diff --git a/.version/2024-05-31.md b/.version/2024-05-31.md
index e1c3913e4..80b66efde 100644
--- a/.version/2024-05-31.md
+++ b/.version/2024-05-31.md
@@ -18,3 +18,4 @@
 | Lab-Project-FreeRTOS-POSIX | freertos/Source/FreeRTOS-Plus-POSIX | https://github.com/sophgo/Lab-Project-FreeRTOS-POSIX.git | sg200x-dev    | 5042bfd      |
 | cvibuilder                 | cvibuilder                          | https://github.com/sophgo/cvibuilder.git                 | sg200x-dev    | 4309f2a      |
 | cvikernel                  | cvikernel                           | https://github.com/sophgo/cvikernel.git                  | sg200x-dev    | 9f1f57a      |
+| cviruntime                 | cviruntime                          | https://github.com/sophgo/cviruntime.git                 | sg200x-dev    | 3f49386      |
diff --git a/cviruntime/.gitignore b/cviruntime/.gitignore
new file mode 100644
index 000000000..99a4cb592
--- /dev/null
+++ b/cviruntime/.gitignore
@@ -0,0 +1,3 @@
+build/
+build_sdk/
+**/__pycache__
diff --git a/cviruntime/CMakeLists.txt b/cviruntime/CMakeLists.txt
new file mode 100644
index 000000000..585fd39cf
--- /dev/null
+++ b/cviruntime/CMakeLists.txt
@@ -0,0 +1,196 @@
+cmake_minimum_required(VERSION 3.1.0)
+
+project(cviruntime C CXX)
+
+execute_process(
+  COMMAND git describe --always --tags --dirty
+  WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  RESULT_VARIABLE GIT_EXEC_RESULT
+  OUTPUT_VARIABLE GIT_SHORT_HASH)
+
+string(STRIP ${GIT_SHORT_HASH} GIT_SHORT_HASH)
+string(TIMESTAMP BUILD_TIME "%Y%m%d")
+set(RUNTIME_VERSION "${GIT_SHORT_HASH}@${BUILD_TIME}")
+message(STATUS "runtime version: ${RUNTIME_VERSION}")
+add_definitions(-DRUNTIME_VERSION="${RUNTIME_VERSION}")
+
+set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
+set(CMAKE_INSTALL_RPATH "\${ORIGIN}/../lib;\${ORIGIN}/")
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
+
+option(ENABLE_COMPRESS_CMDBUF "enable compressed cmdbuf" ON)
+option(ENABLE_CPU_FUNC "enable cpu functions" ON)
+option(ENABLE_PMU "enable tpu PMU" ON)
+
+set(SAFETY_FLAGS "-Werror -Wall -Wextra -fno-strict-aliasing -Wno-missing-field-initializers")
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SAFETY_FLAGS}")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SAFETY_FLAGS}")
+
+if(CMAKE_CROSSCOMPILING)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ftree-vectorize")
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ftree-vectorize -Wno-unused-parameter")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ftree-vectorize -Wno-unused-parameter")
+endif()
+
+if (NOT DEFINED CHIP)
+  message(FATAL_ERROR "no CHIP specified")
+endif()
+if (NOT DEFINED RUNTIME)
+  message(FATAL_ERROR "no RUNTIME specified")
+endif()
+if(NOT DEFINED CVIKERNEL_PATH)
+  message(FATAL_ERROR "Please set CVIKERNEL_PATH to point to the cvikernel installation")
+endif()
+if (NOT DEFINED FLATBUFFERS_PATH)
+  message(FATAL_ERROR "Please set FLATBUF_PATH")
+endif()
+if(NOT DEFINED CVIBUILDER_PATH)
+  message(FATAL_ERROR "Please set CVIBUILDER_PATH to point to the CVIBUILDER installation")
+endif()
+if(RUNTIME STREQUAL CMODEL)
+  if(NOT DEFINED CMODEL_PATH)
+    message(FATAL_ERROR "Please set CMODEL_PATH to point to the cmodel source installation")
+  endif()
+endif()
+
+message(STATUS "CHIP: ${CHIP}")
+message(STATUS "RUNTIME: ${RUNTIME}")
+message(STATUS "CMODEL_PATH: ${CMODEL_PATH}")
+message(STATUS "CVIKERNEL_PATH: ${CVIKERNEL_PATH}")
+message(STATUS "FLATBUFFERS_PATH: ${FLATBUFFERS_PATH}")
+message(STATUS "CVIBUILDER_PATH: ${CVIBUILDER_PATH}")
+
+
+message(STATUS "CHIP: ${CHIP}")
+
+if (CHIP STREQUAL cv183x)
+  add_definitions(-DCHIPID=0x1)
+elseif (CHIP STREQUAL cv182x)
+  add_definitions(-DCHIPID=0x2)
+elseif (CHIP STREQUAL cv181x)
+  add_definitions(-DCHIPID=0x3)
+  set(ENABLE_COMPRESS_CMDBUF OFF CACHE BOOL "" FORCE)
+elseif (CHIP STREQUAL cv180x)
+  add_definitions(-DCHIPID=0x4)
+  set(ENABLE_COMPRESS_CMDBUF OFF CACHE BOOL "" FORCE)
+  set(ENABLE_CPU_FUNC OFF CACHE BOOL "" FORCE)
+endif()
+add_definitions(-DCHIP=${CHIP})
+
+if (${ENABLE_COMPRESS_CMDBUF})
+  add_definitions(-DENABLE_COMPRESS_CMDBUF)
+endif()
+
+if (${ENABLE_CPU_FUNC})
+  add_definitions(-DENABLE_CPU_FUNC)
+endif()
+
+if (${ENABLE_PMU})
+  add_definitions(-DENABLE_PMU)
+endif()
+
+include_directories(
+  ${PROJECT_SOURCE_DIR}/include
+  ${PROJECT_SOURCE_DIR}/src/common
+  ${CVIBUILDER_PATH}/include
+  ${FLATBUFFERS_PATH}/include
+  ${CVIKERNEL_PATH}/include
+  ${CMAKE_CURRENT_BINARY_DIR})
+
+if (${ENABLE_COMPRESS_CMDBUF}) 
+  include_directories(${PROJECT_SOURCE_DIR}/include/lz4)
+endif()
+
+link_directories(${CVIKERNEL_PATH}/lib)
+set(CVI_LIBS ${CVI_LIBS} cvikernel)
+
+if(RUNTIME STREQUAL CMODEL)
+  include_directories(${CMODEL_PATH}/include)
+  link_directories(${CMODEL_PATH}/lib)
+  set(CVI_LIBS ${CVI_LIBS} cvicmodel)
+endif()
+
+add_subdirectory(src)
+add_subdirectory(tool)
+if (ENABLE_PYRUNTIME STREQUAL "ON")
+  add_subdirectory(python)
+endif()
+
+if (ENABLE_TEST STREQUAL "ON")
+  add_subdirectory(test)
+endif()
+
+if (NOT CMAKE_CROSSCOMPILING)
+  if (ENABLE_TEST STREQUAL "ON")
+    enable_testing()
+  endif()
+endif()
+
+file(GLOB HEADERS
+  include/cviruntime.h
+  include/bmruntime.h
+  include/bmruntime_bmkernel.h
+  include/cviruntime_context.h
+  include/cviruntime_extra.h
+  include/cvitpu_debug.h)
+install(FILES ${HEADERS} DESTINATION include)
+
+file(GLOB RUNTIME_HEADERS
+  include/runtime/cpu_function.hpp
+  include/runtime/neuron.hpp
+  include/runtime/op_param.hpp)
+install(FILES ${RUNTIME_HEADERS} DESTINATION include/runtime)
+
+if(NOT CMAKE_CROSSCOMPILING)
+  # install the whole sample dir as source code
+  install(DIRECTORY samples DESTINATION .)
+endif()
+
+install(FILES scripts/envs_tpu_sdk.sh
+    PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
+    DESTINATION .)
+
+if (CHIP STREQUAL cv183x)
+  install(FILES scripts/regression_new_models_cv183x.sh
+      PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
+      DESTINATION . RENAME regression_models.sh)
+  install(FILES scripts/regression_models_e2e_cv183x.sh
+      PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
+      DESTINATION . RENAME regression_models_e2e.sh)
+  install(FILES scripts/regression_samples_cv183x.sh
+      PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
+      DESTINATION . RENAME regression_samples.sh)
+elseif (CHIP STREQUAL cv182x)
+  install(FILES scripts/regression_new_models_cv182x.sh
+      PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
+      DESTINATION . RENAME regression_models.sh)
+  install(FILES scripts/regression_models_e2e_cv182x.sh
+      PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
+      DESTINATION . RENAME regression_models_e2e.sh)
+  install(FILES scripts/regression_samples_cv182x.sh
+      PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
+      DESTINATION . RENAME regression_samples.sh)
+elseif (CHIP STREQUAL cv181x)
+  install(FILES scripts/regression_new_models_cv181x.sh
+      PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
+      DESTINATION . RENAME regression_models.sh)
+  install(FILES scripts/regression_models_e2e_cv181x.sh
+      PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
+      DESTINATION . RENAME regression_models_e2e.sh)
+  install(FILES scripts/regression_samples_cv181x.sh
+      PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
+      DESTINATION . RENAME regression_samples.sh)
+elseif (CHIP STREQUAL cv180x)
+  install(FILES scripts/regression_new_models_cv180x.sh
+      PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
+      DESTINATION . RENAME regression_models.sh)
+  install(FILES scripts/regression_models_e2e_cv180x.sh
+      PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
+      DESTINATION . RENAME regression_models_e2e.sh)
+  install(FILES scripts/regression_samples_cv180x.sh
+      PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
+      DESTINATION . RENAME regression_samples.sh)
+endif()
diff --git a/cviruntime/README.md b/cviruntime/README.md
new file mode 100644
index 000000000..e49bf9653
--- /dev/null
+++ b/cviruntime/README.md
@@ -0,0 +1,65 @@
+# runtime
+
+## overview
+
+runtime is a lib released as SDK for use to develop TPU application. As well as a few tools for testing/benchmarking/profiling, etc.
+
+tools
+
+* test_cvimodel
+
+## dependency
+
+cvibuilder (for cvimodel_generated.h)
+bmkernel (if run bmkernel directly)
+cmodel (if RUNTIME=CMODEL)
+
+## build
+
+assuming install to ../install
+
+assuming support install to ../install
+assuming cvibuilder install to ../install
+assuming bmkernel install to ../install
+assuming cmodel install to ../install
+
+```
+$ cd runtime
+$ mkdir build
+$ cd build
+$ cmake -G Ninja -DCHIP=BM1880v2 -DRUNTIME=CMODEL -DSUPPORT_PATH=../install -DCVIBUILDER_PATH=../install -DCVIKERNEL_PATH=../install -DCMODEL_PATH=../install -DCMAKE_INSTALL_PREFIX=../../install ..
+
+Build
+$ cmake --build .
+$ cmake --build . -- -v
+
+Install
+$ cmake --build . --target install
+$ cmake --build . --target install -- -v
+
+Test
+$ cmake --build . --target test -- -v
+
+Uninstall
+$ xargs rm < install_manifest.txt
+```
+
+## output
+
+## test
+
+'''
+$ cd runtime/build
+# cp bmnet/tests/regression/build/bm1880v2/caffe/resnet50/BM1880v2_resnet50_1.bmodel
+$ ./test/test_bmnet_bmodel \
+    /data/release/bmnet_models/resnet50/int8/resnet50_input_1_3_224_224.bin \
+    BM1880v2_resnet50_1.bmodel \
+    BM1880v2_resnet50_1_output.bin \
+    1 3 224 224
+'''
+
+## TODO
+
+* add SAFETY_FLAGS back
+* for bm1880v2 only, need refactor for all chips
+* add cpu layer back (comments out for now, search for SKIP_CPU_LAYER)
diff --git a/cviruntime/build_tpu_sdk.sh b/cviruntime/build_tpu_sdk.sh
new file mode 100755
index 000000000..f91079c03
--- /dev/null
+++ b/cviruntime/build_tpu_sdk.sh
@@ -0,0 +1,160 @@
+#!/bin/bash
+
+DIR="$( cd "$(dirname "$0")" ; pwd -P )"
+
+echo "TPU_SDK_BUILD_PATH=$TPU_SDK_BUILD_PATH"
+echo "TPU_SDK_INSTALL_PATH=$TPU_SDK_INSTALL_PATH"
+echo "TOP_DIR=$TOP_DIR"
+
+TOOLCHAIN_FILE_PATH=$DIR/scripts/toolchain.cmake
+echo "TOOLCHAIN_FILE_PATH=$TOOLCHAIN_FILE_PATH"
+TOOLCHAIN_AARCH64=$DIR/scripts/toolchain-aarch64-linux.cmake
+TOOLCHAIN_ARM=$DIR/scripts/toolchain-linux-gnueabihf.cmake
+TOOLCHAIN_UCLIBC=$DIR/scripts/toolchain-linux-uclibc.cmake
+TOOLCHAIN_RISCV64=$DIR/scripts/toolchain-riscv64-linux-x86_64.cmake
+TOOLCHAIN_RISCV64_MUSL=$DIR/scripts/toolchain-riscv64-linux-musl-x86_64.cmake
+
+if [ ! -e "$OSS_TARBALL_PATH" ]; then
+    echo "${OSS_TARBALL_PATH} not present, run build_3rd_party first"
+    exit 1
+fi
+
+mkdir -p "$TPU_SDK_BUILD_PATH"/build_sdk
+mkdir -p "$TPU_SDK_INSTALL_PATH"
+
+"$OSS_PATH"/run_build.sh -n zlib -e -t "$OSS_TARBALL_PATH" -i "$TPU_SDK_INSTALL_PATH"
+"$OSS_PATH"/run_build.sh -n flatbuffers -e -t "$OSS_TARBALL_PATH" -i "$TPU_SDK_INSTALL_PATH"/flatbuffers
+"$OSS_PATH"/run_build.sh -n opencv -e -t "$OSS_TARBALL_PATH" -i "$TPU_SDK_INSTALL_PATH"/opencv
+
+#
+# build
+#
+BUILD_TYPE="RELEASE"
+if [ "$BUILD_TYPE" == "RELEASE" ]; then
+  BUILD_FLAG="-DCMAKE_BUILD_TYPE=RELEASE -DCMAKE_C_FLAGS_RELEASE=-O3 -DCMAKE_CXX_FLAGS_RELEASE=-O3"
+else
+  BUILD_FLAG="-DCMAKE_BUILD_TYPE=Debug -DCMAKE_CXX_FLAGS=-ggdb"
+fi
+BUILD_PATH=$TPU_SDK_BUILD_PATH
+
+CHIP_ID="${CHIP_ARCH,,}"
+echo "CHIP_ID=$CHIP_ID"
+
+# build host flatbuffers
+FLATBUFFERS_HOST_PATH=$BUILD_PATH/install_flatbuffers_host
+mkdir -p $FLATBUFFERS_HOST_PATH
+if [ ! -e $BUILD_PATH/build_flatbuffers_host ]; then
+  mkdir -p $BUILD_PATH/build_flatbuffers_host
+fi
+pushd $BUILD_PATH/build_flatbuffers_host
+cmake -G Ninja -DCMAKE_INSTALL_PREFIX=$FLATBUFFERS_HOST_PATH \
+    $TOP_DIR/flatbuffers
+cmake --build . --target install
+test $? -ne 0 && echo "build flatbuffers failed !!" && popd && exit 1
+popd
+
+# build target flat buffer
+# move to build_oss
+
+# generate target-independent flatbuffer schema
+CVIMODEL_HOST_PATH=$BUILD_PATH/install_cvimodel_host
+if [ ! -e $BUILD_PATH/build_cvimodel ]; then
+  mkdir -p $BUILD_PATH/build_cvimodel
+fi
+pushd $BUILD_PATH/build_cvimodel
+cmake -G Ninja -DFLATBUFFERS_PATH=$FLATBUFFERS_HOST_PATH \
+    -DCMAKE_INSTALL_PREFIX=$CVIMODEL_HOST_PATH \
+    $TOP_DIR/cvibuilder
+cmake --build . --target install
+test $? -ne 0 && echo "build cvibuilder failed !!" && popd && exit 1
+popd
+
+# build cvikernel
+if [ ! -e $BUILD_PATH/build_cvikernel ]; then
+  mkdir -p $BUILD_PATH/build_cvikernel
+fi
+pushd $BUILD_PATH/build_cvikernel
+cmake -G Ninja $BUILD_FLAG \
+    -DCHIP=$CHIP_ID \
+    -DCMAKE_TOOLCHAIN_FILE=$TOOLCHAIN_FILE_PATH \
+    -DCMAKE_INSTALL_PREFIX=$TPU_SDK_INSTALL_PATH \
+    $TOP_DIR/cvikernel
+cmake --build . --target install -- -v
+test $? -ne 0 && echo "build cvikernel failed !!" && popd && exit 1
+popd
+
+# build cnpy
+if [ ! -e $BUILD_PATH/build_cnpy ]; then
+  mkdir -p $BUILD_PATH/build_cnpy
+fi
+pushd $BUILD_PATH/build_cnpy
+cmake -G Ninja $BUILD_FLAG \
+    -DCMAKE_TOOLCHAIN_FILE=$TOOLCHAIN_FILE_PATH \
+    -DCMAKE_INSTALL_PREFIX=$TPU_SDK_INSTALL_PATH \
+    $TOP_DIR/cnpy
+cmake --build . --target install
+test $? -ne 0 && echo "build cnpy failed !!" && popd && exit 1
+popd
+
+# build runtime
+
+if [ ! -e $BUILD_PATH/build_cviruntime ]; then
+  mkdir $BUILD_PATH/build_cviruntime
+fi
+pushd $BUILD_PATH/build_cviruntime
+cmake -G Ninja -DCHIP=$CHIP_ID -DRUNTIME=SOC $BUILD_FLAG \
+    -DCMAKE_TOOLCHAIN_FILE=$TOOLCHAIN_FILE_PATH \
+    -DCVIKERNEL_PATH=$TPU_SDK_INSTALL_PATH \
+    -DCNPY_PATH=$TPU_SDK_INSTALL_PATH/lib \
+    -DFLATBUFFERS_PATH=$TPU_SDK_INSTALL_PATH/flatbuffers \
+    -DCVIBUILDER_PATH=$CVIMODEL_HOST_PATH \
+    -DCMAKE_INSTALL_PREFIX=$TPU_SDK_INSTALL_PATH \
+    -DENABLE_TEST=OFF \
+    $TOP_DIR/cviruntime
+cmake --build . --target install -- -v
+test $? -ne 0 && echo "build cviruntime failed !!" && popd && exit 1
+popd
+
+# build cvimath
+if [ ! -e $BUILD_PATH/build_cvimath ]; then
+  mkdir $BUILD_PATH/build_cvimath
+fi
+pushd $BUILD_PATH/build_cvimath
+
+cmake -G Ninja  \
+    -DTOOLCHAIN_ROOT_DIR=$TOOLCHAIN_GCC_PATH \
+    -DCMAKE_TOOLCHAIN_FILE=$TOOLCHAIN_FILE_PATH \
+    -DTPU_SDK_ROOT=$TPU_SDK_INSTALL_PATH \
+    -DCMAKE_INSTALL_PREFIX=$TPU_SDK_INSTALL_PATH \
+    $TOP_DIR/cvimath
+cmake --build . --target install -- -v
+test $? -ne 0 && echo "build cvimath failed !!" && popd && exit 1
+popd
+
+if [ ! -e $BUILD_PATH/build_samples ]; then
+  mkdir $BUILD_PATH/build_samples
+fi
+pushd $BUILD_PATH/build_samples
+cmake -G Ninja $BUILD_FLAG \
+    -DCMAKE_TOOLCHAIN_FILE=$TOOLCHAIN_FILE_PATH \
+    -DTPU_SDK_PATH=$TPU_SDK_INSTALL_PATH \
+    -DOPENCV_PATH=$TPU_SDK_INSTALL_PATH/opencv \
+    -DCMAKE_INSTALL_PREFIX=$TPU_SDK_INSTALL_PATH/samples \
+    $DIR/samples
+cmake --build . --target install -- -v
+test $? -ne 0 && echo "build samples failed !!" && popd && exit 1
+popd
+
+# Copy some files for release build
+mkdir -p $TPU_SDK_INSTALL_PATH/cmake
+cp $TOOLCHAIN_FILE_PATH $TPU_SDK_INSTALL_PATH/cmake
+cp $TOOLCHAIN_AARCH64 $TPU_SDK_INSTALL_PATH/cmake
+cp $TOOLCHAIN_ARM $TPU_SDK_INSTALL_PATH/cmake
+cp $TOOLCHAIN_UCLIBC $TPU_SDK_INSTALL_PATH/cmake
+cp $TOOLCHAIN_RISCV64 $TPU_SDK_INSTALL_PATH/cmake
+cp $TOOLCHAIN_RISCV64_MUSL $TPU_SDK_INSTALL_PATH/cmake
+
+# copy lib
+mkdir -p "$SYSTEM_OUT_DIR"/lib/
+cp -a "$TPU_SDK_INSTALL_PATH"/lib/*.so* "$SYSTEM_OUT_DIR"/lib/
+cp -a "$TPU_SDK_INSTALL_PATH"/opencv/lib/*.so* "$SYSTEM_OUT_DIR"/lib/
diff --git a/cviruntime/custom_op/example/CMakeLists.txt b/cviruntime/custom_op/example/CMakeLists.txt
new file mode 100644
index 000000000..aa5cee510
--- /dev/null
+++ b/cviruntime/custom_op/example/CMakeLists.txt
@@ -0,0 +1,21 @@
+cmake_minimum_required(VERSION 2.8.0)
+project(custom_cpu_function CXX)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
+
+if(NOT DEFINED MLIR_INCLUDE)
+  message(FATAL_ERROR "Please set MLIR_INCLUDE to point to the include path of mlir")
+endif()
+
+include_directories(${PROJECT_SOURCE_DIR})
+include_directories(${MLIR_INCLUDE})
+
+add_library(CustomOpPlugin SHARED
+            LeakyReluOp.cpp
+            ROIAlignOp.cpp
+            SoftmaxOp.cpp
+            UnPoolingOp.cpp)
+
+install(TARGETS CustomOpPlugin DESTINATION lib/custom_op/)
diff --git a/cviruntime/custom_op/example/LeakyReluOp.cpp b/cviruntime/custom_op/example/LeakyReluOp.cpp
new file mode 100644
index 000000000..a86213cb4
--- /dev/null
+++ b/cviruntime/custom_op/example/LeakyReluOp.cpp
@@ -0,0 +1,461 @@
+/*
+* Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+*/
+#include "LeakyReluOp.h"
+#include "QuantHelper.h"
+#include <cvikernel/cvikernel.h>
+
+#define NPU_SHIFT 5
+#define EU_SHIFT 4
+#define NPU_NUM (1 << NPU_SHIFT)
+#define EU_NUM (1 << EU_SHIFT)
+#define LOCAL_MEM_SIZE (1 << 15)
+#define NEURON_MEMORY 0
+#define WEIGHT_MEMORY 1
+
+namespace cvi {
+
+void LeakyReluOp::interpretFp32(
+    std::vector<std::shared_ptr<std::vector<float>>> &operand_tensors,
+    std::vector<std::vector<int64_t>> &operand_shapes,
+    std::shared_ptr<std::vector<float>> &result_tensor,
+    std::vector<int64_t> &result_shape) {
+  int n = operand_shapes[0][0];
+  int c = operand_shapes[0][1];
+  int h = operand_shapes[0][2];
+  int w = operand_shapes[0][3];
+  auto input = operand_tensors[0]->data();
+  auto output = result_tensor->data();
+  auto negative_slope = param.get<float>("negative_slope");
+
+  for (int i = 0; i < (int)operand_tensors[0]->size(); ++i) {
+    if (input[i] >= 0) {
+      output[i] = input[i];
+    } else {
+      output[i] = negative_slope * input[i];
+    }
+  }
+}
+
+void LeakyReluOp::interpretInt8(
+    std::vector<std::shared_ptr<std::vector<float>>> &operand_tensors,
+    std::vector<std::vector<int64_t>> &operand_shapes,
+    std::shared_ptr<std::vector<float>> &result_tensor,
+    std::vector<int64_t> &result_shape) {
+  int n = operand_shapes[0][0];
+  int c = operand_shapes[0][1];
+  int h = operand_shapes[0][2];
+  int w = operand_shapes[0][3];
+
+  auto input = operand_tensors[0]->data();
+  auto quant_pos_rshift =
+      param.has("rshift_pos") ? (float)param.get<int8_t>("rshift_pos") : 0.0f;
+  auto quant_pos_multiplier =
+      param.has("m_i8_pos") ? (float)param.get<int8_t>("m_i8_pos") : 0.0f;
+  auto quant_neg_rshift = (float)param.get<int8_t>("rshift_neg");
+  auto quant_neg_multiplier = (float)param.get<int8_t>("m_i8_neg");
+
+  auto output = result_tensor->data();
+
+  // rshift and saturate on output
+  for (int i = 0; i < (int)operand_tensors[0]->size(); ++i) {
+    if (input[i] > 0) {
+      if (quant_pos_multiplier != 0.0f) {
+        output[i] = (float)applyMultiplierAndRShiftAndSaturateInt8(
+            input[i], (uint32_t)quant_pos_rshift, quant_pos_multiplier, false);
+      } else {
+        output[i] = input[i];
+      }
+    } else {
+      output[i] = (float)applyMultiplierAndRShiftAndSaturateInt8(
+          input[i], (uint32_t)quant_neg_rshift, quant_neg_multiplier, false);
+    }
+  }
+}
+
+void LeakyReluOp::quantizeInt8() {
+  // support per-tensor only for now
+  setOpQuantPerchannel(false);
+  // use rshift and INT8 multiplier
+  setOpQuantParamType("RSHIFT_AND_M_I8");
+
+  float negative_slope = param.get<float>("negative_slope");
+  std::cout << "  negative_slope: " << std::to_string(negative_slope) << "\n";
+
+  // create tensors for rshift and multiplier
+  float rshift_pos = 0;
+  float multiplier_pos = 0;
+  float rshift_neg = 0;
+  float multiplier_neg = 0;
+
+  // quantization
+  float threshold_x = getPrevOpThreshold();
+  float threshold_y = getOpThreshold();
+  std::cout << "threshold_y = " << std::to_string(threshold_y)
+            << ", threshold_x = " << std::to_string(threshold_x) << "\n";
+
+  // positive
+  double qscale_pos = threshold_x / threshold_y;
+  if (fabs(threshold_x - threshold_y) < 1e-5 * std::min(threshold_x, threshold_y)) {
+    // no positive scale
+    rshift_pos = 0;
+    multiplier_pos = 0;
+    std::cout << "  Positive: no_scale\n";
+  } else {
+    uint32_t uint_multiplier_pos;
+    rshift_pos =
+        (float)findRShiftAndMultiplierFromQScale(qscale_pos, &uint_multiplier_pos, false);
+    multiplier_pos = (float)uint_multiplier_pos;
+    std::cout << "  Positive: ";
+    std::cout << "  [multiplier : rshift] = [" << std::to_string(multiplier_pos) << " : "
+              << std::to_string(rshift_pos) << "]\n";
+  }
+  // negative
+  float qscale_neg = fabs(qscale_pos * negative_slope);
+  uint32_t uint_multiplier_neg = 0;
+  rshift_neg =
+      (float)findRShiftAndMultiplierFromQScale(qscale_neg, &uint_multiplier_neg, false);
+  multiplier_neg = (float)uint_multiplier_neg;
+  std::cout << "  Negative: ";
+  std::cout << "  [multiplier : rshift] = [" << std::to_string(multiplier_neg) << " : "
+            << std::to_string(rshift_neg) << "]\n";
+
+  bool do_pos_scale = (multiplier_pos != 0.0) ? true : false;
+  if (do_pos_scale) {
+    param.put<int8_t>("rshift_pos", static_cast<int8_t>(rshift_pos));
+    param.put<int8_t>("m_i8_pos", static_cast<int8_t>(multiplier_pos));
+  }
+  param.put<int8_t>("rshift_neg", static_cast<int8_t>(rshift_neg));
+  param.put<int8_t>("m_i8_neg", static_cast<int8_t>(multiplier_neg));
+}
+
+void LeakyReluOp::codeGenInt8(void *ctx,
+                              std::vector<std::vector<int64_t>> &operand_shapes,
+                              std::vector<uint64_t> &operand_gaddrs,
+                              std::vector<int64_t> &result_shape, uint64_t result_gaddr,
+                              int layer_id) {
+  auto pos_rshift = param.has("rshift_pos") ? param.get<int8_t>("rshift_pos") : 0;
+  auto pos_m_i8 = param.has("m_i8_pos") ? param.get<int8_t>("m_i8_pos") : 0;
+  auto neg_rshift = param.has("rshift_neg") ? param.get<int8_t>("rshift_neg") : 0;
+  auto neg_m_i8 = param.has("m_i8_neg") ? param.get<int8_t>("m_i8_neg") : 0;
+  assert(neg_m_i8);
+
+  int n = operand_shapes[0][0];
+  int c = operand_shapes[0][1];
+  int h = operand_shapes[0][2];
+  int w = operand_shapes[0][3];
+  uint64_t operand_gaddr = operand_gaddrs[0];
+  uint64_t ga_output = result_gaddr;
+
+  leakyrelu_codegen((cvk_context_t *)ctx,           // ctx
+                    layer_id,      // layer_id
+                    operand_gaddr, // input_gaddr
+                    result_gaddr,  // output_gaddr
+                    n,             // input_n
+                    c,             // input_c
+                    h,             // input_h
+                    w,             // input_w
+                    pos_rshift,    // GT_right_shift_width
+                    neg_rshift,    // LE_right_shift_width
+                    pos_m_i8,      // GT_scale
+                    neg_m_i8       // LE_scale
+  );
+}
+
+void LeakyReluOp::tdma_load(cvk_context_t *ctx, cvk_tl_t *tlp, uint64_t ga_src) {
+  cvk_tg_t ts_data;
+  ts_data.base_reg_index = NEURON_MEMORY;
+  ts_data.fmt = tlp->fmt;
+  ts_data.start_address = ga_src;
+  ts_data.shape = {tlp->shape.n, tlp->shape.c, tlp->shape.h, tlp->shape.w};
+  ts_data.stride = ctx->ops->tg_default_stride(ctx, ts_data.shape, ts_data.fmt);
+
+  cvk_tdma_g2l_tensor_copy_param_t p1;
+  p1.src = &ts_data;
+  p1.dst = tlp;
+  ctx->ops->tdma_g2l_tensor_copy(ctx, &p1);
+}
+
+void LeakyReluOp::tdma_store(cvk_context_t *ctx, cvk_tl_t *tlp, uint64_t ga_dst) {
+  cvk_tg_t ts_data;
+  ts_data.base_reg_index = NEURON_MEMORY;
+  ts_data.fmt = tlp->fmt;
+  ts_data.start_address = ga_dst;
+  ts_data.shape = {tlp->shape.n, tlp->shape.c, tlp->shape.h, tlp->shape.w};
+  ts_data.stride = ctx->ops->tg_default_stride(ctx, ts_data.shape, ts_data.fmt);
+
+  cvk_tdma_l2g_tensor_copy_param_t p1;
+  p1.src = tlp;
+  p1.dst = &ts_data;
+  ctx->ops->tdma_l2g_tensor_copy(ctx, &p1);
+}
+
+void LeakyReluOp::leakyrelu_kernel(cvk_context_t *ctx, int layer_id, cvk_tl_t &bottom,
+                                   cvk_tl_t &relu, cvk_tl_t &neg,
+                                   int GT_right_shift_width, int LE_right_shift_width,
+                                   int GT_scale, int LE_scale) {
+  bool isIgnorePosPart = (GT_scale == 0);
+  bool isSlopeSmallerThanOne = ((LE_scale >> LE_right_shift_width) == 0);
+
+  if (isIgnorePosPart) {
+    cvk_tiu_mul_param_t p4;
+    p4.res_high = nullptr;
+    p4.res_low = &relu;
+    p4.a = &bottom;
+    p4.b_const.val = LE_scale;
+    p4.b_const.is_signed = true;
+    p4.b_is_const = 1;
+    p4.rshift_bits = LE_right_shift_width;
+    p4.layer_id = layer_id;
+    p4.relu_enable = 0;
+    ctx->ops->tiu_mul(ctx, &p4);
+
+    if (isSlopeSmallerThanOne) {
+      cvk_tiu_max_param_t p1;
+      p1.max = &bottom;
+      p1.a = &bottom;
+      p1.b = &relu;
+      p1.b_is_const = 0;
+      p1.layer_id = layer_id;
+      ctx->ops->tiu_max(ctx, &p1);
+    } else {
+      cvk_tiu_min_param_t p1;
+      p1.min = &bottom;
+      p1.a = &bottom;
+      p1.b = &relu;
+      p1.b_is_const = 0;
+      p1.layer_id = layer_id;
+      ctx->ops->tiu_min(ctx, &p1);
+    }
+  } else {
+    // 0. relu = relu(bottom)
+    cvk_tiu_max_param_t p13;
+    p13.max = &relu;
+    p13.a = &bottom;
+    p13.b_is_const = 1;
+    p13.b_const.is_signed = 1;
+    p13.b_const.val = 0;
+    p13.layer_id = layer_id;
+    ctx->ops->tiu_max(ctx, &p13);
+
+    // 1. relu = (relu * GT_scale) >> GT_right_shift_width
+    cvk_tiu_mul_param_t p;
+    p.res_high = nullptr;
+    p.res_low = &relu;
+    p.a = &relu;
+    p.b_const.val = GT_scale;
+    p.b_const.is_signed = true;
+    p.b_is_const = 1;
+    p.rshift_bits = GT_right_shift_width;
+    p.layer_id = layer_id;
+    p.relu_enable = 0;
+    ctx->ops->tiu_mul(ctx, &p);
+
+    // 2. neg = neg(0, botom)
+    cvk_tiu_min_param_t p7;
+    p7.min = &neg;
+    p7.a = &bottom;
+    p7.b_is_const = 1;
+    p7.b_const.val = 0;
+    p7.b_const.is_signed = 1;
+    p7.layer_id = layer_id;
+    ctx->ops->tiu_min(ctx, &p7);
+
+    // 3. neg (n,c,h,w) = (neg(n,c,h,w) * slope) >> LE_right_shift_width
+    cvk_tiu_mul_param_t p8;
+    p8.res_high = nullptr;
+    p8.res_low = &neg;
+    p8.a = &neg;
+    p8.b_const.val = LE_scale;
+    p8.b_const.is_signed = true;
+    p8.b_is_const = 1;
+    p8.rshift_bits = LE_right_shift_width;
+    p8.layer_id = layer_id;
+    p8.relu_enable = 0;
+    ctx->ops->tiu_mul(ctx, &p8);
+
+    // 4. bottom = or relu, neg
+    cvk_tiu_or_int8_param_t p9;
+    p9.res = &bottom;
+    p9.a = &relu;
+    p9.b = &neg;
+    p9.layer_id = layer_id;
+    ctx->ops->tiu_or_int8(ctx, &p9);
+  }
+}
+
+void LeakyReluOp::leakyrelu_codegen(cvk_context_t *ctx, uint32_t layer_id,
+                                    uint64_t input_gaddr, uint64_t output_gaddr,
+                                    int input_n, int input_c, int input_h, int input_w,
+                                    int GT_right_shift_width, int LE_right_shift_width,
+                                    int GT_scale, int LE_scale) {
+  printf("leakyrelu_codegen:\n"
+         "  layer_id %d\n"
+         "  input_gddr: %lx, output_gaddr: %lx\n"
+         "  input (%d, %d, %d, %d)\n"
+         "  GT_scale:%d, LE_scale:%d\n"
+         "  GT_right_shift_width:%d, LE_right_shift_width:%d\n",
+         layer_id, input_gaddr, output_gaddr, input_n, input_c, input_h, input_w,
+         GT_scale, LE_scale, GT_right_shift_width, LE_right_shift_width);
+
+  // Split input based on local memory
+  uint32_t total_eu = NPU_NUM * EU_NUM;
+  uint32_t lane_size = LOCAL_MEM_SIZE;
+  uint32_t total_mem_size = NPU_NUM * LOCAL_MEM_SIZE;
+  uint32_t max_N = (1 << 12) - 1; // 1880v2: 12 bit
+  uint32_t max_W = (1 << 12) - 1; // 1880v2: 12 bit
+  uint32_t count = input_n * input_c * input_h * input_w;
+  uint32_t tiled_N = count / total_eu / 3; // 3 blobs
+  tiled_N = (tiled_N > max_N) ? max_N : tiled_N;
+
+  // local tensor shape(tiled_N, npu_num, 1, eu_num)
+  cvk_tl_shape_t tl_shape = {tiled_N, static_cast<uint32_t>(NPU_NUM), 1,
+                             static_cast<uint32_t>(EU_NUM)};
+  cvk_tl_stride_t tl_stride = ctx->ops->tl_default_stride(ctx, tl_shape, CVK_FMT_I8, 1);
+
+  // Find max tiled_N
+  uint32_t required_size = 0;
+  do {
+    tl_shape.n = tiled_N;
+    tl_stride = ctx->ops->tl_default_stride(ctx, tl_shape, CVK_FMT_I8, 1);
+    required_size = 3 * tl_shape.n * tl_stride.n; // 3 blobs
+
+    if (required_size <= lane_size) {
+      break;
+    }
+
+  } while (--tiled_N);
+
+  printf("  tiled_bottom shape (%d, %d, %d, %d), stride (%d, %d, %d, %d)\n"
+         "  required_size %d kB/lane\n",
+         tl_shape.n, tl_shape.c, tl_shape.h, tl_shape.w, tl_stride.n, tl_stride.c,
+         tl_stride.h, tl_stride.w, required_size / 1024);
+
+  assert(tiled_N);
+  if (!tiled_N) {
+    return;
+  }
+
+  // Tiled local memory layout:
+  //   tiled bottom/result
+  //   tiled relu
+  //   tiled neg
+
+  // Tiled bottom
+  required_size /= 3; // for 3 blobs
+  cvk_tl_t tl_tiled_bottom;
+  tl_tiled_bottom.start_address = 0;
+  tl_tiled_bottom.fmt = CVK_FMT_I8;
+  tl_tiled_bottom.shape = tl_shape;
+  tl_tiled_bottom.stride = tl_stride;
+
+  // Tiled relu
+  cvk_tl_t tl_tiled_relu = tl_tiled_bottom;
+  tl_tiled_relu.start_address = tl_tiled_bottom.start_address + required_size;
+
+  // Tiled neg
+  cvk_tl_t tl_tiled_neg = tl_tiled_bottom;
+  tl_tiled_neg.start_address = tl_tiled_relu.start_address + required_size;
+
+  // In unit of tiled_N * npu_num * eu_num
+  uint32_t global_input_offset = 0;
+  for (uint32_t i = 0; i < (count / total_eu / tiled_N); i++) {
+    // Load as a chunk of contiguous memory in global memory, not use global
+    // shape/stride Local memory use tensor shape to maximize eu utilization.
+    tdma_load(ctx, &tl_tiled_bottom, input_gaddr + global_input_offset);
+    leakyrelu_kernel(ctx, layer_id, tl_tiled_bottom, tl_tiled_relu, tl_tiled_neg,
+                     GT_right_shift_width, LE_right_shift_width, GT_scale, LE_scale);
+    // Store bottom as a chunk of contiguous memory, not use global shape/stride
+    tdma_store(ctx, &tl_tiled_bottom, output_gaddr + global_input_offset);
+
+    // Next input offset
+    global_input_offset += tiled_N * total_eu;
+
+  } // for (uint32_t i = 0; i < (count/total_eu/tiled_N); i++)
+
+  // Remaining count, in unit of npu_num * eu_num
+  if (global_input_offset < count) {
+    uint32_t tiled_W = (count - global_input_offset) / NPU_NUM;
+    tiled_N = 1;
+    do {
+      tl_shape.n = tiled_N;
+      tl_shape.w = tiled_W;
+      tl_stride = ctx->ops->tl_default_stride(ctx, tl_shape, CVK_FMT_I8, 1);
+      required_size = 3 * tl_shape.n * tl_stride.n; // 3 blobs
+
+      if (required_size <= lane_size && (tiled_W <= max_W)) {
+        break;
+      } else {
+        tiled_W /= 2;
+        tiled_N *= 2;
+      }
+    } while (true); // Magic number for 2^12 -1 - 32
+
+    if ((count - global_input_offset) % NPU_NUM != 0) {
+      std::cout << "Remaining size should align npu_num, or die";
+      assert(0);
+    }
+
+    // Update shape, stride
+    tl_shape.n = tiled_N;
+    tl_shape.w = tiled_W;
+    tl_stride = ctx->ops->tl_default_stride(ctx, tl_shape, CVK_FMT_I8, 1);
+    required_size = tl_shape.n * tl_stride.n;
+
+    printf("  tiled_bottom shape (%d, %d, %d, %d), stride (%d, %d, %d, %d)\n"
+           "  required_size %d kB/lane\n",
+           tl_shape.n, tl_shape.c, tl_shape.h, tl_shape.w, tl_stride.n, tl_stride.c,
+           tl_stride.h, tl_stride.w, required_size / 1024);
+
+    // Tiled bottom
+    tl_tiled_bottom.shape = tl_shape;
+    tl_tiled_bottom.stride = tl_stride;
+
+    // Tiled bottom with precise stride
+    cvk_tl_t tl_tiled_bottom_precise_stride = tl_tiled_bottom;
+    tl_tiled_bottom_precise_stride.stride = {
+        static_cast<uint32_t>(tl_shape.h * tl_shape.w * sizeof(uint8_t)),
+        static_cast<uint32_t>(tl_shape.h * tl_shape.w * sizeof(uint8_t)),
+        static_cast<uint32_t>(tl_shape.w * sizeof(uint8_t)), sizeof(uint8_t)};
+
+    printf("  tiled_bottom_precise shape (%d, %d, %d, %d), stride (%d, %d, %d, %d)\n"
+           "  required_size %d kB/lane\n",
+           tl_shape.n, tl_shape.c, tl_shape.h, tl_shape.w,
+           tl_tiled_bottom_precise_stride.stride.n,
+           tl_tiled_bottom_precise_stride.stride.c,
+           tl_tiled_bottom_precise_stride.stride.h,
+           tl_tiled_bottom_precise_stride.stride.w, required_size / 1024);
+
+    // Tiled relu
+    tl_tiled_relu = tl_tiled_bottom;
+    tl_tiled_relu.start_address = tl_tiled_bottom.start_address + required_size;
+
+    // Tiled neg
+    tl_tiled_neg = tl_tiled_bottom;
+    tl_tiled_neg.start_address = tl_tiled_relu.start_address + required_size;
+
+    // Load as a chunk of contiguous memory in global memory, not use global
+    // shape/stride Local memory use tensor shape to maximize eu utilization.
+
+    tdma_load(ctx, &tl_tiled_bottom, input_gaddr + global_input_offset);
+
+    leakyrelu_kernel(ctx, layer_id, tl_tiled_bottom, tl_tiled_relu, tl_tiled_neg,
+                     GT_right_shift_width, LE_right_shift_width, GT_scale, LE_scale);
+
+    // Store bottom as a chunk of contiguous memory, not use global shape/stride
+    tdma_store(ctx, &tl_tiled_bottom, output_gaddr + global_input_offset);
+
+    global_input_offset += tl_tiled_bottom_precise_stride.shape.n *
+                           tl_tiled_bottom_precise_stride.stride.n * NPU_NUM;
+  }
+
+  // Remaining count, in unit of eu_num
+  if (global_input_offset != count) {
+    printf("global_input_offset != count (%d != %d)/n", global_input_offset, count);
+    assert(0);
+  }
+}
+
+RegisterCustomOp(leaky_relu, LeakyReluOp);
+
+} // namespace cvi
\ No newline at end of file
diff --git a/cviruntime/custom_op/example/LeakyReluOp.h b/cviruntime/custom_op/example/LeakyReluOp.h
new file mode 100644
index 000000000..f1e390265
--- /dev/null
+++ b/cviruntime/custom_op/example/LeakyReluOp.h
@@ -0,0 +1,44 @@
+/*
+* Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+*/
+#ifndef LEAKY_RELU_OP_H_
+#define LEAKY_RELU_OP_H_
+
+#include "tpuc/CustomOp.h"
+#include <cvikernel/cvikernel.h>
+
+namespace cvi {
+
+class LeakyReluOp : public CustomOp {
+public:
+  LeakyReluOp(OpParam &param) : CustomOp(param) {}
+
+  void interpretFp32(std::vector<std::shared_ptr<std::vector<float>>> &operand_tensors,
+                     std::vector<std::vector<int64_t>> &operand_shapes,
+                     std::shared_ptr<std::vector<float>> &result_tensor,
+                     std::vector<int64_t> &result_shape);
+  void interpretInt8(std::vector<std::shared_ptr<std::vector<float>>> &operand_tensors,
+                     std::vector<std::vector<int64_t>> &operand_shapes,
+                     std::shared_ptr<std::vector<float>> &result_tensor,
+                     std::vector<int64_t> &result_shape);
+  void quantizeInt8();
+  void codeGenInt8(void *ctx,
+                   std::vector<std::vector<int64_t>> &operand_shapes,
+                   std::vector<uint64_t> &operand_gaddrs,
+                   std::vector<int64_t> &result_shape, uint64_t result_gaddr,
+                   int layer_id);
+
+private:
+  void tdma_load(cvk_context_t *ctx, cvk_tl_t *tlp, uint64_t ga_src);
+  void tdma_store(cvk_context_t *ctx, cvk_tl_t *tlp, uint64_t ga_dst);
+  void leakyrelu_kernel(cvk_context_t *ctx, int layer_id, cvk_tl_t &bottom,
+                        cvk_tl_t &relu, cvk_tl_t &neg, int GT_right_shift_width,
+                        int LE_right_shift_width, int GT_scale, int LE_scale);
+  void leakyrelu_codegen(cvk_context_t *ctx, uint32_t layer_id, uint64_t input_gaddr,
+                         uint64_t output_gaddr, int input_n, int input_c, int input_h,
+                         int input_w, int GT_right_shift_width, int LE_right_shift_width,
+                         int GT_scale, int LE_scale);
+};
+
+} // namespace cvi
+#endif
diff --git a/cviruntime/custom_op/example/QuantHelper.h b/cviruntime/custom_op/example/QuantHelper.h
new file mode 100644
index 000000000..bc5e00ad6
--- /dev/null
+++ b/cviruntime/custom_op/example/QuantHelper.h
@@ -0,0 +1,191 @@
+/*
+* Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+*/
+#ifndef CVI_QUANT_HELPER_H
+#define CVI_QUANT_HELPER_H
+#include <assert.h>
+#include <stdint.h>
+#include <cmath>
+#include <limits>
+#include <iostream>
+
+static int RoundingDivideByPOT(int x, int exponent) {
+  if (x == 0) {
+    return 0;
+  }
+  if (exponent == 0) {
+    return x;
+  }
+  assert(exponent > 0);
+  const int shift_vec = -exponent;
+  const int fixup = (x & shift_vec) >> 31;
+  const int fixed_up_x = x + fixup;
+
+  int nudge = 1 << (exponent - 1);
+  int val = (fixed_up_x + nudge) >> exponent;
+
+  return val;
+}
+
+static int SaturatingRoundingDoublingHighMul(int a, int b) {
+  int64_t a_64(a);
+  int64_t b_64(b);
+  int64_t ab_64 = a_64 * b_64;
+  int nudge = ab_64 >= 0 ? (1 << 30) : (1 - (1 << 30));
+  int ab_x2_high32 = static_cast<int>((ab_64 + nudge) / (1ll << 31));
+  return ab_x2_high32;
+}
+
+/// saturate a float to range [-128, 127]
+static int8_t saturateInt8(float f) {
+#if 0
+  // cast
+  int q = (int)f;
+#elif 0
+  // away_from_zero
+  int q = (f >= 0) ? (int)std::ceil(f) : (int)std::floor(f);
+#elif 0
+  // round
+  int q = (int)std::roundf(f);
+#elif 0
+  // trancate, (towards zero)
+  int q = (f >= 0) ? (int)std::floor(f) : (int)std::ceil(f);
+#elif 1
+  // from caffe_int8
+  int q = (int)std::floor(f + 0.5);
+#else
+  // looks HW is different than std::round()
+  // we shall apply round only for input quant()
+  int q = (int)std::round(f);
+#endif
+  if (q > 127)
+    q = 127;
+  if (q < -128)
+    q = -128;
+
+  return (int8_t)q;
+}
+
+/// Simulate HW behavior, after accumuation
+/// apply multiplier, do rshift, and then saturate to INT8
+/// used in BM1880v2 per-channel mode (32bit bias)
+/// qdm mode
+///   use GOOGLE GEMMLOWP QDM multiply and shift
+///   during multiply, a factor of (1 << 31) has been devided
+static int8_t applyMultiplierAndRShiftAndSaturateInt8(float v, uint32_t rshift,
+                                                      uint32_t multiplier, bool qdm) {
+  if (qdm) {
+    int32_t q = RoundingDivideByPOT(
+        SaturatingRoundingDoublingHighMul((int32_t)v, (int32_t)multiplier), rshift);
+    // llvm::errs() << "v,rshift,multiplier,q = " << v << ","
+    //             << rshift << "," << multiplier << "," << q << "\n";
+    return saturateInt8((float)q);
+  } else {
+    return saturateInt8(v * multiplier / (1 << rshift));
+  }
+}
+
+// reference to reference to [arxiv 1712.05877]
+// This implementation comes from tensorflow
+// https://github.com/tensorflow/tensorflow/blob/98ff991500a0247f8f57c60db9a206204268bc42/tensorflow/lite/kernels/internal/quantization_util.cc#L52-L90
+#define Tensorflow_QuantizeMultiplier QuantizeMultiplier
+static void QuantizeMultiplier(double double_multiplier, int32_t *quantized_multiplier,
+                        int *shift) {
+  if (double_multiplier == 0.) {
+    *quantized_multiplier = 0;
+    *shift = 0;
+    return;
+  }
+
+  const double q = std::frexp(double_multiplier, shift);
+  auto q_fixed = static_cast<int64_t>(std::round(q * (1ll << 31)));
+
+  assert(q_fixed <= (1ll << 31));
+  if (q_fixed == (1ll << 31)) {
+    q_fixed /= 2;
+    ++*shift;
+  }
+
+  assert(q_fixed <= std::numeric_limits<int32_t>::max());
+  // A shift amount smaller than -31 would cause all bits to be shifted out
+  // and thus all results would be zero. We implement that instead with
+  // q_fixed==0, so as to avoid hitting issues with right-shift
+  // operations with shift amounts greater than 31. Note that this happens
+  // roughly when abs(double_multiplier) < 2^-31 and the present handling means
+  // that we're effectively flushing tiny double_multiplier's to zero.
+  // We could conceivably handle values in the range (roughly) [32, 63]
+  // as 'denormals' i.e. (shift==0, q_fixed < 2^30). In that point of view
+  // the present handling is just doing 'flush denormals to zero'. We could
+  // reconsider and actually generate nonzero denormals if a need arises.
+  if (*shift < -31) {
+    *shift = 0;
+    q_fixed = 0;
+  }
+  *quantized_multiplier = static_cast<int32_t>(q_fixed);
+}
+
+/// find RShift and Multiplier from QScale
+///   QScale = Multiplier / (1 << RShift)
+///   Multiplier is an integer
+/// case 1: specifically multiply a int8/uint8 multplier, then rshift
+///   used in layers like element_wise, pooling, concat, etc
+///   qdm is false
+///   a max_multiplier (127 or 255 normally) has to be provided
+/// case 2: qdm mode
+///   used in BM1880v2 per-channel conv mode
+///   qdm is true
+///   reference to [arxiv 1712.05877]
+///     choose the int32 value nearest to 2^31 * M0, M0 in [0.5, 1]
+///     this value is always at least 2^30 and have at least 30 bits accuracy
+///   the max_multiplier argument is ignored, fixed to (1 << 31)
+/// if 'uint32_t *multiplier' is present, return multipler alongside
+static int8_t findRShiftAndMultiplierFromQScale(double qscale,
+                                                uint32_t *multiplier = nullptr,
+                                                bool qdm = false,
+                                                uint32_t max_multiplier = 127) {
+  if (qdm) {
+#if 0
+    max_multiplier = (1 << 31);
+    for (uint32_t rshift = 0; rshift < 63; ++rshift) {
+      if ( ((double)qscale * (1ULL << (rshift + 1))) >= (double)max_multiplier ) {
+        if (multiplier) {
+          *multiplier = (uint32_t)((double)qscale * (1ULL << rshift));
+        }
+        return rshift - 31;
+      }
+    }
+#endif
+    // this ensures if qscale is 0, both multiplier and shift will be 0
+    int32_t quantized_multiplier = 0;
+    int lshift = 0;
+    Tensorflow_QuantizeMultiplier(qscale, &quantized_multiplier, &lshift);
+    if (multiplier)
+      *multiplier = quantized_multiplier;
+    int rshift = -lshift;
+    assert(rshift >= 0);
+    if (rshift > 25) {
+      std::cout << "WARNING: large rshift = " << rshift << ", qscale = " << qscale
+                << "\n";
+    }
+    return (int8_t)rshift;
+  } else {
+    assert(qscale < max_multiplier);
+    for (int8_t rshift = 0; rshift < 63; ++rshift) {
+      if (((double)qscale * (1ULL << (rshift + 1))) >= (double)max_multiplier) {
+        if (multiplier) {
+          *multiplier = (uint32_t)((double)qscale * (1ULL << rshift));
+        }
+        return rshift;
+      }
+    }
+    // assert(false);
+    std::cout << "WARNING: failed to find rshift, qscale = " << std::to_string(qscale)
+              << "\n";
+    // we are here because qscale is too small, return 0 for both shift and multiplier
+    if (multiplier) {
+      *multiplier = 0;
+    }
+    return 0;
+  }
+}
+#endif
\ No newline at end of file
diff --git a/cviruntime/custom_op/example/README b/cviruntime/custom_op/example/README
new file mode 100644
index 000000000..7189711c3
--- /dev/null
+++ b/cviruntime/custom_op/example/README
@@ -0,0 +1,42 @@
+CustomOp:
+    Attributes:
+      `param`           : required, a PoolParam struct attributes, carrying
+                          filter size, stride, padding, and do_relu.
+      `quant`           : required, a QuantParam struct attributes.
+      `name`            : required, name for calibration, comparing, or debug.
+      `do_quant`         : required, quantize to int8/bf16 or not.
+      `threshold_overwrite` : required, overwrite threshold backward/forward or not.
+      `layer_id`        : optional, id for profiling.
+
+FrontEnd:
+    def convert_leaky_relu_op(self, onnx_node):
+        assert(onnx_node.op_type == "LeakyRelu")
+        alpha = onnx_node.attrs.get("alpha", 0.01)
+        custom_op_param = {
+            'tpu': True,
+            'do_quant': True,
+            'operation_name': 'leaky_relu',
+            'threshold_overwrite': 'backward',
+            'param': {
+                'negative_slope': float(alpha)
+            }
+        }
+        op, input_shape, tensor_type = self.getOperand(onnx_node.inputs[0])
+        operands = list()
+        operands.append(op)
+        output_shape = input_shape
+        custom_op = self.CVI.add_custom_op("{}_{}".format(onnx_node.name, onnx_node.op_type), operands, output_shape, **custom_op_param)
+        self.addOperand(onnx_node.name, custom_op, output_shape, TensorType.ACTIVATION)
+
+Calibration:
+    gen_data_list.py /work/dataset/coco/val2017/ 1000 cali_list.txt
+    python /work/cvitek_mlir/python/run_calibration.py \
+        --model_name yolo_v3 yolo_v3_416_onnx_opt.mlir cali_list.txt \
+        --input_num=100 --custom_op_plugin libCustomOpPlugin.so
+
+Quantization & Optimization:
+  add "--custom-op-plugin libCustomOpPlugin.so"
+
+Codegen:
+  add "--custom-op-plugin libCustomOpPlugin.so" &
+      "--custom-runtime-lib libCustomOpRuntime_arm64.so,libCustomOpRuntime_x86.so"
diff --git a/cviruntime/custom_op/example/ROIAlignOp.cpp b/cviruntime/custom_op/example/ROIAlignOp.cpp
new file mode 100644
index 000000000..8f586ca38
--- /dev/null
+++ b/cviruntime/custom_op/example/ROIAlignOp.cpp
@@ -0,0 +1,122 @@
+/*
+* Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+*/
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <cmath>
+#include <numeric>
+#include <algorithm>
+#include "ROIAlignOp.h"
+
+namespace cvi {
+
+void ROIAlignOp::interpretFp32(
+    std::vector<std::shared_ptr<std::vector<float>>> &operand_tensors,
+    std::vector<std::vector<int64_t>> &operand_shapes,
+    std::shared_ptr<std::vector<float>> &result_tensor,
+    std::vector<int64_t> &result_shape) {
+  const int32_t pooled_h = param.get<int32_t>("pooled_h");
+  const int32_t pooled_w = param.get<int32_t>("pooled_w");
+  const float spatial_scale = param.get<float>("spatial_scale");
+
+  auto data_shape = operand_shapes[0];
+  auto roi_shape = operand_shapes[1];
+
+  const int batch = (int)data_shape[0];
+  const int channel = (int)data_shape[1];
+  const int height = (int)data_shape[2];
+  const int width = (int)data_shape[3];
+
+  const int rois_num = roi_shape[2];
+  assert(batch * rois_num == result_shape[0]);
+  assert(channel == result_shape[1]);
+
+  float* data = operand_tensors[0]->data();
+  float* rois = operand_tensors[1]->data();
+  float* result = result_tensor->data();
+
+  const int one_batch_output_size = rois_num * channel * pooled_h * pooled_w;
+
+  for (int b = 0; b < batch; ++b) {
+    float* batch_rois = rois + b * rois_num * 5;
+    float* batch_output = result + b * one_batch_output_size;
+    for (int roi_idx = 0; roi_idx < rois_num; ++roi_idx) {
+      const int roi_batch_idx = batch_rois[roi_idx * 5];
+      assert(roi_batch_idx == b);
+
+      const float roi_start_x = batch_rois[roi_idx * 5 + 1] * spatial_scale;
+      const float roi_start_y = batch_rois[roi_idx * 5 + 2] * spatial_scale;
+      const float roi_end_x = batch_rois[roi_idx * 5 + 3] * spatial_scale;
+      const float roi_end_y = batch_rois[roi_idx * 5 + 4] * spatial_scale;
+
+      const float roi_w = std::max(roi_end_x - roi_start_x + 1, 1.0f);
+      const float roi_h = std::max(roi_end_y - roi_start_y + 1, 1.0f);
+
+      float bin_size_w = roi_w / (float)pooled_w;
+      float bin_size_h = roi_h / (float)pooled_h;
+
+      float* batch_data = data + b * channel * height * width;
+
+      for (int c = 0; c < channel; ++c) {
+        for (int ph = 0; ph < pooled_h; ++ph) {
+          for (int pw = 0; pw < pooled_w; ++pw) {
+            const float region_start_x = std::min(pw * bin_size_w + roi_start_x, (float)(width));
+            const float region_start_y = std::min(ph * bin_size_h + roi_start_y, (float)(height));
+            const float region_end_x = std::min((pw+1) * bin_size_w + roi_start_x, (float)(width));
+            const float region_end_y = std::min((ph+1) * bin_size_h + roi_start_y, (float)(height));
+
+            const int region_grid_w = int(std::ceil(bin_size_w));
+            const int region_grid_h = int(std::ceil(bin_size_h));
+
+            const int output_idx = ph * pooled_w + pw;
+            if (region_start_x >= region_end_x || region_start_y >= region_end_y) {
+              batch_output[output_idx] = 0;
+              continue;
+            }
+
+            float value = 0;
+            float fmax = std::numeric_limits<float>::min();
+            for (int gh = 0; gh < region_grid_h; ++gh) {
+              for (int gw = 0; gw < region_grid_w; ++gw) {
+                float x = roi_start_x + gw;
+                float y = roi_start_y + gh;
+
+                const int x_low = x;
+                const int y_low = y;
+
+                const int x_high = x_low + 1;
+                const int y_high = y_low + 1;
+
+                const float x_ratio = x - x_low;
+                const float y_ratio = y - y_low;
+
+                const float w1 = (1 - y_ratio) * (1 - x_ratio);
+                const float w2 = (1 - y_ratio) * x_ratio;
+                const float w3 = y_ratio * (1 - x_ratio);
+                const float w4 = y_ratio * x_ratio;
+
+                const float data1 = batch_data[y_low * height + x_low];
+                const float data2 = batch_data[y_low * height + x_high];
+                const float data3 = batch_data[y_high * height + x_low];
+                const float data4 = batch_data[y_high * height + x_high];
+                value = w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
+                if (value > fmax) {
+                  fmax = value;
+                }
+              }
+            }
+            batch_output[output_idx] = fmax;
+          }
+        }
+
+        batch_data += height * width;
+        batch_output += pooled_h * pooled_w;
+      }
+    }
+  }
+}
+
+RegisterCustomOp(roialign, ROIAlignOp);
+
+} // namespace cvi
diff --git a/cviruntime/custom_op/example/ROIAlignOp.h b/cviruntime/custom_op/example/ROIAlignOp.h
new file mode 100644
index 000000000..42774a32b
--- /dev/null
+++ b/cviruntime/custom_op/example/ROIAlignOp.h
@@ -0,0 +1,22 @@
+/*
+* Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+*/
+#ifndef ROI_ALIGN_OP_H_
+#define ROI_ALIGN_OP_H_
+
+#include "tpuc/CustomOp.h"
+
+namespace cvi {
+
+class ROIAlignOp : public CustomOp {
+public:
+  ROIAlignOp(OpParam &param) : CustomOp(param) {}
+
+  void interpretFp32(std::vector<std::shared_ptr<std::vector<float>>> &operand_tensors,
+                     std::vector<std::vector<int64_t>> &operand_shapes,
+                     std::shared_ptr<std::vector<float>> &result_tensor,
+                     std::vector<int64_t> &result_shape);
+};
+
+} // namespace cvi
+#endif
diff --git a/cviruntime/custom_op/example/SoftmaxOp.cpp b/cviruntime/custom_op/example/SoftmaxOp.cpp
new file mode 100644
index 000000000..fdf3c10e5
--- /dev/null
+++ b/cviruntime/custom_op/example/SoftmaxOp.cpp
@@ -0,0 +1,74 @@
+/*
+* Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+*/
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <cmath>
+#include "SoftmaxOp.h"
+
+namespace cvi {
+
+void SoftmaxOp::interpretFp32(
+    std::vector<std::shared_ptr<std::vector<float>>> &operand_tensors,
+    std::vector<std::vector<int64_t>> &operand_shapes,
+    std::shared_ptr<std::vector<float>> &result_tensor,
+    std::vector<int64_t> &result_shape) {
+  (void)result_shape;
+  auto axis = param.get<int32_t>("axis");
+  auto& shape = operand_shapes[0];
+  axis = axis % shape.size();
+
+  int32_t n = 1, inner_dim = 1;
+  for(int i = 0; i < axis; ++i) {
+    n *= shape[i];
+  }
+
+  for(size_t i = axis + 1; i < shape.size(); ++i) {
+    inner_dim *= shape[i];
+  }
+
+  int32_t c = shape[axis];
+  int32_t dim = c * inner_dim;
+
+  float *max = new float[inner_dim];
+  float *sum = new float[inner_dim];
+  float *p = operand_tensors[0]->data();
+  float *q = result_tensor->data();
+
+  for (int i = 0; i < n; ++i) {
+    memcpy(max, p, inner_dim * sizeof(float));
+    memset(sum, 0, inner_dim * sizeof(float));
+    // find max value accross channel
+    int c_offset = i *dim;
+    for (int j = 0; j <c; ++j, c_offset +=inner_dim) {
+      for (int k = 0; k <inner_dim; k++) {
+        if (max[k] < p[c_offset + k])
+          max[k] = p[c_offset + k];
+      }
+    }
+
+    // calculate exp(x)
+    c_offset = i *dim;
+    for (int j = 0; j <c; ++j, c_offset +=inner_dim) {
+      for (int k = 0; k <inner_dim; k++) {
+        q[c_offset + k] = std::exp(p[c_offset + k] - max[k]);
+        sum[k] += q[c_offset + k];
+      }
+    }
+
+    c_offset = i *dim;
+    for (int j = 0; j <c; ++j, c_offset +=inner_dim) {
+      for (int k = 0; k <inner_dim; k++) {
+        q[c_offset + k] /= sum[k];
+      }
+    }
+  }
+
+  delete[] max;
+  delete[] sum;
+}
+
+RegisterCustomOp(mysoftmax, SoftmaxOp);
+
+} // namespace cvi
\ No newline at end of file
diff --git a/cviruntime/custom_op/example/SoftmaxOp.h b/cviruntime/custom_op/example/SoftmaxOp.h
new file mode 100644
index 000000000..f7fa3e64f
--- /dev/null
+++ b/cviruntime/custom_op/example/SoftmaxOp.h
@@ -0,0 +1,22 @@
+/*
+* Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+*/
+#ifndef LEAKY_RELU_OP_H_
+#define LEAKY_RELU_OP_H_
+
+#include "tpuc/CustomOp.h"
+
+namespace cvi {
+
+class SoftmaxOp : public CustomOp {
+public:
+  SoftmaxOp(OpParam &param) : CustomOp(param) {}
+
+  void interpretFp32(std::vector<std::shared_ptr<std::vector<float>>> &operand_tensors,
+                     std::vector<std::vector<int64_t>> &operand_shapes,
+                     std::shared_ptr<std::vector<float>> &result_tensor,
+                     std::vector<int64_t> &result_shape);
+};
+
+} // namespace cvi
+#endif
diff --git a/cviruntime/custom_op/example/UnPoolingOp.cpp b/cviruntime/custom_op/example/UnPoolingOp.cpp
new file mode 100644
index 000000000..3d2e52a99
--- /dev/null
+++ b/cviruntime/custom_op/example/UnPoolingOp.cpp
@@ -0,0 +1,307 @@
+/*
+ * Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+ */
+#include "UnPoolingOp.h"
+#include "QuantHelper.h"
+#include <cvikernel/cvikernel.h>
+
+#define NPU_SHIFT 5
+#define EU_SHIFT 4
+#define NPU_NUM (1 << NPU_SHIFT)
+#define EU_NUM (1 << EU_SHIFT)
+#define LOCAL_MEM_SIZE (1 << 15)
+#define NEURON_MEMORY 0
+#define WEIGHT_MEMORY 1
+
+namespace cvi {
+
+void UnPoolingOp::interpretFp32(
+    std::vector<std::shared_ptr<std::vector<float>>> &operand_tensors,
+    std::vector<std::vector<int64_t>> &operand_shapes,
+    std::shared_ptr<std::vector<float>> &result_tensor,
+    std::vector<int64_t> &result_shape) {
+  unpooling(operand_tensors, operand_shapes, result_tensor, result_shape);
+}
+
+void UnPoolingOp::interpretInt8(
+    std::vector<std::shared_ptr<std::vector<float>>> &operand_tensors,
+    std::vector<std::vector<int64_t>> &operand_shapes,
+    std::shared_ptr<std::vector<float>> &result_tensor,
+    std::vector<int64_t> &result_shape) {
+  unpooling(operand_tensors, operand_shapes, result_tensor, result_shape);
+}
+
+void UnPoolingOp::quantizeInt8() {
+  // support per-tensor only for now
+  setOpQuantPerchannel(false);
+  // use rshift and INT8 multiplier
+  setOpQuantParamType("RSHIFT_AND_M_I8");
+
+  // quantization
+  float threshold_x = getPrevOpThreshold();
+  float threshold_y = getOpThreshold();
+  std::cout << "threshold_y = " << std::to_string(threshold_y)
+            << ", threshold_x = " << std::to_string(threshold_x) << "\n";
+}
+
+void UnPoolingOp::codeGenInt8(void *ctx,
+                              std::vector<std::vector<int64_t>> &operand_shapes,
+                              std::vector<uint64_t> &operand_gaddrs,
+                              std::vector<int64_t> &result_shape,
+                              uint64_t result_gaddr, int layer_id) {
+  int n = operand_shapes[0][0];
+  int c = operand_shapes[0][1];
+  int h = operand_shapes[0][2];
+  int w = operand_shapes[0][3];
+  uint64_t data_gaddr = operand_gaddrs[0];
+  uint64_t mask_gaddr = operand_gaddrs[1];
+  uint64_t ga_output = result_gaddr;
+
+  int scale = param.get<int>("scale");
+  int unpool_h = param.get<int>("unpool_h");
+  int unpool_w = param.get<int>("unpool_w");
+
+  unpooling_codegen((cvk_context_t *)ctx, // ctx
+                    layer_id,             // layer_id
+                    data_gaddr,           // data_gaddr
+                    mask_gaddr,           // mask_gaddr
+                    ga_output,            // output_gaddr
+                    n, c, h, w,              // input shape
+                    scale, unpool_h, unpool_w);
+}
+
+void UnPoolingOp::alloc_lmem(cvk_context_t *ctx, uint32_t tiling_c, uint32_t tiling_h,
+    uint32_t input_c, uint32_t input_h, uint32_t input_w,
+    uint32_t output_c, uint32_t output_h, uint32_t output_w,
+    cvk_fmt_t fmt, int eu_align, cvk_tl_t &tl_ifmap, cvk_tl_t &tl_working,
+    cvk_tl_t &tl_mask, cvk_tl_t &tl_ofmap) {
+  uint32_t tl_offset = 0;
+  ctx->ops->lmem_init_tensor(ctx, &tl_ifmap, {1, tiling_c, tiling_h, input_w}, fmt,
+                       eu_align);
+  tl_ifmap.start_address = tl_offset;
+  tl_offset += ctx->ops->lmem_tensor_to_size(ctx, tl_ifmap.shape, tl_ifmap.fmt,
+                                       tl_ifmap.eu_align);
+
+  ctx->ops->lmem_init_tensor(ctx, &tl_working, {1, tiling_c, tiling_h, output_w}, fmt,
+                       eu_align);
+  tl_working.start_address = tl_offset;
+  tl_offset += ctx->ops->lmem_tensor_to_size(ctx, tl_working.shape, tl_working.fmt,
+                                       tl_working.eu_align);
+
+  uint32_t tiling_oh = tiling_h * (output_h / input_h);
+  ctx->ops->lmem_init_tensor(ctx, &tl_mask, {1, tiling_c, tiling_oh, output_w}, fmt,
+                       eu_align);
+  tl_mask.start_address = tl_offset;
+  tl_offset += ctx->ops->lmem_tensor_to_size(ctx, tl_mask.shape, tl_mask.fmt,
+                                        tl_mask.eu_align);
+
+  ctx->ops->lmem_init_tensor(ctx, &tl_ofmap, {1, tiling_c, tiling_oh, output_w}, fmt,
+                       eu_align);
+  tl_ofmap.start_address = tl_offset;
+}
+
+void UnPoolingOp::tdma_load(cvk_context_t *ctx, cvk_tl_t *tlp, uint64_t ga_src,
+                            cvk_tg_stride_t stride, int32_t n_pos, int32_t c_pos, int32_t h_pos) {
+  cvk_tg_t ts_data;
+  ts_data.base_reg_index = NEURON_MEMORY;
+  ts_data.fmt = tlp->fmt;
+  ts_data.start_address = ga_src + stride.n * n_pos + stride.c * c_pos + stride.h * h_pos;
+  ts_data.shape = {tlp->shape.n, tlp->shape.c, tlp->shape.h, tlp->shape.w};
+  ts_data.stride = stride;
+
+  cvk_tdma_g2l_tensor_copy_param_t p1;
+  p1.src = &ts_data;
+  p1.dst = tlp;
+  ctx->ops->tdma_g2l_tensor_copy(ctx, &p1);
+}
+
+void UnPoolingOp::unpooling_compute(
+    cvk_context_t *ctx, uint32_t layer_id, int scale_h, int scale_w, 
+    cvk_tl_t *tl_ifmap, cvk_tl_t *tl_working, cvk_tl_t *tl_mask, cvk_tl_t *tl_ofmap) {
+
+  cvk_tl_stride_t tl_ifmap_fake_stride = {0, tl_ifmap->stride.c, tl_ifmap->stride.h, tl_ifmap->stride.w};
+  cvk_tl_t tl_ifmap_fake = {0};
+  tl_ifmap_fake.start_address = tl_ifmap->start_address;
+  tl_ifmap_fake.fmt = tl_ifmap->fmt;
+  tl_ifmap_fake.shape = {scale_w, tl_ifmap->shape.c, tl_ifmap->shape.h, tl_ifmap->shape.w};
+  tl_ifmap_fake.stride = tl_ifmap_fake_stride;
+  tl_ifmap_fake.eu_align = tl_ifmap->eu_align;
+
+  cvk_tl_stride_t tl_working_fake_stride = {
+     tl_working->stride.w, tl_working->stride.c,
+     tl_working->stride.h, tl_working->stride.w * scale_w};
+  cvk_tl_t tl_working_fake = {0};
+  tl_working_fake.start_address = tl_working->start_address;
+  tl_working_fake.fmt = tl_working->fmt;
+  tl_working_fake.shape = {scale_w, tl_ifmap->shape.c, tl_ifmap->shape.h, tl_ifmap->shape.w};
+  tl_working_fake.stride = tl_working_fake_stride;
+  tl_working_fake.eu_align = tl_working->eu_align;
+
+  cvk_tiu_copy_param_t param = {0};
+  param.dst = &tl_working_fake;
+  param.src =  &tl_ifmap_fake;
+  param.layer_id = layer_id;
+  ctx->ops->tiu_copy(ctx, &param);
+
+  cvk_tl_stride_t tl_working_fake2_stride = {0, tl_working->stride.c, tl_working->stride.h, tl_working->stride.w};
+  cvk_tl_t tl_working_fake2 = {0};
+  tl_working_fake2.start_address = tl_working->start_address;
+  tl_working_fake2.fmt = tl_working->fmt;
+  tl_working_fake2.shape = {scale_h, tl_ofmap->shape.c, tl_ifmap->shape.h, tl_ofmap->shape.w};
+  tl_working_fake2.stride = tl_working_fake2_stride;
+  tl_working_fake2.eu_align = tl_working->eu_align;
+
+  cvk_tl_stride_t tl_ofmap_fake_stride = {tl_ofmap->stride.h, tl_ofmap->stride.c, tl_ofmap->stride.h * scale_h, tl_ofmap->stride.w};
+  cvk_tl_t tl_ofmap_fake = {0};
+  tl_ofmap_fake.start_address = tl_ofmap->start_address;
+  tl_ofmap_fake.fmt = tl_ofmap->fmt;
+  tl_ofmap_fake.shape =  {scale_h, tl_ofmap->shape.c, tl_ifmap->shape.h, tl_ofmap->shape.w};
+  tl_ofmap_fake.stride = tl_ofmap_fake_stride;
+  tl_ofmap_fake.eu_align = tl_ofmap->eu_align;
+
+  cvk_tiu_copy_param_t param2 = {0};
+  param2.dst = &tl_ofmap_fake;
+  param2.src =  &tl_working_fake2;
+  param2.layer_id = layer_id;
+  ctx->ops->tiu_copy(ctx, &param2);
+
+  cvk_tiu_mul_param_t param3 = {0};
+  param3.res_high = nullptr;
+  param3.res_low = tl_ofmap;
+  param3.a = tl_ofmap;
+  param3.b_is_const = 0;
+  param3.b = tl_mask;
+  param3.layer_id = layer_id;
+  param3.rshift_bits = 0;
+  param3.relu_enable = 0;
+  ctx->ops->tiu_mul(ctx, &param3);
+}
+
+void UnPoolingOp::tdma_store(cvk_context_t *ctx, cvk_tl_t *tlp,
+                             uint64_t ga_dst, cvk_tg_stride_t stride,
+                             uint32_t n_pos, uint32_t c_pos, uint32_t h_pos,
+                             uint32_t crop_h, uint32_t crop_w) {
+  cvk_tl_t tl_src;
+  tl_src.start_address = tlp->start_address;
+  tl_src.fmt = tlp->fmt;
+  tl_src.shape = {tlp->shape.n, tlp->shape.c, tlp->shape.h - crop_h, tlp->shape.w - crop_w};
+  tl_src.stride = tlp->stride;
+
+  cvk_tg_t tg_dst;
+  tg_dst.base_reg_index = NEURON_MEMORY;
+  tg_dst.fmt = tlp->fmt;
+  tg_dst.start_address = ga_dst + stride.n * n_pos + stride.c * c_pos + stride.h * h_pos;
+  tg_dst.shape = {tlp->shape.n, tlp->shape.c, tlp->shape.h - crop_h, tlp->shape.w - crop_w};
+  tg_dst.stride = stride;
+
+  cvk_tdma_l2g_tensor_copy_param_t p1;
+  p1.src = &tl_src;
+  p1.dst = &tg_dst;
+  ctx->ops->tdma_l2g_tensor_copy(ctx, &p1);
+}
+
+void UnPoolingOp::unpooling_codegen(cvk_context_t *ctx, uint32_t layer_id,
+                                    uint64_t data_gaddr, uint64_t mask_gaddr, uint64_t output_gaddr,
+                                    int input_n, int input_c, int input_h, int input_w,
+                                    int scale, int unpool_h, int unpool_w) {
+  printf("unpooling_codegen:\n"
+         "  layer_id %d\n"
+         "  data_gddr: %lx, mask_gaddr: %lx, output_gaddr: %lx\n"
+         "  input (%d, %d, %d, %d)\n"
+         "  scale:%d, unpool_h:%d, unpool_w:%d\n",
+         layer_id, data_gaddr, mask_gaddr, output_gaddr, input_n, input_c, input_h,
+         input_w, scale, unpool_h, unpool_w);
+
+  // Split input based on local memory
+  uint32_t total_eu = NPU_NUM * EU_NUM;
+  uint32_t lane_size = LOCAL_MEM_SIZE;
+  uint32_t total_mem_size = NPU_NUM * LOCAL_MEM_SIZE;
+  uint32_t max_N = (1 << 12) - 1; // 1880v2: 12 bit
+  uint32_t max_W = (1 << 12) - 1; // 1880v2: 12 bit
+  uint32_t count = input_n * input_c * input_h * input_w;
+
+  uint32_t output_c = input_c;
+  uint32_t output_h = input_h * scale;
+  uint32_t output_w = input_w * scale;
+
+  uint32_t n_step = 1;
+  uint32_t c_step = 0;
+  uint32_t h_step = 0;
+
+  h_step = input_h;
+  uint32_t h_factor = scale;
+
+  for (; h_step > 0; --h_step) {
+    uint32_t total_size;
+    for (c_step = input_c; c_step >= (uint32_t)NPU_NUM ; --c_step) {
+      cvk_tl_shape_t tiled_ifmap_shape = {1, c_step, h_step, input_w};
+      uint32_t tiled_ifmap_size =
+          ctx->ops->lmem_tensor_to_size(ctx, tiled_ifmap_shape, CVK_FMT_I8, 0);
+
+      cvk_tl_shape_t tiled_working_shape = {1, c_step, h_step, output_w};
+      uint32_t tiled_working_size =
+          ctx->ops->lmem_tensor_to_size(ctx, tiled_working_shape, CVK_FMT_I8, 0);
+
+      cvk_tl_shape_t tiled_ofmap_shape = {1, c_step, h_step * h_factor, output_w};
+      uint32_t tiled_ofmap_size =
+          ctx->ops->lmem_tensor_to_size(ctx, tiled_ofmap_shape, CVK_FMT_I8, 0);
+
+      total_size = tiled_ifmap_size + tiled_working_size + tiled_ofmap_size * 2;
+      if (total_size <= static_cast<uint32_t>(LOCAL_MEM_SIZE))
+        break;
+    }
+    if (total_size <= static_cast<uint32_t>(LOCAL_MEM_SIZE))
+      break;
+  }
+
+  printf("tiling: c_step %d, h_step %d\n", c_step, h_step);
+  assert(c_step && h_step && "Expect valid tiling");
+
+  cvk_tg_stride_t ifmap_stride = {
+    input_c * input_h * input_w,
+    input_h * input_w,
+    input_w};
+  cvk_tg_stride_t mask_stride = {
+    output_c * output_h * output_w,
+    output_h * output_w,
+    output_w};
+  cvk_tg_stride_t output_stride = {
+    output_c * unpool_h * unpool_w,
+    unpool_h * unpool_w,
+    unpool_w};
+
+  uint64_t output_offset = 0;
+  uint32_t crop_h = 0;
+  uint32_t crop_w = 0;
+  for (uint32_t n_pos = 0; n_pos < input_n; n_pos += n_step) {
+    for (uint32_t c_pos = 0; c_pos < input_c; c_pos += c_step) {
+      uint32_t tiling_c = std::min(input_c - c_pos, c_step);
+      for (uint32_t h_pos = 0; h_pos < input_h; h_pos += h_step) {
+        uint32_t tiling_h = std::min(input_h - h_pos, h_step);
+
+        cvk_tl_t tl_ifmap, tl_ofmap, tl_mask, tl_working;
+        alloc_lmem(ctx, tiling_c, tiling_h, input_c, input_h, input_w, output_c,
+                                    output_h, output_w, CVK_FMT_I8, 0, tl_ifmap, tl_working,
+                                    tl_mask, tl_ofmap);
+
+        tdma_load(ctx, &tl_ifmap, data_gaddr, ifmap_stride, n_pos, c_pos, h_pos);
+        tdma_load(ctx, &tl_mask, mask_gaddr, mask_stride, n_pos, c_pos, h_pos * scale);
+
+        unpooling_compute(ctx, layer_id, scale, scale, &tl_ifmap, &tl_working, &tl_mask, &tl_ofmap);
+
+        uint32_t oh_pos = h_pos * scale;
+        crop_w = output_w - unpool_w;
+        if (oh_pos + tiling_h * scale > unpool_h) {
+          crop_h = oh_pos + tiling_h * scale - unpool_h;
+        } else {
+          crop_h = 0;
+        }
+        tdma_store(ctx, &tl_ofmap, output_gaddr, output_stride, n_pos, c_pos, h_pos * scale, crop_h, crop_w);
+      }
+    }
+  }
+}
+
+RegisterCustomOp(unpooling, UnPoolingOp);
+
+} // namespace cvi
\ No newline at end of file
diff --git a/cviruntime/custom_op/example/UnPoolingOp.h b/cviruntime/custom_op/example/UnPoolingOp.h
new file mode 100644
index 000000000..34dfdb044
--- /dev/null
+++ b/cviruntime/custom_op/example/UnPoolingOp.h
@@ -0,0 +1,104 @@
+/*
+* Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+*/
+#ifndef UNPOOLING_OP_H_
+#define UNPOOLING_OP_H_
+
+#include "tpuc/CustomOp.h"
+#include <cvikernel/cvikernel.h>
+
+namespace cvi {
+
+class UnPoolingOp : public CustomOp {
+public:
+  UnPoolingOp(OpParam &param) : CustomOp(param) {}
+
+  void interpretFp32(std::vector<std::shared_ptr<std::vector<float>>> &operand_tensors,
+                     std::vector<std::vector<int64_t>> &operand_shapes,
+                     std::shared_ptr<std::vector<float>> &result_tensor,
+                     std::vector<int64_t> &result_shape);
+  void interpretInt8(std::vector<std::shared_ptr<std::vector<float>>> &operand_tensors,
+                     std::vector<std::vector<int64_t>> &operand_shapes,
+                     std::shared_ptr<std::vector<float>> &result_tensor,
+                     std::vector<int64_t> &result_shape);
+  void quantizeInt8();
+  void codeGenInt8(void *ctx,
+                   std::vector<std::vector<int64_t>> &operand_shapes,
+                   std::vector<uint64_t> &operand_gaddrs,
+                   std::vector<int64_t> &result_shape, uint64_t result_gaddr,
+                   int layer_id);
+
+private:
+  void alloc_lmem(cvk_context_t *ctx, uint32_t tiling_c, uint32_t tiling_h,
+    uint32_t input_c, uint32_t input_h, uint32_t input_w,
+    uint32_t output_c, uint32_t output_h, uint32_t output_w,
+    cvk_fmt_t fmt, int eu_align, cvk_tl_t &tl_ifmap, cvk_tl_t &tl_working,
+    cvk_tl_t &tl_ofmap, cvk_tl_t &tl_mask);
+  void tdma_load(cvk_context_t *ctx, cvk_tl_t *tlp, uint64_t ga_src, cvk_tg_stride_t stride,
+                 int n_pos, int c_pos, int h_pos);
+  void unpooling_compute(cvk_context_t *ctx, uint32_t layer_id, int scale_h, int scale_w,
+    cvk_tl_t *tl_ifmap, cvk_tl_t *tl_working, cvk_tl_t *tl_mask, cvk_tl_t *tl_ofmap);
+  void tdma_store(cvk_context_t *ctx, cvk_tl_t *tlp, uint64_t ga_dst, cvk_tg_stride_t stride,
+    uint32_t n_pos, uint32_t c_pos, uint32_t h_pos, uint32_t crop_h, uint32_t crop_w);
+  void unpooling_codegen(cvk_context_t *ctx, uint32_t layer_id,
+                        uint64_t data_gaddr, uint64_t mask_gaddr, uint64_t output_gaddr,
+                        int input_n, int input_c, int input_h, int input_w,
+                        int scale, int unpool_h, int unpool_w);
+
+  void unpooling(std::vector<std::shared_ptr<std::vector<float>>> &operand_tensors,
+      std::vector<std::vector<int64_t>> &operand_shapes,
+      std::shared_ptr<std::vector<float>> &result_tensor,
+      std::vector<int64_t> &result_shape) {
+    int in = operand_shapes[0][0];
+    int ic = operand_shapes[0][1];
+    int ih = operand_shapes[0][2];
+    int iw = operand_shapes[0][3];
+
+    int oh = result_shape[2];
+    int ow = result_shape[3];
+
+    float *data = operand_tensors[0]->data();
+    float *mask = operand_tensors[1]->data();
+    float *output = result_tensor->data();
+    auto scale = param.get<int>("scale");
+    auto unpool_h = param.get<int>("unpool_h");
+    auto unpool_w = param.get<int>("unpool_w");
+
+    assert(oh == unpool_h);
+    assert(ow == unpool_w);
+
+    int sh = ih * scale;
+    int sw = iw * scale;
+    // always use float to store int8 value
+    std::vector<float> tmp_out(in * ic * sh * sw);
+
+    for (int n = 0; n < in; n++) {
+      for (int c = 0; c < ic; c++) {
+        for (int h = 0; h < sh; h++) {
+          for (int w = 0; w < sw; w++) {
+            int isw = w / scale;
+            int ish = h / scale;
+            int out_idx = ((n * ic + c) * sh + h) * sw + w;
+            int in_idx = ((n * ic + c) * ih + ish) * iw + isw;
+            tmp_out[out_idx] = data[in_idx] * mask[out_idx];
+          }
+        }
+      }
+    }
+
+    for (int n = 0; n < in; n++) {
+      for (int c = 0; c < ic; c++) {
+        for (int h = 0; h < oh; h++) {
+          for (int w = 0; w < ow; w++) {
+            int out_idx = ((n * ic + c) * oh + h) * ow + w;
+            int in_idx = ((n * ic + c) * sh + h) * sw + w;
+            output[out_idx] = tmp_out[in_idx];
+          }
+        }
+      }
+    }
+  }
+};
+
+} // namespace cvi
+#endif
diff --git a/cviruntime/custom_op/example/build.sh b/cviruntime/custom_op/example/build.sh
new file mode 100755
index 000000000..35280fbeb
--- /dev/null
+++ b/cviruntime/custom_op/example/build.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+set -e
+
+DIR="$( cd "$(dirname "$0")" ; pwd -P )"
+INSTALL_PATH=$DIR/install
+if [[ ! -e $INSTALL_PATH ]]; then
+  mkdir $DIR/install
+fi
+if [ -z "$ARM_TOOLCHAIN_GCC_PATH" ]; then
+  ARM_TOOLCHAIN_GCC_PATH=$TPU_BASE/host-tools/gcc/gcc-linaro-6.3.1-2017.05-x86_64_aarch64-linux-gnu
+fi
+export PATH=$ARM_TOOLCHAIN_GCC_PATH/bin:$PATH
+export TOOLCHAIN_FILE_PATH=$DIR/cmake/toolchain-aarch64-linux.cmake
+export MLIR_INCLUDE=$TPU_BASE/cvitek_mlir/include
+export CVIRUNTIME_INCLUDE=$MLIR_INCLUDE
+export AARCH64_SYSROOT_PATH=$TPU_BASE/cvitek_sysroot
+
+if [[ ! -e $DIR/build ]]; then
+  mkdir $DIR/build
+fi
+pushd $DIR/build
+rm -rf *
+cmake -DMLIR_INCLUDE=$MLIR_INCLUDE \
+      -DCMAKE_INSTALL_PREFIX=$INSTALL_PATH ..
+make install
+rm -rf *
+cmake -DCVIRUNTIME_INCLUDE=$CVIRUNTIME_INCLUDE \
+      -DCMAKE_INSTALL_PREFIX=$INSTALL_PATH ../runtime
+make install
+rm -rf *
+cmake -DCMAKE_SYSROOT=$AARCH64_SYSROOT_PATH \
+      -DCMAKE_TOOLCHAIN_FILE=$TOOLCHAIN_FILE_PATH \
+      -DCVIRUNTIME_INCLUDE=$CVIRUNTIME_INCLUDE \
+      -DCMAKE_INSTALL_PREFIX=$INSTALL_PATH ../runtime
+make install
+popd
\ No newline at end of file
diff --git a/cviruntime/custom_op/example/cmake/toolchain-aarch64-linux.cmake b/cviruntime/custom_op/example/cmake/toolchain-aarch64-linux.cmake
new file mode 100644
index 000000000..80a02d052
--- /dev/null
+++ b/cviruntime/custom_op/example/cmake/toolchain-aarch64-linux.cmake
@@ -0,0 +1,54 @@
+include(CMakeForceCompiler)
+
+# The Generic system name is used for embedded targets (targets without OS) in
+# CMake
+set( CMAKE_SYSTEM_NAME          Linux )
+set( CMAKE_SYSTEM_PROCESSOR     aarch64 )
+
+# The toolchain prefix for all toolchain executables
+set( ARCH arm64 )
+
+# specify the cross compiler. We force the compiler so that CMake doesn't
+# attempt to build a simple test program as this will fail without us using
+# the -nostartfiles option on the command line
+if(DEFINED ENV{CROSS_COMPILE_64})
+  set(CROSS_COMPILE $ENV{CROSS_COMPILE_64})
+else()
+  set(CROSS_COMPILE aarch64-linux-gnu-)
+endif()
+
+set(CMAKE_C_COMPILER ${CROSS_COMPILE}gcc)
+set(CMAKE_CXX_COMPILER ${CROSS_COMPILE}g++)
+
+message(STATUS "CMAKE_C_COMPILER: ${CMAKE_C_COMPILER}")
+message(STATUS "CMAKE_CXX_COMPILER: ${CMAKE_CXX_COMPILER}")
+
+# To build the tests, we need to set where the target environment containing
+# the required library is. On Debian-like systems, this is
+# /usr/aarch64-linux-gnu.
+SET(CMAKE_FIND_ROOT_PATH ${AARCH64_SYSROOT_PATH})
+# search for programs in the build host directories
+SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+# for libraries and headers in the target directories
+SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+
+# We must set the OBJCOPY setting into cache so that it's available to the
+# whole project. Otherwise, this does not get set into the CACHE and therefore
+# the build doesn't know what the OBJCOPY filepath is
+set(CMAKE_OBJCOPY ${CROSS_COMPILE}objcopy
+	    CACHE FILEPATH "The toolchain objcopy command " FORCE )
+
+# Set the CMAKE C flags (which should also be used by the assembler!
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Os -std=gnu11" )
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a" )
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffunction-sections" )
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fdata-sections" )
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-pointer-to-int-cast" )
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-missing-field-initializers" )
+
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "" )
+set( CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "" )
+
+set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-missing-field-initializers" )
+set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-parentheses" )
diff --git a/cviruntime/custom_op/example/resnet18_convert.py b/cviruntime/custom_op/example/resnet18_convert.py
new file mode 100644
index 000000000..c9be0c1de
--- /dev/null
+++ b/cviruntime/custom_op/example/resnet18_convert.py
@@ -0,0 +1,97 @@
+#!/usr/bin/python3
+"""
+Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+"""
+
+import onnx
+from cvi_toolkit.transform.BaseConverter import TensorType
+from cvi_toolkit.transform.onnx_converter import OnnxConverter
+from cvi_toolkit.transform.tflite_converter_int8 import TFLiteConverter
+from cvi_toolkit.transform.tensorflow_converter import TFConverter
+from cvi_toolkit.utils.log_setting import setup_logger
+from cvi_toolkit.data.preprocess import add_preprocess_parser, preprocess
+
+logger = setup_logger('root', log_level="INFO")
+
+class MyOnnxConverter(OnnxConverter):
+    def __init__(self, model_name, onnx_model, mlir_file_path, batch_size=1, preprocessor=None):
+        super().__init__(model_name, onnx_model, mlir_file_path, batch_size, preprocessor.to_dict())
+        self.onnxop_factory['LeakyRelu'] = lambda node: self.convert_leaky_relu(node);
+
+    def convert_graph(self):
+        """convert all to mlir"""
+
+        # add input op
+        # add input op
+        for idx, input in enumerate(self.input_nodes):
+            input_shape = list()
+            for i, dim in enumerate(input.type.tensor_type.shape.dim):
+                # batch size
+                # dim is zero, mean mutli batch
+                if i == 0 and dim.dim_value <= 0:
+                    input_shape.append(self.batch_size)
+                else:
+                    input_shape.append(dim.dim_value)
+
+            if not self.preprocess_args:
+                input_op = self.CVI.add_input_op(input.name, idx, **{})
+            else:
+                preprocess_hint = {
+                    'mean': self.preprocess_args['perchannel_mean'],
+                    'scale':  self.preprocess_args['perchannel_scale'],
+                    'pixel_format': self.preprocess_args["pixel_format"],
+                    'channel_order': self.preprocess_args["channel_order"],
+                    'aligned': self.preprocess_args["aligned"],
+                    'resize_dims': self.preprocess_args['resize_dims'],
+                    'keep_aspect_ratio': self.preprocess_args['keep_aspect_ratio']
+                }
+                # add input op
+                input_op = self.CVI.add_input_op(input.name, idx, **preprocess_hint)
+            self.addOperand(input.name, input_op, input_shape, TensorType.ACTIVATION)
+
+        def NoneAndRaise(node):
+            raise RuntimeError("{} Op not support now".format(node.op_type))
+        # add node op
+        for n in self.converted_nodes:
+            self.onnxop_factory.get(n.op_type, lambda x: NoneAndRaise(x))(n)
+
+        self.add_softmax_op()
+        # add return op
+        return_op = list()
+        # Set output
+        op, _, _ = self.getOperand("prob")
+        return_op.append(op)
+
+        self.CVI.add_return_op(return_op)
+        mlir_txt = self.CVI.print_module()
+        with open(self.mlir_file_path, "w") as f:
+            f.write(mlir_txt)
+
+    def add_softmax_op(self):
+        softmax_op_param = {
+            'tpu': False,
+            'do_quant': False,
+            'operation_name': 'mysoftmax',
+            'threshold_overwrite': 'none',
+            'param': {
+                'axis': 1
+            }
+        }
+        op, input_shape, tensor_type = self.getOperand('output')
+        operands = list()
+        operands.append(op)
+        output_shape = input_shape
+        custom_op = self.CVI.add_custom_op("prob_softmax", operands, output_shape, **softmax_op_param)
+        self.addOperand("prob", custom_op, output_shape, TensorType.ACTIVATION)
+
+if __name__ == "__main__":
+    onnx_model = onnx.load('model/resnet18.onnx')
+    preprocessor = preprocess()
+    preprocessor.config(net_input_dims="224,224",
+               resize_dims="256,256", crop_method='center', keep_aspect_ratio=False,
+               raw_scale=1.0, mean='0.406,0.456,0.485', std='0.225,0.224,0.229', input_scale=1.0,
+               channel_order='bgr', pixel_format=None, data_format='nchw',
+               aligned=False, gray=False)
+    c = MyOnnxConverter('resnet18', 'model/resnet18.onnx',
+                        'resnet18.mlir', batch_size=1, preprocessor=preprocessor)
+    c.run()
diff --git a/cviruntime/custom_op/example/roialign.mlir b/cviruntime/custom_op/example/roialign.mlir
new file mode 100644
index 000000000..3e94742c4
--- /dev/null
+++ b/cviruntime/custom_op/example/roialign.mlir
@@ -0,0 +1,11 @@
+
+
+module {
+  func @tpu_func(%arg0 : tensor<1x512x38x50xf32>, %arg1 : tensor<1x1x300x5xf32>) -> tensor<300x512x7x7xf32> {
+    %0 = "tpu.weight_file"() {filename = "roialign_1_06eeeb7e.npz"} : () -> memref<10xf32>
+    %1 = "tpu.input"(%arg0) {name = "data0", quant = {is_asymmetric = false, is_perchannel = false, mode = "NONE", param_type = "NONE", threshold_max = 0.000000e+00 : f32, threshold_min = 0.000000e+00 : f32, zero_point = 0 : i32}} : (tensor<1x512x38x50xf32>) -> tensor<1x512x38x50xf32>
+    %2 = "tpu.input"(%arg1) {name = "data1", quant = {is_asymmetric = false, is_perchannel = false, mode = "NONE", param_type = "NONE", threshold_max = 0.000000e+00 : f32, threshold_min = 0.000000e+00 : f32, zero_point = 0 : i32}} : (tensor<1x1x300x5xf32>) -> tensor<1x1x300x5xf32>
+    %3 = "tpu.custom_op"(%1, %2) {name = "roi_align", operation_name = "roialign", param = {pooled_h = 7 : i32, pooled_w = 7 : i32, spatial_scale = 6.250000e-02 : f32}, quant = {is_asymmetric = false, is_perchannel = false, mode = "NONE", param_type = "NONE", threshold_max = 0.000000e+00 : f32, threshold_min = 0.000000e+00 : f32, zero_point = 0 : i32}} : (tensor<1x512x38x50xf32>, tensor<1x1x300x5xf32>) -> tensor<300x512x7x7xf32>
+    return %3 : tensor<300x512x7x7xf32>
+  }
+}
diff --git a/cviruntime/custom_op/example/runtime/CMakeLists.txt b/cviruntime/custom_op/example/runtime/CMakeLists.txt
new file mode 100644
index 000000000..6c4192cf9
--- /dev/null
+++ b/cviruntime/custom_op/example/runtime/CMakeLists.txt
@@ -0,0 +1,20 @@
+cmake_minimum_required(VERSION 2.8.0)
+project(custom_cpu_function CXX)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
+
+include_directories(${PROJECT_SOURCE_DIR})
+include_directories(${CVIRUNTIME_INCLUDE})
+
+add_library(CustomOpRuntime SHARED
+            SoftmaxOpRuntime.cpp
+            ROIAlignOpRuntime.cpp
+            OpRuntimeRegister.cpp)
+if(NOT CMAKE_CROSSCOMPILING)
+  set_target_properties(CustomOpRuntime PROPERTIES SUFFIX "_x86.so")
+else()
+  set_target_properties(CustomOpRuntime PROPERTIES SUFFIX "_arm64.so")
+endif()
+install(TARGETS CustomOpRuntime DESTINATION lib/custom_op/)
diff --git a/cviruntime/custom_op/example/runtime/OpRuntimeRegister.cpp b/cviruntime/custom_op/example/runtime/OpRuntimeRegister.cpp
new file mode 100644
index 000000000..9ab211022
--- /dev/null
+++ b/cviruntime/custom_op/example/runtime/OpRuntimeRegister.cpp
@@ -0,0 +1,12 @@
+/*
+* Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+*/
+#include <runtime/cpu_function.hpp>
+#include "SoftmaxOpRuntime.hpp"
+#include "ROIAlignOpRuntime.hpp"
+
+REGISTER_OP_RUNTIME_FUNCS(
+  {(char *)"mysoftmax", SoftmaxOpRuntime::open},
+  {(char *)"roialign", ROIAlignOpRuntime::open}
+  // add more custom op runtime func here.
+);
diff --git a/cviruntime/custom_op/example/runtime/ROIAlignOpRuntime.cpp b/cviruntime/custom_op/example/runtime/ROIAlignOpRuntime.cpp
new file mode 100644
index 000000000..9faad99b5
--- /dev/null
+++ b/cviruntime/custom_op/example/runtime/ROIAlignOpRuntime.cpp
@@ -0,0 +1,124 @@
+#include <iostream>
+#include <vector>
+#include <string>
+#include <algorithm>
+#include <cmath>
+#include <numeric>
+#include <runtime/neuron.hpp>
+#include "ROIAlignOpRuntime.hpp"
+
+
+ROIAlignOpRuntime::~ROIAlignOpRuntime() {}
+
+void ROIAlignOpRuntime::setup(std::vector<std::shared_ptr<cvi::runtime::Neuron>> &inputs,
+             std::vector<std::shared_ptr<cvi::runtime::Neuron>> &outputs,
+             cvi::OpParam &param) {
+  pooled_h = param.get<int32_t>("pooled_h");
+  pooled_w = param.get<int32_t>("pooled_w");
+  spatial_scale = param.get<float>("spatial_scale");
+
+  auto on = outputs[0]->shape[0];
+  auto oc = outputs[0]->shape[1];
+  if (inputs[0]->shape[1] == oc && inputs[1]->shape[2] == on) {
+    _bottoms = inputs;
+  } else {
+    std::swap(inputs[0], inputs[1]);
+    _bottoms = inputs;
+  }
+
+  _tops = outputs;
+}
+
+void ROIAlignOpRuntime::run() {
+  auto top_data = _tops[0]->cpu_data<float>();
+
+  size_t bottom_count = _bottoms.size();
+  assert(bottom_count == 2);
+
+  float *data = (float *)_bottoms[0]->cpu_data<float>();
+  float *rois = (float *)_bottoms[1]->cpu_data<float>();
+
+  int num_rois = _bottoms[1]->shape[2];
+  int batch = _bottoms[0]->shape[0];
+  int channel = _bottoms[0]->shape[1];
+  int height = _bottoms[0]->shape[2];
+  int width = _bottoms[0]->shape[3];
+
+  for (int b = 0; b < batch; ++b) {
+    auto batch_rois = rois + _bottoms[1]->offset(b);
+    auto batch_output = top_data + b * num_rois * channel * pooled_h * pooled_w;
+    for (int roi_idx = 0; roi_idx < num_rois; ++roi_idx) {
+      const int roi_batch_idx = batch_rois[roi_idx * 5];
+      assert(roi_batch_idx == b);
+
+      const float roi_start_x = batch_rois[roi_idx * 5 + 1] * spatial_scale;
+      const float roi_start_y = batch_rois[roi_idx * 5 + 2] * spatial_scale;
+      const float roi_end_x = batch_rois[roi_idx * 5 + 3] * spatial_scale;
+      const float roi_end_y = batch_rois[roi_idx * 5 + 4] * spatial_scale;
+
+      const float roi_w = std::max(roi_end_x - roi_start_x + 1, 1.0f);
+      const float roi_h = std::max(roi_end_y - roi_start_y + 1, 1.0f);
+
+      float bin_size_w = roi_w / (float)pooled_w;
+      float bin_size_h = roi_h / (float)pooled_h;
+
+      float* batch_data = data + b * channel * height * width;
+
+      for (int c = 0; c < channel; ++c) {
+        for (int ph = 0; ph < pooled_h; ++ph) {
+          for (int pw = 0; pw < pooled_w; ++pw) {
+            const float region_start_x = std::min(pw * bin_size_w + roi_start_x, (float)(width));
+            const float region_start_y = std::min(ph * bin_size_h + roi_start_y, (float)(height));
+            const float region_end_x = std::min((pw+1) * bin_size_w + roi_start_x, (float)(width));
+            const float region_end_y = std::min((ph+1) * bin_size_h + roi_start_y, (float)(height));
+
+            const int region_grid_w = int(std::ceil(bin_size_w));
+            const int region_grid_h = int(std::ceil(bin_size_h));
+
+            const int output_idx = ph * pooled_w + pw;
+            if (region_start_x >= region_end_x || region_start_y >= region_end_y) {
+              batch_output[output_idx] = 0;
+              continue;
+            }
+
+            float value = 0;
+            float fmax = std::numeric_limits<float>::min();
+            for (int gh = 0; gh < region_grid_h; ++gh) {
+              for (int gw = 0; gw < region_grid_w; ++gw) {
+                float x = roi_start_x + gw;
+                float y = roi_start_y + gh;
+
+                const int x_low = x;
+                const int y_low = y;
+
+                const int x_high = x_low + 1;
+                const int y_high = y_low + 1;
+
+                const float x_ratio = x - x_low;
+                const float y_ratio = y - y_low;
+
+                const float w1 = (1 - y_ratio) * (1 - x_ratio);
+                const float w2 = (1 - y_ratio) * x_ratio;
+                const float w3 = y_ratio * (1 - x_ratio);
+                const float w4 = y_ratio * x_ratio;
+
+                const float data1 = batch_data[y_low * height + x_low];
+                const float data2 = batch_data[y_low * height + x_high];
+                const float data3 = batch_data[y_high * height + x_low];
+                const float data4 = batch_data[y_high * height + x_high];
+                value = w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
+                if (value > fmax) {
+                  fmax = value;
+                }
+              }
+            }
+            batch_output[output_idx] = fmax;
+          }
+        }
+
+        batch_data += height * width;
+        batch_output += pooled_h * pooled_w;
+      }
+    }
+  }
+}
diff --git a/cviruntime/custom_op/example/runtime/ROIAlignOpRuntime.hpp b/cviruntime/custom_op/example/runtime/ROIAlignOpRuntime.hpp
new file mode 100644
index 000000000..f53be3228
--- /dev/null
+++ b/cviruntime/custom_op/example/runtime/ROIAlignOpRuntime.hpp
@@ -0,0 +1,30 @@
+#include <iostream>
+#include <vector>
+#include <algorithm>
+#include <unordered_map>
+#include <runtime/neuron.hpp>
+#include <runtime/op_param.hpp>
+#include <runtime/cpu_function.hpp>
+
+class ROIAlignOpRuntime : public cvi::runtime::ICpuFunction {
+
+public:
+  ROIAlignOpRuntime() {}
+
+  ~ROIAlignOpRuntime();
+  void setup(std::vector<std::shared_ptr<cvi::runtime::Neuron>> &inputs,
+             std::vector<std::shared_ptr<cvi::runtime::Neuron>> &outputs,
+             cvi::OpParam &param);
+  void run();
+
+  static ICpuFunction *open() { return new ROIAlignOpRuntime(); }
+  static void close(ICpuFunction *func) { delete func; }
+
+private:
+  std::vector<std::shared_ptr<cvi::runtime::Neuron>> _bottoms;
+  std::vector<std::shared_ptr<cvi::runtime::Neuron>> _tops;
+
+  int pooled_h;
+  int pooled_w;
+  float spatial_scale;
+};
diff --git a/cviruntime/custom_op/example/runtime/SoftmaxOpRuntime.cpp b/cviruntime/custom_op/example/runtime/SoftmaxOpRuntime.cpp
new file mode 100644
index 000000000..50bcba8a7
--- /dev/null
+++ b/cviruntime/custom_op/example/runtime/SoftmaxOpRuntime.cpp
@@ -0,0 +1,74 @@
+/*
+* Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+*/
+#include <cmath>
+#include <string.h>
+#include "SoftmaxOpRuntime.hpp"
+
+SoftmaxOpRuntime::~SoftmaxOpRuntime() {
+  if (_max)
+    delete[] _max;
+  if (_sum)
+    delete[] _sum;
+}
+
+void SoftmaxOpRuntime::setup(std::vector<std::shared_ptr<cvi::runtime::Neuron>> &inputs,
+                             std::vector<std::shared_ptr<cvi::runtime::Neuron>> &outputs,
+                             cvi::OpParam &param) {
+  _bottom = inputs[0];
+  _top = outputs[0];
+  _axis = param.get<int32_t>("axis");
+  assert(_axis >= 0);
+  auto shape = _bottom->shape;
+  _axis = _axis % shape.size();
+
+  _n = 1;
+  for(int i = 0; i < _axis; ++i) {
+    _n *= shape[i];
+  }
+
+  _inner_dim = 1;
+  for(size_t i = _axis+1; i < shape.size(); ++i) {
+    _inner_dim *= shape[i];
+  }
+
+  _c = shape[_axis];
+  _dim = _c * _inner_dim;
+
+  _max = new float[_inner_dim];
+  _sum = new float[_inner_dim];
+}
+
+void SoftmaxOpRuntime::run() {
+  auto bottom_data = _bottom->cpu_data<float>();
+  auto top_data = _top->cpu_data<float>();
+
+  for (int i = 0; i < _n; ++i) {
+    memcpy(_max, bottom_data, _inner_dim * sizeof(float));
+    memset(_sum, 0, _inner_dim * sizeof(float));
+    // find max value accross channel
+    int c_offset = i * _dim;
+    for (int j = 0; j < _c; ++j, c_offset += _inner_dim) {
+      for (int k = 0; k < _inner_dim; k++) {
+        if (_max[k] < bottom_data[c_offset + k])
+          _max[k] = bottom_data[c_offset + k];
+      }
+    }
+
+    // calculate exp(x)
+    c_offset = i * _dim;
+    for (int j = 0; j < _c; ++j, c_offset += _inner_dim) {
+      for (int k = 0; k < _inner_dim; k++) {
+        top_data[c_offset + k] = std::exp(bottom_data[c_offset + k] - _max[k]);
+        _sum[k] += top_data[c_offset + k];
+      }
+    }
+
+    c_offset = i * _dim;
+    for (int j = 0; j < _c; ++j, c_offset += _inner_dim) {
+      for (int k = 0; k < _inner_dim; k++) {
+        top_data[c_offset + k] /= _sum[k];
+      }
+    }
+  }
+}
\ No newline at end of file
diff --git a/cviruntime/custom_op/example/runtime/SoftmaxOpRuntime.hpp b/cviruntime/custom_op/example/runtime/SoftmaxOpRuntime.hpp
new file mode 100644
index 000000000..507bfea62
--- /dev/null
+++ b/cviruntime/custom_op/example/runtime/SoftmaxOpRuntime.hpp
@@ -0,0 +1,36 @@
+/*
+* Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+*/
+#include <iostream>
+#include <vector>
+#include <string>
+#include <runtime/neuron.hpp>
+#include <runtime/cpu_function.hpp>
+#include <runtime/op_param.hpp>
+
+class SoftmaxOpRuntime : public cvi::runtime::ICpuFunction {
+
+public:
+  SoftmaxOpRuntime() = default;
+  ~SoftmaxOpRuntime();
+
+private:
+  std::shared_ptr<cvi::runtime::Neuron> _bottom;
+  std::shared_ptr<cvi::runtime::Neuron> _top;
+  int _axis;
+  int _inner_dim;
+  int _dim;
+  int _c;
+  int _n;
+  float *_max = nullptr;
+  float *_sum = nullptr;
+
+public:
+  static ICpuFunction *open() { return new SoftmaxOpRuntime(); }
+
+  void setup(std::vector<std::shared_ptr<cvi::runtime::Neuron>> &inputs,
+             std::vector<std::shared_ptr<cvi::runtime::Neuron>> &outputs,
+             cvi::OpParam &param);
+  void run();
+
+};
diff --git a/cviruntime/custom_op/example/segnet_convert.py b/cviruntime/custom_op/example/segnet_convert.py
new file mode 100755
index 000000000..0dc93e500
--- /dev/null
+++ b/cviruntime/custom_op/example/segnet_convert.py
@@ -0,0 +1,70 @@
+#!/usr/bin/python3
+"""
+Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+"""
+
+from argparse import ArgumentParser
+from cvi_toolkit.transform.BaseConverter import TensorType
+from cvi_toolkit.transform.caffe_converter import CaffeConverter
+from cvi_toolkit.utils.log_setting import setup_logger
+from cvi_toolkit.data.preprocess import preprocess
+
+logger = setup_logger('root', log_level="INFO")
+
+class MyCaffeConverter(CaffeConverter):
+    def __init__(self, model_name, prototxt, caffe_model, mlir_file_path, batch_size=1):
+        super().__init__(model_name, prototxt, caffe_model, mlir_file_path, batch_size)
+        self.caffeop_factory['Upsample'] = lambda layer: self.convert_unpooling_op(layer);
+
+
+    def convert_unpooling_op(self, layer):
+        assert(self.layerType(layer) == "Upsample")
+        data, data_shape, _ = self.getOperand(layer.bottom[0])
+        mask, mask_shape, _ = self.getOperand(layer.bottom[1])
+        operands = list()
+        operands.append(data)
+        operands.append(mask)
+        
+        p = layer.upsample_param
+        scale = p.scale
+        if p.HasField("upsample_h"):
+            unpool_h = p.upsample_h
+        else:
+            unpool_h = mask_shape[2]
+        if p.HasField("upsample_w"):
+            unpool_w = p.upsample_w
+        else:
+            unpool_w = mask_shape[3]
+
+        output_shape = [data_shape[0], data_shape[1], unpool_h, unpool_w]
+
+        custom_op_param = {
+            'tpu': True,
+            'do_quant': True,
+            'operation_name': 'unpooling',
+            'threshold_overwrite': 'backward',
+            'param': {
+                'unpool_h': unpool_h,
+                'unpool_w': unpool_w,
+                'scale': scale
+            }
+        }
+        print("layer name: {}, top name: {}\n".format(layer.name, layer.top[0]))
+        custom_op = self.CVI.add_custom_op(layer.name,
+                                           operands, output_shape, **custom_op_param)
+        self.addOperand(layer.top[0], custom_op, output_shape, TensorType.ACTIVATION)
+
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument("--model_path", type=str)
+    parser.add_argument("--model_dat", type=str)
+    parser.add_argument("--mlir_file_path", type=str)
+    args = parser.parse_args()
+
+    #preprocessor = preprocess()
+    #preprocessor.config(net_input_dims="360,480",
+    #                    resize_dims="360,480")
+    
+    c = MyCaffeConverter('segnet', args.model_path, args.model_dat,
+                        args.mlir_file_path, batch_size=1)
+    c.run()
diff --git a/cviruntime/custom_op/example/yolo_v3_convert.py b/cviruntime/custom_op/example/yolo_v3_convert.py
new file mode 100644
index 000000000..032b1a4fe
--- /dev/null
+++ b/cviruntime/custom_op/example/yolo_v3_convert.py
@@ -0,0 +1,52 @@
+#!/usr/bin/python3
+"""
+Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+"""
+
+import onnx
+from cvi_toolkit.transform.BaseConverter import TensorType
+from cvi_toolkit.transform.onnx_converter import OnnxConverter
+from cvi_toolkit.transform.tflite_converter_int8 import TFLiteConverter
+from cvi_toolkit.transform.tensorflow_converter import TFConverter
+from cvi_toolkit.utils.log_setting import setup_logger
+from cvi_toolkit.data.preprocess import add_preprocess_parser, preprocess
+
+logger = setup_logger('root', log_level="INFO")
+
+class MyOnnxConverter(OnnxConverter):
+    def __init__(self, model_name, onnx_model, mlir_file_path, batch_size=1, preprocessor=None):
+        super().__init__(model_name, onnx_model, mlir_file_path, batch_size, preprocessor.to_dict())
+        self.onnxop_factory['LeakyRelu'] = lambda node: self.convert_leaky_relu(node);
+
+    def convert_leaky_relu(self, onnx_node):
+        assert(onnx_node.op_type == "LeakyRelu")
+        alpha = onnx_node.attrs.get("alpha", 0.01)
+        custom_op_param = {
+            'tpu': True,
+            'do_quant': True,
+            'operation_name': 'leaky_relu',
+            'threshold_overwrite': 'backward',
+            'param': {
+                'negative_slope': float(alpha)
+            }
+        }
+        op, input_shape, tensor_type = self.getOperand(onnx_node.inputs[0])
+        operands = list()
+        operands.append(op)
+        output_shape = input_shape
+        custom_op = self.CVI.add_custom_op("{}_{}".format(onnx_node.name, onnx_node.op_type),
+                                           operands, output_shape, **custom_op_param)
+        self.addOperand(onnx_node.name, custom_op, output_shape, TensorType.ACTIVATION)
+
+
+if __name__ == "__main__":
+    onnx_model = onnx.load('model/yolov3-416.onnx')
+    preprocessor = preprocess()
+    preprocessor.config(net_input_dims="416,416",
+               resize_dims="416,416", crop_method='center', keep_aspect_ratio=True,
+               raw_scale=1.0, mean='0,0,0', std='1,1,1', input_scale=1.0,
+               channel_order='bgr', pixel_format=None, data_format='nchw',
+               aligned=False, gray=False)
+    c = MyOnnxConverter('yolo_v3', 'model/yolov3-416.onnx',
+                        'yolo_v3_416.mlir', batch_size=1, preprocessor=preprocessor)
+    c.run()
\ No newline at end of file
diff --git a/cviruntime/doc/assets/cmdbuf_debug.png b/cviruntime/doc/assets/cmdbuf_debug.png
new file mode 100644
index 000000000..d446fb4b8
Binary files /dev/null and b/cviruntime/doc/assets/cmdbuf_debug.png differ
diff --git a/cviruntime/doc/assets/cvimodel.png b/cviruntime/doc/assets/cvimodel.png
new file mode 100755
index 000000000..e1c8cc36f
Binary files /dev/null and b/cviruntime/doc/assets/cvimodel.png differ
diff --git a/cviruntime/doc/assets/framework.jpg b/cviruntime/doc/assets/framework.jpg
new file mode 100755
index 000000000..9a21fadd8
Binary files /dev/null and b/cviruntime/doc/assets/framework.jpg differ
diff --git a/cviruntime/doc/assets/logo_0.png b/cviruntime/doc/assets/logo_0.png
new file mode 100755
index 000000000..a9d4a3b91
Binary files /dev/null and b/cviruntime/doc/assets/logo_0.png differ
diff --git a/cviruntime/doc/cvitek_tpu_sdk_development_manual.md b/cviruntime/doc/cvitek_tpu_sdk_development_manual.md
new file mode 100755
index 000000000..b1edceb7f
--- /dev/null
+++ b/cviruntime/doc/cvitek_tpu_sdk_development_manual.md
@@ -0,0 +1,1324 @@
+![image](./assets/logo_0.png)
+
+# CVITEK TPU SDK开发指南
+
+>
+>文档版本: 1.5.10
+>
+>发布日期: 2023-04-17
+>
+>适用于CV183x/CV182x/CV181x/CV180x系列芯片
+
+
+
+本文件所含信息归<u>© 算能2022</u>所有。
+
+未经授权，严禁全部或部分复制或披露该等信息。
+
+<div STYLE="page-break-after: always;"></div>
+
+# 法律声明
+
+版权所有 © 算能 2022. 保留一切权利。
+非经本公司书面许可, 任何单位和个人不得擅自摘抄、复制本文档内容的部分或全部, 并不得以任何形式传播。
+
+**注意**
+您购买的产品、服务或特性等应受算能商业合同和条款的约束, 本文档中描述的全部或部分产品、服务或特性可能不在您的购买或使用范围之内。除非合同另有约定, 算能对本文档内容不做任何明示或默示的声明或保证。由于产品版本升级或其他原因, 本文档内容会不定期进行更新。除非另有约定, 本文档仅作为使用指导, 本文档中的所有陈述、信息和建议不构成任何明示或暗示的担保。
+
+<div STYLE="page-break-after: always;"></div>
+
+# 目 录
+
+[TOC]
+
+<div STYLE="page-break-after: always;"></div>
+
+# 第1章 概述
+
+## 1.1 TPU介绍
+
+> CVITEK TPU是晶视智能开发的边缘计算SoC平台(CV183x/CV182x/CV181x/CV180x)的异构计算引擎。实现了对主流神经网络运算的高效硬件加速，兼顾执行效率和可编程灵活性。计算精度方面同时支持高效的INT8和高动态范围的BF16两种模式，并通过软件平台支持灵活的混合精度配置。
+
+
+
+## 1.2 工具链介绍
+
+> CVITEK TPU工具链是一个高效，开放，透明，可扩展，模型移植全流程可控的神经网络编译优化开发工具集。
+
+
+
+## 1.3 软件框架
+
+TPU软件开发框图如下图所示:
+
+![](assets\framework.jpg)
+
+> 软件框架由Offline工具链和Runtime模型推理库两部分组成。Offline工具链包括模型转换，编译器，量化工具等组件，完成从用户模型导入，变换，量化，优化，到代码生成等步骤，最终组装为cvimodel格式的推理模型文件。Runtime模型推理库加载cvimodel，读取运行时信息进行设置和资源分配，加载权重数据和指令序列，驱动硬件执行其中的指令序列，完成神经网络推理计算任务，输出推理结果数据。Runtime包含完整仿真平台的支持，客户可以先在仿真平台完成模型移植，验证和精度测试，再加载到真实硬件上验证和执行。
+
+
+
+## 1.4 神经网络编译器
+
+> 神经网络编译器基于MLIR框架开发。编译器完成从一个现有框架模型文件到TPU指令集的转换。具体参考《TPU-MLIR快速入门指南》:
+
+## 1.5 算子支持列表
+### 1.5.1 基础Operation的支持列表：
+>
+> | 算子                              | Engine    | INT8 |
+> | --------------------------------- | --------- | ---- |
+> | Abs                               | TPU       | Y    |
+> | Add                               | TPU       | Y    |
+> | Arg                               | TPU + CPU | Y    |
+> | AvgPool                           | TPU       | Y    |
+> | Cast                              | TPU       | Y    |
+> | Clip                              | TPU       | N    |
+> | Concat                            | TPU       | Y    |
+> | Conv                              | TPU       | Y    |
+> | Copy                              | TPU       | Y    |
+> | Deconv                            | TPU       | Y    |
+> | Depth2Space                       | TPU       | Y    |
+> | DetectionOutput                   | CPU       | N    |
+> | Elu                               | TPU       | Y    |
+> | Exp                               | TPU       | Y    |
+> | FrcnDetection                     | CPU       | N    |
+> | Gather                            | CPU       | N    |
+> | GatherND                          | CPU       | N    |
+> | GELU                              | TPU       | Y    |
+> | GridSampler                       | CPU       | N    |
+> | GRU                               | TPU       | N    |
+> | HardSigmoid                       | TPU       | Y    |
+> | HardSwish                         | TPU       | Y    |
+> | InstanceNorm                      | CPU       | N    |
+> | Interp                            | TPU / CPU | Y    |
+> | LayerNorm                         | TPU       | N    |
+> | LeakyRelu                         | TPU       | Y    |
+> | Log                               | TPU       | Y    |
+> | LRN                               | TPU       | N    |
+> | LSTM                              | TPU       | N    |
+> | MatchTemplate ( TM_CCOEFF_NORMED) | TPU       | N    |
+> | MatMul                            | TPU       | Y    |
+> | Max                               | TPU       | Y    |
+> | MaxPool                           | TPU       | Y    |
+> | Min                               | TPU       | Y    |
+> | Mish                              | TPU       | Y    |
+> | Mul                               | TPU       | Y    |
+> | MulConst                          | TPU       | Y    |
+> | Pad                               | TPU       | Y    |
+> | Permute                           | TPU       | Y    |
+> | PoolMask                          | TPU       | Y    |
+> | Pow                               | TPU       | Y    |
+> | PRelu                             | TPU       | Y    |
+> | Proposal                          | CPU       | N    |
+> | Reciprocal                        | TPU       | Y    |
+> | Reduce                            | TPU       | Y    |
+> | Relu                              | TPU       | Y    |
+> | Reshape                           | None      | Y    |
+> | RetinaFaceDetection               | CPU       | N    |
+> | Reverse                           | TPU       | Y    |
+> | ROIPooling                        | CPU       | N    |
+> | Scale                             | TPU       | Y    |
+> | ScaleLut                          | TPU       | Y    |
+> | ScatterND                         | TPU       | Y    |
+> | ShuffleChannel                    | TPU       | Y    |
+> | Sigmoid                           | TPU       | Y    |
+> | SiLU                              | TPU       | Y    |
+> | Slice                             | TPU       | Y    |
+> | Softmax                           | TPU       | Y    |
+> | Softplus                          | TPU       | Y    |
+> | Softsign                          | TPU       | Y    |
+> | Sub                               | TPU       | Y    |
+> | SubConst                          | TPU       | Y    |
+> | SwapChannle                       | TPU       | Y    |
+> | Tanh                              | TPU       | Y    |
+> | Tile                              | TPU       | Y    |
+> | Upsample                          | TPU       | Y    |
+> | YoloDetection                     | CPU       | N    |
+>
+
+> (1) Engine：来指定当前指令的执行阶段，TPU表示指令在TPU上执行，CPU表示在CPU上执行,  TPU+CPU表示该OP的部分指令在TPU上执行部分执行在CPU上，TPU / CPU根据不同模式指令执行在TPU上或CPU上, None表示不产生实际的指令。
+
+> (2) INT8：表示该OP是否支持INT8量化,  在CPU中执行的指令是不需要做量化的。
+
+### 1.5.2 ONNX算子的支持列表：
+
+| 算子                  | Engine    |
+| --------------------- | --------- |
+| Abs                   | TPU       |
+| Add                   | TPU       |
+| ArgMax                | TPU + CPU |
+| AveragePool           | TPU       |
+| BatchNormalization    | TPU       |
+| Cast                  | TPU       |
+| Concat                | TPU       |
+| Constant              | TPU       |
+| ConstantOfShape       | Numpy     |
+| Conv                  | TPU       |
+| Clip                  | TPU       |
+| ConvTranspose         | TPU       |
+| DepthToSpace          | TPU       |
+| Div                   | TPU       |
+| Dropout               | None      |
+| Einsum                | TPU       |
+| Elu                   | TPU       |
+| Equal                 | TPU       |
+| Exp                   | TPU       |
+| Expand                | TPU       |
+| Flatten               | TPU       |
+| Gather                | CPU       |
+| GatherND              | CPU       |
+| GELU                  | TPU       |
+| Gemm                  | TPU       |
+| GlobalAveragePool     | TPU       |
+| GlobalMaxPool         | TPU       |
+| GridSample            | CPU       |
+| GRU                   | TPU       |
+| HardSigmoid           | TPU       |
+| HardSwish             | TPU       |
+| Identity              | None      |
+| InstanceNormalization | CPU       |
+| LayerNormalization    | TPU       |
+| LeakyRelu             | TPU       |
+| Log                   | TPU       |
+| LogSoftmax            | TPU       |
+| LRN                   | TPU       |
+| LSTM                  | TPU       |
+| MatMul                | TPU       |
+| Max                   | TPU       |
+| MaxPool               | TPU       |
+| Min                   | TPU       |
+| Mul                   | TPU       |
+| Neg                   | TPU       |
+| Pad                   | TPU       |
+| PixelNormalization    | TPU       |
+| PRelu                 | TPU       |
+| Pow                   | TPU       |
+| Reciprocal            | TPU       |
+| ReduceMean            | TPU       |
+| ReduceMax             | TPU       |
+| ReduceMin             | TPU       |
+| ReduceL2              | TPU       |
+| ReduceSum             | TPU       |
+| Relu                  | TPU       |
+| Reshape               | None      |
+| Resize                | TPU / CPU |
+| ScatterND             | TPU       |
+| Shape                 | Numpy     |
+| Sigmoid               | TPU       |
+| Slice                 | TPU       |
+| Softmax               | TPU       |
+| Softplus              | TPU       |
+| Squeeze               | TPU       |
+| Split                 | TPU       |
+| Std                   | TPU       |
+| Sub                   | TPU       |
+| Sum                   | TPU       |
+| Sqrt                  | TPU       |
+| Tanh                  | TPU       |
+| Tile                  | TPU       |
+| Transpose             | TPU       |
+| Unsqueeze             | TPU       |
+| Upsample              | TPU / CPU |
+| Where                 | TPU       |
+
+### 1.5.2 Caffe算子的支持列表：
+| 算子                 | Engine    |
+| -------------------- | --------- |
+| ArgMax               | TPU + CPU |
+| BatchNorm            | TPU       |
+| BN                   | TPU       |
+| Concat               | TPU       |
+| Convolution          | TPU       |
+| ConvolutionDepthwise | TPU       |
+| Crop                 | TPU       |
+| Deconvolution        | TPU       |
+| DetectionOutput      | CPU       |
+| Dropout              | None      |
+| DummyData            | None      |
+| Embed                | CPU       |
+| Eltwise              | TPU       |
+| Flatten              | None      |
+| FrcnDetection        | CPU       |
+| InnerProduct         | TPU       |
+| Input                | None      |
+| Interp               | TPU       |
+| ImageData            | None      |
+| LRN                  | TPU       |
+| LSTM                 | TPU       |
+| Lstm                 | TPU       |
+| MatMul               | TPU       |
+| Mish                 | TPU       |
+| Normalize            | TPU       |
+| Padding              | TPU       |
+| Permute              | TPU       |
+| Pooling              | TPU       |
+| Power                | TPU       |
+| PReLU                | TPU       |
+| PriorBox             | TPU       |
+| Proposal             | CPU       |
+| ReLU                 | TPU       |
+| ReLU6                | TPU       |
+| Reorg                | TPU       |
+| Reshape              | None      |
+| Reverse              | TPU       |
+| RetinaFaceDetection  | CPU       |
+| ROIPooling           | CPU       |
+| Scale                | TPU       |
+| ShuffleChannel       | TPU       |
+| Sigmoid              | TPU       |
+| Silence              | None      |
+| Slice                | TPU       |
+| Softmax              | TPU       |
+| Split                | TPU       |
+| TanH                 | TPU       |
+| Tile                 | TPU       |
+| Upsample             | TPU       |
+| YoloDetection        | CPU       |
+
+### 1.5.3 Torch算子的支持列表：
+
+| 算子                  | Engine    |
+| --------------------- | --------- |
+| torch.abs             | TPU       |
+| torch.add             | TPU       |
+| torch.addmm           | TPU       |
+| torch.arange          | TPU       |
+| torch.bmm             | TPU       |
+| torch.chunk           | TPU       |
+| torch.concat          | TPU       |
+| torch.div             | TPU       |
+| torch.grid_sampler    | CPU       |
+| torch.index_select    | TPU       |
+| torch.matmul          | TPU       |
+| torch.mean            | TPU       |
+| torch.mm              | TPU       |
+| torch.multiply        | TPU       |
+| torch.ones            | TPU       |
+| torch.ones_like       | TPU       |
+| torch.permute         | TPU       |
+| torch.pow             | TPU       |
+| torch.reshape         | None      |
+| torch.select          | TPU       |
+| torch.slice           | TPU       |
+| torch.split           | TPU       |
+| torch.squeeze         | TPU       |
+| torch.sqrt            | TPU       |
+| torch.stack           | TPU       |
+| torch.sub             | TPU       |
+| torch.sum             | TPU       |
+| torch.t               | TPU       |
+| torch.tile            | TPU       |
+| torch.transpose       | TPU       |
+| torch.unsqueeze       | None      |
+| torch.where           | TPU       |
+| torch.zeros           | TPU       |
+| torch.zeros_like      | TPU       |
+| nn.BatchNorm1d        | TPU       |
+| nn.BatchNorm2d        | TPU       |
+| nn.ChannelShuffle     | TPU       |
+| nn.ConstantPad1d      | TPU       |
+| nn.ConstantPad2d      | TPU       |
+| nn.Conv1d             | TPU       |
+| nn.Conv2d             | TPU       |
+| nn.ConvTranspose2d    | TPU       |
+| nn.Dropout            | None      |
+| nn.Embedding          | CPU       |
+| nn.GELU               | TPU       |
+| nn.GRU                | TPU       |
+| nn.Hardsigmoid        | TPU       |
+| nn.Hardswish          | TPU       |
+| nn.InstanceNorm2d     | CPU       |
+| nn.LayerNorm          | TPU       |
+| nn.Linear             | TPU       |
+| nn.LogSigmoid         | TPU       |
+| nn.LSTM               | TPU       |
+| nn.Mish               | TPU       |
+| nn.MaxPool1d          | TPU       |
+| nn.MaxPool2d          | TPU       |
+| nn.MaxPool3d          | TPU       |
+| nn.MultiheadAttention | TPU       |
+| nn.PixelShuffle       | TPU       |
+| nn.ReflectionPad1d    | TPU       |
+| nn.ReflectionPad2d    | TPU       |
+| nn.ReLU               | TPU       |
+| nn.ReLU6              | TPU       |
+| nn.ReplicationPad1d   | TPU       |
+| nn.ReplicationPad2d   | TPU       |
+| nn.Sigmoid            | TPU       |
+| nn.SiLU               | TPU       |
+| nn.Softplus           | TPU       |
+| nn.Softsign           | TPU       |
+| nn.Tanh               | TPU       |
+| nn.Upsample           | TPU / CPU |
+| F.adaptive_avg_pool2d | TPU       |
+| F.avg_pool1d          | TPU       |
+| F.avg_pool2d          | TPU       |
+| F.avg_pool3d          | TPU       |
+| F.elu                 | TPU       |
+| F.interpolate         | TPU / CPU |
+| F.leaky_relu          | TPU       |
+| F.log_softmax         | TPU       |
+| F.prelu               | TPU       |
+| F.softmax             | TPU       |
+| F.softmin             | TPU       |
+| Tensor.new_zeros      | TPU       |
+| Tensor.repeat         | TPU       |
+| Tensor.to             | None      |
+| Tensor.type_as        | None      |
+| Tensor.view           | None      |
+
+## 1.6 模型转储cvimodel文件
+
+> cvimodel文件是离线编译的最终结果，交付给runtime进行解析和在线推理。cvimodel还支持：
+
+-   多batch和多分辨率：对于不同的batch_size和输入分辨率，由于资源和优化选择等差异，需要执行不同的指令序列，但共享权重数据。cvimodel采用相应文件数据格式支持同一模型对多种batch和多种分辨率的推理。
+
+-   模型分段：对于包含TPU不支持算子的模型，支持采用TPU和CPU协同方式进行推理计算。将一个模型分成若干段，每段由特定引擎（TPU或CPU）分别执行。
+
+> 为cv183x平台生成的cvimodel可以运行在1832/1835/1838等cv183x系统芯片上; 为cv182x平台生成的cvimodel可以运行在1821/1822/1826等cv182x系列芯片上；为cv181x平台生成的cvimodel可以运行在1810C/1811C/1812C/1810H/1811H/1812H等cv181x系列芯片上；为cv180x平台生成的cvimodel可以运行在cv180x系列芯片上.
+
+> cvimodel采用flatbuffers进行对权重数据、指令序列以及张量的相关信息进行打包封装，用于部署到平台上。
+
+-   模型（Model）：
+
+> 为网络模型所有信息的集合，单个cvimodel中只能包含一个Model对象，但可以包含多个Batch的指令序列。
+
+-   程序（Program）：
+
+> 对应不同batch的指令序列。指令序列包含TPU段和CPU段，分别表示在TPU上运行的指令以及需要切换到CPU上运行的代码段。
+
+-   段（Routine）：
+
+> 现分为TPU段和CPU段。单个Program中可能包含多个TPU段或者CPU段，在运行时会依序执行。
+
+-   张量（Tensor）：
+
+> 为输入输出张量和Activation等的统称，张量中包含其名称、Shape、基本数据类型等信息。
+
+### 1.6.1 Cvimodel结构
+
+> ![](./assets/cvimodel.png)
+>
+> Cvimodel的基本结构如上图所示，分为三段。首段为cvimodel文件的header部分，包含magic字串，版本号，中段的数据字节数、md5值等数据，是解析cvimodel文件的基本信息；中段为Model的结构信息，包含Program、Routines等信息，用于解析网络模型的结构和指令信息；尾段为二进制数据段，包含权重数据，各Program的TPU指令序列，以及存储用户自定义CPU段的so文件。
+
+## 1.7 Runtime
+
+> Runtime库和应用程序运行在CV183x/CV182x/CV181x/CV180x SoC的ARM/AARCH64/RISCV处理器Linux系统中。Runtime提供一组API供应用程序运行时调用，实现模型在板端的在线推理。主要功能包括：
+
+-   解析cvimodel文件；
+
+-   加载权重数据、根据配置的batch_size和分辨率信息加载指令序列数据；
+
+-   根据CPU段信息加载CPU函数；
+
+-   加载输入数据；
+
+-   执行推理计算；
+
+-   返回结果数据。
+
+
+
+### 1.7.1 Python Binding
+
+> Runtime支持Python Binding，方便利用python的数据预处理和后处理代码快速进行模型开发和验证以及离线仿真。
+
+
+
+### 1.7.2 仿真器
+
+> Runtime除了调用硬件外，还支持以同样的API调用仿真器，进行离线测试和调试。
+
+
+
+## 1.8 开发环境配置
+
+推荐使用docker，具体使用方法请参考《TPU快速入门指南》。
+
+<div STYLE="page-break-after: always;"></div>
+
+<div STYLE="page-break-after: always;"></div>
+
+# 第2章 Runtime开发指南
+
+## 2.1 查看cvimodel
+
+在runtime环境中部署cvimodel，请现在命令行中使用cvimodel_tool去查看cvimodel的详情，如输入、输出tensors的shape、name等，权重、Activations占用空间等信息，具体使用方法如下：
+```sh
+$ cvimodel_tool -a dump -i xxx.cvimodel
+```
+该命令的输出如下：
+
+a. 版本信息部分：
+```sh
+Cvitek Runtime 1.2.0 # runtime lib的版本号
+Mlir Version: tpu_rel_v1.3.4-42-g2bd9f2a54-dirty:20201205
+# 编译此cvimodel所用工具链的版本号
+Cvimodel Version: 1.2.0 # cvimodel的版本号
+Build at 2020-12-05 23:37:09 # cvimodel编译的时间
+CviModel Need ION Memory Size: (5.74 MB) # 模型会使用的ION内存大小
+```
+
+b. 权重和指令段
+```sh
+ Sections:
+
+ ID   TYPE   NAME        SIZE   OFFSET ENCRYPT MD5
+ 000 weight weight       820800 0      False   49974c...
+ 001 cmdbuf tpu_func0_3e 1440   820800 False   caa513...
+```
+其中size为部署时weight或者cmdbuf(指令)所占用的memory的大小；encrypt表示该段是否为加密保存；MD5为该段数据的hash值，用于检查数据的完整性
+
+c. 权重tensor列表
+```sh
+WeightMap:
+ID  OFFSET SIZE   TYPE  N   C    H W NAME
+000 1600   819200 int8  400 2048 1 1 filter_quant_lowered
+001 0      1600   int32 400 1    1 1 bias_quant_lowered
+```
+d. program信息
+
+> program对应于执行推理所需要的结构信息，具体包括Activations占用空间，模型的输入、输出tensor的名称，tpu子程序或者cpu子程序的详情以及tensor列表等信息
+```sh
+ Program %0
+  batch_num : 1
+  private_gmem_size: 0  # 私有memory大小
+  shared_gmem_size: 448 # 共有memory\\<多个模型共享的内存区域\\>大小
+  inputs: data_quant    # 模型输入tensor名称，可对应于后面的tensor列表
+  outputs: input.7_Split_dequant # 模型输出tensor名称，可对应于后面的tensor列表
+
+  routines: # program可有多个tpu或cpu子程序组成
+    %000 tpu # tpu子程序
+      inputs : data_quant
+      outputs : input.7_Split_dequant
+      section : tpu_func0_3e
+
+  tensor_map: # tensor列表
+    ID  OFFSET TYPE N C H    W QSCALE   MEM    NAME
+    000 0      int8 1 1 2048 1 5.314389 io_mem data_quant
+    001 0      int8 1 1 400  1 -        shared fc
+    002 10     int8 1 1 100  1 0.095460 shared input.7_Split
+    003 0      fp32 1 1 100  1 -        io_mem input.7_Split_dequant
+```
+
+
+
+
+
+## 2.2 Runtime开发流程
+
+### 模型加载
+
+> Runtime对一个模型进行推理计算首先需要加载模型文件，runtime加载的对象为cvimodel文件。
+
+### 获取输入输出Tensor
+
+> 接下来，程序通过API分别获取Input Tensor和Output Tensor信息。对于支持多种batch_size的cvimodel，需要则会获取Tensor时指定batch_size。每个Tensor有自身的名称，类型，维度信息，以及存储空间。
+
+### 执行推理
+
+> 数据和buffer准备完毕后就可以开始推理计算。
+
+### 预处理和后处理
+
+> 预处理和后处理有几种处理方式。
+
+-   应用程序自理：用户根据模型对前处理的需要，自行添加代码实现。
+
+-   优化为预处理TPU段：在模型导入阶段，通过命令控制增加相应预处理或后处理操作。编译时，通过优化命令，对符合条件的预处理或后处理操作转换为TPU操作并编译进TPU Section中。运行时，随模型执行过程由TPU进行处理。
+
+<br>
+
+## 2.3 Runtime API参考
+
+### 2.3.1 Runtime C API参考
+
+头文件cviruntime.h中定义了runtime C API的数据结构和函数，用于模型的加载和推理，对应的动态库为libcviruntime.so，静态库为libcviruntime-static.a。
+
+#### 数据结构
+
+> TPU Runtime涉及下列主要数据结构。
+
+-   CVI_FMT：张量数据的基本类型
+
+-   CVI_MEM_TYPE_E：张量数据的存储内存类型
+
+-   CVI_SHAPE：张量数据的维度
+
+-   CVI_MEM：张量数据的存储内存信息
+
+-   CVI_TENSOR：张量数据结构
+
+-   CVI_FRAME_TYPE：数据帧类型
+
+-   CVI_VIDEO_FRAME_INFO：视频帧数据结构
+
+-   CVI_MODEL_HANDLE：网络模型句柄
+
+-   CVI_CONFIG_OPTION：选项配置
+
+##### CVI_FMT
+```c++
+typedef enum {
+  CVI_FMT_FP32 = 0,
+  CVI_FMT_INT32 = 1,
+  CVI_FMT_UINT32 = 2,
+  CVI_FMT_BF16 = 3,
+  CVI_FMT_INT16 = 4,
+  CVI_FMT_UINT16 = 5,
+  CVI_FMT_INT8 = 6,
+  CVI_FMT_UINT8 = 7
+} CVI_FMT;
+```
+【描述】
+
+TENSOR的基本数据类型
+
+|  名称             |描述|
+|---------------- |--------------|
+|  CVI_FMT_FP32   |  float 32类型|
+|  CVI_FMT_INT32  |  int32 类型|
+|  CVI_FMT_UINT32 |  uint32类型|
+|  CVI_FMT_BF16   |  bfloat16类型|
+|  CVI_FMT_INT16  |  int16类型|
+|  CVI_FMT_UINT16 |  uint16类型|
+|  CVI_FMT_INT8   |  int8类型|
+|  CVI_FMT_UINT8  |  uint8类型|
+
+##### CVI_MEM_TYPE_E
+```c++
+typedef enum {
+  CVI_MEM_UNSPECIFIED = 0,
+  CVI_MEM_SYSTEM      = 1,
+  CVI_MEM_DEVICE      = 2
+} CVI_MEM_TYPE_E;
+```
+【描述】
+
+定义数据存储类型，表示数据存储的位置
+
+|  名称                  |描述|
+|--------------------- |-------------------------------|
+|  CVI_MEM_UNSPECIFIED |  初始值，表示未指定MEM内存来源|
+|  CVI_MEM_SYSTEM      |  MEM来自于系统内存|
+|  CVI_MEM_DEVICE      |  MEM来自于设备内存|
+
+##### CVI_SHAPE
+```c++
+#define CVI_DIM_MAX (6)
+
+typedef struct {
+  int32_t dim[CVI_DIM_MAX];
+  size_t dim_size;
+} CVI_SHAPE;
+```
+【描述】
+
+定义TENSOR数据维度，按照n/channel/height/width四个维度排列.
+
+  名称       描述
+|名称|描述|
+|---|---|
+|dim       | 各个维度大小|
+|dim_size  | 维度数量，最多为6个维度|
+
+##### CVI_TENSOR
+```c++
+typedef struct {
+  char            *name;
+  CVI_SHAPE       shape;
+  CVI_FMT         fmt;
+  size_t          count;
+  size_t          mem_size;
+  uint8_t         *sys_mem;
+  uint64_t        paddr;
+  CVI_MEM_TYPE_E  mem_type;
+  float           qscale;
+  ...
+} CVI_TENSOR;
+```
+
+【描述】
+
+定义TENSOR结构体
+
+|  名称       |描述|
+|---------- |------------------------|
+|  name     |  tensor的名称|
+|  shape    |  tensor代表的维度|
+|  fmt      |  tensor的基本数据类型|
+|  count    |  tensor代表的元素个数|
+|  mem_size |  tensor的所占用内存的大小|
+|  sys_mem  |  内存指针，指向系统内存|
+|  paddr    |  内存指针的物理地址    |
+|  mem_type |  tensor输入内存的类型 |
+|  qscale   |  量化转换比例系数|
+<br/>
+
+##### CVI_FRAM_TYPE
+```c++
+typedef enum {
+  CVI_NN_PIXEL_RGB_PACKED    = 0,
+  CVI_NN_PIXEL_BGR_PACKED    = 1,
+  CVI_NN_PIXEL_RGB_PLANAR     = 2,
+  CVI_NN_PIXEL_BGR_PLANAR     = 3,
+  CVI_NN_PIXEL_YUV_420_PLANAR = 13,
+  CVI_NN_PIXEL_GRAYSCALE      = 15,
+  CVI_NN_PIXEL_TENSOR         = 100,
+  // please don't use below values,
+  // only for backward compatibility
+  CVI_NN_PIXEL_PLANAR         = 101,
+  CVI_NN_PIXEL_PACKED        = 102
+} CVI_NN_PIXEL_FORMAT_E;
+
+typedef CVI_NN_PIXEL_FRAME_E CVI_FRAME_TYPE;
+```
+
+【描述】
+
+定义输入数据的格式
+
+|  名称                |描述|
+|------------------- |----------------------|
+| CVI_NN_PIXEL_RGB_PACKED     | RGB packed类型,格式为nhwc|
+| CVI_NN_PIXEL_BGR_PACKED     | BGR packed类型,格式为nhwc|
+| CVI_NN_PIXEL_RGB_PLANAR     | RGB planar类型,格式为nchw|
+| CVI_NN_PIXEL_BGR_PLANAR     | BGR planar类型,格式为nchw|
+| CVI_NN_PIXEL_YUV_420_PLANAR | YUV420 planar类型|
+| CVI_NN_PIXEL_GRAYSCALE      | 灰度图, YUV400|
+| CVI_NN_PIXEL_TENSOR         | 紧密排布的4维度张量(1.3版本前默认类型|
+
+##### CVI_VIDEO_FRAME_INFO
+```c++
+typedef struct {
+  CVI_FRAME_TYPE type;
+  CVI_SHAPE shape;
+  CVI_FMT fmt;
+  uint32_t stride[3];
+  uint64_t pyaddr[3];
+} CVI_VIDEO_FRAME_INFO;
+```
+【描述】
+
+定义数据帧类型
+
+|  名称     |描述|
+|---|---|
+|  type     |数据帧类型|
+|  shape    |数据帧的维度数据|
+|  fmt      |基本数据类型|
+|  stride   |frame w维度的间隔，对齐到字节|
+|  pyaddr   |通道的物理地址，当type为PLANAR时需要填入每个通道的地址；当type为PACKED时，只用填数据首地址|
+
+##### CVI_MODEL_HANDLE
+```c++
+typedef void *CVI_MODEL_HANDLE;
+```
+【描述】
+
+神经网络模型句柄，通过接口CVI_NN_RegisterModel得到该句柄，并由接口
+
+CVI_NN_CleanupModel释放句柄的资源。
+
+##### CVI_CONFIG_OPTION
+```c++
+typedef enum {
+  OPTION_OUTPUT_ALL_TENSORS = 4,
+  OPTION_PROGRAM_INDEX      = 9
+} CVI_CONFIG_OPTION;
+```
+【描述】
+
+定义CVI_NN_SetConfig接口获取或者设置模型配置的枚举类型:
+
+|名称                       | 类型   |默认值  | 描述|
+|---|---|---|---|
+|OPTION_PRGRAM_INDEX        |   int  |  0     |   配置推理模型的program index，cvimodel可以存放模型多个batch size或者多种分辨率的指令(或称为program)，通过program id可以选择执行对应的program|
+|OPTION_OUTPUT_ALL_TENOSRS|   bool |  false |   配置runtime将模型所有可见的TENSOR作为模型的输出，则选项可以作为debug的手段之一|
+
+##### 返回码
+```c++
+ #define CVI_RC_SUCCESS 0 // The operation was successful
+ #define CVI_RC_AGAIN 1   // Not ready yet
+ #define CVI_RC_FAILURE 2 // General failure
+ #define CVI_RC_TIMEOUT 3 // Timeout
+ #define CVI_RC_UNINIT 4  // Uninitialzed
+ #define CVI_RC_INVALID_ARG 5 // Arguments invalid
+ #define CVI_RC_NOMEM 6       // Not enough memory
+ #define CVI_RC_DATA_ERR 7    // Data error
+ #define CVI_RC_BUSY 8        // Busy
+ #define CVI_RC_UNSUPPORT 9   // Not supported yet
+ typedef int CVI_RC;
+```
+【描述】
+
+返回码用于表示接口执行结果是否存在异常，其中CVI_RC_SUCCESS为成功，其他值为失败。
+
+#### 函数
+
+> TPU Runtime提供下述基本接口。
+
+-   CVI_NN_RegisterModel：从文件中加载神经网络模型
+-   CVI_NN_RegisterModelFromBuffer：从内存中加载网络模型
+-   CVI_NN_RegisterModelFromFd：从fd加载网络模型
+-   CVI_NN_CloneModel: 复制神经网络模型
+-   CVI_NN_SetConfig:　配置神经网络模型
+-   CVI_NN_GetInputOutputTensors：获取输入以及输出TENSOR的信息
+-   CVI_NN_Forward：神经网络推理，同步接口
+-   CVI_NN_ForwardAsync：神经网络推理，异步接口
+-   CVI_NN_ForwardWait：等待推理任务完成
+-   CVI_NN_CleanupModel：释放网络资源
+-   CVI_NN_GetTensorByName：根据名字获取张量信息
+-   CVI_NN_TensorPtr：获取张量的系统内存指针
+-   CVI_NN_TensorSize：获取张量的系统内存大小
+-   CVI_NN_TensorCount：获得张量的元素个数
+-   CVI_NN_TensorQuantScale：获得张量的量化系数
+-   CVI_NN_TensorShape：获得张量的Shape
+-   CVI_NN_SetTensorPtr：设置张量的系统内存
+-   CVI_NN_SetTensoPhysicalAddrr：设置张量的物理内存
+-   CVI_NN_SetTensorWithAlignedFrames：将视频帧数据拷贝到张量
+-   CVI_NN_SetTensorWithVideoFrames: 将视频帧数据拷贝到张量
+
+##### CVI_NN_RegisterModel
+
+【原型】
+```c++
+ CVI_RC CVI_NN_RegisterModel(
+     const char *model_file,
+     CVI_MODEL_HANDLE *model)
+```
+【描述】
+
+> 从文件中加载cvimodel，并返回模型句柄。此句柄将作为后续API调用的接口的参数。当模型不再使用时，需要调用CVI_NN_CleanupModel接口对模型资源进行释放。
+
+|参数名称|描述|输入/输出|
+|---|---|---|
+|Model_file |cvimodel模型文件名  |输入|
+|model      |网络模型句柄        |输出|
+
+<br>
+
+##### CVI_NN_RegisterModelFromBuffer
+
+【原型】
+```c++
+CVI_RC CVI_NN_RegisterModelFromBuffer(
+    const int8_t *buf,
+    uint32_t size,
+    CVI_MODEL_HANDLE *model);
+```
+【描述】
+
+> 从内存中加载cvimodel，并返回模型句柄。此句柄将作为后续API调用的接口的参数。当模型不再使用时，需要调用CVI_NN_CleanupModel接口对模型资源进行释放。
+
+|  参数名称   |描述             |输入/输出|
+|---------- |-------------|---|
+|  buf      |  内存地址       |  输入|
+|  size     |  模型的内存大小  | 输入|
+|  model    |  网络模型句柄    | 输出|
+
+<br>
+
+##### CVI_NN_RegisterModelFromFd
+
+【原型】
+
+```c++
+CVI_RC CVI_NN_RegisterModelFromFd(
+    const int32_t fd,
+    const size_t ud_offset,
+    CVI_MODEL_HANDLE *model);
+```
+
+【描述】
+
+> 从fd的offset位置加载cvimodel，并返回模型句柄。此句柄将作为后续API调用的接口的参数。当模型不再使用时，需要调用CVI_NN_CleanupModel接口对模型资源进行释放。
+
+| 参数名称 | 描述           | 输入/输出 |
+| -------- | -------------- | --------- |
+| fd       | 文件描述符     | 输入      |
+| offset   | 加载模型的位置 | 输入      |
+| model    | 网络模型句柄   | 输出      |
+
+<br>
+
+##### CVI_NN_CloneModel
+
+【原型】
+```c++
+CVI_RC CVI_NN_CloneModel(
+    CVI_MODEL_HANDLE model,
+    CVI_MODEL_HANDLE *cloned)
+```
+【描述】
+
+> 当需要运行同一个cvimodel模型的不同或者不同batch size指令时，可以调用此接口复制模型，复制后的模型句柄将和之前句柄共享部分资源，可以有效的减少系统内存开销。该句柄在不使用后，也需通过CVI_NN_CleanupModel进行释放。
+
+  参数名称   描述                 输入/输出
+|参数名称|描述|输入/输出|
+|---|---|---|
+|model     | 已经存在模型句柄           | 输入|
+|cloned    | 返回复制的模型句柄          | 输出|
+
+<br>
+
+##### CVI_NN_SetConfig
+
+【原型】
+```c++
+CVI_RC CVI_NN_SetConfig(
+    CVI_MODEL_HANDLE model,
+    CVI_CONFIG_OPTION option,
+    ...)
+```
+
+【描述】
+
+> 用于对模型模型进行配置，可供配置的选项请参考[CVI_CONFIG_OPTION](#_CVI_CONFIG_OPTION)。如果不需要改变配置的默认值，则可不必调用。
+
+注意，此接口需要在CVI_NN_GetInputOutputTensors之前调用。
+
+  参数名称   描述                         输入/输出
+|参数名称|描述|输入/输出|
+|---|---|---|
+|model     | 模型句柄                       | 输入|
+|option    | 配置选项                       | 输入|
+|可变参数      | 根据option的类型传入配置值           | 输入|
+
+【示例】
+```c++
+CVI_NN_SetConfig(model, OPTION_BATCH_SIZE, 1);
+CVI_NN_SetConfig(model, OPTION_OUTPUT_ALL_TENSORS, false);
+```
+<br>
+
+##### CVI_NN_GetInputOutputTensors
+
+【原型】
+```c++
+CVI_RC CVI_NN_GetInputOutputTensors(
+    CVI_MODEL_HANDLE model,
+    CVI_TENSOR **inputs, int32_t *input_num,
+    CVI_TENSOR **outputs, int32_t *output_num)
+```
+【描述】
+
+> 获取输入和输出TENSOR的信息，同时给TENSOR分配内存。
+
+|  参数名称     |描述             |输入/输出|
+|------------ |-------------|---|
+|  model      |  模型句柄       |  输入|
+|  inputs     |  输入TENSOR数组 |  输出|
+|  input_num  |  输入TENSOR个数 |  输出|
+|  outputs    |  输出TENSOR数组 |  输出|
+|  output_num |  输出TENSOR个数 |  输出|
+
+<br>
+
+##### CVI_NN_Forward
+
+【原型】
+```c++
+CVI_RC CVI_NN_Forward(
+     CVI_MODEL_HANDLE model,
+     CVI_TENSOR inputs[], int32_t input_num,
+     CVI_TENSOR outputs[], int32_t output_num);
+```
+【描述】
+
+> 模型前向推理。此接口为阻塞型，会阻塞直至前向推理完成。inputs和outputs必须已经申请过buffer，且输入数据已经存储在inputs的buffer中，推理的结果会保存在outputs的buffer中。
+
+|参数名称|描述|输入/输出|
+|---|---|---|
+|model        |网络模型句柄        |输入|
+|inputs       |输入tensors     |    输入|
+|input_num    |输入tensor的数量   | 输入|
+|outputs      |输出tensors     |    输出|
+|output_num   |输出tensors的数量  | 输入|
+
+【说明】
+> 当模型在EVB板子上进行前向推理出现类似"run cmdbuf failed"等cmdbuf运行失败的报错时，可通过导入环境变量export TPU_ENABLE_PROTECT=1来进行调试。当导入该环境变量后，程序在推理时会将cmdbuf和weight的内存保护起来，如果程序中有内存越界的地方要非法改写cmdbuf或weight时，则会抛出Segmentation fault，之后可用gdb工具来进行调试，分析代码中出现的内存越界的地方。此外，也可通过捕捉段错误的信号的方式并获取程序调用堆栈的方式来定位问题，参考示例如下：
+```c++
+#include <stdio.h>
+#include <signal.h>
+#include <execinfo.h>
+void sigsegvhandle(int signo) {
+  printf("sigsegvhandle received signal: %d \n", signo);
+  /* output callstack */
+  void *pptrace_raw[32] = {0}; 
+  char **pptrace_str = NULL; 
+  int  trace_num = 0, iloop = 0; 
+  trace_num = backtrace(pptrace_raw, 32); 
+  pptrace_str = (char **)backtrace_symbols(pptrace_raw, trace_num); 
+  for (iloop=0; iloop<trace_num; iloop++) { 
+    printf("%s \n", pptrace_str[iloop]);
+  } 
+  if (pptrace_str) { 
+    delete pptrace_str; 
+  }   
+  /* reset signal handle to default */
+  signal(signo, SIG_DFL);
+  /* will receive SIGSEGV again and exit app */
+}
+
+int main(void) {
+  signal(SIGSEGV, sigsegvhandle); //注册信号处理函数
+  int *arr = NULL;
+  arr[1] = 1;
+}
+
+```
+该程序中的段错误会被捕捉到，并打印出堆栈，如下图所示：
+![](assets\cmdbuf_debug.png)
+
+<br>
+
+##### CVI_NN_CleanupModel
+
+【原型】
+```c++
+CVI_RC CVI_NN_CleanupModel(CVI_MODEL_HANDLE model)
+```
+【描述】
+
+> 释放模型所有资源。如果模型被复制过，则此接口只会将模型的引用次数递减。当引用次数为0时，才会释放模型所有资源。
+
+|参数名称|描述|输入/输出|
+|---|---|---|
+|model|网络模型句柄|输入|
+
+<br>
+
+##### CVI_NN_GetTensorByName
+
+【原型】
+```c++
+CVI_TENSOR *CVI_NN_GetTensorByName(
+    const char *name,
+    CVI_TENSOR *tensors,
+    int32_t num)
+```
+【描述】
+
+根据名称从tensors中获得对应的tensor指针。
+
+|参数名称|描述|输入/输出|
+|---|---|---|
+|name|Tensor的名称;可以指定为 CVI_NN_DEFAULT_TENSOR, input_num为1时返回唯一的那个tensor，否则返回NULL|输入|
+|tensors|tensors数组|输入|
+|num|tensor的个数|输入|
+
+<br>
+
+##### CVI_NN_TensorPtr
+
+【原型】
+
+```c++
+void *CVI_NN_TensorPtr(CVI_TENSOR *tensor)
+```
+
+【描述】
+
+> 获得TENSOR中的系统内存指针。
+
+|  参数名称   |描述        | 输入/输出|
+|---|---|---|
+|  tensor |    tensor指针   |输入|
+
+<br>
+
+##### CVI_NN_TensorSize
+
+【原型】
+```c++
+size_t CVI_NN_TensorSize(CVI_TENSOR *tensor);
+```
+【描述】
+
+> 获得tensor的字节大小。
+
+| 参数名称  | 描述       |  输入/输出|
+|--------|--------|-----------|
+| tensor  |tensor指针 |输入|
+
+<br>
+
+##### CVI_NN_TensorCount
+
+【原型】
+```c++
+size_t CVI_NN_TensorCount(CVI_TENSOR *tensor);
+```
+
+【描述】
+
+> 获得TENSOR的元素个数。
+
+|参数名称  | 描述 |        输入/输出|
+|--------|------------|----|
+|tensor    | tensor指针   |输入|
+
+<br>
+
+##### CVI_NN_TensorQuantScale
+
+【原型】
+
+```c++
+float CVI_NN_TensorQuantScale(CVI_TENSOR *tensor)
+```
+【描述】
+
+> 获得TENSOR的量化系数，用于fp32到int8的转化
+
+|参数名称 |  描述        | 输入/输出|
+|--------|------------|-----------|
+|tensor   |  tensor指针  | 输入|
+
+<br>
+
+##### CVI_NN_TensorShape
+
+【原型】
+```c++
+  CVI_SHAPE CVI_NN_TensorShape(CVI_TENSOR *tensor)
+```
+【描述】
+
+> 获得TENSOR的Shape。
+
+|参数名称|描述|输入/输出|
+|---|---|---|
+|tensor|tensor指针|输入|
+
+<br>
+
+##### CVI_NN_SetTensorPtr
+
+【原型】
+```c++
+CVI_RC CVI_NN_SetTensorPtr(
+    CVI_TENSOR *tensor,
+    uint8_t *buf)
+```
+【描述】
+
+> 设置TENSOR的buffer内存。
+
+| 参数名称   |描述          | 输入/输出|
+|--------|-----|----|
+| tensor |    tensor指针|     输入|
+| buf    |    系统内存指针  | 输入|
+
+<br>
+
+##### CVI_NN_SetTensorPhysicalAddr
+
+【原型】
+
+```c++
+CVI_RC CVI_NN_SetTensorPhysicalAddr(
+    CVI_TENSOR *tensor,
+    uint64_t *paddr)
+```
+
+【描述】
+
+> 设置TENSOR的物理地址。调用该接口后，会释放TENSOR初始化时自动分配的内存。
+
+| 参数名称 | 描述        | 输入/输出 |
+| -------- | ----------- | --------- |
+| tensor   | tensor指针  | 输入      |
+| paddr    | ion内存指针 | 输入      |
+
+<br>
+
+##### CVI_NN_SetTensorWithAlignedFrames
+
+【原型】
+```c++
+CVI_RC CVI_NN_SetTensorWithAlignedFrames(
+    CVI_TENSOR *tensor,
+    uint64_t frame_paddrs[],
+    int32_t frame_num,
+    CVI_NN_PIXEL_FORMAT_E pixel_format);
+```
+【描述】
+
+> 将一帧或者多帧数据拷贝到张量。
+
+|参数名称|描述|输入/输出|
+|---|---|---|
+|  tensor         | tensor指针    |  输入|
+|  frame_paddrs   | 视频帧的物理地址数组  |  输入|
+|  frame_num      | 视频帧的个数    | 输入|
+|  pixel_format   | 视频帧格式类型  |  输入|
+
+##### CVI_NN_SetTensorWithVideoFrame
+
+【原型】
+```c++
+CVI_RC CVI_NN_SetTensorWithVideoFrame(
+    CVI_MODEL_HANDLE model, CVI_TENSOR* tensor,
+    CVI_VIDEO_FRAME_INFO* video_frame_info);
+```
+【描述】
+
+> 将视频帧数据拷贝到张量
+
+|参数名称|描述|输入/输出|
+|---|---|---|
+| model            | 模型句柄       |  输入|
+| tensor           | tensor指针    |  输入|
+| video_frame_info | 视频帧信息     |  输入|
+
+### 2.3.2 Runtime Python API
+
+> Runtime通过pybind11将底层Runtime C++代码封装为Python API。Runtime
+> Python API目前支持在python3.6环境下执行。其主要API如下所示：
+
+#### pyruntime.Tensor
+
+Tensor表示张量对象，
+
+【原型】
+
+```python
+class Tensor:
+
+  def __init__(self):
+    self.name = str()
+    self.data = numpy.ndarray()
+```
+【属性】
+
+> Tensor.name为张量的名称；
+>
+> Tensor.data为numpy
+> arrary用于存放张量的数据。可以分别用data.shape和data.dtype获取张量的Shape以及基本数据类型。
+
+<br>
+
+#### pyruntime.Model
+
+> Model表示模型对象
+
+【原型】
+
+```python
+ class Model:
+
+   def __init__(self, model_file, batch_num=0, dump_all_tensors=False):
+     self.inputs = [Tensor]
+     self.outputs = [Tensor]
+
+   def forward(self):
+     pass
+```
+【属性】
+
+> Model.inputs 为模型的输入张量(pyruntime.Tensor)数组；
+>
+> Model.outputs为模型的输出张量(pyruntime.Tensor)数组。
+
+【方法】
+
+```python
+def __init__(self, model_file, batch_num=0, dump_all_tensors=False)
+```
+> Model类的构造方法，用于注册并配置cvimodel模型文件。
+
+|  功能说明              |注释|
+|--------------------|------------------------------------------|
+|  返回值                |None|
+|  batch_size         |  Int型, 指定模型的batch size|
+|  dump_all_tensors |  Bool型, 将网络可见的张量作为输出Tensors|
+
+```python
+def forward(self)
+```
+> 用于做模型的前向推理
+
+|功能说明|注释|
+|----|---|
+|  返回值    |None|
+
+
+#### 示例
+
+```python
+ import pyruntime
+
+ # initialize the cvimodel
+ model = pyruntime.Model("1.cvimodel")
+ if model == None:
+   raise Exception("cannot load cvimodel")
+
+# fill data to inputs
+ data = model.inputs[0].data
+ input_data = np.fromfile("input.bin", dtype=data.dtype)
+                .reshape(data.shape)
+ data[:] = input_data
+
+ # forward
+ model.forward()
+
+ # get output date
+ for out in model.outputs:
+   print(outputs)
+```
+
+
+
+## 2.4 Runtime日志设置
+
+#### Runtime日志路径
+
+> /var/log/tpu
+
+
+
+#### Runtime日志配置
+
+> 以修改输出日志级别为info为例：
+>
+> 修改/etc/rsyslog.d/tpu.conf文件
+>
+> 输入：
+>
+> if $syslogfacility-text == "local6" and ( $syslogseverity <= 6) then /dev/console （日志级别小于等于6则输出到终端）
+>
+> if $syslogfacility-text == "local6" and ( $syslogseverity <= 6) then /var/log/tpu（日志级别小于等于6则输出到/var/log/tpu文件中）
+>
+> 然后使用命令
+>
+> /etc/init.d/P01rsyslog restart
+>
+> 即可生效
+
+
+
+在开发板上输入
+
+syslogd -l 8 -O /dev/console
+
+或
+
+busybox syslogd -l 8 -O /dev/console
+
+会输出所有runtime日志
+
+
+
+#### Runtime日志级别
+
+| FATAL   | 0    |
+| :------ | ---- |
+| ERROR   | 3    |
+| WARNING | 4    |
+| NOTICE  | 5    |
+| INFO    | 6    |
+| DEBUG   | 7    |
+
+
+
diff --git a/cviruntime/include/bmruntime.h b/cviruntime/include/bmruntime.h
new file mode 100644
index 000000000..60843db78
--- /dev/null
+++ b/cviruntime/include/bmruntime.h
@@ -0,0 +1,107 @@
+/*
+* Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+*
+* File Name: bmruntime.h
+* Description:
+*/
+
+#ifndef _BM_RUNTIME_H_
+#define _BM_RUNTIME_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct bm_context;
+typedef struct bm_context *bmctx_t;
+struct bm_device;
+typedef struct bm_device *bmdev_t;
+typedef int32_t bmerr_t;
+
+struct bm_memory;
+typedef struct bm_memory *bmmem_t;
+typedef bmmem_t bmmem_device_t;
+
+typedef enum bmfmt_e {
+  BM_FMT_FP32   = 0,
+  BM_FMT_FP16   = 1,
+  BM_FMT_INT16  = 2,
+  BM_FMT_INT8   = 3,
+  BM_FMT_BF16   = 4,
+  BM_FMT_MAX    = 5
+} bmfmt_t;
+
+#define BM_SHAPE_MAX_DIM       (4)
+typedef struct bmshape_s {
+  bmfmt_t fmt;
+  int dim_size;
+  int dim[BM_SHAPE_MAX_DIM];
+} bmshape_t;
+
+typedef struct _cvi_array_base {
+  uint64_t gaddr_base0;
+  uint64_t gaddr_base1;
+  uint64_t gaddr_base2;
+  uint64_t gaddr_base3;
+  uint64_t gaddr_base4;
+  uint64_t gaddr_base5;
+  uint64_t gaddr_base6;
+  uint64_t gaddr_base7;
+} cvi_array_base;
+
+bmerr_t bm_init(int index, bmctx_t *ctx);
+void bm_exit(bmctx_t ctx);
+
+bmmem_device_t bmmem_device_alloc_raw(bmctx_t ctx, size_t size);
+bmmem_device_t bmmem_device_prealloc_raw(bmctx_t ctx, bmmem_device_t mem, uint64_t offset, size_t size);
+void bmmem_device_free(bmctx_t ctx, bmmem_device_t mem);
+void bmmem_device_free_ex(uint64_t p_addr);
+
+size_t bmmem_device_size(bmmem_device_t mem);
+uint64_t bmmem_device_addr(bmmem_device_t mem);
+int32_t bmmem_device_inc_ref(bmmem_device_t mem);
+int32_t bmmem_device_dec_ref(bmmem_device_t mem);
+uint8_t* bmmem_device_v_addr(bmmem_device_t mem);
+
+bmerr_t bm_memcpy_s2d(bmctx_t ctx, bmmem_device_t dst, uint8_t* src);
+bmerr_t bm_memcpy_d2s(bmctx_t ctx, uint8_t* dst, bmmem_device_t src);
+bmerr_t bm_memcpy_s2d_ex(bmctx_t ctx, bmmem_device_t dst, uint8_t* src, uint64_t offset, size_t size);
+bmerr_t bm_memcpy_d2s_ex(bmctx_t ctx, uint8_t* dst, bmmem_device_t src, uint64_t offset, size_t size);
+
+bmerr_t bm_load_cmdbuf(bmctx_t ctx, uint8_t *cmdbuf, size_t sz,
+                       uint64_t neuron_gaddr, uint64_t weight_gaddr,
+                       bool enable_pmu, bmmem_device_t *cmdbuf_mem);
+
+bmerr_t bm_run_cmdbuf_ex(bmctx_t ctx, bmmem_device_t cmdbuf_mem, uint16_t *seq_no,
+                       uint64_t input_base_addr, uint64_t output_base_addr);
+bmerr_t bm_run_cmdbuf_ex2(bmctx_t ctx, bmmem_device_t cmdbuf_mem, uint16_t *seq_no,
+                       cvi_array_base *array_base);
+bmerr_t cvi_run_async(bmctx_t ctx, bmmem_device_t cmdbuf_mem);
+bmerr_t cvi_wait_cmdbuf_all(bmctx_t ctx);
+
+bmerr_t bm_run_cmdbuf(bmctx_t ctx, bmmem_device_t cmdbuf_mem,
+                      uint16_t *seq_no);
+bmerr_t bm_send_cmdbuf(bmctx_t ctx, uint8_t *cmdbuf, size_t sz,
+                       uint16_t *seq_no);
+bmerr_t bm_wait_cmdbuf_done(bmctx_t ctx, uint16_t seq_no);
+bmerr_t bm_parse_pmubuf(bmmem_device_t cmdbuf_mem, uint8_t **buf_start, uint32_t *buf_len);
+bmerr_t bm_run_cmdbuf_pio(bmctx_t ctx, uint8_t *cmdbuf, size_t sz);
+
+bmerr_t cvi_load_cmdbuf_tee(bmctx_t ctx, uint8_t *cmdbuf, size_t sz,
+                       uint64_t neuron_gaddr, uint64_t weight_gaddr, uint32_t weight_len, bmmem_device_t *cmdbuf_mem);
+bmerr_t cvi_run_cmdbuf_tee(bmctx_t ctx, uint16_t *seq_no, uint64_t dmabuf_addr, cvi_array_base *array_base);
+
+void bm_device_set_base_reg(bmctx_t ctx, uint32_t inx, uint64_t addr);
+uint64_t bm_device_read_base_reg(bmctx_t ctx, unsigned int inx);
+
+void cviruntime_cvikernel_create(bmctx_t ctx, void **p_bk_ctx);
+void cviruntime_cvikernel_submit(bmctx_t ctx);
+void cviruntime_cvikernel_destroy(bmctx_t ctx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BM_RUNTIME_H_ */
diff --git a/cviruntime/include/bmruntime_bmkernel.h b/cviruntime/include/bmruntime_bmkernel.h
new file mode 100644
index 000000000..7ce70d7fd
--- /dev/null
+++ b/cviruntime/include/bmruntime_bmkernel.h
@@ -0,0 +1,26 @@
+/*
+* Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+*
+* File Name: bmruntime_bmkernel.h
+* Description:
+*/
+
+#ifndef _BM_RUNTIME_BMKERNEL_H_
+#define _BM_RUNTIME_BMKERNEL_H_
+
+#include <bmruntime.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void bmruntime_bmkernel_create(bmctx_t ctx, void **p_bk_ctx);
+void bmruntime_bmkernel_destroy(bmctx_t ctx);
+void bmruntime_bmkernel_submit(bmctx_t ctx);
+void bmruntime_bmkernel_submit_pio(bmctx_t ctx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BM_RUNTIME_BMKERNEL_H_ */
diff --git a/cviruntime/include/cviruntime.h b/cviruntime/include/cviruntime.h
new file mode 100644
index 000000000..a41027a3a
--- /dev/null
+++ b/cviruntime/include/cviruntime.h
@@ -0,0 +1,301 @@
+/*
+* Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+*
+* File Name: cviruntime.h
+* Description:
+*/
+
+#ifndef _CVIRUNTIME_H_
+#define _CVIRUNTIME_H_
+
+#include <stddef.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include "cvitpu_debug.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// data type of tensor
+typedef enum {
+  CVI_FMT_FP32   = 0,
+  CVI_FMT_INT32  = 1,
+  CVI_FMT_UINT32 = 2,
+  CVI_FMT_BF16   = 3,
+  CVI_FMT_INT16  = 4,
+  CVI_FMT_UINT16 = 5,
+  CVI_FMT_INT8   = 6,
+  CVI_FMT_UINT8  = 7
+} CVI_FMT;
+
+// memory source of Tensor buf.
+typedef enum {
+  CVI_MEM_SYSTEM  = 1,
+  CVI_MEM_DEVICE  = 2
+} CVI_MEM_TYPE_E;
+
+// pixel format
+typedef enum {
+  CVI_NN_PIXEL_RGB_PACKED     = 0,
+  CVI_NN_PIXEL_BGR_PACKED     = 1,
+  CVI_NN_PIXEL_RGB_PLANAR     = 2,
+  CVI_NN_PIXEL_BGR_PLANAR     = 3,
+  CVI_NN_PIXEL_YUV_NV12       = 11,
+  CVI_NN_PIXEL_YUV_NV21       = 12,
+  CVI_NN_PIXEL_YUV_420_PLANAR = 13,
+  CVI_NN_PIXEL_GRAYSCALE      = 15,
+  CVI_NN_PIXEL_TENSOR         = 100,
+  CVI_NN_PIXEL_RGBA_PLANAR = 1000,
+  // please don't use below values,
+  // only for backward compatibility
+  CVI_NN_PIXEL_PLANAR         = 101,
+  CVI_NN_PIXEL_PACKED         = 102
+} CVI_NN_PIXEL_FORMAT_E;
+
+typedef enum {
+  /*
+   * bool, default value is false,
+   * if set to true, runtime will output all tensors as
+   * output tensors for debugging.
+   */
+  OPTION_OUTPUT_ALL_TENSORS       = 4,
+  /*
+   * unsigned int, default value is 0,
+   * set program id, for switch programs in cvimodel
+   */
+  OPTION_PROGRAM_INDEX            = 9,
+  // DEPRECATED
+  OPTION_BATCH_SIZE               = 1,
+  // DEPRECATED
+  OPTION_SKIP_POSTPROCESS         = 6,
+  // DEPRECATED
+  OPTION_PREPARE_BUF_FOR_INPUTS   = 2,
+  // DEPRECATED
+  OPTION_PREPARE_BUF_FOR_OUTPUTS  = 3,
+  // DEPRECATED
+  OPTION_SKIP_PREPROCESS          = 5,
+  // DEPRECATED
+  OPTION_INPUT_MEM_TYPE           = 7,
+  // DEPRECATED
+  OPTION_OUTPUT_MEM_TYPE          = 8
+} CVI_CONFIG_OPTION;
+
+#define CVI_DIM_MAX (6)
+typedef struct {
+  int32_t dim[CVI_DIM_MAX];
+  size_t dim_size;
+} CVI_SHAPE;
+
+typedef struct {
+  char                  *name;
+  CVI_SHAPE             shape;
+  CVI_FMT               fmt;
+  size_t                count;
+  size_t                mem_size;
+  uint8_t               *sys_mem;
+  uint64_t              paddr;
+  CVI_MEM_TYPE_E        mem_type;
+  float                 qscale;
+  int                   zero_point;
+  CVI_NN_PIXEL_FORMAT_E pixel_format;
+  bool                  aligned;
+  float                 mean[3];
+  float                 scale[3];
+  void                  *owner;
+  char                  reserved[32];
+} CVI_TENSOR;
+
+typedef CVI_NN_PIXEL_FORMAT_E CVI_FRAME_TYPE;
+#define CVI_FRAME_PLANAR  CVI_NN_PIXEL_PLANAR
+#define CVI_FRAME_PACKAGE CVI_NN_PIXEL_PACKED
+
+typedef struct {
+  CVI_FRAME_TYPE type;
+  CVI_SHAPE shape;
+  CVI_FMT fmt;
+  uint32_t stride[3];
+  uint64_t pyaddr[3];
+} CVI_VIDEO_FRAME_INFO;
+
+typedef void *CVI_MODEL_HANDLE;
+
+typedef int CVI_RC;
+/*
+ * Register a cvimodel file to runtime, and return a model handle.
+ * @param [in] model_file,     file name of cvimodel.
+ * @param [out] model,         handle to registered model.
+ */
+CVI_RC CVI_NN_RegisterModel(const char *model_file, CVI_MODEL_HANDLE *model);
+
+/*
+ * Register a cvimodel file from memory, and return a model handle.
+ * @param [in] buf,            buffer to store cvimodel data.
+ * @param [in] size,           bytes of cvimodel data.
+ * @param [out] model,         handle to registered model.
+ */
+CVI_RC CVI_NN_RegisterModelFromBuffer(const int8_t *buf, uint32_t size, CVI_MODEL_HANDLE *model);
+
+CVI_RC CVI_NN_RegisterModelFromFd(const int fd, const size_t ud_offset, CVI_MODEL_HANDLE *model);
+
+/*
+ * Clone model that pointed by previous model handle, it will increment
+ * the refence count of model. The returned handle will share resources with
+ * previous handle, and save considerable memory.
+ * @param [in] model,  previous handle of model
+ * @param [out] cloned, cloned handle of same model.
+ */
+CVI_RC CVI_NN_CloneModel(CVI_MODEL_HANDLE model, CVI_MODEL_HANDLE *cloned);
+
+/*
+ * Get version number of cvimodel.
+ * @param [in] model,  previous handle of model
+ * @param [out] major version number.
+ * @param [out] minor version number.
+ */
+CVI_RC CVI_NN_GetModelVersion(CVI_MODEL_HANDLE model, int32_t *major, int32_t *minor);
+
+/*
+ * Get version number of cvimodel.
+ * @param [in] model,  previous handle of model
+ * @param [out] target name, cv182x,cv183x
+ */
+const char * CVI_NN_GetModelTarget(CVI_MODEL_HANDLE model);
+
+/*
+ * To set the configuration that specified by CVI_CONFIG_OPTION.
+ * This API must to be called before GetInputOutputTensors if user
+ * want to change default configuration.
+ * It only needs to set all these configurations once.
+ * @param [in] model,   handle of model
+ * @param [in] option,  option defiend in enum CVI_CONFIG_OPTION
+ * @param [in] variant value related to parameter option
+ */
+CVI_RC CVI_NN_SetConfig(CVI_MODEL_HANDLE model, CVI_CONFIG_OPTION option, ...);
+
+/*
+ * Get input and output tensors of model. It needs to be call before
+ * Forward/ForwardAsync API.
+ * @param [in] model,         handle of model.
+ * @param [out] inputs,       array of input tensors.
+ * @param [out] input_num,    number of input tensors.
+ * @param [out] outputs,      array of output tensors.
+ * @param [out] output_num,   number of output tensors.
+ */
+CVI_RC CVI_NN_GetInputOutputTensors(CVI_MODEL_HANDLE model, CVI_TENSOR **inputs,
+    int32_t *input_num, CVI_TENSOR **outputs, int32_t *output_num);
+/*
+ * Inference forwarding in blocking mode.
+ */
+CVI_RC CVI_NN_Forward(CVI_MODEL_HANDLE model, CVI_TENSOR inputs[], int32_t input_num,
+    CVI_TENSOR outputs[], int32_t output_num);
+/*
+ * Infernece forwarding in asynchronous mode and
+ * waiting result by calling ForwardWait.
+ */
+CVI_RC CVI_NN_ForwardAsync(CVI_MODEL_HANDLE model, CVI_TENSOR inputs[], int32_t input_num,
+    CVI_TENSOR outputs[], int32_t output_num, void **task_no);
+/*
+ * Waiting result after do inference forward in async mode.
+ */
+CVI_RC CVI_NN_ForwardWait(CVI_MODEL_HANDLE model, void *task_no);
+/*
+ * Decrement of the reference count of model.
+ * It will cleanup all resources of model if reference
+ * declined to zero.
+ */
+CVI_RC CVI_NN_CleanupModel(CVI_MODEL_HANDLE model);
+
+///
+/// Helper functions
+///
+CVI_RC CVI_NN_GetInputTensors(CVI_MODEL_HANDLE model, CVI_TENSOR **inputs, int32_t *input_num);
+CVI_RC CVI_NN_GetOutputTensors(CVI_MODEL_HANDLE model, CVI_TENSOR **outputs, int32_t *output_num);
+
+#define CVI_NN_DEFAULT_TENSOR (NULL)
+/*
+ * Get tensor from input or output tensors by name.
+ * @param [in] name.     name of wanted tensor.
+ *                       if value is CVI_NN_DEFAULT_TENSOR or NULL, return first tensor.
+ *                       And it also support wild-card matching if name ended by '*' character.
+ * @param [in] tensors,  array of input or output tensors.
+ * @param [in] num,      number of input or output tensors.
+ */
+CVI_TENSOR *CVI_NN_GetTensorByName(const char *name, CVI_TENSOR *tensors, int32_t num);
+/*
+ * Get Name of tensor.
+ */
+char *CVI_NN_TensorName(CVI_TENSOR *tensor);
+/*
+ * Get Buffer pointer of tensor.
+ */
+void *CVI_NN_TensorPtr(CVI_TENSOR *tensor);
+/*
+ * Get Byte size of tensor's buffer.
+ * tensor size = tensor count * sizeof(tensor data type)
+ */
+size_t CVI_NN_TensorSize(CVI_TENSOR *tensor);
+/*
+ * Get Count of elements stored in tensor.
+ */
+size_t CVI_NN_TensorCount(CVI_TENSOR *tensor);
+/*
+ * Get quant scale to do quantization(fp32 -> int8)
+ */
+float CVI_NN_TensorQuantScale(CVI_TENSOR *tensor);
+/*
+ * Get quant zero point to do asymmetric quantization(fp32 -> int8)
+ */
+int CVI_NN_TensorQuantZeroPoint(CVI_TENSOR *tensor);
+/*
+ * Get shape of a tensor.
+ */
+CVI_SHAPE CVI_NN_TensorShape(CVI_TENSOR *tensor);
+
+/*
+ * Set system memory for tensor.
+ */
+CVI_RC CVI_NN_SetTensorPtr(CVI_TENSOR *tensor, void *mem);
+
+/*
+ * Set physical Address for tensor.
+ */
+CVI_RC CVI_NN_SetTensorPhysicalAddr(CVI_TENSOR *tensor, uint64_t paddr);
+
+/*
+ * Do data copy from video frame to tensor
+ * WARNNING, this API is DEPRECATED.
+ */
+CVI_RC CVI_NN_SetTensorWithVideoFrame(
+    CVI_MODEL_HANDLE model, CVI_TENSOR* tensor,
+    CVI_VIDEO_FRAME_INFO* video_frame_info);
+
+/*
+ * Do data copy from video frame to tensor
+ * WARNNING, this API is DEPRECATED.
+ */
+CVI_RC CVI_NN_FeedTensorWithFrames(
+    CVI_MODEL_HANDLE model, CVI_TENSOR *tensor,
+    CVI_FRAME_TYPE type, CVI_FMT format,
+    int32_t channel_num, uint64_t *channel_paddrs,
+    int32_t height, int32_t width, uint32_t height_stride);
+
+/*
+ * Fill frames data from vpss to tensor.
+ */
+CVI_RC CVI_NN_SetTensorWithAlignedFrames(
+    CVI_TENSOR *tensor, uint64_t frame_paddrs[],
+    int32_t frame_num,  CVI_NN_PIXEL_FORMAT_E pixel_format);
+
+/*
+ * set shared memory size befor registering all cvimodels.
+ */
+void CVI_NN_Global_SetSharedMemorySize(size_t size);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _CVIRUNTIME_H_
diff --git a/cviruntime/include/cviruntime_context.h b/cviruntime/include/cviruntime_context.h
new file mode 100644
index 000000000..330e66fd6
--- /dev/null
+++ b/cviruntime/include/cviruntime_context.h
@@ -0,0 +1,119 @@
+#ifndef _CVIRUNTIME_CONTEXT_H_
+#define _CVIRUNTIME_CONTEXT_H_
+
+#include <stddef.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include "cvitpu_debug.h"
+
+#ifdef __cplusplus
+  extern "C" {
+#endif
+
+typedef void *CVI_RT_HANDLE;
+typedef void *CVI_RT_SHANDLE;
+typedef void *CVI_RT_KHANDLE;
+typedef void *CVI_RT_MEM;
+typedef int CVI_RC;
+
+typedef struct __CVI_RT_ARRAYBASE {
+  uint64_t gaddr_base0;
+  uint64_t gaddr_base1;
+  uint64_t gaddr_base2;
+  uint64_t gaddr_base3;
+  uint64_t gaddr_base4;
+  uint64_t gaddr_base5;
+  uint64_t gaddr_base6;
+  uint64_t gaddr_base7;
+} CVI_RT_ARRAYBASE;
+
+typedef enum {
+  CVI_ALLOC_WEIGHT = 0,
+  CVI_ALLOC_PROGRAM = 1,
+  CVI_ALLOC_NEURON = 2,
+  CVI_ALLOC_SHARED = 3,
+  CVI_ALLOC_DMABUF = 4,
+  CVI_ALLOC_UNKNOWN = 5
+} CVI_ALLOC_TYPE;
+
+typedef CVI_RT_MEM (*CVI_MEM_ALLOC_CB) (CVI_RT_HANDLE, uint64_t, CVI_ALLOC_TYPE, const char *);
+typedef void (*CVI_MEM_FREE_CB) (CVI_RT_HANDLE, CVI_RT_MEM);
+
+CVI_RC CVI_RT_Init(CVI_RT_HANDLE *rt_handle);
+CVI_RC CVI_RT_DeInit(CVI_RT_HANDLE rt_handle);
+
+CVI_RT_KHANDLE CVI_RT_RegisterKernel(CVI_RT_HANDLE rt_handle, uint32_t cmdbuf_size);
+CVI_RC CVI_RT_UnRegisterKernel(CVI_RT_KHANDLE rt_khandle);
+
+CVI_RC CVI_RT_Submit(CVI_RT_KHANDLE rt_khandle);
+CVI_RC CVI_RT_SubmitAsync(CVI_RT_KHANDLE rt_khandle, uint8_t submit_previous);
+CVI_RC CVI_RT_WaitForAsync(CVI_RT_KHANDLE rt_khandle);
+
+CVI_RC CVI_RT_LoadCmdbuf(
+    CVI_RT_HANDLE rt_handle, uint8_t *cmdbuf,
+    uint64_t cmdbuf_sz, uint64_t gaddr_base0,
+    uint64_t gaddr_base1, bool enable_pmu,
+    CVI_RT_MEM *cmdbuf_mem);
+CVI_RC CVI_RT_LoadDmabuf(
+    CVI_RT_HANDLE rt_handle, CVI_RT_MEM dmabuf,
+    uint64_t cmdbuf_sz, uint64_t gaddr_base0,
+    uint64_t gaddr_base1, bool enable_pmu, CVI_RT_MEM *dmabuf_mem);
+CVI_RC CVI_RT_RunCmdbuf(
+    CVI_RT_HANDLE rt_handle, CVI_RT_MEM cmdbuf_mem,
+    uint64_t gaddr_base2, uint64_t gaddr_base3);
+CVI_RC CVI_RT_RunCmdbufEx(
+    CVI_RT_HANDLE rt_handle, CVI_RT_MEM cmdbuf_mem,
+    CVI_RT_ARRAYBASE *p_array_base);
+
+CVI_RC CVI_RT_LoadCmdbufTee(
+    CVI_RT_HANDLE rt_handle, uint8_t *cmdbuf,
+    size_t sz, uint64_t neuron_gaddr, uint64_t weight_gaddr,
+    uint32_t weight_len, CVI_RT_MEM *cmdbuf_mem);
+
+CVI_RC CVI_RT_RunCmdbufTee(
+    CVI_RT_HANDLE rt_handle, CVI_RT_MEM cmdbuf_mem,
+    CVI_RT_ARRAYBASE *p_array_base);
+
+CVI_RT_MEM CVI_RT_MemAlloc(CVI_RT_HANDLE rt_handle, uint64_t size);
+CVI_RT_MEM CVI_RT_MemPreAlloc(CVI_RT_MEM mem, uint64_t offset, uint64_t size);
+void CVI_RT_MemFree(CVI_RT_HANDLE rt_handle, CVI_RT_MEM mem);
+void CVI_RT_MemFreeEx(uint64_t p_addr);
+uint64_t CVI_RT_MemGetSize(CVI_RT_MEM mem);
+uint64_t CVI_RT_MemGetPAddr(CVI_RT_MEM mem);
+uint8_t* CVI_RT_MemGetVAddr(CVI_RT_MEM mem);
+int32_t CVI_RT_MemIncRef(CVI_RT_MEM mem);
+int32_t CVI_RT_MemDecRef(CVI_RT_MEM mem);
+
+CVI_RC CVI_RT_MemCopyS2D(CVI_RT_HANDLE rt_handle, CVI_RT_MEM dst, uint8_t* src);
+CVI_RC CVI_RT_MemCopyD2S(CVI_RT_HANDLE rt_handle, uint8_t* dst, CVI_RT_MEM src);
+CVI_RC CVI_RT_MemCopyS2DEx(
+    CVI_RT_HANDLE rt_handle, CVI_RT_MEM dst,
+    uint64_t offset, uint64_t len, uint8_t* src);
+CVI_RC CVI_RT_MemFlush(CVI_RT_HANDLE rt_handle, CVI_RT_MEM mem);
+CVI_RC CVI_RT_MemInvld(CVI_RT_HANDLE rt_handle, CVI_RT_MEM mem);
+CVI_RC CVI_RT_MemFlushEx(CVI_RT_HANDLE rt_handle, CVI_RT_MEM mem, uint64_t len);
+CVI_RC CVI_RT_MemInvldEx(CVI_RT_HANDLE rt_handle, CVI_RT_MEM mem, uint64_t len);
+
+CVI_RC CVI_RT_ParsePmuBuf(CVI_RT_MEM cmdbuf_mem, uint8_t **buf_start, uint32_t *buf_len);
+
+CVI_RC CVI_RT_SetBaseReg(CVI_RT_HANDLE rt_handle, uint32_t inx, uint64_t base_addr);
+
+/*
+ * set memory alloc and free callback function.
+ * @param [in] CVI_MEM_ALLOC_CB,  memory alloc function
+ * @param [in] CVI_MEM_FREE_CB,  memory free function
+ */
+CVI_RC CVI_RT_Global_SetMemAllocCallback(
+    CVI_MEM_ALLOC_CB alloc_cb, CVI_MEM_FREE_CB free_cb);
+
+/*
+ * reset to default memory alloc and free function.
+ */
+void CVI_RT_Global_ResetMemAllocCallback();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _CVIRUNTIME_CONTEXT_H_
+
diff --git a/cviruntime/include/cviruntime_extra.h b/cviruntime/include/cviruntime_extra.h
new file mode 100644
index 000000000..1ea26f033
--- /dev/null
+++ b/cviruntime/include/cviruntime_extra.h
@@ -0,0 +1,48 @@
+/*
+* Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+*
+* File Name: cviruntime_extra.h
+* Description:
+*/
+
+#ifndef _CVIRUNTIME_EXTRA_H_
+#define _CVIRUNTIME_EXTRA_H_
+
+#include <stddef.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include "cvitpu_debug.h"
+#include "cviruntime.h"
+#include "cviruntime_context.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void* CVI_KFUNC_HANDLE;
+
+/*
+ * Create tpu kernel function by given parameters.
+ */
+CVI_KFUNC_HANDLE CVI_NN_PrepareMatrixMulKernelFunc(
+    CVI_RT_HANDLE ctx, CVI_FMT fmt, uint32_t m, uint32_t k, uint32_t n);
+/*
+ */
+CVI_KFUNC_HANDLE CVI_NN_PrepareGrayImageLightKernelFunc(
+    CVI_RT_HANDLE ctx, uint32_t ih, uint32_t iw, uint32_t kernel_sz);
+/*
+ * Run tpu kernel function
+ */
+CVI_RC CVI_NN_RunKernelFunc(CVI_KFUNC_HANDLE kfun, int32_t mem_num, ...);
+/*
+ * Destroy tpu kernel function
+ */
+CVI_RC CVI_NN_DestroyKernelFunc(CVI_KFUNC_HANDLE kfun);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _CVIRUNTIME_H_
diff --git a/cviruntime/include/cvitpu_debug.h b/cviruntime/include/cvitpu_debug.h
new file mode 100644
index 000000000..014564603
--- /dev/null
+++ b/cviruntime/include/cvitpu_debug.h
@@ -0,0 +1,179 @@
+#ifndef _CVITPU_DEBUG_H_
+#define _CVITPU_DEBUG_H_
+
+#include <stdio.h>
+#include <syslog.h>
+#include <assert.h>
+
+#ifndef CVI_SUCCESS
+#define CVI_SUCCESS           0
+#endif  
+#ifndef CVI_FAILURE
+#define CVI_FAILURE           -1
+#endif
+#define CVI_RC_SUCCESS        CVI_SUCCESS             // The operation was successful
+#define CVI_RC_AGAIN          CVI_ERR_TPU_AGAIN       // Not ready yet
+#define CVI_RC_FAILURE        CVI_FAILURE             // General failure
+#define CVI_RC_TIMEOUT        CVI_ERR_TPU_TIMEOUT     // Timeout
+#define CVI_RC_UNINIT         CVI_ERR_TPU_UNINIT      // Uninitialzed
+#define CVI_RC_INVALID_ARG    CVI_ERR_TPU_INVALID_ARG // Arguments invalid
+#define CVI_RC_NOMEM          CVI_ERR_TPU_NOMEM       // Not enough memory
+#define CVI_RC_DATA_ERR       CVI_ERR_TPU_DATA_ERR    // Data error
+#define CVI_RC_BUSY           CVI_ERR_TPU_BUSY        // Busy
+#define CVI_RC_UNSUPPORT      CVI_ERR_TPU_UNSUPPORT   // Not supported yet
+
+
+#define LOG_TOWARD_SYSLOG
+#if defined(__i386__) || defined(__x86_64__)
+#undef LOG_TOWARD_SYSLOG
+#endif
+
+#ifdef LOG_TOWARD_SYSLOG
+#define TPU_LOG_FATAL(...)                  \
+  do {                                      \
+    syslog(LOG_LOCAL6|0, __VA_ARGS__);      \
+  } while (0)
+
+#define TPU_LOG_ERROR(...)                  \
+    do {                                    \
+      syslog(LOG_LOCAL6|3, __VA_ARGS__);    \
+    } while (0)
+
+#define TPU_LOG_WARNING(...)                \
+    do {                                    \
+      syslog(LOG_LOCAL6|4, __VA_ARGS__);    \
+    } while (0)
+
+#define TPU_LOG_NOTICE(...)                 \
+    do {                                    \
+      syslog(LOG_LOCAL6|5, __VA_ARGS__);    \
+    } while (0)
+
+#define TPU_LOG_INFO(...)                   \
+    do {                                    \
+      syslog(LOG_LOCAL6|6, __VA_ARGS__);    \
+    } while (0)
+
+#define TPU_LOG_DEBUG(...)                  \
+    do {                                    \
+      syslog(LOG_LOCAL6|7, __VA_ARGS__);    \
+    } while (0)
+
+#else
+#define TPU_LOG_FATAL(...) printf(__VA_ARGS__)
+#define TPU_LOG_ERROR(...) printf(__VA_ARGS__)
+#define TPU_LOG_WARNING(...) printf(__VA_ARGS__)
+#define TPU_LOG_NOTICE(...) printf(__VA_ARGS__)
+#define TPU_LOG_INFO(...) printf(__VA_ARGS__)
+#define TPU_LOG_DEBUG(...) printf(__VA_ARGS__)
+#endif
+
+#define NDEBUG_ASSERT
+#ifdef NDEBUG_ASSERT
+#define TPU_ASSERT(condition, message)                                                      \
+    do {                                                                                    \
+      if (!(condition)) {                                                                   \
+        TPU_LOG_ERROR("%s ERROR in %s %d\n", message ? message : "", __FILE__, __LINE__);   \
+        assert(0);                                                                          \
+      }                                                                                     \
+    } while (0)
+#else
+#define TPU_ASSERT(condition, message)                                                      \
+    do {                                                                                    \
+      assert(condition && message);                                                         \
+    } while (0)
+#endif
+
+//following referened middleware pre-define 
+/*******************************************************************************/
+/*|----------------------------------------------------------------|*/
+/*| 11|   APP_ID   |   MOD_ID    | ERR_LEVEL |   ERR_ID            |*/
+/*|----------------------------------------------------------------|*/
+/*|<--><--6bits----><----8bits---><--3bits---><------13bits------->|*/
+/*******************************************************************************/
+#define CVI_TPU_ERR_APPID  (0x00000000L)
+#define CVI_TPU_RUNTIME  0x77
+#define CVI_TPU_ERR(module, level, errid) \
+  ((int)(0xC0000000L | (CVI_TPU_ERR_APPID) | ((module) << 16) | ((level)<<13) | (errid)))
+
+typedef enum _TPU_ERR_LEVEL_E {
+  TPU_EN_ERR_LEVEL_DEBUG = 0,  /* debug-level                                  */
+  TPU_EN_ERR_LEVEL_INFO,       /* informational                                */
+  TPU_EN_ERR_LEVEL_NOTICE,     /* normal but significant condition             */
+  TPU_EN_ERR_LEVEL_WARNING,    /* warning conditions                           */
+  TPU_EN_ERR_LEVEL_ERROR,      /* error conditions                             */
+  TPU_EN_ERR_LEVEL_CRIT,       /* critical conditions                          */
+  TPU_EN_ERR_LEVEL_ALERT,      /* action must be taken immediately             */
+  TPU_EN_ERR_LEVEL_FATAL,      /* just for compatibility with previous version */
+  TPU_EN_ERR_LEVEL_BUTT
+} TPU_ERR_LEVEL_E;
+
+/* NOTE! the following defined all common error code,		*/
+/*** all module must reserved 0~63 for their common error code*/
+typedef enum _TPU_EN_ERR_CODE_E {
+  TPU_EN_ERR_INVALID_DEVID = 1, /* invalid device ID */
+  TPU_EN_ERR_INVALID_CHNID = 2, /* invalid channel ID*/
+  TPU_EN_ERR_ILLEGAL_PARAM = 3,
+  /* at least one parameter is illegal*/
+  /* eg, an illegal enumeration value */
+  TPU_EN_ERR_EXIST         = 4, /* resource exists*/
+  TPU_EN_ERR_UNEXIST       = 5, /* resource unexists */
+  TPU_EN_ERR_NULL_PTR      = 6, /* using a NULL point*/
+  TPU_EN_ERR_NOT_CONFIG    = 7,
+  /* try to enable or initialize system, device*/
+  /* or channel, before configing attribute*/
+  TPU_EN_ERR_NOT_SUPPORT   = 8,
+  /* operation or type is not supported by NOW*/
+  TPU_EN_ERR_NOT_PERM      = 9,
+  /* operation is not permitted*/
+  /* eg, try to change static attribute*/
+  TPU_EN_ERR_INVALID_PIPEID = 10,
+  /* invalid pipe ID*/
+  TPU_EN_ERR_INVALID_GRPID  = 11,
+  /* invalid group ID*/
+  TPU_EN_ERR_NOMEM         = 12,
+  /* failure caused by malloc memory*/
+  TPU_EN_ERR_NOBUF         = 13,
+  /* failure caused by malloc buffer*/
+  TPU_EN_ERR_BUF_EMPTY     = 14,
+  /* no data in buffer */
+  TPU_EN_ERR_BUF_FULL      = 15,
+  /* no buffer for new data*/
+  TPU_EN_ERR_SYS_NOTREADY  = 16,
+  /* System is not ready, maybe not initialized or*/
+  /* loaded. Returning the error code when opening*/
+  /* a device file failed.*/
+  TPU_EN_ERR_BADADDR       = 17,
+  /* bad address,*/
+  /* eg. used for copy_from_user & copy_to_user*/
+  TPU_EN_ERR_BUSY          = 18,
+  /* resource is busy,*/
+  /* eg. destroy a venc chn without unregister it */
+  TPU_EN_ERR_SIZE_NOT_ENOUGH = 19,
+  /* buffer size is smaller than the actual size required */
+  TPU_EN_ERR_INVALID_VB    = 20,
+
+  /* tpu error code extension */
+  TPU_EN_ERR_TIMEOUT    = 21,
+  TPU_EN_ERR_DATAERR    = 22,
+
+  /* invalid VB handle */
+  TPU_EN_ERR_BUTT          = 63,
+  /* maximum code, private error code of all modules*/
+  /* must be greater than it */
+} TPU_EN_ERR_CODE_E;
+
+typedef enum _CVI_TPU_ERRCODE {
+  CVI_ERR_TPU_SUCCESS = 0,
+  CVI_ERR_TPU_AGAIN = CVI_TPU_ERR(CVI_TPU_RUNTIME, TPU_EN_ERR_LEVEL_ERROR, TPU_EN_ERR_SYS_NOTREADY),
+  CVI_ERR_TPU_FAILURE = -1,
+  CVI_ERR_TPU_TIMEOUT = CVI_TPU_ERR(CVI_TPU_RUNTIME, TPU_EN_ERR_LEVEL_ERROR, TPU_EN_ERR_TIMEOUT),
+  CVI_ERR_TPU_UNINIT = CVI_TPU_ERR(CVI_TPU_RUNTIME, TPU_EN_ERR_LEVEL_ERROR, TPU_EN_ERR_NOT_CONFIG),
+  CVI_ERR_TPU_INVALID_ARG = CVI_TPU_ERR(CVI_TPU_RUNTIME, TPU_EN_ERR_LEVEL_ERROR, TPU_EN_ERR_ILLEGAL_PARAM),
+  CVI_ERR_TPU_NOMEM = CVI_TPU_ERR(CVI_TPU_RUNTIME, TPU_EN_ERR_LEVEL_ERROR, TPU_EN_ERR_NOMEM),
+  CVI_ERR_TPU_DATA_ERR = CVI_TPU_ERR(CVI_TPU_RUNTIME, TPU_EN_ERR_LEVEL_ERROR, TPU_EN_ERR_DATAERR),
+  CVI_ERR_TPU_BUSY = CVI_TPU_ERR(CVI_TPU_RUNTIME, TPU_EN_ERR_LEVEL_ERROR, TPU_EN_ERR_BUSY),
+  CVI_ERR_TPU_UNSUPPORT = CVI_TPU_ERR(CVI_TPU_RUNTIME, TPU_EN_ERR_LEVEL_ERROR, TPU_EN_ERR_NOT_SUPPORT),
+} CVI_TPU_ERRCODE;
+
+#endif
diff --git a/cviruntime/include/lz4/lz4.h b/cviruntime/include/lz4/lz4.h
new file mode 100644
index 000000000..7ab1e483a
--- /dev/null
+++ b/cviruntime/include/lz4/lz4.h
@@ -0,0 +1,774 @@
+/*
+ *  LZ4 - Fast LZ compression algorithm
+ *  Header File
+ *  Copyright (C) 2011-present, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+    - LZ4 homepage : http://www.lz4.org
+    - LZ4 source repository : https://github.com/lz4/lz4
+*/
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+#ifndef LZ4_H_2983827168210
+#define LZ4_H_2983827168210
+
+/* --- Dependency --- */
+#include <stddef.h>   /* size_t */
+
+
+/**
+  Introduction
+
+  LZ4 is lossless compression algorithm, providing compression speed >500 MB/s per core,
+  scalable with multi-cores CPU. It features an extremely fast decoder, with speed in
+  multiple GB/s per core, typically reaching RAM speed limits on multi-core systems.
+
+  The LZ4 compression library provides in-memory compression and decompression functions.
+  It gives full buffer control to user.
+  Compression can be done in:
+    - a single step (described as Simple Functions)
+    - a single step, reusing a context (described in Advanced Functions)
+    - unbounded multiple steps (described as Streaming compression)
+
+  lz4.h generates and decodes LZ4-compressed blocks (doc/lz4_Block_format.md).
+  Decompressing such a compressed block requires additional metadata.
+  Exact metadata depends on exact decompression function.
+  For the typical case of LZ4_decompress_safe(),
+  metadata includes block's compressed size, and maximum bound of decompressed size.
+  Each application is free to encode and pass such metadata in whichever way it wants.
+
+  lz4.h only handle blocks, it can not generate Frames.
+
+  Blocks are different from Frames (doc/lz4_Frame_format.md).
+  Frames bundle both blocks and metadata in a specified manner.
+  Embedding metadata is required for compressed data to be self-contained and portable.
+  Frame format is delivered through a companion API, declared in lz4frame.h.
+  The `lz4` CLI can only manage frames.
+*/
+
+/*^***************************************************************
+*  Export parameters
+*****************************************************************/
+/*
+*  LZ4_DLL_EXPORT :
+*  Enable exporting of functions when building a Windows DLL
+*  LZ4LIB_VISIBILITY :
+*  Control library symbols visibility.
+*/
+#ifndef LZ4LIB_VISIBILITY
+#  if defined(__GNUC__) && (__GNUC__ >= 4)
+#    define LZ4LIB_VISIBILITY __attribute__ ((visibility ("default")))
+#  else
+#    define LZ4LIB_VISIBILITY
+#  endif
+#endif
+#if defined(LZ4_DLL_EXPORT) && (LZ4_DLL_EXPORT==1)
+#  define LZ4LIB_API __declspec(dllexport) LZ4LIB_VISIBILITY
+#elif defined(LZ4_DLL_IMPORT) && (LZ4_DLL_IMPORT==1)
+#  define LZ4LIB_API __declspec(dllimport) LZ4LIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
+#else
+#  define LZ4LIB_API LZ4LIB_VISIBILITY
+#endif
+
+/*------   Version   ------*/
+#define LZ4_VERSION_MAJOR    1    /* for breaking interface changes  */
+#define LZ4_VERSION_MINOR    9    /* for new (non-breaking) interface capabilities */
+#define LZ4_VERSION_RELEASE  3    /* for tweaks, bug-fixes, or development */
+
+#define LZ4_VERSION_NUMBER (LZ4_VERSION_MAJOR *100*100 + LZ4_VERSION_MINOR *100 + LZ4_VERSION_RELEASE)
+
+#define LZ4_LIB_VERSION LZ4_VERSION_MAJOR.LZ4_VERSION_MINOR.LZ4_VERSION_RELEASE
+#define LZ4_QUOTE(str) #str
+#define LZ4_EXPAND_AND_QUOTE(str) LZ4_QUOTE(str)
+#define LZ4_VERSION_STRING LZ4_EXPAND_AND_QUOTE(LZ4_LIB_VERSION)
+
+LZ4LIB_API int LZ4_versionNumber (void);  /**< library version number; useful to check dll version */
+LZ4LIB_API const char* LZ4_versionString (void);   /**< library version string; useful to check dll version */
+
+
+/*-************************************
+*  Tuning parameter
+**************************************/
+/*!
+ * LZ4_MEMORY_USAGE :
+ * Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+ * Increasing memory usage improves compression ratio.
+ * Reduced memory usage may improve speed, thanks to better cache locality.
+ * Default value is 14, for 16KB, which nicely fits into Intel x86 L1 cache
+ */
+#ifndef LZ4_MEMORY_USAGE
+# define LZ4_MEMORY_USAGE 14
+#endif
+
+
+/*-************************************
+*  Simple Functions
+**************************************/
+/*! LZ4_compress_default() :
+ *  Compresses 'srcSize' bytes from buffer 'src'
+ *  into already allocated 'dst' buffer of size 'dstCapacity'.
+ *  Compression is guaranteed to succeed if 'dstCapacity' >= LZ4_compressBound(srcSize).
+ *  It also runs faster, so it's a recommended setting.
+ *  If the function cannot compress 'src' into a more limited 'dst' budget,
+ *  compression stops *immediately*, and the function result is zero.
+ *  In which case, 'dst' content is undefined (invalid).
+ *      srcSize : max supported value is LZ4_MAX_INPUT_SIZE.
+ *      dstCapacity : size of buffer 'dst' (which must be already allocated)
+ *     @return  : the number of bytes written into buffer 'dst' (necessarily <= dstCapacity)
+ *                or 0 if compression fails
+ * Note : This function is protected against buffer overflow scenarios (never writes outside 'dst' buffer, nor read outside 'source' buffer).
+ */
+LZ4LIB_API int LZ4_compress_default(const char* src, char* dst, int srcSize, int dstCapacity);
+
+/*! LZ4_decompress_safe() :
+ *  compressedSize : is the exact complete size of the compressed block.
+ *  dstCapacity : is the size of destination buffer (which must be already allocated), presumed an upper bound of decompressed size.
+ * @return : the number of bytes decompressed into destination buffer (necessarily <= dstCapacity)
+ *           If destination buffer is not large enough, decoding will stop and output an error code (negative value).
+ *           If the source stream is detected malformed, the function will stop decoding and return a negative result.
+ * Note 1 : This function is protected against malicious data packets :
+ *          it will never writes outside 'dst' buffer, nor read outside 'source' buffer,
+ *          even if the compressed block is maliciously modified to order the decoder to do these actions.
+ *          In such case, the decoder stops immediately, and considers the compressed block malformed.
+ * Note 2 : compressedSize and dstCapacity must be provided to the function, the compressed block does not contain them.
+ *          The implementation is free to send / store / derive this information in whichever way is most beneficial.
+ *          If there is a need for a different format which bundles together both compressed data and its metadata, consider looking at lz4frame.h instead.
+ */
+LZ4LIB_API int LZ4_decompress_safe (const char* src, char* dst, int compressedSize, int dstCapacity);
+
+
+/*-************************************
+*  Advanced Functions
+**************************************/
+#define LZ4_MAX_INPUT_SIZE        0x7E000000   /* 2 113 929 216 bytes */
+#define LZ4_COMPRESSBOUND(isize)  ((unsigned)(isize) > (unsigned)LZ4_MAX_INPUT_SIZE ? 0 : (isize) + ((isize)/255) + 16)
+
+/*! LZ4_compressBound() :
+    Provides the maximum size that LZ4 compression may output in a "worst case" scenario (input data not compressible)
+    This function is primarily useful for memory allocation purposes (destination buffer size).
+    Macro LZ4_COMPRESSBOUND() is also provided for compilation-time evaluation (stack memory allocation for example).
+    Note that LZ4_compress_default() compresses faster when dstCapacity is >= LZ4_compressBound(srcSize)
+        inputSize  : max supported value is LZ4_MAX_INPUT_SIZE
+        return : maximum output size in a "worst case" scenario
+              or 0, if input size is incorrect (too large or negative)
+*/
+LZ4LIB_API int LZ4_compressBound(int inputSize);
+
+/*! LZ4_compress_fast() :
+    Same as LZ4_compress_default(), but allows selection of "acceleration" factor.
+    The larger the acceleration value, the faster the algorithm, but also the lesser the compression.
+    It's a trade-off. It can be fine tuned, with each successive value providing roughly +~3% to speed.
+    An acceleration value of "1" is the same as regular LZ4_compress_default()
+    Values <= 0 will be replaced by LZ4_ACCELERATION_DEFAULT (currently == 1, see lz4.c).
+    Values > LZ4_ACCELERATION_MAX will be replaced by LZ4_ACCELERATION_MAX (currently == 65537, see lz4.c).
+*/
+LZ4LIB_API int LZ4_compress_fast (const char* src, char* dst, int srcSize, int dstCapacity, int acceleration);
+
+
+/*! LZ4_compress_fast_extState() :
+ *  Same as LZ4_compress_fast(), using an externally allocated memory space for its state.
+ *  Use LZ4_sizeofState() to know how much memory must be allocated,
+ *  and allocate it on 8-bytes boundaries (using `malloc()` typically).
+ *  Then, provide this buffer as `void* state` to compression function.
+ */
+LZ4LIB_API int LZ4_sizeofState(void);
+LZ4LIB_API int LZ4_compress_fast_extState (void* state, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration);
+
+
+/*! LZ4_compress_destSize() :
+ *  Reverse the logic : compresses as much data as possible from 'src' buffer
+ *  into already allocated buffer 'dst', of size >= 'targetDestSize'.
+ *  This function either compresses the entire 'src' content into 'dst' if it's large enough,
+ *  or fill 'dst' buffer completely with as much data as possible from 'src'.
+ *  note: acceleration parameter is fixed to "default".
+ *
+ * *srcSizePtr : will be modified to indicate how many bytes where read from 'src' to fill 'dst'.
+ *               New value is necessarily <= input value.
+ * @return : Nb bytes written into 'dst' (necessarily <= targetDestSize)
+ *           or 0 if compression fails.
+ *
+ * Note : from v1.8.2 to v1.9.1, this function had a bug (fixed un v1.9.2+):
+ *        the produced compressed content could, in specific circumstances,
+ *        require to be decompressed into a destination buffer larger
+ *        by at least 1 byte than the content to decompress.
+ *        If an application uses `LZ4_compress_destSize()`,
+ *        it's highly recommended to update liblz4 to v1.9.2 or better.
+ *        If this can't be done or ensured,
+ *        the receiving decompression function should provide
+ *        a dstCapacity which is > decompressedSize, by at least 1 byte.
+ *        See https://github.com/lz4/lz4/issues/859 for details
+ */
+LZ4LIB_API int LZ4_compress_destSize (const char* src, char* dst, int* srcSizePtr, int targetDstSize);
+
+
+/*! LZ4_decompress_safe_partial() :
+ *  Decompress an LZ4 compressed block, of size 'srcSize' at position 'src',
+ *  into destination buffer 'dst' of size 'dstCapacity'.
+ *  Up to 'targetOutputSize' bytes will be decoded.
+ *  The function stops decoding on reaching this objective.
+ *  This can be useful to boost performance
+ *  whenever only the beginning of a block is required.
+ *
+ * @return : the number of bytes decoded in `dst` (necessarily <= targetOutputSize)
+ *           If source stream is detected malformed, function returns a negative result.
+ *
+ *  Note 1 : @return can be < targetOutputSize, if compressed block contains less data.
+ *
+ *  Note 2 : targetOutputSize must be <= dstCapacity
+ *
+ *  Note 3 : this function effectively stops decoding on reaching targetOutputSize,
+ *           so dstCapacity is kind of redundant.
+ *           This is because in older versions of this function,
+ *           decoding operation would still write complete sequences.
+ *           Therefore, there was no guarantee that it would stop writing at exactly targetOutputSize,
+ *           it could write more bytes, though only up to dstCapacity.
+ *           Some "margin" used to be required for this operation to work properly.
+ *           Thankfully, this is no longer necessary.
+ *           The function nonetheless keeps the same signature, in an effort to preserve API compatibility.
+ *
+ *  Note 4 : If srcSize is the exact size of the block,
+ *           then targetOutputSize can be any value,
+ *           including larger than the block's decompressed size.
+ *           The function will, at most, generate block's decompressed size.
+ *
+ *  Note 5 : If srcSize is _larger_ than block's compressed size,
+ *           then targetOutputSize **MUST** be <= block's decompressed size.
+ *           Otherwise, *silent corruption will occur*.
+ */
+LZ4LIB_API int LZ4_decompress_safe_partial (const char* src, char* dst, int srcSize, int targetOutputSize, int dstCapacity);
+
+
+/*-*********************************************
+*  Streaming Compression Functions
+***********************************************/
+typedef union LZ4_stream_u LZ4_stream_t;  /* incomplete type (defined later) */
+
+LZ4LIB_API LZ4_stream_t* LZ4_createStream(void);
+LZ4LIB_API int           LZ4_freeStream (LZ4_stream_t* streamPtr);
+
+/*! LZ4_resetStream_fast() : v1.9.0+
+ *  Use this to prepare an LZ4_stream_t for a new chain of dependent blocks
+ *  (e.g., LZ4_compress_fast_continue()).
+ *
+ *  An LZ4_stream_t must be initialized once before usage.
+ *  This is automatically done when created by LZ4_createStream().
+ *  However, should the LZ4_stream_t be simply declared on stack (for example),
+ *  it's necessary to initialize it first, using LZ4_initStream().
+ *
+ *  After init, start any new stream with LZ4_resetStream_fast().
+ *  A same LZ4_stream_t can be re-used multiple times consecutively
+ *  and compress multiple streams,
+ *  provided that it starts each new stream with LZ4_resetStream_fast().
+ *
+ *  LZ4_resetStream_fast() is much faster than LZ4_initStream(),
+ *  but is not compatible with memory regions containing garbage data.
+ *
+ *  Note: it's only useful to call LZ4_resetStream_fast()
+ *        in the context of streaming compression.
+ *        The *extState* functions perform their own resets.
+ *        Invoking LZ4_resetStream_fast() before is redundant, and even counterproductive.
+ */
+LZ4LIB_API void LZ4_resetStream_fast (LZ4_stream_t* streamPtr);
+
+/*! LZ4_loadDict() :
+ *  Use this function to reference a static dictionary into LZ4_stream_t.
+ *  The dictionary must remain available during compression.
+ *  LZ4_loadDict() triggers a reset, so any previous data will be forgotten.
+ *  The same dictionary will have to be loaded on decompression side for successful decoding.
+ *  Dictionary are useful for better compression of small data (KB range).
+ *  While LZ4 accept any input as dictionary,
+ *  results are generally better when using Zstandard's Dictionary Builder.
+ *  Loading a size of 0 is allowed, and is the same as reset.
+ * @return : loaded dictionary size, in bytes (necessarily <= 64 KB)
+ */
+LZ4LIB_API int LZ4_loadDict (LZ4_stream_t* streamPtr, const char* dictionary, int dictSize);
+
+/*! LZ4_compress_fast_continue() :
+ *  Compress 'src' content using data from previously compressed blocks, for better compression ratio.
+ * 'dst' buffer must be already allocated.
+ *  If dstCapacity >= LZ4_compressBound(srcSize), compression is guaranteed to succeed, and runs faster.
+ *
+ * @return : size of compressed block
+ *           or 0 if there is an error (typically, cannot fit into 'dst').
+ *
+ *  Note 1 : Each invocation to LZ4_compress_fast_continue() generates a new block.
+ *           Each block has precise boundaries.
+ *           Each block must be decompressed separately, calling LZ4_decompress_*() with relevant metadata.
+ *           It's not possible to append blocks together and expect a single invocation of LZ4_decompress_*() to decompress them together.
+ *
+ *  Note 2 : The previous 64KB of source data is __assumed__ to remain present, unmodified, at same address in memory !
+ *
+ *  Note 3 : When input is structured as a double-buffer, each buffer can have any size, including < 64 KB.
+ *           Make sure that buffers are separated, by at least one byte.
+ *           This construction ensures that each block only depends on previous block.
+ *
+ *  Note 4 : If input buffer is a ring-buffer, it can have any size, including < 64 KB.
+ *
+ *  Note 5 : After an error, the stream status is undefined (invalid), it can only be reset or freed.
+ */
+LZ4LIB_API int LZ4_compress_fast_continue (LZ4_stream_t* streamPtr, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration);
+
+/*! LZ4_saveDict() :
+ *  If last 64KB data cannot be guaranteed to remain available at its current memory location,
+ *  save it into a safer place (char* safeBuffer).
+ *  This is schematically equivalent to a memcpy() followed by LZ4_loadDict(),
+ *  but is much faster, because LZ4_saveDict() doesn't need to rebuild tables.
+ * @return : saved dictionary size in bytes (necessarily <= maxDictSize), or 0 if error.
+ */
+LZ4LIB_API int LZ4_saveDict (LZ4_stream_t* streamPtr, char* safeBuffer, int maxDictSize);
+
+
+/*-**********************************************
+*  Streaming Decompression Functions
+*  Bufferless synchronous API
+************************************************/
+typedef union LZ4_streamDecode_u LZ4_streamDecode_t;   /* tracking context */
+
+/*! LZ4_createStreamDecode() and LZ4_freeStreamDecode() :
+ *  creation / destruction of streaming decompression tracking context.
+ *  A tracking context can be re-used multiple times.
+ */
+LZ4LIB_API LZ4_streamDecode_t* LZ4_createStreamDecode(void);
+LZ4LIB_API int                 LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream);
+
+/*! LZ4_setStreamDecode() :
+ *  An LZ4_streamDecode_t context can be allocated once and re-used multiple times.
+ *  Use this function to start decompression of a new stream of blocks.
+ *  A dictionary can optionally be set. Use NULL or size 0 for a reset order.
+ *  Dictionary is presumed stable : it must remain accessible and unmodified during next decompression.
+ * @return : 1 if OK, 0 if error
+ */
+LZ4LIB_API int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize);
+
+/*! LZ4_decoderRingBufferSize() : v1.8.2+
+ *  Note : in a ring buffer scenario (optional),
+ *  blocks are presumed decompressed next to each other
+ *  up to the moment there is not enough remaining space for next block (remainingSize < maxBlockSize),
+ *  at which stage it resumes from beginning of ring buffer.
+ *  When setting such a ring buffer for streaming decompression,
+ *  provides the minimum size of this ring buffer
+ *  to be compatible with any source respecting maxBlockSize condition.
+ * @return : minimum ring buffer size,
+ *           or 0 if there is an error (invalid maxBlockSize).
+ */
+LZ4LIB_API int LZ4_decoderRingBufferSize(int maxBlockSize);
+#define LZ4_DECODER_RING_BUFFER_SIZE(maxBlockSize) (65536 + 14 + (maxBlockSize))  /* for static allocation; maxBlockSize presumed valid */
+
+/*! LZ4_decompress_*_continue() :
+ *  These decoding functions allow decompression of consecutive blocks in "streaming" mode.
+ *  A block is an unsplittable entity, it must be presented entirely to a decompression function.
+ *  Decompression functions only accepts one block at a time.
+ *  The last 64KB of previously decoded data *must* remain available and unmodified at the memory position where they were decoded.
+ *  If less than 64KB of data has been decoded, all the data must be present.
+ *
+ *  Special : if decompression side sets a ring buffer, it must respect one of the following conditions :
+ *  - Decompression buffer size is _at least_ LZ4_decoderRingBufferSize(maxBlockSize).
+ *    maxBlockSize is the maximum size of any single block. It can have any value > 16 bytes.
+ *    In which case, encoding and decoding buffers do not need to be synchronized.
+ *    Actually, data can be produced by any source compliant with LZ4 format specification, and respecting maxBlockSize.
+ *  - Synchronized mode :
+ *    Decompression buffer size is _exactly_ the same as compression buffer size,
+ *    and follows exactly same update rule (block boundaries at same positions),
+ *    and decoding function is provided with exact decompressed size of each block (exception for last block of the stream),
+ *    _then_ decoding & encoding ring buffer can have any size, including small ones ( < 64 KB).
+ *  - Decompression buffer is larger than encoding buffer, by a minimum of maxBlockSize more bytes.
+ *    In which case, encoding and decoding buffers do not need to be synchronized,
+ *    and encoding ring buffer can have any size, including small ones ( < 64 KB).
+ *
+ *  Whenever these conditions are not possible,
+ *  save the last 64KB of decoded data into a safe buffer where it can't be modified during decompression,
+ *  then indicate where this data is saved using LZ4_setStreamDecode(), before decompressing next block.
+*/
+LZ4LIB_API int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* src, char* dst, int srcSize, int dstCapacity);
+
+
+/*! LZ4_decompress_*_usingDict() :
+ *  These decoding functions work the same as
+ *  a combination of LZ4_setStreamDecode() followed by LZ4_decompress_*_continue()
+ *  They are stand-alone, and don't need an LZ4_streamDecode_t structure.
+ *  Dictionary is presumed stable : it must remain accessible and unmodified during decompression.
+ *  Performance tip : Decompression speed can be substantially increased
+ *                    when dst == dictStart + dictSize.
+ */
+LZ4LIB_API int LZ4_decompress_safe_usingDict (const char* src, char* dst, int srcSize, int dstCapcity, const char* dictStart, int dictSize);
+
+#endif /* LZ4_H_2983827168210 */
+
+
+/*^*************************************
+ * !!!!!!   STATIC LINKING ONLY   !!!!!!
+ ***************************************/
+
+/*-****************************************************************************
+ * Experimental section
+ *
+ * Symbols declared in this section must be considered unstable. Their
+ * signatures or semantics may change, or they may be removed altogether in the
+ * future. They are therefore only safe to depend on when the caller is
+ * statically linked against the library.
+ *
+ * To protect against unsafe usage, not only are the declarations guarded,
+ * the definitions are hidden by default
+ * when building LZ4 as a shared/dynamic library.
+ *
+ * In order to access these declarations,
+ * define LZ4_STATIC_LINKING_ONLY in your application
+ * before including LZ4's headers.
+ *
+ * In order to make their implementations accessible dynamically, you must
+ * define LZ4_PUBLISH_STATIC_FUNCTIONS when building the LZ4 library.
+ ******************************************************************************/
+
+#ifdef LZ4_STATIC_LINKING_ONLY
+
+#ifndef LZ4_STATIC_3504398509
+#define LZ4_STATIC_3504398509
+
+#ifdef LZ4_PUBLISH_STATIC_FUNCTIONS
+#define LZ4LIB_STATIC_API LZ4LIB_API
+#else
+#define LZ4LIB_STATIC_API
+#endif
+
+
+/*! LZ4_compress_fast_extState_fastReset() :
+ *  A variant of LZ4_compress_fast_extState().
+ *
+ *  Using this variant avoids an expensive initialization step.
+ *  It is only safe to call if the state buffer is known to be correctly initialized already
+ *  (see above comment on LZ4_resetStream_fast() for a definition of "correctly initialized").
+ *  From a high level, the difference is that
+ *  this function initializes the provided state with a call to something like LZ4_resetStream_fast()
+ *  while LZ4_compress_fast_extState() starts with a call to LZ4_resetStream().
+ */
+LZ4LIB_STATIC_API int LZ4_compress_fast_extState_fastReset (void* state, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration);
+
+/*! LZ4_attach_dictionary() :
+ *  This is an experimental API that allows
+ *  efficient use of a static dictionary many times.
+ *
+ *  Rather than re-loading the dictionary buffer into a working context before
+ *  each compression, or copying a pre-loaded dictionary's LZ4_stream_t into a
+ *  working LZ4_stream_t, this function introduces a no-copy setup mechanism,
+ *  in which the working stream references the dictionary stream in-place.
+ *
+ *  Several assumptions are made about the state of the dictionary stream.
+ *  Currently, only streams which have been prepared by LZ4_loadDict() should
+ *  be expected to work.
+ *
+ *  Alternatively, the provided dictionaryStream may be NULL,
+ *  in which case any existing dictionary stream is unset.
+ *
+ *  If a dictionary is provided, it replaces any pre-existing stream history.
+ *  The dictionary contents are the only history that can be referenced and
+ *  logically immediately precede the data compressed in the first subsequent
+ *  compression call.
+ *
+ *  The dictionary will only remain attached to the working stream through the
+ *  first compression call, at the end of which it is cleared. The dictionary
+ *  stream (and source buffer) must remain in-place / accessible / unchanged
+ *  through the completion of the first compression call on the stream.
+ */
+LZ4LIB_STATIC_API void LZ4_attach_dictionary(LZ4_stream_t* workingStream, const LZ4_stream_t* dictionaryStream);
+
+
+/*! In-place compression and decompression
+ *
+ * It's possible to have input and output sharing the same buffer,
+ * for highly contrained memory environments.
+ * In both cases, it requires input to lay at the end of the buffer,
+ * and decompression to start at beginning of the buffer.
+ * Buffer size must feature some margin, hence be larger than final size.
+ *
+ * |<------------------------buffer--------------------------------->|
+ *                             |<-----------compressed data--------->|
+ * |<-----------decompressed size------------------>|
+ *                                                  |<----margin---->|
+ *
+ * This technique is more useful for decompression,
+ * since decompressed size is typically larger,
+ * and margin is short.
+ *
+ * In-place decompression will work inside any buffer
+ * which size is >= LZ4_DECOMPRESS_INPLACE_BUFFER_SIZE(decompressedSize).
+ * This presumes that decompressedSize > compressedSize.
+ * Otherwise, it means compression actually expanded data,
+ * and it would be more efficient to store such data with a flag indicating it's not compressed.
+ * This can happen when data is not compressible (already compressed, or encrypted).
+ *
+ * For in-place compression, margin is larger, as it must be able to cope with both
+ * history preservation, requiring input data to remain unmodified up to LZ4_DISTANCE_MAX,
+ * and data expansion, which can happen when input is not compressible.
+ * As a consequence, buffer size requirements are much higher,
+ * and memory savings offered by in-place compression are more limited.
+ *
+ * There are ways to limit this cost for compression :
+ * - Reduce history size, by modifying LZ4_DISTANCE_MAX.
+ *   Note that it is a compile-time constant, so all compressions will apply this limit.
+ *   Lower values will reduce compression ratio, except when input_size < LZ4_DISTANCE_MAX,
+ *   so it's a reasonable trick when inputs are known to be small.
+ * - Require the compressor to deliver a "maximum compressed size".
+ *   This is the `dstCapacity` parameter in `LZ4_compress*()`.
+ *   When this size is < LZ4_COMPRESSBOUND(inputSize), then compression can fail,
+ *   in which case, the return code will be 0 (zero).
+ *   The caller must be ready for these cases to happen,
+ *   and typically design a backup scheme to send data uncompressed.
+ * The combination of both techniques can significantly reduce
+ * the amount of margin required for in-place compression.
+ *
+ * In-place compression can work in any buffer
+ * which size is >= (maxCompressedSize)
+ * with maxCompressedSize == LZ4_COMPRESSBOUND(srcSize) for guaranteed compression success.
+ * LZ4_COMPRESS_INPLACE_BUFFER_SIZE() depends on both maxCompressedSize and LZ4_DISTANCE_MAX,
+ * so it's possible to reduce memory requirements by playing with them.
+ */
+
+#define LZ4_DECOMPRESS_INPLACE_MARGIN(compressedSize)          (((compressedSize) >> 8) + 32)
+#define LZ4_DECOMPRESS_INPLACE_BUFFER_SIZE(decompressedSize)   ((decompressedSize) + LZ4_DECOMPRESS_INPLACE_MARGIN(decompressedSize))  /**< note: presumes that compressedSize < decompressedSize. note2: margin is overestimated a bit, since it could use compressedSize instead */
+
+#ifndef LZ4_DISTANCE_MAX   /* history window size; can be user-defined at compile time */
+#  define LZ4_DISTANCE_MAX 65535   /* set to maximum value by default */
+#endif
+
+#define LZ4_COMPRESS_INPLACE_MARGIN                           (LZ4_DISTANCE_MAX + 32)   /* LZ4_DISTANCE_MAX can be safely replaced by srcSize when it's smaller */
+#define LZ4_COMPRESS_INPLACE_BUFFER_SIZE(maxCompressedSize)   ((maxCompressedSize) + LZ4_COMPRESS_INPLACE_MARGIN)  /**< maxCompressedSize is generally LZ4_COMPRESSBOUND(inputSize), but can be set to any lower value, with the risk that compression can fail (return code 0(zero)) */
+
+#endif   /* LZ4_STATIC_3504398509 */
+#endif   /* LZ4_STATIC_LINKING_ONLY */
+
+
+
+#ifndef LZ4_H_98237428734687
+#define LZ4_H_98237428734687
+
+/*-************************************************************
+ *  Private Definitions
+ **************************************************************
+ * Do not use these definitions directly.
+ * They are only exposed to allow static allocation of `LZ4_stream_t` and `LZ4_streamDecode_t`.
+ * Accessing members will expose user code to API and/or ABI break in future versions of the library.
+ **************************************************************/
+#define LZ4_HASHLOG   (LZ4_MEMORY_USAGE-2)
+#define LZ4_HASHTABLESIZE (1 << LZ4_MEMORY_USAGE)
+#define LZ4_HASH_SIZE_U32 (1 << LZ4_HASHLOG)       /* required as macro for static allocation */
+
+#if defined(__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+# include <stdint.h>
+  typedef  int8_t  LZ4_i8;
+  typedef uint8_t  LZ4_byte;
+  typedef uint16_t LZ4_u16;
+  typedef uint32_t LZ4_u32;
+#else
+  typedef   signed char  LZ4_i8;
+  typedef unsigned char  LZ4_byte;
+  typedef unsigned short LZ4_u16;
+  typedef unsigned int   LZ4_u32;
+#endif
+
+typedef struct LZ4_stream_t_internal LZ4_stream_t_internal;
+struct LZ4_stream_t_internal {
+    LZ4_u32 hashTable[LZ4_HASH_SIZE_U32];
+    LZ4_u32 currentOffset;
+    LZ4_u32 tableType;
+    const LZ4_byte* dictionary;
+    const LZ4_stream_t_internal* dictCtx;
+    LZ4_u32 dictSize;
+};
+
+typedef struct {
+    const LZ4_byte* externalDict;
+    size_t extDictSize;
+    const LZ4_byte* prefixEnd;
+    size_t prefixSize;
+} LZ4_streamDecode_t_internal;
+
+
+/*! LZ4_stream_t :
+ *  Do not use below internal definitions directly !
+ *  Declare or allocate an LZ4_stream_t instead.
+ *  LZ4_stream_t can also be created using LZ4_createStream(), which is recommended.
+ *  The structure definition can be convenient for static allocation
+ *  (on stack, or as part of larger structure).
+ *  Init this structure with LZ4_initStream() before first use.
+ *  note : only use this definition in association with static linking !
+ *  this definition is not API/ABI safe, and may change in future versions.
+ */
+#define LZ4_STREAMSIZE       16416  /* static size, for inter-version compatibility */
+#define LZ4_STREAMSIZE_VOIDP (LZ4_STREAMSIZE / sizeof(void*))
+union LZ4_stream_u {
+    void* table[LZ4_STREAMSIZE_VOIDP];
+    LZ4_stream_t_internal internal_donotuse;
+}; /* previously typedef'd to LZ4_stream_t */
+
+
+/*! LZ4_initStream() : v1.9.0+
+ *  An LZ4_stream_t structure must be initialized at least once.
+ *  This is automatically done when invoking LZ4_createStream(),
+ *  but it's not when the structure is simply declared on stack (for example).
+ *
+ *  Use LZ4_initStream() to properly initialize a newly declared LZ4_stream_t.
+ *  It can also initialize any arbitrary buffer of sufficient size,
+ *  and will @return a pointer of proper type upon initialization.
+ *
+ *  Note : initialization fails if size and alignment conditions are not respected.
+ *         In which case, the function will @return NULL.
+ *  Note2: An LZ4_stream_t structure guarantees correct alignment and size.
+ *  Note3: Before v1.9.0, use LZ4_resetStream() instead
+ */
+LZ4LIB_API LZ4_stream_t* LZ4_initStream (void* buffer, size_t size);
+
+
+/*! LZ4_streamDecode_t :
+ *  information structure to track an LZ4 stream during decompression.
+ *  init this structure  using LZ4_setStreamDecode() before first use.
+ *  note : only use in association with static linking !
+ *         this definition is not API/ABI safe,
+ *         and may change in a future version !
+ */
+#define LZ4_STREAMDECODESIZE_U64 (4 + ((sizeof(void*)==16) ? 2 : 0) /*AS-400*/ )
+#define LZ4_STREAMDECODESIZE     (LZ4_STREAMDECODESIZE_U64 * sizeof(unsigned long long))
+union LZ4_streamDecode_u {
+    unsigned long long table[LZ4_STREAMDECODESIZE_U64];
+    LZ4_streamDecode_t_internal internal_donotuse;
+} ;   /* previously typedef'd to LZ4_streamDecode_t */
+
+
+
+/*-************************************
+*  Obsolete Functions
+**************************************/
+
+/*! Deprecation warnings
+ *
+ *  Deprecated functions make the compiler generate a warning when invoked.
+ *  This is meant to invite users to update their source code.
+ *  Should deprecation warnings be a problem, it is generally possible to disable them,
+ *  typically with -Wno-deprecated-declarations for gcc
+ *  or _CRT_SECURE_NO_WARNINGS in Visual.
+ *
+ *  Another method is to define LZ4_DISABLE_DEPRECATE_WARNINGS
+ *  before including the header file.
+ */
+#ifdef LZ4_DISABLE_DEPRECATE_WARNINGS
+#  define LZ4_DEPRECATED(message)   /* disable deprecation warnings */
+#else
+#  if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */
+#    define LZ4_DEPRECATED(message) [[deprecated(message)]]
+#  elif defined(_MSC_VER)
+#    define LZ4_DEPRECATED(message) __declspec(deprecated(message))
+#  elif defined(__clang__) || (defined(__GNUC__) && (__GNUC__ * 10 + __GNUC_MINOR__ >= 45))
+#    define LZ4_DEPRECATED(message) __attribute__((deprecated(message)))
+#  elif defined(__GNUC__) && (__GNUC__ * 10 + __GNUC_MINOR__ >= 31)
+#    define LZ4_DEPRECATED(message) __attribute__((deprecated))
+#  else
+#    pragma message("WARNING: LZ4_DEPRECATED needs custom implementation for this compiler")
+#    define LZ4_DEPRECATED(message)   /* disabled */
+#  endif
+#endif /* LZ4_DISABLE_DEPRECATE_WARNINGS */
+
+/*! Obsolete compression functions (since v1.7.3) */
+LZ4_DEPRECATED("use LZ4_compress_default() instead")       LZ4LIB_API int LZ4_compress               (const char* src, char* dest, int srcSize);
+LZ4_DEPRECATED("use LZ4_compress_default() instead")       LZ4LIB_API int LZ4_compress_limitedOutput (const char* src, char* dest, int srcSize, int maxOutputSize);
+LZ4_DEPRECATED("use LZ4_compress_fast_extState() instead") LZ4LIB_API int LZ4_compress_withState               (void* state, const char* source, char* dest, int inputSize);
+LZ4_DEPRECATED("use LZ4_compress_fast_extState() instead") LZ4LIB_API int LZ4_compress_limitedOutput_withState (void* state, const char* source, char* dest, int inputSize, int maxOutputSize);
+LZ4_DEPRECATED("use LZ4_compress_fast_continue() instead") LZ4LIB_API int LZ4_compress_continue                (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize);
+LZ4_DEPRECATED("use LZ4_compress_fast_continue() instead") LZ4LIB_API int LZ4_compress_limitedOutput_continue  (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize, int maxOutputSize);
+
+/*! Obsolete decompression functions (since v1.8.0) */
+LZ4_DEPRECATED("use LZ4_decompress_fast() instead") LZ4LIB_API int LZ4_uncompress (const char* source, char* dest, int outputSize);
+LZ4_DEPRECATED("use LZ4_decompress_safe() instead") LZ4LIB_API int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize);
+
+/* Obsolete streaming functions (since v1.7.0)
+ * degraded functionality; do not use!
+ *
+ * In order to perform streaming compression, these functions depended on data
+ * that is no longer tracked in the state. They have been preserved as well as
+ * possible: using them will still produce a correct output. However, they don't
+ * actually retain any history between compression calls. The compression ratio
+ * achieved will therefore be no better than compressing each chunk
+ * independently.
+ */
+LZ4_DEPRECATED("Use LZ4_createStream() instead") LZ4LIB_API void* LZ4_create (char* inputBuffer);
+LZ4_DEPRECATED("Use LZ4_createStream() instead") LZ4LIB_API int   LZ4_sizeofStreamState(void);
+LZ4_DEPRECATED("Use LZ4_resetStream() instead")  LZ4LIB_API int   LZ4_resetStreamState(void* state, char* inputBuffer);
+LZ4_DEPRECATED("Use LZ4_saveDict() instead")     LZ4LIB_API char* LZ4_slideInputBuffer (void* state);
+
+/*! Obsolete streaming decoding functions (since v1.7.0) */
+LZ4_DEPRECATED("use LZ4_decompress_safe_usingDict() instead") LZ4LIB_API int LZ4_decompress_safe_withPrefix64k (const char* src, char* dst, int compressedSize, int maxDstSize);
+LZ4_DEPRECATED("use LZ4_decompress_fast_usingDict() instead") LZ4LIB_API int LZ4_decompress_fast_withPrefix64k (const char* src, char* dst, int originalSize);
+
+/*! Obsolete LZ4_decompress_fast variants (since v1.9.0) :
+ *  These functions used to be faster than LZ4_decompress_safe(),
+ *  but this is no longer the case. They are now slower.
+ *  This is because LZ4_decompress_fast() doesn't know the input size,
+ *  and therefore must progress more cautiously into the input buffer to not read beyond the end of block.
+ *  On top of that `LZ4_decompress_fast()` is not protected vs malformed or malicious inputs, making it a security liability.
+ *  As a consequence, LZ4_decompress_fast() is strongly discouraged, and deprecated.
+ *
+ *  The last remaining LZ4_decompress_fast() specificity is that
+ *  it can decompress a block without knowing its compressed size.
+ *  Such functionality can be achieved in a more secure manner
+ *  by employing LZ4_decompress_safe_partial().
+ *
+ *  Parameters:
+ *  originalSize : is the uncompressed size to regenerate.
+ *                 `dst` must be already allocated, its size must be >= 'originalSize' bytes.
+ * @return : number of bytes read from source buffer (== compressed size).
+ *           The function expects to finish at block's end exactly.
+ *           If the source stream is detected malformed, the function stops decoding and returns a negative result.
+ *  note : LZ4_decompress_fast*() requires originalSize. Thanks to this information, it never writes past the output buffer.
+ *         However, since it doesn't know its 'src' size, it may read an unknown amount of input, past input buffer bounds.
+ *         Also, since match offsets are not validated, match reads from 'src' may underflow too.
+ *         These issues never happen if input (compressed) data is correct.
+ *         But they may happen if input data is invalid (error or intentional tampering).
+ *         As a consequence, use these functions in trusted environments with trusted data **only**.
+ */
+LZ4_DEPRECATED("This function is deprecated and unsafe. Consider using LZ4_decompress_safe() instead")
+LZ4LIB_API int LZ4_decompress_fast (const char* src, char* dst, int originalSize);
+LZ4_DEPRECATED("This function is deprecated and unsafe. Consider using LZ4_decompress_safe_continue() instead")
+LZ4LIB_API int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* src, char* dst, int originalSize);
+LZ4_DEPRECATED("This function is deprecated and unsafe. Consider using LZ4_decompress_safe_usingDict() instead")
+LZ4LIB_API int LZ4_decompress_fast_usingDict (const char* src, char* dst, int originalSize, const char* dictStart, int dictSize);
+
+/*! LZ4_resetStream() :
+ *  An LZ4_stream_t structure must be initialized at least once.
+ *  This is done with LZ4_initStream(), or LZ4_resetStream().
+ *  Consider switching to LZ4_initStream(),
+ *  invoking LZ4_resetStream() will trigger deprecation warnings in the future.
+ */
+LZ4LIB_API void LZ4_resetStream (LZ4_stream_t* streamPtr);
+
+
+#endif /* LZ4_H_98237428734687 */
+
+
+#if defined (__cplusplus)
+}
+#endif
diff --git a/cviruntime/include/lz4/lz4frame.h b/cviruntime/include/lz4/lz4frame.h
new file mode 100644
index 000000000..4573317ef
--- /dev/null
+++ b/cviruntime/include/lz4/lz4frame.h
@@ -0,0 +1,623 @@
+/*
+   LZ4 auto-framing library
+   Header File
+   Copyright (C) 2011-2017, Yann Collet.
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - LZ4 source repository : https://github.com/lz4/lz4
+   - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+
+/* LZ4F is a stand-alone API able to create and decode LZ4 frames
+ * conformant with specification v1.6.1 in doc/lz4_Frame_format.md .
+ * Generated frames are compatible with `lz4` CLI.
+ *
+ * LZ4F also offers streaming capabilities.
+ *
+ * lz4.h is not required when using lz4frame.h,
+ * except to extract common constant such as LZ4_VERSION_NUMBER.
+ * */
+
+#ifndef LZ4F_H_09782039843
+#define LZ4F_H_09782039843
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* ---   Dependency   --- */
+#include <stddef.h>   /* size_t */
+
+
+/**
+  Introduction
+
+  lz4frame.h implements LZ4 frame specification (doc/lz4_Frame_format.md).
+  lz4frame.h provides frame compression functions that take care
+  of encoding standard metadata alongside LZ4-compressed blocks.
+*/
+
+/*-***************************************************************
+ *  Compiler specifics
+ *****************************************************************/
+/*  LZ4_DLL_EXPORT :
+ *  Enable exporting of functions when building a Windows DLL
+ *  LZ4FLIB_VISIBILITY :
+ *  Control library symbols visibility.
+ */
+#ifndef LZ4FLIB_VISIBILITY
+#  if defined(__GNUC__) && (__GNUC__ >= 4)
+#    define LZ4FLIB_VISIBILITY __attribute__ ((visibility ("default")))
+#  else
+#    define LZ4FLIB_VISIBILITY
+#  endif
+#endif
+#if defined(LZ4_DLL_EXPORT) && (LZ4_DLL_EXPORT==1)
+#  define LZ4FLIB_API __declspec(dllexport) LZ4FLIB_VISIBILITY
+#elif defined(LZ4_DLL_IMPORT) && (LZ4_DLL_IMPORT==1)
+#  define LZ4FLIB_API __declspec(dllimport) LZ4FLIB_VISIBILITY
+#else
+#  define LZ4FLIB_API LZ4FLIB_VISIBILITY
+#endif
+
+#ifdef LZ4F_DISABLE_DEPRECATE_WARNINGS
+#  define LZ4F_DEPRECATE(x) x
+#else
+#  if defined(_MSC_VER)
+#    define LZ4F_DEPRECATE(x) x   /* __declspec(deprecated) x - only works with C++ */
+#  elif defined(__clang__) || (defined(__GNUC__) && (__GNUC__ >= 6))
+#    define LZ4F_DEPRECATE(x) x __attribute__((deprecated))
+#  else
+#    define LZ4F_DEPRECATE(x) x   /* no deprecation warning for this compiler */
+#  endif
+#endif
+
+
+/*-************************************
+ *  Error management
+ **************************************/
+typedef size_t LZ4F_errorCode_t;
+
+LZ4FLIB_API unsigned    LZ4F_isError(LZ4F_errorCode_t code);   /**< tells when a function result is an error code */
+LZ4FLIB_API const char* LZ4F_getErrorName(LZ4F_errorCode_t code);   /**< return error code string; for debugging */
+
+
+/*-************************************
+ *  Frame compression types
+ ************************************* */
+/* #define LZ4F_ENABLE_OBSOLETE_ENUMS   // uncomment to enable obsolete enums */
+#ifdef LZ4F_ENABLE_OBSOLETE_ENUMS
+#  define LZ4F_OBSOLETE_ENUM(x) , LZ4F_DEPRECATE(x) = LZ4F_##x
+#else
+#  define LZ4F_OBSOLETE_ENUM(x)
+#endif
+
+/* The larger the block size, the (slightly) better the compression ratio,
+ * though there are diminishing returns.
+ * Larger blocks also increase memory usage on both compression and decompression sides.
+ */
+typedef enum {
+    LZ4F_default=0,
+    LZ4F_max64KB=4,
+    LZ4F_max256KB=5,
+    LZ4F_max1MB=6,
+    LZ4F_max4MB=7
+    LZ4F_OBSOLETE_ENUM(max64KB)
+    LZ4F_OBSOLETE_ENUM(max256KB)
+    LZ4F_OBSOLETE_ENUM(max1MB)
+    LZ4F_OBSOLETE_ENUM(max4MB)
+} LZ4F_blockSizeID_t;
+
+/* Linked blocks sharply reduce inefficiencies when using small blocks,
+ * they compress better.
+ * However, some LZ4 decoders are only compatible with independent blocks */
+typedef enum {
+    LZ4F_blockLinked=0,
+    LZ4F_blockIndependent
+    LZ4F_OBSOLETE_ENUM(blockLinked)
+    LZ4F_OBSOLETE_ENUM(blockIndependent)
+} LZ4F_blockMode_t;
+
+typedef enum {
+    LZ4F_noContentChecksum=0,
+    LZ4F_contentChecksumEnabled
+    LZ4F_OBSOLETE_ENUM(noContentChecksum)
+    LZ4F_OBSOLETE_ENUM(contentChecksumEnabled)
+} LZ4F_contentChecksum_t;
+
+typedef enum {
+    LZ4F_noBlockChecksum=0,
+    LZ4F_blockChecksumEnabled
+} LZ4F_blockChecksum_t;
+
+typedef enum {
+    LZ4F_frame=0,
+    LZ4F_skippableFrame
+    LZ4F_OBSOLETE_ENUM(skippableFrame)
+} LZ4F_frameType_t;
+
+#ifdef LZ4F_ENABLE_OBSOLETE_ENUMS
+typedef LZ4F_blockSizeID_t blockSizeID_t;
+typedef LZ4F_blockMode_t blockMode_t;
+typedef LZ4F_frameType_t frameType_t;
+typedef LZ4F_contentChecksum_t contentChecksum_t;
+#endif
+
+/*! LZ4F_frameInfo_t :
+ *  makes it possible to set or read frame parameters.
+ *  Structure must be first init to 0, using memset() or LZ4F_INIT_FRAMEINFO,
+ *  setting all parameters to default.
+ *  It's then possible to update selectively some parameters */
+typedef struct {
+  LZ4F_blockSizeID_t     blockSizeID;         /* max64KB, max256KB, max1MB, max4MB; 0 == default */
+  LZ4F_blockMode_t       blockMode;           /* LZ4F_blockLinked, LZ4F_blockIndependent; 0 == default */
+  LZ4F_contentChecksum_t contentChecksumFlag; /* 1: frame terminated with 32-bit checksum of decompressed data; 0: disabled (default) */
+  LZ4F_frameType_t       frameType;           /* read-only field : LZ4F_frame or LZ4F_skippableFrame */
+  unsigned long long     contentSize;         /* Size of uncompressed content ; 0 == unknown */
+  unsigned               dictID;              /* Dictionary ID, sent by compressor to help decoder select correct dictionary; 0 == no dictID provided */
+  LZ4F_blockChecksum_t   blockChecksumFlag;   /* 1: each block followed by a checksum of block's compressed data; 0: disabled (default) */
+} LZ4F_frameInfo_t;
+
+#define LZ4F_INIT_FRAMEINFO   { LZ4F_default, LZ4F_blockLinked, LZ4F_noContentChecksum, LZ4F_frame, 0ULL, 0U, LZ4F_noBlockChecksum }    /* v1.8.3+ */
+
+/*! LZ4F_preferences_t :
+ *  makes it possible to supply advanced compression instructions to streaming interface.
+ *  Structure must be first init to 0, using memset() or LZ4F_INIT_PREFERENCES,
+ *  setting all parameters to default.
+ *  All reserved fields must be set to zero. */
+typedef struct {
+  LZ4F_frameInfo_t frameInfo;
+  int      compressionLevel;    /* 0: default (fast mode); values > LZ4HC_CLEVEL_MAX count as LZ4HC_CLEVEL_MAX; values < 0 trigger "fast acceleration" */
+  unsigned autoFlush;           /* 1: always flush; reduces usage of internal buffers */
+  unsigned favorDecSpeed;       /* 1: parser favors decompression speed vs compression ratio. Only works for high compression modes (>= LZ4HC_CLEVEL_OPT_MIN) */  /* v1.8.2+ */
+  unsigned reserved[3];         /* must be zero for forward compatibility */
+} LZ4F_preferences_t;
+
+#define LZ4F_INIT_PREFERENCES   { LZ4F_INIT_FRAMEINFO, 0, 0u, 0u, { 0u, 0u, 0u } }    /* v1.8.3+ */
+
+
+/*-*********************************
+*  Simple compression function
+***********************************/
+
+LZ4FLIB_API int LZ4F_compressionLevel_max(void);   /* v1.8.0+ */
+
+/*! LZ4F_compressFrameBound() :
+ *  Returns the maximum possible compressed size with LZ4F_compressFrame() given srcSize and preferences.
+ * `preferencesPtr` is optional. It can be replaced by NULL, in which case, the function will assume default preferences.
+ *  Note : this result is only usable with LZ4F_compressFrame().
+ *         It may also be used with LZ4F_compressUpdate() _if no flush() operation_ is performed.
+ */
+LZ4FLIB_API size_t LZ4F_compressFrameBound(size_t srcSize, const LZ4F_preferences_t* preferencesPtr);
+
+/*! LZ4F_compressFrame() :
+ *  Compress an entire srcBuffer into a valid LZ4 frame.
+ *  dstCapacity MUST be >= LZ4F_compressFrameBound(srcSize, preferencesPtr).
+ *  The LZ4F_preferences_t structure is optional : you can provide NULL as argument. All preferences will be set to default.
+ * @return : number of bytes written into dstBuffer.
+ *           or an error code if it fails (can be tested using LZ4F_isError())
+ */
+LZ4FLIB_API size_t LZ4F_compressFrame(void* dstBuffer, size_t dstCapacity,
+                                const void* srcBuffer, size_t srcSize,
+                                const LZ4F_preferences_t* preferencesPtr);
+
+
+/*-***********************************
+*  Advanced compression functions
+*************************************/
+typedef struct LZ4F_cctx_s LZ4F_cctx;   /* incomplete type */
+typedef LZ4F_cctx* LZ4F_compressionContext_t;   /* for compatibility with previous API version */
+
+typedef struct {
+  unsigned stableSrc;    /* 1 == src content will remain present on future calls to LZ4F_compress(); skip copying src content within tmp buffer */
+  unsigned reserved[3];
+} LZ4F_compressOptions_t;
+
+/*---   Resource Management   ---*/
+
+#define LZ4F_VERSION 100    /* This number can be used to check for an incompatible API breaking change */
+LZ4FLIB_API unsigned LZ4F_getVersion(void);
+
+/*! LZ4F_createCompressionContext() :
+ * The first thing to do is to create a compressionContext object, which will be used in all compression operations.
+ * This is achieved using LZ4F_createCompressionContext(), which takes as argument a version.
+ * The version provided MUST be LZ4F_VERSION. It is intended to track potential version mismatch, notably when using DLL.
+ * The function will provide a pointer to a fully allocated LZ4F_cctx object.
+ * If @return != zero, there was an error during context creation.
+ * Object can release its memory using LZ4F_freeCompressionContext();
+ */
+LZ4FLIB_API LZ4F_errorCode_t LZ4F_createCompressionContext(LZ4F_cctx** cctxPtr, unsigned version);
+LZ4FLIB_API LZ4F_errorCode_t LZ4F_freeCompressionContext(LZ4F_cctx* cctx);
+
+
+/*----    Compression    ----*/
+
+#define LZ4F_HEADER_SIZE_MIN  7   /* LZ4 Frame header size can vary, depending on selected paramaters */
+#define LZ4F_HEADER_SIZE_MAX 19
+
+/* Size in bytes of a block header in little-endian format. Highest bit indicates if block data is uncompressed */
+#define LZ4F_BLOCK_HEADER_SIZE 4
+
+/* Size in bytes of a block checksum footer in little-endian format. */
+#define LZ4F_BLOCK_CHECKSUM_SIZE 4
+
+/* Size in bytes of the content checksum. */
+#define LZ4F_CONTENT_CHECKSUM_SIZE 4
+
+/*! LZ4F_compressBegin() :
+ *  will write the frame header into dstBuffer.
+ *  dstCapacity must be >= LZ4F_HEADER_SIZE_MAX bytes.
+ * `prefsPtr` is optional : you can provide NULL as argument, all preferences will then be set to default.
+ * @return : number of bytes written into dstBuffer for the header
+ *           or an error code (which can be tested using LZ4F_isError())
+ */
+LZ4FLIB_API size_t LZ4F_compressBegin(LZ4F_cctx* cctx,
+                                      void* dstBuffer, size_t dstCapacity,
+                                      const LZ4F_preferences_t* prefsPtr);
+
+/*! LZ4F_compressBound() :
+ *  Provides minimum dstCapacity required to guarantee success of
+ *  LZ4F_compressUpdate(), given a srcSize and preferences, for a worst case scenario.
+ *  When srcSize==0, LZ4F_compressBound() provides an upper bound for LZ4F_flush() and LZ4F_compressEnd() instead.
+ *  Note that the result is only valid for a single invocation of LZ4F_compressUpdate().
+ *  When invoking LZ4F_compressUpdate() multiple times,
+ *  if the output buffer is gradually filled up instead of emptied and re-used from its start,
+ *  one must check if there is enough remaining capacity before each invocation, using LZ4F_compressBound().
+ * @return is always the same for a srcSize and prefsPtr.
+ *  prefsPtr is optional : when NULL is provided, preferences will be set to cover worst case scenario.
+ *  tech details :
+ * @return if automatic flushing is not enabled, includes the possibility that internal buffer might already be filled by up to (blockSize-1) bytes.
+ *  It also includes frame footer (ending + checksum), since it might be generated by LZ4F_compressEnd().
+ * @return doesn't include frame header, as it was already generated by LZ4F_compressBegin().
+ */
+LZ4FLIB_API size_t LZ4F_compressBound(size_t srcSize, const LZ4F_preferences_t* prefsPtr);
+
+/*! LZ4F_compressUpdate() :
+ *  LZ4F_compressUpdate() can be called repetitively to compress as much data as necessary.
+ *  Important rule: dstCapacity MUST be large enough to ensure operation success even in worst case situations.
+ *  This value is provided by LZ4F_compressBound().
+ *  If this condition is not respected, LZ4F_compress() will fail (result is an errorCode).
+ *  LZ4F_compressUpdate() doesn't guarantee error recovery.
+ *  When an error occurs, compression context must be freed or resized.
+ * `cOptPtr` is optional : NULL can be provided, in which case all options are set to default.
+ * @return : number of bytes written into `dstBuffer` (it can be zero, meaning input data was just buffered).
+ *           or an error code if it fails (which can be tested using LZ4F_isError())
+ */
+LZ4FLIB_API size_t LZ4F_compressUpdate(LZ4F_cctx* cctx,
+                                       void* dstBuffer, size_t dstCapacity,
+                                 const void* srcBuffer, size_t srcSize,
+                                 const LZ4F_compressOptions_t* cOptPtr);
+
+/*! LZ4F_flush() :
+ *  When data must be generated and sent immediately, without waiting for a block to be completely filled,
+ *  it's possible to call LZ4_flush(). It will immediately compress any data buffered within cctx.
+ * `dstCapacity` must be large enough to ensure the operation will be successful.
+ * `cOptPtr` is optional : it's possible to provide NULL, all options will be set to default.
+ * @return : nb of bytes written into dstBuffer (can be zero, when there is no data stored within cctx)
+ *           or an error code if it fails (which can be tested using LZ4F_isError())
+ *  Note : LZ4F_flush() is guaranteed to be successful when dstCapacity >= LZ4F_compressBound(0, prefsPtr).
+ */
+LZ4FLIB_API size_t LZ4F_flush(LZ4F_cctx* cctx,
+                              void* dstBuffer, size_t dstCapacity,
+                        const LZ4F_compressOptions_t* cOptPtr);
+
+/*! LZ4F_compressEnd() :
+ *  To properly finish an LZ4 frame, invoke LZ4F_compressEnd().
+ *  It will flush whatever data remained within `cctx` (like LZ4_flush())
+ *  and properly finalize the frame, with an endMark and a checksum.
+ * `cOptPtr` is optional : NULL can be provided, in which case all options will be set to default.
+ * @return : nb of bytes written into dstBuffer, necessarily >= 4 (endMark),
+ *           or an error code if it fails (which can be tested using LZ4F_isError())
+ *  Note : LZ4F_compressEnd() is guaranteed to be successful when dstCapacity >= LZ4F_compressBound(0, prefsPtr).
+ *  A successful call to LZ4F_compressEnd() makes `cctx` available again for another compression task.
+ */
+LZ4FLIB_API size_t LZ4F_compressEnd(LZ4F_cctx* cctx,
+                                    void* dstBuffer, size_t dstCapacity,
+                              const LZ4F_compressOptions_t* cOptPtr);
+
+
+/*-*********************************
+*  Decompression functions
+***********************************/
+typedef struct LZ4F_dctx_s LZ4F_dctx;   /* incomplete type */
+typedef LZ4F_dctx* LZ4F_decompressionContext_t;   /* compatibility with previous API versions */
+
+typedef struct {
+  unsigned stableDst;    /* pledges that last 64KB decompressed data will remain available unmodified. This optimization skips storage operations in tmp buffers. */
+  unsigned reserved[3];  /* must be set to zero for forward compatibility */
+} LZ4F_decompressOptions_t;
+
+
+/* Resource management */
+
+/*! LZ4F_createDecompressionContext() :
+ *  Create an LZ4F_dctx object, to track all decompression operations.
+ *  The version provided MUST be LZ4F_VERSION.
+ *  The function provides a pointer to an allocated and initialized LZ4F_dctx object.
+ *  The result is an errorCode, which can be tested using LZ4F_isError().
+ *  dctx memory can be released using LZ4F_freeDecompressionContext();
+ *  Result of LZ4F_freeDecompressionContext() indicates current state of decompressionContext when being released.
+ *  That is, it should be == 0 if decompression has been completed fully and correctly.
+ */
+LZ4FLIB_API LZ4F_errorCode_t LZ4F_createDecompressionContext(LZ4F_dctx** dctxPtr, unsigned version);
+LZ4FLIB_API LZ4F_errorCode_t LZ4F_freeDecompressionContext(LZ4F_dctx* dctx);
+
+
+/*-***********************************
+*  Streaming decompression functions
+*************************************/
+
+#define LZ4F_MIN_SIZE_TO_KNOW_HEADER_LENGTH 5
+
+/*! LZ4F_headerSize() : v1.9.0+
+ *  Provide the header size of a frame starting at `src`.
+ * `srcSize` must be >= LZ4F_MIN_SIZE_TO_KNOW_HEADER_LENGTH,
+ *  which is enough to decode the header length.
+ * @return : size of frame header
+ *           or an error code, which can be tested using LZ4F_isError()
+ *  note : Frame header size is variable, but is guaranteed to be
+ *         >= LZ4F_HEADER_SIZE_MIN bytes, and <= LZ4F_HEADER_SIZE_MAX bytes.
+ */
+LZ4FLIB_API size_t LZ4F_headerSize(const void* src, size_t srcSize);
+
+/*! LZ4F_getFrameInfo() :
+ *  This function extracts frame parameters (max blockSize, dictID, etc.).
+ *  Its usage is optional: user can call LZ4F_decompress() directly.
+ *
+ *  Extracted information will fill an existing LZ4F_frameInfo_t structure.
+ *  This can be useful for allocation and dictionary identification purposes.
+ *
+ *  LZ4F_getFrameInfo() can work in the following situations :
+ *
+ *  1) At the beginning of a new frame, before any invocation of LZ4F_decompress().
+ *     It will decode header from `srcBuffer`,
+ *     consuming the header and starting the decoding process.
+ *
+ *     Input size must be large enough to contain the full frame header.
+ *     Frame header size can be known beforehand by LZ4F_headerSize().
+ *     Frame header size is variable, but is guaranteed to be >= LZ4F_HEADER_SIZE_MIN bytes,
+ *     and not more than <= LZ4F_HEADER_SIZE_MAX bytes.
+ *     Hence, blindly providing LZ4F_HEADER_SIZE_MAX bytes or more will always work.
+ *     It's allowed to provide more input data than the header size,
+ *     LZ4F_getFrameInfo() will only consume the header.
+ *
+ *     If input size is not large enough,
+ *     aka if it's smaller than header size,
+ *     function will fail and return an error code.
+ *
+ *  2) After decoding has been started,
+ *     it's possible to invoke LZ4F_getFrameInfo() anytime
+ *     to extract already decoded frame parameters stored within dctx.
+ *
+ *     Note that, if decoding has barely started,
+ *     and not yet read enough information to decode the header,
+ *     LZ4F_getFrameInfo() will fail.
+ *
+ *  The number of bytes consumed from srcBuffer will be updated in *srcSizePtr (necessarily <= original value).
+ *  LZ4F_getFrameInfo() only consumes bytes when decoding has not yet started,
+ *  and when decoding the header has been successful.
+ *  Decompression must then resume from (srcBuffer + *srcSizePtr).
+ *
+ * @return : a hint about how many srcSize bytes LZ4F_decompress() expects for next call,
+ *           or an error code which can be tested using LZ4F_isError().
+ *  note 1 : in case of error, dctx is not modified. Decoding operation can resume from beginning safely.
+ *  note 2 : frame parameters are *copied into* an already allocated LZ4F_frameInfo_t structure.
+ */
+LZ4FLIB_API size_t LZ4F_getFrameInfo(LZ4F_dctx* dctx,
+                                     LZ4F_frameInfo_t* frameInfoPtr,
+                                     const void* srcBuffer, size_t* srcSizePtr);
+
+/*! LZ4F_decompress() :
+ *  Call this function repetitively to regenerate data compressed in `srcBuffer`.
+ *
+ *  The function requires a valid dctx state.
+ *  It will read up to *srcSizePtr bytes from srcBuffer,
+ *  and decompress data into dstBuffer, of capacity *dstSizePtr.
+ *
+ *  The nb of bytes consumed from srcBuffer will be written into *srcSizePtr (necessarily <= original value).
+ *  The nb of bytes decompressed into dstBuffer will be written into *dstSizePtr (necessarily <= original value).
+ *
+ *  The function does not necessarily read all input bytes, so always check value in *srcSizePtr.
+ *  Unconsumed source data must be presented again in subsequent invocations.
+ *
+ * `dstBuffer` can freely change between each consecutive function invocation.
+ * `dstBuffer` content will be overwritten.
+ *
+ * @return : an hint of how many `srcSize` bytes LZ4F_decompress() expects for next call.
+ *  Schematically, it's the size of the current (or remaining) compressed block + header of next block.
+ *  Respecting the hint provides some small speed benefit, because it skips intermediate buffers.
+ *  This is just a hint though, it's always possible to provide any srcSize.
+ *
+ *  When a frame is fully decoded, @return will be 0 (no more data expected).
+ *  When provided with more bytes than necessary to decode a frame,
+ *  LZ4F_decompress() will stop reading exactly at end of current frame, and @return 0.
+ *
+ *  If decompression failed, @return is an error code, which can be tested using LZ4F_isError().
+ *  After a decompression error, the `dctx` context is not resumable.
+ *  Use LZ4F_resetDecompressionContext() to return to clean state.
+ *
+ *  After a frame is fully decoded, dctx can be used again to decompress another frame.
+ */
+LZ4FLIB_API size_t LZ4F_decompress(LZ4F_dctx* dctx,
+                                   void* dstBuffer, size_t* dstSizePtr,
+                                   const void* srcBuffer, size_t* srcSizePtr,
+                                   const LZ4F_decompressOptions_t* dOptPtr);
+
+
+/*! LZ4F_resetDecompressionContext() : added in v1.8.0
+ *  In case of an error, the context is left in "undefined" state.
+ *  In which case, it's necessary to reset it, before re-using it.
+ *  This method can also be used to abruptly stop any unfinished decompression,
+ *  and start a new one using same context resources. */
+LZ4FLIB_API void LZ4F_resetDecompressionContext(LZ4F_dctx* dctx);   /* always successful */
+
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif  /* LZ4F_H_09782039843 */
+
+#if defined(LZ4F_STATIC_LINKING_ONLY) && !defined(LZ4F_H_STATIC_09782039843)
+#define LZ4F_H_STATIC_09782039843
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/* These declarations are not stable and may change in the future.
+ * They are therefore only safe to depend on
+ * when the caller is statically linked against the library.
+ * To access their declarations, define LZ4F_STATIC_LINKING_ONLY.
+ *
+ * By default, these symbols aren't published into shared/dynamic libraries.
+ * You can override this behavior and force them to be published
+ * by defining LZ4F_PUBLISH_STATIC_FUNCTIONS.
+ * Use at your own risk.
+ */
+#ifdef LZ4F_PUBLISH_STATIC_FUNCTIONS
+# define LZ4FLIB_STATIC_API LZ4FLIB_API
+#else
+# define LZ4FLIB_STATIC_API
+#endif
+
+
+/* ---   Error List   --- */
+#define LZ4F_LIST_ERRORS(ITEM) \
+        ITEM(OK_NoError) \
+        ITEM(ERROR_GENERIC) \
+        ITEM(ERROR_maxBlockSize_invalid) \
+        ITEM(ERROR_blockMode_invalid) \
+        ITEM(ERROR_contentChecksumFlag_invalid) \
+        ITEM(ERROR_compressionLevel_invalid) \
+        ITEM(ERROR_headerVersion_wrong) \
+        ITEM(ERROR_blockChecksum_invalid) \
+        ITEM(ERROR_reservedFlag_set) \
+        ITEM(ERROR_allocation_failed) \
+        ITEM(ERROR_srcSize_tooLarge) \
+        ITEM(ERROR_dstMaxSize_tooSmall) \
+        ITEM(ERROR_frameHeader_incomplete) \
+        ITEM(ERROR_frameType_unknown) \
+        ITEM(ERROR_frameSize_wrong) \
+        ITEM(ERROR_srcPtr_wrong) \
+        ITEM(ERROR_decompressionFailed) \
+        ITEM(ERROR_headerChecksum_invalid) \
+        ITEM(ERROR_contentChecksum_invalid) \
+        ITEM(ERROR_frameDecoding_alreadyStarted) \
+        ITEM(ERROR_maxCode)
+
+#define LZ4F_GENERATE_ENUM(ENUM) LZ4F_##ENUM,
+
+/* enum list is exposed, to handle specific errors */
+typedef enum { LZ4F_LIST_ERRORS(LZ4F_GENERATE_ENUM)
+              _LZ4F_dummy_error_enum_for_c89_never_used } LZ4F_errorCodes;
+
+LZ4FLIB_STATIC_API LZ4F_errorCodes LZ4F_getErrorCode(size_t functionResult);
+
+LZ4FLIB_STATIC_API size_t LZ4F_getBlockSize(unsigned);
+
+/**********************************
+ *  Bulk processing dictionary API
+ *********************************/
+
+/* A Dictionary is useful for the compression of small messages (KB range).
+ * It dramatically improves compression efficiency.
+ *
+ * LZ4 can ingest any input as dictionary, though only the last 64 KB are useful.
+ * Best results are generally achieved by using Zstandard's Dictionary Builder
+ * to generate a high-quality dictionary from a set of samples.
+ *
+ * Loading a dictionary has a cost, since it involves construction of tables.
+ * The Bulk processing dictionary API makes it possible to share this cost
+ * over an arbitrary number of compression jobs, even concurrently,
+ * markedly improving compression latency for these cases.
+ *
+ * The same dictionary will have to be used on the decompression side
+ * for decoding to be successful.
+ * To help identify the correct dictionary at decoding stage,
+ * the frame header allows optional embedding of a dictID field.
+ */
+typedef struct LZ4F_CDict_s LZ4F_CDict;
+
+/*! LZ4_createCDict() :
+ *  When compressing multiple messages / blocks using the same dictionary, it's recommended to load it just once.
+ *  LZ4_createCDict() will create a digested dictionary, ready to start future compression operations without startup delay.
+ *  LZ4_CDict can be created once and shared by multiple threads concurrently, since its usage is read-only.
+ * `dictBuffer` can be released after LZ4_CDict creation, since its content is copied within CDict */
+LZ4FLIB_STATIC_API LZ4F_CDict* LZ4F_createCDict(const void* dictBuffer, size_t dictSize);
+LZ4FLIB_STATIC_API void        LZ4F_freeCDict(LZ4F_CDict* CDict);
+
+
+/*! LZ4_compressFrame_usingCDict() :
+ *  Compress an entire srcBuffer into a valid LZ4 frame using a digested Dictionary.
+ *  cctx must point to a context created by LZ4F_createCompressionContext().
+ *  If cdict==NULL, compress without a dictionary.
+ *  dstBuffer MUST be >= LZ4F_compressFrameBound(srcSize, preferencesPtr).
+ *  If this condition is not respected, function will fail (@return an errorCode).
+ *  The LZ4F_preferences_t structure is optional : you may provide NULL as argument,
+ *  but it's not recommended, as it's the only way to provide dictID in the frame header.
+ * @return : number of bytes written into dstBuffer.
+ *           or an error code if it fails (can be tested using LZ4F_isError()) */
+LZ4FLIB_STATIC_API size_t LZ4F_compressFrame_usingCDict(
+    LZ4F_cctx* cctx,
+    void* dst, size_t dstCapacity,
+    const void* src, size_t srcSize,
+    const LZ4F_CDict* cdict,
+    const LZ4F_preferences_t* preferencesPtr);
+
+
+/*! LZ4F_compressBegin_usingCDict() :
+ *  Inits streaming dictionary compression, and writes the frame header into dstBuffer.
+ *  dstCapacity must be >= LZ4F_HEADER_SIZE_MAX bytes.
+ * `prefsPtr` is optional : you may provide NULL as argument,
+ *  however, it's the only way to provide dictID in the frame header.
+ * @return : number of bytes written into dstBuffer for the header,
+ *           or an error code (which can be tested using LZ4F_isError()) */
+LZ4FLIB_STATIC_API size_t LZ4F_compressBegin_usingCDict(
+    LZ4F_cctx* cctx,
+    void* dstBuffer, size_t dstCapacity,
+    const LZ4F_CDict* cdict,
+    const LZ4F_preferences_t* prefsPtr);
+
+
+/*! LZ4F_decompress_usingDict() :
+ *  Same as LZ4F_decompress(), using a predefined dictionary.
+ *  Dictionary is used "in place", without any preprocessing.
+ *  It must remain accessible throughout the entire frame decoding. */
+LZ4FLIB_STATIC_API size_t LZ4F_decompress_usingDict(
+    LZ4F_dctx* dctxPtr,
+    void* dstBuffer, size_t* dstSizePtr,
+    const void* srcBuffer, size_t* srcSizePtr,
+    const void* dict, size_t dictSize,
+    const LZ4F_decompressOptions_t* decompressOptionsPtr);
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif  /* defined(LZ4F_STATIC_LINKING_ONLY) && !defined(LZ4F_H_STATIC_09782039843) */
diff --git a/cviruntime/include/lz4/xxhash.h b/cviruntime/include/lz4/xxhash.h
new file mode 100644
index 000000000..d6bad9433
--- /dev/null
+++ b/cviruntime/include/lz4/xxhash.h
@@ -0,0 +1,328 @@
+/*
+   xxHash - Extremely Fast Hash algorithm
+   Header File
+   Copyright (C) 2012-2016, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - xxHash source repository : https://github.com/Cyan4973/xxHash
+*/
+
+/* Notice extracted from xxHash homepage :
+
+xxHash is an extremely fast Hash algorithm, running at RAM speed limits.
+It also successfully passes all tests from the SMHasher suite.
+
+Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz)
+
+Name            Speed       Q.Score   Author
+xxHash          5.4 GB/s     10
+CrapWow         3.2 GB/s      2       Andrew
+MumurHash 3a    2.7 GB/s     10       Austin Appleby
+SpookyHash      2.0 GB/s     10       Bob Jenkins
+SBox            1.4 GB/s      9       Bret Mulvey
+Lookup3         1.2 GB/s      9       Bob Jenkins
+SuperFastHash   1.2 GB/s      1       Paul Hsieh
+CityHash64      1.05 GB/s    10       Pike & Alakuijala
+FNV             0.55 GB/s     5       Fowler, Noll, Vo
+CRC32           0.43 GB/s     9
+MD5-32          0.33 GB/s    10       Ronald L. Rivest
+SHA1-32         0.28 GB/s    10
+
+Q.Score is a measure of quality of the hash function.
+It depends on successfully passing SMHasher test set.
+10 is a perfect score.
+
+A 64-bit version, named XXH64, is available since r35.
+It offers much better speed, but for 64-bit applications only.
+Name     Speed on 64 bits    Speed on 32 bits
+XXH64       13.8 GB/s            1.9 GB/s
+XXH32        6.8 GB/s            6.0 GB/s
+*/
+
+#ifndef XXHASH_H_5627135585666179
+#define XXHASH_H_5627135585666179 1
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+/* ****************************
+*  Definitions
+******************************/
+#include <stddef.h>   /* size_t */
+typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
+
+
+/* ****************************
+ *  API modifier
+ ******************************/
+/** XXH_INLINE_ALL (and XXH_PRIVATE_API)
+ *  This is useful to include xxhash functions in `static` mode
+ *  in order to inline them, and remove their symbol from the public list.
+ *  Inlining can offer dramatic performance improvement on small keys.
+ *  Methodology :
+ *     #define XXH_INLINE_ALL
+ *     #include "xxhash.h"
+ * `xxhash.c` is automatically included.
+ *  It's not useful to compile and link it as a separate module.
+ */
+#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
+#  ifndef XXH_STATIC_LINKING_ONLY
+#    define XXH_STATIC_LINKING_ONLY
+#  endif
+#  if defined(__GNUC__)
+#    define XXH_PUBLIC_API static __inline __attribute__((unused))
+#  elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+#    define XXH_PUBLIC_API static inline
+#  elif defined(_MSC_VER)
+#    define XXH_PUBLIC_API static __inline
+#  else
+     /* this version may generate warnings for unused static functions */
+#    define XXH_PUBLIC_API static
+#  endif
+#else
+#  define XXH_PUBLIC_API   /* do nothing */
+#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */
+
+/*! XXH_NAMESPACE, aka Namespace Emulation :
+ *
+ * If you want to include _and expose_ xxHash functions from within your own library,
+ * but also want to avoid symbol collisions with other libraries which may also include xxHash,
+ *
+ * you can use XXH_NAMESPACE, to automatically prefix any public symbol from xxhash library
+ * with the value of XXH_NAMESPACE (therefore, avoid NULL and numeric values).
+ *
+ * Note that no change is required within the calling program as long as it includes `xxhash.h` :
+ * regular symbol name will be automatically translated by this header.
+ */
+#ifdef XXH_NAMESPACE
+#  define XXH_CAT(A,B) A##B
+#  define XXH_NAME2(A,B) XXH_CAT(A,B)
+#  define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber)
+#  define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32)
+#  define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState)
+#  define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState)
+#  define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset)
+#  define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update)
+#  define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest)
+#  define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState)
+#  define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash)
+#  define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical)
+#  define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64)
+#  define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState)
+#  define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState)
+#  define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset)
+#  define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update)
+#  define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest)
+#  define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState)
+#  define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash)
+#  define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical)
+#endif
+
+
+/* *************************************
+*  Version
+***************************************/
+#define XXH_VERSION_MAJOR    0
+#define XXH_VERSION_MINOR    6
+#define XXH_VERSION_RELEASE  5
+#define XXH_VERSION_NUMBER  (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)
+XXH_PUBLIC_API unsigned XXH_versionNumber (void);
+
+
+/*-**********************************************************************
+*  32-bit hash
+************************************************************************/
+typedef unsigned int XXH32_hash_t;
+
+/*! XXH32() :
+    Calculate the 32-bit hash of sequence "length" bytes stored at memory address "input".
+    The memory between input & input+length must be valid (allocated and read-accessible).
+    "seed" can be used to alter the result predictably.
+    Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s */
+XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, unsigned int seed);
+
+/*======   Streaming   ======*/
+typedef struct XXH32_state_s XXH32_state_t;   /* incomplete type */
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void);
+XXH_PUBLIC_API XXH_errorcode  XXH32_freeState(XXH32_state_t* statePtr);
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state);
+
+XXH_PUBLIC_API XXH_errorcode XXH32_reset  (XXH32_state_t* statePtr, unsigned int seed);
+XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH32_hash_t  XXH32_digest (const XXH32_state_t* statePtr);
+
+/*
+ * Streaming functions generate the xxHash of an input provided in multiple segments.
+ * Note that, for small input, they are slower than single-call functions, due to state management.
+ * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.
+ *
+ * XXH state must first be allocated, using XXH*_createState() .
+ *
+ * Start a new hash by initializing state with a seed, using XXH*_reset().
+ *
+ * Then, feed the hash state by calling XXH*_update() as many times as necessary.
+ * The function returns an error code, with 0 meaning OK, and any other value meaning there is an error.
+ *
+ * Finally, a hash value can be produced anytime, by using XXH*_digest().
+ * This function returns the nn-bits hash as an int or long long.
+ *
+ * It's still possible to continue inserting input into the hash state after a digest,
+ * and generate some new hashes later on, by calling again XXH*_digest().
+ *
+ * When done, free XXH state space if it was allocated dynamically.
+ */
+
+/*======   Canonical representation   ======*/
+
+typedef struct { unsigned char digest[4]; } XXH32_canonical_t;
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash);
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
+
+/* Default result type for XXH functions are primitive unsigned 32 and 64 bits.
+ * The canonical representation uses human-readable write convention, aka big-endian (large digits first).
+ * These functions allow transformation of hash result into and from its canonical format.
+ * This way, hash values can be written into a file / memory, and remain comparable on different systems and programs.
+ */
+
+
+#ifndef XXH_NO_LONG_LONG
+/*-**********************************************************************
+*  64-bit hash
+************************************************************************/
+typedef unsigned long long XXH64_hash_t;
+
+/*! XXH64() :
+    Calculate the 64-bit hash of sequence of length "len" stored at memory address "input".
+    "seed" can be used to alter the result predictably.
+    This function runs faster on 64-bit systems, but slower on 32-bit systems (see benchmark).
+*/
+XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t length, unsigned long long seed);
+
+/*======   Streaming   ======*/
+typedef struct XXH64_state_s XXH64_state_t;   /* incomplete type */
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void);
+XXH_PUBLIC_API XXH_errorcode  XXH64_freeState(XXH64_state_t* statePtr);
+XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dst_state, const XXH64_state_t* src_state);
+
+XXH_PUBLIC_API XXH_errorcode XXH64_reset  (XXH64_state_t* statePtr, unsigned long long seed);
+XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length);
+XXH_PUBLIC_API XXH64_hash_t  XXH64_digest (const XXH64_state_t* statePtr);
+
+/*======   Canonical representation   ======*/
+typedef struct { unsigned char digest[8]; } XXH64_canonical_t;
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash);
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src);
+#endif  /* XXH_NO_LONG_LONG */
+
+
+
+#ifdef XXH_STATIC_LINKING_ONLY
+
+/* ================================================================================================
+   This section contains declarations which are not guaranteed to remain stable.
+   They may change in future versions, becoming incompatible with a different version of the library.
+   These declarations should only be used with static linking.
+   Never use them in association with dynamic linking !
+=================================================================================================== */
+
+/* These definitions are only present to allow
+ * static allocation of XXH state, on stack or in a struct for example.
+ * Never **ever** use members directly. */
+
+#if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   include <stdint.h>
+
+struct XXH32_state_s {
+   uint32_t total_len_32;
+   uint32_t large_len;
+   uint32_t v1;
+   uint32_t v2;
+   uint32_t v3;
+   uint32_t v4;
+   uint32_t mem32[4];
+   uint32_t memsize;
+   uint32_t reserved;   /* never read nor write, might be removed in a future version */
+};   /* typedef'd to XXH32_state_t */
+
+struct XXH64_state_s {
+   uint64_t total_len;
+   uint64_t v1;
+   uint64_t v2;
+   uint64_t v3;
+   uint64_t v4;
+   uint64_t mem64[4];
+   uint32_t memsize;
+   uint32_t reserved[2];          /* never read nor write, might be removed in a future version */
+};   /* typedef'd to XXH64_state_t */
+
+# else
+
+struct XXH32_state_s {
+   unsigned total_len_32;
+   unsigned large_len;
+   unsigned v1;
+   unsigned v2;
+   unsigned v3;
+   unsigned v4;
+   unsigned mem32[4];
+   unsigned memsize;
+   unsigned reserved;   /* never read nor write, might be removed in a future version */
+};   /* typedef'd to XXH32_state_t */
+
+#   ifndef XXH_NO_LONG_LONG  /* remove 64-bit support */
+struct XXH64_state_s {
+   unsigned long long total_len;
+   unsigned long long v1;
+   unsigned long long v2;
+   unsigned long long v3;
+   unsigned long long v4;
+   unsigned long long mem64[4];
+   unsigned memsize;
+   unsigned reserved[2];     /* never read nor write, might be removed in a future version */
+};   /* typedef'd to XXH64_state_t */
+#    endif
+
+# endif
+
+
+#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
+#  include "xxhash.c"   /* include xxhash function bodies as `static`, for inlining */
+#endif
+
+#endif /* XXH_STATIC_LINKING_ONLY */
+
+
+#if defined (__cplusplus)
+}
+#endif
+
+#endif /* XXHASH_H_5627135585666179 */
diff --git a/cviruntime/include/runtime/cpu_function.hpp b/cviruntime/include/runtime/cpu_function.hpp
new file mode 100644
index 000000000..4822f0eec
--- /dev/null
+++ b/cviruntime/include/runtime/cpu_function.hpp
@@ -0,0 +1,74 @@
+/*
+* Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+*/
+
+#ifndef RUNTIME_CPU_FUNCTION_H
+#define RUNTIME_CPU_FUNCTION_H
+#include <vector>
+#include <stdint.h>
+#include <memory>
+#include <runtime/neuron.hpp>
+#include <runtime/op_param.hpp>
+
+namespace cvi {
+namespace runtime {
+
+class ICpuFunction {
+public:
+  ICpuFunction() {}
+  virtual ~ICpuFunction() {}
+  virtual void setup(tensor_list_t &inputs,
+                     tensor_list_t &outputs,
+                     OpParam &param) = 0;
+  virtual void run() = 0;
+
+protected:
+  template <typename T>
+  void print_data(T data) {
+    if (sizeof(T) == 4) {
+      printf("%e ", (float)data);
+    } else if (sizeof(T) == 1) {
+      printf("%4d ", (int)data);
+    } else {
+      assert(0);
+      std::cout << data << " ";
+    }
+  }
+
+  template <typename T>
+  void dump(const std::string &tag, const T *data, size_t count) {
+    auto ptr = (T *)data;
+    int loop = count / 10;
+    std::cout << "-------Dump " << tag << ", size:" << count << "\n";
+
+    for (int i = 0; i < loop; i++) {
+      for (int j = 0; j < 10; j++) {
+        print_data(*(ptr++));
+      }
+      std::cout << "\n";
+    }
+    for (int j = 0; j < (int)(count % 10); j++) {
+      print_data(*(ptr++));
+    }
+    std::cout << "\n";
+  }
+};
+
+typedef ICpuFunction *(*ICpuFunctionCreate)();
+
+} // namespace runtime
+} // namespace cvi
+
+typedef struct {
+  char *name;
+  cvi::runtime::ICpuFunctionCreate func;
+} CustomOpRuntimeFunc;
+
+#define REGISTER_OP_RUNTIME_FUNCS(X, ...)                                                \
+  extern "C" {                                                                           \
+  CustomOpRuntimeFunc customOpRuntimeFuncs[] = {X, ##__VA_ARGS__};                       \
+  int customOpRuntimeFuncsNum =                                                          \
+      sizeof(customOpRuntimeFuncs) / sizeof(CustomOpRuntimeFunc);                        \
+  }
+
+#endif
diff --git a/cviruntime/include/runtime/debug.h b/cviruntime/include/runtime/debug.h
new file mode 100644
index 000000000..65a28f65b
--- /dev/null
+++ b/cviruntime/include/runtime/debug.h
@@ -0,0 +1,23 @@
+#ifndef _BM_DEBUG_H_
+#define _BM_DEBUG_H_
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <assert.h>
+#include <memory.h>
+#include <sys/mman.h>
+#include "cvitpu_debug.h"
+
+
+// print the version of runtime.
+void showRuntimeVersion();
+// dump sysfs debug file
+void dumpSysfsDebugFile(const char *path);
+
+void mem_protect(uint8_t *vaddr, size_t size);
+
+void mem_unprotect(uint8_t *vaddr, size_t size);
+
+#endif /* _BM_DEBUG_H_ */
diff --git a/cviruntime/include/runtime/kernel_function.hpp b/cviruntime/include/runtime/kernel_function.hpp
new file mode 100644
index 000000000..6fbd82867
--- /dev/null
+++ b/cviruntime/include/runtime/kernel_function.hpp
@@ -0,0 +1,34 @@
+#ifndef RUNTIME_TDMA_COPY_HPP
+#define RUNTIME_TDMA_COPY_HPP
+
+#include "cviruntime_context.h"
+#include "cviruntime.h"
+#include "cvikernel/cvikernel.h"
+
+namespace cvi {
+namespace runtime {
+
+CVI_RC runtimeExecuteKernelFunction(
+    CVI_RT_HANDLE ctx, CVI_RT_MEM codeBuf,
+    uint64_t gaddrSrc, uint64_t gaddrDst);
+
+CVI_RT_MEM runtimeJitTdmaStrideCopy(
+    CVI_RT_HANDLE ctx, void *cvk, CVI_FMT fmt,
+    cvk_tg_shape_t *shapeDst, cvk_tg_stride_t *strideDst,
+    cvk_tg_shape_t *shapeSrc, cvk_tg_stride_t *strideSrc);
+
+CVI_RT_MEM runtimeJitMatrixMul(
+    CVI_RT_HANDLE ctx, void* cvk_ctx, CVI_FMT fmt,
+    uint32_t m, uint32_t k, uint32_t n);
+
+CVI_RT_MEM runtimeJitEuclideanDistance(
+    CVI_RT_HANDLE ctx, void* cvk_ctx,
+    uint32_t records, uint32_t feature_size);
+
+CVI_RT_MEM runtimeJitGrayImageLight(
+    CVI_RT_HANDLE ctx, void* cvk_ctx,
+    int32_t ih, int32_t iw, int32_t kernel_sz);
+}
+
+}
+#endif
\ No newline at end of file
diff --git a/cviruntime/include/runtime/model.hpp b/cviruntime/include/runtime/model.hpp
new file mode 100644
index 000000000..f000f2368
--- /dev/null
+++ b/cviruntime/include/runtime/model.hpp
@@ -0,0 +1,87 @@
+#ifndef RUNTIME_MODEL_H
+#define RUNTIME_MODEL_H
+
+#include <vector>
+#include <map>
+#include <memory>
+#include <string>
+#include <cvibuilder/cvimodel_generated.h>
+#include <runtime/stream.hpp>
+#include <runtime/program.hpp>
+#include <runtime/neuron.hpp>
+#include <runtime/cpu_function.hpp>
+#include <runtime/taskpool.hpp>
+
+namespace cvi {
+namespace runtime {
+
+typedef struct {
+  char magic[8];
+  uint32_t body_size;
+  char major;
+  char minor;
+  char md5[16];
+  char chip[16];
+  char padding[2];
+} MODEL_HEADER;
+
+class CviModel {
+public:
+  CviModel(CVI_RT_HANDLE ctx, int count);
+
+  CVI_RC acquire(const int8_t *buf, size_t size);
+  CVI_RC acquire(const std::string &modelFile);
+  CVI_RC acquire(const int fd, const size_t ud_offset);
+  void refer() { ref++; }
+  void release();
+
+  CVI_RC loadProgram(Program **program,
+      int program_id, bool export_all_tensors,
+      bool skip_preprocess);
+
+  static std::string getChipType(const std::string &modelFile,
+      const int8_t *buf = nullptr, size_t size = 0);
+
+  int32_t program_num;
+  int32_t major_ver = 1;
+  int32_t minor_ver = 2;
+
+  // global info
+  static std::string targetChipType;
+
+private:
+  ~CviModel();
+
+  CVI_RC parse(BaseStream *stream);
+  CVI_RC loadWeight(BaseStream *stream, size_t offset, size_t size);
+  CVI_RC loadDmabuf(BaseStream *stream, size_t offset, size_t size, const cvi::model::Section *section);
+  CVI_RC loadCmdbuf(BaseStream *stream, size_t offset, size_t size, const cvi::model::Section *section);
+  CVI_RC extractSections(BaseStream *stream, size_t bin_offset);
+  CVI_RC parseModelHeader(BaseStream *stream, size_t &payload_sz,
+                          size_t &header_sz);
+  bool checkIfMatchTargetChipType(std::string &target);
+  CVI_RC showAndCheckVersion();
+  void parseProgramNum();
+  void createCpuWeightMap();
+
+  CVI_RT_HANDLE _ctx;
+  std::atomic<int32_t> ref;
+  TaskPool *_pool = nullptr;
+  cvi::model::Model *_fb_model;
+  uint8_t *_model_body = nullptr;
+  CVI_RT_MEM _weight_mem = nullptr;
+  CustomFunctionSection _custom_section;
+  std::vector<CpuRuntimeFunction *> _cpu_functions;
+  tensor_map_t weight_map;
+  dmabuf_map_t dmabuf_map;
+  bool encrypt_model;
+  bool isprotect = false; //protect cmdbuf_mem and weight_mem 
+  int _count;
+  std::string _model_name;
+  size_t _max_shared_mem_size;
+};
+
+} // namespace runtime
+} // namespace cvi
+
+#endif
diff --git a/cviruntime/include/runtime/neuron.hpp b/cviruntime/include/runtime/neuron.hpp
new file mode 100644
index 000000000..c9fc5cc17
--- /dev/null
+++ b/cviruntime/include/runtime/neuron.hpp
@@ -0,0 +1,148 @@
+/*
+* Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+*/
+
+#ifndef RUNTIME_NEURON_H
+#define RUNTIME_NEURON_H
+
+#include <map>
+#include <vector>
+#include <memory>
+#include "cviruntime.h"
+#include "cviruntime_context.h"
+
+namespace cvi {
+namespace runtime {
+
+class Neuron {
+public:
+  enum NeuronState {
+    TPU_MEM = 0,
+    CPU_MEM = 1,
+  };
+
+  enum NeuronType {
+    WEIGHT      = 0,
+    ACTIVATION  = 1,
+  };
+
+  Neuron(CVI_RT_HANDLE ctx, const void *model_tensor,
+         CVI_RT_MEM weight_mem, const char *model_name);
+  Neuron(CVI_RT_HANDLE ctx, CVI_RT_HANDLE cvk,
+         const void *model_tensor,
+         uint64_t *baseAddrArray,
+         CVI_RT_MEM *baseMemArray,
+         const char *model_name);
+  ~Neuron();
+
+  template <typename T>
+  inline T* cpu_data() {
+    _state = Neuron::CPU_MEM;
+    return (T *)sys_mem();
+  }
+
+  inline size_t count() {
+    return _count;
+  }
+
+  inline size_t size() {
+    return _size;
+  }
+
+  inline size_t offset(int n, int c = 0, int h = 0, int w = 0) {
+    return (((n * shape[1] + c) * shape[2] + h) * shape[3] + w);
+  }
+
+  inline bool overwrote() {
+    return _overwrote;
+  }
+
+  inline void setState(NeuronState state) {
+    _state = state;
+  }
+
+  inline uint8_t *sys_mem() {
+    return (_vaddr ? _vaddr : _cpu_mem);
+  }
+
+  inline uint64_t paddr() {
+    return _paddr;
+  }
+
+  inline float qscale() {
+    return _qscale;
+  }
+
+  inline void setQScale(float scale) {
+    _qscale = scale;
+  }
+
+  inline int zero_point(){
+    return _zero_point;
+  }
+
+  CVI_RC preloadChannelAndCompact(int32_t channel_idx, uint64_t src_paddr);
+  CVI_RC preloadFrameAndCompact(int32_t frame_idx, uint64_t src_paddr);
+  CVI_RC preload(int32_t frame_idx, uint64_t src_paddr);
+
+  void load(CVI_TENSOR &tensor);
+  void store(CVI_TENSOR &tensor);
+  void toCpu();
+  void toTpu();
+  CVI_RC reserveIonMem(int64_t offset);
+  CVI_RC reserveSysMem();
+  void updateBaseAddr(uint64_t paddr);
+  bool isPacked();
+
+private:
+  void updateBaseAddr(CVI_RT_MEM mem);
+  inline void setZeroPoint(int zp) { _zero_point = zp; }
+  void setPixelFormatAndSize(const std::string &pixel_format, int32_t dsize);
+  void setPixelAlign(CVI_NN_PIXEL_FORMAT_E format);
+  uint32_t yuv_size(int n, int c, int h, int w, CVI_NN_PIXEL_FORMAT_E format);
+
+public:
+  std::string name;
+  std::vector<int> shape;
+  CVI_FMT fmt;
+  NeuronType type;
+  CVI_NN_PIXEL_FORMAT_E pixel_format;
+  std::vector<float> scale;
+  std::vector<float> mean;
+  bool aligned = false;
+  int vpss_w_align, vpss_y_align, vpss_channel_align;
+
+private:
+  CVI_RT_HANDLE _ctx;
+  CVI_RT_KHANDLE _cvk;
+  CVI_RT_MEM _streamCopyCmdbuf = nullptr;
+  CVI_RT_MEM _channelPreloadCmdbuf = nullptr;
+  CVI_RT_MEM _framePreloadCmdbuf = nullptr;
+  CVI_RT_MEM _base_mem = nullptr;
+  CVI_RT_MEM _gmem = nullptr;
+  uint8_t* _cpu_mem = nullptr;
+  uint8_t* _vaddr = nullptr;
+  uint64_t _paddr = 0;
+  NeuronState _state;
+  uint32_t _id;
+  uint32_t _count;
+  uint32_t _size;
+  uint32_t _tensor_size = 0;
+  bool _overwrote = false;
+  float _qscale = 1.0f;
+  int _zero_point = 0;
+  uint64_t *_baseAddrArray;
+  CVI_RT_MEM *_baseMemArray;
+  int32_t _baseAddrIndex = 1;
+  std::string _model_name;
+  std::string _module_name;
+};
+
+typedef std::map<std::string, std::shared_ptr<Neuron>> tensor_map_t;
+typedef std::vector<std::shared_ptr<Neuron>> tensor_list_t;
+
+
+} // namespace runtime
+} // namespace cvi
+
+#endif
diff --git a/cviruntime/include/runtime/op_param.hpp b/cviruntime/include/runtime/op_param.hpp
new file mode 100644
index 000000000..0288cfedc
--- /dev/null
+++ b/cviruntime/include/runtime/op_param.hpp
@@ -0,0 +1,53 @@
+/*
+* Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+*/
+
+#ifndef CVI_RUNTIME_OP_PARAMETER_H
+#define CVI_RUNTIME_OP_PARAMETER_H
+#include <iostream>
+#include <string>
+#include <vector>
+#include <map>
+#include <memory>
+#include <assert.h>
+
+namespace cvi {
+
+class FieldBase {
+public:
+  FieldBase() = default;
+  virtual ~FieldBase() = default;
+};
+
+template <typename T>
+class Field: public FieldBase {
+public:
+  Field(T& val): data(val) {}
+  T data;
+};
+
+class OpParam {
+public:
+  template <typename T>
+  void put(std::string name, T value) {
+    fields[name] = std::make_shared<Field<T>>(value);
+  }
+
+  template <typename T>
+  T& get(std::string name) {
+    auto f = dynamic_cast<Field<T>*>(fields[name].get());
+    assert(f);
+    return f->data;
+  }
+
+  bool has(std::string name) {
+    auto it = fields.find(name);
+    return it != fields.end();
+  }
+
+private:
+  std::map<std::string, std::shared_ptr<FieldBase>> fields;
+};
+
+}
+#endif
\ No newline at end of file
diff --git a/cviruntime/include/runtime/program.hpp b/cviruntime/include/runtime/program.hpp
new file mode 100644
index 000000000..e156d44d0
--- /dev/null
+++ b/cviruntime/include/runtime/program.hpp
@@ -0,0 +1,145 @@
+#ifndef RUNTIME_PROGRAM_H
+#define RUNTIME_PROGRAM_H
+
+#include <map>
+#include <list>
+#include <vector>
+#include <unordered_map>
+#include <iostream>
+#include <fstream>
+#include <runtime/neuron.hpp>
+#include <runtime/stream.hpp>
+#include <runtime/section.hpp>
+#include <runtime/cpu_function.hpp>
+#include <runtime/op_param.hpp>
+#include <cviruntime_context.h>
+#include "cviruntime.h"
+#include <runtime/taskpool.hpp>
+#include <cvibuilder/cvimodel_generated.h>
+#include <cvibuilder/parameter_generated.h>
+
+namespace cvi {
+namespace runtime {
+
+typedef std::unordered_map<std::string, CVI_RT_MEM> dmabuf_map_t;
+
+class Routine;
+class Program {
+public:
+  Program(CVI_RT_HANDLE ctx, TaskPool *pool,
+          dmabuf_map_t &dmabuf_map,
+          std::vector<CpuRuntimeFunction *> &functions,
+          tensor_map_t &weight_map,
+          CVI_RT_MEM weight_mem,
+          const char *model_name,
+          size_t max_shared_mem_size);
+  ~Program();
+
+  void setOptions(bool export_all_tensors,
+                  bool skip_preprocess);
+  CVI_RC load(const cvi::model::Program *fb_program);
+
+  bool forward(CVI_TENSOR *inputs, int input_num,
+               CVI_TENSOR *outputs, int output_num);
+
+  void *forwardAsync(CVI_TENSOR *inputs, int input_num,
+                     CVI_TENSOR *outputs, int output_num);
+
+  CVI_RC forwardWait(void *task);
+
+  const tensor_list_t &input_tensors() { return in_tensors; }
+  const tensor_list_t &output_tensors() { return out_tensors; }
+
+  CVI_TENSOR *exportInputs(int32_t &size);
+  CVI_TENSOR *exportOutputs(int32_t &size);
+
+  tensor_list_t in_tensors;
+  tensor_list_t out_tensors;
+  tensor_map_t neuron_map;
+  tensor_map_t &weight_map;
+  dmabuf_map_t &dmabuf_map;
+  std::vector<CpuRuntimeFunction *> &cpu_functions;
+  /* 0: shared_mem,
+   * 1: weight_mem,
+   * 2: private_mem,
+   * 3~7: io_mem
+   */
+  uint64_t baseAddrArray[8];
+  CVI_RT_MEM baseMemArray[8];
+
+private:
+  CVI_RC createNeuronSpace(const cvi::model::Program *fb_program);
+  CVI_RC createNeuronMap(const cvi::model::Program *fb_program);
+  CVI_RC createRoutines(const cvi::model::Program *fb_program);
+  bool run();
+
+  CVI_RT_HANDLE _ctx;
+  CVI_RT_KHANDLE _cvk;
+  bool _export_all_tensors;
+  bool _skip_preprocess;
+  TaskPool *_pool = nullptr;
+  CVI_RT_MEM private_mem = nullptr;
+  CVI_RT_MEM shared_mem = nullptr;
+  std::list<std::shared_ptr<Routine>> _routines;
+  std::string _model_name;
+  size_t _max_shared_mem_size;
+};
+
+class Routine {
+public:
+  Routine(CVI_RT_HANDLE ctx, Program *program, bool tpu)
+    : tpu(tpu), _ctx(ctx), _program(program) {}
+  virtual ~Routine() {}
+  virtual bool initialize(const cvi::model::Routine *routine) = 0;
+  virtual CVI_RC run() = 0;
+  virtual void reset() = 0;
+  virtual CVI_RC prepare() { return CVI_RC_SUCCESS; }
+  tensor_list_t inputs;
+  tensor_list_t outputs;
+  bool tpu;
+
+protected:
+  CVI_RT_HANDLE _ctx;
+  Program *_program;
+};
+
+class TpuRoutine : public Routine {
+public:
+  TpuRoutine(CVI_RT_HANDLE ctx, Program *program)
+    : Routine(ctx, program, true) {}
+  ~TpuRoutine() {
+  }
+
+  bool initialize(const cvi::model::Routine *routine);
+  int init_dmabuf (Program *program, const std::string &name);
+  CVI_RC run();
+  void reset();
+
+private:
+  CVI_RT_MEM buf_mem = nullptr;
+  bool enable_pmu = false;
+  bool encrypted = false;
+};
+
+class CpuRoutine : public Routine {
+public:
+  CpuRoutine(CVI_RT_HANDLE ctx, Program *program)
+    : Routine(ctx, program, false) {}
+  ~CpuRoutine() { delete _func; }
+
+  bool initialize(const cvi::model::Routine *routine);
+  CVI_RC run();
+  CVI_RC prepare();
+  void reset();
+
+private:
+  void fetchQscaleFromDequant(OpParam &param);
+  void handleFuncArgs(const uint8_t *args, OpParam &param);
+  ICpuFunctionCreate _func_open = nullptr;
+  ICpuFunction *_func = nullptr;
+};
+
+} // namespace runtime
+} // namespace cvi
+
+#endif
diff --git a/cviruntime/include/runtime/section.hpp b/cviruntime/include/runtime/section.hpp
new file mode 100644
index 000000000..c1e7f60b6
--- /dev/null
+++ b/cviruntime/include/runtime/section.hpp
@@ -0,0 +1,48 @@
+#ifndef RUNTIME_SECTION_H
+#define RUNTIME_SECTION_H
+
+#include <vector>
+#include <map>
+#include <memory>
+#include <string>
+#include <runtime/stream.hpp>
+#include <runtime/cpu_function.hpp>
+#include "alloc.h"
+
+namespace cvi {
+namespace runtime {
+
+class WeightSection {
+public:
+  WeightSection(size_t offset, size_t size) : offset(offset), size(size) {}
+
+  size_t offset;
+  size_t size;
+};
+
+class CpuRuntimeFunction {
+public:
+  CpuRuntimeFunction(const std::string &name, ICpuFunctionCreate func_open)
+      : name(name), func_open(func_open) {}
+  ~CpuRuntimeFunction() = default;
+
+  const std::string name;
+  ICpuFunctionCreate func_open;
+};
+
+class CustomFunctionSection {
+public:
+  CustomFunctionSection() = default;
+  ~CustomFunctionSection();
+  bool load(BaseStream *stream, size_t offset, size_t size,
+            std::vector<CpuRuntimeFunction *> &cpu_functions);
+
+private:
+  int shm_fd = 0;
+  void *dso_handle = nullptr;
+};
+
+} // namespace runtime
+} // namespace cvi
+
+#endif
diff --git a/cviruntime/include/runtime/shared_mem.hpp b/cviruntime/include/runtime/shared_mem.hpp
new file mode 100644
index 000000000..0c2fddd06
--- /dev/null
+++ b/cviruntime/include/runtime/shared_mem.hpp
@@ -0,0 +1,16 @@
+#ifndef RUNTIME_SHARED_MEM_H
+#define RUNTIME_SHARED_MEM_H
+
+#include <cviruntime_context.h>
+
+namespace cvi {
+namespace runtime {
+
+void setSharedMemSize(size_t size);
+CVI_RT_MEM allocateSharedMemory(CVI_RT_HANDLE ctx, size_t size);
+void deallocateSharedMemory(CVI_RT_HANDLE ctx, CVI_RT_MEM mem);
+
+} // namespace runtime
+} // namespace cvi
+
+#endif
diff --git a/cviruntime/include/runtime/stream.hpp b/cviruntime/include/runtime/stream.hpp
new file mode 100644
index 000000000..36682b755
--- /dev/null
+++ b/cviruntime/include/runtime/stream.hpp
@@ -0,0 +1,57 @@
+#ifndef RUNTIME_CVISTREAM_H
+#define RUNTIME_CVISTREAM_H
+
+#include <iostream>
+#include <fstream>
+
+namespace cvi {
+namespace runtime {
+
+class BaseStream {
+public:
+  BaseStream() {}
+  virtual ~BaseStream() {}
+
+  size_t length() {
+    return _length;
+  }
+
+  virtual size_t read(uint8_t *buf, size_t offset, size_t size) = 0;
+
+protected:
+  size_t _length = 0;
+};
+
+class FileStream : public BaseStream {
+public:
+  FileStream(const std::string &file_name);
+  ~FileStream();
+  size_t read(uint8_t *buf, size_t offset, size_t size);
+private:
+  std::ifstream *_fstream;
+};
+
+class BufferStream : public BaseStream {
+public:
+  BufferStream(const int8_t *buf, size_t size);
+  ~BufferStream() {}
+  size_t read(uint8_t *buf, size_t offset, size_t size);
+
+private:
+  const int8_t *buffer;
+};
+
+class FdStream : public BaseStream {
+public:
+  FdStream(const int fd, const size_t ud_offset);
+  ~FdStream() {};
+  size_t read(uint8_t *buf, size_t offset, size_t size);
+private:
+  int file_descriptor;
+  size_t user_define_offset = 0; //The file header's offset that user defined.
+};
+
+} // namespace runtime
+} // namespace cvi
+
+#endif
diff --git a/cviruntime/include/runtime/taskpool.hpp b/cviruntime/include/runtime/taskpool.hpp
new file mode 100644
index 000000000..83384c4de
--- /dev/null
+++ b/cviruntime/include/runtime/taskpool.hpp
@@ -0,0 +1,113 @@
+/*
+ * This file is licensed under the zlib/libpng license, included in this
+ * distribution in the file COPYING.
+ */
+#ifndef RUNTIME_TASKQUE_H
+#define RUNTIME_TASKQUE_H
+
+#include <future>
+#include <thread>
+#include <deque>
+#include <vector>
+#include <utility>
+#include <chrono>
+#include <list>
+#include <mutex>
+#include <condition_variable>
+#include "cviruntime.h"
+
+namespace cvi {
+namespace runtime {
+
+class TaskPool;
+
+class Task {
+public:
+  Task(TaskPool *pool, void *program, CVI_TENSOR *inputs, int input_num,
+       CVI_TENSOR *outputs, int output_num);
+
+  void *program;
+  int input_num;
+  int output_num;
+  CVI_TENSOR *inputs;
+  CVI_TENSOR *outputs;
+  CVI_RC retCode = CVI_RC_UNINIT;
+};
+
+class RingQueue {
+public:
+  RingQueue(int capacity) : _capacity(capacity) { _queue.resize(_capacity); }
+
+  ~RingQueue() {}
+
+  void put(Task *task) {
+    std::unique_lock<std::mutex> lock(_mutex);
+    while (_capacity - _length <= 1) {
+      _cond_idel.wait(lock);
+    }
+    _queue[_tail] = task;
+    move(_tail);
+    _length++;
+    _cond_busy.notify_one();
+  }
+
+  Task *get() {
+    std::unique_lock<std::mutex> lock(_mutex);
+    while (_length == 0) {
+      _cond_busy.wait(lock);
+    }
+    if (_capacity - _length == 1) {
+      _cond_idel.notify_one();
+    }
+    auto task = _queue[_head];
+    move(_head);
+    _length--;
+    return task;
+  }
+
+  inline uint32_t move(uint32_t &index) {
+    ++index;
+    index %= _capacity;
+    return index;
+  }
+
+private:
+  uint32_t _capacity;
+  uint32_t _head = 0;
+  uint32_t _tail = 0;
+  uint32_t _length = 0;
+  std::vector<Task *> _queue;
+  std::mutex _mutex;
+  std::condition_variable _cond_busy;
+  std::condition_variable _cond_idel;
+};
+
+class TaskPool {
+public:
+  TaskPool(int pool_size)
+      : _pool_size(pool_size), _queue(pool_size * 4),
+        _started(false), _done(false) {}
+  ~TaskPool();
+
+  void startPool();
+  void addTask(Task *task) { _queue.put(task); }
+  void waitTask(Task *task);
+  void workFunc();
+
+private:
+  void addTerminateTask() { _queue.put(nullptr); }
+  static void run(TaskPool *pool) { pool->workFunc(); }
+
+  int _pool_size;
+  RingQueue _queue;
+  std::atomic<bool> _started;
+  std::atomic<bool> _done;
+  std::mutex _mutex;
+  std::vector<std::thread> _threads;
+  std::condition_variable _cond_feedback;
+};
+
+}
+}
+
+#endif // WORKQUEUE_threadpool_hpp
\ No newline at end of file
diff --git a/cviruntime/include/runtime/version.h b/cviruntime/include/runtime/version.h
new file mode 100644
index 000000000..525227021
--- /dev/null
+++ b/cviruntime/include/runtime/version.h
@@ -0,0 +1,8 @@
+#ifndef CVIRUNTIME_VERSION_H
+#define CVIRUNTIME_VERSION_H
+
+#define CVIRUNTIME_MAJOR_VER 1
+#define CVIRUNTIME_MINOR_VER 1
+#define CVIRUNTIME_SUBMINOR_VER 1
+
+#endif
diff --git a/cviruntime/python/CMakeLists.txt b/cviruntime/python/CMakeLists.txt
new file mode 100644
index 000000000..fe89d2d3b
--- /dev/null
+++ b/cviruntime/python/CMakeLists.txt
@@ -0,0 +1,15 @@
+cmake_minimum_required(VERSION 3.1.0)
+project(pyruntime)
+
+if(CMAKE_CROSSCOMPILING)
+  include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
+  include_directories(${CMAKE_SYSROOT}/include)
+  link_directories(${CNPY_PATH})
+endif()
+
+add_subdirectory(include/pybind11)
+pybind11_add_module(pyruntime pyruntime.cpp)
+
+set(CVI_LIBS ${CVI_LIBS} cviruntime cvikernel)
+target_link_libraries(pyruntime PRIVATE ${CVI_LIBS})
+install(TARGETS pyruntime DESTINATION python)
diff --git a/cviruntime/python/README b/cviruntime/python/README
new file mode 100644
index 000000000..f93410009
--- /dev/null
+++ b/cviruntime/python/README
@@ -0,0 +1,21 @@
+1. Download pyruntime.tgz from NAS server("ai/prebuilt/pyruntime.tgz")
+   This package provides a complete environment of python3.6, and also contains
+   modules of torch, torchvision, numpy and cvitek's runtime python wrapper.
+
+2. Decompress pacakge to soc platform:
+   $ cd /mnt/data
+   $ tar -zxvf pyruntime.tgz ./
+
+3. Setup environment variables.
+   $ export PATH=/mnt/data/python/bin/:/mnt/data/cvitek_tpu_sdk/bin:${PATH}
+   $ export LD_LIBRARY_PATH=/mnt/data/python/lib:/mnt/data/cvitek_tpu_sdk/lib:${LD_LIBRARY_PATH}
+   $ export PYTHON_EGG_CACHE=/mnt/data/python/.cache/
+
+4. Run accuracy evaluation script:
+   $ ./eval_imagenet.py \
+        --dataset imagenet/img_val_extracted/ \
+        --cvimodel resnet50_bs4.cvimodel \
+        --count 1000 --batch_size 4 \
+        --image_resize_dims 256,256 \
+        --net_input_dims 224,224 \
+        --mean 104.01,116.67,122.68
diff --git a/cviruntime/python/cvk_test.py b/cviruntime/python/cvk_test.py
new file mode 100644
index 000000000..d24dfc1a8
--- /dev/null
+++ b/cviruntime/python/cvk_test.py
@@ -0,0 +1,16 @@
+import numpy as np
+import pyruntime as rt
+cvk_ctx = rt.CvkContext("CviContext")
+a = np.array([[[[1]]]], dtype=np.int8)
+b = cvk_ctx.lmem_alloc_tensor(a, 1)
+b.shapes()
+b.address()
+c = cvk_ctx.lmem_alloc_tensor(a, 1)
+c.shapes()
+c.address()
+
+cvk_ctx.tdma_g2l_tensor_copy(b, a)
+d = np.array([[[[0]]]], dtype=np.int8)
+cvk_ctx.tdma_l2g_tensor_copy(d, b)
+
+print(a == d)
diff --git a/cviruntime/python/eval_imagenet.py b/cviruntime/python/eval_imagenet.py
new file mode 100755
index 000000000..7c9cacc91
--- /dev/null
+++ b/cviruntime/python/eval_imagenet.py
@@ -0,0 +1,215 @@
+#!/usr/bin/env python3
+# use pytorch for dataloader
+# https://github.com/pytorch/examples/blob/master/imagenet/main.py
+
+import argparse
+import os
+import random
+import shutil
+import time
+import warnings
+import numpy as np
+import pyruntime
+from PIL import Image
+from PIL import ImageFile
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+
+import torch
+import torch.nn as nn
+import torch.utils.data
+import torchvision.transforms as transforms
+import torchvision.datasets as datasets
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self, name, fmt=':f'):
+        self.name = name
+        self.fmt = fmt
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+    def __str__(self):
+        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
+        return fmtstr.format(**self.__dict__)
+
+class ProgressMeter(object):
+    def __init__(self, num_batches, meters, prefix=""):
+        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
+        self.meters = meters
+        self.prefix = prefix
+
+    def display(self, batch):
+        entries = [self.prefix + self.batch_fmtstr.format(batch)]
+        entries += [str(meter) for meter in self.meters]
+        print('\t'.join(entries))
+
+    def _get_batch_fmtstr(self, num_batches):
+        num_digits = len(str(num_batches // 1))
+        fmt = '{:' + str(num_digits) + 'd}'
+        return '[' + fmt + '/' + fmt.format(num_batches) + ']'
+
+def accuracy(output, target, topk=(1,)):
+    """Computes the accuracy over the k top predictions for the specified values of k"""
+    maxk = max(topk)
+    batch_size = target.size(0)
+
+    _, pred = output.topk(maxk, 1, True, True)
+    pred = pred.t()
+    correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+    res = []
+    for k in topk:
+        correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
+        res.append(correct_k.mul_(100.0 / batch_size))
+    return res
+
+def datasetLoader(args):
+  image_resize_dims = [int(s) for s in args.image_resize_dims.split(',')]
+  net_input_dims = [int(s) for s in args.net_input_dims.split(',')]
+  image_resize_dims = [ max(x,y) for (x,y) in zip(image_resize_dims, net_input_dims)]
+
+  valdir = os.path.join(args.dataset, 'val')
+  if (args.loader_transforms):
+    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                     std=[0.229, 0.224, 0.225])
+    val_loader = torch.utils.data.DataLoader(
+        datasets.ImageFolder(valdir, transforms.Compose([
+            transforms.Resize(image_resize_dims),
+            transforms.CenterCrop(net_input_dims),
+            transforms.ToTensor(),
+            normalize,
+        ])),
+        batch_size=args.batch_size, shuffle=True)
+  else:
+    val_loader = torch.utils.data.DataLoader(
+        datasets.ImageFolder(valdir, transforms.Compose([
+            transforms.Resize(image_resize_dims),
+            transforms.CenterCrop(net_input_dims),
+            transforms.ToTensor()
+        ])),
+        batch_size=args.batch_size, shuffle=True)
+  return val_loader
+
+def imagePreprocssing(args, images, mean, qscale):
+  inputs = np.array([])
+  for image in images:
+    if args.loader_transforms:
+      # loader do normalize already
+      x = image.numpy()
+    else:
+      # pytorch ToTensor() will do HWC to CHW, and change range to [0.0, 1.0]
+      # for pytorch, seeing errors if not include ToTensor in transforms
+      # change to range [0, 255]
+      x = image.numpy() * 255
+      x = x.astype('uint8')
+      # transposed already in ToTensor()
+      # x = np.transpose(x, (2, 0, 1))
+      # still need the swap for caffe models
+      x = x[[2,1,0], :, :]
+      x = x.astype(np.float32)
+      if args.raw_scale != 255.0:
+        x = x * args.raw_scale / 255.0
+      # apply mean
+      if mean.size != 0:
+        x -= mean
+      if qscale != 0:
+        x = x * qscale
+    # expand to 4-D again
+    x = np.expand_dims(x, axis=0)
+    if inputs.size:
+      inputs = np.append(inputs, x, axis=0)
+    else:
+      inputs = x
+
+  if args.input_scale != 1.0:
+    inputs *= args.input_scale
+  return inputs
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser(description="Classification Evaluation on ImageNet Dataset.")
+  parser.add_argument("--cvimodel", type=str)
+  parser.add_argument("--batch_size", type=int, default=1)
+  parser.add_argument("--dataset", type=str, help="The root directory of the ImageNet dataset.")
+  parser.add_argument("--image_resize_dims", type=str, default='256,256')
+  parser.add_argument("--net_input_dims", type=str, default='224,224')
+  parser.add_argument("--raw_scale", type=float, help="Multiply raw input image data by this scale.", default=255.0)
+  parser.add_argument("--mean", help="Per Channel image mean values")
+  parser.add_argument("--input_scale", type=float, help="Multiply input features by this scale.", default=1.0)
+  parser.add_argument("--count", type=int, default=50000)
+  parser.add_argument("--loader_transforms", type=int, help="image transform by torch loader", default=0)
+  args = parser.parse_args()
+
+  if args.mean:
+    mean = np.array([float(s) for s in args.mean.split(',')], dtype=np.float32)
+    mean = mean[:, np.newaxis, np.newaxis]
+  else:
+    mean = np.array([])
+
+  # load model
+  model = pyruntime.Model(args.cvimodel, args.batch_size)
+  print('load model {}'.format(args.cvimodel))
+
+  val_loader = datasetLoader(args)
+
+  batch_time = AverageMeter('Time', ':6.3f')
+  losses = AverageMeter('Loss', ':.4e')
+  top1 = AverageMeter('Acc@1', ':6.2f')
+  top5 = AverageMeter('Acc@5', ':6.2f')
+  progress = ProgressMeter(len(val_loader) * args.batch_size,
+                           [batch_time, losses, top1, top5],
+                           prefix='Test: ')
+
+  # define loss function (criterion) and optimizer
+  criterion = nn.CrossEntropyLoss()
+  threshold = ((50 + args.batch_size - 1) // args.batch_size) * args.batch_size
+  total = len(val_loader) * args.batch_size
+  count = 0
+  end = time.time()
+  for i, (images, target) in enumerate(val_loader):
+    # preprocessing
+    x = imagePreprocssing(args, images, mean, model.inputs[0].qscale)
+    # inference
+    model.inputs[0].data[:] = x
+    model.forward()
+
+    # validate output prob
+    assert(len(model.outputs) == 1)
+    res = model.outputs[0].data
+    prob = np.reshape(res, (res.shape[0], res.shape[1]))
+    output = torch.from_numpy(prob)
+
+    # loss
+    loss = criterion(output, target)
+
+    # measure accuracy and record loss
+    acc1, acc5 = accuracy(output, target, topk=(1, 5))
+    losses.update(loss.item(), images.size(0))
+    top1.update(acc1[0], images.size(0))
+    top5.update(acc5[0], images.size(0))
+
+    # measure elapsed time
+    batch_time.update(time.time() - end)
+    end = time.time()
+
+    count += args.batch_size
+    if count % threshold == 0:
+      progress.display(count)
+    if count >= args.count:
+      progress.display(count)
+      break
+    if count + args.batch_size > total:
+      progress.display(count)
+      break
+  print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'
+        .format(top1=top1, top5=top5))
diff --git a/cviruntime/python/include/aarch64-linux-gnu/python3.5m/pyconfig.h b/cviruntime/python/include/aarch64-linux-gnu/python3.5m/pyconfig.h
new file mode 100644
index 000000000..7418b1c41
--- /dev/null
+++ b/cviruntime/python/include/aarch64-linux-gnu/python3.5m/pyconfig.h
@@ -0,0 +1,1552 @@
+/* pyconfig.h.  Generated from pyconfig.h.in by configure.  */
+/* pyconfig.h.in.  Generated from configure.ac by autoheader.  */
+
+
+#ifndef Py_PYCONFIG_H
+#define Py_PYCONFIG_H
+
+
+/* Define if building universal (internal helper macro) */
+/* #undef AC_APPLE_UNIVERSAL_BUILD */
+
+/* Define for AIX if your compiler is a genuine IBM xlC/xlC_r and you want
+   support for AIX C++ shared extension modules. */
+/* #undef AIX_GENUINE_CPLUSPLUS */
+
+/* The Android API level. */
+/* #undef ANDROID_API_LEVEL */
+
+/* Define if C doubles are 64-bit IEEE 754 binary format, stored in ARM
+   mixed-endian order (byte order 45670123) */
+/* #undef DOUBLE_IS_ARM_MIXED_ENDIAN_IEEE754 */
+
+/* Define if C doubles are 64-bit IEEE 754 binary format, stored with the most
+   significant byte first */
+/* #undef DOUBLE_IS_BIG_ENDIAN_IEEE754 */
+
+/* Define if C doubles are 64-bit IEEE 754 binary format, stored with the
+   least significant byte first */
+#define DOUBLE_IS_LITTLE_ENDIAN_IEEE754 1
+
+/* Define if --enable-ipv6 is specified */
+#define ENABLE_IPV6 1
+
+/* Define if flock needs to be linked with bsd library. */
+/* #undef FLOCK_NEEDS_LIBBSD */
+
+/* Define if getpgrp() must be called as getpgrp(0). */
+/* #undef GETPGRP_HAVE_ARG */
+
+/* Define if gettimeofday() does not have second (timezone) argument This is
+   the case on Motorola V4 (R40V4.2) */
+/* #undef GETTIMEOFDAY_NO_TZ */
+
+/* Define to 1 if you have the `accept4' function. */
+#define HAVE_ACCEPT4 1
+
+/* Define to 1 if you have the `acosh' function. */
+#define HAVE_ACOSH 1
+
+/* struct addrinfo (netdb.h) */
+#define HAVE_ADDRINFO 1
+
+/* Define to 1 if you have the `alarm' function. */
+#define HAVE_ALARM 1
+
+/* Define if aligned memory access is required */
+/* #undef HAVE_ALIGNED_REQUIRED */
+
+/* Define to 1 if you have the <alloca.h> header file. */
+#define HAVE_ALLOCA_H 1
+
+/* Define this if your time.h defines altzone. */
+/* #undef HAVE_ALTZONE */
+
+/* Define to 1 if you have the `asinh' function. */
+#define HAVE_ASINH 1
+
+/* Define to 1 if you have the <asm/types.h> header file. */
+#define HAVE_ASM_TYPES_H 1
+
+/* Define to 1 if you have the `atanh' function. */
+#define HAVE_ATANH 1
+
+/* Define to 1 if you have the `bind_textdomain_codeset' function. */
+#define HAVE_BIND_TEXTDOMAIN_CODESET 1
+
+/* Define to 1 if you have the <bluetooth/bluetooth.h> header file. */
+/* #undef HAVE_BLUETOOTH_BLUETOOTH_H */
+
+/* Define to 1 if you have the <bluetooth.h> header file. */
+/* #undef HAVE_BLUETOOTH_H */
+
+/* Define if mbstowcs(NULL, "text", 0) does not return the number of wide
+   chars that would be converted. */
+/* #undef HAVE_BROKEN_MBSTOWCS */
+
+/* Define if nice() returns success/failure instead of the new priority. */
+/* #undef HAVE_BROKEN_NICE */
+
+/* Define if the system reports an invalid PIPE_BUF value. */
+/* #undef HAVE_BROKEN_PIPE_BUF */
+
+/* Define if poll() sets errno on invalid file descriptors. */
+/* #undef HAVE_BROKEN_POLL */
+
+/* Define if the Posix semaphores do not work on your system */
+/* #undef HAVE_BROKEN_POSIX_SEMAPHORES */
+
+/* Define if pthread_sigmask() does not work on your system. */
+/* #undef HAVE_BROKEN_PTHREAD_SIGMASK */
+
+/* define to 1 if your sem_getvalue is broken. */
+/* #undef HAVE_BROKEN_SEM_GETVALUE */
+
+/* Define if `unsetenv` does not return an int. */
+/* #undef HAVE_BROKEN_UNSETENV */
+
+/* Has builtin atomics */
+#define HAVE_BUILTIN_ATOMIC 1
+
+/* Define to 1 if you have the 'chflags' function. */
+/* #undef HAVE_CHFLAGS */
+
+/* Define to 1 if you have the `chown' function. */
+#define HAVE_CHOWN 1
+
+/* Define if you have the 'chroot' function. */
+#define HAVE_CHROOT 1
+
+/* Define to 1 if you have the `clock' function. */
+#define HAVE_CLOCK 1
+
+/* Define to 1 if you have the `clock_getres' function. */
+#define HAVE_CLOCK_GETRES 1
+
+/* Define to 1 if you have the `clock_gettime' function. */
+#define HAVE_CLOCK_GETTIME 1
+
+/* Define to 1 if you have the `clock_settime' function. */
+#define HAVE_CLOCK_SETTIME 1
+
+/* Define if the C compiler supports computed gotos. */
+#define HAVE_COMPUTED_GOTOS 1
+
+/* Define to 1 if you have the `confstr' function. */
+#define HAVE_CONFSTR 1
+
+/* Define to 1 if you have the <conio.h> header file. */
+/* #undef HAVE_CONIO_H */
+
+/* Define to 1 if you have the `copysign' function. */
+#define HAVE_COPYSIGN 1
+
+/* Define to 1 if you have the <crypt.h> header file. */
+#define HAVE_CRYPT_H 1
+
+/* Define to 1 if you have the `ctermid' function. */
+#define HAVE_CTERMID 1
+
+/* Define if you have the 'ctermid_r' function. */
+/* #undef HAVE_CTERMID_R */
+
+/* Define if you have the 'filter' function. */
+/* #undef HAVE_CURSES_FILTER */
+
+/* Define to 1 if you have the <curses.h> header file. */
+/* #undef HAVE_CURSES_H */
+
+/* Define if you have the 'has_key' function. */
+/* #undef HAVE_CURSES_HAS_KEY */
+
+/* Define if you have the 'immedok' function. */
+/* #undef HAVE_CURSES_IMMEDOK */
+
+/* Define if you have the 'is_pad' function or macro. */
+/* #undef HAVE_CURSES_IS_PAD */
+
+/* Define if you have the 'is_term_resized' function. */
+/* #undef HAVE_CURSES_IS_TERM_RESIZED */
+
+/* Define if you have the 'resizeterm' function. */
+/* #undef HAVE_CURSES_RESIZETERM */
+
+/* Define if you have the 'resize_term' function. */
+/* #undef HAVE_CURSES_RESIZE_TERM */
+
+/* Define if you have the 'syncok' function. */
+/* #undef HAVE_CURSES_SYNCOK */
+
+/* Define if you have the 'typeahead' function. */
+/* #undef HAVE_CURSES_TYPEAHEAD */
+
+/* Define if you have the 'use_env' function. */
+/* #undef HAVE_CURSES_USE_ENV */
+
+/* Define if you have the 'wchgat' function. */
+/* #undef HAVE_CURSES_WCHGAT */
+
+/* Define to 1 if you have the declaration of `isfinite', and to 0 if you
+   don't. */
+#define HAVE_DECL_ISFINITE 1
+
+/* Define to 1 if you have the declaration of `isinf', and to 0 if you don't.
+   */
+#define HAVE_DECL_ISINF 1
+
+/* Define to 1 if you have the declaration of `isnan', and to 0 if you don't.
+   */
+#define HAVE_DECL_ISNAN 1
+
+/* Define to 1 if you have the declaration of `RTLD_DEEPBIND', and to 0 if you
+   don't. */
+#define HAVE_DECL_RTLD_DEEPBIND 1
+
+/* Define to 1 if you have the declaration of `RTLD_GLOBAL', and to 0 if you
+   don't. */
+#define HAVE_DECL_RTLD_GLOBAL 1
+
+/* Define to 1 if you have the declaration of `RTLD_LAZY', and to 0 if you
+   don't. */
+#define HAVE_DECL_RTLD_LAZY 1
+
+/* Define to 1 if you have the declaration of `RTLD_LOCAL', and to 0 if you
+   don't. */
+#define HAVE_DECL_RTLD_LOCAL 1
+
+/* Define to 1 if you have the declaration of `RTLD_NODELETE', and to 0 if you
+   don't. */
+#define HAVE_DECL_RTLD_NODELETE 1
+
+/* Define to 1 if you have the declaration of `RTLD_NOLOAD', and to 0 if you
+   don't. */
+#define HAVE_DECL_RTLD_NOLOAD 1
+
+/* Define to 1 if you have the declaration of `RTLD_NOW', and to 0 if you
+   don't. */
+#define HAVE_DECL_RTLD_NOW 1
+
+/* Define to 1 if you have the declaration of `tzname', and to 0 if you don't.
+   */
+/* #undef HAVE_DECL_TZNAME */
+
+/* Define to 1 if you have the device macros. */
+#define HAVE_DEVICE_MACROS 1
+
+/* Define to 1 if you have the /dev/ptc device file. */
+/* #undef HAVE_DEV_PTC */
+
+/* Define to 1 if you have the /dev/ptmx device file. */
+#define HAVE_DEV_PTMX 1
+
+/* Define to 1 if you have the <direct.h> header file. */
+/* #undef HAVE_DIRECT_H */
+
+/* Define to 1 if the dirent structure has a d_type field */
+#define HAVE_DIRENT_D_TYPE 1
+
+/* Define to 1 if you have the <dirent.h> header file, and it defines `DIR'.
+   */
+#define HAVE_DIRENT_H 1
+
+/* Define if you have the 'dirfd' function or macro. */
+#define HAVE_DIRFD 1
+
+/* Define to 1 if you have the <dlfcn.h> header file. */
+#define HAVE_DLFCN_H 1
+
+/* Define to 1 if you have the `dlopen' function. */
+#define HAVE_DLOPEN 1
+
+/* Define to 1 if you have the `dup2' function. */
+#define HAVE_DUP2 1
+
+/* Define to 1 if you have the `dup3' function. */
+#define HAVE_DUP3 1
+
+/* Defined when any dynamic module loading is enabled. */
+#define HAVE_DYNAMIC_LOADING 1
+
+/* Define to 1 if you have the <endian.h> header file. */
+#define HAVE_ENDIAN_H 1
+
+/* Define if you have the 'epoll' functions. */
+#define HAVE_EPOLL 1
+
+/* Define if you have the 'epoll_create1' function. */
+#define HAVE_EPOLL_CREATE1 1
+
+/* Define to 1 if you have the `erf' function. */
+#define HAVE_ERF 1
+
+/* Define to 1 if you have the `erfc' function. */
+#define HAVE_ERFC 1
+
+/* Define to 1 if you have the <errno.h> header file. */
+#define HAVE_ERRNO_H 1
+
+/* Define to 1 if you have the `execv' function. */
+#define HAVE_EXECV 1
+
+/* Define to 1 if you have the `expm1' function. */
+#define HAVE_EXPM1 1
+
+/* Define to 1 if you have the `faccessat' function. */
+#define HAVE_FACCESSAT 1
+
+/* Define if you have the 'fchdir' function. */
+#define HAVE_FCHDIR 1
+
+/* Define to 1 if you have the `fchmod' function. */
+#define HAVE_FCHMOD 1
+
+/* Define to 1 if you have the `fchmodat' function. */
+#define HAVE_FCHMODAT 1
+
+/* Define to 1 if you have the `fchown' function. */
+#define HAVE_FCHOWN 1
+
+/* Define to 1 if you have the `fchownat' function. */
+#define HAVE_FCHOWNAT 1
+
+/* Define to 1 if you have the <fcntl.h> header file. */
+#define HAVE_FCNTL_H 1
+
+/* Define if you have the 'fdatasync' function. */
+#define HAVE_FDATASYNC 1
+
+/* Define to 1 if you have the `fdopendir' function. */
+#define HAVE_FDOPENDIR 1
+
+/* Define to 1 if you have the `fexecve' function. */
+#define HAVE_FEXECVE 1
+
+/* Define to 1 if you have the `finite' function. */
+#define HAVE_FINITE 1
+
+/* Define to 1 if you have the `flock' function. */
+#define HAVE_FLOCK 1
+
+/* Define to 1 if you have the `fork' function. */
+#define HAVE_FORK 1
+
+/* Define to 1 if you have the `forkpty' function. */
+#define HAVE_FORKPTY 1
+
+/* Define to 1 if you have the `fpathconf' function. */
+#define HAVE_FPATHCONF 1
+
+/* Define to 1 if you have the `fseek64' function. */
+/* #undef HAVE_FSEEK64 */
+
+/* Define to 1 if you have the `fseeko' function. */
+#define HAVE_FSEEKO 1
+
+/* Define to 1 if you have the `fstatat' function. */
+#define HAVE_FSTATAT 1
+
+/* Define to 1 if you have the `fstatvfs' function. */
+#define HAVE_FSTATVFS 1
+
+/* Define if you have the 'fsync' function. */
+#define HAVE_FSYNC 1
+
+/* Define to 1 if you have the `ftell64' function. */
+/* #undef HAVE_FTELL64 */
+
+/* Define to 1 if you have the `ftello' function. */
+#define HAVE_FTELLO 1
+
+/* Define to 1 if you have the `ftime' function. */
+#define HAVE_FTIME 1
+
+/* Define to 1 if you have the `ftruncate' function. */
+#define HAVE_FTRUNCATE 1
+
+/* Define to 1 if you have the `futimens' function. */
+#define HAVE_FUTIMENS 1
+
+/* Define to 1 if you have the `futimes' function. */
+#define HAVE_FUTIMES 1
+
+/* Define to 1 if you have the `futimesat' function. */
+#define HAVE_FUTIMESAT 1
+
+/* Define to 1 if you have the `gai_strerror' function. */
+#define HAVE_GAI_STRERROR 1
+
+/* Define to 1 if you have the `gamma' function. */
+#define HAVE_GAMMA 1
+
+/* Define if we can use gcc inline assembler to get and set mc68881 fpcr */
+/* #undef HAVE_GCC_ASM_FOR_MC68881 */
+
+/* Define if we can use x64 gcc inline assembler */
+/* #undef HAVE_GCC_ASM_FOR_X64 */
+
+/* Define if we can use gcc inline assembler to get and set x87 control word
+   */
+/* #undef HAVE_GCC_ASM_FOR_X87 */
+
+/* Define if your compiler provides __uint128_t */
+#define HAVE_GCC_UINT128_T 1
+
+/* Define if you have the getaddrinfo function. */
+#define HAVE_GETADDRINFO 1
+
+/* Define this if you have flockfile(), getc_unlocked(), and funlockfile() */
+#define HAVE_GETC_UNLOCKED 1
+
+/* Define to 1 if you have the `getentropy' function. */
+/* #undef HAVE_GETENTROPY */
+
+/* Define to 1 if you have the `getgrouplist' function. */
+#define HAVE_GETGROUPLIST 1
+
+/* Define to 1 if you have the `getgroups' function. */
+#define HAVE_GETGROUPS 1
+
+/* Define to 1 if you have the `gethostbyname' function. */
+/* #undef HAVE_GETHOSTBYNAME */
+
+/* Define this if you have some version of gethostbyname_r() */
+#define HAVE_GETHOSTBYNAME_R 1
+
+/* Define this if you have the 3-arg version of gethostbyname_r(). */
+/* #undef HAVE_GETHOSTBYNAME_R_3_ARG */
+
+/* Define this if you have the 5-arg version of gethostbyname_r(). */
+/* #undef HAVE_GETHOSTBYNAME_R_5_ARG */
+
+/* Define this if you have the 6-arg version of gethostbyname_r(). */
+#define HAVE_GETHOSTBYNAME_R_6_ARG 1
+
+/* Define to 1 if you have the `getitimer' function. */
+#define HAVE_GETITIMER 1
+
+/* Define to 1 if you have the `getloadavg' function. */
+#define HAVE_GETLOADAVG 1
+
+/* Define to 1 if you have the `getlogin' function. */
+#define HAVE_GETLOGIN 1
+
+/* Define to 1 if you have the `getnameinfo' function. */
+#define HAVE_GETNAMEINFO 1
+
+/* Define if you have the 'getpagesize' function. */
+#define HAVE_GETPAGESIZE 1
+
+/* Define to 1 if you have the `getpeername' function. */
+#define HAVE_GETPEERNAME 1
+
+/* Define to 1 if you have the `getpgid' function. */
+#define HAVE_GETPGID 1
+
+/* Define to 1 if you have the `getpgrp' function. */
+#define HAVE_GETPGRP 1
+
+/* Define to 1 if you have the `getpid' function. */
+#define HAVE_GETPID 1
+
+/* Define to 1 if you have the `getpriority' function. */
+#define HAVE_GETPRIORITY 1
+
+/* Define to 1 if you have the `getpwent' function. */
+#define HAVE_GETPWENT 1
+
+/* Define to 1 if the getrandom() function is available */
+/* #undef HAVE_GETRANDOM */
+
+/* Define to 1 if the Linux getrandom() syscall is available */
+#define HAVE_GETRANDOM_SYSCALL 1
+
+/* Define to 1 if you have the `getresgid' function. */
+#define HAVE_GETRESGID 1
+
+/* Define to 1 if you have the `getresuid' function. */
+#define HAVE_GETRESUID 1
+
+/* Define to 1 if you have the `getsid' function. */
+#define HAVE_GETSID 1
+
+/* Define to 1 if you have the `getspent' function. */
+#define HAVE_GETSPENT 1
+
+/* Define to 1 if you have the `getspnam' function. */
+#define HAVE_GETSPNAM 1
+
+/* Define to 1 if you have the `gettimeofday' function. */
+#define HAVE_GETTIMEOFDAY 1
+
+/* Define to 1 if you have the `getwd' function. */
+#define HAVE_GETWD 1
+
+/* Define if glibc has incorrect _FORTIFY_SOURCE wrappers for memmove and
+   bcopy. */
+/* #undef HAVE_GLIBC_MEMMOVE_BUG */
+
+/* Define to 1 if you have the <grp.h> header file. */
+#define HAVE_GRP_H 1
+
+/* Define if you have the 'hstrerror' function. */
+#define HAVE_HSTRERROR 1
+
+/* Define this if you have le64toh() */
+#define HAVE_HTOLE64 1
+
+/* Define to 1 if you have the `hypot' function. */
+#define HAVE_HYPOT 1
+
+/* Define to 1 if you have the <ieeefp.h> header file. */
+/* #undef HAVE_IEEEFP_H */
+
+/* Define to 1 if you have the 'if_nameindex' function. */
+#define HAVE_IF_NAMEINDEX 1
+
+/* Define if you have the 'inet_aton' function. */
+#define HAVE_INET_ATON 1
+
+/* Define if you have the 'inet_pton' function. */
+#define HAVE_INET_PTON 1
+
+/* Define to 1 if you have the `initgroups' function. */
+#define HAVE_INITGROUPS 1
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#define HAVE_INTTYPES_H 1
+
+/* Define to 1 if you have the <io.h> header file. */
+/* #undef HAVE_IO_H */
+
+/* Define if gcc has the ipa-pure-const bug. */
+/* #undef HAVE_IPA_PURE_CONST_BUG */
+
+/* Define to 1 if you have the `kill' function. */
+#define HAVE_KILL 1
+
+/* Define to 1 if you have the `killpg' function. */
+#define HAVE_KILLPG 1
+
+/* Define if you have the 'kqueue' functions. */
+/* #undef HAVE_KQUEUE */
+
+/* Define to 1 if you have the <langinfo.h> header file. */
+#define HAVE_LANGINFO_H 1
+
+/* Defined to enable large file support when an off_t is bigger than a long
+   and long long is available and at least as big as an off_t. You may need to
+   add some flags for configuration and compilation to enable this mode. (For
+   Solaris and Linux, the necessary defines are already defined.) */
+/* #undef HAVE_LARGEFILE_SUPPORT */
+
+/* Define to 1 if you have the 'lchflags' function. */
+/* #undef HAVE_LCHFLAGS */
+
+/* Define to 1 if you have the `lchmod' function. */
+/* #undef HAVE_LCHMOD */
+
+/* Define to 1 if you have the `lchown' function. */
+#define HAVE_LCHOWN 1
+
+/* Define to 1 if you have the `lgamma' function. */
+#define HAVE_LGAMMA 1
+
+/* Define to 1 if you have the `dl' library (-ldl). */
+#define HAVE_LIBDL 1
+
+/* Define to 1 if you have the `dld' library (-ldld). */
+/* #undef HAVE_LIBDLD */
+
+/* Define to 1 if you have the `ieee' library (-lieee). */
+/* #undef HAVE_LIBIEEE */
+
+/* Define to 1 if you have the <libintl.h> header file. */
+#define HAVE_LIBINTL_H 1
+
+/* Define if you have the readline library (-lreadline). */
+/* #undef HAVE_LIBREADLINE */
+
+/* Define to 1 if you have the `resolv' library (-lresolv). */
+/* #undef HAVE_LIBRESOLV */
+
+/* Define to 1 if you have the `sendfile' library (-lsendfile). */
+/* #undef HAVE_LIBSENDFILE */
+
+/* Define to 1 if you have the <libutil.h> header file. */
+/* #undef HAVE_LIBUTIL_H */
+
+/* Define if you have the 'link' function. */
+#define HAVE_LINK 1
+
+/* Define to 1 if you have the `linkat' function. */
+#define HAVE_LINKAT 1
+
+/* Define to 1 if you have the <linux/can/bcm.h> header file. */
+#define HAVE_LINUX_CAN_BCM_H 1
+
+/* Define to 1 if you have the <linux/can.h> header file. */
+#define HAVE_LINUX_CAN_H 1
+
+/* Define if compiling using Linux 3.6 or later. */
+#define HAVE_LINUX_CAN_RAW_FD_FRAMES 1
+
+/* Define to 1 if you have the <linux/can/raw.h> header file. */
+#define HAVE_LINUX_CAN_RAW_H 1
+
+/* Define to 1 if you have the <linux/netlink.h> header file. */
+#define HAVE_LINUX_NETLINK_H 1
+
+/* Define to 1 if you have the <linux/random.h> header file. */
+#define HAVE_LINUX_RANDOM_H 1
+
+/* Define to 1 if you have the <linux/tipc.h> header file. */
+#define HAVE_LINUX_TIPC_H 1
+
+/* Define to 1 if you have the 'lockf' function and the F_LOCK macro. */
+#define HAVE_LOCKF 1
+
+/* Define to 1 if you have the `log1p' function. */
+#define HAVE_LOG1P 1
+
+/* Define to 1 if you have the `log2' function. */
+#define HAVE_LOG2 1
+
+/* Define this if you have the type long double. */
+#define HAVE_LONG_DOUBLE 1
+
+/* Define to 1 if you have the `lstat' function. */
+#define HAVE_LSTAT 1
+
+/* Define to 1 if you have the `lutimes' function. */
+#define HAVE_LUTIMES 1
+
+/* Define this if you have the makedev macro. */
+#define HAVE_MAKEDEV 1
+
+/* Define to 1 if you have the `mbrtowc' function. */
+#define HAVE_MBRTOWC 1
+
+/* Define to 1 if you have the `memmove' function. */
+#define HAVE_MEMMOVE 1
+
+/* Define to 1 if you have the <memory.h> header file. */
+#define HAVE_MEMORY_H 1
+
+/* Define to 1 if you have the `memrchr' function. */
+#define HAVE_MEMRCHR 1
+
+/* Define to 1 if you have the `mkdirat' function. */
+#define HAVE_MKDIRAT 1
+
+/* Define to 1 if you have the `mkfifo' function. */
+#define HAVE_MKFIFO 1
+
+/* Define to 1 if you have the `mkfifoat' function. */
+#define HAVE_MKFIFOAT 1
+
+/* Define to 1 if you have the `mknod' function. */
+#define HAVE_MKNOD 1
+
+/* Define to 1 if you have the `mknodat' function. */
+#define HAVE_MKNODAT 1
+
+/* Define to 1 if you have the `mktime' function. */
+#define HAVE_MKTIME 1
+
+/* Define to 1 if you have the `mmap' function. */
+#define HAVE_MMAP 1
+
+/* Define to 1 if you have the `mremap' function. */
+#define HAVE_MREMAP 1
+
+/* Define to 1 if you have the <ncurses.h> header file. */
+/* #undef HAVE_NCURSES_H */
+
+/* Define to 1 if you have the <ndir.h> header file, and it defines `DIR'. */
+/* #undef HAVE_NDIR_H */
+
+/* Define to 1 if you have the <netpacket/packet.h> header file. */
+#define HAVE_NETPACKET_PACKET_H 1
+
+/* Define to 1 if you have the <net/if.h> header file. */
+#define HAVE_NET_IF_H 1
+
+/* Define to 1 if you have the `nice' function. */
+#define HAVE_NICE 1
+
+/* Define to 1 if you have the `openat' function. */
+#define HAVE_OPENAT 1
+
+/* Define to 1 if you have the `openpty' function. */
+#define HAVE_OPENPTY 1
+
+/* Define to 1 if you have the `pathconf' function. */
+#define HAVE_PATHCONF 1
+
+/* Define to 1 if you have the `pause' function. */
+#define HAVE_PAUSE 1
+
+/* Define to 1 if you have the `pipe2' function. */
+#define HAVE_PIPE2 1
+
+/* Define to 1 if you have the `plock' function. */
+/* #undef HAVE_PLOCK */
+
+/* Define to 1 if you have the `poll' function. */
+#define HAVE_POLL 1
+
+/* Define to 1 if you have the <poll.h> header file. */
+#define HAVE_POLL_H 1
+
+/* Define to 1 if you have the `posix_fadvise' function. */
+#define HAVE_POSIX_FADVISE 1
+
+/* Define to 1 if you have the `posix_fallocate' function. */
+#define HAVE_POSIX_FALLOCATE 1
+
+/* Define to 1 if you have the `pread' function. */
+#define HAVE_PREAD 1
+
+/* Define if you have the 'prlimit' functions. */
+#define HAVE_PRLIMIT 1
+
+/* Define to 1 if you have the <process.h> header file. */
+/* #undef HAVE_PROCESS_H */
+
+/* Define if your compiler supports function prototype */
+#define HAVE_PROTOTYPES 1
+
+/* Define to 1 if you have the `pthread_atfork' function. */
+#define HAVE_PTHREAD_ATFORK 1
+
+/* Defined for Solaris 2.6 bug in pthread header. */
+/* #undef HAVE_PTHREAD_DESTRUCTOR */
+
+/* Define to 1 if you have the <pthread.h> header file. */
+#define HAVE_PTHREAD_H 1
+
+/* Define to 1 if you have the `pthread_init' function. */
+/* #undef HAVE_PTHREAD_INIT */
+
+/* Define to 1 if you have the `pthread_kill' function. */
+#define HAVE_PTHREAD_KILL 1
+
+/* Define to 1 if you have the `pthread_sigmask' function. */
+#define HAVE_PTHREAD_SIGMASK 1
+
+/* Define to 1 if you have the <pty.h> header file. */
+#define HAVE_PTY_H 1
+
+/* Define to 1 if you have the `putenv' function. */
+#define HAVE_PUTENV 1
+
+/* Define to 1 if you have the `pwrite' function. */
+#define HAVE_PWRITE 1
+
+/* Define to 1 if you have the `readlink' function. */
+#define HAVE_READLINK 1
+
+/* Define to 1 if you have the `readlinkat' function. */
+#define HAVE_READLINKAT 1
+
+/* Define to 1 if you have the `readv' function. */
+#define HAVE_READV 1
+
+/* Define to 1 if you have the `realpath' function. */
+#define HAVE_REALPATH 1
+
+/* Define to 1 if you have the `renameat' function. */
+#define HAVE_RENAMEAT 1
+
+/* Define if readline supports append_history */
+/* #undef HAVE_RL_APPEND_HISTORY */
+
+/* Define if you have readline 2.1 */
+/* #undef HAVE_RL_CALLBACK */
+
+/* Define if you can turn off readline's signal handling. */
+/* #undef HAVE_RL_CATCH_SIGNAL */
+
+/* Define if you have readline 2.2 */
+/* #undef HAVE_RL_COMPLETION_APPEND_CHARACTER */
+
+/* Define if you have readline 4.0 */
+/* #undef HAVE_RL_COMPLETION_DISPLAY_MATCHES_HOOK */
+
+/* Define if you have readline 4.2 */
+/* #undef HAVE_RL_COMPLETION_MATCHES */
+
+/* Define if you have rl_completion_suppress_append */
+/* #undef HAVE_RL_COMPLETION_SUPPRESS_APPEND */
+
+/* Define if you have readline 4.0 */
+/* #undef HAVE_RL_PRE_INPUT_HOOK */
+
+/* Define if you have readline 4.0 */
+/* #undef HAVE_RL_RESIZE_TERMINAL */
+
+/* Define to 1 if you have the `round' function. */
+#define HAVE_ROUND 1
+
+/* Define to 1 if you have the `sched_get_priority_max' function. */
+#define HAVE_SCHED_GET_PRIORITY_MAX 1
+
+/* Define to 1 if you have the <sched.h> header file. */
+#define HAVE_SCHED_H 1
+
+/* Define to 1 if you have the `sched_rr_get_interval' function. */
+#define HAVE_SCHED_RR_GET_INTERVAL 1
+
+/* Define to 1 if you have the `sched_setaffinity' function. */
+#define HAVE_SCHED_SETAFFINITY 1
+
+/* Define to 1 if you have the `sched_setparam' function. */
+#define HAVE_SCHED_SETPARAM 1
+
+/* Define to 1 if you have the `sched_setscheduler' function. */
+#define HAVE_SCHED_SETSCHEDULER 1
+
+/* Define to 1 if you have the `select' function. */
+#define HAVE_SELECT 1
+
+/* Define to 1 if you have the `sem_getvalue' function. */
+#define HAVE_SEM_GETVALUE 1
+
+/* Define to 1 if you have the `sem_open' function. */
+#define HAVE_SEM_OPEN 1
+
+/* Define to 1 if you have the `sem_timedwait' function. */
+#define HAVE_SEM_TIMEDWAIT 1
+
+/* Define to 1 if you have the `sem_unlink' function. */
+#define HAVE_SEM_UNLINK 1
+
+/* Define to 1 if you have the `sendfile' function. */
+#define HAVE_SENDFILE 1
+
+/* Define to 1 if you have the `setegid' function. */
+#define HAVE_SETEGID 1
+
+/* Define to 1 if you have the `seteuid' function. */
+#define HAVE_SETEUID 1
+
+/* Define to 1 if you have the `setgid' function. */
+#define HAVE_SETGID 1
+
+/* Define if you have the 'setgroups' function. */
+#define HAVE_SETGROUPS 1
+
+/* Define to 1 if you have the `sethostname' function. */
+#define HAVE_SETHOSTNAME 1
+
+/* Define to 1 if you have the `setitimer' function. */
+#define HAVE_SETITIMER 1
+
+/* Define to 1 if you have the `setlocale' function. */
+#define HAVE_SETLOCALE 1
+
+/* Define to 1 if you have the `setpgid' function. */
+#define HAVE_SETPGID 1
+
+/* Define to 1 if you have the `setpgrp' function. */
+#define HAVE_SETPGRP 1
+
+/* Define to 1 if you have the `setpriority' function. */
+#define HAVE_SETPRIORITY 1
+
+/* Define to 1 if you have the `setregid' function. */
+#define HAVE_SETREGID 1
+
+/* Define to 1 if you have the `setresgid' function. */
+#define HAVE_SETRESGID 1
+
+/* Define to 1 if you have the `setresuid' function. */
+#define HAVE_SETRESUID 1
+
+/* Define to 1 if you have the `setreuid' function. */
+#define HAVE_SETREUID 1
+
+/* Define to 1 if you have the `setsid' function. */
+#define HAVE_SETSID 1
+
+/* Define to 1 if you have the `setuid' function. */
+#define HAVE_SETUID 1
+
+/* Define to 1 if you have the `setvbuf' function. */
+#define HAVE_SETVBUF 1
+
+/* Define to 1 if you have the <shadow.h> header file. */
+#define HAVE_SHADOW_H 1
+
+/* Define to 1 if you have the `sigaction' function. */
+#define HAVE_SIGACTION 1
+
+/* Define to 1 if you have the `sigaltstack' function. */
+#define HAVE_SIGALTSTACK 1
+
+/* Define to 1 if you have the `siginterrupt' function. */
+#define HAVE_SIGINTERRUPT 1
+
+/* Define to 1 if you have the <signal.h> header file. */
+#define HAVE_SIGNAL_H 1
+
+/* Define to 1 if you have the `sigpending' function. */
+#define HAVE_SIGPENDING 1
+
+/* Define to 1 if you have the `sigrelse' function. */
+#define HAVE_SIGRELSE 1
+
+/* Define to 1 if you have the `sigtimedwait' function. */
+#define HAVE_SIGTIMEDWAIT 1
+
+/* Define to 1 if you have the `sigwait' function. */
+#define HAVE_SIGWAIT 1
+
+/* Define to 1 if you have the `sigwaitinfo' function. */
+#define HAVE_SIGWAITINFO 1
+
+/* Define to 1 if you have the `snprintf' function. */
+#define HAVE_SNPRINTF 1
+
+/* struct sockaddr_alg (linux/if_alg.h) */
+#define HAVE_SOCKADDR_ALG 1
+
+/* Define if sockaddr has sa_len member */
+/* #undef HAVE_SOCKADDR_SA_LEN */
+
+/* struct sockaddr_storage (sys/socket.h) */
+#define HAVE_SOCKADDR_STORAGE 1
+
+/* Define if you have the 'socketpair' function. */
+#define HAVE_SOCKETPAIR 1
+
+/* Define to 1 if you have the <spawn.h> header file. */
+#define HAVE_SPAWN_H 1
+
+/* Define if your compiler provides ssize_t */
+#define HAVE_SSIZE_T 1
+
+/* Define to 1 if you have the `statvfs' function. */
+#define HAVE_STATVFS 1
+
+/* Define if you have struct stat.st_mtim.tv_nsec */
+#define HAVE_STAT_TV_NSEC 1
+
+/* Define if you have struct stat.st_mtimensec */
+/* #undef HAVE_STAT_TV_NSEC2 */
+
+/* Define if your compiler supports variable length function prototypes (e.g.
+   void fprintf(FILE *, char *, ...);) *and* <stdarg.h> */
+#define HAVE_STDARG_PROTOTYPES 1
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#define HAVE_STDINT_H 1
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#define HAVE_STDLIB_H 1
+
+/* Has stdatomic.h with atomic_int */
+#define HAVE_STD_ATOMIC 1
+
+/* Define to 1 if you have the `strdup' function. */
+#define HAVE_STRDUP 1
+
+/* Define to 1 if you have the `strftime' function. */
+#define HAVE_STRFTIME 1
+
+/* Define to 1 if you have the <strings.h> header file. */
+#define HAVE_STRINGS_H 1
+
+/* Define to 1 if you have the <string.h> header file. */
+#define HAVE_STRING_H 1
+
+/* Define to 1 if you have the `strlcpy' function. */
+/* #undef HAVE_STRLCPY */
+
+/* Define to 1 if you have the <stropts.h> header file. */
+#define HAVE_STROPTS_H 1
+
+/* Define to 1 if `pw_gecos' is a member of `struct passwd'. */
+#define HAVE_STRUCT_PASSWD_PW_GECOS 1
+
+/* Define to 1 if `pw_passwd' is a member of `struct passwd'. */
+#define HAVE_STRUCT_PASSWD_PW_PASSWD 1
+
+/* Define to 1 if `st_birthtime' is a member of `struct stat'. */
+/* #undef HAVE_STRUCT_STAT_ST_BIRTHTIME */
+
+/* Define to 1 if `st_blksize' is a member of `struct stat'. */
+#define HAVE_STRUCT_STAT_ST_BLKSIZE 1
+
+/* Define to 1 if `st_blocks' is a member of `struct stat'. */
+#define HAVE_STRUCT_STAT_ST_BLOCKS 1
+
+/* Define to 1 if `st_flags' is a member of `struct stat'. */
+/* #undef HAVE_STRUCT_STAT_ST_FLAGS */
+
+/* Define to 1 if `st_gen' is a member of `struct stat'. */
+/* #undef HAVE_STRUCT_STAT_ST_GEN */
+
+/* Define to 1 if `st_rdev' is a member of `struct stat'. */
+#define HAVE_STRUCT_STAT_ST_RDEV 1
+
+/* Define to 1 if `tm_zone' is a member of `struct tm'. */
+#define HAVE_STRUCT_TM_TM_ZONE 1
+
+/* Define if you have the 'symlink' function. */
+#define HAVE_SYMLINK 1
+
+/* Define to 1 if you have the `symlinkat' function. */
+#define HAVE_SYMLINKAT 1
+
+/* Define to 1 if you have the `sync' function. */
+#define HAVE_SYNC 1
+
+/* Define to 1 if you have the `sysconf' function. */
+#define HAVE_SYSCONF 1
+
+/* Define to 1 if you have the <sysexits.h> header file. */
+#define HAVE_SYSEXITS_H 1
+
+/* Define to 1 if you have the <sys/audioio.h> header file. */
+/* #undef HAVE_SYS_AUDIOIO_H */
+
+/* Define to 1 if you have the <sys/bsdtty.h> header file. */
+/* #undef HAVE_SYS_BSDTTY_H */
+
+/* Define to 1 if you have the <sys/devpoll.h> header file. */
+/* #undef HAVE_SYS_DEVPOLL_H */
+
+/* Define to 1 if you have the <sys/dir.h> header file, and it defines `DIR'.
+   */
+/* #undef HAVE_SYS_DIR_H */
+
+/* Define to 1 if you have the <sys/endian.h> header file. */
+/* #undef HAVE_SYS_ENDIAN_H */
+
+/* Define to 1 if you have the <sys/epoll.h> header file. */
+#define HAVE_SYS_EPOLL_H 1
+
+/* Define to 1 if you have the <sys/event.h> header file. */
+/* #undef HAVE_SYS_EVENT_H */
+
+/* Define to 1 if you have the <sys/file.h> header file. */
+#define HAVE_SYS_FILE_H 1
+
+/* Define to 1 if you have the <sys/ioctl.h> header file. */
+#define HAVE_SYS_IOCTL_H 1
+
+/* Define to 1 if you have the <sys/kern_control.h> header file. */
+/* #undef HAVE_SYS_KERN_CONTROL_H */
+
+/* Define to 1 if you have the <sys/loadavg.h> header file. */
+/* #undef HAVE_SYS_LOADAVG_H */
+
+/* Define to 1 if you have the <sys/lock.h> header file. */
+/* #undef HAVE_SYS_LOCK_H */
+
+/* Define to 1 if you have the <sys/mkdev.h> header file. */
+/* #undef HAVE_SYS_MKDEV_H */
+
+/* Define to 1 if you have the <sys/modem.h> header file. */
+/* #undef HAVE_SYS_MODEM_H */
+
+/* Define to 1 if you have the <sys/ndir.h> header file, and it defines `DIR'.
+   */
+/* #undef HAVE_SYS_NDIR_H */
+
+/* Define to 1 if you have the <sys/param.h> header file. */
+#define HAVE_SYS_PARAM_H 1
+
+/* Define to 1 if you have the <sys/poll.h> header file. */
+#define HAVE_SYS_POLL_H 1
+
+/* Define to 1 if you have the <sys/random.h> header file. */
+/* #undef HAVE_SYS_RANDOM_H */
+
+/* Define to 1 if you have the <sys/resource.h> header file. */
+#define HAVE_SYS_RESOURCE_H 1
+
+/* Define to 1 if you have the <sys/select.h> header file. */
+#define HAVE_SYS_SELECT_H 1
+
+/* Define to 1 if you have the <sys/sendfile.h> header file. */
+#define HAVE_SYS_SENDFILE_H 1
+
+/* Define to 1 if you have the <sys/socket.h> header file. */
+#define HAVE_SYS_SOCKET_H 1
+
+/* Define to 1 if you have the <sys/statvfs.h> header file. */
+#define HAVE_SYS_STATVFS_H 1
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#define HAVE_SYS_STAT_H 1
+
+/* Define to 1 if you have the <sys/syscall.h> header file. */
+#define HAVE_SYS_SYSCALL_H 1
+
+/* Define to 1 if you have the <sys/sysmacros.h> header file. */
+#define HAVE_SYS_SYSMACROS_H 1
+
+/* Define to 1 if you have the <sys/sys_domain.h> header file. */
+/* #undef HAVE_SYS_SYS_DOMAIN_H */
+
+/* Define to 1 if you have the <sys/termio.h> header file. */
+/* #undef HAVE_SYS_TERMIO_H */
+
+/* Define to 1 if you have the <sys/times.h> header file. */
+#define HAVE_SYS_TIMES_H 1
+
+/* Define to 1 if you have the <sys/time.h> header file. */
+#define HAVE_SYS_TIME_H 1
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#define HAVE_SYS_TYPES_H 1
+
+/* Define to 1 if you have the <sys/uio.h> header file. */
+#define HAVE_SYS_UIO_H 1
+
+/* Define to 1 if you have the <sys/un.h> header file. */
+#define HAVE_SYS_UN_H 1
+
+/* Define to 1 if you have the <sys/utsname.h> header file. */
+#define HAVE_SYS_UTSNAME_H 1
+
+/* Define to 1 if you have the <sys/wait.h> header file. */
+#define HAVE_SYS_WAIT_H 1
+
+/* Define to 1 if you have the <sys/xattr.h> header file. */
+#define HAVE_SYS_XATTR_H 1
+
+/* Define to 1 if you have the `tcgetpgrp' function. */
+#define HAVE_TCGETPGRP 1
+
+/* Define to 1 if you have the `tcsetpgrp' function. */
+#define HAVE_TCSETPGRP 1
+
+/* Define to 1 if you have the `tempnam' function. */
+#define HAVE_TEMPNAM 1
+
+/* Define to 1 if you have the <termios.h> header file. */
+#define HAVE_TERMIOS_H 1
+
+/* Define to 1 if you have the <term.h> header file. */
+/* #undef HAVE_TERM_H */
+
+/* Define to 1 if you have the `tgamma' function. */
+#define HAVE_TGAMMA 1
+
+/* Define to 1 if you have the `timegm' function. */
+#define HAVE_TIMEGM 1
+
+/* Define to 1 if you have the `times' function. */
+#define HAVE_TIMES 1
+
+/* Define to 1 if you have the `tmpfile' function. */
+#define HAVE_TMPFILE 1
+
+/* Define to 1 if you have the `tmpnam' function. */
+#define HAVE_TMPNAM 1
+
+/* Define to 1 if you have the `tmpnam_r' function. */
+#define HAVE_TMPNAM_R 1
+
+/* Define to 1 if your `struct tm' has `tm_zone'. Deprecated, use
+   `HAVE_STRUCT_TM_TM_ZONE' instead. */
+#define HAVE_TM_ZONE 1
+
+/* Define to 1 if you have the `truncate' function. */
+#define HAVE_TRUNCATE 1
+
+/* Define to 1 if you don't have `tm_zone' but do have the external array
+   `tzname'. */
+/* #undef HAVE_TZNAME */
+
+/* Define this if you have tcl and TCL_UTF_MAX==6 */
+/* #undef HAVE_UCS4_TCL */
+
+/* Define to 1 if you have the `uname' function. */
+#define HAVE_UNAME 1
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#define HAVE_UNISTD_H 1
+
+/* Define to 1 if you have the `unlinkat' function. */
+#define HAVE_UNLINKAT 1
+
+/* Define to 1 if you have the `unsetenv' function. */
+#define HAVE_UNSETENV 1
+
+/* Define if you have a useable wchar_t type defined in wchar.h; useable means
+   wchar_t must be an unsigned type with at least 16 bits. (see
+   Include/unicodeobject.h). */
+#define HAVE_USABLE_WCHAR_T 1
+
+/* Define to 1 if you have the <util.h> header file. */
+/* #undef HAVE_UTIL_H */
+
+/* Define to 1 if you have the `utimensat' function. */
+#define HAVE_UTIMENSAT 1
+
+/* Define to 1 if you have the `utimes' function. */
+#define HAVE_UTIMES 1
+
+/* Define to 1 if you have the <utime.h> header file. */
+#define HAVE_UTIME_H 1
+
+/* Define to 1 if you have the `wait3' function. */
+#define HAVE_WAIT3 1
+
+/* Define to 1 if you have the `wait4' function. */
+#define HAVE_WAIT4 1
+
+/* Define to 1 if you have the `waitid' function. */
+#define HAVE_WAITID 1
+
+/* Define to 1 if you have the `waitpid' function. */
+#define HAVE_WAITPID 1
+
+/* Define if the compiler provides a wchar.h header file. */
+#define HAVE_WCHAR_H 1
+
+/* Define to 1 if you have the `wcscoll' function. */
+#define HAVE_WCSCOLL 1
+
+/* Define to 1 if you have the `wcsftime' function. */
+#define HAVE_WCSFTIME 1
+
+/* Define to 1 if you have the `wcsxfrm' function. */
+#define HAVE_WCSXFRM 1
+
+/* Define to 1 if you have the `wmemcmp' function. */
+#define HAVE_WMEMCMP 1
+
+/* Define if tzset() actually switches the local timezone in a meaningful way.
+   */
+#define HAVE_WORKING_TZSET 1
+
+/* Define to 1 if you have the `writev' function. */
+#define HAVE_WRITEV 1
+
+/* Define if the zlib library has inflateCopy */
+#define HAVE_ZLIB_COPY 1
+
+/* Define to 1 if you have the `_getpty' function. */
+/* #undef HAVE__GETPTY */
+
+/* Define if log1p(-0.) is 0. rather than -0. */
+/* #undef LOG1P_DROPS_ZERO_SIGN */
+
+/* Define to 1 if `major', `minor', and `makedev' are declared in <mkdev.h>.
+   */
+/* #undef MAJOR_IN_MKDEV */
+
+/* Define to 1 if `major', `minor', and `makedev' are declared in
+   <sysmacros.h>. */
+/* #undef MAJOR_IN_SYSMACROS */
+
+/* Define if mvwdelch in curses.h is an expression. */
+/* #undef MVWDELCH_IS_EXPRESSION */
+
+/* Define to the address where bug reports for this package should be sent. */
+/* #undef PACKAGE_BUGREPORT */
+
+/* Define to the full name of this package. */
+/* #undef PACKAGE_NAME */
+
+/* Define to the full name and version of this package. */
+/* #undef PACKAGE_STRING */
+
+/* Define to the one symbol short name of this package. */
+/* #undef PACKAGE_TARNAME */
+
+/* Define to the home page for this package. */
+/* #undef PACKAGE_URL */
+
+/* Define to the version of this package. */
+/* #undef PACKAGE_VERSION */
+
+/* Define if POSIX semaphores aren't enabled on your system */
+/* #undef POSIX_SEMAPHORES_NOT_ENABLED */
+
+/* Defined if PTHREAD_SCOPE_SYSTEM supported. */
+#define PTHREAD_SYSTEM_SCHED_SUPPORTED 1
+
+/* Define as the preferred size in bits of long digits */
+/* #undef PYLONG_BITS_IN_DIGIT */
+
+/* Define to printf format modifier for Py_ssize_t */
+#define PY_FORMAT_SIZE_T "z"
+
+/* Define if you want to build an interpreter with many run-time checks. */
+/* #undef Py_DEBUG */
+
+/* Defined if Python is built as a shared library. */
+#define Py_ENABLE_SHARED 1
+
+/* Define hash algorithm for str, bytes and memoryview. SipHash24: 1, FNV: 2,
+   externally defined: 0 */
+/* #undef Py_HASH_ALGORITHM */
+
+/* assume C89 semantics that RETSIGTYPE is always void */
+#define RETSIGTYPE void
+
+/* Define if setpgrp() must be called as setpgrp(0, 0). */
+/* #undef SETPGRP_HAVE_ARG */
+
+/* Define if i>>j for signed int i does not extend the sign bit when i < 0 */
+/* #undef SIGNED_RIGHT_SHIFT_ZERO_FILLS */
+
+/* The size of `double', as computed by sizeof. */
+#define SIZEOF_DOUBLE 8
+
+/* The size of `float', as computed by sizeof. */
+#define SIZEOF_FLOAT 4
+
+/* The size of `fpos_t', as computed by sizeof. */
+#define SIZEOF_FPOS_T 16
+
+/* The size of `int', as computed by sizeof. */
+#define SIZEOF_INT 4
+
+/* The size of `long', as computed by sizeof. */
+#define SIZEOF_LONG 8
+
+/* The size of `long double', as computed by sizeof. */
+#define SIZEOF_LONG_DOUBLE 16
+
+/* The size of `long long', as computed by sizeof. */
+#define SIZEOF_LONG_LONG 8
+
+/* The size of `off_t', as computed by sizeof. */
+#define SIZEOF_OFF_T 8
+
+/* The size of `pid_t', as computed by sizeof. */
+#define SIZEOF_PID_T 4
+
+/* The size of `pthread_t', as computed by sizeof. */
+#define SIZEOF_PTHREAD_T 8
+
+/* The size of `short', as computed by sizeof. */
+#define SIZEOF_SHORT 2
+
+/* The size of `size_t', as computed by sizeof. */
+#define SIZEOF_SIZE_T 8
+
+/* The size of `time_t', as computed by sizeof. */
+#define SIZEOF_TIME_T 8
+
+/* The size of `uintptr_t', as computed by sizeof. */
+#define SIZEOF_UINTPTR_T 8
+
+/* The size of `void *', as computed by sizeof. */
+#define SIZEOF_VOID_P 8
+
+/* The size of `wchar_t', as computed by sizeof. */
+#define SIZEOF_WCHAR_T 4
+
+/* The size of `_Bool', as computed by sizeof. */
+#define SIZEOF__BOOL 1
+
+/* Define to 1 if you have the ANSI C header files. */
+#define STDC_HEADERS 1
+
+/* Define if you can safely include both <sys/select.h> and <sys/time.h>
+   (which you can't on SCO ODT 3.0). */
+#define SYS_SELECT_WITH_SYS_TIME 1
+
+/* Define if tanh(-0.) is -0., or if platform doesn't have signed zeros */
+#define TANH_PRESERVES_ZERO_SIGN 1
+
+/* Library needed by timemodule.c: librt may be needed for clock_gettime() */
+/* #undef TIMEMODULE_LIB */
+
+/* Define to 1 if you can safely include both <sys/time.h> and <time.h>. */
+#define TIME_WITH_SYS_TIME 1
+
+/* Define to 1 if your <sys/time.h> declares `struct tm'. */
+/* #undef TM_IN_SYS_TIME */
+
+/* Define if you want to use computed gotos in ceval.c. */
+/* #undef USE_COMPUTED_GOTOS */
+
+/* Define to use the C99 inline keyword. */
+#define USE_INLINE 1
+
+/* Enable extensions on AIX 3, Interix.  */
+#ifndef _ALL_SOURCE
+# define _ALL_SOURCE 1
+#endif
+/* Enable GNU extensions on systems that have them.  */
+#ifndef _GNU_SOURCE
+# define _GNU_SOURCE 1
+#endif
+/* Enable threading extensions on Solaris.  */
+#ifndef _POSIX_PTHREAD_SEMANTICS
+# define _POSIX_PTHREAD_SEMANTICS 1
+#endif
+/* Enable extensions on HP NonStop.  */
+#ifndef _TANDEM_SOURCE
+# define _TANDEM_SOURCE 1
+#endif
+/* Enable general extensions on Solaris.  */
+#ifndef __EXTENSIONS__
+# define __EXTENSIONS__ 1
+#endif
+
+
+/* Define if you want SIGFPE handled (see Include/pyfpe.h). */
+/* #undef WANT_SIGFPE_HANDLER */
+
+/* Define if WINDOW in curses.h offers a field _flags. */
+/* #undef WINDOW_HAS_FLAGS */
+
+/* Define if you want documentation strings in extension modules */
+#define WITH_DOC_STRINGS 1
+
+/* Define if you want to compile in DTrace support */
+/* #undef WITH_DTRACE */
+
+/* Define if you want to use the new-style (Openstep, Rhapsody, MacOS) dynamic
+   linker (dyld) instead of the old-style (NextStep) dynamic linker (rld).
+   Dyld is necessary to support frameworks. */
+/* #undef WITH_DYLD */
+
+/* Define to 1 if libintl is needed for locale functions. */
+/* #undef WITH_LIBINTL */
+
+/* Define if you want to produce an OpenStep/Rhapsody framework (shared
+   library plus accessory files). */
+/* #undef WITH_NEXT_FRAMEWORK */
+
+/* Define if you want to compile in Python-specific mallocs */
+#define WITH_PYMALLOC 1
+
+/* Define if you want to compile in rudimentary thread support */
+#define WITH_THREAD 1
+
+/* Define if you want pymalloc to be disabled when running under valgrind */
+/* #undef WITH_VALGRIND */
+
+/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most
+   significant byte first (like Motorola and SPARC, unlike Intel). */
+#if defined AC_APPLE_UNIVERSAL_BUILD
+# if defined __BIG_ENDIAN__
+#  define WORDS_BIGENDIAN 1
+# endif
+#else
+# ifndef WORDS_BIGENDIAN
+/* #  undef WORDS_BIGENDIAN */
+# endif
+#endif
+
+/* Define if arithmetic is subject to x87-style double rounding issue */
+/* #undef X87_DOUBLE_ROUNDING */
+
+/* Define on OpenBSD to activate all library features */
+/* #undef _BSD_SOURCE */
+
+/* Define on Irix to enable u_int */
+#define _BSD_TYPES 1
+
+/* Define on Darwin to activate all library features */
+#define _DARWIN_C_SOURCE 1
+
+/* This must be set to 64 on some systems to enable large file support. */
+#define _FILE_OFFSET_BITS 64
+
+/* Define on Linux to activate all library features */
+#define _GNU_SOURCE 1
+
+/* Define to include mbstate_t for mbrtowc */
+/* #undef _INCLUDE__STDC_A1_SOURCE */
+
+/* This must be defined on some systems to enable large file support. */
+#define _LARGEFILE_SOURCE 1
+
+/* This must be defined on AIX systems to enable large file support. */
+/* #undef _LARGE_FILES */
+
+/* Define to 1 if on MINIX. */
+/* #undef _MINIX */
+
+/* Define on NetBSD to activate all library features */
+#define _NETBSD_SOURCE 1
+
+/* Define to 2 if the system does not provide POSIX.1 features except with
+   this defined. */
+/* #undef _POSIX_1_SOURCE */
+
+/* Define to activate features from IEEE Stds 1003.1-2008 */
+#define _POSIX_C_SOURCE 200809L
+
+/* Define to 1 if you need to in order for `stat' and other things to work. */
+/* #undef _POSIX_SOURCE */
+
+/* Define if you have POSIX threads, and your system does not define that. */
+/* #undef _POSIX_THREADS */
+
+/* Define to force use of thread-safe errno, h_errno, and other functions */
+/* #undef _REENTRANT */
+
+/* Define to the level of X/Open that your system supports */
+#define _XOPEN_SOURCE 700
+
+/* Define to activate Unix95-and-earlier features */
+#define _XOPEN_SOURCE_EXTENDED 1
+
+/* Define on FreeBSD to activate all library features */
+#define __BSD_VISIBLE 1
+
+/* Define to 1 if type `char' is unsigned and you are not using gcc.  */
+#ifndef __CHAR_UNSIGNED__
+/* # undef __CHAR_UNSIGNED__ */
+#endif
+
+/* Define to 'long' if <time.h> doesn't define. */
+/* #undef clock_t */
+
+/* Define to empty if `const' does not conform to ANSI C. */
+/* #undef const */
+
+/* Define to `int' if <sys/types.h> doesn't define. */
+/* #undef gid_t */
+
+/* Define to `__inline__' or `__inline' if that's what the C compiler
+   calls it, or to nothing if 'inline' is not supported under any name.  */
+#ifndef __cplusplus
+/* #undef inline */
+#endif
+
+/* Define to `int' if <sys/types.h> does not define. */
+/* #undef mode_t */
+
+/* Define to `long int' if <sys/types.h> does not define. */
+/* #undef off_t */
+
+/* Define to `int' if <sys/types.h> does not define. */
+/* #undef pid_t */
+
+/* Define to empty if the keyword does not work. */
+/* #undef signed */
+
+/* Define to `unsigned int' if <sys/types.h> does not define. */
+/* #undef size_t */
+
+/* Define to `int' if <sys/socket.h> does not define. */
+/* #undef socklen_t */
+
+/* Define to `int' if <sys/types.h> doesn't define. */
+/* #undef uid_t */
+
+/* Define to empty if the keyword does not work. */
+/* #undef volatile */
+
+
+/* Define the macros needed if on a UnixWare 7.x system. */
+#if defined(__USLC__) && defined(__SCO_VERSION__)
+#define STRICT_SYSV_CURSES /* Don't use ncurses extensions */
+#endif
+
+#endif /*Py_PYCONFIG_H*/
+
diff --git a/cviruntime/python/include/aarch64-linux-gnu/python3.6m/pyconfig.h b/cviruntime/python/include/aarch64-linux-gnu/python3.6m/pyconfig.h
new file mode 100644
index 000000000..7418b1c41
--- /dev/null
+++ b/cviruntime/python/include/aarch64-linux-gnu/python3.6m/pyconfig.h
@@ -0,0 +1,1552 @@
+/* pyconfig.h.  Generated from pyconfig.h.in by configure.  */
+/* pyconfig.h.in.  Generated from configure.ac by autoheader.  */
+
+
+#ifndef Py_PYCONFIG_H
+#define Py_PYCONFIG_H
+
+
+/* Define if building universal (internal helper macro) */
+/* #undef AC_APPLE_UNIVERSAL_BUILD */
+
+/* Define for AIX if your compiler is a genuine IBM xlC/xlC_r and you want
+   support for AIX C++ shared extension modules. */
+/* #undef AIX_GENUINE_CPLUSPLUS */
+
+/* The Android API level. */
+/* #undef ANDROID_API_LEVEL */
+
+/* Define if C doubles are 64-bit IEEE 754 binary format, stored in ARM
+   mixed-endian order (byte order 45670123) */
+/* #undef DOUBLE_IS_ARM_MIXED_ENDIAN_IEEE754 */
+
+/* Define if C doubles are 64-bit IEEE 754 binary format, stored with the most
+   significant byte first */
+/* #undef DOUBLE_IS_BIG_ENDIAN_IEEE754 */
+
+/* Define if C doubles are 64-bit IEEE 754 binary format, stored with the
+   least significant byte first */
+#define DOUBLE_IS_LITTLE_ENDIAN_IEEE754 1
+
+/* Define if --enable-ipv6 is specified */
+#define ENABLE_IPV6 1
+
+/* Define if flock needs to be linked with bsd library. */
+/* #undef FLOCK_NEEDS_LIBBSD */
+
+/* Define if getpgrp() must be called as getpgrp(0). */
+/* #undef GETPGRP_HAVE_ARG */
+
+/* Define if gettimeofday() does not have second (timezone) argument This is
+   the case on Motorola V4 (R40V4.2) */
+/* #undef GETTIMEOFDAY_NO_TZ */
+
+/* Define to 1 if you have the `accept4' function. */
+#define HAVE_ACCEPT4 1
+
+/* Define to 1 if you have the `acosh' function. */
+#define HAVE_ACOSH 1
+
+/* struct addrinfo (netdb.h) */
+#define HAVE_ADDRINFO 1
+
+/* Define to 1 if you have the `alarm' function. */
+#define HAVE_ALARM 1
+
+/* Define if aligned memory access is required */
+/* #undef HAVE_ALIGNED_REQUIRED */
+
+/* Define to 1 if you have the <alloca.h> header file. */
+#define HAVE_ALLOCA_H 1
+
+/* Define this if your time.h defines altzone. */
+/* #undef HAVE_ALTZONE */
+
+/* Define to 1 if you have the `asinh' function. */
+#define HAVE_ASINH 1
+
+/* Define to 1 if you have the <asm/types.h> header file. */
+#define HAVE_ASM_TYPES_H 1
+
+/* Define to 1 if you have the `atanh' function. */
+#define HAVE_ATANH 1
+
+/* Define to 1 if you have the `bind_textdomain_codeset' function. */
+#define HAVE_BIND_TEXTDOMAIN_CODESET 1
+
+/* Define to 1 if you have the <bluetooth/bluetooth.h> header file. */
+/* #undef HAVE_BLUETOOTH_BLUETOOTH_H */
+
+/* Define to 1 if you have the <bluetooth.h> header file. */
+/* #undef HAVE_BLUETOOTH_H */
+
+/* Define if mbstowcs(NULL, "text", 0) does not return the number of wide
+   chars that would be converted. */
+/* #undef HAVE_BROKEN_MBSTOWCS */
+
+/* Define if nice() returns success/failure instead of the new priority. */
+/* #undef HAVE_BROKEN_NICE */
+
+/* Define if the system reports an invalid PIPE_BUF value. */
+/* #undef HAVE_BROKEN_PIPE_BUF */
+
+/* Define if poll() sets errno on invalid file descriptors. */
+/* #undef HAVE_BROKEN_POLL */
+
+/* Define if the Posix semaphores do not work on your system */
+/* #undef HAVE_BROKEN_POSIX_SEMAPHORES */
+
+/* Define if pthread_sigmask() does not work on your system. */
+/* #undef HAVE_BROKEN_PTHREAD_SIGMASK */
+
+/* define to 1 if your sem_getvalue is broken. */
+/* #undef HAVE_BROKEN_SEM_GETVALUE */
+
+/* Define if `unsetenv` does not return an int. */
+/* #undef HAVE_BROKEN_UNSETENV */
+
+/* Has builtin atomics */
+#define HAVE_BUILTIN_ATOMIC 1
+
+/* Define to 1 if you have the 'chflags' function. */
+/* #undef HAVE_CHFLAGS */
+
+/* Define to 1 if you have the `chown' function. */
+#define HAVE_CHOWN 1
+
+/* Define if you have the 'chroot' function. */
+#define HAVE_CHROOT 1
+
+/* Define to 1 if you have the `clock' function. */
+#define HAVE_CLOCK 1
+
+/* Define to 1 if you have the `clock_getres' function. */
+#define HAVE_CLOCK_GETRES 1
+
+/* Define to 1 if you have the `clock_gettime' function. */
+#define HAVE_CLOCK_GETTIME 1
+
+/* Define to 1 if you have the `clock_settime' function. */
+#define HAVE_CLOCK_SETTIME 1
+
+/* Define if the C compiler supports computed gotos. */
+#define HAVE_COMPUTED_GOTOS 1
+
+/* Define to 1 if you have the `confstr' function. */
+#define HAVE_CONFSTR 1
+
+/* Define to 1 if you have the <conio.h> header file. */
+/* #undef HAVE_CONIO_H */
+
+/* Define to 1 if you have the `copysign' function. */
+#define HAVE_COPYSIGN 1
+
+/* Define to 1 if you have the <crypt.h> header file. */
+#define HAVE_CRYPT_H 1
+
+/* Define to 1 if you have the `ctermid' function. */
+#define HAVE_CTERMID 1
+
+/* Define if you have the 'ctermid_r' function. */
+/* #undef HAVE_CTERMID_R */
+
+/* Define if you have the 'filter' function. */
+/* #undef HAVE_CURSES_FILTER */
+
+/* Define to 1 if you have the <curses.h> header file. */
+/* #undef HAVE_CURSES_H */
+
+/* Define if you have the 'has_key' function. */
+/* #undef HAVE_CURSES_HAS_KEY */
+
+/* Define if you have the 'immedok' function. */
+/* #undef HAVE_CURSES_IMMEDOK */
+
+/* Define if you have the 'is_pad' function or macro. */
+/* #undef HAVE_CURSES_IS_PAD */
+
+/* Define if you have the 'is_term_resized' function. */
+/* #undef HAVE_CURSES_IS_TERM_RESIZED */
+
+/* Define if you have the 'resizeterm' function. */
+/* #undef HAVE_CURSES_RESIZETERM */
+
+/* Define if you have the 'resize_term' function. */
+/* #undef HAVE_CURSES_RESIZE_TERM */
+
+/* Define if you have the 'syncok' function. */
+/* #undef HAVE_CURSES_SYNCOK */
+
+/* Define if you have the 'typeahead' function. */
+/* #undef HAVE_CURSES_TYPEAHEAD */
+
+/* Define if you have the 'use_env' function. */
+/* #undef HAVE_CURSES_USE_ENV */
+
+/* Define if you have the 'wchgat' function. */
+/* #undef HAVE_CURSES_WCHGAT */
+
+/* Define to 1 if you have the declaration of `isfinite', and to 0 if you
+   don't. */
+#define HAVE_DECL_ISFINITE 1
+
+/* Define to 1 if you have the declaration of `isinf', and to 0 if you don't.
+   */
+#define HAVE_DECL_ISINF 1
+
+/* Define to 1 if you have the declaration of `isnan', and to 0 if you don't.
+   */
+#define HAVE_DECL_ISNAN 1
+
+/* Define to 1 if you have the declaration of `RTLD_DEEPBIND', and to 0 if you
+   don't. */
+#define HAVE_DECL_RTLD_DEEPBIND 1
+
+/* Define to 1 if you have the declaration of `RTLD_GLOBAL', and to 0 if you
+   don't. */
+#define HAVE_DECL_RTLD_GLOBAL 1
+
+/* Define to 1 if you have the declaration of `RTLD_LAZY', and to 0 if you
+   don't. */
+#define HAVE_DECL_RTLD_LAZY 1
+
+/* Define to 1 if you have the declaration of `RTLD_LOCAL', and to 0 if you
+   don't. */
+#define HAVE_DECL_RTLD_LOCAL 1
+
+/* Define to 1 if you have the declaration of `RTLD_NODELETE', and to 0 if you
+   don't. */
+#define HAVE_DECL_RTLD_NODELETE 1
+
+/* Define to 1 if you have the declaration of `RTLD_NOLOAD', and to 0 if you
+   don't. */
+#define HAVE_DECL_RTLD_NOLOAD 1
+
+/* Define to 1 if you have the declaration of `RTLD_NOW', and to 0 if you
+   don't. */
+#define HAVE_DECL_RTLD_NOW 1
+
+/* Define to 1 if you have the declaration of `tzname', and to 0 if you don't.
+   */
+/* #undef HAVE_DECL_TZNAME */
+
+/* Define to 1 if you have the device macros. */
+#define HAVE_DEVICE_MACROS 1
+
+/* Define to 1 if you have the /dev/ptc device file. */
+/* #undef HAVE_DEV_PTC */
+
+/* Define to 1 if you have the /dev/ptmx device file. */
+#define HAVE_DEV_PTMX 1
+
+/* Define to 1 if you have the <direct.h> header file. */
+/* #undef HAVE_DIRECT_H */
+
+/* Define to 1 if the dirent structure has a d_type field */
+#define HAVE_DIRENT_D_TYPE 1
+
+/* Define to 1 if you have the <dirent.h> header file, and it defines `DIR'.
+   */
+#define HAVE_DIRENT_H 1
+
+/* Define if you have the 'dirfd' function or macro. */
+#define HAVE_DIRFD 1
+
+/* Define to 1 if you have the <dlfcn.h> header file. */
+#define HAVE_DLFCN_H 1
+
+/* Define to 1 if you have the `dlopen' function. */
+#define HAVE_DLOPEN 1
+
+/* Define to 1 if you have the `dup2' function. */
+#define HAVE_DUP2 1
+
+/* Define to 1 if you have the `dup3' function. */
+#define HAVE_DUP3 1
+
+/* Defined when any dynamic module loading is enabled. */
+#define HAVE_DYNAMIC_LOADING 1
+
+/* Define to 1 if you have the <endian.h> header file. */
+#define HAVE_ENDIAN_H 1
+
+/* Define if you have the 'epoll' functions. */
+#define HAVE_EPOLL 1
+
+/* Define if you have the 'epoll_create1' function. */
+#define HAVE_EPOLL_CREATE1 1
+
+/* Define to 1 if you have the `erf' function. */
+#define HAVE_ERF 1
+
+/* Define to 1 if you have the `erfc' function. */
+#define HAVE_ERFC 1
+
+/* Define to 1 if you have the <errno.h> header file. */
+#define HAVE_ERRNO_H 1
+
+/* Define to 1 if you have the `execv' function. */
+#define HAVE_EXECV 1
+
+/* Define to 1 if you have the `expm1' function. */
+#define HAVE_EXPM1 1
+
+/* Define to 1 if you have the `faccessat' function. */
+#define HAVE_FACCESSAT 1
+
+/* Define if you have the 'fchdir' function. */
+#define HAVE_FCHDIR 1
+
+/* Define to 1 if you have the `fchmod' function. */
+#define HAVE_FCHMOD 1
+
+/* Define to 1 if you have the `fchmodat' function. */
+#define HAVE_FCHMODAT 1
+
+/* Define to 1 if you have the `fchown' function. */
+#define HAVE_FCHOWN 1
+
+/* Define to 1 if you have the `fchownat' function. */
+#define HAVE_FCHOWNAT 1
+
+/* Define to 1 if you have the <fcntl.h> header file. */
+#define HAVE_FCNTL_H 1
+
+/* Define if you have the 'fdatasync' function. */
+#define HAVE_FDATASYNC 1
+
+/* Define to 1 if you have the `fdopendir' function. */
+#define HAVE_FDOPENDIR 1
+
+/* Define to 1 if you have the `fexecve' function. */
+#define HAVE_FEXECVE 1
+
+/* Define to 1 if you have the `finite' function. */
+#define HAVE_FINITE 1
+
+/* Define to 1 if you have the `flock' function. */
+#define HAVE_FLOCK 1
+
+/* Define to 1 if you have the `fork' function. */
+#define HAVE_FORK 1
+
+/* Define to 1 if you have the `forkpty' function. */
+#define HAVE_FORKPTY 1
+
+/* Define to 1 if you have the `fpathconf' function. */
+#define HAVE_FPATHCONF 1
+
+/* Define to 1 if you have the `fseek64' function. */
+/* #undef HAVE_FSEEK64 */
+
+/* Define to 1 if you have the `fseeko' function. */
+#define HAVE_FSEEKO 1
+
+/* Define to 1 if you have the `fstatat' function. */
+#define HAVE_FSTATAT 1
+
+/* Define to 1 if you have the `fstatvfs' function. */
+#define HAVE_FSTATVFS 1
+
+/* Define if you have the 'fsync' function. */
+#define HAVE_FSYNC 1
+
+/* Define to 1 if you have the `ftell64' function. */
+/* #undef HAVE_FTELL64 */
+
+/* Define to 1 if you have the `ftello' function. */
+#define HAVE_FTELLO 1
+
+/* Define to 1 if you have the `ftime' function. */
+#define HAVE_FTIME 1
+
+/* Define to 1 if you have the `ftruncate' function. */
+#define HAVE_FTRUNCATE 1
+
+/* Define to 1 if you have the `futimens' function. */
+#define HAVE_FUTIMENS 1
+
+/* Define to 1 if you have the `futimes' function. */
+#define HAVE_FUTIMES 1
+
+/* Define to 1 if you have the `futimesat' function. */
+#define HAVE_FUTIMESAT 1
+
+/* Define to 1 if you have the `gai_strerror' function. */
+#define HAVE_GAI_STRERROR 1
+
+/* Define to 1 if you have the `gamma' function. */
+#define HAVE_GAMMA 1
+
+/* Define if we can use gcc inline assembler to get and set mc68881 fpcr */
+/* #undef HAVE_GCC_ASM_FOR_MC68881 */
+
+/* Define if we can use x64 gcc inline assembler */
+/* #undef HAVE_GCC_ASM_FOR_X64 */
+
+/* Define if we can use gcc inline assembler to get and set x87 control word
+   */
+/* #undef HAVE_GCC_ASM_FOR_X87 */
+
+/* Define if your compiler provides __uint128_t */
+#define HAVE_GCC_UINT128_T 1
+
+/* Define if you have the getaddrinfo function. */
+#define HAVE_GETADDRINFO 1
+
+/* Define this if you have flockfile(), getc_unlocked(), and funlockfile() */
+#define HAVE_GETC_UNLOCKED 1
+
+/* Define to 1 if you have the `getentropy' function. */
+/* #undef HAVE_GETENTROPY */
+
+/* Define to 1 if you have the `getgrouplist' function. */
+#define HAVE_GETGROUPLIST 1
+
+/* Define to 1 if you have the `getgroups' function. */
+#define HAVE_GETGROUPS 1
+
+/* Define to 1 if you have the `gethostbyname' function. */
+/* #undef HAVE_GETHOSTBYNAME */
+
+/* Define this if you have some version of gethostbyname_r() */
+#define HAVE_GETHOSTBYNAME_R 1
+
+/* Define this if you have the 3-arg version of gethostbyname_r(). */
+/* #undef HAVE_GETHOSTBYNAME_R_3_ARG */
+
+/* Define this if you have the 5-arg version of gethostbyname_r(). */
+/* #undef HAVE_GETHOSTBYNAME_R_5_ARG */
+
+/* Define this if you have the 6-arg version of gethostbyname_r(). */
+#define HAVE_GETHOSTBYNAME_R_6_ARG 1
+
+/* Define to 1 if you have the `getitimer' function. */
+#define HAVE_GETITIMER 1
+
+/* Define to 1 if you have the `getloadavg' function. */
+#define HAVE_GETLOADAVG 1
+
+/* Define to 1 if you have the `getlogin' function. */
+#define HAVE_GETLOGIN 1
+
+/* Define to 1 if you have the `getnameinfo' function. */
+#define HAVE_GETNAMEINFO 1
+
+/* Define if you have the 'getpagesize' function. */
+#define HAVE_GETPAGESIZE 1
+
+/* Define to 1 if you have the `getpeername' function. */
+#define HAVE_GETPEERNAME 1
+
+/* Define to 1 if you have the `getpgid' function. */
+#define HAVE_GETPGID 1
+
+/* Define to 1 if you have the `getpgrp' function. */
+#define HAVE_GETPGRP 1
+
+/* Define to 1 if you have the `getpid' function. */
+#define HAVE_GETPID 1
+
+/* Define to 1 if you have the `getpriority' function. */
+#define HAVE_GETPRIORITY 1
+
+/* Define to 1 if you have the `getpwent' function. */
+#define HAVE_GETPWENT 1
+
+/* Define to 1 if the getrandom() function is available */
+/* #undef HAVE_GETRANDOM */
+
+/* Define to 1 if the Linux getrandom() syscall is available */
+#define HAVE_GETRANDOM_SYSCALL 1
+
+/* Define to 1 if you have the `getresgid' function. */
+#define HAVE_GETRESGID 1
+
+/* Define to 1 if you have the `getresuid' function. */
+#define HAVE_GETRESUID 1
+
+/* Define to 1 if you have the `getsid' function. */
+#define HAVE_GETSID 1
+
+/* Define to 1 if you have the `getspent' function. */
+#define HAVE_GETSPENT 1
+
+/* Define to 1 if you have the `getspnam' function. */
+#define HAVE_GETSPNAM 1
+
+/* Define to 1 if you have the `gettimeofday' function. */
+#define HAVE_GETTIMEOFDAY 1
+
+/* Define to 1 if you have the `getwd' function. */
+#define HAVE_GETWD 1
+
+/* Define if glibc has incorrect _FORTIFY_SOURCE wrappers for memmove and
+   bcopy. */
+/* #undef HAVE_GLIBC_MEMMOVE_BUG */
+
+/* Define to 1 if you have the <grp.h> header file. */
+#define HAVE_GRP_H 1
+
+/* Define if you have the 'hstrerror' function. */
+#define HAVE_HSTRERROR 1
+
+/* Define this if you have le64toh() */
+#define HAVE_HTOLE64 1
+
+/* Define to 1 if you have the `hypot' function. */
+#define HAVE_HYPOT 1
+
+/* Define to 1 if you have the <ieeefp.h> header file. */
+/* #undef HAVE_IEEEFP_H */
+
+/* Define to 1 if you have the 'if_nameindex' function. */
+#define HAVE_IF_NAMEINDEX 1
+
+/* Define if you have the 'inet_aton' function. */
+#define HAVE_INET_ATON 1
+
+/* Define if you have the 'inet_pton' function. */
+#define HAVE_INET_PTON 1
+
+/* Define to 1 if you have the `initgroups' function. */
+#define HAVE_INITGROUPS 1
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#define HAVE_INTTYPES_H 1
+
+/* Define to 1 if you have the <io.h> header file. */
+/* #undef HAVE_IO_H */
+
+/* Define if gcc has the ipa-pure-const bug. */
+/* #undef HAVE_IPA_PURE_CONST_BUG */
+
+/* Define to 1 if you have the `kill' function. */
+#define HAVE_KILL 1
+
+/* Define to 1 if you have the `killpg' function. */
+#define HAVE_KILLPG 1
+
+/* Define if you have the 'kqueue' functions. */
+/* #undef HAVE_KQUEUE */
+
+/* Define to 1 if you have the <langinfo.h> header file. */
+#define HAVE_LANGINFO_H 1
+
+/* Defined to enable large file support when an off_t is bigger than a long
+   and long long is available and at least as big as an off_t. You may need to
+   add some flags for configuration and compilation to enable this mode. (For
+   Solaris and Linux, the necessary defines are already defined.) */
+/* #undef HAVE_LARGEFILE_SUPPORT */
+
+/* Define to 1 if you have the 'lchflags' function. */
+/* #undef HAVE_LCHFLAGS */
+
+/* Define to 1 if you have the `lchmod' function. */
+/* #undef HAVE_LCHMOD */
+
+/* Define to 1 if you have the `lchown' function. */
+#define HAVE_LCHOWN 1
+
+/* Define to 1 if you have the `lgamma' function. */
+#define HAVE_LGAMMA 1
+
+/* Define to 1 if you have the `dl' library (-ldl). */
+#define HAVE_LIBDL 1
+
+/* Define to 1 if you have the `dld' library (-ldld). */
+/* #undef HAVE_LIBDLD */
+
+/* Define to 1 if you have the `ieee' library (-lieee). */
+/* #undef HAVE_LIBIEEE */
+
+/* Define to 1 if you have the <libintl.h> header file. */
+#define HAVE_LIBINTL_H 1
+
+/* Define if you have the readline library (-lreadline). */
+/* #undef HAVE_LIBREADLINE */
+
+/* Define to 1 if you have the `resolv' library (-lresolv). */
+/* #undef HAVE_LIBRESOLV */
+
+/* Define to 1 if you have the `sendfile' library (-lsendfile). */
+/* #undef HAVE_LIBSENDFILE */
+
+/* Define to 1 if you have the <libutil.h> header file. */
+/* #undef HAVE_LIBUTIL_H */
+
+/* Define if you have the 'link' function. */
+#define HAVE_LINK 1
+
+/* Define to 1 if you have the `linkat' function. */
+#define HAVE_LINKAT 1
+
+/* Define to 1 if you have the <linux/can/bcm.h> header file. */
+#define HAVE_LINUX_CAN_BCM_H 1
+
+/* Define to 1 if you have the <linux/can.h> header file. */
+#define HAVE_LINUX_CAN_H 1
+
+/* Define if compiling using Linux 3.6 or later. */
+#define HAVE_LINUX_CAN_RAW_FD_FRAMES 1
+
+/* Define to 1 if you have the <linux/can/raw.h> header file. */
+#define HAVE_LINUX_CAN_RAW_H 1
+
+/* Define to 1 if you have the <linux/netlink.h> header file. */
+#define HAVE_LINUX_NETLINK_H 1
+
+/* Define to 1 if you have the <linux/random.h> header file. */
+#define HAVE_LINUX_RANDOM_H 1
+
+/* Define to 1 if you have the <linux/tipc.h> header file. */
+#define HAVE_LINUX_TIPC_H 1
+
+/* Define to 1 if you have the 'lockf' function and the F_LOCK macro. */
+#define HAVE_LOCKF 1
+
+/* Define to 1 if you have the `log1p' function. */
+#define HAVE_LOG1P 1
+
+/* Define to 1 if you have the `log2' function. */
+#define HAVE_LOG2 1
+
+/* Define this if you have the type long double. */
+#define HAVE_LONG_DOUBLE 1
+
+/* Define to 1 if you have the `lstat' function. */
+#define HAVE_LSTAT 1
+
+/* Define to 1 if you have the `lutimes' function. */
+#define HAVE_LUTIMES 1
+
+/* Define this if you have the makedev macro. */
+#define HAVE_MAKEDEV 1
+
+/* Define to 1 if you have the `mbrtowc' function. */
+#define HAVE_MBRTOWC 1
+
+/* Define to 1 if you have the `memmove' function. */
+#define HAVE_MEMMOVE 1
+
+/* Define to 1 if you have the <memory.h> header file. */
+#define HAVE_MEMORY_H 1
+
+/* Define to 1 if you have the `memrchr' function. */
+#define HAVE_MEMRCHR 1
+
+/* Define to 1 if you have the `mkdirat' function. */
+#define HAVE_MKDIRAT 1
+
+/* Define to 1 if you have the `mkfifo' function. */
+#define HAVE_MKFIFO 1
+
+/* Define to 1 if you have the `mkfifoat' function. */
+#define HAVE_MKFIFOAT 1
+
+/* Define to 1 if you have the `mknod' function. */
+#define HAVE_MKNOD 1
+
+/* Define to 1 if you have the `mknodat' function. */
+#define HAVE_MKNODAT 1
+
+/* Define to 1 if you have the `mktime' function. */
+#define HAVE_MKTIME 1
+
+/* Define to 1 if you have the `mmap' function. */
+#define HAVE_MMAP 1
+
+/* Define to 1 if you have the `mremap' function. */
+#define HAVE_MREMAP 1
+
+/* Define to 1 if you have the <ncurses.h> header file. */
+/* #undef HAVE_NCURSES_H */
+
+/* Define to 1 if you have the <ndir.h> header file, and it defines `DIR'. */
+/* #undef HAVE_NDIR_H */
+
+/* Define to 1 if you have the <netpacket/packet.h> header file. */
+#define HAVE_NETPACKET_PACKET_H 1
+
+/* Define to 1 if you have the <net/if.h> header file. */
+#define HAVE_NET_IF_H 1
+
+/* Define to 1 if you have the `nice' function. */
+#define HAVE_NICE 1
+
+/* Define to 1 if you have the `openat' function. */
+#define HAVE_OPENAT 1
+
+/* Define to 1 if you have the `openpty' function. */
+#define HAVE_OPENPTY 1
+
+/* Define to 1 if you have the `pathconf' function. */
+#define HAVE_PATHCONF 1
+
+/* Define to 1 if you have the `pause' function. */
+#define HAVE_PAUSE 1
+
+/* Define to 1 if you have the `pipe2' function. */
+#define HAVE_PIPE2 1
+
+/* Define to 1 if you have the `plock' function. */
+/* #undef HAVE_PLOCK */
+
+/* Define to 1 if you have the `poll' function. */
+#define HAVE_POLL 1
+
+/* Define to 1 if you have the <poll.h> header file. */
+#define HAVE_POLL_H 1
+
+/* Define to 1 if you have the `posix_fadvise' function. */
+#define HAVE_POSIX_FADVISE 1
+
+/* Define to 1 if you have the `posix_fallocate' function. */
+#define HAVE_POSIX_FALLOCATE 1
+
+/* Define to 1 if you have the `pread' function. */
+#define HAVE_PREAD 1
+
+/* Define if you have the 'prlimit' functions. */
+#define HAVE_PRLIMIT 1
+
+/* Define to 1 if you have the <process.h> header file. */
+/* #undef HAVE_PROCESS_H */
+
+/* Define if your compiler supports function prototype */
+#define HAVE_PROTOTYPES 1
+
+/* Define to 1 if you have the `pthread_atfork' function. */
+#define HAVE_PTHREAD_ATFORK 1
+
+/* Defined for Solaris 2.6 bug in pthread header. */
+/* #undef HAVE_PTHREAD_DESTRUCTOR */
+
+/* Define to 1 if you have the <pthread.h> header file. */
+#define HAVE_PTHREAD_H 1
+
+/* Define to 1 if you have the `pthread_init' function. */
+/* #undef HAVE_PTHREAD_INIT */
+
+/* Define to 1 if you have the `pthread_kill' function. */
+#define HAVE_PTHREAD_KILL 1
+
+/* Define to 1 if you have the `pthread_sigmask' function. */
+#define HAVE_PTHREAD_SIGMASK 1
+
+/* Define to 1 if you have the <pty.h> header file. */
+#define HAVE_PTY_H 1
+
+/* Define to 1 if you have the `putenv' function. */
+#define HAVE_PUTENV 1
+
+/* Define to 1 if you have the `pwrite' function. */
+#define HAVE_PWRITE 1
+
+/* Define to 1 if you have the `readlink' function. */
+#define HAVE_READLINK 1
+
+/* Define to 1 if you have the `readlinkat' function. */
+#define HAVE_READLINKAT 1
+
+/* Define to 1 if you have the `readv' function. */
+#define HAVE_READV 1
+
+/* Define to 1 if you have the `realpath' function. */
+#define HAVE_REALPATH 1
+
+/* Define to 1 if you have the `renameat' function. */
+#define HAVE_RENAMEAT 1
+
+/* Define if readline supports append_history */
+/* #undef HAVE_RL_APPEND_HISTORY */
+
+/* Define if you have readline 2.1 */
+/* #undef HAVE_RL_CALLBACK */
+
+/* Define if you can turn off readline's signal handling. */
+/* #undef HAVE_RL_CATCH_SIGNAL */
+
+/* Define if you have readline 2.2 */
+/* #undef HAVE_RL_COMPLETION_APPEND_CHARACTER */
+
+/* Define if you have readline 4.0 */
+/* #undef HAVE_RL_COMPLETION_DISPLAY_MATCHES_HOOK */
+
+/* Define if you have readline 4.2 */
+/* #undef HAVE_RL_COMPLETION_MATCHES */
+
+/* Define if you have rl_completion_suppress_append */
+/* #undef HAVE_RL_COMPLETION_SUPPRESS_APPEND */
+
+/* Define if you have readline 4.0 */
+/* #undef HAVE_RL_PRE_INPUT_HOOK */
+
+/* Define if you have readline 4.0 */
+/* #undef HAVE_RL_RESIZE_TERMINAL */
+
+/* Define to 1 if you have the `round' function. */
+#define HAVE_ROUND 1
+
+/* Define to 1 if you have the `sched_get_priority_max' function. */
+#define HAVE_SCHED_GET_PRIORITY_MAX 1
+
+/* Define to 1 if you have the <sched.h> header file. */
+#define HAVE_SCHED_H 1
+
+/* Define to 1 if you have the `sched_rr_get_interval' function. */
+#define HAVE_SCHED_RR_GET_INTERVAL 1
+
+/* Define to 1 if you have the `sched_setaffinity' function. */
+#define HAVE_SCHED_SETAFFINITY 1
+
+/* Define to 1 if you have the `sched_setparam' function. */
+#define HAVE_SCHED_SETPARAM 1
+
+/* Define to 1 if you have the `sched_setscheduler' function. */
+#define HAVE_SCHED_SETSCHEDULER 1
+
+/* Define to 1 if you have the `select' function. */
+#define HAVE_SELECT 1
+
+/* Define to 1 if you have the `sem_getvalue' function. */
+#define HAVE_SEM_GETVALUE 1
+
+/* Define to 1 if you have the `sem_open' function. */
+#define HAVE_SEM_OPEN 1
+
+/* Define to 1 if you have the `sem_timedwait' function. */
+#define HAVE_SEM_TIMEDWAIT 1
+
+/* Define to 1 if you have the `sem_unlink' function. */
+#define HAVE_SEM_UNLINK 1
+
+/* Define to 1 if you have the `sendfile' function. */
+#define HAVE_SENDFILE 1
+
+/* Define to 1 if you have the `setegid' function. */
+#define HAVE_SETEGID 1
+
+/* Define to 1 if you have the `seteuid' function. */
+#define HAVE_SETEUID 1
+
+/* Define to 1 if you have the `setgid' function. */
+#define HAVE_SETGID 1
+
+/* Define if you have the 'setgroups' function. */
+#define HAVE_SETGROUPS 1
+
+/* Define to 1 if you have the `sethostname' function. */
+#define HAVE_SETHOSTNAME 1
+
+/* Define to 1 if you have the `setitimer' function. */
+#define HAVE_SETITIMER 1
+
+/* Define to 1 if you have the `setlocale' function. */
+#define HAVE_SETLOCALE 1
+
+/* Define to 1 if you have the `setpgid' function. */
+#define HAVE_SETPGID 1
+
+/* Define to 1 if you have the `setpgrp' function. */
+#define HAVE_SETPGRP 1
+
+/* Define to 1 if you have the `setpriority' function. */
+#define HAVE_SETPRIORITY 1
+
+/* Define to 1 if you have the `setregid' function. */
+#define HAVE_SETREGID 1
+
+/* Define to 1 if you have the `setresgid' function. */
+#define HAVE_SETRESGID 1
+
+/* Define to 1 if you have the `setresuid' function. */
+#define HAVE_SETRESUID 1
+
+/* Define to 1 if you have the `setreuid' function. */
+#define HAVE_SETREUID 1
+
+/* Define to 1 if you have the `setsid' function. */
+#define HAVE_SETSID 1
+
+/* Define to 1 if you have the `setuid' function. */
+#define HAVE_SETUID 1
+
+/* Define to 1 if you have the `setvbuf' function. */
+#define HAVE_SETVBUF 1
+
+/* Define to 1 if you have the <shadow.h> header file. */
+#define HAVE_SHADOW_H 1
+
+/* Define to 1 if you have the `sigaction' function. */
+#define HAVE_SIGACTION 1
+
+/* Define to 1 if you have the `sigaltstack' function. */
+#define HAVE_SIGALTSTACK 1
+
+/* Define to 1 if you have the `siginterrupt' function. */
+#define HAVE_SIGINTERRUPT 1
+
+/* Define to 1 if you have the <signal.h> header file. */
+#define HAVE_SIGNAL_H 1
+
+/* Define to 1 if you have the `sigpending' function. */
+#define HAVE_SIGPENDING 1
+
+/* Define to 1 if you have the `sigrelse' function. */
+#define HAVE_SIGRELSE 1
+
+/* Define to 1 if you have the `sigtimedwait' function. */
+#define HAVE_SIGTIMEDWAIT 1
+
+/* Define to 1 if you have the `sigwait' function. */
+#define HAVE_SIGWAIT 1
+
+/* Define to 1 if you have the `sigwaitinfo' function. */
+#define HAVE_SIGWAITINFO 1
+
+/* Define to 1 if you have the `snprintf' function. */
+#define HAVE_SNPRINTF 1
+
+/* struct sockaddr_alg (linux/if_alg.h) */
+#define HAVE_SOCKADDR_ALG 1
+
+/* Define if sockaddr has sa_len member */
+/* #undef HAVE_SOCKADDR_SA_LEN */
+
+/* struct sockaddr_storage (sys/socket.h) */
+#define HAVE_SOCKADDR_STORAGE 1
+
+/* Define if you have the 'socketpair' function. */
+#define HAVE_SOCKETPAIR 1
+
+/* Define to 1 if you have the <spawn.h> header file. */
+#define HAVE_SPAWN_H 1
+
+/* Define if your compiler provides ssize_t */
+#define HAVE_SSIZE_T 1
+
+/* Define to 1 if you have the `statvfs' function. */
+#define HAVE_STATVFS 1
+
+/* Define if you have struct stat.st_mtim.tv_nsec */
+#define HAVE_STAT_TV_NSEC 1
+
+/* Define if you have struct stat.st_mtimensec */
+/* #undef HAVE_STAT_TV_NSEC2 */
+
+/* Define if your compiler supports variable length function prototypes (e.g.
+   void fprintf(FILE *, char *, ...);) *and* <stdarg.h> */
+#define HAVE_STDARG_PROTOTYPES 1
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#define HAVE_STDINT_H 1
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#define HAVE_STDLIB_H 1
+
+/* Has stdatomic.h with atomic_int */
+#define HAVE_STD_ATOMIC 1
+
+/* Define to 1 if you have the `strdup' function. */
+#define HAVE_STRDUP 1
+
+/* Define to 1 if you have the `strftime' function. */
+#define HAVE_STRFTIME 1
+
+/* Define to 1 if you have the <strings.h> header file. */
+#define HAVE_STRINGS_H 1
+
+/* Define to 1 if you have the <string.h> header file. */
+#define HAVE_STRING_H 1
+
+/* Define to 1 if you have the `strlcpy' function. */
+/* #undef HAVE_STRLCPY */
+
+/* Define to 1 if you have the <stropts.h> header file. */
+#define HAVE_STROPTS_H 1
+
+/* Define to 1 if `pw_gecos' is a member of `struct passwd'. */
+#define HAVE_STRUCT_PASSWD_PW_GECOS 1
+
+/* Define to 1 if `pw_passwd' is a member of `struct passwd'. */
+#define HAVE_STRUCT_PASSWD_PW_PASSWD 1
+
+/* Define to 1 if `st_birthtime' is a member of `struct stat'. */
+/* #undef HAVE_STRUCT_STAT_ST_BIRTHTIME */
+
+/* Define to 1 if `st_blksize' is a member of `struct stat'. */
+#define HAVE_STRUCT_STAT_ST_BLKSIZE 1
+
+/* Define to 1 if `st_blocks' is a member of `struct stat'. */
+#define HAVE_STRUCT_STAT_ST_BLOCKS 1
+
+/* Define to 1 if `st_flags' is a member of `struct stat'. */
+/* #undef HAVE_STRUCT_STAT_ST_FLAGS */
+
+/* Define to 1 if `st_gen' is a member of `struct stat'. */
+/* #undef HAVE_STRUCT_STAT_ST_GEN */
+
+/* Define to 1 if `st_rdev' is a member of `struct stat'. */
+#define HAVE_STRUCT_STAT_ST_RDEV 1
+
+/* Define to 1 if `tm_zone' is a member of `struct tm'. */
+#define HAVE_STRUCT_TM_TM_ZONE 1
+
+/* Define if you have the 'symlink' function. */
+#define HAVE_SYMLINK 1
+
+/* Define to 1 if you have the `symlinkat' function. */
+#define HAVE_SYMLINKAT 1
+
+/* Define to 1 if you have the `sync' function. */
+#define HAVE_SYNC 1
+
+/* Define to 1 if you have the `sysconf' function. */
+#define HAVE_SYSCONF 1
+
+/* Define to 1 if you have the <sysexits.h> header file. */
+#define HAVE_SYSEXITS_H 1
+
+/* Define to 1 if you have the <sys/audioio.h> header file. */
+/* #undef HAVE_SYS_AUDIOIO_H */
+
+/* Define to 1 if you have the <sys/bsdtty.h> header file. */
+/* #undef HAVE_SYS_BSDTTY_H */
+
+/* Define to 1 if you have the <sys/devpoll.h> header file. */
+/* #undef HAVE_SYS_DEVPOLL_H */
+
+/* Define to 1 if you have the <sys/dir.h> header file, and it defines `DIR'.
+   */
+/* #undef HAVE_SYS_DIR_H */
+
+/* Define to 1 if you have the <sys/endian.h> header file. */
+/* #undef HAVE_SYS_ENDIAN_H */
+
+/* Define to 1 if you have the <sys/epoll.h> header file. */
+#define HAVE_SYS_EPOLL_H 1
+
+/* Define to 1 if you have the <sys/event.h> header file. */
+/* #undef HAVE_SYS_EVENT_H */
+
+/* Define to 1 if you have the <sys/file.h> header file. */
+#define HAVE_SYS_FILE_H 1
+
+/* Define to 1 if you have the <sys/ioctl.h> header file. */
+#define HAVE_SYS_IOCTL_H 1
+
+/* Define to 1 if you have the <sys/kern_control.h> header file. */
+/* #undef HAVE_SYS_KERN_CONTROL_H */
+
+/* Define to 1 if you have the <sys/loadavg.h> header file. */
+/* #undef HAVE_SYS_LOADAVG_H */
+
+/* Define to 1 if you have the <sys/lock.h> header file. */
+/* #undef HAVE_SYS_LOCK_H */
+
+/* Define to 1 if you have the <sys/mkdev.h> header file. */
+/* #undef HAVE_SYS_MKDEV_H */
+
+/* Define to 1 if you have the <sys/modem.h> header file. */
+/* #undef HAVE_SYS_MODEM_H */
+
+/* Define to 1 if you have the <sys/ndir.h> header file, and it defines `DIR'.
+   */
+/* #undef HAVE_SYS_NDIR_H */
+
+/* Define to 1 if you have the <sys/param.h> header file. */
+#define HAVE_SYS_PARAM_H 1
+
+/* Define to 1 if you have the <sys/poll.h> header file. */
+#define HAVE_SYS_POLL_H 1
+
+/* Define to 1 if you have the <sys/random.h> header file. */
+/* #undef HAVE_SYS_RANDOM_H */
+
+/* Define to 1 if you have the <sys/resource.h> header file. */
+#define HAVE_SYS_RESOURCE_H 1
+
+/* Define to 1 if you have the <sys/select.h> header file. */
+#define HAVE_SYS_SELECT_H 1
+
+/* Define to 1 if you have the <sys/sendfile.h> header file. */
+#define HAVE_SYS_SENDFILE_H 1
+
+/* Define to 1 if you have the <sys/socket.h> header file. */
+#define HAVE_SYS_SOCKET_H 1
+
+/* Define to 1 if you have the <sys/statvfs.h> header file. */
+#define HAVE_SYS_STATVFS_H 1
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#define HAVE_SYS_STAT_H 1
+
+/* Define to 1 if you have the <sys/syscall.h> header file. */
+#define HAVE_SYS_SYSCALL_H 1
+
+/* Define to 1 if you have the <sys/sysmacros.h> header file. */
+#define HAVE_SYS_SYSMACROS_H 1
+
+/* Define to 1 if you have the <sys/sys_domain.h> header file. */
+/* #undef HAVE_SYS_SYS_DOMAIN_H */
+
+/* Define to 1 if you have the <sys/termio.h> header file. */
+/* #undef HAVE_SYS_TERMIO_H */
+
+/* Define to 1 if you have the <sys/times.h> header file. */
+#define HAVE_SYS_TIMES_H 1
+
+/* Define to 1 if you have the <sys/time.h> header file. */
+#define HAVE_SYS_TIME_H 1
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#define HAVE_SYS_TYPES_H 1
+
+/* Define to 1 if you have the <sys/uio.h> header file. */
+#define HAVE_SYS_UIO_H 1
+
+/* Define to 1 if you have the <sys/un.h> header file. */
+#define HAVE_SYS_UN_H 1
+
+/* Define to 1 if you have the <sys/utsname.h> header file. */
+#define HAVE_SYS_UTSNAME_H 1
+
+/* Define to 1 if you have the <sys/wait.h> header file. */
+#define HAVE_SYS_WAIT_H 1
+
+/* Define to 1 if you have the <sys/xattr.h> header file. */
+#define HAVE_SYS_XATTR_H 1
+
+/* Define to 1 if you have the `tcgetpgrp' function. */
+#define HAVE_TCGETPGRP 1
+
+/* Define to 1 if you have the `tcsetpgrp' function. */
+#define HAVE_TCSETPGRP 1
+
+/* Define to 1 if you have the `tempnam' function. */
+#define HAVE_TEMPNAM 1
+
+/* Define to 1 if you have the <termios.h> header file. */
+#define HAVE_TERMIOS_H 1
+
+/* Define to 1 if you have the <term.h> header file. */
+/* #undef HAVE_TERM_H */
+
+/* Define to 1 if you have the `tgamma' function. */
+#define HAVE_TGAMMA 1
+
+/* Define to 1 if you have the `timegm' function. */
+#define HAVE_TIMEGM 1
+
+/* Define to 1 if you have the `times' function. */
+#define HAVE_TIMES 1
+
+/* Define to 1 if you have the `tmpfile' function. */
+#define HAVE_TMPFILE 1
+
+/* Define to 1 if you have the `tmpnam' function. */
+#define HAVE_TMPNAM 1
+
+/* Define to 1 if you have the `tmpnam_r' function. */
+#define HAVE_TMPNAM_R 1
+
+/* Define to 1 if your `struct tm' has `tm_zone'. Deprecated, use
+   `HAVE_STRUCT_TM_TM_ZONE' instead. */
+#define HAVE_TM_ZONE 1
+
+/* Define to 1 if you have the `truncate' function. */
+#define HAVE_TRUNCATE 1
+
+/* Define to 1 if you don't have `tm_zone' but do have the external array
+   `tzname'. */
+/* #undef HAVE_TZNAME */
+
+/* Define this if you have tcl and TCL_UTF_MAX==6 */
+/* #undef HAVE_UCS4_TCL */
+
+/* Define to 1 if you have the `uname' function. */
+#define HAVE_UNAME 1
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#define HAVE_UNISTD_H 1
+
+/* Define to 1 if you have the `unlinkat' function. */
+#define HAVE_UNLINKAT 1
+
+/* Define to 1 if you have the `unsetenv' function. */
+#define HAVE_UNSETENV 1
+
+/* Define if you have a useable wchar_t type defined in wchar.h; useable means
+   wchar_t must be an unsigned type with at least 16 bits. (see
+   Include/unicodeobject.h). */
+#define HAVE_USABLE_WCHAR_T 1
+
+/* Define to 1 if you have the <util.h> header file. */
+/* #undef HAVE_UTIL_H */
+
+/* Define to 1 if you have the `utimensat' function. */
+#define HAVE_UTIMENSAT 1
+
+/* Define to 1 if you have the `utimes' function. */
+#define HAVE_UTIMES 1
+
+/* Define to 1 if you have the <utime.h> header file. */
+#define HAVE_UTIME_H 1
+
+/* Define to 1 if you have the `wait3' function. */
+#define HAVE_WAIT3 1
+
+/* Define to 1 if you have the `wait4' function. */
+#define HAVE_WAIT4 1
+
+/* Define to 1 if you have the `waitid' function. */
+#define HAVE_WAITID 1
+
+/* Define to 1 if you have the `waitpid' function. */
+#define HAVE_WAITPID 1
+
+/* Define if the compiler provides a wchar.h header file. */
+#define HAVE_WCHAR_H 1
+
+/* Define to 1 if you have the `wcscoll' function. */
+#define HAVE_WCSCOLL 1
+
+/* Define to 1 if you have the `wcsftime' function. */
+#define HAVE_WCSFTIME 1
+
+/* Define to 1 if you have the `wcsxfrm' function. */
+#define HAVE_WCSXFRM 1
+
+/* Define to 1 if you have the `wmemcmp' function. */
+#define HAVE_WMEMCMP 1
+
+/* Define if tzset() actually switches the local timezone in a meaningful way.
+   */
+#define HAVE_WORKING_TZSET 1
+
+/* Define to 1 if you have the `writev' function. */
+#define HAVE_WRITEV 1
+
+/* Define if the zlib library has inflateCopy */
+#define HAVE_ZLIB_COPY 1
+
+/* Define to 1 if you have the `_getpty' function. */
+/* #undef HAVE__GETPTY */
+
+/* Define if log1p(-0.) is 0. rather than -0. */
+/* #undef LOG1P_DROPS_ZERO_SIGN */
+
+/* Define to 1 if `major', `minor', and `makedev' are declared in <mkdev.h>.
+   */
+/* #undef MAJOR_IN_MKDEV */
+
+/* Define to 1 if `major', `minor', and `makedev' are declared in
+   <sysmacros.h>. */
+/* #undef MAJOR_IN_SYSMACROS */
+
+/* Define if mvwdelch in curses.h is an expression. */
+/* #undef MVWDELCH_IS_EXPRESSION */
+
+/* Define to the address where bug reports for this package should be sent. */
+/* #undef PACKAGE_BUGREPORT */
+
+/* Define to the full name of this package. */
+/* #undef PACKAGE_NAME */
+
+/* Define to the full name and version of this package. */
+/* #undef PACKAGE_STRING */
+
+/* Define to the one symbol short name of this package. */
+/* #undef PACKAGE_TARNAME */
+
+/* Define to the home page for this package. */
+/* #undef PACKAGE_URL */
+
+/* Define to the version of this package. */
+/* #undef PACKAGE_VERSION */
+
+/* Define if POSIX semaphores aren't enabled on your system */
+/* #undef POSIX_SEMAPHORES_NOT_ENABLED */
+
+/* Defined if PTHREAD_SCOPE_SYSTEM supported. */
+#define PTHREAD_SYSTEM_SCHED_SUPPORTED 1
+
+/* Define as the preferred size in bits of long digits */
+/* #undef PYLONG_BITS_IN_DIGIT */
+
+/* Define to printf format modifier for Py_ssize_t */
+#define PY_FORMAT_SIZE_T "z"
+
+/* Define if you want to build an interpreter with many run-time checks. */
+/* #undef Py_DEBUG */
+
+/* Defined if Python is built as a shared library. */
+#define Py_ENABLE_SHARED 1
+
+/* Define hash algorithm for str, bytes and memoryview. SipHash24: 1, FNV: 2,
+   externally defined: 0 */
+/* #undef Py_HASH_ALGORITHM */
+
+/* assume C89 semantics that RETSIGTYPE is always void */
+#define RETSIGTYPE void
+
+/* Define if setpgrp() must be called as setpgrp(0, 0). */
+/* #undef SETPGRP_HAVE_ARG */
+
+/* Define if i>>j for signed int i does not extend the sign bit when i < 0 */
+/* #undef SIGNED_RIGHT_SHIFT_ZERO_FILLS */
+
+/* The size of `double', as computed by sizeof. */
+#define SIZEOF_DOUBLE 8
+
+/* The size of `float', as computed by sizeof. */
+#define SIZEOF_FLOAT 4
+
+/* The size of `fpos_t', as computed by sizeof. */
+#define SIZEOF_FPOS_T 16
+
+/* The size of `int', as computed by sizeof. */
+#define SIZEOF_INT 4
+
+/* The size of `long', as computed by sizeof. */
+#define SIZEOF_LONG 8
+
+/* The size of `long double', as computed by sizeof. */
+#define SIZEOF_LONG_DOUBLE 16
+
+/* The size of `long long', as computed by sizeof. */
+#define SIZEOF_LONG_LONG 8
+
+/* The size of `off_t', as computed by sizeof. */
+#define SIZEOF_OFF_T 8
+
+/* The size of `pid_t', as computed by sizeof. */
+#define SIZEOF_PID_T 4
+
+/* The size of `pthread_t', as computed by sizeof. */
+#define SIZEOF_PTHREAD_T 8
+
+/* The size of `short', as computed by sizeof. */
+#define SIZEOF_SHORT 2
+
+/* The size of `size_t', as computed by sizeof. */
+#define SIZEOF_SIZE_T 8
+
+/* The size of `time_t', as computed by sizeof. */
+#define SIZEOF_TIME_T 8
+
+/* The size of `uintptr_t', as computed by sizeof. */
+#define SIZEOF_UINTPTR_T 8
+
+/* The size of `void *', as computed by sizeof. */
+#define SIZEOF_VOID_P 8
+
+/* The size of `wchar_t', as computed by sizeof. */
+#define SIZEOF_WCHAR_T 4
+
+/* The size of `_Bool', as computed by sizeof. */
+#define SIZEOF__BOOL 1
+
+/* Define to 1 if you have the ANSI C header files. */
+#define STDC_HEADERS 1
+
+/* Define if you can safely include both <sys/select.h> and <sys/time.h>
+   (which you can't on SCO ODT 3.0). */
+#define SYS_SELECT_WITH_SYS_TIME 1
+
+/* Define if tanh(-0.) is -0., or if platform doesn't have signed zeros */
+#define TANH_PRESERVES_ZERO_SIGN 1
+
+/* Library needed by timemodule.c: librt may be needed for clock_gettime() */
+/* #undef TIMEMODULE_LIB */
+
+/* Define to 1 if you can safely include both <sys/time.h> and <time.h>. */
+#define TIME_WITH_SYS_TIME 1
+
+/* Define to 1 if your <sys/time.h> declares `struct tm'. */
+/* #undef TM_IN_SYS_TIME */
+
+/* Define if you want to use computed gotos in ceval.c. */
+/* #undef USE_COMPUTED_GOTOS */
+
+/* Define to use the C99 inline keyword. */
+#define USE_INLINE 1
+
+/* Enable extensions on AIX 3, Interix.  */
+#ifndef _ALL_SOURCE
+# define _ALL_SOURCE 1
+#endif
+/* Enable GNU extensions on systems that have them.  */
+#ifndef _GNU_SOURCE
+# define _GNU_SOURCE 1
+#endif
+/* Enable threading extensions on Solaris.  */
+#ifndef _POSIX_PTHREAD_SEMANTICS
+# define _POSIX_PTHREAD_SEMANTICS 1
+#endif
+/* Enable extensions on HP NonStop.  */
+#ifndef _TANDEM_SOURCE
+# define _TANDEM_SOURCE 1
+#endif
+/* Enable general extensions on Solaris.  */
+#ifndef __EXTENSIONS__
+# define __EXTENSIONS__ 1
+#endif
+
+
+/* Define if you want SIGFPE handled (see Include/pyfpe.h). */
+/* #undef WANT_SIGFPE_HANDLER */
+
+/* Define if WINDOW in curses.h offers a field _flags. */
+/* #undef WINDOW_HAS_FLAGS */
+
+/* Define if you want documentation strings in extension modules */
+#define WITH_DOC_STRINGS 1
+
+/* Define if you want to compile in DTrace support */
+/* #undef WITH_DTRACE */
+
+/* Define if you want to use the new-style (Openstep, Rhapsody, MacOS) dynamic
+   linker (dyld) instead of the old-style (NextStep) dynamic linker (rld).
+   Dyld is necessary to support frameworks. */
+/* #undef WITH_DYLD */
+
+/* Define to 1 if libintl is needed for locale functions. */
+/* #undef WITH_LIBINTL */
+
+/* Define if you want to produce an OpenStep/Rhapsody framework (shared
+   library plus accessory files). */
+/* #undef WITH_NEXT_FRAMEWORK */
+
+/* Define if you want to compile in Python-specific mallocs */
+#define WITH_PYMALLOC 1
+
+/* Define if you want to compile in rudimentary thread support */
+#define WITH_THREAD 1
+
+/* Define if you want pymalloc to be disabled when running under valgrind */
+/* #undef WITH_VALGRIND */
+
+/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most
+   significant byte first (like Motorola and SPARC, unlike Intel). */
+#if defined AC_APPLE_UNIVERSAL_BUILD
+# if defined __BIG_ENDIAN__
+#  define WORDS_BIGENDIAN 1
+# endif
+#else
+# ifndef WORDS_BIGENDIAN
+/* #  undef WORDS_BIGENDIAN */
+# endif
+#endif
+
+/* Define if arithmetic is subject to x87-style double rounding issue */
+/* #undef X87_DOUBLE_ROUNDING */
+
+/* Define on OpenBSD to activate all library features */
+/* #undef _BSD_SOURCE */
+
+/* Define on Irix to enable u_int */
+#define _BSD_TYPES 1
+
+/* Define on Darwin to activate all library features */
+#define _DARWIN_C_SOURCE 1
+
+/* This must be set to 64 on some systems to enable large file support. */
+#define _FILE_OFFSET_BITS 64
+
+/* Define on Linux to activate all library features */
+#define _GNU_SOURCE 1
+
+/* Define to include mbstate_t for mbrtowc */
+/* #undef _INCLUDE__STDC_A1_SOURCE */
+
+/* This must be defined on some systems to enable large file support. */
+#define _LARGEFILE_SOURCE 1
+
+/* This must be defined on AIX systems to enable large file support. */
+/* #undef _LARGE_FILES */
+
+/* Define to 1 if on MINIX. */
+/* #undef _MINIX */
+
+/* Define on NetBSD to activate all library features */
+#define _NETBSD_SOURCE 1
+
+/* Define to 2 if the system does not provide POSIX.1 features except with
+   this defined. */
+/* #undef _POSIX_1_SOURCE */
+
+/* Define to activate features from IEEE Stds 1003.1-2008 */
+#define _POSIX_C_SOURCE 200809L
+
+/* Define to 1 if you need to in order for `stat' and other things to work. */
+/* #undef _POSIX_SOURCE */
+
+/* Define if you have POSIX threads, and your system does not define that. */
+/* #undef _POSIX_THREADS */
+
+/* Define to force use of thread-safe errno, h_errno, and other functions */
+/* #undef _REENTRANT */
+
+/* Define to the level of X/Open that your system supports */
+#define _XOPEN_SOURCE 700
+
+/* Define to activate Unix95-and-earlier features */
+#define _XOPEN_SOURCE_EXTENDED 1
+
+/* Define on FreeBSD to activate all library features */
+#define __BSD_VISIBLE 1
+
+/* Define to 1 if type `char' is unsigned and you are not using gcc.  */
+#ifndef __CHAR_UNSIGNED__
+/* # undef __CHAR_UNSIGNED__ */
+#endif
+
+/* Define to 'long' if <time.h> doesn't define. */
+/* #undef clock_t */
+
+/* Define to empty if `const' does not conform to ANSI C. */
+/* #undef const */
+
+/* Define to `int' if <sys/types.h> doesn't define. */
+/* #undef gid_t */
+
+/* Define to `__inline__' or `__inline' if that's what the C compiler
+   calls it, or to nothing if 'inline' is not supported under any name.  */
+#ifndef __cplusplus
+/* #undef inline */
+#endif
+
+/* Define to `int' if <sys/types.h> does not define. */
+/* #undef mode_t */
+
+/* Define to `long int' if <sys/types.h> does not define. */
+/* #undef off_t */
+
+/* Define to `int' if <sys/types.h> does not define. */
+/* #undef pid_t */
+
+/* Define to empty if the keyword does not work. */
+/* #undef signed */
+
+/* Define to `unsigned int' if <sys/types.h> does not define. */
+/* #undef size_t */
+
+/* Define to `int' if <sys/socket.h> does not define. */
+/* #undef socklen_t */
+
+/* Define to `int' if <sys/types.h> doesn't define. */
+/* #undef uid_t */
+
+/* Define to empty if the keyword does not work. */
+/* #undef volatile */
+
+
+/* Define the macros needed if on a UnixWare 7.x system. */
+#if defined(__USLC__) && defined(__SCO_VERSION__)
+#define STRICT_SYSV_CURSES /* Don't use ncurses extensions */
+#endif
+
+#endif /*Py_PYCONFIG_H*/
+
diff --git a/cviruntime/python/include/pybind11/CMakeLists.txt b/cviruntime/python/include/pybind11/CMakeLists.txt
new file mode 100644
index 000000000..08e17b53c
--- /dev/null
+++ b/cviruntime/python/include/pybind11/CMakeLists.txt
@@ -0,0 +1,114 @@
+# CMakeLists.txt -- Build system for the pybind11 modules
+#
+# Copyright (c) 2015 Wenzel Jakob <wenzel@inf.ethz.ch>
+#
+# All rights reserved. Use of this source code is governed by a
+# BSD-style license that can be found in the LICENSE file.
+
+cmake_minimum_required(VERSION 2.8.12)
+
+if (POLICY CMP0048)
+  # cmake warns if loaded from a min-3.0-required parent dir, so silence the warning:
+  cmake_policy(SET CMP0048 NEW)
+endif()
+
+# CMake versions < 3.4.0 do not support try_compile/pthread checks without C as active language.
+if(CMAKE_VERSION VERSION_LESS 3.4.0)
+  project(pybind11)
+else()
+  project(pybind11 CXX)
+endif()
+
+# Check if pybind11 is being used directly or via add_subdirectory
+set(PYBIND11_MASTER_PROJECT OFF)
+if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
+  set(PYBIND11_MASTER_PROJECT ON)
+endif()
+
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/tools")
+
+include(pybind11Tools)
+
+# Cache variables so pybind11_add_module can be used in parent projects
+set(PYBIND11_INCLUDE_DIR "${CMAKE_CURRENT_LIST_DIR}/include" CACHE INTERNAL "")
+set(PYTHON_INCLUDE_DIRS ${PYTHON_INCLUDE_DIRS} CACHE INTERNAL "")
+set(PYTHON_LIBRARIES ${PYTHON_LIBRARIES} CACHE INTERNAL "")
+set(PYTHON_MODULE_PREFIX ${PYTHON_MODULE_PREFIX} CACHE INTERNAL "")
+set(PYTHON_MODULE_EXTENSION ${PYTHON_MODULE_EXTENSION} CACHE INTERNAL "")
+set(PYTHON_VERSION_MAJOR ${PYTHON_VERSION_MAJOR} CACHE INTERNAL "")
+set(PYTHON_VERSION_MINOR ${PYTHON_VERSION_MINOR} CACHE INTERNAL "")
+
+# NB: when adding a header don't forget to also add it to setup.py
+set(PYBIND11_HEADERS
+  include/pybind11/detail/class.h
+  include/pybind11/detail/common.h
+  include/pybind11/detail/descr.h
+  include/pybind11/detail/init.h
+  include/pybind11/detail/internals.h
+  include/pybind11/detail/typeid.h
+  include/pybind11/attr.h
+  include/pybind11/buffer_info.h
+  include/pybind11/cast.h
+  include/pybind11/chrono.h
+  include/pybind11/common.h
+  include/pybind11/complex.h
+  include/pybind11/options.h
+  include/pybind11/eigen.h
+  include/pybind11/embed.h
+  include/pybind11/eval.h
+  include/pybind11/functional.h
+  include/pybind11/numpy.h
+  include/pybind11/operators.h
+  include/pybind11/pybind11.h
+  include/pybind11/pytypes.h
+  include/pybind11/stl.h
+  include/pybind11/stl_bind.h
+)
+string(REPLACE "include/" "${CMAKE_CURRENT_SOURCE_DIR}/include/"
+       PYBIND11_HEADERS "${PYBIND11_HEADERS}")
+
+include(GNUInstallDirs)
+include(CMakePackageConfigHelpers)
+
+# extract project version from source
+file(STRINGS "${PYBIND11_INCLUDE_DIR}/pybind11/detail/common.h" pybind11_version_defines
+     REGEX "#define PYBIND11_VERSION_(MAJOR|MINOR|PATCH) ")
+foreach(ver ${pybind11_version_defines})
+  if (ver MATCHES "#define PYBIND11_VERSION_(MAJOR|MINOR|PATCH) +([^ ]+)$")
+    set(PYBIND11_VERSION_${CMAKE_MATCH_1} "${CMAKE_MATCH_2}" CACHE INTERNAL "")
+  endif()
+endforeach()
+set(${PROJECT_NAME}_VERSION ${PYBIND11_VERSION_MAJOR}.${PYBIND11_VERSION_MINOR}.${PYBIND11_VERSION_PATCH})
+message(STATUS "pybind11 v${${PROJECT_NAME}_VERSION}")
+
+option (USE_PYTHON_INCLUDE_DIR "Install pybind11 headers in Python include directory instead of default installation prefix" OFF)
+if (USE_PYTHON_INCLUDE_DIR)
+    file(RELATIVE_PATH CMAKE_INSTALL_INCLUDEDIR ${CMAKE_INSTALL_PREFIX} ${PYTHON_INCLUDE_DIRS})
+endif()
+
+if(NOT (CMAKE_VERSION VERSION_LESS 3.0))  # CMake >= 3.0
+  # Build an interface library target:
+  add_library(pybind11 INTERFACE)
+  add_library(pybind11::pybind11 ALIAS pybind11)  # to match exported target
+  target_include_directories(pybind11 INTERFACE $<BUILD_INTERFACE:${PYBIND11_INCLUDE_DIR}>
+                                                $<BUILD_INTERFACE:${PYTHON_INCLUDE_DIRS}>
+                                                $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
+  target_compile_options(pybind11 INTERFACE $<BUILD_INTERFACE:${PYBIND11_CPP_STANDARD}>)
+
+  add_library(module INTERFACE)
+  add_library(pybind11::module ALIAS module)
+  if(NOT MSVC)
+    target_compile_options(module INTERFACE -fvisibility=hidden)
+  endif()
+  target_link_libraries(module INTERFACE pybind11::pybind11)
+  if(WIN32 OR CYGWIN)
+    target_link_libraries(module INTERFACE $<BUILD_INTERFACE:${PYTHON_LIBRARIES}>)
+  elseif(APPLE)
+    target_link_libraries(module INTERFACE "-undefined dynamic_lookup")
+  endif()
+
+  add_library(embed INTERFACE)
+  add_library(pybind11::embed ALIAS embed)
+  target_link_libraries(embed INTERFACE pybind11::pybind11 $<BUILD_INTERFACE:${PYTHON_LIBRARIES}>)
+endif()
diff --git a/cviruntime/python/include/pybind11/CONTRIBUTING.md b/cviruntime/python/include/pybind11/CONTRIBUTING.md
new file mode 100644
index 000000000..01596d94f
--- /dev/null
+++ b/cviruntime/python/include/pybind11/CONTRIBUTING.md
@@ -0,0 +1,49 @@
+Thank you for your interest in this project! Please refer to the following
+sections on how to contribute code and bug reports.
+
+### Reporting bugs
+
+At the moment, this project is run in the spare time of a single person
+([Wenzel Jakob](http://rgl.epfl.ch/people/wjakob)) with very limited resources
+for issue tracker tickets. Thus, before submitting a question or bug report,
+please take a moment of your time and ensure that your issue isn't already
+discussed in the project documentation provided at
+[http://pybind11.readthedocs.org/en/latest](http://pybind11.readthedocs.org/en/latest).
+
+Assuming that you have identified a previously unknown problem or an important
+question, it's essential that you submit a self-contained and minimal piece of
+code that reproduces the problem. In other words: no external dependencies,
+isolate the function(s) that cause breakage, submit matched and complete C++
+and Python snippets that can be easily compiled and run on my end.
+
+## Pull requests
+Contributions are submitted, reviewed, and accepted using Github pull requests.
+Please refer to [this
+article](https://help.github.com/articles/using-pull-requests) for details and
+adhere to the following rules to make the process as smooth as possible:
+
+* Make a new branch for every feature you're working on.
+* Make small and clean pull requests that are easy to review but make sure they
+  do add value by themselves.
+* Add tests for any new functionality and run the test suite (``make pytest``)
+  to ensure that no existing features break.
+* Please run ``flake8`` and ``tools/check-style.sh`` to check your code matches
+  the project style. (Note that ``check-style.sh`` requires ``gawk``.)
+* This project has a strong focus on providing general solutions using a
+  minimal amount of code, thus small pull requests are greatly preferred.
+
+### Licensing of contributions
+
+pybind11 is provided under a BSD-style license that can be found in the
+``LICENSE`` file. By using, distributing, or contributing to this project, you
+agree to the terms and conditions of this license.
+
+You are under no obligation whatsoever to provide any bug fixes, patches, or
+upgrades to the features, functionality or performance of the source code
+("Enhancements") to anyone; however, if you choose to make your Enhancements
+available either publicly, or directly to the author of this software, without
+imposing a separate written license agreement for such Enhancements, then you
+hereby grant the following license: a non-exclusive, royalty-free perpetual
+license to install, use, modify, prepare derivative works, incorporate into
+other computer software, distribute, and sublicense such enhancements or
+derivative works thereof, in binary and source code form.
diff --git a/cviruntime/python/include/pybind11/LICENSE b/cviruntime/python/include/pybind11/LICENSE
new file mode 100644
index 000000000..6f15578cc
--- /dev/null
+++ b/cviruntime/python/include/pybind11/LICENSE
@@ -0,0 +1,29 @@
+Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>, All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Please also refer to the file CONTRIBUTING.md, which clarifies licensing of
+external contributions to this project including patches, pull requests, etc.
diff --git a/cviruntime/python/include/pybind11/README.md b/cviruntime/python/include/pybind11/README.md
new file mode 100644
index 000000000..35d2d76ff
--- /dev/null
+++ b/cviruntime/python/include/pybind11/README.md
@@ -0,0 +1,129 @@
+![pybind11 logo](https://github.com/pybind/pybind11/raw/master/docs/pybind11-logo.png)
+
+# pybind11 — Seamless operability between C++11 and Python
+
+[![Documentation Status](https://readthedocs.org/projects/pybind11/badge/?version=master)](http://pybind11.readthedocs.org/en/master/?badge=master)
+[![Documentation Status](https://readthedocs.org/projects/pybind11/badge/?version=stable)](http://pybind11.readthedocs.org/en/stable/?badge=stable)
+[![Gitter chat](https://img.shields.io/gitter/room/gitterHQ/gitter.svg)](https://gitter.im/pybind/Lobby)
+[![Build Status](https://travis-ci.org/pybind/pybind11.svg?branch=master)](https://travis-ci.org/pybind/pybind11)
+[![Build status](https://ci.appveyor.com/api/projects/status/riaj54pn4h08xy40?svg=true)](https://ci.appveyor.com/project/wjakob/pybind11)
+
+**pybind11** is a lightweight header-only library that exposes C++ types in Python
+and vice versa, mainly to create Python bindings of existing C++ code. Its
+goals and syntax are similar to the excellent
+[Boost.Python](http://www.boost.org/doc/libs/1_58_0/libs/python/doc/) library
+by David Abrahams: to minimize boilerplate code in traditional extension
+modules by inferring type information using compile-time introspection.
+
+The main issue with Boost.Python—and the reason for creating such a similar
+project—is Boost. Boost is an enormously large and complex suite of utility
+libraries that works with almost every C++ compiler in existence. This
+compatibility has its cost: arcane template tricks and workarounds are
+necessary to support the oldest and buggiest of compiler specimens. Now that
+C++11-compatible compilers are widely available, this heavy machinery has
+become an excessively large and unnecessary dependency.
+
+Think of this library as a tiny self-contained version of Boost.Python with
+everything stripped away that isn't relevant for binding generation. Without
+comments, the core header files only require ~4K lines of code and depend on
+Python (2.7 or 3.x, or PyPy2.7 >= 5.7) and the C++ standard library. This
+compact implementation was possible thanks to some of the new C++11 language
+features (specifically: tuples, lambda functions and variadic templates). Since
+its creation, this library has grown beyond Boost.Python in many ways, leading
+to dramatically simpler binding code in many common situations.
+
+Tutorial and reference documentation is provided at
+[http://pybind11.readthedocs.org/en/master](http://pybind11.readthedocs.org/en/master).
+A PDF version of the manual is available
+[here](https://media.readthedocs.org/pdf/pybind11/master/pybind11.pdf).
+
+## Core features
+pybind11 can map the following core C++ features to Python
+
+- Functions accepting and returning custom data structures per value, reference, or pointer
+- Instance methods and static methods
+- Overloaded functions
+- Instance attributes and static attributes
+- Arbitrary exception types
+- Enumerations
+- Callbacks
+- Iterators and ranges
+- Custom operators
+- Single and multiple inheritance
+- STL data structures
+- Smart pointers with reference counting like ``std::shared_ptr``
+- Internal references with correct reference counting
+- C++ classes with virtual (and pure virtual) methods can be extended in Python
+
+## Goodies
+In addition to the core functionality, pybind11 provides some extra goodies:
+
+- Python 2.7, 3.x, and PyPy (PyPy2.7 >= 5.7) are supported with an
+  implementation-agnostic interface.
+
+- It is possible to bind C++11 lambda functions with captured variables. The
+  lambda capture data is stored inside the resulting Python function object.
+
+- pybind11 uses C++11 move constructors and move assignment operators whenever
+  possible to efficiently transfer custom data types.
+
+- It's easy to expose the internal storage of custom data types through
+  Pythons' buffer protocols. This is handy e.g. for fast conversion between
+  C++ matrix classes like Eigen and NumPy without expensive copy operations.
+
+- pybind11 can automatically vectorize functions so that they are transparently
+  applied to all entries of one or more NumPy array arguments.
+
+- Python's slice-based access and assignment operations can be supported with
+  just a few lines of code.
+
+- Everything is contained in just a few header files; there is no need to link
+  against any additional libraries.
+
+- Binaries are generally smaller by a factor of at least 2 compared to
+  equivalent bindings generated by Boost.Python. A recent pybind11 conversion
+  of PyRosetta, an enormous Boost.Python binding project,
+  [reported](http://graylab.jhu.edu/RosettaCon2016/PyRosetta-4.pdf) a binary
+  size reduction of **5.4x** and compile time reduction by **5.8x**.
+
+- Function signatures are precomputed at compile time (using ``constexpr``),
+  leading to smaller binaries.
+
+- With little extra effort, C++ types can be pickled and unpickled similar to
+  regular Python objects.
+
+## Supported compilers
+
+1. Clang/LLVM 3.3 or newer (for Apple Xcode's clang, this is 5.0.0 or newer)
+2. GCC 4.8 or newer
+3. Microsoft Visual Studio 2015 Update 3 or newer
+4. Intel C++ compiler 17 or newer (16 with pybind11 v2.0 and 15 with pybind11 v2.0 and a [workaround](https://github.com/pybind/pybind11/issues/276))
+5. Cygwin/GCC (tested on 2.5.1)
+
+## About
+
+This project was created by [Wenzel Jakob](http://rgl.epfl.ch/people/wjakob).
+Significant features and/or improvements to the code were contributed by
+Jonas Adler,
+Lori A. Burns,
+Sylvain Corlay,
+Trent Houliston,
+Axel Huebl,
+@hulucc,
+Sergey Lyskov
+Johan Mabille,
+Tomasz Miąsko,
+Dean Moldovan,
+Ben Pritchard,
+Jason Rhinelander,
+Boris Schäling,
+Pim Schellart,
+Henry Schreiner,
+Ivan Smirnov, and
+Patrick Stewart.
+
+### License
+
+pybind11 is provided under a BSD-style license that can be found in the
+``LICENSE`` file. By using, distributing, or contributing to this project,
+you agree to the terms and conditions of this license.
diff --git a/cviruntime/python/include/pybind11/include/pybind11/attr.h b/cviruntime/python/include/pybind11/include/pybind11/attr.h
new file mode 100644
index 000000000..6962d6fc5
--- /dev/null
+++ b/cviruntime/python/include/pybind11/include/pybind11/attr.h
@@ -0,0 +1,493 @@
+/*
+    pybind11/attr.h: Infrastructure for processing custom
+    type and function attributes
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "cast.h"
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+/// \addtogroup annotations
+/// @{
+
+/// Annotation for methods
+struct is_method { handle class_; is_method(const handle &c) : class_(c) { } };
+
+/// Annotation for operators
+struct is_operator { };
+
+/// Annotation for parent scope
+struct scope { handle value; scope(const handle &s) : value(s) { } };
+
+/// Annotation for documentation
+struct doc { const char *value; doc(const char *value) : value(value) { } };
+
+/// Annotation for function names
+struct name { const char *value; name(const char *value) : value(value) { } };
+
+/// Annotation indicating that a function is an overload associated with a given "sibling"
+struct sibling { handle value; sibling(const handle &value) : value(value.ptr()) { } };
+
+/// Annotation indicating that a class derives from another given type
+template <typename T> struct base {
+    PYBIND11_DEPRECATED("base<T>() was deprecated in favor of specifying 'T' as a template argument to class_")
+    base() { }
+};
+
+/// Keep patient alive while nurse lives
+template <size_t Nurse, size_t Patient> struct keep_alive { };
+
+/// Annotation indicating that a class is involved in a multiple inheritance relationship
+struct multiple_inheritance { };
+
+/// Annotation which enables dynamic attributes, i.e. adds `__dict__` to a class
+struct dynamic_attr { };
+
+/// Annotation which enables the buffer protocol for a type
+struct buffer_protocol { };
+
+/// Annotation which requests that a special metaclass is created for a type
+struct metaclass {
+    handle value;
+
+    PYBIND11_DEPRECATED("py::metaclass() is no longer required. It's turned on by default now.")
+    metaclass() {}
+
+    /// Override pybind11's default metaclass
+    explicit metaclass(handle value) : value(value) { }
+};
+
+/// Annotation that marks a class as local to the module:
+struct module_local { const bool value; constexpr module_local(bool v = true) : value(v) { } };
+
+/// Annotation to mark enums as an arithmetic type
+struct arithmetic { };
+
+/** \rst
+    A call policy which places one or more guard variables (``Ts...``) around the function call.
+
+    For example, this definition:
+
+    .. code-block:: cpp
+
+        m.def("foo", foo, py::call_guard<T>());
+
+    is equivalent to the following pseudocode:
+
+    .. code-block:: cpp
+
+        m.def("foo", [](args...) {
+            T scope_guard;
+            return foo(args...); // forwarded arguments
+        });
+ \endrst */
+template <typename... Ts> struct call_guard;
+
+template <> struct call_guard<> { using type = detail::void_type; };
+
+template <typename T>
+struct call_guard<T> {
+    static_assert(std::is_default_constructible<T>::value,
+                  "The guard type must be default constructible");
+
+    using type = T;
+};
+
+template <typename T, typename... Ts>
+struct call_guard<T, Ts...> {
+    struct type {
+        T guard{}; // Compose multiple guard types with left-to-right default-constructor order
+        typename call_guard<Ts...>::type next{};
+    };
+};
+
+/// @} annotations
+
+NAMESPACE_BEGIN(detail)
+/* Forward declarations */
+enum op_id : int;
+enum op_type : int;
+struct undefined_t;
+template <op_id id, op_type ot, typename L = undefined_t, typename R = undefined_t> struct op_;
+inline void keep_alive_impl(size_t Nurse, size_t Patient, function_call &call, handle ret);
+
+/// Internal data structure which holds metadata about a keyword argument
+struct argument_record {
+    const char *name;  ///< Argument name
+    const char *descr; ///< Human-readable version of the argument value
+    handle value;      ///< Associated Python object
+    bool convert : 1;  ///< True if the argument is allowed to convert when loading
+    bool none : 1;     ///< True if None is allowed when loading
+
+    argument_record(const char *name, const char *descr, handle value, bool convert, bool none)
+        : name(name), descr(descr), value(value), convert(convert), none(none) { }
+};
+
+/// Internal data structure which holds metadata about a bound function (signature, overloads, etc.)
+struct function_record {
+    function_record()
+        : is_constructor(false), is_new_style_constructor(false), is_stateless(false),
+          is_operator(false), has_args(false), has_kwargs(false), is_method(false) { }
+
+    /// Function name
+    char *name = nullptr; /* why no C++ strings? They generate heavier code.. */
+
+    // User-specified documentation string
+    char *doc = nullptr;
+
+    /// Human-readable version of the function signature
+    char *signature = nullptr;
+
+    /// List of registered keyword arguments
+    std::vector<argument_record> args;
+
+    /// Pointer to lambda function which converts arguments and performs the actual call
+    handle (*impl) (function_call &) = nullptr;
+
+    /// Storage for the wrapped function pointer and captured data, if any
+    void *data[3] = { };
+
+    /// Pointer to custom destructor for 'data' (if needed)
+    void (*free_data) (function_record *ptr) = nullptr;
+
+    /// Return value policy associated with this function
+    return_value_policy policy = return_value_policy::automatic;
+
+    /// True if name == '__init__'
+    bool is_constructor : 1;
+
+    /// True if this is a new-style `__init__` defined in `detail/init.h`
+    bool is_new_style_constructor : 1;
+
+    /// True if this is a stateless function pointer
+    bool is_stateless : 1;
+
+    /// True if this is an operator (__add__), etc.
+    bool is_operator : 1;
+
+    /// True if the function has a '*args' argument
+    bool has_args : 1;
+
+    /// True if the function has a '**kwargs' argument
+    bool has_kwargs : 1;
+
+    /// True if this is a method
+    bool is_method : 1;
+
+    /// Number of arguments (including py::args and/or py::kwargs, if present)
+    std::uint16_t nargs;
+
+    /// Python method object
+    PyMethodDef *def = nullptr;
+
+    /// Python handle to the parent scope (a class or a module)
+    handle scope;
+
+    /// Python handle to the sibling function representing an overload chain
+    handle sibling;
+
+    /// Pointer to next overload
+    function_record *next = nullptr;
+};
+
+/// Special data structure which (temporarily) holds metadata about a bound class
+struct type_record {
+    PYBIND11_NOINLINE type_record()
+        : multiple_inheritance(false), dynamic_attr(false), buffer_protocol(false),
+          default_holder(true), module_local(false) { }
+
+    /// Handle to the parent scope
+    handle scope;
+
+    /// Name of the class
+    const char *name = nullptr;
+
+    // Pointer to RTTI type_info data structure
+    const std::type_info *type = nullptr;
+
+    /// How large is the underlying C++ type?
+    size_t type_size = 0;
+
+    /// What is the alignment of the underlying C++ type?
+    size_t type_align = 0;
+
+    /// How large is the type's holder?
+    size_t holder_size = 0;
+
+    /// The global operator new can be overridden with a class-specific variant
+    void *(*operator_new)(size_t) = nullptr;
+
+    /// Function pointer to class_<..>::init_instance
+    void (*init_instance)(instance *, const void *) = nullptr;
+
+    /// Function pointer to class_<..>::dealloc
+    void (*dealloc)(detail::value_and_holder &) = nullptr;
+
+    /// List of base classes of the newly created type
+    list bases;
+
+    /// Optional docstring
+    const char *doc = nullptr;
+
+    /// Custom metaclass (optional)
+    handle metaclass;
+
+    /// Multiple inheritance marker
+    bool multiple_inheritance : 1;
+
+    /// Does the class manage a __dict__?
+    bool dynamic_attr : 1;
+
+    /// Does the class implement the buffer protocol?
+    bool buffer_protocol : 1;
+
+    /// Is the default (unique_ptr) holder type used?
+    bool default_holder : 1;
+
+    /// Is the class definition local to the module shared object?
+    bool module_local : 1;
+
+    PYBIND11_NOINLINE void add_base(const std::type_info &base, void *(*caster)(void *)) {
+        auto base_info = detail::get_type_info(base, false);
+        if (!base_info) {
+            std::string tname(base.name());
+            detail::clean_type_id(tname);
+            pybind11_fail("generic_type: type \"" + std::string(name) +
+                          "\" referenced unknown base type \"" + tname + "\"");
+        }
+
+        if (default_holder != base_info->default_holder) {
+            std::string tname(base.name());
+            detail::clean_type_id(tname);
+            pybind11_fail("generic_type: type \"" + std::string(name) + "\" " +
+                    (default_holder ? "does not have" : "has") +
+                    " a non-default holder type while its base \"" + tname + "\" " +
+                    (base_info->default_holder ? "does not" : "does"));
+        }
+
+        bases.append((PyObject *) base_info->type);
+
+        if (base_info->type->tp_dictoffset != 0)
+            dynamic_attr = true;
+
+        if (caster)
+            base_info->implicit_casts.emplace_back(type, caster);
+    }
+};
+
+inline function_call::function_call(const function_record &f, handle p) :
+        func(f), parent(p) {
+    args.reserve(f.nargs);
+    args_convert.reserve(f.nargs);
+}
+
+/// Tag for a new-style `__init__` defined in `detail/init.h`
+struct is_new_style_constructor { };
+
+/**
+ * Partial template specializations to process custom attributes provided to
+ * cpp_function_ and class_. These are either used to initialize the respective
+ * fields in the type_record and function_record data structures or executed at
+ * runtime to deal with custom call policies (e.g. keep_alive).
+ */
+template <typename T, typename SFINAE = void> struct process_attribute;
+
+template <typename T> struct process_attribute_default {
+    /// Default implementation: do nothing
+    static void init(const T &, function_record *) { }
+    static void init(const T &, type_record *) { }
+    static void precall(function_call &) { }
+    static void postcall(function_call &, handle) { }
+};
+
+/// Process an attribute specifying the function's name
+template <> struct process_attribute<name> : process_attribute_default<name> {
+    static void init(const name &n, function_record *r) { r->name = const_cast<char *>(n.value); }
+};
+
+/// Process an attribute specifying the function's docstring
+template <> struct process_attribute<doc> : process_attribute_default<doc> {
+    static void init(const doc &n, function_record *r) { r->doc = const_cast<char *>(n.value); }
+};
+
+/// Process an attribute specifying the function's docstring (provided as a C-style string)
+template <> struct process_attribute<const char *> : process_attribute_default<const char *> {
+    static void init(const char *d, function_record *r) { r->doc = const_cast<char *>(d); }
+    static void init(const char *d, type_record *r) { r->doc = const_cast<char *>(d); }
+};
+template <> struct process_attribute<char *> : process_attribute<const char *> { };
+
+/// Process an attribute indicating the function's return value policy
+template <> struct process_attribute<return_value_policy> : process_attribute_default<return_value_policy> {
+    static void init(const return_value_policy &p, function_record *r) { r->policy = p; }
+};
+
+/// Process an attribute which indicates that this is an overloaded function associated with a given sibling
+template <> struct process_attribute<sibling> : process_attribute_default<sibling> {
+    static void init(const sibling &s, function_record *r) { r->sibling = s.value; }
+};
+
+/// Process an attribute which indicates that this function is a method
+template <> struct process_attribute<is_method> : process_attribute_default<is_method> {
+    static void init(const is_method &s, function_record *r) { r->is_method = true; r->scope = s.class_; }
+};
+
+/// Process an attribute which indicates the parent scope of a method
+template <> struct process_attribute<scope> : process_attribute_default<scope> {
+    static void init(const scope &s, function_record *r) { r->scope = s.value; }
+};
+
+/// Process an attribute which indicates that this function is an operator
+template <> struct process_attribute<is_operator> : process_attribute_default<is_operator> {
+    static void init(const is_operator &, function_record *r) { r->is_operator = true; }
+};
+
+template <> struct process_attribute<is_new_style_constructor> : process_attribute_default<is_new_style_constructor> {
+    static void init(const is_new_style_constructor &, function_record *r) { r->is_new_style_constructor = true; }
+};
+
+/// Process a keyword argument attribute (*without* a default value)
+template <> struct process_attribute<arg> : process_attribute_default<arg> {
+    static void init(const arg &a, function_record *r) {
+        if (r->is_method && r->args.empty())
+            r->args.emplace_back("self", nullptr, handle(), true /*convert*/, false /*none not allowed*/);
+        r->args.emplace_back(a.name, nullptr, handle(), !a.flag_noconvert, a.flag_none);
+    }
+};
+
+/// Process a keyword argument attribute (*with* a default value)
+template <> struct process_attribute<arg_v> : process_attribute_default<arg_v> {
+    static void init(const arg_v &a, function_record *r) {
+        if (r->is_method && r->args.empty())
+            r->args.emplace_back("self", nullptr /*descr*/, handle() /*parent*/, true /*convert*/, false /*none not allowed*/);
+
+        if (!a.value) {
+#if !defined(NDEBUG)
+            std::string descr("'");
+            if (a.name) descr += std::string(a.name) + ": ";
+            descr += a.type + "'";
+            if (r->is_method) {
+                if (r->name)
+                    descr += " in method '" + (std::string) str(r->scope) + "." + (std::string) r->name + "'";
+                else
+                    descr += " in method of '" + (std::string) str(r->scope) + "'";
+            } else if (r->name) {
+                descr += " in function '" + (std::string) r->name + "'";
+            }
+            pybind11_fail("arg(): could not convert default argument "
+                          + descr + " into a Python object (type not registered yet?)");
+#else
+            pybind11_fail("arg(): could not convert default argument "
+                          "into a Python object (type not registered yet?). "
+                          "Compile in debug mode for more information.");
+#endif
+        }
+        r->args.emplace_back(a.name, a.descr, a.value.inc_ref(), !a.flag_noconvert, a.flag_none);
+    }
+};
+
+/// Process a parent class attribute.  Single inheritance only (class_ itself already guarantees that)
+template <typename T>
+struct process_attribute<T, enable_if_t<is_pyobject<T>::value>> : process_attribute_default<handle> {
+    static void init(const handle &h, type_record *r) { r->bases.append(h); }
+};
+
+/// Process a parent class attribute (deprecated, does not support multiple inheritance)
+template <typename T>
+struct process_attribute<base<T>> : process_attribute_default<base<T>> {
+    static void init(const base<T> &, type_record *r) { r->add_base(typeid(T), nullptr); }
+};
+
+/// Process a multiple inheritance attribute
+template <>
+struct process_attribute<multiple_inheritance> : process_attribute_default<multiple_inheritance> {
+    static void init(const multiple_inheritance &, type_record *r) { r->multiple_inheritance = true; }
+};
+
+template <>
+struct process_attribute<dynamic_attr> : process_attribute_default<dynamic_attr> {
+    static void init(const dynamic_attr &, type_record *r) { r->dynamic_attr = true; }
+};
+
+template <>
+struct process_attribute<buffer_protocol> : process_attribute_default<buffer_protocol> {
+    static void init(const buffer_protocol &, type_record *r) { r->buffer_protocol = true; }
+};
+
+template <>
+struct process_attribute<metaclass> : process_attribute_default<metaclass> {
+    static void init(const metaclass &m, type_record *r) { r->metaclass = m.value; }
+};
+
+template <>
+struct process_attribute<module_local> : process_attribute_default<module_local> {
+    static void init(const module_local &l, type_record *r) { r->module_local = l.value; }
+};
+
+/// Process an 'arithmetic' attribute for enums (does nothing here)
+template <>
+struct process_attribute<arithmetic> : process_attribute_default<arithmetic> {};
+
+template <typename... Ts>
+struct process_attribute<call_guard<Ts...>> : process_attribute_default<call_guard<Ts...>> { };
+
+/**
+ * Process a keep_alive call policy -- invokes keep_alive_impl during the
+ * pre-call handler if both Nurse, Patient != 0 and use the post-call handler
+ * otherwise
+ */
+template <size_t Nurse, size_t Patient> struct process_attribute<keep_alive<Nurse, Patient>> : public process_attribute_default<keep_alive<Nurse, Patient>> {
+    template <size_t N = Nurse, size_t P = Patient, enable_if_t<N != 0 && P != 0, int> = 0>
+    static void precall(function_call &call) { keep_alive_impl(Nurse, Patient, call, handle()); }
+    template <size_t N = Nurse, size_t P = Patient, enable_if_t<N != 0 && P != 0, int> = 0>
+    static void postcall(function_call &, handle) { }
+    template <size_t N = Nurse, size_t P = Patient, enable_if_t<N == 0 || P == 0, int> = 0>
+    static void precall(function_call &) { }
+    template <size_t N = Nurse, size_t P = Patient, enable_if_t<N == 0 || P == 0, int> = 0>
+    static void postcall(function_call &call, handle ret) { keep_alive_impl(Nurse, Patient, call, ret); }
+};
+
+/// Recursively iterate over variadic template arguments
+template <typename... Args> struct process_attributes {
+    static void init(const Args&... args, function_record *r) {
+        int unused[] = { 0, (process_attribute<typename std::decay<Args>::type>::init(args, r), 0) ... };
+        ignore_unused(unused);
+    }
+    static void init(const Args&... args, type_record *r) {
+        int unused[] = { 0, (process_attribute<typename std::decay<Args>::type>::init(args, r), 0) ... };
+        ignore_unused(unused);
+    }
+    static void precall(function_call &call) {
+        int unused[] = { 0, (process_attribute<typename std::decay<Args>::type>::precall(call), 0) ... };
+        ignore_unused(unused);
+    }
+    static void postcall(function_call &call, handle fn_ret) {
+        int unused[] = { 0, (process_attribute<typename std::decay<Args>::type>::postcall(call, fn_ret), 0) ... };
+        ignore_unused(unused);
+    }
+};
+
+template <typename T>
+using is_call_guard = is_instantiation<call_guard, T>;
+
+/// Extract the ``type`` from the first `call_guard` in `Extras...` (or `void_type` if none found)
+template <typename... Extra>
+using extract_guard_t = typename exactly_one_t<is_call_guard, call_guard<>, Extra...>::type;
+
+/// Check the number of named arguments at compile time
+template <typename... Extra,
+          size_t named = constexpr_sum(std::is_base_of<arg, Extra>::value...),
+          size_t self  = constexpr_sum(std::is_same<is_method, Extra>::value...)>
+constexpr bool expected_num_args(size_t nargs, bool has_args, bool has_kwargs) {
+    return named == 0 || (self + named + has_args + has_kwargs) == nargs;
+}
+
+NAMESPACE_END(detail)
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/cviruntime/python/include/pybind11/include/pybind11/buffer_info.h b/cviruntime/python/include/pybind11/include/pybind11/buffer_info.h
new file mode 100644
index 000000000..1f4115a1f
--- /dev/null
+++ b/cviruntime/python/include/pybind11/include/pybind11/buffer_info.h
@@ -0,0 +1,114 @@
+/*
+    pybind11/buffer_info.h: Python buffer object interface
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/common.h"
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+/// Information record describing a Python buffer object
+struct buffer_info {
+    void *ptr = nullptr;          // Pointer to the underlying storage
+    ssize_t itemsize = 0;         // Size of individual items in bytes
+    ssize_t size = 0;             // Total number of entries
+    std::string format;           // For homogeneous buffers, this should be set to format_descriptor<T>::format()
+    ssize_t ndim = 0;             // Number of dimensions
+    std::vector<ssize_t> shape;   // Shape of the tensor (1 entry per dimension)
+    std::vector<ssize_t> strides; // Number of bytes between adjacent entries (for each per dimension)
+    bool readonly = false;        // flag to indicate if the underlying storage may be written to
+
+    buffer_info() { }
+
+    buffer_info(void *ptr, ssize_t itemsize, const std::string &format, ssize_t ndim,
+                detail::any_container<ssize_t> shape_in, detail::any_container<ssize_t> strides_in, bool readonly=false)
+    : ptr(ptr), itemsize(itemsize), size(1), format(format), ndim(ndim),
+      shape(std::move(shape_in)), strides(std::move(strides_in)), readonly(readonly) {
+        if (ndim != (ssize_t) shape.size() || ndim != (ssize_t) strides.size())
+            pybind11_fail("buffer_info: ndim doesn't match shape and/or strides length");
+        for (size_t i = 0; i < (size_t) ndim; ++i)
+            size *= shape[i];
+    }
+
+    template <typename T>
+    buffer_info(T *ptr, detail::any_container<ssize_t> shape_in, detail::any_container<ssize_t> strides_in, bool readonly=false)
+    : buffer_info(private_ctr_tag(), ptr, sizeof(T), format_descriptor<T>::format(), static_cast<ssize_t>(shape_in->size()), std::move(shape_in), std::move(strides_in), readonly) { }
+
+    buffer_info(void *ptr, ssize_t itemsize, const std::string &format, ssize_t size, bool readonly=false)
+    : buffer_info(ptr, itemsize, format, 1, {size}, {itemsize}, readonly) { }
+
+    template <typename T>
+    buffer_info(T *ptr, ssize_t size, bool readonly=false)
+    : buffer_info(ptr, sizeof(T), format_descriptor<T>::format(), size, readonly) { }
+
+    template <typename T>
+    buffer_info(const T *ptr, ssize_t size, bool readonly=true)
+    : buffer_info(const_cast<T*>(ptr), sizeof(T), format_descriptor<T>::format(), size, readonly) { }
+
+    explicit buffer_info(Py_buffer *view, bool ownview = true)
+    : buffer_info(view->buf, view->itemsize, view->format, view->ndim,
+            {view->shape, view->shape + view->ndim}, {view->strides, view->strides + view->ndim}, view->readonly) {
+        this->view = view;
+        this->ownview = ownview;
+    }
+
+    buffer_info(const buffer_info &) = delete;
+    buffer_info& operator=(const buffer_info &) = delete;
+
+    buffer_info(buffer_info &&other) {
+        (*this) = std::move(other);
+    }
+
+    buffer_info& operator=(buffer_info &&rhs) {
+        ptr = rhs.ptr;
+        itemsize = rhs.itemsize;
+        size = rhs.size;
+        format = std::move(rhs.format);
+        ndim = rhs.ndim;
+        shape = std::move(rhs.shape);
+        strides = std::move(rhs.strides);
+        std::swap(view, rhs.view);
+        std::swap(ownview, rhs.ownview);
+        readonly = rhs.readonly;
+        return *this;
+    }
+
+    ~buffer_info() {
+        if (view && ownview) { PyBuffer_Release(view); delete view; }
+    }
+
+private:
+    struct private_ctr_tag { };
+
+    buffer_info(private_ctr_tag, void *ptr, ssize_t itemsize, const std::string &format, ssize_t ndim,
+                detail::any_container<ssize_t> &&shape_in, detail::any_container<ssize_t> &&strides_in, bool readonly)
+    : buffer_info(ptr, itemsize, format, ndim, std::move(shape_in), std::move(strides_in), readonly) { }
+
+    Py_buffer *view = nullptr;
+    bool ownview = false;
+};
+
+NAMESPACE_BEGIN(detail)
+
+template <typename T, typename SFINAE = void> struct compare_buffer_info {
+    static bool compare(const buffer_info& b) {
+        return b.format == format_descriptor<T>::format() && b.itemsize == (ssize_t) sizeof(T);
+    }
+};
+
+template <typename T> struct compare_buffer_info<T, detail::enable_if_t<std::is_integral<T>::value>> {
+    static bool compare(const buffer_info& b) {
+        return (size_t) b.itemsize == sizeof(T) && (b.format == format_descriptor<T>::value ||
+            ((sizeof(T) == sizeof(long)) && b.format == (std::is_unsigned<T>::value ? "L" : "l")) ||
+            ((sizeof(T) == sizeof(size_t)) && b.format == (std::is_unsigned<T>::value ? "N" : "n")));
+    }
+};
+
+NAMESPACE_END(detail)
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/cviruntime/python/include/pybind11/include/pybind11/cast.h b/cviruntime/python/include/pybind11/include/pybind11/cast.h
new file mode 100644
index 000000000..fcfd0a8e3
--- /dev/null
+++ b/cviruntime/python/include/pybind11/include/pybind11/cast.h
@@ -0,0 +1,2185 @@
+/*
+    pybind11/cast.h: Partial template specializations to cast between
+    C++ and Python types
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pytypes.h"
+#include "detail/typeid.h"
+#include "detail/descr.h"
+#include "detail/internals.h"
+#include <array>
+#include <limits>
+#include <tuple>
+#include <type_traits>
+
+#if defined(PYBIND11_CPP17)
+#  if defined(__has_include)
+#    if __has_include(<string_view>)
+#      define PYBIND11_HAS_STRING_VIEW
+#    endif
+#  elif defined(_MSC_VER)
+#    define PYBIND11_HAS_STRING_VIEW
+#  endif
+#endif
+#ifdef PYBIND11_HAS_STRING_VIEW
+#include <string_view>
+#endif
+
+#if defined(__cpp_lib_char8_t) && __cpp_lib_char8_t >= 201811L
+#  define PYBIND11_HAS_U8STRING
+#endif
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+/// A life support system for temporary objects created by `type_caster::load()`.
+/// Adding a patient will keep it alive up until the enclosing function returns.
+class loader_life_support {
+public:
+    /// A new patient frame is created when a function is entered
+    loader_life_support() {
+        get_internals().loader_patient_stack.push_back(nullptr);
+    }
+
+    /// ... and destroyed after it returns
+    ~loader_life_support() {
+        auto &stack = get_internals().loader_patient_stack;
+        if (stack.empty())
+            pybind11_fail("loader_life_support: internal error");
+
+        auto ptr = stack.back();
+        stack.pop_back();
+        Py_CLEAR(ptr);
+
+        // A heuristic to reduce the stack's capacity (e.g. after long recursive calls)
+        if (stack.capacity() > 16 && stack.size() != 0 && stack.capacity() / stack.size() > 2)
+            stack.shrink_to_fit();
+    }
+
+    /// This can only be used inside a pybind11-bound function, either by `argument_loader`
+    /// at argument preparation time or by `py::cast()` at execution time.
+    PYBIND11_NOINLINE static void add_patient(handle h) {
+        auto &stack = get_internals().loader_patient_stack;
+        if (stack.empty())
+            throw cast_error("When called outside a bound function, py::cast() cannot "
+                             "do Python -> C++ conversions which require the creation "
+                             "of temporary values");
+
+        auto &list_ptr = stack.back();
+        if (list_ptr == nullptr) {
+            list_ptr = PyList_New(1);
+            if (!list_ptr)
+                pybind11_fail("loader_life_support: error allocating list");
+            PyList_SET_ITEM(list_ptr, 0, h.inc_ref().ptr());
+        } else {
+            auto result = PyList_Append(list_ptr, h.ptr());
+            if (result == -1)
+                pybind11_fail("loader_life_support: error adding patient");
+        }
+    }
+};
+
+// Gets the cache entry for the given type, creating it if necessary.  The return value is the pair
+// returned by emplace, i.e. an iterator for the entry and a bool set to `true` if the entry was
+// just created.
+inline std::pair<decltype(internals::registered_types_py)::iterator, bool> all_type_info_get_cache(PyTypeObject *type);
+
+// Populates a just-created cache entry.
+PYBIND11_NOINLINE inline void all_type_info_populate(PyTypeObject *t, std::vector<type_info *> &bases) {
+    std::vector<PyTypeObject *> check;
+    for (handle parent : reinterpret_borrow<tuple>(t->tp_bases))
+        check.push_back((PyTypeObject *) parent.ptr());
+
+    auto const &type_dict = get_internals().registered_types_py;
+    for (size_t i = 0; i < check.size(); i++) {
+        auto type = check[i];
+        // Ignore Python2 old-style class super type:
+        if (!PyType_Check((PyObject *) type)) continue;
+
+        // Check `type` in the current set of registered python types:
+        auto it = type_dict.find(type);
+        if (it != type_dict.end()) {
+            // We found a cache entry for it, so it's either pybind-registered or has pre-computed
+            // pybind bases, but we have to make sure we haven't already seen the type(s) before: we
+            // want to follow Python/virtual C++ rules that there should only be one instance of a
+            // common base.
+            for (auto *tinfo : it->second) {
+                // NB: Could use a second set here, rather than doing a linear search, but since
+                // having a large number of immediate pybind11-registered types seems fairly
+                // unlikely, that probably isn't worthwhile.
+                bool found = false;
+                for (auto *known : bases) {
+                    if (known == tinfo) { found = true; break; }
+                }
+                if (!found) bases.push_back(tinfo);
+            }
+        }
+        else if (type->tp_bases) {
+            // It's some python type, so keep follow its bases classes to look for one or more
+            // registered types
+            if (i + 1 == check.size()) {
+                // When we're at the end, we can pop off the current element to avoid growing
+                // `check` when adding just one base (which is typical--i.e. when there is no
+                // multiple inheritance)
+                check.pop_back();
+                i--;
+            }
+            for (handle parent : reinterpret_borrow<tuple>(type->tp_bases))
+                check.push_back((PyTypeObject *) parent.ptr());
+        }
+    }
+}
+
+/**
+ * Extracts vector of type_info pointers of pybind-registered roots of the given Python type.  Will
+ * be just 1 pybind type for the Python type of a pybind-registered class, or for any Python-side
+ * derived class that uses single inheritance.  Will contain as many types as required for a Python
+ * class that uses multiple inheritance to inherit (directly or indirectly) from multiple
+ * pybind-registered classes.  Will be empty if neither the type nor any base classes are
+ * pybind-registered.
+ *
+ * The value is cached for the lifetime of the Python type.
+ */
+inline const std::vector<detail::type_info *> &all_type_info(PyTypeObject *type) {
+    auto ins = all_type_info_get_cache(type);
+    if (ins.second)
+        // New cache entry: populate it
+        all_type_info_populate(type, ins.first->second);
+
+    return ins.first->second;
+}
+
+/**
+ * Gets a single pybind11 type info for a python type.  Returns nullptr if neither the type nor any
+ * ancestors are pybind11-registered.  Throws an exception if there are multiple bases--use
+ * `all_type_info` instead if you want to support multiple bases.
+ */
+PYBIND11_NOINLINE inline detail::type_info* get_type_info(PyTypeObject *type) {
+    auto &bases = all_type_info(type);
+    if (bases.size() == 0)
+        return nullptr;
+    if (bases.size() > 1)
+        pybind11_fail("pybind11::detail::get_type_info: type has multiple pybind11-registered bases");
+    return bases.front();
+}
+
+inline detail::type_info *get_local_type_info(const std::type_index &tp) {
+    auto &locals = registered_local_types_cpp();
+    auto it = locals.find(tp);
+    if (it != locals.end())
+        return it->second;
+    return nullptr;
+}
+
+inline detail::type_info *get_global_type_info(const std::type_index &tp) {
+    auto &types = get_internals().registered_types_cpp;
+    auto it = types.find(tp);
+    if (it != types.end())
+        return it->second;
+    return nullptr;
+}
+
+/// Return the type info for a given C++ type; on lookup failure can either throw or return nullptr.
+PYBIND11_NOINLINE inline detail::type_info *get_type_info(const std::type_index &tp,
+                                                          bool throw_if_missing = false) {
+    if (auto ltype = get_local_type_info(tp))
+        return ltype;
+    if (auto gtype = get_global_type_info(tp))
+        return gtype;
+
+    if (throw_if_missing) {
+        std::string tname = tp.name();
+        detail::clean_type_id(tname);
+        pybind11_fail("pybind11::detail::get_type_info: unable to find type info for \"" + tname + "\"");
+    }
+    return nullptr;
+}
+
+PYBIND11_NOINLINE inline handle get_type_handle(const std::type_info &tp, bool throw_if_missing) {
+    detail::type_info *type_info = get_type_info(tp, throw_if_missing);
+    return handle(type_info ? ((PyObject *) type_info->type) : nullptr);
+}
+
+struct value_and_holder {
+    instance *inst = nullptr;
+    size_t index = 0u;
+    const detail::type_info *type = nullptr;
+    void **vh = nullptr;
+
+    // Main constructor for a found value/holder:
+    value_and_holder(instance *i, const detail::type_info *type, size_t vpos, size_t index) :
+        inst{i}, index{index}, type{type},
+        vh{inst->simple_layout ? inst->simple_value_holder : &inst->nonsimple.values_and_holders[vpos]}
+    {}
+
+    // Default constructor (used to signal a value-and-holder not found by get_value_and_holder())
+    value_and_holder() {}
+
+    // Used for past-the-end iterator
+    value_and_holder(size_t index) : index{index} {}
+
+    template <typename V = void> V *&value_ptr() const {
+        return reinterpret_cast<V *&>(vh[0]);
+    }
+    // True if this `value_and_holder` has a non-null value pointer
+    explicit operator bool() const { return value_ptr(); }
+
+    template <typename H> H &holder() const {
+        return reinterpret_cast<H &>(vh[1]);
+    }
+    bool holder_constructed() const {
+        return inst->simple_layout
+            ? inst->simple_holder_constructed
+            : inst->nonsimple.status[index] & instance::status_holder_constructed;
+    }
+    void set_holder_constructed(bool v = true) {
+        if (inst->simple_layout)
+            inst->simple_holder_constructed = v;
+        else if (v)
+            inst->nonsimple.status[index] |= instance::status_holder_constructed;
+        else
+            inst->nonsimple.status[index] &= (uint8_t) ~instance::status_holder_constructed;
+    }
+    bool instance_registered() const {
+        return inst->simple_layout
+            ? inst->simple_instance_registered
+            : inst->nonsimple.status[index] & instance::status_instance_registered;
+    }
+    void set_instance_registered(bool v = true) {
+        if (inst->simple_layout)
+            inst->simple_instance_registered = v;
+        else if (v)
+            inst->nonsimple.status[index] |= instance::status_instance_registered;
+        else
+            inst->nonsimple.status[index] &= (uint8_t) ~instance::status_instance_registered;
+    }
+};
+
+// Container for accessing and iterating over an instance's values/holders
+struct values_and_holders {
+private:
+    instance *inst;
+    using type_vec = std::vector<detail::type_info *>;
+    const type_vec &tinfo;
+
+public:
+    values_and_holders(instance *inst) : inst{inst}, tinfo(all_type_info(Py_TYPE(inst))) {}
+
+    struct iterator {
+    private:
+        instance *inst = nullptr;
+        const type_vec *types = nullptr;
+        value_and_holder curr;
+        friend struct values_and_holders;
+        iterator(instance *inst, const type_vec *tinfo)
+            : inst{inst}, types{tinfo},
+            curr(inst /* instance */,
+                 types->empty() ? nullptr : (*types)[0] /* type info */,
+                 0, /* vpos: (non-simple types only): the first vptr comes first */
+                 0 /* index */)
+        {}
+        // Past-the-end iterator:
+        iterator(size_t end) : curr(end) {}
+    public:
+        bool operator==(const iterator &other) { return curr.index == other.curr.index; }
+        bool operator!=(const iterator &other) { return curr.index != other.curr.index; }
+        iterator &operator++() {
+            if (!inst->simple_layout)
+                curr.vh += 1 + (*types)[curr.index]->holder_size_in_ptrs;
+            ++curr.index;
+            curr.type = curr.index < types->size() ? (*types)[curr.index] : nullptr;
+            return *this;
+        }
+        value_and_holder &operator*() { return curr; }
+        value_and_holder *operator->() { return &curr; }
+    };
+
+    iterator begin() { return iterator(inst, &tinfo); }
+    iterator end() { return iterator(tinfo.size()); }
+
+    iterator find(const type_info *find_type) {
+        auto it = begin(), endit = end();
+        while (it != endit && it->type != find_type) ++it;
+        return it;
+    }
+
+    size_t size() { return tinfo.size(); }
+};
+
+/**
+ * Extracts C++ value and holder pointer references from an instance (which may contain multiple
+ * values/holders for python-side multiple inheritance) that match the given type.  Throws an error
+ * if the given type (or ValueType, if omitted) is not a pybind11 base of the given instance.  If
+ * `find_type` is omitted (or explicitly specified as nullptr) the first value/holder are returned,
+ * regardless of type (and the resulting .type will be nullptr).
+ *
+ * The returned object should be short-lived: in particular, it must not outlive the called-upon
+ * instance.
+ */
+PYBIND11_NOINLINE inline value_and_holder instance::get_value_and_holder(const type_info *find_type /*= nullptr default in common.h*/, bool throw_if_missing /*= true in common.h*/) {
+    // Optimize common case:
+    if (!find_type || Py_TYPE(this) == find_type->type)
+        return value_and_holder(this, find_type, 0, 0);
+
+    detail::values_and_holders vhs(this);
+    auto it = vhs.find(find_type);
+    if (it != vhs.end())
+        return *it;
+
+    if (!throw_if_missing)
+        return value_and_holder();
+
+#if defined(NDEBUG)
+    pybind11_fail("pybind11::detail::instance::get_value_and_holder: "
+            "type is not a pybind11 base of the given instance "
+            "(compile in debug mode for type details)");
+#else
+    pybind11_fail("pybind11::detail::instance::get_value_and_holder: `" +
+            std::string(find_type->type->tp_name) + "' is not a pybind11 base of the given `" +
+            std::string(Py_TYPE(this)->tp_name) + "' instance");
+#endif
+}
+
+PYBIND11_NOINLINE inline void instance::allocate_layout() {
+    auto &tinfo = all_type_info(Py_TYPE(this));
+
+    const size_t n_types = tinfo.size();
+
+    if (n_types == 0)
+        pybind11_fail("instance allocation failed: new instance has no pybind11-registered base types");
+
+    simple_layout =
+        n_types == 1 && tinfo.front()->holder_size_in_ptrs <= instance_simple_holder_in_ptrs();
+
+    // Simple path: no python-side multiple inheritance, and a small-enough holder
+    if (simple_layout) {
+        simple_value_holder[0] = nullptr;
+        simple_holder_constructed = false;
+        simple_instance_registered = false;
+    }
+    else { // multiple base types or a too-large holder
+        // Allocate space to hold: [v1*][h1][v2*][h2]...[bb...] where [vN*] is a value pointer,
+        // [hN] is the (uninitialized) holder instance for value N, and [bb...] is a set of bool
+        // values that tracks whether each associated holder has been initialized.  Each [block] is
+        // padded, if necessary, to an integer multiple of sizeof(void *).
+        size_t space = 0;
+        for (auto t : tinfo) {
+            space += 1; // value pointer
+            space += t->holder_size_in_ptrs; // holder instance
+        }
+        size_t flags_at = space;
+        space += size_in_ptrs(n_types); // status bytes (holder_constructed and instance_registered)
+
+        // Allocate space for flags, values, and holders, and initialize it to 0 (flags and values,
+        // in particular, need to be 0).  Use Python's memory allocation functions: in Python 3.6
+        // they default to using pymalloc, which is designed to be efficient for small allocations
+        // like the one we're doing here; in earlier versions (and for larger allocations) they are
+        // just wrappers around malloc.
+#if PY_VERSION_HEX >= 0x03050000
+        nonsimple.values_and_holders = (void **) PyMem_Calloc(space, sizeof(void *));
+        if (!nonsimple.values_and_holders) throw std::bad_alloc();
+#else
+        nonsimple.values_and_holders = (void **) PyMem_New(void *, space);
+        if (!nonsimple.values_and_holders) throw std::bad_alloc();
+        std::memset(nonsimple.values_and_holders, 0, space * sizeof(void *));
+#endif
+        nonsimple.status = reinterpret_cast<uint8_t *>(&nonsimple.values_and_holders[flags_at]);
+    }
+    owned = true;
+}
+
+PYBIND11_NOINLINE inline void instance::deallocate_layout() {
+    if (!simple_layout)
+        PyMem_Free(nonsimple.values_and_holders);
+}
+
+PYBIND11_NOINLINE inline bool isinstance_generic(handle obj, const std::type_info &tp) {
+    handle type = detail::get_type_handle(tp, false);
+    if (!type)
+        return false;
+    return isinstance(obj, type);
+}
+
+PYBIND11_NOINLINE inline std::string error_string() {
+    if (!PyErr_Occurred()) {
+        PyErr_SetString(PyExc_RuntimeError, "Unknown internal error occurred");
+        return "Unknown internal error occurred";
+    }
+
+    error_scope scope; // Preserve error state
+
+    std::string errorString;
+    if (scope.type) {
+        errorString += handle(scope.type).attr("__name__").cast<std::string>();
+        errorString += ": ";
+    }
+    if (scope.value)
+        errorString += (std::string) str(scope.value);
+
+    PyErr_NormalizeException(&scope.type, &scope.value, &scope.trace);
+
+#if PY_MAJOR_VERSION >= 3
+    if (scope.trace != nullptr)
+        PyException_SetTraceback(scope.value, scope.trace);
+#endif
+
+#if !defined(PYPY_VERSION)
+    if (scope.trace) {
+        PyTracebackObject *trace = (PyTracebackObject *) scope.trace;
+
+        /* Get the deepest trace possible */
+        while (trace->tb_next)
+            trace = trace->tb_next;
+
+        PyFrameObject *frame = trace->tb_frame;
+        errorString += "\n\nAt:\n";
+        while (frame) {
+            int lineno = PyFrame_GetLineNumber(frame);
+            errorString +=
+                "  " + handle(frame->f_code->co_filename).cast<std::string>() +
+                "(" + std::to_string(lineno) + "): " +
+                handle(frame->f_code->co_name).cast<std::string>() + "\n";
+            frame = frame->f_back;
+        }
+    }
+#endif
+
+    return errorString;
+}
+
+PYBIND11_NOINLINE inline handle get_object_handle(const void *ptr, const detail::type_info *type ) {
+    auto &instances = get_internals().registered_instances;
+    auto range = instances.equal_range(ptr);
+    for (auto it = range.first; it != range.second; ++it) {
+        for (auto vh : values_and_holders(it->second)) {
+            if (vh.type == type)
+                return handle((PyObject *) it->second);
+        }
+    }
+    return handle();
+}
+
+inline PyThreadState *get_thread_state_unchecked() {
+#if defined(PYPY_VERSION)
+    return PyThreadState_GET();
+#elif PY_VERSION_HEX < 0x03000000
+    return _PyThreadState_Current;
+#elif PY_VERSION_HEX < 0x03050000
+    return (PyThreadState*) _Py_atomic_load_relaxed(&_PyThreadState_Current);
+#elif PY_VERSION_HEX < 0x03050200
+    return (PyThreadState*) _PyThreadState_Current.value;
+#else
+    return _PyThreadState_UncheckedGet();
+#endif
+}
+
+// Forward declarations
+inline void keep_alive_impl(handle nurse, handle patient);
+inline PyObject *make_new_instance(PyTypeObject *type);
+
+class type_caster_generic {
+public:
+    PYBIND11_NOINLINE type_caster_generic(const std::type_info &type_info)
+        : typeinfo(get_type_info(type_info)), cpptype(&type_info) { }
+
+    type_caster_generic(const type_info *typeinfo)
+        : typeinfo(typeinfo), cpptype(typeinfo ? typeinfo->cpptype : nullptr) { }
+
+    bool load(handle src, bool convert) {
+        return load_impl<type_caster_generic>(src, convert);
+    }
+
+    PYBIND11_NOINLINE static handle cast(const void *_src, return_value_policy policy, handle parent,
+                                         const detail::type_info *tinfo,
+                                         void *(*copy_constructor)(const void *),
+                                         void *(*move_constructor)(const void *),
+                                         const void *existing_holder = nullptr) {
+        if (!tinfo) // no type info: error will be set already
+            return handle();
+
+        void *src = const_cast<void *>(_src);
+        if (src == nullptr)
+            return none().release();
+
+        auto it_instances = get_internals().registered_instances.equal_range(src);
+        for (auto it_i = it_instances.first; it_i != it_instances.second; ++it_i) {
+            for (auto instance_type : detail::all_type_info(Py_TYPE(it_i->second))) {
+                if (instance_type && same_type(*instance_type->cpptype, *tinfo->cpptype))
+                    return handle((PyObject *) it_i->second).inc_ref();
+            }
+        }
+
+        auto inst = reinterpret_steal<object>(make_new_instance(tinfo->type));
+        auto wrapper = reinterpret_cast<instance *>(inst.ptr());
+        wrapper->owned = false;
+        void *&valueptr = values_and_holders(wrapper).begin()->value_ptr();
+
+        switch (policy) {
+            case return_value_policy::automatic:
+            case return_value_policy::take_ownership:
+                valueptr = src;
+                wrapper->owned = true;
+                break;
+
+            case return_value_policy::automatic_reference:
+            case return_value_policy::reference:
+                valueptr = src;
+                wrapper->owned = false;
+                break;
+
+            case return_value_policy::copy:
+                if (copy_constructor)
+                    valueptr = copy_constructor(src);
+                else {
+#if defined(NDEBUG)
+                    throw cast_error("return_value_policy = copy, but type is "
+                                     "non-copyable! (compile in debug mode for details)");
+#else
+                    std::string type_name(tinfo->cpptype->name());
+                    detail::clean_type_id(type_name);
+                    throw cast_error("return_value_policy = copy, but type " +
+                                     type_name + " is non-copyable!");
+#endif
+                }
+                wrapper->owned = true;
+                break;
+
+            case return_value_policy::move:
+                if (move_constructor)
+                    valueptr = move_constructor(src);
+                else if (copy_constructor)
+                    valueptr = copy_constructor(src);
+                else {
+#if defined(NDEBUG)
+                    throw cast_error("return_value_policy = move, but type is neither "
+                                     "movable nor copyable! "
+                                     "(compile in debug mode for details)");
+#else
+                    std::string type_name(tinfo->cpptype->name());
+                    detail::clean_type_id(type_name);
+                    throw cast_error("return_value_policy = move, but type " +
+                                     type_name + " is neither movable nor copyable!");
+#endif
+                }
+                wrapper->owned = true;
+                break;
+
+            case return_value_policy::reference_internal:
+                valueptr = src;
+                wrapper->owned = false;
+                keep_alive_impl(inst, parent);
+                break;
+
+            default:
+                throw cast_error("unhandled return_value_policy: should not happen!");
+        }
+
+        tinfo->init_instance(wrapper, existing_holder);
+
+        return inst.release();
+    }
+
+    // Base methods for generic caster; there are overridden in copyable_holder_caster
+    void load_value(value_and_holder &&v_h) {
+        auto *&vptr = v_h.value_ptr();
+        // Lazy allocation for unallocated values:
+        if (vptr == nullptr) {
+            auto *type = v_h.type ? v_h.type : typeinfo;
+            if (type->operator_new) {
+                vptr = type->operator_new(type->type_size);
+            } else {
+                #if defined(__cpp_aligned_new) && (!defined(_MSC_VER) || _MSC_VER >= 1912)
+                    if (type->type_align > __STDCPP_DEFAULT_NEW_ALIGNMENT__)
+                        vptr = ::operator new(type->type_size,
+                                              std::align_val_t(type->type_align));
+                    else
+                #endif
+                vptr = ::operator new(type->type_size);
+            }
+        }
+        value = vptr;
+    }
+    bool try_implicit_casts(handle src, bool convert) {
+        for (auto &cast : typeinfo->implicit_casts) {
+            type_caster_generic sub_caster(*cast.first);
+            if (sub_caster.load(src, convert)) {
+                value = cast.second(sub_caster.value);
+                return true;
+            }
+        }
+        return false;
+    }
+    bool try_direct_conversions(handle src) {
+        for (auto &converter : *typeinfo->direct_conversions) {
+            if (converter(src.ptr(), value))
+                return true;
+        }
+        return false;
+    }
+    void check_holder_compat() {}
+
+    PYBIND11_NOINLINE static void *local_load(PyObject *src, const type_info *ti) {
+        auto caster = type_caster_generic(ti);
+        if (caster.load(src, false))
+            return caster.value;
+        return nullptr;
+    }
+
+    /// Try to load with foreign typeinfo, if available. Used when there is no
+    /// native typeinfo, or when the native one wasn't able to produce a value.
+    PYBIND11_NOINLINE bool try_load_foreign_module_local(handle src) {
+        constexpr auto *local_key = PYBIND11_MODULE_LOCAL_ID;
+        const auto pytype = src.get_type();
+        if (!hasattr(pytype, local_key))
+            return false;
+
+        type_info *foreign_typeinfo = reinterpret_borrow<capsule>(getattr(pytype, local_key));
+        // Only consider this foreign loader if actually foreign and is a loader of the correct cpp type
+        if (foreign_typeinfo->module_local_load == &local_load
+            || (cpptype && !same_type(*cpptype, *foreign_typeinfo->cpptype)))
+            return false;
+
+        if (auto result = foreign_typeinfo->module_local_load(src.ptr(), foreign_typeinfo)) {
+            value = result;
+            return true;
+        }
+        return false;
+    }
+
+    // Implementation of `load`; this takes the type of `this` so that it can dispatch the relevant
+    // bits of code between here and copyable_holder_caster where the two classes need different
+    // logic (without having to resort to virtual inheritance).
+    template <typename ThisT>
+    PYBIND11_NOINLINE bool load_impl(handle src, bool convert) {
+        if (!src) return false;
+        if (!typeinfo) return try_load_foreign_module_local(src);
+        if (src.is_none()) {
+            // Defer accepting None to other overloads (if we aren't in convert mode):
+            if (!convert) return false;
+            value = nullptr;
+            return true;
+        }
+
+        auto &this_ = static_cast<ThisT &>(*this);
+        this_.check_holder_compat();
+
+        PyTypeObject *srctype = Py_TYPE(src.ptr());
+
+        // Case 1: If src is an exact type match for the target type then we can reinterpret_cast
+        // the instance's value pointer to the target type:
+        if (srctype == typeinfo->type) {
+            this_.load_value(reinterpret_cast<instance *>(src.ptr())->get_value_and_holder());
+            return true;
+        }
+        // Case 2: We have a derived class
+        else if (PyType_IsSubtype(srctype, typeinfo->type)) {
+            auto &bases = all_type_info(srctype);
+            bool no_cpp_mi = typeinfo->simple_type;
+
+            // Case 2a: the python type is a Python-inherited derived class that inherits from just
+            // one simple (no MI) pybind11 class, or is an exact match, so the C++ instance is of
+            // the right type and we can use reinterpret_cast.
+            // (This is essentially the same as case 2b, but because not using multiple inheritance
+            // is extremely common, we handle it specially to avoid the loop iterator and type
+            // pointer lookup overhead)
+            if (bases.size() == 1 && (no_cpp_mi || bases.front()->type == typeinfo->type)) {
+                this_.load_value(reinterpret_cast<instance *>(src.ptr())->get_value_and_holder());
+                return true;
+            }
+            // Case 2b: the python type inherits from multiple C++ bases.  Check the bases to see if
+            // we can find an exact match (or, for a simple C++ type, an inherited match); if so, we
+            // can safely reinterpret_cast to the relevant pointer.
+            else if (bases.size() > 1) {
+                for (auto base : bases) {
+                    if (no_cpp_mi ? PyType_IsSubtype(base->type, typeinfo->type) : base->type == typeinfo->type) {
+                        this_.load_value(reinterpret_cast<instance *>(src.ptr())->get_value_and_holder(base));
+                        return true;
+                    }
+                }
+            }
+
+            // Case 2c: C++ multiple inheritance is involved and we couldn't find an exact type match
+            // in the registered bases, above, so try implicit casting (needed for proper C++ casting
+            // when MI is involved).
+            if (this_.try_implicit_casts(src, convert))
+                return true;
+        }
+
+        // Perform an implicit conversion
+        if (convert) {
+            for (auto &converter : typeinfo->implicit_conversions) {
+                auto temp = reinterpret_steal<object>(converter(src.ptr(), typeinfo->type));
+                if (load_impl<ThisT>(temp, false)) {
+                    loader_life_support::add_patient(temp);
+                    return true;
+                }
+            }
+            if (this_.try_direct_conversions(src))
+                return true;
+        }
+
+        // Failed to match local typeinfo. Try again with global.
+        if (typeinfo->module_local) {
+            if (auto gtype = get_global_type_info(*typeinfo->cpptype)) {
+                typeinfo = gtype;
+                return load(src, false);
+            }
+        }
+
+        // Global typeinfo has precedence over foreign module_local
+        return try_load_foreign_module_local(src);
+    }
+
+
+    // Called to do type lookup and wrap the pointer and type in a pair when a dynamic_cast
+    // isn't needed or can't be used.  If the type is unknown, sets the error and returns a pair
+    // with .second = nullptr.  (p.first = nullptr is not an error: it becomes None).
+    PYBIND11_NOINLINE static std::pair<const void *, const type_info *> src_and_type(
+            const void *src, const std::type_info &cast_type, const std::type_info *rtti_type = nullptr) {
+        if (auto *tpi = get_type_info(cast_type))
+            return {src, const_cast<const type_info *>(tpi)};
+
+        // Not found, set error:
+        std::string tname = rtti_type ? rtti_type->name() : cast_type.name();
+        detail::clean_type_id(tname);
+        std::string msg = "Unregistered type : " + tname;
+        PyErr_SetString(PyExc_TypeError, msg.c_str());
+        return {nullptr, nullptr};
+    }
+
+    const type_info *typeinfo = nullptr;
+    const std::type_info *cpptype = nullptr;
+    void *value = nullptr;
+};
+
+/**
+ * Determine suitable casting operator for pointer-or-lvalue-casting type casters.  The type caster
+ * needs to provide `operator T*()` and `operator T&()` operators.
+ *
+ * If the type supports moving the value away via an `operator T&&() &&` method, it should use
+ * `movable_cast_op_type` instead.
+ */
+template <typename T>
+using cast_op_type =
+    conditional_t<std::is_pointer<remove_reference_t<T>>::value,
+        typename std::add_pointer<intrinsic_t<T>>::type,
+        typename std::add_lvalue_reference<intrinsic_t<T>>::type>;
+
+/**
+ * Determine suitable casting operator for a type caster with a movable value.  Such a type caster
+ * needs to provide `operator T*()`, `operator T&()`, and `operator T&&() &&`.  The latter will be
+ * called in appropriate contexts where the value can be moved rather than copied.
+ *
+ * These operator are automatically provided when using the PYBIND11_TYPE_CASTER macro.
+ */
+template <typename T>
+using movable_cast_op_type =
+    conditional_t<std::is_pointer<typename std::remove_reference<T>::type>::value,
+        typename std::add_pointer<intrinsic_t<T>>::type,
+    conditional_t<std::is_rvalue_reference<T>::value,
+        typename std::add_rvalue_reference<intrinsic_t<T>>::type,
+        typename std::add_lvalue_reference<intrinsic_t<T>>::type>>;
+
+// std::is_copy_constructible isn't quite enough: it lets std::vector<T> (and similar) through when
+// T is non-copyable, but code containing such a copy constructor fails to actually compile.
+template <typename T, typename SFINAE = void> struct is_copy_constructible : std::is_copy_constructible<T> {};
+
+// Specialization for types that appear to be copy constructible but also look like stl containers
+// (we specifically check for: has `value_type` and `reference` with `reference = value_type&`): if
+// so, copy constructability depends on whether the value_type is copy constructible.
+template <typename Container> struct is_copy_constructible<Container, enable_if_t<all_of<
+        std::is_copy_constructible<Container>,
+        std::is_same<typename Container::value_type &, typename Container::reference>,
+        // Avoid infinite recursion
+        negation<std::is_same<Container, typename Container::value_type>>
+    >::value>> : is_copy_constructible<typename Container::value_type> {};
+
+// Likewise for std::pair
+// (after C++17 it is mandatory that the copy constructor not exist when the two types aren't themselves
+// copy constructible, but this can not be relied upon when T1 or T2 are themselves containers).
+template <typename T1, typename T2> struct is_copy_constructible<std::pair<T1, T2>>
+    : all_of<is_copy_constructible<T1>, is_copy_constructible<T2>> {};
+
+// The same problems arise with std::is_copy_assignable, so we use the same workaround.
+template <typename T, typename SFINAE = void> struct is_copy_assignable : std::is_copy_assignable<T> {};
+template <typename Container> struct is_copy_assignable<Container, enable_if_t<all_of<
+        std::is_copy_assignable<Container>,
+        std::is_same<typename Container::value_type &, typename Container::reference>
+    >::value>> : is_copy_assignable<typename Container::value_type> {};
+template <typename T1, typename T2> struct is_copy_assignable<std::pair<T1, T2>>
+    : all_of<is_copy_assignable<T1>, is_copy_assignable<T2>> {};
+
+NAMESPACE_END(detail)
+
+// polymorphic_type_hook<itype>::get(src, tinfo) determines whether the object pointed
+// to by `src` actually is an instance of some class derived from `itype`.
+// If so, it sets `tinfo` to point to the std::type_info representing that derived
+// type, and returns a pointer to the start of the most-derived object of that type
+// (in which `src` is a subobject; this will be the same address as `src` in most
+// single inheritance cases). If not, or if `src` is nullptr, it simply returns `src`
+// and leaves `tinfo` at its default value of nullptr.
+//
+// The default polymorphic_type_hook just returns src. A specialization for polymorphic
+// types determines the runtime type of the passed object and adjusts the this-pointer
+// appropriately via dynamic_cast<void*>. This is what enables a C++ Animal* to appear
+// to Python as a Dog (if Dog inherits from Animal, Animal is polymorphic, Dog is
+// registered with pybind11, and this Animal is in fact a Dog).
+//
+// You may specialize polymorphic_type_hook yourself for types that want to appear
+// polymorphic to Python but do not use C++ RTTI. (This is a not uncommon pattern
+// in performance-sensitive applications, used most notably in LLVM.)
+//
+// polymorphic_type_hook_base allows users to specialize polymorphic_type_hook with
+// std::enable_if. User provided specializations will always have higher priority than
+// the default implementation and specialization provided in polymorphic_type_hook_base.
+template <typename itype, typename SFINAE = void>
+struct polymorphic_type_hook_base
+{
+    static const void *get(const itype *src, const std::type_info*&) { return src; }
+};
+template <typename itype>
+struct polymorphic_type_hook_base<itype, detail::enable_if_t<std::is_polymorphic<itype>::value>>
+{
+    static const void *get(const itype *src, const std::type_info*& type) {
+        type = src ? &typeid(*src) : nullptr;
+        return dynamic_cast<const void*>(src);
+    }
+};
+template <typename itype, typename SFINAE = void>
+struct polymorphic_type_hook : public polymorphic_type_hook_base<itype> {};
+
+NAMESPACE_BEGIN(detail)
+
+/// Generic type caster for objects stored on the heap
+template <typename type> class type_caster_base : public type_caster_generic {
+    using itype = intrinsic_t<type>;
+
+public:
+    static constexpr auto name = _<type>();
+
+    type_caster_base() : type_caster_base(typeid(type)) { }
+    explicit type_caster_base(const std::type_info &info) : type_caster_generic(info) { }
+
+    static handle cast(const itype &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic || policy == return_value_policy::automatic_reference)
+            policy = return_value_policy::copy;
+        return cast(&src, policy, parent);
+    }
+
+    static handle cast(itype &&src, return_value_policy, handle parent) {
+        return cast(&src, return_value_policy::move, parent);
+    }
+
+    // Returns a (pointer, type_info) pair taking care of necessary type lookup for a
+    // polymorphic type (using RTTI by default, but can be overridden by specializing
+    // polymorphic_type_hook). If the instance isn't derived, returns the base version.
+    static std::pair<const void *, const type_info *> src_and_type(const itype *src) {
+        auto &cast_type = typeid(itype);
+        const std::type_info *instance_type = nullptr;
+        const void *vsrc = polymorphic_type_hook<itype>::get(src, instance_type);
+        if (instance_type && !same_type(cast_type, *instance_type)) {
+            // This is a base pointer to a derived type. If the derived type is registered
+            // with pybind11, we want to make the full derived object available.
+            // In the typical case where itype is polymorphic, we get the correct
+            // derived pointer (which may be != base pointer) by a dynamic_cast to
+            // most derived type. If itype is not polymorphic, we won't get here
+            // except via a user-provided specialization of polymorphic_type_hook,
+            // and the user has promised that no this-pointer adjustment is
+            // required in that case, so it's OK to use static_cast.
+            if (const auto *tpi = get_type_info(*instance_type))
+                return {vsrc, tpi};
+        }
+        // Otherwise we have either a nullptr, an `itype` pointer, or an unknown derived pointer, so
+        // don't do a cast
+        return type_caster_generic::src_and_type(src, cast_type, instance_type);
+    }
+
+    static handle cast(const itype *src, return_value_policy policy, handle parent) {
+        auto st = src_and_type(src);
+        return type_caster_generic::cast(
+            st.first, policy, parent, st.second,
+            make_copy_constructor(src), make_move_constructor(src));
+    }
+
+    static handle cast_holder(const itype *src, const void *holder) {
+        auto st = src_and_type(src);
+        return type_caster_generic::cast(
+            st.first, return_value_policy::take_ownership, {}, st.second,
+            nullptr, nullptr, holder);
+    }
+
+    template <typename T> using cast_op_type = detail::cast_op_type<T>;
+
+    operator itype*() { return (type *) value; }
+    operator itype&() { if (!value) throw reference_cast_error(); return *((itype *) value); }
+
+protected:
+    using Constructor = void *(*)(const void *);
+
+    /* Only enabled when the types are {copy,move}-constructible *and* when the type
+       does not have a private operator new implementation. */
+    template <typename T, typename = enable_if_t<is_copy_constructible<T>::value>>
+    static auto make_copy_constructor(const T *x) -> decltype(new T(*x), Constructor{}) {
+        return [](const void *arg) -> void * {
+            return new T(*reinterpret_cast<const T *>(arg));
+        };
+    }
+
+    template <typename T, typename = enable_if_t<std::is_move_constructible<T>::value>>
+    static auto make_move_constructor(const T *x) -> decltype(new T(std::move(*const_cast<T *>(x))), Constructor{}) {
+        return [](const void *arg) -> void * {
+            return new T(std::move(*const_cast<T *>(reinterpret_cast<const T *>(arg))));
+        };
+    }
+
+    static Constructor make_copy_constructor(...) { return nullptr; }
+    static Constructor make_move_constructor(...) { return nullptr; }
+};
+
+template <typename type, typename SFINAE = void> class type_caster : public type_caster_base<type> { };
+template <typename type> using make_caster = type_caster<intrinsic_t<type>>;
+
+// Shortcut for calling a caster's `cast_op_type` cast operator for casting a type_caster to a T
+template <typename T> typename make_caster<T>::template cast_op_type<T> cast_op(make_caster<T> &caster) {
+    return caster.operator typename make_caster<T>::template cast_op_type<T>();
+}
+template <typename T> typename make_caster<T>::template cast_op_type<typename std::add_rvalue_reference<T>::type>
+cast_op(make_caster<T> &&caster) {
+    return std::move(caster).operator
+        typename make_caster<T>::template cast_op_type<typename std::add_rvalue_reference<T>::type>();
+}
+
+template <typename type> class type_caster<std::reference_wrapper<type>> {
+private:
+    using caster_t = make_caster<type>;
+    caster_t subcaster;
+    using subcaster_cast_op_type = typename caster_t::template cast_op_type<type>;
+    static_assert(std::is_same<typename std::remove_const<type>::type &, subcaster_cast_op_type>::value,
+            "std::reference_wrapper<T> caster requires T to have a caster with an `T &` operator");
+public:
+    bool load(handle src, bool convert) { return subcaster.load(src, convert); }
+    static constexpr auto name = caster_t::name;
+    static handle cast(const std::reference_wrapper<type> &src, return_value_policy policy, handle parent) {
+        // It is definitely wrong to take ownership of this pointer, so mask that rvp
+        if (policy == return_value_policy::take_ownership || policy == return_value_policy::automatic)
+            policy = return_value_policy::automatic_reference;
+        return caster_t::cast(&src.get(), policy, parent);
+    }
+    template <typename T> using cast_op_type = std::reference_wrapper<type>;
+    operator std::reference_wrapper<type>() { return subcaster.operator subcaster_cast_op_type&(); }
+};
+
+#define PYBIND11_TYPE_CASTER(type, py_name) \
+    protected: \
+        type value; \
+    public: \
+        static constexpr auto name = py_name; \
+        template <typename T_, enable_if_t<std::is_same<type, remove_cv_t<T_>>::value, int> = 0> \
+        static handle cast(T_ *src, return_value_policy policy, handle parent) { \
+            if (!src) return none().release(); \
+            if (policy == return_value_policy::take_ownership) { \
+                auto h = cast(std::move(*src), policy, parent); delete src; return h; \
+            } else { \
+                return cast(*src, policy, parent); \
+            } \
+        } \
+        operator type*() { return &value; } \
+        operator type&() { return value; } \
+        operator type&&() && { return std::move(value); } \
+        template <typename T_> using cast_op_type = pybind11::detail::movable_cast_op_type<T_>
+
+
+template <typename CharT> using is_std_char_type = any_of<
+    std::is_same<CharT, char>, /* std::string */
+#if defined(PYBIND11_HAS_U8STRING)
+    std::is_same<CharT, char8_t>, /* std::u8string */
+#endif
+    std::is_same<CharT, char16_t>, /* std::u16string */
+    std::is_same<CharT, char32_t>, /* std::u32string */
+    std::is_same<CharT, wchar_t> /* std::wstring */
+>;
+
+template <typename T>
+struct type_caster<T, enable_if_t<std::is_arithmetic<T>::value && !is_std_char_type<T>::value>> {
+    using _py_type_0 = conditional_t<sizeof(T) <= sizeof(long), long, long long>;
+    using _py_type_1 = conditional_t<std::is_signed<T>::value, _py_type_0, typename std::make_unsigned<_py_type_0>::type>;
+    using py_type = conditional_t<std::is_floating_point<T>::value, double, _py_type_1>;
+public:
+
+    bool load(handle src, bool convert) {
+        py_type py_value;
+
+        if (!src)
+            return false;
+
+        if (std::is_floating_point<T>::value) {
+            if (convert || PyFloat_Check(src.ptr()))
+                py_value = (py_type) PyFloat_AsDouble(src.ptr());
+            else
+                return false;
+        } else if (PyFloat_Check(src.ptr())) {
+            return false;
+        } else if (std::is_unsigned<py_type>::value) {
+            py_value = as_unsigned<py_type>(src.ptr());
+        } else { // signed integer:
+            py_value = sizeof(T) <= sizeof(long)
+                ? (py_type) PyLong_AsLong(src.ptr())
+                : (py_type) PYBIND11_LONG_AS_LONGLONG(src.ptr());
+        }
+
+        bool py_err = py_value == (py_type) -1 && PyErr_Occurred();
+
+        // Protect std::numeric_limits::min/max with parentheses
+        if (py_err || (std::is_integral<T>::value && sizeof(py_type) != sizeof(T) &&
+                       (py_value < (py_type) (std::numeric_limits<T>::min)() ||
+                        py_value > (py_type) (std::numeric_limits<T>::max)()))) {
+            bool type_error = py_err && PyErr_ExceptionMatches(
+#if PY_VERSION_HEX < 0x03000000 && !defined(PYPY_VERSION)
+                PyExc_SystemError
+#else
+                PyExc_TypeError
+#endif
+            );
+            PyErr_Clear();
+            if (type_error && convert && PyNumber_Check(src.ptr())) {
+                auto tmp = reinterpret_steal<object>(std::is_floating_point<T>::value
+                                                     ? PyNumber_Float(src.ptr())
+                                                     : PyNumber_Long(src.ptr()));
+                PyErr_Clear();
+                return load(tmp, false);
+            }
+            return false;
+        }
+
+        value = (T) py_value;
+        return true;
+    }
+
+    template<typename U = T>
+    static typename std::enable_if<std::is_floating_point<U>::value, handle>::type
+    cast(U src, return_value_policy /* policy */, handle /* parent */) {
+        return PyFloat_FromDouble((double) src);
+    }
+
+    template<typename U = T>
+    static typename std::enable_if<!std::is_floating_point<U>::value && std::is_signed<U>::value && (sizeof(U) <= sizeof(long)), handle>::type
+    cast(U src, return_value_policy /* policy */, handle /* parent */) {
+        return PYBIND11_LONG_FROM_SIGNED((long) src);
+    }
+
+    template<typename U = T>
+    static typename std::enable_if<!std::is_floating_point<U>::value && std::is_unsigned<U>::value && (sizeof(U) <= sizeof(unsigned long)), handle>::type
+    cast(U src, return_value_policy /* policy */, handle /* parent */) {
+        return PYBIND11_LONG_FROM_UNSIGNED((unsigned long) src);
+    }
+
+    template<typename U = T>
+    static typename std::enable_if<!std::is_floating_point<U>::value && std::is_signed<U>::value && (sizeof(U) > sizeof(long)), handle>::type
+    cast(U src, return_value_policy /* policy */, handle /* parent */) {
+        return PyLong_FromLongLong((long long) src);
+    }
+
+    template<typename U = T>
+    static typename std::enable_if<!std::is_floating_point<U>::value && std::is_unsigned<U>::value && (sizeof(U) > sizeof(unsigned long)), handle>::type
+    cast(U src, return_value_policy /* policy */, handle /* parent */) {
+        return PyLong_FromUnsignedLongLong((unsigned long long) src);
+    }
+
+    PYBIND11_TYPE_CASTER(T, _<std::is_integral<T>::value>("int", "float"));
+};
+
+template<typename T> struct void_caster {
+public:
+    bool load(handle src, bool) {
+        if (src && src.is_none())
+            return true;
+        return false;
+    }
+    static handle cast(T, return_value_policy /* policy */, handle /* parent */) {
+        return none().inc_ref();
+    }
+    PYBIND11_TYPE_CASTER(T, _("None"));
+};
+
+template <> class type_caster<void_type> : public void_caster<void_type> {};
+
+template <> class type_caster<void> : public type_caster<void_type> {
+public:
+    using type_caster<void_type>::cast;
+
+    bool load(handle h, bool) {
+        if (!h) {
+            return false;
+        } else if (h.is_none()) {
+            value = nullptr;
+            return true;
+        }
+
+        /* Check if this is a capsule */
+        if (isinstance<capsule>(h)) {
+            value = reinterpret_borrow<capsule>(h);
+            return true;
+        }
+
+        /* Check if this is a C++ type */
+        auto &bases = all_type_info((PyTypeObject *) h.get_type().ptr());
+        if (bases.size() == 1) { // Only allowing loading from a single-value type
+            value = values_and_holders(reinterpret_cast<instance *>(h.ptr())).begin()->value_ptr();
+            return true;
+        }
+
+        /* Fail */
+        return false;
+    }
+
+    static handle cast(const void *ptr, return_value_policy /* policy */, handle /* parent */) {
+        if (ptr)
+            return capsule(ptr).release();
+        else
+            return none().inc_ref();
+    }
+
+    template <typename T> using cast_op_type = void*&;
+    operator void *&() { return value; }
+    static constexpr auto name = _("capsule");
+private:
+    void *value = nullptr;
+};
+
+template <> class type_caster<std::nullptr_t> : public void_caster<std::nullptr_t> { };
+
+template <> class type_caster<bool> {
+public:
+    bool load(handle src, bool convert) {
+        if (!src) return false;
+        else if (src.ptr() == Py_True) { value = true; return true; }
+        else if (src.ptr() == Py_False) { value = false; return true; }
+        else if (convert || !strcmp("numpy.bool_", Py_TYPE(src.ptr())->tp_name)) {
+            // (allow non-implicit conversion for numpy booleans)
+
+            Py_ssize_t res = -1;
+            if (src.is_none()) {
+                res = 0;  // None is implicitly converted to False
+            }
+            #if defined(PYPY_VERSION)
+            // On PyPy, check that "__bool__" (or "__nonzero__" on Python 2.7) attr exists
+            else if (hasattr(src, PYBIND11_BOOL_ATTR)) {
+                res = PyObject_IsTrue(src.ptr());
+            }
+            #else
+            // Alternate approach for CPython: this does the same as the above, but optimized
+            // using the CPython API so as to avoid an unneeded attribute lookup.
+            else if (auto tp_as_number = src.ptr()->ob_type->tp_as_number) {
+                if (PYBIND11_NB_BOOL(tp_as_number)) {
+                    res = (*PYBIND11_NB_BOOL(tp_as_number))(src.ptr());
+                }
+            }
+            #endif
+            if (res == 0 || res == 1) {
+                value = (bool) res;
+                return true;
+            } else {
+                PyErr_Clear();
+            }
+        }
+        return false;
+    }
+    static handle cast(bool src, return_value_policy /* policy */, handle /* parent */) {
+        return handle(src ? Py_True : Py_False).inc_ref();
+    }
+    PYBIND11_TYPE_CASTER(bool, _("bool"));
+};
+
+// Helper class for UTF-{8,16,32} C++ stl strings:
+template <typename StringType, bool IsView = false> struct string_caster {
+    using CharT = typename StringType::value_type;
+
+    // Simplify life by being able to assume standard char sizes (the standard only guarantees
+    // minimums, but Python requires exact sizes)
+    static_assert(!std::is_same<CharT, char>::value || sizeof(CharT) == 1, "Unsupported char size != 1");
+#if defined(PYBIND11_HAS_U8STRING)
+    static_assert(!std::is_same<CharT, char8_t>::value || sizeof(CharT) == 1, "Unsupported char8_t size != 1");
+#endif
+    static_assert(!std::is_same<CharT, char16_t>::value || sizeof(CharT) == 2, "Unsupported char16_t size != 2");
+    static_assert(!std::is_same<CharT, char32_t>::value || sizeof(CharT) == 4, "Unsupported char32_t size != 4");
+    // wchar_t can be either 16 bits (Windows) or 32 (everywhere else)
+    static_assert(!std::is_same<CharT, wchar_t>::value || sizeof(CharT) == 2 || sizeof(CharT) == 4,
+            "Unsupported wchar_t size != 2/4");
+    static constexpr size_t UTF_N = 8 * sizeof(CharT);
+
+    bool load(handle src, bool) {
+#if PY_MAJOR_VERSION < 3
+        object temp;
+#endif
+        handle load_src = src;
+        if (!src) {
+            return false;
+        } else if (!PyUnicode_Check(load_src.ptr())) {
+#if PY_MAJOR_VERSION >= 3
+            return load_bytes(load_src);
+#else
+            if (std::is_same<CharT, char>::value) {
+                return load_bytes(load_src);
+            }
+
+            // The below is a guaranteed failure in Python 3 when PyUnicode_Check returns false
+            if (!PYBIND11_BYTES_CHECK(load_src.ptr()))
+                return false;
+
+            temp = reinterpret_steal<object>(PyUnicode_FromObject(load_src.ptr()));
+            if (!temp) { PyErr_Clear(); return false; }
+            load_src = temp;
+#endif
+        }
+
+        object utfNbytes = reinterpret_steal<object>(PyUnicode_AsEncodedString(
+            load_src.ptr(), UTF_N == 8 ? "utf-8" : UTF_N == 16 ? "utf-16" : "utf-32", nullptr));
+        if (!utfNbytes) { PyErr_Clear(); return false; }
+
+        const CharT *buffer = reinterpret_cast<const CharT *>(PYBIND11_BYTES_AS_STRING(utfNbytes.ptr()));
+        size_t length = (size_t) PYBIND11_BYTES_SIZE(utfNbytes.ptr()) / sizeof(CharT);
+        if (UTF_N > 8) { buffer++; length--; } // Skip BOM for UTF-16/32
+        value = StringType(buffer, length);
+
+        // If we're loading a string_view we need to keep the encoded Python object alive:
+        if (IsView)
+            loader_life_support::add_patient(utfNbytes);
+
+        return true;
+    }
+
+    static handle cast(const StringType &src, return_value_policy /* policy */, handle /* parent */) {
+        const char *buffer = reinterpret_cast<const char *>(src.data());
+        ssize_t nbytes = ssize_t(src.size() * sizeof(CharT));
+        handle s = decode_utfN(buffer, nbytes);
+        if (!s) throw error_already_set();
+        return s;
+    }
+
+    PYBIND11_TYPE_CASTER(StringType, _(PYBIND11_STRING_NAME));
+
+private:
+    static handle decode_utfN(const char *buffer, ssize_t nbytes) {
+#if !defined(PYPY_VERSION)
+        return
+            UTF_N == 8  ? PyUnicode_DecodeUTF8(buffer, nbytes, nullptr) :
+            UTF_N == 16 ? PyUnicode_DecodeUTF16(buffer, nbytes, nullptr, nullptr) :
+                          PyUnicode_DecodeUTF32(buffer, nbytes, nullptr, nullptr);
+#else
+        // PyPy seems to have multiple problems related to PyUnicode_UTF*: the UTF8 version
+        // sometimes segfaults for unknown reasons, while the UTF16 and 32 versions require a
+        // non-const char * arguments, which is also a nuisance, so bypass the whole thing by just
+        // passing the encoding as a string value, which works properly:
+        return PyUnicode_Decode(buffer, nbytes, UTF_N == 8 ? "utf-8" : UTF_N == 16 ? "utf-16" : "utf-32", nullptr);
+#endif
+    }
+
+    // When loading into a std::string or char*, accept a bytes object as-is (i.e.
+    // without any encoding/decoding attempt).  For other C++ char sizes this is a no-op.
+    // which supports loading a unicode from a str, doesn't take this path.
+    template <typename C = CharT>
+    bool load_bytes(enable_if_t<std::is_same<C, char>::value, handle> src) {
+        if (PYBIND11_BYTES_CHECK(src.ptr())) {
+            // We were passed a Python 3 raw bytes; accept it into a std::string or char*
+            // without any encoding attempt.
+            const char *bytes = PYBIND11_BYTES_AS_STRING(src.ptr());
+            if (bytes) {
+                value = StringType(bytes, (size_t) PYBIND11_BYTES_SIZE(src.ptr()));
+                return true;
+            }
+        }
+
+        return false;
+    }
+
+    template <typename C = CharT>
+    bool load_bytes(enable_if_t<!std::is_same<C, char>::value, handle>) { return false; }
+};
+
+template <typename CharT, class Traits, class Allocator>
+struct type_caster<std::basic_string<CharT, Traits, Allocator>, enable_if_t<is_std_char_type<CharT>::value>>
+    : string_caster<std::basic_string<CharT, Traits, Allocator>> {};
+
+#ifdef PYBIND11_HAS_STRING_VIEW
+template <typename CharT, class Traits>
+struct type_caster<std::basic_string_view<CharT, Traits>, enable_if_t<is_std_char_type<CharT>::value>>
+    : string_caster<std::basic_string_view<CharT, Traits>, true> {};
+#endif
+
+// Type caster for C-style strings.  We basically use a std::string type caster, but also add the
+// ability to use None as a nullptr char* (which the string caster doesn't allow).
+template <typename CharT> struct type_caster<CharT, enable_if_t<is_std_char_type<CharT>::value>> {
+    using StringType = std::basic_string<CharT>;
+    using StringCaster = type_caster<StringType>;
+    StringCaster str_caster;
+    bool none = false;
+    CharT one_char = 0;
+public:
+    bool load(handle src, bool convert) {
+        if (!src) return false;
+        if (src.is_none()) {
+            // Defer accepting None to other overloads (if we aren't in convert mode):
+            if (!convert) return false;
+            none = true;
+            return true;
+        }
+        return str_caster.load(src, convert);
+    }
+
+    static handle cast(const CharT *src, return_value_policy policy, handle parent) {
+        if (src == nullptr) return pybind11::none().inc_ref();
+        return StringCaster::cast(StringType(src), policy, parent);
+    }
+
+    static handle cast(CharT src, return_value_policy policy, handle parent) {
+        if (std::is_same<char, CharT>::value) {
+            handle s = PyUnicode_DecodeLatin1((const char *) &src, 1, nullptr);
+            if (!s) throw error_already_set();
+            return s;
+        }
+        return StringCaster::cast(StringType(1, src), policy, parent);
+    }
+
+    operator CharT*() { return none ? nullptr : const_cast<CharT *>(static_cast<StringType &>(str_caster).c_str()); }
+    operator CharT&() {
+        if (none)
+            throw value_error("Cannot convert None to a character");
+
+        auto &value = static_cast<StringType &>(str_caster);
+        size_t str_len = value.size();
+        if (str_len == 0)
+            throw value_error("Cannot convert empty string to a character");
+
+        // If we're in UTF-8 mode, we have two possible failures: one for a unicode character that
+        // is too high, and one for multiple unicode characters (caught later), so we need to figure
+        // out how long the first encoded character is in bytes to distinguish between these two
+        // errors.  We also allow want to allow unicode characters U+0080 through U+00FF, as those
+        // can fit into a single char value.
+        if (StringCaster::UTF_N == 8 && str_len > 1 && str_len <= 4) {
+            unsigned char v0 = static_cast<unsigned char>(value[0]);
+            size_t char0_bytes = !(v0 & 0x80) ? 1 : // low bits only: 0-127
+                (v0 & 0xE0) == 0xC0 ? 2 : // 0b110xxxxx - start of 2-byte sequence
+                (v0 & 0xF0) == 0xE0 ? 3 : // 0b1110xxxx - start of 3-byte sequence
+                4; // 0b11110xxx - start of 4-byte sequence
+
+            if (char0_bytes == str_len) {
+                // If we have a 128-255 value, we can decode it into a single char:
+                if (char0_bytes == 2 && (v0 & 0xFC) == 0xC0) { // 0x110000xx 0x10xxxxxx
+                    one_char = static_cast<CharT>(((v0 & 3) << 6) + (static_cast<unsigned char>(value[1]) & 0x3F));
+                    return one_char;
+                }
+                // Otherwise we have a single character, but it's > U+00FF
+                throw value_error("Character code point not in range(0x100)");
+            }
+        }
+
+        // UTF-16 is much easier: we can only have a surrogate pair for values above U+FFFF, thus a
+        // surrogate pair with total length 2 instantly indicates a range error (but not a "your
+        // string was too long" error).
+        else if (StringCaster::UTF_N == 16 && str_len == 2) {
+            one_char = static_cast<CharT>(value[0]);
+            if (one_char >= 0xD800 && one_char < 0xE000)
+                throw value_error("Character code point not in range(0x10000)");
+        }
+
+        if (str_len != 1)
+            throw value_error("Expected a character, but multi-character string found");
+
+        one_char = value[0];
+        return one_char;
+    }
+
+    static constexpr auto name = _(PYBIND11_STRING_NAME);
+    template <typename _T> using cast_op_type = pybind11::detail::cast_op_type<_T>;
+};
+
+// Base implementation for std::tuple and std::pair
+template <template<typename...> class Tuple, typename... Ts> class tuple_caster {
+    using type = Tuple<Ts...>;
+    static constexpr auto size = sizeof...(Ts);
+    using indices = make_index_sequence<size>;
+public:
+
+    bool load(handle src, bool convert) {
+        if (!isinstance<sequence>(src))
+            return false;
+        const auto seq = reinterpret_borrow<sequence>(src);
+        if (seq.size() != size)
+            return false;
+        return load_impl(seq, convert, indices{});
+    }
+
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
+        return cast_impl(std::forward<T>(src), policy, parent, indices{});
+    }
+
+    static constexpr auto name = _("Tuple[") + concat(make_caster<Ts>::name...) + _("]");
+
+    template <typename T> using cast_op_type = type;
+
+    operator type() & { return implicit_cast(indices{}); }
+    operator type() && { return std::move(*this).implicit_cast(indices{}); }
+
+protected:
+    template <size_t... Is>
+    type implicit_cast(index_sequence<Is...>) & { return type(cast_op<Ts>(std::get<Is>(subcasters))...); }
+    template <size_t... Is>
+    type implicit_cast(index_sequence<Is...>) && { return type(cast_op<Ts>(std::move(std::get<Is>(subcasters)))...); }
+
+    static constexpr bool load_impl(const sequence &, bool, index_sequence<>) { return true; }
+
+    template <size_t... Is>
+    bool load_impl(const sequence &seq, bool convert, index_sequence<Is...>) {
+#ifdef __cpp_fold_expressions
+        if ((... || !std::get<Is>(subcasters).load(seq[Is], convert)))
+            return false;
+#else
+        for (bool r : {std::get<Is>(subcasters).load(seq[Is], convert)...})
+            if (!r)
+                return false;
+#endif
+        return true;
+    }
+
+    /* Implementation: Convert a C++ tuple into a Python tuple */
+    template <typename T, size_t... Is>
+    static handle cast_impl(T &&src, return_value_policy policy, handle parent, index_sequence<Is...>) {
+        std::array<object, size> entries{{
+            reinterpret_steal<object>(make_caster<Ts>::cast(std::get<Is>(std::forward<T>(src)), policy, parent))...
+        }};
+        for (const auto &entry: entries)
+            if (!entry)
+                return handle();
+        tuple result(size);
+        int counter = 0;
+        for (auto & entry: entries)
+            PyTuple_SET_ITEM(result.ptr(), counter++, entry.release().ptr());
+        return result.release();
+    }
+
+    Tuple<make_caster<Ts>...> subcasters;
+};
+
+template <typename T1, typename T2> class type_caster<std::pair<T1, T2>>
+    : public tuple_caster<std::pair, T1, T2> {};
+
+template <typename... Ts> class type_caster<std::tuple<Ts...>>
+    : public tuple_caster<std::tuple, Ts...> {};
+
+/// Helper class which abstracts away certain actions. Users can provide specializations for
+/// custom holders, but it's only necessary if the type has a non-standard interface.
+template <typename T>
+struct holder_helper {
+    static auto get(const T &p) -> decltype(p.get()) { return p.get(); }
+};
+
+/// Type caster for holder types like std::shared_ptr, etc.
+template <typename type, typename holder_type>
+struct copyable_holder_caster : public type_caster_base<type> {
+public:
+    using base = type_caster_base<type>;
+    static_assert(std::is_base_of<base, type_caster<type>>::value,
+            "Holder classes are only supported for custom types");
+    using base::base;
+    using base::cast;
+    using base::typeinfo;
+    using base::value;
+
+    bool load(handle src, bool convert) {
+        return base::template load_impl<copyable_holder_caster<type, holder_type>>(src, convert);
+    }
+
+    explicit operator type*() { return this->value; }
+    explicit operator type&() { return *(this->value); }
+    explicit operator holder_type*() { return std::addressof(holder); }
+
+    // Workaround for Intel compiler bug
+    // see pybind11 issue 94
+    #if defined(__ICC) || defined(__INTEL_COMPILER)
+    operator holder_type&() { return holder; }
+    #else
+    explicit operator holder_type&() { return holder; }
+    #endif
+
+    static handle cast(const holder_type &src, return_value_policy, handle) {
+        const auto *ptr = holder_helper<holder_type>::get(src);
+        return type_caster_base<type>::cast_holder(ptr, &src);
+    }
+
+protected:
+    friend class type_caster_generic;
+    void check_holder_compat() {
+        if (typeinfo->default_holder)
+            throw cast_error("Unable to load a custom holder type from a default-holder instance");
+    }
+
+    bool load_value(value_and_holder &&v_h) {
+        if (v_h.holder_constructed()) {
+            value = v_h.value_ptr();
+            holder = v_h.template holder<holder_type>();
+            return true;
+        } else {
+            throw cast_error("Unable to cast from non-held to held instance (T& to Holder<T>) "
+#if defined(NDEBUG)
+                             "(compile in debug mode for type information)");
+#else
+                             "of type '" + type_id<holder_type>() + "''");
+#endif
+        }
+    }
+
+    template <typename T = holder_type, detail::enable_if_t<!std::is_constructible<T, const T &, type*>::value, int> = 0>
+    bool try_implicit_casts(handle, bool) { return false; }
+
+    template <typename T = holder_type, detail::enable_if_t<std::is_constructible<T, const T &, type*>::value, int> = 0>
+    bool try_implicit_casts(handle src, bool convert) {
+        for (auto &cast : typeinfo->implicit_casts) {
+            copyable_holder_caster sub_caster(*cast.first);
+            if (sub_caster.load(src, convert)) {
+                value = cast.second(sub_caster.value);
+                holder = holder_type(sub_caster.holder, (type *) value);
+                return true;
+            }
+        }
+        return false;
+    }
+
+    static bool try_direct_conversions(handle) { return false; }
+
+
+    holder_type holder;
+};
+
+/// Specialize for the common std::shared_ptr, so users don't need to
+template <typename T>
+class type_caster<std::shared_ptr<T>> : public copyable_holder_caster<T, std::shared_ptr<T>> { };
+
+template <typename type, typename holder_type>
+struct move_only_holder_caster {
+    static_assert(std::is_base_of<type_caster_base<type>, type_caster<type>>::value,
+            "Holder classes are only supported for custom types");
+
+    static handle cast(holder_type &&src, return_value_policy, handle) {
+        auto *ptr = holder_helper<holder_type>::get(src);
+        return type_caster_base<type>::cast_holder(ptr, std::addressof(src));
+    }
+    static constexpr auto name = type_caster_base<type>::name;
+};
+
+template <typename type, typename deleter>
+class type_caster<std::unique_ptr<type, deleter>>
+    : public move_only_holder_caster<type, std::unique_ptr<type, deleter>> { };
+
+template <typename type, typename holder_type>
+using type_caster_holder = conditional_t<is_copy_constructible<holder_type>::value,
+                                         copyable_holder_caster<type, holder_type>,
+                                         move_only_holder_caster<type, holder_type>>;
+
+template <typename T, bool Value = false> struct always_construct_holder { static constexpr bool value = Value; };
+
+/// Create a specialization for custom holder types (silently ignores std::shared_ptr)
+#define PYBIND11_DECLARE_HOLDER_TYPE(type, holder_type, ...) \
+    namespace pybind11 { namespace detail { \
+    template <typename type> \
+    struct always_construct_holder<holder_type> : always_construct_holder<void, ##__VA_ARGS__>  { }; \
+    template <typename type> \
+    class type_caster<holder_type, enable_if_t<!is_shared_ptr<holder_type>::value>> \
+        : public type_caster_holder<type, holder_type> { }; \
+    }}
+
+// PYBIND11_DECLARE_HOLDER_TYPE holder types:
+template <typename base, typename holder> struct is_holder_type :
+    std::is_base_of<detail::type_caster_holder<base, holder>, detail::type_caster<holder>> {};
+// Specialization for always-supported unique_ptr holders:
+template <typename base, typename deleter> struct is_holder_type<base, std::unique_ptr<base, deleter>> :
+    std::true_type {};
+
+template <typename T> struct handle_type_name { static constexpr auto name = _<T>(); };
+template <> struct handle_type_name<bytes> { static constexpr auto name = _(PYBIND11_BYTES_NAME); };
+template <> struct handle_type_name<args> { static constexpr auto name = _("*args"); };
+template <> struct handle_type_name<kwargs> { static constexpr auto name = _("**kwargs"); };
+
+template <typename type>
+struct pyobject_caster {
+    template <typename T = type, enable_if_t<std::is_same<T, handle>::value, int> = 0>
+    bool load(handle src, bool /* convert */) { value = src; return static_cast<bool>(value); }
+
+    template <typename T = type, enable_if_t<std::is_base_of<object, T>::value, int> = 0>
+    bool load(handle src, bool /* convert */) {
+        if (!isinstance<type>(src))
+            return false;
+        value = reinterpret_borrow<type>(src);
+        return true;
+    }
+
+    static handle cast(const handle &src, return_value_policy /* policy */, handle /* parent */) {
+        return src.inc_ref();
+    }
+    PYBIND11_TYPE_CASTER(type, handle_type_name<type>::name);
+};
+
+template <typename T>
+class type_caster<T, enable_if_t<is_pyobject<T>::value>> : public pyobject_caster<T> { };
+
+// Our conditions for enabling moving are quite restrictive:
+// At compile time:
+// - T needs to be a non-const, non-pointer, non-reference type
+// - type_caster<T>::operator T&() must exist
+// - the type must be move constructible (obviously)
+// At run-time:
+// - if the type is non-copy-constructible, the object must be the sole owner of the type (i.e. it
+//   must have ref_count() == 1)h
+// If any of the above are not satisfied, we fall back to copying.
+template <typename T> using move_is_plain_type = satisfies_none_of<T,
+    std::is_void, std::is_pointer, std::is_reference, std::is_const
+>;
+template <typename T, typename SFINAE = void> struct move_always : std::false_type {};
+template <typename T> struct move_always<T, enable_if_t<all_of<
+    move_is_plain_type<T>,
+    negation<is_copy_constructible<T>>,
+    std::is_move_constructible<T>,
+    std::is_same<decltype(std::declval<make_caster<T>>().operator T&()), T&>
+>::value>> : std::true_type {};
+template <typename T, typename SFINAE = void> struct move_if_unreferenced : std::false_type {};
+template <typename T> struct move_if_unreferenced<T, enable_if_t<all_of<
+    move_is_plain_type<T>,
+    negation<move_always<T>>,
+    std::is_move_constructible<T>,
+    std::is_same<decltype(std::declval<make_caster<T>>().operator T&()), T&>
+>::value>> : std::true_type {};
+template <typename T> using move_never = none_of<move_always<T>, move_if_unreferenced<T>>;
+
+// Detect whether returning a `type` from a cast on type's type_caster is going to result in a
+// reference or pointer to a local variable of the type_caster.  Basically, only
+// non-reference/pointer `type`s and reference/pointers from a type_caster_generic are safe;
+// everything else returns a reference/pointer to a local variable.
+template <typename type> using cast_is_temporary_value_reference = bool_constant<
+    (std::is_reference<type>::value || std::is_pointer<type>::value) &&
+    !std::is_base_of<type_caster_generic, make_caster<type>>::value &&
+    !std::is_same<intrinsic_t<type>, void>::value
+>;
+
+// When a value returned from a C++ function is being cast back to Python, we almost always want to
+// force `policy = move`, regardless of the return value policy the function/method was declared
+// with.
+template <typename Return, typename SFINAE = void> struct return_value_policy_override {
+    static return_value_policy policy(return_value_policy p) { return p; }
+};
+
+template <typename Return> struct return_value_policy_override<Return,
+        detail::enable_if_t<std::is_base_of<type_caster_generic, make_caster<Return>>::value, void>> {
+    static return_value_policy policy(return_value_policy p) {
+        return !std::is_lvalue_reference<Return>::value &&
+               !std::is_pointer<Return>::value
+                   ? return_value_policy::move : p;
+    }
+};
+
+// Basic python -> C++ casting; throws if casting fails
+template <typename T, typename SFINAE> type_caster<T, SFINAE> &load_type(type_caster<T, SFINAE> &conv, const handle &handle) {
+    if (!conv.load(handle, true)) {
+#if defined(NDEBUG)
+        throw cast_error("Unable to cast Python instance to C++ type (compile in debug mode for details)");
+#else
+        throw cast_error("Unable to cast Python instance of type " +
+            (std::string) str(handle.get_type()) + " to C++ type '" + type_id<T>() + "'");
+#endif
+    }
+    return conv;
+}
+// Wrapper around the above that also constructs and returns a type_caster
+template <typename T> make_caster<T> load_type(const handle &handle) {
+    make_caster<T> conv;
+    load_type(conv, handle);
+    return conv;
+}
+
+NAMESPACE_END(detail)
+
+// pytype -> C++ type
+template <typename T, detail::enable_if_t<!detail::is_pyobject<T>::value, int> = 0>
+T cast(const handle &handle) {
+    using namespace detail;
+    static_assert(!cast_is_temporary_value_reference<T>::value,
+            "Unable to cast type to reference: value is local to type caster");
+    return cast_op<T>(load_type<T>(handle));
+}
+
+// pytype -> pytype (calls converting constructor)
+template <typename T, detail::enable_if_t<detail::is_pyobject<T>::value, int> = 0>
+T cast(const handle &handle) { return T(reinterpret_borrow<object>(handle)); }
+
+// C++ type -> py::object
+template <typename T, detail::enable_if_t<!detail::is_pyobject<T>::value, int> = 0>
+object cast(const T &value, return_value_policy policy = return_value_policy::automatic_reference,
+            handle parent = handle()) {
+    if (policy == return_value_policy::automatic)
+        policy = std::is_pointer<T>::value ? return_value_policy::take_ownership : return_value_policy::copy;
+    else if (policy == return_value_policy::automatic_reference)
+        policy = std::is_pointer<T>::value ? return_value_policy::reference : return_value_policy::copy;
+    return reinterpret_steal<object>(detail::make_caster<T>::cast(value, policy, parent));
+}
+
+template <typename T> T handle::cast() const { return pybind11::cast<T>(*this); }
+template <> inline void handle::cast() const { return; }
+
+template <typename T>
+detail::enable_if_t<!detail::move_never<T>::value, T> move(object &&obj) {
+    if (obj.ref_count() > 1)
+#if defined(NDEBUG)
+        throw cast_error("Unable to cast Python instance to C++ rvalue: instance has multiple references"
+            " (compile in debug mode for details)");
+#else
+        throw cast_error("Unable to move from Python " + (std::string) str(obj.get_type()) +
+                " instance to C++ " + type_id<T>() + " instance: instance has multiple references");
+#endif
+
+    // Move into a temporary and return that, because the reference may be a local value of `conv`
+    T ret = std::move(detail::load_type<T>(obj).operator T&());
+    return ret;
+}
+
+// Calling cast() on an rvalue calls pybind::cast with the object rvalue, which does:
+// - If we have to move (because T has no copy constructor), do it.  This will fail if the moved
+//   object has multiple references, but trying to copy will fail to compile.
+// - If both movable and copyable, check ref count: if 1, move; otherwise copy
+// - Otherwise (not movable), copy.
+template <typename T> detail::enable_if_t<detail::move_always<T>::value, T> cast(object &&object) {
+    return move<T>(std::move(object));
+}
+template <typename T> detail::enable_if_t<detail::move_if_unreferenced<T>::value, T> cast(object &&object) {
+    if (object.ref_count() > 1)
+        return cast<T>(object);
+    else
+        return move<T>(std::move(object));
+}
+template <typename T> detail::enable_if_t<detail::move_never<T>::value, T> cast(object &&object) {
+    return cast<T>(object);
+}
+
+template <typename T> T object::cast() const & { return pybind11::cast<T>(*this); }
+template <typename T> T object::cast() && { return pybind11::cast<T>(std::move(*this)); }
+template <> inline void object::cast() const & { return; }
+template <> inline void object::cast() && { return; }
+
+NAMESPACE_BEGIN(detail)
+
+// Declared in pytypes.h:
+template <typename T, enable_if_t<!is_pyobject<T>::value, int>>
+object object_or_cast(T &&o) { return pybind11::cast(std::forward<T>(o)); }
+
+struct overload_unused {}; // Placeholder type for the unneeded (and dead code) static variable in the OVERLOAD_INT macro
+template <typename ret_type> using overload_caster_t = conditional_t<
+    cast_is_temporary_value_reference<ret_type>::value, make_caster<ret_type>, overload_unused>;
+
+// Trampoline use: for reference/pointer types to value-converted values, we do a value cast, then
+// store the result in the given variable.  For other types, this is a no-op.
+template <typename T> enable_if_t<cast_is_temporary_value_reference<T>::value, T> cast_ref(object &&o, make_caster<T> &caster) {
+    return cast_op<T>(load_type(caster, o));
+}
+template <typename T> enable_if_t<!cast_is_temporary_value_reference<T>::value, T> cast_ref(object &&, overload_unused &) {
+    pybind11_fail("Internal error: cast_ref fallback invoked"); }
+
+// Trampoline use: Having a pybind11::cast with an invalid reference type is going to static_assert, even
+// though if it's in dead code, so we provide a "trampoline" to pybind11::cast that only does anything in
+// cases where pybind11::cast is valid.
+template <typename T> enable_if_t<!cast_is_temporary_value_reference<T>::value, T> cast_safe(object &&o) {
+    return pybind11::cast<T>(std::move(o)); }
+template <typename T> enable_if_t<cast_is_temporary_value_reference<T>::value, T> cast_safe(object &&) {
+    pybind11_fail("Internal error: cast_safe fallback invoked"); }
+template <> inline void cast_safe<void>(object &&) {}
+
+NAMESPACE_END(detail)
+
+template <return_value_policy policy = return_value_policy::automatic_reference>
+tuple make_tuple() { return tuple(0); }
+
+template <return_value_policy policy = return_value_policy::automatic_reference,
+          typename... Args> tuple make_tuple(Args&&... args_) {
+    constexpr size_t size = sizeof...(Args);
+    std::array<object, size> args {
+        { reinterpret_steal<object>(detail::make_caster<Args>::cast(
+            std::forward<Args>(args_), policy, nullptr))... }
+    };
+    for (size_t i = 0; i < args.size(); i++) {
+        if (!args[i]) {
+#if defined(NDEBUG)
+            throw cast_error("make_tuple(): unable to convert arguments to Python object (compile in debug mode for details)");
+#else
+            std::array<std::string, size> argtypes { {type_id<Args>()...} };
+            throw cast_error("make_tuple(): unable to convert argument of type '" +
+                argtypes[i] + "' to Python object");
+#endif
+        }
+    }
+    tuple result(size);
+    int counter = 0;
+    for (auto &arg_value : args)
+        PyTuple_SET_ITEM(result.ptr(), counter++, arg_value.release().ptr());
+    return result;
+}
+
+/// \ingroup annotations
+/// Annotation for arguments
+struct arg {
+    /// Constructs an argument with the name of the argument; if null or omitted, this is a positional argument.
+    constexpr explicit arg(const char *name = nullptr) : name(name), flag_noconvert(false), flag_none(true) { }
+    /// Assign a value to this argument
+    template <typename T> arg_v operator=(T &&value) const;
+    /// Indicate that the type should not be converted in the type caster
+    arg &noconvert(bool flag = true) { flag_noconvert = flag; return *this; }
+    /// Indicates that the argument should/shouldn't allow None (e.g. for nullable pointer args)
+    arg &none(bool flag = true) { flag_none = flag; return *this; }
+
+    const char *name; ///< If non-null, this is a named kwargs argument
+    bool flag_noconvert : 1; ///< If set, do not allow conversion (requires a supporting type caster!)
+    bool flag_none : 1; ///< If set (the default), allow None to be passed to this argument
+};
+
+/// \ingroup annotations
+/// Annotation for arguments with values
+struct arg_v : arg {
+private:
+    template <typename T>
+    arg_v(arg &&base, T &&x, const char *descr = nullptr)
+        : arg(base),
+          value(reinterpret_steal<object>(
+              detail::make_caster<T>::cast(x, return_value_policy::automatic, {})
+          )),
+          descr(descr)
+#if !defined(NDEBUG)
+        , type(type_id<T>())
+#endif
+    { }
+
+public:
+    /// Direct construction with name, default, and description
+    template <typename T>
+    arg_v(const char *name, T &&x, const char *descr = nullptr)
+        : arg_v(arg(name), std::forward<T>(x), descr) { }
+
+    /// Called internally when invoking `py::arg("a") = value`
+    template <typename T>
+    arg_v(const arg &base, T &&x, const char *descr = nullptr)
+        : arg_v(arg(base), std::forward<T>(x), descr) { }
+
+    /// Same as `arg::noconvert()`, but returns *this as arg_v&, not arg&
+    arg_v &noconvert(bool flag = true) { arg::noconvert(flag); return *this; }
+
+    /// Same as `arg::nonone()`, but returns *this as arg_v&, not arg&
+    arg_v &none(bool flag = true) { arg::none(flag); return *this; }
+
+    /// The default value
+    object value;
+    /// The (optional) description of the default value
+    const char *descr;
+#if !defined(NDEBUG)
+    /// The C++ type name of the default value (only available when compiled in debug mode)
+    std::string type;
+#endif
+};
+
+template <typename T>
+arg_v arg::operator=(T &&value) const { return {std::move(*this), std::forward<T>(value)}; }
+
+/// Alias for backward compatibility -- to be removed in version 2.0
+template <typename /*unused*/> using arg_t = arg_v;
+
+inline namespace literals {
+/** \rst
+    String literal version of `arg`
+ \endrst */
+constexpr arg operator"" _a(const char *name, size_t) { return arg(name); }
+}
+
+NAMESPACE_BEGIN(detail)
+
+// forward declaration (definition in attr.h)
+struct function_record;
+
+/// Internal data associated with a single function call
+struct function_call {
+    function_call(const function_record &f, handle p); // Implementation in attr.h
+
+    /// The function data:
+    const function_record &func;
+
+    /// Arguments passed to the function:
+    std::vector<handle> args;
+
+    /// The `convert` value the arguments should be loaded with
+    std::vector<bool> args_convert;
+
+    /// Extra references for the optional `py::args` and/or `py::kwargs` arguments (which, if
+    /// present, are also in `args` but without a reference).
+    object args_ref, kwargs_ref;
+
+    /// The parent, if any
+    handle parent;
+
+    /// If this is a call to an initializer, this argument contains `self`
+    handle init_self;
+};
+
+
+/// Helper class which loads arguments for C++ functions called from Python
+template <typename... Args>
+class argument_loader {
+    using indices = make_index_sequence<sizeof...(Args)>;
+
+    template <typename Arg> using argument_is_args   = std::is_same<intrinsic_t<Arg>, args>;
+    template <typename Arg> using argument_is_kwargs = std::is_same<intrinsic_t<Arg>, kwargs>;
+    // Get args/kwargs argument positions relative to the end of the argument list:
+    static constexpr auto args_pos = constexpr_first<argument_is_args, Args...>() - (int) sizeof...(Args),
+                        kwargs_pos = constexpr_first<argument_is_kwargs, Args...>() - (int) sizeof...(Args);
+
+    static constexpr bool args_kwargs_are_last = kwargs_pos >= - 1 && args_pos >= kwargs_pos - 1;
+
+    static_assert(args_kwargs_are_last, "py::args/py::kwargs are only permitted as the last argument(s) of a function");
+
+public:
+    static constexpr bool has_kwargs = kwargs_pos < 0;
+    static constexpr bool has_args = args_pos < 0;
+
+    static constexpr auto arg_names = concat(type_descr(make_caster<Args>::name)...);
+
+    bool load_args(function_call &call) {
+        return load_impl_sequence(call, indices{});
+    }
+
+    template <typename Return, typename Guard, typename Func>
+    enable_if_t<!std::is_void<Return>::value, Return> call(Func &&f) && {
+        return std::move(*this).template call_impl<Return>(std::forward<Func>(f), indices{}, Guard{});
+    }
+
+    template <typename Return, typename Guard, typename Func>
+    enable_if_t<std::is_void<Return>::value, void_type> call(Func &&f) && {
+        std::move(*this).template call_impl<Return>(std::forward<Func>(f), indices{}, Guard{});
+        return void_type();
+    }
+
+private:
+
+    static bool load_impl_sequence(function_call &, index_sequence<>) { return true; }
+
+    template <size_t... Is>
+    bool load_impl_sequence(function_call &call, index_sequence<Is...>) {
+#ifdef __cpp_fold_expressions
+        if ((... || !std::get<Is>(argcasters).load(call.args[Is], call.args_convert[Is])))
+            return false;
+#else
+        for (bool r : {std::get<Is>(argcasters).load(call.args[Is], call.args_convert[Is])...})
+            if (!r)
+                return false;
+#endif
+        return true;
+    }
+
+    template <typename Return, typename Func, size_t... Is, typename Guard>
+    Return call_impl(Func &&f, index_sequence<Is...>, Guard &&) && {
+        return std::forward<Func>(f)(cast_op<Args>(std::move(std::get<Is>(argcasters)))...);
+    }
+
+    std::tuple<make_caster<Args>...> argcasters;
+};
+
+/// Helper class which collects only positional arguments for a Python function call.
+/// A fancier version below can collect any argument, but this one is optimal for simple calls.
+template <return_value_policy policy>
+class simple_collector {
+public:
+    template <typename... Ts>
+    explicit simple_collector(Ts &&...values)
+        : m_args(pybind11::make_tuple<policy>(std::forward<Ts>(values)...)) { }
+
+    const tuple &args() const & { return m_args; }
+    dict kwargs() const { return {}; }
+
+    tuple args() && { return std::move(m_args); }
+
+    /// Call a Python function and pass the collected arguments
+    object call(PyObject *ptr) const {
+        PyObject *result = PyObject_CallObject(ptr, m_args.ptr());
+        if (!result)
+            throw error_already_set();
+        return reinterpret_steal<object>(result);
+    }
+
+private:
+    tuple m_args;
+};
+
+/// Helper class which collects positional, keyword, * and ** arguments for a Python function call
+template <return_value_policy policy>
+class unpacking_collector {
+public:
+    template <typename... Ts>
+    explicit unpacking_collector(Ts &&...values) {
+        // Tuples aren't (easily) resizable so a list is needed for collection,
+        // but the actual function call strictly requires a tuple.
+        auto args_list = list();
+        int _[] = { 0, (process(args_list, std::forward<Ts>(values)), 0)... };
+        ignore_unused(_);
+
+        m_args = std::move(args_list);
+    }
+
+    const tuple &args() const & { return m_args; }
+    const dict &kwargs() const & { return m_kwargs; }
+
+    tuple args() && { return std::move(m_args); }
+    dict kwargs() && { return std::move(m_kwargs); }
+
+    /// Call a Python function and pass the collected arguments
+    object call(PyObject *ptr) const {
+        PyObject *result = PyObject_Call(ptr, m_args.ptr(), m_kwargs.ptr());
+        if (!result)
+            throw error_already_set();
+        return reinterpret_steal<object>(result);
+    }
+
+private:
+    template <typename T>
+    void process(list &args_list, T &&x) {
+        auto o = reinterpret_steal<object>(detail::make_caster<T>::cast(std::forward<T>(x), policy, {}));
+        if (!o) {
+#if defined(NDEBUG)
+            argument_cast_error();
+#else
+            argument_cast_error(std::to_string(args_list.size()), type_id<T>());
+#endif
+        }
+        args_list.append(o);
+    }
+
+    void process(list &args_list, detail::args_proxy ap) {
+        for (const auto &a : ap)
+            args_list.append(a);
+    }
+
+    void process(list &/*args_list*/, arg_v a) {
+        if (!a.name)
+#if defined(NDEBUG)
+            nameless_argument_error();
+#else
+            nameless_argument_error(a.type);
+#endif
+
+        if (m_kwargs.contains(a.name)) {
+#if defined(NDEBUG)
+            multiple_values_error();
+#else
+            multiple_values_error(a.name);
+#endif
+        }
+        if (!a.value) {
+#if defined(NDEBUG)
+            argument_cast_error();
+#else
+            argument_cast_error(a.name, a.type);
+#endif
+        }
+        m_kwargs[a.name] = a.value;
+    }
+
+    void process(list &/*args_list*/, detail::kwargs_proxy kp) {
+        if (!kp)
+            return;
+        for (const auto &k : reinterpret_borrow<dict>(kp)) {
+            if (m_kwargs.contains(k.first)) {
+#if defined(NDEBUG)
+                multiple_values_error();
+#else
+                multiple_values_error(str(k.first));
+#endif
+            }
+            m_kwargs[k.first] = k.second;
+        }
+    }
+
+    [[noreturn]] static void nameless_argument_error() {
+        throw type_error("Got kwargs without a name; only named arguments "
+                         "may be passed via py::arg() to a python function call. "
+                         "(compile in debug mode for details)");
+    }
+    [[noreturn]] static void nameless_argument_error(std::string type) {
+        throw type_error("Got kwargs without a name of type '" + type + "'; only named "
+                         "arguments may be passed via py::arg() to a python function call. ");
+    }
+    [[noreturn]] static void multiple_values_error() {
+        throw type_error("Got multiple values for keyword argument "
+                         "(compile in debug mode for details)");
+    }
+
+    [[noreturn]] static void multiple_values_error(std::string name) {
+        throw type_error("Got multiple values for keyword argument '" + name + "'");
+    }
+
+    [[noreturn]] static void argument_cast_error() {
+        throw cast_error("Unable to convert call argument to Python object "
+                         "(compile in debug mode for details)");
+    }
+
+    [[noreturn]] static void argument_cast_error(std::string name, std::string type) {
+        throw cast_error("Unable to convert call argument '" + name
+                         + "' of type '" + type + "' to Python object");
+    }
+
+private:
+    tuple m_args;
+    dict m_kwargs;
+};
+
+/// Collect only positional arguments for a Python function call
+template <return_value_policy policy, typename... Args,
+          typename = enable_if_t<all_of<is_positional<Args>...>::value>>
+simple_collector<policy> collect_arguments(Args &&...args) {
+    return simple_collector<policy>(std::forward<Args>(args)...);
+}
+
+/// Collect all arguments, including keywords and unpacking (only instantiated when needed)
+template <return_value_policy policy, typename... Args,
+          typename = enable_if_t<!all_of<is_positional<Args>...>::value>>
+unpacking_collector<policy> collect_arguments(Args &&...args) {
+    // Following argument order rules for generalized unpacking according to PEP 448
+    static_assert(
+        constexpr_last<is_positional, Args...>() < constexpr_first<is_keyword_or_ds, Args...>()
+        && constexpr_last<is_s_unpacking, Args...>() < constexpr_first<is_ds_unpacking, Args...>(),
+        "Invalid function call: positional args must precede keywords and ** unpacking; "
+        "* unpacking must precede ** unpacking"
+    );
+    return unpacking_collector<policy>(std::forward<Args>(args)...);
+}
+
+template <typename Derived>
+template <return_value_policy policy, typename... Args>
+object object_api<Derived>::operator()(Args &&...args) const {
+    return detail::collect_arguments<policy>(std::forward<Args>(args)...).call(derived().ptr());
+}
+
+template <typename Derived>
+template <return_value_policy policy, typename... Args>
+object object_api<Derived>::call(Args &&...args) const {
+    return operator()<policy>(std::forward<Args>(args)...);
+}
+
+NAMESPACE_END(detail)
+
+#define PYBIND11_MAKE_OPAQUE(...) \
+    namespace pybind11 { namespace detail { \
+        template<> class type_caster<__VA_ARGS__> : public type_caster_base<__VA_ARGS__> { }; \
+    }}
+
+/// Lets you pass a type containing a `,` through a macro parameter without needing a separate
+/// typedef, e.g.: `PYBIND11_OVERLOAD(PYBIND11_TYPE(ReturnType<A, B>), PYBIND11_TYPE(Parent<C, D>), f, arg)`
+#define PYBIND11_TYPE(...) __VA_ARGS__
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/cviruntime/python/include/pybind11/include/pybind11/chrono.h b/cviruntime/python/include/pybind11/include/pybind11/chrono.h
new file mode 100644
index 000000000..ea777e696
--- /dev/null
+++ b/cviruntime/python/include/pybind11/include/pybind11/chrono.h
@@ -0,0 +1,184 @@
+/*
+    pybind11/chrono.h: Transparent conversion between std::chrono and python's datetime
+
+    Copyright (c) 2016 Trent Houliston <trent@houliston.me> and
+                       Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+#include <cmath>
+#include <ctime>
+#include <chrono>
+#include <datetime.h>
+
+// Backport the PyDateTime_DELTA functions from Python3.3 if required
+#ifndef PyDateTime_DELTA_GET_DAYS
+#define PyDateTime_DELTA_GET_DAYS(o)         (((PyDateTime_Delta*)o)->days)
+#endif
+#ifndef PyDateTime_DELTA_GET_SECONDS
+#define PyDateTime_DELTA_GET_SECONDS(o)      (((PyDateTime_Delta*)o)->seconds)
+#endif
+#ifndef PyDateTime_DELTA_GET_MICROSECONDS
+#define PyDateTime_DELTA_GET_MICROSECONDS(o) (((PyDateTime_Delta*)o)->microseconds)
+#endif
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+template <typename type> class duration_caster {
+public:
+    typedef typename type::rep rep;
+    typedef typename type::period period;
+
+    typedef std::chrono::duration<uint_fast32_t, std::ratio<86400>> days;
+
+    bool load(handle src, bool) {
+        using namespace std::chrono;
+
+        // Lazy initialise the PyDateTime import
+        if (!PyDateTimeAPI) { PyDateTime_IMPORT; }
+
+        if (!src) return false;
+        // If invoked with datetime.delta object
+        if (PyDelta_Check(src.ptr())) {
+            value = type(duration_cast<duration<rep, period>>(
+                  days(PyDateTime_DELTA_GET_DAYS(src.ptr()))
+                + seconds(PyDateTime_DELTA_GET_SECONDS(src.ptr()))
+                + microseconds(PyDateTime_DELTA_GET_MICROSECONDS(src.ptr()))));
+            return true;
+        }
+        // If invoked with a float we assume it is seconds and convert
+        else if (PyFloat_Check(src.ptr())) {
+            value = type(duration_cast<duration<rep, period>>(duration<double>(PyFloat_AsDouble(src.ptr()))));
+            return true;
+        }
+        else return false;
+    }
+
+    // If this is a duration just return it back
+    static const std::chrono::duration<rep, period>& get_duration(const std::chrono::duration<rep, period> &src) {
+        return src;
+    }
+
+    // If this is a time_point get the time_since_epoch
+    template <typename Clock> static std::chrono::duration<rep, period> get_duration(const std::chrono::time_point<Clock, std::chrono::duration<rep, period>> &src) {
+        return src.time_since_epoch();
+    }
+
+    static handle cast(const type &src, return_value_policy /* policy */, handle /* parent */) {
+        using namespace std::chrono;
+
+        // Use overloaded function to get our duration from our source
+        // Works out if it is a duration or time_point and get the duration
+        auto d = get_duration(src);
+
+        // Lazy initialise the PyDateTime import
+        if (!PyDateTimeAPI) { PyDateTime_IMPORT; }
+
+        // Declare these special duration types so the conversions happen with the correct primitive types (int)
+        using dd_t = duration<int, std::ratio<86400>>;
+        using ss_t = duration<int, std::ratio<1>>;
+        using us_t = duration<int, std::micro>;
+
+        auto dd = duration_cast<dd_t>(d);
+        auto subd = d - dd;
+        auto ss = duration_cast<ss_t>(subd);
+        auto us = duration_cast<us_t>(subd - ss);
+        return PyDelta_FromDSU(dd.count(), ss.count(), us.count());
+    }
+
+    PYBIND11_TYPE_CASTER(type, _("datetime.timedelta"));
+};
+
+// This is for casting times on the system clock into datetime.datetime instances
+template <typename Duration> class type_caster<std::chrono::time_point<std::chrono::system_clock, Duration>> {
+public:
+    typedef std::chrono::time_point<std::chrono::system_clock, Duration> type;
+    bool load(handle src, bool) {
+        using namespace std::chrono;
+
+        // Lazy initialise the PyDateTime import
+        if (!PyDateTimeAPI) { PyDateTime_IMPORT; }
+
+        if (!src) return false;
+
+        std::tm cal;
+        microseconds msecs;
+
+        if (PyDateTime_Check(src.ptr())) {
+            cal.tm_sec   = PyDateTime_DATE_GET_SECOND(src.ptr());
+            cal.tm_min   = PyDateTime_DATE_GET_MINUTE(src.ptr());
+            cal.tm_hour  = PyDateTime_DATE_GET_HOUR(src.ptr());
+            cal.tm_mday  = PyDateTime_GET_DAY(src.ptr());
+            cal.tm_mon   = PyDateTime_GET_MONTH(src.ptr()) - 1;
+            cal.tm_year  = PyDateTime_GET_YEAR(src.ptr()) - 1900;
+            cal.tm_isdst = -1;
+            msecs        = microseconds(PyDateTime_DATE_GET_MICROSECOND(src.ptr()));
+        } else if (PyDate_Check(src.ptr())) {
+            cal.tm_sec   = 0;
+            cal.tm_min   = 0;
+            cal.tm_hour  = 0;
+            cal.tm_mday  = PyDateTime_GET_DAY(src.ptr());
+            cal.tm_mon   = PyDateTime_GET_MONTH(src.ptr()) - 1;
+            cal.tm_year  = PyDateTime_GET_YEAR(src.ptr()) - 1900;
+            cal.tm_isdst = -1;
+            msecs        = microseconds(0);
+        } else if (PyTime_Check(src.ptr())) {
+            cal.tm_sec   = PyDateTime_TIME_GET_SECOND(src.ptr());
+            cal.tm_min   = PyDateTime_TIME_GET_MINUTE(src.ptr());
+            cal.tm_hour  = PyDateTime_TIME_GET_HOUR(src.ptr());
+            cal.tm_mday  = 1;   // This date (day, month, year) = (1, 0, 70)
+            cal.tm_mon   = 0;   // represents 1-Jan-1970, which is the first
+            cal.tm_year  = 70;  // earliest available date for Python's datetime
+            cal.tm_isdst = -1;
+            msecs        = microseconds(PyDateTime_TIME_GET_MICROSECOND(src.ptr()));
+        }
+        else return false;
+
+        value = system_clock::from_time_t(std::mktime(&cal)) + msecs;
+        return true;
+    }
+
+    static handle cast(const std::chrono::time_point<std::chrono::system_clock, Duration> &src, return_value_policy /* policy */, handle /* parent */) {
+        using namespace std::chrono;
+
+        // Lazy initialise the PyDateTime import
+        if (!PyDateTimeAPI) { PyDateTime_IMPORT; }
+
+        std::time_t tt = system_clock::to_time_t(time_point_cast<system_clock::duration>(src));
+        // this function uses static memory so it's best to copy it out asap just in case
+        // otherwise other code that is using localtime may break this (not just python code)
+        std::tm localtime = *std::localtime(&tt);
+
+        // Declare these special duration types so the conversions happen with the correct primitive types (int)
+        using us_t = duration<int, std::micro>;
+
+        return PyDateTime_FromDateAndTime(localtime.tm_year + 1900,
+                                          localtime.tm_mon + 1,
+                                          localtime.tm_mday,
+                                          localtime.tm_hour,
+                                          localtime.tm_min,
+                                          localtime.tm_sec,
+                                          (duration_cast<us_t>(src.time_since_epoch() % seconds(1))).count());
+    }
+    PYBIND11_TYPE_CASTER(type, _("datetime.datetime"));
+};
+
+// Other clocks that are not the system clock are not measured as datetime.datetime objects
+// since they are not measured on calendar time. So instead we just make them timedeltas
+// Or if they have passed us a time as a float we convert that
+template <typename Clock, typename Duration> class type_caster<std::chrono::time_point<Clock, Duration>>
+: public duration_caster<std::chrono::time_point<Clock, Duration>> {
+};
+
+template <typename Rep, typename Period> class type_caster<std::chrono::duration<Rep, Period>>
+: public duration_caster<std::chrono::duration<Rep, Period>> {
+};
+
+NAMESPACE_END(detail)
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/cviruntime/python/include/pybind11/include/pybind11/common.h b/cviruntime/python/include/pybind11/include/pybind11/common.h
new file mode 100644
index 000000000..6c8a4f1e8
--- /dev/null
+++ b/cviruntime/python/include/pybind11/include/pybind11/common.h
@@ -0,0 +1,2 @@
+#include "detail/common.h"
+#warning "Including 'common.h' is deprecated. It will be removed in v3.0. Use 'pybind11.h'."
diff --git a/cviruntime/python/include/pybind11/include/pybind11/complex.h b/cviruntime/python/include/pybind11/include/pybind11/complex.h
new file mode 100644
index 000000000..3f8963857
--- /dev/null
+++ b/cviruntime/python/include/pybind11/include/pybind11/complex.h
@@ -0,0 +1,65 @@
+/*
+    pybind11/complex.h: Complex number support
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+#include <complex>
+
+/// glibc defines I as a macro which breaks things, e.g., boost template names
+#ifdef I
+#  undef I
+#endif
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+template <typename T> struct format_descriptor<std::complex<T>, detail::enable_if_t<std::is_floating_point<T>::value>> {
+    static constexpr const char c = format_descriptor<T>::c;
+    static constexpr const char value[3] = { 'Z', c, '\0' };
+    static std::string format() { return std::string(value); }
+};
+
+#ifndef PYBIND11_CPP17
+
+template <typename T> constexpr const char format_descriptor<
+    std::complex<T>, detail::enable_if_t<std::is_floating_point<T>::value>>::value[3];
+
+#endif
+
+NAMESPACE_BEGIN(detail)
+
+template <typename T> struct is_fmt_numeric<std::complex<T>, detail::enable_if_t<std::is_floating_point<T>::value>> {
+    static constexpr bool value = true;
+    static constexpr int index = is_fmt_numeric<T>::index + 3;
+};
+
+template <typename T> class type_caster<std::complex<T>> {
+public:
+    bool load(handle src, bool convert) {
+        if (!src)
+            return false;
+        if (!convert && !PyComplex_Check(src.ptr()))
+            return false;
+        Py_complex result = PyComplex_AsCComplex(src.ptr());
+        if (result.real == -1.0 && PyErr_Occurred()) {
+            PyErr_Clear();
+            return false;
+        }
+        value = std::complex<T>((T) result.real, (T) result.imag);
+        return true;
+    }
+
+    static handle cast(const std::complex<T> &src, return_value_policy /* policy */, handle /* parent */) {
+        return PyComplex_FromDoubles((double) src.real(), (double) src.imag());
+    }
+
+    PYBIND11_TYPE_CASTER(std::complex<T>, _("complex"));
+};
+NAMESPACE_END(detail)
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/cviruntime/python/include/pybind11/include/pybind11/detail/class.h b/cviruntime/python/include/pybind11/include/pybind11/detail/class.h
new file mode 100644
index 000000000..edfa7de68
--- /dev/null
+++ b/cviruntime/python/include/pybind11/include/pybind11/detail/class.h
@@ -0,0 +1,639 @@
+/*
+    pybind11/detail/class.h: Python C API implementation details for py::class_
+
+    Copyright (c) 2017 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "../attr.h"
+#include "../options.h"
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+#if PY_VERSION_HEX >= 0x03030000
+#  define PYBIND11_BUILTIN_QUALNAME
+#  define PYBIND11_SET_OLDPY_QUALNAME(obj, nameobj)
+#else
+// In pre-3.3 Python, we still set __qualname__ so that we can produce reliable function type
+// signatures; in 3.3+ this macro expands to nothing:
+#  define PYBIND11_SET_OLDPY_QUALNAME(obj, nameobj) setattr((PyObject *) obj, "__qualname__", nameobj)
+#endif
+
+inline PyTypeObject *type_incref(PyTypeObject *type) {
+    Py_INCREF(type);
+    return type;
+}
+
+#if !defined(PYPY_VERSION)
+
+/// `pybind11_static_property.__get__()`: Always pass the class instead of the instance.
+extern "C" inline PyObject *pybind11_static_get(PyObject *self, PyObject * /*ob*/, PyObject *cls) {
+    return PyProperty_Type.tp_descr_get(self, cls, cls);
+}
+
+/// `pybind11_static_property.__set__()`: Just like the above `__get__()`.
+extern "C" inline int pybind11_static_set(PyObject *self, PyObject *obj, PyObject *value) {
+    PyObject *cls = PyType_Check(obj) ? obj : (PyObject *) Py_TYPE(obj);
+    return PyProperty_Type.tp_descr_set(self, cls, value);
+}
+
+/** A `static_property` is the same as a `property` but the `__get__()` and `__set__()`
+    methods are modified to always use the object type instead of a concrete instance.
+    Return value: New reference. */
+inline PyTypeObject *make_static_property_type() {
+    constexpr auto *name = "pybind11_static_property";
+    auto name_obj = reinterpret_steal<object>(PYBIND11_FROM_STRING(name));
+
+    /* Danger zone: from now (and until PyType_Ready), make sure to
+       issue no Python C API calls which could potentially invoke the
+       garbage collector (the GC will call type_traverse(), which will in
+       turn find the newly constructed type in an invalid state) */
+    auto heap_type = (PyHeapTypeObject *) PyType_Type.tp_alloc(&PyType_Type, 0);
+    if (!heap_type)
+        pybind11_fail("make_static_property_type(): error allocating type!");
+
+    heap_type->ht_name = name_obj.inc_ref().ptr();
+#ifdef PYBIND11_BUILTIN_QUALNAME
+    heap_type->ht_qualname = name_obj.inc_ref().ptr();
+#endif
+
+    auto type = &heap_type->ht_type;
+    type->tp_name = name;
+    type->tp_base = type_incref(&PyProperty_Type);
+    type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
+    type->tp_descr_get = pybind11_static_get;
+    type->tp_descr_set = pybind11_static_set;
+
+    if (PyType_Ready(type) < 0)
+        pybind11_fail("make_static_property_type(): failure in PyType_Ready()!");
+
+    setattr((PyObject *) type, "__module__", str("pybind11_builtins"));
+    PYBIND11_SET_OLDPY_QUALNAME(type, name_obj);
+
+    return type;
+}
+
+#else // PYPY
+
+/** PyPy has some issues with the above C API, so we evaluate Python code instead.
+    This function will only be called once so performance isn't really a concern.
+    Return value: New reference. */
+inline PyTypeObject *make_static_property_type() {
+    auto d = dict();
+    PyObject *result = PyRun_String(R"(\
+        class pybind11_static_property(property):
+            def __get__(self, obj, cls):
+                return property.__get__(self, cls, cls)
+
+            def __set__(self, obj, value):
+                cls = obj if isinstance(obj, type) else type(obj)
+                property.__set__(self, cls, value)
+        )", Py_file_input, d.ptr(), d.ptr()
+    );
+    if (result == nullptr)
+        throw error_already_set();
+    Py_DECREF(result);
+    return (PyTypeObject *) d["pybind11_static_property"].cast<object>().release().ptr();
+}
+
+#endif // PYPY
+
+/** Types with static properties need to handle `Type.static_prop = x` in a specific way.
+    By default, Python replaces the `static_property` itself, but for wrapped C++ types
+    we need to call `static_property.__set__()` in order to propagate the new value to
+    the underlying C++ data structure. */
+extern "C" inline int pybind11_meta_setattro(PyObject* obj, PyObject* name, PyObject* value) {
+    // Use `_PyType_Lookup()` instead of `PyObject_GetAttr()` in order to get the raw
+    // descriptor (`property`) instead of calling `tp_descr_get` (`property.__get__()`).
+    PyObject *descr = _PyType_Lookup((PyTypeObject *) obj, name);
+
+    // The following assignment combinations are possible:
+    //   1. `Type.static_prop = value`             --> descr_set: `Type.static_prop.__set__(value)`
+    //   2. `Type.static_prop = other_static_prop` --> setattro:  replace existing `static_prop`
+    //   3. `Type.regular_attribute = value`       --> setattro:  regular attribute assignment
+    const auto static_prop = (PyObject *) get_internals().static_property_type;
+    const auto call_descr_set = descr && PyObject_IsInstance(descr, static_prop)
+                                && !PyObject_IsInstance(value, static_prop);
+    if (call_descr_set) {
+        // Call `static_property.__set__()` instead of replacing the `static_property`.
+#if !defined(PYPY_VERSION)
+        return Py_TYPE(descr)->tp_descr_set(descr, obj, value);
+#else
+        if (PyObject *result = PyObject_CallMethod(descr, "__set__", "OO", obj, value)) {
+            Py_DECREF(result);
+            return 0;
+        } else {
+            return -1;
+        }
+#endif
+    } else {
+        // Replace existing attribute.
+        return PyType_Type.tp_setattro(obj, name, value);
+    }
+}
+
+#if PY_MAJOR_VERSION >= 3
+/**
+ * Python 3's PyInstanceMethod_Type hides itself via its tp_descr_get, which prevents aliasing
+ * methods via cls.attr("m2") = cls.attr("m1"): instead the tp_descr_get returns a plain function,
+ * when called on a class, or a PyMethod, when called on an instance.  Override that behaviour here
+ * to do a special case bypass for PyInstanceMethod_Types.
+ */
+extern "C" inline PyObject *pybind11_meta_getattro(PyObject *obj, PyObject *name) {
+    PyObject *descr = _PyType_Lookup((PyTypeObject *) obj, name);
+    if (descr && PyInstanceMethod_Check(descr)) {
+        Py_INCREF(descr);
+        return descr;
+    }
+    else {
+        return PyType_Type.tp_getattro(obj, name);
+    }
+}
+#endif
+
+/** This metaclass is assigned by default to all pybind11 types and is required in order
+    for static properties to function correctly. Users may override this using `py::metaclass`.
+    Return value: New reference. */
+inline PyTypeObject* make_default_metaclass() {
+    constexpr auto *name = "pybind11_type";
+    auto name_obj = reinterpret_steal<object>(PYBIND11_FROM_STRING(name));
+
+    /* Danger zone: from now (and until PyType_Ready), make sure to
+       issue no Python C API calls which could potentially invoke the
+       garbage collector (the GC will call type_traverse(), which will in
+       turn find the newly constructed type in an invalid state) */
+    auto heap_type = (PyHeapTypeObject *) PyType_Type.tp_alloc(&PyType_Type, 0);
+    if (!heap_type)
+        pybind11_fail("make_default_metaclass(): error allocating metaclass!");
+
+    heap_type->ht_name = name_obj.inc_ref().ptr();
+#ifdef PYBIND11_BUILTIN_QUALNAME
+    heap_type->ht_qualname = name_obj.inc_ref().ptr();
+#endif
+
+    auto type = &heap_type->ht_type;
+    type->tp_name = name;
+    type->tp_base = type_incref(&PyType_Type);
+    type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
+
+    type->tp_setattro = pybind11_meta_setattro;
+#if PY_MAJOR_VERSION >= 3
+    type->tp_getattro = pybind11_meta_getattro;
+#endif
+
+    if (PyType_Ready(type) < 0)
+        pybind11_fail("make_default_metaclass(): failure in PyType_Ready()!");
+
+    setattr((PyObject *) type, "__module__", str("pybind11_builtins"));
+    PYBIND11_SET_OLDPY_QUALNAME(type, name_obj);
+
+    return type;
+}
+
+/// For multiple inheritance types we need to recursively register/deregister base pointers for any
+/// base classes with pointers that are difference from the instance value pointer so that we can
+/// correctly recognize an offset base class pointer. This calls a function with any offset base ptrs.
+inline void traverse_offset_bases(void *valueptr, const detail::type_info *tinfo, instance *self,
+        bool (*f)(void * /*parentptr*/, instance * /*self*/)) {
+    for (handle h : reinterpret_borrow<tuple>(tinfo->type->tp_bases)) {
+        if (auto parent_tinfo = get_type_info((PyTypeObject *) h.ptr())) {
+            for (auto &c : parent_tinfo->implicit_casts) {
+                if (c.first == tinfo->cpptype) {
+                    auto *parentptr = c.second(valueptr);
+                    if (parentptr != valueptr)
+                        f(parentptr, self);
+                    traverse_offset_bases(parentptr, parent_tinfo, self, f);
+                    break;
+                }
+            }
+        }
+    }
+}
+
+inline bool register_instance_impl(void *ptr, instance *self) {
+    get_internals().registered_instances.emplace(ptr, self);
+    return true; // unused, but gives the same signature as the deregister func
+}
+inline bool deregister_instance_impl(void *ptr, instance *self) {
+    auto &registered_instances = get_internals().registered_instances;
+    auto range = registered_instances.equal_range(ptr);
+    for (auto it = range.first; it != range.second; ++it) {
+        if (Py_TYPE(self) == Py_TYPE(it->second)) {
+            registered_instances.erase(it);
+            return true;
+        }
+    }
+    return false;
+}
+
+inline void register_instance(instance *self, void *valptr, const type_info *tinfo) {
+    register_instance_impl(valptr, self);
+    if (!tinfo->simple_ancestors)
+        traverse_offset_bases(valptr, tinfo, self, register_instance_impl);
+}
+
+inline bool deregister_instance(instance *self, void *valptr, const type_info *tinfo) {
+    bool ret = deregister_instance_impl(valptr, self);
+    if (!tinfo->simple_ancestors)
+        traverse_offset_bases(valptr, tinfo, self, deregister_instance_impl);
+    return ret;
+}
+
+/// Instance creation function for all pybind11 types. It allocates the internal instance layout for
+/// holding C++ objects and holders.  Allocation is done lazily (the first time the instance is cast
+/// to a reference or pointer), and initialization is done by an `__init__` function.
+inline PyObject *make_new_instance(PyTypeObject *type) {
+#if defined(PYPY_VERSION)
+    // PyPy gets tp_basicsize wrong (issue 2482) under multiple inheritance when the first inherited
+    // object is a a plain Python type (i.e. not derived from an extension type).  Fix it.
+    ssize_t instance_size = static_cast<ssize_t>(sizeof(instance));
+    if (type->tp_basicsize < instance_size) {
+        type->tp_basicsize = instance_size;
+    }
+#endif
+    PyObject *self = type->tp_alloc(type, 0);
+    auto inst = reinterpret_cast<instance *>(self);
+    // Allocate the value/holder internals:
+    inst->allocate_layout();
+
+    inst->owned = true;
+
+    return self;
+}
+
+/// Instance creation function for all pybind11 types. It only allocates space for the
+/// C++ object, but doesn't call the constructor -- an `__init__` function must do that.
+extern "C" inline PyObject *pybind11_object_new(PyTypeObject *type, PyObject *, PyObject *) {
+    return make_new_instance(type);
+}
+
+/// An `__init__` function constructs the C++ object. Users should provide at least one
+/// of these using `py::init` or directly with `.def(__init__, ...)`. Otherwise, the
+/// following default function will be used which simply throws an exception.
+extern "C" inline int pybind11_object_init(PyObject *self, PyObject *, PyObject *) {
+    PyTypeObject *type = Py_TYPE(self);
+    std::string msg;
+#if defined(PYPY_VERSION)
+    msg += handle((PyObject *) type).attr("__module__").cast<std::string>() + ".";
+#endif
+    msg += type->tp_name;
+    msg += ": No constructor defined!";
+    PyErr_SetString(PyExc_TypeError, msg.c_str());
+    return -1;
+}
+
+inline void add_patient(PyObject *nurse, PyObject *patient) {
+    auto &internals = get_internals();
+    auto instance = reinterpret_cast<detail::instance *>(nurse);
+    instance->has_patients = true;
+    Py_INCREF(patient);
+    internals.patients[nurse].push_back(patient);
+}
+
+inline void clear_patients(PyObject *self) {
+    auto instance = reinterpret_cast<detail::instance *>(self);
+    auto &internals = get_internals();
+    auto pos = internals.patients.find(self);
+    assert(pos != internals.patients.end());
+    // Clearing the patients can cause more Python code to run, which
+    // can invalidate the iterator. Extract the vector of patients
+    // from the unordered_map first.
+    auto patients = std::move(pos->second);
+    internals.patients.erase(pos);
+    instance->has_patients = false;
+    for (PyObject *&patient : patients)
+        Py_CLEAR(patient);
+}
+
+/// Clears all internal data from the instance and removes it from registered instances in
+/// preparation for deallocation.
+inline void clear_instance(PyObject *self) {
+    auto instance = reinterpret_cast<detail::instance *>(self);
+
+    // Deallocate any values/holders, if present:
+    for (auto &v_h : values_and_holders(instance)) {
+        if (v_h) {
+
+            // We have to deregister before we call dealloc because, for virtual MI types, we still
+            // need to be able to get the parent pointers.
+            if (v_h.instance_registered() && !deregister_instance(instance, v_h.value_ptr(), v_h.type))
+                pybind11_fail("pybind11_object_dealloc(): Tried to deallocate unregistered instance!");
+
+            if (instance->owned || v_h.holder_constructed())
+                v_h.type->dealloc(v_h);
+        }
+    }
+    // Deallocate the value/holder layout internals:
+    instance->deallocate_layout();
+
+    if (instance->weakrefs)
+        PyObject_ClearWeakRefs(self);
+
+    PyObject **dict_ptr = _PyObject_GetDictPtr(self);
+    if (dict_ptr)
+        Py_CLEAR(*dict_ptr);
+
+    if (instance->has_patients)
+        clear_patients(self);
+}
+
+/// Instance destructor function for all pybind11 types. It calls `type_info.dealloc`
+/// to destroy the C++ object itself, while the rest is Python bookkeeping.
+extern "C" inline void pybind11_object_dealloc(PyObject *self) {
+    clear_instance(self);
+
+    auto type = Py_TYPE(self);
+    type->tp_free(self);
+
+#if PY_VERSION_HEX < 0x03080000
+    // `type->tp_dealloc != pybind11_object_dealloc` means that we're being called
+    // as part of a derived type's dealloc, in which case we're not allowed to decref
+    // the type here. For cross-module compatibility, we shouldn't compare directly
+    // with `pybind11_object_dealloc`, but with the common one stashed in internals.
+    auto pybind11_object_type = (PyTypeObject *) get_internals().instance_base;
+    if (type->tp_dealloc == pybind11_object_type->tp_dealloc)
+        Py_DECREF(type);
+#else
+    // This was not needed before Python 3.8 (Python issue 35810)
+    // https://github.com/pybind/pybind11/issues/1946
+    Py_DECREF(type);
+#endif
+}
+
+/** Create the type which can be used as a common base for all classes.  This is
+    needed in order to satisfy Python's requirements for multiple inheritance.
+    Return value: New reference. */
+inline PyObject *make_object_base_type(PyTypeObject *metaclass) {
+    constexpr auto *name = "pybind11_object";
+    auto name_obj = reinterpret_steal<object>(PYBIND11_FROM_STRING(name));
+
+    /* Danger zone: from now (and until PyType_Ready), make sure to
+       issue no Python C API calls which could potentially invoke the
+       garbage collector (the GC will call type_traverse(), which will in
+       turn find the newly constructed type in an invalid state) */
+    auto heap_type = (PyHeapTypeObject *) metaclass->tp_alloc(metaclass, 0);
+    if (!heap_type)
+        pybind11_fail("make_object_base_type(): error allocating type!");
+
+    heap_type->ht_name = name_obj.inc_ref().ptr();
+#ifdef PYBIND11_BUILTIN_QUALNAME
+    heap_type->ht_qualname = name_obj.inc_ref().ptr();
+#endif
+
+    auto type = &heap_type->ht_type;
+    type->tp_name = name;
+    type->tp_base = type_incref(&PyBaseObject_Type);
+    type->tp_basicsize = static_cast<ssize_t>(sizeof(instance));
+    type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
+
+    type->tp_new = pybind11_object_new;
+    type->tp_init = pybind11_object_init;
+    type->tp_dealloc = pybind11_object_dealloc;
+
+    /* Support weak references (needed for the keep_alive feature) */
+    type->tp_weaklistoffset = offsetof(instance, weakrefs);
+
+    if (PyType_Ready(type) < 0)
+        pybind11_fail("PyType_Ready failed in make_object_base_type():" + error_string());
+
+    setattr((PyObject *) type, "__module__", str("pybind11_builtins"));
+    PYBIND11_SET_OLDPY_QUALNAME(type, name_obj);
+
+    assert(!PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC));
+    return (PyObject *) heap_type;
+}
+
+/// dynamic_attr: Support for `d = instance.__dict__`.
+extern "C" inline PyObject *pybind11_get_dict(PyObject *self, void *) {
+    PyObject *&dict = *_PyObject_GetDictPtr(self);
+    if (!dict)
+        dict = PyDict_New();
+    Py_XINCREF(dict);
+    return dict;
+}
+
+/// dynamic_attr: Support for `instance.__dict__ = dict()`.
+extern "C" inline int pybind11_set_dict(PyObject *self, PyObject *new_dict, void *) {
+    if (!PyDict_Check(new_dict)) {
+        PyErr_Format(PyExc_TypeError, "__dict__ must be set to a dictionary, not a '%.200s'",
+                     Py_TYPE(new_dict)->tp_name);
+        return -1;
+    }
+    PyObject *&dict = *_PyObject_GetDictPtr(self);
+    Py_INCREF(new_dict);
+    Py_CLEAR(dict);
+    dict = new_dict;
+    return 0;
+}
+
+/// dynamic_attr: Allow the garbage collector to traverse the internal instance `__dict__`.
+extern "C" inline int pybind11_traverse(PyObject *self, visitproc visit, void *arg) {
+    PyObject *&dict = *_PyObject_GetDictPtr(self);
+    Py_VISIT(dict);
+    return 0;
+}
+
+/// dynamic_attr: Allow the GC to clear the dictionary.
+extern "C" inline int pybind11_clear(PyObject *self) {
+    PyObject *&dict = *_PyObject_GetDictPtr(self);
+    Py_CLEAR(dict);
+    return 0;
+}
+
+/// Give instances of this type a `__dict__` and opt into garbage collection.
+inline void enable_dynamic_attributes(PyHeapTypeObject *heap_type) {
+    auto type = &heap_type->ht_type;
+#if defined(PYPY_VERSION)
+    pybind11_fail(std::string(type->tp_name) + ": dynamic attributes are "
+                                               "currently not supported in "
+                                               "conjunction with PyPy!");
+#endif
+    type->tp_flags |= Py_TPFLAGS_HAVE_GC;
+    type->tp_dictoffset = type->tp_basicsize; // place dict at the end
+    type->tp_basicsize += (ssize_t)sizeof(PyObject *); // and allocate enough space for it
+    type->tp_traverse = pybind11_traverse;
+    type->tp_clear = pybind11_clear;
+
+    static PyGetSetDef getset[] = {
+        {const_cast<char*>("__dict__"), pybind11_get_dict, pybind11_set_dict, nullptr, nullptr},
+        {nullptr, nullptr, nullptr, nullptr, nullptr}
+    };
+    type->tp_getset = getset;
+}
+
+/// buffer_protocol: Fill in the view as specified by flags.
+extern "C" inline int pybind11_getbuffer(PyObject *obj, Py_buffer *view, int flags) {
+    // Look for a `get_buffer` implementation in this type's info or any bases (following MRO).
+    type_info *tinfo = nullptr;
+    for (auto type : reinterpret_borrow<tuple>(Py_TYPE(obj)->tp_mro)) {
+        tinfo = get_type_info((PyTypeObject *) type.ptr());
+        if (tinfo && tinfo->get_buffer)
+            break;
+    }
+    if (view == nullptr || !tinfo || !tinfo->get_buffer) {
+        if (view)
+            view->obj = nullptr;
+        PyErr_SetString(PyExc_BufferError, "pybind11_getbuffer(): Internal error");
+        return -1;
+    }
+    std::memset(view, 0, sizeof(Py_buffer));
+    buffer_info *info = tinfo->get_buffer(obj, tinfo->get_buffer_data);
+    view->obj = obj;
+    view->ndim = 1;
+    view->internal = info;
+    view->buf = info->ptr;
+    view->itemsize = info->itemsize;
+    view->len = view->itemsize;
+    for (auto s : info->shape)
+        view->len *= s;
+    view->readonly = info->readonly;
+    if ((flags & PyBUF_WRITABLE) == PyBUF_WRITABLE && info->readonly) {
+        if (view)
+            view->obj = nullptr;
+        PyErr_SetString(PyExc_BufferError, "Writable buffer requested for readonly storage");
+        return -1;
+    }
+    if ((flags & PyBUF_FORMAT) == PyBUF_FORMAT)
+        view->format = const_cast<char *>(info->format.c_str());
+    if ((flags & PyBUF_STRIDES) == PyBUF_STRIDES) {
+        view->ndim = (int) info->ndim;
+        view->strides = &info->strides[0];
+        view->shape = &info->shape[0];
+    }
+    Py_INCREF(view->obj);
+    return 0;
+}
+
+/// buffer_protocol: Release the resources of the buffer.
+extern "C" inline void pybind11_releasebuffer(PyObject *, Py_buffer *view) {
+    delete (buffer_info *) view->internal;
+}
+
+/// Give this type a buffer interface.
+inline void enable_buffer_protocol(PyHeapTypeObject *heap_type) {
+    heap_type->ht_type.tp_as_buffer = &heap_type->as_buffer;
+#if PY_MAJOR_VERSION < 3
+    heap_type->ht_type.tp_flags |= Py_TPFLAGS_HAVE_NEWBUFFER;
+#endif
+
+    heap_type->as_buffer.bf_getbuffer = pybind11_getbuffer;
+    heap_type->as_buffer.bf_releasebuffer = pybind11_releasebuffer;
+}
+
+/** Create a brand new Python type according to the `type_record` specification.
+    Return value: New reference. */
+inline PyObject* make_new_python_type(const type_record &rec) {
+    auto name = reinterpret_steal<object>(PYBIND11_FROM_STRING(rec.name));
+
+    auto qualname = name;
+    if (rec.scope && !PyModule_Check(rec.scope.ptr()) && hasattr(rec.scope, "__qualname__")) {
+#if PY_MAJOR_VERSION >= 3
+        qualname = reinterpret_steal<object>(
+            PyUnicode_FromFormat("%U.%U", rec.scope.attr("__qualname__").ptr(), name.ptr()));
+#else
+        qualname = str(rec.scope.attr("__qualname__").cast<std::string>() + "." + rec.name);
+#endif
+    }
+
+    object module;
+    if (rec.scope) {
+        if (hasattr(rec.scope, "__module__"))
+            module = rec.scope.attr("__module__");
+        else if (hasattr(rec.scope, "__name__"))
+            module = rec.scope.attr("__name__");
+    }
+
+    auto full_name = c_str(
+#if !defined(PYPY_VERSION)
+        module ? str(module).cast<std::string>() + "." + rec.name :
+#endif
+        rec.name);
+
+    char *tp_doc = nullptr;
+    if (rec.doc && options::show_user_defined_docstrings()) {
+        /* Allocate memory for docstring (using PyObject_MALLOC, since
+           Python will free this later on) */
+        size_t size = strlen(rec.doc) + 1;
+        tp_doc = (char *) PyObject_MALLOC(size);
+        memcpy((void *) tp_doc, rec.doc, size);
+    }
+
+    auto &internals = get_internals();
+    auto bases = tuple(rec.bases);
+    auto base = (bases.size() == 0) ? internals.instance_base
+                                    : bases[0].ptr();
+
+    /* Danger zone: from now (and until PyType_Ready), make sure to
+       issue no Python C API calls which could potentially invoke the
+       garbage collector (the GC will call type_traverse(), which will in
+       turn find the newly constructed type in an invalid state) */
+    auto metaclass = rec.metaclass.ptr() ? (PyTypeObject *) rec.metaclass.ptr()
+                                         : internals.default_metaclass;
+
+    auto heap_type = (PyHeapTypeObject *) metaclass->tp_alloc(metaclass, 0);
+    if (!heap_type)
+        pybind11_fail(std::string(rec.name) + ": Unable to create type object!");
+
+    heap_type->ht_name = name.release().ptr();
+#ifdef PYBIND11_BUILTIN_QUALNAME
+    heap_type->ht_qualname = qualname.inc_ref().ptr();
+#endif
+
+    auto type = &heap_type->ht_type;
+    type->tp_name = full_name;
+    type->tp_doc = tp_doc;
+    type->tp_base = type_incref((PyTypeObject *)base);
+    type->tp_basicsize = static_cast<ssize_t>(sizeof(instance));
+    if (bases.size() > 0)
+        type->tp_bases = bases.release().ptr();
+
+    /* Don't inherit base __init__ */
+    type->tp_init = pybind11_object_init;
+
+    /* Supported protocols */
+    type->tp_as_number = &heap_type->as_number;
+    type->tp_as_sequence = &heap_type->as_sequence;
+    type->tp_as_mapping = &heap_type->as_mapping;
+#if PY_VERSION_HEX >= 0x03050000
+    type->tp_as_async = &heap_type->as_async;
+#endif
+
+    /* Flags */
+    type->tp_flags |= Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
+#if PY_MAJOR_VERSION < 3
+    type->tp_flags |= Py_TPFLAGS_CHECKTYPES;
+#endif
+
+    if (rec.dynamic_attr)
+        enable_dynamic_attributes(heap_type);
+
+    if (rec.buffer_protocol)
+        enable_buffer_protocol(heap_type);
+
+    if (PyType_Ready(type) < 0)
+        pybind11_fail(std::string(rec.name) + ": PyType_Ready failed (" + error_string() + ")!");
+
+    assert(rec.dynamic_attr ? PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC)
+                            : !PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC));
+
+    /* Register type with the parent scope */
+    if (rec.scope)
+        setattr(rec.scope, rec.name, (PyObject *) type);
+    else
+        Py_INCREF(type); // Keep it alive forever (reference leak)
+
+    if (module) // Needed by pydoc
+        setattr((PyObject *) type, "__module__", module);
+
+    PYBIND11_SET_OLDPY_QUALNAME(type, qualname);
+
+    return (PyObject *) type;
+}
+
+NAMESPACE_END(detail)
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/cviruntime/python/include/pybind11/include/pybind11/detail/common.h b/cviruntime/python/include/pybind11/include/pybind11/detail/common.h
new file mode 100644
index 000000000..dd6267936
--- /dev/null
+++ b/cviruntime/python/include/pybind11/include/pybind11/detail/common.h
@@ -0,0 +1,820 @@
+/*
+    pybind11/detail/common.h -- Basic macros
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#if !defined(NAMESPACE_BEGIN)
+#  define NAMESPACE_BEGIN(name) namespace name {
+#endif
+#if !defined(NAMESPACE_END)
+#  define NAMESPACE_END(name) }
+#endif
+
+// Robust support for some features and loading modules compiled against different pybind versions
+// requires forcing hidden visibility on pybind code, so we enforce this by setting the attribute on
+// the main `pybind11` namespace.
+#if !defined(PYBIND11_NAMESPACE)
+#  ifdef __GNUG__
+#    define PYBIND11_NAMESPACE pybind11 __attribute__((visibility("hidden")))
+#  else
+#    define PYBIND11_NAMESPACE pybind11
+#  endif
+#endif
+
+#if !(defined(_MSC_VER) && __cplusplus == 199711L) && !defined(__INTEL_COMPILER)
+#  if __cplusplus >= 201402L
+#    define PYBIND11_CPP14
+#    if __cplusplus >= 201703L
+#      define PYBIND11_CPP17
+#    endif
+#  endif
+#elif defined(_MSC_VER) && __cplusplus == 199711L
+// MSVC sets _MSVC_LANG rather than __cplusplus (supposedly until the standard is fully implemented)
+// Unless you use the /Zc:__cplusplus flag on Visual Studio 2017 15.7 Preview 3 or newer
+#  if _MSVC_LANG >= 201402L
+#    define PYBIND11_CPP14
+#    if _MSVC_LANG > 201402L && _MSC_VER >= 1910
+#      define PYBIND11_CPP17
+#    endif
+#  endif
+#endif
+
+// Compiler version assertions
+#if defined(__INTEL_COMPILER)
+#  if __INTEL_COMPILER < 1700
+#    error pybind11 requires Intel C++ compiler v17 or newer
+#  endif
+#elif defined(__clang__) && !defined(__apple_build_version__)
+#  if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ < 3)
+#    error pybind11 requires clang 3.3 or newer
+#  endif
+#elif defined(__clang__)
+// Apple changes clang version macros to its Xcode version; the first Xcode release based on
+// (upstream) clang 3.3 was Xcode 5:
+#  if __clang_major__ < 5
+#    error pybind11 requires Xcode/clang 5.0 or newer
+#  endif
+#elif defined(__GNUG__)
+#  if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 8)
+#    error pybind11 requires gcc 4.8 or newer
+#  endif
+#elif defined(_MSC_VER)
+// Pybind hits various compiler bugs in 2015u2 and earlier, and also makes use of some stl features
+// (e.g. std::negation) added in 2015u3:
+#  if _MSC_FULL_VER < 190024210
+#    error pybind11 requires MSVC 2015 update 3 or newer
+#  endif
+#endif
+
+#if !defined(PYBIND11_EXPORT)
+#  if defined(WIN32) || defined(_WIN32)
+#    define PYBIND11_EXPORT __declspec(dllexport)
+#  else
+#    define PYBIND11_EXPORT __attribute__ ((visibility("default")))
+#  endif
+#endif
+
+#if defined(_MSC_VER)
+#  define PYBIND11_NOINLINE __declspec(noinline)
+#else
+#  define PYBIND11_NOINLINE __attribute__ ((noinline))
+#endif
+
+#if defined(PYBIND11_CPP14)
+#  define PYBIND11_DEPRECATED(reason) [[deprecated(reason)]]
+#else
+#  define PYBIND11_DEPRECATED(reason) __attribute__((deprecated(reason)))
+#endif
+
+#define PYBIND11_VERSION_MAJOR 2
+#define PYBIND11_VERSION_MINOR 5
+#define PYBIND11_VERSION_PATCH dev1
+
+/// Include Python header, disable linking to pythonX_d.lib on Windows in debug mode
+#if defined(_MSC_VER)
+#  if (PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION < 4)
+#    define HAVE_ROUND 1
+#  endif
+#  pragma warning(push)
+#  pragma warning(disable: 4510 4610 4512 4005)
+#  if defined(_DEBUG) && !defined(Py_DEBUG)
+#    define PYBIND11_DEBUG_MARKER
+#    undef _DEBUG
+#  endif
+#endif
+
+#include <Python.h>
+#include <frameobject.h>
+#include <pythread.h>
+
+/* Python #defines overrides on all sorts of core functions, which
+   tends to weak havok in C++ codebases that expect these to work
+   like regular functions (potentially with several overloads) */
+#if defined(isalnum)
+#  undef isalnum
+#  undef isalpha
+#  undef islower
+#  undef isspace
+#  undef isupper
+#  undef tolower
+#  undef toupper
+#endif
+
+#if defined(copysign)
+#  undef copysign
+#endif
+
+#if defined(_MSC_VER)
+#  if defined(PYBIND11_DEBUG_MARKER)
+#    define _DEBUG
+#    undef PYBIND11_DEBUG_MARKER
+#  endif
+#  pragma warning(pop)
+#endif
+
+#include <cstddef>
+#include <cstring>
+#include <forward_list>
+#include <vector>
+#include <string>
+#include <stdexcept>
+#include <unordered_set>
+#include <unordered_map>
+#include <memory>
+#include <typeindex>
+#include <type_traits>
+
+#if PY_MAJOR_VERSION >= 3 /// Compatibility macros for various Python versions
+#define PYBIND11_INSTANCE_METHOD_NEW(ptr, class_) PyInstanceMethod_New(ptr)
+#define PYBIND11_INSTANCE_METHOD_CHECK PyInstanceMethod_Check
+#define PYBIND11_INSTANCE_METHOD_GET_FUNCTION PyInstanceMethod_GET_FUNCTION
+#define PYBIND11_BYTES_CHECK PyBytes_Check
+#define PYBIND11_BYTES_FROM_STRING PyBytes_FromString
+#define PYBIND11_BYTES_FROM_STRING_AND_SIZE PyBytes_FromStringAndSize
+#define PYBIND11_BYTES_AS_STRING_AND_SIZE PyBytes_AsStringAndSize
+#define PYBIND11_BYTES_AS_STRING PyBytes_AsString
+#define PYBIND11_BYTES_SIZE PyBytes_Size
+#define PYBIND11_LONG_CHECK(o) PyLong_Check(o)
+#define PYBIND11_LONG_AS_LONGLONG(o) PyLong_AsLongLong(o)
+#define PYBIND11_LONG_FROM_SIGNED(o) PyLong_FromSsize_t((ssize_t) o)
+#define PYBIND11_LONG_FROM_UNSIGNED(o) PyLong_FromSize_t((size_t) o)
+#define PYBIND11_BYTES_NAME "bytes"
+#define PYBIND11_STRING_NAME "str"
+#define PYBIND11_SLICE_OBJECT PyObject
+#define PYBIND11_FROM_STRING PyUnicode_FromString
+#define PYBIND11_STR_TYPE ::pybind11::str
+#define PYBIND11_BOOL_ATTR "__bool__"
+#define PYBIND11_NB_BOOL(ptr) ((ptr)->nb_bool)
+// Providing a separate declaration to make Clang's -Wmissing-prototypes happy
+#define PYBIND11_PLUGIN_IMPL(name) \
+    extern "C" PYBIND11_EXPORT PyObject *PyInit_##name();   \
+    extern "C" PYBIND11_EXPORT PyObject *PyInit_##name()
+
+#else
+#define PYBIND11_INSTANCE_METHOD_NEW(ptr, class_) PyMethod_New(ptr, nullptr, class_)
+#define PYBIND11_INSTANCE_METHOD_CHECK PyMethod_Check
+#define PYBIND11_INSTANCE_METHOD_GET_FUNCTION PyMethod_GET_FUNCTION
+#define PYBIND11_BYTES_CHECK PyString_Check
+#define PYBIND11_BYTES_FROM_STRING PyString_FromString
+#define PYBIND11_BYTES_FROM_STRING_AND_SIZE PyString_FromStringAndSize
+#define PYBIND11_BYTES_AS_STRING_AND_SIZE PyString_AsStringAndSize
+#define PYBIND11_BYTES_AS_STRING PyString_AsString
+#define PYBIND11_BYTES_SIZE PyString_Size
+#define PYBIND11_LONG_CHECK(o) (PyInt_Check(o) || PyLong_Check(o))
+#define PYBIND11_LONG_AS_LONGLONG(o) (PyInt_Check(o) ? (long long) PyLong_AsLong(o) : PyLong_AsLongLong(o))
+#define PYBIND11_LONG_FROM_SIGNED(o) PyInt_FromSsize_t((ssize_t) o) // Returns long if needed.
+#define PYBIND11_LONG_FROM_UNSIGNED(o) PyInt_FromSize_t((size_t) o) // Returns long if needed.
+#define PYBIND11_BYTES_NAME "str"
+#define PYBIND11_STRING_NAME "unicode"
+#define PYBIND11_SLICE_OBJECT PySliceObject
+#define PYBIND11_FROM_STRING PyString_FromString
+#define PYBIND11_STR_TYPE ::pybind11::bytes
+#define PYBIND11_BOOL_ATTR "__nonzero__"
+#define PYBIND11_NB_BOOL(ptr) ((ptr)->nb_nonzero)
+// Providing a separate PyInit decl to make Clang's -Wmissing-prototypes happy
+#define PYBIND11_PLUGIN_IMPL(name) \
+    static PyObject *pybind11_init_wrapper();               \
+    extern "C" PYBIND11_EXPORT void init##name();           \
+    extern "C" PYBIND11_EXPORT void init##name() {          \
+        (void)pybind11_init_wrapper();                      \
+    }                                                       \
+    PyObject *pybind11_init_wrapper()
+#endif
+
+#if PY_VERSION_HEX >= 0x03050000 && PY_VERSION_HEX < 0x03050200
+extern "C" {
+    struct _Py_atomic_address { void *value; };
+    PyAPI_DATA(_Py_atomic_address) _PyThreadState_Current;
+}
+#endif
+
+#define PYBIND11_TRY_NEXT_OVERLOAD ((PyObject *) 1) // special failure return code
+#define PYBIND11_STRINGIFY(x) #x
+#define PYBIND11_TOSTRING(x) PYBIND11_STRINGIFY(x)
+#define PYBIND11_CONCAT(first, second) first##second
+#define PYBIND11_ENSURE_INTERNALS_READY \
+    pybind11::detail::get_internals();
+
+#define PYBIND11_CHECK_PYTHON_VERSION \
+    {                                                                          \
+        const char *compiled_ver = PYBIND11_TOSTRING(PY_MAJOR_VERSION)         \
+            "." PYBIND11_TOSTRING(PY_MINOR_VERSION);                           \
+        const char *runtime_ver = Py_GetVersion();                             \
+        size_t len = std::strlen(compiled_ver);                                \
+        if (std::strncmp(runtime_ver, compiled_ver, len) != 0                  \
+                || (runtime_ver[len] >= '0' && runtime_ver[len] <= '9')) {     \
+            PyErr_Format(PyExc_ImportError,                                    \
+                "Python version mismatch: module was compiled for Python %s, " \
+                "but the interpreter version is incompatible: %s.",            \
+                compiled_ver, runtime_ver);                                    \
+            return nullptr;                                                    \
+        }                                                                      \
+    }
+
+#define PYBIND11_CATCH_INIT_EXCEPTIONS \
+        catch (pybind11::error_already_set &e) {                               \
+            PyErr_SetString(PyExc_ImportError, e.what());                      \
+            return nullptr;                                                    \
+        } catch (const std::exception &e) {                                    \
+            PyErr_SetString(PyExc_ImportError, e.what());                      \
+            return nullptr;                                                    \
+        }                                                                      \
+
+/** \rst
+    ***Deprecated in favor of PYBIND11_MODULE***
+
+    This macro creates the entry point that will be invoked when the Python interpreter
+    imports a plugin library. Please create a `module` in the function body and return
+    the pointer to its underlying Python object at the end.
+
+    .. code-block:: cpp
+
+        PYBIND11_PLUGIN(example) {
+            pybind11::module m("example", "pybind11 example plugin");
+            /// Set up bindings here
+            return m.ptr();
+        }
+\endrst */
+#define PYBIND11_PLUGIN(name)                                                  \
+    PYBIND11_DEPRECATED("PYBIND11_PLUGIN is deprecated, use PYBIND11_MODULE")  \
+    static PyObject *pybind11_init();                                          \
+    PYBIND11_PLUGIN_IMPL(name) {                                               \
+        PYBIND11_CHECK_PYTHON_VERSION                                          \
+        PYBIND11_ENSURE_INTERNALS_READY                                        \
+        try {                                                                  \
+            return pybind11_init();                                            \
+        } PYBIND11_CATCH_INIT_EXCEPTIONS                                       \
+    }                                                                          \
+    PyObject *pybind11_init()
+
+/** \rst
+    This macro creates the entry point that will be invoked when the Python interpreter
+    imports an extension module. The module name is given as the fist argument and it
+    should not be in quotes. The second macro argument defines a variable of type
+    `py::module` which can be used to initialize the module.
+
+    .. code-block:: cpp
+
+        PYBIND11_MODULE(example, m) {
+            m.doc() = "pybind11 example module";
+
+            // Add bindings here
+            m.def("foo", []() {
+                return "Hello, World!";
+            });
+        }
+\endrst */
+#define PYBIND11_MODULE(name, variable)                                        \
+    static void PYBIND11_CONCAT(pybind11_init_, name)(pybind11::module &);     \
+    PYBIND11_PLUGIN_IMPL(name) {                                               \
+        PYBIND11_CHECK_PYTHON_VERSION                                          \
+        PYBIND11_ENSURE_INTERNALS_READY                                        \
+        auto m = pybind11::module(PYBIND11_TOSTRING(name));                    \
+        try {                                                                  \
+            PYBIND11_CONCAT(pybind11_init_, name)(m);                          \
+            return m.ptr();                                                    \
+        } PYBIND11_CATCH_INIT_EXCEPTIONS                                       \
+    }                                                                          \
+    void PYBIND11_CONCAT(pybind11_init_, name)(pybind11::module &variable)
+
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+using ssize_t = Py_ssize_t;
+using size_t  = std::size_t;
+
+/// Approach used to cast a previously unknown C++ instance into a Python object
+enum class return_value_policy : uint8_t {
+    /** This is the default return value policy, which falls back to the policy
+        return_value_policy::take_ownership when the return value is a pointer.
+        Otherwise, it uses return_value::move or return_value::copy for rvalue
+        and lvalue references, respectively. See below for a description of what
+        all of these different policies do. */
+    automatic = 0,
+
+    /** As above, but use policy return_value_policy::reference when the return
+        value is a pointer. This is the default conversion policy for function
+        arguments when calling Python functions manually from C++ code (i.e. via
+        handle::operator()). You probably won't need to use this. */
+    automatic_reference,
+
+    /** Reference an existing object (i.e. do not create a new copy) and take
+        ownership. Python will call the destructor and delete operator when the
+        object’s reference count reaches zero. Undefined behavior ensues when
+        the C++ side does the same.. */
+    take_ownership,
+
+    /** Create a new copy of the returned object, which will be owned by
+        Python. This policy is comparably safe because the lifetimes of the two
+        instances are decoupled. */
+    copy,
+
+    /** Use std::move to move the return value contents into a new instance
+        that will be owned by Python. This policy is comparably safe because the
+        lifetimes of the two instances (move source and destination) are
+        decoupled. */
+    move,
+
+    /** Reference an existing object, but do not take ownership. The C++ side
+        is responsible for managing the object’s lifetime and deallocating it
+        when it is no longer used. Warning: undefined behavior will ensue when
+        the C++ side deletes an object that is still referenced and used by
+        Python. */
+    reference,
+
+    /** This policy only applies to methods and properties. It references the
+        object without taking ownership similar to the above
+        return_value_policy::reference policy. In contrast to that policy, the
+        function or property’s implicit this argument (called the parent) is
+        considered to be the the owner of the return value (the child).
+        pybind11 then couples the lifetime of the parent to the child via a
+        reference relationship that ensures that the parent cannot be garbage
+        collected while Python is still using the child. More advanced
+        variations of this scheme are also possible using combinations of
+        return_value_policy::reference and the keep_alive call policy */
+    reference_internal
+};
+
+NAMESPACE_BEGIN(detail)
+
+inline static constexpr int log2(size_t n, int k = 0) { return (n <= 1) ? k : log2(n >> 1, k + 1); }
+
+// Returns the size as a multiple of sizeof(void *), rounded up.
+inline static constexpr size_t size_in_ptrs(size_t s) { return 1 + ((s - 1) >> log2(sizeof(void *))); }
+
+/**
+ * The space to allocate for simple layout instance holders (see below) in multiple of the size of
+ * a pointer (e.g.  2 means 16 bytes on 64-bit architectures).  The default is the minimum required
+ * to holder either a std::unique_ptr or std::shared_ptr (which is almost always
+ * sizeof(std::shared_ptr<T>)).
+ */
+constexpr size_t instance_simple_holder_in_ptrs() {
+    static_assert(sizeof(std::shared_ptr<int>) >= sizeof(std::unique_ptr<int>),
+            "pybind assumes std::shared_ptrs are at least as big as std::unique_ptrs");
+    return size_in_ptrs(sizeof(std::shared_ptr<int>));
+}
+
+// Forward declarations
+struct type_info;
+struct value_and_holder;
+
+struct nonsimple_values_and_holders {
+    void **values_and_holders;
+    uint8_t *status;
+};
+
+/// The 'instance' type which needs to be standard layout (need to be able to use 'offsetof')
+struct instance {
+    PyObject_HEAD
+    /// Storage for pointers and holder; see simple_layout, below, for a description
+    union {
+        void *simple_value_holder[1 + instance_simple_holder_in_ptrs()];
+        nonsimple_values_and_holders nonsimple;
+    };
+    /// Weak references
+    PyObject *weakrefs;
+    /// If true, the pointer is owned which means we're free to manage it with a holder.
+    bool owned : 1;
+    /**
+     * An instance has two possible value/holder layouts.
+     *
+     * Simple layout (when this flag is true), means the `simple_value_holder` is set with a pointer
+     * and the holder object governing that pointer, i.e. [val1*][holder].  This layout is applied
+     * whenever there is no python-side multiple inheritance of bound C++ types *and* the type's
+     * holder will fit in the default space (which is large enough to hold either a std::unique_ptr
+     * or std::shared_ptr).
+     *
+     * Non-simple layout applies when using custom holders that require more space than `shared_ptr`
+     * (which is typically the size of two pointers), or when multiple inheritance is used on the
+     * python side.  Non-simple layout allocates the required amount of memory to have multiple
+     * bound C++ classes as parents.  Under this layout, `nonsimple.values_and_holders` is set to a
+     * pointer to allocated space of the required space to hold a sequence of value pointers and
+     * holders followed `status`, a set of bit flags (1 byte each), i.e.
+     * [val1*][holder1][val2*][holder2]...[bb...]  where each [block] is rounded up to a multiple of
+     * `sizeof(void *)`.  `nonsimple.status` is, for convenience, a pointer to the
+     * beginning of the [bb...] block (but not independently allocated).
+     *
+     * Status bits indicate whether the associated holder is constructed (&
+     * status_holder_constructed) and whether the value pointer is registered (&
+     * status_instance_registered) in `registered_instances`.
+     */
+    bool simple_layout : 1;
+    /// For simple layout, tracks whether the holder has been constructed
+    bool simple_holder_constructed : 1;
+    /// For simple layout, tracks whether the instance is registered in `registered_instances`
+    bool simple_instance_registered : 1;
+    /// If true, get_internals().patients has an entry for this object
+    bool has_patients : 1;
+
+    /// Initializes all of the above type/values/holders data (but not the instance values themselves)
+    void allocate_layout();
+
+    /// Destroys/deallocates all of the above
+    void deallocate_layout();
+
+    /// Returns the value_and_holder wrapper for the given type (or the first, if `find_type`
+    /// omitted).  Returns a default-constructed (with `.inst = nullptr`) object on failure if
+    /// `throw_if_missing` is false.
+    value_and_holder get_value_and_holder(const type_info *find_type = nullptr, bool throw_if_missing = true);
+
+    /// Bit values for the non-simple status flags
+    static constexpr uint8_t status_holder_constructed  = 1;
+    static constexpr uint8_t status_instance_registered = 2;
+};
+
+static_assert(std::is_standard_layout<instance>::value, "Internal error: `pybind11::detail::instance` is not standard layout!");
+
+/// from __cpp_future__ import (convenient aliases from C++14/17)
+#if defined(PYBIND11_CPP14) && (!defined(_MSC_VER) || _MSC_VER >= 1910)
+using std::enable_if_t;
+using std::conditional_t;
+using std::remove_cv_t;
+using std::remove_reference_t;
+#else
+template <bool B, typename T = void> using enable_if_t = typename std::enable_if<B, T>::type;
+template <bool B, typename T, typename F> using conditional_t = typename std::conditional<B, T, F>::type;
+template <typename T> using remove_cv_t = typename std::remove_cv<T>::type;
+template <typename T> using remove_reference_t = typename std::remove_reference<T>::type;
+#endif
+
+/// Index sequences
+#if defined(PYBIND11_CPP14)
+using std::index_sequence;
+using std::make_index_sequence;
+#else
+template<size_t ...> struct index_sequence  { };
+template<size_t N, size_t ...S> struct make_index_sequence_impl : make_index_sequence_impl <N - 1, N - 1, S...> { };
+template<size_t ...S> struct make_index_sequence_impl <0, S...> { typedef index_sequence<S...> type; };
+template<size_t N> using make_index_sequence = typename make_index_sequence_impl<N>::type;
+#endif
+
+/// Make an index sequence of the indices of true arguments
+template <typename ISeq, size_t, bool...> struct select_indices_impl { using type = ISeq; };
+template <size_t... IPrev, size_t I, bool B, bool... Bs> struct select_indices_impl<index_sequence<IPrev...>, I, B, Bs...>
+    : select_indices_impl<conditional_t<B, index_sequence<IPrev..., I>, index_sequence<IPrev...>>, I + 1, Bs...> {};
+template <bool... Bs> using select_indices = typename select_indices_impl<index_sequence<>, 0, Bs...>::type;
+
+/// Backports of std::bool_constant and std::negation to accommodate older compilers
+template <bool B> using bool_constant = std::integral_constant<bool, B>;
+template <typename T> struct negation : bool_constant<!T::value> { };
+
+template <typename...> struct void_t_impl { using type = void; };
+template <typename... Ts> using void_t = typename void_t_impl<Ts...>::type;
+
+/// Compile-time all/any/none of that check the boolean value of all template types
+#if defined(__cpp_fold_expressions) && !(defined(_MSC_VER) && (_MSC_VER < 1916))
+template <class... Ts> using all_of = bool_constant<(Ts::value && ...)>;
+template <class... Ts> using any_of = bool_constant<(Ts::value || ...)>;
+#elif !defined(_MSC_VER)
+template <bool...> struct bools {};
+template <class... Ts> using all_of = std::is_same<
+    bools<Ts::value..., true>,
+    bools<true, Ts::value...>>;
+template <class... Ts> using any_of = negation<all_of<negation<Ts>...>>;
+#else
+// MSVC has trouble with the above, but supports std::conjunction, which we can use instead (albeit
+// at a slight loss of compilation efficiency).
+template <class... Ts> using all_of = std::conjunction<Ts...>;
+template <class... Ts> using any_of = std::disjunction<Ts...>;
+#endif
+template <class... Ts> using none_of = negation<any_of<Ts...>>;
+
+template <class T, template<class> class... Predicates> using satisfies_all_of = all_of<Predicates<T>...>;
+template <class T, template<class> class... Predicates> using satisfies_any_of = any_of<Predicates<T>...>;
+template <class T, template<class> class... Predicates> using satisfies_none_of = none_of<Predicates<T>...>;
+
+/// Strip the class from a method type
+template <typename T> struct remove_class { };
+template <typename C, typename R, typename... A> struct remove_class<R (C::*)(A...)> { typedef R type(A...); };
+template <typename C, typename R, typename... A> struct remove_class<R (C::*)(A...) const> { typedef R type(A...); };
+
+/// Helper template to strip away type modifiers
+template <typename T> struct intrinsic_type                       { typedef T type; };
+template <typename T> struct intrinsic_type<const T>              { typedef typename intrinsic_type<T>::type type; };
+template <typename T> struct intrinsic_type<T*>                   { typedef typename intrinsic_type<T>::type type; };
+template <typename T> struct intrinsic_type<T&>                   { typedef typename intrinsic_type<T>::type type; };
+template <typename T> struct intrinsic_type<T&&>                  { typedef typename intrinsic_type<T>::type type; };
+template <typename T, size_t N> struct intrinsic_type<const T[N]> { typedef typename intrinsic_type<T>::type type; };
+template <typename T, size_t N> struct intrinsic_type<T[N]>       { typedef typename intrinsic_type<T>::type type; };
+template <typename T> using intrinsic_t = typename intrinsic_type<T>::type;
+
+/// Helper type to replace 'void' in some expressions
+struct void_type { };
+
+/// Helper template which holds a list of types
+template <typename...> struct type_list { };
+
+/// Compile-time integer sum
+#ifdef __cpp_fold_expressions
+template <typename... Ts> constexpr size_t constexpr_sum(Ts... ns) { return (0 + ... + size_t{ns}); }
+#else
+constexpr size_t constexpr_sum() { return 0; }
+template <typename T, typename... Ts>
+constexpr size_t constexpr_sum(T n, Ts... ns) { return size_t{n} + constexpr_sum(ns...); }
+#endif
+
+NAMESPACE_BEGIN(constexpr_impl)
+/// Implementation details for constexpr functions
+constexpr int first(int i) { return i; }
+template <typename T, typename... Ts>
+constexpr int first(int i, T v, Ts... vs) { return v ? i : first(i + 1, vs...); }
+
+constexpr int last(int /*i*/, int result) { return result; }
+template <typename T, typename... Ts>
+constexpr int last(int i, int result, T v, Ts... vs) { return last(i + 1, v ? i : result, vs...); }
+NAMESPACE_END(constexpr_impl)
+
+/// Return the index of the first type in Ts which satisfies Predicate<T>.  Returns sizeof...(Ts) if
+/// none match.
+template <template<typename> class Predicate, typename... Ts>
+constexpr int constexpr_first() { return constexpr_impl::first(0, Predicate<Ts>::value...); }
+
+/// Return the index of the last type in Ts which satisfies Predicate<T>, or -1 if none match.
+template <template<typename> class Predicate, typename... Ts>
+constexpr int constexpr_last() { return constexpr_impl::last(0, -1, Predicate<Ts>::value...); }
+
+/// Return the Nth element from the parameter pack
+template <size_t N, typename T, typename... Ts>
+struct pack_element { using type = typename pack_element<N - 1, Ts...>::type; };
+template <typename T, typename... Ts>
+struct pack_element<0, T, Ts...> { using type = T; };
+
+/// Return the one and only type which matches the predicate, or Default if none match.
+/// If more than one type matches the predicate, fail at compile-time.
+template <template<typename> class Predicate, typename Default, typename... Ts>
+struct exactly_one {
+    static constexpr auto found = constexpr_sum(Predicate<Ts>::value...);
+    static_assert(found <= 1, "Found more than one type matching the predicate");
+
+    static constexpr auto index = found ? constexpr_first<Predicate, Ts...>() : 0;
+    using type = conditional_t<found, typename pack_element<index, Ts...>::type, Default>;
+};
+template <template<typename> class P, typename Default>
+struct exactly_one<P, Default> { using type = Default; };
+
+template <template<typename> class Predicate, typename Default, typename... Ts>
+using exactly_one_t = typename exactly_one<Predicate, Default, Ts...>::type;
+
+/// Defer the evaluation of type T until types Us are instantiated
+template <typename T, typename... /*Us*/> struct deferred_type { using type = T; };
+template <typename T, typename... Us> using deferred_t = typename deferred_type<T, Us...>::type;
+
+/// Like is_base_of, but requires a strict base (i.e. `is_strict_base_of<T, T>::value == false`,
+/// unlike `std::is_base_of`)
+template <typename Base, typename Derived> using is_strict_base_of = bool_constant<
+    std::is_base_of<Base, Derived>::value && !std::is_same<Base, Derived>::value>;
+
+/// Like is_base_of, but also requires that the base type is accessible (i.e. that a Derived pointer
+/// can be converted to a Base pointer)
+template <typename Base, typename Derived> using is_accessible_base_of = bool_constant<
+    std::is_base_of<Base, Derived>::value && std::is_convertible<Derived *, Base *>::value>;
+
+template <template<typename...> class Base>
+struct is_template_base_of_impl {
+    template <typename... Us> static std::true_type check(Base<Us...> *);
+    static std::false_type check(...);
+};
+
+/// Check if a template is the base of a type. For example:
+/// `is_template_base_of<Base, T>` is true if `struct T : Base<U> {}` where U can be anything
+template <template<typename...> class Base, typename T>
+#if !defined(_MSC_VER)
+using is_template_base_of = decltype(is_template_base_of_impl<Base>::check((intrinsic_t<T>*)nullptr));
+#else // MSVC2015 has trouble with decltype in template aliases
+struct is_template_base_of : decltype(is_template_base_of_impl<Base>::check((intrinsic_t<T>*)nullptr)) { };
+#endif
+
+/// Check if T is an instantiation of the template `Class`. For example:
+/// `is_instantiation<shared_ptr, T>` is true if `T == shared_ptr<U>` where U can be anything.
+template <template<typename...> class Class, typename T>
+struct is_instantiation : std::false_type { };
+template <template<typename...> class Class, typename... Us>
+struct is_instantiation<Class, Class<Us...>> : std::true_type { };
+
+/// Check if T is std::shared_ptr<U> where U can be anything
+template <typename T> using is_shared_ptr = is_instantiation<std::shared_ptr, T>;
+
+/// Check if T looks like an input iterator
+template <typename T, typename = void> struct is_input_iterator : std::false_type {};
+template <typename T>
+struct is_input_iterator<T, void_t<decltype(*std::declval<T &>()), decltype(++std::declval<T &>())>>
+    : std::true_type {};
+
+template <typename T> using is_function_pointer = bool_constant<
+    std::is_pointer<T>::value && std::is_function<typename std::remove_pointer<T>::type>::value>;
+
+template <typename F> struct strip_function_object {
+    using type = typename remove_class<decltype(&F::operator())>::type;
+};
+
+// Extracts the function signature from a function, function pointer or lambda.
+template <typename Function, typename F = remove_reference_t<Function>>
+using function_signature_t = conditional_t<
+    std::is_function<F>::value,
+    F,
+    typename conditional_t<
+        std::is_pointer<F>::value || std::is_member_pointer<F>::value,
+        std::remove_pointer<F>,
+        strip_function_object<F>
+    >::type
+>;
+
+/// Returns true if the type looks like a lambda: that is, isn't a function, pointer or member
+/// pointer.  Note that this can catch all sorts of other things, too; this is intended to be used
+/// in a place where passing a lambda makes sense.
+template <typename T> using is_lambda = satisfies_none_of<remove_reference_t<T>,
+        std::is_function, std::is_pointer, std::is_member_pointer>;
+
+/// Ignore that a variable is unused in compiler warnings
+inline void ignore_unused(const int *) { }
+
+/// Apply a function over each element of a parameter pack
+#ifdef __cpp_fold_expressions
+#define PYBIND11_EXPAND_SIDE_EFFECTS(PATTERN) (((PATTERN), void()), ...)
+#else
+using expand_side_effects = bool[];
+#define PYBIND11_EXPAND_SIDE_EFFECTS(PATTERN) pybind11::detail::expand_side_effects{ ((PATTERN), void(), false)..., false }
+#endif
+
+NAMESPACE_END(detail)
+
+/// C++ bindings of builtin Python exceptions
+class builtin_exception : public std::runtime_error {
+public:
+    using std::runtime_error::runtime_error;
+    /// Set the error using the Python C API
+    virtual void set_error() const = 0;
+};
+
+#define PYBIND11_RUNTIME_EXCEPTION(name, type) \
+    class name : public builtin_exception { public: \
+        using builtin_exception::builtin_exception; \
+        name() : name("") { } \
+        void set_error() const override { PyErr_SetString(type, what()); } \
+    };
+
+PYBIND11_RUNTIME_EXCEPTION(stop_iteration, PyExc_StopIteration)
+PYBIND11_RUNTIME_EXCEPTION(index_error, PyExc_IndexError)
+PYBIND11_RUNTIME_EXCEPTION(key_error, PyExc_KeyError)
+PYBIND11_RUNTIME_EXCEPTION(value_error, PyExc_ValueError)
+PYBIND11_RUNTIME_EXCEPTION(type_error, PyExc_TypeError)
+PYBIND11_RUNTIME_EXCEPTION(buffer_error, PyExc_BufferError)
+PYBIND11_RUNTIME_EXCEPTION(import_error, PyExc_ImportError)
+PYBIND11_RUNTIME_EXCEPTION(cast_error, PyExc_RuntimeError) /// Thrown when pybind11::cast or handle::call fail due to a type casting error
+PYBIND11_RUNTIME_EXCEPTION(reference_cast_error, PyExc_RuntimeError) /// Used internally
+
+[[noreturn]] PYBIND11_NOINLINE inline void pybind11_fail(const char *reason) { throw std::runtime_error(reason); }
+[[noreturn]] PYBIND11_NOINLINE inline void pybind11_fail(const std::string &reason) { throw std::runtime_error(reason); }
+
+template <typename T, typename SFINAE = void> struct format_descriptor { };
+
+NAMESPACE_BEGIN(detail)
+// Returns the index of the given type in the type char array below, and in the list in numpy.h
+// The order here is: bool; 8 ints ((signed,unsigned)x(8,16,32,64)bits); float,double,long double;
+// complex float,double,long double.  Note that the long double types only participate when long
+// double is actually longer than double (it isn't under MSVC).
+// NB: not only the string below but also complex.h and numpy.h rely on this order.
+template <typename T, typename SFINAE = void> struct is_fmt_numeric { static constexpr bool value = false; };
+template <typename T> struct is_fmt_numeric<T, enable_if_t<std::is_arithmetic<T>::value>> {
+    static constexpr bool value = true;
+    static constexpr int index = std::is_same<T, bool>::value ? 0 : 1 + (
+        std::is_integral<T>::value ? detail::log2(sizeof(T))*2 + std::is_unsigned<T>::value : 8 + (
+        std::is_same<T, double>::value ? 1 : std::is_same<T, long double>::value ? 2 : 0));
+};
+NAMESPACE_END(detail)
+
+template <typename T> struct format_descriptor<T, detail::enable_if_t<std::is_arithmetic<T>::value>> {
+    static constexpr const char c = "?bBhHiIqQfdg"[detail::is_fmt_numeric<T>::index];
+    static constexpr const char value[2] = { c, '\0' };
+    static std::string format() { return std::string(1, c); }
+};
+
+#if !defined(PYBIND11_CPP17)
+
+template <typename T> constexpr const char format_descriptor<
+    T, detail::enable_if_t<std::is_arithmetic<T>::value>>::value[2];
+
+#endif
+
+/// RAII wrapper that temporarily clears any Python error state
+struct error_scope {
+    PyObject *type, *value, *trace;
+    error_scope() { PyErr_Fetch(&type, &value, &trace); }
+    ~error_scope() { PyErr_Restore(type, value, trace); }
+};
+
+/// Dummy destructor wrapper that can be used to expose classes with a private destructor
+struct nodelete { template <typename T> void operator()(T*) { } };
+
+NAMESPACE_BEGIN(detail)
+template <typename... Args>
+struct overload_cast_impl {
+    constexpr overload_cast_impl() {} // MSVC 2015 needs this
+
+    template <typename Return>
+    constexpr auto operator()(Return (*pf)(Args...)) const noexcept
+                              -> decltype(pf) { return pf; }
+
+    template <typename Return, typename Class>
+    constexpr auto operator()(Return (Class::*pmf)(Args...), std::false_type = {}) const noexcept
+                              -> decltype(pmf) { return pmf; }
+
+    template <typename Return, typename Class>
+    constexpr auto operator()(Return (Class::*pmf)(Args...) const, std::true_type) const noexcept
+                              -> decltype(pmf) { return pmf; }
+};
+NAMESPACE_END(detail)
+
+// overload_cast requires variable templates: C++14
+#if defined(PYBIND11_CPP14)
+#define PYBIND11_OVERLOAD_CAST 1
+/// Syntax sugar for resolving overloaded function pointers:
+///  - regular: static_cast<Return (Class::*)(Arg0, Arg1, Arg2)>(&Class::func)
+///  - sweet:   overload_cast<Arg0, Arg1, Arg2>(&Class::func)
+template <typename... Args>
+static constexpr detail::overload_cast_impl<Args...> overload_cast = {};
+// MSVC 2015 only accepts this particular initialization syntax for this variable template.
+#endif
+
+/// Const member function selector for overload_cast
+///  - regular: static_cast<Return (Class::*)(Arg) const>(&Class::func)
+///  - sweet:   overload_cast<Arg>(&Class::func, const_)
+static constexpr auto const_ = std::true_type{};
+
+#if !defined(PYBIND11_CPP14) // no overload_cast: providing something that static_assert-fails:
+template <typename... Args> struct overload_cast {
+    static_assert(detail::deferred_t<std::false_type, Args...>::value,
+                  "pybind11::overload_cast<...> requires compiling in C++14 mode");
+};
+#endif // overload_cast
+
+NAMESPACE_BEGIN(detail)
+
+// Adaptor for converting arbitrary container arguments into a vector; implicitly convertible from
+// any standard container (or C-style array) supporting std::begin/std::end, any singleton
+// arithmetic type (if T is arithmetic), or explicitly constructible from an iterator pair.
+template <typename T>
+class any_container {
+    std::vector<T> v;
+public:
+    any_container() = default;
+
+    // Can construct from a pair of iterators
+    template <typename It, typename = enable_if_t<is_input_iterator<It>::value>>
+    any_container(It first, It last) : v(first, last) { }
+
+    // Implicit conversion constructor from any arbitrary container type with values convertible to T
+    template <typename Container, typename = enable_if_t<std::is_convertible<decltype(*std::begin(std::declval<const Container &>())), T>::value>>
+    any_container(const Container &c) : any_container(std::begin(c), std::end(c)) { }
+
+    // initializer_list's aren't deducible, so don't get matched by the above template; we need this
+    // to explicitly allow implicit conversion from one:
+    template <typename TIn, typename = enable_if_t<std::is_convertible<TIn, T>::value>>
+    any_container(const std::initializer_list<TIn> &c) : any_container(c.begin(), c.end()) { }
+
+    // Avoid copying if given an rvalue vector of the correct type.
+    any_container(std::vector<T> &&v) : v(std::move(v)) { }
+
+    // Moves the vector out of an rvalue any_container
+    operator std::vector<T> &&() && { return std::move(v); }
+
+    // Dereferencing obtains a reference to the underlying vector
+    std::vector<T> &operator*() { return v; }
+    const std::vector<T> &operator*() const { return v; }
+
+    // -> lets you call methods on the underlying vector
+    std::vector<T> *operator->() { return &v; }
+    const std::vector<T> *operator->() const { return &v; }
+};
+
+NAMESPACE_END(detail)
+
+
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/cviruntime/python/include/pybind11/include/pybind11/detail/descr.h b/cviruntime/python/include/pybind11/include/pybind11/detail/descr.h
new file mode 100644
index 000000000..8d404e534
--- /dev/null
+++ b/cviruntime/python/include/pybind11/include/pybind11/detail/descr.h
@@ -0,0 +1,100 @@
+/*
+    pybind11/detail/descr.h: Helper type for concatenating type signatures at compile time
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "common.h"
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+#if !defined(_MSC_VER)
+#  define PYBIND11_DESCR_CONSTEXPR static constexpr
+#else
+#  define PYBIND11_DESCR_CONSTEXPR const
+#endif
+
+/* Concatenate type signatures at compile time */
+template <size_t N, typename... Ts>
+struct descr {
+    char text[N + 1];
+
+    constexpr descr() : text{'\0'} { }
+    constexpr descr(char const (&s)[N+1]) : descr(s, make_index_sequence<N>()) { }
+
+    template <size_t... Is>
+    constexpr descr(char const (&s)[N+1], index_sequence<Is...>) : text{s[Is]..., '\0'} { }
+
+    template <typename... Chars>
+    constexpr descr(char c, Chars... cs) : text{c, static_cast<char>(cs)..., '\0'} { }
+
+    static constexpr std::array<const std::type_info *, sizeof...(Ts) + 1> types() {
+        return {{&typeid(Ts)..., nullptr}};
+    }
+};
+
+template <size_t N1, size_t N2, typename... Ts1, typename... Ts2, size_t... Is1, size_t... Is2>
+constexpr descr<N1 + N2, Ts1..., Ts2...> plus_impl(const descr<N1, Ts1...> &a, const descr<N2, Ts2...> &b,
+                                                   index_sequence<Is1...>, index_sequence<Is2...>) {
+    return {a.text[Is1]..., b.text[Is2]...};
+}
+
+template <size_t N1, size_t N2, typename... Ts1, typename... Ts2>
+constexpr descr<N1 + N2, Ts1..., Ts2...> operator+(const descr<N1, Ts1...> &a, const descr<N2, Ts2...> &b) {
+    return plus_impl(a, b, make_index_sequence<N1>(), make_index_sequence<N2>());
+}
+
+template <size_t N>
+constexpr descr<N - 1> _(char const(&text)[N]) { return descr<N - 1>(text); }
+constexpr descr<0> _(char const(&)[1]) { return {}; }
+
+template <size_t Rem, size_t... Digits> struct int_to_str : int_to_str<Rem/10, Rem%10, Digits...> { };
+template <size_t...Digits> struct int_to_str<0, Digits...> {
+    static constexpr auto digits = descr<sizeof...(Digits)>(('0' + Digits)...);
+};
+
+// Ternary description (like std::conditional)
+template <bool B, size_t N1, size_t N2>
+constexpr enable_if_t<B, descr<N1 - 1>> _(char const(&text1)[N1], char const(&)[N2]) {
+    return _(text1);
+}
+template <bool B, size_t N1, size_t N2>
+constexpr enable_if_t<!B, descr<N2 - 1>> _(char const(&)[N1], char const(&text2)[N2]) {
+    return _(text2);
+}
+
+template <bool B, typename T1, typename T2>
+constexpr enable_if_t<B, T1> _(const T1 &d, const T2 &) { return d; }
+template <bool B, typename T1, typename T2>
+constexpr enable_if_t<!B, T2> _(const T1 &, const T2 &d) { return d; }
+
+template <size_t Size> auto constexpr _() -> decltype(int_to_str<Size / 10, Size % 10>::digits) {
+    return int_to_str<Size / 10, Size % 10>::digits;
+}
+
+template <typename Type> constexpr descr<1, Type> _() { return {'%'}; }
+
+constexpr descr<0> concat() { return {}; }
+
+template <size_t N, typename... Ts>
+constexpr descr<N, Ts...> concat(const descr<N, Ts...> &descr) { return descr; }
+
+template <size_t N, typename... Ts, typename... Args>
+constexpr auto concat(const descr<N, Ts...> &d, const Args &...args)
+    -> decltype(std::declval<descr<N + 2, Ts...>>() + concat(args...)) {
+    return d + _(", ") + concat(args...);
+}
+
+template <size_t N, typename... Ts>
+constexpr descr<N + 2, Ts...> type_descr(const descr<N, Ts...> &descr) {
+    return _("{") + descr + _("}");
+}
+
+NAMESPACE_END(detail)
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/cviruntime/python/include/pybind11/include/pybind11/detail/init.h b/cviruntime/python/include/pybind11/include/pybind11/detail/init.h
new file mode 100644
index 000000000..acfe00bdb
--- /dev/null
+++ b/cviruntime/python/include/pybind11/include/pybind11/detail/init.h
@@ -0,0 +1,335 @@
+/*
+    pybind11/detail/init.h: init factory function implementation and support code.
+
+    Copyright (c) 2017 Jason Rhinelander <jason@imaginary.ca>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "class.h"
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+template <>
+class type_caster<value_and_holder> {
+public:
+    bool load(handle h, bool) {
+        value = reinterpret_cast<value_and_holder *>(h.ptr());
+        return true;
+    }
+
+    template <typename> using cast_op_type = value_and_holder &;
+    operator value_and_holder &() { return *value; }
+    static constexpr auto name = _<value_and_holder>();
+
+private:
+    value_and_holder *value = nullptr;
+};
+
+NAMESPACE_BEGIN(initimpl)
+
+inline void no_nullptr(void *ptr) {
+    if (!ptr) throw type_error("pybind11::init(): factory function returned nullptr");
+}
+
+// Implementing functions for all forms of py::init<...> and py::init(...)
+template <typename Class> using Cpp = typename Class::type;
+template <typename Class> using Alias = typename Class::type_alias;
+template <typename Class> using Holder = typename Class::holder_type;
+
+template <typename Class> using is_alias_constructible = std::is_constructible<Alias<Class>, Cpp<Class> &&>;
+
+// Takes a Cpp pointer and returns true if it actually is a polymorphic Alias instance.
+template <typename Class, enable_if_t<Class::has_alias, int> = 0>
+bool is_alias(Cpp<Class> *ptr) {
+    return dynamic_cast<Alias<Class> *>(ptr) != nullptr;
+}
+// Failing fallback version of the above for a no-alias class (always returns false)
+template <typename /*Class*/>
+constexpr bool is_alias(void *) { return false; }
+
+// Constructs and returns a new object; if the given arguments don't map to a constructor, we fall
+// back to brace aggregate initiailization so that for aggregate initialization can be used with
+// py::init, e.g.  `py::init<int, int>` to initialize a `struct T { int a; int b; }`.  For
+// non-aggregate types, we need to use an ordinary T(...) constructor (invoking as `T{...}` usually
+// works, but will not do the expected thing when `T` has an `initializer_list<T>` constructor).
+template <typename Class, typename... Args, detail::enable_if_t<std::is_constructible<Class, Args...>::value, int> = 0>
+inline Class *construct_or_initialize(Args &&...args) { return new Class(std::forward<Args>(args)...); }
+template <typename Class, typename... Args, detail::enable_if_t<!std::is_constructible<Class, Args...>::value, int> = 0>
+inline Class *construct_or_initialize(Args &&...args) { return new Class{std::forward<Args>(args)...}; }
+
+// Attempts to constructs an alias using a `Alias(Cpp &&)` constructor.  This allows types with
+// an alias to provide only a single Cpp factory function as long as the Alias can be
+// constructed from an rvalue reference of the base Cpp type.  This means that Alias classes
+// can, when appropriate, simply define a `Alias(Cpp &&)` constructor rather than needing to
+// inherit all the base class constructors.
+template <typename Class>
+void construct_alias_from_cpp(std::true_type /*is_alias_constructible*/,
+                              value_and_holder &v_h, Cpp<Class> &&base) {
+    v_h.value_ptr() = new Alias<Class>(std::move(base));
+}
+template <typename Class>
+[[noreturn]] void construct_alias_from_cpp(std::false_type /*!is_alias_constructible*/,
+                                           value_and_holder &, Cpp<Class> &&) {
+    throw type_error("pybind11::init(): unable to convert returned instance to required "
+                     "alias class: no `Alias<Class>(Class &&)` constructor available");
+}
+
+// Error-generating fallback for factories that don't match one of the below construction
+// mechanisms.
+template <typename Class>
+void construct(...) {
+    static_assert(!std::is_same<Class, Class>::value /* always false */,
+            "pybind11::init(): init function must return a compatible pointer, "
+            "holder, or value");
+}
+
+// Pointer return v1: the factory function returns a class pointer for a registered class.
+// If we don't need an alias (because this class doesn't have one, or because the final type is
+// inherited on the Python side) we can simply take over ownership.  Otherwise we need to try to
+// construct an Alias from the returned base instance.
+template <typename Class>
+void construct(value_and_holder &v_h, Cpp<Class> *ptr, bool need_alias) {
+    no_nullptr(ptr);
+    if (Class::has_alias && need_alias && !is_alias<Class>(ptr)) {
+        // We're going to try to construct an alias by moving the cpp type.  Whether or not
+        // that succeeds, we still need to destroy the original cpp pointer (either the
+        // moved away leftover, if the alias construction works, or the value itself if we
+        // throw an error), but we can't just call `delete ptr`: it might have a special
+        // deleter, or might be shared_from_this.  So we construct a holder around it as if
+        // it was a normal instance, then steal the holder away into a local variable; thus
+        // the holder and destruction happens when we leave the C++ scope, and the holder
+        // class gets to handle the destruction however it likes.
+        v_h.value_ptr() = ptr;
+        v_h.set_instance_registered(true); // To prevent init_instance from registering it
+        v_h.type->init_instance(v_h.inst, nullptr); // Set up the holder
+        Holder<Class> temp_holder(std::move(v_h.holder<Holder<Class>>())); // Steal the holder
+        v_h.type->dealloc(v_h); // Destroys the moved-out holder remains, resets value ptr to null
+        v_h.set_instance_registered(false);
+
+        construct_alias_from_cpp<Class>(is_alias_constructible<Class>{}, v_h, std::move(*ptr));
+    } else {
+        // Otherwise the type isn't inherited, so we don't need an Alias
+        v_h.value_ptr() = ptr;
+    }
+}
+
+// Pointer return v2: a factory that always returns an alias instance ptr.  We simply take over
+// ownership of the pointer.
+template <typename Class, enable_if_t<Class::has_alias, int> = 0>
+void construct(value_and_holder &v_h, Alias<Class> *alias_ptr, bool) {
+    no_nullptr(alias_ptr);
+    v_h.value_ptr() = static_cast<Cpp<Class> *>(alias_ptr);
+}
+
+// Holder return: copy its pointer, and move or copy the returned holder into the new instance's
+// holder.  This also handles types like std::shared_ptr<T> and std::unique_ptr<T> where T is a
+// derived type (through those holder's implicit conversion from derived class holder constructors).
+template <typename Class>
+void construct(value_and_holder &v_h, Holder<Class> holder, bool need_alias) {
+    auto *ptr = holder_helper<Holder<Class>>::get(holder);
+    // If we need an alias, check that the held pointer is actually an alias instance
+    if (Class::has_alias && need_alias && !is_alias<Class>(ptr))
+        throw type_error("pybind11::init(): construction failed: returned holder-wrapped instance "
+                         "is not an alias instance");
+
+    v_h.value_ptr() = ptr;
+    v_h.type->init_instance(v_h.inst, &holder);
+}
+
+// return-by-value version 1: returning a cpp class by value.  If the class has an alias and an
+// alias is required the alias must have an `Alias(Cpp &&)` constructor so that we can construct
+// the alias from the base when needed (i.e. because of Python-side inheritance).  When we don't
+// need it, we simply move-construct the cpp value into a new instance.
+template <typename Class>
+void construct(value_and_holder &v_h, Cpp<Class> &&result, bool need_alias) {
+    static_assert(std::is_move_constructible<Cpp<Class>>::value,
+        "pybind11::init() return-by-value factory function requires a movable class");
+    if (Class::has_alias && need_alias)
+        construct_alias_from_cpp<Class>(is_alias_constructible<Class>{}, v_h, std::move(result));
+    else
+        v_h.value_ptr() = new Cpp<Class>(std::move(result));
+}
+
+// return-by-value version 2: returning a value of the alias type itself.  We move-construct an
+// Alias instance (even if no the python-side inheritance is involved).  The is intended for
+// cases where Alias initialization is always desired.
+template <typename Class>
+void construct(value_and_holder &v_h, Alias<Class> &&result, bool) {
+    static_assert(std::is_move_constructible<Alias<Class>>::value,
+        "pybind11::init() return-by-alias-value factory function requires a movable alias class");
+    v_h.value_ptr() = new Alias<Class>(std::move(result));
+}
+
+// Implementing class for py::init<...>()
+template <typename... Args>
+struct constructor {
+    template <typename Class, typename... Extra, enable_if_t<!Class::has_alias, int> = 0>
+    static void execute(Class &cl, const Extra&... extra) {
+        cl.def("__init__", [](value_and_holder &v_h, Args... args) {
+            v_h.value_ptr() = construct_or_initialize<Cpp<Class>>(std::forward<Args>(args)...);
+        }, is_new_style_constructor(), extra...);
+    }
+
+    template <typename Class, typename... Extra,
+              enable_if_t<Class::has_alias &&
+                          std::is_constructible<Cpp<Class>, Args...>::value, int> = 0>
+    static void execute(Class &cl, const Extra&... extra) {
+        cl.def("__init__", [](value_and_holder &v_h, Args... args) {
+            if (Py_TYPE(v_h.inst) == v_h.type->type)
+                v_h.value_ptr() = construct_or_initialize<Cpp<Class>>(std::forward<Args>(args)...);
+            else
+                v_h.value_ptr() = construct_or_initialize<Alias<Class>>(std::forward<Args>(args)...);
+        }, is_new_style_constructor(), extra...);
+    }
+
+    template <typename Class, typename... Extra,
+              enable_if_t<Class::has_alias &&
+                          !std::is_constructible<Cpp<Class>, Args...>::value, int> = 0>
+    static void execute(Class &cl, const Extra&... extra) {
+        cl.def("__init__", [](value_and_holder &v_h, Args... args) {
+            v_h.value_ptr() = construct_or_initialize<Alias<Class>>(std::forward<Args>(args)...);
+        }, is_new_style_constructor(), extra...);
+    }
+};
+
+// Implementing class for py::init_alias<...>()
+template <typename... Args> struct alias_constructor {
+    template <typename Class, typename... Extra,
+              enable_if_t<Class::has_alias && std::is_constructible<Alias<Class>, Args...>::value, int> = 0>
+    static void execute(Class &cl, const Extra&... extra) {
+        cl.def("__init__", [](value_and_holder &v_h, Args... args) {
+            v_h.value_ptr() = construct_or_initialize<Alias<Class>>(std::forward<Args>(args)...);
+        }, is_new_style_constructor(), extra...);
+    }
+};
+
+// Implementation class for py::init(Func) and py::init(Func, AliasFunc)
+template <typename CFunc, typename AFunc = void_type (*)(),
+          typename = function_signature_t<CFunc>, typename = function_signature_t<AFunc>>
+struct factory;
+
+// Specialization for py::init(Func)
+template <typename Func, typename Return, typename... Args>
+struct factory<Func, void_type (*)(), Return(Args...)> {
+    remove_reference_t<Func> class_factory;
+
+    factory(Func &&f) : class_factory(std::forward<Func>(f)) { }
+
+    // The given class either has no alias or has no separate alias factory;
+    // this always constructs the class itself.  If the class is registered with an alias
+    // type and an alias instance is needed (i.e. because the final type is a Python class
+    // inheriting from the C++ type) the returned value needs to either already be an alias
+    // instance, or the alias needs to be constructible from a `Class &&` argument.
+    template <typename Class, typename... Extra>
+    void execute(Class &cl, const Extra &...extra) && {
+        #if defined(PYBIND11_CPP14)
+        cl.def("__init__", [func = std::move(class_factory)]
+        #else
+        auto &func = class_factory;
+        cl.def("__init__", [func]
+        #endif
+        (value_and_holder &v_h, Args... args) {
+            construct<Class>(v_h, func(std::forward<Args>(args)...),
+                             Py_TYPE(v_h.inst) != v_h.type->type);
+        }, is_new_style_constructor(), extra...);
+    }
+};
+
+// Specialization for py::init(Func, AliasFunc)
+template <typename CFunc, typename AFunc,
+          typename CReturn, typename... CArgs, typename AReturn, typename... AArgs>
+struct factory<CFunc, AFunc, CReturn(CArgs...), AReturn(AArgs...)> {
+    static_assert(sizeof...(CArgs) == sizeof...(AArgs),
+                  "pybind11::init(class_factory, alias_factory): class and alias factories "
+                  "must have identical argument signatures");
+    static_assert(all_of<std::is_same<CArgs, AArgs>...>::value,
+                  "pybind11::init(class_factory, alias_factory): class and alias factories "
+                  "must have identical argument signatures");
+
+    remove_reference_t<CFunc> class_factory;
+    remove_reference_t<AFunc> alias_factory;
+
+    factory(CFunc &&c, AFunc &&a)
+        : class_factory(std::forward<CFunc>(c)), alias_factory(std::forward<AFunc>(a)) { }
+
+    // The class factory is called when the `self` type passed to `__init__` is the direct
+    // class (i.e. not inherited), the alias factory when `self` is a Python-side subtype.
+    template <typename Class, typename... Extra>
+    void execute(Class &cl, const Extra&... extra) && {
+        static_assert(Class::has_alias, "The two-argument version of `py::init()` can "
+                                        "only be used if the class has an alias");
+        #if defined(PYBIND11_CPP14)
+        cl.def("__init__", [class_func = std::move(class_factory), alias_func = std::move(alias_factory)]
+        #else
+        auto &class_func = class_factory;
+        auto &alias_func = alias_factory;
+        cl.def("__init__", [class_func, alias_func]
+        #endif
+        (value_and_holder &v_h, CArgs... args) {
+            if (Py_TYPE(v_h.inst) == v_h.type->type)
+                // If the instance type equals the registered type we don't have inheritance, so
+                // don't need the alias and can construct using the class function:
+                construct<Class>(v_h, class_func(std::forward<CArgs>(args)...), false);
+            else
+                construct<Class>(v_h, alias_func(std::forward<CArgs>(args)...), true);
+        }, is_new_style_constructor(), extra...);
+    }
+};
+
+/// Set just the C++ state. Same as `__init__`.
+template <typename Class, typename T>
+void setstate(value_and_holder &v_h, T &&result, bool need_alias) {
+    construct<Class>(v_h, std::forward<T>(result), need_alias);
+}
+
+/// Set both the C++ and Python states
+template <typename Class, typename T, typename O,
+          enable_if_t<std::is_convertible<O, handle>::value, int> = 0>
+void setstate(value_and_holder &v_h, std::pair<T, O> &&result, bool need_alias) {
+    construct<Class>(v_h, std::move(result.first), need_alias);
+    setattr((PyObject *) v_h.inst, "__dict__", result.second);
+}
+
+/// Implementation for py::pickle(GetState, SetState)
+template <typename Get, typename Set,
+          typename = function_signature_t<Get>, typename = function_signature_t<Set>>
+struct pickle_factory;
+
+template <typename Get, typename Set,
+          typename RetState, typename Self, typename NewInstance, typename ArgState>
+struct pickle_factory<Get, Set, RetState(Self), NewInstance(ArgState)> {
+    static_assert(std::is_same<intrinsic_t<RetState>, intrinsic_t<ArgState>>::value,
+                  "The type returned by `__getstate__` must be the same "
+                  "as the argument accepted by `__setstate__`");
+
+    remove_reference_t<Get> get;
+    remove_reference_t<Set> set;
+
+    pickle_factory(Get get, Set set)
+        : get(std::forward<Get>(get)), set(std::forward<Set>(set)) { }
+
+    template <typename Class, typename... Extra>
+    void execute(Class &cl, const Extra &...extra) && {
+        cl.def("__getstate__", std::move(get));
+
+#if defined(PYBIND11_CPP14)
+        cl.def("__setstate__", [func = std::move(set)]
+#else
+        auto &func = set;
+        cl.def("__setstate__", [func]
+#endif
+        (value_and_holder &v_h, ArgState state) {
+            setstate<Class>(v_h, func(std::forward<ArgState>(state)),
+                            Py_TYPE(v_h.inst) != v_h.type->type);
+        }, is_new_style_constructor(), extra...);
+    }
+};
+
+NAMESPACE_END(initimpl)
+NAMESPACE_END(detail)
+NAMESPACE_END(pybind11)
diff --git a/cviruntime/python/include/pybind11/include/pybind11/detail/internals.h b/cviruntime/python/include/pybind11/include/pybind11/detail/internals.h
new file mode 100644
index 000000000..6224dfb22
--- /dev/null
+++ b/cviruntime/python/include/pybind11/include/pybind11/detail/internals.h
@@ -0,0 +1,349 @@
+/*
+    pybind11/detail/internals.h: Internal data structure and related functions
+
+    Copyright (c) 2017 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "../pytypes.h"
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+// Forward declarations
+inline PyTypeObject *make_static_property_type();
+inline PyTypeObject *make_default_metaclass();
+inline PyObject *make_object_base_type(PyTypeObject *metaclass);
+
+// The old Python Thread Local Storage (TLS) API is deprecated in Python 3.7 in favor of the new
+// Thread Specific Storage (TSS) API.
+#if PY_VERSION_HEX >= 0x03070000
+#    define PYBIND11_TLS_KEY_INIT(var) Py_tss_t *var = nullptr
+#    define PYBIND11_TLS_GET_VALUE(key) PyThread_tss_get((key))
+#    define PYBIND11_TLS_REPLACE_VALUE(key, value) PyThread_tss_set((key), (value))
+#    define PYBIND11_TLS_DELETE_VALUE(key) PyThread_tss_set((key), nullptr)
+#    define PYBIND11_TLS_FREE(key) PyThread_tss_free(key)
+#else
+    // Usually an int but a long on Cygwin64 with Python 3.x
+#    define PYBIND11_TLS_KEY_INIT(var) decltype(PyThread_create_key()) var = 0
+#    define PYBIND11_TLS_GET_VALUE(key) PyThread_get_key_value((key))
+#    if PY_MAJOR_VERSION < 3
+#        define PYBIND11_TLS_DELETE_VALUE(key)                               \
+             PyThread_delete_key_value(key)
+#        define PYBIND11_TLS_REPLACE_VALUE(key, value)                       \
+             do {                                                            \
+                 PyThread_delete_key_value((key));                           \
+                 PyThread_set_key_value((key), (value));                     \
+             } while (false)
+#    else
+#        define PYBIND11_TLS_DELETE_VALUE(key)                               \
+             PyThread_set_key_value((key), nullptr)
+#        define PYBIND11_TLS_REPLACE_VALUE(key, value)                       \
+             PyThread_set_key_value((key), (value))
+#    endif
+#    define PYBIND11_TLS_FREE(key) (void)key
+#endif
+
+// Python loads modules by default with dlopen with the RTLD_LOCAL flag; under libc++ and possibly
+// other STLs, this means `typeid(A)` from one module won't equal `typeid(A)` from another module
+// even when `A` is the same, non-hidden-visibility type (e.g. from a common include).  Under
+// libstdc++, this doesn't happen: equality and the type_index hash are based on the type name,
+// which works.  If not under a known-good stl, provide our own name-based hash and equality
+// functions that use the type name.
+#if defined(__GLIBCXX__)
+inline bool same_type(const std::type_info &lhs, const std::type_info &rhs) { return lhs == rhs; }
+using type_hash = std::hash<std::type_index>;
+using type_equal_to = std::equal_to<std::type_index>;
+#else
+inline bool same_type(const std::type_info &lhs, const std::type_info &rhs) {
+    return lhs.name() == rhs.name() || std::strcmp(lhs.name(), rhs.name()) == 0;
+}
+
+struct type_hash {
+    size_t operator()(const std::type_index &t) const {
+        size_t hash = 5381;
+        const char *ptr = t.name();
+        while (auto c = static_cast<unsigned char>(*ptr++))
+            hash = (hash * 33) ^ c;
+        return hash;
+    }
+};
+
+struct type_equal_to {
+    bool operator()(const std::type_index &lhs, const std::type_index &rhs) const {
+        return lhs.name() == rhs.name() || std::strcmp(lhs.name(), rhs.name()) == 0;
+    }
+};
+#endif
+
+template <typename value_type>
+using type_map = std::unordered_map<std::type_index, value_type, type_hash, type_equal_to>;
+
+struct overload_hash {
+    inline size_t operator()(const std::pair<const PyObject *, const char *>& v) const {
+        size_t value = std::hash<const void *>()(v.first);
+        value ^= std::hash<const void *>()(v.second)  + 0x9e3779b9 + (value<<6) + (value>>2);
+        return value;
+    }
+};
+
+/// Internal data structure used to track registered instances and types.
+/// Whenever binary incompatible changes are made to this structure,
+/// `PYBIND11_INTERNALS_VERSION` must be incremented.
+struct internals {
+    type_map<type_info *> registered_types_cpp; // std::type_index -> pybind11's type information
+    std::unordered_map<PyTypeObject *, std::vector<type_info *>> registered_types_py; // PyTypeObject* -> base type_info(s)
+    std::unordered_multimap<const void *, instance*> registered_instances; // void * -> instance*
+    std::unordered_set<std::pair<const PyObject *, const char *>, overload_hash> inactive_overload_cache;
+    type_map<std::vector<bool (*)(PyObject *, void *&)>> direct_conversions;
+    std::unordered_map<const PyObject *, std::vector<PyObject *>> patients;
+    std::forward_list<void (*) (std::exception_ptr)> registered_exception_translators;
+    std::unordered_map<std::string, void *> shared_data; // Custom data to be shared across extensions
+    std::vector<PyObject *> loader_patient_stack; // Used by `loader_life_support`
+    std::forward_list<std::string> static_strings; // Stores the std::strings backing detail::c_str()
+    PyTypeObject *static_property_type;
+    PyTypeObject *default_metaclass;
+    PyObject *instance_base;
+#if defined(WITH_THREAD)
+    PYBIND11_TLS_KEY_INIT(tstate);
+    PyInterpreterState *istate = nullptr;
+    ~internals() {
+        // This destructor is called *after* Py_Finalize() in finalize_interpreter().
+        // That *SHOULD BE* fine. The following details what happens whe PyThread_tss_free is called.
+        // PYBIND11_TLS_FREE is PyThread_tss_free on python 3.7+. On older python, it does nothing.
+        // PyThread_tss_free calls PyThread_tss_delete and PyMem_RawFree.
+        // PyThread_tss_delete just calls TlsFree (on Windows) or pthread_key_delete (on *NIX). Neither
+        // of those have anything to do with CPython internals.
+        // PyMem_RawFree *requires* that the `tstate` be allocated with the CPython allocator.
+        PYBIND11_TLS_FREE(tstate);
+    }
+#endif
+};
+
+/// Additional type information which does not fit into the PyTypeObject.
+/// Changes to this struct also require bumping `PYBIND11_INTERNALS_VERSION`.
+struct type_info {
+    PyTypeObject *type;
+    const std::type_info *cpptype;
+    size_t type_size, type_align, holder_size_in_ptrs;
+    void *(*operator_new)(size_t);
+    void (*init_instance)(instance *, const void *);
+    void (*dealloc)(value_and_holder &v_h);
+    std::vector<PyObject *(*)(PyObject *, PyTypeObject *)> implicit_conversions;
+    std::vector<std::pair<const std::type_info *, void *(*)(void *)>> implicit_casts;
+    std::vector<bool (*)(PyObject *, void *&)> *direct_conversions;
+    buffer_info *(*get_buffer)(PyObject *, void *) = nullptr;
+    void *get_buffer_data = nullptr;
+    void *(*module_local_load)(PyObject *, const type_info *) = nullptr;
+    /* A simple type never occurs as a (direct or indirect) parent
+     * of a class that makes use of multiple inheritance */
+    bool simple_type : 1;
+    /* True if there is no multiple inheritance in this type's inheritance tree */
+    bool simple_ancestors : 1;
+    /* for base vs derived holder_type checks */
+    bool default_holder : 1;
+    /* true if this is a type registered with py::module_local */
+    bool module_local : 1;
+};
+
+/// Tracks the `internals` and `type_info` ABI version independent of the main library version
+#define PYBIND11_INTERNALS_VERSION 4
+
+/// On MSVC, debug and release builds are not ABI-compatible!
+#if defined(_MSC_VER) && defined(_DEBUG)
+#   define PYBIND11_BUILD_TYPE "_debug"
+#else
+#   define PYBIND11_BUILD_TYPE ""
+#endif
+
+/// Let's assume that different compilers are ABI-incompatible.
+#if defined(_MSC_VER)
+#   define PYBIND11_COMPILER_TYPE "_msvc"
+#elif defined(__INTEL_COMPILER)
+#   define PYBIND11_COMPILER_TYPE "_icc"
+#elif defined(__clang__)
+#   define PYBIND11_COMPILER_TYPE "_clang"
+#elif defined(__PGI)
+#   define PYBIND11_COMPILER_TYPE "_pgi"
+#elif defined(__MINGW32__)
+#   define PYBIND11_COMPILER_TYPE "_mingw"
+#elif defined(__CYGWIN__)
+#   define PYBIND11_COMPILER_TYPE "_gcc_cygwin"
+#elif defined(__GNUC__)
+#   define PYBIND11_COMPILER_TYPE "_gcc"
+#else
+#   define PYBIND11_COMPILER_TYPE "_unknown"
+#endif
+
+#if defined(_LIBCPP_VERSION)
+#  define PYBIND11_STDLIB "_libcpp"
+#elif defined(__GLIBCXX__) || defined(__GLIBCPP__)
+#  define PYBIND11_STDLIB "_libstdcpp"
+#else
+#  define PYBIND11_STDLIB ""
+#endif
+
+/// On Linux/OSX, changes in __GXX_ABI_VERSION__ indicate ABI incompatibility.
+#if defined(__GXX_ABI_VERSION)
+#  define PYBIND11_BUILD_ABI "_cxxabi" PYBIND11_TOSTRING(__GXX_ABI_VERSION)
+#else
+#  define PYBIND11_BUILD_ABI ""
+#endif
+
+#if defined(WITH_THREAD)
+#  define PYBIND11_INTERNALS_KIND ""
+#else
+#  define PYBIND11_INTERNALS_KIND "_without_thread"
+#endif
+
+#define PYBIND11_INTERNALS_ID "__pybind11_internals_v" \
+    PYBIND11_TOSTRING(PYBIND11_INTERNALS_VERSION) PYBIND11_INTERNALS_KIND PYBIND11_COMPILER_TYPE PYBIND11_STDLIB PYBIND11_BUILD_ABI PYBIND11_BUILD_TYPE "__"
+
+#define PYBIND11_MODULE_LOCAL_ID "__pybind11_module_local_v" \
+    PYBIND11_TOSTRING(PYBIND11_INTERNALS_VERSION) PYBIND11_INTERNALS_KIND PYBIND11_COMPILER_TYPE PYBIND11_STDLIB PYBIND11_BUILD_ABI PYBIND11_BUILD_TYPE "__"
+
+/// Each module locally stores a pointer to the `internals` data. The data
+/// itself is shared among modules with the same `PYBIND11_INTERNALS_ID`.
+inline internals **&get_internals_pp() {
+    static internals **internals_pp = nullptr;
+    return internals_pp;
+}
+
+inline void translate_exception(std::exception_ptr p) {
+    try {
+        if (p) std::rethrow_exception(p);
+    } catch (error_already_set &e)           { e.restore();                                    return;
+    } catch (const builtin_exception &e)     { e.set_error();                                  return;
+    } catch (const std::bad_alloc &e)        { PyErr_SetString(PyExc_MemoryError,   e.what()); return;
+    } catch (const std::domain_error &e)     { PyErr_SetString(PyExc_ValueError,    e.what()); return;
+    } catch (const std::invalid_argument &e) { PyErr_SetString(PyExc_ValueError,    e.what()); return;
+    } catch (const std::length_error &e)     { PyErr_SetString(PyExc_ValueError,    e.what()); return;
+    } catch (const std::out_of_range &e)     { PyErr_SetString(PyExc_IndexError,    e.what()); return;
+    } catch (const std::range_error &e)      { PyErr_SetString(PyExc_ValueError,    e.what()); return;
+    } catch (const std::overflow_error &e)   { PyErr_SetString(PyExc_OverflowError, e.what()); return;
+    } catch (const std::exception &e)        { PyErr_SetString(PyExc_RuntimeError,  e.what()); return;
+    } catch (...) {
+        PyErr_SetString(PyExc_RuntimeError, "Caught an unknown exception!");
+        return;
+    }
+}
+
+#if !defined(__GLIBCXX__)
+inline void translate_local_exception(std::exception_ptr p) {
+    try {
+        if (p) std::rethrow_exception(p);
+    } catch (error_already_set &e)       { e.restore();   return;
+    } catch (const builtin_exception &e) { e.set_error(); return;
+    }
+}
+#endif
+
+/// Return a reference to the current `internals` data
+PYBIND11_NOINLINE inline internals &get_internals() {
+    auto **&internals_pp = get_internals_pp();
+    if (internals_pp && *internals_pp)
+        return **internals_pp;
+
+    // Ensure that the GIL is held since we will need to make Python calls.
+    // Cannot use py::gil_scoped_acquire here since that constructor calls get_internals.
+    struct gil_scoped_acquire_local {
+        gil_scoped_acquire_local() : state (PyGILState_Ensure()) {}
+        ~gil_scoped_acquire_local() { PyGILState_Release(state); }
+        const PyGILState_STATE state;
+    } gil;
+
+    constexpr auto *id = PYBIND11_INTERNALS_ID;
+    auto builtins = handle(PyEval_GetBuiltins());
+    if (builtins.contains(id) && isinstance<capsule>(builtins[id])) {
+        internals_pp = static_cast<internals **>(capsule(builtins[id]));
+
+        // We loaded builtins through python's builtins, which means that our `error_already_set`
+        // and `builtin_exception` may be different local classes than the ones set up in the
+        // initial exception translator, below, so add another for our local exception classes.
+        //
+        // libstdc++ doesn't require this (types there are identified only by name)
+#if !defined(__GLIBCXX__)
+        (*internals_pp)->registered_exception_translators.push_front(&translate_local_exception);
+#endif
+    } else {
+        if (!internals_pp) internals_pp = new internals*();
+        auto *&internals_ptr = *internals_pp;
+        internals_ptr = new internals();
+#if defined(WITH_THREAD)
+        PyEval_InitThreads();
+        PyThreadState *tstate = PyThreadState_Get();
+        #if PY_VERSION_HEX >= 0x03070000
+            internals_ptr->tstate = PyThread_tss_alloc();
+            if (!internals_ptr->tstate || PyThread_tss_create(internals_ptr->tstate))
+                pybind11_fail("get_internals: could not successfully initialize the TSS key!");
+            PyThread_tss_set(internals_ptr->tstate, tstate);
+        #else
+            internals_ptr->tstate = PyThread_create_key();
+            if (internals_ptr->tstate == -1)
+                pybind11_fail("get_internals: could not successfully initialize the TLS key!");
+            PyThread_set_key_value(internals_ptr->tstate, tstate);
+        #endif
+        internals_ptr->istate = tstate->interp;
+#endif
+        builtins[id] = capsule(internals_pp);
+        internals_ptr->registered_exception_translators.push_front(&translate_exception);
+        internals_ptr->static_property_type = make_static_property_type();
+        internals_ptr->default_metaclass = make_default_metaclass();
+        internals_ptr->instance_base = make_object_base_type(internals_ptr->default_metaclass);
+    }
+    return **internals_pp;
+}
+
+/// Works like `internals.registered_types_cpp`, but for module-local registered types:
+inline type_map<type_info *> &registered_local_types_cpp() {
+    static type_map<type_info *> locals{};
+    return locals;
+}
+
+/// Constructs a std::string with the given arguments, stores it in `internals`, and returns its
+/// `c_str()`.  Such strings objects have a long storage duration -- the internal strings are only
+/// cleared when the program exits or after interpreter shutdown (when embedding), and so are
+/// suitable for c-style strings needed by Python internals (such as PyTypeObject's tp_name).
+template <typename... Args>
+const char *c_str(Args &&...args) {
+    auto &strings = get_internals().static_strings;
+    strings.emplace_front(std::forward<Args>(args)...);
+    return strings.front().c_str();
+}
+
+NAMESPACE_END(detail)
+
+/// Returns a named pointer that is shared among all extension modules (using the same
+/// pybind11 version) running in the current interpreter. Names starting with underscores
+/// are reserved for internal usage. Returns `nullptr` if no matching entry was found.
+inline PYBIND11_NOINLINE void *get_shared_data(const std::string &name) {
+    auto &internals = detail::get_internals();
+    auto it = internals.shared_data.find(name);
+    return it != internals.shared_data.end() ? it->second : nullptr;
+}
+
+/// Set the shared data that can be later recovered by `get_shared_data()`.
+inline PYBIND11_NOINLINE void *set_shared_data(const std::string &name, void *data) {
+    detail::get_internals().shared_data[name] = data;
+    return data;
+}
+
+/// Returns a typed reference to a shared data entry (by using `get_shared_data()`) if
+/// such entry exists. Otherwise, a new object of default-constructible type `T` is
+/// added to the shared data under the given name and a reference to it is returned.
+template<typename T>
+T &get_or_create_shared_data(const std::string &name) {
+    auto &internals = detail::get_internals();
+    auto it = internals.shared_data.find(name);
+    T *ptr = (T *) (it != internals.shared_data.end() ? it->second : nullptr);
+    if (!ptr) {
+        ptr = new T();
+        internals.shared_data[name] = ptr;
+    }
+    return *ptr;
+}
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/cviruntime/python/include/pybind11/include/pybind11/detail/typeid.h b/cviruntime/python/include/pybind11/include/pybind11/detail/typeid.h
new file mode 100644
index 000000000..9c8a4fc69
--- /dev/null
+++ b/cviruntime/python/include/pybind11/include/pybind11/detail/typeid.h
@@ -0,0 +1,55 @@
+/*
+    pybind11/detail/typeid.h: Compiler-independent access to type identifiers
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include <cstdio>
+#include <cstdlib>
+
+#if defined(__GNUG__)
+#include <cxxabi.h>
+#endif
+
+#include "common.h"
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+/// Erase all occurrences of a substring
+inline void erase_all(std::string &string, const std::string &search) {
+    for (size_t pos = 0;;) {
+        pos = string.find(search, pos);
+        if (pos == std::string::npos) break;
+        string.erase(pos, search.length());
+    }
+}
+
+PYBIND11_NOINLINE inline void clean_type_id(std::string &name) {
+#if defined(__GNUG__)
+    int status = 0;
+    std::unique_ptr<char, void (*)(void *)> res {
+        abi::__cxa_demangle(name.c_str(), nullptr, nullptr, &status), std::free };
+    if (status == 0)
+        name = res.get();
+#else
+    detail::erase_all(name, "class ");
+    detail::erase_all(name, "struct ");
+    detail::erase_all(name, "enum ");
+#endif
+    detail::erase_all(name, "pybind11::");
+}
+NAMESPACE_END(detail)
+
+/// Return a string representation of a C++ type
+template <typename T> static std::string type_id() {
+    std::string name(typeid(T).name());
+    detail::clean_type_id(name);
+    return name;
+}
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/cviruntime/python/include/pybind11/include/pybind11/eigen.h b/cviruntime/python/include/pybind11/include/pybind11/eigen.h
new file mode 100644
index 000000000..d963d9650
--- /dev/null
+++ b/cviruntime/python/include/pybind11/include/pybind11/eigen.h
@@ -0,0 +1,607 @@
+/*
+    pybind11/eigen.h: Transparent conversion for dense and sparse Eigen matrices
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "numpy.h"
+
+#if defined(__INTEL_COMPILER)
+#  pragma warning(disable: 1682) // implicit conversion of a 64-bit integral type to a smaller integral type (potential portability problem)
+#elif defined(__GNUG__) || defined(__clang__)
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wconversion"
+#  pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#  ifdef __clang__
+//   Eigen generates a bunch of implicit-copy-constructor-is-deprecated warnings with -Wdeprecated
+//   under Clang, so disable that warning here:
+#    pragma GCC diagnostic ignored "-Wdeprecated"
+#  endif
+#  if __GNUC__ >= 7
+#    pragma GCC diagnostic ignored "-Wint-in-bool-context"
+#  endif
+#endif
+
+#if defined(_MSC_VER)
+#  pragma warning(push)
+#  pragma warning(disable: 4127) // warning C4127: Conditional expression is constant
+#  pragma warning(disable: 4996) // warning C4996: std::unary_negate is deprecated in C++17
+#endif
+
+#include <Eigen/Core>
+#include <Eigen/SparseCore>
+
+// Eigen prior to 3.2.7 doesn't have proper move constructors--but worse, some classes get implicit
+// move constructors that break things.  We could detect this an explicitly copy, but an extra copy
+// of matrices seems highly undesirable.
+static_assert(EIGEN_VERSION_AT_LEAST(3,2,7), "Eigen support in pybind11 requires Eigen >= 3.2.7");
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+// Provide a convenience alias for easier pass-by-ref usage with fully dynamic strides:
+using EigenDStride = Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>;
+template <typename MatrixType> using EigenDRef = Eigen::Ref<MatrixType, 0, EigenDStride>;
+template <typename MatrixType> using EigenDMap = Eigen::Map<MatrixType, 0, EigenDStride>;
+
+NAMESPACE_BEGIN(detail)
+
+#if EIGEN_VERSION_AT_LEAST(3,3,0)
+using EigenIndex = Eigen::Index;
+#else
+using EigenIndex = EIGEN_DEFAULT_DENSE_INDEX_TYPE;
+#endif
+
+// Matches Eigen::Map, Eigen::Ref, blocks, etc:
+template <typename T> using is_eigen_dense_map = all_of<is_template_base_of<Eigen::DenseBase, T>, std::is_base_of<Eigen::MapBase<T, Eigen::ReadOnlyAccessors>, T>>;
+template <typename T> using is_eigen_mutable_map = std::is_base_of<Eigen::MapBase<T, Eigen::WriteAccessors>, T>;
+template <typename T> using is_eigen_dense_plain = all_of<negation<is_eigen_dense_map<T>>, is_template_base_of<Eigen::PlainObjectBase, T>>;
+template <typename T> using is_eigen_sparse = is_template_base_of<Eigen::SparseMatrixBase, T>;
+// Test for objects inheriting from EigenBase<Derived> that aren't captured by the above.  This
+// basically covers anything that can be assigned to a dense matrix but that don't have a typical
+// matrix data layout that can be copied from their .data().  For example, DiagonalMatrix and
+// SelfAdjointView fall into this category.
+template <typename T> using is_eigen_other = all_of<
+    is_template_base_of<Eigen::EigenBase, T>,
+    negation<any_of<is_eigen_dense_map<T>, is_eigen_dense_plain<T>, is_eigen_sparse<T>>>
+>;
+
+// Captures numpy/eigen conformability status (returned by EigenProps::conformable()):
+template <bool EigenRowMajor> struct EigenConformable {
+    bool conformable = false;
+    EigenIndex rows = 0, cols = 0;
+    EigenDStride stride{0, 0};      // Only valid if negativestrides is false!
+    bool negativestrides = false;   // If true, do not use stride!
+
+    EigenConformable(bool fits = false) : conformable{fits} {}
+    // Matrix type:
+    EigenConformable(EigenIndex r, EigenIndex c,
+            EigenIndex rstride, EigenIndex cstride) :
+        conformable{true}, rows{r}, cols{c} {
+        // TODO: when Eigen bug #747 is fixed, remove the tests for non-negativity. http://eigen.tuxfamily.org/bz/show_bug.cgi?id=747
+        if (rstride < 0 || cstride < 0) {
+            negativestrides = true;
+        } else {
+            stride = {EigenRowMajor ? rstride : cstride /* outer stride */,
+                      EigenRowMajor ? cstride : rstride /* inner stride */ };
+        }
+    }
+    // Vector type:
+    EigenConformable(EigenIndex r, EigenIndex c, EigenIndex stride)
+        : EigenConformable(r, c, r == 1 ? c*stride : stride, c == 1 ? r : r*stride) {}
+
+    template <typename props> bool stride_compatible() const {
+        // To have compatible strides, we need (on both dimensions) one of fully dynamic strides,
+        // matching strides, or a dimension size of 1 (in which case the stride value is irrelevant)
+        return
+            !negativestrides &&
+            (props::inner_stride == Eigen::Dynamic || props::inner_stride == stride.inner() ||
+                (EigenRowMajor ? cols : rows) == 1) &&
+            (props::outer_stride == Eigen::Dynamic || props::outer_stride == stride.outer() ||
+                (EigenRowMajor ? rows : cols) == 1);
+    }
+    operator bool() const { return conformable; }
+};
+
+template <typename Type> struct eigen_extract_stride { using type = Type; };
+template <typename PlainObjectType, int MapOptions, typename StrideType>
+struct eigen_extract_stride<Eigen::Map<PlainObjectType, MapOptions, StrideType>> { using type = StrideType; };
+template <typename PlainObjectType, int Options, typename StrideType>
+struct eigen_extract_stride<Eigen::Ref<PlainObjectType, Options, StrideType>> { using type = StrideType; };
+
+// Helper struct for extracting information from an Eigen type
+template <typename Type_> struct EigenProps {
+    using Type = Type_;
+    using Scalar = typename Type::Scalar;
+    using StrideType = typename eigen_extract_stride<Type>::type;
+    static constexpr EigenIndex
+        rows = Type::RowsAtCompileTime,
+        cols = Type::ColsAtCompileTime,
+        size = Type::SizeAtCompileTime;
+    static constexpr bool
+        row_major = Type::IsRowMajor,
+        vector = Type::IsVectorAtCompileTime, // At least one dimension has fixed size 1
+        fixed_rows = rows != Eigen::Dynamic,
+        fixed_cols = cols != Eigen::Dynamic,
+        fixed = size != Eigen::Dynamic, // Fully-fixed size
+        dynamic = !fixed_rows && !fixed_cols; // Fully-dynamic size
+
+    template <EigenIndex i, EigenIndex ifzero> using if_zero = std::integral_constant<EigenIndex, i == 0 ? ifzero : i>;
+    static constexpr EigenIndex inner_stride = if_zero<StrideType::InnerStrideAtCompileTime, 1>::value,
+                                outer_stride = if_zero<StrideType::OuterStrideAtCompileTime,
+                                                       vector ? size : row_major ? cols : rows>::value;
+    static constexpr bool dynamic_stride = inner_stride == Eigen::Dynamic && outer_stride == Eigen::Dynamic;
+    static constexpr bool requires_row_major = !dynamic_stride && !vector && (row_major ? inner_stride : outer_stride) == 1;
+    static constexpr bool requires_col_major = !dynamic_stride && !vector && (row_major ? outer_stride : inner_stride) == 1;
+
+    // Takes an input array and determines whether we can make it fit into the Eigen type.  If
+    // the array is a vector, we attempt to fit it into either an Eigen 1xN or Nx1 vector
+    // (preferring the latter if it will fit in either, i.e. for a fully dynamic matrix type).
+    static EigenConformable<row_major> conformable(const array &a) {
+        const auto dims = a.ndim();
+        if (dims < 1 || dims > 2)
+            return false;
+
+        if (dims == 2) { // Matrix type: require exact match (or dynamic)
+
+            EigenIndex
+                np_rows = a.shape(0),
+                np_cols = a.shape(1),
+                np_rstride = a.strides(0) / static_cast<ssize_t>(sizeof(Scalar)),
+                np_cstride = a.strides(1) / static_cast<ssize_t>(sizeof(Scalar));
+            if ((fixed_rows && np_rows != rows) || (fixed_cols && np_cols != cols))
+                return false;
+
+            return {np_rows, np_cols, np_rstride, np_cstride};
+        }
+
+        // Otherwise we're storing an n-vector.  Only one of the strides will be used, but whichever
+        // is used, we want the (single) numpy stride value.
+        const EigenIndex n = a.shape(0),
+              stride = a.strides(0) / static_cast<ssize_t>(sizeof(Scalar));
+
+        if (vector) { // Eigen type is a compile-time vector
+            if (fixed && size != n)
+                return false; // Vector size mismatch
+            return {rows == 1 ? 1 : n, cols == 1 ? 1 : n, stride};
+        }
+        else if (fixed) {
+            // The type has a fixed size, but is not a vector: abort
+            return false;
+        }
+        else if (fixed_cols) {
+            // Since this isn't a vector, cols must be != 1.  We allow this only if it exactly
+            // equals the number of elements (rows is Dynamic, and so 1 row is allowed).
+            if (cols != n) return false;
+            return {1, n, stride};
+        }
+        else {
+            // Otherwise it's either fully dynamic, or column dynamic; both become a column vector
+            if (fixed_rows && rows != n) return false;
+            return {n, 1, stride};
+        }
+    }
+
+    static constexpr bool show_writeable = is_eigen_dense_map<Type>::value && is_eigen_mutable_map<Type>::value;
+    static constexpr bool show_order = is_eigen_dense_map<Type>::value;
+    static constexpr bool show_c_contiguous = show_order && requires_row_major;
+    static constexpr bool show_f_contiguous = !show_c_contiguous && show_order && requires_col_major;
+
+    static constexpr auto descriptor =
+        _("numpy.ndarray[") + npy_format_descriptor<Scalar>::name +
+        _("[")  + _<fixed_rows>(_<(size_t) rows>(), _("m")) +
+        _(", ") + _<fixed_cols>(_<(size_t) cols>(), _("n")) +
+        _("]") +
+        // For a reference type (e.g. Ref<MatrixXd>) we have other constraints that might need to be
+        // satisfied: writeable=True (for a mutable reference), and, depending on the map's stride
+        // options, possibly f_contiguous or c_contiguous.  We include them in the descriptor output
+        // to provide some hint as to why a TypeError is occurring (otherwise it can be confusing to
+        // see that a function accepts a 'numpy.ndarray[float64[3,2]]' and an error message that you
+        // *gave* a numpy.ndarray of the right type and dimensions.
+        _<show_writeable>(", flags.writeable", "") +
+        _<show_c_contiguous>(", flags.c_contiguous", "") +
+        _<show_f_contiguous>(", flags.f_contiguous", "") +
+        _("]");
+};
+
+// Casts an Eigen type to numpy array.  If given a base, the numpy array references the src data,
+// otherwise it'll make a copy.  writeable lets you turn off the writeable flag for the array.
+template <typename props> handle eigen_array_cast(typename props::Type const &src, handle base = handle(), bool writeable = true) {
+    constexpr ssize_t elem_size = sizeof(typename props::Scalar);
+    array a;
+    if (props::vector)
+        a = array({ src.size() }, { elem_size * src.innerStride() }, src.data(), base);
+    else
+        a = array({ src.rows(), src.cols() }, { elem_size * src.rowStride(), elem_size * src.colStride() },
+                  src.data(), base);
+
+    if (!writeable)
+        array_proxy(a.ptr())->flags &= ~detail::npy_api::NPY_ARRAY_WRITEABLE_;
+
+    return a.release();
+}
+
+// Takes an lvalue ref to some Eigen type and a (python) base object, creating a numpy array that
+// reference the Eigen object's data with `base` as the python-registered base class (if omitted,
+// the base will be set to None, and lifetime management is up to the caller).  The numpy array is
+// non-writeable if the given type is const.
+template <typename props, typename Type>
+handle eigen_ref_array(Type &src, handle parent = none()) {
+    // none here is to get past array's should-we-copy detection, which currently always
+    // copies when there is no base.  Setting the base to None should be harmless.
+    return eigen_array_cast<props>(src, parent, !std::is_const<Type>::value);
+}
+
+// Takes a pointer to some dense, plain Eigen type, builds a capsule around it, then returns a numpy
+// array that references the encapsulated data with a python-side reference to the capsule to tie
+// its destruction to that of any dependent python objects.  Const-ness is determined by whether or
+// not the Type of the pointer given is const.
+template <typename props, typename Type, typename = enable_if_t<is_eigen_dense_plain<Type>::value>>
+handle eigen_encapsulate(Type *src) {
+    capsule base(src, [](void *o) { delete static_cast<Type *>(o); });
+    return eigen_ref_array<props>(*src, base);
+}
+
+// Type caster for regular, dense matrix types (e.g. MatrixXd), but not maps/refs/etc. of dense
+// types.
+template<typename Type>
+struct type_caster<Type, enable_if_t<is_eigen_dense_plain<Type>::value>> {
+    using Scalar = typename Type::Scalar;
+    using props = EigenProps<Type>;
+
+    bool load(handle src, bool convert) {
+        // If we're in no-convert mode, only load if given an array of the correct type
+        if (!convert && !isinstance<array_t<Scalar>>(src))
+            return false;
+
+        // Coerce into an array, but don't do type conversion yet; the copy below handles it.
+        auto buf = array::ensure(src);
+
+        if (!buf)
+            return false;
+
+        auto dims = buf.ndim();
+        if (dims < 1 || dims > 2)
+            return false;
+
+        auto fits = props::conformable(buf);
+        if (!fits)
+            return false;
+
+        // Allocate the new type, then build a numpy reference into it
+        value = Type(fits.rows, fits.cols);
+        auto ref = reinterpret_steal<array>(eigen_ref_array<props>(value));
+        if (dims == 1) ref = ref.squeeze();
+        else if (ref.ndim() == 1) buf = buf.squeeze();
+
+        int result = detail::npy_api::get().PyArray_CopyInto_(ref.ptr(), buf.ptr());
+
+        if (result < 0) { // Copy failed!
+            PyErr_Clear();
+            return false;
+        }
+
+        return true;
+    }
+
+private:
+
+    // Cast implementation
+    template <typename CType>
+    static handle cast_impl(CType *src, return_value_policy policy, handle parent) {
+        switch (policy) {
+            case return_value_policy::take_ownership:
+            case return_value_policy::automatic:
+                return eigen_encapsulate<props>(src);
+            case return_value_policy::move:
+                return eigen_encapsulate<props>(new CType(std::move(*src)));
+            case return_value_policy::copy:
+                return eigen_array_cast<props>(*src);
+            case return_value_policy::reference:
+            case return_value_policy::automatic_reference:
+                return eigen_ref_array<props>(*src);
+            case return_value_policy::reference_internal:
+                return eigen_ref_array<props>(*src, parent);
+            default:
+                throw cast_error("unhandled return_value_policy: should not happen!");
+        };
+    }
+
+public:
+
+    // Normal returned non-reference, non-const value:
+    static handle cast(Type &&src, return_value_policy /* policy */, handle parent) {
+        return cast_impl(&src, return_value_policy::move, parent);
+    }
+    // If you return a non-reference const, we mark the numpy array readonly:
+    static handle cast(const Type &&src, return_value_policy /* policy */, handle parent) {
+        return cast_impl(&src, return_value_policy::move, parent);
+    }
+    // lvalue reference return; default (automatic) becomes copy
+    static handle cast(Type &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic || policy == return_value_policy::automatic_reference)
+            policy = return_value_policy::copy;
+        return cast_impl(&src, policy, parent);
+    }
+    // const lvalue reference return; default (automatic) becomes copy
+    static handle cast(const Type &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic || policy == return_value_policy::automatic_reference)
+            policy = return_value_policy::copy;
+        return cast(&src, policy, parent);
+    }
+    // non-const pointer return
+    static handle cast(Type *src, return_value_policy policy, handle parent) {
+        return cast_impl(src, policy, parent);
+    }
+    // const pointer return
+    static handle cast(const Type *src, return_value_policy policy, handle parent) {
+        return cast_impl(src, policy, parent);
+    }
+
+    static constexpr auto name = props::descriptor;
+
+    operator Type*() { return &value; }
+    operator Type&() { return value; }
+    operator Type&&() && { return std::move(value); }
+    template <typename T> using cast_op_type = movable_cast_op_type<T>;
+
+private:
+    Type value;
+};
+
+// Base class for casting reference/map/block/etc. objects back to python.
+template <typename MapType> struct eigen_map_caster {
+private:
+    using props = EigenProps<MapType>;
+
+public:
+
+    // Directly referencing a ref/map's data is a bit dangerous (whatever the map/ref points to has
+    // to stay around), but we'll allow it under the assumption that you know what you're doing (and
+    // have an appropriate keep_alive in place).  We return a numpy array pointing directly at the
+    // ref's data (The numpy array ends up read-only if the ref was to a const matrix type.) Note
+    // that this means you need to ensure you don't destroy the object in some other way (e.g. with
+    // an appropriate keep_alive, or with a reference to a statically allocated matrix).
+    static handle cast(const MapType &src, return_value_policy policy, handle parent) {
+        switch (policy) {
+            case return_value_policy::copy:
+                return eigen_array_cast<props>(src);
+            case return_value_policy::reference_internal:
+                return eigen_array_cast<props>(src, parent, is_eigen_mutable_map<MapType>::value);
+            case return_value_policy::reference:
+            case return_value_policy::automatic:
+            case return_value_policy::automatic_reference:
+                return eigen_array_cast<props>(src, none(), is_eigen_mutable_map<MapType>::value);
+            default:
+                // move, take_ownership don't make any sense for a ref/map:
+                pybind11_fail("Invalid return_value_policy for Eigen Map/Ref/Block type");
+        }
+    }
+
+    static constexpr auto name = props::descriptor;
+
+    // Explicitly delete these: support python -> C++ conversion on these (i.e. these can be return
+    // types but not bound arguments).  We still provide them (with an explicitly delete) so that
+    // you end up here if you try anyway.
+    bool load(handle, bool) = delete;
+    operator MapType() = delete;
+    template <typename> using cast_op_type = MapType;
+};
+
+// We can return any map-like object (but can only load Refs, specialized next):
+template <typename Type> struct type_caster<Type, enable_if_t<is_eigen_dense_map<Type>::value>>
+    : eigen_map_caster<Type> {};
+
+// Loader for Ref<...> arguments.  See the documentation for info on how to make this work without
+// copying (it requires some extra effort in many cases).
+template <typename PlainObjectType, typename StrideType>
+struct type_caster<
+    Eigen::Ref<PlainObjectType, 0, StrideType>,
+    enable_if_t<is_eigen_dense_map<Eigen::Ref<PlainObjectType, 0, StrideType>>::value>
+> : public eigen_map_caster<Eigen::Ref<PlainObjectType, 0, StrideType>> {
+private:
+    using Type = Eigen::Ref<PlainObjectType, 0, StrideType>;
+    using props = EigenProps<Type>;
+    using Scalar = typename props::Scalar;
+    using MapType = Eigen::Map<PlainObjectType, 0, StrideType>;
+    using Array = array_t<Scalar, array::forcecast |
+                ((props::row_major ? props::inner_stride : props::outer_stride) == 1 ? array::c_style :
+                 (props::row_major ? props::outer_stride : props::inner_stride) == 1 ? array::f_style : 0)>;
+    static constexpr bool need_writeable = is_eigen_mutable_map<Type>::value;
+    // Delay construction (these have no default constructor)
+    std::unique_ptr<MapType> map;
+    std::unique_ptr<Type> ref;
+    // Our array.  When possible, this is just a numpy array pointing to the source data, but
+    // sometimes we can't avoid copying (e.g. input is not a numpy array at all, has an incompatible
+    // layout, or is an array of a type that needs to be converted).  Using a numpy temporary
+    // (rather than an Eigen temporary) saves an extra copy when we need both type conversion and
+    // storage order conversion.  (Note that we refuse to use this temporary copy when loading an
+    // argument for a Ref<M> with M non-const, i.e. a read-write reference).
+    Array copy_or_ref;
+public:
+    bool load(handle src, bool convert) {
+        // First check whether what we have is already an array of the right type.  If not, we can't
+        // avoid a copy (because the copy is also going to do type conversion).
+        bool need_copy = !isinstance<Array>(src);
+
+        EigenConformable<props::row_major> fits;
+        if (!need_copy) {
+            // We don't need a converting copy, but we also need to check whether the strides are
+            // compatible with the Ref's stride requirements
+            Array aref = reinterpret_borrow<Array>(src);
+
+            if (aref && (!need_writeable || aref.writeable())) {
+                fits = props::conformable(aref);
+                if (!fits) return false; // Incompatible dimensions
+                if (!fits.template stride_compatible<props>())
+                    need_copy = true;
+                else
+                    copy_or_ref = std::move(aref);
+            }
+            else {
+                need_copy = true;
+            }
+        }
+
+        if (need_copy) {
+            // We need to copy: If we need a mutable reference, or we're not supposed to convert
+            // (either because we're in the no-convert overload pass, or because we're explicitly
+            // instructed not to copy (via `py::arg().noconvert()`) we have to fail loading.
+            if (!convert || need_writeable) return false;
+
+            Array copy = Array::ensure(src);
+            if (!copy) return false;
+            fits = props::conformable(copy);
+            if (!fits || !fits.template stride_compatible<props>())
+                return false;
+            copy_or_ref = std::move(copy);
+            loader_life_support::add_patient(copy_or_ref);
+        }
+
+        ref.reset();
+        map.reset(new MapType(data(copy_or_ref), fits.rows, fits.cols, make_stride(fits.stride.outer(), fits.stride.inner())));
+        ref.reset(new Type(*map));
+
+        return true;
+    }
+
+    operator Type*() { return ref.get(); }
+    operator Type&() { return *ref; }
+    template <typename _T> using cast_op_type = pybind11::detail::cast_op_type<_T>;
+
+private:
+    template <typename T = Type, enable_if_t<is_eigen_mutable_map<T>::value, int> = 0>
+    Scalar *data(Array &a) { return a.mutable_data(); }
+
+    template <typename T = Type, enable_if_t<!is_eigen_mutable_map<T>::value, int> = 0>
+    const Scalar *data(Array &a) { return a.data(); }
+
+    // Attempt to figure out a constructor of `Stride` that will work.
+    // If both strides are fixed, use a default constructor:
+    template <typename S> using stride_ctor_default = bool_constant<
+        S::InnerStrideAtCompileTime != Eigen::Dynamic && S::OuterStrideAtCompileTime != Eigen::Dynamic &&
+        std::is_default_constructible<S>::value>;
+    // Otherwise, if there is a two-index constructor, assume it is (outer,inner) like
+    // Eigen::Stride, and use it:
+    template <typename S> using stride_ctor_dual = bool_constant<
+        !stride_ctor_default<S>::value && std::is_constructible<S, EigenIndex, EigenIndex>::value>;
+    // Otherwise, if there is a one-index constructor, and just one of the strides is dynamic, use
+    // it (passing whichever stride is dynamic).
+    template <typename S> using stride_ctor_outer = bool_constant<
+        !any_of<stride_ctor_default<S>, stride_ctor_dual<S>>::value &&
+        S::OuterStrideAtCompileTime == Eigen::Dynamic && S::InnerStrideAtCompileTime != Eigen::Dynamic &&
+        std::is_constructible<S, EigenIndex>::value>;
+    template <typename S> using stride_ctor_inner = bool_constant<
+        !any_of<stride_ctor_default<S>, stride_ctor_dual<S>>::value &&
+        S::InnerStrideAtCompileTime == Eigen::Dynamic && S::OuterStrideAtCompileTime != Eigen::Dynamic &&
+        std::is_constructible<S, EigenIndex>::value>;
+
+    template <typename S = StrideType, enable_if_t<stride_ctor_default<S>::value, int> = 0>
+    static S make_stride(EigenIndex, EigenIndex) { return S(); }
+    template <typename S = StrideType, enable_if_t<stride_ctor_dual<S>::value, int> = 0>
+    static S make_stride(EigenIndex outer, EigenIndex inner) { return S(outer, inner); }
+    template <typename S = StrideType, enable_if_t<stride_ctor_outer<S>::value, int> = 0>
+    static S make_stride(EigenIndex outer, EigenIndex) { return S(outer); }
+    template <typename S = StrideType, enable_if_t<stride_ctor_inner<S>::value, int> = 0>
+    static S make_stride(EigenIndex, EigenIndex inner) { return S(inner); }
+
+};
+
+// type_caster for special matrix types (e.g. DiagonalMatrix), which are EigenBase, but not
+// EigenDense (i.e. they don't have a data(), at least not with the usual matrix layout).
+// load() is not supported, but we can cast them into the python domain by first copying to a
+// regular Eigen::Matrix, then casting that.
+template <typename Type>
+struct type_caster<Type, enable_if_t<is_eigen_other<Type>::value>> {
+protected:
+    using Matrix = Eigen::Matrix<typename Type::Scalar, Type::RowsAtCompileTime, Type::ColsAtCompileTime>;
+    using props = EigenProps<Matrix>;
+public:
+    static handle cast(const Type &src, return_value_policy /* policy */, handle /* parent */) {
+        handle h = eigen_encapsulate<props>(new Matrix(src));
+        return h;
+    }
+    static handle cast(const Type *src, return_value_policy policy, handle parent) { return cast(*src, policy, parent); }
+
+    static constexpr auto name = props::descriptor;
+
+    // Explicitly delete these: support python -> C++ conversion on these (i.e. these can be return
+    // types but not bound arguments).  We still provide them (with an explicitly delete) so that
+    // you end up here if you try anyway.
+    bool load(handle, bool) = delete;
+    operator Type() = delete;
+    template <typename> using cast_op_type = Type;
+};
+
+template<typename Type>
+struct type_caster<Type, enable_if_t<is_eigen_sparse<Type>::value>> {
+    typedef typename Type::Scalar Scalar;
+    typedef remove_reference_t<decltype(*std::declval<Type>().outerIndexPtr())> StorageIndex;
+    typedef typename Type::Index Index;
+    static constexpr bool rowMajor = Type::IsRowMajor;
+
+    bool load(handle src, bool) {
+        if (!src)
+            return false;
+
+        auto obj = reinterpret_borrow<object>(src);
+        object sparse_module = module::import("scipy.sparse");
+        object matrix_type = sparse_module.attr(
+            rowMajor ? "csr_matrix" : "csc_matrix");
+
+        if (!obj.get_type().is(matrix_type)) {
+            try {
+                obj = matrix_type(obj);
+            } catch (const error_already_set &) {
+                return false;
+            }
+        }
+
+        auto values = array_t<Scalar>((object) obj.attr("data"));
+        auto innerIndices = array_t<StorageIndex>((object) obj.attr("indices"));
+        auto outerIndices = array_t<StorageIndex>((object) obj.attr("indptr"));
+        auto shape = pybind11::tuple((pybind11::object) obj.attr("shape"));
+        auto nnz = obj.attr("nnz").cast<Index>();
+
+        if (!values || !innerIndices || !outerIndices)
+            return false;
+
+        value = Eigen::MappedSparseMatrix<Scalar, Type::Flags, StorageIndex>(
+            shape[0].cast<Index>(), shape[1].cast<Index>(), nnz,
+            outerIndices.mutable_data(), innerIndices.mutable_data(), values.mutable_data());
+
+        return true;
+    }
+
+    static handle cast(const Type &src, return_value_policy /* policy */, handle /* parent */) {
+        const_cast<Type&>(src).makeCompressed();
+
+        object matrix_type = module::import("scipy.sparse").attr(
+            rowMajor ? "csr_matrix" : "csc_matrix");
+
+        array data(src.nonZeros(), src.valuePtr());
+        array outerIndices((rowMajor ? src.rows() : src.cols()) + 1, src.outerIndexPtr());
+        array innerIndices(src.nonZeros(), src.innerIndexPtr());
+
+        return matrix_type(
+            std::make_tuple(data, innerIndices, outerIndices),
+            std::make_pair(src.rows(), src.cols())
+        ).release();
+    }
+
+    PYBIND11_TYPE_CASTER(Type, _<(Type::IsRowMajor) != 0>("scipy.sparse.csr_matrix[", "scipy.sparse.csc_matrix[")
+            + npy_format_descriptor<Scalar>::name + _("]"));
+};
+
+NAMESPACE_END(detail)
+NAMESPACE_END(PYBIND11_NAMESPACE)
+
+#if defined(__GNUG__) || defined(__clang__)
+#  pragma GCC diagnostic pop
+#elif defined(_MSC_VER)
+#  pragma warning(pop)
+#endif
diff --git a/cviruntime/python/include/pybind11/include/pybind11/embed.h b/cviruntime/python/include/pybind11/include/pybind11/embed.h
new file mode 100644
index 000000000..f814c783e
--- /dev/null
+++ b/cviruntime/python/include/pybind11/include/pybind11/embed.h
@@ -0,0 +1,202 @@
+/*
+    pybind11/embed.h: Support for embedding the interpreter
+
+    Copyright (c) 2017 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+#include "eval.h"
+
+#if defined(PYPY_VERSION)
+#  error Embedding the interpreter is not supported with PyPy
+#endif
+
+#if PY_MAJOR_VERSION >= 3
+#  define PYBIND11_EMBEDDED_MODULE_IMPL(name)            \
+      extern "C" PyObject *pybind11_init_impl_##name();  \
+      extern "C" PyObject *pybind11_init_impl_##name() { \
+          return pybind11_init_wrapper_##name();         \
+      }
+#else
+#  define PYBIND11_EMBEDDED_MODULE_IMPL(name)            \
+      extern "C" void pybind11_init_impl_##name();       \
+      extern "C" void pybind11_init_impl_##name() {      \
+          pybind11_init_wrapper_##name();                \
+      }
+#endif
+
+/** \rst
+    Add a new module to the table of builtins for the interpreter. Must be
+    defined in global scope. The first macro parameter is the name of the
+    module (without quotes). The second parameter is the variable which will
+    be used as the interface to add functions and classes to the module.
+
+    .. code-block:: cpp
+
+        PYBIND11_EMBEDDED_MODULE(example, m) {
+            // ... initialize functions and classes here
+            m.def("foo", []() {
+                return "Hello, World!";
+            });
+        }
+ \endrst */
+#define PYBIND11_EMBEDDED_MODULE(name, variable)                              \
+    static void PYBIND11_CONCAT(pybind11_init_, name)(pybind11::module &);    \
+    static PyObject PYBIND11_CONCAT(*pybind11_init_wrapper_, name)() {        \
+        auto m = pybind11::module(PYBIND11_TOSTRING(name));                   \
+        try {                                                                 \
+            PYBIND11_CONCAT(pybind11_init_, name)(m);                         \
+            return m.ptr();                                                   \
+        } catch (pybind11::error_already_set &e) {                            \
+            PyErr_SetString(PyExc_ImportError, e.what());                     \
+            return nullptr;                                                   \
+        } catch (const std::exception &e) {                                   \
+            PyErr_SetString(PyExc_ImportError, e.what());                     \
+            return nullptr;                                                   \
+        }                                                                     \
+    }                                                                         \
+    PYBIND11_EMBEDDED_MODULE_IMPL(name)                                       \
+    pybind11::detail::embedded_module name(PYBIND11_TOSTRING(name),           \
+                               PYBIND11_CONCAT(pybind11_init_impl_, name));   \
+    void PYBIND11_CONCAT(pybind11_init_, name)(pybind11::module &variable)
+
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+/// Python 2.7/3.x compatible version of `PyImport_AppendInittab` and error checks.
+struct embedded_module {
+#if PY_MAJOR_VERSION >= 3
+    using init_t = PyObject *(*)();
+#else
+    using init_t = void (*)();
+#endif
+    embedded_module(const char *name, init_t init) {
+        if (Py_IsInitialized())
+            pybind11_fail("Can't add new modules after the interpreter has been initialized");
+
+        auto result = PyImport_AppendInittab(name, init);
+        if (result == -1)
+            pybind11_fail("Insufficient memory to add a new module");
+    }
+};
+
+NAMESPACE_END(detail)
+
+/** \rst
+    Initialize the Python interpreter. No other pybind11 or CPython API functions can be
+    called before this is done; with the exception of `PYBIND11_EMBEDDED_MODULE`. The
+    optional parameter can be used to skip the registration of signal handlers (see the
+    `Python documentation`_ for details). Calling this function again after the interpreter
+    has already been initialized is a fatal error.
+
+    If initializing the Python interpreter fails, then the program is terminated.  (This
+    is controlled by the CPython runtime and is an exception to pybind11's normal behavior
+    of throwing exceptions on errors.)
+
+    .. _Python documentation: https://docs.python.org/3/c-api/init.html#c.Py_InitializeEx
+ \endrst */
+inline void initialize_interpreter(bool init_signal_handlers = true) {
+    if (Py_IsInitialized())
+        pybind11_fail("The interpreter is already running");
+
+    Py_InitializeEx(init_signal_handlers ? 1 : 0);
+
+    // Make .py files in the working directory available by default
+    module::import("sys").attr("path").cast<list>().append(".");
+}
+
+/** \rst
+    Shut down the Python interpreter. No pybind11 or CPython API functions can be called
+    after this. In addition, pybind11 objects must not outlive the interpreter:
+
+    .. code-block:: cpp
+
+        { // BAD
+            py::initialize_interpreter();
+            auto hello = py::str("Hello, World!");
+            py::finalize_interpreter();
+        } // <-- BOOM, hello's destructor is called after interpreter shutdown
+
+        { // GOOD
+            py::initialize_interpreter();
+            { // scoped
+                auto hello = py::str("Hello, World!");
+            } // <-- OK, hello is cleaned up properly
+            py::finalize_interpreter();
+        }
+
+        { // BETTER
+            py::scoped_interpreter guard{};
+            auto hello = py::str("Hello, World!");
+        }
+
+    .. warning::
+
+        The interpreter can be restarted by calling `initialize_interpreter` again.
+        Modules created using pybind11 can be safely re-initialized. However, Python
+        itself cannot completely unload binary extension modules and there are several
+        caveats with regard to interpreter restarting. All the details can be found
+        in the CPython documentation. In short, not all interpreter memory may be
+        freed, either due to reference cycles or user-created global data.
+
+ \endrst */
+inline void finalize_interpreter() {
+    handle builtins(PyEval_GetBuiltins());
+    const char *id = PYBIND11_INTERNALS_ID;
+
+    // Get the internals pointer (without creating it if it doesn't exist).  It's possible for the
+    // internals to be created during Py_Finalize() (e.g. if a py::capsule calls `get_internals()`
+    // during destruction), so we get the pointer-pointer here and check it after Py_Finalize().
+    detail::internals **internals_ptr_ptr = detail::get_internals_pp();
+    // It could also be stashed in builtins, so look there too:
+    if (builtins.contains(id) && isinstance<capsule>(builtins[id]))
+        internals_ptr_ptr = capsule(builtins[id]);
+
+    Py_Finalize();
+
+    if (internals_ptr_ptr) {
+        delete *internals_ptr_ptr;
+        *internals_ptr_ptr = nullptr;
+    }
+}
+
+/** \rst
+    Scope guard version of `initialize_interpreter` and `finalize_interpreter`.
+    This a move-only guard and only a single instance can exist.
+
+    .. code-block:: cpp
+
+        #include <pybind11/embed.h>
+
+        int main() {
+            py::scoped_interpreter guard{};
+            py::print(Hello, World!);
+        } // <-- interpreter shutdown
+ \endrst */
+class scoped_interpreter {
+public:
+    scoped_interpreter(bool init_signal_handlers = true) {
+        initialize_interpreter(init_signal_handlers);
+    }
+
+    scoped_interpreter(const scoped_interpreter &) = delete;
+    scoped_interpreter(scoped_interpreter &&other) noexcept { other.is_valid = false; }
+    scoped_interpreter &operator=(const scoped_interpreter &) = delete;
+    scoped_interpreter &operator=(scoped_interpreter &&) = delete;
+
+    ~scoped_interpreter() {
+        if (is_valid)
+            finalize_interpreter();
+    }
+
+private:
+    bool is_valid = true;
+};
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/cviruntime/python/include/pybind11/include/pybind11/eval.h b/cviruntime/python/include/pybind11/include/pybind11/eval.h
new file mode 100644
index 000000000..ea85ba1db
--- /dev/null
+++ b/cviruntime/python/include/pybind11/include/pybind11/eval.h
@@ -0,0 +1,117 @@
+/*
+    pybind11/exec.h: Support for evaluating Python expressions and statements
+    from strings and files
+
+    Copyright (c) 2016 Klemens Morgenstern <klemens.morgenstern@ed-chemnitz.de> and
+                       Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+enum eval_mode {
+    /// Evaluate a string containing an isolated expression
+    eval_expr,
+
+    /// Evaluate a string containing a single statement. Returns \c none
+    eval_single_statement,
+
+    /// Evaluate a string containing a sequence of statement. Returns \c none
+    eval_statements
+};
+
+template <eval_mode mode = eval_expr>
+object eval(str expr, object global = globals(), object local = object()) {
+    if (!local)
+        local = global;
+
+    /* PyRun_String does not accept a PyObject / encoding specifier,
+       this seems to be the only alternative */
+    std::string buffer = "# -*- coding: utf-8 -*-\n" + (std::string) expr;
+
+    int start;
+    switch (mode) {
+        case eval_expr:             start = Py_eval_input;   break;
+        case eval_single_statement: start = Py_single_input; break;
+        case eval_statements:       start = Py_file_input;   break;
+        default: pybind11_fail("invalid evaluation mode");
+    }
+
+    PyObject *result = PyRun_String(buffer.c_str(), start, global.ptr(), local.ptr());
+    if (!result)
+        throw error_already_set();
+    return reinterpret_steal<object>(result);
+}
+
+template <eval_mode mode = eval_expr, size_t N>
+object eval(const char (&s)[N], object global = globals(), object local = object()) {
+    /* Support raw string literals by removing common leading whitespace */
+    auto expr = (s[0] == '\n') ? str(module::import("textwrap").attr("dedent")(s))
+                               : str(s);
+    return eval<mode>(expr, global, local);
+}
+
+inline void exec(str expr, object global = globals(), object local = object()) {
+    eval<eval_statements>(expr, global, local);
+}
+
+template <size_t N>
+void exec(const char (&s)[N], object global = globals(), object local = object()) {
+    eval<eval_statements>(s, global, local);
+}
+
+template <eval_mode mode = eval_statements>
+object eval_file(str fname, object global = globals(), object local = object()) {
+    if (!local)
+        local = global;
+
+    int start;
+    switch (mode) {
+        case eval_expr:             start = Py_eval_input;   break;
+        case eval_single_statement: start = Py_single_input; break;
+        case eval_statements:       start = Py_file_input;   break;
+        default: pybind11_fail("invalid evaluation mode");
+    }
+
+    int closeFile = 1;
+    std::string fname_str = (std::string) fname;
+#if PY_VERSION_HEX >= 0x03040000
+    FILE *f = _Py_fopen_obj(fname.ptr(), "r");
+#elif PY_VERSION_HEX >= 0x03000000
+    FILE *f = _Py_fopen(fname.ptr(), "r");
+#else
+    /* No unicode support in open() :( */
+    auto fobj = reinterpret_steal<object>(PyFile_FromString(
+        const_cast<char *>(fname_str.c_str()),
+        const_cast<char*>("r")));
+    FILE *f = nullptr;
+    if (fobj)
+        f = PyFile_AsFile(fobj.ptr());
+    closeFile = 0;
+#endif
+    if (!f) {
+        PyErr_Clear();
+        pybind11_fail("File \"" + fname_str + "\" could not be opened!");
+    }
+
+#if PY_VERSION_HEX < 0x03000000 && defined(PYPY_VERSION)
+    PyObject *result = PyRun_File(f, fname_str.c_str(), start, global.ptr(),
+                                  local.ptr());
+    (void) closeFile;
+#else
+    PyObject *result = PyRun_FileEx(f, fname_str.c_str(), start, global.ptr(),
+                                    local.ptr(), closeFile);
+#endif
+
+    if (!result)
+        throw error_already_set();
+    return reinterpret_steal<object>(result);
+}
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/cviruntime/python/include/pybind11/include/pybind11/functional.h b/cviruntime/python/include/pybind11/include/pybind11/functional.h
new file mode 100644
index 000000000..f8bda6483
--- /dev/null
+++ b/cviruntime/python/include/pybind11/include/pybind11/functional.h
@@ -0,0 +1,101 @@
+/*
+    pybind11/functional.h: std::function<> support
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+#include <functional>
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+template <typename Return, typename... Args>
+struct type_caster<std::function<Return(Args...)>> {
+    using type = std::function<Return(Args...)>;
+    using retval_type = conditional_t<std::is_same<Return, void>::value, void_type, Return>;
+    using function_type = Return (*) (Args...);
+
+public:
+    bool load(handle src, bool convert) {
+        if (src.is_none()) {
+            // Defer accepting None to other overloads (if we aren't in convert mode):
+            if (!convert) return false;
+            return true;
+        }
+
+        if (!isinstance<function>(src))
+            return false;
+
+        auto func = reinterpret_borrow<function>(src);
+
+        /*
+           When passing a C++ function as an argument to another C++
+           function via Python, every function call would normally involve
+           a full C++ -> Python -> C++ roundtrip, which can be prohibitive.
+           Here, we try to at least detect the case where the function is
+           stateless (i.e. function pointer or lambda function without
+           captured variables), in which case the roundtrip can be avoided.
+         */
+        if (auto cfunc = func.cpp_function()) {
+            auto c = reinterpret_borrow<capsule>(PyCFunction_GET_SELF(cfunc.ptr()));
+            auto rec = (function_record *) c;
+
+            if (rec && rec->is_stateless &&
+                    same_type(typeid(function_type), *reinterpret_cast<const std::type_info *>(rec->data[1]))) {
+                struct capture { function_type f; };
+                value = ((capture *) &rec->data)->f;
+                return true;
+            }
+        }
+
+        // ensure GIL is held during functor destruction
+        struct func_handle {
+            function f;
+            func_handle(function&& f_) : f(std::move(f_)) {}
+            func_handle(const func_handle&) = default;
+            ~func_handle() {
+                gil_scoped_acquire acq;
+                function kill_f(std::move(f));
+            }
+        };
+
+        // to emulate 'move initialization capture' in C++11
+        struct func_wrapper {
+            func_handle hfunc;
+            func_wrapper(func_handle&& hf): hfunc(std::move(hf)) {}
+            Return operator()(Args... args) const {
+                gil_scoped_acquire acq;
+                object retval(hfunc.f(std::forward<Args>(args)...));
+                /* Visual studio 2015 parser issue: need parentheses around this expression */
+                return (retval.template cast<Return>());
+            }
+        };
+
+        value = func_wrapper(func_handle(std::move(func)));
+        return true;
+    }
+
+    template <typename Func>
+    static handle cast(Func &&f_, return_value_policy policy, handle /* parent */) {
+        if (!f_)
+            return none().inc_ref();
+
+        auto result = f_.template target<function_type>();
+        if (result)
+            return cpp_function(*result, policy).release();
+        else
+            return cpp_function(std::forward<Func>(f_), policy).release();
+    }
+
+    PYBIND11_TYPE_CASTER(type, _("Callable[[") + concat(make_caster<Args>::name...) + _("], ")
+                               + make_caster<retval_type>::name + _("]"));
+};
+
+NAMESPACE_END(detail)
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/cviruntime/python/include/pybind11/include/pybind11/iostream.h b/cviruntime/python/include/pybind11/include/pybind11/iostream.h
new file mode 100644
index 000000000..c43b7c93a
--- /dev/null
+++ b/cviruntime/python/include/pybind11/include/pybind11/iostream.h
@@ -0,0 +1,209 @@
+/*
+    pybind11/iostream.h -- Tools to assist with redirecting cout and cerr to Python
+
+    Copyright (c) 2017 Henry F. Schreiner
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+
+#include <streambuf>
+#include <ostream>
+#include <string>
+#include <memory>
+#include <iostream>
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+// Buffer that writes to Python instead of C++
+class pythonbuf : public std::streambuf {
+private:
+    using traits_type = std::streambuf::traits_type;
+
+    const size_t buf_size;
+    std::unique_ptr<char[]> d_buffer;
+    object pywrite;
+    object pyflush;
+
+    int overflow(int c) {
+        if (!traits_type::eq_int_type(c, traits_type::eof())) {
+            *pptr() = traits_type::to_char_type(c);
+            pbump(1);
+        }
+        return sync() == 0 ? traits_type::not_eof(c) : traits_type::eof();
+    }
+
+    int sync() {
+        if (pbase() != pptr()) {
+            // This subtraction cannot be negative, so dropping the sign
+            str line(pbase(), static_cast<size_t>(pptr() - pbase()));
+
+            {
+                gil_scoped_acquire tmp;
+                pywrite(line);
+                pyflush();
+            }
+
+            setp(pbase(), epptr());
+        }
+        return 0;
+    }
+
+public:
+
+    pythonbuf(object pyostream, size_t buffer_size = 1024)
+        : buf_size(buffer_size),
+          d_buffer(new char[buf_size]),
+          pywrite(pyostream.attr("write")),
+          pyflush(pyostream.attr("flush")) {
+        setp(d_buffer.get(), d_buffer.get() + buf_size - 1);
+    }
+
+    pythonbuf(pythonbuf&&) = default;
+
+    /// Sync before destroy
+    ~pythonbuf() {
+        sync();
+    }
+};
+
+NAMESPACE_END(detail)
+
+
+/** \rst
+    This a move-only guard that redirects output.
+
+    .. code-block:: cpp
+
+        #include <pybind11/iostream.h>
+
+        ...
+
+        {
+            py::scoped_ostream_redirect output;
+            std::cout << "Hello, World!"; // Python stdout
+        } // <-- return std::cout to normal
+
+    You can explicitly pass the c++ stream and the python object,
+    for example to guard stderr instead.
+
+    .. code-block:: cpp
+
+        {
+            py::scoped_ostream_redirect output{std::cerr, py::module::import("sys").attr("stderr")};
+            std::cerr << "Hello, World!";
+        }
+ \endrst */
+class scoped_ostream_redirect {
+protected:
+    std::streambuf *old;
+    std::ostream &costream;
+    detail::pythonbuf buffer;
+
+public:
+    scoped_ostream_redirect(
+            std::ostream &costream = std::cout,
+            object pyostream = module::import("sys").attr("stdout"))
+        : costream(costream), buffer(pyostream) {
+        old = costream.rdbuf(&buffer);
+    }
+
+    ~scoped_ostream_redirect() {
+        costream.rdbuf(old);
+    }
+
+    scoped_ostream_redirect(const scoped_ostream_redirect &) = delete;
+    scoped_ostream_redirect(scoped_ostream_redirect &&other) = default;
+    scoped_ostream_redirect &operator=(const scoped_ostream_redirect &) = delete;
+    scoped_ostream_redirect &operator=(scoped_ostream_redirect &&) = delete;
+};
+
+
+/** \rst
+    Like `scoped_ostream_redirect`, but redirects cerr by default. This class
+    is provided primary to make ``py::call_guard`` easier to make.
+
+    .. code-block:: cpp
+
+     m.def("noisy_func", &noisy_func,
+           py::call_guard<scoped_ostream_redirect,
+                          scoped_estream_redirect>());
+
+\endrst */
+class scoped_estream_redirect : public scoped_ostream_redirect {
+public:
+    scoped_estream_redirect(
+            std::ostream &costream = std::cerr,
+            object pyostream = module::import("sys").attr("stderr"))
+        : scoped_ostream_redirect(costream,pyostream) {}
+};
+
+
+NAMESPACE_BEGIN(detail)
+
+// Class to redirect output as a context manager. C++ backend.
+class OstreamRedirect {
+    bool do_stdout_;
+    bool do_stderr_;
+    std::unique_ptr<scoped_ostream_redirect> redirect_stdout;
+    std::unique_ptr<scoped_estream_redirect> redirect_stderr;
+
+public:
+    OstreamRedirect(bool do_stdout = true, bool do_stderr = true)
+        : do_stdout_(do_stdout), do_stderr_(do_stderr) {}
+
+    void enter() {
+        if (do_stdout_)
+            redirect_stdout.reset(new scoped_ostream_redirect());
+        if (do_stderr_)
+            redirect_stderr.reset(new scoped_estream_redirect());
+    }
+
+    void exit() {
+        redirect_stdout.reset();
+        redirect_stderr.reset();
+    }
+};
+
+NAMESPACE_END(detail)
+
+/** \rst
+    This is a helper function to add a C++ redirect context manager to Python
+    instead of using a C++ guard. To use it, add the following to your binding code:
+
+    .. code-block:: cpp
+
+        #include <pybind11/iostream.h>
+
+        ...
+
+        py::add_ostream_redirect(m, "ostream_redirect");
+
+    You now have a Python context manager that redirects your output:
+
+    .. code-block:: python
+
+        with m.ostream_redirect():
+            m.print_to_cout_function()
+
+    This manager can optionally be told which streams to operate on:
+
+    .. code-block:: python
+
+        with m.ostream_redirect(stdout=true, stderr=true):
+            m.noisy_function_with_error_printing()
+
+ \endrst */
+inline class_<detail::OstreamRedirect> add_ostream_redirect(module m, std::string name = "ostream_redirect") {
+    return class_<detail::OstreamRedirect>(m, name.c_str(), module_local())
+        .def(init<bool,bool>(), arg("stdout")=true, arg("stderr")=true)
+        .def("__enter__", &detail::OstreamRedirect::enter)
+        .def("__exit__", [](detail::OstreamRedirect &self_, args) { self_.exit(); });
+}
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/cviruntime/python/include/pybind11/include/pybind11/numpy.h b/cviruntime/python/include/pybind11/include/pybind11/numpy.h
new file mode 100644
index 000000000..ba41a223d
--- /dev/null
+++ b/cviruntime/python/include/pybind11/include/pybind11/numpy.h
@@ -0,0 +1,1642 @@
+/*
+    pybind11/numpy.h: Basic NumPy support, vectorize() wrapper
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+#include "complex.h"
+#include <numeric>
+#include <algorithm>
+#include <array>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <sstream>
+#include <string>
+#include <functional>
+#include <utility>
+#include <vector>
+#include <typeindex>
+
+#if defined(_MSC_VER)
+#  pragma warning(push)
+#  pragma warning(disable: 4127) // warning C4127: Conditional expression is constant
+#endif
+
+/* This will be true on all flat address space platforms and allows us to reduce the
+   whole npy_intp / ssize_t / Py_intptr_t business down to just ssize_t for all size
+   and dimension types (e.g. shape, strides, indexing), instead of inflicting this
+   upon the library user. */
+static_assert(sizeof(ssize_t) == sizeof(Py_intptr_t), "ssize_t != Py_intptr_t");
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+class array; // Forward declaration
+
+NAMESPACE_BEGIN(detail)
+template <typename type, typename SFINAE = void> struct npy_format_descriptor;
+
+struct PyArrayDescr_Proxy {
+    PyObject_HEAD
+    PyObject *typeobj;
+    char kind;
+    char type;
+    char byteorder;
+    char flags;
+    int type_num;
+    int elsize;
+    int alignment;
+    char *subarray;
+    PyObject *fields;
+    PyObject *names;
+};
+
+struct PyArray_Proxy {
+    PyObject_HEAD
+    char *data;
+    int nd;
+    ssize_t *dimensions;
+    ssize_t *strides;
+    PyObject *base;
+    PyObject *descr;
+    int flags;
+};
+
+struct PyVoidScalarObject_Proxy {
+    PyObject_VAR_HEAD
+    char *obval;
+    PyArrayDescr_Proxy *descr;
+    int flags;
+    PyObject *base;
+};
+
+struct numpy_type_info {
+    PyObject* dtype_ptr;
+    std::string format_str;
+};
+
+struct numpy_internals {
+    std::unordered_map<std::type_index, numpy_type_info> registered_dtypes;
+
+    numpy_type_info *get_type_info(const std::type_info& tinfo, bool throw_if_missing = true) {
+        auto it = registered_dtypes.find(std::type_index(tinfo));
+        if (it != registered_dtypes.end())
+            return &(it->second);
+        if (throw_if_missing)
+            pybind11_fail(std::string("NumPy type info missing for ") + tinfo.name());
+        return nullptr;
+    }
+
+    template<typename T> numpy_type_info *get_type_info(bool throw_if_missing = true) {
+        return get_type_info(typeid(typename std::remove_cv<T>::type), throw_if_missing);
+    }
+};
+
+inline PYBIND11_NOINLINE void load_numpy_internals(numpy_internals* &ptr) {
+    ptr = &get_or_create_shared_data<numpy_internals>("_numpy_internals");
+}
+
+inline numpy_internals& get_numpy_internals() {
+    static numpy_internals* ptr = nullptr;
+    if (!ptr)
+        load_numpy_internals(ptr);
+    return *ptr;
+}
+
+template <typename T> struct same_size {
+    template <typename U> using as = bool_constant<sizeof(T) == sizeof(U)>;
+};
+
+template <typename Concrete> constexpr int platform_lookup() { return -1; }
+
+// Lookup a type according to its size, and return a value corresponding to the NumPy typenum.
+template <typename Concrete, typename T, typename... Ts, typename... Ints>
+constexpr int platform_lookup(int I, Ints... Is) {
+    return sizeof(Concrete) == sizeof(T) ? I : platform_lookup<Concrete, Ts...>(Is...);
+}
+
+struct npy_api {
+    enum constants {
+        NPY_ARRAY_C_CONTIGUOUS_ = 0x0001,
+        NPY_ARRAY_F_CONTIGUOUS_ = 0x0002,
+        NPY_ARRAY_OWNDATA_ = 0x0004,
+        NPY_ARRAY_FORCECAST_ = 0x0010,
+        NPY_ARRAY_ENSUREARRAY_ = 0x0040,
+        NPY_ARRAY_ALIGNED_ = 0x0100,
+        NPY_ARRAY_WRITEABLE_ = 0x0400,
+        NPY_BOOL_ = 0,
+        NPY_BYTE_, NPY_UBYTE_,
+        NPY_SHORT_, NPY_USHORT_,
+        NPY_INT_, NPY_UINT_,
+        NPY_LONG_, NPY_ULONG_,
+        NPY_LONGLONG_, NPY_ULONGLONG_,
+        NPY_FLOAT_, NPY_DOUBLE_, NPY_LONGDOUBLE_,
+        NPY_CFLOAT_, NPY_CDOUBLE_, NPY_CLONGDOUBLE_,
+        NPY_OBJECT_ = 17,
+        NPY_STRING_, NPY_UNICODE_, NPY_VOID_,
+        // Platform-dependent normalization
+        NPY_INT8_ = NPY_BYTE_,
+        NPY_UINT8_ = NPY_UBYTE_,
+        NPY_INT16_ = NPY_SHORT_,
+        NPY_UINT16_ = NPY_USHORT_,
+        // `npy_common.h` defines the integer aliases. In order, it checks:
+        // NPY_BITSOF_LONG, NPY_BITSOF_LONGLONG, NPY_BITSOF_INT, NPY_BITSOF_SHORT, NPY_BITSOF_CHAR
+        // and assigns the alias to the first matching size, so we should check in this order.
+        NPY_INT32_ = platform_lookup<std::int32_t, long, int, short>(
+            NPY_LONG_, NPY_INT_, NPY_SHORT_),
+        NPY_UINT32_ = platform_lookup<std::uint32_t, unsigned long, unsigned int, unsigned short>(
+            NPY_ULONG_, NPY_UINT_, NPY_USHORT_),
+        NPY_INT64_ = platform_lookup<std::int64_t, long, long long, int>(
+            NPY_LONG_, NPY_LONGLONG_, NPY_INT_),
+        NPY_UINT64_ = platform_lookup<std::uint64_t, unsigned long, unsigned long long, unsigned int>(
+            NPY_ULONG_, NPY_ULONGLONG_, NPY_UINT_),
+    };
+
+    typedef struct {
+        Py_intptr_t *ptr;
+        int len;
+    } PyArray_Dims;
+
+    static npy_api& get() {
+        static npy_api api = lookup();
+        return api;
+    }
+
+    bool PyArray_Check_(PyObject *obj) const {
+        return (bool) PyObject_TypeCheck(obj, PyArray_Type_);
+    }
+    bool PyArrayDescr_Check_(PyObject *obj) const {
+        return (bool) PyObject_TypeCheck(obj, PyArrayDescr_Type_);
+    }
+
+    unsigned int (*PyArray_GetNDArrayCFeatureVersion_)();
+    PyObject *(*PyArray_DescrFromType_)(int);
+    PyObject *(*PyArray_NewFromDescr_)
+        (PyTypeObject *, PyObject *, int, Py_intptr_t *,
+         Py_intptr_t *, void *, int, PyObject *);
+    PyObject *(*PyArray_DescrNewFromType_)(int);
+    int (*PyArray_CopyInto_)(PyObject *, PyObject *);
+    PyObject *(*PyArray_NewCopy_)(PyObject *, int);
+    PyTypeObject *PyArray_Type_;
+    PyTypeObject *PyVoidArrType_Type_;
+    PyTypeObject *PyArrayDescr_Type_;
+    PyObject *(*PyArray_DescrFromScalar_)(PyObject *);
+    PyObject *(*PyArray_FromAny_) (PyObject *, PyObject *, int, int, int, PyObject *);
+    int (*PyArray_DescrConverter_) (PyObject *, PyObject **);
+    bool (*PyArray_EquivTypes_) (PyObject *, PyObject *);
+    int (*PyArray_GetArrayParamsFromObject_)(PyObject *, PyObject *, char, PyObject **, int *,
+                                             Py_ssize_t *, PyObject **, PyObject *);
+    PyObject *(*PyArray_Squeeze_)(PyObject *);
+    int (*PyArray_SetBaseObject_)(PyObject *, PyObject *);
+    PyObject* (*PyArray_Resize_)(PyObject*, PyArray_Dims*, int, int);
+private:
+    enum functions {
+        API_PyArray_GetNDArrayCFeatureVersion = 211,
+        API_PyArray_Type = 2,
+        API_PyArrayDescr_Type = 3,
+        API_PyVoidArrType_Type = 39,
+        API_PyArray_DescrFromType = 45,
+        API_PyArray_DescrFromScalar = 57,
+        API_PyArray_FromAny = 69,
+        API_PyArray_Resize = 80,
+        API_PyArray_CopyInto = 82,
+        API_PyArray_NewCopy = 85,
+        API_PyArray_NewFromDescr = 94,
+        API_PyArray_DescrNewFromType = 9,
+        API_PyArray_DescrConverter = 174,
+        API_PyArray_EquivTypes = 182,
+        API_PyArray_GetArrayParamsFromObject = 278,
+        API_PyArray_Squeeze = 136,
+        API_PyArray_SetBaseObject = 282
+    };
+
+    static npy_api lookup() {
+        module m = module::import("numpy.core.multiarray");
+        auto c = m.attr("_ARRAY_API");
+#if PY_MAJOR_VERSION >= 3
+        void **api_ptr = (void **) PyCapsule_GetPointer(c.ptr(), NULL);
+#else
+        void **api_ptr = (void **) PyCObject_AsVoidPtr(c.ptr());
+#endif
+        npy_api api;
+#define DECL_NPY_API(Func) api.Func##_ = (decltype(api.Func##_)) api_ptr[API_##Func];
+        DECL_NPY_API(PyArray_GetNDArrayCFeatureVersion);
+        if (api.PyArray_GetNDArrayCFeatureVersion_() < 0x7)
+            pybind11_fail("pybind11 numpy support requires numpy >= 1.7.0");
+        DECL_NPY_API(PyArray_Type);
+        DECL_NPY_API(PyVoidArrType_Type);
+        DECL_NPY_API(PyArrayDescr_Type);
+        DECL_NPY_API(PyArray_DescrFromType);
+        DECL_NPY_API(PyArray_DescrFromScalar);
+        DECL_NPY_API(PyArray_FromAny);
+        DECL_NPY_API(PyArray_Resize);
+        DECL_NPY_API(PyArray_CopyInto);
+        DECL_NPY_API(PyArray_NewCopy);
+        DECL_NPY_API(PyArray_NewFromDescr);
+        DECL_NPY_API(PyArray_DescrNewFromType);
+        DECL_NPY_API(PyArray_DescrConverter);
+        DECL_NPY_API(PyArray_EquivTypes);
+        DECL_NPY_API(PyArray_GetArrayParamsFromObject);
+        DECL_NPY_API(PyArray_Squeeze);
+        DECL_NPY_API(PyArray_SetBaseObject);
+#undef DECL_NPY_API
+        return api;
+    }
+};
+
+inline PyArray_Proxy* array_proxy(void* ptr) {
+    return reinterpret_cast<PyArray_Proxy*>(ptr);
+}
+
+inline const PyArray_Proxy* array_proxy(const void* ptr) {
+    return reinterpret_cast<const PyArray_Proxy*>(ptr);
+}
+
+inline PyArrayDescr_Proxy* array_descriptor_proxy(PyObject* ptr) {
+   return reinterpret_cast<PyArrayDescr_Proxy*>(ptr);
+}
+
+inline const PyArrayDescr_Proxy* array_descriptor_proxy(const PyObject* ptr) {
+   return reinterpret_cast<const PyArrayDescr_Proxy*>(ptr);
+}
+
+inline bool check_flags(const void* ptr, int flag) {
+    return (flag == (array_proxy(ptr)->flags & flag));
+}
+
+template <typename T> struct is_std_array : std::false_type { };
+template <typename T, size_t N> struct is_std_array<std::array<T, N>> : std::true_type { };
+template <typename T> struct is_complex : std::false_type { };
+template <typename T> struct is_complex<std::complex<T>> : std::true_type { };
+
+template <typename T> struct array_info_scalar {
+    typedef T type;
+    static constexpr bool is_array = false;
+    static constexpr bool is_empty = false;
+    static constexpr auto extents = _("");
+    static void append_extents(list& /* shape */) { }
+};
+// Computes underlying type and a comma-separated list of extents for array
+// types (any mix of std::array and built-in arrays). An array of char is
+// treated as scalar because it gets special handling.
+template <typename T> struct array_info : array_info_scalar<T> { };
+template <typename T, size_t N> struct array_info<std::array<T, N>> {
+    using type = typename array_info<T>::type;
+    static constexpr bool is_array = true;
+    static constexpr bool is_empty = (N == 0) || array_info<T>::is_empty;
+    static constexpr size_t extent = N;
+
+    // appends the extents to shape
+    static void append_extents(list& shape) {
+        shape.append(N);
+        array_info<T>::append_extents(shape);
+    }
+
+    static constexpr auto extents = _<array_info<T>::is_array>(
+        concat(_<N>(), array_info<T>::extents), _<N>()
+    );
+};
+// For numpy we have special handling for arrays of characters, so we don't include
+// the size in the array extents.
+template <size_t N> struct array_info<char[N]> : array_info_scalar<char[N]> { };
+template <size_t N> struct array_info<std::array<char, N>> : array_info_scalar<std::array<char, N>> { };
+template <typename T, size_t N> struct array_info<T[N]> : array_info<std::array<T, N>> { };
+template <typename T> using remove_all_extents_t = typename array_info<T>::type;
+
+template <typename T> using is_pod_struct = all_of<
+    std::is_standard_layout<T>,     // since we're accessing directly in memory we need a standard layout type
+#if !defined(__GNUG__) || defined(_LIBCPP_VERSION) || defined(_GLIBCXX_USE_CXX11_ABI)
+    // _GLIBCXX_USE_CXX11_ABI indicates that we're using libstdc++ from GCC 5 or newer, independent
+    // of the actual compiler (Clang can also use libstdc++, but it always defines __GNUC__ == 4).
+    std::is_trivially_copyable<T>,
+#else
+    // GCC 4 doesn't implement is_trivially_copyable, so approximate it
+    std::is_trivially_destructible<T>,
+    satisfies_any_of<T, std::has_trivial_copy_constructor, std::has_trivial_copy_assign>,
+#endif
+    satisfies_none_of<T, std::is_reference, std::is_array, is_std_array, std::is_arithmetic, is_complex, std::is_enum>
+>;
+
+template <ssize_t Dim = 0, typename Strides> ssize_t byte_offset_unsafe(const Strides &) { return 0; }
+template <ssize_t Dim = 0, typename Strides, typename... Ix>
+ssize_t byte_offset_unsafe(const Strides &strides, ssize_t i, Ix... index) {
+    return i * strides[Dim] + byte_offset_unsafe<Dim + 1>(strides, index...);
+}
+
+/**
+ * Proxy class providing unsafe, unchecked const access to array data.  This is constructed through
+ * the `unchecked<T, N>()` method of `array` or the `unchecked<N>()` method of `array_t<T>`.  `Dims`
+ * will be -1 for dimensions determined at runtime.
+ */
+template <typename T, ssize_t Dims>
+class unchecked_reference {
+protected:
+    static constexpr bool Dynamic = Dims < 0;
+    const unsigned char *data_;
+    // Storing the shape & strides in local variables (i.e. these arrays) allows the compiler to
+    // make large performance gains on big, nested loops, but requires compile-time dimensions
+    conditional_t<Dynamic, const ssize_t *, std::array<ssize_t, (size_t) Dims>>
+            shape_, strides_;
+    const ssize_t dims_;
+
+    friend class pybind11::array;
+    // Constructor for compile-time dimensions:
+    template <bool Dyn = Dynamic>
+    unchecked_reference(const void *data, const ssize_t *shape, const ssize_t *strides, enable_if_t<!Dyn, ssize_t>)
+    : data_{reinterpret_cast<const unsigned char *>(data)}, dims_{Dims} {
+        for (size_t i = 0; i < (size_t) dims_; i++) {
+            shape_[i] = shape[i];
+            strides_[i] = strides[i];
+        }
+    }
+    // Constructor for runtime dimensions:
+    template <bool Dyn = Dynamic>
+    unchecked_reference(const void *data, const ssize_t *shape, const ssize_t *strides, enable_if_t<Dyn, ssize_t> dims)
+    : data_{reinterpret_cast<const unsigned char *>(data)}, shape_{shape}, strides_{strides}, dims_{dims} {}
+
+public:
+    /**
+     * Unchecked const reference access to data at the given indices.  For a compile-time known
+     * number of dimensions, this requires the correct number of arguments; for run-time
+     * dimensionality, this is not checked (and so is up to the caller to use safely).
+     */
+    template <typename... Ix> const T &operator()(Ix... index) const {
+        static_assert(ssize_t{sizeof...(Ix)} == Dims || Dynamic,
+                "Invalid number of indices for unchecked array reference");
+        return *reinterpret_cast<const T *>(data_ + byte_offset_unsafe(strides_, ssize_t(index)...));
+    }
+    /**
+     * Unchecked const reference access to data; this operator only participates if the reference
+     * is to a 1-dimensional array.  When present, this is exactly equivalent to `obj(index)`.
+     */
+    template <ssize_t D = Dims, typename = enable_if_t<D == 1 || Dynamic>>
+    const T &operator[](ssize_t index) const { return operator()(index); }
+
+    /// Pointer access to the data at the given indices.
+    template <typename... Ix> const T *data(Ix... ix) const { return &operator()(ssize_t(ix)...); }
+
+    /// Returns the item size, i.e. sizeof(T)
+    constexpr static ssize_t itemsize() { return sizeof(T); }
+
+    /// Returns the shape (i.e. size) of dimension `dim`
+    ssize_t shape(ssize_t dim) const { return shape_[(size_t) dim]; }
+
+    /// Returns the number of dimensions of the array
+    ssize_t ndim() const { return dims_; }
+
+    /// Returns the total number of elements in the referenced array, i.e. the product of the shapes
+    template <bool Dyn = Dynamic>
+    enable_if_t<!Dyn, ssize_t> size() const {
+        return std::accumulate(shape_.begin(), shape_.end(), (ssize_t) 1, std::multiplies<ssize_t>());
+    }
+    template <bool Dyn = Dynamic>
+    enable_if_t<Dyn, ssize_t> size() const {
+        return std::accumulate(shape_, shape_ + ndim(), (ssize_t) 1, std::multiplies<ssize_t>());
+    }
+
+    /// Returns the total number of bytes used by the referenced data.  Note that the actual span in
+    /// memory may be larger if the referenced array has non-contiguous strides (e.g. for a slice).
+    ssize_t nbytes() const {
+        return size() * itemsize();
+    }
+};
+
+template <typename T, ssize_t Dims>
+class unchecked_mutable_reference : public unchecked_reference<T, Dims> {
+    friend class pybind11::array;
+    using ConstBase = unchecked_reference<T, Dims>;
+    using ConstBase::ConstBase;
+    using ConstBase::Dynamic;
+public:
+    /// Mutable, unchecked access to data at the given indices.
+    template <typename... Ix> T& operator()(Ix... index) {
+        static_assert(ssize_t{sizeof...(Ix)} == Dims || Dynamic,
+                "Invalid number of indices for unchecked array reference");
+        return const_cast<T &>(ConstBase::operator()(index...));
+    }
+    /**
+     * Mutable, unchecked access data at the given index; this operator only participates if the
+     * reference is to a 1-dimensional array (or has runtime dimensions).  When present, this is
+     * exactly equivalent to `obj(index)`.
+     */
+    template <ssize_t D = Dims, typename = enable_if_t<D == 1 || Dynamic>>
+    T &operator[](ssize_t index) { return operator()(index); }
+
+    /// Mutable pointer access to the data at the given indices.
+    template <typename... Ix> T *mutable_data(Ix... ix) { return &operator()(ssize_t(ix)...); }
+};
+
+template <typename T, ssize_t Dim>
+struct type_caster<unchecked_reference<T, Dim>> {
+    static_assert(Dim == 0 && Dim > 0 /* always fail */, "unchecked array proxy object is not castable");
+};
+template <typename T, ssize_t Dim>
+struct type_caster<unchecked_mutable_reference<T, Dim>> : type_caster<unchecked_reference<T, Dim>> {};
+
+NAMESPACE_END(detail)
+
+class dtype : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(dtype, object, detail::npy_api::get().PyArrayDescr_Check_);
+
+    explicit dtype(const buffer_info &info) {
+        dtype descr(_dtype_from_pep3118()(PYBIND11_STR_TYPE(info.format)));
+        // If info.itemsize == 0, use the value calculated from the format string
+        m_ptr = descr.strip_padding(info.itemsize ? info.itemsize : descr.itemsize()).release().ptr();
+    }
+
+    explicit dtype(const std::string &format) {
+        m_ptr = from_args(pybind11::str(format)).release().ptr();
+    }
+
+    dtype(const char *format) : dtype(std::string(format)) { }
+
+    dtype(list names, list formats, list offsets, ssize_t itemsize) {
+        dict args;
+        args["names"] = names;
+        args["formats"] = formats;
+        args["offsets"] = offsets;
+        args["itemsize"] = pybind11::int_(itemsize);
+        m_ptr = from_args(args).release().ptr();
+    }
+
+    /// This is essentially the same as calling numpy.dtype(args) in Python.
+    static dtype from_args(object args) {
+        PyObject *ptr = nullptr;
+        if (!detail::npy_api::get().PyArray_DescrConverter_(args.ptr(), &ptr) || !ptr)
+            throw error_already_set();
+        return reinterpret_steal<dtype>(ptr);
+    }
+
+    /// Return dtype associated with a C++ type.
+    template <typename T> static dtype of() {
+        return detail::npy_format_descriptor<typename std::remove_cv<T>::type>::dtype();
+    }
+
+    /// Size of the data type in bytes.
+    ssize_t itemsize() const {
+        return detail::array_descriptor_proxy(m_ptr)->elsize;
+    }
+
+    /// Returns true for structured data types.
+    bool has_fields() const {
+        return detail::array_descriptor_proxy(m_ptr)->names != nullptr;
+    }
+
+    /// Single-character type code.
+    char kind() const {
+        return detail::array_descriptor_proxy(m_ptr)->kind;
+    }
+
+private:
+    static object _dtype_from_pep3118() {
+        static PyObject *obj = module::import("numpy.core._internal")
+            .attr("_dtype_from_pep3118").cast<object>().release().ptr();
+        return reinterpret_borrow<object>(obj);
+    }
+
+    dtype strip_padding(ssize_t itemsize) {
+        // Recursively strip all void fields with empty names that are generated for
+        // padding fields (as of NumPy v1.11).
+        if (!has_fields())
+            return *this;
+
+        struct field_descr { PYBIND11_STR_TYPE name; object format; pybind11::int_ offset; };
+        std::vector<field_descr> field_descriptors;
+
+        for (auto field : attr("fields").attr("items")()) {
+            auto spec = field.cast<tuple>();
+            auto name = spec[0].cast<pybind11::str>();
+            auto format = spec[1].cast<tuple>()[0].cast<dtype>();
+            auto offset = spec[1].cast<tuple>()[1].cast<pybind11::int_>();
+            if (!len(name) && format.kind() == 'V')
+                continue;
+            field_descriptors.push_back({(PYBIND11_STR_TYPE) name, format.strip_padding(format.itemsize()), offset});
+        }
+
+        std::sort(field_descriptors.begin(), field_descriptors.end(),
+                  [](const field_descr& a, const field_descr& b) {
+                      return a.offset.cast<int>() < b.offset.cast<int>();
+                  });
+
+        list names, formats, offsets;
+        for (auto& descr : field_descriptors) {
+            names.append(descr.name);
+            formats.append(descr.format);
+            offsets.append(descr.offset);
+        }
+        return dtype(names, formats, offsets, itemsize);
+    }
+};
+
+class array : public buffer {
+public:
+    PYBIND11_OBJECT_CVT(array, buffer, detail::npy_api::get().PyArray_Check_, raw_array)
+
+    enum {
+        c_style = detail::npy_api::NPY_ARRAY_C_CONTIGUOUS_,
+        f_style = detail::npy_api::NPY_ARRAY_F_CONTIGUOUS_,
+        forcecast = detail::npy_api::NPY_ARRAY_FORCECAST_
+    };
+
+    array() : array({{0}}, static_cast<const double *>(nullptr)) {}
+
+    using ShapeContainer = detail::any_container<ssize_t>;
+    using StridesContainer = detail::any_container<ssize_t>;
+
+    // Constructs an array taking shape/strides from arbitrary container types
+    array(const pybind11::dtype &dt, ShapeContainer shape, StridesContainer strides,
+          const void *ptr = nullptr, handle base = handle()) {
+
+        if (strides->empty())
+            *strides = c_strides(*shape, dt.itemsize());
+
+        auto ndim = shape->size();
+        if (ndim != strides->size())
+            pybind11_fail("NumPy: shape ndim doesn't match strides ndim");
+        auto descr = dt;
+
+        int flags = 0;
+        if (base && ptr) {
+            if (isinstance<array>(base))
+                /* Copy flags from base (except ownership bit) */
+                flags = reinterpret_borrow<array>(base).flags() & ~detail::npy_api::NPY_ARRAY_OWNDATA_;
+            else
+                /* Writable by default, easy to downgrade later on if needed */
+                flags = detail::npy_api::NPY_ARRAY_WRITEABLE_;
+        }
+
+        auto &api = detail::npy_api::get();
+        auto tmp = reinterpret_steal<object>(api.PyArray_NewFromDescr_(
+            api.PyArray_Type_, descr.release().ptr(), (int) ndim, shape->data(), strides->data(),
+            const_cast<void *>(ptr), flags, nullptr));
+        if (!tmp)
+            throw error_already_set();
+        if (ptr) {
+            if (base) {
+                api.PyArray_SetBaseObject_(tmp.ptr(), base.inc_ref().ptr());
+            } else {
+                tmp = reinterpret_steal<object>(api.PyArray_NewCopy_(tmp.ptr(), -1 /* any order */));
+            }
+        }
+        m_ptr = tmp.release().ptr();
+    }
+
+    array(const pybind11::dtype &dt, ShapeContainer shape, const void *ptr = nullptr, handle base = handle())
+        : array(dt, std::move(shape), {}, ptr, base) { }
+
+    template <typename T, typename = detail::enable_if_t<std::is_integral<T>::value && !std::is_same<bool, T>::value>>
+    array(const pybind11::dtype &dt, T count, const void *ptr = nullptr, handle base = handle())
+        : array(dt, {{count}}, ptr, base) { }
+
+    template <typename T>
+    array(ShapeContainer shape, StridesContainer strides, const T *ptr, handle base = handle())
+        : array(pybind11::dtype::of<T>(), std::move(shape), std::move(strides), ptr, base) { }
+
+    template <typename T>
+    array(ShapeContainer shape, const T *ptr, handle base = handle())
+        : array(std::move(shape), {}, ptr, base) { }
+
+    template <typename T>
+    explicit array(ssize_t count, const T *ptr, handle base = handle()) : array({count}, {}, ptr, base) { }
+
+    explicit array(const buffer_info &info)
+    : array(pybind11::dtype(info), info.shape, info.strides, info.ptr) { }
+
+    /// Array descriptor (dtype)
+    pybind11::dtype dtype() const {
+        return reinterpret_borrow<pybind11::dtype>(detail::array_proxy(m_ptr)->descr);
+    }
+
+    /// Total number of elements
+    ssize_t size() const {
+        return std::accumulate(shape(), shape() + ndim(), (ssize_t) 1, std::multiplies<ssize_t>());
+    }
+
+    /// Byte size of a single element
+    ssize_t itemsize() const {
+        return detail::array_descriptor_proxy(detail::array_proxy(m_ptr)->descr)->elsize;
+    }
+
+    /// Total number of bytes
+    ssize_t nbytes() const {
+        return size() * itemsize();
+    }
+
+    /// Number of dimensions
+    ssize_t ndim() const {
+        return detail::array_proxy(m_ptr)->nd;
+    }
+
+    /// Base object
+    object base() const {
+        return reinterpret_borrow<object>(detail::array_proxy(m_ptr)->base);
+    }
+
+    /// Dimensions of the array
+    const ssize_t* shape() const {
+        return detail::array_proxy(m_ptr)->dimensions;
+    }
+
+    /// Dimension along a given axis
+    ssize_t shape(ssize_t dim) const {
+        if (dim >= ndim())
+            fail_dim_check(dim, "invalid axis");
+        return shape()[dim];
+    }
+
+    /// Strides of the array
+    const ssize_t* strides() const {
+        return detail::array_proxy(m_ptr)->strides;
+    }
+
+    /// Stride along a given axis
+    ssize_t strides(ssize_t dim) const {
+        if (dim >= ndim())
+            fail_dim_check(dim, "invalid axis");
+        return strides()[dim];
+    }
+
+    /// Return the NumPy array flags
+    int flags() const {
+        return detail::array_proxy(m_ptr)->flags;
+    }
+
+    /// If set, the array is writeable (otherwise the buffer is read-only)
+    bool writeable() const {
+        return detail::check_flags(m_ptr, detail::npy_api::NPY_ARRAY_WRITEABLE_);
+    }
+
+    /// If set, the array owns the data (will be freed when the array is deleted)
+    bool owndata() const {
+        return detail::check_flags(m_ptr, detail::npy_api::NPY_ARRAY_OWNDATA_);
+    }
+
+    /// Pointer to the contained data. If index is not provided, points to the
+    /// beginning of the buffer. May throw if the index would lead to out of bounds access.
+    template<typename... Ix> const void* data(Ix... index) const {
+        return static_cast<const void *>(detail::array_proxy(m_ptr)->data + offset_at(index...));
+    }
+
+    /// Mutable pointer to the contained data. If index is not provided, points to the
+    /// beginning of the buffer. May throw if the index would lead to out of bounds access.
+    /// May throw if the array is not writeable.
+    template<typename... Ix> void* mutable_data(Ix... index) {
+        check_writeable();
+        return static_cast<void *>(detail::array_proxy(m_ptr)->data + offset_at(index...));
+    }
+
+    /// Byte offset from beginning of the array to a given index (full or partial).
+    /// May throw if the index would lead to out of bounds access.
+    template<typename... Ix> ssize_t offset_at(Ix... index) const {
+        if ((ssize_t) sizeof...(index) > ndim())
+            fail_dim_check(sizeof...(index), "too many indices for an array");
+        return byte_offset(ssize_t(index)...);
+    }
+
+    ssize_t offset_at() const { return 0; }
+
+    /// Item count from beginning of the array to a given index (full or partial).
+    /// May throw if the index would lead to out of bounds access.
+    template<typename... Ix> ssize_t index_at(Ix... index) const {
+        return offset_at(index...) / itemsize();
+    }
+
+    /**
+     * Returns a proxy object that provides access to the array's data without bounds or
+     * dimensionality checking.  Will throw if the array is missing the `writeable` flag.  Use with
+     * care: the array must not be destroyed or reshaped for the duration of the returned object,
+     * and the caller must take care not to access invalid dimensions or dimension indices.
+     */
+    template <typename T, ssize_t Dims = -1> detail::unchecked_mutable_reference<T, Dims> mutable_unchecked() & {
+        if (Dims >= 0 && ndim() != Dims)
+            throw std::domain_error("array has incorrect number of dimensions: " + std::to_string(ndim()) +
+                    "; expected " + std::to_string(Dims));
+        return detail::unchecked_mutable_reference<T, Dims>(mutable_data(), shape(), strides(), ndim());
+    }
+
+    /**
+     * Returns a proxy object that provides const access to the array's data without bounds or
+     * dimensionality checking.  Unlike `mutable_unchecked()`, this does not require that the
+     * underlying array have the `writable` flag.  Use with care: the array must not be destroyed or
+     * reshaped for the duration of the returned object, and the caller must take care not to access
+     * invalid dimensions or dimension indices.
+     */
+    template <typename T, ssize_t Dims = -1> detail::unchecked_reference<T, Dims> unchecked() const & {
+        if (Dims >= 0 && ndim() != Dims)
+            throw std::domain_error("array has incorrect number of dimensions: " + std::to_string(ndim()) +
+                    "; expected " + std::to_string(Dims));
+        return detail::unchecked_reference<T, Dims>(data(), shape(), strides(), ndim());
+    }
+
+    /// Return a new view with all of the dimensions of length 1 removed
+    array squeeze() {
+        auto& api = detail::npy_api::get();
+        return reinterpret_steal<array>(api.PyArray_Squeeze_(m_ptr));
+    }
+
+    /// Resize array to given shape
+    /// If refcheck is true and more that one reference exist to this array
+    /// then resize will succeed only if it makes a reshape, i.e. original size doesn't change
+    void resize(ShapeContainer new_shape, bool refcheck = true) {
+        detail::npy_api::PyArray_Dims d = {
+            new_shape->data(), int(new_shape->size())
+        };
+        // try to resize, set ordering param to -1 cause it's not used anyway
+        object new_array = reinterpret_steal<object>(
+            detail::npy_api::get().PyArray_Resize_(m_ptr, &d, int(refcheck), -1)
+        );
+        if (!new_array) throw error_already_set();
+        if (isinstance<array>(new_array)) { *this = std::move(new_array); }
+    }
+
+    /// Ensure that the argument is a NumPy array
+    /// In case of an error, nullptr is returned and the Python error is cleared.
+    static array ensure(handle h, int ExtraFlags = 0) {
+        auto result = reinterpret_steal<array>(raw_array(h.ptr(), ExtraFlags));
+        if (!result)
+            PyErr_Clear();
+        return result;
+    }
+
+protected:
+    template<typename, typename> friend struct detail::npy_format_descriptor;
+
+    void fail_dim_check(ssize_t dim, const std::string& msg) const {
+        throw index_error(msg + ": " + std::to_string(dim) +
+                          " (ndim = " + std::to_string(ndim()) + ")");
+    }
+
+    template<typename... Ix> ssize_t byte_offset(Ix... index) const {
+        check_dimensions(index...);
+        return detail::byte_offset_unsafe(strides(), ssize_t(index)...);
+    }
+
+    void check_writeable() const {
+        if (!writeable())
+            throw std::domain_error("array is not writeable");
+    }
+
+    // Default, C-style strides
+    static std::vector<ssize_t> c_strides(const std::vector<ssize_t> &shape, ssize_t itemsize) {
+        auto ndim = shape.size();
+        std::vector<ssize_t> strides(ndim, itemsize);
+        if (ndim > 0)
+            for (size_t i = ndim - 1; i > 0; --i)
+                strides[i - 1] = strides[i] * shape[i];
+        return strides;
+    }
+
+    // F-style strides; default when constructing an array_t with `ExtraFlags & f_style`
+    static std::vector<ssize_t> f_strides(const std::vector<ssize_t> &shape, ssize_t itemsize) {
+        auto ndim = shape.size();
+        std::vector<ssize_t> strides(ndim, itemsize);
+        for (size_t i = 1; i < ndim; ++i)
+            strides[i] = strides[i - 1] * shape[i - 1];
+        return strides;
+    }
+
+    template<typename... Ix> void check_dimensions(Ix... index) const {
+        check_dimensions_impl(ssize_t(0), shape(), ssize_t(index)...);
+    }
+
+    void check_dimensions_impl(ssize_t, const ssize_t*) const { }
+
+    template<typename... Ix> void check_dimensions_impl(ssize_t axis, const ssize_t* shape, ssize_t i, Ix... index) const {
+        if (i >= *shape) {
+            throw index_error(std::string("index ") + std::to_string(i) +
+                              " is out of bounds for axis " + std::to_string(axis) +
+                              " with size " + std::to_string(*shape));
+        }
+        check_dimensions_impl(axis + 1, shape + 1, index...);
+    }
+
+    /// Create array from any object -- always returns a new reference
+    static PyObject *raw_array(PyObject *ptr, int ExtraFlags = 0) {
+        if (ptr == nullptr) {
+            PyErr_SetString(PyExc_ValueError, "cannot create a pybind11::array from a nullptr");
+            return nullptr;
+        }
+        return detail::npy_api::get().PyArray_FromAny_(
+            ptr, nullptr, 0, 0, detail::npy_api::NPY_ARRAY_ENSUREARRAY_ | ExtraFlags, nullptr);
+    }
+};
+
+template <typename T, int ExtraFlags = array::forcecast> class array_t : public array {
+private:
+    struct private_ctor {};
+    // Delegating constructor needed when both moving and accessing in the same constructor
+    array_t(private_ctor, ShapeContainer &&shape, StridesContainer &&strides, const T *ptr, handle base)
+        : array(std::move(shape), std::move(strides), ptr, base) {}
+public:
+    static_assert(!detail::array_info<T>::is_array, "Array types cannot be used with array_t");
+
+    using value_type = T;
+
+    array_t() : array(0, static_cast<const T *>(nullptr)) {}
+    array_t(handle h, borrowed_t) : array(h, borrowed_t{}) { }
+    array_t(handle h, stolen_t) : array(h, stolen_t{}) { }
+
+    PYBIND11_DEPRECATED("Use array_t<T>::ensure() instead")
+    array_t(handle h, bool is_borrowed) : array(raw_array_t(h.ptr()), stolen_t{}) {
+        if (!m_ptr) PyErr_Clear();
+        if (!is_borrowed) Py_XDECREF(h.ptr());
+    }
+
+    array_t(const object &o) : array(raw_array_t(o.ptr()), stolen_t{}) {
+        if (!m_ptr) throw error_already_set();
+    }
+
+    explicit array_t(const buffer_info& info) : array(info) { }
+
+    array_t(ShapeContainer shape, StridesContainer strides, const T *ptr = nullptr, handle base = handle())
+        : array(std::move(shape), std::move(strides), ptr, base) { }
+
+    explicit array_t(ShapeContainer shape, const T *ptr = nullptr, handle base = handle())
+        : array_t(private_ctor{}, std::move(shape),
+                ExtraFlags & f_style ? f_strides(*shape, itemsize()) : c_strides(*shape, itemsize()),
+                ptr, base) { }
+
+    explicit array_t(size_t count, const T *ptr = nullptr, handle base = handle())
+        : array({count}, {}, ptr, base) { }
+
+    constexpr ssize_t itemsize() const {
+        return sizeof(T);
+    }
+
+    template<typename... Ix> ssize_t index_at(Ix... index) const {
+        return offset_at(index...) / itemsize();
+    }
+
+    template<typename... Ix> const T* data(Ix... index) const {
+        return static_cast<const T*>(array::data(index...));
+    }
+
+    template<typename... Ix> T* mutable_data(Ix... index) {
+        return static_cast<T*>(array::mutable_data(index...));
+    }
+
+    // Reference to element at a given index
+    template<typename... Ix> const T& at(Ix... index) const {
+        if ((ssize_t) sizeof...(index) != ndim())
+            fail_dim_check(sizeof...(index), "index dimension mismatch");
+        return *(static_cast<const T*>(array::data()) + byte_offset(ssize_t(index)...) / itemsize());
+    }
+
+    // Mutable reference to element at a given index
+    template<typename... Ix> T& mutable_at(Ix... index) {
+        if ((ssize_t) sizeof...(index) != ndim())
+            fail_dim_check(sizeof...(index), "index dimension mismatch");
+        return *(static_cast<T*>(array::mutable_data()) + byte_offset(ssize_t(index)...) / itemsize());
+    }
+
+    /**
+     * Returns a proxy object that provides access to the array's data without bounds or
+     * dimensionality checking.  Will throw if the array is missing the `writeable` flag.  Use with
+     * care: the array must not be destroyed or reshaped for the duration of the returned object,
+     * and the caller must take care not to access invalid dimensions or dimension indices.
+     */
+    template <ssize_t Dims = -1> detail::unchecked_mutable_reference<T, Dims> mutable_unchecked() & {
+        return array::mutable_unchecked<T, Dims>();
+    }
+
+    /**
+     * Returns a proxy object that provides const access to the array's data without bounds or
+     * dimensionality checking.  Unlike `unchecked()`, this does not require that the underlying
+     * array have the `writable` flag.  Use with care: the array must not be destroyed or reshaped
+     * for the duration of the returned object, and the caller must take care not to access invalid
+     * dimensions or dimension indices.
+     */
+    template <ssize_t Dims = -1> detail::unchecked_reference<T, Dims> unchecked() const & {
+        return array::unchecked<T, Dims>();
+    }
+
+    /// Ensure that the argument is a NumPy array of the correct dtype (and if not, try to convert
+    /// it).  In case of an error, nullptr is returned and the Python error is cleared.
+    static array_t ensure(handle h) {
+        auto result = reinterpret_steal<array_t>(raw_array_t(h.ptr()));
+        if (!result)
+            PyErr_Clear();
+        return result;
+    }
+
+    static bool check_(handle h) {
+        const auto &api = detail::npy_api::get();
+        return api.PyArray_Check_(h.ptr())
+               && api.PyArray_EquivTypes_(detail::array_proxy(h.ptr())->descr, dtype::of<T>().ptr());
+    }
+
+protected:
+    /// Create array from any object -- always returns a new reference
+    static PyObject *raw_array_t(PyObject *ptr) {
+        if (ptr == nullptr) {
+            PyErr_SetString(PyExc_ValueError, "cannot create a pybind11::array_t from a nullptr");
+            return nullptr;
+        }
+        return detail::npy_api::get().PyArray_FromAny_(
+            ptr, dtype::of<T>().release().ptr(), 0, 0,
+            detail::npy_api::NPY_ARRAY_ENSUREARRAY_ | ExtraFlags, nullptr);
+    }
+};
+
+template <typename T>
+struct format_descriptor<T, detail::enable_if_t<detail::is_pod_struct<T>::value>> {
+    static std::string format() {
+        return detail::npy_format_descriptor<typename std::remove_cv<T>::type>::format();
+    }
+};
+
+template <size_t N> struct format_descriptor<char[N]> {
+    static std::string format() { return std::to_string(N) + "s"; }
+};
+template <size_t N> struct format_descriptor<std::array<char, N>> {
+    static std::string format() { return std::to_string(N) + "s"; }
+};
+
+template <typename T>
+struct format_descriptor<T, detail::enable_if_t<std::is_enum<T>::value>> {
+    static std::string format() {
+        return format_descriptor<
+            typename std::remove_cv<typename std::underlying_type<T>::type>::type>::format();
+    }
+};
+
+template <typename T>
+struct format_descriptor<T, detail::enable_if_t<detail::array_info<T>::is_array>> {
+    static std::string format() {
+        using namespace detail;
+        static constexpr auto extents = _("(") + array_info<T>::extents + _(")");
+        return extents.text + format_descriptor<remove_all_extents_t<T>>::format();
+    }
+};
+
+NAMESPACE_BEGIN(detail)
+template <typename T, int ExtraFlags>
+struct pyobject_caster<array_t<T, ExtraFlags>> {
+    using type = array_t<T, ExtraFlags>;
+
+    bool load(handle src, bool convert) {
+        if (!convert && !type::check_(src))
+            return false;
+        value = type::ensure(src);
+        return static_cast<bool>(value);
+    }
+
+    static handle cast(const handle &src, return_value_policy /* policy */, handle /* parent */) {
+        return src.inc_ref();
+    }
+    PYBIND11_TYPE_CASTER(type, handle_type_name<type>::name);
+};
+
+template <typename T>
+struct compare_buffer_info<T, detail::enable_if_t<detail::is_pod_struct<T>::value>> {
+    static bool compare(const buffer_info& b) {
+        return npy_api::get().PyArray_EquivTypes_(dtype::of<T>().ptr(), dtype(b).ptr());
+    }
+};
+
+template <typename T, typename = void>
+struct npy_format_descriptor_name;
+
+template <typename T>
+struct npy_format_descriptor_name<T, enable_if_t<std::is_integral<T>::value>> {
+    static constexpr auto name = _<std::is_same<T, bool>::value>(
+        _("bool"), _<std::is_signed<T>::value>("int", "uint") + _<sizeof(T)*8>()
+    );
+};
+
+template <typename T>
+struct npy_format_descriptor_name<T, enable_if_t<std::is_floating_point<T>::value>> {
+    static constexpr auto name = _<std::is_same<T, float>::value || std::is_same<T, double>::value>(
+        _("float") + _<sizeof(T)*8>(), _("longdouble")
+    );
+};
+
+template <typename T>
+struct npy_format_descriptor_name<T, enable_if_t<is_complex<T>::value>> {
+    static constexpr auto name = _<std::is_same<typename T::value_type, float>::value
+                                   || std::is_same<typename T::value_type, double>::value>(
+        _("complex") + _<sizeof(typename T::value_type)*16>(), _("longcomplex")
+    );
+};
+
+template <typename T>
+struct npy_format_descriptor<T, enable_if_t<satisfies_any_of<T, std::is_arithmetic, is_complex>::value>>
+    : npy_format_descriptor_name<T> {
+private:
+    // NB: the order here must match the one in common.h
+    constexpr static const int values[15] = {
+        npy_api::NPY_BOOL_,
+        npy_api::NPY_BYTE_,   npy_api::NPY_UBYTE_,   npy_api::NPY_INT16_,    npy_api::NPY_UINT16_,
+        npy_api::NPY_INT32_,  npy_api::NPY_UINT32_,  npy_api::NPY_INT64_,    npy_api::NPY_UINT64_,
+        npy_api::NPY_FLOAT_,  npy_api::NPY_DOUBLE_,  npy_api::NPY_LONGDOUBLE_,
+        npy_api::NPY_CFLOAT_, npy_api::NPY_CDOUBLE_, npy_api::NPY_CLONGDOUBLE_
+    };
+
+public:
+    static constexpr int value = values[detail::is_fmt_numeric<T>::index];
+
+    static pybind11::dtype dtype() {
+        if (auto ptr = npy_api::get().PyArray_DescrFromType_(value))
+            return reinterpret_steal<pybind11::dtype>(ptr);
+        pybind11_fail("Unsupported buffer format!");
+    }
+};
+
+#define PYBIND11_DECL_CHAR_FMT \
+    static constexpr auto name = _("S") + _<N>(); \
+    static pybind11::dtype dtype() { return pybind11::dtype(std::string("S") + std::to_string(N)); }
+template <size_t N> struct npy_format_descriptor<char[N]> { PYBIND11_DECL_CHAR_FMT };
+template <size_t N> struct npy_format_descriptor<std::array<char, N>> { PYBIND11_DECL_CHAR_FMT };
+#undef PYBIND11_DECL_CHAR_FMT
+
+template<typename T> struct npy_format_descriptor<T, enable_if_t<array_info<T>::is_array>> {
+private:
+    using base_descr = npy_format_descriptor<typename array_info<T>::type>;
+public:
+    static_assert(!array_info<T>::is_empty, "Zero-sized arrays are not supported");
+
+    static constexpr auto name = _("(") + array_info<T>::extents + _(")") + base_descr::name;
+    static pybind11::dtype dtype() {
+        list shape;
+        array_info<T>::append_extents(shape);
+        return pybind11::dtype::from_args(pybind11::make_tuple(base_descr::dtype(), shape));
+    }
+};
+
+template<typename T> struct npy_format_descriptor<T, enable_if_t<std::is_enum<T>::value>> {
+private:
+    using base_descr = npy_format_descriptor<typename std::underlying_type<T>::type>;
+public:
+    static constexpr auto name = base_descr::name;
+    static pybind11::dtype dtype() { return base_descr::dtype(); }
+};
+
+struct field_descriptor {
+    const char *name;
+    ssize_t offset;
+    ssize_t size;
+    std::string format;
+    dtype descr;
+};
+
+inline PYBIND11_NOINLINE void register_structured_dtype(
+    any_container<field_descriptor> fields,
+    const std::type_info& tinfo, ssize_t itemsize,
+    bool (*direct_converter)(PyObject *, void *&)) {
+
+    auto& numpy_internals = get_numpy_internals();
+    if (numpy_internals.get_type_info(tinfo, false))
+        pybind11_fail("NumPy: dtype is already registered");
+
+    // Use ordered fields because order matters as of NumPy 1.14:
+    // https://docs.scipy.org/doc/numpy/release.html#multiple-field-indexing-assignment-of-structured-arrays
+    std::vector<field_descriptor> ordered_fields(std::move(fields));
+    std::sort(ordered_fields.begin(), ordered_fields.end(),
+        [](const field_descriptor &a, const field_descriptor &b) { return a.offset < b.offset; });
+
+    list names, formats, offsets;
+    for (auto& field : ordered_fields) {
+        if (!field.descr)
+            pybind11_fail(std::string("NumPy: unsupported field dtype: `") +
+                            field.name + "` @ " + tinfo.name());
+        names.append(PYBIND11_STR_TYPE(field.name));
+        formats.append(field.descr);
+        offsets.append(pybind11::int_(field.offset));
+    }
+    auto dtype_ptr = pybind11::dtype(names, formats, offsets, itemsize).release().ptr();
+
+    // There is an existing bug in NumPy (as of v1.11): trailing bytes are
+    // not encoded explicitly into the format string. This will supposedly
+    // get fixed in v1.12; for further details, see these:
+    // - https://github.com/numpy/numpy/issues/7797
+    // - https://github.com/numpy/numpy/pull/7798
+    // Because of this, we won't use numpy's logic to generate buffer format
+    // strings and will just do it ourselves.
+    ssize_t offset = 0;
+    std::ostringstream oss;
+    // mark the structure as unaligned with '^', because numpy and C++ don't
+    // always agree about alignment (particularly for complex), and we're
+    // explicitly listing all our padding. This depends on none of the fields
+    // overriding the endianness. Putting the ^ in front of individual fields
+    // isn't guaranteed to work due to https://github.com/numpy/numpy/issues/9049
+    oss << "^T{";
+    for (auto& field : ordered_fields) {
+        if (field.offset > offset)
+            oss << (field.offset - offset) << 'x';
+        oss << field.format << ':' << field.name << ':';
+        offset = field.offset + field.size;
+    }
+    if (itemsize > offset)
+        oss << (itemsize - offset) << 'x';
+    oss << '}';
+    auto format_str = oss.str();
+
+    // Sanity check: verify that NumPy properly parses our buffer format string
+    auto& api = npy_api::get();
+    auto arr =  array(buffer_info(nullptr, itemsize, format_str, 1));
+    if (!api.PyArray_EquivTypes_(dtype_ptr, arr.dtype().ptr()))
+        pybind11_fail("NumPy: invalid buffer descriptor!");
+
+    auto tindex = std::type_index(tinfo);
+    numpy_internals.registered_dtypes[tindex] = { dtype_ptr, format_str };
+    get_internals().direct_conversions[tindex].push_back(direct_converter);
+}
+
+template <typename T, typename SFINAE> struct npy_format_descriptor {
+    static_assert(is_pod_struct<T>::value, "Attempt to use a non-POD or unimplemented POD type as a numpy dtype");
+
+    static constexpr auto name = make_caster<T>::name;
+
+    static pybind11::dtype dtype() {
+        return reinterpret_borrow<pybind11::dtype>(dtype_ptr());
+    }
+
+    static std::string format() {
+        static auto format_str = get_numpy_internals().get_type_info<T>(true)->format_str;
+        return format_str;
+    }
+
+    static void register_dtype(any_container<field_descriptor> fields) {
+        register_structured_dtype(std::move(fields), typeid(typename std::remove_cv<T>::type),
+                                  sizeof(T), &direct_converter);
+    }
+
+private:
+    static PyObject* dtype_ptr() {
+        static PyObject* ptr = get_numpy_internals().get_type_info<T>(true)->dtype_ptr;
+        return ptr;
+    }
+
+    static bool direct_converter(PyObject *obj, void*& value) {
+        auto& api = npy_api::get();
+        if (!PyObject_TypeCheck(obj, api.PyVoidArrType_Type_))
+            return false;
+        if (auto descr = reinterpret_steal<object>(api.PyArray_DescrFromScalar_(obj))) {
+            if (api.PyArray_EquivTypes_(dtype_ptr(), descr.ptr())) {
+                value = ((PyVoidScalarObject_Proxy *) obj)->obval;
+                return true;
+            }
+        }
+        return false;
+    }
+};
+
+#ifdef __CLION_IDE__ // replace heavy macro with dummy code for the IDE (doesn't affect code)
+# define PYBIND11_NUMPY_DTYPE(Type, ...) ((void)0)
+# define PYBIND11_NUMPY_DTYPE_EX(Type, ...) ((void)0)
+#else
+
+#define PYBIND11_FIELD_DESCRIPTOR_EX(T, Field, Name)                                          \
+    ::pybind11::detail::field_descriptor {                                                    \
+        Name, offsetof(T, Field), sizeof(decltype(std::declval<T>().Field)),                  \
+        ::pybind11::format_descriptor<decltype(std::declval<T>().Field)>::format(),           \
+        ::pybind11::detail::npy_format_descriptor<decltype(std::declval<T>().Field)>::dtype() \
+    }
+
+// Extract name, offset and format descriptor for a struct field
+#define PYBIND11_FIELD_DESCRIPTOR(T, Field) PYBIND11_FIELD_DESCRIPTOR_EX(T, Field, #Field)
+
+// The main idea of this macro is borrowed from https://github.com/swansontec/map-macro
+// (C) William Swanson, Paul Fultz
+#define PYBIND11_EVAL0(...) __VA_ARGS__
+#define PYBIND11_EVAL1(...) PYBIND11_EVAL0 (PYBIND11_EVAL0 (PYBIND11_EVAL0 (__VA_ARGS__)))
+#define PYBIND11_EVAL2(...) PYBIND11_EVAL1 (PYBIND11_EVAL1 (PYBIND11_EVAL1 (__VA_ARGS__)))
+#define PYBIND11_EVAL3(...) PYBIND11_EVAL2 (PYBIND11_EVAL2 (PYBIND11_EVAL2 (__VA_ARGS__)))
+#define PYBIND11_EVAL4(...) PYBIND11_EVAL3 (PYBIND11_EVAL3 (PYBIND11_EVAL3 (__VA_ARGS__)))
+#define PYBIND11_EVAL(...)  PYBIND11_EVAL4 (PYBIND11_EVAL4 (PYBIND11_EVAL4 (__VA_ARGS__)))
+#define PYBIND11_MAP_END(...)
+#define PYBIND11_MAP_OUT
+#define PYBIND11_MAP_COMMA ,
+#define PYBIND11_MAP_GET_END() 0, PYBIND11_MAP_END
+#define PYBIND11_MAP_NEXT0(test, next, ...) next PYBIND11_MAP_OUT
+#define PYBIND11_MAP_NEXT1(test, next) PYBIND11_MAP_NEXT0 (test, next, 0)
+#define PYBIND11_MAP_NEXT(test, next)  PYBIND11_MAP_NEXT1 (PYBIND11_MAP_GET_END test, next)
+#ifdef _MSC_VER // MSVC is not as eager to expand macros, hence this workaround
+#define PYBIND11_MAP_LIST_NEXT1(test, next) \
+    PYBIND11_EVAL0 (PYBIND11_MAP_NEXT0 (test, PYBIND11_MAP_COMMA next, 0))
+#else
+#define PYBIND11_MAP_LIST_NEXT1(test, next) \
+    PYBIND11_MAP_NEXT0 (test, PYBIND11_MAP_COMMA next, 0)
+#endif
+#define PYBIND11_MAP_LIST_NEXT(test, next) \
+    PYBIND11_MAP_LIST_NEXT1 (PYBIND11_MAP_GET_END test, next)
+#define PYBIND11_MAP_LIST0(f, t, x, peek, ...) \
+    f(t, x) PYBIND11_MAP_LIST_NEXT (peek, PYBIND11_MAP_LIST1) (f, t, peek, __VA_ARGS__)
+#define PYBIND11_MAP_LIST1(f, t, x, peek, ...) \
+    f(t, x) PYBIND11_MAP_LIST_NEXT (peek, PYBIND11_MAP_LIST0) (f, t, peek, __VA_ARGS__)
+// PYBIND11_MAP_LIST(f, t, a1, a2, ...) expands to f(t, a1), f(t, a2), ...
+#define PYBIND11_MAP_LIST(f, t, ...) \
+    PYBIND11_EVAL (PYBIND11_MAP_LIST1 (f, t, __VA_ARGS__, (), 0))
+
+#define PYBIND11_NUMPY_DTYPE(Type, ...) \
+    ::pybind11::detail::npy_format_descriptor<Type>::register_dtype \
+        (::std::vector<::pybind11::detail::field_descriptor> \
+         {PYBIND11_MAP_LIST (PYBIND11_FIELD_DESCRIPTOR, Type, __VA_ARGS__)})
+
+#ifdef _MSC_VER
+#define PYBIND11_MAP2_LIST_NEXT1(test, next) \
+    PYBIND11_EVAL0 (PYBIND11_MAP_NEXT0 (test, PYBIND11_MAP_COMMA next, 0))
+#else
+#define PYBIND11_MAP2_LIST_NEXT1(test, next) \
+    PYBIND11_MAP_NEXT0 (test, PYBIND11_MAP_COMMA next, 0)
+#endif
+#define PYBIND11_MAP2_LIST_NEXT(test, next) \
+    PYBIND11_MAP2_LIST_NEXT1 (PYBIND11_MAP_GET_END test, next)
+#define PYBIND11_MAP2_LIST0(f, t, x1, x2, peek, ...) \
+    f(t, x1, x2) PYBIND11_MAP2_LIST_NEXT (peek, PYBIND11_MAP2_LIST1) (f, t, peek, __VA_ARGS__)
+#define PYBIND11_MAP2_LIST1(f, t, x1, x2, peek, ...) \
+    f(t, x1, x2) PYBIND11_MAP2_LIST_NEXT (peek, PYBIND11_MAP2_LIST0) (f, t, peek, __VA_ARGS__)
+// PYBIND11_MAP2_LIST(f, t, a1, a2, ...) expands to f(t, a1, a2), f(t, a3, a4), ...
+#define PYBIND11_MAP2_LIST(f, t, ...) \
+    PYBIND11_EVAL (PYBIND11_MAP2_LIST1 (f, t, __VA_ARGS__, (), 0))
+
+#define PYBIND11_NUMPY_DTYPE_EX(Type, ...) \
+    ::pybind11::detail::npy_format_descriptor<Type>::register_dtype \
+        (::std::vector<::pybind11::detail::field_descriptor> \
+         {PYBIND11_MAP2_LIST (PYBIND11_FIELD_DESCRIPTOR_EX, Type, __VA_ARGS__)})
+
+#endif // __CLION_IDE__
+
+template  <class T>
+using array_iterator = typename std::add_pointer<T>::type;
+
+template <class T>
+array_iterator<T> array_begin(const buffer_info& buffer) {
+    return array_iterator<T>(reinterpret_cast<T*>(buffer.ptr));
+}
+
+template <class T>
+array_iterator<T> array_end(const buffer_info& buffer) {
+    return array_iterator<T>(reinterpret_cast<T*>(buffer.ptr) + buffer.size);
+}
+
+class common_iterator {
+public:
+    using container_type = std::vector<ssize_t>;
+    using value_type = container_type::value_type;
+    using size_type = container_type::size_type;
+
+    common_iterator() : p_ptr(0), m_strides() {}
+
+    common_iterator(void* ptr, const container_type& strides, const container_type& shape)
+        : p_ptr(reinterpret_cast<char*>(ptr)), m_strides(strides.size()) {
+        m_strides.back() = static_cast<value_type>(strides.back());
+        for (size_type i = m_strides.size() - 1; i != 0; --i) {
+            size_type j = i - 1;
+            value_type s = static_cast<value_type>(shape[i]);
+            m_strides[j] = strides[j] + m_strides[i] - strides[i] * s;
+        }
+    }
+
+    void increment(size_type dim) {
+        p_ptr += m_strides[dim];
+    }
+
+    void* data() const {
+        return p_ptr;
+    }
+
+private:
+    char* p_ptr;
+    container_type m_strides;
+};
+
+template <size_t N> class multi_array_iterator {
+public:
+    using container_type = std::vector<ssize_t>;
+
+    multi_array_iterator(const std::array<buffer_info, N> &buffers,
+                         const container_type &shape)
+        : m_shape(shape.size()), m_index(shape.size(), 0),
+          m_common_iterator() {
+
+        // Manual copy to avoid conversion warning if using std::copy
+        for (size_t i = 0; i < shape.size(); ++i)
+            m_shape[i] = shape[i];
+
+        container_type strides(shape.size());
+        for (size_t i = 0; i < N; ++i)
+            init_common_iterator(buffers[i], shape, m_common_iterator[i], strides);
+    }
+
+    multi_array_iterator& operator++() {
+        for (size_t j = m_index.size(); j != 0; --j) {
+            size_t i = j - 1;
+            if (++m_index[i] != m_shape[i]) {
+                increment_common_iterator(i);
+                break;
+            } else {
+                m_index[i] = 0;
+            }
+        }
+        return *this;
+    }
+
+    template <size_t K, class T = void> T* data() const {
+        return reinterpret_cast<T*>(m_common_iterator[K].data());
+    }
+
+private:
+
+    using common_iter = common_iterator;
+
+    void init_common_iterator(const buffer_info &buffer,
+                              const container_type &shape,
+                              common_iter &iterator,
+                              container_type &strides) {
+        auto buffer_shape_iter = buffer.shape.rbegin();
+        auto buffer_strides_iter = buffer.strides.rbegin();
+        auto shape_iter = shape.rbegin();
+        auto strides_iter = strides.rbegin();
+
+        while (buffer_shape_iter != buffer.shape.rend()) {
+            if (*shape_iter == *buffer_shape_iter)
+                *strides_iter = *buffer_strides_iter;
+            else
+                *strides_iter = 0;
+
+            ++buffer_shape_iter;
+            ++buffer_strides_iter;
+            ++shape_iter;
+            ++strides_iter;
+        }
+
+        std::fill(strides_iter, strides.rend(), 0);
+        iterator = common_iter(buffer.ptr, strides, shape);
+    }
+
+    void increment_common_iterator(size_t dim) {
+        for (auto &iter : m_common_iterator)
+            iter.increment(dim);
+    }
+
+    container_type m_shape;
+    container_type m_index;
+    std::array<common_iter, N> m_common_iterator;
+};
+
+enum class broadcast_trivial { non_trivial, c_trivial, f_trivial };
+
+// Populates the shape and number of dimensions for the set of buffers.  Returns a broadcast_trivial
+// enum value indicating whether the broadcast is "trivial"--that is, has each buffer being either a
+// singleton or a full-size, C-contiguous (`c_trivial`) or Fortran-contiguous (`f_trivial`) storage
+// buffer; returns `non_trivial` otherwise.
+template <size_t N>
+broadcast_trivial broadcast(const std::array<buffer_info, N> &buffers, ssize_t &ndim, std::vector<ssize_t> &shape) {
+    ndim = std::accumulate(buffers.begin(), buffers.end(), ssize_t(0), [](ssize_t res, const buffer_info &buf) {
+        return std::max(res, buf.ndim);
+    });
+
+    shape.clear();
+    shape.resize((size_t) ndim, 1);
+
+    // Figure out the output size, and make sure all input arrays conform (i.e. are either size 1 or
+    // the full size).
+    for (size_t i = 0; i < N; ++i) {
+        auto res_iter = shape.rbegin();
+        auto end = buffers[i].shape.rend();
+        for (auto shape_iter = buffers[i].shape.rbegin(); shape_iter != end; ++shape_iter, ++res_iter) {
+            const auto &dim_size_in = *shape_iter;
+            auto &dim_size_out = *res_iter;
+
+            // Each input dimension can either be 1 or `n`, but `n` values must match across buffers
+            if (dim_size_out == 1)
+                dim_size_out = dim_size_in;
+            else if (dim_size_in != 1 && dim_size_in != dim_size_out)
+                pybind11_fail("pybind11::vectorize: incompatible size/dimension of inputs!");
+        }
+    }
+
+    bool trivial_broadcast_c = true;
+    bool trivial_broadcast_f = true;
+    for (size_t i = 0; i < N && (trivial_broadcast_c || trivial_broadcast_f); ++i) {
+        if (buffers[i].size == 1)
+            continue;
+
+        // Require the same number of dimensions:
+        if (buffers[i].ndim != ndim)
+            return broadcast_trivial::non_trivial;
+
+        // Require all dimensions be full-size:
+        if (!std::equal(buffers[i].shape.cbegin(), buffers[i].shape.cend(), shape.cbegin()))
+            return broadcast_trivial::non_trivial;
+
+        // Check for C contiguity (but only if previous inputs were also C contiguous)
+        if (trivial_broadcast_c) {
+            ssize_t expect_stride = buffers[i].itemsize;
+            auto end = buffers[i].shape.crend();
+            for (auto shape_iter = buffers[i].shape.crbegin(), stride_iter = buffers[i].strides.crbegin();
+                    trivial_broadcast_c && shape_iter != end; ++shape_iter, ++stride_iter) {
+                if (expect_stride == *stride_iter)
+                    expect_stride *= *shape_iter;
+                else
+                    trivial_broadcast_c = false;
+            }
+        }
+
+        // Check for Fortran contiguity (if previous inputs were also F contiguous)
+        if (trivial_broadcast_f) {
+            ssize_t expect_stride = buffers[i].itemsize;
+            auto end = buffers[i].shape.cend();
+            for (auto shape_iter = buffers[i].shape.cbegin(), stride_iter = buffers[i].strides.cbegin();
+                    trivial_broadcast_f && shape_iter != end; ++shape_iter, ++stride_iter) {
+                if (expect_stride == *stride_iter)
+                    expect_stride *= *shape_iter;
+                else
+                    trivial_broadcast_f = false;
+            }
+        }
+    }
+
+    return
+        trivial_broadcast_c ? broadcast_trivial::c_trivial :
+        trivial_broadcast_f ? broadcast_trivial::f_trivial :
+        broadcast_trivial::non_trivial;
+}
+
+template <typename T>
+struct vectorize_arg {
+    static_assert(!std::is_rvalue_reference<T>::value, "Functions with rvalue reference arguments cannot be vectorized");
+    // The wrapped function gets called with this type:
+    using call_type = remove_reference_t<T>;
+    // Is this a vectorized argument?
+    static constexpr bool vectorize =
+        satisfies_any_of<call_type, std::is_arithmetic, is_complex, std::is_pod>::value &&
+        satisfies_none_of<call_type, std::is_pointer, std::is_array, is_std_array, std::is_enum>::value &&
+        (!std::is_reference<T>::value ||
+         (std::is_lvalue_reference<T>::value && std::is_const<call_type>::value));
+    // Accept this type: an array for vectorized types, otherwise the type as-is:
+    using type = conditional_t<vectorize, array_t<remove_cv_t<call_type>, array::forcecast>, T>;
+};
+
+template <typename Func, typename Return, typename... Args>
+struct vectorize_helper {
+private:
+    static constexpr size_t N = sizeof...(Args);
+    static constexpr size_t NVectorized = constexpr_sum(vectorize_arg<Args>::vectorize...);
+    static_assert(NVectorized >= 1,
+            "pybind11::vectorize(...) requires a function with at least one vectorizable argument");
+
+public:
+    template <typename T>
+    explicit vectorize_helper(T &&f) : f(std::forward<T>(f)) { }
+
+    object operator()(typename vectorize_arg<Args>::type... args) {
+        return run(args...,
+                   make_index_sequence<N>(),
+                   select_indices<vectorize_arg<Args>::vectorize...>(),
+                   make_index_sequence<NVectorized>());
+    }
+
+private:
+    remove_reference_t<Func> f;
+
+    // Internal compiler error in MSVC 19.16.27025.1 (Visual Studio 2017 15.9.4), when compiling with "/permissive-" flag
+    // when arg_call_types is manually inlined.
+    using arg_call_types = std::tuple<typename vectorize_arg<Args>::call_type...>;
+    template <size_t Index> using param_n_t = typename std::tuple_element<Index, arg_call_types>::type;
+
+    // Runs a vectorized function given arguments tuple and three index sequences:
+    //     - Index is the full set of 0 ... (N-1) argument indices;
+    //     - VIndex is the subset of argument indices with vectorized parameters, letting us access
+    //       vectorized arguments (anything not in this sequence is passed through)
+    //     - BIndex is a incremental sequence (beginning at 0) of the same size as VIndex, so that
+    //       we can store vectorized buffer_infos in an array (argument VIndex has its buffer at
+    //       index BIndex in the array).
+    template <size_t... Index, size_t... VIndex, size_t... BIndex> object run(
+            typename vectorize_arg<Args>::type &...args,
+            index_sequence<Index...> i_seq, index_sequence<VIndex...> vi_seq, index_sequence<BIndex...> bi_seq) {
+
+        // Pointers to values the function was called with; the vectorized ones set here will start
+        // out as array_t<T> pointers, but they will be changed them to T pointers before we make
+        // call the wrapped function.  Non-vectorized pointers are left as-is.
+        std::array<void *, N> params{{ &args... }};
+
+        // The array of `buffer_info`s of vectorized arguments:
+        std::array<buffer_info, NVectorized> buffers{{ reinterpret_cast<array *>(params[VIndex])->request()... }};
+
+        /* Determine dimensions parameters of output array */
+        ssize_t nd = 0;
+        std::vector<ssize_t> shape(0);
+        auto trivial = broadcast(buffers, nd, shape);
+        size_t ndim = (size_t) nd;
+
+        size_t size = std::accumulate(shape.begin(), shape.end(), (size_t) 1, std::multiplies<size_t>());
+
+        // If all arguments are 0-dimension arrays (i.e. single values) return a plain value (i.e.
+        // not wrapped in an array).
+        if (size == 1 && ndim == 0) {
+            PYBIND11_EXPAND_SIDE_EFFECTS(params[VIndex] = buffers[BIndex].ptr);
+            return cast(f(*reinterpret_cast<param_n_t<Index> *>(params[Index])...));
+        }
+
+        array_t<Return> result;
+        if (trivial == broadcast_trivial::f_trivial) result = array_t<Return, array::f_style>(shape);
+        else result = array_t<Return>(shape);
+
+        if (size == 0) return std::move(result);
+
+        /* Call the function */
+        if (trivial == broadcast_trivial::non_trivial)
+            apply_broadcast(buffers, params, result, i_seq, vi_seq, bi_seq);
+        else
+            apply_trivial(buffers, params, result.mutable_data(), size, i_seq, vi_seq, bi_seq);
+
+        return std::move(result);
+    }
+
+    template <size_t... Index, size_t... VIndex, size_t... BIndex>
+    void apply_trivial(std::array<buffer_info, NVectorized> &buffers,
+                       std::array<void *, N> &params,
+                       Return *out,
+                       size_t size,
+                       index_sequence<Index...>, index_sequence<VIndex...>, index_sequence<BIndex...>) {
+
+        // Initialize an array of mutable byte references and sizes with references set to the
+        // appropriate pointer in `params`; as we iterate, we'll increment each pointer by its size
+        // (except for singletons, which get an increment of 0).
+        std::array<std::pair<unsigned char *&, const size_t>, NVectorized> vecparams{{
+            std::pair<unsigned char *&, const size_t>(
+                    reinterpret_cast<unsigned char *&>(params[VIndex] = buffers[BIndex].ptr),
+                    buffers[BIndex].size == 1 ? 0 : sizeof(param_n_t<VIndex>)
+            )...
+        }};
+
+        for (size_t i = 0; i < size; ++i) {
+            out[i] = f(*reinterpret_cast<param_n_t<Index> *>(params[Index])...);
+            for (auto &x : vecparams) x.first += x.second;
+        }
+    }
+
+    template <size_t... Index, size_t... VIndex, size_t... BIndex>
+    void apply_broadcast(std::array<buffer_info, NVectorized> &buffers,
+                         std::array<void *, N> &params,
+                         array_t<Return> &output_array,
+                         index_sequence<Index...>, index_sequence<VIndex...>, index_sequence<BIndex...>) {
+
+        buffer_info output = output_array.request();
+        multi_array_iterator<NVectorized> input_iter(buffers, output.shape);
+
+        for (array_iterator<Return> iter = array_begin<Return>(output), end = array_end<Return>(output);
+             iter != end;
+             ++iter, ++input_iter) {
+            PYBIND11_EXPAND_SIDE_EFFECTS((
+                params[VIndex] = input_iter.template data<BIndex>()
+            ));
+            *iter = f(*reinterpret_cast<param_n_t<Index> *>(std::get<Index>(params))...);
+        }
+    }
+};
+
+template <typename Func, typename Return, typename... Args>
+vectorize_helper<Func, Return, Args...>
+vectorize_extractor(const Func &f, Return (*) (Args ...)) {
+    return detail::vectorize_helper<Func, Return, Args...>(f);
+}
+
+template <typename T, int Flags> struct handle_type_name<array_t<T, Flags>> {
+    static constexpr auto name = _("numpy.ndarray[") + npy_format_descriptor<T>::name + _("]");
+};
+
+NAMESPACE_END(detail)
+
+// Vanilla pointer vectorizer:
+template <typename Return, typename... Args>
+detail::vectorize_helper<Return (*)(Args...), Return, Args...>
+vectorize(Return (*f) (Args ...)) {
+    return detail::vectorize_helper<Return (*)(Args...), Return, Args...>(f);
+}
+
+// lambda vectorizer:
+template <typename Func, detail::enable_if_t<detail::is_lambda<Func>::value, int> = 0>
+auto vectorize(Func &&f) -> decltype(
+        detail::vectorize_extractor(std::forward<Func>(f), (detail::function_signature_t<Func> *) nullptr)) {
+    return detail::vectorize_extractor(std::forward<Func>(f), (detail::function_signature_t<Func> *) nullptr);
+}
+
+// Vectorize a class method (non-const):
+template <typename Return, typename Class, typename... Args,
+          typename Helper = detail::vectorize_helper<decltype(std::mem_fn(std::declval<Return (Class::*)(Args...)>())), Return, Class *, Args...>>
+Helper vectorize(Return (Class::*f)(Args...)) {
+    return Helper(std::mem_fn(f));
+}
+
+// Vectorize a class method (const):
+template <typename Return, typename Class, typename... Args,
+          typename Helper = detail::vectorize_helper<decltype(std::mem_fn(std::declval<Return (Class::*)(Args...) const>())), Return, const Class *, Args...>>
+Helper vectorize(Return (Class::*f)(Args...) const) {
+    return Helper(std::mem_fn(f));
+}
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
diff --git a/cviruntime/python/include/pybind11/include/pybind11/operators.h b/cviruntime/python/include/pybind11/include/pybind11/operators.h
new file mode 100644
index 000000000..b3dd62c3b
--- /dev/null
+++ b/cviruntime/python/include/pybind11/include/pybind11/operators.h
@@ -0,0 +1,168 @@
+/*
+    pybind11/operator.h: Metatemplates for operator overloading
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+
+#if defined(__clang__) && !defined(__INTEL_COMPILER)
+#  pragma clang diagnostic ignored "-Wunsequenced" // multiple unsequenced modifications to 'self' (when using def(py::self OP Type()))
+#elif defined(_MSC_VER)
+#  pragma warning(push)
+#  pragma warning(disable: 4127) // warning C4127: Conditional expression is constant
+#endif
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+/// Enumeration with all supported operator types
+enum op_id : int {
+    op_add, op_sub, op_mul, op_div, op_mod, op_divmod, op_pow, op_lshift,
+    op_rshift, op_and, op_xor, op_or, op_neg, op_pos, op_abs, op_invert,
+    op_int, op_long, op_float, op_str, op_cmp, op_gt, op_ge, op_lt, op_le,
+    op_eq, op_ne, op_iadd, op_isub, op_imul, op_idiv, op_imod, op_ilshift,
+    op_irshift, op_iand, op_ixor, op_ior, op_complex, op_bool, op_nonzero,
+    op_repr, op_truediv, op_itruediv, op_hash
+};
+
+enum op_type : int {
+    op_l, /* base type on left */
+    op_r, /* base type on right */
+    op_u  /* unary operator */
+};
+
+struct self_t { };
+static const self_t self = self_t();
+
+/// Type for an unused type slot
+struct undefined_t { };
+
+/// Don't warn about an unused variable
+inline self_t __self() { return self; }
+
+/// base template of operator implementations
+template <op_id, op_type, typename B, typename L, typename R> struct op_impl { };
+
+/// Operator implementation generator
+template <op_id id, op_type ot, typename L, typename R> struct op_ {
+    template <typename Class, typename... Extra> void execute(Class &cl, const Extra&... extra) const {
+        using Base = typename Class::type;
+        using L_type = conditional_t<std::is_same<L, self_t>::value, Base, L>;
+        using R_type = conditional_t<std::is_same<R, self_t>::value, Base, R>;
+        using op = op_impl<id, ot, Base, L_type, R_type>;
+        cl.def(op::name(), &op::execute, is_operator(), extra...);
+        #if PY_MAJOR_VERSION < 3
+        if (id == op_truediv || id == op_itruediv)
+            cl.def(id == op_itruediv ? "__idiv__" : ot == op_l ? "__div__" : "__rdiv__",
+                    &op::execute, is_operator(), extra...);
+        #endif
+    }
+    template <typename Class, typename... Extra> void execute_cast(Class &cl, const Extra&... extra) const {
+        using Base = typename Class::type;
+        using L_type = conditional_t<std::is_same<L, self_t>::value, Base, L>;
+        using R_type = conditional_t<std::is_same<R, self_t>::value, Base, R>;
+        using op = op_impl<id, ot, Base, L_type, R_type>;
+        cl.def(op::name(), &op::execute_cast, is_operator(), extra...);
+        #if PY_MAJOR_VERSION < 3
+        if (id == op_truediv || id == op_itruediv)
+            cl.def(id == op_itruediv ? "__idiv__" : ot == op_l ? "__div__" : "__rdiv__",
+                    &op::execute, is_operator(), extra...);
+        #endif
+    }
+};
+
+#define PYBIND11_BINARY_OPERATOR(id, rid, op, expr)                                    \
+template <typename B, typename L, typename R> struct op_impl<op_##id, op_l, B, L, R> { \
+    static char const* name() { return "__" #id "__"; }                                \
+    static auto execute(const L &l, const R &r) -> decltype(expr) { return (expr); }   \
+    static B execute_cast(const L &l, const R &r) { return B(expr); }                  \
+};                                                                                     \
+template <typename B, typename L, typename R> struct op_impl<op_##id, op_r, B, L, R> { \
+    static char const* name() { return "__" #rid "__"; }                               \
+    static auto execute(const R &r, const L &l) -> decltype(expr) { return (expr); }   \
+    static B execute_cast(const R &r, const L &l) { return B(expr); }                  \
+};                                                                                     \
+inline op_<op_##id, op_l, self_t, self_t> op(const self_t &, const self_t &) {         \
+    return op_<op_##id, op_l, self_t, self_t>();                                       \
+}                                                                                      \
+template <typename T> op_<op_##id, op_l, self_t, T> op(const self_t &, const T &) {    \
+    return op_<op_##id, op_l, self_t, T>();                                            \
+}                                                                                      \
+template <typename T> op_<op_##id, op_r, T, self_t> op(const T &, const self_t &) {    \
+    return op_<op_##id, op_r, T, self_t>();                                            \
+}
+
+#define PYBIND11_INPLACE_OPERATOR(id, op, expr)                                        \
+template <typename B, typename L, typename R> struct op_impl<op_##id, op_l, B, L, R> { \
+    static char const* name() { return "__" #id "__"; }                                \
+    static auto execute(L &l, const R &r) -> decltype(expr) { return expr; }           \
+    static B execute_cast(L &l, const R &r) { return B(expr); }                        \
+};                                                                                     \
+template <typename T> op_<op_##id, op_l, self_t, T> op(const self_t &, const T &) {    \
+    return op_<op_##id, op_l, self_t, T>();                                            \
+}
+
+#define PYBIND11_UNARY_OPERATOR(id, op, expr)                                          \
+template <typename B, typename L> struct op_impl<op_##id, op_u, B, L, undefined_t> {   \
+    static char const* name() { return "__" #id "__"; }                                \
+    static auto execute(const L &l) -> decltype(expr) { return expr; }                 \
+    static B execute_cast(const L &l) { return B(expr); }                              \
+};                                                                                     \
+inline op_<op_##id, op_u, self_t, undefined_t> op(const self_t &) {                    \
+    return op_<op_##id, op_u, self_t, undefined_t>();                                  \
+}
+
+PYBIND11_BINARY_OPERATOR(sub,       rsub,         operator-,    l - r)
+PYBIND11_BINARY_OPERATOR(add,       radd,         operator+,    l + r)
+PYBIND11_BINARY_OPERATOR(mul,       rmul,         operator*,    l * r)
+PYBIND11_BINARY_OPERATOR(truediv,   rtruediv,     operator/,    l / r)
+PYBIND11_BINARY_OPERATOR(mod,       rmod,         operator%,    l % r)
+PYBIND11_BINARY_OPERATOR(lshift,    rlshift,      operator<<,   l << r)
+PYBIND11_BINARY_OPERATOR(rshift,    rrshift,      operator>>,   l >> r)
+PYBIND11_BINARY_OPERATOR(and,       rand,         operator&,    l & r)
+PYBIND11_BINARY_OPERATOR(xor,       rxor,         operator^,    l ^ r)
+PYBIND11_BINARY_OPERATOR(eq,        eq,           operator==,   l == r)
+PYBIND11_BINARY_OPERATOR(ne,        ne,           operator!=,   l != r)
+PYBIND11_BINARY_OPERATOR(or,        ror,          operator|,    l | r)
+PYBIND11_BINARY_OPERATOR(gt,        lt,           operator>,    l > r)
+PYBIND11_BINARY_OPERATOR(ge,        le,           operator>=,   l >= r)
+PYBIND11_BINARY_OPERATOR(lt,        gt,           operator<,    l < r)
+PYBIND11_BINARY_OPERATOR(le,        ge,           operator<=,   l <= r)
+//PYBIND11_BINARY_OPERATOR(pow,       rpow,         pow,          std::pow(l,  r))
+PYBIND11_INPLACE_OPERATOR(iadd,     operator+=,   l += r)
+PYBIND11_INPLACE_OPERATOR(isub,     operator-=,   l -= r)
+PYBIND11_INPLACE_OPERATOR(imul,     operator*=,   l *= r)
+PYBIND11_INPLACE_OPERATOR(itruediv, operator/=,   l /= r)
+PYBIND11_INPLACE_OPERATOR(imod,     operator%=,   l %= r)
+PYBIND11_INPLACE_OPERATOR(ilshift,  operator<<=,  l <<= r)
+PYBIND11_INPLACE_OPERATOR(irshift,  operator>>=,  l >>= r)
+PYBIND11_INPLACE_OPERATOR(iand,     operator&=,   l &= r)
+PYBIND11_INPLACE_OPERATOR(ixor,     operator^=,   l ^= r)
+PYBIND11_INPLACE_OPERATOR(ior,      operator|=,   l |= r)
+PYBIND11_UNARY_OPERATOR(neg,        operator-,    -l)
+PYBIND11_UNARY_OPERATOR(pos,        operator+,    +l)
+PYBIND11_UNARY_OPERATOR(abs,        abs,          std::abs(l))
+PYBIND11_UNARY_OPERATOR(hash,       hash,         std::hash<L>()(l))
+PYBIND11_UNARY_OPERATOR(invert,     operator~,    (~l))
+PYBIND11_UNARY_OPERATOR(bool,       operator!,    !!l)
+PYBIND11_UNARY_OPERATOR(int,        int_,         (int) l)
+PYBIND11_UNARY_OPERATOR(float,      float_,       (double) l)
+
+#undef PYBIND11_BINARY_OPERATOR
+#undef PYBIND11_INPLACE_OPERATOR
+#undef PYBIND11_UNARY_OPERATOR
+NAMESPACE_END(detail)
+
+using detail::self;
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
+
+#if defined(_MSC_VER)
+#  pragma warning(pop)
+#endif
diff --git a/cviruntime/python/include/pybind11/include/pybind11/options.h b/cviruntime/python/include/pybind11/include/pybind11/options.h
new file mode 100644
index 000000000..cc1e1f6f0
--- /dev/null
+++ b/cviruntime/python/include/pybind11/include/pybind11/options.h
@@ -0,0 +1,65 @@
+/*
+    pybind11/options.h: global settings that are configurable at runtime.
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/common.h"
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+class options {
+public:
+
+    // Default RAII constructor, which leaves settings as they currently are.
+    options() : previous_state(global_state()) {}
+
+    // Class is non-copyable.
+    options(const options&) = delete;
+    options& operator=(const options&) = delete;
+
+    // Destructor, which restores settings that were in effect before.
+    ~options() {
+        global_state() = previous_state;
+    }
+
+    // Setter methods (affect the global state):
+
+    options& disable_user_defined_docstrings() & { global_state().show_user_defined_docstrings = false; return *this; }
+
+    options& enable_user_defined_docstrings() & { global_state().show_user_defined_docstrings = true; return *this; }
+
+    options& disable_function_signatures() & { global_state().show_function_signatures = false; return *this; }
+
+    options& enable_function_signatures() & { global_state().show_function_signatures = true; return *this; }
+
+    // Getter methods (return the global state):
+
+    static bool show_user_defined_docstrings() { return global_state().show_user_defined_docstrings; }
+
+    static bool show_function_signatures() { return global_state().show_function_signatures; }
+
+    // This type is not meant to be allocated on the heap.
+    void* operator new(size_t) = delete;
+
+private:
+
+    struct state {
+        bool show_user_defined_docstrings = true;  //< Include user-supplied texts in docstrings.
+        bool show_function_signatures = true;      //< Include auto-generated function signatures in docstrings.
+    };
+
+    static state &global_state() {
+        static state instance;
+        return instance;
+    }
+
+    state previous_state;
+};
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/cviruntime/python/include/pybind11/include/pybind11/pybind11.h b/cviruntime/python/include/pybind11/include/pybind11/pybind11.h
new file mode 100644
index 000000000..d95d61f7b
--- /dev/null
+++ b/cviruntime/python/include/pybind11/include/pybind11/pybind11.h
@@ -0,0 +1,2183 @@
+/*
+    pybind11/pybind11.h: Main header file of the C++11 python
+    binding generator library
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#if defined(__INTEL_COMPILER)
+#  pragma warning push
+#  pragma warning disable 68    // integer conversion resulted in a change of sign
+#  pragma warning disable 186   // pointless comparison of unsigned integer with zero
+#  pragma warning disable 878   // incompatible exception specifications
+#  pragma warning disable 1334  // the "template" keyword used for syntactic disambiguation may only be used within a template
+#  pragma warning disable 1682  // implicit conversion of a 64-bit integral type to a smaller integral type (potential portability problem)
+#  pragma warning disable 1786  // function "strdup" was declared deprecated
+#  pragma warning disable 1875  // offsetof applied to non-POD (Plain Old Data) types is nonstandard
+#  pragma warning disable 2196  // warning #2196: routine is both "inline" and "noinline"
+#elif defined(_MSC_VER)
+#  pragma warning(push)
+#  pragma warning(disable: 4100) // warning C4100: Unreferenced formal parameter
+#  pragma warning(disable: 4127) // warning C4127: Conditional expression is constant
+#  pragma warning(disable: 4512) // warning C4512: Assignment operator was implicitly defined as deleted
+#  pragma warning(disable: 4800) // warning C4800: 'int': forcing value to bool 'true' or 'false' (performance warning)
+#  pragma warning(disable: 4996) // warning C4996: The POSIX name for this item is deprecated. Instead, use the ISO C and C++ conformant name
+#  pragma warning(disable: 4702) // warning C4702: unreachable code
+#  pragma warning(disable: 4522) // warning C4522: multiple assignment operators specified
+#elif defined(__GNUG__) && !defined(__clang__)
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wunused-but-set-parameter"
+#  pragma GCC diagnostic ignored "-Wunused-but-set-variable"
+#  pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+#  pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#  pragma GCC diagnostic ignored "-Wattributes"
+#  if __GNUC__ >= 7
+#    pragma GCC diagnostic ignored "-Wnoexcept-type"
+#  endif
+#endif
+
+#include "attr.h"
+#include "options.h"
+#include "detail/class.h"
+#include "detail/init.h"
+
+#if defined(__GNUG__) && !defined(__clang__)
+#  include <cxxabi.h>
+#endif
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+/// Wraps an arbitrary C++ function/method/lambda function/.. into a callable Python object
+class cpp_function : public function {
+public:
+    cpp_function() { }
+    cpp_function(std::nullptr_t) { }
+
+    /// Construct a cpp_function from a vanilla function pointer
+    template <typename Return, typename... Args, typename... Extra>
+    cpp_function(Return (*f)(Args...), const Extra&... extra) {
+        initialize(f, f, extra...);
+    }
+
+    /// Construct a cpp_function from a lambda function (possibly with internal state)
+    template <typename Func, typename... Extra,
+              typename = detail::enable_if_t<detail::is_lambda<Func>::value>>
+    cpp_function(Func &&f, const Extra&... extra) {
+        initialize(std::forward<Func>(f),
+                   (detail::function_signature_t<Func> *) nullptr, extra...);
+    }
+
+    /// Construct a cpp_function from a class method (non-const)
+    template <typename Return, typename Class, typename... Arg, typename... Extra>
+    cpp_function(Return (Class::*f)(Arg...), const Extra&... extra) {
+        initialize([f](Class *c, Arg... args) -> Return { return (c->*f)(args...); },
+                   (Return (*) (Class *, Arg...)) nullptr, extra...);
+    }
+
+    /// Construct a cpp_function from a class method (const)
+    template <typename Return, typename Class, typename... Arg, typename... Extra>
+    cpp_function(Return (Class::*f)(Arg...) const, const Extra&... extra) {
+        initialize([f](const Class *c, Arg... args) -> Return { return (c->*f)(args...); },
+                   (Return (*)(const Class *, Arg ...)) nullptr, extra...);
+    }
+
+    /// Return the function name
+    object name() const { return attr("__name__"); }
+
+protected:
+    /// Space optimization: don't inline this frequently instantiated fragment
+    PYBIND11_NOINLINE detail::function_record *make_function_record() {
+        return new detail::function_record();
+    }
+
+    /// Special internal constructor for functors, lambda functions, etc.
+    template <typename Func, typename Return, typename... Args, typename... Extra>
+    void initialize(Func &&f, Return (*)(Args...), const Extra&... extra) {
+        using namespace detail;
+        struct capture { remove_reference_t<Func> f; };
+
+        /* Store the function including any extra state it might have (e.g. a lambda capture object) */
+        auto rec = make_function_record();
+
+        /* Store the capture object directly in the function record if there is enough space */
+        if (sizeof(capture) <= sizeof(rec->data)) {
+            /* Without these pragmas, GCC warns that there might not be
+               enough space to use the placement new operator. However, the
+               'if' statement above ensures that this is the case. */
+#if defined(__GNUG__) && !defined(__clang__) && __GNUC__ >= 6
+#  pragma GCC diagnostic push
+#  pragma GCC diagnostic ignored "-Wplacement-new"
+#endif
+            new ((capture *) &rec->data) capture { std::forward<Func>(f) };
+#if defined(__GNUG__) && !defined(__clang__) && __GNUC__ >= 6
+#  pragma GCC diagnostic pop
+#endif
+            if (!std::is_trivially_destructible<Func>::value)
+                rec->free_data = [](function_record *r) { ((capture *) &r->data)->~capture(); };
+        } else {
+            rec->data[0] = new capture { std::forward<Func>(f) };
+            rec->free_data = [](function_record *r) { delete ((capture *) r->data[0]); };
+        }
+
+        /* Type casters for the function arguments and return value */
+        using cast_in = argument_loader<Args...>;
+        using cast_out = make_caster<
+            conditional_t<std::is_void<Return>::value, void_type, Return>
+        >;
+
+        static_assert(expected_num_args<Extra...>(sizeof...(Args), cast_in::has_args, cast_in::has_kwargs),
+                      "The number of argument annotations does not match the number of function arguments");
+
+        /* Dispatch code which converts function arguments and performs the actual function call */
+        rec->impl = [](function_call &call) -> handle {
+            cast_in args_converter;
+
+            /* Try to cast the function arguments into the C++ domain */
+            if (!args_converter.load_args(call))
+                return PYBIND11_TRY_NEXT_OVERLOAD;
+
+            /* Invoke call policy pre-call hook */
+            process_attributes<Extra...>::precall(call);
+
+            /* Get a pointer to the capture object */
+            auto data = (sizeof(capture) <= sizeof(call.func.data)
+                         ? &call.func.data : call.func.data[0]);
+            capture *cap = const_cast<capture *>(reinterpret_cast<const capture *>(data));
+
+            /* Override policy for rvalues -- usually to enforce rvp::move on an rvalue */
+            return_value_policy policy = return_value_policy_override<Return>::policy(call.func.policy);
+
+            /* Function scope guard -- defaults to the compile-to-nothing `void_type` */
+            using Guard = extract_guard_t<Extra...>;
+
+            /* Perform the function call */
+            handle result = cast_out::cast(
+                std::move(args_converter).template call<Return, Guard>(cap->f), policy, call.parent);
+
+            /* Invoke call policy post-call hook */
+            process_attributes<Extra...>::postcall(call, result);
+
+            return result;
+        };
+
+        /* Process any user-provided function attributes */
+        process_attributes<Extra...>::init(extra..., rec);
+
+        /* Generate a readable signature describing the function's arguments and return value types */
+        static constexpr auto signature = _("(") + cast_in::arg_names + _(") -> ") + cast_out::name;
+        PYBIND11_DESCR_CONSTEXPR auto types = decltype(signature)::types();
+
+        /* Register the function with Python from generic (non-templated) code */
+        initialize_generic(rec, signature.text, types.data(), sizeof...(Args));
+
+        if (cast_in::has_args) rec->has_args = true;
+        if (cast_in::has_kwargs) rec->has_kwargs = true;
+
+        /* Stash some additional information used by an important optimization in 'functional.h' */
+        using FunctionType = Return (*)(Args...);
+        constexpr bool is_function_ptr =
+            std::is_convertible<Func, FunctionType>::value &&
+            sizeof(capture) == sizeof(void *);
+        if (is_function_ptr) {
+            rec->is_stateless = true;
+            rec->data[1] = const_cast<void *>(reinterpret_cast<const void *>(&typeid(FunctionType)));
+        }
+    }
+
+    /// Register a function call with Python (generic non-templated code goes here)
+    void initialize_generic(detail::function_record *rec, const char *text,
+                            const std::type_info *const *types, size_t args) {
+
+        /* Create copies of all referenced C-style strings */
+        rec->name = strdup(rec->name ? rec->name : "");
+        if (rec->doc) rec->doc = strdup(rec->doc);
+        for (auto &a: rec->args) {
+            if (a.name)
+                a.name = strdup(a.name);
+            if (a.descr)
+                a.descr = strdup(a.descr);
+            else if (a.value)
+                a.descr = strdup(a.value.attr("__repr__")().cast<std::string>().c_str());
+        }
+
+        rec->is_constructor = !strcmp(rec->name, "__init__") || !strcmp(rec->name, "__setstate__");
+
+#if !defined(NDEBUG) && !defined(PYBIND11_DISABLE_NEW_STYLE_INIT_WARNING)
+        if (rec->is_constructor && !rec->is_new_style_constructor) {
+            const auto class_name = std::string(((PyTypeObject *) rec->scope.ptr())->tp_name);
+            const auto func_name = std::string(rec->name);
+            PyErr_WarnEx(
+                PyExc_FutureWarning,
+                ("pybind11-bound class '" + class_name + "' is using an old-style "
+                 "placement-new '" + func_name + "' which has been deprecated. See "
+                 "the upgrade guide in pybind11's docs. This message is only visible "
+                 "when compiled in debug mode.").c_str(), 0
+            );
+        }
+#endif
+
+        /* Generate a proper function signature */
+        std::string signature;
+        size_t type_index = 0, arg_index = 0;
+        for (auto *pc = text; *pc != '\0'; ++pc) {
+            const auto c = *pc;
+
+            if (c == '{') {
+                // Write arg name for everything except *args and **kwargs.
+                if (*(pc + 1) == '*')
+                    continue;
+
+                if (arg_index < rec->args.size() && rec->args[arg_index].name) {
+                    signature += rec->args[arg_index].name;
+                } else if (arg_index == 0 && rec->is_method) {
+                    signature += "self";
+                } else {
+                    signature += "arg" + std::to_string(arg_index - (rec->is_method ? 1 : 0));
+                }
+                signature += ": ";
+            } else if (c == '}') {
+                // Write default value if available.
+                if (arg_index < rec->args.size() && rec->args[arg_index].descr) {
+                    signature += " = ";
+                    signature += rec->args[arg_index].descr;
+                }
+                arg_index++;
+            } else if (c == '%') {
+                const std::type_info *t = types[type_index++];
+                if (!t)
+                    pybind11_fail("Internal error while parsing type signature (1)");
+                if (auto tinfo = detail::get_type_info(*t)) {
+                    handle th((PyObject *) tinfo->type);
+                    signature +=
+                        th.attr("__module__").cast<std::string>() + "." +
+                        th.attr("__qualname__").cast<std::string>(); // Python 3.3+, but we backport it to earlier versions
+                } else if (rec->is_new_style_constructor && arg_index == 0) {
+                    // A new-style `__init__` takes `self` as `value_and_holder`.
+                    // Rewrite it to the proper class type.
+                    signature +=
+                        rec->scope.attr("__module__").cast<std::string>() + "." +
+                        rec->scope.attr("__qualname__").cast<std::string>();
+                } else {
+                    std::string tname(t->name());
+                    detail::clean_type_id(tname);
+                    signature += tname;
+                }
+            } else {
+                signature += c;
+            }
+        }
+        if (arg_index != args || types[type_index] != nullptr)
+            pybind11_fail("Internal error while parsing type signature (2)");
+
+#if PY_MAJOR_VERSION < 3
+        if (strcmp(rec->name, "__next__") == 0) {
+            std::free(rec->name);
+            rec->name = strdup("next");
+        } else if (strcmp(rec->name, "__bool__") == 0) {
+            std::free(rec->name);
+            rec->name = strdup("__nonzero__");
+        }
+#endif
+        rec->signature = strdup(signature.c_str());
+        rec->args.shrink_to_fit();
+        rec->nargs = (std::uint16_t) args;
+
+        if (rec->sibling && PYBIND11_INSTANCE_METHOD_CHECK(rec->sibling.ptr()))
+            rec->sibling = PYBIND11_INSTANCE_METHOD_GET_FUNCTION(rec->sibling.ptr());
+
+        detail::function_record *chain = nullptr, *chain_start = rec;
+        if (rec->sibling) {
+            if (PyCFunction_Check(rec->sibling.ptr())) {
+                auto rec_capsule = reinterpret_borrow<capsule>(PyCFunction_GET_SELF(rec->sibling.ptr()));
+                chain = (detail::function_record *) rec_capsule;
+                /* Never append a method to an overload chain of a parent class;
+                   instead, hide the parent's overloads in this case */
+                if (!chain->scope.is(rec->scope))
+                    chain = nullptr;
+            }
+            // Don't trigger for things like the default __init__, which are wrapper_descriptors that we are intentionally replacing
+            else if (!rec->sibling.is_none() && rec->name[0] != '_')
+                pybind11_fail("Cannot overload existing non-function object \"" + std::string(rec->name) +
+                        "\" with a function of the same name");
+        }
+
+        if (!chain) {
+            /* No existing overload was found, create a new function object */
+            rec->def = new PyMethodDef();
+            std::memset(rec->def, 0, sizeof(PyMethodDef));
+            rec->def->ml_name = rec->name;
+            rec->def->ml_meth = reinterpret_cast<PyCFunction>(reinterpret_cast<void (*) (void)>(*dispatcher));
+            rec->def->ml_flags = METH_VARARGS | METH_KEYWORDS;
+
+            capsule rec_capsule(rec, [](void *ptr) {
+                destruct((detail::function_record *) ptr);
+            });
+
+            object scope_module;
+            if (rec->scope) {
+                if (hasattr(rec->scope, "__module__")) {
+                    scope_module = rec->scope.attr("__module__");
+                } else if (hasattr(rec->scope, "__name__")) {
+                    scope_module = rec->scope.attr("__name__");
+                }
+            }
+
+            m_ptr = PyCFunction_NewEx(rec->def, rec_capsule.ptr(), scope_module.ptr());
+            if (!m_ptr)
+                pybind11_fail("cpp_function::cpp_function(): Could not allocate function object");
+        } else {
+            /* Append at the end of the overload chain */
+            m_ptr = rec->sibling.ptr();
+            inc_ref();
+            chain_start = chain;
+            if (chain->is_method != rec->is_method)
+                pybind11_fail("overloading a method with both static and instance methods is not supported; "
+                    #if defined(NDEBUG)
+                        "compile in debug mode for more details"
+                    #else
+                        "error while attempting to bind " + std::string(rec->is_method ? "instance" : "static") + " method " +
+                        std::string(pybind11::str(rec->scope.attr("__name__"))) + "." + std::string(rec->name) + signature
+                    #endif
+                );
+            while (chain->next)
+                chain = chain->next;
+            chain->next = rec;
+        }
+
+        std::string signatures;
+        int index = 0;
+        /* Create a nice pydoc rec including all signatures and
+           docstrings of the functions in the overload chain */
+        if (chain && options::show_function_signatures()) {
+            // First a generic signature
+            signatures += rec->name;
+            signatures += "(*args, **kwargs)\n";
+            signatures += "Overloaded function.\n\n";
+        }
+        // Then specific overload signatures
+        bool first_user_def = true;
+        for (auto it = chain_start; it != nullptr; it = it->next) {
+            if (options::show_function_signatures()) {
+                if (index > 0) signatures += "\n";
+                if (chain)
+                    signatures += std::to_string(++index) + ". ";
+                signatures += rec->name;
+                signatures += it->signature;
+                signatures += "\n";
+            }
+            if (it->doc && strlen(it->doc) > 0 && options::show_user_defined_docstrings()) {
+                // If we're appending another docstring, and aren't printing function signatures, we
+                // need to append a newline first:
+                if (!options::show_function_signatures()) {
+                    if (first_user_def) first_user_def = false;
+                    else signatures += "\n";
+                }
+                if (options::show_function_signatures()) signatures += "\n";
+                signatures += it->doc;
+                if (options::show_function_signatures()) signatures += "\n";
+            }
+        }
+
+        /* Install docstring */
+        PyCFunctionObject *func = (PyCFunctionObject *) m_ptr;
+        if (func->m_ml->ml_doc)
+            std::free(const_cast<char *>(func->m_ml->ml_doc));
+        func->m_ml->ml_doc = strdup(signatures.c_str());
+
+        if (rec->is_method) {
+            m_ptr = PYBIND11_INSTANCE_METHOD_NEW(m_ptr, rec->scope.ptr());
+            if (!m_ptr)
+                pybind11_fail("cpp_function::cpp_function(): Could not allocate instance method object");
+            Py_DECREF(func);
+        }
+    }
+
+    /// When a cpp_function is GCed, release any memory allocated by pybind11
+    static void destruct(detail::function_record *rec) {
+        while (rec) {
+            detail::function_record *next = rec->next;
+            if (rec->free_data)
+                rec->free_data(rec);
+            std::free((char *) rec->name);
+            std::free((char *) rec->doc);
+            std::free((char *) rec->signature);
+            for (auto &arg: rec->args) {
+                std::free(const_cast<char *>(arg.name));
+                std::free(const_cast<char *>(arg.descr));
+                arg.value.dec_ref();
+            }
+            if (rec->def) {
+                std::free(const_cast<char *>(rec->def->ml_doc));
+                delete rec->def;
+            }
+            delete rec;
+            rec = next;
+        }
+    }
+
+    /// Main dispatch logic for calls to functions bound using pybind11
+    static PyObject *dispatcher(PyObject *self, PyObject *args_in, PyObject *kwargs_in) {
+        using namespace detail;
+
+        /* Iterator over the list of potentially admissible overloads */
+        const function_record *overloads = (function_record *) PyCapsule_GetPointer(self, nullptr),
+                              *it = overloads;
+
+        /* Need to know how many arguments + keyword arguments there are to pick the right overload */
+        const size_t n_args_in = (size_t) PyTuple_GET_SIZE(args_in);
+
+        handle parent = n_args_in > 0 ? PyTuple_GET_ITEM(args_in, 0) : nullptr,
+               result = PYBIND11_TRY_NEXT_OVERLOAD;
+
+        auto self_value_and_holder = value_and_holder();
+        if (overloads->is_constructor) {
+            const auto tinfo = get_type_info((PyTypeObject *) overloads->scope.ptr());
+            const auto pi = reinterpret_cast<instance *>(parent.ptr());
+            self_value_and_holder = pi->get_value_and_holder(tinfo, false);
+
+            if (!self_value_and_holder.type || !self_value_and_holder.inst) {
+                PyErr_SetString(PyExc_TypeError, "__init__(self, ...) called with invalid `self` argument");
+                return nullptr;
+            }
+
+            // If this value is already registered it must mean __init__ is invoked multiple times;
+            // we really can't support that in C++, so just ignore the second __init__.
+            if (self_value_and_holder.instance_registered())
+                return none().release().ptr();
+        }
+
+        try {
+            // We do this in two passes: in the first pass, we load arguments with `convert=false`;
+            // in the second, we allow conversion (except for arguments with an explicit
+            // py::arg().noconvert()).  This lets us prefer calls without conversion, with
+            // conversion as a fallback.
+            std::vector<function_call> second_pass;
+
+            // However, if there are no overloads, we can just skip the no-convert pass entirely
+            const bool overloaded = it != nullptr && it->next != nullptr;
+
+            for (; it != nullptr; it = it->next) {
+
+                /* For each overload:
+                   1. Copy all positional arguments we were given, also checking to make sure that
+                      named positional arguments weren't *also* specified via kwarg.
+                   2. If we weren't given enough, try to make up the omitted ones by checking
+                      whether they were provided by a kwarg matching the `py::arg("name")` name.  If
+                      so, use it (and remove it from kwargs; if not, see if the function binding
+                      provided a default that we can use.
+                   3. Ensure that either all keyword arguments were "consumed", or that the function
+                      takes a kwargs argument to accept unconsumed kwargs.
+                   4. Any positional arguments still left get put into a tuple (for args), and any
+                      leftover kwargs get put into a dict.
+                   5. Pack everything into a vector; if we have py::args or py::kwargs, they are an
+                      extra tuple or dict at the end of the positional arguments.
+                   6. Call the function call dispatcher (function_record::impl)
+
+                   If one of these fail, move on to the next overload and keep trying until we get a
+                   result other than PYBIND11_TRY_NEXT_OVERLOAD.
+                 */
+
+                const function_record &func = *it;
+                size_t pos_args = func.nargs;    // Number of positional arguments that we need
+                if (func.has_args) --pos_args;   // (but don't count py::args
+                if (func.has_kwargs) --pos_args; //  or py::kwargs)
+
+                if (!func.has_args && n_args_in > pos_args)
+                    continue; // Too many arguments for this overload
+
+                if (n_args_in < pos_args && func.args.size() < pos_args)
+                    continue; // Not enough arguments given, and not enough defaults to fill in the blanks
+
+                function_call call(func, parent);
+
+                size_t args_to_copy = (std::min)(pos_args, n_args_in); // Protect std::min with parentheses
+                size_t args_copied = 0;
+
+                // 0. Inject new-style `self` argument
+                if (func.is_new_style_constructor) {
+                    // The `value` may have been preallocated by an old-style `__init__`
+                    // if it was a preceding candidate for overload resolution.
+                    if (self_value_and_holder)
+                        self_value_and_holder.type->dealloc(self_value_and_holder);
+
+                    call.init_self = PyTuple_GET_ITEM(args_in, 0);
+                    call.args.push_back(reinterpret_cast<PyObject *>(&self_value_and_holder));
+                    call.args_convert.push_back(false);
+                    ++args_copied;
+                }
+
+                // 1. Copy any position arguments given.
+                bool bad_arg = false;
+                for (; args_copied < args_to_copy; ++args_copied) {
+                    const argument_record *arg_rec = args_copied < func.args.size() ? &func.args[args_copied] : nullptr;
+                    if (kwargs_in && arg_rec && arg_rec->name && PyDict_GetItemString(kwargs_in, arg_rec->name)) {
+                        bad_arg = true;
+                        break;
+                    }
+
+                    handle arg(PyTuple_GET_ITEM(args_in, args_copied));
+                    if (arg_rec && !arg_rec->none && arg.is_none()) {
+                        bad_arg = true;
+                        break;
+                    }
+                    call.args.push_back(arg);
+                    call.args_convert.push_back(arg_rec ? arg_rec->convert : true);
+                }
+                if (bad_arg)
+                    continue; // Maybe it was meant for another overload (issue #688)
+
+                // We'll need to copy this if we steal some kwargs for defaults
+                dict kwargs = reinterpret_borrow<dict>(kwargs_in);
+
+                // 2. Check kwargs and, failing that, defaults that may help complete the list
+                if (args_copied < pos_args) {
+                    bool copied_kwargs = false;
+
+                    for (; args_copied < pos_args; ++args_copied) {
+                        const auto &arg = func.args[args_copied];
+
+                        handle value;
+                        if (kwargs_in && arg.name)
+                            value = PyDict_GetItemString(kwargs.ptr(), arg.name);
+
+                        if (value) {
+                            // Consume a kwargs value
+                            if (!copied_kwargs) {
+                                kwargs = reinterpret_steal<dict>(PyDict_Copy(kwargs.ptr()));
+                                copied_kwargs = true;
+                            }
+                            PyDict_DelItemString(kwargs.ptr(), arg.name);
+                        } else if (arg.value) {
+                            value = arg.value;
+                        }
+
+                        if (value) {
+                            call.args.push_back(value);
+                            call.args_convert.push_back(arg.convert);
+                        }
+                        else
+                            break;
+                    }
+
+                    if (args_copied < pos_args)
+                        continue; // Not enough arguments, defaults, or kwargs to fill the positional arguments
+                }
+
+                // 3. Check everything was consumed (unless we have a kwargs arg)
+                if (kwargs && kwargs.size() > 0 && !func.has_kwargs)
+                    continue; // Unconsumed kwargs, but no py::kwargs argument to accept them
+
+                // 4a. If we have a py::args argument, create a new tuple with leftovers
+                if (func.has_args) {
+                    tuple extra_args;
+                    if (args_to_copy == 0) {
+                        // We didn't copy out any position arguments from the args_in tuple, so we
+                        // can reuse it directly without copying:
+                        extra_args = reinterpret_borrow<tuple>(args_in);
+                    } else if (args_copied >= n_args_in) {
+                        extra_args = tuple(0);
+                    } else {
+                        size_t args_size = n_args_in - args_copied;
+                        extra_args = tuple(args_size);
+                        for (size_t i = 0; i < args_size; ++i) {
+                            extra_args[i] = PyTuple_GET_ITEM(args_in, args_copied + i);
+                        }
+                    }
+                    call.args.push_back(extra_args);
+                    call.args_convert.push_back(false);
+                    call.args_ref = std::move(extra_args);
+                }
+
+                // 4b. If we have a py::kwargs, pass on any remaining kwargs
+                if (func.has_kwargs) {
+                    if (!kwargs.ptr())
+                        kwargs = dict(); // If we didn't get one, send an empty one
+                    call.args.push_back(kwargs);
+                    call.args_convert.push_back(false);
+                    call.kwargs_ref = std::move(kwargs);
+                }
+
+                // 5. Put everything in a vector.  Not technically step 5, we've been building it
+                // in `call.args` all along.
+                #if !defined(NDEBUG)
+                if (call.args.size() != func.nargs || call.args_convert.size() != func.nargs)
+                    pybind11_fail("Internal error: function call dispatcher inserted wrong number of arguments!");
+                #endif
+
+                std::vector<bool> second_pass_convert;
+                if (overloaded) {
+                    // We're in the first no-convert pass, so swap out the conversion flags for a
+                    // set of all-false flags.  If the call fails, we'll swap the flags back in for
+                    // the conversion-allowed call below.
+                    second_pass_convert.resize(func.nargs, false);
+                    call.args_convert.swap(second_pass_convert);
+                }
+
+                // 6. Call the function.
+                try {
+                    loader_life_support guard{};
+                    result = func.impl(call);
+                } catch (reference_cast_error &) {
+                    result = PYBIND11_TRY_NEXT_OVERLOAD;
+                }
+
+                if (result.ptr() != PYBIND11_TRY_NEXT_OVERLOAD)
+                    break;
+
+                if (overloaded) {
+                    // The (overloaded) call failed; if the call has at least one argument that
+                    // permits conversion (i.e. it hasn't been explicitly specified `.noconvert()`)
+                    // then add this call to the list of second pass overloads to try.
+                    for (size_t i = func.is_method ? 1 : 0; i < pos_args; i++) {
+                        if (second_pass_convert[i]) {
+                            // Found one: swap the converting flags back in and store the call for
+                            // the second pass.
+                            call.args_convert.swap(second_pass_convert);
+                            second_pass.push_back(std::move(call));
+                            break;
+                        }
+                    }
+                }
+            }
+
+            if (overloaded && !second_pass.empty() && result.ptr() == PYBIND11_TRY_NEXT_OVERLOAD) {
+                // The no-conversion pass finished without success, try again with conversion allowed
+                for (auto &call : second_pass) {
+                    try {
+                        loader_life_support guard{};
+                        result = call.func.impl(call);
+                    } catch (reference_cast_error &) {
+                        result = PYBIND11_TRY_NEXT_OVERLOAD;
+                    }
+
+                    if (result.ptr() != PYBIND11_TRY_NEXT_OVERLOAD) {
+                        // The error reporting logic below expects 'it' to be valid, as it would be
+                        // if we'd encountered this failure in the first-pass loop.
+                        if (!result)
+                            it = &call.func;
+                        break;
+                    }
+                }
+            }
+        } catch (error_already_set &e) {
+            e.restore();
+            return nullptr;
+#if defined(__GNUG__) && !defined(__clang__)
+        } catch ( abi::__forced_unwind& ) {
+            throw;
+#endif
+        } catch (...) {
+            /* When an exception is caught, give each registered exception
+               translator a chance to translate it to a Python exception
+               in reverse order of registration.
+
+               A translator may choose to do one of the following:
+
+                - catch the exception and call PyErr_SetString or PyErr_SetObject
+                  to set a standard (or custom) Python exception, or
+                - do nothing and let the exception fall through to the next translator, or
+                - delegate translation to the next translator by throwing a new type of exception. */
+
+            auto last_exception = std::current_exception();
+            auto &registered_exception_translators = get_internals().registered_exception_translators;
+            for (auto& translator : registered_exception_translators) {
+                try {
+                    translator(last_exception);
+                } catch (...) {
+                    last_exception = std::current_exception();
+                    continue;
+                }
+                return nullptr;
+            }
+            PyErr_SetString(PyExc_SystemError, "Exception escaped from default exception translator!");
+            return nullptr;
+        }
+
+        auto append_note_if_missing_header_is_suspected = [](std::string &msg) {
+            if (msg.find("std::") != std::string::npos) {
+                msg += "\n\n"
+                       "Did you forget to `#include <pybind11/stl.h>`? Or <pybind11/complex.h>,\n"
+                       "<pybind11/functional.h>, <pybind11/chrono.h>, etc. Some automatic\n"
+                       "conversions are optional and require extra headers to be included\n"
+                       "when compiling your pybind11 module.";
+            }
+        };
+
+        if (result.ptr() == PYBIND11_TRY_NEXT_OVERLOAD) {
+            if (overloads->is_operator)
+                return handle(Py_NotImplemented).inc_ref().ptr();
+
+            std::string msg = std::string(overloads->name) + "(): incompatible " +
+                std::string(overloads->is_constructor ? "constructor" : "function") +
+                " arguments. The following argument types are supported:\n";
+
+            int ctr = 0;
+            for (const function_record *it2 = overloads; it2 != nullptr; it2 = it2->next) {
+                msg += "    "+ std::to_string(++ctr) + ". ";
+
+                bool wrote_sig = false;
+                if (overloads->is_constructor) {
+                    // For a constructor, rewrite `(self: Object, arg0, ...) -> NoneType` as `Object(arg0, ...)`
+                    std::string sig = it2->signature;
+                    size_t start = sig.find('(') + 7; // skip "(self: "
+                    if (start < sig.size()) {
+                        // End at the , for the next argument
+                        size_t end = sig.find(", "), next = end + 2;
+                        size_t ret = sig.rfind(" -> ");
+                        // Or the ), if there is no comma:
+                        if (end >= sig.size()) next = end = sig.find(')');
+                        if (start < end && next < sig.size()) {
+                            msg.append(sig, start, end - start);
+                            msg += '(';
+                            msg.append(sig, next, ret - next);
+                            wrote_sig = true;
+                        }
+                    }
+                }
+                if (!wrote_sig) msg += it2->signature;
+
+                msg += "\n";
+            }
+            msg += "\nInvoked with: ";
+            auto args_ = reinterpret_borrow<tuple>(args_in);
+            bool some_args = false;
+            for (size_t ti = overloads->is_constructor ? 1 : 0; ti < args_.size(); ++ti) {
+                if (!some_args) some_args = true;
+                else msg += ", ";
+                msg += pybind11::repr(args_[ti]);
+            }
+            if (kwargs_in) {
+                auto kwargs = reinterpret_borrow<dict>(kwargs_in);
+                if (kwargs.size() > 0) {
+                    if (some_args) msg += "; ";
+                    msg += "kwargs: ";
+                    bool first = true;
+                    for (auto kwarg : kwargs) {
+                        if (first) first = false;
+                        else msg += ", ";
+                        msg += pybind11::str("{}={!r}").format(kwarg.first, kwarg.second);
+                    }
+                }
+            }
+
+            append_note_if_missing_header_is_suspected(msg);
+            PyErr_SetString(PyExc_TypeError, msg.c_str());
+            return nullptr;
+        } else if (!result) {
+            std::string msg = "Unable to convert function return value to a "
+                              "Python type! The signature was\n\t";
+            msg += it->signature;
+            append_note_if_missing_header_is_suspected(msg);
+            PyErr_SetString(PyExc_TypeError, msg.c_str());
+            return nullptr;
+        } else {
+            if (overloads->is_constructor && !self_value_and_holder.holder_constructed()) {
+                auto *pi = reinterpret_cast<instance *>(parent.ptr());
+                self_value_and_holder.type->init_instance(pi, nullptr);
+            }
+            return result.ptr();
+        }
+    }
+};
+
+/// Wrapper for Python extension modules
+class module : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(module, object, PyModule_Check)
+
+    /// Create a new top-level Python module with the given name and docstring
+    explicit module(const char *name, const char *doc = nullptr) {
+        if (!options::show_user_defined_docstrings()) doc = nullptr;
+#if PY_MAJOR_VERSION >= 3
+        PyModuleDef *def = new PyModuleDef();
+        std::memset(def, 0, sizeof(PyModuleDef));
+        def->m_name = name;
+        def->m_doc = doc;
+        def->m_size = -1;
+        Py_INCREF(def);
+        m_ptr = PyModule_Create(def);
+#else
+        m_ptr = Py_InitModule3(name, nullptr, doc);
+#endif
+        if (m_ptr == nullptr)
+            pybind11_fail("Internal error in module::module()");
+        inc_ref();
+    }
+
+    /** \rst
+        Create Python binding for a new function within the module scope. ``Func``
+        can be a plain C++ function, a function pointer, or a lambda function. For
+        details on the ``Extra&& ... extra`` argument, see section :ref:`extras`.
+    \endrst */
+    template <typename Func, typename... Extra>
+    module &def(const char *name_, Func &&f, const Extra& ... extra) {
+        cpp_function func(std::forward<Func>(f), name(name_), scope(*this),
+                          sibling(getattr(*this, name_, none())), extra...);
+        // NB: allow overwriting here because cpp_function sets up a chain with the intention of
+        // overwriting (and has already checked internally that it isn't overwriting non-functions).
+        add_object(name_, func, true /* overwrite */);
+        return *this;
+    }
+
+    /** \rst
+        Create and return a new Python submodule with the given name and docstring.
+        This also works recursively, i.e.
+
+        .. code-block:: cpp
+
+            py::module m("example", "pybind11 example plugin");
+            py::module m2 = m.def_submodule("sub", "A submodule of 'example'");
+            py::module m3 = m2.def_submodule("subsub", "A submodule of 'example.sub'");
+    \endrst */
+    module def_submodule(const char *name, const char *doc = nullptr) {
+        std::string full_name = std::string(PyModule_GetName(m_ptr))
+            + std::string(".") + std::string(name);
+        auto result = reinterpret_borrow<module>(PyImport_AddModule(full_name.c_str()));
+        if (doc && options::show_user_defined_docstrings())
+            result.attr("__doc__") = pybind11::str(doc);
+        attr(name) = result;
+        return result;
+    }
+
+    /// Import and return a module or throws `error_already_set`.
+    static module import(const char *name) {
+        PyObject *obj = PyImport_ImportModule(name);
+        if (!obj)
+            throw error_already_set();
+        return reinterpret_steal<module>(obj);
+    }
+
+    /// Reload the module or throws `error_already_set`.
+    void reload() {
+        PyObject *obj = PyImport_ReloadModule(ptr());
+        if (!obj)
+            throw error_already_set();
+        *this = reinterpret_steal<module>(obj);
+    }
+
+    // Adds an object to the module using the given name.  Throws if an object with the given name
+    // already exists.
+    //
+    // overwrite should almost always be false: attempting to overwrite objects that pybind11 has
+    // established will, in most cases, break things.
+    PYBIND11_NOINLINE void add_object(const char *name, handle obj, bool overwrite = false) {
+        if (!overwrite && hasattr(*this, name))
+            pybind11_fail("Error during initialization: multiple incompatible definitions with name \"" +
+                    std::string(name) + "\"");
+
+        PyModule_AddObject(ptr(), name, obj.inc_ref().ptr() /* steals a reference */);
+    }
+};
+
+/// \ingroup python_builtins
+/// Return a dictionary representing the global variables in the current execution frame,
+/// or ``__main__.__dict__`` if there is no frame (usually when the interpreter is embedded).
+inline dict globals() {
+    PyObject *p = PyEval_GetGlobals();
+    return reinterpret_borrow<dict>(p ? p : module::import("__main__").attr("__dict__").ptr());
+}
+
+NAMESPACE_BEGIN(detail)
+/// Generic support for creating new Python heap types
+class generic_type : public object {
+    template <typename...> friend class class_;
+public:
+    PYBIND11_OBJECT_DEFAULT(generic_type, object, PyType_Check)
+protected:
+    void initialize(const type_record &rec) {
+        if (rec.scope && hasattr(rec.scope, rec.name))
+            pybind11_fail("generic_type: cannot initialize type \"" + std::string(rec.name) +
+                          "\": an object with that name is already defined");
+
+        if (rec.module_local ? get_local_type_info(*rec.type) : get_global_type_info(*rec.type))
+            pybind11_fail("generic_type: type \"" + std::string(rec.name) +
+                          "\" is already registered!");
+
+        m_ptr = make_new_python_type(rec);
+
+        /* Register supplemental type information in C++ dict */
+        auto *tinfo = new detail::type_info();
+        tinfo->type = (PyTypeObject *) m_ptr;
+        tinfo->cpptype = rec.type;
+        tinfo->type_size = rec.type_size;
+        tinfo->type_align = rec.type_align;
+        tinfo->operator_new = rec.operator_new;
+        tinfo->holder_size_in_ptrs = size_in_ptrs(rec.holder_size);
+        tinfo->init_instance = rec.init_instance;
+        tinfo->dealloc = rec.dealloc;
+        tinfo->simple_type = true;
+        tinfo->simple_ancestors = true;
+        tinfo->default_holder = rec.default_holder;
+        tinfo->module_local = rec.module_local;
+
+        auto &internals = get_internals();
+        auto tindex = std::type_index(*rec.type);
+        tinfo->direct_conversions = &internals.direct_conversions[tindex];
+        if (rec.module_local)
+            registered_local_types_cpp()[tindex] = tinfo;
+        else
+            internals.registered_types_cpp[tindex] = tinfo;
+        internals.registered_types_py[(PyTypeObject *) m_ptr] = { tinfo };
+
+        if (rec.bases.size() > 1 || rec.multiple_inheritance) {
+            mark_parents_nonsimple(tinfo->type);
+            tinfo->simple_ancestors = false;
+        }
+        else if (rec.bases.size() == 1) {
+            auto parent_tinfo = get_type_info((PyTypeObject *) rec.bases[0].ptr());
+            tinfo->simple_ancestors = parent_tinfo->simple_ancestors;
+        }
+
+        if (rec.module_local) {
+            // Stash the local typeinfo and loader so that external modules can access it.
+            tinfo->module_local_load = &type_caster_generic::local_load;
+            setattr(m_ptr, PYBIND11_MODULE_LOCAL_ID, capsule(tinfo));
+        }
+    }
+
+    /// Helper function which tags all parents of a type using mult. inheritance
+    void mark_parents_nonsimple(PyTypeObject *value) {
+        auto t = reinterpret_borrow<tuple>(value->tp_bases);
+        for (handle h : t) {
+            auto tinfo2 = get_type_info((PyTypeObject *) h.ptr());
+            if (tinfo2)
+                tinfo2->simple_type = false;
+            mark_parents_nonsimple((PyTypeObject *) h.ptr());
+        }
+    }
+
+    void install_buffer_funcs(
+            buffer_info *(*get_buffer)(PyObject *, void *),
+            void *get_buffer_data) {
+        PyHeapTypeObject *type = (PyHeapTypeObject*) m_ptr;
+        auto tinfo = detail::get_type_info(&type->ht_type);
+
+        if (!type->ht_type.tp_as_buffer)
+            pybind11_fail(
+                "To be able to register buffer protocol support for the type '" +
+                std::string(tinfo->type->tp_name) +
+                "' the associated class<>(..) invocation must "
+                "include the pybind11::buffer_protocol() annotation!");
+
+        tinfo->get_buffer = get_buffer;
+        tinfo->get_buffer_data = get_buffer_data;
+    }
+
+    // rec_func must be set for either fget or fset.
+    void def_property_static_impl(const char *name,
+                                  handle fget, handle fset,
+                                  detail::function_record *rec_func) {
+        const auto is_static = rec_func && !(rec_func->is_method && rec_func->scope);
+        const auto has_doc = rec_func && rec_func->doc && pybind11::options::show_user_defined_docstrings();
+        auto property = handle((PyObject *) (is_static ? get_internals().static_property_type
+                                                       : &PyProperty_Type));
+        attr(name) = property(fget.ptr() ? fget : none(),
+                              fset.ptr() ? fset : none(),
+                              /*deleter*/none(),
+                              pybind11::str(has_doc ? rec_func->doc : ""));
+    }
+};
+
+/// Set the pointer to operator new if it exists. The cast is needed because it can be overloaded.
+template <typename T, typename = void_t<decltype(static_cast<void *(*)(size_t)>(T::operator new))>>
+void set_operator_new(type_record *r) { r->operator_new = &T::operator new; }
+
+template <typename> void set_operator_new(...) { }
+
+template <typename T, typename SFINAE = void> struct has_operator_delete : std::false_type { };
+template <typename T> struct has_operator_delete<T, void_t<decltype(static_cast<void (*)(void *)>(T::operator delete))>>
+    : std::true_type { };
+template <typename T, typename SFINAE = void> struct has_operator_delete_size : std::false_type { };
+template <typename T> struct has_operator_delete_size<T, void_t<decltype(static_cast<void (*)(void *, size_t)>(T::operator delete))>>
+    : std::true_type { };
+/// Call class-specific delete if it exists or global otherwise. Can also be an overload set.
+template <typename T, enable_if_t<has_operator_delete<T>::value, int> = 0>
+void call_operator_delete(T *p, size_t, size_t) { T::operator delete(p); }
+template <typename T, enable_if_t<!has_operator_delete<T>::value && has_operator_delete_size<T>::value, int> = 0>
+void call_operator_delete(T *p, size_t s, size_t) { T::operator delete(p, s); }
+
+inline void call_operator_delete(void *p, size_t s, size_t a) {
+    (void)s; (void)a;
+    #if defined(__cpp_aligned_new) && (!defined(_MSC_VER) || _MSC_VER >= 1912)
+        if (a > __STDCPP_DEFAULT_NEW_ALIGNMENT__) {
+            #ifdef __cpp_sized_deallocation
+                ::operator delete(p, s, std::align_val_t(a));
+            #else
+                ::operator delete(p, std::align_val_t(a));
+            #endif
+            return;
+        }
+    #endif
+    #ifdef __cpp_sized_deallocation
+        ::operator delete(p, s);
+    #else
+        ::operator delete(p);
+    #endif
+}
+
+NAMESPACE_END(detail)
+
+/// Given a pointer to a member function, cast it to its `Derived` version.
+/// Forward everything else unchanged.
+template <typename /*Derived*/, typename F>
+auto method_adaptor(F &&f) -> decltype(std::forward<F>(f)) { return std::forward<F>(f); }
+
+template <typename Derived, typename Return, typename Class, typename... Args>
+auto method_adaptor(Return (Class::*pmf)(Args...)) -> Return (Derived::*)(Args...) {
+    static_assert(detail::is_accessible_base_of<Class, Derived>::value,
+        "Cannot bind an inaccessible base class method; use a lambda definition instead");
+    return pmf;
+}
+
+template <typename Derived, typename Return, typename Class, typename... Args>
+auto method_adaptor(Return (Class::*pmf)(Args...) const) -> Return (Derived::*)(Args...) const {
+    static_assert(detail::is_accessible_base_of<Class, Derived>::value,
+        "Cannot bind an inaccessible base class method; use a lambda definition instead");
+    return pmf;
+}
+
+template <typename type_, typename... options>
+class class_ : public detail::generic_type {
+    template <typename T> using is_holder = detail::is_holder_type<type_, T>;
+    template <typename T> using is_subtype = detail::is_strict_base_of<type_, T>;
+    template <typename T> using is_base = detail::is_strict_base_of<T, type_>;
+    // struct instead of using here to help MSVC:
+    template <typename T> struct is_valid_class_option :
+        detail::any_of<is_holder<T>, is_subtype<T>, is_base<T>> {};
+
+public:
+    using type = type_;
+    using type_alias = detail::exactly_one_t<is_subtype, void, options...>;
+    constexpr static bool has_alias = !std::is_void<type_alias>::value;
+    using holder_type = detail::exactly_one_t<is_holder, std::unique_ptr<type>, options...>;
+
+    static_assert(detail::all_of<is_valid_class_option<options>...>::value,
+            "Unknown/invalid class_ template parameters provided");
+
+    static_assert(!has_alias || std::is_polymorphic<type>::value,
+            "Cannot use an alias class with a non-polymorphic type");
+
+    PYBIND11_OBJECT(class_, generic_type, PyType_Check)
+
+    template <typename... Extra>
+    class_(handle scope, const char *name, const Extra &... extra) {
+        using namespace detail;
+
+        // MI can only be specified via class_ template options, not constructor parameters
+        static_assert(
+            none_of<is_pyobject<Extra>...>::value || // no base class arguments, or:
+            (   constexpr_sum(is_pyobject<Extra>::value...) == 1 && // Exactly one base
+                constexpr_sum(is_base<options>::value...)   == 0 && // no template option bases
+                none_of<std::is_same<multiple_inheritance, Extra>...>::value), // no multiple_inheritance attr
+            "Error: multiple inheritance bases must be specified via class_ template options");
+
+        type_record record;
+        record.scope = scope;
+        record.name = name;
+        record.type = &typeid(type);
+        record.type_size = sizeof(conditional_t<has_alias, type_alias, type>);
+        record.type_align = alignof(conditional_t<has_alias, type_alias, type>&);
+        record.holder_size = sizeof(holder_type);
+        record.init_instance = init_instance;
+        record.dealloc = dealloc;
+        record.default_holder = detail::is_instantiation<std::unique_ptr, holder_type>::value;
+
+        set_operator_new<type>(&record);
+
+        /* Register base classes specified via template arguments to class_, if any */
+        PYBIND11_EXPAND_SIDE_EFFECTS(add_base<options>(record));
+
+        /* Process optional arguments, if any */
+        process_attributes<Extra...>::init(extra..., &record);
+
+        generic_type::initialize(record);
+
+        if (has_alias) {
+            auto &instances = record.module_local ? registered_local_types_cpp() : get_internals().registered_types_cpp;
+            instances[std::type_index(typeid(type_alias))] = instances[std::type_index(typeid(type))];
+        }
+    }
+
+    template <typename Base, detail::enable_if_t<is_base<Base>::value, int> = 0>
+    static void add_base(detail::type_record &rec) {
+        rec.add_base(typeid(Base), [](void *src) -> void * {
+            return static_cast<Base *>(reinterpret_cast<type *>(src));
+        });
+    }
+
+    template <typename Base, detail::enable_if_t<!is_base<Base>::value, int> = 0>
+    static void add_base(detail::type_record &) { }
+
+    template <typename Func, typename... Extra>
+    class_ &def(const char *name_, Func&& f, const Extra&... extra) {
+        cpp_function cf(method_adaptor<type>(std::forward<Func>(f)), name(name_), is_method(*this),
+                        sibling(getattr(*this, name_, none())), extra...);
+        attr(cf.name()) = cf;
+        return *this;
+    }
+
+    template <typename Func, typename... Extra> class_ &
+    def_static(const char *name_, Func &&f, const Extra&... extra) {
+        static_assert(!std::is_member_function_pointer<Func>::value,
+                "def_static(...) called with a non-static member function pointer");
+        cpp_function cf(std::forward<Func>(f), name(name_), scope(*this),
+                        sibling(getattr(*this, name_, none())), extra...);
+        attr(cf.name()) = staticmethod(cf);
+        return *this;
+    }
+
+    template <detail::op_id id, detail::op_type ot, typename L, typename R, typename... Extra>
+    class_ &def(const detail::op_<id, ot, L, R> &op, const Extra&... extra) {
+        op.execute(*this, extra...);
+        return *this;
+    }
+
+    template <detail::op_id id, detail::op_type ot, typename L, typename R, typename... Extra>
+    class_ & def_cast(const detail::op_<id, ot, L, R> &op, const Extra&... extra) {
+        op.execute_cast(*this, extra...);
+        return *this;
+    }
+
+    template <typename... Args, typename... Extra>
+    class_ &def(const detail::initimpl::constructor<Args...> &init, const Extra&... extra) {
+        init.execute(*this, extra...);
+        return *this;
+    }
+
+    template <typename... Args, typename... Extra>
+    class_ &def(const detail::initimpl::alias_constructor<Args...> &init, const Extra&... extra) {
+        init.execute(*this, extra...);
+        return *this;
+    }
+
+    template <typename... Args, typename... Extra>
+    class_ &def(detail::initimpl::factory<Args...> &&init, const Extra&... extra) {
+        std::move(init).execute(*this, extra...);
+        return *this;
+    }
+
+    template <typename... Args, typename... Extra>
+    class_ &def(detail::initimpl::pickle_factory<Args...> &&pf, const Extra &...extra) {
+        std::move(pf).execute(*this, extra...);
+        return *this;
+    }
+
+    template <typename Func> class_& def_buffer(Func &&func) {
+        struct capture { Func func; };
+        capture *ptr = new capture { std::forward<Func>(func) };
+        install_buffer_funcs([](PyObject *obj, void *ptr) -> buffer_info* {
+            detail::make_caster<type> caster;
+            if (!caster.load(obj, false))
+                return nullptr;
+            return new buffer_info(((capture *) ptr)->func(caster));
+        }, ptr);
+        return *this;
+    }
+
+    template <typename Return, typename Class, typename... Args>
+    class_ &def_buffer(Return (Class::*func)(Args...)) {
+        return def_buffer([func] (type &obj) { return (obj.*func)(); });
+    }
+
+    template <typename Return, typename Class, typename... Args>
+    class_ &def_buffer(Return (Class::*func)(Args...) const) {
+        return def_buffer([func] (const type &obj) { return (obj.*func)(); });
+    }
+
+    template <typename C, typename D, typename... Extra>
+    class_ &def_readwrite(const char *name, D C::*pm, const Extra&... extra) {
+        static_assert(std::is_same<C, type>::value || std::is_base_of<C, type>::value, "def_readwrite() requires a class member (or base class member)");
+        cpp_function fget([pm](const type &c) -> const D &{ return c.*pm; }, is_method(*this)),
+                     fset([pm](type &c, const D &value) { c.*pm = value; }, is_method(*this));
+        def_property(name, fget, fset, return_value_policy::reference_internal, extra...);
+        return *this;
+    }
+
+    template <typename C, typename D, typename... Extra>
+    class_ &def_readonly(const char *name, const D C::*pm, const Extra& ...extra) {
+        static_assert(std::is_same<C, type>::value || std::is_base_of<C, type>::value, "def_readonly() requires a class member (or base class member)");
+        cpp_function fget([pm](const type &c) -> const D &{ return c.*pm; }, is_method(*this));
+        def_property_readonly(name, fget, return_value_policy::reference_internal, extra...);
+        return *this;
+    }
+
+    template <typename D, typename... Extra>
+    class_ &def_readwrite_static(const char *name, D *pm, const Extra& ...extra) {
+        cpp_function fget([pm](object) -> const D &{ return *pm; }, scope(*this)),
+                     fset([pm](object, const D &value) { *pm = value; }, scope(*this));
+        def_property_static(name, fget, fset, return_value_policy::reference, extra...);
+        return *this;
+    }
+
+    template <typename D, typename... Extra>
+    class_ &def_readonly_static(const char *name, const D *pm, const Extra& ...extra) {
+        cpp_function fget([pm](object) -> const D &{ return *pm; }, scope(*this));
+        def_property_readonly_static(name, fget, return_value_policy::reference, extra...);
+        return *this;
+    }
+
+    /// Uses return_value_policy::reference_internal by default
+    template <typename Getter, typename... Extra>
+    class_ &def_property_readonly(const char *name, const Getter &fget, const Extra& ...extra) {
+        return def_property_readonly(name, cpp_function(method_adaptor<type>(fget)),
+                                     return_value_policy::reference_internal, extra...);
+    }
+
+    /// Uses cpp_function's return_value_policy by default
+    template <typename... Extra>
+    class_ &def_property_readonly(const char *name, const cpp_function &fget, const Extra& ...extra) {
+        return def_property(name, fget, nullptr, extra...);
+    }
+
+    /// Uses return_value_policy::reference by default
+    template <typename Getter, typename... Extra>
+    class_ &def_property_readonly_static(const char *name, const Getter &fget, const Extra& ...extra) {
+        return def_property_readonly_static(name, cpp_function(fget), return_value_policy::reference, extra...);
+    }
+
+    /// Uses cpp_function's return_value_policy by default
+    template <typename... Extra>
+    class_ &def_property_readonly_static(const char *name, const cpp_function &fget, const Extra& ...extra) {
+        return def_property_static(name, fget, nullptr, extra...);
+    }
+
+    /// Uses return_value_policy::reference_internal by default
+    template <typename Getter, typename Setter, typename... Extra>
+    class_ &def_property(const char *name, const Getter &fget, const Setter &fset, const Extra& ...extra) {
+        return def_property(name, fget, cpp_function(method_adaptor<type>(fset)), extra...);
+    }
+    template <typename Getter, typename... Extra>
+    class_ &def_property(const char *name, const Getter &fget, const cpp_function &fset, const Extra& ...extra) {
+        return def_property(name, cpp_function(method_adaptor<type>(fget)), fset,
+                            return_value_policy::reference_internal, extra...);
+    }
+
+    /// Uses cpp_function's return_value_policy by default
+    template <typename... Extra>
+    class_ &def_property(const char *name, const cpp_function &fget, const cpp_function &fset, const Extra& ...extra) {
+        return def_property_static(name, fget, fset, is_method(*this), extra...);
+    }
+
+    /// Uses return_value_policy::reference by default
+    template <typename Getter, typename... Extra>
+    class_ &def_property_static(const char *name, const Getter &fget, const cpp_function &fset, const Extra& ...extra) {
+        return def_property_static(name, cpp_function(fget), fset, return_value_policy::reference, extra...);
+    }
+
+    /// Uses cpp_function's return_value_policy by default
+    template <typename... Extra>
+    class_ &def_property_static(const char *name, const cpp_function &fget, const cpp_function &fset, const Extra& ...extra) {
+        static_assert( 0 == detail::constexpr_sum(std::is_base_of<arg, Extra>::value...),
+                      "Argument annotations are not allowed for properties");
+        auto rec_fget = get_function_record(fget), rec_fset = get_function_record(fset);
+        auto *rec_active = rec_fget;
+        if (rec_fget) {
+           char *doc_prev = rec_fget->doc; /* 'extra' field may include a property-specific documentation string */
+           detail::process_attributes<Extra...>::init(extra..., rec_fget);
+           if (rec_fget->doc && rec_fget->doc != doc_prev) {
+              free(doc_prev);
+              rec_fget->doc = strdup(rec_fget->doc);
+           }
+        }
+        if (rec_fset) {
+            char *doc_prev = rec_fset->doc;
+            detail::process_attributes<Extra...>::init(extra..., rec_fset);
+            if (rec_fset->doc && rec_fset->doc != doc_prev) {
+                free(doc_prev);
+                rec_fset->doc = strdup(rec_fset->doc);
+            }
+            if (! rec_active) rec_active = rec_fset;
+        }
+        def_property_static_impl(name, fget, fset, rec_active);
+        return *this;
+    }
+
+private:
+    /// Initialize holder object, variant 1: object derives from enable_shared_from_this
+    template <typename T>
+    static void init_holder(detail::instance *inst, detail::value_and_holder &v_h,
+            const holder_type * /* unused */, const std::enable_shared_from_this<T> * /* dummy */) {
+        try {
+            auto sh = std::dynamic_pointer_cast<typename holder_type::element_type>(
+                    v_h.value_ptr<type>()->shared_from_this());
+            if (sh) {
+                new (std::addressof(v_h.holder<holder_type>())) holder_type(std::move(sh));
+                v_h.set_holder_constructed();
+            }
+        } catch (const std::bad_weak_ptr &) {}
+
+        if (!v_h.holder_constructed() && inst->owned) {
+            new (std::addressof(v_h.holder<holder_type>())) holder_type(v_h.value_ptr<type>());
+            v_h.set_holder_constructed();
+        }
+    }
+
+    static void init_holder_from_existing(const detail::value_and_holder &v_h,
+            const holder_type *holder_ptr, std::true_type /*is_copy_constructible*/) {
+        new (std::addressof(v_h.holder<holder_type>())) holder_type(*reinterpret_cast<const holder_type *>(holder_ptr));
+    }
+
+    static void init_holder_from_existing(const detail::value_and_holder &v_h,
+            const holder_type *holder_ptr, std::false_type /*is_copy_constructible*/) {
+        new (std::addressof(v_h.holder<holder_type>())) holder_type(std::move(*const_cast<holder_type *>(holder_ptr)));
+    }
+
+    /// Initialize holder object, variant 2: try to construct from existing holder object, if possible
+    static void init_holder(detail::instance *inst, detail::value_and_holder &v_h,
+            const holder_type *holder_ptr, const void * /* dummy -- not enable_shared_from_this<T>) */) {
+        if (holder_ptr) {
+            init_holder_from_existing(v_h, holder_ptr, std::is_copy_constructible<holder_type>());
+            v_h.set_holder_constructed();
+        } else if (inst->owned || detail::always_construct_holder<holder_type>::value) {
+            new (std::addressof(v_h.holder<holder_type>())) holder_type(v_h.value_ptr<type>());
+            v_h.set_holder_constructed();
+        }
+    }
+
+    /// Performs instance initialization including constructing a holder and registering the known
+    /// instance.  Should be called as soon as the `type` value_ptr is set for an instance.  Takes an
+    /// optional pointer to an existing holder to use; if not specified and the instance is
+    /// `.owned`, a new holder will be constructed to manage the value pointer.
+    static void init_instance(detail::instance *inst, const void *holder_ptr) {
+        auto v_h = inst->get_value_and_holder(detail::get_type_info(typeid(type)));
+        if (!v_h.instance_registered()) {
+            register_instance(inst, v_h.value_ptr(), v_h.type);
+            v_h.set_instance_registered();
+        }
+        init_holder(inst, v_h, (const holder_type *) holder_ptr, v_h.value_ptr<type>());
+    }
+
+    /// Deallocates an instance; via holder, if constructed; otherwise via operator delete.
+    static void dealloc(detail::value_and_holder &v_h) {
+        if (v_h.holder_constructed()) {
+            v_h.holder<holder_type>().~holder_type();
+            v_h.set_holder_constructed(false);
+        }
+        else {
+            detail::call_operator_delete(v_h.value_ptr<type>(),
+                v_h.type->type_size,
+                v_h.type->type_align
+            );
+        }
+        v_h.value_ptr() = nullptr;
+    }
+
+    static detail::function_record *get_function_record(handle h) {
+        h = detail::get_function(h);
+        return h ? (detail::function_record *) reinterpret_borrow<capsule>(PyCFunction_GET_SELF(h.ptr()))
+                 : nullptr;
+    }
+};
+
+/// Binds an existing constructor taking arguments Args...
+template <typename... Args> detail::initimpl::constructor<Args...> init() { return {}; }
+/// Like `init<Args...>()`, but the instance is always constructed through the alias class (even
+/// when not inheriting on the Python side).
+template <typename... Args> detail::initimpl::alias_constructor<Args...> init_alias() { return {}; }
+
+/// Binds a factory function as a constructor
+template <typename Func, typename Ret = detail::initimpl::factory<Func>>
+Ret init(Func &&f) { return {std::forward<Func>(f)}; }
+
+/// Dual-argument factory function: the first function is called when no alias is needed, the second
+/// when an alias is needed (i.e. due to python-side inheritance).  Arguments must be identical.
+template <typename CFunc, typename AFunc, typename Ret = detail::initimpl::factory<CFunc, AFunc>>
+Ret init(CFunc &&c, AFunc &&a) {
+    return {std::forward<CFunc>(c), std::forward<AFunc>(a)};
+}
+
+/// Binds pickling functions `__getstate__` and `__setstate__` and ensures that the type
+/// returned by `__getstate__` is the same as the argument accepted by `__setstate__`.
+template <typename GetState, typename SetState>
+detail::initimpl::pickle_factory<GetState, SetState> pickle(GetState &&g, SetState &&s) {
+    return {std::forward<GetState>(g), std::forward<SetState>(s)};
+}
+
+NAMESPACE_BEGIN(detail)
+struct enum_base {
+    enum_base(handle base, handle parent) : m_base(base), m_parent(parent) { }
+
+    PYBIND11_NOINLINE void init(bool is_arithmetic, bool is_convertible) {
+        m_base.attr("__entries") = dict();
+        auto property = handle((PyObject *) &PyProperty_Type);
+        auto static_property = handle((PyObject *) get_internals().static_property_type);
+
+        m_base.attr("__repr__") = cpp_function(
+            [](handle arg) -> str {
+                handle type = arg.get_type();
+                object type_name = type.attr("__name__");
+                dict entries = type.attr("__entries");
+                for (const auto &kv : entries) {
+                    object other = kv.second[int_(0)];
+                    if (other.equal(arg))
+                        return pybind11::str("{}.{}").format(type_name, kv.first);
+                }
+                return pybind11::str("{}.???").format(type_name);
+            }, is_method(m_base)
+        );
+
+        m_base.attr("name") = property(cpp_function(
+            [](handle arg) -> str {
+                dict entries = arg.get_type().attr("__entries");
+                for (const auto &kv : entries) {
+                    if (handle(kv.second[int_(0)]).equal(arg))
+                        return pybind11::str(kv.first);
+                }
+                return "???";
+            }, is_method(m_base)
+        ));
+
+        m_base.attr("__doc__") = static_property(cpp_function(
+            [](handle arg) -> std::string {
+                std::string docstring;
+                dict entries = arg.attr("__entries");
+                if (((PyTypeObject *) arg.ptr())->tp_doc)
+                    docstring += std::string(((PyTypeObject *) arg.ptr())->tp_doc) + "\n\n";
+                docstring += "Members:";
+                for (const auto &kv : entries) {
+                    auto key = std::string(pybind11::str(kv.first));
+                    auto comment = kv.second[int_(1)];
+                    docstring += "\n\n  " + key;
+                    if (!comment.is_none())
+                        docstring += " : " + (std::string) pybind11::str(comment);
+                }
+                return docstring;
+            }
+        ), none(), none(), "");
+
+        m_base.attr("__members__") = static_property(cpp_function(
+            [](handle arg) -> dict {
+                dict entries = arg.attr("__entries"), m;
+                for (const auto &kv : entries)
+                    m[kv.first] = kv.second[int_(0)];
+                return m;
+            }), none(), none(), ""
+        );
+
+        #define PYBIND11_ENUM_OP_STRICT(op, expr, strict_behavior)                     \
+            m_base.attr(op) = cpp_function(                                            \
+                [](object a, object b) {                                               \
+                    if (!a.get_type().is(b.get_type()))                                \
+                        strict_behavior;                                               \
+                    return expr;                                                       \
+                },                                                                     \
+                is_method(m_base))
+
+        #define PYBIND11_ENUM_OP_CONV(op, expr)                                        \
+            m_base.attr(op) = cpp_function(                                            \
+                [](object a_, object b_) {                                             \
+                    int_ a(a_), b(b_);                                                 \
+                    return expr;                                                       \
+                },                                                                     \
+                is_method(m_base))
+
+        #define PYBIND11_ENUM_OP_CONV_LHS(op, expr)                                    \
+            m_base.attr(op) = cpp_function(                                            \
+                [](object a_, object b) {                                              \
+                    int_ a(a_);                                                        \
+                    return expr;                                                       \
+                },                                                                     \
+                is_method(m_base))
+
+        if (is_convertible) {
+            PYBIND11_ENUM_OP_CONV_LHS("__eq__", !b.is_none() &&  a.equal(b));
+            PYBIND11_ENUM_OP_CONV_LHS("__ne__",  b.is_none() || !a.equal(b));
+
+            if (is_arithmetic) {
+                PYBIND11_ENUM_OP_CONV("__lt__",   a <  b);
+                PYBIND11_ENUM_OP_CONV("__gt__",   a >  b);
+                PYBIND11_ENUM_OP_CONV("__le__",   a <= b);
+                PYBIND11_ENUM_OP_CONV("__ge__",   a >= b);
+                PYBIND11_ENUM_OP_CONV("__and__",  a &  b);
+                PYBIND11_ENUM_OP_CONV("__rand__", a &  b);
+                PYBIND11_ENUM_OP_CONV("__or__",   a |  b);
+                PYBIND11_ENUM_OP_CONV("__ror__",  a |  b);
+                PYBIND11_ENUM_OP_CONV("__xor__",  a ^  b);
+                PYBIND11_ENUM_OP_CONV("__rxor__", a ^  b);
+                m_base.attr("__invert__") = cpp_function(
+                    [](object arg) { return ~(int_(arg)); }, is_method(m_base));
+            }
+        } else {
+            PYBIND11_ENUM_OP_STRICT("__eq__",  int_(a).equal(int_(b)), return false);
+            PYBIND11_ENUM_OP_STRICT("__ne__", !int_(a).equal(int_(b)), return true);
+
+            if (is_arithmetic) {
+                #define PYBIND11_THROW throw type_error("Expected an enumeration of matching type!");
+                PYBIND11_ENUM_OP_STRICT("__lt__", int_(a) <  int_(b), PYBIND11_THROW);
+                PYBIND11_ENUM_OP_STRICT("__gt__", int_(a) >  int_(b), PYBIND11_THROW);
+                PYBIND11_ENUM_OP_STRICT("__le__", int_(a) <= int_(b), PYBIND11_THROW);
+                PYBIND11_ENUM_OP_STRICT("__ge__", int_(a) >= int_(b), PYBIND11_THROW);
+                #undef PYBIND11_THROW
+            }
+        }
+
+        #undef PYBIND11_ENUM_OP_CONV_LHS
+        #undef PYBIND11_ENUM_OP_CONV
+        #undef PYBIND11_ENUM_OP_STRICT
+
+        object getstate = cpp_function(
+            [](object arg) { return int_(arg); }, is_method(m_base));
+
+        m_base.attr("__getstate__") = getstate;
+        m_base.attr("__hash__") = getstate;
+    }
+
+    PYBIND11_NOINLINE void value(char const* name_, object value, const char *doc = nullptr) {
+        dict entries = m_base.attr("__entries");
+        str name(name_);
+        if (entries.contains(name)) {
+            std::string type_name = (std::string) str(m_base.attr("__name__"));
+            throw value_error(type_name + ": element \"" + std::string(name_) + "\" already exists!");
+        }
+
+        entries[name] = std::make_pair(value, doc);
+        m_base.attr(name) = value;
+    }
+
+    PYBIND11_NOINLINE void export_values() {
+        dict entries = m_base.attr("__entries");
+        for (const auto &kv : entries)
+            m_parent.attr(kv.first) = kv.second[int_(0)];
+    }
+
+    handle m_base;
+    handle m_parent;
+};
+
+NAMESPACE_END(detail)
+
+/// Binds C++ enumerations and enumeration classes to Python
+template <typename Type> class enum_ : public class_<Type> {
+public:
+    using Base = class_<Type>;
+    using Base::def;
+    using Base::attr;
+    using Base::def_property_readonly;
+    using Base::def_property_readonly_static;
+    using Scalar = typename std::underlying_type<Type>::type;
+
+    template <typename... Extra>
+    enum_(const handle &scope, const char *name, const Extra&... extra)
+      : class_<Type>(scope, name, extra...), m_base(*this, scope) {
+        constexpr bool is_arithmetic = detail::any_of<std::is_same<arithmetic, Extra>...>::value;
+        constexpr bool is_convertible = std::is_convertible<Type, Scalar>::value;
+        m_base.init(is_arithmetic, is_convertible);
+
+        def(init([](Scalar i) { return static_cast<Type>(i); }));
+        def("__int__", [](Type value) { return (Scalar) value; });
+        #if PY_MAJOR_VERSION < 3
+            def("__long__", [](Type value) { return (Scalar) value; });
+        #endif
+        #if PY_MAJOR_VERSION > 3 || (PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION >= 8)
+            def("__index__", [](Type value) { return (Scalar) value; });
+        #endif
+
+        cpp_function setstate(
+            [](Type &value, Scalar arg) { value = static_cast<Type>(arg); },
+            is_method(*this));
+        attr("__setstate__") = setstate;
+    }
+
+    /// Export enumeration entries into the parent scope
+    enum_& export_values() {
+        m_base.export_values();
+        return *this;
+    }
+
+    /// Add an enumeration entry
+    enum_& value(char const* name, Type value, const char *doc = nullptr) {
+        m_base.value(name, pybind11::cast(value, return_value_policy::copy), doc);
+        return *this;
+    }
+
+private:
+    detail::enum_base m_base;
+};
+
+NAMESPACE_BEGIN(detail)
+
+
+inline void keep_alive_impl(handle nurse, handle patient) {
+    if (!nurse || !patient)
+        pybind11_fail("Could not activate keep_alive!");
+
+    if (patient.is_none() || nurse.is_none())
+        return; /* Nothing to keep alive or nothing to be kept alive by */
+
+    auto tinfo = all_type_info(Py_TYPE(nurse.ptr()));
+    if (!tinfo.empty()) {
+        /* It's a pybind-registered type, so we can store the patient in the
+         * internal list. */
+        add_patient(nurse.ptr(), patient.ptr());
+    }
+    else {
+        /* Fall back to clever approach based on weak references taken from
+         * Boost.Python. This is not used for pybind-registered types because
+         * the objects can be destroyed out-of-order in a GC pass. */
+        cpp_function disable_lifesupport(
+            [patient](handle weakref) { patient.dec_ref(); weakref.dec_ref(); });
+
+        weakref wr(nurse, disable_lifesupport);
+
+        patient.inc_ref(); /* reference patient and leak the weak reference */
+        (void) wr.release();
+    }
+}
+
+PYBIND11_NOINLINE inline void keep_alive_impl(size_t Nurse, size_t Patient, function_call &call, handle ret) {
+    auto get_arg = [&](size_t n) {
+        if (n == 0)
+            return ret;
+        else if (n == 1 && call.init_self)
+            return call.init_self;
+        else if (n <= call.args.size())
+            return call.args[n - 1];
+        return handle();
+    };
+
+    keep_alive_impl(get_arg(Nurse), get_arg(Patient));
+}
+
+inline std::pair<decltype(internals::registered_types_py)::iterator, bool> all_type_info_get_cache(PyTypeObject *type) {
+    auto res = get_internals().registered_types_py
+#ifdef __cpp_lib_unordered_map_try_emplace
+        .try_emplace(type);
+#else
+        .emplace(type, std::vector<detail::type_info *>());
+#endif
+    if (res.second) {
+        // New cache entry created; set up a weak reference to automatically remove it if the type
+        // gets destroyed:
+        weakref((PyObject *) type, cpp_function([type](handle wr) {
+            get_internals().registered_types_py.erase(type);
+            wr.dec_ref();
+        })).release();
+    }
+
+    return res;
+}
+
+template <typename Iterator, typename Sentinel, bool KeyIterator, return_value_policy Policy>
+struct iterator_state {
+    Iterator it;
+    Sentinel end;
+    bool first_or_done;
+};
+
+NAMESPACE_END(detail)
+
+/// Makes a python iterator from a first and past-the-end C++ InputIterator.
+template <return_value_policy Policy = return_value_policy::reference_internal,
+          typename Iterator,
+          typename Sentinel,
+          typename ValueType = decltype(*std::declval<Iterator>()),
+          typename... Extra>
+iterator make_iterator(Iterator first, Sentinel last, Extra &&... extra) {
+    typedef detail::iterator_state<Iterator, Sentinel, false, Policy> state;
+
+    if (!detail::get_type_info(typeid(state), false)) {
+        class_<state>(handle(), "iterator", pybind11::module_local())
+            .def("__iter__", [](state &s) -> state& { return s; })
+            .def("__next__", [](state &s) -> ValueType {
+                if (!s.first_or_done)
+                    ++s.it;
+                else
+                    s.first_or_done = false;
+                if (s.it == s.end) {
+                    s.first_or_done = true;
+                    throw stop_iteration();
+                }
+                return *s.it;
+            }, std::forward<Extra>(extra)..., Policy);
+    }
+
+    return cast(state{first, last, true});
+}
+
+/// Makes an python iterator over the keys (`.first`) of a iterator over pairs from a
+/// first and past-the-end InputIterator.
+template <return_value_policy Policy = return_value_policy::reference_internal,
+          typename Iterator,
+          typename Sentinel,
+          typename KeyType = decltype((*std::declval<Iterator>()).first),
+          typename... Extra>
+iterator make_key_iterator(Iterator first, Sentinel last, Extra &&... extra) {
+    typedef detail::iterator_state<Iterator, Sentinel, true, Policy> state;
+
+    if (!detail::get_type_info(typeid(state), false)) {
+        class_<state>(handle(), "iterator", pybind11::module_local())
+            .def("__iter__", [](state &s) -> state& { return s; })
+            .def("__next__", [](state &s) -> KeyType {
+                if (!s.first_or_done)
+                    ++s.it;
+                else
+                    s.first_or_done = false;
+                if (s.it == s.end) {
+                    s.first_or_done = true;
+                    throw stop_iteration();
+                }
+                return (*s.it).first;
+            }, std::forward<Extra>(extra)..., Policy);
+    }
+
+    return cast(state{first, last, true});
+}
+
+/// Makes an iterator over values of an stl container or other container supporting
+/// `std::begin()`/`std::end()`
+template <return_value_policy Policy = return_value_policy::reference_internal,
+          typename Type, typename... Extra> iterator make_iterator(Type &value, Extra&&... extra) {
+    return make_iterator<Policy>(std::begin(value), std::end(value), extra...);
+}
+
+/// Makes an iterator over the keys (`.first`) of a stl map-like container supporting
+/// `std::begin()`/`std::end()`
+template <return_value_policy Policy = return_value_policy::reference_internal,
+          typename Type, typename... Extra> iterator make_key_iterator(Type &value, Extra&&... extra) {
+    return make_key_iterator<Policy>(std::begin(value), std::end(value), extra...);
+}
+
+template <typename InputType, typename OutputType> void implicitly_convertible() {
+    struct set_flag {
+        bool &flag;
+        set_flag(bool &flag) : flag(flag) { flag = true; }
+        ~set_flag() { flag = false; }
+    };
+    auto implicit_caster = [](PyObject *obj, PyTypeObject *type) -> PyObject * {
+        static bool currently_used = false;
+        if (currently_used) // implicit conversions are non-reentrant
+            return nullptr;
+        set_flag flag_helper(currently_used);
+        if (!detail::make_caster<InputType>().load(obj, false))
+            return nullptr;
+        tuple args(1);
+        args[0] = obj;
+        PyObject *result = PyObject_Call((PyObject *) type, args.ptr(), nullptr);
+        if (result == nullptr)
+            PyErr_Clear();
+        return result;
+    };
+
+    if (auto tinfo = detail::get_type_info(typeid(OutputType)))
+        tinfo->implicit_conversions.push_back(implicit_caster);
+    else
+        pybind11_fail("implicitly_convertible: Unable to find type " + type_id<OutputType>());
+}
+
+template <typename ExceptionTranslator>
+void register_exception_translator(ExceptionTranslator&& translator) {
+    detail::get_internals().registered_exception_translators.push_front(
+        std::forward<ExceptionTranslator>(translator));
+}
+
+/**
+ * Wrapper to generate a new Python exception type.
+ *
+ * This should only be used with PyErr_SetString for now.
+ * It is not (yet) possible to use as a py::base.
+ * Template type argument is reserved for future use.
+ */
+template <typename type>
+class exception : public object {
+public:
+    exception() = default;
+    exception(handle scope, const char *name, PyObject *base = PyExc_Exception) {
+        std::string full_name = scope.attr("__name__").cast<std::string>() +
+                                std::string(".") + name;
+        m_ptr = PyErr_NewException(const_cast<char *>(full_name.c_str()), base, NULL);
+        if (hasattr(scope, name))
+            pybind11_fail("Error during initialization: multiple incompatible "
+                          "definitions with name \"" + std::string(name) + "\"");
+        scope.attr(name) = *this;
+    }
+
+    // Sets the current python exception to this exception object with the given message
+    void operator()(const char *message) {
+        PyErr_SetString(m_ptr, message);
+    }
+};
+
+NAMESPACE_BEGIN(detail)
+// Returns a reference to a function-local static exception object used in the simple
+// register_exception approach below.  (It would be simpler to have the static local variable
+// directly in register_exception, but that makes clang <3.5 segfault - issue #1349).
+template <typename CppException>
+exception<CppException> &get_exception_object() { static exception<CppException> ex; return ex; }
+NAMESPACE_END(detail)
+
+/**
+ * Registers a Python exception in `m` of the given `name` and installs an exception translator to
+ * translate the C++ exception to the created Python exception using the exceptions what() method.
+ * This is intended for simple exception translations; for more complex translation, register the
+ * exception object and translator directly.
+ */
+template <typename CppException>
+exception<CppException> &register_exception(handle scope,
+                                            const char *name,
+                                            PyObject *base = PyExc_Exception) {
+    auto &ex = detail::get_exception_object<CppException>();
+    if (!ex) ex = exception<CppException>(scope, name, base);
+
+    register_exception_translator([](std::exception_ptr p) {
+        if (!p) return;
+        try {
+            std::rethrow_exception(p);
+        } catch (const CppException &e) {
+            detail::get_exception_object<CppException>()(e.what());
+        }
+    });
+    return ex;
+}
+
+NAMESPACE_BEGIN(detail)
+PYBIND11_NOINLINE inline void print(tuple args, dict kwargs) {
+    auto strings = tuple(args.size());
+    for (size_t i = 0; i < args.size(); ++i) {
+        strings[i] = str(args[i]);
+    }
+    auto sep = kwargs.contains("sep") ? kwargs["sep"] : cast(" ");
+    auto line = sep.attr("join")(strings);
+
+    object file;
+    if (kwargs.contains("file")) {
+        file = kwargs["file"].cast<object>();
+    } else {
+        try {
+            file = module::import("sys").attr("stdout");
+        } catch (const error_already_set &) {
+            /* If print() is called from code that is executed as
+               part of garbage collection during interpreter shutdown,
+               importing 'sys' can fail. Give up rather than crashing the
+               interpreter in this case. */
+            return;
+        }
+    }
+
+    auto write = file.attr("write");
+    write(line);
+    write(kwargs.contains("end") ? kwargs["end"] : cast("\n"));
+
+    if (kwargs.contains("flush") && kwargs["flush"].cast<bool>())
+        file.attr("flush")();
+}
+NAMESPACE_END(detail)
+
+template <return_value_policy policy = return_value_policy::automatic_reference, typename... Args>
+void print(Args &&...args) {
+    auto c = detail::collect_arguments<policy>(std::forward<Args>(args)...);
+    detail::print(c.args(), c.kwargs());
+}
+
+#if defined(WITH_THREAD) && !defined(PYPY_VERSION)
+
+/* The functions below essentially reproduce the PyGILState_* API using a RAII
+ * pattern, but there are a few important differences:
+ *
+ * 1. When acquiring the GIL from an non-main thread during the finalization
+ *    phase, the GILState API blindly terminates the calling thread, which
+ *    is often not what is wanted. This API does not do this.
+ *
+ * 2. The gil_scoped_release function can optionally cut the relationship
+ *    of a PyThreadState and its associated thread, which allows moving it to
+ *    another thread (this is a fairly rare/advanced use case).
+ *
+ * 3. The reference count of an acquired thread state can be controlled. This
+ *    can be handy to prevent cases where callbacks issued from an external
+ *    thread would otherwise constantly construct and destroy thread state data
+ *    structures.
+ *
+ * See the Python bindings of NanoGUI (http://github.com/wjakob/nanogui) for an
+ * example which uses features 2 and 3 to migrate the Python thread of
+ * execution to another thread (to run the event loop on the original thread,
+ * in this case).
+ */
+
+class gil_scoped_acquire {
+public:
+    PYBIND11_NOINLINE gil_scoped_acquire() {
+        auto const &internals = detail::get_internals();
+        tstate = (PyThreadState *) PYBIND11_TLS_GET_VALUE(internals.tstate);
+
+        if (!tstate) {
+            /* Check if the GIL was acquired using the PyGILState_* API instead (e.g. if
+               calling from a Python thread). Since we use a different key, this ensures
+               we don't create a new thread state and deadlock in PyEval_AcquireThread
+               below. Note we don't save this state with internals.tstate, since we don't
+               create it we would fail to clear it (its reference count should be > 0). */
+            tstate = PyGILState_GetThisThreadState();
+        }
+
+        if (!tstate) {
+            tstate = PyThreadState_New(internals.istate);
+            #if !defined(NDEBUG)
+                if (!tstate)
+                    pybind11_fail("scoped_acquire: could not create thread state!");
+            #endif
+            tstate->gilstate_counter = 0;
+            PYBIND11_TLS_REPLACE_VALUE(internals.tstate, tstate);
+        } else {
+            release = detail::get_thread_state_unchecked() != tstate;
+        }
+
+        if (release) {
+            /* Work around an annoying assertion in PyThreadState_Swap */
+            #if defined(Py_DEBUG)
+                PyInterpreterState *interp = tstate->interp;
+                tstate->interp = nullptr;
+            #endif
+            PyEval_AcquireThread(tstate);
+            #if defined(Py_DEBUG)
+                tstate->interp = interp;
+            #endif
+        }
+
+        inc_ref();
+    }
+
+    void inc_ref() {
+        ++tstate->gilstate_counter;
+    }
+
+    PYBIND11_NOINLINE void dec_ref() {
+        --tstate->gilstate_counter;
+        #if !defined(NDEBUG)
+            if (detail::get_thread_state_unchecked() != tstate)
+                pybind11_fail("scoped_acquire::dec_ref(): thread state must be current!");
+            if (tstate->gilstate_counter < 0)
+                pybind11_fail("scoped_acquire::dec_ref(): reference count underflow!");
+        #endif
+        if (tstate->gilstate_counter == 0) {
+            #if !defined(NDEBUG)
+                if (!release)
+                    pybind11_fail("scoped_acquire::dec_ref(): internal error!");
+            #endif
+            PyThreadState_Clear(tstate);
+            PyThreadState_DeleteCurrent();
+            PYBIND11_TLS_DELETE_VALUE(detail::get_internals().tstate);
+            release = false;
+        }
+    }
+
+    PYBIND11_NOINLINE ~gil_scoped_acquire() {
+        dec_ref();
+        if (release)
+           PyEval_SaveThread();
+    }
+private:
+    PyThreadState *tstate = nullptr;
+    bool release = true;
+};
+
+class gil_scoped_release {
+public:
+    explicit gil_scoped_release(bool disassoc = false) : disassoc(disassoc) {
+        // `get_internals()` must be called here unconditionally in order to initialize
+        // `internals.tstate` for subsequent `gil_scoped_acquire` calls. Otherwise, an
+        // initialization race could occur as multiple threads try `gil_scoped_acquire`.
+        const auto &internals = detail::get_internals();
+        tstate = PyEval_SaveThread();
+        if (disassoc) {
+            auto key = internals.tstate;
+            PYBIND11_TLS_DELETE_VALUE(key);
+        }
+    }
+    ~gil_scoped_release() {
+        if (!tstate)
+            return;
+        PyEval_RestoreThread(tstate);
+        if (disassoc) {
+            auto key = detail::get_internals().tstate;
+            PYBIND11_TLS_REPLACE_VALUE(key, tstate);
+        }
+    }
+private:
+    PyThreadState *tstate;
+    bool disassoc;
+};
+#elif defined(PYPY_VERSION)
+class gil_scoped_acquire {
+    PyGILState_STATE state;
+public:
+    gil_scoped_acquire() { state = PyGILState_Ensure(); }
+    ~gil_scoped_acquire() { PyGILState_Release(state); }
+};
+
+class gil_scoped_release {
+    PyThreadState *state;
+public:
+    gil_scoped_release() { state = PyEval_SaveThread(); }
+    ~gil_scoped_release() { PyEval_RestoreThread(state); }
+};
+#else
+class gil_scoped_acquire { };
+class gil_scoped_release { };
+#endif
+
+error_already_set::~error_already_set() {
+    if (m_type) {
+        gil_scoped_acquire gil;
+        error_scope scope;
+        m_type.release().dec_ref();
+        m_value.release().dec_ref();
+        m_trace.release().dec_ref();
+    }
+}
+
+inline function get_type_overload(const void *this_ptr, const detail::type_info *this_type, const char *name)  {
+    handle self = detail::get_object_handle(this_ptr, this_type);
+    if (!self)
+        return function();
+    handle type = self.get_type();
+    auto key = std::make_pair(type.ptr(), name);
+
+    /* Cache functions that aren't overloaded in Python to avoid
+       many costly Python dictionary lookups below */
+    auto &cache = detail::get_internals().inactive_overload_cache;
+    if (cache.find(key) != cache.end())
+        return function();
+
+    function overload = getattr(self, name, function());
+    if (overload.is_cpp_function()) {
+        cache.insert(key);
+        return function();
+    }
+
+    /* Don't call dispatch code if invoked from overridden function.
+       Unfortunately this doesn't work on PyPy. */
+#if !defined(PYPY_VERSION)
+    PyFrameObject *frame = PyThreadState_Get()->frame;
+    if (frame && (std::string) str(frame->f_code->co_name) == name &&
+        frame->f_code->co_argcount > 0) {
+        PyFrame_FastToLocals(frame);
+        PyObject *self_caller = PyDict_GetItem(
+            frame->f_locals, PyTuple_GET_ITEM(frame->f_code->co_varnames, 0));
+        if (self_caller == self.ptr())
+            return function();
+    }
+#else
+    /* PyPy currently doesn't provide a detailed cpyext emulation of
+       frame objects, so we have to emulate this using Python. This
+       is going to be slow..*/
+    dict d; d["self"] = self; d["name"] = pybind11::str(name);
+    PyObject *result = PyRun_String(
+        "import inspect\n"
+        "frame = inspect.currentframe()\n"
+        "if frame is not None:\n"
+        "    frame = frame.f_back\n"
+        "    if frame is not None and str(frame.f_code.co_name) == name and "
+        "frame.f_code.co_argcount > 0:\n"
+        "        self_caller = frame.f_locals[frame.f_code.co_varnames[0]]\n"
+        "        if self_caller == self:\n"
+        "            self = None\n",
+        Py_file_input, d.ptr(), d.ptr());
+    if (result == nullptr)
+        throw error_already_set();
+    if (d["self"].is_none())
+        return function();
+    Py_DECREF(result);
+#endif
+
+    return overload;
+}
+
+/** \rst
+  Try to retrieve a python method by the provided name from the instance pointed to by the this_ptr.
+
+  :this_ptr: The pointer to the object the overload should be retrieved for. This should be the first
+                   non-trampoline class encountered in the inheritance chain.
+  :name: The name of the overloaded Python method to retrieve.
+  :return: The Python method by this name from the object or an empty function wrapper.
+ \endrst */
+template <class T> function get_overload(const T *this_ptr, const char *name) {
+    auto tinfo = detail::get_type_info(typeid(T));
+    return tinfo ? get_type_overload(this_ptr, tinfo, name) : function();
+}
+
+#define PYBIND11_OVERLOAD_INT(ret_type, cname, name, ...) { \
+        pybind11::gil_scoped_acquire gil; \
+        pybind11::function overload = pybind11::get_overload(static_cast<const cname *>(this), name); \
+        if (overload) { \
+            auto o = overload(__VA_ARGS__); \
+            if (pybind11::detail::cast_is_temporary_value_reference<ret_type>::value) { \
+                static pybind11::detail::overload_caster_t<ret_type> caster; \
+                return pybind11::detail::cast_ref<ret_type>(std::move(o), caster); \
+            } \
+            else return pybind11::detail::cast_safe<ret_type>(std::move(o)); \
+        } \
+    }
+
+/** \rst
+    Macro to populate the virtual method in the trampoline class. This macro tries to look up a method named 'fn'
+    from the Python side, deals with the :ref:`gil` and necessary argument conversions to call this method and return
+    the appropriate type. See :ref:`overriding_virtuals` for more information. This macro should be used when the method
+    name in C is not the same as the method name in Python. For example with `__str__`.
+
+    .. code-block:: cpp
+
+      std::string toString() override {
+        PYBIND11_OVERLOAD_NAME(
+            std::string, // Return type (ret_type)
+            Animal,      // Parent class (cname)
+            toString,    // Name of function in C++ (name)
+            "__str__",   // Name of method in Python (fn)
+        );
+      }
+\endrst */
+#define PYBIND11_OVERLOAD_NAME(ret_type, cname, name, fn, ...) \
+    PYBIND11_OVERLOAD_INT(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), name, __VA_ARGS__) \
+    return cname::fn(__VA_ARGS__)
+
+/** \rst
+    Macro for pure virtual functions, this function is identical to :c:macro:`PYBIND11_OVERLOAD_NAME`, except that it
+    throws if no overload can be found.
+\endrst */
+#define PYBIND11_OVERLOAD_PURE_NAME(ret_type, cname, name, fn, ...) \
+    PYBIND11_OVERLOAD_INT(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), name, __VA_ARGS__) \
+    pybind11::pybind11_fail("Tried to call pure virtual function \"" PYBIND11_STRINGIFY(cname) "::" name "\"");
+
+/** \rst
+    Macro to populate the virtual method in the trampoline class. This macro tries to look up the method
+    from the Python side, deals with the :ref:`gil` and necessary argument conversions to call this method and return
+    the appropriate type. This macro should be used if the method name in C and in Python are identical.
+    See :ref:`overriding_virtuals` for more information.
+
+    .. code-block:: cpp
+
+      class PyAnimal : public Animal {
+      public:
+          // Inherit the constructors
+          using Animal::Animal;
+
+          // Trampoline (need one for each virtual function)
+          std::string go(int n_times) override {
+              PYBIND11_OVERLOAD_PURE(
+                  std::string, // Return type (ret_type)
+                  Animal,      // Parent class (cname)
+                  go,          // Name of function in C++ (must match Python name) (fn)
+                  n_times      // Argument(s) (...)
+              );
+          }
+      };
+\endrst */
+#define PYBIND11_OVERLOAD(ret_type, cname, fn, ...) \
+    PYBIND11_OVERLOAD_NAME(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), #fn, fn, __VA_ARGS__)
+
+/** \rst
+    Macro for pure virtual functions, this function is identical to :c:macro:`PYBIND11_OVERLOAD`, except that it throws
+    if no overload can be found.
+\endrst */
+#define PYBIND11_OVERLOAD_PURE(ret_type, cname, fn, ...) \
+    PYBIND11_OVERLOAD_PURE_NAME(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), #fn, fn, __VA_ARGS__)
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
+
+#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#  pragma warning(pop)
+#elif defined(__GNUG__) && !defined(__clang__)
+#  pragma GCC diagnostic pop
+#endif
diff --git a/cviruntime/python/include/pybind11/include/pybind11/pytypes.h b/cviruntime/python/include/pybind11/include/pybind11/pytypes.h
new file mode 100644
index 000000000..4003d6918
--- /dev/null
+++ b/cviruntime/python/include/pybind11/include/pybind11/pytypes.h
@@ -0,0 +1,1484 @@
+/*
+    pybind11/pytypes.h: Convenience wrapper classes for basic Python types
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/common.h"
+#include "buffer_info.h"
+#include <utility>
+#include <type_traits>
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+/* A few forward declarations */
+class handle; class object;
+class str; class iterator;
+struct arg; struct arg_v;
+
+NAMESPACE_BEGIN(detail)
+class args_proxy;
+inline bool isinstance_generic(handle obj, const std::type_info &tp);
+
+// Accessor forward declarations
+template <typename Policy> class accessor;
+namespace accessor_policies {
+    struct obj_attr;
+    struct str_attr;
+    struct generic_item;
+    struct sequence_item;
+    struct list_item;
+    struct tuple_item;
+}
+using obj_attr_accessor = accessor<accessor_policies::obj_attr>;
+using str_attr_accessor = accessor<accessor_policies::str_attr>;
+using item_accessor = accessor<accessor_policies::generic_item>;
+using sequence_accessor = accessor<accessor_policies::sequence_item>;
+using list_accessor = accessor<accessor_policies::list_item>;
+using tuple_accessor = accessor<accessor_policies::tuple_item>;
+
+/// Tag and check to identify a class which implements the Python object API
+class pyobject_tag { };
+template <typename T> using is_pyobject = std::is_base_of<pyobject_tag, remove_reference_t<T>>;
+
+/** \rst
+    A mixin class which adds common functions to `handle`, `object` and various accessors.
+    The only requirement for `Derived` is to implement ``PyObject *Derived::ptr() const``.
+\endrst */
+template <typename Derived>
+class object_api : public pyobject_tag {
+    const Derived &derived() const { return static_cast<const Derived &>(*this); }
+
+public:
+    /** \rst
+        Return an iterator equivalent to calling ``iter()`` in Python. The object
+        must be a collection which supports the iteration protocol.
+    \endrst */
+    iterator begin() const;
+    /// Return a sentinel which ends iteration.
+    iterator end() const;
+
+    /** \rst
+        Return an internal functor to invoke the object's sequence protocol. Casting
+        the returned ``detail::item_accessor`` instance to a `handle` or `object`
+        subclass causes a corresponding call to ``__getitem__``. Assigning a `handle`
+        or `object` subclass causes a call to ``__setitem__``.
+    \endrst */
+    item_accessor operator[](handle key) const;
+    /// See above (the only difference is that they key is provided as a string literal)
+    item_accessor operator[](const char *key) const;
+
+    /** \rst
+        Return an internal functor to access the object's attributes. Casting the
+        returned ``detail::obj_attr_accessor`` instance to a `handle` or `object`
+        subclass causes a corresponding call to ``getattr``. Assigning a `handle`
+        or `object` subclass causes a call to ``setattr``.
+    \endrst */
+    obj_attr_accessor attr(handle key) const;
+    /// See above (the only difference is that they key is provided as a string literal)
+    str_attr_accessor attr(const char *key) const;
+
+    /** \rst
+        Matches * unpacking in Python, e.g. to unpack arguments out of a ``tuple``
+        or ``list`` for a function call. Applying another * to the result yields
+        ** unpacking, e.g. to unpack a dict as function keyword arguments.
+        See :ref:`calling_python_functions`.
+    \endrst */
+    args_proxy operator*() const;
+
+    /// Check if the given item is contained within this object, i.e. ``item in obj``.
+    template <typename T> bool contains(T &&item) const;
+
+    /** \rst
+        Assuming the Python object is a function or implements the ``__call__``
+        protocol, ``operator()`` invokes the underlying function, passing an
+        arbitrary set of parameters. The result is returned as a `object` and
+        may need to be converted back into a Python object using `handle::cast()`.
+
+        When some of the arguments cannot be converted to Python objects, the
+        function will throw a `cast_error` exception. When the Python function
+        call fails, a `error_already_set` exception is thrown.
+    \endrst */
+    template <return_value_policy policy = return_value_policy::automatic_reference, typename... Args>
+    object operator()(Args &&...args) const;
+    template <return_value_policy policy = return_value_policy::automatic_reference, typename... Args>
+    PYBIND11_DEPRECATED("call(...) was deprecated in favor of operator()(...)")
+        object call(Args&&... args) const;
+
+    /// Equivalent to ``obj is other`` in Python.
+    bool is(object_api const& other) const { return derived().ptr() == other.derived().ptr(); }
+    /// Equivalent to ``obj is None`` in Python.
+    bool is_none() const { return derived().ptr() == Py_None; }
+    /// Equivalent to obj == other in Python
+    bool equal(object_api const &other) const      { return rich_compare(other, Py_EQ); }
+    bool not_equal(object_api const &other) const  { return rich_compare(other, Py_NE); }
+    bool operator<(object_api const &other) const  { return rich_compare(other, Py_LT); }
+    bool operator<=(object_api const &other) const { return rich_compare(other, Py_LE); }
+    bool operator>(object_api const &other) const  { return rich_compare(other, Py_GT); }
+    bool operator>=(object_api const &other) const { return rich_compare(other, Py_GE); }
+
+    object operator-() const;
+    object operator~() const;
+    object operator+(object_api const &other) const;
+    object operator+=(object_api const &other) const;
+    object operator-(object_api const &other) const;
+    object operator-=(object_api const &other) const;
+    object operator*(object_api const &other) const;
+    object operator*=(object_api const &other) const;
+    object operator/(object_api const &other) const;
+    object operator/=(object_api const &other) const;
+    object operator|(object_api const &other) const;
+    object operator|=(object_api const &other) const;
+    object operator&(object_api const &other) const;
+    object operator&=(object_api const &other) const;
+    object operator^(object_api const &other) const;
+    object operator^=(object_api const &other) const;
+    object operator<<(object_api const &other) const;
+    object operator<<=(object_api const &other) const;
+    object operator>>(object_api const &other) const;
+    object operator>>=(object_api const &other) const;
+
+    PYBIND11_DEPRECATED("Use py::str(obj) instead")
+    pybind11::str str() const;
+
+    /// Get or set the object's docstring, i.e. ``obj.__doc__``.
+    str_attr_accessor doc() const;
+
+    /// Return the object's current reference count
+    int ref_count() const { return static_cast<int>(Py_REFCNT(derived().ptr())); }
+    /// Return a handle to the Python type object underlying the instance
+    handle get_type() const;
+
+private:
+    bool rich_compare(object_api const &other, int value) const;
+};
+
+NAMESPACE_END(detail)
+
+/** \rst
+    Holds a reference to a Python object (no reference counting)
+
+    The `handle` class is a thin wrapper around an arbitrary Python object (i.e. a
+    ``PyObject *`` in Python's C API). It does not perform any automatic reference
+    counting and merely provides a basic C++ interface to various Python API functions.
+
+    .. seealso::
+        The `object` class inherits from `handle` and adds automatic reference
+        counting features.
+\endrst */
+class handle : public detail::object_api<handle> {
+public:
+    /// The default constructor creates a handle with a ``nullptr``-valued pointer
+    handle() = default;
+    /// Creates a ``handle`` from the given raw Python object pointer
+    handle(PyObject *ptr) : m_ptr(ptr) { } // Allow implicit conversion from PyObject*
+
+    /// Return the underlying ``PyObject *`` pointer
+    PyObject *ptr() const { return m_ptr; }
+    PyObject *&ptr() { return m_ptr; }
+
+    /** \rst
+        Manually increase the reference count of the Python object. Usually, it is
+        preferable to use the `object` class which derives from `handle` and calls
+        this function automatically. Returns a reference to itself.
+    \endrst */
+    const handle& inc_ref() const & { Py_XINCREF(m_ptr); return *this; }
+
+    /** \rst
+        Manually decrease the reference count of the Python object. Usually, it is
+        preferable to use the `object` class which derives from `handle` and calls
+        this function automatically. Returns a reference to itself.
+    \endrst */
+    const handle& dec_ref() const & { Py_XDECREF(m_ptr); return *this; }
+
+    /** \rst
+        Attempt to cast the Python object into the given C++ type. A `cast_error`
+        will be throw upon failure.
+    \endrst */
+    template <typename T> T cast() const;
+    /// Return ``true`` when the `handle` wraps a valid Python object
+    explicit operator bool() const { return m_ptr != nullptr; }
+    /** \rst
+        Deprecated: Check that the underlying pointers are the same.
+        Equivalent to ``obj1 is obj2`` in Python.
+    \endrst */
+    PYBIND11_DEPRECATED("Use obj1.is(obj2) instead")
+    bool operator==(const handle &h) const { return m_ptr == h.m_ptr; }
+    PYBIND11_DEPRECATED("Use !obj1.is(obj2) instead")
+    bool operator!=(const handle &h) const { return m_ptr != h.m_ptr; }
+    PYBIND11_DEPRECATED("Use handle::operator bool() instead")
+    bool check() const { return m_ptr != nullptr; }
+protected:
+    PyObject *m_ptr = nullptr;
+};
+
+/** \rst
+    Holds a reference to a Python object (with reference counting)
+
+    Like `handle`, the `object` class is a thin wrapper around an arbitrary Python
+    object (i.e. a ``PyObject *`` in Python's C API). In contrast to `handle`, it
+    optionally increases the object's reference count upon construction, and it
+    *always* decreases the reference count when the `object` instance goes out of
+    scope and is destructed. When using `object` instances consistently, it is much
+    easier to get reference counting right at the first attempt.
+\endrst */
+class object : public handle {
+public:
+    object() = default;
+    PYBIND11_DEPRECATED("Use reinterpret_borrow<object>() or reinterpret_steal<object>()")
+    object(handle h, bool is_borrowed) : handle(h) { if (is_borrowed) inc_ref(); }
+    /// Copy constructor; always increases the reference count
+    object(const object &o) : handle(o) { inc_ref(); }
+    /// Move constructor; steals the object from ``other`` and preserves its reference count
+    object(object &&other) noexcept { m_ptr = other.m_ptr; other.m_ptr = nullptr; }
+    /// Destructor; automatically calls `handle::dec_ref()`
+    ~object() { dec_ref(); }
+
+    /** \rst
+        Resets the internal pointer to ``nullptr`` without without decreasing the
+        object's reference count. The function returns a raw handle to the original
+        Python object.
+    \endrst */
+    handle release() {
+      PyObject *tmp = m_ptr;
+      m_ptr = nullptr;
+      return handle(tmp);
+    }
+
+    object& operator=(const object &other) {
+        other.inc_ref();
+        dec_ref();
+        m_ptr = other.m_ptr;
+        return *this;
+    }
+
+    object& operator=(object &&other) noexcept {
+        if (this != &other) {
+            handle temp(m_ptr);
+            m_ptr = other.m_ptr;
+            other.m_ptr = nullptr;
+            temp.dec_ref();
+        }
+        return *this;
+    }
+
+    // Calling cast() on an object lvalue just copies (via handle::cast)
+    template <typename T> T cast() const &;
+    // Calling on an object rvalue does a move, if needed and/or possible
+    template <typename T> T cast() &&;
+
+protected:
+    // Tags for choosing constructors from raw PyObject *
+    struct borrowed_t { };
+    struct stolen_t { };
+
+    template <typename T> friend T reinterpret_borrow(handle);
+    template <typename T> friend T reinterpret_steal(handle);
+
+public:
+    // Only accessible from derived classes and the reinterpret_* functions
+    object(handle h, borrowed_t) : handle(h) { inc_ref(); }
+    object(handle h, stolen_t) : handle(h) { }
+};
+
+/** \rst
+    Declare that a `handle` or ``PyObject *`` is a certain type and borrow the reference.
+    The target type ``T`` must be `object` or one of its derived classes. The function
+    doesn't do any conversions or checks. It's up to the user to make sure that the
+    target type is correct.
+
+    .. code-block:: cpp
+
+        PyObject *p = PyList_GetItem(obj, index);
+        py::object o = reinterpret_borrow<py::object>(p);
+        // or
+        py::tuple t = reinterpret_borrow<py::tuple>(p); // <-- `p` must be already be a `tuple`
+\endrst */
+template <typename T> T reinterpret_borrow(handle h) { return {h, object::borrowed_t{}}; }
+
+/** \rst
+    Like `reinterpret_borrow`, but steals the reference.
+
+     .. code-block:: cpp
+
+        PyObject *p = PyObject_Str(obj);
+        py::str s = reinterpret_steal<py::str>(p); // <-- `p` must be already be a `str`
+\endrst */
+template <typename T> T reinterpret_steal(handle h) { return {h, object::stolen_t{}}; }
+
+NAMESPACE_BEGIN(detail)
+inline std::string error_string();
+NAMESPACE_END(detail)
+
+/// Fetch and hold an error which was already set in Python.  An instance of this is typically
+/// thrown to propagate python-side errors back through C++ which can either be caught manually or
+/// else falls back to the function dispatcher (which then raises the captured error back to
+/// python).
+class error_already_set : public std::runtime_error {
+public:
+    /// Constructs a new exception from the current Python error indicator, if any.  The current
+    /// Python error indicator will be cleared.
+    error_already_set() : std::runtime_error(detail::error_string()) {
+        PyErr_Fetch(&m_type.ptr(), &m_value.ptr(), &m_trace.ptr());
+    }
+
+    error_already_set(const error_already_set &) = default;
+    error_already_set(error_already_set &&) = default;
+
+    inline ~error_already_set();
+
+    /// Give the currently-held error back to Python, if any.  If there is currently a Python error
+    /// already set it is cleared first.  After this call, the current object no longer stores the
+    /// error variables (but the `.what()` string is still available).
+    void restore() { PyErr_Restore(m_type.release().ptr(), m_value.release().ptr(), m_trace.release().ptr()); }
+
+    // Does nothing; provided for backwards compatibility.
+    PYBIND11_DEPRECATED("Use of error_already_set.clear() is deprecated")
+    void clear() {}
+
+    /// Check if the currently trapped error type matches the given Python exception class (or a
+    /// subclass thereof).  May also be passed a tuple to search for any exception class matches in
+    /// the given tuple.
+    bool matches(handle exc) const { return PyErr_GivenExceptionMatches(m_type.ptr(), exc.ptr()); }
+
+    const object& type() const { return m_type; }
+    const object& value() const { return m_value; }
+    const object& trace() const { return m_trace; }
+
+private:
+    object m_type, m_value, m_trace;
+};
+
+/** \defgroup python_builtins _
+    Unless stated otherwise, the following C++ functions behave the same
+    as their Python counterparts.
+ */
+
+/** \ingroup python_builtins
+    \rst
+    Return true if ``obj`` is an instance of ``T``. Type ``T`` must be a subclass of
+    `object` or a class which was exposed to Python as ``py::class_<T>``.
+\endrst */
+template <typename T, detail::enable_if_t<std::is_base_of<object, T>::value, int> = 0>
+bool isinstance(handle obj) { return T::check_(obj); }
+
+template <typename T, detail::enable_if_t<!std::is_base_of<object, T>::value, int> = 0>
+bool isinstance(handle obj) { return detail::isinstance_generic(obj, typeid(T)); }
+
+template <> inline bool isinstance<handle>(handle obj) = delete;
+template <> inline bool isinstance<object>(handle obj) { return obj.ptr() != nullptr; }
+
+/// \ingroup python_builtins
+/// Return true if ``obj`` is an instance of the ``type``.
+inline bool isinstance(handle obj, handle type) {
+    const auto result = PyObject_IsInstance(obj.ptr(), type.ptr());
+    if (result == -1)
+        throw error_already_set();
+    return result != 0;
+}
+
+/// \addtogroup python_builtins
+/// @{
+inline bool hasattr(handle obj, handle name) {
+    return PyObject_HasAttr(obj.ptr(), name.ptr()) == 1;
+}
+
+inline bool hasattr(handle obj, const char *name) {
+    return PyObject_HasAttrString(obj.ptr(), name) == 1;
+}
+
+inline void delattr(handle obj, handle name) {
+    if (PyObject_DelAttr(obj.ptr(), name.ptr()) != 0) { throw error_already_set(); }
+}
+
+inline void delattr(handle obj, const char *name) {
+    if (PyObject_DelAttrString(obj.ptr(), name) != 0) { throw error_already_set(); }
+}
+
+inline object getattr(handle obj, handle name) {
+    PyObject *result = PyObject_GetAttr(obj.ptr(), name.ptr());
+    if (!result) { throw error_already_set(); }
+    return reinterpret_steal<object>(result);
+}
+
+inline object getattr(handle obj, const char *name) {
+    PyObject *result = PyObject_GetAttrString(obj.ptr(), name);
+    if (!result) { throw error_already_set(); }
+    return reinterpret_steal<object>(result);
+}
+
+inline object getattr(handle obj, handle name, handle default_) {
+    if (PyObject *result = PyObject_GetAttr(obj.ptr(), name.ptr())) {
+        return reinterpret_steal<object>(result);
+    } else {
+        PyErr_Clear();
+        return reinterpret_borrow<object>(default_);
+    }
+}
+
+inline object getattr(handle obj, const char *name, handle default_) {
+    if (PyObject *result = PyObject_GetAttrString(obj.ptr(), name)) {
+        return reinterpret_steal<object>(result);
+    } else {
+        PyErr_Clear();
+        return reinterpret_borrow<object>(default_);
+    }
+}
+
+inline void setattr(handle obj, handle name, handle value) {
+    if (PyObject_SetAttr(obj.ptr(), name.ptr(), value.ptr()) != 0) { throw error_already_set(); }
+}
+
+inline void setattr(handle obj, const char *name, handle value) {
+    if (PyObject_SetAttrString(obj.ptr(), name, value.ptr()) != 0) { throw error_already_set(); }
+}
+
+inline ssize_t hash(handle obj) {
+    auto h = PyObject_Hash(obj.ptr());
+    if (h == -1) { throw error_already_set(); }
+    return h;
+}
+
+/// @} python_builtins
+
+NAMESPACE_BEGIN(detail)
+inline handle get_function(handle value) {
+    if (value) {
+#if PY_MAJOR_VERSION >= 3
+        if (PyInstanceMethod_Check(value.ptr()))
+            value = PyInstanceMethod_GET_FUNCTION(value.ptr());
+        else
+#endif
+        if (PyMethod_Check(value.ptr()))
+            value = PyMethod_GET_FUNCTION(value.ptr());
+    }
+    return value;
+}
+
+// Helper aliases/functions to support implicit casting of values given to python accessors/methods.
+// When given a pyobject, this simply returns the pyobject as-is; for other C++ type, the value goes
+// through pybind11::cast(obj) to convert it to an `object`.
+template <typename T, enable_if_t<is_pyobject<T>::value, int> = 0>
+auto object_or_cast(T &&o) -> decltype(std::forward<T>(o)) { return std::forward<T>(o); }
+// The following casting version is implemented in cast.h:
+template <typename T, enable_if_t<!is_pyobject<T>::value, int> = 0>
+object object_or_cast(T &&o);
+// Match a PyObject*, which we want to convert directly to handle via its converting constructor
+inline handle object_or_cast(PyObject *ptr) { return ptr; }
+
+template <typename Policy>
+class accessor : public object_api<accessor<Policy>> {
+    using key_type = typename Policy::key_type;
+
+public:
+    accessor(handle obj, key_type key) : obj(obj), key(std::move(key)) { }
+    accessor(const accessor &) = default;
+    accessor(accessor &&) = default;
+
+    // accessor overload required to override default assignment operator (templates are not allowed
+    // to replace default compiler-generated assignments).
+    void operator=(const accessor &a) && { std::move(*this).operator=(handle(a)); }
+    void operator=(const accessor &a) & { operator=(handle(a)); }
+
+    template <typename T> void operator=(T &&value) && {
+        Policy::set(obj, key, object_or_cast(std::forward<T>(value)));
+    }
+    template <typename T> void operator=(T &&value) & {
+        get_cache() = reinterpret_borrow<object>(object_or_cast(std::forward<T>(value)));
+    }
+
+    template <typename T = Policy>
+    PYBIND11_DEPRECATED("Use of obj.attr(...) as bool is deprecated in favor of pybind11::hasattr(obj, ...)")
+    explicit operator enable_if_t<std::is_same<T, accessor_policies::str_attr>::value ||
+            std::is_same<T, accessor_policies::obj_attr>::value, bool>() const {
+        return hasattr(obj, key);
+    }
+    template <typename T = Policy>
+    PYBIND11_DEPRECATED("Use of obj[key] as bool is deprecated in favor of obj.contains(key)")
+    explicit operator enable_if_t<std::is_same<T, accessor_policies::generic_item>::value, bool>() const {
+        return obj.contains(key);
+    }
+
+    operator object() const { return get_cache(); }
+    PyObject *ptr() const { return get_cache().ptr(); }
+    template <typename T> T cast() const { return get_cache().template cast<T>(); }
+
+private:
+    object &get_cache() const {
+        if (!cache) { cache = Policy::get(obj, key); }
+        return cache;
+    }
+
+private:
+    handle obj;
+    key_type key;
+    mutable object cache;
+};
+
+NAMESPACE_BEGIN(accessor_policies)
+struct obj_attr {
+    using key_type = object;
+    static object get(handle obj, handle key) { return getattr(obj, key); }
+    static void set(handle obj, handle key, handle val) { setattr(obj, key, val); }
+};
+
+struct str_attr {
+    using key_type = const char *;
+    static object get(handle obj, const char *key) { return getattr(obj, key); }
+    static void set(handle obj, const char *key, handle val) { setattr(obj, key, val); }
+};
+
+struct generic_item {
+    using key_type = object;
+
+    static object get(handle obj, handle key) {
+        PyObject *result = PyObject_GetItem(obj.ptr(), key.ptr());
+        if (!result) { throw error_already_set(); }
+        return reinterpret_steal<object>(result);
+    }
+
+    static void set(handle obj, handle key, handle val) {
+        if (PyObject_SetItem(obj.ptr(), key.ptr(), val.ptr()) != 0) { throw error_already_set(); }
+    }
+};
+
+struct sequence_item {
+    using key_type = size_t;
+
+    static object get(handle obj, size_t index) {
+        PyObject *result = PySequence_GetItem(obj.ptr(), static_cast<ssize_t>(index));
+        if (!result) { throw error_already_set(); }
+        return reinterpret_steal<object>(result);
+    }
+
+    static void set(handle obj, size_t index, handle val) {
+        // PySequence_SetItem does not steal a reference to 'val'
+        if (PySequence_SetItem(obj.ptr(), static_cast<ssize_t>(index), val.ptr()) != 0) {
+            throw error_already_set();
+        }
+    }
+};
+
+struct list_item {
+    using key_type = size_t;
+
+    static object get(handle obj, size_t index) {
+        PyObject *result = PyList_GetItem(obj.ptr(), static_cast<ssize_t>(index));
+        if (!result) { throw error_already_set(); }
+        return reinterpret_borrow<object>(result);
+    }
+
+    static void set(handle obj, size_t index, handle val) {
+        // PyList_SetItem steals a reference to 'val'
+        if (PyList_SetItem(obj.ptr(), static_cast<ssize_t>(index), val.inc_ref().ptr()) != 0) {
+            throw error_already_set();
+        }
+    }
+};
+
+struct tuple_item {
+    using key_type = size_t;
+
+    static object get(handle obj, size_t index) {
+        PyObject *result = PyTuple_GetItem(obj.ptr(), static_cast<ssize_t>(index));
+        if (!result) { throw error_already_set(); }
+        return reinterpret_borrow<object>(result);
+    }
+
+    static void set(handle obj, size_t index, handle val) {
+        // PyTuple_SetItem steals a reference to 'val'
+        if (PyTuple_SetItem(obj.ptr(), static_cast<ssize_t>(index), val.inc_ref().ptr()) != 0) {
+            throw error_already_set();
+        }
+    }
+};
+NAMESPACE_END(accessor_policies)
+
+/// STL iterator template used for tuple, list, sequence and dict
+template <typename Policy>
+class generic_iterator : public Policy {
+    using It = generic_iterator;
+
+public:
+    using difference_type = ssize_t;
+    using iterator_category = typename Policy::iterator_category;
+    using value_type = typename Policy::value_type;
+    using reference = typename Policy::reference;
+    using pointer = typename Policy::pointer;
+
+    generic_iterator() = default;
+    generic_iterator(handle seq, ssize_t index) : Policy(seq, index) { }
+
+    reference operator*() const { return Policy::dereference(); }
+    reference operator[](difference_type n) const { return *(*this + n); }
+    pointer operator->() const { return **this; }
+
+    It &operator++() { Policy::increment(); return *this; }
+    It operator++(int) { auto copy = *this; Policy::increment(); return copy; }
+    It &operator--() { Policy::decrement(); return *this; }
+    It operator--(int) { auto copy = *this; Policy::decrement(); return copy; }
+    It &operator+=(difference_type n) { Policy::advance(n); return *this; }
+    It &operator-=(difference_type n) { Policy::advance(-n); return *this; }
+
+    friend It operator+(const It &a, difference_type n) { auto copy = a; return copy += n; }
+    friend It operator+(difference_type n, const It &b) { return b + n; }
+    friend It operator-(const It &a, difference_type n) { auto copy = a; return copy -= n; }
+    friend difference_type operator-(const It &a, const It &b) { return a.distance_to(b); }
+
+    friend bool operator==(const It &a, const It &b) { return a.equal(b); }
+    friend bool operator!=(const It &a, const It &b) { return !(a == b); }
+    friend bool operator< (const It &a, const It &b) { return b - a > 0; }
+    friend bool operator> (const It &a, const It &b) { return b < a; }
+    friend bool operator>=(const It &a, const It &b) { return !(a < b); }
+    friend bool operator<=(const It &a, const It &b) { return !(a > b); }
+};
+
+NAMESPACE_BEGIN(iterator_policies)
+/// Quick proxy class needed to implement ``operator->`` for iterators which can't return pointers
+template <typename T>
+struct arrow_proxy {
+    T value;
+
+    arrow_proxy(T &&value) : value(std::move(value)) { }
+    T *operator->() const { return &value; }
+};
+
+/// Lightweight iterator policy using just a simple pointer: see ``PySequence_Fast_ITEMS``
+class sequence_fast_readonly {
+protected:
+    using iterator_category = std::random_access_iterator_tag;
+    using value_type = handle;
+    using reference = const handle;
+    using pointer = arrow_proxy<const handle>;
+
+    sequence_fast_readonly(handle obj, ssize_t n) : ptr(PySequence_Fast_ITEMS(obj.ptr()) + n) { }
+
+    reference dereference() const { return *ptr; }
+    void increment() { ++ptr; }
+    void decrement() { --ptr; }
+    void advance(ssize_t n) { ptr += n; }
+    bool equal(const sequence_fast_readonly &b) const { return ptr == b.ptr; }
+    ssize_t distance_to(const sequence_fast_readonly &b) const { return ptr - b.ptr; }
+
+private:
+    PyObject **ptr;
+};
+
+/// Full read and write access using the sequence protocol: see ``detail::sequence_accessor``
+class sequence_slow_readwrite {
+protected:
+    using iterator_category = std::random_access_iterator_tag;
+    using value_type = object;
+    using reference = sequence_accessor;
+    using pointer = arrow_proxy<const sequence_accessor>;
+
+    sequence_slow_readwrite(handle obj, ssize_t index) : obj(obj), index(index) { }
+
+    reference dereference() const { return {obj, static_cast<size_t>(index)}; }
+    void increment() { ++index; }
+    void decrement() { --index; }
+    void advance(ssize_t n) { index += n; }
+    bool equal(const sequence_slow_readwrite &b) const { return index == b.index; }
+    ssize_t distance_to(const sequence_slow_readwrite &b) const { return index - b.index; }
+
+private:
+    handle obj;
+    ssize_t index;
+};
+
+/// Python's dictionary protocol permits this to be a forward iterator
+class dict_readonly {
+protected:
+    using iterator_category = std::forward_iterator_tag;
+    using value_type = std::pair<handle, handle>;
+    using reference = const value_type;
+    using pointer = arrow_proxy<const value_type>;
+
+    dict_readonly() = default;
+    dict_readonly(handle obj, ssize_t pos) : obj(obj), pos(pos) { increment(); }
+
+    reference dereference() const { return {key, value}; }
+    void increment() { if (!PyDict_Next(obj.ptr(), &pos, &key, &value)) { pos = -1; } }
+    bool equal(const dict_readonly &b) const { return pos == b.pos; }
+
+private:
+    handle obj;
+    PyObject *key = nullptr, *value = nullptr;
+    ssize_t pos = -1;
+};
+NAMESPACE_END(iterator_policies)
+
+#if !defined(PYPY_VERSION)
+using tuple_iterator = generic_iterator<iterator_policies::sequence_fast_readonly>;
+using list_iterator = generic_iterator<iterator_policies::sequence_fast_readonly>;
+#else
+using tuple_iterator = generic_iterator<iterator_policies::sequence_slow_readwrite>;
+using list_iterator = generic_iterator<iterator_policies::sequence_slow_readwrite>;
+#endif
+
+using sequence_iterator = generic_iterator<iterator_policies::sequence_slow_readwrite>;
+using dict_iterator = generic_iterator<iterator_policies::dict_readonly>;
+
+inline bool PyIterable_Check(PyObject *obj) {
+    PyObject *iter = PyObject_GetIter(obj);
+    if (iter) {
+        Py_DECREF(iter);
+        return true;
+    } else {
+        PyErr_Clear();
+        return false;
+    }
+}
+
+inline bool PyNone_Check(PyObject *o) { return o == Py_None; }
+#if PY_MAJOR_VERSION >= 3
+inline bool PyEllipsis_Check(PyObject *o) { return o == Py_Ellipsis; }
+#endif
+
+inline bool PyUnicode_Check_Permissive(PyObject *o) { return PyUnicode_Check(o) || PYBIND11_BYTES_CHECK(o); }
+
+inline bool PyStaticMethod_Check(PyObject *o) { return o->ob_type == &PyStaticMethod_Type; }
+
+class kwargs_proxy : public handle {
+public:
+    explicit kwargs_proxy(handle h) : handle(h) { }
+};
+
+class args_proxy : public handle {
+public:
+    explicit args_proxy(handle h) : handle(h) { }
+    kwargs_proxy operator*() const { return kwargs_proxy(*this); }
+};
+
+/// Python argument categories (using PEP 448 terms)
+template <typename T> using is_keyword = std::is_base_of<arg, T>;
+template <typename T> using is_s_unpacking = std::is_same<args_proxy, T>; // * unpacking
+template <typename T> using is_ds_unpacking = std::is_same<kwargs_proxy, T>; // ** unpacking
+template <typename T> using is_positional = satisfies_none_of<T,
+    is_keyword, is_s_unpacking, is_ds_unpacking
+>;
+template <typename T> using is_keyword_or_ds = satisfies_any_of<T, is_keyword, is_ds_unpacking>;
+
+// Call argument collector forward declarations
+template <return_value_policy policy = return_value_policy::automatic_reference>
+class simple_collector;
+template <return_value_policy policy = return_value_policy::automatic_reference>
+class unpacking_collector;
+
+NAMESPACE_END(detail)
+
+// TODO: After the deprecated constructors are removed, this macro can be simplified by
+//       inheriting ctors: `using Parent::Parent`. It's not an option right now because
+//       the `using` statement triggers the parent deprecation warning even if the ctor
+//       isn't even used.
+#define PYBIND11_OBJECT_COMMON(Name, Parent, CheckFun) \
+    public: \
+        PYBIND11_DEPRECATED("Use reinterpret_borrow<"#Name">() or reinterpret_steal<"#Name">()") \
+        Name(handle h, bool is_borrowed) : Parent(is_borrowed ? Parent(h, borrowed_t{}) : Parent(h, stolen_t{})) { } \
+        Name(handle h, borrowed_t) : Parent(h, borrowed_t{}) { } \
+        Name(handle h, stolen_t) : Parent(h, stolen_t{}) { } \
+        PYBIND11_DEPRECATED("Use py::isinstance<py::python_type>(obj) instead") \
+        bool check() const { return m_ptr != nullptr && (bool) CheckFun(m_ptr); } \
+        static bool check_(handle h) { return h.ptr() != nullptr && CheckFun(h.ptr()); }
+
+#define PYBIND11_OBJECT_CVT(Name, Parent, CheckFun, ConvertFun) \
+    PYBIND11_OBJECT_COMMON(Name, Parent, CheckFun) \
+    /* This is deliberately not 'explicit' to allow implicit conversion from object: */ \
+    Name(const object &o) \
+    : Parent(check_(o) ? o.inc_ref().ptr() : ConvertFun(o.ptr()), stolen_t{}) \
+    { if (!m_ptr) throw error_already_set(); } \
+    Name(object &&o) \
+    : Parent(check_(o) ? o.release().ptr() : ConvertFun(o.ptr()), stolen_t{}) \
+    { if (!m_ptr) throw error_already_set(); } \
+    template <typename Policy_> \
+    Name(const ::pybind11::detail::accessor<Policy_> &a) : Name(object(a)) { }
+
+#define PYBIND11_OBJECT(Name, Parent, CheckFun) \
+    PYBIND11_OBJECT_COMMON(Name, Parent, CheckFun) \
+    /* This is deliberately not 'explicit' to allow implicit conversion from object: */ \
+    Name(const object &o) : Parent(o) { } \
+    Name(object &&o) : Parent(std::move(o)) { }
+
+#define PYBIND11_OBJECT_DEFAULT(Name, Parent, CheckFun) \
+    PYBIND11_OBJECT(Name, Parent, CheckFun) \
+    Name() : Parent() { }
+
+/// \addtogroup pytypes
+/// @{
+
+/** \rst
+    Wraps a Python iterator so that it can also be used as a C++ input iterator
+
+    Caveat: copying an iterator does not (and cannot) clone the internal
+    state of the Python iterable. This also applies to the post-increment
+    operator. This iterator should only be used to retrieve the current
+    value using ``operator*()``.
+\endrst */
+class iterator : public object {
+public:
+    using iterator_category = std::input_iterator_tag;
+    using difference_type = ssize_t;
+    using value_type = handle;
+    using reference = const handle;
+    using pointer = const handle *;
+
+    PYBIND11_OBJECT_DEFAULT(iterator, object, PyIter_Check)
+
+    iterator& operator++() {
+        advance();
+        return *this;
+    }
+
+    iterator operator++(int) {
+        auto rv = *this;
+        advance();
+        return rv;
+    }
+
+    reference operator*() const {
+        if (m_ptr && !value.ptr()) {
+            auto& self = const_cast<iterator &>(*this);
+            self.advance();
+        }
+        return value;
+    }
+
+    pointer operator->() const { operator*(); return &value; }
+
+    /** \rst
+         The value which marks the end of the iteration. ``it == iterator::sentinel()``
+         is equivalent to catching ``StopIteration`` in Python.
+
+         .. code-block:: cpp
+
+             void foo(py::iterator it) {
+                 while (it != py::iterator::sentinel()) {
+                    // use `*it`
+                    ++it;
+                 }
+             }
+    \endrst */
+    static iterator sentinel() { return {}; }
+
+    friend bool operator==(const iterator &a, const iterator &b) { return a->ptr() == b->ptr(); }
+    friend bool operator!=(const iterator &a, const iterator &b) { return a->ptr() != b->ptr(); }
+
+private:
+    void advance() {
+        value = reinterpret_steal<object>(PyIter_Next(m_ptr));
+        if (PyErr_Occurred()) { throw error_already_set(); }
+    }
+
+private:
+    object value = {};
+};
+
+class iterable : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(iterable, object, detail::PyIterable_Check)
+};
+
+class bytes;
+
+class str : public object {
+public:
+    PYBIND11_OBJECT_CVT(str, object, detail::PyUnicode_Check_Permissive, raw_str)
+
+    str(const char *c, size_t n)
+        : object(PyUnicode_FromStringAndSize(c, (ssize_t) n), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate string object!");
+    }
+
+    // 'explicit' is explicitly omitted from the following constructors to allow implicit conversion to py::str from C++ string-like objects
+    str(const char *c = "")
+        : object(PyUnicode_FromString(c), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate string object!");
+    }
+
+    str(const std::string &s) : str(s.data(), s.size()) { }
+
+    explicit str(const bytes &b);
+
+    /** \rst
+        Return a string representation of the object. This is analogous to
+        the ``str()`` function in Python.
+    \endrst */
+    explicit str(handle h) : object(raw_str(h.ptr()), stolen_t{}) { }
+
+    operator std::string() const {
+        object temp = *this;
+        if (PyUnicode_Check(m_ptr)) {
+            temp = reinterpret_steal<object>(PyUnicode_AsUTF8String(m_ptr));
+            if (!temp)
+                pybind11_fail("Unable to extract string contents! (encoding issue)");
+        }
+        char *buffer;
+        ssize_t length;
+        if (PYBIND11_BYTES_AS_STRING_AND_SIZE(temp.ptr(), &buffer, &length))
+            pybind11_fail("Unable to extract string contents! (invalid type)");
+        return std::string(buffer, (size_t) length);
+    }
+
+    template <typename... Args>
+    str format(Args &&...args) const {
+        return attr("format")(std::forward<Args>(args)...);
+    }
+
+private:
+    /// Return string representation -- always returns a new reference, even if already a str
+    static PyObject *raw_str(PyObject *op) {
+        PyObject *str_value = PyObject_Str(op);
+#if PY_MAJOR_VERSION < 3
+        if (!str_value) throw error_already_set();
+        PyObject *unicode = PyUnicode_FromEncodedObject(str_value, "utf-8", nullptr);
+        Py_XDECREF(str_value); str_value = unicode;
+#endif
+        return str_value;
+    }
+};
+/// @} pytypes
+
+inline namespace literals {
+/** \rst
+    String literal version of `str`
+ \endrst */
+inline str operator"" _s(const char *s, size_t size) { return {s, size}; }
+}
+
+/// \addtogroup pytypes
+/// @{
+class bytes : public object {
+public:
+    PYBIND11_OBJECT(bytes, object, PYBIND11_BYTES_CHECK)
+
+    // Allow implicit conversion:
+    bytes(const char *c = "")
+        : object(PYBIND11_BYTES_FROM_STRING(c), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate bytes object!");
+    }
+
+    bytes(const char *c, size_t n)
+        : object(PYBIND11_BYTES_FROM_STRING_AND_SIZE(c, (ssize_t) n), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate bytes object!");
+    }
+
+    // Allow implicit conversion:
+    bytes(const std::string &s) : bytes(s.data(), s.size()) { }
+
+    explicit bytes(const pybind11::str &s);
+
+    operator std::string() const {
+        char *buffer;
+        ssize_t length;
+        if (PYBIND11_BYTES_AS_STRING_AND_SIZE(m_ptr, &buffer, &length))
+            pybind11_fail("Unable to extract bytes contents!");
+        return std::string(buffer, (size_t) length);
+    }
+};
+
+inline bytes::bytes(const pybind11::str &s) {
+    object temp = s;
+    if (PyUnicode_Check(s.ptr())) {
+        temp = reinterpret_steal<object>(PyUnicode_AsUTF8String(s.ptr()));
+        if (!temp)
+            pybind11_fail("Unable to extract string contents! (encoding issue)");
+    }
+    char *buffer;
+    ssize_t length;
+    if (PYBIND11_BYTES_AS_STRING_AND_SIZE(temp.ptr(), &buffer, &length))
+        pybind11_fail("Unable to extract string contents! (invalid type)");
+    auto obj = reinterpret_steal<object>(PYBIND11_BYTES_FROM_STRING_AND_SIZE(buffer, length));
+    if (!obj)
+        pybind11_fail("Could not allocate bytes object!");
+    m_ptr = obj.release().ptr();
+}
+
+inline str::str(const bytes& b) {
+    char *buffer;
+    ssize_t length;
+    if (PYBIND11_BYTES_AS_STRING_AND_SIZE(b.ptr(), &buffer, &length))
+        pybind11_fail("Unable to extract bytes contents!");
+    auto obj = reinterpret_steal<object>(PyUnicode_FromStringAndSize(buffer, (ssize_t) length));
+    if (!obj)
+        pybind11_fail("Could not allocate string object!");
+    m_ptr = obj.release().ptr();
+}
+
+class none : public object {
+public:
+    PYBIND11_OBJECT(none, object, detail::PyNone_Check)
+    none() : object(Py_None, borrowed_t{}) { }
+};
+
+#if PY_MAJOR_VERSION >= 3
+class ellipsis : public object {
+public:
+    PYBIND11_OBJECT(ellipsis, object, detail::PyEllipsis_Check)
+    ellipsis() : object(Py_Ellipsis, borrowed_t{}) { }
+};
+#endif
+
+class bool_ : public object {
+public:
+    PYBIND11_OBJECT_CVT(bool_, object, PyBool_Check, raw_bool)
+    bool_() : object(Py_False, borrowed_t{}) { }
+    // Allow implicit conversion from and to `bool`:
+    bool_(bool value) : object(value ? Py_True : Py_False, borrowed_t{}) { }
+    operator bool() const { return m_ptr && PyLong_AsLong(m_ptr) != 0; }
+
+private:
+    /// Return the truth value of an object -- always returns a new reference
+    static PyObject *raw_bool(PyObject *op) {
+        const auto value = PyObject_IsTrue(op);
+        if (value == -1) return nullptr;
+        return handle(value ? Py_True : Py_False).inc_ref().ptr();
+    }
+};
+
+NAMESPACE_BEGIN(detail)
+// Converts a value to the given unsigned type.  If an error occurs, you get back (Unsigned) -1;
+// otherwise you get back the unsigned long or unsigned long long value cast to (Unsigned).
+// (The distinction is critically important when casting a returned -1 error value to some other
+// unsigned type: (A)-1 != (B)-1 when A and B are unsigned types of different sizes).
+template <typename Unsigned>
+Unsigned as_unsigned(PyObject *o) {
+    if (sizeof(Unsigned) <= sizeof(unsigned long)
+#if PY_VERSION_HEX < 0x03000000
+            || PyInt_Check(o)
+#endif
+    ) {
+        unsigned long v = PyLong_AsUnsignedLong(o);
+        return v == (unsigned long) -1 && PyErr_Occurred() ? (Unsigned) -1 : (Unsigned) v;
+    }
+    else {
+        unsigned long long v = PyLong_AsUnsignedLongLong(o);
+        return v == (unsigned long long) -1 && PyErr_Occurred() ? (Unsigned) -1 : (Unsigned) v;
+    }
+}
+NAMESPACE_END(detail)
+
+class int_ : public object {
+public:
+    PYBIND11_OBJECT_CVT(int_, object, PYBIND11_LONG_CHECK, PyNumber_Long)
+    int_() : object(PyLong_FromLong(0), stolen_t{}) { }
+    // Allow implicit conversion from C++ integral types:
+    template <typename T,
+              detail::enable_if_t<std::is_integral<T>::value, int> = 0>
+    int_(T value) {
+        if (sizeof(T) <= sizeof(long)) {
+            if (std::is_signed<T>::value)
+                m_ptr = PyLong_FromLong((long) value);
+            else
+                m_ptr = PyLong_FromUnsignedLong((unsigned long) value);
+        } else {
+            if (std::is_signed<T>::value)
+                m_ptr = PyLong_FromLongLong((long long) value);
+            else
+                m_ptr = PyLong_FromUnsignedLongLong((unsigned long long) value);
+        }
+        if (!m_ptr) pybind11_fail("Could not allocate int object!");
+    }
+
+    template <typename T,
+              detail::enable_if_t<std::is_integral<T>::value, int> = 0>
+    operator T() const {
+        return std::is_unsigned<T>::value
+            ? detail::as_unsigned<T>(m_ptr)
+            : sizeof(T) <= sizeof(long)
+              ? (T) PyLong_AsLong(m_ptr)
+              : (T) PYBIND11_LONG_AS_LONGLONG(m_ptr);
+    }
+};
+
+class float_ : public object {
+public:
+    PYBIND11_OBJECT_CVT(float_, object, PyFloat_Check, PyNumber_Float)
+    // Allow implicit conversion from float/double:
+    float_(float value) : object(PyFloat_FromDouble((double) value), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate float object!");
+    }
+    float_(double value = .0) : object(PyFloat_FromDouble((double) value), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate float object!");
+    }
+    operator float() const { return (float) PyFloat_AsDouble(m_ptr); }
+    operator double() const { return (double) PyFloat_AsDouble(m_ptr); }
+};
+
+class weakref : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(weakref, object, PyWeakref_Check)
+    explicit weakref(handle obj, handle callback = {})
+        : object(PyWeakref_NewRef(obj.ptr(), callback.ptr()), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate weak reference!");
+    }
+};
+
+class slice : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(slice, object, PySlice_Check)
+    slice(ssize_t start_, ssize_t stop_, ssize_t step_) {
+        int_ start(start_), stop(stop_), step(step_);
+        m_ptr = PySlice_New(start.ptr(), stop.ptr(), step.ptr());
+        if (!m_ptr) pybind11_fail("Could not allocate slice object!");
+    }
+    bool compute(size_t length, size_t *start, size_t *stop, size_t *step,
+                 size_t *slicelength) const {
+        return PySlice_GetIndicesEx((PYBIND11_SLICE_OBJECT *) m_ptr,
+                                    (ssize_t) length, (ssize_t *) start,
+                                    (ssize_t *) stop, (ssize_t *) step,
+                                    (ssize_t *) slicelength) == 0;
+    }
+    bool compute(ssize_t length, ssize_t *start, ssize_t *stop, ssize_t *step,
+      ssize_t *slicelength) const {
+      return PySlice_GetIndicesEx((PYBIND11_SLICE_OBJECT *) m_ptr,
+          length, start,
+          stop, step,
+          slicelength) == 0;
+    }
+};
+
+class capsule : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(capsule, object, PyCapsule_CheckExact)
+    PYBIND11_DEPRECATED("Use reinterpret_borrow<capsule>() or reinterpret_steal<capsule>()")
+    capsule(PyObject *ptr, bool is_borrowed) : object(is_borrowed ? object(ptr, borrowed_t{}) : object(ptr, stolen_t{})) { }
+
+    explicit capsule(const void *value, const char *name = nullptr, void (*destructor)(PyObject *) = nullptr)
+        : object(PyCapsule_New(const_cast<void *>(value), name, destructor), stolen_t{}) {
+        if (!m_ptr)
+            pybind11_fail("Could not allocate capsule object!");
+    }
+
+    PYBIND11_DEPRECATED("Please pass a destructor that takes a void pointer as input")
+    capsule(const void *value, void (*destruct)(PyObject *))
+        : object(PyCapsule_New(const_cast<void*>(value), nullptr, destruct), stolen_t{}) {
+        if (!m_ptr)
+            pybind11_fail("Could not allocate capsule object!");
+    }
+
+    capsule(const void *value, void (*destructor)(void *)) {
+        m_ptr = PyCapsule_New(const_cast<void *>(value), nullptr, [](PyObject *o) {
+            auto destructor = reinterpret_cast<void (*)(void *)>(PyCapsule_GetContext(o));
+            void *ptr = PyCapsule_GetPointer(o, nullptr);
+            destructor(ptr);
+        });
+
+        if (!m_ptr)
+            pybind11_fail("Could not allocate capsule object!");
+
+        if (PyCapsule_SetContext(m_ptr, (void *) destructor) != 0)
+            pybind11_fail("Could not set capsule context!");
+    }
+
+    capsule(void (*destructor)()) {
+        m_ptr = PyCapsule_New(reinterpret_cast<void *>(destructor), nullptr, [](PyObject *o) {
+            auto destructor = reinterpret_cast<void (*)()>(PyCapsule_GetPointer(o, nullptr));
+            destructor();
+        });
+
+        if (!m_ptr)
+            pybind11_fail("Could not allocate capsule object!");
+    }
+
+    template <typename T> operator T *() const {
+        auto name = this->name();
+        T * result = static_cast<T *>(PyCapsule_GetPointer(m_ptr, name));
+        if (!result) pybind11_fail("Unable to extract capsule contents!");
+        return result;
+    }
+
+    const char *name() const { return PyCapsule_GetName(m_ptr); }
+};
+
+class tuple : public object {
+public:
+    PYBIND11_OBJECT_CVT(tuple, object, PyTuple_Check, PySequence_Tuple)
+    explicit tuple(size_t size = 0) : object(PyTuple_New((ssize_t) size), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate tuple object!");
+    }
+    size_t size() const { return (size_t) PyTuple_Size(m_ptr); }
+    bool empty() const { return size() == 0; }
+    detail::tuple_accessor operator[](size_t index) const { return {*this, index}; }
+    detail::item_accessor operator[](handle h) const { return object::operator[](h); }
+    detail::tuple_iterator begin() const { return {*this, 0}; }
+    detail::tuple_iterator end() const { return {*this, PyTuple_GET_SIZE(m_ptr)}; }
+};
+
+class dict : public object {
+public:
+    PYBIND11_OBJECT_CVT(dict, object, PyDict_Check, raw_dict)
+    dict() : object(PyDict_New(), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate dict object!");
+    }
+    template <typename... Args,
+              typename = detail::enable_if_t<detail::all_of<detail::is_keyword_or_ds<Args>...>::value>,
+              // MSVC workaround: it can't compile an out-of-line definition, so defer the collector
+              typename collector = detail::deferred_t<detail::unpacking_collector<>, Args...>>
+    explicit dict(Args &&...args) : dict(collector(std::forward<Args>(args)...).kwargs()) { }
+
+    size_t size() const { return (size_t) PyDict_Size(m_ptr); }
+    bool empty() const { return size() == 0; }
+    detail::dict_iterator begin() const { return {*this, 0}; }
+    detail::dict_iterator end() const { return {}; }
+    void clear() const { PyDict_Clear(ptr()); }
+    template <typename T> bool contains(T &&key) const {
+        return PyDict_Contains(m_ptr, detail::object_or_cast(std::forward<T>(key)).ptr()) == 1;
+    }
+
+private:
+    /// Call the `dict` Python type -- always returns a new reference
+    static PyObject *raw_dict(PyObject *op) {
+        if (PyDict_Check(op))
+            return handle(op).inc_ref().ptr();
+        return PyObject_CallFunctionObjArgs((PyObject *) &PyDict_Type, op, nullptr);
+    }
+};
+
+class sequence : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(sequence, object, PySequence_Check)
+    size_t size() const { return (size_t) PySequence_Size(m_ptr); }
+    bool empty() const { return size() == 0; }
+    detail::sequence_accessor operator[](size_t index) const { return {*this, index}; }
+    detail::item_accessor operator[](handle h) const { return object::operator[](h); }
+    detail::sequence_iterator begin() const { return {*this, 0}; }
+    detail::sequence_iterator end() const { return {*this, PySequence_Size(m_ptr)}; }
+};
+
+class list : public object {
+public:
+    PYBIND11_OBJECT_CVT(list, object, PyList_Check, PySequence_List)
+    explicit list(size_t size = 0) : object(PyList_New((ssize_t) size), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate list object!");
+    }
+    size_t size() const { return (size_t) PyList_Size(m_ptr); }
+    bool empty() const { return size() == 0; }
+    detail::list_accessor operator[](size_t index) const { return {*this, index}; }
+    detail::item_accessor operator[](handle h) const { return object::operator[](h); }
+    detail::list_iterator begin() const { return {*this, 0}; }
+    detail::list_iterator end() const { return {*this, PyList_GET_SIZE(m_ptr)}; }
+    template <typename T> void append(T &&val) const {
+        PyList_Append(m_ptr, detail::object_or_cast(std::forward<T>(val)).ptr());
+    }
+    template <typename T> void insert(size_t index, T &&val) const {
+        PyList_Insert(m_ptr, static_cast<ssize_t>(index),
+            detail::object_or_cast(std::forward<T>(val)).ptr());
+    }
+};
+
+class args : public tuple { PYBIND11_OBJECT_DEFAULT(args, tuple, PyTuple_Check) };
+class kwargs : public dict { PYBIND11_OBJECT_DEFAULT(kwargs, dict, PyDict_Check)  };
+
+class set : public object {
+public:
+    PYBIND11_OBJECT_CVT(set, object, PySet_Check, PySet_New)
+    set() : object(PySet_New(nullptr), stolen_t{}) {
+        if (!m_ptr) pybind11_fail("Could not allocate set object!");
+    }
+    size_t size() const { return (size_t) PySet_Size(m_ptr); }
+    bool empty() const { return size() == 0; }
+    template <typename T> bool add(T &&val) const {
+        return PySet_Add(m_ptr, detail::object_or_cast(std::forward<T>(val)).ptr()) == 0;
+    }
+    void clear() const { PySet_Clear(m_ptr); }
+    template <typename T> bool contains(T &&val) const {
+        return PySet_Contains(m_ptr, detail::object_or_cast(std::forward<T>(val)).ptr()) == 1;
+    }
+};
+
+class function : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(function, object, PyCallable_Check)
+    handle cpp_function() const {
+        handle fun = detail::get_function(m_ptr);
+        if (fun && PyCFunction_Check(fun.ptr()))
+            return fun;
+        return handle();
+    }
+    bool is_cpp_function() const { return (bool) cpp_function(); }
+};
+
+class staticmethod : public object {
+public:
+    PYBIND11_OBJECT_CVT(staticmethod, object, detail::PyStaticMethod_Check, PyStaticMethod_New)
+};
+
+class buffer : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(buffer, object, PyObject_CheckBuffer)
+
+    buffer_info request(bool writable = false) const {
+        int flags = PyBUF_STRIDES | PyBUF_FORMAT;
+        if (writable) flags |= PyBUF_WRITABLE;
+        Py_buffer *view = new Py_buffer();
+        if (PyObject_GetBuffer(m_ptr, view, flags) != 0) {
+            delete view;
+            throw error_already_set();
+        }
+        return buffer_info(view);
+    }
+};
+
+class memoryview : public object {
+public:
+    explicit memoryview(const buffer_info& info) {
+        static Py_buffer buf { };
+        // Py_buffer uses signed sizes, strides and shape!..
+        static std::vector<Py_ssize_t> py_strides { };
+        static std::vector<Py_ssize_t> py_shape { };
+        buf.buf = info.ptr;
+        buf.itemsize = info.itemsize;
+        buf.format = const_cast<char *>(info.format.c_str());
+        buf.ndim = (int) info.ndim;
+        buf.len = info.size;
+        py_strides.clear();
+        py_shape.clear();
+        for (size_t i = 0; i < (size_t) info.ndim; ++i) {
+            py_strides.push_back(info.strides[i]);
+            py_shape.push_back(info.shape[i]);
+        }
+        buf.strides = py_strides.data();
+        buf.shape = py_shape.data();
+        buf.suboffsets = nullptr;
+        buf.readonly = info.readonly;
+        buf.internal = nullptr;
+
+        m_ptr = PyMemoryView_FromBuffer(&buf);
+        if (!m_ptr)
+            pybind11_fail("Unable to create memoryview from buffer descriptor");
+    }
+
+    PYBIND11_OBJECT_CVT(memoryview, object, PyMemoryView_Check, PyMemoryView_FromObject)
+};
+/// @} pytypes
+
+/// \addtogroup python_builtins
+/// @{
+inline size_t len(handle h) {
+    ssize_t result = PyObject_Length(h.ptr());
+    if (result < 0)
+        pybind11_fail("Unable to compute length of object");
+    return (size_t) result;
+}
+
+inline size_t len_hint(handle h) {
+#if PY_VERSION_HEX >= 0x03040000
+    ssize_t result = PyObject_LengthHint(h.ptr(), 0);
+#else
+    ssize_t result = PyObject_Length(h.ptr());
+#endif
+    if (result < 0) {
+        // Sometimes a length can't be determined at all (eg generators)
+        // In which case simply return 0
+        PyErr_Clear();
+        return 0;
+    }
+    return (size_t) result;
+}
+
+inline str repr(handle h) {
+    PyObject *str_value = PyObject_Repr(h.ptr());
+    if (!str_value) throw error_already_set();
+#if PY_MAJOR_VERSION < 3
+    PyObject *unicode = PyUnicode_FromEncodedObject(str_value, "utf-8", nullptr);
+    Py_XDECREF(str_value); str_value = unicode;
+    if (!str_value) throw error_already_set();
+#endif
+    return reinterpret_steal<str>(str_value);
+}
+
+inline iterator iter(handle obj) {
+    PyObject *result = PyObject_GetIter(obj.ptr());
+    if (!result) { throw error_already_set(); }
+    return reinterpret_steal<iterator>(result);
+}
+/// @} python_builtins
+
+NAMESPACE_BEGIN(detail)
+template <typename D> iterator object_api<D>::begin() const { return iter(derived()); }
+template <typename D> iterator object_api<D>::end() const { return iterator::sentinel(); }
+template <typename D> item_accessor object_api<D>::operator[](handle key) const {
+    return {derived(), reinterpret_borrow<object>(key)};
+}
+template <typename D> item_accessor object_api<D>::operator[](const char *key) const {
+    return {derived(), pybind11::str(key)};
+}
+template <typename D> obj_attr_accessor object_api<D>::attr(handle key) const {
+    return {derived(), reinterpret_borrow<object>(key)};
+}
+template <typename D> str_attr_accessor object_api<D>::attr(const char *key) const {
+    return {derived(), key};
+}
+template <typename D> args_proxy object_api<D>::operator*() const {
+    return args_proxy(derived().ptr());
+}
+template <typename D> template <typename T> bool object_api<D>::contains(T &&item) const {
+    return attr("__contains__")(std::forward<T>(item)).template cast<bool>();
+}
+
+template <typename D>
+pybind11::str object_api<D>::str() const { return pybind11::str(derived()); }
+
+template <typename D>
+str_attr_accessor object_api<D>::doc() const { return attr("__doc__"); }
+
+template <typename D>
+handle object_api<D>::get_type() const { return (PyObject *) Py_TYPE(derived().ptr()); }
+
+template <typename D>
+bool object_api<D>::rich_compare(object_api const &other, int value) const {
+    int rv = PyObject_RichCompareBool(derived().ptr(), other.derived().ptr(), value);
+    if (rv == -1)
+        throw error_already_set();
+    return rv == 1;
+}
+
+#define PYBIND11_MATH_OPERATOR_UNARY(op, fn)                                   \
+    template <typename D> object object_api<D>::op() const {                   \
+        object result = reinterpret_steal<object>(fn(derived().ptr()));        \
+        if (!result.ptr())                                                     \
+            throw error_already_set();                                         \
+        return result;                                                         \
+    }
+
+#define PYBIND11_MATH_OPERATOR_BINARY(op, fn)                                  \
+    template <typename D>                                                      \
+    object object_api<D>::op(object_api const &other) const {                  \
+        object result = reinterpret_steal<object>(                             \
+            fn(derived().ptr(), other.derived().ptr()));                       \
+        if (!result.ptr())                                                     \
+            throw error_already_set();                                         \
+        return result;                                                         \
+    }
+
+PYBIND11_MATH_OPERATOR_UNARY (operator~,   PyNumber_Invert)
+PYBIND11_MATH_OPERATOR_UNARY (operator-,   PyNumber_Negative)
+PYBIND11_MATH_OPERATOR_BINARY(operator+,   PyNumber_Add)
+PYBIND11_MATH_OPERATOR_BINARY(operator+=,  PyNumber_InPlaceAdd)
+PYBIND11_MATH_OPERATOR_BINARY(operator-,   PyNumber_Subtract)
+PYBIND11_MATH_OPERATOR_BINARY(operator-=,  PyNumber_InPlaceSubtract)
+PYBIND11_MATH_OPERATOR_BINARY(operator*,   PyNumber_Multiply)
+PYBIND11_MATH_OPERATOR_BINARY(operator*=,  PyNumber_InPlaceMultiply)
+PYBIND11_MATH_OPERATOR_BINARY(operator/,   PyNumber_TrueDivide)
+PYBIND11_MATH_OPERATOR_BINARY(operator/=,  PyNumber_InPlaceTrueDivide)
+PYBIND11_MATH_OPERATOR_BINARY(operator|,   PyNumber_Or)
+PYBIND11_MATH_OPERATOR_BINARY(operator|=,  PyNumber_InPlaceOr)
+PYBIND11_MATH_OPERATOR_BINARY(operator&,   PyNumber_And)
+PYBIND11_MATH_OPERATOR_BINARY(operator&=,  PyNumber_InPlaceAnd)
+PYBIND11_MATH_OPERATOR_BINARY(operator^,   PyNumber_Xor)
+PYBIND11_MATH_OPERATOR_BINARY(operator^=,  PyNumber_InPlaceXor)
+PYBIND11_MATH_OPERATOR_BINARY(operator<<,  PyNumber_Lshift)
+PYBIND11_MATH_OPERATOR_BINARY(operator<<=, PyNumber_InPlaceLshift)
+PYBIND11_MATH_OPERATOR_BINARY(operator>>,  PyNumber_Rshift)
+PYBIND11_MATH_OPERATOR_BINARY(operator>>=, PyNumber_InPlaceRshift)
+
+#undef PYBIND11_MATH_OPERATOR_UNARY
+#undef PYBIND11_MATH_OPERATOR_BINARY
+
+NAMESPACE_END(detail)
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/cviruntime/python/include/pybind11/include/pybind11/stl.h b/cviruntime/python/include/pybind11/include/pybind11/stl.h
new file mode 100644
index 000000000..32f8d294a
--- /dev/null
+++ b/cviruntime/python/include/pybind11/include/pybind11/stl.h
@@ -0,0 +1,386 @@
+/*
+    pybind11/stl.h: Transparent conversion for STL data types
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+#include <set>
+#include <unordered_set>
+#include <map>
+#include <unordered_map>
+#include <iostream>
+#include <list>
+#include <deque>
+#include <valarray>
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable: 4127) // warning C4127: Conditional expression is constant
+#endif
+
+#ifdef __has_include
+// std::optional (but including it in c++14 mode isn't allowed)
+#  if defined(PYBIND11_CPP17) && __has_include(<optional>)
+#    include <optional>
+#    define PYBIND11_HAS_OPTIONAL 1
+#  endif
+// std::experimental::optional (but not allowed in c++11 mode)
+#  if defined(PYBIND11_CPP14) && (__has_include(<experimental/optional>) && \
+                                 !__has_include(<optional>))
+#    include <experimental/optional>
+#    define PYBIND11_HAS_EXP_OPTIONAL 1
+#  endif
+// std::variant
+#  if defined(PYBIND11_CPP17) && __has_include(<variant>)
+#    include <variant>
+#    define PYBIND11_HAS_VARIANT 1
+#  endif
+#elif defined(_MSC_VER) && defined(PYBIND11_CPP17)
+#  include <optional>
+#  include <variant>
+#  define PYBIND11_HAS_OPTIONAL 1
+#  define PYBIND11_HAS_VARIANT 1
+#endif
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+/// Extracts an const lvalue reference or rvalue reference for U based on the type of T (e.g. for
+/// forwarding a container element).  Typically used indirect via forwarded_type(), below.
+template <typename T, typename U>
+using forwarded_type = conditional_t<
+    std::is_lvalue_reference<T>::value, remove_reference_t<U> &, remove_reference_t<U> &&>;
+
+/// Forwards a value U as rvalue or lvalue according to whether T is rvalue or lvalue; typically
+/// used for forwarding a container's elements.
+template <typename T, typename U>
+forwarded_type<T, U> forward_like(U &&u) {
+    return std::forward<detail::forwarded_type<T, U>>(std::forward<U>(u));
+}
+
+template <typename Type, typename Key> struct set_caster {
+    using type = Type;
+    using key_conv = make_caster<Key>;
+
+    bool load(handle src, bool convert) {
+        if (!isinstance<pybind11::set>(src))
+            return false;
+        auto s = reinterpret_borrow<pybind11::set>(src);
+        value.clear();
+        for (auto entry : s) {
+            key_conv conv;
+            if (!conv.load(entry, convert))
+                return false;
+            value.insert(cast_op<Key &&>(std::move(conv)));
+        }
+        return true;
+    }
+
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
+        if (!std::is_lvalue_reference<T>::value)
+            policy = return_value_policy_override<Key>::policy(policy);
+        pybind11::set s;
+        for (auto &&value : src) {
+            auto value_ = reinterpret_steal<object>(key_conv::cast(forward_like<T>(value), policy, parent));
+            if (!value_ || !s.add(value_))
+                return handle();
+        }
+        return s.release();
+    }
+
+    PYBIND11_TYPE_CASTER(type, _("Set[") + key_conv::name + _("]"));
+};
+
+template <typename Type, typename Key, typename Value> struct map_caster {
+    using key_conv   = make_caster<Key>;
+    using value_conv = make_caster<Value>;
+
+    bool load(handle src, bool convert) {
+        if (!isinstance<dict>(src))
+            return false;
+        auto d = reinterpret_borrow<dict>(src);
+        value.clear();
+        for (auto it : d) {
+            key_conv kconv;
+            value_conv vconv;
+            if (!kconv.load(it.first.ptr(), convert) ||
+                !vconv.load(it.second.ptr(), convert))
+                return false;
+            value.emplace(cast_op<Key &&>(std::move(kconv)), cast_op<Value &&>(std::move(vconv)));
+        }
+        return true;
+    }
+
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
+        dict d;
+        return_value_policy policy_key = policy;
+        return_value_policy policy_value = policy;
+        if (!std::is_lvalue_reference<T>::value) {
+            policy_key = return_value_policy_override<Key>::policy(policy_key);
+            policy_value = return_value_policy_override<Value>::policy(policy_value);
+        }
+        for (auto &&kv : src) {
+            auto key = reinterpret_steal<object>(key_conv::cast(forward_like<T>(kv.first), policy_key, parent));
+            auto value = reinterpret_steal<object>(value_conv::cast(forward_like<T>(kv.second), policy_value, parent));
+            if (!key || !value)
+                return handle();
+            d[key] = value;
+        }
+        return d.release();
+    }
+
+    PYBIND11_TYPE_CASTER(Type, _("Dict[") + key_conv::name + _(", ") + value_conv::name + _("]"));
+};
+
+template <typename Type, typename Value> struct list_caster {
+    using value_conv = make_caster<Value>;
+
+    bool load(handle src, bool convert) {
+        if (!isinstance<sequence>(src) || isinstance<str>(src))
+            return false;
+        auto s = reinterpret_borrow<sequence>(src);
+        value.clear();
+        reserve_maybe(s, &value);
+        for (auto it : s) {
+            value_conv conv;
+            if (!conv.load(it, convert))
+                return false;
+            value.push_back(cast_op<Value &&>(std::move(conv)));
+        }
+        return true;
+    }
+
+private:
+    template <typename T = Type,
+              enable_if_t<std::is_same<decltype(std::declval<T>().reserve(0)), void>::value, int> = 0>
+    void reserve_maybe(sequence s, Type *) { value.reserve(s.size()); }
+    void reserve_maybe(sequence, void *) { }
+
+public:
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
+        if (!std::is_lvalue_reference<T>::value)
+            policy = return_value_policy_override<Value>::policy(policy);
+        list l(src.size());
+        size_t index = 0;
+        for (auto &&value : src) {
+            auto value_ = reinterpret_steal<object>(value_conv::cast(forward_like<T>(value), policy, parent));
+            if (!value_)
+                return handle();
+            PyList_SET_ITEM(l.ptr(), (ssize_t) index++, value_.release().ptr()); // steals a reference
+        }
+        return l.release();
+    }
+
+    PYBIND11_TYPE_CASTER(Type, _("List[") + value_conv::name + _("]"));
+};
+
+template <typename Type, typename Alloc> struct type_caster<std::vector<Type, Alloc>>
+ : list_caster<std::vector<Type, Alloc>, Type> { };
+
+template <typename Type, typename Alloc> struct type_caster<std::deque<Type, Alloc>>
+ : list_caster<std::deque<Type, Alloc>, Type> { };
+
+template <typename Type, typename Alloc> struct type_caster<std::list<Type, Alloc>>
+ : list_caster<std::list<Type, Alloc>, Type> { };
+
+template <typename ArrayType, typename Value, bool Resizable, size_t Size = 0> struct array_caster {
+    using value_conv = make_caster<Value>;
+
+private:
+    template <bool R = Resizable>
+    bool require_size(enable_if_t<R, size_t> size) {
+        if (value.size() != size)
+            value.resize(size);
+        return true;
+    }
+    template <bool R = Resizable>
+    bool require_size(enable_if_t<!R, size_t> size) {
+        return size == Size;
+    }
+
+public:
+    bool load(handle src, bool convert) {
+        if (!isinstance<sequence>(src))
+            return false;
+        auto l = reinterpret_borrow<sequence>(src);
+        if (!require_size(l.size()))
+            return false;
+        size_t ctr = 0;
+        for (auto it : l) {
+            value_conv conv;
+            if (!conv.load(it, convert))
+                return false;
+            value[ctr++] = cast_op<Value &&>(std::move(conv));
+        }
+        return true;
+    }
+
+    template <typename T>
+    static handle cast(T &&src, return_value_policy policy, handle parent) {
+        list l(src.size());
+        size_t index = 0;
+        for (auto &&value : src) {
+            auto value_ = reinterpret_steal<object>(value_conv::cast(forward_like<T>(value), policy, parent));
+            if (!value_)
+                return handle();
+            PyList_SET_ITEM(l.ptr(), (ssize_t) index++, value_.release().ptr()); // steals a reference
+        }
+        return l.release();
+    }
+
+    PYBIND11_TYPE_CASTER(ArrayType, _("List[") + value_conv::name + _<Resizable>(_(""), _("[") + _<Size>() + _("]")) + _("]"));
+};
+
+template <typename Type, size_t Size> struct type_caster<std::array<Type, Size>>
+ : array_caster<std::array<Type, Size>, Type, false, Size> { };
+
+template <typename Type> struct type_caster<std::valarray<Type>>
+ : array_caster<std::valarray<Type>, Type, true> { };
+
+template <typename Key, typename Compare, typename Alloc> struct type_caster<std::set<Key, Compare, Alloc>>
+  : set_caster<std::set<Key, Compare, Alloc>, Key> { };
+
+template <typename Key, typename Hash, typename Equal, typename Alloc> struct type_caster<std::unordered_set<Key, Hash, Equal, Alloc>>
+  : set_caster<std::unordered_set<Key, Hash, Equal, Alloc>, Key> { };
+
+template <typename Key, typename Value, typename Compare, typename Alloc> struct type_caster<std::map<Key, Value, Compare, Alloc>>
+  : map_caster<std::map<Key, Value, Compare, Alloc>, Key, Value> { };
+
+template <typename Key, typename Value, typename Hash, typename Equal, typename Alloc> struct type_caster<std::unordered_map<Key, Value, Hash, Equal, Alloc>>
+  : map_caster<std::unordered_map<Key, Value, Hash, Equal, Alloc>, Key, Value> { };
+
+// This type caster is intended to be used for std::optional and std::experimental::optional
+template<typename T> struct optional_caster {
+    using value_conv = make_caster<typename T::value_type>;
+
+    template <typename T_>
+    static handle cast(T_ &&src, return_value_policy policy, handle parent) {
+        if (!src)
+            return none().inc_ref();
+        policy = return_value_policy_override<typename T::value_type>::policy(policy);
+        return value_conv::cast(*std::forward<T_>(src), policy, parent);
+    }
+
+    bool load(handle src, bool convert) {
+        if (!src) {
+            return false;
+        } else if (src.is_none()) {
+            return true;  // default-constructed value is already empty
+        }
+        value_conv inner_caster;
+        if (!inner_caster.load(src, convert))
+            return false;
+
+        value.emplace(cast_op<typename T::value_type &&>(std::move(inner_caster)));
+        return true;
+    }
+
+    PYBIND11_TYPE_CASTER(T, _("Optional[") + value_conv::name + _("]"));
+};
+
+#if PYBIND11_HAS_OPTIONAL
+template<typename T> struct type_caster<std::optional<T>>
+    : public optional_caster<std::optional<T>> {};
+
+template<> struct type_caster<std::nullopt_t>
+    : public void_caster<std::nullopt_t> {};
+#endif
+
+#if PYBIND11_HAS_EXP_OPTIONAL
+template<typename T> struct type_caster<std::experimental::optional<T>>
+    : public optional_caster<std::experimental::optional<T>> {};
+
+template<> struct type_caster<std::experimental::nullopt_t>
+    : public void_caster<std::experimental::nullopt_t> {};
+#endif
+
+/// Visit a variant and cast any found type to Python
+struct variant_caster_visitor {
+    return_value_policy policy;
+    handle parent;
+
+    using result_type = handle; // required by boost::variant in C++11
+
+    template <typename T>
+    result_type operator()(T &&src) const {
+        return make_caster<T>::cast(std::forward<T>(src), policy, parent);
+    }
+};
+
+/// Helper class which abstracts away variant's `visit` function. `std::variant` and similar
+/// `namespace::variant` types which provide a `namespace::visit()` function are handled here
+/// automatically using argument-dependent lookup. Users can provide specializations for other
+/// variant-like classes, e.g. `boost::variant` and `boost::apply_visitor`.
+template <template<typename...> class Variant>
+struct visit_helper {
+    template <typename... Args>
+    static auto call(Args &&...args) -> decltype(visit(std::forward<Args>(args)...)) {
+        return visit(std::forward<Args>(args)...);
+    }
+};
+
+/// Generic variant caster
+template <typename Variant> struct variant_caster;
+
+template <template<typename...> class V, typename... Ts>
+struct variant_caster<V<Ts...>> {
+    static_assert(sizeof...(Ts) > 0, "Variant must consist of at least one alternative.");
+
+    template <typename U, typename... Us>
+    bool load_alternative(handle src, bool convert, type_list<U, Us...>) {
+        auto caster = make_caster<U>();
+        if (caster.load(src, convert)) {
+            value = cast_op<U>(caster);
+            return true;
+        }
+        return load_alternative(src, convert, type_list<Us...>{});
+    }
+
+    bool load_alternative(handle, bool, type_list<>) { return false; }
+
+    bool load(handle src, bool convert) {
+        // Do a first pass without conversions to improve constructor resolution.
+        // E.g. `py::int_(1).cast<variant<double, int>>()` needs to fill the `int`
+        // slot of the variant. Without two-pass loading `double` would be filled
+        // because it appears first and a conversion is possible.
+        if (convert && load_alternative(src, false, type_list<Ts...>{}))
+            return true;
+        return load_alternative(src, convert, type_list<Ts...>{});
+    }
+
+    template <typename Variant>
+    static handle cast(Variant &&src, return_value_policy policy, handle parent) {
+        return visit_helper<V>::call(variant_caster_visitor{policy, parent},
+                                     std::forward<Variant>(src));
+    }
+
+    using Type = V<Ts...>;
+    PYBIND11_TYPE_CASTER(Type, _("Union[") + detail::concat(make_caster<Ts>::name...) + _("]"));
+};
+
+#if PYBIND11_HAS_VARIANT
+template <typename... Ts>
+struct type_caster<std::variant<Ts...>> : variant_caster<std::variant<Ts...>> { };
+#endif
+
+NAMESPACE_END(detail)
+
+inline std::ostream &operator<<(std::ostream &os, const handle &obj) {
+    os << (std::string) str(obj);
+    return os;
+}
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
diff --git a/cviruntime/python/include/pybind11/include/pybind11/stl_bind.h b/cviruntime/python/include/pybind11/include/pybind11/stl_bind.h
new file mode 100644
index 000000000..da233eca9
--- /dev/null
+++ b/cviruntime/python/include/pybind11/include/pybind11/stl_bind.h
@@ -0,0 +1,656 @@
+/*
+    pybind11/std_bind.h: Binding generators for STL data types
+
+    Copyright (c) 2016 Sergey Lyskov and Wenzel Jakob
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "detail/common.h"
+#include "operators.h"
+
+#include <algorithm>
+#include <sstream>
+
+NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+NAMESPACE_BEGIN(detail)
+
+/* SFINAE helper class used by 'is_comparable */
+template <typename T>  struct container_traits {
+    template <typename T2> static std::true_type test_comparable(decltype(std::declval<const T2 &>() == std::declval<const T2 &>())*);
+    template <typename T2> static std::false_type test_comparable(...);
+    template <typename T2> static std::true_type test_value(typename T2::value_type *);
+    template <typename T2> static std::false_type test_value(...);
+    template <typename T2> static std::true_type test_pair(typename T2::first_type *, typename T2::second_type *);
+    template <typename T2> static std::false_type test_pair(...);
+
+    static constexpr const bool is_comparable = std::is_same<std::true_type, decltype(test_comparable<T>(nullptr))>::value;
+    static constexpr const bool is_pair = std::is_same<std::true_type, decltype(test_pair<T>(nullptr, nullptr))>::value;
+    static constexpr const bool is_vector = std::is_same<std::true_type, decltype(test_value<T>(nullptr))>::value;
+    static constexpr const bool is_element = !is_pair && !is_vector;
+};
+
+/* Default: is_comparable -> std::false_type */
+template <typename T, typename SFINAE = void>
+struct is_comparable : std::false_type { };
+
+/* For non-map data structures, check whether operator== can be instantiated */
+template <typename T>
+struct is_comparable<
+    T, enable_if_t<container_traits<T>::is_element &&
+                   container_traits<T>::is_comparable>>
+    : std::true_type { };
+
+/* For a vector/map data structure, recursively check the value type (which is std::pair for maps) */
+template <typename T>
+struct is_comparable<T, enable_if_t<container_traits<T>::is_vector>> {
+    static constexpr const bool value =
+        is_comparable<typename T::value_type>::value;
+};
+
+/* For pairs, recursively check the two data types */
+template <typename T>
+struct is_comparable<T, enable_if_t<container_traits<T>::is_pair>> {
+    static constexpr const bool value =
+        is_comparable<typename T::first_type>::value &&
+        is_comparable<typename T::second_type>::value;
+};
+
+/* Fallback functions */
+template <typename, typename, typename... Args> void vector_if_copy_constructible(const Args &...) { }
+template <typename, typename, typename... Args> void vector_if_equal_operator(const Args &...) { }
+template <typename, typename, typename... Args> void vector_if_insertion_operator(const Args &...) { }
+template <typename, typename, typename... Args> void vector_modifiers(const Args &...) { }
+
+template<typename Vector, typename Class_>
+void vector_if_copy_constructible(enable_if_t<is_copy_constructible<Vector>::value, Class_> &cl) {
+    cl.def(init<const Vector &>(), "Copy constructor");
+}
+
+template<typename Vector, typename Class_>
+void vector_if_equal_operator(enable_if_t<is_comparable<Vector>::value, Class_> &cl) {
+    using T = typename Vector::value_type;
+
+    cl.def(self == self);
+    cl.def(self != self);
+
+    cl.def("count",
+        [](const Vector &v, const T &x) {
+            return std::count(v.begin(), v.end(), x);
+        },
+        arg("x"),
+        "Return the number of times ``x`` appears in the list"
+    );
+
+    cl.def("remove", [](Vector &v, const T &x) {
+            auto p = std::find(v.begin(), v.end(), x);
+            if (p != v.end())
+                v.erase(p);
+            else
+                throw value_error();
+        },
+        arg("x"),
+        "Remove the first item from the list whose value is x. "
+        "It is an error if there is no such item."
+    );
+
+    cl.def("__contains__",
+        [](const Vector &v, const T &x) {
+            return std::find(v.begin(), v.end(), x) != v.end();
+        },
+        arg("x"),
+        "Return true the container contains ``x``"
+    );
+}
+
+// Vector modifiers -- requires a copyable vector_type:
+// (Technically, some of these (pop and __delitem__) don't actually require copyability, but it seems
+// silly to allow deletion but not insertion, so include them here too.)
+template <typename Vector, typename Class_>
+void vector_modifiers(enable_if_t<is_copy_constructible<typename Vector::value_type>::value, Class_> &cl) {
+    using T = typename Vector::value_type;
+    using SizeType = typename Vector::size_type;
+    using DiffType = typename Vector::difference_type;
+
+    auto wrap_i = [](DiffType i, SizeType n) {
+        if (i < 0)
+            i += n;
+        if (i < 0 || (SizeType)i >= n)
+            throw index_error();
+        return i;
+    };
+
+    cl.def("append",
+           [](Vector &v, const T &value) { v.push_back(value); },
+           arg("x"),
+           "Add an item to the end of the list");
+
+    cl.def(init([](iterable it) {
+        auto v = std::unique_ptr<Vector>(new Vector());
+        v->reserve(len_hint(it));
+        for (handle h : it)
+           v->push_back(h.cast<T>());
+        return v.release();
+    }));
+
+    cl.def("clear",
+        [](Vector &v) {
+            v.clear();
+        },
+        "Clear the contents"
+    );
+
+    cl.def("extend",
+       [](Vector &v, const Vector &src) {
+           v.insert(v.end(), src.begin(), src.end());
+       },
+       arg("L"),
+       "Extend the list by appending all the items in the given list"
+    );
+
+    cl.def("extend",
+       [](Vector &v, iterable it) {
+           const size_t old_size = v.size();
+           v.reserve(old_size + len_hint(it));
+           try {
+               for (handle h : it) {
+                   v.push_back(h.cast<T>());
+               }
+           } catch (const cast_error &) {
+               v.erase(v.begin() + static_cast<typename Vector::difference_type>(old_size), v.end());
+               try {
+                   v.shrink_to_fit();
+               } catch (const std::exception &) {
+                   // Do nothing
+               }
+               throw;
+           }
+       },
+       arg("L"),
+       "Extend the list by appending all the items in the given list"
+    );
+
+    cl.def("insert",
+        [](Vector &v, DiffType i, const T &x) {
+            // Can't use wrap_i; i == v.size() is OK
+            if (i < 0)
+                i += v.size();
+            if (i < 0 || (SizeType)i > v.size())
+                throw index_error();
+            v.insert(v.begin() + i, x);
+        },
+        arg("i") , arg("x"),
+        "Insert an item at a given position."
+    );
+
+    cl.def("pop",
+        [](Vector &v) {
+            if (v.empty())
+                throw index_error();
+            T t = v.back();
+            v.pop_back();
+            return t;
+        },
+        "Remove and return the last item"
+    );
+
+    cl.def("pop",
+        [wrap_i](Vector &v, DiffType i) {
+            i = wrap_i(i, v.size());
+            T t = v[(SizeType) i];
+            v.erase(v.begin() + i);
+            return t;
+        },
+        arg("i"),
+        "Remove and return the item at index ``i``"
+    );
+
+    cl.def("__setitem__",
+        [wrap_i](Vector &v, DiffType i, const T &t) {
+            i = wrap_i(i, v.size());
+            v[(SizeType)i] = t;
+        }
+    );
+
+    /// Slicing protocol
+    cl.def("__getitem__",
+        [](const Vector &v, slice slice) -> Vector * {
+            size_t start, stop, step, slicelength;
+
+            if (!slice.compute(v.size(), &start, &stop, &step, &slicelength))
+                throw error_already_set();
+
+            Vector *seq = new Vector();
+            seq->reserve((size_t) slicelength);
+
+            for (size_t i=0; i<slicelength; ++i) {
+                seq->push_back(v[start]);
+                start += step;
+            }
+            return seq;
+        },
+        arg("s"),
+        "Retrieve list elements using a slice object"
+    );
+
+    cl.def("__setitem__",
+        [](Vector &v, slice slice,  const Vector &value) {
+            size_t start, stop, step, slicelength;
+            if (!slice.compute(v.size(), &start, &stop, &step, &slicelength))
+                throw error_already_set();
+
+            if (slicelength != value.size())
+                throw std::runtime_error("Left and right hand size of slice assignment have different sizes!");
+
+            for (size_t i=0; i<slicelength; ++i) {
+                v[start] = value[i];
+                start += step;
+            }
+        },
+        "Assign list elements using a slice object"
+    );
+
+    cl.def("__delitem__",
+        [wrap_i](Vector &v, DiffType i) {
+            i = wrap_i(i, v.size());
+            v.erase(v.begin() + i);
+        },
+        "Delete the list elements at index ``i``"
+    );
+
+    cl.def("__delitem__",
+        [](Vector &v, slice slice) {
+            size_t start, stop, step, slicelength;
+
+            if (!slice.compute(v.size(), &start, &stop, &step, &slicelength))
+                throw error_already_set();
+
+            if (step == 1 && false) {
+                v.erase(v.begin() + (DiffType) start, v.begin() + DiffType(start + slicelength));
+            } else {
+                for (size_t i = 0; i < slicelength; ++i) {
+                    v.erase(v.begin() + DiffType(start));
+                    start += step - 1;
+                }
+            }
+        },
+        "Delete list elements using a slice object"
+    );
+
+}
+
+// If the type has an operator[] that doesn't return a reference (most notably std::vector<bool>),
+// we have to access by copying; otherwise we return by reference.
+template <typename Vector> using vector_needs_copy = negation<
+    std::is_same<decltype(std::declval<Vector>()[typename Vector::size_type()]), typename Vector::value_type &>>;
+
+// The usual case: access and iterate by reference
+template <typename Vector, typename Class_>
+void vector_accessor(enable_if_t<!vector_needs_copy<Vector>::value, Class_> &cl) {
+    using T = typename Vector::value_type;
+    using SizeType = typename Vector::size_type;
+    using DiffType = typename Vector::difference_type;
+    using ItType   = typename Vector::iterator;
+
+    auto wrap_i = [](DiffType i, SizeType n) {
+        if (i < 0)
+            i += n;
+        if (i < 0 || (SizeType)i >= n)
+            throw index_error();
+        return i;
+    };
+
+    cl.def("__getitem__",
+        [wrap_i](Vector &v, DiffType i) -> T & {
+            i = wrap_i(i, v.size());
+            return v[(SizeType)i];
+        },
+        return_value_policy::reference_internal // ref + keepalive
+    );
+
+    cl.def("__iter__",
+           [](Vector &v) {
+               return make_iterator<
+                   return_value_policy::reference_internal, ItType, ItType, T&>(
+                   v.begin(), v.end());
+           },
+           keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */
+    );
+}
+
+// The case for special objects, like std::vector<bool>, that have to be returned-by-copy:
+template <typename Vector, typename Class_>
+void vector_accessor(enable_if_t<vector_needs_copy<Vector>::value, Class_> &cl) {
+    using T = typename Vector::value_type;
+    using SizeType = typename Vector::size_type;
+    using DiffType = typename Vector::difference_type;
+    using ItType   = typename Vector::iterator;
+    cl.def("__getitem__",
+        [](const Vector &v, DiffType i) -> T {
+            if (i < 0 && (i += v.size()) < 0)
+                throw index_error();
+            if ((SizeType)i >= v.size())
+                throw index_error();
+            return v[(SizeType)i];
+        }
+    );
+
+    cl.def("__iter__",
+           [](Vector &v) {
+               return make_iterator<
+                   return_value_policy::copy, ItType, ItType, T>(
+                   v.begin(), v.end());
+           },
+           keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */
+    );
+}
+
+template <typename Vector, typename Class_> auto vector_if_insertion_operator(Class_ &cl, std::string const &name)
+    -> decltype(std::declval<std::ostream&>() << std::declval<typename Vector::value_type>(), void()) {
+    using size_type = typename Vector::size_type;
+
+    cl.def("__repr__",
+           [name](Vector &v) {
+            std::ostringstream s;
+            s << name << '[';
+            for (size_type i=0; i < v.size(); ++i) {
+                s << v[i];
+                if (i != v.size() - 1)
+                    s << ", ";
+            }
+            s << ']';
+            return s.str();
+        },
+        "Return the canonical string representation of this list."
+    );
+}
+
+// Provide the buffer interface for vectors if we have data() and we have a format for it
+// GCC seems to have "void std::vector<bool>::data()" - doing SFINAE on the existence of data() is insufficient, we need to check it returns an appropriate pointer
+template <typename Vector, typename = void>
+struct vector_has_data_and_format : std::false_type {};
+template <typename Vector>
+struct vector_has_data_and_format<Vector, enable_if_t<std::is_same<decltype(format_descriptor<typename Vector::value_type>::format(), std::declval<Vector>().data()), typename Vector::value_type*>::value>> : std::true_type {};
+
+// Add the buffer interface to a vector
+template <typename Vector, typename Class_, typename... Args>
+enable_if_t<detail::any_of<std::is_same<Args, buffer_protocol>...>::value>
+vector_buffer(Class_& cl) {
+    using T = typename Vector::value_type;
+
+    static_assert(vector_has_data_and_format<Vector>::value, "There is not an appropriate format descriptor for this vector");
+
+    // numpy.h declares this for arbitrary types, but it may raise an exception and crash hard at runtime if PYBIND11_NUMPY_DTYPE hasn't been called, so check here
+    format_descriptor<T>::format();
+
+    cl.def_buffer([](Vector& v) -> buffer_info {
+        return buffer_info(v.data(), static_cast<ssize_t>(sizeof(T)), format_descriptor<T>::format(), 1, {v.size()}, {sizeof(T)});
+    });
+
+    cl.def(init([](buffer buf) {
+        auto info = buf.request();
+        if (info.ndim != 1 || info.strides[0] % static_cast<ssize_t>(sizeof(T)))
+            throw type_error("Only valid 1D buffers can be copied to a vector");
+        if (!detail::compare_buffer_info<T>::compare(info) || (ssize_t) sizeof(T) != info.itemsize)
+            throw type_error("Format mismatch (Python: " + info.format + " C++: " + format_descriptor<T>::format() + ")");
+
+        auto vec = std::unique_ptr<Vector>(new Vector());
+        vec->reserve((size_t) info.shape[0]);
+        T *p = static_cast<T*>(info.ptr);
+        ssize_t step = info.strides[0] / static_cast<ssize_t>(sizeof(T));
+        T *end = p + info.shape[0] * step;
+        for (; p != end; p += step)
+            vec->push_back(*p);
+        return vec.release();
+    }));
+
+    return;
+}
+
+template <typename Vector, typename Class_, typename... Args>
+enable_if_t<!detail::any_of<std::is_same<Args, buffer_protocol>...>::value> vector_buffer(Class_&) {}
+
+NAMESPACE_END(detail)
+
+//
+// std::vector
+//
+template <typename Vector, typename holder_type = std::unique_ptr<Vector>, typename... Args>
+class_<Vector, holder_type> bind_vector(handle scope, std::string const &name, Args&&... args) {
+    using Class_ = class_<Vector, holder_type>;
+
+    // If the value_type is unregistered (e.g. a converting type) or is itself registered
+    // module-local then make the vector binding module-local as well:
+    using vtype = typename Vector::value_type;
+    auto vtype_info = detail::get_type_info(typeid(vtype));
+    bool local = !vtype_info || vtype_info->module_local;
+
+    Class_ cl(scope, name.c_str(), pybind11::module_local(local), std::forward<Args>(args)...);
+
+    // Declare the buffer interface if a buffer_protocol() is passed in
+    detail::vector_buffer<Vector, Class_, Args...>(cl);
+
+    cl.def(init<>());
+
+    // Register copy constructor (if possible)
+    detail::vector_if_copy_constructible<Vector, Class_>(cl);
+
+    // Register comparison-related operators and functions (if possible)
+    detail::vector_if_equal_operator<Vector, Class_>(cl);
+
+    // Register stream insertion operator (if possible)
+    detail::vector_if_insertion_operator<Vector, Class_>(cl, name);
+
+    // Modifiers require copyable vector value type
+    detail::vector_modifiers<Vector, Class_>(cl);
+
+    // Accessor and iterator; return by value if copyable, otherwise we return by ref + keep-alive
+    detail::vector_accessor<Vector, Class_>(cl);
+
+    cl.def("__bool__",
+        [](const Vector &v) -> bool {
+            return !v.empty();
+        },
+        "Check whether the list is nonempty"
+    );
+
+    cl.def("__len__", &Vector::size);
+
+
+
+
+#if 0
+    // C++ style functions deprecated, leaving it here as an example
+    cl.def(init<size_type>());
+
+    cl.def("resize",
+         (void (Vector::*) (size_type count)) & Vector::resize,
+         "changes the number of elements stored");
+
+    cl.def("erase",
+        [](Vector &v, SizeType i) {
+        if (i >= v.size())
+            throw index_error();
+        v.erase(v.begin() + i);
+    }, "erases element at index ``i``");
+
+    cl.def("empty",         &Vector::empty,         "checks whether the container is empty");
+    cl.def("size",          &Vector::size,          "returns the number of elements");
+    cl.def("push_back", (void (Vector::*)(const T&)) &Vector::push_back, "adds an element to the end");
+    cl.def("pop_back",                               &Vector::pop_back, "removes the last element");
+
+    cl.def("max_size",      &Vector::max_size,      "returns the maximum possible number of elements");
+    cl.def("reserve",       &Vector::reserve,       "reserves storage");
+    cl.def("capacity",      &Vector::capacity,      "returns the number of elements that can be held in currently allocated storage");
+    cl.def("shrink_to_fit", &Vector::shrink_to_fit, "reduces memory usage by freeing unused memory");
+
+    cl.def("clear", &Vector::clear, "clears the contents");
+    cl.def("swap",   &Vector::swap, "swaps the contents");
+
+    cl.def("front", [](Vector &v) {
+        if (v.size()) return v.front();
+        else throw index_error();
+    }, "access the first element");
+
+    cl.def("back", [](Vector &v) {
+        if (v.size()) return v.back();
+        else throw index_error();
+    }, "access the last element ");
+
+#endif
+
+    return cl;
+}
+
+
+
+//
+// std::map, std::unordered_map
+//
+
+NAMESPACE_BEGIN(detail)
+
+/* Fallback functions */
+template <typename, typename, typename... Args> void map_if_insertion_operator(const Args &...) { }
+template <typename, typename, typename... Args> void map_assignment(const Args &...) { }
+
+// Map assignment when copy-assignable: just copy the value
+template <typename Map, typename Class_>
+void map_assignment(enable_if_t<is_copy_assignable<typename Map::mapped_type>::value, Class_> &cl) {
+    using KeyType = typename Map::key_type;
+    using MappedType = typename Map::mapped_type;
+
+    cl.def("__setitem__",
+           [](Map &m, const KeyType &k, const MappedType &v) {
+               auto it = m.find(k);
+               if (it != m.end()) it->second = v;
+               else m.emplace(k, v);
+           }
+    );
+}
+
+// Not copy-assignable, but still copy-constructible: we can update the value by erasing and reinserting
+template<typename Map, typename Class_>
+void map_assignment(enable_if_t<
+        !is_copy_assignable<typename Map::mapped_type>::value &&
+        is_copy_constructible<typename Map::mapped_type>::value,
+        Class_> &cl) {
+    using KeyType = typename Map::key_type;
+    using MappedType = typename Map::mapped_type;
+
+    cl.def("__setitem__",
+           [](Map &m, const KeyType &k, const MappedType &v) {
+               // We can't use m[k] = v; because value type might not be default constructable
+               auto r = m.emplace(k, v);
+               if (!r.second) {
+                   // value type is not copy assignable so the only way to insert it is to erase it first...
+                   m.erase(r.first);
+                   m.emplace(k, v);
+               }
+           }
+    );
+}
+
+
+template <typename Map, typename Class_> auto map_if_insertion_operator(Class_ &cl, std::string const &name)
+-> decltype(std::declval<std::ostream&>() << std::declval<typename Map::key_type>() << std::declval<typename Map::mapped_type>(), void()) {
+
+    cl.def("__repr__",
+           [name](Map &m) {
+            std::ostringstream s;
+            s << name << '{';
+            bool f = false;
+            for (auto const &kv : m) {
+                if (f)
+                    s << ", ";
+                s << kv.first << ": " << kv.second;
+                f = true;
+            }
+            s << '}';
+            return s.str();
+        },
+        "Return the canonical string representation of this map."
+    );
+}
+
+
+NAMESPACE_END(detail)
+
+template <typename Map, typename holder_type = std::unique_ptr<Map>, typename... Args>
+class_<Map, holder_type> bind_map(handle scope, const std::string &name, Args&&... args) {
+    using KeyType = typename Map::key_type;
+    using MappedType = typename Map::mapped_type;
+    using Class_ = class_<Map, holder_type>;
+
+    // If either type is a non-module-local bound type then make the map binding non-local as well;
+    // otherwise (e.g. both types are either module-local or converting) the map will be
+    // module-local.
+    auto tinfo = detail::get_type_info(typeid(MappedType));
+    bool local = !tinfo || tinfo->module_local;
+    if (local) {
+        tinfo = detail::get_type_info(typeid(KeyType));
+        local = !tinfo || tinfo->module_local;
+    }
+
+    Class_ cl(scope, name.c_str(), pybind11::module_local(local), std::forward<Args>(args)...);
+
+    cl.def(init<>());
+
+    // Register stream insertion operator (if possible)
+    detail::map_if_insertion_operator<Map, Class_>(cl, name);
+
+    cl.def("__bool__",
+        [](const Map &m) -> bool { return !m.empty(); },
+        "Check whether the map is nonempty"
+    );
+
+    cl.def("__iter__",
+           [](Map &m) { return make_key_iterator(m.begin(), m.end()); },
+           keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */
+    );
+
+    cl.def("items",
+           [](Map &m) { return make_iterator(m.begin(), m.end()); },
+           keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */
+    );
+
+    cl.def("__getitem__",
+        [](Map &m, const KeyType &k) -> MappedType & {
+            auto it = m.find(k);
+            if (it == m.end())
+              throw key_error();
+           return it->second;
+        },
+        return_value_policy::reference_internal // ref + keepalive
+    );
+
+    cl.def("__contains__",
+        [](Map &m, const KeyType &k) -> bool {
+            auto it = m.find(k);
+            if (it == m.end())
+              return false;
+           return true;
+        }
+    );
+
+    // Assignment provided only if the type is copyable
+    detail::map_assignment<Map, Class_>(cl);
+
+    cl.def("__delitem__",
+           [](Map &m, const KeyType &k) {
+               auto it = m.find(k);
+               if (it == m.end())
+                   throw key_error();
+               m.erase(it);
+           }
+    );
+
+    cl.def("__len__", &Map::size);
+
+    return cl;
+}
+
+NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/cviruntime/python/include/pybind11/tools/FindCatch.cmake b/cviruntime/python/include/pybind11/tools/FindCatch.cmake
new file mode 100644
index 000000000..9d490c5aa
--- /dev/null
+++ b/cviruntime/python/include/pybind11/tools/FindCatch.cmake
@@ -0,0 +1,57 @@
+# - Find the Catch test framework or download it (single header)
+#
+# This is a quick module for internal use. It assumes that Catch is
+# REQUIRED and that a minimum version is provided (not EXACT). If
+# a suitable version isn't found locally, the single header file
+# will be downloaded and placed in the build dir: PROJECT_BINARY_DIR.
+#
+# This code sets the following variables:
+#  CATCH_INCLUDE_DIR      - path to catch.hpp
+#  CATCH_VERSION          - version number
+
+if(NOT Catch_FIND_VERSION)
+  message(FATAL_ERROR "A version number must be specified.")
+elseif(Catch_FIND_REQUIRED)
+  message(FATAL_ERROR "This module assumes Catch is not required.")
+elseif(Catch_FIND_VERSION_EXACT)
+  message(FATAL_ERROR "Exact version numbers are not supported, only minimum.")
+endif()
+
+# Extract the version number from catch.hpp
+function(_get_catch_version)
+  file(STRINGS "${CATCH_INCLUDE_DIR}/catch.hpp" version_line REGEX "Catch v.*" LIMIT_COUNT 1)
+  if(version_line MATCHES "Catch v([0-9]+)\\.([0-9]+)\\.([0-9]+)")
+    set(CATCH_VERSION "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}" PARENT_SCOPE)
+  endif()
+endfunction()
+
+# Download the single-header version of Catch
+function(_download_catch version destination_dir)
+  message(STATUS "Downloading catch v${version}...")
+  set(url https://github.com/philsquared/Catch/releases/download/v${version}/catch.hpp)
+  file(DOWNLOAD ${url} "${destination_dir}/catch.hpp" STATUS status)
+  list(GET status 0 error)
+  if(error)
+    message(FATAL_ERROR "Could not download ${url}")
+  endif()
+  set(CATCH_INCLUDE_DIR "${destination_dir}" CACHE INTERNAL "")
+endfunction()
+
+# Look for catch locally
+find_path(CATCH_INCLUDE_DIR NAMES catch.hpp PATH_SUFFIXES catch)
+if(CATCH_INCLUDE_DIR)
+  _get_catch_version()
+endif()
+
+# Download the header if it wasn't found or if it's outdated
+if(NOT CATCH_VERSION OR CATCH_VERSION VERSION_LESS ${Catch_FIND_VERSION})
+  if(DOWNLOAD_CATCH)
+    _download_catch(${Catch_FIND_VERSION} "${PROJECT_BINARY_DIR}/catch/")
+    _get_catch_version()
+  else()
+    set(CATCH_FOUND FALSE)
+    return()
+  endif()
+endif()
+
+set(CATCH_FOUND TRUE)
diff --git a/cviruntime/python/include/pybind11/tools/FindEigen3.cmake b/cviruntime/python/include/pybind11/tools/FindEigen3.cmake
new file mode 100644
index 000000000..9c546a05d
--- /dev/null
+++ b/cviruntime/python/include/pybind11/tools/FindEigen3.cmake
@@ -0,0 +1,81 @@
+# - Try to find Eigen3 lib
+#
+# This module supports requiring a minimum version, e.g. you can do
+#   find_package(Eigen3 3.1.2)
+# to require version 3.1.2 or newer of Eigen3.
+#
+# Once done this will define
+#
+#  EIGEN3_FOUND - system has eigen lib with correct version
+#  EIGEN3_INCLUDE_DIR - the eigen include directory
+#  EIGEN3_VERSION - eigen version
+
+# Copyright (c) 2006, 2007 Montel Laurent, <montel@kde.org>
+# Copyright (c) 2008, 2009 Gael Guennebaud, <g.gael@free.fr>
+# Copyright (c) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+# Redistribution and use is allowed according to the terms of the 2-clause BSD license.
+
+if(NOT Eigen3_FIND_VERSION)
+  if(NOT Eigen3_FIND_VERSION_MAJOR)
+    set(Eigen3_FIND_VERSION_MAJOR 2)
+  endif(NOT Eigen3_FIND_VERSION_MAJOR)
+  if(NOT Eigen3_FIND_VERSION_MINOR)
+    set(Eigen3_FIND_VERSION_MINOR 91)
+  endif(NOT Eigen3_FIND_VERSION_MINOR)
+  if(NOT Eigen3_FIND_VERSION_PATCH)
+    set(Eigen3_FIND_VERSION_PATCH 0)
+  endif(NOT Eigen3_FIND_VERSION_PATCH)
+
+  set(Eigen3_FIND_VERSION "${Eigen3_FIND_VERSION_MAJOR}.${Eigen3_FIND_VERSION_MINOR}.${Eigen3_FIND_VERSION_PATCH}")
+endif(NOT Eigen3_FIND_VERSION)
+
+macro(_eigen3_check_version)
+  file(READ "${EIGEN3_INCLUDE_DIR}/Eigen/src/Core/util/Macros.h" _eigen3_version_header)
+
+  string(REGEX MATCH "define[ \t]+EIGEN_WORLD_VERSION[ \t]+([0-9]+)" _eigen3_world_version_match "${_eigen3_version_header}")
+  set(EIGEN3_WORLD_VERSION "${CMAKE_MATCH_1}")
+  string(REGEX MATCH "define[ \t]+EIGEN_MAJOR_VERSION[ \t]+([0-9]+)" _eigen3_major_version_match "${_eigen3_version_header}")
+  set(EIGEN3_MAJOR_VERSION "${CMAKE_MATCH_1}")
+  string(REGEX MATCH "define[ \t]+EIGEN_MINOR_VERSION[ \t]+([0-9]+)" _eigen3_minor_version_match "${_eigen3_version_header}")
+  set(EIGEN3_MINOR_VERSION "${CMAKE_MATCH_1}")
+
+  set(EIGEN3_VERSION ${EIGEN3_WORLD_VERSION}.${EIGEN3_MAJOR_VERSION}.${EIGEN3_MINOR_VERSION})
+  if(${EIGEN3_VERSION} VERSION_LESS ${Eigen3_FIND_VERSION})
+    set(EIGEN3_VERSION_OK FALSE)
+  else(${EIGEN3_VERSION} VERSION_LESS ${Eigen3_FIND_VERSION})
+    set(EIGEN3_VERSION_OK TRUE)
+  endif(${EIGEN3_VERSION} VERSION_LESS ${Eigen3_FIND_VERSION})
+
+  if(NOT EIGEN3_VERSION_OK)
+
+    message(STATUS "Eigen3 version ${EIGEN3_VERSION} found in ${EIGEN3_INCLUDE_DIR}, "
+                   "but at least version ${Eigen3_FIND_VERSION} is required")
+  endif(NOT EIGEN3_VERSION_OK)
+endmacro(_eigen3_check_version)
+
+if (EIGEN3_INCLUDE_DIR)
+
+  # in cache already
+  _eigen3_check_version()
+  set(EIGEN3_FOUND ${EIGEN3_VERSION_OK})
+
+else (EIGEN3_INCLUDE_DIR)
+
+  find_path(EIGEN3_INCLUDE_DIR NAMES signature_of_eigen3_matrix_library
+      PATHS
+      ${CMAKE_INSTALL_PREFIX}/include
+      ${KDE4_INCLUDE_DIR}
+      PATH_SUFFIXES eigen3 eigen
+    )
+
+  if(EIGEN3_INCLUDE_DIR)
+    _eigen3_check_version()
+  endif(EIGEN3_INCLUDE_DIR)
+
+  include(FindPackageHandleStandardArgs)
+  find_package_handle_standard_args(Eigen3 DEFAULT_MSG EIGEN3_INCLUDE_DIR EIGEN3_VERSION_OK)
+
+  mark_as_advanced(EIGEN3_INCLUDE_DIR)
+
+endif(EIGEN3_INCLUDE_DIR)
+
diff --git a/cviruntime/python/include/pybind11/tools/FindPythonLibsNew.cmake b/cviruntime/python/include/pybind11/tools/FindPythonLibsNew.cmake
new file mode 100644
index 000000000..9ea6036e3
--- /dev/null
+++ b/cviruntime/python/include/pybind11/tools/FindPythonLibsNew.cmake
@@ -0,0 +1,202 @@
+# - Find python libraries
+# This module finds the libraries corresponding to the Python interpreter
+# FindPythonInterp provides.
+# This code sets the following variables:
+#
+#  PYTHONLIBS_FOUND           - have the Python libs been found
+#  PYTHON_PREFIX              - path to the Python installation
+#  PYTHON_LIBRARIES           - path to the python library
+#  PYTHON_INCLUDE_DIRS        - path to where Python.h is found
+#  PYTHON_MODULE_EXTENSION    - lib extension, e.g. '.so' or '.pyd'
+#  PYTHON_MODULE_PREFIX       - lib name prefix: usually an empty string
+#  PYTHON_SITE_PACKAGES       - path to installation site-packages
+#  PYTHON_IS_DEBUG            - whether the Python interpreter is a debug build
+#
+# Thanks to talljimbo for the patch adding the 'LDVERSION' config
+# variable usage.
+
+#=============================================================================
+# Copyright 2001-2009 Kitware, Inc.
+# Copyright 2012 Continuum Analytics, Inc.
+#
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# * Neither the names of Kitware, Inc., the Insight Software Consortium,
+# nor the names of their contributors may be used to endorse or promote
+# products derived from this software without specific prior written
+# permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#=============================================================================
+
+# Checking for the extension makes sure that `LibsNew` was found and not just `Libs`.
+if(PYTHONLIBS_FOUND AND PYTHON_MODULE_EXTENSION)
+    return()
+endif()
+
+# Use the Python interpreter to find the libs.
+if(PythonLibsNew_FIND_REQUIRED)
+    find_package(PythonInterp ${PythonLibsNew_FIND_VERSION} REQUIRED)
+else()
+    find_package(PythonInterp ${PythonLibsNew_FIND_VERSION})
+endif()
+
+if(NOT PYTHONINTERP_FOUND)
+    set(PYTHONLIBS_FOUND FALSE)
+    set(PythonLibsNew_FOUND FALSE)
+    return()
+endif()
+
+# According to http://stackoverflow.com/questions/646518/python-how-to-detect-debug-interpreter
+# testing whether sys has the gettotalrefcount function is a reliable, cross-platform
+# way to detect a CPython debug interpreter.
+#
+# The library suffix is from the config var LDVERSION sometimes, otherwise
+# VERSION. VERSION will typically be like "2.7" on unix, and "27" on windows.
+execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
+    "from distutils import sysconfig as s;import sys;import struct;
+print('.'.join(str(v) for v in sys.version_info));
+print(sys.prefix);
+print(s.get_python_inc(plat_specific=True));
+print(s.get_python_lib(plat_specific=True));
+print(s.get_config_var('SO'));
+print(hasattr(sys, 'gettotalrefcount')+0);
+print(struct.calcsize('@P'));
+print(s.get_config_var('LDVERSION') or s.get_config_var('VERSION'));
+print(s.get_config_var('LIBDIR') or '');
+print(s.get_config_var('MULTIARCH') or '');
+"
+    RESULT_VARIABLE _PYTHON_SUCCESS
+    OUTPUT_VARIABLE _PYTHON_VALUES
+    ERROR_VARIABLE _PYTHON_ERROR_VALUE)
+
+if(NOT _PYTHON_SUCCESS MATCHES 0)
+    if(PythonLibsNew_FIND_REQUIRED)
+        message(FATAL_ERROR
+            "Python config failure:\n${_PYTHON_ERROR_VALUE}")
+    endif()
+    set(PYTHONLIBS_FOUND FALSE)
+    set(PythonLibsNew_FOUND FALSE)
+    return()
+endif()
+
+# Convert the process output into a list
+if(WIN32)
+    string(REGEX REPLACE "\\\\" "/" _PYTHON_VALUES ${_PYTHON_VALUES})
+endif()
+string(REGEX REPLACE ";" "\\\\;" _PYTHON_VALUES ${_PYTHON_VALUES})
+string(REGEX REPLACE "\n" ";" _PYTHON_VALUES ${_PYTHON_VALUES})
+list(GET _PYTHON_VALUES 0 _PYTHON_VERSION_LIST)
+list(GET _PYTHON_VALUES 1 PYTHON_PREFIX)
+list(GET _PYTHON_VALUES 2 PYTHON_INCLUDE_DIR)
+list(GET _PYTHON_VALUES 3 PYTHON_SITE_PACKAGES)
+list(GET _PYTHON_VALUES 4 PYTHON_MODULE_EXTENSION)
+list(GET _PYTHON_VALUES 5 PYTHON_IS_DEBUG)
+list(GET _PYTHON_VALUES 6 PYTHON_SIZEOF_VOID_P)
+list(GET _PYTHON_VALUES 7 PYTHON_LIBRARY_SUFFIX)
+list(GET _PYTHON_VALUES 8 PYTHON_LIBDIR)
+list(GET _PYTHON_VALUES 9 PYTHON_MULTIARCH)
+
+# Make sure the Python has the same pointer-size as the chosen compiler
+# Skip if CMAKE_SIZEOF_VOID_P is not defined
+if(CMAKE_SIZEOF_VOID_P AND (NOT "${PYTHON_SIZEOF_VOID_P}" STREQUAL "${CMAKE_SIZEOF_VOID_P}"))
+    if(PythonLibsNew_FIND_REQUIRED)
+        math(EXPR _PYTHON_BITS "${PYTHON_SIZEOF_VOID_P} * 8")
+        math(EXPR _CMAKE_BITS "${CMAKE_SIZEOF_VOID_P} * 8")
+        message(FATAL_ERROR
+            "Python config failure: Python is ${_PYTHON_BITS}-bit, "
+            "chosen compiler is  ${_CMAKE_BITS}-bit")
+    endif()
+    set(PYTHONLIBS_FOUND FALSE)
+    set(PythonLibsNew_FOUND FALSE)
+    return()
+endif()
+
+# The built-in FindPython didn't always give the version numbers
+string(REGEX REPLACE "\\." ";" _PYTHON_VERSION_LIST ${_PYTHON_VERSION_LIST})
+list(GET _PYTHON_VERSION_LIST 0 PYTHON_VERSION_MAJOR)
+list(GET _PYTHON_VERSION_LIST 1 PYTHON_VERSION_MINOR)
+list(GET _PYTHON_VERSION_LIST 2 PYTHON_VERSION_PATCH)
+
+# Make sure all directory separators are '/'
+string(REGEX REPLACE "\\\\" "/" PYTHON_PREFIX "${PYTHON_PREFIX}")
+string(REGEX REPLACE "\\\\" "/" PYTHON_INCLUDE_DIR "${PYTHON_INCLUDE_DIR}")
+string(REGEX REPLACE "\\\\" "/" PYTHON_SITE_PACKAGES "${PYTHON_SITE_PACKAGES}")
+
+if(CMAKE_HOST_WIN32 AND NOT (MINGW AND DEFINED ENV{MSYSTEM}))
+    set(PYTHON_LIBRARY
+        "${PYTHON_PREFIX}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib")
+
+    # when run in a venv, PYTHON_PREFIX points to it. But the libraries remain in the
+    # original python installation. They may be found relative to PYTHON_INCLUDE_DIR.
+    if(NOT EXISTS "${PYTHON_LIBRARY}")
+        get_filename_component(_PYTHON_ROOT ${PYTHON_INCLUDE_DIR} DIRECTORY)
+        set(PYTHON_LIBRARY
+            "${_PYTHON_ROOT}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib")
+    endif()
+
+    # raise an error if the python libs are still not found.
+    if(NOT EXISTS "${PYTHON_LIBRARY}")
+        message(FATAL_ERROR "Python libraries not found")
+    endif()
+
+else()
+    if(PYTHON_MULTIARCH)
+        set(_PYTHON_LIBS_SEARCH "${PYTHON_LIBDIR}/${PYTHON_MULTIARCH}" "${PYTHON_LIBDIR}")
+    else()
+        set(_PYTHON_LIBS_SEARCH "${PYTHON_LIBDIR}")
+    endif()
+    #message(STATUS "Searching for Python libs in ${_PYTHON_LIBS_SEARCH}")
+    # Probably this needs to be more involved. It would be nice if the config
+    # information the python interpreter itself gave us were more complete.
+    find_library(PYTHON_LIBRARY
+        NAMES "python${PYTHON_LIBRARY_SUFFIX}"
+        PATHS ${_PYTHON_LIBS_SEARCH}
+        NO_DEFAULT_PATH)
+
+    # If all else fails, just set the name/version and let the linker figure out the path.
+    if(NOT PYTHON_LIBRARY)
+        set(PYTHON_LIBRARY python${PYTHON_LIBRARY_SUFFIX})
+    endif()
+endif()
+
+MARK_AS_ADVANCED(
+  PYTHON_LIBRARY
+  PYTHON_INCLUDE_DIR
+)
+
+# We use PYTHON_INCLUDE_DIR, PYTHON_LIBRARY and PYTHON_DEBUG_LIBRARY for the
+# cache entries because they are meant to specify the location of a single
+# library. We now set the variables listed by the documentation for this
+# module.
+SET(PYTHON_INCLUDE_DIRS "${PYTHON_INCLUDE_DIR}")
+SET(PYTHON_LIBRARIES "${PYTHON_LIBRARY}")
+SET(PYTHON_DEBUG_LIBRARIES "${PYTHON_DEBUG_LIBRARY}")
+
+find_package_message(PYTHON
+    "Found PythonLibs: ${PYTHON_LIBRARY}"
+    "${PYTHON_EXECUTABLE}${PYTHON_VERSION}")
+
+set(PYTHONLIBS_FOUND TRUE)
+set(PythonLibsNew_FOUND TRUE)
diff --git a/cviruntime/python/include/pybind11/tools/pybind11Config.cmake.in b/cviruntime/python/include/pybind11/tools/pybind11Config.cmake.in
new file mode 100644
index 000000000..8a7272ff9
--- /dev/null
+++ b/cviruntime/python/include/pybind11/tools/pybind11Config.cmake.in
@@ -0,0 +1,104 @@
+# pybind11Config.cmake
+# --------------------
+#
+# PYBIND11 cmake module.
+# This module sets the following variables in your project::
+#
+#   pybind11_FOUND - true if pybind11 and all required components found on the system
+#   pybind11_VERSION - pybind11 version in format Major.Minor.Release
+#   pybind11_INCLUDE_DIRS - Directories where pybind11 and python headers are located.
+#   pybind11_INCLUDE_DIR - Directory where pybind11 headers are located.
+#   pybind11_DEFINITIONS - Definitions necessary to use pybind11, namely USING_pybind11.
+#   pybind11_LIBRARIES - compile flags and python libraries (as needed) to link against.
+#   pybind11_LIBRARY - empty.
+#   CMAKE_MODULE_PATH - appends location of accompanying FindPythonLibsNew.cmake and
+#                       pybind11Tools.cmake modules.
+#
+#
+# Available components: None
+#
+#
+# Exported targets::
+#
+# If pybind11 is found, this module defines the following :prop_tgt:`IMPORTED`
+# interface library targets::
+#
+#   pybind11::module - for extension modules
+#   pybind11::embed - for embedding the Python interpreter
+#
+# Python headers, libraries (as needed by platform), and the C++ standard
+# are attached to the target. Set PythonLibsNew variables to influence
+# python detection and PYBIND11_CPP_STANDARD (-std=c++11 or -std=c++14) to
+# influence standard setting. ::
+#
+#   find_package(pybind11 CONFIG REQUIRED)
+#   message(STATUS "Found pybind11 v${pybind11_VERSION}: ${pybind11_INCLUDE_DIRS}")
+#
+#   # Create an extension module
+#   add_library(mylib MODULE main.cpp)
+#   target_link_libraries(mylib pybind11::module)
+#
+#   # Or embed the Python interpreter into an executable
+#   add_executable(myexe main.cpp)
+#   target_link_libraries(myexe pybind11::embed)
+#
+# Suggested usage::
+#
+# find_package with version info is not recommended except for release versions. ::
+#
+#   find_package(pybind11 CONFIG)
+#   find_package(pybind11 2.0 EXACT CONFIG REQUIRED)
+#
+#
+# The following variables can be set to guide the search for this package::
+#
+#   pybind11_DIR - CMake variable, set to directory containing this Config file
+#   CMAKE_PREFIX_PATH - CMake variable, set to root directory of this package
+#   PATH - environment variable, set to bin directory of this package
+#   CMAKE_DISABLE_FIND_PACKAGE_pybind11 - CMake variable, disables
+#     find_package(pybind11) when not REQUIRED, perhaps to force internal build
+
+@PACKAGE_INIT@
+
+set(PN pybind11)
+
+# location of pybind11/pybind11.h
+set(${PN}_INCLUDE_DIR "${PACKAGE_PREFIX_DIR}/@CMAKE_INSTALL_INCLUDEDIR@")
+
+set(${PN}_LIBRARY "")
+set(${PN}_DEFINITIONS USING_${PN})
+
+check_required_components(${PN})
+
+# make detectable the FindPythonLibsNew.cmake module
+list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR})
+
+include(pybind11Tools)
+
+if(NOT (CMAKE_VERSION VERSION_LESS 3.0))
+#-----------------------------------------------------------------------------
+# Don't include targets if this file is being picked up by another
+# project which has already built this as a subproject
+#-----------------------------------------------------------------------------
+if(NOT TARGET ${PN}::pybind11)
+    include("${CMAKE_CURRENT_LIST_DIR}/${PN}Targets.cmake")
+
+    find_package(PythonLibsNew ${PYBIND11_PYTHON_VERSION} MODULE REQUIRED)
+    set_property(TARGET ${PN}::pybind11 APPEND PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${PYTHON_INCLUDE_DIRS})
+    set_property(TARGET ${PN}::embed APPEND PROPERTY INTERFACE_LINK_LIBRARIES ${PYTHON_LIBRARIES})
+    if(WIN32 OR CYGWIN)
+      set_property(TARGET ${PN}::module APPEND PROPERTY INTERFACE_LINK_LIBRARIES ${PYTHON_LIBRARIES})
+    endif()
+
+    if(CMAKE_VERSION VERSION_LESS 3.3)
+      set_property(TARGET ${PN}::pybind11 APPEND PROPERTY INTERFACE_COMPILE_OPTIONS "${PYBIND11_CPP_STANDARD}")
+    else()
+      set_property(TARGET ${PN}::pybind11 APPEND PROPERTY INTERFACE_COMPILE_OPTIONS $<$<COMPILE_LANGUAGE:CXX>:${PYBIND11_CPP_STANDARD}>)
+    endif()
+
+    get_property(_iid TARGET ${PN}::pybind11 PROPERTY INTERFACE_INCLUDE_DIRECTORIES)
+    get_property(_ill TARGET ${PN}::module PROPERTY INTERFACE_LINK_LIBRARIES)
+    set(${PN}_INCLUDE_DIRS ${_iid})
+    set(${PN}_LIBRARIES ${_ico} ${_ill})
+endif()
+endif()
diff --git a/cviruntime/python/include/pybind11/tools/pybind11Tools.cmake b/cviruntime/python/include/pybind11/tools/pybind11Tools.cmake
new file mode 100644
index 000000000..508e47429
--- /dev/null
+++ b/cviruntime/python/include/pybind11/tools/pybind11Tools.cmake
@@ -0,0 +1,227 @@
+# tools/pybind11Tools.cmake -- Build system for the pybind11 modules
+#
+# Copyright (c) 2015 Wenzel Jakob <wenzel@inf.ethz.ch>
+#
+# All rights reserved. Use of this source code is governed by a
+# BSD-style license that can be found in the LICENSE file.
+
+cmake_minimum_required(VERSION 2.8.12)
+
+# Add a CMake parameter for choosing a desired Python version
+if(NOT PYBIND11_PYTHON_VERSION)
+  set(PYBIND11_PYTHON_VERSION "" CACHE STRING "Python version to use for compiling modules")
+endif()
+
+set(Python_ADDITIONAL_VERSIONS 3.9 3.8 3.7 3.6 3.5 3.4)
+find_package(PythonLibsNew ${PYBIND11_PYTHON_VERSION} REQUIRED)
+
+include(CheckCXXCompilerFlag)
+include(CMakeParseArguments)
+
+if(NOT PYBIND11_CPP_STANDARD AND NOT CMAKE_CXX_STANDARD)
+  if(NOT MSVC)
+    check_cxx_compiler_flag("-std=c++14" HAS_CPP14_FLAG)
+
+    if (HAS_CPP14_FLAG)
+      set(PYBIND11_CPP_STANDARD -std=c++14)
+    else()
+      check_cxx_compiler_flag("-std=c++11" HAS_CPP11_FLAG)
+      if (HAS_CPP11_FLAG)
+        set(PYBIND11_CPP_STANDARD -std=c++11)
+      else()
+        message(FATAL_ERROR "Unsupported compiler -- pybind11 requires C++11 support!")
+      endif()
+    endif()
+  elseif(MSVC)
+    set(PYBIND11_CPP_STANDARD /std:c++14)
+  endif()
+
+  set(PYBIND11_CPP_STANDARD ${PYBIND11_CPP_STANDARD} CACHE STRING
+      "C++ standard flag, e.g. -std=c++11, -std=c++14, /std:c++14.  Defaults to C++14 mode." FORCE)
+endif()
+
+# Checks whether the given CXX/linker flags can compile and link a cxx file.  cxxflags and
+# linkerflags are lists of flags to use.  The result variable is a unique variable name for each set
+# of flags: the compilation result will be cached base on the result variable.  If the flags work,
+# sets them in cxxflags_out/linkerflags_out internal cache variables (in addition to ${result}).
+function(_pybind11_return_if_cxx_and_linker_flags_work result cxxflags linkerflags cxxflags_out linkerflags_out)
+  set(CMAKE_REQUIRED_LIBRARIES ${linkerflags})
+  check_cxx_compiler_flag("${cxxflags}" ${result})
+  if (${result})
+    set(${cxxflags_out} "${cxxflags}" CACHE INTERNAL "" FORCE)
+    set(${linkerflags_out} "${linkerflags}" CACHE INTERNAL "" FORCE)
+  endif()
+endfunction()
+
+# Internal: find the appropriate link time optimization flags for this compiler
+function(_pybind11_add_lto_flags target_name prefer_thin_lto)
+  if (NOT DEFINED PYBIND11_LTO_CXX_FLAGS)
+    set(PYBIND11_LTO_CXX_FLAGS "" CACHE INTERNAL "")
+    set(PYBIND11_LTO_LINKER_FLAGS "" CACHE INTERNAL "")
+
+    if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
+      set(cxx_append "")
+      set(linker_append "")
+      if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND NOT APPLE)
+        # Clang Gold plugin does not support -Os; append -O3 to MinSizeRel builds to override it
+        set(linker_append ";$<$<CONFIG:MinSizeRel>:-O3>")
+      elseif(CMAKE_CXX_COMPILER_ID MATCHES "GNU")
+        set(cxx_append ";-fno-fat-lto-objects")
+      endif()
+
+      if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND prefer_thin_lto)
+        _pybind11_return_if_cxx_and_linker_flags_work(HAS_FLTO_THIN
+          "-flto=thin${cxx_append}" "-flto=thin${linker_append}"
+          PYBIND11_LTO_CXX_FLAGS PYBIND11_LTO_LINKER_FLAGS)
+      endif()
+
+      if (NOT HAS_FLTO_THIN)
+        _pybind11_return_if_cxx_and_linker_flags_work(HAS_FLTO
+          "-flto${cxx_append}" "-flto${linker_append}"
+          PYBIND11_LTO_CXX_FLAGS PYBIND11_LTO_LINKER_FLAGS)
+      endif()
+    elseif (CMAKE_CXX_COMPILER_ID MATCHES "Intel")
+      # Intel equivalent to LTO is called IPO
+      _pybind11_return_if_cxx_and_linker_flags_work(HAS_INTEL_IPO
+      "-ipo" "-ipo" PYBIND11_LTO_CXX_FLAGS PYBIND11_LTO_LINKER_FLAGS)
+    elseif(MSVC)
+      # cmake only interprets libraries as linker flags when they start with a - (otherwise it
+      # converts /LTCG to \LTCG as if it was a Windows path).  Luckily MSVC supports passing flags
+      # with - instead of /, even if it is a bit non-standard:
+      _pybind11_return_if_cxx_and_linker_flags_work(HAS_MSVC_GL_LTCG
+        "/GL" "-LTCG" PYBIND11_LTO_CXX_FLAGS PYBIND11_LTO_LINKER_FLAGS)
+    endif()
+
+    if (PYBIND11_LTO_CXX_FLAGS)
+      message(STATUS "LTO enabled")
+    else()
+      message(STATUS "LTO disabled (not supported by the compiler and/or linker)")
+    endif()
+  endif()
+
+  # Enable LTO flags if found, except for Debug builds
+  if (PYBIND11_LTO_CXX_FLAGS)
+    target_compile_options(${target_name} PRIVATE "$<$<NOT:$<CONFIG:Debug>>:${PYBIND11_LTO_CXX_FLAGS}>")
+  endif()
+  if (PYBIND11_LTO_LINKER_FLAGS)
+    target_link_libraries(${target_name} PRIVATE "$<$<NOT:$<CONFIG:Debug>>:${PYBIND11_LTO_LINKER_FLAGS}>")
+  endif()
+endfunction()
+
+# Build a Python extension module:
+# pybind11_add_module(<name> [MODULE | SHARED] [EXCLUDE_FROM_ALL]
+#                     [NO_EXTRAS] [SYSTEM] [THIN_LTO] source1 [source2 ...])
+#
+function(pybind11_add_module target_name)
+  set(options MODULE SHARED EXCLUDE_FROM_ALL NO_EXTRAS SYSTEM THIN_LTO)
+  cmake_parse_arguments(ARG "${options}" "" "" ${ARGN})
+
+  if(ARG_MODULE AND ARG_SHARED)
+    message(FATAL_ERROR "Can't be both MODULE and SHARED")
+  elseif(ARG_SHARED)
+    set(lib_type SHARED)
+  else()
+    set(lib_type MODULE)
+  endif()
+
+  if(ARG_EXCLUDE_FROM_ALL)
+    set(exclude_from_all EXCLUDE_FROM_ALL)
+  endif()
+
+  add_library(${target_name} ${lib_type} ${exclude_from_all} ${ARG_UNPARSED_ARGUMENTS})
+
+  if(ARG_SYSTEM)
+    set(inc_isystem SYSTEM)
+  endif()
+
+  target_include_directories(${target_name} ${inc_isystem}
+    PRIVATE ${PYBIND11_INCLUDE_DIR}  # from project CMakeLists.txt
+    PRIVATE ${pybind11_INCLUDE_DIR}  # from pybind11Config
+    PRIVATE ${PYTHON_INCLUDE_DIRS})
+
+  # Python debug libraries expose slightly different objects
+  # https://docs.python.org/3.6/c-api/intro.html#debugging-builds
+  # https://stackoverflow.com/questions/39161202/how-to-work-around-missing-pymodule-create2-in-amd64-win-python35-d-lib
+  if(PYTHON_IS_DEBUG)
+    target_compile_definitions(${target_name} PRIVATE Py_DEBUG)
+  endif()
+
+  # The prefix and extension are provided by FindPythonLibsNew.cmake
+  set_target_properties(${target_name} PROPERTIES PREFIX "${PYTHON_MODULE_PREFIX}")
+  set_target_properties(${target_name} PROPERTIES SUFFIX "${PYTHON_MODULE_EXTENSION}")
+
+  # -fvisibility=hidden is required to allow multiple modules compiled against
+  # different pybind versions to work properly, and for some features (e.g.
+  # py::module_local).  We force it on everything inside the `pybind11`
+  # namespace; also turning it on for a pybind module compilation here avoids
+  # potential warnings or issues from having mixed hidden/non-hidden types.
+  set_target_properties(${target_name} PROPERTIES CXX_VISIBILITY_PRESET "hidden")
+  set_target_properties(${target_name} PROPERTIES CUDA_VISIBILITY_PRESET "hidden")
+
+  if(WIN32 OR CYGWIN)
+    # Link against the Python shared library on Windows
+    target_link_libraries(${target_name} PRIVATE ${PYTHON_LIBRARIES})
+  elseif(APPLE)
+    # It's quite common to have multiple copies of the same Python version
+    # installed on one's system. E.g.: one copy from the OS and another copy
+    # that's statically linked into an application like Blender or Maya.
+    # If we link our plugin library against the OS Python here and import it
+    # into Blender or Maya later on, this will cause segfaults when multiple
+    # conflicting Python instances are active at the same time (even when they
+    # are of the same version).
+
+    # Windows is not affected by this issue since it handles DLL imports
+    # differently. The solution for Linux and Mac OS is simple: we just don't
+    # link against the Python library. The resulting shared library will have
+    # missing symbols, but that's perfectly fine -- they will be resolved at
+    # import time.
+
+    target_link_libraries(${target_name} PRIVATE "-undefined dynamic_lookup")
+
+    if(ARG_SHARED)
+      # Suppress CMake >= 3.0 warning for shared libraries
+      set_target_properties(${target_name} PROPERTIES MACOSX_RPATH ON)
+    endif()
+  endif()
+
+  # Make sure C++11/14 are enabled
+  if(CMAKE_VERSION VERSION_LESS 3.3)
+    target_compile_options(${target_name} PUBLIC ${PYBIND11_CPP_STANDARD})
+  else()
+    target_compile_options(${target_name} PUBLIC $<$<COMPILE_LANGUAGE:CXX>:${PYBIND11_CPP_STANDARD}>)
+  endif()
+
+  if(ARG_NO_EXTRAS)
+    return()
+  endif()
+
+  _pybind11_add_lto_flags(${target_name} ${ARG_THIN_LTO})
+
+  if (NOT MSVC AND NOT ${CMAKE_BUILD_TYPE} MATCHES Debug|RelWithDebInfo)
+    # Strip unnecessary sections of the binary on Linux/Mac OS
+    if(CMAKE_STRIP)
+      if(APPLE)
+        add_custom_command(TARGET ${target_name} POST_BUILD
+                           COMMAND ${CMAKE_STRIP} -x $<TARGET_FILE:${target_name}>)
+      else()
+        add_custom_command(TARGET ${target_name} POST_BUILD
+                           COMMAND ${CMAKE_STRIP} $<TARGET_FILE:${target_name}>)
+      endif()
+    endif()
+  endif()
+
+  if(MSVC)
+    # /MP enables multithreaded builds (relevant when there are many files), /bigobj is
+    # needed for bigger binding projects due to the limit to 64k addressable sections
+    target_compile_options(${target_name} PRIVATE /bigobj)
+    if(CMAKE_VERSION VERSION_LESS 3.11)
+      target_compile_options(${target_name} PRIVATE $<$<NOT:$<CONFIG:Debug>>:/MP>)
+    else()
+      # Only set these options for C++ files.  This is important so that, for
+      # instance, projects that include other types of source files like CUDA
+      # .cu files don't get these options propagated to nvcc since that would
+      # cause the build to fail.
+      target_compile_options(${target_name} PRIVATE $<$<NOT:$<CONFIG:Debug>>:$<$<COMPILE_LANGUAGE:CXX>:/MP>>)
+    endif()
+  endif()
+endfunction()
diff --git a/cviruntime/python/pyruntime.cpp b/cviruntime/python/pyruntime.cpp
new file mode 100644
index 000000000..c39d45f7a
--- /dev/null
+++ b/cviruntime/python/pyruntime.cpp
@@ -0,0 +1,356 @@
+#include <pybind11/pybind11.h>
+#include <pybind11/pytypes.h>
+#include <pybind11/stl.h>
+#include <pybind11/numpy.h>
+#include "cviruntime.h"
+#include <runtime/debug.h>
+#include <cvikernel/cvikernel.h>
+#include "cviruntime_context.h"
+
+namespace py = pybind11;
+
+PYBIND11_DECLARE_HOLDER_TYPE(T, std::shared_ptr<T>);
+
+struct PythonTensor {
+  PythonTensor(CVI_TENSOR *tensor) {
+    name = std::string(tensor->name);
+    qscale = tensor->qscale;
+    zpoint = tensor->zero_point;
+    std::vector<size_t> shape;
+    for (int i = 0; i < (int)tensor->shape.dim_size; i++) {
+      shape.push_back(tensor->shape.dim[i]);
+    }
+    data = py::array(getDtype(tensor->fmt), shape, (void *)CVI_NN_TensorPtr(tensor),
+                     py::cast(*this));
+  }
+
+  std::string name;
+  float qscale;
+  int zpoint;
+  py::array data;
+
+private:
+  py::dtype getDtype(CVI_FMT fmt) {
+    switch (fmt) {
+      case CVI_FMT_FP32:
+        return py::dtype("single");
+      case CVI_FMT_INT8:
+        return py::dtype("int8");
+      case CVI_FMT_UINT8:
+        return py::dtype("uint8");
+      case CVI_FMT_INT16:
+        return py::dtype("int16");
+      case CVI_FMT_UINT16:
+        return py::dtype("uint16");
+      case CVI_FMT_INT32:
+        return py::dtype("int32");
+      case CVI_FMT_UINT32:
+        return py::dtype("uint32");
+      case CVI_FMT_BF16:
+        // numpy has no bf16 type, use uint16 instread of bf16.
+        return py::dtype("uint16");
+      default:
+        assert(0);
+    }
+  }
+};
+
+struct PythonCviModel {
+  PythonCviModel(const std::string &model_file, int program_id, bool output_all_tensors) {
+    int ret = CVI_NN_RegisterModel(model_file.c_str(), &model);
+    if (ret != 0) {
+      assert(0);
+    }
+    this->config(program_id, output_all_tensors);
+  }
+
+  ~PythonCviModel() { CVI_NN_CleanupModel(model); }
+
+  py::object clone() {
+    auto new_cvimodel = new PythonCviModel();
+    int ret = CVI_NN_CloneModel(model, &new_cvimodel->model);
+    if (ret != 0) {
+      assert(0);
+    }
+    return py::cast(new_cvimodel);
+  }
+
+  void config(int program_id, bool output_all_tensors) {
+    CVI_NN_SetConfig(model, OPTION_PROGRAM_INDEX, program_id);
+    CVI_NN_SetConfig(model, OPTION_OUTPUT_ALL_TENSORS, output_all_tensors);
+    int32_t ret = CVI_NN_GetInputOutputTensors(model, &input_tensors, &input_num,
+                                         &output_tensors, &output_num);
+    if (ret != 0) {
+      assert(0);
+    }
+    for (int i = 0; i < input_num; i++) {
+      inputs.push_back(std::make_shared<PythonTensor>(&input_tensors[i]));
+    }
+    for (int i = 0; i < output_num; i++) {
+      outputs.push_back(std::make_shared<PythonTensor>(&output_tensors[i]));
+    }
+  }
+
+  void forward() {
+    int ret = CVI_NN_Forward(model, input_tensors, input_num, output_tensors, output_num);
+    if (ret != 0) {
+      assert(0);
+    }
+  }
+
+  std::vector<std::shared_ptr<PythonTensor>> inputs;
+  std::vector<std::shared_ptr<PythonTensor>> outputs;
+
+private:
+  PythonCviModel() {}
+  CVI_MODEL_HANDLE model = nullptr;
+  int32_t input_num = 0;
+  int32_t output_num = 0;
+  CVI_TENSOR *input_tensors = nullptr;
+  CVI_TENSOR *output_tensors = nullptr;
+};
+
+class PyCvkLmTensor {
+public:
+  PyCvkLmTensor() {}
+
+  PyCvkLmTensor(cvk_context_t *cvk_ctx, cvk_tl_shape_t shape,
+      cvk_fmt_t fmt, int eu_align) : cvk_ctx(cvk_ctx), fmt(fmt),
+          eu_align(eu_align) {
+
+    if (!cvk_ctx)
+      throw std::runtime_error("Expect valid kernel context");
+
+    lmTensor =
+        cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, shape, fmt, eu_align);
+    if (!lmTensor)
+      throw std::runtime_error("Fail to allocate tensor in local memory");
+  }
+
+  std::vector<int> shapes() {
+    std::vector<int> shapes = {0, 0, 0, 0};
+
+    if (lmTensor) {
+      shapes[0] = lmTensor->shape.n;
+      shapes[1] = lmTensor->shape.c;
+      shapes[2] = lmTensor->shape.h;
+      shapes[3] = lmTensor->shape.w;
+    }
+
+    return shapes;
+  }
+
+  int address() {
+    if (lmTensor)
+      return static_cast<int>(lmTensor->start_address);
+
+    return 0;
+  }
+
+  cvk_tl_t *allocated() {
+    return lmTensor;
+  }
+
+  cvk_fmt_t format() {
+    if (lmTensor)
+      return lmTensor->fmt;
+
+    return CVK_FMT_I8;
+  }
+
+private:
+  cvk_context_t *cvk_ctx = nullptr;
+  cvk_tl_shape_t shape;
+  cvk_fmt_t fmt = CVK_FMT_I8;
+  int eu_align = 0;
+  cvk_tl_t *lmTensor = nullptr;
+};
+
+class PyCviKernelContext {
+public:
+  const uint32_t CMDBUF_SIZE = 512 * 1024;
+
+  PyCviKernelContext(const std::string &name) : name(name) {
+    CVI_RT_Init(&rt_handle);
+    assert(rt_handle);
+    submit_handle = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+    assert(submit_handle);
+    cvk_ctx = (cvk_context_t *)submit_handle;
+  }
+
+  ~PyCviKernelContext() {
+    CVI_RT_UnRegisterKernel(cvk_ctx);
+    CVI_RT_DeInit(rt_handle);
+  }
+
+  cvk_fmt_t getCvkDataFormat(std::string format);
+
+  void checkTdmaParameters(py::buffer b, PyCvkLmTensor *lmTensor);
+  void setupGmTensor(cvk_tg_t &tg, py::buffer_info &info, CVI_RT_MEM mem);
+
+  // Kernel Operations
+  PyCvkLmTensor lmem_alloc_tensor(py::buffer b, int eu_align);
+  void tdma_g2l_tensor_copy(PyCvkLmTensor *lmTensor, py::buffer b);
+  void tdma_l2g_tensor_copy(py::buffer b, PyCvkLmTensor *lmTensor);
+
+private:
+  std::string name;
+  CVI_RT_HANDLE rt_handle = nullptr;
+  CVI_RT_KHANDLE submit_handle = nullptr;
+  cvk_context_t *cvk_ctx = nullptr;
+};
+
+cvk_fmt_t PyCviKernelContext::getCvkDataFormat(std::string format) {
+  if (format == "b")
+    return CVK_FMT_I8;
+
+  return CVK_FMT_I8;
+}
+
+void PyCviKernelContext::checkTdmaParameters(py::buffer b,
+                                             PyCvkLmTensor *lmTensor) {
+  if (!lmTensor)
+    throw std::runtime_error("Tensor in Local memory not assigned");
+
+  if (!lmTensor->allocated())
+    throw std::runtime_error("Tensor in local memory not allocated yet");
+
+  py::buffer_info info = b.request();
+  if (info.ndim != 4)
+    throw std::runtime_error("Only support NCHW 4D tensor");
+
+  if ((info.shape[0] != lmTensor->shapes()[0]) ||
+      (info.shape[1] != lmTensor->shapes()[1]) ||
+      (info.shape[2] != lmTensor->shapes()[2]) ||
+      (info.shape[3] != lmTensor->shapes()[3]))
+    throw std::runtime_error("Shape mismatched");
+}
+
+void PyCviKernelContext::setupGmTensor(cvk_tg_t &tg, py::buffer_info &info,
+                                       CVI_RT_MEM mem) {
+  memset(&tg, 0, sizeof(tg));
+  cvk_tg_shape_t tg_shape = {
+      static_cast<uint32_t>(info.shape[0]),
+      static_cast<uint32_t>(info.shape[1]),
+      static_cast<uint32_t>(info.shape[2]),
+      static_cast<uint32_t>(info.shape[3])};
+  cvk_ctx->ops->gmem_init_tensor(cvk_ctx, &tg, tg_shape,
+                                 getCvkDataFormat(info.format));
+  tg.start_address = CVI_RT_MemGetPAddr(mem);
+}
+
+PyCvkLmTensor PyCviKernelContext::lmem_alloc_tensor(py::buffer b,
+                                                    int eu_align) {
+  py::buffer_info info = b.request();
+
+  if (info.ndim != 4)
+    throw std::runtime_error("Local memory only support NCHW");
+
+  if (!info.shape[0] || !info.shape[1] || !info.shape[2] || !info.shape[3])
+    throw std::runtime_error("Shape should not zero");
+
+  cvk_tl_shape_t shape = {
+      static_cast<uint32_t>(info.shape[0]),
+      static_cast<uint32_t>(info.shape[1]),
+      static_cast<uint32_t>(info.shape[2]),
+      static_cast<uint32_t>(info.shape[3])};
+
+  cvk_fmt_t fmt = getCvkDataFormat(info.format);
+
+  PyCvkLmTensor lmTensor(cvk_ctx, shape, fmt, eu_align);
+
+  return lmTensor;
+}
+
+void PyCviKernelContext::tdma_g2l_tensor_copy(PyCvkLmTensor *lmTensor,
+                                              py::buffer b) {
+  checkTdmaParameters(b, lmTensor);
+
+  if (!lmTensor)
+    throw std::runtime_error("Tensor in Local memory not assigned");
+
+  if (!lmTensor->allocated())
+    throw std::runtime_error("Tensor in local memory not allocated yet");
+
+  py::buffer_info info = b.request();
+
+  size_t gm_size = info.shape[0] * info.shape[1] * info.shape[2] *
+                   info.shape[3];
+  CVI_RT_MEM mem = CVI_RT_MemAlloc(rt_handle, gm_size);
+
+  // Copy from system memory to device memory
+  CVI_RT_MemCopyS2D(rt_handle, mem, static_cast<uint8_t *>(info.ptr));
+
+  // Setup global memory
+  cvk_tg_t tg;
+  setupGmTensor(tg, info, mem);
+
+  cvk_tdma_g2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = lmTensor->allocated();
+  cvk_ctx->ops->tdma_g2l_tensor_copy(cvk_ctx, &p);
+
+  CVI_RT_Submit(cvk_ctx);
+
+  // free the device memory
+  CVI_RT_MemFree(rt_handle, mem);
+}
+
+void PyCviKernelContext::tdma_l2g_tensor_copy(py::buffer b,
+                                              PyCvkLmTensor *lmTensor) {
+  checkTdmaParameters(b, lmTensor);
+
+  py::buffer_info info = b.request();
+
+  size_t gm_size = info.shape[0] * info.shape[1] * info.shape[2] *
+                   info.shape[3];
+  CVI_RT_MEM mem = CVI_RT_MemAlloc(rt_handle, gm_size);
+
+  // Setup global memory
+  cvk_tg_t tg;
+  setupGmTensor(tg, info, mem);
+
+  cvk_tdma_l2g_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = lmTensor->allocated();
+  p.dst = &tg;
+  cvk_ctx->ops->tdma_l2g_tensor_copy(cvk_ctx, &p);
+  CVI_RT_Submit(cvk_ctx);
+
+  // Copy from device memory to system memory
+  CVI_RT_MemCopyD2S(rt_handle, static_cast<uint8_t *>(info.ptr), mem);
+
+  // Free the device memory
+  CVI_RT_MemFree(rt_handle, mem);
+}
+
+PYBIND11_MODULE(pyruntime, m) {
+  py::class_<PythonTensor, std::shared_ptr<PythonTensor>>(m, "Tensor")
+      .def_readonly("name", &PythonTensor::name)
+      .def_readonly("qscale", &PythonTensor::qscale)
+      .def_readonly("zpoint", &PythonTensor::zpoint)
+      .def_readwrite("data", &PythonTensor::data);
+
+  py::class_<PythonCviModel>(m, "Model")
+      .def(py::init<const std::string &, int, bool>(),
+           py::arg("cvimodel"), py::arg("program_id") = 0,
+           py::arg("output_all_tensors") = false)
+      .def("forward", &PythonCviModel::forward)
+      .def_readwrite("inputs", &PythonCviModel::inputs)
+      .def_readwrite("outputs", &PythonCviModel::outputs);
+
+  py::class_<PyCvkLmTensor>(m, "CvkLmTensor")
+      .def(py::init<>())
+      .def("shapes", &PyCvkLmTensor::shapes, "Get shape.")
+      .def("address", &PyCvkLmTensor::address, "Get address.");
+
+  py::class_<PyCviKernelContext>(m, "CvkContext")
+      .def(py::init<const std::string &>())
+      .def("lmem_alloc_tensor", &PyCviKernelContext::lmem_alloc_tensor,
+           "Allocate tensor in TPU local memory.")
+      .def("tdma_g2l_tensor_copy", &PyCviKernelContext::tdma_g2l_tensor_copy,
+           "Transfer data from host to TPU local memory.")
+      .def("tdma_l2g_tensor_copy", &PyCviKernelContext::tdma_l2g_tensor_copy,
+           "Transfer data from TPU local memory to host.");
+}
diff --git a/cviruntime/python/test.py b/cviruntime/python/test.py
new file mode 100644
index 000000000..b6bc91a02
--- /dev/null
+++ b/cviruntime/python/test.py
@@ -0,0 +1,79 @@
+#!/usr/bin/python3
+import argparse
+import pyruntime as rt
+import numpy as np
+
+def bf16_to_fp32(d_bf16):
+  s = d_bf16.shape
+  d_bf16 = d_bf16.flatten()
+  assert d_bf16.dtype == np.uint16
+  d_fp32 = np.empty_like(d_bf16, dtype=np.float32)
+  for i in range(len(d_bf16)):
+    d_fp32[i] = struct.unpack('<f', struct.pack('<HH', 0, d_bf16[i]))[0]
+  return d_fp32.reshape(s)
+
+def compare_one_array(a, b):
+  if a.dtype == np.uint16:
+    a = bf16_to_fp32(a)
+  return np.array_equal(a.astype(np.float32).flatten(),
+                        b.astype(np.float32).flatten())
+
+def max_name_sz(tensors):
+  max_sz = 0
+  for out in tensors:
+    sz = len(out.name)
+    max_sz = max_sz if sz < max_sz else sz
+  return max_sz
+
+
+def compare_with_ref(tensors, refs):
+  result = [0, 0]
+  style = "{:<" + str(max_name_sz(tensors) + 4) + "}";
+  print("To compare outputs with refernece npz:")
+  for out in tensors:
+    ref = refs[out.name]
+    same = compare_one_array(out.data, ref)
+    result[int(same)] += 1
+    print("  {} [{}]".format(str.format(style, out.name),
+          "PASS" if same else "FAIL"))
+  print("{} passed, {} failed, Compare {}!!!"
+        .format(result[1], result[0], "OK" if result[0] == 0 else "ERROR"))
+  return result[0] == 0
+
+def quant(x, scale):
+  x = x * scale
+  x = np.trunc(x + np.copysign(.5, x))
+  x[x > 127.0] = 127.0
+  x[x < -128.0] = -128.0
+  return x.astype(np.int8)
+
+def test(input_npz, cvimodel, ref_npz):
+  # load cvimodel
+  model = rt.Model(cvimodel, batch_num=1, output_all_tensors=True)
+  # fill data to inputs
+  data = model.inputs[0].data
+  qscale = model.inputs[0].qscale
+  # load input data and quant to int8
+  input_npz = np.load(input_npz)
+  input = input_npz[input_npz.files[0]]
+  print(input.shape)
+  input = quant(input, qscale)
+  # fill input data to input tensor of model
+  data[:] = input.reshape(data.shape)
+  for out in model.outputs:
+    print(out.name)
+    print(out.data.dtype)
+    print(out.data.shape)
+  # forward
+  model.forward()
+  # compare result with reference
+  refs = np.load(ref_npz)
+  compare_with_ref(model.outputs, refs)
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser(description="Test python runtime API.")
+  parser.add_argument("--cvimodel", type=str, help="cvimodel")
+  parser.add_argument("--input", type=str, help="input npz")
+  parser.add_argument("--reference", type=str, help="reference to output npz")
+  args = parser.parse_args()
+  test(args.input, args.cvimodel, args.reference)
\ No newline at end of file
diff --git a/cviruntime/samples/CMakeLists.txt b/cviruntime/samples/CMakeLists.txt
new file mode 100644
index 000000000..0f8074fc7
--- /dev/null
+++ b/cviruntime/samples/CMakeLists.txt
@@ -0,0 +1,24 @@
+cmake_minimum_required(VERSION 2.8.0)
+
+project(cvitek_samples C CXX)
+
+add_subdirectory(utils)
+add_subdirectory(runner)
+add_subdirectory(classifier)
+add_subdirectory(classifier_bf16)
+add_subdirectory(classifier_fused_preprocess)
+add_subdirectory(classifier_multi_batch)
+add_subdirectory(samples_extra)
+
+set(SCRIPT_FILES
+    run_classifier.sh
+    run_classifier_bf16.sh
+    run_classifier_fused_preprocess.sh
+    run_classifier_multi_batch.sh
+    )
+
+install(FILES ${SCRIPT_FILES}
+    PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
+    DESTINATION .)
+
+install(DIRECTORY data/ DESTINATION data FILES_MATCHING PATTERN "*")
\ No newline at end of file
diff --git a/cviruntime/samples/README.md b/cviruntime/samples/README.md
new file mode 100644
index 000000000..128673ab1
--- /dev/null
+++ b/cviruntime/samples/README.md
@@ -0,0 +1,190 @@
+# Samples for CVI TPU SDK
+
+## Catalogue
+| name                              | sample simple introduce |  
+| --------------------------------- | :------------: |
+| classifier                         |      sample without fuse_preprocess and quant to int8       |
+| classifier_bf16                    |      sample without fuse_preprocess and quant to bf16       |
+| classifier_fused_preprocess        |      sample with fuse_preprocess and quant to int8         |
+| classifier_multi_batch             |      sample with multiple batch merged model       | 
+
+## Sample introduction
+### classifier_bf16
+For model deployment, you need to refer to this sample first and convert it into a bf16 model to evaluate the model accuracy effect in the business scenario. For preprocessing, you can refer to the sample and use OpenCV to implement it.
+
+### classifier
+Ensure that the sample of bf16 model can be adjusted normally. Next, we can see the implementation of quantification as int8. Similarly, the pretreatment can be implemented by using OpenCV with reference to sample.
+
+### classifier_fused_preprocess
+If the pre-processing of the model takes a long time, you can bring --fuse_preprocess parameter allows TPU to implement partial preprocessing of the model to reduce the time of model preprocessing or memory copying.
+
+### classifier_multi_batch
+The combination of two models can share weight and memory, and support the implementation of different batches of the same model. Please refer to this sample.
+
+## How to Compile image input sample in docker
+
+The following documents are required:
+
+* cvitek_tpu_sdk_[cv182x|cv182x_uclibc|cv183x|cv181x_glibc32|cv181x_musl_riscv64].tar.gz
+* cvitek_tpu_samples.tar.gz
+
+**64 bit platform**
+
+``` shell
+tar zxf cvitek_tpu_sdk_cv183x.tar.gz
+export TPU_SDK_PATH=$PWD/cvitek_tpu_sdk
+cd cvitek_tpu_sdk && source ./envs_tpu_sdk.sh && cd ..
+
+tar zxf cvitek_tpu_samples.tar.gz
+cd cvitek_tpu_samples
+mkdir build_soc
+cd build_soc
+cmake -G Ninja \
+    -DCMAKE_BUILD_TYPE=RELEASE \
+    -DCMAKE_C_FLAGS_RELEASE=-O3 -DCMAKE_CXX_FLAGS_RELEASE=-O3 \
+    -DCMAKE_TOOLCHAIN_FILE=$TPU_SDK_PATH/cmake/toolchain-aarch64-linux.cmake \
+    -DTPU_SDK_PATH=$TPU_SDK_PATH \
+    -DOPENCV_PATH=$TPU_SDK_PATH/opencv \
+    -DCMAKE_INSTALL_PREFIX=../install_samples \
+    ..
+cmake --build . --target install
+```
+
+**32 bit platform**
+
+``` shell
+tar zxf cvitek_tpu_sdk_[cv182x|cv181x_glibc32].tar.gz
+export TPU_SDK_PATH=$PWD/cvitek_tpu_sdk
+cd cvitek_tpu_sdk && source ./envs_tpu_sdk.sh && cd ..
+
+tar zxf cvitek_tpu_samples.tar.gz
+cd cvitek_tpu_samples
+mkdir build_soc
+cd build_soc
+cmake -G Ninja \
+    -DCMAKE_BUILD_TYPE=RELEASE \
+    -DCMAKE_C_FLAGS_RELEASE=-O3 -DCMAKE_CXX_FLAGS_RELEASE=-O3 \
+    -DCMAKE_TOOLCHAIN_FILE=$TPU_SDK_PATH/cmake/toolchain-linux-gnueabihf.cmake \
+    -DTPU_SDK_PATH=$TPU_SDK_PATH \
+    -DOPENCV_PATH=$TPU_SDK_PATH/opencv \
+    -DCMAKE_INSTALL_PREFIX=../install_samples \
+    ..
+cmake --build . --target install
+```
+
+**uclibc platform**
+
+``` shell
+tar zxf cvitek_tpu_sdk_cv182x_uclibc.tar.gz
+export TPU_SDK_PATH=$PWD/cvitek_tpu_sdk
+cd cvitek_tpu_sdk && source ./envs_tpu_sdk.sh && cd ..
+
+tar zxf cvitek_tpu_samples.tar.gz
+cd cvitek_tpu_samples
+mkdir build_soc
+cd build_soc
+cmake -G Ninja \
+    -DCMAKE_BUILD_TYPE=RELEASE \
+    -DCMAKE_C_FLAGS_RELEASE=-O3 -DCMAKE_CXX_FLAGS_RELEASE=-O3 \
+    -DCMAKE_TOOLCHAIN_FILE=$TPU_SDK_PATH/cmake/toolchain-linux-uclibc.cmake \
+    -DTPU_SDK_PATH=$TPU_SDK_PATH \
+    -DOPENCV_PATH=$TPU_SDK_PATH/opencv \
+    -DCMAKE_INSTALL_PREFIX=../install_samples \
+    ..
+cmake --build . --target install
+```
+
+**cv181x musl riscv64 platform**
+
+``` shell
+tar zxf cvitek_tpu_sdk_cv181x_musl_riscv64.tar.gz
+export TPU_SDK_PATH=$PWD/cvitek_tpu_sdk
+cd cvitek_tpu_sdk && source ./envs_tpu_sdk.sh && cd ..
+
+tar zxf cvitek_tpu_samples.tar.gz
+cd cvitek_tpu_samples
+mkdir build_soc
+cd build_soc
+cmake -G Ninja \
+    -DCMAKE_BUILD_TYPE=RELEASE \
+    -DCMAKE_C_FLAGS_RELEASE=-O3 -DCMAKE_CXX_FLAGS_RELEASE=-O3 \
+    -DCMAKE_TOOLCHAIN_FILE=$TPU_SDK_PATH/cmake/toolchain-riscv64-linux-musl-x86_64.cmake \
+    -DTPU_SDK_PATH=$TPU_SDK_PATH \
+    -DOPENCV_PATH=$TPU_SDK_PATH/opencv \
+    -DCMAKE_INSTALL_PREFIX=../install_samples \
+    ..
+cmake --build . --target install
+```
+
+Fianlly, copy install_samples folder to development board.
+
+## How to Compile vpss input sample in docker
+
+just add -DMW_PATH in cmake
+
+The following documents are required:
+
+* cvitek_tpu_sdk_[cv182x|cv182x_uclibc|cv183x|cv181x_glibc32|cv181x_musl_riscv64].tar.gz
+* cvitek_tpu_samples.tar.gz
+* mw.tar.gz
+
+
+**64 bit platform**
+
+``` shell
+mkdir mw_path
+tar -zxvf mw.tar.gz -C mw_path
+export MW_PATH=$PWD/mw_path
+
+tar zxf cvitek_tpu_sdk_cv183x.tar.gz
+export TPU_SDK_PATH=$PWD/cvitek_tpu_sdk
+cd cvitek_tpu_sdk && source ./envs_tpu_sdk.sh && cd ..
+
+tar zxf cvitek_tpu_samples.tar.gz
+cd cvitek_tpu_samples
+mkdir build_soc
+cd build_soc
+cmake -G Ninja \
+    -DCMAKE_BUILD_TYPE=RELEASE \
+    -DCMAKE_C_FLAGS_RELEASE=-O3 -DCMAKE_CXX_FLAGS_RELEASE=-O3 \
+    -DCMAKE_TOOLCHAIN_FILE=$TPU_SDK_PATH/cmake/toolchain-aarch64-linux.cmake \
+    -DTPU_SDK_PATH=$TPU_SDK_PATH \
+    -DOPENCV_PATH=$TPU_SDK_PATH/opencv \
+    -DMW_PATH=$MW_PATH \
+    -DCHIP=183x \
+    -DCMAKE_INSTALL_PREFIX=../install_samples \
+    ..
+cmake --build . --target install
+```
+
+Fianlly, copy install_samples folder to development board.
+
+**musl platform**
+``` shell
+mkdir mw_path
+tar -zxvf mw.tar.gz -C mw_path
+export MW_PATH=$PWD/mw_path
+
+tar zxf cvitek_tpu_sdk_cv181x_musl_riscv64.tar.gz
+export TPU_SDK_PATH=$PWD/cvitek_tpu_sdk
+cd cvitek_tpu_sdk && source ./envs_tpu_sdk.sh && cd ..
+
+tar zxf cvitek_tpu_samples.tar.gz
+cd cvitek_tpu_samples
+mkdir build_soc
+cd build_soc
+cmake -G Ninja \
+    -DCMAKE_BUILD_TYPE=RELEASE \
+    -DCMAKE_C_FLAGS_RELEASE=-O3 -DCMAKE_CXX_FLAGS_RELEASE=-O3 \
+    -DCMAKE_TOOLCHAIN_FILE=$TPU_SDK_PATH/cmake/toolchain-riscv64-linux-musl-x86_64.cmake \
+    -DTPU_SDK_PATH=$TPU_SDK_PATH \
+    -DSDK_VER=musl_riscv64 \
+    -DOPENCV_PATH=$TPU_SDK_PATH/opencv \
+    -DMW_PATH=$MW_PATH \
+    -DCHIP=mars \
+    -DCMAKE_INSTALL_PREFIX=../install_samples \
+    ..
+cmake --build . --target install
+```
+
+Fianlly, copy install_samples folder to development board.
\ No newline at end of file
diff --git a/cviruntime/samples/classifier/CMakeLists.txt b/cviruntime/samples/classifier/CMakeLists.txt
new file mode 100644
index 000000000..0155d4f96
--- /dev/null
+++ b/cviruntime/samples/classifier/CMakeLists.txt
@@ -0,0 +1,40 @@
+cmake_minimum_required(VERSION 2.8.0)
+
+project(cvi_sample_classifier C CXX)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
+
+if(NOT DEFINED TPU_SDK_PATH)
+  message(FATAL_ERROR "Please set TPU_SDK_PATH to point to the TPU_SDK installation")
+endif()
+include_directories(${TPU_SDK_PATH}/include)
+link_directories(${TPU_SDK_PATH}/lib)
+
+if(NOT DEFINED OPENCV_PATH)
+  message(FATAL_ERROR "Please set OPENCV_PATH to point to the opencvn installation")
+endif()
+include_directories(${OPENCV_PATH}/include)
+link_directories(${OPENCV_PATH}/lib)
+
+set(CVI_LIBS ${CVI_LIBS} cviruntime cvikernel)
+if(NOT CMAKE_CROSSCOMPILING)
+  set(CVI_LIBS ${CVI_LIBS} cvicmodel)
+endif()
+
+set(OPENCV_LIBS ${OPENCV_LIBS} opencv_core opencv_imgcodecs opencv_imgproc)
+if(NOT CMAKE_CROSSCOMPILING)
+  set(OPENCV_LIBS ${OPENCV_LIBS} opencv_highgui)
+endif()
+
+set(EXTRA_LIBS ${EXTRA_LIBS} dl stdc++ pthread z)
+
+add_executable(cvi_sample_classifier
+    classifier.cpp)
+target_link_libraries(cvi_sample_classifier
+    ${CVI_LIBS}
+    ${OPENCV_LIBS}
+    ${EXTRA_LIBS})
+install(TARGETS cvi_sample_classifier
+    cvi_sample_classifier DESTINATION bin)
diff --git a/cviruntime/samples/classifier/README.md b/cviruntime/samples/classifier/README.md
new file mode 100644
index 000000000..e3fa7fd0c
--- /dev/null
+++ b/cviruntime/samples/classifier/README.md
@@ -0,0 +1,117 @@
+# Mobilev2 Sample without fuse proprocess and quant to int8
+
+Copy Unzipped mobilenet_v2.cvimodel to EVB board
+
+### Download the model and convert the model under docker (optional)
+Mobilev2 model could clone from:https://github.com/shicai/MobileNet-Caffe
+
+#### For new toolchain guide
+The following documents are required:
+* tpu-mlir_xxxx.tar.gz (The release package of tpu-mlir)
+
+Transform cvimodel shell:
+``` shell
+tar zxf tpu-mlir_xxxx.tar.gz
+source tpu-mlir_xxxx/envsetup.sh
+
+mkdir workspace && cd workspace
+cp $TPUC_ROOT/regression/image/cat.jpg .
+cp -rf $TPUC_ROOT/regression/dataset/ILSVRC2012 .
+
+model_transform.py \
+--model_name mobilenet_v2 \
+--model_def ./mobilenet_v2_deploy.prototxt \
+--model_data ./mobilenet_v2.caffemodel \
+--test_input ./cat.jpg \
+--test_result mobilenet_v2_top_output.npz \
+--input_shapes [[1,3,224,224]]
+--resize_dims 256,256 \
+--mean 103.94,116.78,123.68 \
+--scale 0.017,0.017,0.017 \
+--pixel_format "bgr" \
+--tolerance 0.99,0.99 \
+--excepts prob \
+--mlir mobilenet_v2.mlir
+
+run_calibration.py \
+mobilenet_v2.mlir \
+--dataset=./ILSVRC2012 \
+--input_num=100 \
+-o mobilenet_v2_calibration_table
+
+model_deploy.py \
+--mlir mobilenet_v2.mlir \
+--calibration_table mobilenet_v2_calibration_table \
+--chip cv183x \
+--quantize INT8 \
+--quant_input \
+--test_input mobilenet_v2_in_f32.npz \
+--test_reference mobilenet_v2_top_output.npz \
+--excepts prob \
+--tolerance 0.9,0.6 \
+--model mobilenet_v2.cvimodel
+```
+
+
+
+#### For old toolchain guide
+The following documents are required:
+
+* cvitek_mlir_ubuntu-18.04.tar.gz
+
+Transform cvimodel shell:
+``` shell
+tar zxf cvitek_mlir_ubuntu-18.04.tar.gz
+source cvitek_mlir/cvitek_envs.sh
+
+mkdir workspace && cd workspace
+cp $MLIR_PATH/tpuc/regression/data/cat.jpg .
+cp -rf $MLIR_PATH/tpuc/regression/data/images .
+
+model_transform.py \
+--model_type caffe \
+--model_name mobilenet_v2 \
+--model_def ./mobilenet_v2_deploy.prototxt \
+--model_data ./mobilenet_v2.caffemodel \
+--image ./cat.jpg \
+--image_resize_dims 256,256 \
+--net_input_dims 224,224 \
+--mean 103.94,116.78,123.68 \
+--input_scale 0.017 \
+--model_channel_order "bgr" \
+--tolerance 0.99,0.99,0.99 \
+--excepts prob \
+--mlir mobilenet_v2_fp32.mlir
+
+run_calibration.py \
+mobilenet_v2_fp32.mlir \
+--dataset=./images \
+--input_num=100 \
+-o mobilenet_v2_calibration_table
+
+model_deploy.py \
+--model_name mobilenet_v2 \
+--mlir mobilenet_v2_fp32.mlir \
+--calibration_table mobilenet_v2_calibration_table \
+--chip cv183x \
+--quantize INT8 \
+--image cat.jpg \
+--excepts prob \
+--tolerance 0.9,0.9,0.6 \
+--correctness 0.95,0.95,0.9 \
+--cvimodel mobilenet_v2.cvimodel
+```
+Copy generated mobilenet_v2.cvimodel to EVB board
+
+
+## How To Compile Sample In Docker
+View the Top level directory README.md or View the cvitek_tpu_quick_start_guide.md
+
+## Run Samples In EVB Borad
+```
+cd install_samples
+./bin/cvi_sample_classifier \
+./mobilenet_v2.cvimodel \
+./data/cat.jpg \
+./data/synset_words.txt
+```
\ No newline at end of file
diff --git a/cviruntime/samples/classifier/classifier.cpp b/cviruntime/samples/classifier/classifier.cpp
new file mode 100644
index 000000000..23c21ebfe
--- /dev/null
+++ b/cviruntime/samples/classifier/classifier.cpp
@@ -0,0 +1,127 @@
+#include <stdio.h>
+#include <fstream>
+#include <string>
+#include <numeric>
+#include <cviruntime.h>
+#include <opencv2/opencv.hpp>
+
+#define IMG_RESIZE_DIMS 256,256
+#define BGR_MEAN        103.94,116.78,123.68
+#define INPUT_SCALE     0.017
+
+static void usage(char **argv) {
+  printf("Usage:\n");
+  printf("   %s cvimodel image.jpg label_file\n", argv[0]);
+}
+
+int main(int argc, char **argv) {
+  if (argc != 4) {
+    usage(argv);
+    exit(-1);
+  }
+
+  // load model file
+  const char *model_file = argv[1];
+  CVI_MODEL_HANDLE model = nullptr;
+  int ret = CVI_NN_RegisterModel(model_file, &model);
+  if (CVI_RC_SUCCESS != ret) {
+    printf("CVI_NN_RegisterModel failed, err %d\n", ret);
+    exit(1);
+  }
+  printf("CVI_NN_RegisterModel succeeded\n");
+
+  // get input output tensors
+  CVI_TENSOR *input_tensors;
+  CVI_TENSOR *output_tensors;
+  int32_t input_num;
+  int32_t output_num;
+  CVI_NN_GetInputOutputTensors(model, &input_tensors, &input_num, &output_tensors,
+                               &output_num);
+  CVI_TENSOR *input = CVI_NN_GetTensorByName(CVI_NN_DEFAULT_TENSOR, input_tensors, input_num);
+  assert(input);
+  printf("input, name:%s\n", input->name);
+  CVI_TENSOR *output = CVI_NN_GetTensorByName(CVI_NN_DEFAULT_TENSOR, output_tensors, output_num);
+  assert(output);
+
+  float qscale = CVI_NN_TensorQuantScale(input);
+  printf("qscale:%f\n", qscale);
+  CVI_SHAPE shape = CVI_NN_TensorShape(input);
+
+  // nchw
+  int32_t height = shape.dim[2];
+  int32_t width = shape.dim[3];
+
+  // imread
+  cv::Mat image;
+  image = cv::imread(argv[2]);
+  if (!image.data) {
+    printf("Could not open or find the image\n");
+    return -1;
+  }
+
+  // resize
+  cv::resize(image, image, cv::Size(IMG_RESIZE_DIMS)); // linear is default
+  // crop
+  cv::Size size = cv::Size(height, width);
+  cv::Rect crop(cv::Point(0.5 * (image.cols - size.width),
+                          0.5 * (image.rows - size.height)), size);
+  image = image(crop);
+  // split
+  cv::Mat channels[3];
+  for (int i = 0; i < 3; i++) {
+    channels[i] = cv::Mat(height, width, CV_8SC1);
+  }
+  cv::split(image, channels);
+  // normalize
+  float mean[] = {BGR_MEAN};
+  for (int i = 0; i < 3; ++i) {
+    channels[i].convertTo(channels[i], CV_8SC1, INPUT_SCALE * qscale,
+                          -1 * mean[i] * INPUT_SCALE * qscale);
+  }
+
+  // fill to input tensor
+  int8_t *ptr = (int8_t *)CVI_NN_TensorPtr(input);
+  int channel_size = height * width;
+  for (int i = 0; i < 3; ++i) {
+    memcpy(ptr + i * channel_size, channels[i].data, channel_size);
+  }
+
+  // run inference
+  CVI_NN_Forward(model, input_tensors, input_num, output_tensors, output_num);
+  printf("CVI_NN_Forward succeeded\n");
+
+  // output result
+  std::vector<std::string> labels;
+  std::ifstream file(argv[3]);
+  if (!file) {
+    printf("Didn't find synset_words file\n");
+    exit(1);
+  } else {
+    std::string line;
+    while (std::getline(file, line)) {
+      labels.push_back(std::string(line));
+    }
+  }
+
+  int32_t top_num = 5;
+  float *prob = (float *)CVI_NN_TensorPtr(output);
+  int32_t count = CVI_NN_TensorCount(output);
+
+  // find top-k prob and cls
+  std::vector<size_t> idx(count);
+  std::iota(idx.begin(), idx.end(), 0);
+  std::sort(idx.begin(), idx.end(), [&prob](size_t idx_0, size_t idx_1) {return prob[idx_0] > prob[idx_1];});
+  // show results.
+  printf("------\n");
+  for (size_t i = 0; i < top_num; i++) {
+    int top_k_idx = idx[i];
+    printf("  %f, idx %d", prob[top_k_idx], top_k_idx);
+    if (!labels.empty())
+      printf(", %s", labels[top_k_idx].c_str());
+    printf("\n");
+  }
+  printf("------\n");
+  CVI_NN_CleanupModel(model);
+  printf("CVI_NN_CleanupModel succeeded\n");
+  return 0;
+}
\ No newline at end of file
diff --git a/cviruntime/samples/classifier_bf16/CMakeLists.txt b/cviruntime/samples/classifier_bf16/CMakeLists.txt
new file mode 100644
index 000000000..e4058d208
--- /dev/null
+++ b/cviruntime/samples/classifier_bf16/CMakeLists.txt
@@ -0,0 +1,40 @@
+cmake_minimum_required(VERSION 2.8.0)
+
+project(cvi_sample_classifier_bf16 C CXX)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
+
+if(NOT DEFINED TPU_SDK_PATH)
+  message(FATAL_ERROR "Please set TPU_SDK_PATH to point to the TPU_SDK installation")
+endif()
+include_directories(${TPU_SDK_PATH}/include)
+link_directories(${TPU_SDK_PATH}/lib)
+
+if(NOT DEFINED OPENCV_PATH)
+  message(FATAL_ERROR "Please set OPENCV_PATH to point to the opencvn installation")
+endif()
+include_directories(${OPENCV_PATH}/include)
+link_directories(${OPENCV_PATH}/lib)
+
+set(CVI_LIBS ${CVI_LIBS} cviruntime cvikernel)
+if(NOT CMAKE_CROSSCOMPILING)
+  set(CVI_LIBS ${CVI_LIBS} cvicmodel)
+endif()
+
+set(OPENCV_LIBS ${OPENCV_LIBS} opencv_core opencv_imgcodecs opencv_imgproc)
+if(NOT CMAKE_CROSSCOMPILING)
+  set(OPENCV_LIBS ${OPENCV_LIBS} opencv_highgui)
+endif()
+
+set(EXTRA_LIBS ${EXTRA_LIBS} dl stdc++ pthread z)
+
+add_executable(cvi_sample_classifier_bf16
+    classifier_bf16.cpp)
+target_link_libraries(cvi_sample_classifier_bf16
+    ${CVI_LIBS}
+    ${OPENCV_LIBS}
+    ${EXTRA_LIBS})
+install(TARGETS cvi_sample_classifier_bf16
+    cvi_sample_classifier_bf16 DESTINATION bin)
diff --git a/cviruntime/samples/classifier_bf16/README.md b/cviruntime/samples/classifier_bf16/README.md
new file mode 100644
index 000000000..24f3db190
--- /dev/null
+++ b/cviruntime/samples/classifier_bf16/README.md
@@ -0,0 +1,98 @@
+# Mobilev2 Sample without fuse proprocess and quant to BF16
+
+### Download model and convert the model under docker (optional)
+Mobilev2 Model could clone from:https://github.com/shicai/MobileNet-Caffe
+
+#### For new toolchain guide
+The following documents are required:
+* tpu-mlir_xxxx.tar.gz (The release package of tpu-mlir)
+
+Transform cvimodel shell:
+``` shell
+tar zxf tpu-mlir_xxxx.tar.gz
+source tpu-mlir_xxxx/envsetup.sh
+
+mkdir workspace && cd workspace
+cp $TPUC_ROOT/regression/image/cat.jpg .
+
+model_transform.py \
+--model_name mobilenet_v2 \
+--model_def ./mobilenet_v2_deploy.prototxt \
+--model_data ./mobilenet_v2.caffemodel \
+--test_input ./cat.jpg \
+--test_result mobilenet_v2_top_output.npz \
+--input_shapes [[1,3,224,224]]
+--resize_dims 256,256 \
+--mean 103.94,116.78,123.68 \
+--scale 0.017,0.017,0.017 \
+--pixel_format "bgr" \
+--tolerance 0.99,0.99 \
+--excepts prob \
+--mlir mobilenet_v2.mlir
+
+
+model_deploy.py \
+--mlir mobilenet_v2.mlir \
+--chip cv183x \
+--quantize BF16 \
+--test_input mobilenet_v2_in_f32.npz \
+--test_reference mobilenet_v2_top_output.npz \
+--excepts prob \
+--tolerance 0.94,0.61 \
+--model mobilenet_v2_bf16.cvimodel
+```
+
+
+#### For old toolchain guide
+The following documents are required:
+
+* cvitek_mlir_ubuntu-18.04.tar.gz
+
+Transform model shell:
+``` shell
+tar zxf cvitek_mlir_ubuntu-18.04.tar.gz
+source cvitek_mlir/cvitek_envs.sh
+
+mkdir workspace && cd workspace
+cp $MLIR_PATH/tpuc/regression/data/cat.jpg .
+
+model_transform.py \
+--model_type caffe \
+--model_name mobilenet_v2 \
+--model_def ./mobilenet_v2_deploy.prototxt \
+--model_data ./mobilenet_v2.caffemodel \
+--image ./cat.jpg \
+--image_resize_dims 256,256 \
+--net_input_dims 224,224 \
+--mean 103.94,116.78,123.68 \
+--input_scale 0.017 \
+--model_channel_order "bgr" \
+--tolerance 0.99,0.99,0.99 \
+--excepts prob \
+--mlir mobilenet_v2_fp32.mlir
+
+model_deploy.py \
+--model_name mobilenet_v2 \
+--mlir mobilenet_v2_fp32.mlir \
+--quantize BF16 \
+--chip cv183x \
+--image cat.jpg \
+--excepts prob \
+--tolerance 0.94,0.94,0.61 \
+--correctness 0.99,0.99,0.96 \
+--cvimodel mobilenet_v2_bf16.cvimodel
+```
+
+Copy generated mobilenet_v2_bf16.cvimodel to EVB board
+
+## How To Compile Sample In Docker
+View the Top level directory README.md or View the cvitek_tpu_quick_start_guide.md
+
+## Run Samples In EVB Borad
+```
+cd install_samples
+./bin/cvi_sample_classifier_bf16 \
+./mobilenet_v2_bf16.cvimodel \
+./data/cat.jpg \
+./data/synset_words.txt
+```
diff --git a/cviruntime/samples/classifier_bf16/classifier_bf16.cpp b/cviruntime/samples/classifier_bf16/classifier_bf16.cpp
new file mode 100644
index 000000000..65aa1c8a5
--- /dev/null
+++ b/cviruntime/samples/classifier_bf16/classifier_bf16.cpp
@@ -0,0 +1,140 @@
+#include <stdio.h>
+#include <fstream>
+#include <string>
+#include <cviruntime.h>
+#include <opencv2/opencv.hpp>
+
+#define IMG_RESIZE_DIMS 256,256
+#define BGR_MEAN        103.94,116.78,123.68
+#define INPUT_SCALE     0.017
+
+static void usage(char **argv) {
+  printf("Usage:\n");
+  printf("   %s cvimodel image.jpg label_file\n", argv[0]);
+}
+
+int main(int argc, char **argv) {
+  if (argc != 4) {
+    usage(argv);
+    exit(-1);
+  }
+
+  // load model file
+  const char *model_file = argv[1];
+  CVI_MODEL_HANDLE model = nullptr;
+  int ret = CVI_NN_RegisterModel(model_file, &model);
+  if (CVI_RC_SUCCESS != ret) {
+    printf("CVI_NN_RegisterModel failed, err %d\n", ret);
+    exit(1);
+  }
+  printf("CVI_NN_RegisterModel succeeded\n");
+
+  // get input output tensors
+  CVI_TENSOR *input_tensors;
+  CVI_TENSOR *output_tensors;
+  int32_t input_num;
+  int32_t output_num;
+  CVI_NN_GetInputOutputTensors(model, &input_tensors, &input_num, &output_tensors,
+                               &output_num);
+  CVI_TENSOR *input = CVI_NN_GetTensorByName(CVI_NN_DEFAULT_TENSOR, input_tensors, input_num);
+  assert(input);
+  CVI_TENSOR *output = CVI_NN_GetTensorByName(CVI_NN_DEFAULT_TENSOR, output_tensors, output_num);
+  assert(output);
+  CVI_SHAPE shape = CVI_NN_TensorShape(input);
+
+  // nchw
+  int32_t height = shape.dim[2];
+  int32_t width = shape.dim[3];
+
+  // imread
+  cv::Mat image;
+  image = cv::imread(argv[2]);
+  if (!image.data) {
+    printf("Could not open or find the image\n");
+    return -1;
+  }
+
+  // resize
+  cv::resize(image, image, cv::Size(IMG_RESIZE_DIMS)); // linear is default
+  // crop
+  cv::Size size = cv::Size(height, width);
+  cv::Rect crop(cv::Point(0.5 * (image.cols - size.width),
+                          0.5 * (image.rows - size.height)), size);
+  image = image(crop);
+
+  // split
+  cv::Mat channels[3];
+  for (int i = 0; i < 3; i++) {
+    channels[i] = cv::Mat(height, width, CV_8UC1);
+  }
+  cv::split(image, channels);
+  // normalize
+  float mean[] = {BGR_MEAN};
+  for (int i = 0; i < 3; ++i) {
+    channels[i].convertTo(channels[i], CV_32FC1, INPUT_SCALE,
+                          -1 * mean[i] * INPUT_SCALE);
+  }
+
+  // fill to input tensor
+  float *ptr = (float *)CVI_NN_TensorPtr(input);
+  int channel_size = height * width;
+  for (int i = 0; i < 3; ++i) {
+    memcpy(ptr + i * channel_size, channels[i].data, channel_size*sizeof(float));
+  }
+
+  // run inference
+  CVI_NN_Forward(model, input_tensors, input_num, output_tensors, output_num);
+  printf("CVI_NN_Forward succeeded\n");
+
+  // output result
+  std::vector<std::string> labels;
+  std::ifstream file(argv[3]);
+  if (!file) {
+    printf("Didn't find synset_words file\n");
+    exit(1);
+  } else {
+    std::string line;
+    while (std::getline(file, line)) {
+      labels.push_back(std::string(line));
+    }
+  }
+
+  int32_t top_num = 5;
+  float *prob = (float *)CVI_NN_TensorPtr(output);
+  int32_t count = CVI_NN_TensorCount(output);
+
+  int32_t top_k_idx[top_num] = {-1};
+  float top_k[top_num] = {0};
+
+  // find top-k prob and cls
+  for (int32_t i = 0; i < count; ++i) {
+    for (int32_t k = 0; k < top_num; ++k) {
+      if (prob[i] > top_k[k]) {
+        top_k[k] = prob[i];
+        top_k_idx[k] = i;
+        break;
+      }
+    }
+  }
+  // sort
+  for (int32_t i = 0; i < top_num - 1; ++i) {
+    for (int32_t j = 0; j < top_num - 1 - i; ++j) {
+      if (top_k[j] < top_k[j + 1]) {
+        std::swap(top_k[j], top_k[j + 1]);
+        std::swap(top_k_idx[j], top_k_idx[j + 1]);
+      }
+    }
+  }
+  // show results.
+  printf("------\n");
+  for (size_t i = 0; i < top_num; i++) {
+    printf("  %f, idx %d", top_k[i], top_k_idx[i]);
+    if (!labels.empty())
+      printf(", %s", labels[top_k_idx[i]].c_str());
+    printf("\n");
+  }
+  printf("------\n");
+  CVI_NN_CleanupModel(model);
+  printf("CVI_NN_CleanupModel succeeded\n");
+  return 0;
+}
\ No newline at end of file
diff --git a/cviruntime/samples/classifier_fused_preprocess/CMakeLists.txt b/cviruntime/samples/classifier_fused_preprocess/CMakeLists.txt
new file mode 100644
index 000000000..10e3b7cdc
--- /dev/null
+++ b/cviruntime/samples/classifier_fused_preprocess/CMakeLists.txt
@@ -0,0 +1,40 @@
+cmake_minimum_required(VERSION 2.8.0)
+
+project(cvi_sample_classifier_fused_preprocess C CXX)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
+
+if(NOT DEFINED TPU_SDK_PATH)
+  message(FATAL_ERROR "Please set TPU_SDK_PATH to point to the TPU_SDK installation")
+endif()
+include_directories(${TPU_SDK_PATH}/include)
+link_directories(${TPU_SDK_PATH}/lib)
+
+if(NOT DEFINED OPENCV_PATH)
+  message(FATAL_ERROR "Please set OPENCV_PATH to point to the opencvn installation")
+endif()
+include_directories(${OPENCV_PATH}/include)
+link_directories(${OPENCV_PATH}/lib)
+
+set(CVI_LIBS ${CVI_LIBS} cviruntime cvikernel)
+if(NOT CMAKE_CROSSCOMPILING)
+  set(CVI_LIBS ${CVI_LIBS} cvicmodel)
+endif()
+
+set(OPENCV_LIBS ${OPENCV_LIBS} opencv_core opencv_imgcodecs opencv_imgproc)
+if(NOT CMAKE_CROSSCOMPILING)
+  set(OPENCV_LIBS ${OPENCV_LIBS} opencv_highgui)
+endif()
+
+set(EXTRA_LIBS ${EXTRA_LIBS} dl stdc++ pthread z)
+
+add_executable(cvi_sample_classifier_fused_preprocess
+    classifier_fused_preprocess.cpp)
+target_link_libraries(cvi_sample_classifier_fused_preprocess
+    ${CVI_LIBS}
+    ${OPENCV_LIBS}
+    ${EXTRA_LIBS})
+install(TARGETS cvi_sample_classifier_fused_preprocess
+    cvi_sample_classifier_fused_preprocess DESTINATION bin)
diff --git a/cviruntime/samples/classifier_fused_preprocess/README.md b/cviruntime/samples/classifier_fused_preprocess/README.md
new file mode 100644
index 000000000..cabfc00d0
--- /dev/null
+++ b/cviruntime/samples/classifier_fused_preprocess/README.md
@@ -0,0 +1,119 @@
+# Mobilev2 Sample with fuse proprocess and quant to int8
+
+### Download the mobilev2 model and convert the model under docker (optional)
+Model could clone from:https://github.com/shicai/MobileNet-Caffe
+
+#### For new toolchain guide
+The following documents are required:
+* tpu-mlir_xxxx.tar.gz (The release package of tpu-mlir)
+
+Transform cvimodel shell:
+``` shell
+tar zxf tpu-mlir_xxxx.tar.gz
+source tpu-mlir_xxxx/envsetup.sh
+
+mkdir workspace && cd workspace
+cp $TPUC_ROOT/regression/image/cat.jpg .
+cp -rf $TPUC_ROOT/regression/dataset/ILSVRC2012 .
+
+model_transform.py \
+--model_name mobilenet_v2 \
+--model_def ./mobilenet_v2_deploy.prototxt \
+--model_data ./mobilenet_v2.caffemodel \
+--test_input ./cat.jpg \
+--test_result mobilenet_v2_top_output.npz \
+--input_shapes [[1,3,224,224]]
+--resize_dims 256,256 \
+--mean 103.94,116.78,123.68 \
+--scale 0.017,0.017,0.017 \
+--pixel_format "bgr" \
+--tolerance 0.99,0.99 \
+--excepts prob \
+--mlir mobilenet_v2.mlir
+
+run_calibration.py \
+mobilenet_v2.mlir \
+--dataset=./ILSVRC2012 \
+--input_num=100 \
+-o mobilenet_v2_calibration_table
+
+model_deploy.py \
+--mlir mobilenet_v2.mlir \
+--calibration_table mobilenet_v2_calibration_table \
+--chip cv183x \
+--quantize INT8 \
+--quant_input \
+--customization_format BGR_PLANAR \
+--test_input ./cat.jpg \
+--test_reference mobilenet_v2_top_output.npz \
+--fuse_preprocess \
+--excepts prob \
+--tolerance 0.9,0.6 \
+--model mobilenet_v2_fused_preprocess.cvimodel
+```
+
+
+
+#### For old toolchain guide
+The following documents are required:
+
+* cvitek_mlir_ubuntu-18.04.tar.gz
+
+Transform model shell:
+``` shell
+tar zxf cvitek_mlir_ubuntu-18.04.tar.gz
+source cvitek_mlir/cvitek_envs.sh
+
+mkdir workspace && cd workspace
+cp $MLIR_PATH/tpuc/regression/data/cat.jpg .
+cp -rf $MLIR_PATH/tpuc/regression/data/images .
+
+model_transform.py \
+--model_type caffe \
+--model_name mobilenet_v2 \
+--model_def ./mobilenet_v2_deploy.prototxt \
+--model_data ./mobilenet_v2.caffemodel \
+--image ./cat.jpg \
+--image_resize_dims 256,256 \
+--net_input_dims 224,224 \
+--mean 103.94,116.78,123.68 \
+--input_scale 0.017 \
+--model_channel_order "bgr" \
+--tolerance 0.99,0.99,0.99 \
+--excepts prob \
+--mlir mobilenet_v2_fp32.mlir
+
+run_calibration.py \
+mobilenet_v2_fp32.mlir \
+--dataset=./images \
+--input_num=100 \
+-o mobilenet_v2_calibration_table
+
+model_deploy.py \
+--model_name mobilenet_v2 \
+--mlir mobilenet_v2_fp32.mlir \
+--calibration_table mobilenet_v2_calibration_table \
+--fuse_preprocess \
+--pixel_format BGR_PLANAR \
+--aligned_input false \
+--chip cv183x \
+--quantize INT8 \
+--image cat.jpg \
+--tolerance 0.9,0.9,0.6 \
+--correctness 0.95,0.95,0.9 \
+--cvimodel mobilenet_v2_fused_preprocess.cvimodel
+```
+
+Copy generated mobilenet_v2_fused_preprocess.cvimodel to Development board
+
+## How To Compile Sample In Docker
+View the Top level directory README.md or View the cvitek_tpu_quick_start_guide.md
+
+## Run Samples In EVB Borad
+```
+cd install_samples
+./bin/cvi_sample_classifier_fused_preprocess \
+./mobilenet_v2_fused_preprocess.cvimodel \
+./data/cat.jpg \
+./data/synset_words.txt
+```
diff --git a/cviruntime/samples/classifier_fused_preprocess/classifier_fused_preprocess.cpp b/cviruntime/samples/classifier_fused_preprocess/classifier_fused_preprocess.cpp
new file mode 100644
index 000000000..f8859a236
--- /dev/null
+++ b/cviruntime/samples/classifier_fused_preprocess/classifier_fused_preprocess.cpp
@@ -0,0 +1,109 @@
+#include <stdio.h>
+#include <fstream>
+#include <string>
+#include <numeric>
+#include <cviruntime.h>
+#include <opencv2/opencv.hpp>
+
+
+#define IMG_RESIZE_DIMS 256
+
+static void usage(char **argv) {
+  printf("Usage:\n");
+  printf("   %s cvimodel image.jpg label_file\n", argv[0]);
+}
+
+int main(int argc, char **argv) {
+  if (argc != 4) {
+    usage(argv);
+    exit(-1);
+  }
+
+  // load model file
+  const char *model_file = argv[1];
+  CVI_MODEL_HANDLE model = nullptr;
+  int ret = CVI_NN_RegisterModel(model_file, &model);
+  if (CVI_RC_SUCCESS != ret) {
+    printf("CVI_NN_RegisterModel failed, err %d\n", ret);
+    exit(1);
+  }
+  printf("CVI_NN_RegisterModel succeeded\n");
+
+  // get input output tensors
+  CVI_TENSOR *input_tensors;
+  CVI_TENSOR *output_tensors;
+  int32_t input_num;
+  int32_t output_num;
+  CVI_NN_GetInputOutputTensors(model, &input_tensors, &input_num, &output_tensors,
+                               &output_num);
+  CVI_TENSOR *input = CVI_NN_GetTensorByName(CVI_NN_DEFAULT_TENSOR, input_tensors, input_num);
+  assert(input);
+  CVI_TENSOR *output = CVI_NN_GetTensorByName(CVI_NN_DEFAULT_TENSOR, output_tensors, output_num);
+  assert(output);
+
+  // nchw
+  CVI_SHAPE shape = CVI_NN_TensorShape(input);
+  int32_t height = shape.dim[2];
+  int32_t width = shape.dim[3];
+
+  // imread
+  cv::Mat image;
+  image = cv::imread(argv[2]);
+  if (!image.data) {
+    printf("Could not open or find the image\n");
+    return -1;
+  }
+
+  // resize
+  cv::resize(image, image, cv::Size(IMG_RESIZE_DIMS, IMG_RESIZE_DIMS)); // linear is default
+
+  //Packed2Planar
+  cv::Mat channels[3];
+  for (int i = 0; i < 3; i++) {
+    channels[i] = cv::Mat(image.rows, image.cols, CV_8SC1);
+  }
+  cv::split(image, channels);
+
+  int8_t *ptr = (int8_t *)CVI_NN_TensorPtr(input);
+  int channel_size = height * width;
+  for (int i = 0; i < 3; ++i) {
+    memcpy(ptr + i * channel_size, channels[i].data, channel_size);
+  }
+
+  // run inference
+  CVI_NN_Forward(model, input_tensors, input_num, output_tensors, output_num);
+  printf("CVI_NN_Forward succeeded\n");
+
+  std::vector<std::string> labels;
+  std::ifstream file(argv[3]);
+  if (!file) {
+    printf("Didn't find synset_words file\n");
+    exit(1);
+  } else {
+    std::string line;
+    while (std::getline(file, line)) {
+      labels.push_back(std::string(line));
+    }
+  }
+
+  int32_t top_num = 5;
+  float *prob = (float *)CVI_NN_TensorPtr(output);
+  int32_t count = CVI_NN_TensorCount(output);
+  // find top-k prob and cls
+  std::vector<size_t> idx(count);
+  std::iota(idx.begin(), idx.end(), 0);
+  std::sort(idx.begin(), idx.end(), [&prob](size_t idx_0, size_t idx_1) {return prob[idx_0] > prob[idx_1];});
+  // show results.
+  printf("------\n");
+  for (size_t i = 0; i < top_num; i++) {
+    int top_k_idx = idx[i];
+    printf("  %f, idx %d", prob[top_k_idx], top_k_idx);
+    if (!labels.empty())
+      printf(", %s", labels[top_k_idx].c_str());
+    printf("\n");
+  }
+  printf("------\n");
+  CVI_NN_CleanupModel(model);
+  printf("CVI_NN_CleanupModel succeeded\n");
+  return 0;
+}
\ No newline at end of file
diff --git a/cviruntime/samples/classifier_multi_batch/CMakeLists.txt b/cviruntime/samples/classifier_multi_batch/CMakeLists.txt
new file mode 100644
index 000000000..110d577b0
--- /dev/null
+++ b/cviruntime/samples/classifier_multi_batch/CMakeLists.txt
@@ -0,0 +1,40 @@
+cmake_minimum_required(VERSION 2.8.0)
+
+project(cvi_sample_classifier_multi_batch C CXX)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
+
+if(NOT DEFINED TPU_SDK_PATH)
+  message(FATAL_ERROR "Please set TPU_SDK_PATH to point to the TPU_SDK installation")
+endif()
+include_directories(${TPU_SDK_PATH}/include)
+link_directories(${TPU_SDK_PATH}/lib)
+
+if(NOT DEFINED OPENCV_PATH)
+  message(FATAL_ERROR "Please set OPENCV_PATH to point to the opencvn installation")
+endif()
+include_directories(${OPENCV_PATH}/include)
+link_directories(${OPENCV_PATH}/lib)
+
+set(CVI_LIBS ${CVI_LIBS} cviruntime cvikernel)
+if(NOT CMAKE_CROSSCOMPILING)
+  set(CVI_LIBS ${CVI_LIBS} cvicmodel)
+endif()
+
+set(OPENCV_LIBS ${OPENCV_LIBS} opencv_core opencv_imgcodecs opencv_imgproc)
+if(NOT CMAKE_CROSSCOMPILING)
+  set(OPENCV_LIBS ${OPENCV_LIBS} opencv_highgui)
+endif()
+
+set(EXTRA_LIBS ${EXTRA_LIBS} dl stdc++ pthread z)
+
+add_executable(cvi_sample_classifier_multi_batch
+    classifier_multi_batch.cpp)
+target_link_libraries(cvi_sample_classifier_multi_batch
+    ${CVI_LIBS}
+    ${OPENCV_LIBS}
+    ${EXTRA_LIBS})
+install(TARGETS cvi_sample_classifier_multi_batch
+    cvi_sample_classifier_multi_batch DESTINATION bin)
diff --git a/cviruntime/samples/classifier_multi_batch/README.md b/cviruntime/samples/classifier_multi_batch/README.md
new file mode 100644
index 000000000..b534fd1c1
--- /dev/null
+++ b/cviruntime/samples/classifier_multi_batch/README.md
@@ -0,0 +1,180 @@
+# Sample with Multiple batch fuse cvimodel 
+
+### Download the mobilev2 model and convert the model under docker (optional)
+Model could clone from:https://github.com/shicai/MobileNet-Caffe
+
+#### For new toolchain guide
+The following documents are required:
+* tpu-mlir_xxxx.tar.gz (The release package of tpu-mlir)
+
+Transform cvimodel shell:
+``` shell
+tar zxf tpu-mlir_xxxx.tar.gz
+source tpu-mlir_xxxx/envsetup.sh
+
+mkdir workspace && cd workspace
+cp $TPUC_ROOT/regression/image/cat.jpg .
+cp -rf $TPUC_ROOT/regression/dataset/ILSVRC2012 .
+
+model_transform.py \
+--model_name mobilenet_v2 \
+--model_def ./mobilenet_v2_deploy.prototxt \
+--model_data ./mobilenet_v2.caffemodel \
+--test_input ./cat.jpg \
+--test_result mobilenet_v2_top_output.npz \
+--input_shapes [[1,3,224,224]]
+--resize_dims 256,256 \
+--mean 103.94,116.78,123.68 \
+--scale 0.017,0.017,0.017 \
+--pixel_format "bgr" \
+--tolerance 0.99,0.99 \
+--excepts prob \
+--mlir mobilenet_v2.mlir
+
+run_calibration.py \
+mobilenet_v2.mlir \
+--dataset=./ILSVRC2012 \
+--input_num=100 \
+-o mobilenet_v2_calibration_table
+
+model_deploy.py \
+--mlir mobilenet_v2.mlir \
+--calibration_table mobilenet_v2_calibration_table \
+--chip cv183x \
+--quantize INT8 \
+--quant_input \
+--test_input mobilenet_v2_in_f32.npz \
+--test_reference mobilenet_v2_top_output.npz \
+--excepts prob \
+--tolerance 0.9,0.6 \
+--model tmp_model_bs1.cvimodel
+
+
+model_transform.py \
+--model_name mobilenet_v2 \
+--model_def ./mobilenet_v2_deploy.prototxt \
+--model_data ./mobilenet_v2.caffemodel \
+--test_input ./cat.jpg \
+--test_result mobilenet_v2_top_output.npz \
+--input_shapes [[4,3,224,224]]
+--resize_dims 256,256 \
+--mean 103.94,116.78,123.68 \
+--scale 0.017,0.017,0.017 \
+--pixel_format "bgr" \
+--tolerance 0.99,0.99 \
+--excepts prob \
+--mlir mobilenet_v2.mlir
+
+model_deploy.py \
+--mlir mobilenet_v2.mlir \
+--calibration_table mobilenet_v2_calibration_table \
+--chip cv183x \
+--quantize INT8 \
+--quant_input \
+--test_input mobilenet_v2_in_f32.npz \
+--test_reference mobilenet_v2_top_output.npz \
+--excepts prob \
+--tolerance 0.9,0.6 \
+--model tmp_model_bs4.cvimodel
+
+model_tool --combine tmp_model_bs1.cvimodel tmp_model_bs4.cvimodel -o mobilenet_v2_bs1_bs4.cvimodel
+
+```
+
+
+
+#### For old toolchain guide
+The following documents are required:
+
+* cvitek_mlir_ubuntu-18.04.tar.gz
+
+Transform model shell:
+``` shell
+tar zxf cvitek_mlir_ubuntu-18.04.tar.gz
+source cvitek_mlir/cvitek_envs.sh
+
+mkdir workspace && cd workspace
+cp $MLIR_PATH/tpuc/regression/data/cat.jpg .
+cp -rf $MLIR_PATH/tpuc/regression/data/images .
+
+model_transform.py \
+--model_type caffe \
+--model_name mobilenet_v2 \
+--model_def mobilenet_v2_deploy.prototxt \
+--model_data mobilenet_v2.caffemodel \
+--image cat.jpg \
+--image_resize_dims 256,256 \
+--net_input_dims 224,224 \
+--mean 103.94,116.78,123.68 \
+--input_scale 0.017 \
+--model_channel_order bgr \
+--tolerance 0.999,0.999,0.998 \
+--excepts prob \
+--mlir mobilenet_v2_fp32.mlir
+
+run_calibration.py \
+mobilenet_v2_fp32.mlir \
+--dataset=./images \
+--input_num=100 \
+-o mobilenet_v2_calibration_table
+
+model_deploy.py \
+--model_name mobilenet_v2 \
+--mlir mobilenet_v2_fp32.mlir \
+--calibration_table mobilenet_v2_calibration_table_1000 \
+--chip cv183x \
+--image cat.jpg \
+--merge_weight \
+--tolerance 0.94,0.94,0.66 \
+--excepts prob \
+--correctness 0.99,0.99,0.99 \
+--cvimodel tmp_model_bs1.cvimodel
+
+model_transform.py \
+--model_type caffe \
+--model_name mobilenet_v2 \
+--model_def mobilenet_v2_deploy.prototxt \
+--model_data mobilenet_v2.caffemodel \
+--image cat.jpg \
+--image_resize_dims 256,256 \
+--net_input_dims 224,224 \
+--mean 103.94,116.78,123.68 \
+--input_scale 0.017 \
+--model_channel_order bgr \
+--batch_size 4 \
+--tolerance 0.999,0.999,0.998 \
+--excepts prob \
+--mlir mobilenet_v2_fp32.mlir
+
+model_deploy.py \
+--model_name mobilenet_v2 \
+--mlir mobilenet_v2_fp32.mlir \
+--calibration_table mobilenet_v2_calibration_table_1000 \
+--chip cv183x \
+--image cat.jpg \
+--merge_weight \
+--tolerance 0.94,0.94,0.66 \
+--excepts prob \
+--correctness 0.99,0.99,0.99 \
+--cvimodel tmp_model_bs4.cvimodel
+
+cvimodel_tool \
+-a merge \
+-i tmp_model_bs1.cvimodel tmp_model_bs4.cvimodel \
+-o mobilenet_v2_bs1_bs4.cvimodel
+```
+
+Copy generated mobilenet_v2_bs1_bs4.cvimodel to Development board
+
+
+## How To Compile Sample In Docker
+View the Top level directory README.md or View the cvitek_tpu_quick_start_guide.md
+
+## Run Samples In EVB Borad
+```
+cd install_samples
+./bin/cvi_sample_classifier_multi_batch \
+./mobilenet_v2_bs1_bs4.cvimodel \
+./data/cat.jpg \
+./data/synset_words.txt
+```
diff --git a/cviruntime/samples/classifier_multi_batch/classifier_multi_batch.cpp b/cviruntime/samples/classifier_multi_batch/classifier_multi_batch.cpp
new file mode 100644
index 000000000..05dfd0228
--- /dev/null
+++ b/cviruntime/samples/classifier_multi_batch/classifier_multi_batch.cpp
@@ -0,0 +1,207 @@
+#include <stdio.h>
+#include <fstream>
+#include <string>
+#include <numeric>
+#include <cviruntime.h>
+#include <opencv2/opencv.hpp>
+
+#define IMG_RESIZE_DIMS 256,256
+#define BGR_MEAN        103.94,116.78,123.68
+#define INPUT_SCALE     0.017
+
+static void usage(char **argv) {
+  printf("Usage:\n");
+  printf("   %s cvimodel image.jpg label_file\n", argv[0]);
+}
+
+static int get_input_output_tensors(CVI_MODEL_HANDLE &model, CVI_TENSOR **input,
+                                    CVI_TENSOR **output, CVI_TENSOR **input_tensors,
+                                    CVI_TENSOR **output_tensors, int32_t &input_num,
+                                    int32_t &output_num, CVI_SHAPE &shape, int32_t &batch,
+                                    float &qscale) {
+  int program_id = 0;
+  if (batch == 1) {
+    program_id = 0;
+  } else if (batch == 4) {
+    program_id = 1;
+  } else {
+    printf("error batch size\n");
+    exit(1);
+  }
+  CVI_NN_SetConfig(model, OPTION_PROGRAM_INDEX, program_id);
+
+  // get input output tensors
+  CVI_NN_GetInputOutputTensors(model, input_tensors, &input_num, output_tensors,
+                               &output_num);
+  *input = CVI_NN_GetTensorByName(CVI_NN_DEFAULT_TENSOR, *input_tensors, input_num);
+  assert(*input);
+  printf("input, name:%s\n", (*input)->name);
+  *output = CVI_NN_GetTensorByName(CVI_NN_DEFAULT_TENSOR, *output_tensors, output_num);
+  assert(*output);
+
+  qscale = CVI_NN_TensorQuantScale(*input);
+  printf("qscale:%f\n", qscale);
+  shape = CVI_NN_TensorShape(*input);
+
+  if (shape.dim[0] != batch) {
+    printf("ERROR : Program id %d is batch %d not batch %d\n", program_id, shape.dim[0], batch);
+    exit(1);
+  }
+
+  return 0;
+}
+
+static void post_process(int top_num, int batch, CVI_TENSOR *output, std::vector<std::string> &labels) {
+  float *batch_prob = (float *)CVI_NN_TensorPtr(output);
+  int32_t count = CVI_NN_TensorCount(output) / batch;
+
+  for (int b = 0; b < batch; ++b){
+    // find top-k prob and cls
+    float *prob = batch_prob + b * count;
+    std::vector<size_t> idx(count);
+    std::iota(idx.begin(), idx.end(), 0);
+    std::sort(idx.begin(), idx.end(), [&prob](size_t idx_0, size_t idx_1) {return prob[idx_0] > prob[idx_1];});
+    // show results.
+    printf("--batch %d----\n", b);
+    for (size_t i = 0; i < top_num; i++) {
+      int top_k_idx = idx[i];
+      printf("  %f, idx %d", prob[top_k_idx], top_k_idx);
+      if (!labels.empty())
+        printf(", %s", labels[top_k_idx].c_str());
+      printf("\n");
+    }
+    printf("------\n");
+  }
+}
+
+int main(int argc, char **argv) {
+  if (argc != 4) {
+    usage(argv);
+    exit(-1);
+  }
+
+  // load model file
+  const char *model_file = argv[1];
+  CVI_MODEL_HANDLE model = nullptr;
+  int ret = CVI_NN_RegisterModel(model_file, &model);
+  if (CVI_RC_SUCCESS != ret) {
+    printf("CVI_NN_RegisterModel failed, err %d\n", ret);
+    exit(1);
+  }
+  printf("CVI_NN_RegisterModel succeeded\n");
+
+  // batch 1
+  // get input output tensors
+  CVI_TENSOR *input;
+  CVI_TENSOR *output;
+  CVI_TENSOR *input_tensors;
+  CVI_TENSOR *output_tensors;
+  int32_t input_num;
+  int32_t output_num;
+  CVI_SHAPE shape;
+  float qscale;
+
+  int32_t batch = 1;
+  get_input_output_tensors(model, &input, &output, &input_tensors, &output_tensors,
+                           input_num, output_num, shape, batch, qscale);
+
+  // nchw
+  int32_t height = shape.dim[2];
+  int32_t width = shape.dim[3];
+
+  // imread
+  cv::Mat image;
+  image = cv::imread(argv[2]);
+  if (!image.data) {
+    printf("Could not open or find the image\n");
+    return -1;
+  }
+
+  // resize
+  cv::resize(image, image, cv::Size(IMG_RESIZE_DIMS)); // linear is default
+  // crop
+  cv::Size size = cv::Size(height, width);
+  cv::Rect crop(cv::Point(0.5 * (image.cols - size.width),
+                          0.5 * (image.rows - size.height)), size);
+  image = image(crop);
+  // split
+  cv::Mat channels[3];
+  for (int i = 0; i < 3; i++) {
+    channels[i] = cv::Mat(height, width, CV_8SC1);
+  }
+  cv::split(image, channels);
+  // normalize
+  float mean[] = {BGR_MEAN};
+  for (int i = 0; i < 3; ++i) {
+    channels[i].convertTo(channels[i], CV_8SC1, INPUT_SCALE * qscale,
+                          -1 * mean[i] * INPUT_SCALE * qscale);
+  }
+
+  // fill to input tensor
+  int8_t *ptr = (int8_t *)CVI_NN_TensorPtr(input);
+  int channel_size = height * width;
+  for (int i = 0; i < 3; ++i) {
+    memcpy(ptr + i * channel_size, channels[i].data, channel_size);
+  }
+
+  // run inference
+  CVI_NN_Forward(model, input_tensors, input_num, output_tensors, output_num);
+  printf("CVI_NN_Forward succeeded\n");
+
+  // output result
+  std::vector<std::string> labels;
+  std::ifstream file(argv[3]);
+  if (!file) {
+    printf("Didn't find synset_words file\n");
+    exit(1);
+  } else {
+    std::string line;
+    while (std::getline(file, line)) {
+      labels.push_back(std::string(line));
+    }
+  }
+
+  int32_t top_num = 5;
+  post_process(top_num, batch, output, labels);
+
+  // batch 4
+  CVI_MODEL_HANDLE batch4_model = nullptr;
+  ret = CVI_NN_CloneModel(model, &batch4_model);
+  if (CVI_RC_SUCCESS != ret) {
+    printf("CVI_NN_CloneModel failed, err %d\n", ret);
+    exit(1);
+  }
+  printf("CVI_NN_CloneModel succeeded\n");
+
+  // get input output tensors
+  batch = 4;
+  get_input_output_tensors(batch4_model, &input, &output, &input_tensors, &output_tensors,
+                           input_num, output_num, shape, batch, qscale);
+
+  // nchw
+  height = shape.dim[2];
+  width = shape.dim[3];
+
+  // fill to input tensor
+  ptr = (int8_t *)CVI_NN_TensorPtr(input);
+  channel_size = height * width;
+  int batch_size = height * width * 3;
+  for (int b = 0; b < batch; ++b) {
+    for (int i = 0; i < 3; ++i) {
+      memcpy(ptr + i * channel_size + b * batch_size, channels[i].data, channel_size);
+    }
+  }
+
+  // run inference
+  CVI_NN_Forward(batch4_model, input_tensors, input_num, output_tensors, output_num);
+  printf("CVI_NN_Forward succeeded\n");
+
+  // output result
+  top_num = 5;
+  post_process(top_num, batch, output, labels);
+
+  CVI_NN_CleanupModel(model);
+  CVI_NN_CleanupModel(batch4_model);
+  printf("CVI_NN_CleanupModel succeeded\n");
+  return 0;
+}
\ No newline at end of file
diff --git a/cviruntime/samples/data/cat.jpg b/cviruntime/samples/data/cat.jpg
new file mode 100644
index 000000000..b4efc6c98
Binary files /dev/null and b/cviruntime/samples/data/cat.jpg differ
diff --git a/cviruntime/samples/data/synset_words.txt b/cviruntime/samples/data/synset_words.txt
new file mode 100644
index 000000000..a9e8c7f50
--- /dev/null
+++ b/cviruntime/samples/data/synset_words.txt
@@ -0,0 +1,1000 @@
+n01440764 tench, Tinca tinca
+n01443537 goldfish, Carassius auratus
+n01484850 great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias
+n01491361 tiger shark, Galeocerdo cuvieri
+n01494475 hammerhead, hammerhead shark
+n01496331 electric ray, crampfish, numbfish, torpedo
+n01498041 stingray
+n01514668 cock
+n01514859 hen
+n01518878 ostrich, Struthio camelus
+n01530575 brambling, Fringilla montifringilla
+n01531178 goldfinch, Carduelis carduelis
+n01532829 house finch, linnet, Carpodacus mexicanus
+n01534433 junco, snowbird
+n01537544 indigo bunting, indigo finch, indigo bird, Passerina cyanea
+n01558993 robin, American robin, Turdus migratorius
+n01560419 bulbul
+n01580077 jay
+n01582220 magpie
+n01592084 chickadee
+n01601694 water ouzel, dipper
+n01608432 kite
+n01614925 bald eagle, American eagle, Haliaeetus leucocephalus
+n01616318 vulture
+n01622779 great grey owl, great gray owl, Strix nebulosa
+n01629819 European fire salamander, Salamandra salamandra
+n01630670 common newt, Triturus vulgaris
+n01631663 eft
+n01632458 spotted salamander, Ambystoma maculatum
+n01632777 axolotl, mud puppy, Ambystoma mexicanum
+n01641577 bullfrog, Rana catesbeiana
+n01644373 tree frog, tree-frog
+n01644900 tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui
+n01664065 loggerhead, loggerhead turtle, Caretta caretta
+n01665541 leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea
+n01667114 mud turtle
+n01667778 terrapin
+n01669191 box turtle, box tortoise
+n01675722 banded gecko
+n01677366 common iguana, iguana, Iguana iguana
+n01682714 American chameleon, anole, Anolis carolinensis
+n01685808 whiptail, whiptail lizard
+n01687978 agama
+n01688243 frilled lizard, Chlamydosaurus kingi
+n01689811 alligator lizard
+n01692333 Gila monster, Heloderma suspectum
+n01693334 green lizard, Lacerta viridis
+n01694178 African chameleon, Chamaeleo chamaeleon
+n01695060 Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis
+n01697457 African crocodile, Nile crocodile, Crocodylus niloticus
+n01698640 American alligator, Alligator mississipiensis
+n01704323 triceratops
+n01728572 thunder snake, worm snake, Carphophis amoenus
+n01728920 ringneck snake, ring-necked snake, ring snake
+n01729322 hognose snake, puff adder, sand viper
+n01729977 green snake, grass snake
+n01734418 king snake, kingsnake
+n01735189 garter snake, grass snake
+n01737021 water snake
+n01739381 vine snake
+n01740131 night snake, Hypsiglena torquata
+n01742172 boa constrictor, Constrictor constrictor
+n01744401 rock python, rock snake, Python sebae
+n01748264 Indian cobra, Naja naja
+n01749939 green mamba
+n01751748 sea snake
+n01753488 horned viper, cerastes, sand viper, horned asp, Cerastes cornutus
+n01755581 diamondback, diamondback rattlesnake, Crotalus adamanteus
+n01756291 sidewinder, horned rattlesnake, Crotalus cerastes
+n01768244 trilobite
+n01770081 harvestman, daddy longlegs, Phalangium opilio
+n01770393 scorpion
+n01773157 black and gold garden spider, Argiope aurantia
+n01773549 barn spider, Araneus cavaticus
+n01773797 garden spider, Aranea diademata
+n01774384 black widow, Latrodectus mactans
+n01774750 tarantula
+n01775062 wolf spider, hunting spider
+n01776313 tick
+n01784675 centipede
+n01795545 black grouse
+n01796340 ptarmigan
+n01797886 ruffed grouse, partridge, Bonasa umbellus
+n01798484 prairie chicken, prairie grouse, prairie fowl
+n01806143 peacock
+n01806567 quail
+n01807496 partridge
+n01817953 African grey, African gray, Psittacus erithacus
+n01818515 macaw
+n01819313 sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita
+n01820546 lorikeet
+n01824575 coucal
+n01828970 bee eater
+n01829413 hornbill
+n01833805 hummingbird
+n01843065 jacamar
+n01843383 toucan
+n01847000 drake
+n01855032 red-breasted merganser, Mergus serrator
+n01855672 goose
+n01860187 black swan, Cygnus atratus
+n01871265 tusker
+n01872401 echidna, spiny anteater, anteater
+n01873310 platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus
+n01877812 wallaby, brush kangaroo
+n01882714 koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus
+n01883070 wombat
+n01910747 jellyfish
+n01914609 sea anemone, anemone
+n01917289 brain coral
+n01924916 flatworm, platyhelminth
+n01930112 nematode, nematode worm, roundworm
+n01943899 conch
+n01944390 snail
+n01945685 slug
+n01950731 sea slug, nudibranch
+n01955084 chiton, coat-of-mail shell, sea cradle, polyplacophore
+n01968897 chambered nautilus, pearly nautilus, nautilus
+n01978287 Dungeness crab, Cancer magister
+n01978455 rock crab, Cancer irroratus
+n01980166 fiddler crab
+n01981276 king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica
+n01983481 American lobster, Northern lobster, Maine lobster, Homarus americanus
+n01984695 spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish
+n01985128 crayfish, crawfish, crawdad, crawdaddy
+n01986214 hermit crab
+n01990800 isopod
+n02002556 white stork, Ciconia ciconia
+n02002724 black stork, Ciconia nigra
+n02006656 spoonbill
+n02007558 flamingo
+n02009229 little blue heron, Egretta caerulea
+n02009912 American egret, great white heron, Egretta albus
+n02011460 bittern
+n02012849 crane
+n02013706 limpkin, Aramus pictus
+n02017213 European gallinule, Porphyrio porphyrio
+n02018207 American coot, marsh hen, mud hen, water hen, Fulica americana
+n02018795 bustard
+n02025239 ruddy turnstone, Arenaria interpres
+n02027492 red-backed sandpiper, dunlin, Erolia alpina
+n02028035 redshank, Tringa totanus
+n02033041 dowitcher
+n02037110 oystercatcher, oyster catcher
+n02051845 pelican
+n02056570 king penguin, Aptenodytes patagonica
+n02058221 albatross, mollymawk
+n02066245 grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus
+n02071294 killer whale, killer, orca, grampus, sea wolf, Orcinus orca
+n02074367 dugong, Dugong dugon
+n02077923 sea lion
+n02085620 Chihuahua
+n02085782 Japanese spaniel
+n02085936 Maltese dog, Maltese terrier, Maltese
+n02086079 Pekinese, Pekingese, Peke
+n02086240 Shih-Tzu
+n02086646 Blenheim spaniel
+n02086910 papillon
+n02087046 toy terrier
+n02087394 Rhodesian ridgeback
+n02088094 Afghan hound, Afghan
+n02088238 basset, basset hound
+n02088364 beagle
+n02088466 bloodhound, sleuthhound
+n02088632 bluetick
+n02089078 black-and-tan coonhound
+n02089867 Walker hound, Walker foxhound
+n02089973 English foxhound
+n02090379 redbone
+n02090622 borzoi, Russian wolfhound
+n02090721 Irish wolfhound
+n02091032 Italian greyhound
+n02091134 whippet
+n02091244 Ibizan hound, Ibizan Podenco
+n02091467 Norwegian elkhound, elkhound
+n02091635 otterhound, otter hound
+n02091831 Saluki, gazelle hound
+n02092002 Scottish deerhound, deerhound
+n02092339 Weimaraner
+n02093256 Staffordshire bullterrier, Staffordshire bull terrier
+n02093428 American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier
+n02093647 Bedlington terrier
+n02093754 Border terrier
+n02093859 Kerry blue terrier
+n02093991 Irish terrier
+n02094114 Norfolk terrier
+n02094258 Norwich terrier
+n02094433 Yorkshire terrier
+n02095314 wire-haired fox terrier
+n02095570 Lakeland terrier
+n02095889 Sealyham terrier, Sealyham
+n02096051 Airedale, Airedale terrier
+n02096177 cairn, cairn terrier
+n02096294 Australian terrier
+n02096437 Dandie Dinmont, Dandie Dinmont terrier
+n02096585 Boston bull, Boston terrier
+n02097047 miniature schnauzer
+n02097130 giant schnauzer
+n02097209 standard schnauzer
+n02097298 Scotch terrier, Scottish terrier, Scottie
+n02097474 Tibetan terrier, chrysanthemum dog
+n02097658 silky terrier, Sydney silky
+n02098105 soft-coated wheaten terrier
+n02098286 West Highland white terrier
+n02098413 Lhasa, Lhasa apso
+n02099267 flat-coated retriever
+n02099429 curly-coated retriever
+n02099601 golden retriever
+n02099712 Labrador retriever
+n02099849 Chesapeake Bay retriever
+n02100236 German short-haired pointer
+n02100583 vizsla, Hungarian pointer
+n02100735 English setter
+n02100877 Irish setter, red setter
+n02101006 Gordon setter
+n02101388 Brittany spaniel
+n02101556 clumber, clumber spaniel
+n02102040 English springer, English springer spaniel
+n02102177 Welsh springer spaniel
+n02102318 cocker spaniel, English cocker spaniel, cocker
+n02102480 Sussex spaniel
+n02102973 Irish water spaniel
+n02104029 kuvasz
+n02104365 schipperke
+n02105056 groenendael
+n02105162 malinois
+n02105251 briard
+n02105412 kelpie
+n02105505 komondor
+n02105641 Old English sheepdog, bobtail
+n02105855 Shetland sheepdog, Shetland sheep dog, Shetland
+n02106030 collie
+n02106166 Border collie
+n02106382 Bouvier des Flandres, Bouviers des Flandres
+n02106550 Rottweiler
+n02106662 German shepherd, German shepherd dog, German police dog, alsatian
+n02107142 Doberman, Doberman pinscher
+n02107312 miniature pinscher
+n02107574 Greater Swiss Mountain dog
+n02107683 Bernese mountain dog
+n02107908 Appenzeller
+n02108000 EntleBucher
+n02108089 boxer
+n02108422 bull mastiff
+n02108551 Tibetan mastiff
+n02108915 French bulldog
+n02109047 Great Dane
+n02109525 Saint Bernard, St Bernard
+n02109961 Eskimo dog, husky
+n02110063 malamute, malemute, Alaskan malamute
+n02110185 Siberian husky
+n02110341 dalmatian, coach dog, carriage dog
+n02110627 affenpinscher, monkey pinscher, monkey dog
+n02110806 basenji
+n02110958 pug, pug-dog
+n02111129 Leonberg
+n02111277 Newfoundland, Newfoundland dog
+n02111500 Great Pyrenees
+n02111889 Samoyed, Samoyede
+n02112018 Pomeranian
+n02112137 chow, chow chow
+n02112350 keeshond
+n02112706 Brabancon griffon
+n02113023 Pembroke, Pembroke Welsh corgi
+n02113186 Cardigan, Cardigan Welsh corgi
+n02113624 toy poodle
+n02113712 miniature poodle
+n02113799 standard poodle
+n02113978 Mexican hairless
+n02114367 timber wolf, grey wolf, gray wolf, Canis lupus
+n02114548 white wolf, Arctic wolf, Canis lupus tundrarum
+n02114712 red wolf, maned wolf, Canis rufus, Canis niger
+n02114855 coyote, prairie wolf, brush wolf, Canis latrans
+n02115641 dingo, warrigal, warragal, Canis dingo
+n02115913 dhole, Cuon alpinus
+n02116738 African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus
+n02117135 hyena, hyaena
+n02119022 red fox, Vulpes vulpes
+n02119789 kit fox, Vulpes macrotis
+n02120079 Arctic fox, white fox, Alopex lagopus
+n02120505 grey fox, gray fox, Urocyon cinereoargenteus
+n02123045 tabby, tabby cat
+n02123159 tiger cat
+n02123394 Persian cat
+n02123597 Siamese cat, Siamese
+n02124075 Egyptian cat
+n02125311 cougar, puma, catamount, mountain lion, painter, panther, Felis concolor
+n02127052 lynx, catamount
+n02128385 leopard, Panthera pardus
+n02128757 snow leopard, ounce, Panthera uncia
+n02128925 jaguar, panther, Panthera onca, Felis onca
+n02129165 lion, king of beasts, Panthera leo
+n02129604 tiger, Panthera tigris
+n02130308 cheetah, chetah, Acinonyx jubatus
+n02132136 brown bear, bruin, Ursus arctos
+n02133161 American black bear, black bear, Ursus americanus, Euarctos americanus
+n02134084 ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus
+n02134418 sloth bear, Melursus ursinus, Ursus ursinus
+n02137549 mongoose
+n02138441 meerkat, mierkat
+n02165105 tiger beetle
+n02165456 ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle
+n02167151 ground beetle, carabid beetle
+n02168699 long-horned beetle, longicorn, longicorn beetle
+n02169497 leaf beetle, chrysomelid
+n02172182 dung beetle
+n02174001 rhinoceros beetle
+n02177972 weevil
+n02190166 fly
+n02206856 bee
+n02219486 ant, emmet, pismire
+n02226429 grasshopper, hopper
+n02229544 cricket
+n02231487 walking stick, walkingstick, stick insect
+n02233338 cockroach, roach
+n02236044 mantis, mantid
+n02256656 cicada, cicala
+n02259212 leafhopper
+n02264363 lacewing, lacewing fly
+n02268443 dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk
+n02268853 damselfly
+n02276258 admiral
+n02277742 ringlet, ringlet butterfly
+n02279972 monarch, monarch butterfly, milkweed butterfly, Danaus plexippus
+n02280649 cabbage butterfly
+n02281406 sulphur butterfly, sulfur butterfly
+n02281787 lycaenid, lycaenid butterfly
+n02317335 starfish, sea star
+n02319095 sea urchin
+n02321529 sea cucumber, holothurian
+n02325366 wood rabbit, cottontail, cottontail rabbit
+n02326432 hare
+n02328150 Angora, Angora rabbit
+n02342885 hamster
+n02346627 porcupine, hedgehog
+n02356798 fox squirrel, eastern fox squirrel, Sciurus niger
+n02361337 marmot
+n02363005 beaver
+n02364673 guinea pig, Cavia cobaya
+n02389026 sorrel
+n02391049 zebra
+n02395406 hog, pig, grunter, squealer, Sus scrofa
+n02396427 wild boar, boar, Sus scrofa
+n02397096 warthog
+n02398521 hippopotamus, hippo, river horse, Hippopotamus amphibius
+n02403003 ox
+n02408429 water buffalo, water ox, Asiatic buffalo, Bubalus bubalis
+n02410509 bison
+n02412080 ram, tup
+n02415577 bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis
+n02417914 ibex, Capra ibex
+n02422106 hartebeest
+n02422699 impala, Aepyceros melampus
+n02423022 gazelle
+n02437312 Arabian camel, dromedary, Camelus dromedarius
+n02437616 llama
+n02441942 weasel
+n02442845 mink
+n02443114 polecat, fitch, foulmart, foumart, Mustela putorius
+n02443484 black-footed ferret, ferret, Mustela nigripes
+n02444819 otter
+n02445715 skunk, polecat, wood pussy
+n02447366 badger
+n02454379 armadillo
+n02457408 three-toed sloth, ai, Bradypus tridactylus
+n02480495 orangutan, orang, orangutang, Pongo pygmaeus
+n02480855 gorilla, Gorilla gorilla
+n02481823 chimpanzee, chimp, Pan troglodytes
+n02483362 gibbon, Hylobates lar
+n02483708 siamang, Hylobates syndactylus, Symphalangus syndactylus
+n02484975 guenon, guenon monkey
+n02486261 patas, hussar monkey, Erythrocebus patas
+n02486410 baboon
+n02487347 macaque
+n02488291 langur
+n02488702 colobus, colobus monkey
+n02489166 proboscis monkey, Nasalis larvatus
+n02490219 marmoset
+n02492035 capuchin, ringtail, Cebus capucinus
+n02492660 howler monkey, howler
+n02493509 titi, titi monkey
+n02493793 spider monkey, Ateles geoffroyi
+n02494079 squirrel monkey, Saimiri sciureus
+n02497673 Madagascar cat, ring-tailed lemur, Lemur catta
+n02500267 indri, indris, Indri indri, Indri brevicaudatus
+n02504013 Indian elephant, Elephas maximus
+n02504458 African elephant, Loxodonta africana
+n02509815 lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens
+n02510455 giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca
+n02514041 barracouta, snoek
+n02526121 eel
+n02536864 coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch
+n02606052 rock beauty, Holocanthus tricolor
+n02607072 anemone fish
+n02640242 sturgeon
+n02641379 gar, garfish, garpike, billfish, Lepisosteus osseus
+n02643566 lionfish
+n02655020 puffer, pufferfish, blowfish, globefish
+n02666196 abacus
+n02667093 abaya
+n02669723 academic gown, academic robe, judge's robe
+n02672831 accordion, piano accordion, squeeze box
+n02676566 acoustic guitar
+n02687172 aircraft carrier, carrier, flattop, attack aircraft carrier
+n02690373 airliner
+n02692877 airship, dirigible
+n02699494 altar
+n02701002 ambulance
+n02704792 amphibian, amphibious vehicle
+n02708093 analog clock
+n02727426 apiary, bee house
+n02730930 apron
+n02747177 ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin
+n02749479 assault rifle, assault gun
+n02769748 backpack, back pack, knapsack, packsack, rucksack, haversack
+n02776631 bakery, bakeshop, bakehouse
+n02777292 balance beam, beam
+n02782093 balloon
+n02783161 ballpoint, ballpoint pen, ballpen, Biro
+n02786058 Band Aid
+n02787622 banjo
+n02788148 bannister, banister, balustrade, balusters, handrail
+n02790996 barbell
+n02791124 barber chair
+n02791270 barbershop
+n02793495 barn
+n02794156 barometer
+n02795169 barrel, cask
+n02797295 barrow, garden cart, lawn cart, wheelbarrow
+n02799071 baseball
+n02802426 basketball
+n02804414 bassinet
+n02804610 bassoon
+n02807133 bathing cap, swimming cap
+n02808304 bath towel
+n02808440 bathtub, bathing tub, bath, tub
+n02814533 beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon
+n02814860 beacon, lighthouse, beacon light, pharos
+n02815834 beaker
+n02817516 bearskin, busby, shako
+n02823428 beer bottle
+n02823750 beer glass
+n02825657 bell cote, bell cot
+n02834397 bib
+n02835271 bicycle-built-for-two, tandem bicycle, tandem
+n02837789 bikini, two-piece
+n02840245 binder, ring-binder
+n02841315 binoculars, field glasses, opera glasses
+n02843684 birdhouse
+n02859443 boathouse
+n02860847 bobsled, bobsleigh, bob
+n02865351 bolo tie, bolo, bola tie, bola
+n02869837 bonnet, poke bonnet
+n02870880 bookcase
+n02871525 bookshop, bookstore, bookstall
+n02877765 bottlecap
+n02879718 bow
+n02883205 bow tie, bow-tie, bowtie
+n02892201 brass, memorial tablet, plaque
+n02892767 brassiere, bra, bandeau
+n02894605 breakwater, groin, groyne, mole, bulwark, seawall, jetty
+n02895154 breastplate, aegis, egis
+n02906734 broom
+n02909870 bucket, pail
+n02910353 buckle
+n02916936 bulletproof vest
+n02917067 bullet train, bullet
+n02927161 butcher shop, meat market
+n02930766 cab, hack, taxi, taxicab
+n02939185 caldron, cauldron
+n02948072 candle, taper, wax light
+n02950826 cannon
+n02951358 canoe
+n02951585 can opener, tin opener
+n02963159 cardigan
+n02965783 car mirror
+n02966193 carousel, carrousel, merry-go-round, roundabout, whirligig
+n02966687 carpenter's kit, tool kit
+n02971356 carton
+n02974003 car wheel
+n02977058 cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM
+n02978881 cassette
+n02979186 cassette player
+n02980441 castle
+n02981792 catamaran
+n02988304 CD player
+n02992211 cello, violoncello
+n02992529 cellular telephone, cellular phone, cellphone, cell, mobile phone
+n02999410 chain
+n03000134 chainlink fence
+n03000247 chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour
+n03000684 chain saw, chainsaw
+n03014705 chest
+n03016953 chiffonier, commode
+n03017168 chime, bell, gong
+n03018349 china cabinet, china closet
+n03026506 Christmas stocking
+n03028079 church, church building
+n03032252 cinema, movie theater, movie theatre, movie house, picture palace
+n03041632 cleaver, meat cleaver, chopper
+n03042490 cliff dwelling
+n03045698 cloak
+n03047690 clog, geta, patten, sabot
+n03062245 cocktail shaker
+n03063599 coffee mug
+n03063689 coffeepot
+n03065424 coil, spiral, volute, whorl, helix
+n03075370 combination lock
+n03085013 computer keyboard, keypad
+n03089624 confectionery, confectionary, candy store
+n03095699 container ship, containership, container vessel
+n03100240 convertible
+n03109150 corkscrew, bottle screw
+n03110669 cornet, horn, trumpet, trump
+n03124043 cowboy boot
+n03124170 cowboy hat, ten-gallon hat
+n03125729 cradle
+n03126707 crane
+n03127747 crash helmet
+n03127925 crate
+n03131574 crib, cot
+n03133878 Crock Pot
+n03134739 croquet ball
+n03141823 crutch
+n03146219 cuirass
+n03160309 dam, dike, dyke
+n03179701 desk
+n03180011 desktop computer
+n03187595 dial telephone, dial phone
+n03188531 diaper, nappy, napkin
+n03196217 digital clock
+n03197337 digital watch
+n03201208 dining table, board
+n03207743 dishrag, dishcloth
+n03207941 dishwasher, dish washer, dishwashing machine
+n03208938 disk brake, disc brake
+n03216828 dock, dockage, docking facility
+n03218198 dogsled, dog sled, dog sleigh
+n03220513 dome
+n03223299 doormat, welcome mat
+n03240683 drilling platform, offshore rig
+n03249569 drum, membranophone, tympan
+n03250847 drumstick
+n03255030 dumbbell
+n03259280 Dutch oven
+n03271574 electric fan, blower
+n03272010 electric guitar
+n03272562 electric locomotive
+n03290653 entertainment center
+n03291819 envelope
+n03297495 espresso maker
+n03314780 face powder
+n03325584 feather boa, boa
+n03337140 file, file cabinet, filing cabinet
+n03344393 fireboat
+n03345487 fire engine, fire truck
+n03347037 fire screen, fireguard
+n03355925 flagpole, flagstaff
+n03372029 flute, transverse flute
+n03376595 folding chair
+n03379051 football helmet
+n03384352 forklift
+n03388043 fountain
+n03388183 fountain pen
+n03388549 four-poster
+n03393912 freight car
+n03394916 French horn, horn
+n03400231 frying pan, frypan, skillet
+n03404251 fur coat
+n03417042 garbage truck, dustcart
+n03424325 gasmask, respirator, gas helmet
+n03425413 gas pump, gasoline pump, petrol pump, island dispenser
+n03443371 goblet
+n03444034 go-kart
+n03445777 golf ball
+n03445924 golfcart, golf cart
+n03447447 gondola
+n03447721 gong, tam-tam
+n03450230 gown
+n03452741 grand piano, grand
+n03457902 greenhouse, nursery, glasshouse
+n03459775 grille, radiator grille
+n03461385 grocery store, grocery, food market, market
+n03467068 guillotine
+n03476684 hair slide
+n03476991 hair spray
+n03478589 half track
+n03481172 hammer
+n03482405 hamper
+n03483316 hand blower, blow dryer, blow drier, hair dryer, hair drier
+n03485407 hand-held computer, hand-held microcomputer
+n03485794 handkerchief, hankie, hanky, hankey
+n03492542 hard disc, hard disk, fixed disk
+n03494278 harmonica, mouth organ, harp, mouth harp
+n03495258 harp
+n03496892 harvester, reaper
+n03498962 hatchet
+n03527444 holster
+n03529860 home theater, home theatre
+n03530642 honeycomb
+n03532672 hook, claw
+n03534580 hoopskirt, crinoline
+n03535780 horizontal bar, high bar
+n03538406 horse cart, horse-cart
+n03544143 hourglass
+n03584254 iPod
+n03584829 iron, smoothing iron
+n03590841 jack-o'-lantern
+n03594734 jean, blue jean, denim
+n03594945 jeep, landrover
+n03595614 jersey, T-shirt, tee shirt
+n03598930 jigsaw puzzle
+n03599486 jinrikisha, ricksha, rickshaw
+n03602883 joystick
+n03617480 kimono
+n03623198 knee pad
+n03627232 knot
+n03630383 lab coat, laboratory coat
+n03633091 ladle
+n03637318 lampshade, lamp shade
+n03642806 laptop, laptop computer
+n03649909 lawn mower, mower
+n03657121 lens cap, lens cover
+n03658185 letter opener, paper knife, paperknife
+n03661043 library
+n03662601 lifeboat
+n03666591 lighter, light, igniter, ignitor
+n03670208 limousine, limo
+n03673027 liner, ocean liner
+n03676483 lipstick, lip rouge
+n03680355 Loafer
+n03690938 lotion
+n03691459 loudspeaker, speaker, speaker unit, loudspeaker system, speaker system
+n03692522 loupe, jeweler's loupe
+n03697007 lumbermill, sawmill
+n03706229 magnetic compass
+n03709823 mailbag, postbag
+n03710193 mailbox, letter box
+n03710637 maillot
+n03710721 maillot, tank suit
+n03717622 manhole cover
+n03720891 maraca
+n03721384 marimba, xylophone
+n03724870 mask
+n03729826 matchstick
+n03733131 maypole
+n03733281 maze, labyrinth
+n03733805 measuring cup
+n03742115 medicine chest, medicine cabinet
+n03743016 megalith, megalithic structure
+n03759954 microphone, mike
+n03761084 microwave, microwave oven
+n03763968 military uniform
+n03764736 milk can
+n03769881 minibus
+n03770439 miniskirt, mini
+n03770679 minivan
+n03773504 missile
+n03775071 mitten
+n03775546 mixing bowl
+n03776460 mobile home, manufactured home
+n03777568 Model T
+n03777754 modem
+n03781244 monastery
+n03782006 monitor
+n03785016 moped
+n03786901 mortar
+n03787032 mortarboard
+n03788195 mosque
+n03788365 mosquito net
+n03791053 motor scooter, scooter
+n03792782 mountain bike, all-terrain bike, off-roader
+n03792972 mountain tent
+n03793489 mouse, computer mouse
+n03794056 mousetrap
+n03796401 moving van
+n03803284 muzzle
+n03804744 nail
+n03814639 neck brace
+n03814906 necklace
+n03825788 nipple
+n03832673 notebook, notebook computer
+n03837869 obelisk
+n03838899 oboe, hautboy, hautbois
+n03840681 ocarina, sweet potato
+n03841143 odometer, hodometer, mileometer, milometer
+n03843555 oil filter
+n03854065 organ, pipe organ
+n03857828 oscilloscope, scope, cathode-ray oscilloscope, CRO
+n03866082 overskirt
+n03868242 oxcart
+n03868863 oxygen mask
+n03871628 packet
+n03873416 paddle, boat paddle
+n03874293 paddlewheel, paddle wheel
+n03874599 padlock
+n03876231 paintbrush
+n03877472 pajama, pyjama, pj's, jammies
+n03877845 palace
+n03884397 panpipe, pandean pipe, syrinx
+n03887697 paper towel
+n03888257 parachute, chute
+n03888605 parallel bars, bars
+n03891251 park bench
+n03891332 parking meter
+n03895866 passenger car, coach, carriage
+n03899768 patio, terrace
+n03902125 pay-phone, pay-station
+n03903868 pedestal, plinth, footstall
+n03908618 pencil box, pencil case
+n03908714 pencil sharpener
+n03916031 perfume, essence
+n03920288 Petri dish
+n03924679 photocopier
+n03929660 pick, plectrum, plectron
+n03929855 pickelhaube
+n03930313 picket fence, paling
+n03930630 pickup, pickup truck
+n03933933 pier
+n03935335 piggy bank, penny bank
+n03937543 pill bottle
+n03938244 pillow
+n03942813 ping-pong ball
+n03944341 pinwheel
+n03947888 pirate, pirate ship
+n03950228 pitcher, ewer
+n03954731 plane, carpenter's plane, woodworking plane
+n03956157 planetarium
+n03958227 plastic bag
+n03961711 plate rack
+n03967562 plow, plough
+n03970156 plunger, plumber's helper
+n03976467 Polaroid camera, Polaroid Land camera
+n03976657 pole
+n03977966 police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria
+n03980874 poncho
+n03982430 pool table, billiard table, snooker table
+n03983396 pop bottle, soda bottle
+n03991062 pot, flowerpot
+n03992509 potter's wheel
+n03995372 power drill
+n03998194 prayer rug, prayer mat
+n04004767 printer
+n04005630 prison, prison house
+n04008634 projectile, missile
+n04009552 projector
+n04019541 puck, hockey puck
+n04023962 punching bag, punch bag, punching ball, punchball
+n04026417 purse
+n04033901 quill, quill pen
+n04033995 quilt, comforter, comfort, puff
+n04037443 racer, race car, racing car
+n04039381 racket, racquet
+n04040759 radiator
+n04041544 radio, wireless
+n04044716 radio telescope, radio reflector
+n04049303 rain barrel
+n04065272 recreational vehicle, RV, R.V.
+n04067472 reel
+n04069434 reflex camera
+n04070727 refrigerator, icebox
+n04074963 remote control, remote
+n04081281 restaurant, eating house, eating place, eatery
+n04086273 revolver, six-gun, six-shooter
+n04090263 rifle
+n04099969 rocking chair, rocker
+n04111531 rotisserie
+n04116512 rubber eraser, rubber, pencil eraser
+n04118538 rugby ball
+n04118776 rule, ruler
+n04120489 running shoe
+n04125021 safe
+n04127249 safety pin
+n04131690 saltshaker, salt shaker
+n04133789 sandal
+n04136333 sarong
+n04141076 sax, saxophone
+n04141327 scabbard
+n04141975 scale, weighing machine
+n04146614 school bus
+n04147183 schooner
+n04149813 scoreboard
+n04152593 screen, CRT screen
+n04153751 screw
+n04154565 screwdriver
+n04162706 seat belt, seatbelt
+n04179913 sewing machine
+n04192698 shield, buckler
+n04200800 shoe shop, shoe-shop, shoe store
+n04201297 shoji
+n04204238 shopping basket
+n04204347 shopping cart
+n04208210 shovel
+n04209133 shower cap
+n04209239 shower curtain
+n04228054 ski
+n04229816 ski mask
+n04235860 sleeping bag
+n04238763 slide rule, slipstick
+n04239074 sliding door
+n04243546 slot, one-armed bandit
+n04251144 snorkel
+n04252077 snowmobile
+n04252225 snowplow, snowplough
+n04254120 soap dispenser
+n04254680 soccer ball
+n04254777 sock
+n04258138 solar dish, solar collector, solar furnace
+n04259630 sombrero
+n04263257 soup bowl
+n04264628 space bar
+n04265275 space heater
+n04266014 space shuttle
+n04270147 spatula
+n04273569 speedboat
+n04275548 spider web, spider's web
+n04277352 spindle
+n04285008 sports car, sport car
+n04286575 spotlight, spot
+n04296562 stage
+n04310018 steam locomotive
+n04311004 steel arch bridge
+n04311174 steel drum
+n04317175 stethoscope
+n04325704 stole
+n04326547 stone wall
+n04328186 stopwatch, stop watch
+n04330267 stove
+n04332243 strainer
+n04335435 streetcar, tram, tramcar, trolley, trolley car
+n04336792 stretcher
+n04344873 studio couch, day bed
+n04346328 stupa, tope
+n04347754 submarine, pigboat, sub, U-boat
+n04350905 suit, suit of clothes
+n04355338 sundial
+n04355933 sunglass
+n04356056 sunglasses, dark glasses, shades
+n04357314 sunscreen, sunblock, sun blocker
+n04366367 suspension bridge
+n04367480 swab, swob, mop
+n04370456 sweatshirt
+n04371430 swimming trunks, bathing trunks
+n04371774 swing
+n04372370 switch, electric switch, electrical switch
+n04376876 syringe
+n04380533 table lamp
+n04389033 tank, army tank, armored combat vehicle, armoured combat vehicle
+n04392985 tape player
+n04398044 teapot
+n04399382 teddy, teddy bear
+n04404412 television, television system
+n04409515 tennis ball
+n04417672 thatch, thatched roof
+n04418357 theater curtain, theatre curtain
+n04423845 thimble
+n04428191 thresher, thrasher, threshing machine
+n04429376 throne
+n04435653 tile roof
+n04442312 toaster
+n04443257 tobacco shop, tobacconist shop, tobacconist
+n04447861 toilet seat
+n04456115 torch
+n04458633 totem pole
+n04461696 tow truck, tow car, wrecker
+n04462240 toyshop
+n04465501 tractor
+n04467665 trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi
+n04476259 tray
+n04479046 trench coat
+n04482393 tricycle, trike, velocipede
+n04483307 trimaran
+n04485082 tripod
+n04486054 triumphal arch
+n04487081 trolleybus, trolley coach, trackless trolley
+n04487394 trombone
+n04493381 tub, vat
+n04501370 turnstile
+n04505470 typewriter keyboard
+n04507155 umbrella
+n04509417 unicycle, monocycle
+n04515003 upright, upright piano
+n04517823 vacuum, vacuum cleaner
+n04522168 vase
+n04523525 vault
+n04525038 velvet
+n04525305 vending machine
+n04532106 vestment
+n04532670 viaduct
+n04536866 violin, fiddle
+n04540053 volleyball
+n04542943 waffle iron
+n04548280 wall clock
+n04548362 wallet, billfold, notecase, pocketbook
+n04550184 wardrobe, closet, press
+n04552348 warplane, military plane
+n04553703 washbasin, handbasin, washbowl, lavabo, wash-hand basin
+n04554684 washer, automatic washer, washing machine
+n04557648 water bottle
+n04560804 water jug
+n04562935 water tower
+n04579145 whiskey jug
+n04579432 whistle
+n04584207 wig
+n04589890 window screen
+n04590129 window shade
+n04591157 Windsor tie
+n04591713 wine bottle
+n04592741 wing
+n04596742 wok
+n04597913 wooden spoon
+n04599235 wool, woolen, woollen
+n04604644 worm fence, snake fence, snake-rail fence, Virginia fence
+n04606251 wreck
+n04612504 yawl
+n04613696 yurt
+n06359193 web site, website, internet site, site
+n06596364 comic book
+n06785654 crossword puzzle, crossword
+n06794110 street sign
+n06874185 traffic light, traffic signal, stoplight
+n07248320 book jacket, dust cover, dust jacket, dust wrapper
+n07565083 menu
+n07579787 plate
+n07583066 guacamole
+n07584110 consomme
+n07590611 hot pot, hotpot
+n07613480 trifle
+n07614500 ice cream, icecream
+n07615774 ice lolly, lolly, lollipop, popsicle
+n07684084 French loaf
+n07693725 bagel, beigel
+n07695742 pretzel
+n07697313 cheeseburger
+n07697537 hotdog, hot dog, red hot
+n07711569 mashed potato
+n07714571 head cabbage
+n07714990 broccoli
+n07715103 cauliflower
+n07716358 zucchini, courgette
+n07716906 spaghetti squash
+n07717410 acorn squash
+n07717556 butternut squash
+n07718472 cucumber, cuke
+n07718747 artichoke, globe artichoke
+n07720875 bell pepper
+n07730033 cardoon
+n07734744 mushroom
+n07742313 Granny Smith
+n07745940 strawberry
+n07747607 orange
+n07749582 lemon
+n07753113 fig
+n07753275 pineapple, ananas
+n07753592 banana
+n07754684 jackfruit, jak, jack
+n07760859 custard apple
+n07768694 pomegranate
+n07802026 hay
+n07831146 carbonara
+n07836838 chocolate sauce, chocolate syrup
+n07860988 dough
+n07871810 meat loaf, meatloaf
+n07873807 pizza, pizza pie
+n07875152 potpie
+n07880968 burrito
+n07892512 red wine
+n07920052 espresso
+n07930864 cup
+n07932039 eggnog
+n09193705 alp
+n09229709 bubble
+n09246464 cliff, drop, drop-off
+n09256479 coral reef
+n09288635 geyser
+n09332890 lakeside, lakeshore
+n09399592 promontory, headland, head, foreland
+n09421951 sandbar, sand bar
+n09428293 seashore, coast, seacoast, sea-coast
+n09468604 valley, vale
+n09472597 volcano
+n09835506 ballplayer, baseball player
+n10148035 groom, bridegroom
+n10565667 scuba diver
+n11879895 rapeseed
+n11939491 daisy
+n12057211 yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum
+n12144580 corn
+n12267677 acorn
+n12620546 hip, rose hip, rosehip
+n12768682 buckeye, horse chestnut, conker
+n12985857 coral fungus
+n12998815 agaric
+n13037406 gyromitra
+n13040303 stinkhorn, carrion fungus
+n13044778 earthstar
+n13052670 hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa
+n13054560 bolete
+n13133613 ear, spike, capitulum
+n15075141 toilet tissue, toilet paper, bathroom tissue
diff --git a/cviruntime/samples/run_classifier.sh b/cviruntime/samples/run_classifier.sh
new file mode 100755
index 000000000..f3db74c72
--- /dev/null
+++ b/cviruntime/samples/run_classifier.sh
@@ -0,0 +1,22 @@
+#!/bin/sh
+
+if [ -z $MODEL_PATH ]; then
+  echo "$0 MODEL_PATH not set"
+  exit 1
+fi
+if [ ! -e ./bin/cvi_sample_classifier ]; then
+  echo "$0 Please run at the same dir as the script"
+  exit 1
+fi
+if [ ! -e $MODEL_PATH/mobilenet_v2.cvimodel ]; then
+  echo "$0 Model mobilenet_v2.cvimodel not present"
+  exit 1
+fi
+
+./bin/cvi_sample_classifier \
+    $MODEL_PATH/mobilenet_v2.cvimodel \
+    ./data/cat.jpg \
+    ./data/synset_words.txt
+
+test $? -ne 0 && echo "cvi_sample_classifier failed !!"  && exit 1
+exit 0
\ No newline at end of file
diff --git a/cviruntime/samples/run_classifier_bf16.sh b/cviruntime/samples/run_classifier_bf16.sh
new file mode 100755
index 000000000..b3f86a72a
--- /dev/null
+++ b/cviruntime/samples/run_classifier_bf16.sh
@@ -0,0 +1,22 @@
+#!/bin/sh
+
+if [ -z $MODEL_PATH ]; then
+  echo "$0 MODEL_PATH not set"
+  exit 1
+fi
+if [ ! -e ./bin/cvi_sample_classifier_bf16 ]; then
+  echo "$0 Please run at the same dir as the script"
+  exit 1
+fi
+if [ ! -e $MODEL_PATH/mobilenet_v2_bf16.cvimodel ]; then
+  echo "$0 Model mobilenet_v2_bf16.cvimodel not present"
+  exit 1
+fi
+
+./bin/cvi_sample_classifier_bf16 \
+    $MODEL_PATH/mobilenet_v2_bf16.cvimodel \
+    ./data/cat.jpg \
+    ./data/synset_words.txt
+
+test $? -ne 0 && echo "cvi_sample_classifier_bf16 failed !!"  && exit 1
+exit 0
\ No newline at end of file
diff --git a/cviruntime/samples/run_classifier_fused_preprocess.sh b/cviruntime/samples/run_classifier_fused_preprocess.sh
new file mode 100755
index 000000000..8ec3f895b
--- /dev/null
+++ b/cviruntime/samples/run_classifier_fused_preprocess.sh
@@ -0,0 +1,21 @@
+#!/bin/sh
+
+if [ -z $MODEL_PATH ]; then
+  echo "$0 MODEL_PATH not set"
+  exit 1
+fi
+if [ ! -e ./bin/cvi_sample_classifier_fused_preprocess ]; then
+  echo "$0 Please run at the same dir as the script"
+  exit 1
+fi
+if [ ! -e $MODEL_PATH/mobilenet_v2_fused_preprocess.cvimodel ]; then
+  echo "$0 Model mobilenet_v2_fused_preprocess.cvimodel not present"
+  exit 1
+fi
+
+./bin/cvi_sample_classifier_fused_preprocess \
+    $MODEL_PATH/mobilenet_v2_fused_preprocess.cvimodel \
+    ./data/cat.jpg \
+    ./data/synset_words.txt
+test $? -ne 0 && echo "cvi_sample_classifier_fused_preprocess failed !!"  && exit 1
+exit 0
\ No newline at end of file
diff --git a/cviruntime/samples/run_classifier_multi_batch.sh b/cviruntime/samples/run_classifier_multi_batch.sh
new file mode 100755
index 000000000..3f363cff0
--- /dev/null
+++ b/cviruntime/samples/run_classifier_multi_batch.sh
@@ -0,0 +1,22 @@
+#!/bin/sh
+
+if [ -z $MODEL_PATH ]; then
+  echo "$0 MODEL_PATH not set"
+  exit 1
+fi
+if [ ! -e ./bin/cvi_sample_classifier_multi_batch ]; then
+  echo "$0 Please run at the same dir as the script"
+  exit 1
+fi
+if [ ! -e $MODEL_PATH/mobilenet_v2_bs1_bs4.cvimodel ]; then
+  echo "$0 Model mobilenet_v2_bs1_bs4.cvimodel not present"
+  exit 1
+fi
+
+./bin/cvi_sample_classifier_multi_batch \
+    $MODEL_PATH/mobilenet_v2_bs1_bs4.cvimodel \
+    ./data/cat.jpg \
+    ./data/synset_words.txt
+
+test $? -ne 0 && echo "cvi_sample_classifier_multi_batch failed !!"  && exit 1
+exit 0
\ No newline at end of file
diff --git a/cviruntime/samples/runner/CMakeLists.txt b/cviruntime/samples/runner/CMakeLists.txt
new file mode 100644
index 000000000..bececead1
--- /dev/null
+++ b/cviruntime/samples/runner/CMakeLists.txt
@@ -0,0 +1,30 @@
+cmake_minimum_required(VERSION 2.8.0)
+
+project(cvi_sample_model_runner C CXX)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
+
+include_directories(${CMAKE_SYSROOT}/include)
+
+if(NOT DEFINED TPU_SDK_PATH)
+  message(FATAL_ERROR "Please set TPU_SDK_PATH to point to the TPU_SDK installation")
+endif()
+include_directories(${TPU_SDK_PATH}/include)
+link_directories(${TPU_SDK_PATH}/lib)
+if(DEFINED CNPY_PATH)
+  include_directories(${CNPY_PATH}/include)
+  link_directories(${CNPY_PATH}/lib)
+endif()
+
+set(CVI_LIBS ${CVI_LIBS} cviruntime cvikernel cnpy z)
+set(EXTRA_LIBS ${EXTRA_LIBS} dl)
+
+add_executable(cvi_sample_model_runner
+    model_runner.cpp)
+target_link_libraries(cvi_sample_model_runner
+    ${CVI_LIBS}
+    ${EXTRA_LIBS})
+install(TARGETS cvi_sample_model_runner
+    cvi_sample_model_runner DESTINATION bin)
diff --git a/cviruntime/samples/runner/argparse.hpp b/cviruntime/samples/runner/argparse.hpp
new file mode 100644
index 000000000..322754138
--- /dev/null
+++ b/cviruntime/samples/runner/argparse.hpp
@@ -0,0 +1,602 @@
+#ifndef ARGPARSE_HPP_
+#define ARGPARSE_HPP_
+
+#if __cplusplus >= 201103L
+#include <unordered_map>
+typedef std::unordered_map<std::string, size_t> IndexMap;
+#else
+#include <map>
+typedef std::map<std::string, size_t> IndexMap;
+#endif
+#include <string>
+#include <vector>
+#include <typeinfo>
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+#include <cassert>
+#include <algorithm>
+
+namespace argparse {
+// Modified from https://github.com/davisking/dlib/blob/master/dlib/algs.h
+template <typename T>
+struct is_standard_type {
+  const static bool value = false;
+};
+
+template <>
+struct is_standard_type<float> {
+  const static bool value = true;
+};
+template <>
+struct is_standard_type<double> {
+  const static bool value = true;
+};
+template <>
+struct is_standard_type<long double> {
+  const static bool value = true;
+};
+template <>
+struct is_standard_type<short> {
+  const static bool value = true;
+};
+template <>
+struct is_standard_type<int> {
+  const static bool value = true;
+};
+template <>
+struct is_standard_type<long> {
+  const static bool value = true;
+};
+template <>
+struct is_standard_type<unsigned short> {
+  const static bool value = true;
+};
+template <>
+struct is_standard_type<unsigned int> {
+  const static bool value = true;
+};
+template <>
+struct is_standard_type<unsigned long> {
+  const static bool value = true;
+};
+template <>
+struct is_standard_type<char> {
+  const static bool value = true;
+};
+template <>
+struct is_standard_type<signed char> {
+  const static bool value = true;
+};
+template <>
+struct is_standard_type<unsigned char> {
+  const static bool value = true;
+};
+template <>
+struct is_standard_type<std::string> {
+  const static bool value = true;
+};
+
+// Copied from https://github.com/davisking/dlib/blob/master/dlib/enable_if.h
+template <bool B, class T = void>
+struct enable_if_c {
+  typedef T type;
+};
+
+template <class T>
+struct enable_if_c<false, T> {};
+
+template <class Cond, class T = void>
+struct enable_if : public enable_if_c<Cond::value, T> {};
+
+template <bool B, class T = void>
+struct disable_if_c {
+  typedef T type;
+};
+
+template <class T>
+struct disable_if_c<true, T> {};
+
+template <class Cond, class T = void>
+struct disable_if : public disable_if_c<Cond::value, T> {};
+
+template <typename T>
+T castTo(const std::string &item) {
+  std::istringstream sin(item);
+  T value;
+  sin >> value;
+  return value;
+}
+
+template <typename T>
+std::string toString(const T &item) {
+  std::ostringstream sout;
+  sout << item;
+  return sout.str();
+}
+
+void remove_space(std::string &str) {
+  str.erase(std::remove_if(str.begin(), str.end(),
+                           [](unsigned char x) { return std::isspace(x); }),
+            str.end());
+}
+
+void strip_brackets(std::string &str) {
+  auto first_bracket = str.find_first_of('[');
+  if (first_bracket == std::string::npos) {
+    std::ostringstream sout;
+    sout << "Could not find a left bracket in " << str;
+    throw std::runtime_error(sout.str());
+  }
+  str.erase(str.begin() + first_bracket);
+
+  auto last_bracket = str.find_last_of(']');
+  if (last_bracket == std::string::npos) {
+    std::ostringstream sout;
+    sout << "Could not find a right bracket in " << str;
+    throw std::runtime_error(sout.str());
+  }
+  str.erase(str.begin() + last_bracket);
+}
+
+/*! @class ArgumentParser
+ *  @brief A simple command-line argument parser based on the design of
+ *  python's parser of the same name.
+ *
+ *  ArgumentParser is a simple C++ class that can parse arguments from
+ *  the command-line or any array of strings. The syntax is familiar to
+ *  anyone who has used python's ArgumentParser:
+ *  \code
+ *    // create a parser and add the options
+ *    ArgumentParser parser;
+ *    parser.addArgument("-n", "--name");
+ *    parser.addArgument("--inputs", '+');
+ *
+ *    // parse the command-line arguments
+ *    parser.parse(argc, argv);
+ *
+ *    // get the inputs and iterate over them
+ *    string name = parser.retrieve("name");
+ *    vector<string> inputs = parser.retrieve<vector<string>>("inputs");
+ *  \endcode
+ *  https://github.com/jiwoong-choi/argparse/blob/master/argparse.hpp
+ */
+class ArgumentParser {
+private:
+  class Argument;
+  typedef std::string String;
+  typedef std::vector<String> StringVector;
+  typedef std::vector<Argument> ArgumentVector;
+
+  // --------------------------------------------------------------------------
+  // Argument
+  // --------------------------------------------------------------------------
+  static String delimit(const String &name) {
+    return String(std::min(name.size(), (size_t)2), '-').append(name);
+  }
+  static String strip(const String &name) {
+    size_t begin = 0;
+    begin += name.size() > 0 ? name[0] == '-' : 0;
+    begin += name.size() > 3 ? name[1] == '-' : 0;
+    return name.substr(begin);
+  }
+  static String upper(const String &in) {
+    String out(in);
+    std::transform(out.begin(), out.end(), out.begin(), ::toupper);
+    return out;
+  }
+  static String escape(const String &in) {
+    String out(in);
+    if (in.find(' ') != std::string::npos)
+      out = String("\"").append(out).append("\"");
+    return out;
+  }
+
+  struct Argument {
+    Argument()
+        : short_name(""), name(""), optional(true), fixed_nargs(0),
+          fixed(true) {}
+    Argument(const String &_short_name, const String &_name, bool _optional,
+             char nargs)
+        : short_name(_short_name), name(_name), optional(_optional) {
+      if (nargs == '+' || nargs == '*') {
+        variable_nargs = nargs;
+        fixed = false;
+      } else {
+        fixed_nargs = nargs;
+        fixed = true;
+      }
+    }
+    String short_name;
+    String name;
+    bool optional;
+    union {
+      size_t fixed_nargs;
+      char variable_nargs;
+    };
+    bool fixed;
+    bool specified = false;
+    String canonicalName() const { return (name.empty()) ? short_name : name; }
+    String toString(bool named = true) const {
+      std::ostringstream s;
+      String uname =
+          name.empty() ? upper(strip(short_name)) : upper(strip(name));
+      if (named && optional)
+        s << "[";
+      if (named)
+        s << canonicalName();
+      if (fixed) {
+        size_t N = std::min((size_t)3, fixed_nargs);
+        for (size_t n = 0; n < N; ++n)
+          s << " " << uname;
+        if (N < fixed_nargs)
+          s << " ...";
+      }
+      if (!fixed) {
+        s << " ";
+        if (variable_nargs == '*')
+          s << "[";
+        s << uname << " ";
+        if (variable_nargs == '+')
+          s << "[";
+        s << uname << "...]";
+      }
+      if (named && optional)
+        s << "]";
+      return s.str();
+    }
+  };
+
+  void insertArgument(const Argument &arg) {
+    size_t N = arguments_.size();
+    arguments_.push_back(arg);
+    if (arg.fixed && arg.fixed_nargs <= 1) {
+      variables_.push_back(String());
+    } else {
+      variables_.push_back(String());
+    }
+    if (!arg.short_name.empty())
+      index_[arg.short_name] = N;
+    if (!arg.name.empty())
+      index_[arg.name] = N;
+    if (!arg.optional)
+      required_++;
+  }
+
+  // --------------------------------------------------------------------------
+  // Error handling
+  // --------------------------------------------------------------------------
+  void argumentError(const std::string &msg, bool show_usage = false) {
+    if (use_exceptions_)
+      throw std::invalid_argument(msg);
+    std::cerr << "ArgumentParser error: " << msg << std::endl;
+    if (show_usage)
+      std::cerr << usage() << std::endl;
+    exit(-5);
+  }
+
+  // --------------------------------------------------------------------------
+  // Member variables
+  // --------------------------------------------------------------------------
+  IndexMap index_;
+  bool ignore_first_;
+  bool use_exceptions_;
+  size_t required_;
+  String app_name_;
+  String final_name_;
+  ArgumentVector arguments_;
+  StringVector variables_;
+
+public:
+  ArgumentParser()
+      : ignore_first_(true), use_exceptions_(false), required_(0) {}
+  // --------------------------------------------------------------------------
+  // addArgument
+  // --------------------------------------------------------------------------
+  void appName(const String &name) { app_name_ = name; }
+  void addArgument(const String &name, char nargs = 0, bool optional = true) {
+    if (name.size() > 2) {
+      Argument arg("", verify(name), optional, nargs);
+      insertArgument(arg);
+    } else {
+      Argument arg(verify(name), "", optional, nargs);
+      insertArgument(arg);
+    }
+  }
+  void addArgument(const String &short_name, const String &name, char nargs = 0,
+                   bool optional = true) {
+    Argument arg(verify(short_name), verify(name), optional, nargs);
+    insertArgument(arg);
+  }
+  void addFinalArgument(const String &name, char nargs = 1,
+                        bool optional = false) {
+    final_name_ = delimit(name);
+    Argument arg("", final_name_, optional, nargs);
+    insertArgument(arg);
+  }
+  void ignoreFirstArgument(bool ignore_first) { ignore_first_ = ignore_first; }
+  String verify(const String &name) {
+    if (name.empty())
+      argumentError("argument names must be non-empty");
+    if ((name.size() == 2 && name[0] != '-') || name.size() == 3)
+      argumentError(String("invalid argument '")
+                        .append(name)
+                        .append("'. Short names must begin with '-'"));
+    if (name.size() > 3 && (name[0] != '-' || name[1] != '-'))
+      argumentError(
+          String("invalid argument '")
+              .append(name)
+              .append("'. Multi-character names must begin with '--'"));
+    return name;
+  }
+
+  // --------------------------------------------------------------------------
+  // Parse
+  // --------------------------------------------------------------------------
+  void parse(size_t argc, const char **argv) {
+    parse(StringVector(argv, argv + argc));
+  }
+
+  void parse(const StringVector &argv) {
+    // check if the app is named
+    if (app_name_.empty() && ignore_first_ && !argv.empty())
+      app_name_ = argv[0];
+
+    // set up the working set
+    Argument active;
+    Argument final =
+        final_name_.empty() ? Argument() : arguments_[index_[final_name_]];
+    size_t consumed = 0;
+    size_t nrequired = final.optional ? required_ : required_ - 1;
+    size_t nfinal = final.optional
+                        ? 0
+                        : (final.fixed ? final.fixed_nargs
+                                       : (final.variable_nargs == '+' ? 1 : 0));
+
+    // iterate over each element of the array
+    for (StringVector::const_iterator in = argv.begin() + ignore_first_;
+         in < argv.end() - nfinal; ++in) {
+      String active_name = active.canonicalName();
+      String el = *in;
+
+      //  check if the element is a key
+      if (index_.count(el) == 0) {
+        // input
+        // is the current active argument expecting more inputs?
+        if (active.fixed && active.fixed_nargs <= consumed)
+          argumentError(
+              String("attempt to pass too many inputs to ").append(active_name),
+              true);
+        if (active.fixed && active.fixed_nargs == 1) {
+          variables_[index_[active_name]] = el;
+        } else {
+          String &variable = variables_[index_[active_name]];
+          StringVector value = castTo<StringVector>(variable);
+          value.push_back(el);
+          variable = toString(value);
+        }
+        consumed++;
+      } else {
+        // new key!
+        arguments_[index_[el]].specified = true;
+        // has the active argument consumed enough elements?
+        if ((active.fixed && active.fixed_nargs != consumed) ||
+            (!active.fixed && active.variable_nargs == '+' && consumed < 1))
+          argumentError(String("encountered argument ")
+                            .append(el)
+                            .append(" when expecting more inputs to ")
+                            .append(active_name),
+                        true);
+        active = arguments_[index_[el]];
+        // check if we've satisfied the required arguments
+        /*
+        if ((!active.optional) && nrequired > 0)
+          argumentError(String("encountered optional argument ")
+                            .append(el)
+                            .append(" when expecting more required arguments"),
+                        true);
+        */
+        // are there enough arguments for the new argument to consume?
+        if ((active.fixed &&
+             active.fixed_nargs > (argv.end() - in - nfinal - 1)) ||
+            (!active.fixed && active.variable_nargs == '+' &&
+             !(argv.end() - in - nfinal - 1)))
+          argumentError(String("too few inputs passed to argument ").append(el),
+                        true);
+        if (!active.optional)
+          nrequired--;
+        consumed = 0;
+      }
+    }
+
+    for (StringVector::const_iterator in =
+             std::max(argv.begin() + ignore_first_, argv.end() - nfinal);
+         in != argv.end(); ++in) {
+      String el = *in;
+      // check if we accidentally find an argument specifier
+      if (index_.count(el))
+        argumentError(String("encountered argument specifier ")
+                          .append(el)
+                          .append(" while parsing final required inputs"),
+                      true);
+      if (final.fixed && final.fixed_nargs == 1) {
+        variables_[index_[final_name_]] = el;
+      } else {
+        String &variable = variables_[index_[final_name_]];
+        StringVector value = castTo<StringVector>(variable);
+        value.push_back(el);
+        variable = toString(value);
+      }
+      nfinal--;
+    }
+
+    // check that all of the required arguments have been encountered
+    if (nrequired > 0 || nfinal > 0)
+      argumentError(
+          String("too few required arguments passed to ").append(app_name_),
+          true);
+  }
+
+  // --------------------------------------------------------------------------
+  // Retrieve
+  // --------------------------------------------------------------------------
+  template <typename T>
+  T retrieve(const String &name) {
+    if (index_.count(delimit(name)) == 0)
+      throw std::out_of_range("Key not found");
+    size_t N = index_[delimit(name)];
+    return castTo<T>(variables_[N]);
+  }
+
+  // --------------------------------------------------------------------------
+  // Properties
+  // --------------------------------------------------------------------------
+  String usage() {
+    // premable app name
+    std::ostringstream help;
+    help << "Usage: " << escape(app_name_);
+    size_t indent = help.str().size();
+    size_t linelength = 0;
+
+    // get the required arguments
+    for (ArgumentVector::const_iterator it = arguments_.begin();
+         it != arguments_.end(); ++it) {
+      Argument arg = *it;
+      if (arg.optional)
+        continue;
+      if (arg.name.compare(final_name_) == 0)
+        continue;
+      help << " ";
+      String argstr = arg.toString();
+      if (argstr.size() + linelength > 80) {
+        help << "\n" << String(indent, ' ');
+        linelength = 0;
+      } else {
+        linelength += argstr.size();
+      }
+      help << argstr;
+    }
+
+    // get the optional arguments
+    for (ArgumentVector::const_iterator it = arguments_.begin();
+         it != arguments_.end(); ++it) {
+      Argument arg = *it;
+      if (!arg.optional)
+        continue;
+      if (arg.name.compare(final_name_) == 0)
+        continue;
+      help << " ";
+      String argstr = arg.toString();
+      if (argstr.size() + linelength > 80) {
+        help << "\n" << String(indent, ' ');
+        linelength = 0;
+      } else {
+        linelength += argstr.size();
+      }
+      help << argstr;
+    }
+
+    // get the final argument
+    if (!final_name_.empty()) {
+      Argument arg = arguments_[index_[final_name_]];
+      String argstr = arg.toString(false);
+      if (argstr.size() + linelength > 80) {
+        help << "\n" << String(indent, ' ');
+        linelength = 0;
+      } else {
+        linelength += argstr.size();
+      }
+      help << argstr;
+    }
+
+    return help.str();
+  }
+  void useExceptions(bool state) { use_exceptions_ = state; }
+  bool empty() const { return index_.empty(); }
+  void clear() {
+    ignore_first_ = true;
+    required_ = 0;
+    index_.clear();
+    arguments_.clear();
+    variables_.clear();
+  }
+  bool exists(const String &name) const {
+    return index_.count(delimit(name)) > 0;
+  }
+  bool gotArgument(const String &name) {
+    // check if the name is an argument
+    if (index_.count(delimit(name)) == 0)
+      return 0;
+    size_t N = index_[delimit(name)];
+    Argument arg = arguments_[N];
+    return arg.specified;
+  }
+};
+} // namespace argparse
+
+template <typename T>
+std::ostream &operator<<(std::ostream &out, const std::vector<T> &v) {
+  out << "[";
+  for (unsigned long i = 0; i < v.size(); ++i) {
+    if (i > 0)
+      out << ", ";
+    out << v[i];
+  }
+  out << "]";
+
+  return out;
+}
+
+template <typename T>
+typename argparse::enable_if<argparse::is_standard_type<T>,
+                             std::istream &>::type
+operator>>(std::istream &in, std::vector<T> &v) {
+  using namespace argparse;
+  v.clear();
+
+  std::string str;
+  std::getline(in, str, '\n');
+
+  if (str.empty())
+    return in;
+  remove_space(str);
+  strip_brackets(str);
+
+  std::istringstream sin(str);
+  while (sin.good()) {
+    std::string substr;
+    std::getline(sin, substr, ',');
+    if (!substr.empty())
+      v.push_back(castTo<T>(substr));
+  }
+
+  return in;
+}
+
+template <typename T>
+typename argparse::enable_if<argparse::is_standard_type<T>,
+                             std::istream &>::type
+operator>>(std::istream &in, std::vector<std::vector<T>> &v) {
+  using namespace argparse;
+  static const std::string delimiter = "]";
+  v.clear();
+
+  std::string str;
+  std::getline(in, str, '\n');
+
+  if (str.empty())
+    return in;
+  remove_space(str);
+  strip_brackets(str);
+
+  size_t pos = 0;
+  while ((pos = str.find(delimiter)) != std::string::npos) {
+    std::string substr = str.substr(0, pos + 1);
+    v.push_back(castTo<std::vector<T>>(substr));
+    str.erase(0, pos + delimiter.length());
+  }
+
+  return in;
+}
+
+#endif
\ No newline at end of file
diff --git a/cviruntime/samples/runner/model_runner.cpp b/cviruntime/samples/runner/model_runner.cpp
new file mode 100644
index 000000000..531d32c68
--- /dev/null
+++ b/cviruntime/samples/runner/model_runner.cpp
@@ -0,0 +1,409 @@
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <stdlib.h>
+#include <string.h>
+#include <vector>
+#include <iostream>
+#include <sstream>
+#include <fstream>
+#include <sys/time.h>
+#include <cviruntime.h>
+#include <cvitpu_debug.h>
+#include "argparse.hpp"
+#include "similarity.hpp"
+#include "cnpy.h"
+#include "assert.h"
+
+static std::string optInputFile;
+static std::string optModelFile;
+static std::string optOutputFile;
+static std::string optSetChipName;
+static int32_t optProgramId = 0;
+static int32_t optInferenceCount = 1;
+static int32_t optTimerRunCount = 1;
+static bool optDumpAllTensors = false;
+static bool optAsyncForward = false;
+static bool optEnableTimer = false;
+static float optCosineTolerance = 1.0f;
+static float optCorrelationTolerance = 1.0f;
+static float optEuclideanTolerance = 1.0f;
+
+#define EXIT_IF_ERROR(cond, statement)                                                   \
+  if ((cond)) {                                                                          \
+    TPU_LOG_ERROR("%s\n", statement);                                                         \
+    exit(1);                                                                             \
+  }
+
+static const char* formatToStr(CVI_FMT fmt) {
+  switch(fmt) {
+    case CVI_FMT_FP32:  return "fp32";
+    case CVI_FMT_INT32:  return "i32";
+    case CVI_FMT_UINT32: return "u32";
+    case CVI_FMT_BF16:   return "bf16";
+    case CVI_FMT_INT16:  return "i16";
+    case CVI_FMT_UINT16: return "u16";
+    case CVI_FMT_INT8:   return "i8";
+    case CVI_FMT_UINT8:  return "u8";
+    default:
+      TPU_LOG_FATAL("unknown fmt:%d\n", fmt);
+  }
+  return nullptr;
+}
+
+static std::ifstream *openFile(const std::string &name, size_t &size) {
+  auto *f = new std::ifstream(name, std::ios::binary);
+  if (!f->is_open()) {
+    TPU_LOG_ERROR("Error, failed to open %s\n", name.c_str());
+    return nullptr;
+  }
+  f->seekg(0, std::ios::end);
+  size = f->tellg();
+  f->seekg(0, std::ios::beg);
+  return f;
+}
+
+static bool isNpzFile(const std::string &name) {
+  std::string extension = name.substr(name.size() - 4);
+  if (extension == ".npz")
+    return true;
+  return false;
+}
+
+static bool compareResultWithNpz(CVI_TENSOR *tensors, int32_t num, cnpy::npz_t &reference) {
+  float euclidean = 0;
+  float cosine = 0;
+  float correlation = 0;
+  for (int i = 0; i < num; i++) {
+    auto &tensor = tensors[i];
+    std::string name(tensor.name);
+    if (reference.find(name) == reference.end()) {
+      TPU_LOG_WARNING("Warning, Cannot find %s in reference\n", name.c_str());
+      continue;
+    }
+    auto &refData = reference[name];
+    if (tensor.count != refData.num_vals) {
+      TPU_LOG_ERROR("%s %zu vs %zu, size are not equal.\n", name.c_str(), tensor.count, refData.num_vals);
+      return false;
+    }
+
+    if (refData.type == 'f') {
+      if (tensor.fmt == CVI_FMT_INT8) {
+        array_similarity((int8_t *)CVI_NN_TensorPtr(&tensor), refData.data<float>(),
+                         tensor.count, euclidean, cosine, correlation);
+      } else if (tensor.fmt == CVI_FMT_UINT8) {
+        array_similarity((uint8_t *)CVI_NN_TensorPtr(&tensor), refData.data<float>(),
+                         tensor.count, euclidean, cosine, correlation);
+      } else if (tensor.fmt == CVI_FMT_BF16) {
+        array_similarity((uint16_t *)CVI_NN_TensorPtr(&tensor), refData.data<float>(),
+                         tensor.count, euclidean, cosine, correlation);
+      } else {
+        array_similarity((float *)CVI_NN_TensorPtr(&tensor), refData.data<float>(), tensor.count,
+                         euclidean, cosine, correlation);
+      }
+    } else if (refData.type == 'u') {
+      if (tensor.fmt == CVI_FMT_BF16) {
+        assert(refData.word_size == 2);
+        array_similarity((uint16_t *)CVI_NN_TensorPtr(&tensor), refData.data<uint16_t>(),
+                         tensor.count, euclidean, cosine, correlation);
+      } else if (tensor.fmt == CVI_FMT_UINT8) {
+        assert(refData.word_size == 1);
+        array_similarity((uint8_t *)CVI_NN_TensorPtr(&tensor), refData.data<uint8_t>(),
+                         tensor.count, euclidean, cosine, correlation);
+      }
+    } else if (refData.type == 'i') {
+      assert(refData.word_size == 1);
+      assert(tensor.fmt == CVI_FMT_INT8);
+      array_similarity((int8_t *)CVI_NN_TensorPtr(&tensor), refData.data<int8_t>(),
+                        tensor.count, euclidean, cosine, correlation);
+    }
+
+    if (cosine < optCosineTolerance || correlation < optCorrelationTolerance ||
+        euclidean < optEuclideanTolerance) {
+
+      printf("Error, [%s] cosine:%f correlation:%f euclidean:%f\n", name.c_str(), cosine,
+             correlation, euclidean);
+      return false;
+    }
+  }
+  TPU_LOG_INFO("Compare pass.\n");
+  return true;
+}
+
+static void saveResultToNpz(const std::string &name, CVI_TENSOR *tensors, int32_t num) {
+  assert(isNpzFile(name) && "output should be a npz file");
+
+  cnpy::npz_t npz;
+  for (int i = 0; i < num; i++) {
+    auto &tensor = tensors[i];
+    std::vector<size_t> shape = {(size_t)tensor.shape.dim[0], (size_t)tensor.shape.dim[1],
+                                 (size_t)tensor.shape.dim[2],
+                                 (size_t)tensor.shape.dim[3]};
+    switch (tensor.fmt) {
+      case CVI_FMT_FP32:
+        cnpy::npz_add_array<float>(npz, tensor.name, (float *)CVI_NN_TensorPtr(&tensor), shape);
+        break;
+      case CVI_FMT_BF16: // we use uint16_t to represent BF16
+        cnpy::npz_add_array<uint16_t>(npz, tensor.name, (uint16_t *)CVI_NN_TensorPtr(&tensor),
+                                      shape);
+        break;
+      case CVI_FMT_INT8:
+        cnpy::npz_add_array<int8_t>(npz, tensor.name, (int8_t *)CVI_NN_TensorPtr(&tensor),
+                                    shape);
+        break;
+      case CVI_FMT_UINT8:
+        cnpy::npz_add_array<uint8_t>(npz, tensor.name, (uint8_t *)CVI_NN_TensorPtr(&tensor),
+                                    shape);
+        break;
+      default:
+        TPU_LOG_ERROR("Error, Current unsupported type:%d\n", tensor.fmt);
+        assert(0);
+    }
+  }
+  cnpy::npz_save_all(name, npz);
+}
+
+static void ConvertFp32ToInt8(float *src, int8_t *dst, int count,
+                              float qscale, int zero_point = 0) {
+  for (int i = 0; i < count; i++) {
+    int val = std::round((*src++) * qscale) + zero_point;
+    if (val > 127) {
+      val = 127;
+    } else if (val < -128) {
+      val = -128;
+    }
+    *dst++ = (int8_t)val;
+  }
+}
+
+static void ConvertFp32ToUint8(float *src, uint8_t *dst, int count,
+                               float qscale, int zero_point = 0) {
+  for (int i = 0; i < count; i++) {
+    int val = std::round((*src++) * qscale) + zero_point;
+    if (val > 255) {
+      val = 255;
+    }
+    *dst++ = (uint8_t)val;
+  }
+}
+
+static void ConvertFp32ToBf16(float *src, uint16_t *dst, int count) {
+  for (int i = 0; i < count; ++i) {
+    auto fval = src[i];
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    dst[i] = ((uint16_t *)&fval)[0];
+#else
+    dst[i] = ((uint16_t *)&fval)[1];
+#endif
+  }
+}
+
+static void loadInput(std::string &input_file, CVI_TENSOR *tensors, int num) {
+  assert(isNpzFile(input_file) && "input should be npz file");
+
+  cnpy::npz_t input_npz = cnpy::npz_load(input_file);
+  EXIT_IF_ERROR(!input_npz.size(), "cannot open input npz file");
+  assert(num == (int)input_npz.size());
+
+  int idx = 0;
+  for (auto &npy : input_npz) {
+    auto &arr = npy.second;
+    auto &tensor = tensors[idx++];
+    if (arr.type == 'f' && tensor.fmt == CVI_FMT_INT8) {
+      assert(arr.num_vals == tensor.mem_size);
+      ConvertFp32ToInt8(
+          arr.data<float>(),
+          (int8_t *)CVI_NN_TensorPtr(&tensor),
+          CVI_NN_TensorCount(&tensor),
+          CVI_NN_TensorQuantScale(&tensor));
+    } else if (arr.type == 'f' && tensor.fmt == CVI_FMT_UINT8) {
+      assert(arr.num_vals == tensor.mem_size);
+      ConvertFp32ToUint8(
+          arr.data<float>(),
+          (uint8_t *)CVI_NN_TensorPtr(&tensor),
+          CVI_NN_TensorCount(&tensor),
+          CVI_NN_TensorQuantScale(&tensor));
+    } else if (arr.type == 'f' && tensor.fmt == CVI_FMT_BF16) {
+      assert(arr.num_vals == tensor.count);
+      ConvertFp32ToBf16(
+          arr.data<float>(),
+          (uint16_t *)CVI_NN_TensorPtr(&tensor),
+          CVI_NN_TensorCount(&tensor));
+    } else {
+      if (arr.num_bytes() != tensor.mem_size){
+        std::stringstream err;
+        err << "arr.num_bytes: (" << arr.num_bytes()
+            << ")not same as mem.size: (" << tensor.mem_size << ")\n";
+        throw std::runtime_error(err.str());
+      }
+      memcpy(CVI_NN_TensorPtr(&tensor), arr.data<uint8_t>(), tensor.mem_size);
+    }
+  }
+}
+
+int main(int argc, const char **argv) {
+  argparse::ArgumentParser parser;
+  parser.addArgument("-i", "--input", 1, false); // required
+  parser.addArgument("-m", "--model", 1, false);   // required
+  parser.addArgument("-o", "--output", 1, false);  // required
+  parser.addArgument("-p", "--pmu", 1);
+  parser.addArgument("-s", "--program-id", 1); // select program by id
+  parser.addArgument("-b", "--batch-num", 1);  // deprecated
+  parser.addArgument("-c", "--count", 1);      // inference count
+  parser.addArgument("-r", "--reference", 1);  // must be npz file
+  parser.addArgument("-t", "--tolerances", 1); // cosine_tol,correlation_tol,euclidean_tol
+  parser.addArgument("-v", "--verbose", 1); // set verbose level, 0: only error & warning, 1: info, 2: debug
+  parser.addArgument("--dump-all-tensors");
+  parser.addArgument("--async-forward");
+  parser.addArgument("--skip-preprocess");
+  parser.addArgument("--enable-timer");
+  parser.parse(argc, argv);
+
+  optInputFile = parser.retrieve<std::string>("input");
+  optModelFile = parser.retrieve<std::string>("model");
+  optOutputFile = parser.retrieve<std::string>("output");
+  cnpy::npz_t ref_npz;
+
+  if (parser.gotArgument("pmu")) {
+    std::string pmu = parser.retrieve<std::string>("pmu");
+    setenv("TPU_PMUBUF_OUTPUT_FILE", pmu.c_str(), true);
+  }
+  if (parser.gotArgument("dump-all-tensors")) {
+    optDumpAllTensors = true;
+  }
+  if (parser.gotArgument("async-forward")) {
+    optAsyncForward = true;
+  }
+  if (parser.gotArgument("enable-timer")) {
+    optEnableTimer = true;
+  }
+  if (parser.gotArgument("count")) {
+    if (optEnableTimer) {
+      optTimerRunCount = parser.retrieve<int>("count");
+    } else {
+      optInferenceCount = parser.retrieve<int>("count");
+    }
+  }
+  if (parser.gotArgument("program-id")) {
+    optProgramId = parser.retrieve<int>("program-id");
+  }
+  if (parser.gotArgument("reference")) {
+    auto name = parser.retrieve<std::string>("reference");
+    assert(isNpzFile(name));
+    ref_npz = cnpy::npz_load(name);
+    EXIT_IF_ERROR(!ref_npz.size(), "cannot open reference npz file");
+  }
+
+  if (parser.gotArgument("tolerances")) {
+    std::istringstream option(parser.retrieve<std::string>("tolerances"));
+    std::vector<std::string> tolerances;
+    std::string tol;
+    while (std::getline(option, tol, ',')) {
+      tolerances.push_back(std::move(tol));
+    }
+    assert(tolerances.size() == 3);
+    optCosineTolerance = std::stof(tolerances[0]);
+    optCorrelationTolerance = std::stof(tolerances[1]);
+    optEuclideanTolerance = std::stof(tolerances[2]);
+    printf("Tolerance, cosine:%f, correlation:%f, euclidean:%f\n", optCosineTolerance,
+           optCorrelationTolerance, optEuclideanTolerance);
+  }
+
+  CVI_MODEL_HANDLE model = NULL;
+  CVI_RC ret = CVI_NN_RegisterModel(optModelFile.c_str(), &model);
+  EXIT_IF_ERROR(ret != CVI_RC_SUCCESS, "failed to register cvimodel");
+
+  CVI_NN_SetConfig(model, OPTION_PROGRAM_INDEX, optProgramId);
+  CVI_NN_SetConfig(model, OPTION_OUTPUT_ALL_TENSORS, optDumpAllTensors);
+
+  CVI_TENSOR *input_tensors, *output_tensors;
+  int32_t input_num, output_num;
+  ret = CVI_NN_GetInputOutputTensors(model, &input_tensors, &input_num,
+                               &output_tensors, &output_num);
+  EXIT_IF_ERROR(ret != CVI_RC_SUCCESS, "failed to get inputs & outputs from model");
+
+  // print the inputs & outputs's information
+  if (1) {
+    TPU_LOG_INFO("Inputs:\n");
+    for (int i = 0; i < input_num; ++i) {
+      auto &tensor = input_tensors[i];
+      TPU_LOG_INFO("  [%d] %s <%d,%d,%d,%d>,%s\n",
+                   i, tensor.name, tensor.shape.dim[0], tensor.shape.dim[1], tensor.shape.dim[2],
+                   tensor.shape.dim[3], formatToStr(tensor.fmt));
+    }
+    TPU_LOG_INFO("Outputs:\n");
+    for (int i = 0; i < output_num; ++i) {
+      auto &tensor = output_tensors[i];
+      TPU_LOG_INFO("  [%d] %s <%d,%d,%d,%d>,%s\n",
+                   i, tensor.name, tensor.shape.dim[0], tensor.shape.dim[1], tensor.shape.dim[2],
+                   tensor.shape.dim[3], formatToStr(tensor.fmt));
+    }
+  }
+
+  loadInput(optInputFile, input_tensors, input_num);
+
+  int fail_cnt = 0;
+  for (int i = 0; i < optInferenceCount; ++i) {
+    if (optAsyncForward) {
+      void *task = nullptr;
+      ret = CVI_NN_ForwardAsync(model, input_tensors, input_num, output_tensors,
+                                output_num, &task);
+      EXIT_IF_ERROR(ret != CVI_RC_SUCCESS, "async forward failed");
+
+      ret = CVI_NN_ForwardWait(model, task);
+      EXIT_IF_ERROR(ret != CVI_RC_SUCCESS, "forward wait failed");
+
+    } else {
+      ret = CVI_NN_Forward(model, input_tensors, input_num, output_tensors,
+                           output_num);
+      EXIT_IF_ERROR(ret != CVI_RC_SUCCESS, "forward failed");
+    }
+    if (ref_npz.size() && !compareResultWithNpz(output_tensors, output_num, ref_npz)) {
+      fail_cnt++;
+    }
+  }
+
+  if (optEnableTimer) {
+    struct timeval t0, t1;
+    long elapsed;
+    gettimeofday(&t0, NULL);
+
+    for (int i = 0; i < optTimerRunCount; ++i) {
+      if (optAsyncForward) {
+        void *task = nullptr;
+        ret = CVI_NN_ForwardAsync(model, input_tensors, input_num, output_tensors,
+                                  output_num, &task);
+        EXIT_IF_ERROR(ret != CVI_RC_SUCCESS, "async forward failed");
+
+        ret = CVI_NN_ForwardWait(model, task);
+        EXIT_IF_ERROR(ret != CVI_RC_SUCCESS, "forward wait failed");
+      } else {
+        ret = CVI_NN_Forward(model, input_tensors, input_num, output_tensors,
+                             output_num);
+        EXIT_IF_ERROR(ret != CVI_RC_SUCCESS, "forward failed");
+      }
+    }
+
+    gettimeofday(&t1, NULL);
+    elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
+    double ms_per_iter = elapsed/optTimerRunCount/1000.0;
+    double fps = 1000.0/ms_per_iter;
+    std::cout << "Performance result: "
+              << optTimerRunCount << " runs take "
+              << elapsed/1000.0 << " ms, each run takes "
+              << std::to_string(ms_per_iter) << " ms, fps "
+              << std::to_string(fps) << std::endl;
+  }
+
+  saveResultToNpz(optOutputFile, output_tensors, output_num);
+
+  CVI_NN_CleanupModel(model);
+
+  if (ref_npz.size()) {
+    std::cout << "Compare result: " << (optInferenceCount - fail_cnt) << "/"
+              << optInferenceCount << " passed.\n";
+    if (fail_cnt)
+      return 1;
+  }
+  return 0;
+}
diff --git a/cviruntime/samples/runner/similarity.hpp b/cviruntime/samples/runner/similarity.hpp
new file mode 100644
index 000000000..a380272f7
--- /dev/null
+++ b/cviruntime/samples/runner/similarity.hpp
@@ -0,0 +1,96 @@
+#ifndef RUNTIME_SIMILARITY_H
+#define RUNTIME_SIMILARITY_H
+
+#include <stddef.h>
+#include <math.h>
+#include <vector>
+#include <iostream>
+
+static float u16_to_bf16(uint16_t val) {
+  float ret;
+  auto *q = reinterpret_cast<uint16_t *>(&ret);
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+  q[0] = val;
+#else
+  q[1] = val;
+#endif
+  return ret;
+}
+
+template <typename U, typename V>
+static bool array_convert(U *u, V *v, std::vector<float> &uu, std::vector<float> &vv) {
+  size_t equal_cnt = 0;
+  for (size_t i = 0; i < uu.size(); i++) {
+    uu[i] = (typeid(U) == typeid(uint16_t)) ? u16_to_bf16(u[i]) : static_cast<float>(u[i]);
+    vv[i] = (typeid(V) == typeid(uint16_t)) ? u16_to_bf16(v[i]) : static_cast<float>(v[i]);
+    if (uu[i] == vv[i])
+      equal_cnt++;
+  }
+  return equal_cnt == uu.size();
+}
+
+static float array_average(float *u, float *v, size_t size) {
+  double average = 0;
+  for (size_t i = 0; i < size; i++) {
+    average += u[i] * v[i];
+  }
+  return average / size;
+}
+
+static float array_average(float *u, size_t size, int power = 1) {
+  double average = 0;
+  for (size_t i = 0; i < size; i++) {
+    if (power != 1) {
+      average += pow(u[i], power);
+    } else {
+      average += u[i];
+    }
+  }
+  return average / size;
+}
+
+static float euclidean_similiarity(float *u, float *v, size_t size) {
+  double distance = 0;
+  double root = 0;
+  for (size_t i = 0; i < size; i++) {
+    distance += pow(u[i] - v[i], 2);
+    root += pow((u[i] + v[i]) / 2, 2);
+  }
+  distance = sqrt(distance);
+  root = sqrt(root);
+  return (float)(1 - distance / root);
+}
+
+static float correlation_similarity(float *u, float *v, size_t size, bool centered) {
+  if (centered) {
+    float umu = array_average(u, size);
+    float vmu = array_average(v, size);
+    for (size_t i = 0; i < size; i++) {
+      u[i] -= umu;
+      v[i] -= vmu;
+    }
+  }
+
+  float uv = array_average(u, v, size);
+  float uu = array_average(u, size, 2);
+  float vv = array_average(v, size, 2);
+  return uv / sqrt(uu * vv);
+}
+
+template <typename U, typename V>
+static void array_similarity(U *u, V *v, size_t size, float &euclidean, float &cosine,
+                             float &correlation) {
+  std::vector<float> uu(size, 0);
+  std::vector<float> vv(size, 0);
+  if (array_convert(u, v, uu, vv)) {
+    euclidean = 1;
+    cosine = 1;
+    correlation = 1;
+    return;
+  }
+  euclidean = euclidean_similiarity(uu.data(), vv.data(), uu.size());
+  cosine = correlation_similarity(uu.data(), vv.data(), uu.size(), false);
+  correlation = correlation_similarity(uu.data(), vv.data(), uu.size(), true);
+}
+
+#endif
\ No newline at end of file
diff --git a/cviruntime/samples/samples_extra/CMakeLists.txt b/cviruntime/samples/samples_extra/CMakeLists.txt
new file mode 100644
index 000000000..8e1dbc08b
--- /dev/null
+++ b/cviruntime/samples/samples_extra/CMakeLists.txt
@@ -0,0 +1,26 @@
+cmake_minimum_required(VERSION 2.8.0)
+
+project(cvitek_samples C CXX)
+
+add_subdirectory(alphapose_fused_preprocess)
+add_subdirectory(detector_yolov3_fused_preprocess)
+add_subdirectory(detector_yolov5_fused_preprocess)
+add_subdirectory(detector_yolov5-face_fused_preprocess)
+add_subdirectory(detector_yolox_s)
+add_subdirectory(insightface_fused_preprocess)
+add_subdirectory(detector_ppyoloem_fused_preprocess)
+add_subdirectory(detector_yolov8n_fused_preprocess)
+
+set(SCRIPT_FILES
+    run_alphapose_fused_preprocess.sh
+    run_detector_yolov3_fused_preprocess.sh
+    run_detector_yolov5_fused_preprocess.sh
+    run_detector_yolox_s.sh
+    run_insightface_fused_preprocess.sh
+    run_detector_ppyoloem_fused_preprocess.sh)
+
+install(FILES ${SCRIPT_FILES}
+    PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ GROUP_READ WORLD_READ
+    DESTINATION samples_extra)
+
+install(DIRECTORY data DESTINATION samples_extra)
\ No newline at end of file
diff --git a/cviruntime/samples/samples_extra/README.md b/cviruntime/samples/samples_extra/README.md
new file mode 100644
index 000000000..12418a73c
--- /dev/null
+++ b/cviruntime/samples/samples_extra/README.md
@@ -0,0 +1,19 @@
+# Samples extra for CVI TPU SDK
+
+##  Yolo detection series
+
+The sample implementation of yolov3 without post-processing, yolov5 with post-processing and yolox is provided. Please refer to the detector_yolov3_fused_ preprocess, detector_yolov5_fused_Preprocess and detector_yolox_s Implementation under directory
+
+##  Preprocess classification
+Support the use of TPU or VPSS for pre-processing. Take the classification model as an example:
+
+1. Refer to classifier for classifier_tpu_preprocess under preprocess directory \
+The advantage is to reduce the memory copy of pre-processing, but it will increase the use of ion
+
+2. Refer to classifier for classifier_vpss_preprocess under preprocess directory \
+The advantage is that VPSS supports more types of preprocessing. It does not need to use TPU for additional preprocessing, but memory copying is required
+
+##  Complex deployment scenario sample
+1. for attitude evaluation, refer to alphapose_fused_preprocess directory
+2. for face recognition detection, please refer to insightface_fused_preprocess directory
+
diff --git a/cviruntime/samples/samples_extra/alphapose_fused_preprocess/CMakeLists.txt b/cviruntime/samples/samples_extra/alphapose_fused_preprocess/CMakeLists.txt
new file mode 100644
index 000000000..c54dc67db
--- /dev/null
+++ b/cviruntime/samples/samples_extra/alphapose_fused_preprocess/CMakeLists.txt
@@ -0,0 +1,43 @@
+cmake_minimum_required(VERSION 2.8.0)
+
+project(cvi_sample_detector C CXX)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
+
+if(NOT DEFINED TPU_SDK_PATH)
+  message(FATAL_ERROR "Please set TPU_SDK_PATH to point to the TPU_SDK installation")
+endif()
+include_directories(${TPU_SDK_PATH}/include)
+link_directories(${TPU_SDK_PATH}/lib)
+
+if(NOT DEFINED OPENCV_PATH)
+  message(FATAL_ERROR "Please set OPENCV_PATH to point to the opencvn installation")
+endif()
+include_directories(${OPENCV_PATH}/include)
+link_directories(${OPENCV_PATH}/lib)
+
+set(CVI_LIBS ${CVI_LIBS} cviruntime cvikernel)
+if(NOT CMAKE_CROSSCOMPILING)
+  set(CVI_LIBS ${CVI_LIBS} cvicmodel)
+endif()
+
+set(OPENCV_LIBS ${OPENCV_LIBS} opencv_core opencv_imgcodecs opencv_imgproc)
+if(NOT CMAKE_CROSSCOMPILING)
+  set(OPENCV_LIBS ${OPENCV_LIBS} opencv_highgui)
+endif()
+
+set(EXTRA_LIBS ${EXTRA_LIBS} dl stdc++ pthread z)
+
+add_executable(cvi_sample_alphapose_fused_preprocess
+    alphapose_fused_preprocess.cpp
+    pose_detector.cpp
+    yolo_v3_detector.cpp
+    pose_utils.cpp)
+target_link_libraries(cvi_sample_alphapose_fused_preprocess
+    ${CVI_LIBS}
+    ${OPENCV_LIBS}
+    ${EXTRA_LIBS})
+install(TARGETS cvi_sample_alphapose_fused_preprocess
+    cvi_sample_alphapose_fused_preprocess DESTINATION samples_extra/bin)
diff --git a/cviruntime/samples/samples_extra/alphapose_fused_preprocess/README.md b/cviruntime/samples/samples_extra/alphapose_fused_preprocess/README.md
new file mode 100644
index 000000000..35333d428
--- /dev/null
+++ b/cviruntime/samples/samples_extra/alphapose_fused_preprocess/README.md
@@ -0,0 +1,186 @@
+# Alphapose Sample
+
+### Download the model and convert the model under docker (optional)
+
+#### For new toolchain guide
+The following documents are required:
+* tpu-mlir_xxxx.tar.gz (The release package of tpu-mlir)
+
+Transform cvimodel shell:
+``` shell
+tar zxf tpu-mlir_xxxx.tar.gz
+source tpu-mlir_xxxx/envsetup.sh
+
+mkdir workspace && cd workspace
+cp $TPUC_ROOT/regression/image/dog.jpg .
+cp $TPUC_ROOT/regression/image/pose_256_192.jpg .
+cp -rf $TPUC_ROOT/regression/dataset/COCO2017 .
+cp $TPUC_ROOT/regression/cali_tables/alphapose_res50_cvi_cali_table .
+
+## yolov3
+model_transform.py \
+--model_name yolov3 \
+--model_def ./yolov3_416_with_detection.prototxt \
+--model_data ./yolov3_416.caffemodel \
+--test_input ./dog.jpg \
+--test_result yolov3_top_output.npz \
+--input_shapes [[1,3,416,416]]
+--resize_dims 416,416 \
+--keep_aspect_ratio true \
+--mean 0,0,0 \
+--scale 0.00392,0.00392,0.00392 \
+--pixel_format "rgb" \
+--tolerance 0.99,0.99 \
+--excepts output \
+--mlir yolov3.mlir
+
+run_calibration.py \
+yolov3.mlir \
+--dataset=./COCO2017 \
+--input_num=100 \
+-o yolov3_calibration_table
+
+model_deploy.py \
+--mlir yolov3.mlir \
+--calibration_table yolov3_calibration_table \
+--chip cv183x \
+--quantize INT8 \
+--quant_input \
+--test_input ./dog.jpg \
+--test_reference yolov3_top_output.npz \
+--excepts output \
+--tolerance 0.9,0.3 \
+--fuse_preprocess \
+--customization_format RGB_PLANAR \
+--model yolo_v3_416_fused_preprocess_with_detection.cvimodel
+
+## alphapose
+model_transform.py \
+--model_name alphapose \
+--model_def ./alphapose_resnet50_256x192.onnx \
+--test_input ./pose_256_192.jpg \
+--test_result alphapose_top_output.npz \
+--input_shapes [[1,3,256,192]] \
+--resize_dims 256,192 \
+--scale 0.00392,0.00392,0.00392 \
+--mean 103.53,116.535,122.399 \
+--pixel_format "rgb" \
+--tolerance 0.99,0.99 \
+--mlir alphapose.mlir
+
+model_deploy.py \
+--mlir alphapose.mlir \
+--calibration_table alphapose_res50_cvi_cali_table \
+--test_input pose_256_192.jpg \
+--test_reference ./pose_256_192.jpg \
+--excepts 404_Relu \
+--chip cv183x \
+--quantize INT8 \
+--quant_input \
+--tolerance 0.89,0.49 \
+--fuse_preprocess \
+--customization_format RGB_PLANAR \
+--model alphapose_fused_preprocess.cvimodel
+```
+
+#### For old toolchain guide
+The following documents are required:
+
+* cvitek_mlir_ubuntu-18.04.tar.gz
+
+Transform model shell:
+``` shell
+tar zxf cvitek_mlir_ubuntu-18.04.tar.gz
+source cvitek_mlir/cvitek_envs.sh
+
+mkdir workspace && cd workspace
+cp $MLIR_PATH/tpuc/regression/data/dog.jpg .
+cp $MLIR_PATH/tpuc/regression/data/pose_256_192.jpg .
+cp -rf $MLIR_PATH/tpuc/regression/data/images .
+
+## yolov3
+model_transform.py \
+--model_type caffe \
+--model_name yolov3_416 \
+--model_def ./yolov3_416_with_detection.prototxt \
+--model_data ./yolov3_416.caffemodel \
+--image ./dog.jpg \
+--image_resize_dims 416,416 \
+--keep_aspect_ratio true \
+--raw_scale 1 \
+--model_channel_order "rgb" \
+--tolerance 0.99,0.99,0.99 \
+--excepts output \
+--mlir yolov3_416_fp32.mlir
+
+run_calibration.py \
+yolov3_416_fp32.mlir \
+--dataset=./images \
+--input_num=100 \
+-o yolo_v3_calibration_table_autotune
+
+model_deploy.py \
+--model_name yolov3_416 \
+--mlir yolov3_416_fp32.mlir \
+--calibration_table yolo_v3_calibration_table_autotune \
+--fuse_preprocess \
+--pixel_format RGB_PLANAR \
+--aligned_input false \
+--excepts output \
+--chip cv183x \
+--quantize INT8 \
+--image dog.jpg \
+--tolerance 0.9,0.9,0.3 \
+--correctness 0.95,0.95,0.9 \
+--cvimodel yolo_v3_416_fused_preprocess_with_detection.cvimodel
+
+## alphapose
+model_transform.py \
+--model_type onnx \
+--model_name alphapose \
+--model_def ./alphapose_resnet50_256x192.onnx \
+--image ./pose_256_192.jpg \
+--net_input_dims 256,192 \
+--image_resize_dims 256,192 \
+--raw_scale 1 \
+--mean 0.406,0.457,0.48 \
+--model_channel_order "rgb" \
+--tolerance 0.99,0.99,0.99 \
+--mlir alphapose_fp32.mlir
+
+run_calibration.py \
+alphapose_fp32.mlir \
+--dataset=./images \
+--input_num=100 \
+-o alphapose_calibration_table
+
+model_deploy.py \
+--model_name alphapose \
+--mlir alphapose_fp32.mlir \
+--calibration_table alphapose_calibration_table \
+--fuse_preprocess \
+--pixel_format RGB_PLANAR \
+--aligned_input false \
+--excepts 404_Relu \
+--chip cv183x \
+--quantize INT8 \
+--image pose_256_192.jpg \
+--tolerance 0.91,0.89,0.49 \
+--correctness 0.95,0.95,0.9 \
+--cvimodel alphapose_fused_preprocess.cvimodel
+```
+
+Copy generated yolo_v3_416_fused_preprocess_with_detection.cvimodel and alphapose_fused_preprocess.cvimodel to EVB board
+
+## How To Compile Sample In Docker
+View the Top level directory README.md or View the cvitek_tpu_quick_start_guide.md
+
+## Run Samples In EVB Borad
+```
+cd install_samples/samples_extra
+./bin/cvi_sample_alphapose_fused_preprocess \
+yolo_v3_416_fused_preprocess_with_detection.cvimodel \
+alphapose_fused_preprocess.cvimodel \
+./data/pose_demo_2.jpg \
+alphapose_out.jpg 
+```
\ No newline at end of file
diff --git a/cviruntime/samples/samples_extra/alphapose_fused_preprocess/alphapose_fused_preprocess.cpp b/cviruntime/samples/samples_extra/alphapose_fused_preprocess/alphapose_fused_preprocess.cpp
new file mode 100644
index 000000000..6a6a59d33
--- /dev/null
+++ b/cviruntime/samples/samples_extra/alphapose_fused_preprocess/alphapose_fused_preprocess.cpp
@@ -0,0 +1,120 @@
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <stdlib.h>
+#include <string.h>
+#include <opencv2/opencv.hpp>
+#include "yolo_v3_detector.h"
+#include "pose_detector.h"
+
+#define MEASURE_TIME
+#ifdef MEASURE_TIME
+#include <sys/time.h>
+static long total_elapsed = 0;
+#endif
+
+static void usage(char **argv) {
+  printf("Usage:\n");
+  printf("   %s yolo_cvimodel pose_cvimodel image.jpg output_image_pose.jpg [repeat] "
+         "[max_pose]\n",
+         argv[0]);
+}
+
+int main(int argc, char **argv) {
+  int ret = 0;
+  if (argc != 5 && argc != 6 && argc != 7) {
+    usage(argv);
+    exit(-1);
+  }
+  int repeat = 1;
+  if (argc >= 6) {
+    repeat = atoi(argv[5]);
+  }
+  int max_pose = -1;
+  if (argc >= 7) {
+    max_pose = atoi(argv[6]);
+  }
+
+  // imread
+  cv::Mat image;
+  image = cv::imread(argv[3]);
+  if (!image.data) {
+    printf("Could not open or find the image\n");
+    return -1;
+  }
+
+  // register model
+  YoloV3Detector yolo_detector(argv[1]);
+  PoseDetector pose_detector(argv[2]);
+
+  while (repeat) {
+#ifdef MEASURE_TIME
+    struct timeval t0, t1;
+    long elapsed, gross_total_elapsed = 0;
+    gettimeofday(&t0, NULL);
+#endif
+
+    detection dets[MAX_DET];
+    yolo_detector.doPreProccess(image);
+    yolo_detector.doInference();
+    int num_det = yolo_detector.doPostProccess(image.rows, image.cols, dets, MAX_DET);
+
+    std::vector<pose_t> pose_list;
+    std::vector<bbox_t> align_bbox_list;
+    std::vector<cv::Mat> pose_pred_list;
+
+    for (int i = 0; i < num_det; i++) {
+      if (i == max_pose)
+        break;
+      if (dets[i].cls != 0) { // 0 is person
+        continue;
+      }
+
+      pose_detector.doPreProccess_ResizeOnly(image, dets[i], align_bbox_list);
+      pose_detector.doInference();
+
+      // post process
+      auto output_tensor = pose_detector.output;
+      auto output_shape = CVI_NN_TensorShape(output_tensor);
+      cv::Mat pose_pred({output_shape.dim[0], output_shape.dim[1], output_shape.dim[2],
+                         output_shape.dim[3]},
+                        CV_32FC1, cv::Scalar(0));
+      memcpy(pose_pred.data, CVI_NN_TensorPtr(output_tensor),
+             CVI_NN_TensorSize(output_tensor));
+
+      pose_pred_list.push_back(pose_pred);
+    }
+
+    pose_list.resize(pose_pred_list.size());
+    simple_postprocess(pose_pred_list, align_bbox_list, pose_list);
+
+#ifdef MEASURE_TIME
+    gettimeofday(&t1, NULL);
+    elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
+    printf("TIMER: one frame total time %ld us (with %zu pose)\n", elapsed,
+           pose_list.size());
+    gettimeofday(&t0, NULL);
+#endif
+
+    cv::Mat draw_img = draw_pose(image, pose_list);
+
+#ifdef MEASURE_TIME
+    gettimeofday(&t1, NULL);
+    elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
+    printf("TIMER: draw %ld us\n", elapsed);
+    t0 = t1;
+#endif
+
+    if (repeat == 1) {
+      cv::imwrite(argv[4], draw_img);
+    }
+
+    printf("------\n");
+    printf(" %d poses are detected\n", pose_list.size());
+    printf("------\n");
+
+    repeat--;
+  }
+
+  return 0;
+}
diff --git a/cviruntime/samples/samples_extra/alphapose_fused_preprocess/pose_detector.cpp b/cviruntime/samples/samples_extra/alphapose_fused_preprocess/pose_detector.cpp
new file mode 100644
index 000000000..a7a1093c7
--- /dev/null
+++ b/cviruntime/samples/samples_extra/alphapose_fused_preprocess/pose_detector.cpp
@@ -0,0 +1,80 @@
+#include <vector>
+#include "pose_detector.h"
+#include "pose_utils.h"
+
+PoseDetector::PoseDetector(const char *model_file) {
+  int ret = CVI_NN_RegisterModel(model_file, &model);
+  if (ret != CVI_RC_SUCCESS) {
+    printf("CVI_NN_RegisterModel failed, err %d\n", ret);
+    exit(1);
+  }
+
+  // get input output tensors
+  CVI_NN_GetInputOutputTensors(model, &input_tensors, &input_num, &output_tensors,
+                               &output_num);
+
+  input = CVI_NN_GetTensorByName(CVI_NN_DEFAULT_TENSOR, input_tensors, input_num);
+  assert(input);
+  output = &output_tensors[0];
+  assert(output);
+
+  shape = CVI_NN_TensorShape(input);
+  height = shape.dim[2];
+  width = shape.dim[3];
+
+}
+
+PoseDetector::~PoseDetector() {
+  if (model) {
+    CVI_NN_CleanupModel(model);
+  }
+}
+
+void PoseDetector::doPreProccess_ResizeOnly(cv::Mat &image, detection &det,
+                                            std::vector<bbox_t> &align_bbox_list) {
+  int pose_h = height;
+  int pose_w = width;
+  box b = det.bbox;
+  bbox_t b_pose;
+  b_pose.x1 = (b.x - b.w / 2);
+  b_pose.y1 = (b.y - b.h / 2);
+  b_pose.x2 = (b.x + b.w / 2);
+  b_pose.y2 = (b.y + b.h / 2);
+
+  float aspect_ratio = (float)pose_w / pose_h;
+
+  float x = b_pose.x1;
+  float y = b_pose.y1;
+  float w = b_pose.x2 - b_pose.x1;
+  float h = b_pose.y2 - b_pose.y1;
+
+  std::vector<float> center(2, 0);
+  std::vector<float> scale(2, 0);
+  box_to_center_scale(x, y, w, h, aspect_ratio, scale, center);
+
+  cv::Mat trans = get_affine_transform(center, scale, {(float)pose_h, (float)pose_w});
+  cv::Mat align_image;
+  cv::warpAffine(image, align_image, trans, cv::Size(int(pose_w), int(pose_h)),
+                 cv::INTER_LINEAR);
+  align_bbox_list.push_back(center_scale_to_box(center, scale));
+
+  cv::cvtColor(align_image, align_image, cv::COLOR_BGR2RGB);
+
+  //Packed2Planar
+  cv::Mat channels[3];
+  for (int i = 0; i < 3; i++) {
+    channels[i] = cv::Mat(align_image.rows, align_image.cols, CV_8SC1);
+  }
+  cv::split(align_image, channels);
+
+  // fill data
+  int8_t *ptr = (int8_t *)CVI_NN_TensorPtr(input);
+  int channel_size = height * width;
+  for (int i = 0; i < 3; ++i) {
+    memcpy(ptr + i * channel_size, channels[i].data, channel_size);
+  }
+}
+
+void PoseDetector::doInference() {
+  CVI_NN_Forward(model, input_tensors, input_num, output_tensors, output_num);
+}
diff --git a/cviruntime/samples/samples_extra/alphapose_fused_preprocess/pose_detector.h b/cviruntime/samples/samples_extra/alphapose_fused_preprocess/pose_detector.h
new file mode 100644
index 000000000..6e8166909
--- /dev/null
+++ b/cviruntime/samples/samples_extra/alphapose_fused_preprocess/pose_detector.h
@@ -0,0 +1,39 @@
+#ifndef POSE_DETECTOR_H
+#define POSE_DETECTOR_H
+
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <opencv2/opencv.hpp>
+#include "cviruntime.h"
+#include "yolo_v3_detector.h"
+#include "pose_utils.h"
+
+class PoseDetector {
+public:
+  PoseDetector(const char *model_file);
+  ~PoseDetector();
+
+  void doPreProccess_ResizeOnly(cv::Mat &image, detection &det,
+                                std::vector<bbox_t> &align_bbox_list);
+  void doInference();
+
+public:
+  CVI_TENSOR *input;
+  CVI_TENSOR *output;
+
+private:
+  CVI_MODEL_HANDLE model = nullptr;
+  CVI_TENSOR *input_tensors;
+  CVI_TENSOR *output_tensors;
+  int32_t input_num;
+  int32_t output_num;
+  CVI_SHAPE shape;
+  int32_t height;
+  int32_t width;
+};
+
+#endif
diff --git a/cviruntime/samples/samples_extra/alphapose_fused_preprocess/pose_utils.cpp b/cviruntime/samples/samples_extra/alphapose_fused_preprocess/pose_utils.cpp
new file mode 100644
index 000000000..b1263a433
--- /dev/null
+++ b/cviruntime/samples/samples_extra/alphapose_fused_preprocess/pose_utils.cpp
@@ -0,0 +1,252 @@
+#include "math.h"
+#include "stdio.h"
+#include "assert.h"
+#include <unordered_map>
+#include "pose_utils.h"
+
+using namespace std;
+using namespace cv;
+
+static std::vector<pair<int, int>> l_pair = {{0, 1},   {0, 2},   {1, 3},   {2, 4},   {5, 6},
+                                        {5, 7},   {7, 9},   {6, 8},   {8, 10},  {17, 11},
+                                        {17, 12}, {11, 13}, {12, 14}, {13, 15}, {14, 16}};
+
+static std::vector<cv::Scalar> p_color = {
+    {0, 255, 255},  {0, 191, 255},  {0, 255, 102},  {0, 77, 255},   {0, 255, 0},
+    {77, 255, 255}, {77, 255, 204}, {77, 204, 255}, {191, 255, 77}, {77, 191, 255},
+    {191, 255, 77}, {204, 77, 255}, {77, 255, 204}, {191, 77, 255}, {77, 255, 191},
+    {127, 77, 255}, {77, 255, 127}, {0, 255, 255}};
+
+static std::vector<cv::Scalar> line_color = {
+    {0, 215, 255},   {0, 255, 204},  {0, 134, 255},  {0, 255, 50},  {77, 255, 222},
+    {77, 196, 255},  {77, 135, 255}, {191, 255, 77}, {77, 255, 77}, {77, 222, 255},
+    {255, 156, 127}, {0, 127, 255},  {255, 127, 77}, {0, 77, 255},  {255, 77, 36}};
+
+std::vector<bbox_t> yolo_detect() {
+  std::vector<bbox_t> yolo_bbox_list(5);
+  yolo_bbox_list[0] = bbox_t(137.53755547, 114.67451232, 78.44751164, 182.63019933);
+  yolo_bbox_list[1] = bbox_t(49.44112301, 134.16529877, 73.67089391, 159.10105168);
+  yolo_bbox_list[2] = bbox_t(464.84487668, 77.33579667, 98.59243602, 229.26176143);
+  yolo_bbox_list[3] = bbox_t(353.29705574, 90.37665582, 79.01046082, 211.99965902);
+  yolo_bbox_list[4] = bbox_t(228.44323056, 108.68967827, 107.76577916, 190.25104091);
+  // x, y, w, h to x1, y1, x2, y2
+  for (int i = 0; i < 5; ++i) {
+    yolo_bbox_list[i].x2 += yolo_bbox_list[i].x1;
+    yolo_bbox_list[i].y2 += yolo_bbox_list[i].y1;
+  }
+
+  return std::move(yolo_bbox_list);
+}
+
+cv::Point2f get_3rd_point(cv::Point2f a, cv::Point2f b) {
+  Point2f direct;
+  direct.x = b.x - (a - b).y;
+  direct.y = b.y + (a - b).x;
+
+  return direct;
+}
+
+std::vector<float> get_dir(float src_w) {
+
+  float sn = sin(0);
+  float cs = cos(0);
+
+  std::vector<float> src_result(2, 0);
+  src_result[0] = -src_w * sn;
+  src_result[1] = src_w * cs;
+
+  return src_result;
+}
+
+cv::Mat get_affine_transform(const std::vector<float> &center, const std::vector<float> &scale,
+                             const std::vector<float> &output_size, bool inv) {
+  std::vector<float> shift(2, 0);
+  float src_w = scale[0];
+  int dst_h = output_size[0];
+  int dst_w = output_size[1];
+
+  std::vector<float> src_dir = get_dir(src_w * -0.5);
+  std::vector<float> dst_dir(2, 0);
+  dst_dir[1] = dst_w * -0.5;
+
+  cv::Point2f src[3];
+  cv::Point2f dst[3];
+
+  src[0] = Point2f(center[0], center[1]);
+  src[1] = Point2f(center[0] + src_dir[0], center[1] + src_dir[1]);
+  src[2] = get_3rd_point(src[0], src[1]);
+  dst[0] = Point2f(dst_w * 0.5, dst_h * 0.5);
+  dst[1] = Point2f(dst_w * 0.5 + dst_dir[0], dst_h * 0.5 + dst_dir[1]);
+  dst[2] = get_3rd_point(dst[0], dst[1]);
+
+  if (inv)
+    return cv::getAffineTransform(dst, src);
+  else
+    return cv::getAffineTransform(src, dst);
+}
+
+bbox_t center_scale_to_box(const std::vector<float> &center, const std::vector<float> &scale) {
+  float w = scale[0] * 1.0;
+  float h = scale[1] * 1.0;
+  bbox_t bbox;
+
+  bbox.x1 = center[0] - w * 0.5;
+  bbox.y1 = center[1] - h * 0.5;
+  bbox.x2 = bbox.x1 + w;
+  bbox.y2 = bbox.y1 + h;
+
+  return bbox;
+}
+
+void box_to_center_scale(float x, float y, float w, float h, float aspect_ratio,
+                         std::vector<float> &scale, std::vector<float> &center) {
+  float pixel_std = 1;
+  float scale_mult = 1.25;
+
+  center[0] = x + w * 0.5;
+  center[1] = y + h * 0.5;
+
+  if (w > aspect_ratio * h) {
+    h = w / aspect_ratio;
+  } else if (w < aspect_ratio * h) {
+    w = h * aspect_ratio;
+  }
+
+  scale[0] = w * 1.0 / pixel_std;
+  scale[1] = h * 1.0 / pixel_std;
+  if (center[0] != -1) {
+    scale[0] = scale[0] * scale_mult;
+    scale[1] = scale[1] * scale_mult;
+  }
+}
+
+void get_max_pred(const Mat &pose_pred, pose_t &dst_pose) {
+  int inner_size = pose_pred.size[2] * pose_pred.size[3];
+  float *ptr = (float *)pose_pred.data;
+  for (int c = 0; c < POSE_PTS_NUM; ++c) {
+    dst_pose.score[c] = 0;
+    dst_pose.x[c] = 0;
+    dst_pose.y[c] = 0;
+    // for (int h = 0; h < pose_pred.size[2]; ++h) {
+    //    for (int w = 0; w < pose_pred.size[3]; ++w) {
+    //        float current_score = blob_to_val(pose_pred, 0, c, h, w);
+    //        if (current_score > dst_pose.score[c]) {
+    //            dst_pose.score[c] = current_score;
+    //            dst_pose.x[c] = w;
+    //            dst_pose.y[c] = h;
+    //        }
+    //    }
+    //}
+    int max_idx = 0;
+    for (int i = 0; i < inner_size; ++i) {
+      if (ptr[i] > dst_pose.score[c]) {
+        dst_pose.score[c] = ptr[i];
+        max_idx = i;
+      }
+    }
+    dst_pose.x[c] = max_idx % pose_pred.size[3];
+    dst_pose.y[c] = max_idx / pose_pred.size[3];
+    ptr += inner_size;
+  }
+}
+
+void simple_postprocess(const std::vector<Mat> &pose_pred_list,
+                        const std::vector<bbox_t> &align_bbox_list,
+                        std::vector<pose_t> &dst_pose_list) {
+  for (int i = 0; i < pose_pred_list.size(); ++i) {
+    float x = align_bbox_list[i].x1;
+    float y = align_bbox_list[i].y1;
+    float w = align_bbox_list[i].x2 - align_bbox_list[i].x1;
+    float h = align_bbox_list[i].y2 - align_bbox_list[i].y1;
+    std::vector<float> center = {(float)(x + w * 0.5), (float)(y + h * 0.5)};
+    std::vector<float> scale = {w, h};
+
+    get_max_pred(pose_pred_list[i], dst_pose_list[i]);
+    cv::Mat trans = get_affine_transform(
+        center, scale,
+        {(float)pose_pred_list[i].size[2], (float)pose_pred_list[i].size[3]}, true);
+    for (int c = 0; c < POSE_PTS_NUM; ++c) {
+      dst_pose_list[i].x[c] = trans.at<double>(0) * dst_pose_list[i].x[c] +
+                              trans.at<double>(1) * dst_pose_list[i].y[c] +
+                              trans.at<double>(2);
+      dst_pose_list[i].y[c] = trans.at<double>(3) * dst_pose_list[i].x[c] +
+                              trans.at<double>(4) * dst_pose_list[i].y[c] +
+                              trans.at<double>(5);
+    }
+  }
+}
+
+Mat draw_pose(cv::Mat &image, std::vector<pose_t> &pose_list) {
+  int height = image.rows;
+  int width = image.cols;
+
+  cv::Mat img = image.clone();
+
+  for (pose_t pose : pose_list) {
+    std::vector<Point2f> kp_preds(POSE_PTS_NUM);
+    std::vector<float> kp_scores(POSE_PTS_NUM);
+
+    for (int i = 0; i < POSE_PTS_NUM; ++i) {
+      kp_preds[i].x = pose.x[i];
+      kp_preds[i].y = pose.y[i];
+      kp_scores[i] = pose.score[i];
+    }
+
+    Point2f extra_pred;
+    extra_pred.x = (kp_preds[5].x + kp_preds[6].x) / 2;
+    extra_pred.y = (kp_preds[5].y + kp_preds[6].y) / 2;
+    kp_preds.push_back(extra_pred);
+
+    float extra_score = (kp_scores[5] + kp_scores[6]) / 2;
+    kp_scores.push_back(extra_score);
+
+    // Draw keypoints
+    unordered_map<int, pair<int, int>> part_line;
+    for (int n = 0; n < kp_scores.size(); n++) {
+      if (kp_scores[n] <= 0.35)
+        continue;
+
+      int cor_x = kp_preds[n].x;
+      int cor_y = kp_preds[n].y;
+      part_line[n] = make_pair(cor_x, cor_y);
+
+      cv::Mat bg;
+      img.copyTo(bg);
+      cv::circle(bg, cv::Size(cor_x, cor_y), 2, p_color[n], -1);
+      float transparency = max(float(0.0), min(float(1.0), kp_scores[n]));
+      cv::addWeighted(bg, transparency, img, 1 - transparency, 0, img);
+    }
+
+    // Draw limbs
+    for (int i = 0; i < l_pair.size(); i++) {
+      int start_p = l_pair[i].first;
+      int end_p = l_pair[i].second;
+      if (part_line.count(start_p) > 0 && part_line.count(end_p) > 0) {
+        pair<int, int> start_xy = part_line[start_p];
+        pair<int, int> end_xy = part_line[end_p];
+
+        float mX = (start_xy.first + end_xy.first) / 2;
+        float mY = (start_xy.second + end_xy.second) / 2;
+        float length = sqrt(pow((start_xy.second - end_xy.second), 2) +
+                            pow((start_xy.first - end_xy.first), 2));
+        float angle =
+            (atan2(start_xy.second - end_xy.second, start_xy.first - end_xy.first)) *
+            180.0 / M_PI;
+        float stickwidth = (kp_scores[start_p] + kp_scores[end_p]) + 1;
+        std::vector<cv::Point> polygon;
+        cv::ellipse2Poly(cv::Point(int(mX), int(mY)),
+                         cv::Size(int(length / 2), stickwidth), int(angle), 0, 360, 1,
+                         polygon);
+
+        cv::Mat bg;
+        img.copyTo(bg);
+        cv::fillConvexPoly(bg, polygon, line_color[i]);
+        float transparency =
+            max(float(0.0),
+                min(float(1.0), float(0.5) * (kp_scores[start_p] + kp_scores[end_p])));
+        cv::addWeighted(bg, transparency, img, 1 - transparency, 0, img);
+      }
+    }
+  }
+  return img;
+}
diff --git a/cviruntime/samples/samples_extra/alphapose_fused_preprocess/pose_utils.h b/cviruntime/samples/samples_extra/alphapose_fused_preprocess/pose_utils.h
new file mode 100644
index 000000000..5329ef254
--- /dev/null
+++ b/cviruntime/samples/samples_extra/alphapose_fused_preprocess/pose_utils.h
@@ -0,0 +1,39 @@
+#ifndef _POSE_UTILS_H_
+#define _POSE_UTILS_H_
+#include <vector>
+#include <opencv2/opencv.hpp>
+
+#define POSE_PTS_NUM 17
+
+struct bbox_t {
+  bbox_t(float _x1 = 0, float _y1 = 0, float _x2 = 0, float _y2 = 0)
+      : x1(_x1), y1(_y1), x2(_x2), y2(_y2) {}
+
+  float x1;
+  float y1;
+  float x2;
+  float y2;
+};
+
+struct pose_t {
+  float x[POSE_PTS_NUM];
+  float y[POSE_PTS_NUM];
+  float score[POSE_PTS_NUM];
+};
+
+cv::Mat get_affine_transform(const std::vector<float> &center,
+                             const std::vector<float> &scale,
+                             const std::vector<float> &output_size, bool inv = false);
+
+bbox_t center_scale_to_box(const std::vector<float> &center,
+                           const std::vector<float> &scale);
+
+void box_to_center_scale(float x, float y, float w, float h, float aspect_ratio,
+                         std::vector<float> &scale, std::vector<float> &center);
+
+void simple_postprocess(const std::vector<cv::Mat> &pose_pred_list,
+                        const std::vector<bbox_t> &align_bbox_list,
+                        std::vector<pose_t> &dst_pose_list);
+cv::Mat draw_pose(cv::Mat &image, std::vector<pose_t> &pose_list);
+
+#endif // _POSE_UTILS_H_
diff --git a/cviruntime/samples/samples_extra/alphapose_fused_preprocess/yolo_v3_detector.cpp b/cviruntime/samples/samples_extra/alphapose_fused_preprocess/yolo_v3_detector.cpp
new file mode 100644
index 000000000..379442f2c
--- /dev/null
+++ b/cviruntime/samples/samples_extra/alphapose_fused_preprocess/yolo_v3_detector.cpp
@@ -0,0 +1,125 @@
+#include "yolo_v3_detector.h"
+
+YoloV3Detector::YoloV3Detector(const char *model_file) {
+  int ret = CVI_NN_RegisterModel(model_file, &model);
+  if (ret != CVI_RC_SUCCESS) {
+    printf("CVI_NN_RegisterModel failed, err %d\n", ret);
+    exit(1);
+  }
+
+  // get input output tensors
+  CVI_NN_GetInputOutputTensors(model, &input_tensors, &input_num, &output_tensors,
+                               &output_num);
+
+  input = CVI_NN_GetTensorByName(CVI_NN_DEFAULT_TENSOR, input_tensors, input_num);
+  assert(input);
+  output = CVI_NN_GetTensorByName("output", output_tensors, output_num);
+  assert(output);
+
+  qscale = CVI_NN_TensorQuantScale(input);
+  shape = CVI_NN_TensorShape(input);
+
+  for (int i = 0; i < 3; i++) {
+    channels[i] = cv::Mat(shape.dim[2], shape.dim[3], CV_8SC1);
+  }
+}
+
+YoloV3Detector::~YoloV3Detector() {
+  if (model) {
+    CVI_NN_CleanupModel(model);
+  }
+}
+
+void YoloV3Detector::doPreProccess(cv::Mat &image) {
+  cv::Mat resized_image;
+  // resize & letterbox
+  int ih = image.rows;
+  int iw = image.cols;
+  int oh = shape.dim[2];
+  int ow = shape.dim[3];
+  double resize_scale = std::min((double)oh / ih, (double)ow / iw);
+  int nh = (int)(ih * resize_scale);
+  int nw = (int)(iw * resize_scale);
+  cv::resize(image, resized_image, cv::Size(nw, nh));
+  int top = (oh - nh) / 2;
+  int bottom = (oh - nh) - top;
+  int left = (ow - nw) / 2;
+  int right = (ow - nw) - left;
+  cv::copyMakeBorder(resized_image, resized_image, top, bottom, left, right,
+                     cv::BORDER_CONSTANT, cv::Scalar::all(0));
+
+  cv::cvtColor(resized_image, resized_image, cv::COLOR_BGR2RGB);
+  //Packed2Planar
+  cv::Mat channels[3];
+  for (int i = 0; i < 3; i++) {
+    channels[i] = cv::Mat(resized_image.rows, resized_image.cols, CV_8SC1);
+  }
+  cv::split(resized_image, channels);
+
+  // fill data
+  int8_t *ptr = (int8_t *)CVI_NN_TensorPtr(input);
+  int channel_size = oh * ow;
+  for (int i = 0; i < 3; ++i) {
+    memcpy(ptr + i * channel_size, channels[i].data, channel_size);
+  }
+
+}
+
+void YoloV3Detector::doInference() {
+  // run inference
+  CVI_NN_Forward(model, input_tensors, input_num, output_tensors, output_num);
+}
+
+int32_t YoloV3Detector::doPostProccess(int32_t image_h, int32_t image_w, detection dets[],
+                                      int32_t max_det_num) {
+  int32_t det_num = 0;
+  float *output_ptr = (float *)CVI_NN_TensorPtr(output);
+  for (int i = 0; i < max_det_num; ++i) {
+    // filter real det with score > 0
+    if (output_ptr[i * 6 + 5] > 0) {
+      // output: [x,y,w,h,cls,score]
+      dets[det_num].bbox.x = output_ptr[i * 6 + 0];
+      dets[det_num].bbox.y = output_ptr[i * 6 + 1];
+      dets[det_num].bbox.w = output_ptr[i * 6 + 2];
+      dets[det_num].bbox.h = output_ptr[i * 6 + 3];
+      dets[det_num].cls = output_ptr[i * 6 + 4];
+      dets[det_num].score = output_ptr[i * 6 + 5];
+      det_num++;
+    }
+  }
+  printf("get detection num: %d\n", det_num);
+  // correct box with origin image size
+  correctYoloBoxes(dets, det_num, image_h, image_w, false);
+
+  return det_num;
+}
+
+void YoloV3Detector::correctYoloBoxes(detection *dets, int det_num, int image_h,
+                                      int image_w, bool relative_position) {
+  int i;
+  int restored_w = 0;
+  int restored_h = 0;
+  if (((float)shape.dim[3] / image_w) < ((float)shape.dim[2] / image_h)) {
+    restored_w = shape.dim[3];
+    restored_h = (image_h * shape.dim[3]) / image_w;
+  } else {
+    restored_h = shape.dim[2];
+    restored_w = (image_w * shape.dim[2]) / image_h;
+  }
+  for (i = 0; i < det_num; ++i) {
+    box b = dets[i].bbox;
+    b.x = (b.x - (shape.dim[3] - restored_w) / 2. / shape.dim[3]) /
+          ((float)restored_w / shape.dim[3]);
+    b.y = (b.y - (shape.dim[2] - restored_h) / 2. / shape.dim[2]) /
+          ((float)restored_h / shape.dim[2]);
+    b.w *= (float)shape.dim[3] / restored_w;
+    b.h *= (float)shape.dim[2] / restored_h;
+    if (!relative_position) {
+      b.x *= image_w;
+      b.w *= image_w;
+      b.y *= image_h;
+      b.h *= image_h;
+    }
+    dets[i].bbox = b;
+  }
+}
diff --git a/cviruntime/samples/samples_extra/alphapose_fused_preprocess/yolo_v3_detector.h b/cviruntime/samples/samples_extra/alphapose_fused_preprocess/yolo_v3_detector.h
new file mode 100644
index 000000000..916170ac5
--- /dev/null
+++ b/cviruntime/samples/samples_extra/alphapose_fused_preprocess/yolo_v3_detector.h
@@ -0,0 +1,53 @@
+#ifndef YOLO_V3_DETECTOR_H
+#define YOLO_V3_DETECTOR_H
+
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <opencv2/opencv.hpp>
+#include "cviruntime.h"
+
+#define MAX_DET 200
+
+typedef struct {
+  float x, y, w, h;
+} box;
+
+typedef struct {
+  box bbox;
+  int cls;
+  float score;
+} detection;
+
+class YoloV3Detector {
+public:
+  YoloV3Detector(const char *model_file);
+  ~YoloV3Detector();
+
+  void doPreProccess(cv::Mat &image);
+  void doInference();
+  int32_t doPostProccess(int32_t image_h, int32_t image_w, detection det[],
+                        int32_t max_det_num);
+
+private:
+  void correctYoloBoxes(detection *dets, int det_num, int image_h, int image_w,
+                        bool relative_position);
+public:
+  CVI_TENSOR *input;
+  CVI_TENSOR *output;
+  cv::Mat channels[3];
+
+private:
+  CVI_MODEL_HANDLE model = nullptr;
+  CVI_TENSOR *input_tensors;
+  CVI_TENSOR *output_tensors;
+  int32_t input_num;
+  int32_t output_num;
+  CVI_SHAPE shape;
+  float qscale;
+};
+
+#endif
diff --git a/cviruntime/samples/samples_extra/classifier_vpss_yuv/CMakeLists.txt b/cviruntime/samples/samples_extra/classifier_vpss_yuv/CMakeLists.txt
new file mode 100644
index 000000000..4c915a0da
--- /dev/null
+++ b/cviruntime/samples/samples_extra/classifier_vpss_yuv/CMakeLists.txt
@@ -0,0 +1,80 @@
+cmake_minimum_required(VERSION 2.8.0)
+
+project(cvi_sample_classifier_tpu_preprocess C CXX)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
+
+if(NOT DEFINED TPU_SDK_PATH)
+  message(FATAL_ERROR "Please set TPU_SDK_PATH to point to the TPU_SDK installation")
+endif()
+include_directories(${TPU_SDK_PATH}/include)
+link_directories(${TPU_SDK_PATH}/lib)
+
+if(NOT DEFINED OPENCV_PATH)
+  message(FATAL_ERROR "Please set OPENCV_PATH to point to the opencvn installation")
+endif()
+include_directories(${OPENCV_PATH}/include)
+link_directories(${OPENCV_PATH}/lib)
+
+if(NOT DEFINED CHIP)
+  message(FATAL_ERROR "Please set CHIP to 183x or 182x or 181x or mars")
+endif()
+
+if(NOT DEFINED MW_PATH)
+  message(FATAL_ERROR "Please set MW_PATH to point to the middleware installation")
+endif()
+
+include_directories(${MW_PATH}/include)
+include_directories(${MW_PATH}/sample/common)
+
+if (CHIP STREQUAL "mars")
+  include_directories(${MW_PATH}/include/isp/${CHIP})
+else()
+  include_directories(${MW_PATH}/include/isp/cv${CHIP})
+endif()
+
+if("${SDK_VER}" STREQUAL "musl_riscv64")
+  link_directories(${MW_PATH}/lib_musl_riscv64)
+  link_directories(${MW_PATH}/lib_musl_riscv64/3rd)
+elseif("${SDK_VER}" STREQUAL "glibc_riscv64")
+  link_directories(${MW_PATH}/lib_glibc_riscv64)
+  link_directories(${MW_PATH}/lib_glibc_riscv64/3rd)
+else()
+  link_directories(${MW_PATH}/lib)
+  link_directories(${MW_PATH}/lib/3rd)
+endif()
+
+if(CHIP STREQUAL "mars")
+  set(MW_LIB sample isp cvi_bin_isp cvi_bin isp_algo vpu venc vdec af ae awb sns_full misc ini sys atomic)
+elseif(CHIP STREQUAL "181x")
+  set(MW_LIB sample isp cvi_bin_isp cvi_bin isp_algo vpu venc vdec af ae awb sns_full misc ini sys atomic)
+elseif(CHIP STREQUAL "182x")
+  set(MW_LIB sample isp cvi_bin_isp cvi_bin isp_algo vpu venc vdec cvi_vcodec sys awb ae af sns_full ini cvitracer)
+elseif(CHIP STREQUAL "183x")
+  set(MW_LIB sample isp cvi_bin_isp cvi_bin vpu venc vdec cvi_vcodec sys awb ae af sns_full ini)
+endif()
+
+set(CVI_LIBS ${CVI_LIBS} cviruntime cvikernel)
+if(NOT CMAKE_CROSSCOMPILING)
+  set(CVI_LIBS ${CVI_LIBS} cvicmodel)
+endif()
+
+set(OPENCV_LIBS ${OPENCV_LIBS} opencv_core opencv_imgcodecs opencv_imgproc)
+if(NOT CMAKE_CROSSCOMPILING)
+  set(OPENCV_LIBS ${OPENCV_LIBS} opencv_highgui)
+endif()
+
+set(EXTRA_LIBS ${EXTRA_LIBS} dl stdc++ pthread z)
+
+add_executable(classifier_vpss_yuv
+classifier_vpss_yuv.cpp)
+    
+target_link_libraries(classifier_vpss_yuv
+    ${CVI_LIBS}
+    ${OPENCV_LIBS}
+    ${EXTRA_LIBS}
+    ${MW_LIB})
+install(TARGETS classifier_vpss_yuv
+classifier_vpss_yuv DESTINATION samples_extra/bin)
\ No newline at end of file
diff --git a/cviruntime/samples/samples_extra/classifier_vpss_yuv/README.md b/cviruntime/samples/samples_extra/classifier_vpss_yuv/README.md
new file mode 100644
index 000000000..75b3d1235
--- /dev/null
+++ b/cviruntime/samples/samples_extra/classifier_vpss_yuv/README.md
@@ -0,0 +1,120 @@
+# Classifier Tpu_fuse_preprocess Samples
+
+### Download the model and convert the model under docker (optional)
+Mobilev2 model could clone from:https://github.com/shicai/MobileNet-Caffe
+#### For new toolchain guide
+The following documents are required:
+* tpu-mlir_xxxx.tar.gz (The release package of tpu-mlir)
+
+Transform cvimodel shell:
+``` shell
+tar zxf tpu-mlir_xxxx.tar.gz
+source tpu-mlir_xxxx/envsetup.sh
+
+mkdir workspace && cd workspace
+cp $TPUC_ROOT/regression/image/cat.jpg .
+cp -rf $TPUC_ROOT/regression/dataset/ILSVRC2012 .
+
+model_transform.py \
+--model_name mobilenet_v2 \
+--model_def ./mobilenet_v2_deploy.prototxt \
+--model_data ./mobilenet_v2.caffemodel \
+--test_input ./cat.jpg \
+--test_result mobilenet_v2_top_output.npz \
+--input_shapes [[1,3,224,224]]
+--resize_dims 256,256 \
+--mean 103.94,116.78,123.68 \
+--scale 0.017,0.017,0.017 \
+--pixel_format "bgr" \
+--tolerance 0.99,0.99 \
+--excepts prob \
+--mlir mobilenet_v2.mlir
+
+run_calibration.py \
+mobilenet_v2.mlir \
+--dataset=./ILSVRC2012 \
+--input_num=100 \
+-o mobilenet_v2_calibration_table
+
+model_deploy.py \
+--mlir mobilenet_v2.mlir \
+--calibration_table mobilenet_v2_calibration_table \
+--chip cv183x \
+--quantize INT8 \
+--quant_input \
+--test_input ./cat.jpg \
+--test_reference mobilenet_v2_top_output.npz \
+--excepts prob \
+--tolerance 0.9,0.6 \
+--fuse_preprocess \
+--customization_format YUV420_PLANAR \
+--aligned_input \
+--model mobilenet_v2_int8_yuv420.cvimodel
+```
+
+
+
+#### For old toolchain guide
+The following documents are required:
+
+* cvitek_mlir_ubuntu-18.04.tar.gz
+
+Transform model shell:
+``` shell
+tar zxf cvitek_mlir_ubuntu-18.04.tar.gz
+source cvitek_mlir/cvitek_envs.sh
+
+mkdir workspace && cd workspace
+cp $MLIR_PATH/tpuc/regression/data/cat.jpg .
+cp -rf $MLIR_PATH/tpuc/regression/data/images .
+
+model_transform.py \
+--model_type caffe \
+--model_name mobilenet_v2 \
+--model_def ./mobilenet_v2_deploy.prototxt \
+--model_data ./mobilenet_v2.caffemodel \
+--image ./cat.jpg \
+--image_resize_dims 256,256 \
+--net_input_dims 224,224 \
+--mean 103.94,116.78,123.68 \
+--input_scale 0.017 \
+--model_channel_order "bgr" \
+--tolerance 0.99,0.99,0.99 \
+--excepts prob \
+--mlir mobilenet_v2_fp32.mlir
+
+run_calibration.py \
+mobilenet_v2_fp32.mlir \
+--dataset=./images \
+--input_num=100 \
+-o mobilenet_v2_calibration_table
+
+model_deploy.py \
+--model_name mobilenet_v2 \
+--mlir mobilenet_v2_fp32.mlir \
+--calibration_table mobilenet_v2_calibration_table \
+--chip cv183x \
+--quantize INT8 \
+--fuse_preprocess \
+--aligned_input true \
+--pixel_format YUV420_PLANAR \
+--image cat.jpg \
+--tolerance 0.9,0.9,0.6 \
+--correctness 0.95,0.95,0.9 \
+--cvimodel mobilenet_v2_int8_yuv420.cvimodel
+```
+
+Copy generated mobilenet_v2_int8_yuv420.cvimodel to EVB board
+
+## How To Compile Vpss input Sample In Docker
+View the Top level directory README.md
+
+## Run Samples In EVB Borad
+```
+cd install_samples/samples_extra
+./cvi_sample_classifier_vpss_yuv \
+./mobilenet_v2_int8_yuv420.cvimodel \
+./cat.jpg \
+./synset_words.txt \
+YUV420_PLANAR
+```
\ No newline at end of file
diff --git a/cviruntime/samples/samples_extra/classifier_vpss_yuv/classifier_vpss_yuv.cpp b/cviruntime/samples/samples_extra/classifier_vpss_yuv/classifier_vpss_yuv.cpp
new file mode 100644
index 000000000..eb7fe5892
--- /dev/null
+++ b/cviruntime/samples/samples_extra/classifier_vpss_yuv/classifier_vpss_yuv.cpp
@@ -0,0 +1,184 @@
+#include <iostream>
+#include <fstream>
+#include <numeric>
+#include "cviruntime.h"
+#include "mapi.hpp"
+
+#include "opencv2/opencv.hpp"
+
+
+#ifndef SAMPLE_CHECK_RET
+#define SAMPLE_CHECK_RET(express)                                                    \
+    do {                                                                      \
+        int rc = express;                                                     \
+        if (rc != 0) {                                                        \
+            printf("\nFailed at %s: %d  (rc:0x%#x!)\n",                       \
+                    __FILE__, __LINE__, rc);                                  \
+            return rc;                                                        \
+        }                                                                     \
+    } while (0)
+#endif
+
+static void usage(char **argv) {
+  printf("Usage:\n");
+  printf("   %s cvimodel image.jpg label_file\n", argv[0]);
+}
+
+static void get_frame_from_mat(VIDEO_FRAME_INFO_S &in_frame, const cv::Mat &mat) {
+  CVI_MAPI_AllocateFrame(&in_frame, mat.cols, mat.rows, PIXEL_FORMAT_BGR_888);
+  CVI_MAPI_FrameMmap(&in_frame, true);
+  uint8_t *src_ptr = mat.data;
+  uint8_t *dst_ptr = in_frame.stVFrame.pu8VirAddr[0];
+  for (int h = 0; h < mat.rows; ++h) {
+    memcpy(dst_ptr, src_ptr, mat.cols * mat.elemSize());
+    src_ptr += mat.step[0];
+    dst_ptr += in_frame.stVFrame.u32Stride[0];
+  }
+  CVI_MAPI_FrameFlushCache(&in_frame);
+  CVI_MAPI_FrameMunmap(&in_frame);
+}
+
+int main(int argc, char **argv) {
+  if (argc != 5) {
+    usage(argv);
+    exit(-1);
+  }
+
+  YuvType yuv_type = YUV_UNKNOWN;
+  if (strcmp(argv[4], "YUV420_PLANAR") == 0) {
+    yuv_type = YUV420_PLANAR;
+    printf("YUV420_PLANAR mode\n");
+  } else if (strcmp(argv[4], "YUV_NV12") == 0) {
+    yuv_type = YUV_NV12;
+    printf("YUV_NV12 mode\n");
+  } else if (strcmp(argv[4], "YUV_NV21") == 0) {
+    yuv_type = YUV_NV21;
+    printf("YUV_NV21 mode\n");
+  } else {
+    assert(0 && "unsupported yuv type");
+  }
+
+  // load model file
+  const char *model_file = argv[1];
+  CVI_MODEL_HANDLE model = nullptr;
+  int ret = CVI_NN_RegisterModel(model_file, &model);
+  if (CVI_RC_SUCCESS != ret) {
+    printf("CVI_NN_RegisterModel failed, err %d\n", ret);
+    exit(1);
+  }
+  printf("CVI_NN_RegisterModel succeeded\n");
+
+  // get input output tensors
+  CVI_TENSOR *input_tensors;
+  CVI_TENSOR *output_tensors;
+  int32_t input_num;
+  int32_t output_num;
+  CVI_NN_GetInputOutputTensors(model, &input_tensors, &input_num, &output_tensors,
+                               &output_num);
+  CVI_TENSOR *input = CVI_NN_GetTensorByName(CVI_NN_DEFAULT_TENSOR, input_tensors, input_num);
+  assert(input);
+  printf("input, name:%s\n", input->name);
+  CVI_TENSOR *output = CVI_NN_GetTensorByName(CVI_NN_DEFAULT_TENSOR, output_tensors, output_num);
+  assert(output);
+
+  float qscale = CVI_NN_TensorQuantScale(input);
+  printf("qscale:%f\n", qscale);
+  CVI_SHAPE shape = CVI_NN_TensorShape(input);
+
+  // nchw
+  int32_t height = shape.dim[2];
+  int32_t width = shape.dim[3];
+
+  // imread
+  cv::Mat image;
+  image = cv::imread(argv[2]);
+  if (!image.data) {
+    printf("Could not open or find the image\n");
+    return -1;
+  }
+
+  // init vb
+  SAMPLE_CHECK_RET(CVI_MAPI_Media_Init(image.cols, image.rows, 2));
+
+  //init vpss
+  PreprocessArg arg;
+  arg.width = width;
+  arg.height = height;
+  arg.yuv_type = yuv_type;
+  init_vpss(image.cols, image.rows, &arg);
+
+  VIDEO_FRAME_INFO_S frame_in;
+  VIDEO_FRAME_INFO_S frame_preprocessed;
+  memset(&frame_in, 0x00, sizeof(frame_in));
+  get_frame_from_mat(frame_in, image);
+
+  if (CVI_SUCCESS != CVI_VPSS_SendFrame(0, &frame_in, -1)) {
+    printf("send frame failed\n");
+    return -1;
+  }
+  if (CVI_SUCCESS != CVI_VPSS_GetChnFrame(0, 0, &frame_preprocessed, 1000)) {
+    printf("get frame failed\n");
+    return -1;
+  }
+  for (int i = 0; i < 3; ++i) {
+    printf("YUV Frame height[%d] width[%d] stride[%d] addr[%llx]\n",
+    frame_preprocessed.stVFrame.u32Height,
+    frame_preprocessed.stVFrame.u32Width,
+    frame_preprocessed.stVFrame.u32Stride[i],
+    frame_preprocessed.stVFrame.u64PhyAddr[i]);
+  }
+
+  CVI_NN_SetTensorPhysicalAddr(input, (uint64_t)frame_preprocessed.stVFrame.u64PhyAddr[0]);
+
+  CVI_MAPI_ReleaseFrame(&frame_in);
+
+  // run inference
+  CVI_NN_Forward(model, input_tensors, input_num, output_tensors, output_num);
+  printf("CVI_NN_Forward succeeded\n");
+
+  if (CVI_SUCCESS != CVI_VPSS_ReleaseChnFrame(0, 0, &frame_preprocessed)) {
+    printf("release frame failed!\n");
+    return -1;
+  }
+
+  // output result
+  std::vector<std::string> labels;
+  std::ifstream file(argv[3]);
+  if (!file) {
+    printf("Didn't find synset_words file\n");
+    exit(1);
+  } else {
+    std::string line;
+    while (std::getline(file, line)) {
+      labels.push_back(std::string(line));
+    }
+  }
+
+  int32_t top_num = 5;
+  float *prob = (float *)CVI_NN_TensorPtr(output);
+  int32_t count = CVI_NN_TensorCount(output);
+
+  int32_t top_k_idx[top_num] = {-1};
+  float top_k[top_num] = {0};
+
+  // find top-k prob and cls
+  std::vector<size_t> idx(count);
+  std::iota(idx.begin(), idx.end(), 0);
+  std::sort(idx.begin(), idx.end(), [&prob](size_t idx_0, size_t idx_1) {return prob[idx_0] > prob[idx_1];});
+  // show results.
+  printf("------\n");
+  for (size_t i = 0; i < top_num; i++) {
+    int top_k_idx = idx[i];
+    printf("  %f, idx %d", prob[top_k_idx], top_k_idx);
+    if (!labels.empty())
+      printf(", %s", labels[top_k_idx].c_str());
+    printf("\n");
+  }
+  printf("------\n");
+
+  CVI_NN_CleanupModel(model);
+  printf("CVI_NN_CleanupModel succeeded\n");
+  SAMPLE_CHECK_RET(CVI_MAPI_Media_Deinit());
+  vproc_deinit();
+  return 0;
+}
diff --git a/cviruntime/samples/samples_extra/classifier_vpss_yuv/mapi.hpp b/cviruntime/samples/samples_extra/classifier_vpss_yuv/mapi.hpp
new file mode 100644
index 000000000..5b32f6f55
--- /dev/null
+++ b/cviruntime/samples/samples_extra/classifier_vpss_yuv/mapi.hpp
@@ -0,0 +1,462 @@
+#pragma once
+#include <signal.h>
+#include "cvi_sys.h"
+#include "cvi_type.h"
+#include "sample_comm.h"
+
+#define CVI_MAPI_SUCCESS ((int)(0))
+#define CVI_MAPI_ERR_FAILURE       ((int)(-1001))
+#define CVI_MAPI_ERR_NOMEM         ((int)(-1002))
+#define CVI_MAPI_ERR_TIMEOUT       ((int)(-1003))
+#define CVI_MAPI_ERR_INVALID       ((int)(-1004))
+
+enum YuvType {
+  YUV_UNKNOWN = 0,
+  YUV420_PLANAR = 1,
+  YUV_NV12 = 2,
+  YUV_NV21 = 3
+};
+
+struct PreprocessArg {
+  int width;
+  int height;
+  YuvType yuv_type;
+};
+
+
+static void _SYS_HandleSig(int nSignal, siginfo_t *si, void *arg)
+{
+    SAMPLE_COMM_SYS_Exit();
+    exit(1);
+}
+
+int CVI_MAPI_Media_Init(uint32_t img_w, uint32_t img_h, uint32_t blk_cnt)
+{
+    SIZE_S stSize;
+    stSize.u32Width  = img_w;
+    stSize.u32Height = img_h;
+
+    VB_CONFIG_S      stVbConf;
+    CVI_U32          u32BlkSize, u32BlkRotSize;
+    COMPRESS_MODE_E  enCompressMode = COMPRESS_MODE_NONE;
+
+    struct sigaction sa;
+    memset(&sa, 0, sizeof(struct sigaction));
+    sigemptyset(&sa.sa_mask);
+    sa.sa_sigaction = _SYS_HandleSig;
+    sa.sa_flags = SA_SIGINFO|SA_RESETHAND;    // Reset signal handler to system default after signal triggered
+    sigaction(SIGINT, &sa, NULL);
+    sigaction(SIGTERM, &sa, NULL);
+
+    memset(&stVbConf, 0, sizeof(VB_CONFIG_S));
+    stVbConf.u32MaxPoolCnt        = 2;
+
+    u32BlkSize = COMMON_GetPicBufferSize(stSize.u32Width, stSize.u32Height, SAMPLE_PIXEL_FORMAT,
+        DATA_BITWIDTH_8, enCompressMode, DEFAULT_ALIGN);
+    u32BlkRotSize = COMMON_GetPicBufferSize(stSize.u32Height, stSize.u32Width, SAMPLE_PIXEL_FORMAT,
+        DATA_BITWIDTH_8, enCompressMode, DEFAULT_ALIGN);
+    u32BlkSize = u32BlkSize > u32BlkRotSize ? u32BlkSize : u32BlkRotSize;
+
+    stVbConf.astCommPool[0].u32BlkSize    = u32BlkSize;
+    stVbConf.astCommPool[0].u32BlkCnt     = blk_cnt;  // 10
+    stVbConf.astCommPool[0].enRemapMode   = VB_REMAP_MODE_CACHED;
+    printf("common pool[0] BlkSize %d\n", u32BlkSize);
+
+    u32BlkSize = COMMON_GetPicBufferSize(1920, 1080, PIXEL_FORMAT_RGB_888_PLANAR,
+        DATA_BITWIDTH_8, enCompressMode, DEFAULT_ALIGN);
+
+    stVbConf.astCommPool[1].u32BlkSize    = u32BlkSize;
+    stVbConf.astCommPool[1].u32BlkCnt    = 2;
+    printf("common pool[1] BlkSize %d\n", u32BlkSize);
+
+    int ret = CVI_MAPI_SUCCESS;
+    CVI_S32 rc = SAMPLE_COMM_SYS_Init(&stVbConf);
+    if (rc != CVI_SUCCESS) {
+        printf("SAMPLE_COMM_SYS_Init fail, rc = %#x\n", rc);
+        ret = CVI_MAPI_ERR_FAILURE;
+        goto error;
+    }
+
+    return ret;
+
+error:
+    SAMPLE_COMM_SYS_Exit();
+    return ret;
+}
+
+int CVI_MAPI_Media_Deinit(void)
+{
+    SAMPLE_COMM_SYS_Exit();
+    return CVI_MAPI_SUCCESS;
+}
+
+int CVI_MAPI_AllocateFrame(VIDEO_FRAME_INFO_S *frm,
+        uint32_t width, uint32_t height, PIXEL_FORMAT_E fmt) {
+    VB_BLK blk;
+    VB_CAL_CONFIG_S stVbCalConfig;
+    COMMON_GetPicBufferConfig(width, height, fmt, DATA_BITWIDTH_8,
+                              COMPRESS_MODE_NONE, DEFAULT_ALIGN, &stVbCalConfig);
+
+    frm->stVFrame.enCompressMode = COMPRESS_MODE_NONE;
+    frm->stVFrame.enPixelFormat = fmt;
+    frm->stVFrame.enVideoFormat = VIDEO_FORMAT_LINEAR;
+    frm->stVFrame.enColorGamut = COLOR_GAMUT_BT709;
+    frm->stVFrame.u32Width = width;
+    frm->stVFrame.u32Height = height;
+    frm->stVFrame.u32Stride[0] = stVbCalConfig.u32MainStride;
+    frm->stVFrame.u32Stride[1] = stVbCalConfig.u32CStride;
+    frm->stVFrame.u32Stride[2] = stVbCalConfig.u32CStride;
+    frm->stVFrame.u32TimeRef = 0;
+    frm->stVFrame.u64PTS = 0;
+    frm->stVFrame.enDynamicRange = DYNAMIC_RANGE_SDR8;
+
+    printf("Allocate VB block with size %d\n", stVbCalConfig.u32VBSize);
+
+    blk = CVI_VB_GetBlock(VB_INVALID_POOLID, stVbCalConfig.u32VBSize);
+    if (blk == (unsigned long)CVI_INVALID_HANDLE) {
+        printf("Can't acquire VB block for size %d\n",
+            stVbCalConfig.u32VBSize);
+        return CVI_MAPI_ERR_FAILURE;
+    }
+
+    frm->u32PoolId = CVI_VB_Handle2PoolId(blk);
+    frm->stVFrame.u32Length[0] = ALIGN(stVbCalConfig.u32MainYSize,
+                                       stVbCalConfig.u16AddrAlign);
+    frm->stVFrame.u32Length[1] = frm->stVFrame.u32Length[2]
+                               = ALIGN(stVbCalConfig.u32MainCSize,
+                                       stVbCalConfig.u16AddrAlign);
+
+    frm->stVFrame.u64PhyAddr[0] = CVI_VB_Handle2PhysAddr(blk);
+    frm->stVFrame.u64PhyAddr[1] = frm->stVFrame.u64PhyAddr[0]
+                                  + frm->stVFrame.u32Length[0];
+    frm->stVFrame.u64PhyAddr[2] = frm->stVFrame.u64PhyAddr[1]
+                                  + frm->stVFrame.u32Length[1];
+    for (int i = 0; i < 3; ++i) {
+        frm->stVFrame.pu8VirAddr[i] = NULL;
+    }
+
+    return CVI_MAPI_SUCCESS;
+}
+
+static void get_frame_plane_num_and_mem_size(VIDEO_FRAME_INFO_S *frm,
+    int *plane_num, size_t *mem_size)
+{
+    if (frm->stVFrame.enPixelFormat == PIXEL_FORMAT_RGB_888_PLANAR
+            || frm->stVFrame.enPixelFormat == PIXEL_FORMAT_BGR_888_PLANAR
+            || frm->stVFrame.enPixelFormat == PIXEL_FORMAT_YUV_PLANAR_422
+            || frm->stVFrame.enPixelFormat == PIXEL_FORMAT_YUV_PLANAR_420
+            || frm->stVFrame.enPixelFormat == PIXEL_FORMAT_YUV_PLANAR_444
+            || frm->stVFrame.enPixelFormat == PIXEL_FORMAT_HSV_888_PLANAR
+            || frm->stVFrame.enPixelFormat == PIXEL_FORMAT_FP32_C3_PLANAR
+            || frm->stVFrame.enPixelFormat == PIXEL_FORMAT_INT32_C3_PLANAR
+            || frm->stVFrame.enPixelFormat == PIXEL_FORMAT_UINT32_C3_PLANAR
+            || frm->stVFrame.enPixelFormat == PIXEL_FORMAT_BF16_C3_PLANAR
+            || frm->stVFrame.enPixelFormat == PIXEL_FORMAT_INT16_C3_PLANAR
+            || frm->stVFrame.enPixelFormat == PIXEL_FORMAT_UINT16_C3_PLANAR
+            || frm->stVFrame.enPixelFormat == PIXEL_FORMAT_INT8_C3_PLANAR
+            || frm->stVFrame.enPixelFormat == PIXEL_FORMAT_UINT8_C3_PLANAR) {
+        *plane_num = 3;
+        // check phyaddr
+        assert(frm->stVFrame.u64PhyAddr[1] - frm->stVFrame.u64PhyAddr[0]
+                       == frm->stVFrame.u32Length[0] &&
+                       "phy addr not continue 0");
+        assert(frm->stVFrame.u64PhyAddr[2] - frm->stVFrame.u64PhyAddr[1]
+                       == frm->stVFrame.u32Length[1] &&
+                       "phy addr not continue 1");
+    } else {
+        *plane_num = 1;
+    }
+
+    *mem_size = 0;
+    for (int i = 0; i < *plane_num; ++i) {
+        *mem_size += frm->stVFrame.u32Length[i];
+    }
+}
+
+int CVI_MAPI_FrameMmap(VIDEO_FRAME_INFO_S *frm, bool enable_cache) {
+    int plane_num = 0;
+    size_t mem_size = 0;
+    get_frame_plane_num_and_mem_size(frm, &plane_num, &mem_size);
+
+    void *vir_addr = NULL;
+    if (enable_cache) {
+        vir_addr = CVI_SYS_MmapCache(frm->stVFrame.u64PhyAddr[0], mem_size);
+    } else {
+        vir_addr = CVI_SYS_Mmap(frm->stVFrame.u64PhyAddr[0], mem_size);
+    }
+    assert(vir_addr && "mmap failed\n");
+
+    //CVI_SYS_IonInvalidateCache(frm->stVFrame.u64PhyAddr[0], vir_addr, mem_size);
+    uint64_t plane_offset = 0;
+    for (int i = 0; i < plane_num; ++i) {
+        frm->stVFrame.pu8VirAddr[i] = (uint8_t *)vir_addr + plane_offset;
+        plane_offset += frm->stVFrame.u32Length[i];
+    }
+
+    return CVI_MAPI_SUCCESS;
+}
+
+int CVI_MAPI_FrameMunmap(VIDEO_FRAME_INFO_S *frm)
+{
+    int plane_num = 0;
+    size_t mem_size = 0;
+    get_frame_plane_num_and_mem_size(frm, &plane_num, &mem_size);
+
+    void *vir_addr = (void *)frm->stVFrame.pu8VirAddr[0];
+    CVI_SYS_Munmap(vir_addr, mem_size);
+
+    for (int i = 0; i < plane_num; ++i) {
+        frm->stVFrame.pu8VirAddr[i] = NULL;
+    }
+
+    return CVI_MAPI_SUCCESS;
+}
+
+int CVI_MAPI_FrameFlushCache(VIDEO_FRAME_INFO_S *frm)
+{
+    int plane_num = 0;
+    size_t mem_size = 0;
+    get_frame_plane_num_and_mem_size(frm, &plane_num, &mem_size);
+
+    void *vir_addr = (void *)frm->stVFrame.pu8VirAddr[0];
+    uint64_t phy_addr = frm->stVFrame.u64PhyAddr[0];
+
+    CVI_SYS_IonFlushCache(phy_addr, vir_addr, mem_size);
+    return CVI_MAPI_SUCCESS;
+}
+
+int CVI_MAPI_ReleaseFrame(VIDEO_FRAME_INFO_S *frm)
+{
+    VB_BLK blk = CVI_VB_PhysAddr2Handle(frm->stVFrame.u64PhyAddr[0]);
+    CVI_VB_ReleaseBlock(blk);
+    return CVI_MAPI_SUCCESS;
+}
+
+int CVI_MAPI_SaveFramePixelData(VIDEO_FRAME_INFO_S *frm, const char *name)
+{
+    #define FILENAME_MAX_LEN    (128)
+    char filename[FILENAME_MAX_LEN] = {0};
+    char const *extension = NULL;
+    if (frm->stVFrame.enPixelFormat == PIXEL_FORMAT_YUV_PLANAR_420) {
+        extension = "yuv";
+    } else if (frm->stVFrame.enPixelFormat == PIXEL_FORMAT_RGB_888_PLANAR
+            || frm->stVFrame.enPixelFormat == PIXEL_FORMAT_BGR_888_PLANAR) {
+        extension = "chw";
+    } else if (frm->stVFrame.enPixelFormat == PIXEL_FORMAT_RGB_888) {
+        extension = "rgb";
+    } else if (frm->stVFrame.enPixelFormat == PIXEL_FORMAT_BGR_888) {
+        extension = "bgr";
+    } else {
+        assert(0 && "Invalid frm pixel format");
+    }
+    snprintf(filename, FILENAME_MAX_LEN, "%s_%dX%d.%s", name,
+            frm->stVFrame.u32Width,
+            frm->stVFrame.u32Height,
+            extension);
+
+    FILE *output;
+    output = fopen(filename, "wb");
+    assert(output && "file open failed");
+
+    printf("Save %s, w*h(%d*%d)\n",
+             filename,
+             frm->stVFrame.u32Width,
+             frm->stVFrame.u32Height);
+
+    CVI_MAPI_FrameMmap(frm, false);
+
+    if (frm->stVFrame.enPixelFormat == PIXEL_FORMAT_YUV_PLANAR_420) {
+        for (int i = 0; i < 3; ++i) {
+            printf("  plane(%d): paddr(0x%lx) vaddr(%p) stride(%d) length(%d)\n",
+                    i,
+                    frm->stVFrame.u64PhyAddr[i],
+                    frm->stVFrame.pu8VirAddr[i],
+                    frm->stVFrame.u32Stride[i],
+                    frm->stVFrame.u32Length[i]);
+            //TODO: test unaligned image
+            uint32_t length = (i == 0 ? frm->stVFrame.u32Height : frm->stVFrame.u32Height / 2);
+            uint32_t step = (i == 0 ? frm->stVFrame.u32Width : frm->stVFrame.u32Width / 2);
+            uint8_t *ptr = (uint8_t *)frm->stVFrame.pu8VirAddr[i];
+            for (int j = 0; j < length; ++j) {
+                fwrite(ptr, step, 1, output);
+                ptr += frm->stVFrame.u32Stride[i];
+            }
+        }
+    } else if (frm->stVFrame.enPixelFormat == PIXEL_FORMAT_RGB_888_PLANAR
+            || frm->stVFrame.enPixelFormat == PIXEL_FORMAT_BGR_888_PLANAR) {
+        for (int i = 0; i < 3; i++) {
+            printf("  plane(%d): paddr(0x%lx) vaddr(%p) stride(%d) length(%d)\n",
+                    i,
+                    frm->stVFrame.u64PhyAddr[i],
+                    frm->stVFrame.pu8VirAddr[i],
+                    frm->stVFrame.u32Stride[i],
+                    frm->stVFrame.u32Length[i]);
+            unsigned char *ptr = frm->stVFrame.pu8VirAddr[i];
+            for (int j = 0; j < frm->stVFrame.u32Height; ++j) {
+                fwrite(ptr, frm->stVFrame.u32Width, 1, output);
+                ptr += frm->stVFrame.u32Stride[i];
+            }
+        }
+    } else if (frm->stVFrame.enPixelFormat == PIXEL_FORMAT_RGB_888
+               || frm->stVFrame.enPixelFormat == PIXEL_FORMAT_BGR_888) {
+        printf("  packed: paddr(0x%lx) vaddr(%p) stride(%d) length(%d)\n",
+                frm->stVFrame.u64PhyAddr[0],
+                frm->stVFrame.pu8VirAddr[0],
+                frm->stVFrame.u32Stride[0],
+                frm->stVFrame.u32Length[0]);
+        uint8_t *ptr = frm->stVFrame.pu8VirAddr[0];
+        for (int j = 0; j < frm->stVFrame.u32Height; ++j)
+        {
+            fwrite(ptr, frm->stVFrame.u32Width * 3, 1, output);
+            ptr += frm->stVFrame.u32Stride[0];
+        }
+    } else {
+        printf(0, "Invalid frm pixel format");
+    }
+
+    CVI_MAPI_FrameMunmap(frm);
+
+    fclose(output);
+    return CVI_MAPI_SUCCESS;
+}
+
+static CVI_S32 vproc_init(VPSS_GRP VpssGrp, CVI_BOOL *pabChnEnable, VPSS_GRP_ATTR_S *pstVpssGrpAttr,
+                  VPSS_CHN_ATTR_S *pastVpssChnAttr)
+{
+    VPSS_CHN VpssChn = 0;
+    CVI_S32 s32Ret;
+    CVI_S32 j;
+
+    s32Ret = CVI_VPSS_CreateGrp(VpssGrp, pstVpssGrpAttr);
+    if (s32Ret != CVI_SUCCESS) {
+        printf("CVI_VPSS_CreateGrp(grp:%d) failed with %#x!\n", VpssGrp, s32Ret);
+        return s32Ret;
+    }
+
+    s32Ret = CVI_VPSS_ResetGrp(VpssGrp);
+    if (s32Ret != CVI_SUCCESS) {
+        printf("CVI_VPSS_ResetGrp(grp:%d) failed with %#x!\n", VpssGrp, s32Ret);
+        goto exit1;
+    }
+
+    for (j = 0; j < VPSS_MAX_PHY_CHN_NUM; j++) {
+        if (pabChnEnable[j]) {
+            VpssChn = j;
+            s32Ret = CVI_VPSS_SetChnAttr(VpssGrp, VpssChn, &pastVpssChnAttr[VpssChn]);
+            if (s32Ret != CVI_SUCCESS) {
+                printf("CVI_VPSS_SetChnAttr failed with %#x\n", s32Ret);
+                goto exit2;
+            }
+
+            s32Ret = CVI_VPSS_EnableChn(VpssGrp, VpssChn);
+            if (s32Ret != CVI_SUCCESS) {
+                printf("CVI_VPSS_EnableChn failed with %#x\n", s32Ret);
+                goto exit2;
+            }
+        }
+    }
+
+    s32Ret = CVI_VPSS_StartGrp(VpssGrp);
+    if (s32Ret != CVI_SUCCESS) {
+        printf("CVI_VPSS_StartGrp failed with %#x\n", s32Ret);
+        goto exit2;
+    }
+    return CVI_SUCCESS;
+
+exit2:
+    for(j = 0;j < VpssChn; j++){
+        if (CVI_VPSS_DisableChn(VpssGrp, j) != CVI_SUCCESS) {
+            printf("CVI_VPSS_DisableChn failed!\n");
+        }
+    }
+exit1:
+    if (CVI_VPSS_DestroyGrp(VpssGrp) != CVI_SUCCESS) {
+        printf("CVI_VPSS_DestroyGrp(grp:%d) failed!\n", VpssGrp);
+    }
+
+    return s32Ret;
+}
+
+static CVI_S32 set_vpss_config(VPSS_GRP VpssGrp, VPSS_GRP_ATTR_S* stVpssGrpAttr, PreprocessArg *arg)
+{
+	VPSS_CHN VpssChn = VPSS_CHN0;
+	CVI_BOOL abChnEnable[VPSS_MAX_PHY_CHN_NUM] = { 0 };
+	VPSS_CHN_ATTR_S astVpssChnAttr[VPSS_MAX_PHY_CHN_NUM];
+	CVI_S32 s32Ret = CVI_SUCCESS;
+
+	if (VpssGrp == 0) {
+	//channel0 
+		abChnEnable[VpssChn] = CVI_TRUE;
+		astVpssChnAttr[VpssChn].u32Width = arg->width;
+		astVpssChnAttr[VpssChn].u32Height = arg->height;
+		astVpssChnAttr[VpssChn].enVideoFormat = VIDEO_FORMAT_LINEAR;
+    if (arg->yuv_type == YUV420_PLANAR) {
+		  astVpssChnAttr[VpssChn].enPixelFormat = PIXEL_FORMAT_YUV_PLANAR_420;
+    } else if (arg->yuv_type == YUV_NV12) {
+		  astVpssChnAttr[VpssChn].enPixelFormat = PIXEL_FORMAT_NV12;
+    } else {
+		  astVpssChnAttr[VpssChn].enPixelFormat = PIXEL_FORMAT_NV21;
+    }
+		astVpssChnAttr[VpssChn].stFrameRate.s32SrcFrameRate = 30;
+		astVpssChnAttr[VpssChn].stFrameRate.s32DstFrameRate = 30;
+		astVpssChnAttr[VpssChn].u32Depth = 1;
+		astVpssChnAttr[VpssChn].bMirror = false;
+		astVpssChnAttr[VpssChn].bFlip = false;
+
+		astVpssChnAttr[VpssChn].stAspectRatio.enMode = ASPECT_RATIO_AUTO;
+		astVpssChnAttr[VpssChn].stAspectRatio.bEnableBgColor = CVI_TRUE;
+		astVpssChnAttr[VpssChn].stAspectRatio.u32BgColor  = COLOR_RGB_BLACK;
+		astVpssChnAttr[VpssChn].stNormalize.bEnable = CVI_FALSE;
+	} else {
+    return -1;
+  }
+	CVI_SYS_SetVPSSMode(VPSS_MODE_SINGLE);
+
+	/*start vpss*/
+  s32Ret = vproc_init(VpssGrp, abChnEnable, stVpssGrpAttr, astVpssChnAttr);
+	if (s32Ret != CVI_SUCCESS) {
+		printf("init vpss group failed. s32Ret: 0x%x !\n", s32Ret);
+		return s32Ret;
+	}
+	return s32Ret;
+}
+
+int init_vpss(int in_width, int in_height, PreprocessArg *arg) {
+	CVI_S32 s32Ret = CVI_SUCCESS;
+	VPSS_GRP_ATTR_S stVpssGrpAttr;
+	CVI_S32 vpssgrp_width = in_width;
+	CVI_S32 vpssgrp_height = in_height;
+	stVpssGrpAttr.stFrameRate.s32SrcFrameRate = -1;
+	stVpssGrpAttr.stFrameRate.s32DstFrameRate = -1;
+	stVpssGrpAttr.enPixelFormat = PIXEL_FORMAT_BGR_888;
+	stVpssGrpAttr.u32MaxW = vpssgrp_width;
+	stVpssGrpAttr.u32MaxH = vpssgrp_height;
+	// only for test here. u8VpssDev should be decided by VPSS_MODE and usage.
+	stVpssGrpAttr.u8VpssDev = 0;
+	s32Ret = set_vpss_config(0, &stVpssGrpAttr, arg);
+	if (s32Ret != CVI_SUCCESS) {
+		printf("CVI_Init_Video_Process Grp0 failed with %d\n", s32Ret);
+		return s32Ret;
+	}
+	return s32Ret;
+}
+
+
+static CVI_VOID vproc_deinit()
+{
+    CVI_S32 j;
+    CVI_S32 s32Ret = CVI_SUCCESS;
+
+    s32Ret = CVI_VPSS_DisableChn(0, 0);
+    if (s32Ret != CVI_SUCCESS) {
+        printf("failed with %#x!\n", s32Ret);
+    }
+    s32Ret = CVI_VPSS_StopGrp(0);
+    if (s32Ret != CVI_SUCCESS) {
+        printf("failed with %#x!\n", s32Ret);
+    }
+    s32Ret = CVI_VPSS_DestroyGrp(0);
+    if (s32Ret != CVI_SUCCESS) {
+        printf("failed with %#x!\n", s32Ret);
+    }
+}
diff --git a/cviruntime/samples/samples_extra/data/cat.jpg b/cviruntime/samples/samples_extra/data/cat.jpg
new file mode 100644
index 000000000..b4efc6c98
Binary files /dev/null and b/cviruntime/samples/samples_extra/data/cat.jpg differ
diff --git a/cviruntime/samples/samples_extra/data/dog.jpg b/cviruntime/samples/samples_extra/data/dog.jpg
new file mode 100644
index 000000000..77b038122
Binary files /dev/null and b/cviruntime/samples/samples_extra/data/dog.jpg differ
diff --git a/cviruntime/samples/samples_extra/data/obama1.jpg b/cviruntime/samples/samples_extra/data/obama1.jpg
new file mode 100644
index 000000000..bd419927c
Binary files /dev/null and b/cviruntime/samples/samples_extra/data/obama1.jpg differ
diff --git a/cviruntime/samples/samples_extra/data/obama2.jpg b/cviruntime/samples/samples_extra/data/obama2.jpg
new file mode 100644
index 000000000..c2342eb61
Binary files /dev/null and b/cviruntime/samples/samples_extra/data/obama2.jpg differ
diff --git a/cviruntime/samples/samples_extra/data/obama3.jpg b/cviruntime/samples/samples_extra/data/obama3.jpg
new file mode 100644
index 000000000..fbf0f1082
Binary files /dev/null and b/cviruntime/samples/samples_extra/data/obama3.jpg differ
diff --git a/cviruntime/samples/samples_extra/data/pose_demo_2.jpg b/cviruntime/samples/samples_extra/data/pose_demo_2.jpg
new file mode 100644
index 000000000..69cad75ba
Binary files /dev/null and b/cviruntime/samples/samples_extra/data/pose_demo_2.jpg differ
diff --git a/cviruntime/samples/samples_extra/data/synset_words.txt b/cviruntime/samples/samples_extra/data/synset_words.txt
new file mode 100644
index 000000000..a9e8c7f50
--- /dev/null
+++ b/cviruntime/samples/samples_extra/data/synset_words.txt
@@ -0,0 +1,1000 @@
+n01440764 tench, Tinca tinca
+n01443537 goldfish, Carassius auratus
+n01484850 great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias
+n01491361 tiger shark, Galeocerdo cuvieri
+n01494475 hammerhead, hammerhead shark
+n01496331 electric ray, crampfish, numbfish, torpedo
+n01498041 stingray
+n01514668 cock
+n01514859 hen
+n01518878 ostrich, Struthio camelus
+n01530575 brambling, Fringilla montifringilla
+n01531178 goldfinch, Carduelis carduelis
+n01532829 house finch, linnet, Carpodacus mexicanus
+n01534433 junco, snowbird
+n01537544 indigo bunting, indigo finch, indigo bird, Passerina cyanea
+n01558993 robin, American robin, Turdus migratorius
+n01560419 bulbul
+n01580077 jay
+n01582220 magpie
+n01592084 chickadee
+n01601694 water ouzel, dipper
+n01608432 kite
+n01614925 bald eagle, American eagle, Haliaeetus leucocephalus
+n01616318 vulture
+n01622779 great grey owl, great gray owl, Strix nebulosa
+n01629819 European fire salamander, Salamandra salamandra
+n01630670 common newt, Triturus vulgaris
+n01631663 eft
+n01632458 spotted salamander, Ambystoma maculatum
+n01632777 axolotl, mud puppy, Ambystoma mexicanum
+n01641577 bullfrog, Rana catesbeiana
+n01644373 tree frog, tree-frog
+n01644900 tailed frog, bell toad, ribbed toad, tailed toad, Ascaphus trui
+n01664065 loggerhead, loggerhead turtle, Caretta caretta
+n01665541 leatherback turtle, leatherback, leathery turtle, Dermochelys coriacea
+n01667114 mud turtle
+n01667778 terrapin
+n01669191 box turtle, box tortoise
+n01675722 banded gecko
+n01677366 common iguana, iguana, Iguana iguana
+n01682714 American chameleon, anole, Anolis carolinensis
+n01685808 whiptail, whiptail lizard
+n01687978 agama
+n01688243 frilled lizard, Chlamydosaurus kingi
+n01689811 alligator lizard
+n01692333 Gila monster, Heloderma suspectum
+n01693334 green lizard, Lacerta viridis
+n01694178 African chameleon, Chamaeleo chamaeleon
+n01695060 Komodo dragon, Komodo lizard, dragon lizard, giant lizard, Varanus komodoensis
+n01697457 African crocodile, Nile crocodile, Crocodylus niloticus
+n01698640 American alligator, Alligator mississipiensis
+n01704323 triceratops
+n01728572 thunder snake, worm snake, Carphophis amoenus
+n01728920 ringneck snake, ring-necked snake, ring snake
+n01729322 hognose snake, puff adder, sand viper
+n01729977 green snake, grass snake
+n01734418 king snake, kingsnake
+n01735189 garter snake, grass snake
+n01737021 water snake
+n01739381 vine snake
+n01740131 night snake, Hypsiglena torquata
+n01742172 boa constrictor, Constrictor constrictor
+n01744401 rock python, rock snake, Python sebae
+n01748264 Indian cobra, Naja naja
+n01749939 green mamba
+n01751748 sea snake
+n01753488 horned viper, cerastes, sand viper, horned asp, Cerastes cornutus
+n01755581 diamondback, diamondback rattlesnake, Crotalus adamanteus
+n01756291 sidewinder, horned rattlesnake, Crotalus cerastes
+n01768244 trilobite
+n01770081 harvestman, daddy longlegs, Phalangium opilio
+n01770393 scorpion
+n01773157 black and gold garden spider, Argiope aurantia
+n01773549 barn spider, Araneus cavaticus
+n01773797 garden spider, Aranea diademata
+n01774384 black widow, Latrodectus mactans
+n01774750 tarantula
+n01775062 wolf spider, hunting spider
+n01776313 tick
+n01784675 centipede
+n01795545 black grouse
+n01796340 ptarmigan
+n01797886 ruffed grouse, partridge, Bonasa umbellus
+n01798484 prairie chicken, prairie grouse, prairie fowl
+n01806143 peacock
+n01806567 quail
+n01807496 partridge
+n01817953 African grey, African gray, Psittacus erithacus
+n01818515 macaw
+n01819313 sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita
+n01820546 lorikeet
+n01824575 coucal
+n01828970 bee eater
+n01829413 hornbill
+n01833805 hummingbird
+n01843065 jacamar
+n01843383 toucan
+n01847000 drake
+n01855032 red-breasted merganser, Mergus serrator
+n01855672 goose
+n01860187 black swan, Cygnus atratus
+n01871265 tusker
+n01872401 echidna, spiny anteater, anteater
+n01873310 platypus, duckbill, duckbilled platypus, duck-billed platypus, Ornithorhynchus anatinus
+n01877812 wallaby, brush kangaroo
+n01882714 koala, koala bear, kangaroo bear, native bear, Phascolarctos cinereus
+n01883070 wombat
+n01910747 jellyfish
+n01914609 sea anemone, anemone
+n01917289 brain coral
+n01924916 flatworm, platyhelminth
+n01930112 nematode, nematode worm, roundworm
+n01943899 conch
+n01944390 snail
+n01945685 slug
+n01950731 sea slug, nudibranch
+n01955084 chiton, coat-of-mail shell, sea cradle, polyplacophore
+n01968897 chambered nautilus, pearly nautilus, nautilus
+n01978287 Dungeness crab, Cancer magister
+n01978455 rock crab, Cancer irroratus
+n01980166 fiddler crab
+n01981276 king crab, Alaska crab, Alaskan king crab, Alaska king crab, Paralithodes camtschatica
+n01983481 American lobster, Northern lobster, Maine lobster, Homarus americanus
+n01984695 spiny lobster, langouste, rock lobster, crawfish, crayfish, sea crawfish
+n01985128 crayfish, crawfish, crawdad, crawdaddy
+n01986214 hermit crab
+n01990800 isopod
+n02002556 white stork, Ciconia ciconia
+n02002724 black stork, Ciconia nigra
+n02006656 spoonbill
+n02007558 flamingo
+n02009229 little blue heron, Egretta caerulea
+n02009912 American egret, great white heron, Egretta albus
+n02011460 bittern
+n02012849 crane
+n02013706 limpkin, Aramus pictus
+n02017213 European gallinule, Porphyrio porphyrio
+n02018207 American coot, marsh hen, mud hen, water hen, Fulica americana
+n02018795 bustard
+n02025239 ruddy turnstone, Arenaria interpres
+n02027492 red-backed sandpiper, dunlin, Erolia alpina
+n02028035 redshank, Tringa totanus
+n02033041 dowitcher
+n02037110 oystercatcher, oyster catcher
+n02051845 pelican
+n02056570 king penguin, Aptenodytes patagonica
+n02058221 albatross, mollymawk
+n02066245 grey whale, gray whale, devilfish, Eschrichtius gibbosus, Eschrichtius robustus
+n02071294 killer whale, killer, orca, grampus, sea wolf, Orcinus orca
+n02074367 dugong, Dugong dugon
+n02077923 sea lion
+n02085620 Chihuahua
+n02085782 Japanese spaniel
+n02085936 Maltese dog, Maltese terrier, Maltese
+n02086079 Pekinese, Pekingese, Peke
+n02086240 Shih-Tzu
+n02086646 Blenheim spaniel
+n02086910 papillon
+n02087046 toy terrier
+n02087394 Rhodesian ridgeback
+n02088094 Afghan hound, Afghan
+n02088238 basset, basset hound
+n02088364 beagle
+n02088466 bloodhound, sleuthhound
+n02088632 bluetick
+n02089078 black-and-tan coonhound
+n02089867 Walker hound, Walker foxhound
+n02089973 English foxhound
+n02090379 redbone
+n02090622 borzoi, Russian wolfhound
+n02090721 Irish wolfhound
+n02091032 Italian greyhound
+n02091134 whippet
+n02091244 Ibizan hound, Ibizan Podenco
+n02091467 Norwegian elkhound, elkhound
+n02091635 otterhound, otter hound
+n02091831 Saluki, gazelle hound
+n02092002 Scottish deerhound, deerhound
+n02092339 Weimaraner
+n02093256 Staffordshire bullterrier, Staffordshire bull terrier
+n02093428 American Staffordshire terrier, Staffordshire terrier, American pit bull terrier, pit bull terrier
+n02093647 Bedlington terrier
+n02093754 Border terrier
+n02093859 Kerry blue terrier
+n02093991 Irish terrier
+n02094114 Norfolk terrier
+n02094258 Norwich terrier
+n02094433 Yorkshire terrier
+n02095314 wire-haired fox terrier
+n02095570 Lakeland terrier
+n02095889 Sealyham terrier, Sealyham
+n02096051 Airedale, Airedale terrier
+n02096177 cairn, cairn terrier
+n02096294 Australian terrier
+n02096437 Dandie Dinmont, Dandie Dinmont terrier
+n02096585 Boston bull, Boston terrier
+n02097047 miniature schnauzer
+n02097130 giant schnauzer
+n02097209 standard schnauzer
+n02097298 Scotch terrier, Scottish terrier, Scottie
+n02097474 Tibetan terrier, chrysanthemum dog
+n02097658 silky terrier, Sydney silky
+n02098105 soft-coated wheaten terrier
+n02098286 West Highland white terrier
+n02098413 Lhasa, Lhasa apso
+n02099267 flat-coated retriever
+n02099429 curly-coated retriever
+n02099601 golden retriever
+n02099712 Labrador retriever
+n02099849 Chesapeake Bay retriever
+n02100236 German short-haired pointer
+n02100583 vizsla, Hungarian pointer
+n02100735 English setter
+n02100877 Irish setter, red setter
+n02101006 Gordon setter
+n02101388 Brittany spaniel
+n02101556 clumber, clumber spaniel
+n02102040 English springer, English springer spaniel
+n02102177 Welsh springer spaniel
+n02102318 cocker spaniel, English cocker spaniel, cocker
+n02102480 Sussex spaniel
+n02102973 Irish water spaniel
+n02104029 kuvasz
+n02104365 schipperke
+n02105056 groenendael
+n02105162 malinois
+n02105251 briard
+n02105412 kelpie
+n02105505 komondor
+n02105641 Old English sheepdog, bobtail
+n02105855 Shetland sheepdog, Shetland sheep dog, Shetland
+n02106030 collie
+n02106166 Border collie
+n02106382 Bouvier des Flandres, Bouviers des Flandres
+n02106550 Rottweiler
+n02106662 German shepherd, German shepherd dog, German police dog, alsatian
+n02107142 Doberman, Doberman pinscher
+n02107312 miniature pinscher
+n02107574 Greater Swiss Mountain dog
+n02107683 Bernese mountain dog
+n02107908 Appenzeller
+n02108000 EntleBucher
+n02108089 boxer
+n02108422 bull mastiff
+n02108551 Tibetan mastiff
+n02108915 French bulldog
+n02109047 Great Dane
+n02109525 Saint Bernard, St Bernard
+n02109961 Eskimo dog, husky
+n02110063 malamute, malemute, Alaskan malamute
+n02110185 Siberian husky
+n02110341 dalmatian, coach dog, carriage dog
+n02110627 affenpinscher, monkey pinscher, monkey dog
+n02110806 basenji
+n02110958 pug, pug-dog
+n02111129 Leonberg
+n02111277 Newfoundland, Newfoundland dog
+n02111500 Great Pyrenees
+n02111889 Samoyed, Samoyede
+n02112018 Pomeranian
+n02112137 chow, chow chow
+n02112350 keeshond
+n02112706 Brabancon griffon
+n02113023 Pembroke, Pembroke Welsh corgi
+n02113186 Cardigan, Cardigan Welsh corgi
+n02113624 toy poodle
+n02113712 miniature poodle
+n02113799 standard poodle
+n02113978 Mexican hairless
+n02114367 timber wolf, grey wolf, gray wolf, Canis lupus
+n02114548 white wolf, Arctic wolf, Canis lupus tundrarum
+n02114712 red wolf, maned wolf, Canis rufus, Canis niger
+n02114855 coyote, prairie wolf, brush wolf, Canis latrans
+n02115641 dingo, warrigal, warragal, Canis dingo
+n02115913 dhole, Cuon alpinus
+n02116738 African hunting dog, hyena dog, Cape hunting dog, Lycaon pictus
+n02117135 hyena, hyaena
+n02119022 red fox, Vulpes vulpes
+n02119789 kit fox, Vulpes macrotis
+n02120079 Arctic fox, white fox, Alopex lagopus
+n02120505 grey fox, gray fox, Urocyon cinereoargenteus
+n02123045 tabby, tabby cat
+n02123159 tiger cat
+n02123394 Persian cat
+n02123597 Siamese cat, Siamese
+n02124075 Egyptian cat
+n02125311 cougar, puma, catamount, mountain lion, painter, panther, Felis concolor
+n02127052 lynx, catamount
+n02128385 leopard, Panthera pardus
+n02128757 snow leopard, ounce, Panthera uncia
+n02128925 jaguar, panther, Panthera onca, Felis onca
+n02129165 lion, king of beasts, Panthera leo
+n02129604 tiger, Panthera tigris
+n02130308 cheetah, chetah, Acinonyx jubatus
+n02132136 brown bear, bruin, Ursus arctos
+n02133161 American black bear, black bear, Ursus americanus, Euarctos americanus
+n02134084 ice bear, polar bear, Ursus Maritimus, Thalarctos maritimus
+n02134418 sloth bear, Melursus ursinus, Ursus ursinus
+n02137549 mongoose
+n02138441 meerkat, mierkat
+n02165105 tiger beetle
+n02165456 ladybug, ladybeetle, lady beetle, ladybird, ladybird beetle
+n02167151 ground beetle, carabid beetle
+n02168699 long-horned beetle, longicorn, longicorn beetle
+n02169497 leaf beetle, chrysomelid
+n02172182 dung beetle
+n02174001 rhinoceros beetle
+n02177972 weevil
+n02190166 fly
+n02206856 bee
+n02219486 ant, emmet, pismire
+n02226429 grasshopper, hopper
+n02229544 cricket
+n02231487 walking stick, walkingstick, stick insect
+n02233338 cockroach, roach
+n02236044 mantis, mantid
+n02256656 cicada, cicala
+n02259212 leafhopper
+n02264363 lacewing, lacewing fly
+n02268443 dragonfly, darning needle, devil's darning needle, sewing needle, snake feeder, snake doctor, mosquito hawk, skeeter hawk
+n02268853 damselfly
+n02276258 admiral
+n02277742 ringlet, ringlet butterfly
+n02279972 monarch, monarch butterfly, milkweed butterfly, Danaus plexippus
+n02280649 cabbage butterfly
+n02281406 sulphur butterfly, sulfur butterfly
+n02281787 lycaenid, lycaenid butterfly
+n02317335 starfish, sea star
+n02319095 sea urchin
+n02321529 sea cucumber, holothurian
+n02325366 wood rabbit, cottontail, cottontail rabbit
+n02326432 hare
+n02328150 Angora, Angora rabbit
+n02342885 hamster
+n02346627 porcupine, hedgehog
+n02356798 fox squirrel, eastern fox squirrel, Sciurus niger
+n02361337 marmot
+n02363005 beaver
+n02364673 guinea pig, Cavia cobaya
+n02389026 sorrel
+n02391049 zebra
+n02395406 hog, pig, grunter, squealer, Sus scrofa
+n02396427 wild boar, boar, Sus scrofa
+n02397096 warthog
+n02398521 hippopotamus, hippo, river horse, Hippopotamus amphibius
+n02403003 ox
+n02408429 water buffalo, water ox, Asiatic buffalo, Bubalus bubalis
+n02410509 bison
+n02412080 ram, tup
+n02415577 bighorn, bighorn sheep, cimarron, Rocky Mountain bighorn, Rocky Mountain sheep, Ovis canadensis
+n02417914 ibex, Capra ibex
+n02422106 hartebeest
+n02422699 impala, Aepyceros melampus
+n02423022 gazelle
+n02437312 Arabian camel, dromedary, Camelus dromedarius
+n02437616 llama
+n02441942 weasel
+n02442845 mink
+n02443114 polecat, fitch, foulmart, foumart, Mustela putorius
+n02443484 black-footed ferret, ferret, Mustela nigripes
+n02444819 otter
+n02445715 skunk, polecat, wood pussy
+n02447366 badger
+n02454379 armadillo
+n02457408 three-toed sloth, ai, Bradypus tridactylus
+n02480495 orangutan, orang, orangutang, Pongo pygmaeus
+n02480855 gorilla, Gorilla gorilla
+n02481823 chimpanzee, chimp, Pan troglodytes
+n02483362 gibbon, Hylobates lar
+n02483708 siamang, Hylobates syndactylus, Symphalangus syndactylus
+n02484975 guenon, guenon monkey
+n02486261 patas, hussar monkey, Erythrocebus patas
+n02486410 baboon
+n02487347 macaque
+n02488291 langur
+n02488702 colobus, colobus monkey
+n02489166 proboscis monkey, Nasalis larvatus
+n02490219 marmoset
+n02492035 capuchin, ringtail, Cebus capucinus
+n02492660 howler monkey, howler
+n02493509 titi, titi monkey
+n02493793 spider monkey, Ateles geoffroyi
+n02494079 squirrel monkey, Saimiri sciureus
+n02497673 Madagascar cat, ring-tailed lemur, Lemur catta
+n02500267 indri, indris, Indri indri, Indri brevicaudatus
+n02504013 Indian elephant, Elephas maximus
+n02504458 African elephant, Loxodonta africana
+n02509815 lesser panda, red panda, panda, bear cat, cat bear, Ailurus fulgens
+n02510455 giant panda, panda, panda bear, coon bear, Ailuropoda melanoleuca
+n02514041 barracouta, snoek
+n02526121 eel
+n02536864 coho, cohoe, coho salmon, blue jack, silver salmon, Oncorhynchus kisutch
+n02606052 rock beauty, Holocanthus tricolor
+n02607072 anemone fish
+n02640242 sturgeon
+n02641379 gar, garfish, garpike, billfish, Lepisosteus osseus
+n02643566 lionfish
+n02655020 puffer, pufferfish, blowfish, globefish
+n02666196 abacus
+n02667093 abaya
+n02669723 academic gown, academic robe, judge's robe
+n02672831 accordion, piano accordion, squeeze box
+n02676566 acoustic guitar
+n02687172 aircraft carrier, carrier, flattop, attack aircraft carrier
+n02690373 airliner
+n02692877 airship, dirigible
+n02699494 altar
+n02701002 ambulance
+n02704792 amphibian, amphibious vehicle
+n02708093 analog clock
+n02727426 apiary, bee house
+n02730930 apron
+n02747177 ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin
+n02749479 assault rifle, assault gun
+n02769748 backpack, back pack, knapsack, packsack, rucksack, haversack
+n02776631 bakery, bakeshop, bakehouse
+n02777292 balance beam, beam
+n02782093 balloon
+n02783161 ballpoint, ballpoint pen, ballpen, Biro
+n02786058 Band Aid
+n02787622 banjo
+n02788148 bannister, banister, balustrade, balusters, handrail
+n02790996 barbell
+n02791124 barber chair
+n02791270 barbershop
+n02793495 barn
+n02794156 barometer
+n02795169 barrel, cask
+n02797295 barrow, garden cart, lawn cart, wheelbarrow
+n02799071 baseball
+n02802426 basketball
+n02804414 bassinet
+n02804610 bassoon
+n02807133 bathing cap, swimming cap
+n02808304 bath towel
+n02808440 bathtub, bathing tub, bath, tub
+n02814533 beach wagon, station wagon, wagon, estate car, beach waggon, station waggon, waggon
+n02814860 beacon, lighthouse, beacon light, pharos
+n02815834 beaker
+n02817516 bearskin, busby, shako
+n02823428 beer bottle
+n02823750 beer glass
+n02825657 bell cote, bell cot
+n02834397 bib
+n02835271 bicycle-built-for-two, tandem bicycle, tandem
+n02837789 bikini, two-piece
+n02840245 binder, ring-binder
+n02841315 binoculars, field glasses, opera glasses
+n02843684 birdhouse
+n02859443 boathouse
+n02860847 bobsled, bobsleigh, bob
+n02865351 bolo tie, bolo, bola tie, bola
+n02869837 bonnet, poke bonnet
+n02870880 bookcase
+n02871525 bookshop, bookstore, bookstall
+n02877765 bottlecap
+n02879718 bow
+n02883205 bow tie, bow-tie, bowtie
+n02892201 brass, memorial tablet, plaque
+n02892767 brassiere, bra, bandeau
+n02894605 breakwater, groin, groyne, mole, bulwark, seawall, jetty
+n02895154 breastplate, aegis, egis
+n02906734 broom
+n02909870 bucket, pail
+n02910353 buckle
+n02916936 bulletproof vest
+n02917067 bullet train, bullet
+n02927161 butcher shop, meat market
+n02930766 cab, hack, taxi, taxicab
+n02939185 caldron, cauldron
+n02948072 candle, taper, wax light
+n02950826 cannon
+n02951358 canoe
+n02951585 can opener, tin opener
+n02963159 cardigan
+n02965783 car mirror
+n02966193 carousel, carrousel, merry-go-round, roundabout, whirligig
+n02966687 carpenter's kit, tool kit
+n02971356 carton
+n02974003 car wheel
+n02977058 cash machine, cash dispenser, automated teller machine, automatic teller machine, automated teller, automatic teller, ATM
+n02978881 cassette
+n02979186 cassette player
+n02980441 castle
+n02981792 catamaran
+n02988304 CD player
+n02992211 cello, violoncello
+n02992529 cellular telephone, cellular phone, cellphone, cell, mobile phone
+n02999410 chain
+n03000134 chainlink fence
+n03000247 chain mail, ring mail, mail, chain armor, chain armour, ring armor, ring armour
+n03000684 chain saw, chainsaw
+n03014705 chest
+n03016953 chiffonier, commode
+n03017168 chime, bell, gong
+n03018349 china cabinet, china closet
+n03026506 Christmas stocking
+n03028079 church, church building
+n03032252 cinema, movie theater, movie theatre, movie house, picture palace
+n03041632 cleaver, meat cleaver, chopper
+n03042490 cliff dwelling
+n03045698 cloak
+n03047690 clog, geta, patten, sabot
+n03062245 cocktail shaker
+n03063599 coffee mug
+n03063689 coffeepot
+n03065424 coil, spiral, volute, whorl, helix
+n03075370 combination lock
+n03085013 computer keyboard, keypad
+n03089624 confectionery, confectionary, candy store
+n03095699 container ship, containership, container vessel
+n03100240 convertible
+n03109150 corkscrew, bottle screw
+n03110669 cornet, horn, trumpet, trump
+n03124043 cowboy boot
+n03124170 cowboy hat, ten-gallon hat
+n03125729 cradle
+n03126707 crane
+n03127747 crash helmet
+n03127925 crate
+n03131574 crib, cot
+n03133878 Crock Pot
+n03134739 croquet ball
+n03141823 crutch
+n03146219 cuirass
+n03160309 dam, dike, dyke
+n03179701 desk
+n03180011 desktop computer
+n03187595 dial telephone, dial phone
+n03188531 diaper, nappy, napkin
+n03196217 digital clock
+n03197337 digital watch
+n03201208 dining table, board
+n03207743 dishrag, dishcloth
+n03207941 dishwasher, dish washer, dishwashing machine
+n03208938 disk brake, disc brake
+n03216828 dock, dockage, docking facility
+n03218198 dogsled, dog sled, dog sleigh
+n03220513 dome
+n03223299 doormat, welcome mat
+n03240683 drilling platform, offshore rig
+n03249569 drum, membranophone, tympan
+n03250847 drumstick
+n03255030 dumbbell
+n03259280 Dutch oven
+n03271574 electric fan, blower
+n03272010 electric guitar
+n03272562 electric locomotive
+n03290653 entertainment center
+n03291819 envelope
+n03297495 espresso maker
+n03314780 face powder
+n03325584 feather boa, boa
+n03337140 file, file cabinet, filing cabinet
+n03344393 fireboat
+n03345487 fire engine, fire truck
+n03347037 fire screen, fireguard
+n03355925 flagpole, flagstaff
+n03372029 flute, transverse flute
+n03376595 folding chair
+n03379051 football helmet
+n03384352 forklift
+n03388043 fountain
+n03388183 fountain pen
+n03388549 four-poster
+n03393912 freight car
+n03394916 French horn, horn
+n03400231 frying pan, frypan, skillet
+n03404251 fur coat
+n03417042 garbage truck, dustcart
+n03424325 gasmask, respirator, gas helmet
+n03425413 gas pump, gasoline pump, petrol pump, island dispenser
+n03443371 goblet
+n03444034 go-kart
+n03445777 golf ball
+n03445924 golfcart, golf cart
+n03447447 gondola
+n03447721 gong, tam-tam
+n03450230 gown
+n03452741 grand piano, grand
+n03457902 greenhouse, nursery, glasshouse
+n03459775 grille, radiator grille
+n03461385 grocery store, grocery, food market, market
+n03467068 guillotine
+n03476684 hair slide
+n03476991 hair spray
+n03478589 half track
+n03481172 hammer
+n03482405 hamper
+n03483316 hand blower, blow dryer, blow drier, hair dryer, hair drier
+n03485407 hand-held computer, hand-held microcomputer
+n03485794 handkerchief, hankie, hanky, hankey
+n03492542 hard disc, hard disk, fixed disk
+n03494278 harmonica, mouth organ, harp, mouth harp
+n03495258 harp
+n03496892 harvester, reaper
+n03498962 hatchet
+n03527444 holster
+n03529860 home theater, home theatre
+n03530642 honeycomb
+n03532672 hook, claw
+n03534580 hoopskirt, crinoline
+n03535780 horizontal bar, high bar
+n03538406 horse cart, horse-cart
+n03544143 hourglass
+n03584254 iPod
+n03584829 iron, smoothing iron
+n03590841 jack-o'-lantern
+n03594734 jean, blue jean, denim
+n03594945 jeep, landrover
+n03595614 jersey, T-shirt, tee shirt
+n03598930 jigsaw puzzle
+n03599486 jinrikisha, ricksha, rickshaw
+n03602883 joystick
+n03617480 kimono
+n03623198 knee pad
+n03627232 knot
+n03630383 lab coat, laboratory coat
+n03633091 ladle
+n03637318 lampshade, lamp shade
+n03642806 laptop, laptop computer
+n03649909 lawn mower, mower
+n03657121 lens cap, lens cover
+n03658185 letter opener, paper knife, paperknife
+n03661043 library
+n03662601 lifeboat
+n03666591 lighter, light, igniter, ignitor
+n03670208 limousine, limo
+n03673027 liner, ocean liner
+n03676483 lipstick, lip rouge
+n03680355 Loafer
+n03690938 lotion
+n03691459 loudspeaker, speaker, speaker unit, loudspeaker system, speaker system
+n03692522 loupe, jeweler's loupe
+n03697007 lumbermill, sawmill
+n03706229 magnetic compass
+n03709823 mailbag, postbag
+n03710193 mailbox, letter box
+n03710637 maillot
+n03710721 maillot, tank suit
+n03717622 manhole cover
+n03720891 maraca
+n03721384 marimba, xylophone
+n03724870 mask
+n03729826 matchstick
+n03733131 maypole
+n03733281 maze, labyrinth
+n03733805 measuring cup
+n03742115 medicine chest, medicine cabinet
+n03743016 megalith, megalithic structure
+n03759954 microphone, mike
+n03761084 microwave, microwave oven
+n03763968 military uniform
+n03764736 milk can
+n03769881 minibus
+n03770439 miniskirt, mini
+n03770679 minivan
+n03773504 missile
+n03775071 mitten
+n03775546 mixing bowl
+n03776460 mobile home, manufactured home
+n03777568 Model T
+n03777754 modem
+n03781244 monastery
+n03782006 monitor
+n03785016 moped
+n03786901 mortar
+n03787032 mortarboard
+n03788195 mosque
+n03788365 mosquito net
+n03791053 motor scooter, scooter
+n03792782 mountain bike, all-terrain bike, off-roader
+n03792972 mountain tent
+n03793489 mouse, computer mouse
+n03794056 mousetrap
+n03796401 moving van
+n03803284 muzzle
+n03804744 nail
+n03814639 neck brace
+n03814906 necklace
+n03825788 nipple
+n03832673 notebook, notebook computer
+n03837869 obelisk
+n03838899 oboe, hautboy, hautbois
+n03840681 ocarina, sweet potato
+n03841143 odometer, hodometer, mileometer, milometer
+n03843555 oil filter
+n03854065 organ, pipe organ
+n03857828 oscilloscope, scope, cathode-ray oscilloscope, CRO
+n03866082 overskirt
+n03868242 oxcart
+n03868863 oxygen mask
+n03871628 packet
+n03873416 paddle, boat paddle
+n03874293 paddlewheel, paddle wheel
+n03874599 padlock
+n03876231 paintbrush
+n03877472 pajama, pyjama, pj's, jammies
+n03877845 palace
+n03884397 panpipe, pandean pipe, syrinx
+n03887697 paper towel
+n03888257 parachute, chute
+n03888605 parallel bars, bars
+n03891251 park bench
+n03891332 parking meter
+n03895866 passenger car, coach, carriage
+n03899768 patio, terrace
+n03902125 pay-phone, pay-station
+n03903868 pedestal, plinth, footstall
+n03908618 pencil box, pencil case
+n03908714 pencil sharpener
+n03916031 perfume, essence
+n03920288 Petri dish
+n03924679 photocopier
+n03929660 pick, plectrum, plectron
+n03929855 pickelhaube
+n03930313 picket fence, paling
+n03930630 pickup, pickup truck
+n03933933 pier
+n03935335 piggy bank, penny bank
+n03937543 pill bottle
+n03938244 pillow
+n03942813 ping-pong ball
+n03944341 pinwheel
+n03947888 pirate, pirate ship
+n03950228 pitcher, ewer
+n03954731 plane, carpenter's plane, woodworking plane
+n03956157 planetarium
+n03958227 plastic bag
+n03961711 plate rack
+n03967562 plow, plough
+n03970156 plunger, plumber's helper
+n03976467 Polaroid camera, Polaroid Land camera
+n03976657 pole
+n03977966 police van, police wagon, paddy wagon, patrol wagon, wagon, black Maria
+n03980874 poncho
+n03982430 pool table, billiard table, snooker table
+n03983396 pop bottle, soda bottle
+n03991062 pot, flowerpot
+n03992509 potter's wheel
+n03995372 power drill
+n03998194 prayer rug, prayer mat
+n04004767 printer
+n04005630 prison, prison house
+n04008634 projectile, missile
+n04009552 projector
+n04019541 puck, hockey puck
+n04023962 punching bag, punch bag, punching ball, punchball
+n04026417 purse
+n04033901 quill, quill pen
+n04033995 quilt, comforter, comfort, puff
+n04037443 racer, race car, racing car
+n04039381 racket, racquet
+n04040759 radiator
+n04041544 radio, wireless
+n04044716 radio telescope, radio reflector
+n04049303 rain barrel
+n04065272 recreational vehicle, RV, R.V.
+n04067472 reel
+n04069434 reflex camera
+n04070727 refrigerator, icebox
+n04074963 remote control, remote
+n04081281 restaurant, eating house, eating place, eatery
+n04086273 revolver, six-gun, six-shooter
+n04090263 rifle
+n04099969 rocking chair, rocker
+n04111531 rotisserie
+n04116512 rubber eraser, rubber, pencil eraser
+n04118538 rugby ball
+n04118776 rule, ruler
+n04120489 running shoe
+n04125021 safe
+n04127249 safety pin
+n04131690 saltshaker, salt shaker
+n04133789 sandal
+n04136333 sarong
+n04141076 sax, saxophone
+n04141327 scabbard
+n04141975 scale, weighing machine
+n04146614 school bus
+n04147183 schooner
+n04149813 scoreboard
+n04152593 screen, CRT screen
+n04153751 screw
+n04154565 screwdriver
+n04162706 seat belt, seatbelt
+n04179913 sewing machine
+n04192698 shield, buckler
+n04200800 shoe shop, shoe-shop, shoe store
+n04201297 shoji
+n04204238 shopping basket
+n04204347 shopping cart
+n04208210 shovel
+n04209133 shower cap
+n04209239 shower curtain
+n04228054 ski
+n04229816 ski mask
+n04235860 sleeping bag
+n04238763 slide rule, slipstick
+n04239074 sliding door
+n04243546 slot, one-armed bandit
+n04251144 snorkel
+n04252077 snowmobile
+n04252225 snowplow, snowplough
+n04254120 soap dispenser
+n04254680 soccer ball
+n04254777 sock
+n04258138 solar dish, solar collector, solar furnace
+n04259630 sombrero
+n04263257 soup bowl
+n04264628 space bar
+n04265275 space heater
+n04266014 space shuttle
+n04270147 spatula
+n04273569 speedboat
+n04275548 spider web, spider's web
+n04277352 spindle
+n04285008 sports car, sport car
+n04286575 spotlight, spot
+n04296562 stage
+n04310018 steam locomotive
+n04311004 steel arch bridge
+n04311174 steel drum
+n04317175 stethoscope
+n04325704 stole
+n04326547 stone wall
+n04328186 stopwatch, stop watch
+n04330267 stove
+n04332243 strainer
+n04335435 streetcar, tram, tramcar, trolley, trolley car
+n04336792 stretcher
+n04344873 studio couch, day bed
+n04346328 stupa, tope
+n04347754 submarine, pigboat, sub, U-boat
+n04350905 suit, suit of clothes
+n04355338 sundial
+n04355933 sunglass
+n04356056 sunglasses, dark glasses, shades
+n04357314 sunscreen, sunblock, sun blocker
+n04366367 suspension bridge
+n04367480 swab, swob, mop
+n04370456 sweatshirt
+n04371430 swimming trunks, bathing trunks
+n04371774 swing
+n04372370 switch, electric switch, electrical switch
+n04376876 syringe
+n04380533 table lamp
+n04389033 tank, army tank, armored combat vehicle, armoured combat vehicle
+n04392985 tape player
+n04398044 teapot
+n04399382 teddy, teddy bear
+n04404412 television, television system
+n04409515 tennis ball
+n04417672 thatch, thatched roof
+n04418357 theater curtain, theatre curtain
+n04423845 thimble
+n04428191 thresher, thrasher, threshing machine
+n04429376 throne
+n04435653 tile roof
+n04442312 toaster
+n04443257 tobacco shop, tobacconist shop, tobacconist
+n04447861 toilet seat
+n04456115 torch
+n04458633 totem pole
+n04461696 tow truck, tow car, wrecker
+n04462240 toyshop
+n04465501 tractor
+n04467665 trailer truck, tractor trailer, trucking rig, rig, articulated lorry, semi
+n04476259 tray
+n04479046 trench coat
+n04482393 tricycle, trike, velocipede
+n04483307 trimaran
+n04485082 tripod
+n04486054 triumphal arch
+n04487081 trolleybus, trolley coach, trackless trolley
+n04487394 trombone
+n04493381 tub, vat
+n04501370 turnstile
+n04505470 typewriter keyboard
+n04507155 umbrella
+n04509417 unicycle, monocycle
+n04515003 upright, upright piano
+n04517823 vacuum, vacuum cleaner
+n04522168 vase
+n04523525 vault
+n04525038 velvet
+n04525305 vending machine
+n04532106 vestment
+n04532670 viaduct
+n04536866 violin, fiddle
+n04540053 volleyball
+n04542943 waffle iron
+n04548280 wall clock
+n04548362 wallet, billfold, notecase, pocketbook
+n04550184 wardrobe, closet, press
+n04552348 warplane, military plane
+n04553703 washbasin, handbasin, washbowl, lavabo, wash-hand basin
+n04554684 washer, automatic washer, washing machine
+n04557648 water bottle
+n04560804 water jug
+n04562935 water tower
+n04579145 whiskey jug
+n04579432 whistle
+n04584207 wig
+n04589890 window screen
+n04590129 window shade
+n04591157 Windsor tie
+n04591713 wine bottle
+n04592741 wing
+n04596742 wok
+n04597913 wooden spoon
+n04599235 wool, woolen, woollen
+n04604644 worm fence, snake fence, snake-rail fence, Virginia fence
+n04606251 wreck
+n04612504 yawl
+n04613696 yurt
+n06359193 web site, website, internet site, site
+n06596364 comic book
+n06785654 crossword puzzle, crossword
+n06794110 street sign
+n06874185 traffic light, traffic signal, stoplight
+n07248320 book jacket, dust cover, dust jacket, dust wrapper
+n07565083 menu
+n07579787 plate
+n07583066 guacamole
+n07584110 consomme
+n07590611 hot pot, hotpot
+n07613480 trifle
+n07614500 ice cream, icecream
+n07615774 ice lolly, lolly, lollipop, popsicle
+n07684084 French loaf
+n07693725 bagel, beigel
+n07695742 pretzel
+n07697313 cheeseburger
+n07697537 hotdog, hot dog, red hot
+n07711569 mashed potato
+n07714571 head cabbage
+n07714990 broccoli
+n07715103 cauliflower
+n07716358 zucchini, courgette
+n07716906 spaghetti squash
+n07717410 acorn squash
+n07717556 butternut squash
+n07718472 cucumber, cuke
+n07718747 artichoke, globe artichoke
+n07720875 bell pepper
+n07730033 cardoon
+n07734744 mushroom
+n07742313 Granny Smith
+n07745940 strawberry
+n07747607 orange
+n07749582 lemon
+n07753113 fig
+n07753275 pineapple, ananas
+n07753592 banana
+n07754684 jackfruit, jak, jack
+n07760859 custard apple
+n07768694 pomegranate
+n07802026 hay
+n07831146 carbonara
+n07836838 chocolate sauce, chocolate syrup
+n07860988 dough
+n07871810 meat loaf, meatloaf
+n07873807 pizza, pizza pie
+n07875152 potpie
+n07880968 burrito
+n07892512 red wine
+n07920052 espresso
+n07930864 cup
+n07932039 eggnog
+n09193705 alp
+n09229709 bubble
+n09246464 cliff, drop, drop-off
+n09256479 coral reef
+n09288635 geyser
+n09332890 lakeside, lakeshore
+n09399592 promontory, headland, head, foreland
+n09421951 sandbar, sand bar
+n09428293 seashore, coast, seacoast, sea-coast
+n09468604 valley, vale
+n09472597 volcano
+n09835506 ballplayer, baseball player
+n10148035 groom, bridegroom
+n10565667 scuba diver
+n11879895 rapeseed
+n11939491 daisy
+n12057211 yellow lady's slipper, yellow lady-slipper, Cypripedium calceolus, Cypripedium parviflorum
+n12144580 corn
+n12267677 acorn
+n12620546 hip, rose hip, rosehip
+n12768682 buckeye, horse chestnut, conker
+n12985857 coral fungus
+n12998815 agaric
+n13037406 gyromitra
+n13040303 stinkhorn, carrion fungus
+n13044778 earthstar
+n13052670 hen-of-the-woods, hen of the woods, Polyporus frondosus, Grifola frondosa
+n13054560 bolete
+n13133613 ear, spike, capitulum
+n15075141 toilet tissue, toilet paper, bathroom tissue
diff --git a/cviruntime/samples/samples_extra/data/trump1.jpg b/cviruntime/samples/samples_extra/data/trump1.jpg
new file mode 100644
index 000000000..592ff2de6
Binary files /dev/null and b/cviruntime/samples/samples_extra/data/trump1.jpg differ
diff --git a/cviruntime/samples/samples_extra/data/trump2.jpg b/cviruntime/samples/samples_extra/data/trump2.jpg
new file mode 100644
index 000000000..20e517a76
Binary files /dev/null and b/cviruntime/samples/samples_extra/data/trump2.jpg differ
diff --git a/cviruntime/samples/samples_extra/data/trump3.jpg b/cviruntime/samples/samples_extra/data/trump3.jpg
new file mode 100644
index 000000000..49642c584
Binary files /dev/null and b/cviruntime/samples/samples_extra/data/trump3.jpg differ
diff --git a/cviruntime/samples/samples_extra/detector_ppyoloem_fused_preprocess/CMakeLists.txt b/cviruntime/samples/samples_extra/detector_ppyoloem_fused_preprocess/CMakeLists.txt
new file mode 100644
index 000000000..f8d32cbd4
--- /dev/null
+++ b/cviruntime/samples/samples_extra/detector_ppyoloem_fused_preprocess/CMakeLists.txt
@@ -0,0 +1,39 @@
+cmake_minimum_required(VERSION 2.8.0)
+
+project(cvi_sample_detector C CXX)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
+
+if(NOT DEFINED TPU_SDK_PATH)
+  message(FATAL_ERROR "Please set TPU_SDK_PATH to point to the TPU_SDK installation")
+endif()
+include_directories(${TPU_SDK_PATH}/include)
+link_directories(${TPU_SDK_PATH}/lib)
+
+if(NOT DEFINED OPENCV_PATH)
+  message(FATAL_ERROR "Please set OPENCV_PATH to point to the opencvn installation")
+endif()
+include_directories(${OPENCV_PATH}/include)
+link_directories(${OPENCV_PATH}/lib)
+
+set(CVI_LIBS ${CVI_LIBS} cviruntime cvikernel)
+if(NOT CMAKE_CROSSCOMPILING)
+  set(CVI_LIBS ${CVI_LIBS} cvicmodel)
+endif()
+
+set(OPENCV_LIBS ${OPENCV_LIBS} opencv_core opencv_imgcodecs opencv_imgproc)
+if(NOT CMAKE_CROSSCOMPILING)
+  set(OPENCV_LIBS ${OPENCV_LIBS} opencv_highgui)
+endif()
+
+set(EXTRA_LIBS ${EXTRA_LIBS} dl stdc++ pthread z)
+
+add_executable(cvi_sample_detector_ppyoloem_fused_preprocess
+    detector_ppyoloem_fused_preprocess.cpp)
+target_link_libraries(cvi_sample_detector_ppyoloem_fused_preprocess
+    ${CVI_LIBS}
+    ${OPENCV_LIBS}
+    ${EXTRA_LIBS})
+install(TARGETS cvi_sample_detector_ppyoloem_fused_preprocess DESTINATION samples_extra/bin)
\ No newline at end of file
diff --git a/cviruntime/samples/samples_extra/detector_ppyoloem_fused_preprocess/README.md b/cviruntime/samples/samples_extra/detector_ppyoloem_fused_preprocess/README.md
new file mode 100644
index 000000000..697586f15
--- /dev/null
+++ b/cviruntime/samples/samples_extra/detector_ppyoloem_fused_preprocess/README.md
@@ -0,0 +1,117 @@
+# PPYOLOE_M Sample with post_process
+
+### Download the model and convert the model under docker (optional)
+#### For new toolchain guide
+The following documents are required:
+* tpu-mlir_xxxx.tar.gz (The release package of tpu-mlir)
+
+Transform cvimodel shell:
+``` shell
+tar zxf tpu-mlir_xxxx.tar.gz
+source tpu-mlir_xxxx/envsetup.sh
+
+mkdir workspace && cd workspace
+cp $TPUC_ROOT/regression/image/dog.jpg .
+cp -rf $TPUC_ROOT/regression/dataset/COCO2017 .
+
+model_transform.py \
+	--model_name ppyoloe_m \
+	--model_def ./ppyoloe_m_new.onnx \
+	--input_shapes [[1,3,640,640]] \
+	--pixel_format "rgb" \
+	--mean 123.675,116.28,103.53 \
+	--scale 0.017125,0.017507,0.01743 \
+	--test_input ./dog.jpg \
+	--test_result ppyoloe_m_top_outputs.npz \
+	--mlir ppyoloe_m.mlir
+
+run_calibration.py \
+    ppyoloe_m.mlir \
+    --dataset=./COCO2017 \
+    --input_num=100 \
+    -o ../ppyoloe_m_calibration_table
+
+model_deploy.py \
+	--mlir ppyoloe_m.mlir \
+	--quantize INT8 \
+	--calibration_table ./ppyoloe_m_calibration_table \
+	--chip cv183x \
+	--test_input ./dog.jpg \
+	--test_reference ppyoloe_m_top_outputs.npz \
+	--compare_all \
+	--quant_input \
+	--tolerance 0.8,0.4 \
+	--fuse_preprocess \
+	--model ppyoloe_m_int8.cvimodel
+```
+#### For old toolchain guide
+The following documents are required:
+
+* cvitek_mlir_ubuntu-18.04.tar.gz
+
+Transform model shell:
+
+```shell
+tar zxf cvitek_mlir_ubuntu-18.04.tar.gz
+source cvitek_mlir/cvitek_envs.sh
+
+mkdir workspace && cd workspace
+cp $MLIR_PATH/tpuc/regression/data/dog.jpg .
+
+assign_output.py --model ./ppyoloe_m.onnx --output p2o.Transpose.12,p2o.Concat.30
+
+model_transform.py \
+  --model_type onnx \
+  --model_name ppyoloe_m \
+  --model_def ./ppyoloe_m_new.onnx \
+  --image ./dog.jpg \
+  --image_resize_dims 640,640 \
+  --keep_aspect_ratio false \
+  --net_input_dims 640,640 \
+  --raw_scale 1.0 \
+  --mean 0.485,0.456,0.406 \
+  --std 0.229,0.224,0.225 \
+  --input_scale 1.0 \
+  --model_channel_order "rgb" \
+  --tolerance 0.99,0.99,0.99 \
+  --mlir ppyoloe_m_fp32.mlir
+
+run_calibration.py \
+    ppyoloe_m_fp32.mlir \
+    --dataset=/data/dataset/coco/val2017 \
+    --input_num=100 \
+    --calibration_table ppyoloe_m_calibration_table
+
+model_deploy.py \
+  --model_name ppyoloep \
+  --mlir ppyoloe_m_fp32.mlir \
+  --calibration_table ppyoloe_m_calibration_table \
+  --quantize INT8 \
+  --pixel_format RGB_PLANAR \
+  --chip cv183x \
+  --image ./dog.jpg \
+  --tolerance 0.9,0.9,0.6 \
+  --correctness 0.9,0.9,0.93 \
+  --cvimodel ppyoloe_m_int8.cvimodel \
+  --fuse_preprocess 
+```
+
+**Attention**:After run_calibration.py, we advice to mannully set the threshold of "p2o.Concat.29_Conca", "p2o.Concat.31_Concat","transpose_3.tmp_0_Transpose" the same as its max_value,like this:
+
+![](cali.png)
+
+Copy generated ppyoloe_m_int8.cvimodel to EVB board
+
+## How To Compile Vpss input Sample In Docker
+
+View the Top level directory README.md
+
+## Run Samples In EVB Borad
+
+```shell
+cd install_samples/samples_extra
+./bin/cvi_sample_detector_ppyoloem_fused_preprocess \
+./ppyoloe_m_int8.cvimodel \
+./data/dog.jpg \
+ppyoloe_ms_out.jpg
+```
\ No newline at end of file
diff --git a/cviruntime/samples/samples_extra/detector_ppyoloem_fused_preprocess/cali.png b/cviruntime/samples/samples_extra/detector_ppyoloem_fused_preprocess/cali.png
new file mode 100644
index 000000000..287c851fc
Binary files /dev/null and b/cviruntime/samples/samples_extra/detector_ppyoloem_fused_preprocess/cali.png differ
diff --git a/cviruntime/samples/samples_extra/detector_ppyoloem_fused_preprocess/detector_ppyoloem_fused_preprocess.cpp b/cviruntime/samples/samples_extra/detector_ppyoloem_fused_preprocess/detector_ppyoloem_fused_preprocess.cpp
new file mode 100644
index 000000000..b716af8e5
--- /dev/null
+++ b/cviruntime/samples/samples_extra/detector_ppyoloem_fused_preprocess/detector_ppyoloem_fused_preprocess.cpp
@@ -0,0 +1,334 @@
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <opencv2/opencv.hpp>
+#include "cviruntime.h"
+
+
+typedef struct {
+  float x, y, w, h;
+} box;
+
+typedef struct {
+  box bbox;
+  int cls;
+  float score;
+  int batch_idx;
+} detection;
+
+
+static const char *coco_names[] = {
+    "person",        "bicycle",       "car",           "motorbike",
+    "aeroplane",     "bus",           "train",         "truck",
+    "boat",          "traffic light", "fire hydrant",  "stop sign",
+    "parking meter", "bench",         "bird",          "cat",
+    "dog",           "horse",         "sheep",         "cow",
+    "elephant",      "bear",          "zebra",         "giraffe",
+    "backpack",      "umbrella",      "handbag",       "tie",
+    "suitcase",      "frisbee",       "skis",          "snowboard",
+    "sports ball",   "kite",          "baseball bat",  "baseball glove",
+    "skateboard",    "surfboard",     "tennis racket", "bottle",
+    "wine glass",    "cup",           "fork",          "knife",
+    "spoon",         "bowl",          "banana",        "apple",
+    "sandwich",      "orange",        "broccoli",      "carrot",
+    "hot dog",       "pizza",         "donut",         "cake",
+    "chair",         "sofa",          "pottedplant",   "bed",
+    "diningtable",   "toilet",        "tvmonitor",     "laptop",
+    "mouse",         "remote",        "keyboard",      "cell phone",
+    "microwave",     "oven",          "toaster",       "sink",
+    "refrigerator",  "book",          "clock",         "vase",
+    "scissors",      "teddy bear",    "hair drier",    "toothbrush"};
+
+
+static void usage(char **argv) {
+  printf("Usage:\n");
+  printf("   %s cvimodel image.jpg image_detected.jpg\n", argv[0]);
+}
+
+template <typename T>
+int argmax(const T *data,
+          size_t len,
+          size_t stride = 1)
+{
+	int maxIndex = 0;
+	for (size_t i = 1; i < len; i++)
+	{
+		int idx = i * stride;
+		if (data[maxIndex * stride] < data[idx])
+		{
+			maxIndex = i;
+		}
+	}
+	return maxIndex;
+}
+
+float calIou(box a, box b)
+{
+  float area1 = a.w * a.h;
+  float area2 = b.w * b.h;
+  float wi = std::min((a.x + a.w / 2), (b.x + b.w / 2)) - std::max((a.x - a.w / 2), (b.x - b.w / 2));
+  float hi = std::min((a.y + a.h / 2), (b.y + b.h / 2)) - std::max((a.y - a.h / 2), (b.y - b.h / 2));
+  float area_i = std::max(wi, 0.0f) * std::max(hi, 0.0f);
+  return area_i / (area1 + area2 - area_i);
+}
+
+void correctYoloBoxes(std::vector<detection> &dets,
+                      int det_num,
+                      int image_h,
+                      int image_w,
+                      int input_height,
+                      int input_width) {
+    int restored_w;
+    int restored_h;
+    float resize_ratio;
+    if (((float)input_width / image_w) < ((float)input_height / image_h)) {
+        restored_w = input_width;
+        restored_h = (image_h * input_width) / image_w;
+    } else {
+        restored_h = input_height;
+        restored_w = (image_w * input_height) / image_h;
+    }
+    resize_ratio = ((float)image_w / restored_w);
+
+    for (int i = 0; i < det_num; ++i) {
+        box bbox = dets[i].bbox;
+        int b    = dets[i].batch_idx;
+        bbox.x   = (bbox.x - (input_width - restored_w) / 2.) * resize_ratio;
+        bbox.y   = (bbox.y - (input_height - restored_h) / 2.) * resize_ratio;
+        bbox.w *= resize_ratio;
+        bbox.h *= resize_ratio;
+        dets[i].bbox = bbox;
+    }
+}
+
+static void NMS(std::vector<detection> &dets, int *total, float thresh)
+{
+  if (*total){
+    std::sort(dets.begin(), dets.end(), [](detection &a, detection &b)
+              { return b.score < a.score; });
+    int new_count = *total;
+    for (int i = 0; i < *total; ++i)
+    {
+      detection &a = dets[i];
+      if (a.score == 0)
+        continue;
+      for (int j = i + 1; j < *total; ++j)
+      {
+        detection &b = dets[j];
+        if (dets[i].batch_idx == dets[j].batch_idx &&
+            b.score != 0 && dets[i].cls == dets[j].cls &&
+            calIou(a.bbox, b.bbox) > thresh)
+        {
+          b.score = 0;
+          new_count--;
+        }
+      }
+    }
+    std::vector<detection>::iterator it = dets.begin();
+    while (it != dets.end()) {
+      if (it->score == 0) {
+        dets.erase(it);
+      } else {
+        it++;
+      }
+    }
+    *total = new_count;
+  }
+}
+
+/**
+ * @brief
+ *
+ * @note work as long as output shape [n, a, h, w, cls + 5]
+ * @param layer
+ * @param input_height 
+ * @param input_width
+ * @param classes_num
+ * @param conf_thresh
+ * @param dets
+ * @return int
+ */
+int getDetections(CVI_TENSOR *output,
+                  int32_t input_height,
+                  int32_t input_width,
+                  int classes_num,
+                  CVI_SHAPE output_shape,
+                  float conf_thresh,
+                  std::vector<detection> &dets) {
+    
+    float *scores_ptr  = (float *)CVI_NN_TensorPtr(&output[1]);
+    float *dets_ptr  = (float *)CVI_NN_TensorPtr(&output[0]);
+    float stride[3] = {32, 16, 8};
+    int count          = 0;
+    int batch = output_shape.dim[0];
+    int total_box_num = output_shape.dim[1];
+    // int max_loc = argmax(scores_ptr, classes_num * 8400);
+    // float max_score = scores_ptr[max_loc];
+    for (int b = 0; b < batch; b++) {
+      for (int i = 0; i < 3; i++) {
+        int nh = input_height / stride[i], nw = input_width / stride[i];
+        int box_num = nh * nw;
+        for (int j = 0; j < box_num; j++) {
+          float anchor_x = (float)(j % nw) + 0.5, anchor_y = (float)(j / nw) + 0.5;
+          int category = argmax(scores_ptr, classes_num);
+          if (scores_ptr[category] <= conf_thresh) {
+            scores_ptr = scores_ptr + classes_num;
+            dets_ptr = dets_ptr + 4;
+            continue; 
+          }
+          detection det;
+          det.score = scores_ptr[category];
+          det.cls = category;
+          det.batch_idx = b;
+          float x1 = anchor_x - dets_ptr[0];
+          float y1 = anchor_y - dets_ptr[1];
+          float x2 = anchor_x + dets_ptr[2];
+          float y2 = anchor_y + dets_ptr[3];
+          det.bbox.h = (y2 -y1) * stride[i];
+          det.bbox.w = (x2 -x1) * stride[i];
+          det.bbox.x = x1 * stride[i] + det.bbox.w / 2.0;
+          det.bbox.y = y1 * stride[i] + det.bbox.h / 2.0;
+          count++;  
+          dets.emplace_back(det);
+          scores_ptr = scores_ptr + classes_num;
+          dets_ptr = dets_ptr + 4;      
+        }
+      }
+    }
+    return count;
+}
+
+
+int main(int argc, char **argv) {
+  int ret = 0;
+  CVI_MODEL_HANDLE model;
+
+  if (argc != 4) {
+    usage(argv);
+    exit(-1);
+  }
+  CVI_TENSOR *input;
+  CVI_TENSOR *output;
+  CVI_TENSOR *input_tensors;
+  CVI_TENSOR *output_tensors;
+  int32_t input_num;
+  int32_t output_num;
+  CVI_SHAPE input_shape;
+  CVI_SHAPE* output_shape;
+  int32_t height;
+  int32_t width;
+  float qscale;
+  //int bbox_len = 84; // classes num + 4
+  int classes_num = 80;
+  float conf_thresh = 0.5;
+  float iou_thresh = 0.5;
+  ret = CVI_NN_RegisterModel(argv[1], &model);
+  if (ret != CVI_RC_SUCCESS) {
+    printf("CVI_NN_RegisterModel failed, err %d\n", ret);
+    exit(1);
+  }
+  printf("CVI_NN_RegisterModel succeeded\n");
+
+  // get input output tensors
+  CVI_NN_GetInputOutputTensors(model, &input_tensors, &input_num, &output_tensors,
+                               &output_num);
+
+  input = CVI_NN_GetTensorByName(CVI_NN_DEFAULT_TENSOR, input_tensors, input_num);
+  assert(input);
+  output = output_tensors;
+  printf("CVI_NN_GetTensorByName succeed.\n");
+  output_shape = reinterpret_cast<CVI_SHAPE *>(calloc(output_num, sizeof(CVI_SHAPE)));
+  for (int i = 0; i < output_num; i++)
+  {
+    output_shape[i] = CVI_NN_TensorShape(&output[i]);
+  }
+
+  // nchw
+  input_shape = CVI_NN_TensorShape(input);
+  height = input_shape.dim[2];
+  width = input_shape.dim[3];
+  assert(height % 32 == 0 && width %32 == 0);
+  // imread
+  cv::Mat image;
+  image = cv::imread(argv[2]);
+  if (!image.data) {
+    printf("Could not open or find the image\n");
+    return -1;
+  }
+  cv::Mat cloned = image.clone();
+
+  // resize & letterbox
+  int ih = image.rows;
+  int iw = image.cols;
+  int oh = height;
+  int ow = width;
+  double resize_scale = std::min((double)oh / ih, (double)ow / iw);
+  int nh = (int)(ih * resize_scale);
+  int nw = (int)(iw * resize_scale);
+  cv::resize(image, image, cv::Size(nw, nh));
+  int top = (oh - nh) / 2;
+  int bottom = (oh - nh) - top;
+  int left = (ow - nw) / 2;
+  int right = (ow - nw) - left;
+  cv::copyMakeBorder(image, image, top, bottom, left, right, cv::BORDER_CONSTANT,
+                     cv::Scalar::all(0));
+  cv::cvtColor(image, image, cv::COLOR_BGR2RGB);
+
+  //Packed2Planar
+  cv::Mat channels[3];
+  for (int i = 0; i < 3; i++) {
+    channels[i] = cv::Mat(image.rows, image.cols, CV_8SC1);
+  }
+  cv::split(image, channels);
+
+  // fill data
+  int8_t *ptr = (int8_t *)CVI_NN_TensorPtr(input);
+  int channel_size = height * width;
+  for (int i = 0; i < 3; ++i) {
+    memcpy(ptr + i * channel_size, channels[i].data, channel_size);
+  }
+
+  // run inference
+  CVI_NN_Forward(model, input_tensors, input_num, output_tensors, output_num);
+  printf("CVI_NN_Forward Succeed...\n");
+  // do post proprocess
+  int det_num = 0;
+  std::vector<detection> dets;
+  det_num = getDetections(output, height, width, classes_num, output_shape[0],  
+                          conf_thresh, dets);
+  // correct box with origin image size
+  NMS(dets, &det_num, iou_thresh);
+  correctYoloBoxes(dets, det_num, cloned.rows, cloned.cols, height, width);
+  printf("get detection num: %d\n", det_num);
+
+  // draw bbox on image
+  for (int i = 0; i < det_num; i++) {
+    box b = dets[i].bbox;
+    // xywh2xyxy
+    int x1 = (b.x - b.w / 2);
+    int y1 = (b.y - b.h / 2);
+    int x2 = (b.x + b.w / 2);
+    int y2 = (b.y + b.h / 2);
+    cv::rectangle(cloned, cv::Point(x1, y1), cv::Point(x2, y2), cv::Scalar(255, 255, 0),
+                  3, 8, 0);
+    char content[100];
+    sprintf(content, "%s %0.3f", coco_names[dets[i].cls], dets[i].score);
+    cv::putText(cloned, content, cv::Point(x1, y1),
+                cv::FONT_HERSHEY_DUPLEX, 1.0, cv::Scalar(0, 0, 255), 2);
+  }
+
+  // save or show picture
+  cv::imwrite(argv[3], cloned);
+
+  printf("------\n");
+  printf("%d objects are detected\n", det_num);
+  printf("------\n");
+
+  CVI_NN_CleanupModel(model);
+  printf("CVI_NN_CleanupModel succeeded\n");
+  free(output_shape);
+  return 0;
+}
diff --git a/cviruntime/samples/samples_extra/detector_yolov3_fused_preprocess/CMakeLists.txt b/cviruntime/samples/samples_extra/detector_yolov3_fused_preprocess/CMakeLists.txt
new file mode 100644
index 000000000..0c26e5867
--- /dev/null
+++ b/cviruntime/samples/samples_extra/detector_yolov3_fused_preprocess/CMakeLists.txt
@@ -0,0 +1,40 @@
+cmake_minimum_required(VERSION 2.8.0)
+
+project(cvi_sample_detector C CXX)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
+
+if(NOT DEFINED TPU_SDK_PATH)
+  message(FATAL_ERROR "Please set TPU_SDK_PATH to point to the TPU_SDK installation")
+endif()
+include_directories(${TPU_SDK_PATH}/include)
+link_directories(${TPU_SDK_PATH}/lib)
+
+if(NOT DEFINED OPENCV_PATH)
+  message(FATAL_ERROR "Please set OPENCV_PATH to point to the opencv installation")
+endif()
+include_directories(${OPENCV_PATH}/include)
+link_directories(${OPENCV_PATH}/lib)
+
+set(CVI_LIBS ${CVI_LIBS} cviruntime cvikernel)
+if(NOT CMAKE_CROSSCOMPILING)
+  set(CVI_LIBS ${CVI_LIBS} cvicmodel)
+endif()
+
+set(OPENCV_LIBS ${OPENCV_LIBS} opencv_core opencv_imgcodecs opencv_imgproc)
+if(NOT CMAKE_CROSSCOMPILING)
+  set(OPENCV_LIBS ${OPENCV_LIBS} opencv_highgui)
+endif()
+
+set(EXTRA_LIBS ${EXTRA_LIBS} dl stdc++ pthread z)
+
+add_executable(cvi_sample_detector_yolo_v3_fused_preprocess
+    detector_yolov3_fused_preprocess.cpp)
+target_link_libraries(cvi_sample_detector_yolo_v3_fused_preprocess
+    ${CVI_LIBS}
+    ${OPENCV_LIBS}
+    ${EXTRA_LIBS})
+install(TARGETS cvi_sample_detector_yolo_v3_fused_preprocess
+    cvi_sample_detector_yolo_v3_fused_preprocess DESTINATION samples_extra/bin)
\ No newline at end of file
diff --git a/cviruntime/samples/samples_extra/detector_yolov3_fused_preprocess/README.md b/cviruntime/samples/samples_extra/detector_yolov3_fused_preprocess/README.md
new file mode 100644
index 000000000..f0b7163be
--- /dev/null
+++ b/cviruntime/samples/samples_extra/detector_yolov3_fused_preprocess/README.md
@@ -0,0 +1,118 @@
+# Yolov3 Sample without post_process
+
+### Download the model and convert the model under docker (optional)
+#### For new toolchain guide
+The following documents are required:
+* tpu-mlir_xxxx.tar.gz (The release package of tpu-mlir)
+
+Transform cvimodel shell:
+``` shell
+tar zxf tpu-mlir_xxxx.tar.gz
+source tpu-mlir_xxxx/envsetup.sh
+
+mkdir workspace && cd workspace
+cp $TPUC_ROOT/regression/image/dog.jpg .
+cp -rf $TPUC_ROOT/regression/dataset/COCO2017 .
+
+model_transform.py \
+--model_name yolov3 \
+--model_def ./yolov3_416_with_detection.prototxt \
+--model_data ./yolov3_416.caffemodel \
+--test_input ./dog.jpg \
+--test_result yolov3_top_output.npz \
+--input_shapes [[1,3,416,416]]
+--resize_dims 416,416 \
+--keep_aspect_ratio true \
+--mean 0,0,0 \
+--scale 0.00392,0.00392,0.00392 \
+--pixel_format "rgb" \
+--tolerance 0.99,0.99 \
+--excepts output \
+--mlir yolov3.mlir
+
+run_calibration.py \
+yolov3.mlir \
+--dataset=./COCO2017 \
+--input_num=100 \
+-o yolov3_calibration_table
+
+model_deploy.py \
+--mlir yolov3.mlir \
+--calibration_table yolov3_calibration_table \
+--chip cv183x \
+--quantize INT8 \
+--quant_input \
+--test_input ./dog.jpg \
+--test_reference yolov3_top_output.npz \
+--excepts output \
+--tolerance 0.9,0.3 \
+--fuse_preprocess \
+--customization_format RGB_PLANAR \
+--model yolo_v3_416_fused_preprocess_with_detection.cvimodel
+```
+
+
+
+#### For old toolchain guide
+The following documents are required:
+
+* cvitek_mlir_ubuntu-18.04.tar.gz
+
+Transform model shell:
+``` shell
+tar zxf cvitek_mlir_ubuntu-18.04.tar.gz
+source cvitek_mlir/cvitek_envs.sh
+
+mkdir workspace && cd workspace
+cp $MLIR_PATH/tpuc/regression/data/dog.jpg .
+cp -rf $MLIR_PATH/tpuc/regression/data/images .
+
+model_transform.py \
+  --model_type caffe \
+  --model_name yolov3_416 \
+  --model_def ./yolov3_416_with_detection.prototxt \
+  --model_data ./yolov3_416.caffemodel \
+  --image ./dog.jpg \
+  --image_resize_dims 416,416 \
+  --keep_aspect_ratio true \
+  --raw_scale 1 \
+  --model_channel_order "rgb" \
+  --tolerance 0.99,0.99,0.99 \
+  --excepts output \
+  --mlir yolov3_416_fp32.mlir
+
+run_calibration.py \
+yolov3_416_fp32.mlir \
+--dataset=./images \
+--input_num=100 \
+-o yolo_v3_calibration_table_autotune
+
+model_deploy.py \
+--model_name yolov3_416 \
+--mlir yolov3_416_fp32.mlir \
+--calibration_table yolo_v3_calibration_table_autotune \
+--fuse_preprocess \
+--pixel_format RGB_PLANAR \
+--aligned_input false \
+--excepts output \
+--chip cv183x \
+--quantize INT8 \
+--image dog.jpg \
+--tolerance 0.9,0.9,0.3 \
+--correctness 0.95,0.95,0.9 \
+--cvimodel yolo_v3_416_fused_preprocess_with_detection.cvimodel
+```
+
+Copy generated yolo_v3_416_fused_preprocess_with_detection.cvimodel to EVB board
+
+## How To Compile Vpss input Sample In Docker
+View the Top level directory README.md
+
+## Run Samples In EVB Borad
+```
+cd install_samples/samples_extra
+./bin/cvi_sample_detector_yolo_v3_fused_preprocess \
+./yolo_v3_416_fused_preprocess_with_detection.cvimodel \
+./data/dog.jpg \
+yolo_v3_out.jpg
+```
\ No newline at end of file
diff --git a/cviruntime/samples/samples_extra/detector_yolov3_fused_preprocess/detector_yolov3_fused_preprocess.cpp b/cviruntime/samples/samples_extra/detector_yolov3_fused_preprocess/detector_yolov3_fused_preprocess.cpp
new file mode 100644
index 000000000..2101d2c97
--- /dev/null
+++ b/cviruntime/samples/samples_extra/detector_yolov3_fused_preprocess/detector_yolov3_fused_preprocess.cpp
@@ -0,0 +1,208 @@
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <opencv2/opencv.hpp>
+#include "cviruntime.h"
+
+// #define SAVE_FILE_FOR_DEBUG
+// #define DO_IMSHOW
+
+#define MAX_DET 200
+
+typedef struct {
+  float x, y, w, h;
+} box;
+
+typedef struct {
+  box bbox;
+  int cls;
+  float score;
+} detection;
+
+static const char *coco_names[] = {
+    "person",        "bicycle",       "car",           "motorbike",
+    "aeroplane",     "bus",           "train",         "truck",
+    "boat",          "traffic light", "fire hydrant",  "stop sign",
+    "parking meter", "bench",         "bird",          "cat",
+    "dog",           "horse",         "sheep",         "cow",
+    "elephant",      "bear",          "zebra",         "giraffe",
+    "backpack",      "umbrella",      "handbag",       "tie",
+    "suitcase",      "frisbee",       "skis",          "snowboard",
+    "sports ball",   "kite",          "baseball bat",  "baseball glove",
+    "skateboard",    "surfboard",     "tennis racket", "bottle",
+    "wine glass",    "cup",           "fork",          "knife",
+    "spoon",         "bowl",          "banana",        "apple",
+    "sandwich",      "orange",        "broccoli",      "carrot",
+    "hot dog",       "pizza",         "donut",         "cake",
+    "chair",         "sofa",          "pottedplant",   "bed",
+    "diningtable",   "toilet",        "tvmonitor",     "laptop",
+    "mouse",         "remote",        "keyboard",      "cell phone",
+    "microwave",     "oven",          "toaster",       "sink",
+    "refrigerator",  "book",          "clock",         "vase",
+    "scissors",      "teddy bear",    "hair drier",    "toothbrush"};
+
+static void usage(char **argv) {
+  printf("Usage:\n");
+  printf("   %s cvimodel image.jpg image_detected.jpg\n", argv[0]);
+}
+
+int main(int argc, char **argv) {
+  int ret = 0;
+  CVI_MODEL_HANDLE model;
+
+  if (argc != 4) {
+    usage(argv);
+    exit(-1);
+  }
+  CVI_TENSOR *input;
+  CVI_TENSOR *output;
+  CVI_TENSOR *input_tensors;
+  CVI_TENSOR *output_tensors;
+  int32_t input_num;
+  int32_t output_num;
+  CVI_SHAPE shape;
+  int32_t height;
+  int32_t width;
+
+  ret = CVI_NN_RegisterModel(argv[1], &model);
+  if (ret != CVI_RC_SUCCESS) {
+    printf("CVI_NN_RegisterModel failed, err %d\n", ret);
+    exit(1);
+  }
+  printf("CVI_NN_RegisterModel succeeded\n");
+
+  // get input output tensors
+  CVI_NN_GetInputOutputTensors(model, &input_tensors, &input_num, &output_tensors,
+                               &output_num);
+
+  input = CVI_NN_GetTensorByName(CVI_NN_DEFAULT_TENSOR, input_tensors, input_num);
+  assert(input);
+  output = CVI_NN_GetTensorByName("output", output_tensors, output_num);
+  assert(output);
+
+  // nchw
+  shape = CVI_NN_TensorShape(input);
+  height = shape.dim[2];
+  width = shape.dim[3];
+
+  // imread
+  cv::Mat image;
+  image = cv::imread(argv[2]);
+  if (!image.data) {
+    printf("Could not open or find the image\n");
+    return -1;
+  }
+  cv::Mat cloned = image.clone();
+
+  detection dets[MAX_DET];
+  int32_t det_num = 0;
+
+  /* preprocess */
+  int ih = image.rows;
+  int iw = image.cols;
+  int oh = height;
+  int ow = width;
+  double scale = std::min((double)oh / ih, (double)ow / iw);
+  int nh = (int)(ih * scale);
+  int nw = (int)(iw * scale);
+  // resize & letterbox
+  cv::resize(image, image, cv::Size(nw, nh));
+  int top = (oh - nh) / 2;
+  int bottom = (oh - nh) - top;
+  int left = (ow - nw) / 2;
+  int right = (ow - nw) - left;
+  cv::copyMakeBorder(image, image, top, bottom, left, right, cv::BORDER_CONSTANT,
+                     cv::Scalar::all(0));
+  cv::cvtColor(image, image, cv::COLOR_BGR2RGB);
+
+  //Packed2Planar
+  cv::Mat channels[3];
+  for (int i = 0; i < 3; i++) {
+    channels[i] = cv::Mat(image.rows, image.cols, CV_8SC1);
+  }
+  cv::split(image, channels);
+
+  // fill data
+  int8_t *ptr = (int8_t *)CVI_NN_TensorPtr(input);
+  int channel_size = height * width;
+  for (int i = 0; i < 3; ++i) {
+    memcpy(ptr + i * channel_size, channels[i].data, channel_size);
+  }
+
+  /* run inference */
+  CVI_NN_Forward(model, input_tensors, input_num, output_tensors, output_num);
+  printf("CVI_NN_Forward succeeded\n");
+
+  /* post process */
+  float *output_ptr = (float *)CVI_NN_TensorPtr(output);
+  for (int i = 0; i < MAX_DET; ++i) {
+    // filter real det with score > 0
+    if (output_ptr[i * 6 + 5] > 0) {
+      // output: [x,y,w,h,cls,score]
+      dets[det_num].bbox.x = output_ptr[i * 6 + 0];
+      dets[det_num].bbox.y = output_ptr[i * 6 + 1];
+      dets[det_num].bbox.w = output_ptr[i * 6 + 2];
+      dets[det_num].bbox.h = output_ptr[i * 6 + 3];
+      dets[det_num].cls = output_ptr[i * 6 + 4];
+      dets[det_num].score = output_ptr[i * 6 + 5];
+      det_num++;
+    }
+  }
+  printf("get detection num: %d\n", det_num);
+
+  // correct box with origin image size
+  int restored_w = 0;
+  int restored_h = 0;
+  bool relative_position = false;
+  if (((float)width / cloned.cols) < ((float)height / cloned.rows)) {
+    restored_w = width;
+    restored_h = (cloned.rows * width) / cloned.cols;
+  } else {
+    restored_h = height;
+    restored_w = (cloned.cols * height) / cloned.rows;
+  }
+  for (int i = 0; i < det_num; ++i) {
+    box b = dets[i].bbox;
+    b.x = (b.x - (width - restored_w) / 2. / width) /
+          ((float)restored_w / width);
+    b.y = (b.y - (height - restored_h) / 2. / height) /
+          ((float)restored_h / height);
+    b.w *= (float)width / restored_w;
+    b.h *= (float)height / restored_h;
+    if (!relative_position) {
+      b.x *= cloned.cols;
+      b.w *= cloned.cols;
+      b.y *= cloned.rows;
+      b.h *= cloned.rows;
+    }
+    dets[i].bbox = b;
+  }
+
+  /* draw bbox on image */
+  for (int i = 0; i < det_num; i++) {
+    box b = dets[i].bbox;
+    // xywh2xyxy
+    int x1 = (b.x - b.w / 2);
+    int y1 = (b.y - b.h / 2);
+    int x2 = (b.x + b.w / 2);
+    int y2 = (b.y + b.h / 2);
+    cv::rectangle(cloned, cv::Point(x1, y1), cv::Point(x2, y2), cv::Scalar(255, 255, 0),
+                  3, 8, 0);
+    cv::putText(cloned, coco_names[dets[i].cls], cv::Point(x1, y1),
+                cv::FONT_HERSHEY_DUPLEX, 1.0, cv::Scalar(0, 0, 255), 2);
+  }
+
+  // save or show picture
+  cv::imwrite(argv[3], cloned);
+
+  printf("------\n");
+  printf("%d objects are detected\n", det_num);
+  printf("------\n");
+
+  CVI_NN_CleanupModel(model);
+  printf("CVI_NN_CleanupModel succeeded\n");
+  return 0;
+}
diff --git a/cviruntime/samples/samples_extra/detector_yolov5-face_fused_preprocess/CMakeLists.txt b/cviruntime/samples/samples_extra/detector_yolov5-face_fused_preprocess/CMakeLists.txt
new file mode 100644
index 000000000..7893bed7e
--- /dev/null
+++ b/cviruntime/samples/samples_extra/detector_yolov5-face_fused_preprocess/CMakeLists.txt
@@ -0,0 +1,39 @@
+cmake_minimum_required(VERSION 2.8.0)
+
+project(cvi_sample_detector C CXX)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
+
+if(NOT DEFINED TPU_SDK_PATH)
+  message(FATAL_ERROR "Please set TPU_SDK_PATH to point to the TPU_SDK installation")
+endif()
+include_directories(${TPU_SDK_PATH}/include)
+link_directories(${TPU_SDK_PATH}/lib)
+
+if(NOT DEFINED OPENCV_PATH)
+  message(FATAL_ERROR "Please set OPENCV_PATH to point to the opencvn installation")
+endif()
+include_directories(${OPENCV_PATH}/include)
+link_directories(${OPENCV_PATH}/lib)
+
+set(CVI_LIBS ${CVI_LIBS} cviruntime cvikernel)
+if(NOT CMAKE_CROSSCOMPILING)
+  set(CVI_LIBS ${CVI_LIBS} cvicmodel)
+endif()
+
+set(OPENCV_LIBS ${OPENCV_LIBS} opencv_core opencv_imgcodecs opencv_imgproc)
+if(NOT CMAKE_CROSSCOMPILING)
+  set(OPENCV_LIBS ${OPENCV_LIBS} opencv_highgui)
+endif()
+
+set(EXTRA_LIBS ${EXTRA_LIBS} dl stdc++ pthread z)
+
+add_executable(cvi_sample_detector_yolov5-face_fused_preprocess
+    detector_yolov5-face_fused_preprocess.cpp)
+target_link_libraries(cvi_sample_detector_yolov5-face_fused_preprocess
+    ${CVI_LIBS}
+    ${OPENCV_LIBS}
+    ${EXTRA_LIBS})
+install(TARGETS cvi_sample_detector_yolov5-face_fused_preprocess DESTINATION samples_extra/bin)
diff --git a/cviruntime/samples/samples_extra/detector_yolov5-face_fused_preprocess/README.md b/cviruntime/samples/samples_extra/detector_yolov5-face_fused_preprocess/README.md
new file mode 100644
index 000000000..92461db8a
--- /dev/null
+++ b/cviruntime/samples/samples_extra/detector_yolov5-face_fused_preprocess/README.md
@@ -0,0 +1,111 @@
+# Yolov5s-face Sample with post_process
+
+### Download the model and convert the model under docker (optional)
+#### For new toolchain guide
+The following documents are required:
+* tpu-mlir_xxxx.tar.gz (The release package of tpu-mlir)
+
+Transform cvimodel shell:
+``` shell
+tar zxf tpu-mlir_xxxx.tar.gz
+source tpu-mlir_xxxx/envsetup.sh
+
+mkdir workspace && cd workspace
+cp $TPUC_ROOT/regression/image/parade.jpg .
+cp -rf $TPUC_ROOT/regression/dataset/WIDER .
+
+model_transform.py \
+--model_name yolov5s-face \
+--model_def ./yolov5s-face.onnx \
+--test_input ./parade.jpg \
+--test_result yolov5s-face_top_output.npz \
+--input_shapes [[1,3,640,640]]
+--resize_dims 640,640 \
+--mean 0,0,0 \
+--scale 0.00392,0.00392,0.00392 \
+--pixel_format "rgb" \
+--tolerance 0.99,0.99 \
+--mlir yolov5s-face.mlir
+
+run_calibration.py \
+yolov5s-face.mlir \
+--dataset=./WIDER \
+--input_num=100 \
+-o yolov5s-face_calibration_table
+
+model_deploy.py \
+--mlir yolov5s-face.mlir \
+--calibration_table face_calibration_table \
+--chip cv183x \
+--quantize INT8 \
+--quant_input \
+--test_input ./parade.jpg \
+--test_reference yolov5s-face_top_output.npz \
+--tolerance 0.9,0.6 \
+--fuse_preprocess \
+--model yolov5s-face_fused_preprocess.cvimodel
+```
+
+#### For old toolchain guide
+The following documents are required:
+
+* cvitek_mlir_ubuntu-18.04.tar.gz
+
+Transform model shell:
+``` shell
+tar zxf cvitek_mlir_ubuntu-18.04.tar.gz
+source cvitek_mlir/cvitek_envs.sh
+
+mkdir workspace && cd workspace
+cp $MLIR_PATH/tpuc/regression/data/parade.jpg .
+# set your own calibration dataset, this is example
+cp -rf $MLIR_PATH/tpuc/regression/data/images . 
+
+model_transform.py \
+  --model_type onnx \
+  --model_name yolov5s-face \
+  --model_def yolov5s-face.onnx \
+  --image parade.jpg \
+  --image_resize_dims 640,640 \
+  --net_input_dims 640,640 \
+  --keep_aspect_ratio true \
+  --raw_scale 1.0 \
+  --mean 0.,0.,0. \
+  --std 1.,1.,1. \
+  --input_scale 1.0 \
+  --model_channel_order "rgb" \
+  --tolerance 0.99,0.99,0.99 \
+  --mlir yolov5s-face.mlir
+
+run_calibration.py \
+yolov5s-face.mlir \
+--dataset=./images \
+--input_num=100 \
+-o yolov5s-face_calibration_table
+
+model_deploy.py \
+  --model_name yolov5s-face \
+  --mlir yolov5s-face.mlir \
+  --calibration_table yolov5s-face_calibration_table \
+  --quantize INT8 \
+  --chip cv183x \
+  --image parade.jpg \
+  --fuse_preprocess \
+  --tolerance 0.9,0.9,0.7 \
+  --correctness 0.99,0.99,0.93 \
+  --cvimodel yolov5s-face_fused_preprocess.cvimodel  
+```
+
+Copy generated yolov5s-face_fused_preprocess.cvimodel to EVB board
+
+## How To Compile Vpss input Sample In Docker
+View the Top level directory README.md
+
+## Run Samples In EVB Borad
+```
+cd install_samples/samples_extra
+./bin/cvi_sample_detector_yolov5-face_fused_preprocess \
+./yolov5s-face_fused_preprocess.cvimodel \
+./data/parade.jpg \
+out.jpg
+```
\ No newline at end of file
diff --git a/cviruntime/samples/samples_extra/detector_yolov5-face_fused_preprocess/detector_yolov5-face_fused_preprocess.cpp b/cviruntime/samples/samples_extra/detector_yolov5-face_fused_preprocess/detector_yolov5-face_fused_preprocess.cpp
new file mode 100644
index 000000000..544738d8d
--- /dev/null
+++ b/cviruntime/samples/samples_extra/detector_yolov5-face_fused_preprocess/detector_yolov5-face_fused_preprocess.cpp
@@ -0,0 +1,351 @@
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <opencv2/opencv.hpp>
+#include "cviruntime.h"
+
+#define MAX_DET 20000
+
+typedef struct {
+  float x, y, w, h;
+} box;
+
+typedef struct {
+  box bbox;
+  int cls;
+  float score;
+  int batch_idx;
+  std::vector<std::pair<float, float>> vlandmark;
+} detection;
+
+typedef struct {
+  CVI_TENSOR *output;
+  int num_anchors, h, w, bbox_len, batch = 1, layer_idx;
+} detectLayer;
+
+static float anchors_[3][3][2] = {{{4, 5}, {8, 10}, {13, 16}},
+                                  {{23, 29}, {43, 55}, {73, 105}},
+                                  {{146, 217}, {231, 300}, {335, 433}}};
+
+static void usage(char **argv) {
+  printf("Usage:\n");
+  printf("   %s cvimodel image.jpg image_detected.jpg\n", argv[0]);
+}
+
+static float sigmoid(float x)
+{
+  return 1.0 / (1 + expf(-x));
+}
+
+float calIou(box a, box b)
+{
+  float area1 = a.w * a.h;
+  float area2 = b.w * b.h;
+  float wi = std::min((a.x + a.w / 2), (b.x + b.w / 2)) - std::max((a.x - a.w / 2), (b.x - b.w / 2));
+  float hi = std::min((a.y + a.h / 2), (b.y + b.h / 2)) - std::max((a.y - a.h / 2), (b.y - b.h / 2));
+  float area_i = std::max(wi, 0.0f) * std::max(hi, 0.0f);
+  return area_i / (area1 + area2 - area_i);
+}
+
+void correctYoloBoxes(detection *dets,
+                      int det_num,
+                      int image_h,
+                      int image_w,
+                      int input_height,
+                      int input_width) {
+    int restored_w;
+    int restored_h;
+    float resize_ratio;
+    if (((float)input_width / image_w) < ((float)input_height / image_h)) {
+        restored_w = input_width;
+        restored_h = (image_h * input_width) / image_w;
+    } else {
+        restored_h = input_height;
+        restored_w = (image_w * input_height) / image_h;
+    }
+    resize_ratio = ((float)image_w / restored_w);
+
+    for (int i = 0; i < det_num; ++i) {
+        box bbox = dets[i].bbox;
+        int b    = dets[i].batch_idx;
+        std::vector<std::pair<float, float>> &vlandmark = dets[i].vlandmark;
+        bbox.x   = (bbox.x - (input_width - restored_w) / 2.) * resize_ratio;
+        bbox.y   = (bbox.y - (input_height - restored_h) / 2.) * resize_ratio;
+        bbox.w *= resize_ratio;
+        bbox.h *= resize_ratio;
+        dets[i].bbox = bbox;
+        for (uint32_t j = 0; j < vlandmark.size(); ++j) {
+          vlandmark[j].first =
+            (vlandmark[j].first - (input_width - restored_w) / 2.) * resize_ratio;
+          vlandmark[j].second =
+            (vlandmark[j].second - (input_height - restored_h) / 2.) * resize_ratio;
+        }
+    }
+}
+
+void NMS(detection *dets, int *total, float thresh)
+{
+  if (*total){
+    std::sort(dets, dets + *total - 1, [](detection &a, detection &b)
+              { return b.score < a.score; });
+    int new_count = *total;
+    for (int i = 0; i < *total; ++i)
+    {
+      detection &a = dets[i];
+      if (a.score == 0)
+        continue;
+      for (int j = i + 1; j < *total; ++j)
+      {
+        detection &b = dets[j];
+        if (dets[i].batch_idx == dets[j].batch_idx &&
+            b.score != 0 && dets[i].cls == dets[j].cls &&
+            calIou(a.bbox, b.bbox) > thresh)
+        {
+          b.score = 0;
+          new_count--;
+        }
+      }
+    }
+    for (int i = 0, j = 0 ; i < *total && j < new_count; ++i) {
+      detection &a = dets[i];
+      if (a.score == 0)
+        continue;
+      dets[j] = dets[i];
+      ++j;
+    }
+    *total = new_count;
+  }
+}
+
+/**
+ * @brief
+ *
+ * @note work as long as output shape [n, a, h, w, cls + 5]
+ * @param layer
+ * @param input_height
+ * @param input_width
+ * @param classes_num
+ * @param conf_thresh
+ * @param dets
+ * @return int
+ */
+int getDetections(detectLayer *layer,
+                  int32_t input_height,
+                  int32_t input_width,
+                  int classes_num,
+                  float conf_thresh,
+                  detection *dets) {
+    CVI_TENSOR *output = layer->output;
+    float *output_ptr  = (float *)CVI_NN_TensorPtr(output);
+    int count          = 0;
+    int w_stride       = layer->bbox_len;
+    int h_stride       = layer->w * w_stride;
+    int a_stride       = layer->h * h_stride;
+    int b_stride       = layer->num_anchors * a_stride;
+    float down_stride  = input_width / layer->w;
+    for (int b = 0; b < layer->batch; b++) {
+        for (int a = 0; a < layer->num_anchors; ++a) {
+            for (int i = 0; i < layer->w * layer->h; ++i) {
+                int col          = i % layer->w;
+                int row          = i / layer->w;
+                float *obj       = output_ptr + b * b_stride + a * a_stride + row * h_stride + col * w_stride + 4;
+                // -4,-3,-2,-1 box_xywh
+                // 0           obj_conf
+                // 1,2         landmark x1 y1
+                // 3,4         landmark x2 y2
+                // 5,6         landmark x3 y3
+                // 7,8         landmark x4 y4
+                // 9,10        landmark x5 y5
+                // 11          cls_conf
+                float objectness = sigmoid(obj[0]);
+                if (objectness <= conf_thresh) {
+                    continue;
+                }
+                objectness *= sigmoid(obj[11]);
+
+                float x               = *(obj - 4);
+                float y               = *(obj - 3);
+                float w               = *(obj - 2);
+                float h               = *(obj - 1);
+
+                dets[count].score     = objectness;
+                dets[count].cls       = 0;
+                dets[count].batch_idx = b;
+
+                dets[count].bbox.x = (sigmoid(x) * 2 + col - 0.5) * down_stride;
+                dets[count].bbox.y = (sigmoid(y) * 2 + row - 0.5) * down_stride;
+                dets[count].bbox.w = pow(sigmoid(w) * 2, 2) * anchors_[layer->layer_idx][a][0];
+                dets[count].bbox.h = pow(sigmoid(h) * 2, 2) * anchors_[layer->layer_idx][a][1];
+                // store landmark
+                std::vector<std::pair<float, float>> vlandmark;
+                for (int i = 1; i < 6; ++i) {
+                  float lx = *(obj + i * 2 - 1) * anchors_[layer->layer_idx][a][0] + col * down_stride;
+                  float ly = *(obj + i * 2) * anchors_[layer->layer_idx][a][1] + row * down_stride;
+                  vlandmark.push_back(std::make_pair(lx, ly));
+                }
+                dets[count].vlandmark = vlandmark;
+                ++count;
+            }
+        }
+    }
+    return count;
+}
+
+
+int main(int argc, char **argv) {
+  int ret = 0;
+  CVI_MODEL_HANDLE model;
+
+  if (argc != 4) {
+    usage(argv);
+    exit(-1);
+  }
+  CVI_TENSOR *input;
+  CVI_TENSOR *output;
+  CVI_TENSOR *input_tensors;
+  CVI_TENSOR *output_tensors;
+  int32_t input_num;
+  int32_t output_num;
+  CVI_SHAPE input_shape;
+  CVI_SHAPE* output_shape;
+  int32_t height;
+  int32_t width;
+  float qscale;
+  int bbox_len = 16;
+  int classes_num = 1;
+  float conf_thresh = 0.6;
+  float iou_thresh = 0.5;
+
+  ret = CVI_NN_RegisterModel(argv[1], &model);
+  if (ret != CVI_RC_SUCCESS) {
+    printf("CVI_NN_RegisterModel failed, err %d\n", ret);
+    exit(1);
+  }
+  printf("CVI_NN_RegisterModel succeeded\n");
+
+  // get input output tensors
+  CVI_NN_GetInputOutputTensors(model, &input_tensors, &input_num, &output_tensors,
+                               &output_num);
+
+  input = CVI_NN_GetTensorByName(CVI_NN_DEFAULT_TENSOR, input_tensors, input_num);
+  assert(input);
+  output = output_tensors;
+
+  output_shape = reinterpret_cast<CVI_SHAPE *>(calloc(output_num, sizeof(CVI_SHAPE)));
+  for (int i = 0; i < output_num; i++)
+  {
+    output_shape[i] = CVI_NN_TensorShape(&output[i]);
+  }
+
+  // nchw
+  input_shape = CVI_NN_TensorShape(input);
+  height = input_shape.dim[2];
+  width = input_shape.dim[3];
+
+  // imread
+  cv::Mat image;
+  image = cv::imread(argv[2]);
+  if (!image.data) {
+    printf("Could not open or find the image\n");
+    return -1;
+  }
+  cv::Mat cloned = image.clone();
+
+  // resize & letterbox
+  int ih = image.rows;
+  int iw = image.cols;
+  int oh = height;
+  int ow = width;
+  double resize_scale = std::min((double)oh / ih, (double)ow / iw);
+  int nh = (int)(ih * resize_scale);
+  int nw = (int)(iw * resize_scale);
+  cv::resize(image, image, cv::Size(nw, nh));
+  int top = (oh - nh) / 2;
+  int bottom = (oh - nh) - top;
+  int left = (ow - nw) / 2;
+  int right = (ow - nw) - left;
+  cv::copyMakeBorder(image, image, top, bottom, left, right, cv::BORDER_CONSTANT,
+                     cv::Scalar::all(0));
+  cv::cvtColor(image, image, cv::COLOR_BGR2RGB);
+
+  //Packed2Planar
+  cv::Mat channels[3];
+  for (int i = 0; i < 3; i++) {
+    channels[i] = cv::Mat(image.rows, image.cols, CV_8SC1);
+  }
+  cv::split(image, channels);
+
+  // fill data
+  int8_t *ptr = (int8_t *)CVI_NN_TensorPtr(input);
+  int channel_size = height * width;
+  for (int i = 0; i < 3; ++i) {
+    memcpy(ptr + i * channel_size, channels[i].data, channel_size);
+  }
+
+  // run inference
+  CVI_NN_Forward(model, input_tensors, input_num, output_tensors, output_num);
+
+  // do post proprocess
+  int det_num = 0;
+  int count = 0;
+  detection dets[MAX_DET];
+  std::vector<detectLayer> layers;
+  detection * dets_ptr = dets;
+
+  int stride[3] = {8, 16, 32};
+  // for each detect layer
+  for (int i = 0; i < output_num; i++)
+  {
+    // layer init
+    detectLayer layer;
+    layer.output = &output[i];
+    layer.bbox_len = bbox_len;
+    layer.num_anchors = output_shape[i].dim[1];
+    layer.h = output_shape[i].dim[2];
+    layer.w = (int)(output_shape[i].dim[3] / bbox_len);
+    layer.layer_idx = i;
+    layers.push_back(layer);
+
+    count = getDetections(&layer, height, width,
+                          classes_num, conf_thresh, dets_ptr);
+    det_num += count;
+    dets_ptr += count;
+    float *output_ptr = (float *)CVI_NN_TensorPtr(&output[i]);
+  }
+  // correct box with origin image size
+  NMS(dets, &det_num, iou_thresh);
+  correctYoloBoxes(dets, det_num, cloned.rows, cloned.cols, height, width);
+  printf("get detection num: %d\n", det_num);
+
+  // draw bbox on image
+  for (int i = 0; i < det_num; i++) {
+    box b = dets[i].bbox;
+    // xywh2xyxy
+    int x1 = (b.x - b.w / 2);
+    int y1 = (b.y - b.h / 2);
+    int x2 = (b.x + b.w / 2);
+    int y2 = (b.y + b.h / 2);
+    cv::rectangle(cloned, cv::Point(x1, y1), cv::Point(x2, y2), cv::Scalar(255, 255, 0),
+                  3, 8, 0);
+    std::vector<std::pair<float, float>> vlandmark = dets[i].vlandmark;
+    for (uint32_t j = 0; j < vlandmark.size(); ++j) {
+      cv::circle(cloned, cv::Point((int)vlandmark[j].first, (int)vlandmark[j].second), 1,
+                 cv::Scalar(0, 255, 255), 8, 0);
+    }
+  }
+
+  // save or show picture
+  cv::imwrite(argv[3], cloned);
+
+  printf("------\n");
+  printf("%d objects are detected\n", det_num);
+  printf("------\n");
+
+  CVI_NN_CleanupModel(model);
+  printf("CVI_NN_CleanupModel succeeded\n");
+  free(output_shape);
+  return 0;
+}
diff --git a/cviruntime/samples/samples_extra/detector_yolov5_fused_preprocess/CMakeLists.txt b/cviruntime/samples/samples_extra/detector_yolov5_fused_preprocess/CMakeLists.txt
new file mode 100644
index 000000000..01506f4f7
--- /dev/null
+++ b/cviruntime/samples/samples_extra/detector_yolov5_fused_preprocess/CMakeLists.txt
@@ -0,0 +1,39 @@
+cmake_minimum_required(VERSION 2.8.0)
+
+project(cvi_sample_detector C CXX)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
+
+if(NOT DEFINED TPU_SDK_PATH)
+  message(FATAL_ERROR "Please set TPU_SDK_PATH to point to the TPU_SDK installation")
+endif()
+include_directories(${TPU_SDK_PATH}/include)
+link_directories(${TPU_SDK_PATH}/lib)
+
+if(NOT DEFINED OPENCV_PATH)
+  message(FATAL_ERROR "Please set OPENCV_PATH to point to the opencvn installation")
+endif()
+include_directories(${OPENCV_PATH}/include)
+link_directories(${OPENCV_PATH}/lib)
+
+set(CVI_LIBS ${CVI_LIBS} cviruntime cvikernel)
+if(NOT CMAKE_CROSSCOMPILING)
+  set(CVI_LIBS ${CVI_LIBS} cvicmodel)
+endif()
+
+set(OPENCV_LIBS ${OPENCV_LIBS} opencv_core opencv_imgcodecs opencv_imgproc)
+if(NOT CMAKE_CROSSCOMPILING)
+  set(OPENCV_LIBS ${OPENCV_LIBS} opencv_highgui)
+endif()
+
+set(EXTRA_LIBS ${EXTRA_LIBS} dl stdc++ pthread z)
+
+add_executable(cvi_sample_detector_yolo_v5_fused_preprocess
+    detector_yolov5_fused_preprocess.cpp)
+target_link_libraries(cvi_sample_detector_yolo_v5_fused_preprocess
+    ${CVI_LIBS}
+    ${OPENCV_LIBS}
+    ${EXTRA_LIBS})
+install(TARGETS cvi_sample_detector_yolo_v5_fused_preprocess DESTINATION samples_extra/bin)
\ No newline at end of file
diff --git a/cviruntime/samples/samples_extra/detector_yolov5_fused_preprocess/README.md b/cviruntime/samples/samples_extra/detector_yolov5_fused_preprocess/README.md
new file mode 100644
index 000000000..44381463c
--- /dev/null
+++ b/cviruntime/samples/samples_extra/detector_yolov5_fused_preprocess/README.md
@@ -0,0 +1,111 @@
+# Yolov5s Sample with post_process
+
+### Download the model and convert the model under docker (optional)
+
+#### For new toolchain guide
+The following documents are required:
+* tpu-mlir_xxxx.tar.gz (The release package of tpu-mlir)
+
+Transform cvimodel shell:
+``` shell
+tar zxf tpu-mlir_xxxx.tar.gz
+source tpu-mlir_xxxx/envsetup.sh
+
+mkdir workspace && cd workspace
+cp $TPUC_ROOT/regression/image/dog.jpg .
+cp -rf $TPUC_ROOT/regression/dataset/COCO2017 .
+
+model_transform.py \
+--model_name yolov5s \
+--model_def ./yolov5s.onnx \
+--test_input ./dog.jpg \
+--test_result yolov5s_top_output.npz \
+--input_shapes [[1,3,640,640]] \
+--output_names 326,474,622
+--resize_dims 640,640 \
+--mean 0,0,0 \
+--scale 0.00392,0.00392,0.00392 \
+--pixel_format "rgb" \
+--tolerance 0.99,0.99 \
+--mlir yolov5s.mlir
+
+run_calibration.py \
+yolov5s.mlir \
+--dataset=./COCO2017 \
+--input_num=100 \
+-o yolov5s_calibration_table
+
+model_deploy.py \
+--mlir yolov5s.mlir \
+--calibration_table yolov5s_calibration_table \
+--chip cv183x \
+--quantize INT8 \
+--quant_input \
+--test_input ./dog.jpg \
+--test_reference yolov5s_top_output.npz \
+--tolerance 0.9,0.6 \
+--fuse_preprocess \
+--model yolov5s_fused_preprocess.cvimodel
+```
+
+#### For old toolchain guide
+The following documents are required:
+
+* cvitek_mlir_ubuntu-18.04.tar.gz
+
+Transform model shell:
+``` shell
+tar zxf cvitek_mlir_ubuntu-18.04.tar.gz
+source cvitek_mlir/cvitek_envs.sh
+
+mkdir workspace && cd workspace
+cp $MLIR_PATH/tpuc/regression/data/cat.jpg .
+cp -rf $MLIR_PATH/tpuc/regression/data/images .
+
+model_transform.py \
+  --model_type onnx \
+  --model_name yolov5s \
+  --model_def ./yolov5s_new.onnx \
+  --image ./dog.jpg \
+  --image_resize_dims 640,640 \
+  --keep_aspect_ratio true \
+  --raw_scale 1.0 \
+  --model_channel_order "rgb" \
+  --tolerance 0.99,0.99,0.99 \
+  --mlir yolov5s_fp32.mlir
+
+run_calibration.py \
+yolov5s_fp32.mlir \
+--dataset=./images \
+--input_num=100 \
+-o yolov5s_calibration_table
+
+model_deploy.py \
+--model_name yolov5s \
+--mlir yolov5s_fp32.mlir \
+--calibration_table yolov5s_calibration_table \
+--fuse_preprocess \
+--pixel_format RGB_PLANAR \
+--aligned_input false \
+--excepts output \
+--chip cv183x \
+--quantize INT8 \
+--image ./dog.jpg \
+--tolerance 0.9,0.9,0.5 \
+--correctness 0.95,0.95,0.9 \
+--cvimodel yolov5s_fused_preprocess.cvimodel
+```
+
+Copy generated yolov5s_fused_preprocess.cvimodel to EVB board
+
+## How To Compile Vpss input Sample In Docker
+View the Top level directory README.md
+
+## Run Samples In EVB Borad
+```
+cd install_samples/samples_extra
+./bin/cvi_sample_detector_yolo_v5_fused_preprocess \
+./yolov5s_fused_preprocess.cvimodel \
+./data/dog.jpg \
+yolo_v5_out.jpg
+```
\ No newline at end of file
diff --git a/cviruntime/samples/samples_extra/detector_yolov5_fused_preprocess/detector_yolov5_fused_preprocess.cpp b/cviruntime/samples/samples_extra/detector_yolov5_fused_preprocess/detector_yolov5_fused_preprocess.cpp
new file mode 100644
index 000000000..3b52e41af
--- /dev/null
+++ b/cviruntime/samples/samples_extra/detector_yolov5_fused_preprocess/detector_yolov5_fused_preprocess.cpp
@@ -0,0 +1,354 @@
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <time.h>
+
+#include <opencv2/opencv.hpp>
+#include "cviruntime.h"
+
+typedef struct {
+  float x, y, w, h;
+} box;
+
+typedef struct {
+  box bbox;
+  int cls;
+  float score;
+  int batch_idx;
+} detection;
+
+typedef struct {
+  CVI_TENSOR *output;
+  int num_anchors, h, w, bbox_len, batch = 1, layer_idx;
+} detectLayer;
+
+static const char *coco_names[] = {
+    "person",        "bicycle",       "car",           "motorbike",
+    "aeroplane",     "bus",           "train",         "truck",
+    "boat",          "traffic light", "fire hydrant",  "stop sign",
+    "parking meter", "bench",         "bird",          "cat",
+    "dog",           "horse",         "sheep",         "cow",
+    "elephant",      "bear",          "zebra",         "giraffe",
+    "backpack",      "umbrella",      "handbag",       "tie",
+    "suitcase",      "frisbee",       "skis",          "snowboard",
+    "sports ball",   "kite",          "baseball bat",  "baseball glove",
+    "skateboard",    "surfboard",     "tennis racket", "bottle",
+    "wine glass",    "cup",           "fork",          "knife",
+    "spoon",         "bowl",          "banana",        "apple",
+    "sandwich",      "orange",        "broccoli",      "carrot",
+    "hot dog",       "pizza",         "donut",         "cake",
+    "chair",         "sofa",          "pottedplant",   "bed",
+    "diningtable",   "toilet",        "tvmonitor",     "laptop",
+    "mouse",         "remote",        "keyboard",      "cell phone",
+    "microwave",     "oven",          "toaster",       "sink",
+    "refrigerator",  "book",          "clock",         "vase",
+    "scissors",      "teddy bear",    "hair drier",    "toothbrush"};
+
+static float anchors_[3][3][2] = {{{10, 13}, {16, 30}, {33, 23}},
+                                  {{30, 61}, {62, 45}, {59, 119}},
+                                  {{116, 90}, {156, 198}, {373, 326}}};
+
+static void usage(char **argv) {
+  printf("Usage:\n");
+  printf("   %s cvimodel image.jpg image_detected.jpg\n", argv[0]);
+}
+
+template <typename T> int argmax(const T *data, size_t len, size_t stride = 1) {
+  int maxIndex = 0;
+  for (size_t i = 1; i < len; i++) {
+    int idx = i * stride;
+    if (data[maxIndex * stride] < data[idx]) {
+      maxIndex = i;
+    }
+  }
+  return maxIndex;
+}
+
+static float sigmoid(float x) { return 1.0 / (1 + expf(-x)); }
+
+float calIou(box a, box b) {
+  float area1 = a.w * a.h;
+  float area2 = b.w * b.h;
+  float wi = std::min((a.x + a.w / 2), (b.x + b.w / 2)) -
+             std::max((a.x - a.w / 2), (b.x - b.w / 2));
+  float hi = std::min((a.y + a.h / 2), (b.y + b.h / 2)) -
+             std::max((a.y - a.h / 2), (b.y - b.h / 2));
+  float area_i = std::max(wi, 0.0f) * std::max(hi, 0.0f);
+  return area_i / (area1 + area2 - area_i);
+}
+
+void correctYoloBoxes(detection *dets, int det_num, int image_h, int image_w,
+                      int input_height, int input_width) {
+  int restored_w;
+  int restored_h;
+  float resize_ratio;
+  if (((float)input_width / image_w) < ((float)input_height / image_h)) {
+    restored_w = input_width;
+    restored_h = (image_h * input_width) / image_w;
+  } else {
+    restored_h = input_height;
+    restored_w = (image_w * input_height) / image_h;
+  }
+  resize_ratio = ((float)image_w / restored_w);
+
+  for (int i = 0; i < det_num; ++i) {
+    box bbox = dets[i].bbox;
+    int b = dets[i].batch_idx;
+    bbox.x = (bbox.x - (input_width - restored_w) / 2.) * resize_ratio;
+    bbox.y = (bbox.y - (input_height - restored_h) / 2.) * resize_ratio;
+    bbox.w *= resize_ratio;
+    bbox.h *= resize_ratio;
+    dets[i].bbox = bbox;
+  }
+}
+
+void NMS(detection *dets, int *total, float thresh) {
+  if (*total) {
+    std::sort(dets, dets + *total - 1,
+              [](detection &a, detection &b) { return b.score < a.score; });
+    int new_count = *total;
+    for (int i = 0; i < *total; ++i) {
+      detection &a = dets[i];
+      if (a.score == 0)
+        continue;
+      for (int j = i + 1; j < *total; ++j) {
+        detection &b = dets[j];
+        if (dets[i].batch_idx == dets[j].batch_idx && b.score != 0 &&
+            dets[i].cls == dets[j].cls && calIou(a.bbox, b.bbox) > thresh) {
+          b.score = 0;
+          new_count--;
+        }
+      }
+    }
+    for (int i = 0, j = 0; i < *total && j < new_count; ++i) {
+      detection &a = dets[i];
+      if (a.score == 0)
+        continue;
+      dets[j] = dets[i];
+      ++j;
+    }
+    *total = new_count;
+  }
+}
+
+/**
+ * @brief
+ *
+ * @note work as long as output shape [n, a * (cls + 5), h, w]
+ * @param layer
+ * @param input_height
+ * @param input_width
+ * @param classes_num
+ * @param conf_thresh
+ * @param dets
+ * @return int
+ */
+int getDetections(detectLayer *layer, int32_t input_height, int32_t input_width,
+                  int classes_num, float conf_thresh, std::vector<detection> &dets) {
+  CVI_TENSOR *output = layer->output;
+  float *output_ptr = (float *)CVI_NN_TensorPtr(output);
+  int count = 0;
+  int w_stride = 1;
+  int h_stride = layer->w * w_stride;
+  int o_stride = layer->h * h_stride;
+  int a_stride = layer->bbox_len * o_stride;
+  int b_stride = layer->num_anchors * a_stride;
+  float down_stride = input_width / layer->w;
+  for (int b = 0; b < layer->batch; b++) {
+    for (int a = 0; a < layer->num_anchors; ++a) {
+      for (int i = 0; i < layer->w * layer->h; ++i) {
+        int col = i % layer->w;
+        int row = i / layer->w;
+        float *obj = output_ptr + b * b_stride + a * a_stride + row * h_stride +
+                     col * w_stride + 4 * o_stride;
+        float objectness = sigmoid(obj[0]);
+        if (objectness <= conf_thresh) {
+          continue;
+        }
+        float *scores = obj + 1 * o_stride;
+        int category = argmax(scores, classes_num, o_stride);
+        objectness *= sigmoid(scores[category * o_stride]);
+
+        if (objectness <= conf_thresh) {
+          continue;
+        }
+        // printf("objectness:%f, score:%f\n", sigmoid(obj[0]), sigmoid(scores[category]));
+
+        float x = *(obj - 4 * o_stride);
+        float y = *(obj - 3 * o_stride);
+        float w = *(obj - 2 * o_stride);
+        float h = *(obj - 1 * o_stride);
+        detection det_obj;
+        det_obj.score = objectness;
+        det_obj.cls = category;
+        det_obj.batch_idx = b;
+
+        det_obj.bbox.x = (sigmoid(x) * 2 + col - 0.5) * down_stride;
+        det_obj.bbox.y = (sigmoid(y) * 2 + row - 0.5) * down_stride;
+        det_obj.bbox.w =
+            pow(sigmoid(w) * 2, 2) * anchors_[layer->layer_idx][a][0];
+        det_obj.bbox.h =
+            pow(sigmoid(h) * 2, 2) * anchors_[layer->layer_idx][a][1];
+        dets.emplace_back(std::move(det_obj));
+
+        ++count;
+      }
+    }
+  }
+  return count;
+}
+
+int main(int argc, char **argv) {
+  int ret = 0;
+  CVI_MODEL_HANDLE model;
+
+  if (argc != 4) {
+    usage(argv);
+    exit(-1);
+  }
+  CVI_TENSOR *input;
+  CVI_TENSOR *output;
+  CVI_TENSOR *input_tensors;
+  CVI_TENSOR *output_tensors;
+  int32_t input_num;
+  int32_t output_num;
+  CVI_SHAPE input_shape;
+  CVI_SHAPE *output_shape;
+  int32_t height;
+  int32_t width;
+  float qscale;
+  int bbox_len = 85; // classes num + 5
+  int classes_num = 80;
+  float conf_thresh = 0.5;
+  float iou_thresh = 0.5;
+  float obj_thresh = 0.5;
+
+  ret = CVI_NN_RegisterModel(argv[1], &model);
+  if (ret != CVI_RC_SUCCESS) {
+    printf("CVI_NN_RegisterModel failed, err %d\n", ret);
+    exit(1);
+  }
+  printf("CVI_NN_RegisterModel succeeded\n");
+
+  // get input output tensors
+  CVI_NN_GetInputOutputTensors(model, &input_tensors, &input_num,
+                               &output_tensors, &output_num);
+
+  input =
+      CVI_NN_GetTensorByName(CVI_NN_DEFAULT_TENSOR, input_tensors, input_num);
+  assert(input);
+  output = output_tensors;
+
+  output_shape =
+      reinterpret_cast<CVI_SHAPE *>(calloc(output_num, sizeof(CVI_SHAPE)));
+  for (int i = 0; i < output_num; i++) {
+    output_shape[i] = CVI_NN_TensorShape(&output[i]);
+  }
+
+  // nchw
+  input_shape = CVI_NN_TensorShape(input);
+  height = input_shape.dim[2];
+  width = input_shape.dim[3];
+
+  // imread
+  cv::Mat image;
+  image = cv::imread(argv[2]);
+  if (!image.data) {
+    printf("Could not open or find the image\n");
+    return -1;
+  }
+  cv::Mat cloned = image.clone();
+
+  // resize & letterbox
+  int ih = image.rows;
+  int iw = image.cols;
+  int oh = height;
+  int ow = width;
+  double resize_scale = std::min((double)oh / ih, (double)ow / iw);
+  int nh = (int)(ih * resize_scale);
+  int nw = (int)(iw * resize_scale);
+  cv::resize(image, image, cv::Size(nw, nh));
+  int top = (oh - nh) / 2;
+  int bottom = (oh - nh) - top;
+  int left = (ow - nw) / 2;
+  int right = (ow - nw) - left;
+  cv::copyMakeBorder(image, image, top, bottom, left, right,
+                     cv::BORDER_CONSTANT, cv::Scalar::all(0));
+  cv::cvtColor(image, image, cv::COLOR_BGR2RGB);
+
+  // Packed2Planar
+  cv::Mat channels[3];
+  for (int i = 0; i < 3; i++) {
+    channels[i] = cv::Mat(image.rows, image.cols, CV_8SC1);
+  }
+  cv::split(image, channels);
+
+  // fill data
+  int8_t *ptr = (int8_t *)CVI_NN_TensorPtr(input);
+  int channel_size = height * width;
+  for (int i = 0; i < 3; ++i) {
+    memcpy(ptr + i * channel_size, channels[i].data, channel_size);
+  }
+
+  // run inference
+  CVI_NN_Forward(model, input_tensors, input_num, output_tensors, output_num);
+
+  // do post proprocess
+  int det_num = 0;
+  int count = 0;
+  std::vector<detectLayer> layers;
+  std::vector<detection> dets;
+
+  int stride[3] = {8, 16, 32};
+  // for each detect layer
+  for (int i = 0; i < output_num; i++) {
+    // layer init
+    detectLayer layer;
+    layer.output = &output[i];
+    layer.bbox_len = bbox_len;
+    layer.num_anchors = 3;
+    layer.h = output_shape[i].dim[2];
+    layer.w = output_shape[i].dim[3];
+    layer.layer_idx = i;
+    layers.push_back(layer);
+
+    count = getDetections(&layer, height, width, classes_num, conf_thresh,
+                          dets);
+    det_num += count;
+  }
+  // correct box with origin image size
+  NMS(dets.data(), &det_num, iou_thresh);
+  correctYoloBoxes(dets.data(), det_num, cloned.rows, cloned.cols, height, width);
+  printf("get detection num: %d\n", det_num);
+
+  // draw bbox on image
+  for (int i = 0; i < det_num; i++) {
+    printf("obj %d: [%f %f %f %f] score:%f cls:%s \n", i, dets[i].bbox.x,
+           dets[i].bbox.y, dets[i].bbox.w, dets[i].bbox.h, dets[i].score,
+           coco_names[dets[i].cls]);
+    box b = dets[i].bbox;
+    // xywh2xyxy
+    int x1 = (b.x - b.w / 2);
+    int y1 = (b.y - b.h / 2);
+    int x2 = (b.x + b.w / 2);
+    int y2 = (b.y + b.h / 2);
+    cv::rectangle(cloned, cv::Point(x1, y1), cv::Point(x2, y2),
+                  cv::Scalar(255, 255, 0), 3, 8, 0);
+    cv::putText(cloned, coco_names[dets[i].cls], cv::Point(x1, y1),
+                cv::FONT_HERSHEY_DUPLEX, 1.0, cv::Scalar(0, 0, 255), 2);
+  }
+
+  // save or show picture
+  cv::imwrite(argv[3], cloned);
+
+  printf("------\n");
+  printf("%d objects are detected\n", det_num);
+  printf("------\n");
+
+  CVI_NN_CleanupModel(model);
+  printf("CVI_NN_CleanupModel succeeded\n");
+  free(output_shape);
+  return 0;
+}
diff --git a/cviruntime/samples/samples_extra/detector_yolov8n_fused_preprocess/CMakeLists.txt b/cviruntime/samples/samples_extra/detector_yolov8n_fused_preprocess/CMakeLists.txt
new file mode 100644
index 000000000..721e12e86
--- /dev/null
+++ b/cviruntime/samples/samples_extra/detector_yolov8n_fused_preprocess/CMakeLists.txt
@@ -0,0 +1,39 @@
+cmake_minimum_required(VERSION 2.8.0)
+
+project(cvi_sample_detector C CXX)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
+
+if(NOT DEFINED TPU_SDK_PATH)
+  message(FATAL_ERROR "Please set TPU_SDK_PATH to point to the TPU_SDK installation")
+endif()
+include_directories(${TPU_SDK_PATH}/include)
+link_directories(${TPU_SDK_PATH}/lib)
+
+if(NOT DEFINED OPENCV_PATH)
+  message(FATAL_ERROR "Please set OPENCV_PATH to point to the opencvn installation")
+endif()
+include_directories(${OPENCV_PATH}/include)
+link_directories(${OPENCV_PATH}/lib)
+
+set(CVI_LIBS ${CVI_LIBS} cviruntime cvikernel)
+if(NOT CMAKE_CROSSCOMPILING)
+  set(CVI_LIBS ${CVI_LIBS} cvicmodel)
+endif()
+
+set(OPENCV_LIBS ${OPENCV_LIBS} opencv_core opencv_imgcodecs opencv_imgproc)
+if(NOT CMAKE_CROSSCOMPILING)
+  set(OPENCV_LIBS ${OPENCV_LIBS} opencv_highgui)
+endif()
+
+set(EXTRA_LIBS ${EXTRA_LIBS} dl stdc++ pthread z)
+
+add_executable(cvi_sample_detector_yolov8n_fused_preprocess
+    detector_yolov8n_fused_preprocess.cpp)
+target_link_libraries(cvi_sample_detector_yolov8n_fused_preprocess
+    ${CVI_LIBS}
+    ${OPENCV_LIBS}
+    ${EXTRA_LIBS})
+install(TARGETS cvi_sample_detector_yolov8n_fused_preprocess DESTINATION samples_extra/bin)
\ No newline at end of file
diff --git a/cviruntime/samples/samples_extra/detector_yolov8n_fused_preprocess/README.md b/cviruntime/samples/samples_extra/detector_yolov8n_fused_preprocess/README.md
new file mode 100644
index 000000000..f4670eacb
--- /dev/null
+++ b/cviruntime/samples/samples_extra/detector_yolov8n_fused_preprocess/README.md
@@ -0,0 +1,112 @@
+# PPYOLOE_M Sample with post_process
+
+### Download the model and convert the model under docker (optional)
+#### For new toolchain guide
+The following documents are required:
+* tpu-mlir_xxxx.tar.gz (The release package of tpu-mlir)
+
+Transform cvimodel shell:
+``` shell
+tar zxf tpu-mlir_xxxx.tar.gz
+source tpu-mlir_xxxx/envsetup.sh
+
+mkdir workspace && cd workspace
+cp $TPUC_ROOT/regression/image/dog.jpg .
+cp -rf $TPUC_ROOT/regression/dataset/COCO2017 .
+
+assign_output.py --model yolov8n.onnx --output /model.22/dfl/conv/Conv,/model.22/Sigmoid
+
+model_transform.py \
+	--model_name yolov8n \
+	--model_def yolov8n_new.onnx \
+	--input_shapes [[1,3,640,640]] \
+	--keep_aspect_ratio \
+	--pixel_format "rgb" \
+	--mean 0,0,0 \
+	--scale 0.0039216,0.0039216,0.0039216 \
+	--test_input dog.jpg \
+	--test_result yolov8n_top_outputs.npz \
+	--mlir yolov8n.mlir
+
+run_calibration.py yolov8n.mlir \
+	--dataset ./COCO2017 \
+	--input_num 100 \
+	-o yolov8n_cali_table
+
+model_deploy.py \
+	--mlir yolov8n.mlir \
+	--quantize INT8 \
+	--calibration_table yolov8n_cali_table \
+	--chip cv183x \
+	--test_input dog.jpg \
+	--test_reference yolov8n_top_outputs.npz \
+	--compare_all \
+	--tolerance 0.94,0.67 \
+	--fuse_preprocess \
+	--model yolov8n_int8_fuse_preprocess.cvimodel
+```
+#### For old toolchain guide
+The following documents are required:
+
+* cvitek_mlir_ubuntu-18.04.tar.gz
+
+Transform model shell:
+
+```shell
+tar zxf cvitek_mlir_ubuntu-18.04.tar.gz
+source cvitek_mlir/cvitek_envs.sh
+
+mkdir workspace && cd workspace
+cp $MLIR_PATH/tpuc/regression/data/dog.jpg .
+
+assign_output.py --model yolov8n.onnx --output /model.22/dfl/conv/Conv,/model.22/Sigmoid
+
+model_transform.py \ 
+--model_type onnx \ 
+--model_name yolov8n \
+--model_def yolov8n_new.onnx \
+--keep_aspect_ratio True \
+--image dog.jpg \
+--image_resize_dims 640,640 \
+--net_input_dims 640,640 \
+--raw_scale 1.0 \
+--mean 0.0,0.0,0.0 \
+--std 1.0,1.0,1.0 \
+--input_scale 1.0 \
+--pixel_format RGB_PLANAR \
+--model_channel_order "rgb" \
+--tolerance 0.99,0.99,0.99 \
+--mlir yolov8n.mlir
+
+run_calibration.py yolov8n.mlir \
+	--dataset /data/dataset/coco/val2017 \
+	--input_num 100 \
+	-o yolov8n_cali_table
+
+model_deploy.py \
+--model_name yolov8n \
+--mlir yolov8n.mlir \
+--calibration_table yolov8n_cali_table \
+--quantize int8 \
+--tolerance 0.94,0.94,0.67 \
+--chip cv183x \
+--fuse_preprocess \
+--pixel_format RGB_PLANAR \
+--image dog.jpg \
+--cvimodel yolov8n_int8_fuse_preprocess.cvimodel
+```
+
+
+## How To Compile Vpss input Sample In Docker
+
+View the Top level directory README.md
+
+## Run Samples In EVB Borad
+
+```shell
+cd install_samples/samples_extra
+./bin/cvi_sample_detector_yolov8n_fused_preprocess \
+./yolov8n_int8_fuse_preprocess.cvimodel \
+./data/dog.jpg \
+yolov8n_out.jpg
+```
\ No newline at end of file
diff --git a/cviruntime/samples/samples_extra/detector_yolov8n_fused_preprocess/detector_yolov8n_fused_preprocess.cpp b/cviruntime/samples/samples_extra/detector_yolov8n_fused_preprocess/detector_yolov8n_fused_preprocess.cpp
new file mode 100644
index 000000000..4a1e440c8
--- /dev/null
+++ b/cviruntime/samples/samples_extra/detector_yolov8n_fused_preprocess/detector_yolov8n_fused_preprocess.cpp
@@ -0,0 +1,322 @@
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <opencv2/opencv.hpp>
+#include "cviruntime.h"
+
+
+typedef struct {
+  float x, y, w, h;
+} box;
+
+typedef struct {
+  box bbox;
+  int cls;
+  float score;
+  int batch_idx;
+} detection;
+
+static const char *coco_names[] = {
+    "person",        "bicycle",       "car",           "motorbike",
+    "aeroplane",     "bus",           "train",         "truck",
+    "boat",          "traffic light", "fire hydrant",  "stop sign",
+    "parking meter", "bench",         "bird",          "cat",
+    "dog",           "horse",         "sheep",         "cow",
+    "elephant",      "bear",          "zebra",         "giraffe",
+    "backpack",      "umbrella",      "handbag",       "tie",
+    "suitcase",      "frisbee",       "skis",          "snowboard",
+    "sports ball",   "kite",          "baseball bat",  "baseball glove",
+    "skateboard",    "surfboard",     "tennis racket", "bottle",
+    "wine glass",    "cup",           "fork",          "knife",
+    "spoon",         "bowl",          "banana",        "apple",
+    "sandwich",      "orange",        "broccoli",      "carrot",
+    "hot dog",       "pizza",         "donut",         "cake",
+    "chair",         "sofa",          "pottedplant",   "bed",
+    "diningtable",   "toilet",        "tvmonitor",     "laptop",
+    "mouse",         "remote",        "keyboard",      "cell phone",
+    "microwave",     "oven",          "toaster",       "sink",
+    "refrigerator",  "book",          "clock",         "vase",
+    "scissors",      "teddy bear",    "hair drier",    "toothbrush"};
+
+static void usage(char **argv) {
+  printf("Usage:\n");
+  printf("   %s cvimodel image.jpg image_detected.jpg\n", argv[0]);
+}
+
+template <typename T>
+int argmax(const T *data,
+          size_t len,
+          size_t stride = 1)
+{
+	int maxIndex = 0;
+	for (size_t i = stride; i < len; i += stride)
+	{
+		if (data[maxIndex] < data[i])
+		{
+			maxIndex = i;
+		}
+	}
+	return maxIndex;
+}
+
+float calIou(box a, box b)
+{
+  float area1 = a.w * a.h;
+  float area2 = b.w * b.h;
+  float wi = std::min((a.x + a.w / 2), (b.x + b.w / 2)) - std::max((a.x - a.w / 2), (b.x - b.w / 2));
+  float hi = std::min((a.y + a.h / 2), (b.y + b.h / 2)) - std::max((a.y - a.h / 2), (b.y - b.h / 2));
+  float area_i = std::max(wi, 0.0f) * std::max(hi, 0.0f);
+  return area_i / (area1 + area2 - area_i);
+}
+
+static void NMS(std::vector<detection> &dets, int *total, float thresh)
+{
+  if (*total){
+    std::sort(dets.begin(), dets.end(), [](detection &a, detection &b)
+              { return b.score < a.score; });
+    int new_count = *total;
+    for (int i = 0; i < *total; ++i)
+    {
+      detection &a = dets[i];
+      if (a.score == 0)
+        continue;
+      for (int j = i + 1; j < *total; ++j)
+      {
+        detection &b = dets[j];
+        if (dets[i].batch_idx == dets[j].batch_idx &&
+            b.score != 0 && dets[i].cls == dets[j].cls &&
+            calIou(a.bbox, b.bbox) > thresh)
+        {
+          b.score = 0;
+          new_count--;
+        }
+      }
+    }
+    std::vector<detection>::iterator it = dets.begin();
+    while (it != dets.end()) {
+      if (it->score == 0) {
+        dets.erase(it);
+      } else {
+        it++;
+      }
+    }
+    *total = new_count;
+  }
+}
+
+void correctYoloBoxes(std::vector<detection> &dets,
+                      int det_num,
+                      int image_h,
+                      int image_w,
+                      int input_height,
+                      int input_width) {
+    int restored_w;
+    int restored_h;
+    float resize_ratio;
+    if (((float)input_width / image_w) < ((float)input_height / image_h)) {
+        restored_w = input_width;
+        restored_h = (image_h * input_width) / image_w;
+    } else {
+        restored_h = input_height;
+        restored_w = (image_w * input_height) / image_h;
+    }
+    resize_ratio = ((float)image_w / restored_w);
+
+    for (int i = 0; i < det_num; ++i) {
+        box bbox = dets[i].bbox;
+        int b    = dets[i].batch_idx;
+        bbox.x   = (bbox.x - (input_width - restored_w) / 2.) * resize_ratio;
+        bbox.y   = (bbox.y - (input_height - restored_h) / 2.) * resize_ratio;
+        bbox.w *= resize_ratio;
+        bbox.h *= resize_ratio;
+        dets[i].bbox = bbox;
+    }
+}
+
+/**
+ * @brief
+ * @param output
+ * @note scores_shape : [batch , class_num, det_num, 1] 
+ * @note des_shape: [batch, 1, 4, det_num]
+ * @return int
+ */
+int getDetections(CVI_TENSOR *output,
+                  int32_t input_height,
+                  int32_t input_width,
+                  int classes_num,
+                  CVI_SHAPE output_shape,
+                  float conf_thresh,
+                  std::vector<detection> &dets) {  
+    float *scores_ptr  = (float *)CVI_NN_TensorPtr(&output[1]);
+    float *dets_ptr  = (float *)CVI_NN_TensorPtr(&output[0]);
+    float stride[3] = {8, 16, 32};
+    int count          = 0;
+    int batch = output_shape.dim[0];
+    int total_box_num = output_shape.dim[3];
+    for (int b = 0; b < batch; b++) {
+        int score_index = 0;
+      for (int i = 0; i < 3; i++) {
+        int nh = input_height / stride[i], nw = input_width / stride[i];
+        int box_num = nh * nw;
+        for (int j = 0; j < box_num; j++) {
+          float anchor_x = (float)(j % nw) + 0.5, anchor_y = (float)(j / nw) + 0.5;
+          int maxIndex = argmax(scores_ptr, classes_num * total_box_num - score_index, total_box_num);
+          if (scores_ptr[maxIndex] <= conf_thresh) {
+            scores_ptr = scores_ptr + 1;
+            score_index++;
+            dets_ptr = dets_ptr + 1;            
+            continue; 
+          }
+          detection det;
+          det.score = scores_ptr[maxIndex];
+          det.cls = (maxIndex + score_index) / total_box_num;
+          det.batch_idx = b;
+          float x1 = anchor_x - dets_ptr[0 * total_box_num];
+          float y1 = anchor_y - dets_ptr[1 * total_box_num];
+          float x2 = anchor_x + dets_ptr[2 * total_box_num];
+          float y2 = anchor_y + dets_ptr[3 * total_box_num];
+          det.bbox.h = (y2 -y1) * stride[i];
+          det.bbox.w = (x2 -x1) * stride[i];
+          det.bbox.x = x1 * stride[i] + det.bbox.w / 2.0;
+          det.bbox.y = y1 * stride[i] + det.bbox.h / 2.0;
+          count++;  
+          dets.emplace_back(det);
+          scores_ptr = scores_ptr + 1;
+          score_index++;
+          dets_ptr = dets_ptr + 1;   
+        }
+      }
+    }
+    return count;
+}
+
+int main(int argc, char **argv) {
+  int ret = 0;
+  CVI_MODEL_HANDLE model;
+
+  if (argc != 4) {
+    usage(argv);
+    exit(-1);
+  }
+  CVI_TENSOR *input;
+  CVI_TENSOR *output;
+  CVI_TENSOR *input_tensors;
+  CVI_TENSOR *output_tensors;
+  int32_t input_num;
+  int32_t output_num;
+  CVI_SHAPE input_shape;
+  CVI_SHAPE* output_shape;
+  int32_t height;
+  int32_t width;
+  //int bbox_len = 84; // classes num + 4
+  int classes_num = 80;
+  float conf_thresh = 0.5;
+  float iou_thresh = 0.5;
+  ret = CVI_NN_RegisterModel(argv[1], &model);
+  if (ret != CVI_RC_SUCCESS) {
+    printf("CVI_NN_RegisterModel failed, err %d\n", ret);
+    exit(1);
+  }
+  printf("CVI_NN_RegisterModel succeeded\n");
+
+  // get input output tensors
+  CVI_NN_GetInputOutputTensors(model, &input_tensors, &input_num, &output_tensors,
+                               &output_num);
+
+  input = CVI_NN_GetTensorByName(CVI_NN_DEFAULT_TENSOR, input_tensors, input_num);
+  assert(input);
+  output = output_tensors;
+  output_shape = reinterpret_cast<CVI_SHAPE *>(calloc(output_num, sizeof(CVI_SHAPE)));
+  for (int i = 0; i < output_num; i++)
+  {
+    output_shape[i] = CVI_NN_TensorShape(&output[i]);
+  }
+
+  // nchw
+  input_shape = CVI_NN_TensorShape(input);
+  height = input_shape.dim[2];
+  width = input_shape.dim[3];
+  assert(height % 32 == 0 && width %32 == 0);
+  // imread
+  cv::Mat image;
+  image = cv::imread(argv[2]);
+  if (!image.data) {
+    printf("Could not open or find the image\n");
+    return -1;
+  }
+  cv::Mat cloned = image.clone();
+
+  // resize & letterbox
+  int ih = image.rows;
+  int iw = image.cols;
+  int oh = height;
+  int ow = width;
+  double resize_scale = std::min((double)oh / ih, (double)ow / iw);
+  int nh = (int)(ih * resize_scale);
+  int nw = (int)(iw * resize_scale);
+  cv::resize(image, image, cv::Size(nw, nh));
+  int top = (oh - nh) / 2;
+  int bottom = (oh - nh) - top;
+  int left = (ow - nw) / 2;
+  int right = (ow - nw) - left;
+  cv::copyMakeBorder(image, image, top, bottom, left, right, cv::BORDER_CONSTANT,
+                     cv::Scalar::all(0));
+  cv::cvtColor(image, image, cv::COLOR_BGR2RGB);
+
+  //Packed2Planar
+  cv::Mat channels[3];
+  for (int i = 0; i < 3; i++) {
+    channels[i] = cv::Mat(image.rows, image.cols, CV_8SC1);
+  }
+  cv::split(image, channels);
+
+  // fill data
+  int8_t *ptr = (int8_t *)CVI_NN_TensorPtr(input);
+  int channel_size = height * width;
+  for (int i = 0; i < 3; ++i) {
+    memcpy(ptr + i * channel_size, channels[i].data, channel_size);
+  }
+
+  // run inference
+  CVI_NN_Forward(model, input_tensors, input_num, output_tensors, output_num);
+  printf("CVI_NN_Forward Succeed...\n");
+  // do post proprocess
+  int det_num = 0;
+  std::vector<detection> dets;
+  det_num = getDetections(output, height, width, classes_num, output_shape[0],  
+                          conf_thresh, dets);
+  // correct box with origin image size
+  NMS(dets, &det_num, iou_thresh);
+  correctYoloBoxes(dets, det_num, cloned.rows, cloned.cols, height, width);
+
+  // draw bbox on image
+  for (int i = 0; i < det_num; i++) {
+    box b = dets[i].bbox;
+    // xywh2xyxy
+    int x1 = (b.x - b.w / 2);
+    int y1 = (b.y - b.h / 2);
+    int x2 = (b.x + b.w / 2);
+    int y2 = (b.y + b.h / 2);
+    cv::rectangle(cloned, cv::Point(x1, y1), cv::Point(x2, y2), cv::Scalar(255, 255, 0),
+                  3, 8, 0);
+    char content[100];
+    sprintf(content, "%s %0.3f", coco_names[dets[i].cls], dets[i].score);
+    cv::putText(cloned, content, cv::Point(x1, y1),
+                cv::FONT_HERSHEY_DUPLEX, 1.0, cv::Scalar(0, 0, 255), 2);
+  }
+
+  // save or show picture
+  cv::imwrite(argv[3], cloned);
+
+  printf("------\n");
+  printf("%d objects are detected\n", det_num);
+  printf("------\n");
+
+  CVI_NN_CleanupModel(model);
+  printf("CVI_NN_CleanupModel succeeded\n");
+  free(output_shape);
+  return 0;
+}
\ No newline at end of file
diff --git a/cviruntime/samples/samples_extra/detector_yolox_s/CMakeLists.txt b/cviruntime/samples/samples_extra/detector_yolox_s/CMakeLists.txt
new file mode 100644
index 000000000..198244637
--- /dev/null
+++ b/cviruntime/samples/samples_extra/detector_yolox_s/CMakeLists.txt
@@ -0,0 +1,40 @@
+cmake_minimum_required(VERSION 2.8.0)
+
+project(cvi_sample_detector_yolox_s C CXX)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
+
+if(NOT DEFINED TPU_SDK_PATH)
+  message(FATAL_ERROR "Please set TPU_SDK_PATH to point to the TPU_SDK installation")
+endif()
+include_directories(${TPU_SDK_PATH}/include)
+link_directories(${TPU_SDK_PATH}/lib)
+
+if(NOT DEFINED OPENCV_PATH)
+  message(FATAL_ERROR "Please set OPENCV_PATH to point to the opencvn installation")
+endif()
+include_directories(${OPENCV_PATH}/include)
+link_directories(${OPENCV_PATH}/lib)
+
+set(CVI_LIBS ${CVI_LIBS} cviruntime cvikernel)
+if(NOT CMAKE_CROSSCOMPILING)
+  set(CVI_LIBS ${CVI_LIBS} cvicmodel)
+endif()
+
+set(OPENCV_LIBS ${OPENCV_LIBS} opencv_core opencv_imgcodecs opencv_imgproc)
+if(NOT CMAKE_CROSSCOMPILING)
+  set(OPENCV_LIBS ${OPENCV_LIBS} opencv_highgui)
+endif()
+
+set(EXTRA_LIBS ${EXTRA_LIBS} dl stdc++ pthread z)
+
+add_executable(cvi_sample_detector_yolox_s
+    detector_yolox_s.cpp)
+target_link_libraries(cvi_sample_detector_yolox_s
+    ${CVI_LIBS}
+    ${OPENCV_LIBS}
+    ${EXTRA_LIBS})
+install(TARGETS cvi_sample_detector_yolox_s
+    cvi_sample_detector_yolox_s DESTINATION samples_extra/bin)
\ No newline at end of file
diff --git a/cviruntime/samples/samples_extra/detector_yolox_s/README.md b/cviruntime/samples/samples_extra/detector_yolox_s/README.md
new file mode 100644
index 000000000..42d12a8c3
--- /dev/null
+++ b/cviruntime/samples/samples_extra/detector_yolox_s/README.md
@@ -0,0 +1,106 @@
+# Yolox_s Sample with post_process and withdou fuse_preprocess
+
+### Download the model and convert the model under docker (optional)
+#### For new toolchain guide
+The following documents are required:
+* tpu-mlir_xxxx.tar.gz (The release package of tpu-mlir)
+
+Transform cvimodel shell:
+``` shell
+tar zxf tpu-mlir_xxxx.tar.gz
+source tpu-mlir_xxxx/envsetup.sh
+
+mkdir workspace && cd workspace
+cp $TPUC_ROOT/regression/image/dog.jpg .
+cp -rf $TPUC_ROOT/regression/dataset/COCO2017 .
+
+model_transform.py \
+--model_name yolox_s \
+--model_def ./yolox_s.onnx \
+--test_input ./dog.jpg \
+--test_result yolox_s_top_output.npz \
+--input_shapes [[1,3,640,640]]
+--resize_dims 640,640 \
+--mean 0,0,0 \
+--scale 1.0,1.0,1.0 \
+--pixel_format "bgr" \
+--tolerance 0.99,0.99 \
+--excepts 796_Sigmoid \
+--mlir yolox_s.mlir
+
+run_calibration.py \
+yolox_s.mlir \
+--dataset=./COCO2017 \
+--input_num=100 \
+-o yolox_s_calibration_table
+
+model_deploy.py \
+--mlir yolox_s.mlir \
+--calibration_table yolox_s_calibration_table \
+--chip cv183x \
+--quantize INT8 \
+--quant_input \
+--test_input ./dog.jpg \
+--test_reference yolox_s_top_output.npz \
+--excepts 796_Sigmoid \
+--tolerance 0.8,0.5 \
+--model yolox_s.cvimodel
+```
+
+#### For old toolchain guide
+The following documents are required:
+
+* cvitek_mlir_ubuntu-18.04.tar.gz
+
+Transform model shell:
+``` shell
+tar zxf cvitek_mlir_ubuntu-18.04.tar.gz
+source cvitek_mlir/cvitek_envs.sh
+
+mkdir workspace && cd workspace
+cp $MLIR_PATH/tpuc/regression/data/dog.jpg .
+cp -rf $MLIR_PATH/tpuc/regression/data/images .
+
+model_transform.py \
+  --model_type onnx \
+  --model_name yolox_s \
+  --model_def ./yolox_s.onnx \
+  --image ./dog.jpg \
+  --image_resize_dims 640,640 \
+  --keep_aspect_ratio true \
+  --model_channel_order "bgr" \
+  --tolerance 0.99,0.99,0.99 \
+  --mlir yolox_s_fp32.mlir
+
+run_calibration.py \
+yolox_s_fp32.mlir \
+--dataset=./images \
+--input_num=100 \
+-o yolox_s_calib.txt
+
+model_deploy.py \
+--model_name yolox \
+--mlir yolox_s_fp32.mlir \
+--calibration_table yolox_s_calib.txt \
+--pixel_format BGR_PLANAR \
+--excepts "796_Sigmoid" \
+--chip cv183x \
+--quantize INT8 \
+--image dog.jpg \
+--tolerance 0.85,0.85,0.4 \
+--correctness 0.95,0.95,0.9 \
+--cvimodel yolox_s.cvimodel
+```
+Copy generated yolox_s.cvimodel to EVB board
+
+## How To Compile Vpss input Sample In Docker
+View the Top level directory README.md
+
+## Run Samples In EVB Borad
+```
+cd install_samples/samples_extra
+./bin/cvi_sample_detector_yolox_s \
+./yolox_s.cvimodel \
+./data/dog.jpg \
+yolox_s_out.jpg
+```
\ No newline at end of file
diff --git a/cviruntime/samples/samples_extra/detector_yolox_s/detector_yolox_s.cpp b/cviruntime/samples/samples_extra/detector_yolox_s/detector_yolox_s.cpp
new file mode 100644
index 000000000..51b585702
--- /dev/null
+++ b/cviruntime/samples/samples_extra/detector_yolox_s/detector_yolox_s.cpp
@@ -0,0 +1,387 @@
+#include <math.h>
+#include <opencv2/opencv.hpp>
+#include "cviruntime.h"
+
+#define MAX_DET 200
+#define YOLOX_NMS_THRESH  0.45 // nms threshold
+#define YOLOX_CONF_THRESH 0.25 // threshold of bounding box prob
+
+typedef struct {
+  float x, y, w, h;
+} box;
+
+typedef struct {
+  box bbox;
+  int cls;
+  float score;
+} detection;
+
+struct Object
+{
+    cv::Rect_<float> rect;
+    int label;
+    float prob;
+};
+
+struct GridAndStride
+{
+    int grid0;
+    int grid1;
+    int stride;
+};
+
+
+static const char *coco_names[] = {
+    "person",        "bicycle",       "car",           "motorbike",
+    "aeroplane",     "bus",           "train",         "truck",
+    "boat",          "traffic light", "fire hydrant",  "stop sign",
+    "parking meter", "bench",         "bird",          "cat",
+    "dog",           "horse",         "sheep",         "cow",
+    "elephant",      "bear",          "zebra",         "giraffe",
+    "backpack",      "umbrella",      "handbag",       "tie",
+    "suitcase",      "frisbee",       "skis",          "snowboard",
+    "sports ball",   "kite",          "baseball bat",  "baseball glove",
+    "skateboard",    "surfboard",     "tennis racket", "bottle",
+    "wine glass",    "cup",           "fork",          "knife",
+    "spoon",         "bowl",          "banana",        "apple",
+    "sandwich",      "orange",        "broccoli",      "carrot",
+    "hot dog",       "pizza",         "donut",         "cake",
+    "chair",         "sofa",          "pottedplant",   "bed",
+    "diningtable",   "toilet",        "tvmonitor",     "laptop",
+    "mouse",         "remote",        "keyboard",      "cell phone",
+    "microwave",     "oven",          "toaster",       "sink",
+    "refrigerator",  "book",          "clock",         "vase",
+    "scissors",      "teddy bear",    "hair drier",    "toothbrush"};
+
+static void usage(char **argv) {
+  printf("Usage:\n");
+  printf("   %s cvimodel image.jpg image_detected.jpg\n", argv[0]);
+}
+
+static inline float intersection_area(const Object &a, const Object &b) {
+  cv::Rect_<float> inter = a.rect & b.rect;
+  return inter.area();
+}
+
+static void
+generate_grids_and_stride(const int target_size_w, const int target_size_h, std::vector<int> &strides,
+                          std::vector<GridAndStride> &grid_strides) {
+  for (int i = 0; i < (int)strides.size(); i++) {
+    int stride = strides[i];
+    int num_grid_w = target_size_w / stride;
+    int num_grid_h = target_size_h / stride;
+    for (int g1 = 0; g1 < num_grid_h; g1++) {
+      for (int g0 = 0; g0 < num_grid_w; g0++) {
+        GridAndStride gs;
+        gs.grid0 = g0;
+        gs.grid1 = g1;
+        gs.stride = stride;
+        grid_strides.push_back(gs);
+      }
+    }
+  }
+}
+
+static void generate_yolox_proposals(std::vector<GridAndStride> grid_strides,
+                                     CVI_TENSOR *out_tensor,
+                                     float prob_threshold,
+                                     std::vector<Object> &objects) {
+  CVI_SHAPE shape = CVI_NN_TensorShape(out_tensor);
+  const int num_grid = shape.dim[1];
+  const int num_class = shape.dim[2] - 5;
+  const int num_anchors = grid_strides.size();
+
+  const float *feat_ptr = (float *)CVI_NN_TensorPtr(out_tensor);
+  for (int anchor_idx = 0; anchor_idx < num_anchors; anchor_idx++) {
+    int max_class_idx = 0;
+    float max_prob = 0.0f;
+    float box_objectness = feat_ptr[4];
+    for (int class_idx = 0; class_idx < num_class; class_idx++) {
+      float box_cls_score = feat_ptr[5 + class_idx];
+      float box_prob = box_objectness * box_cls_score;
+      if (box_prob > max_prob) {
+        max_class_idx = class_idx;
+        max_prob = box_prob;
+      }
+    } // class loop
+    if (max_prob > prob_threshold) {
+      const int grid0 = grid_strides[anchor_idx].grid0;
+      const int grid1 = grid_strides[anchor_idx].grid1;
+      const int stride = grid_strides[anchor_idx].stride;
+
+      // yolox/models/yolo_head.py decode logic
+      //  outputs[..., :2] = (outputs[..., :2] + grids) * strides
+      //  outputs[..., 2:4] = torch.exp(outputs[..., 2:4]) * strides
+      float x_center = (feat_ptr[0] + grid0) * stride;
+      float y_center = (feat_ptr[1] + grid1) * stride;
+      float w = exp(feat_ptr[2]) * stride;
+      float h = exp(feat_ptr[3]) * stride;
+      float x0 = x_center - w * 0.5f;
+      float y0 = y_center - h * 0.5f;
+      Object obj;
+      obj.rect.x = x0;
+      obj.rect.y = y0;
+      obj.rect.width = w;
+      obj.rect.height = h;
+      obj.label = max_class_idx;
+      obj.prob = max_prob;
+
+      objects.push_back(obj);
+    }
+
+    feat_ptr += shape.dim[2];
+
+  } // point anchor loop
+}
+
+static void qsort_descent_inplace(std::vector<Object> &faceobjects, int left,
+                                  int right) {
+  int i = left;
+  int j = right;
+  float p = faceobjects[(left + right) / 2].prob;
+
+  while (i <= j) {
+    while (faceobjects[i].prob > p)
+      i++;
+
+    while (faceobjects[j].prob < p)
+      j--;
+
+    if (i <= j) {
+      // swap
+      std::swap(faceobjects[i], faceobjects[j]);
+
+      i++;
+      j--;
+    }
+  }
+
+#pragma omp parallel sections
+  {
+#pragma omp section
+    {
+      if (left < j)
+        qsort_descent_inplace(faceobjects, left, j);
+    }
+#pragma omp section
+    {
+      if (i < right)
+        qsort_descent_inplace(faceobjects, i, right);
+    }
+  }
+}
+
+static void qsort_descent_inplace(std::vector<Object> &objects) {
+  if (objects.empty())
+    return;
+
+  qsort_descent_inplace(objects, 0, objects.size() - 1);
+}
+
+static void nms_sorted_bboxes(const std::vector<Object> &faceobjects,
+                              std::vector<int> &picked, float nms_threshold) {
+  picked.clear();
+
+  const int n = faceobjects.size();
+
+  std::vector<float> areas(n);
+  for (int i = 0; i < n; i++) {
+    areas[i] = faceobjects[i].rect.area();
+  }
+
+  for (int i = 0; i < n; i++) {
+    const Object &a = faceobjects[i];
+
+    int keep = 1;
+    for (int j = 0; j < (int)picked.size(); j++) {
+      const Object &b = faceobjects[picked[j]];
+
+      // intersection over union
+      float inter_area = intersection_area(a, b);
+      float union_area = areas[i] + areas[picked[j]] - inter_area;
+      // float IoU = inter_area / union_area
+      if (inter_area / union_area > nms_threshold)
+        keep = 0;
+    }
+
+    if (keep)
+      picked.push_back(i);
+  }
+}
+
+int post_process(CVI_TENSOR *out_tensor, int img_w, int img_h, int width, int height, float scale,
+                 std::vector<Object> &objects) {
+  std::vector<Object> proposals;
+  static const int stride_arr[] = {8, 16, 32}; // might have stride=64 in YOLOX
+  std::vector<int> strides(stride_arr, stride_arr + sizeof(stride_arr) /
+                                                        sizeof(stride_arr[0]));
+  std::vector<GridAndStride> grid_strides;
+  generate_grids_and_stride(width, height, strides, grid_strides);
+  generate_yolox_proposals(grid_strides, out_tensor, YOLOX_CONF_THRESH, proposals);
+  // sort all proposals by score from highest to lowest
+  qsort_descent_inplace(proposals);
+
+  // apply nms with nms_threshold
+  std::vector<int> picked;
+  nms_sorted_bboxes(proposals, picked, YOLOX_NMS_THRESH);
+
+  int count = picked.size();
+
+  objects.resize(count);
+  for (int i = 0; i < count; i++) {
+    objects[i] = proposals[picked[i]];
+
+    // adjust offset to original unpadded
+    float x0 = (objects[i].rect.x) / scale;
+    float y0 = (objects[i].rect.y) / scale;
+    float x1 = (objects[i].rect.x + objects[i].rect.width) / scale;
+    float y1 = (objects[i].rect.y + objects[i].rect.height) / scale;
+
+    // clip
+    x0 = std::max(std::min(x0, (float)(img_w - 1)), 0.f);
+    y0 = std::max(std::min(y0, (float)(img_h - 1)), 0.f);
+    x1 = std::max(std::min(x1, (float)(img_w - 1)), 0.f);
+    y1 = std::max(std::min(y1, (float)(img_h - 1)), 0.f);
+
+    objects[i].rect.x = x0;
+    objects[i].rect.y = y0;
+    objects[i].rect.width = x1 - x0;
+    objects[i].rect.height = y1 - y0;
+  }
+  return 0;
+}
+
+int main(int argc, char *argv[]) {
+  CVI_MODEL_HANDLE model = nullptr;
+  CVI_TENSOR *input_tensors;
+  CVI_TENSOR *output_tensors;
+  int32_t input_num;
+  int32_t output_num;
+  CVI_SHAPE shape;
+  int32_t height;
+  int32_t width;
+  float qscale;
+  CVI_TENSOR *input;
+  CVI_TENSOR *output;
+
+  if (argc != 4) {
+    usage(argv);
+    exit(-1);
+  }
+
+  const char *model_file = argv[1];
+  cv::Mat image = cv::imread(argv[2]);
+  if (!image.data) {
+    printf("Could not open or find the image:%s\n", argv[2]);
+    return -1;
+  }
+  cv::Mat cloned = image.clone();
+
+  int ret = CVI_NN_RegisterModel(model_file, &model);
+  if (ret != CVI_RC_SUCCESS) {
+    printf("CVI_NN_RegisterModel failed, err %d\n", ret);
+    exit(1);
+  }
+  printf("CVI_NN_RegisterModel succeeded\n");
+
+  // get input output tensors
+  CVI_NN_GetInputOutputTensors(model, &input_tensors, &input_num, &output_tensors,
+                               &output_num);
+
+  input = CVI_NN_GetTensorByName(CVI_NN_DEFAULT_TENSOR, input_tensors, input_num);
+  assert(input);
+  output = CVI_NN_GetTensorByName(CVI_NN_DEFAULT_TENSOR, output_tensors, output_num);
+  assert(output);
+
+  qscale = CVI_NN_TensorQuantScale(input);
+  shape = CVI_NN_TensorShape(input);
+  height = shape.dim[2];
+  width = shape.dim[3];
+
+  // do preprocess
+  // resize & letterbox
+  int ih = image.rows;
+  int iw = image.cols;
+  int oh = height;
+  int ow = width;
+  double resize_scale = std::min((double)oh / ih, (double)ow / iw);
+  int nh = (int)(ih * resize_scale);
+  int nw = (int)(iw * resize_scale);
+  cv::resize(image, image, cv::Size(nw, nh));
+  int top = 0;
+  int bottom = oh - nh;
+  int left = 0;
+  int right = ow - nw;
+  cv::copyMakeBorder(image, image, top, bottom, left, right, cv::BORDER_CONSTANT,
+                     cv::Scalar::all(0));
+  // split
+  cv::Mat channels[3];
+  for (int i = 0; i < 3; i++) {
+    channels[i] = cv::Mat(height, width, CV_8SC1);
+  }
+  cv::split(image, channels);
+  // normalize
+  float scale = qscale;
+  float mean = 0.0f;
+  for (int i = 0; i < 3; i++) {
+    channels[i].convertTo(channels[i], CV_8SC1, scale, mean);
+  }
+  // fill data
+  int8_t *ptr = (int8_t *)CVI_NN_TensorPtr(input);
+  int channel_size = height * width;
+  memcpy(ptr, channels[0].data, channel_size);
+  memcpy(ptr + channel_size, channels[1].data, channel_size);
+  memcpy(ptr + 2 * channel_size, channels[2].data, channel_size);
+
+  // run inference
+  CVI_NN_Forward(model, input_tensors, input_num, output_tensors, output_num);
+
+  std::vector<Object> objects;
+
+  // post process
+  ret = post_process(output, iw, ih, width, height, resize_scale, objects);
+
+  // draw bbox on image
+  for (size_t i = 0; i < objects.size(); i++) {
+    const Object &obj = objects[i];
+
+    fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
+            obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
+
+    cv::rectangle(cloned, obj.rect, cv::Scalar(255, 0, 0));
+
+    char text[256];
+    sprintf(text, "%s %.1f%%", coco_names[obj.label], obj.prob * 100);
+
+    int baseLine = 0;
+    cv::Size label_size =
+        cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
+
+    int x = obj.rect.x;
+    int y = obj.rect.y - label_size.height - baseLine;
+    if (y < 0)
+      y = 0;
+    if (x + label_size.width > image.cols)
+      x = image.cols - label_size.width;
+
+    cv::rectangle(
+        cloned,
+        cv::Rect(cv::Point(x, y),
+                 cv::Size(label_size.width, label_size.height + baseLine)),
+        cv::Scalar(255, 255, 255), -1);
+
+    cv::putText(cloned, text, cv::Point(x, y + label_size.height),
+                cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
+  }
+
+  // save or show picture
+  cv::imwrite(argv[3], cloned);
+
+  printf("------\n");
+  printf("%zu objects are detected\n", objects.size());
+  printf("------\n");
+
+  CVI_NN_CleanupModel(model);
+  printf("CVI_NN_CleanupModel succeeded\n");
+  return 0;
+}
\ No newline at end of file
diff --git a/cviruntime/samples/samples_extra/insightface_fused_preprocess/CMakeLists.txt b/cviruntime/samples/samples_extra/insightface_fused_preprocess/CMakeLists.txt
new file mode 100644
index 000000000..f5b192b14
--- /dev/null
+++ b/cviruntime/samples/samples_extra/insightface_fused_preprocess/CMakeLists.txt
@@ -0,0 +1,43 @@
+cmake_minimum_required(VERSION 2.8.0)
+
+project(cvi_sample_fd_fr_fused_preprocess C CXX)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
+
+if(NOT DEFINED TPU_SDK_PATH)
+  message(FATAL_ERROR "Please set TPU_SDK_PATH to point to the TPU_SDK installation")
+endif()
+include_directories(${TPU_SDK_PATH}/include)
+link_directories(${TPU_SDK_PATH}/lib)
+
+if(NOT DEFINED OPENCV_PATH)
+  message(FATAL_ERROR "Please set OPENCV_PATH to point to the opencvn installation")
+endif()
+include_directories(${OPENCV_PATH}/include)
+link_directories(${OPENCV_PATH}/lib)
+
+set(CVI_LIBS ${CVI_LIBS} cviruntime cvikernel)
+if(NOT CMAKE_CROSSCOMPILING)
+  set(CVI_LIBS ${CVI_LIBS} cvicmodel)
+endif()
+
+set(OPENCV_LIBS ${OPENCV_LIBS} opencv_core opencv_imgcodecs opencv_imgproc)
+if(NOT CMAKE_CROSSCOMPILING)
+  set(OPENCV_LIBS ${OPENCV_LIBS} opencv_highgui)
+endif()
+
+set(EXTRA_LIBS ${EXTRA_LIBS} dl stdc++ pthread z)
+
+add_executable(cvi_sample_fd_fr_fused_preprocess
+    face_detector.cpp
+    face_recognizer.cpp
+    face_transform.cpp
+    fd_fr_compare_similarity_fused_preprocess.cpp)
+target_link_libraries(cvi_sample_fd_fr_fused_preprocess
+    ${CVI_LIBS}
+    ${OPENCV_LIBS}
+    ${EXTRA_LIBS})
+install(TARGETS cvi_sample_fd_fr_fused_preprocess
+    cvi_sample_fd_fr_fused_preprocess DESTINATION samples_extra/bin)
diff --git a/cviruntime/samples/samples_extra/insightface_fused_preprocess/README.md b/cviruntime/samples/samples_extra/insightface_fused_preprocess/README.md
new file mode 100644
index 000000000..0b09f6504
--- /dev/null
+++ b/cviruntime/samples/samples_extra/insightface_fused_preprocess/README.md
@@ -0,0 +1,197 @@
+# Face detection and recognition Sample
+
+### Download the model and convert the model under docker (optional)
+#### For new toolchain guide
+The following documents are required:
+* tpu-mlir_xxxx.tar.gz (The release package of tpu-mlir)
+
+Transform cvimodel shell:
+``` shell
+tar zxf tpu-mlir_xxxx.tar.gz
+source tpu-mlir_xxxx/envsetup.sh
+
+mkdir workspace && cd workspace
+cp $TPUC_ROOT/regression/image/parade.jpg .
+cp $TPUC_ROOT/regression/image/Aaron_Eckhart_0001.jpg .
+cp -rf $TPUC_ROOT/regression/dataset/LFW .
+cp -rf $TPUC_ROOT/regression/dataset/WIDER .
+
+## retinaface fuse_post_process
+model_transform.py \
+--model_name mnet \
+--model_def ./mnet_600_with_detection.prototxt \
+--model_data ./mnet.caffemodel \
+--test_input ./parade.jpg \
+--test_result mnet_top_output.npz \
+--input_shapes [[1,3,600,600]]
+--resize_dims 600,600 \
+--mean 0,0,0 \
+--scale 1,1,1 \
+--pixel_format "rgb" \
+--tolerance 0.99,0.99 \
+--excepts data \
+--mlir mnet.mlir
+
+run_calibration.py \
+mnet.mlir \
+--dataset=./WIDER \
+--input_num=100 \
+-o mnet_calibration_table
+
+model_deploy.py \
+--mlir mnet.mlir \
+--calibration_table mnet_calibration_table \
+--chip cv183x \
+--quantize INT8 \
+--quant_input \
+--customization_format RGB_PLANAR \
+--test_input ./parade.jpg \
+--test_reference mnet_top_output.npz \
+--fuse_preprocess \
+--excepts data \
+--tolerance 0.8,0.5 \
+--model retinaface_mnet25_600_fused_preprocess_with_detection.cvimodel
+
+## arcface
+model_transform.py \
+--model_name arcface_res50 \
+--model_def ./arcface_res50.prototxt \
+--model_data ./arcface_res50.caffemodel \
+--test_input ./Aaron_Eckhart_0001.jpg \
+--test_result arcface_res50_top_output.npz \
+--input_shapes [[1,3,112,112]]
+--resize_dims 112,112 \
+--mean 127.5,127.5,127.5 \
+--scale 0.0078125,0.0078125,0.0078125 \
+--pixel_format "rgb" \
+--tolerance 0.99,0.99 \
+--excepts data \
+--mlir arcface_res50.mlir
+
+run_calibration.py \
+arcface_res50.mlir \
+--dataset=./LFW \
+--input_num=100 \
+-o arcface_res50_calibration_table
+
+model_deploy.py \
+--mlir arcface_res50.mlir \
+--calibration_table arcface_res50_calibration_table \
+--chip cv183x \
+--quantize INT8 \
+--quant_input \
+--customization_format RGB_PLANAR \
+--test_input ./Aaron_Eckhart_0001.jpg \
+--test_reference arcface_res50_top_output.npz \
+--fuse_preprocess \
+--excepts data \
+--tolerance 0.9,0.6 \
+--model arcface_res50_fused_preprocess.cvimodel
+```
+
+
+#### For old toolchain guide
+The following documents are required:
+
+* cvitek_mlir_ubuntu-18.04.tar.gz
+
+Transform model shell:
+``` shell
+tar zxf cvitek_mlir_ubuntu-18.04.tar.gz
+source cvitek_mlir/cvitek_envs.sh
+
+mkdir workspace && cd workspace
+cp $MLIR_PATH/tpuc/regression/data/parade.jpg .
+cp $MLIR_PATH/tpuc/regression/data/Aaron_Eckhart_0001.jpg .
+cp -rf $MLIR_PATH/tpuc/regression/data/images .
+
+## retinaface fuse_post_process
+model_transform.py \
+--model_type caffe \
+--model_name mnet \
+--model_def ./mnet_600_with_detection.prototxt \
+--model_data ./mnet.caffemodel \
+--image ./parade.jpg \
+--image_resize_dims 600,600 \
+--model_channel_order "rgb" \
+--tolerance 0.99,0.99,0.99 \
+--mlir mnet_416_fp32.mlir
+
+run_calibration.py \
+mnet_416_fp32.mlir \
+--dataset=./images \
+--input_num=100 \
+-o retinaface_mnet25_calibration_table
+
+model_deploy.py \
+--model_name mnet \
+--mlir mnet_416_fp32.mlir \
+--calibration_table retinaface_mnet25_calibration_table \
+--fuse_preprocess \
+--pixel_format RGB_PLANAR \
+--aligned_input false \
+--chip cv183x \
+--quantize INT8 \
+--image parade.jpg \
+--tolerance 0.90,0.85,0.54 \
+--correctness 0.95,0.95,0.9 \
+--cvimodel retinaface_mnet25_600_fused_preprocess_with_detection.cvimodel
+
+## arcface
+model_transform.py \
+--model_type caffe \
+--model_name mnet \
+--model_def ./arcface_res50.prototxt \
+--model_data ./arcface_res50.caffemodel \
+--image ./Aaron_Eckhart_0001.jpg \
+--image_resize_dims 112,112 \
+--input_scale 0.0078125 \
+--mean 127.5,127.5,127.5 \
+--model_channel_order "rgb" \
+--tolerance 0.99,0.99,0.99 \
+--mlir arcface_res50_fp32.mlir
+
+run_calibration.py \
+arcface_res50_fp32.mlir \
+--dataset=./images \
+--input_num=100 \
+-o arcface_res50_calibration_table
+
+model_deploy.py \
+--model_name arcface_res50 \
+--mlir arcface_res50_fp32.mlir \
+--calibration_table arcface_res50_calibration_table \
+--fuse_preprocess \
+--pixel_format RGB_PLANAR \
+--aligned_input false \
+--chip cv183x \
+--quantize INT8 \
+--image pose_256_192.jpg \
+--excepts stage1_unit1_sc_scale \
+--tolerance 0.6,0.6,0 \
+--correctness 0.95,0.95,0.9 \
+--cvimodel arcface_res50_fused_preprocess.cvimodel
+```
+
+Copy generated retinaface_mnet25_600_fused_preprocess_with_detection.cvimodel and arcface_res50_fused_preprocess.cvimodel to Development board
+
+## How To Compile Vpss input Sample In Docker
+View the Top level directory README.md
+
+## Run Samples In EVB Borad
+```
+cd install_samples/samples_extra
+## test sample people
+./bin/cvi_sample_fd_fr_fused_preprocess \
+retinaface_mnet25_600_fused_preprocess_with_detection.cvimodel \
+arcface_res50_fused_preprocess.cvimodel \
+./data/obama1.jpg \
+./data/obama2.jpg
+
+## test different people
+./bin/cvi_sample_fd_fr_fused_preprocess \
+retinaface_mnet25_600_fused_preprocess_with_detection.cvimodel \
+arcface_res50_fused_preprocess.cvimodel \
+./data/obama1.jpg \
+./data/trump1.jpg
+```
\ No newline at end of file
diff --git a/cviruntime/samples/samples_extra/insightface_fused_preprocess/face_detector.cpp b/cviruntime/samples/samples_extra/insightface_fused_preprocess/face_detector.cpp
new file mode 100644
index 000000000..9e9384783
--- /dev/null
+++ b/cviruntime/samples/samples_extra/insightface_fused_preprocess/face_detector.cpp
@@ -0,0 +1,75 @@
+#include "face_detector.h"
+
+FaceDetector::FaceDetector(const char *model_file) {
+  int ret = CVI_NN_RegisterModel(model_file, &model);
+  if (ret != CVI_RC_SUCCESS) {
+    printf("CVI_NN_RegisterModel failed, err %d\n", ret);
+    exit(1);
+  }
+
+  // get input output tensors
+  CVI_NN_GetInputOutputTensors(model, &input_tensors, &input_num, &output_tensors,
+                               &output_num);
+  input = &input_tensors[0];
+  output = &output_tensors[0];
+
+  shape = CVI_NN_TensorShape(input);
+  height = shape.dim[2];
+  width = shape.dim[3];
+  scale_w = scale_h = 1.0;
+}
+
+FaceDetector::~FaceDetector() {
+  if (model) {
+    CVI_NN_CleanupModel(model);
+  }
+}
+
+void FaceDetector::doPreProccess_ResizeOnly(cv::Mat &image) {
+  cv::Mat resized_image = image.clone();
+  scale_w = 1.0 * width / image.cols;
+  scale_h = 1.0 * height / image.rows;
+  cv::resize(image, resized_image, cv::Size(), scale_w, scale_h);
+  cv::cvtColor(resized_image, resized_image, cv::COLOR_BGR2RGB);
+
+  //Packed2Planar
+  cv::Mat channels[3];
+  for (int i = 0; i < 3; i++) {
+    channels[i] = cv::Mat(resized_image.rows, resized_image.cols, CV_8SC1);
+  }
+  cv::split(resized_image, channels);
+
+  // fill data
+  int8_t *ptr = (int8_t *)CVI_NN_TensorPtr(input);
+  int channel_size = height * width;
+  for (int i = 0; i < 3; ++i) {
+    memcpy(ptr + i * channel_size, channels[i].data, channel_size);
+  }
+}
+
+void FaceDetector::doInference() {
+  CVI_NN_Forward(model, input_tensors, input_num, output_tensors, output_num);
+}
+
+cv::Mat FaceDetector::doPostProccess(void) {
+  int32_t output_h = output_tensors[0].shape.dim[2];
+  int32_t output_w = output_tensors[0].shape.dim[3];
+
+  printf("output_h = %d, output_w = %d, CVI_NN_TensorSize(output) = %d\n", output_h, output_w, CVI_NN_TensorSize(output));
+  cv::Mat dets(output_h, output_w, CV_32FC1);
+  memcpy(dets.data, CVI_NN_TensorPtr(output), CVI_NN_TensorSize(output));
+
+  // multiply scale to origin image size
+  for (int i = 0; i < output_h; ++i) {
+    dets.at<float>(i, 0) = dets.at<float>(i, 0) / scale_w;
+    dets.at<float>(i, 1) = dets.at<float>(i, 1) / scale_h;
+    dets.at<float>(i, 2) = dets.at<float>(i, 2) / scale_w;
+    dets.at<float>(i, 3) = dets.at<float>(i, 3) / scale_h;
+
+    for (int j = 0; j < 10; j = j + 2) {
+      dets.at<float>(i, 5 + j) = dets.at<float>(i, 5 + j) / scale_w;
+      dets.at<float>(i, 6 + j) = dets.at<float>(i, 6 + j) / scale_h;
+    }
+  }
+  return dets;
+}
diff --git a/cviruntime/samples/samples_extra/insightface_fused_preprocess/face_detector.h b/cviruntime/samples/samples_extra/insightface_fused_preprocess/face_detector.h
new file mode 100644
index 000000000..0db4089c5
--- /dev/null
+++ b/cviruntime/samples/samples_extra/insightface_fused_preprocess/face_detector.h
@@ -0,0 +1,36 @@
+#ifndef FACE_DETECTOR_H
+#define FACE_DETECTOR_H
+
+#include <iostream>
+#include <vector>
+#include <string>
+#include <opencv2/opencv.hpp>
+#include "cviruntime.h"
+
+class FaceDetector {
+public:
+  FaceDetector(const char *model_file);
+  ~FaceDetector();
+
+  void doPreProccess_ResizeOnly(cv::Mat &image);
+  void doInference();
+  cv::Mat doPostProccess();
+
+public:
+  CVI_TENSOR *input;
+  CVI_TENSOR *output;
+
+private:
+  CVI_MODEL_HANDLE model = nullptr;
+  CVI_TENSOR *input_tensors;
+  CVI_TENSOR *output_tensors;
+  int32_t input_num;
+  int32_t output_num;
+  CVI_SHAPE shape;
+  int32_t height;
+  int32_t width;
+  float scale_w;
+  float scale_h;
+};
+
+#endif
diff --git a/cviruntime/samples/samples_extra/insightface_fused_preprocess/face_recognizer.cpp b/cviruntime/samples/samples_extra/insightface_fused_preprocess/face_recognizer.cpp
new file mode 100644
index 000000000..d6354117f
--- /dev/null
+++ b/cviruntime/samples/samples_extra/insightface_fused_preprocess/face_recognizer.cpp
@@ -0,0 +1,80 @@
+#include "face_recognizer.h"
+
+
+FaceRecognizer::FaceRecognizer(const char *model_file) {
+  int ret = CVI_NN_RegisterModel(model_file, &model);
+  if (ret != CVI_RC_SUCCESS) {
+    printf("CVI_NN_RegisterModel failed, err %d\n", ret);
+    exit(1);
+  }
+
+  // get input output tensors
+  CVI_NN_GetInputOutputTensors(model, &input_tensors, &input_num,
+                               &output_tensors, &output_num);
+  input = &input_tensors[0];
+  output = &output_tensors[0];
+
+  shape = CVI_NN_TensorShape(input);
+  height = shape.dim[2];
+  width = shape.dim[3];
+  
+}
+
+FaceRecognizer::~FaceRecognizer() {
+  if (model) {
+    CVI_NN_CleanupModel(model);
+  }
+}
+
+void FaceRecognizer::doPreProccess_ResizeOnly(cv::Mat &image, cv::Mat &det) {
+  cv::Mat aligned_face = image.clone();
+
+  float ref_pts[5][2] = {
+    { 30.2946f, 51.6963f },
+    { 65.5318f, 51.5014f },
+    { 48.0252f, 71.7366f },
+    { 33.5493f, 92.3655f },
+    { 62.7299f, 92.2041f }
+  };
+
+  cv::Mat ref(5, 2, CV_32FC1, ref_pts);
+
+  float dst_pts[5][2] = {
+    det.at<float>(0, 5), det.at<float>(0, 6),
+    det.at<float>(0, 7), det.at<float>(0, 8),
+    det.at<float>(0, 9), det.at<float>(0, 10),
+    det.at<float>(0, 11), det.at<float>(0, 12),
+    det.at<float>(0, 13), det.at<float>(0, 14)
+  };
+
+  cv::Mat dst(5, 2, CV_32FC1, dst_pts);
+  auto m = similarTransform(dst, ref);
+  cv::warpPerspective(image, aligned_face, m, cv::Size(96, 112), cv::INTER_LINEAR);
+  cv::resize(aligned_face, aligned_face, cv::Size(112, 112), 0, 0, cv::INTER_LINEAR);
+  cv::cvtColor(aligned_face, aligned_face, cv::COLOR_BGR2RGB);
+
+  //Packed2Planar
+  cv::Mat channels[3];
+  for (int i = 0; i < 3; i++) {
+    channels[i] = cv::Mat(aligned_face.rows, aligned_face.cols, CV_8SC1);
+  }
+  cv::split(aligned_face, channels);
+
+  // fill data
+  int8_t *ptr = (int8_t *)CVI_NN_TensorPtr(input);
+  int channel_size = height * width;
+  for (int i = 0; i < 3; ++i) {
+    memcpy(ptr + i * channel_size, channels[i].data, channel_size);
+  }
+}
+
+void FaceRecognizer::doInference() {
+  // run inference
+  CVI_NN_Forward(model, input_tensors, input_num, output_tensors, output_num);
+}
+
+cv::Mat FaceRecognizer::doPostProccess() {
+  cv::Mat feature(512, 1, CV_32FC1);
+  memcpy(feature.data, CVI_NN_TensorPtr(output), CVI_NN_TensorSize(output));
+  return feature;
+}
\ No newline at end of file
diff --git a/cviruntime/samples/samples_extra/insightface_fused_preprocess/face_recognizer.h b/cviruntime/samples/samples_extra/insightface_fused_preprocess/face_recognizer.h
new file mode 100644
index 000000000..0015bf646
--- /dev/null
+++ b/cviruntime/samples/samples_extra/insightface_fused_preprocess/face_recognizer.h
@@ -0,0 +1,37 @@
+#ifndef FACE_RECOGNIZER_H
+#define FACE_RECOGNIZER_H
+
+#include <iostream>
+#include <string>
+#include <vector>
+#include <opencv2/opencv.hpp>
+#include "cviruntime.h"
+
+
+cv::Mat similarTransform(cv::Mat src, cv::Mat dst);
+
+class FaceRecognizer {
+public:
+  FaceRecognizer(const char *model_file);
+  ~FaceRecognizer();
+
+  void doPreProccess_ResizeOnly(cv::Mat &image, cv::Mat &det);
+  void doInference();
+  cv::Mat doPostProccess();
+
+public:
+  CVI_TENSOR *input;
+  CVI_TENSOR *output;
+
+private:
+  CVI_MODEL_HANDLE model = nullptr;
+  CVI_TENSOR *input_tensors;
+  CVI_TENSOR *output_tensors;
+  int32_t input_num;
+  int32_t output_num;
+  CVI_SHAPE shape;
+  int32_t height;
+  int32_t width;
+};
+
+#endif
diff --git a/cviruntime/samples/samples_extra/insightface_fused_preprocess/face_transform.cpp b/cviruntime/samples/samples_extra/insightface_fused_preprocess/face_transform.cpp
new file mode 100644
index 000000000..06884e2d8
--- /dev/null
+++ b/cviruntime/samples/samples_extra/insightface_fused_preprocess/face_transform.cpp
@@ -0,0 +1,126 @@
+//================================================================
+// This file from
+// https://github.com/deepinsight/insightface/tree/master/cpp-align
+//================================================================
+
+// Created by Jack Yu on 23/03/2018.
+//
+#include <opencv2/opencv.hpp>
+
+static cv::Mat meanAxis0(const cv::Mat &src) {
+  int num = src.rows;
+  int dim = src.cols;
+
+  // x1 y1
+  // x2 y2
+
+  cv::Mat output(1, dim, CV_32F);
+  for (int i = 0; i < dim; i++) {
+    float sum = 0;
+    for (int j = 0; j < num; j++) {
+      sum += src.at<float>(j, i);
+    }
+    output.at<float>(0, i) = sum / num;
+  }
+
+  return output;
+}
+
+static cv::Mat elementwiseMinus(const cv::Mat &A, const cv::Mat &B) {
+  cv::Mat output(A.rows, A.cols, A.type());
+
+  assert(B.cols == A.cols);
+  if (B.cols == A.cols) {
+    for (int i = 0; i < A.rows; i++) {
+      for (int j = 0; j < B.cols; j++) {
+        output.at<float>(i, j) = A.at<float>(i, j) - B.at<float>(0, j);
+      }
+    }
+  }
+  return output;
+}
+
+static cv::Mat varAxis0(const cv::Mat &src) {
+  cv::Mat temp_ = elementwiseMinus(src, meanAxis0(src));
+  cv::multiply(temp_, temp_, temp_);
+  return meanAxis0(temp_);
+}
+
+static int MatrixRank(cv::Mat M) {
+  cv::Mat w, u, vt;
+  cv::SVD::compute(M, w, u, vt);
+  cv::Mat1b nonZeroSingularValues = w > 0.0001;
+  int rank = cv::countNonZero(nonZeroSingularValues);
+  return rank;
+}
+
+//    References
+//    ----------
+//    .. [1] "Least-squares estimation of transformation parameters between two
+//    point patterns", Shinji Umeyama, PAMI 1991, DOI: 10.1109/34.88573
+//
+//    """
+//
+//    Anthor:Jack Yu
+cv::Mat similarTransform(cv::Mat src, cv::Mat dst) {
+  int num = src.rows;
+  int dim = src.cols;
+  cv::Mat src_mean = meanAxis0(src);
+  cv::Mat dst_mean = meanAxis0(dst);
+  cv::Mat src_demean = elementwiseMinus(src, src_mean);
+  cv::Mat dst_demean = elementwiseMinus(dst, dst_mean);
+  cv::Mat A = (dst_demean.t() * src_demean) / static_cast<float>(num);
+  cv::Mat d(dim, 1, CV_32F);
+  d.setTo(1.0f);
+  if (cv::determinant(A) < 0) {
+    d.at<float>(dim - 1, 0) = -1;
+  }
+  cv::Mat T = cv::Mat::eye(dim + 1, dim + 1, CV_32F);
+  cv::Mat U, S, V;
+  cv::SVD::compute(A, S, U, V);
+
+  // the SVD function in opencv differ from scipy .
+
+  int rank = MatrixRank(A);
+  if (rank == 0) {
+    assert(rank == 0);
+
+  } else if (rank == dim - 1) {
+    if (cv::determinant(U) * cv::determinant(V) > 0) {
+      T.rowRange(0, dim).colRange(0, dim) = U * V;
+    } else {
+      //            s = d[dim - 1]
+      //            d[dim - 1] = -1
+      //            T[:dim, :dim] = np.dot(U, np.dot(np.diag(d), V))
+      //            d[dim - 1] = s
+      int s = d.at<float>(dim - 1, 0) = -1;
+      d.at<float>(dim - 1, 0) = -1;
+
+      T.rowRange(0, dim).colRange(0, dim) = U * V;
+      cv::Mat diag_ = cv::Mat::diag(d);
+      cv::Mat twp = diag_ * V; // np.dot(np.diag(d), V.T)
+      cv::Mat B = cv::Mat::zeros(3, 3, CV_8UC1);
+      cv::Mat C = B.diag(0);
+      T.rowRange(0, dim).colRange(0, dim) = U * twp;
+      d.at<float>(dim - 1, 0) = s;
+    }
+  } else {
+    cv::Mat diag_ = cv::Mat::diag(d);
+    cv::Mat twp = diag_ * V.t(); // np.dot(np.diag(d), V.T)
+    cv::Mat res = U * twp;       // U
+    T.rowRange(0, dim).colRange(0, dim) = -U.t() * twp;
+  }
+  cv::Mat var_ = varAxis0(src_demean);
+  float val = cv::sum(var_).val[0];
+  cv::Mat res;
+  cv::multiply(d, S, res);
+  float scale = 1.0 / val * cv::sum(res).val[0];
+  T.rowRange(0, dim).colRange(0, dim) = -T.rowRange(0, dim).colRange(0, dim).t();
+  cv::Mat temp1 = T.rowRange(0, dim).colRange(0, dim); // T[:dim, :dim]
+  cv::Mat temp2 = src_mean.t();                        // src_mean.T
+  cv::Mat temp3 = temp1 * temp2; // np.dot(T[:dim, :dim], src_mean.T)
+  cv::Mat temp4 = scale * temp3;
+  T.rowRange(0, dim).colRange(dim, dim + 1) = -(temp4 - dst_mean.t());
+  T.rowRange(0, dim).colRange(0, dim) *= scale;
+  return T;
+}
diff --git a/cviruntime/samples/samples_extra/insightface_fused_preprocess/fd_fr_compare_similarity_fused_preprocess.cpp b/cviruntime/samples/samples_extra/insightface_fused_preprocess/fd_fr_compare_similarity_fused_preprocess.cpp
new file mode 100644
index 000000000..d0411e8b0
--- /dev/null
+++ b/cviruntime/samples/samples_extra/insightface_fused_preprocess/fd_fr_compare_similarity_fused_preprocess.cpp
@@ -0,0 +1,55 @@
+#include <iostream>
+#include <string>
+#include <vector>
+#include "face_detector.h"
+#include "face_recognizer.h"
+
+static float cal_similarity(cv::Mat feature1, cv::Mat feature2) {
+  return feature1.dot(feature2) / (cv::norm(feature1) * cv::norm(feature2));
+}
+
+int main(int argc, char *argv[]) {
+  // Assume test jpg only has one face per image
+  if (argc < 4) {
+    std::cout << "Usage: " << argv[0] << " fd.cvimodel fr.cvimodel image1.jpg image2.jpg"
+              << std::endl;
+  }
+
+  cv::Mat image1 = cv::imread(argv[3]);
+  if (!image1.data) {
+    std::cout << "Can not find or open image: " << argv[3] << std::endl;
+    return -1;
+  }
+
+  cv::Mat image2 = cv::imread(argv[4]);
+  if (!image2.data) {
+    std::cout << "Can not find or open image: " << argv[4] << std::endl;
+    return -1;
+  }
+
+  FaceDetector fd(argv[1]);
+  FaceRecognizer fr(argv[2]);
+
+  fd.doPreProccess_ResizeOnly(image1);
+  fd.doInference();
+  auto det1 = fd.doPostProccess();
+
+  fr.doPreProccess_ResizeOnly(image1, det1);
+  fr.doInference();
+  auto feature1 = fr.doPostProccess();
+
+  fd.doPreProccess_ResizeOnly(image2);
+  fd.doInference();
+  auto det2 = fd.doPostProccess();
+  fr.doPreProccess_ResizeOnly(image2, det2);
+  fr.doInference();
+  auto feature2 = fr.doPostProccess();
+
+  float similarity = cal_similarity(feature1, feature2);
+
+  printf("------\n");
+  printf("Similarity: %f\n", similarity);
+  printf("------\n");
+
+  return 0;
+}
diff --git a/cviruntime/samples/samples_extra/insightface_mem_cb/CMakeLists.txt b/cviruntime/samples/samples_extra/insightface_mem_cb/CMakeLists.txt
new file mode 100644
index 000000000..5ffae4c77
--- /dev/null
+++ b/cviruntime/samples/samples_extra/insightface_mem_cb/CMakeLists.txt
@@ -0,0 +1,43 @@
+cmake_minimum_required(VERSION 2.8.0)
+
+project(cvi_sample_mem_cb C CXX)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
+
+if(NOT DEFINED TPU_SDK_PATH)
+  message(FATAL_ERROR "Please set TPU_SDK_PATH to point to the TPU_SDK installation")
+endif()
+include_directories(${TPU_SDK_PATH}/include)
+link_directories(${TPU_SDK_PATH}/lib)
+
+if(NOT DEFINED OPENCV_PATH)
+  message(FATAL_ERROR "Please set OPENCV_PATH to point to the opencvn installation")
+endif()
+include_directories(${OPENCV_PATH}/include)
+link_directories(${OPENCV_PATH}/lib)
+
+set(CVI_LIBS ${CVI_LIBS} cviruntime cvikernel)
+if(NOT CMAKE_CROSSCOMPILING)
+  set(CVI_LIBS ${CVI_LIBS} cvicmodel)
+endif()
+
+set(OPENCV_LIBS ${OPENCV_LIBS} opencv_core opencv_imgcodecs opencv_imgproc)
+if(NOT CMAKE_CROSSCOMPILING)
+  set(OPENCV_LIBS ${OPENCV_LIBS} opencv_highgui)
+endif()
+
+set(EXTRA_LIBS ${EXTRA_LIBS} dl stdc++ pthread z)
+
+add_executable(cvi_sample_mem_cb
+    face_detector.cpp
+    face_recognizer.cpp
+    face_transform.cpp
+    fd_fr_compare_similarity_with_mem_cb.cpp)
+target_link_libraries(cvi_sample_mem_cb
+    ${CVI_LIBS}
+    ${OPENCV_LIBS}
+    ${EXTRA_LIBS})
+install(TARGETS cvi_sample_mem_cb
+    cvi_sample_mem_cb DESTINATION samples_extra/bin)
diff --git a/cviruntime/samples/samples_extra/insightface_mem_cb/face_detector.cpp b/cviruntime/samples/samples_extra/insightface_mem_cb/face_detector.cpp
new file mode 100644
index 000000000..3f505a14b
--- /dev/null
+++ b/cviruntime/samples/samples_extra/insightface_mem_cb/face_detector.cpp
@@ -0,0 +1,89 @@
+#include "face_detector.h"
+
+FaceDetector::FaceDetector(const char *model_file, bool nhwc) {
+  int ret = CVI_NN_RegisterModel(model_file, &model);
+  if (ret != CVI_RC_SUCCESS) {
+    printf("CVI_NN_RegisterModel failed, err %d\n", ret);
+    exit(1);
+  }
+
+  // get input output tensors
+  CVI_NN_GetInputOutputTensors(model, &input_tensors, &input_num, &output_tensors,
+                               &output_num);
+  input = &input_tensors[0];
+  output = &output_tensors[0];
+
+  qscale = CVI_NN_TensorQuantScale(input);
+  shape = CVI_NN_TensorShape(input);
+  if (nhwc) {
+    height = shape.dim[1];
+    width = shape.dim[2];
+  } else {
+    height = shape.dim[2];
+    width = shape.dim[3];
+  }
+  scale_w = scale_h = 1.0;
+}
+
+FaceDetector::~FaceDetector() {
+  if (model) {
+    CVI_NN_CleanupModel(model);
+  }
+}
+
+void FaceDetector::doPreProccess(cv::Mat &image) {
+  cv::Mat resized_image;
+  scale_w = 1.0 * width / image.cols;
+  scale_h = 1.0 * height / image.rows;
+  cv::resize(image, resized_image, cv::Size(), scale_w, scale_h);
+  // split
+  cv::Mat channels[3];
+  for (int i = 0; i < 3; i++) {
+    channels[i] = cv::Mat(height, width, CV_8SC1);
+  }
+  cv::split(resized_image, channels);
+  // normalize
+  for (int i = 0; i < 3; i++) {
+    channels[i].convertTo(channels[i], CV_8SC1, qscale);
+  }
+  // BGR -> RGB & fill data
+  int8_t *ptr = (int8_t *)CVI_NN_TensorPtr(input);
+  int channel_size = height * width;
+  memcpy(ptr + 2 * channel_size, channels[0].data, channel_size);
+  memcpy(ptr + channel_size, channels[1].data, channel_size);
+  memcpy(ptr, channels[2].data, channel_size);
+}
+
+void FaceDetector::doPreProccess_ResizeOnly(cv::Mat &image) {
+  cv::Mat resized_image;
+  scale_w = 1.0 * width / image.cols;
+  scale_h = 1.0 * height / image.rows;
+  cv::resize(image, resized_image, cv::Size(), scale_w, scale_h);
+  memcpy(CVI_NN_TensorPtr(input), resized_image.data, CVI_NN_TensorSize(input));
+}
+
+void FaceDetector::doInference() {
+  CVI_NN_Forward(model, input_tensors, input_num, output_tensors, output_num);
+}
+
+cv::Mat FaceDetector::doPostProccess(cv::Mat &image) {
+  int32_t output_h = output_tensors[0].shape.dim[2];
+  int32_t output_w = output_tensors[0].shape.dim[3];
+
+  cv::Mat dets(output_h, output_w, CV_32FC1);
+  memcpy(dets.data, CVI_NN_TensorPtr(output), CVI_NN_TensorSize(output));
+
+  // multiply scale to origin image size
+  for (int i = 0; i < output_h; ++i) {
+    dets.at<float>(i, 0) = dets.at<float>(i, 0) / scale_w;
+    dets.at<float>(i, 1) = dets.at<float>(i, 1) / scale_h;
+    dets.at<float>(i, 2) = dets.at<float>(i, 2) / scale_w;
+    dets.at<float>(i, 3) = dets.at<float>(i, 3) / scale_h;
+
+    for (int j = 0; j < 10; j = j + 2) {
+      dets.at<float>(i, 5 + j) = dets.at<float>(i, 5 + j) / scale_w;
+      dets.at<float>(i, 6 + j) = dets.at<float>(i, 6 + j) / scale_h;
+    }
+  }
+  return dets;
+}
diff --git a/cviruntime/samples/samples_extra/insightface_mem_cb/face_detector.h b/cviruntime/samples/samples_extra/insightface_mem_cb/face_detector.h
new file mode 100644
index 000000000..6d2b140e5
--- /dev/null
+++ b/cviruntime/samples/samples_extra/insightface_mem_cb/face_detector.h
@@ -0,0 +1,38 @@
+#ifndef FACE_DETECTOR_H
+#define FACE_DETECTOR_H
+
+#include <iostream>
+#include <vector>
+#include <string>
+#include <opencv2/opencv.hpp>
+#include "cviruntime.h"
+
+class FaceDetector {
+public:
+  FaceDetector(const char *model_file, bool nhwc = false);
+  ~FaceDetector();
+
+  void doPreProccess(cv::Mat &image);
+  void doPreProccess_ResizeOnly(cv::Mat &image);
+  void doInference();
+  cv::Mat doPostProccess(cv::Mat &image);
+
+public:
+  CVI_TENSOR *input;
+  CVI_TENSOR *output;
+
+private:
+  CVI_MODEL_HANDLE model = nullptr;
+  CVI_TENSOR *input_tensors;
+  CVI_TENSOR *output_tensors;
+  int32_t input_num;
+  int32_t output_num;
+  CVI_SHAPE shape;
+  int32_t height;
+  int32_t width;
+  float scale_w;
+  float scale_h;
+  float qscale;
+};
+
+#endif
diff --git a/cviruntime/samples/samples_extra/insightface_mem_cb/face_recognizer.cpp b/cviruntime/samples/samples_extra/insightface_mem_cb/face_recognizer.cpp
new file mode 100644
index 000000000..e5e47812d
--- /dev/null
+++ b/cviruntime/samples/samples_extra/insightface_mem_cb/face_recognizer.cpp
@@ -0,0 +1,116 @@
+#include "face_recognizer.h"
+
+
+FaceRecognizer::FaceRecognizer(const char *model_file, bool nhwc) {
+  int ret = CVI_NN_RegisterModel(model_file, &model);
+  if (ret != CVI_RC_SUCCESS) {
+    printf("CVI_NN_RegisterModel failed, err %d\n", ret);
+    exit(1);
+  }
+
+  // get input output tensors
+  CVI_NN_GetInputOutputTensors(model, &input_tensors, &input_num,
+                               &output_tensors, &output_num);
+  input = &input_tensors[0];
+  output = &output_tensors[0];
+
+  qscale = CVI_NN_TensorQuantScale(input);
+  shape = CVI_NN_TensorShape(input);
+  if (nhwc) {
+    height = shape.dim[1];
+    width = shape.dim[2];
+  } else {
+    height = shape.dim[2];
+    width = shape.dim[3];
+  }
+}
+
+FaceRecognizer::~FaceRecognizer() {
+  if (model) {
+    CVI_NN_CleanupModel(model);
+  }
+}
+
+void FaceRecognizer::doPreProccess(cv::Mat &image, cv::Mat &det) {
+  cv::Mat aligned_face = image.clone();
+  float ref_pts[5][2] = {
+    { 30.2946f, 51.6963f },
+    { 65.5318f, 51.5014f },
+    { 48.0252f, 71.7366f },
+    { 33.5493f, 92.3655f },
+    { 62.7299f, 92.2041f }
+  };
+
+  cv::Mat ref(5, 2, CV_32FC1, ref_pts);
+
+  float dst_pts[5][2] = {
+    det.at<float>(0, 5), det.at<float>(0, 6),
+    det.at<float>(0, 7), det.at<float>(0, 8),
+    det.at<float>(0, 9), det.at<float>(0, 10),
+    det.at<float>(0, 11), det.at<float>(0, 12),
+    det.at<float>(0, 13), det.at<float>(0, 14)
+  };
+  cv::Mat dst(5, 2, CV_32FC1, dst_pts);
+
+  auto m = similarTransform(dst, ref);
+  cv::warpPerspective(image, aligned_face, m, cv::Size(96, 112), cv::INTER_LINEAR);
+  cv::resize(aligned_face, aligned_face, cv::Size(112, 112), 0, 0, cv::INTER_LINEAR);
+  // split
+  cv::Mat channels[3];
+  for (int i = 0; i < 3; i++) {
+    channels[i] = cv::Mat(height, width, CV_8SC1);
+  }
+  cv::split(aligned_face, channels);
+  // normalize
+  float input_scale = 1 / 128.0;
+  float scale = input_scale * qscale;
+  float mean = -127.5 * scale;
+  for (int i = 0; i < 3; i++) {
+    channels[i].convertTo(channels[i], CV_8SC1, scale, mean);
+  }
+  // BGR fill data
+  int8_t *ptr = (int8_t *)CVI_NN_TensorPtr(input);
+  int channel_size = height * width;
+  memcpy(ptr + 2 * channel_size, channels[0].data, channel_size);
+  memcpy(ptr + channel_size, channels[1].data, channel_size);
+  memcpy(ptr, channels[2].data, channel_size);
+}
+
+void FaceRecognizer::doPreProccess_ResizeOnly(cv::Mat &image, cv::Mat &det) {
+  cv::Mat aligned_face = image.clone();
+  float ref_pts[5][2] = {
+    { 30.2946f, 51.6963f },
+    { 65.5318f, 51.5014f },
+    { 48.0252f, 71.7366f },
+    { 33.5493f, 92.3655f },
+    { 62.7299f, 92.2041f }
+  };
+
+  cv::Mat ref(5, 2, CV_32FC1, ref_pts);
+
+  float dst_pts[5][2] = {
+    det.at<float>(0, 5), det.at<float>(0, 6),
+    det.at<float>(0, 7), det.at<float>(0, 8),
+    det.at<float>(0, 9), det.at<float>(0, 10),
+    det.at<float>(0, 11), det.at<float>(0, 12),
+    det.at<float>(0, 13), det.at<float>(0, 14)
+  };
+
+  cv::Mat dst(5, 2, CV_32FC1, dst_pts);
+
+  auto m = similarTransform(dst, ref);
+  cv::warpPerspective(image, aligned_face, m, cv::Size(96, 112), cv::INTER_LINEAR);
+  cv::resize(aligned_face, aligned_face, cv::Size(112, 112), 0, 0, cv::INTER_LINEAR);
+  memcpy(CVI_NN_TensorPtr(input), aligned_face.data, CVI_NN_TensorSize(input));
+}
+
+void FaceRecognizer::doInference() {
+  // run inference
+  CVI_NN_Forward(model, input_tensors, input_num, output_tensors, output_num);
+}
+
+cv::Mat FaceRecognizer::doPostProccess() {
+  cv::Mat feature(512, 1, CV_32FC1);
+  memcpy(feature.data, CVI_NN_TensorPtr(output), CVI_NN_TensorSize(output));
+  return feature;
+}
diff --git a/cviruntime/samples/samples_extra/insightface_mem_cb/face_recognizer.h b/cviruntime/samples/samples_extra/insightface_mem_cb/face_recognizer.h
new file mode 100644
index 000000000..28b00d427
--- /dev/null
+++ b/cviruntime/samples/samples_extra/insightface_mem_cb/face_recognizer.h
@@ -0,0 +1,39 @@
+#ifndef FACE_RECOGNIZER_H
+#define FACE_RECOGNIZER_H
+
+#include <iostream>
+#include <string>
+#include <vector>
+#include <opencv2/opencv.hpp>
+#include "cviruntime.h"
+
+
+cv::Mat similarTransform(cv::Mat src, cv::Mat dst);
+
+class FaceRecognizer {
+public:
+  FaceRecognizer(const char *model_file, bool nhwc = false);
+  ~FaceRecognizer();
+
+  void doPreProccess(cv::Mat &image, cv::Mat &det);
+  void doPreProccess_ResizeOnly(cv::Mat &image, cv::Mat &det);
+  void doInference();
+  cv::Mat doPostProccess();
+
+public:
+  CVI_TENSOR *input;
+  CVI_TENSOR *output;
+
+private:
+  CVI_MODEL_HANDLE model = nullptr;
+  CVI_TENSOR *input_tensors;
+  CVI_TENSOR *output_tensors;
+  int32_t input_num;
+  int32_t output_num;
+  CVI_SHAPE shape;
+  int32_t height;
+  int32_t width;
+  float qscale;
+};
+
+#endif
diff --git a/cviruntime/samples/samples_extra/insightface_mem_cb/face_transform.cpp b/cviruntime/samples/samples_extra/insightface_mem_cb/face_transform.cpp
new file mode 100644
index 000000000..06884e2d8
--- /dev/null
+++ b/cviruntime/samples/samples_extra/insightface_mem_cb/face_transform.cpp
@@ -0,0 +1,126 @@
+//================================================================
+// This file from
+// https://github.com/deepinsight/insightface/tree/master/cpp-align
+//================================================================
+
+// Created by Jack Yu on 23/03/2018.
+//
+#include <opencv2/opencv.hpp>
+
+static cv::Mat meanAxis0(const cv::Mat &src) {
+  int num = src.rows;
+  int dim = src.cols;
+
+  // x1 y1
+  // x2 y2
+
+  cv::Mat output(1, dim, CV_32F);
+  for (int i = 0; i < dim; i++) {
+    float sum = 0;
+    for (int j = 0; j < num; j++) {
+      sum += src.at<float>(j, i);
+    }
+    output.at<float>(0, i) = sum / num;
+  }
+
+  return output;
+}
+
+static cv::Mat elementwiseMinus(const cv::Mat &A, const cv::Mat &B) {
+  cv::Mat output(A.rows, A.cols, A.type());
+
+  assert(B.cols == A.cols);
+  if (B.cols == A.cols) {
+    for (int i = 0; i < A.rows; i++) {
+      for (int j = 0; j < B.cols; j++) {
+        output.at<float>(i, j) = A.at<float>(i, j) - B.at<float>(0, j);
+      }
+    }
+  }
+  return output;
+}
+
+static cv::Mat varAxis0(const cv::Mat &src) {
+  cv::Mat temp_ = elementwiseMinus(src, meanAxis0(src));
+  cv::multiply(temp_, temp_, temp_);
+  return meanAxis0(temp_);
+}
+
+static int MatrixRank(cv::Mat M) {
+  cv::Mat w, u, vt;
+  cv::SVD::compute(M, w, u, vt);
+  cv::Mat1b nonZeroSingularValues = w > 0.0001;
+  int rank = cv::countNonZero(nonZeroSingularValues);
+  return rank;
+}
+
+//    References
+//    ----------
+//    .. [1] "Least-squares estimation of transformation parameters between two
+//    point patterns", Shinji Umeyama, PAMI 1991, DOI: 10.1109/34.88573
+//
+//    """
+//
+//    Anthor:Jack Yu
+cv::Mat similarTransform(cv::Mat src, cv::Mat dst) {
+  int num = src.rows;
+  int dim = src.cols;
+  cv::Mat src_mean = meanAxis0(src);
+  cv::Mat dst_mean = meanAxis0(dst);
+  cv::Mat src_demean = elementwiseMinus(src, src_mean);
+  cv::Mat dst_demean = elementwiseMinus(dst, dst_mean);
+  cv::Mat A = (dst_demean.t() * src_demean) / static_cast<float>(num);
+  cv::Mat d(dim, 1, CV_32F);
+  d.setTo(1.0f);
+  if (cv::determinant(A) < 0) {
+    d.at<float>(dim - 1, 0) = -1;
+  }
+  cv::Mat T = cv::Mat::eye(dim + 1, dim + 1, CV_32F);
+  cv::Mat U, S, V;
+  cv::SVD::compute(A, S, U, V);
+
+  // the SVD function in opencv differ from scipy .
+
+  int rank = MatrixRank(A);
+  if (rank == 0) {
+    assert(rank == 0);
+
+  } else if (rank == dim - 1) {
+    if (cv::determinant(U) * cv::determinant(V) > 0) {
+      T.rowRange(0, dim).colRange(0, dim) = U * V;
+    } else {
+      //            s = d[dim - 1]
+      //            d[dim - 1] = -1
+      //            T[:dim, :dim] = np.dot(U, np.dot(np.diag(d), V))
+      //            d[dim - 1] = s
+      int s = d.at<float>(dim - 1, 0) = -1;
+      d.at<float>(dim - 1, 0) = -1;
+
+      T.rowRange(0, dim).colRange(0, dim) = U * V;
+      cv::Mat diag_ = cv::Mat::diag(d);
+      cv::Mat twp = diag_ * V; // np.dot(np.diag(d), V.T)
+      cv::Mat B = cv::Mat::zeros(3, 3, CV_8UC1);
+      cv::Mat C = B.diag(0);
+      T.rowRange(0, dim).colRange(0, dim) = U * twp;
+      d.at<float>(dim - 1, 0) = s;
+    }
+  } else {
+    cv::Mat diag_ = cv::Mat::diag(d);
+    cv::Mat twp = diag_ * V.t(); // np.dot(np.diag(d), V.T)
+    cv::Mat res = U * twp;       // U
+    T.rowRange(0, dim).colRange(0, dim) = -U.t() * twp;
+  }
+  cv::Mat var_ = varAxis0(src_demean);
+  float val = cv::sum(var_).val[0];
+  cv::Mat res;
+  cv::multiply(d, S, res);
+  float scale = 1.0 / val * cv::sum(res).val[0];
+  T.rowRange(0, dim).colRange(0, dim) = -T.rowRange(0, dim).colRange(0, dim).t();
+  cv::Mat temp1 = T.rowRange(0, dim).colRange(0, dim); // T[:dim, :dim]
+  cv::Mat temp2 = src_mean.t();                        // src_mean.T
+  cv::Mat temp3 = temp1 * temp2; // np.dot(T[:dim, :dim], src_mean.T)
+  cv::Mat temp4 = scale * temp3;
+  T.rowRange(0, dim).colRange(dim, dim + 1) = -(temp4 - dst_mean.t());
+  T.rowRange(0, dim).colRange(0, dim) *= scale;
+  return T;
+}
diff --git a/cviruntime/samples/samples_extra/insightface_mem_cb/fd_fr_compare_similarity_with_mem_cb.cpp b/cviruntime/samples/samples_extra/insightface_mem_cb/fd_fr_compare_similarity_with_mem_cb.cpp
new file mode 100644
index 000000000..1d32ef11c
--- /dev/null
+++ b/cviruntime/samples/samples_extra/insightface_mem_cb/fd_fr_compare_similarity_with_mem_cb.cpp
@@ -0,0 +1,65 @@
+#include <iostream>
+#include <string>
+#include <vector>
+#include "face_detector.h"
+#include "face_recognizer.h"
+#include "mem_alloc.hpp"
+
+static float cal_similarity(cv::Mat feature1, cv::Mat feature2) {
+  return feature1.dot(feature2) / (cv::norm(feature1) * cv::norm(feature2));
+}
+
+int main(int argc, char *argv[]) {
+  // Assume test jpg only has one face per image
+  if (argc < 6) {
+    std::cout << "Usage: " << argv[0] << " fd.cvimodel fr.cvimodel image1.jpg image2.jpg use_mem_cb"
+              << std::endl;
+    return -1;
+  }
+
+  cv::Mat image1 = cv::imread(argv[3]);
+  if (!image1.data) {
+    std::cout << "Can not find or open image: " << argv[3] << std::endl;
+    return -1;
+  }
+
+  cv::Mat image2 = cv::imread(argv[4]);
+  if (!image2.data) {
+    std::cout << "Can not find or open image: " << argv[4] << std::endl;
+    return -1;
+  }
+
+  int use_mem_cb = atoi(argv[5]);
+  if (1 == use_mem_cb) {
+      if (0 != CVI_RT_Global_SetMemAllocCallback(mem_alloc, mem_free)) {
+          printf("bind alloc func failed\n");
+          return -1;
+      }
+  }
+
+  FaceDetector fd(argv[1]);
+  FaceRecognizer fr(argv[2]);
+  print_mem();
+
+  fd.doPreProccess(image1);
+  fd.doInference();
+  auto det1 = fd.doPostProccess(image1);
+  fr.doPreProccess(image1, det1);
+  fr.doInference();
+  auto feature1 = fr.doPostProccess();
+
+  fd.doPreProccess(image2);
+  fd.doInference();
+  auto det2 = fd.doPostProccess(image2);
+  fr.doPreProccess(image2, det2);
+  fr.doInference();
+  auto feature2 = fr.doPostProccess();
+
+  float similarity = cal_similarity(feature1, feature2);
+
+  printf("------\n");
+  printf("Similarity: %f\n", similarity);
+  printf("------\n");
+
+  return 0;
+}
diff --git a/cviruntime/samples/samples_extra/insightface_mem_cb/mem_alloc.hpp b/cviruntime/samples/samples_extra/insightface_mem_cb/mem_alloc.hpp
new file mode 100644
index 000000000..547333937
--- /dev/null
+++ b/cviruntime/samples/samples_extra/insightface_mem_cb/mem_alloc.hpp
@@ -0,0 +1,132 @@
+#pragma once
+#include <mutex>
+#include <list>
+#include <unordered_map>
+#include "cviruntime_context.h"
+
+struct Node {
+    CVI_RT_MEM mem;
+    uint64_t size;
+    std::string name;
+    CVI_ALLOC_TYPE type;
+    bool used;
+};
+
+#define MEM_ALLOC_SIZE 120 * 1024 * 1024
+#define MEM_MIN_SIZE  1024
+
+class MemAllocate {
+public:
+    MemAllocate() : size_(MEM_ALLOC_SIZE), 
+    min_size_(MEM_MIN_SIZE),
+    mem_buf_(nullptr),
+    ctx_(nullptr),
+    init_(false) {} 
+
+    ~MemAllocate() {
+        for (auto& node : mem_list_) {
+            CVI_RT_MemFree(ctx_, node.mem);
+        }
+        mem_list_.clear();
+        if (mem_buf_) {
+            CVI_RT_MemFree(ctx_, mem_buf_);
+            mem_buf_ = nullptr;
+        }
+        if (ctx_) {
+            CVI_RT_DeInit(ctx_);
+            ctx_ = nullptr;
+        }
+    }
+
+    int init() {
+        if (init_) {
+            return 0;
+        }
+        if (0 != CVI_RT_Init(&ctx_)) {
+            printf("CVI_RT_Init failed\n");
+            return -1;
+        }
+        mem_buf_ = CVI_RT_MemAlloc(ctx_, size_);
+        if (nullptr == mem_buf_) {
+            printf("CVI_RT_MemAlloc failed\n");
+            return -1;
+        }
+        uint64_t remain_size = size_;
+        int offset = 0;
+        while (remain_size > min_size_) {
+            uint64_t cur_size = remain_size >> 1;
+            CVI_RT_MEM node_mem = CVI_RT_MemPreAlloc(mem_buf_, offset, cur_size);
+            Node node;
+            node.mem = node_mem;
+            node.size = CVI_RT_MemGetSize(node_mem);
+            node.used = false;
+            mem_list_.emplace_front(node);
+            offset += cur_size;
+            remain_size -= cur_size;
+            std::cout << "alloc mem block:" << cur_size << std::endl;
+        }
+        init_ = true;
+        return 0;
+    }
+    CVI_RT_MEM alloc(CVI_RT_HANDLE ctx, uint64_t size, CVI_ALLOC_TYPE type, const char *name) {
+        std::unique_lock<std::mutex> lk(mutex_);
+        init();
+        for (auto iter = mem_list_.begin(); iter != mem_list_.end(); ++iter) {
+            auto& node = *iter;
+            if (!node.used && node.size >= size) {
+                node.used = true;
+                node.type = type;
+                node.name = name;
+                std::cout << "alloc buf size:" << size << " node size:" << node.size << std::endl;
+                return node.mem;
+            }
+        }
+        return nullptr;
+    }
+    void free(CVI_RT_HANDLE ctx, CVI_RT_MEM mem) {
+        std::unique_lock<std::mutex> lk(mutex_);
+        for (auto &node : mem_list_) {
+            if (node.mem == mem) {
+                node.used = false;
+                node.type = CVI_ALLOC_UNKNOWN;
+                node.name = "";
+                std::cout << "free buf size:" << node.size << std::endl;
+                return ;
+            }
+        }
+        return CVI_RT_MemFree(ctx, mem);
+    }
+
+    void print_stat() {
+        std::cout << "*************print mem***************" << std::endl;
+        std::cout << "name\t\t\t\t\t type\t\t\t\t\t size\t\t\t" << std::endl;
+        std::unique_lock<std::mutex> lk(mutex_);
+        for (auto &node : mem_list_) {
+            if (node.used) {
+                std::cout << node.name << "\t\t\t\t" << node.type << "\t\t\t\t" << node.size << std::endl;
+            }
+        }
+        std::cout << "*************end print mem***************" << std::endl;
+    }
+private:
+    bool init_;
+    uint64_t size_;
+    uint64_t min_size_;
+    std::list<Node> mem_list_;
+    CVI_RT_MEM mem_buf_;
+    CVI_RT_HANDLE ctx_;
+    std::mutex mutex_;
+};
+
+static MemAllocate gMemAllocate;
+CVI_RT_MEM mem_alloc(CVI_RT_HANDLE ctx, uint64_t size, CVI_ALLOC_TYPE type, const char *name) {
+    return gMemAllocate.alloc(ctx, size, type, name);
+}
+
+void mem_free(CVI_RT_HANDLE ctx, CVI_RT_MEM mem) {
+    return gMemAllocate.free(ctx, mem);
+}
+
+void print_mem() {
+    return gMemAllocate.print_stat();
+}
\ No newline at end of file
diff --git a/cviruntime/samples/samples_extra/insightface_vpss/CMakeLists.txt b/cviruntime/samples/samples_extra/insightface_vpss/CMakeLists.txt
new file mode 100644
index 000000000..bba897092
--- /dev/null
+++ b/cviruntime/samples/samples_extra/insightface_vpss/CMakeLists.txt
@@ -0,0 +1,66 @@
+cmake_minimum_required(VERSION 2.8.0)
+
+project(insightface_vpss C CXX)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
+
+if(NOT DEFINED TPU_SDK_PATH)
+  message(FATAL_ERROR "Please set TPU_SDK_PATH to point to the TPU_SDK installation")
+endif()
+include_directories(${TPU_SDK_PATH}/include)
+link_directories(${TPU_SDK_PATH}/lib)
+
+if(NOT DEFINED OPENCV_PATH)
+  message(FATAL_ERROR "Please set OPENCV_PATH to point to the opencvn installation")
+endif()
+include_directories(${OPENCV_PATH}/include)
+link_directories(${OPENCV_PATH}/lib)
+
+if(NOT DEFINED CHIP)
+  message(FATAL_ERROR "Please set CHIP to 183x or 182x")
+endif()
+
+if(NOT DEFINED MW_PATH)
+  message(FATAL_ERROR "Please set MW_PATH to point to the middleware installation")
+endif()
+include_directories(${MW_PATH}/include)
+include_directories(${MW_PATH}/sample/common)
+include_directories(${MW_PATH}/include/isp/cv${CHIP})
+link_directories(${MW_PATH}/lib)
+link_directories(${MW_PATH}/lib/3rd)
+
+if (CHIP STREQUAL "182x")
+  add_definitions(-DCHIP_182x)
+  set(MW_LIB sample isp cvi_bin cvi_bin_isp isp_algo vpu venc vdec cvi_vcodec sys awb ae af sns_full ini cvitracer)
+else()
+  set(MW_LIB sample isp cvi_bin cvi_bin_isp vpu venc vdec cvi_vcodec sys awb ae af sns_full ini)
+endif()
+
+set(CVI_LIBS ${CVI_LIBS} cviruntime cvikernel)
+if(NOT CMAKE_CROSSCOMPILING)
+  set(CVI_LIBS ${CVI_LIBS} cvicmodel)
+endif()
+
+set(OPENCV_LIBS ${OPENCV_LIBS} opencv_core opencv_imgcodecs opencv_imgproc)
+if(NOT CMAKE_CROSSCOMPILING)
+  set(OPENCV_LIBS ${OPENCV_LIBS} opencv_highgui)
+endif()
+
+set(EXTRA_LIBS ${EXTRA_LIBS} dl stdc++ pthread z)
+
+
+add_executable(insightface_vpss
+    fd_fr_demo_mpi.cpp
+    cvi_media_sdk.cpp
+    affine_hw.cpp)
+    
+target_link_libraries(insightface_vpss
+    ${CVI_LIBS}
+    ${OPENCV_LIBS}
+    ${EXTRA_LIBS}
+    ${MW_LIB})
+
+install(TARGETS insightface_vpss 
+    insightface_vpss DESTINATION samples_extra/bin)
diff --git a/cviruntime/samples/samples_extra/insightface_vpss/FacePreprocess.h b/cviruntime/samples/samples_extra/insightface_vpss/FacePreprocess.h
new file mode 100644
index 000000000..010575ecd
--- /dev/null
+++ b/cviruntime/samples/samples_extra/insightface_vpss/FacePreprocess.h
@@ -0,0 +1,151 @@
+//================================================================
+//This file from 
+//https://github.com/deepinsight/insightface/tree/master/cpp-align
+//================================================================
+
+// Created by Jack Yu on 23/03/2018.
+//
+
+#ifndef FACE_DEMO_FACEPREPROCESS_H
+#define FACE_DEMO_FACEPREPROCESS_H
+
+#include<opencv2/opencv.hpp>
+
+using namespace cv;
+
+namespace FacePreprocess {
+
+cv::Mat meanAxis0(const cv::Mat &src)
+{
+    int num = src.rows;
+    int dim = src.cols;
+
+    // x1 y1
+    // x2 y2
+
+    cv::Mat output(1,dim,CV_32F);
+    for(int i = 0 ; i <  dim; i ++)
+    {
+        float sum = 0 ;
+        for(int j = 0 ; j < num ; j++)
+        {
+            sum+=src.at<float>(j,i);
+        }
+        output.at<float>(0,i) = sum/num;
+    }
+
+    return output;
+}
+
+cv::Mat elementwiseMinus(const cv::Mat &A,const cv::Mat &B)
+{
+    cv::Mat output(A.rows,A.cols,A.type());
+
+    assert(B.cols == A.cols);
+    if(B.cols == A.cols)
+    {
+        for(int i = 0 ; i <  A.rows; i ++)
+        {
+            for(int j = 0 ; j < B.cols; j++)
+            {
+                output.at<float>(i,j) = A.at<float>(i,j) - B.at<float>(0,j);
+            }
+        }
+    }
+    return output;
+}
+
+
+cv::Mat varAxis0(const cv::Mat &src)
+{
+    cv::Mat temp_ = elementwiseMinus(src,meanAxis0(src));
+    cv::multiply(temp_ ,temp_ ,temp_ );
+    return meanAxis0(temp_);
+}
+
+int MatrixRank(cv::Mat M)
+{
+    Mat w, u, vt;
+    SVD::compute(M, w, u, vt);
+    Mat1b nonZeroSingularValues = w > 0.0001;
+    int rank = countNonZero(nonZeroSingularValues);
+    return rank;
+}
+
+//    References
+//    ----------
+//    .. [1] "Least-squares estimation of transformation parameters between two
+//    point patterns", Shinji Umeyama, PAMI 1991, DOI: 10.1109/34.88573
+//
+//    """
+//
+//    Anthor:Jack Yu
+cv::Mat similarTransform(cv::Mat src,cv::Mat dst) {
+    int num = src.rows;
+    int dim = src.cols;
+    cv::Mat src_mean = meanAxis0(src);
+    cv::Mat dst_mean = meanAxis0(dst);
+    cv::Mat src_demean = elementwiseMinus(src, src_mean);
+    cv::Mat dst_demean = elementwiseMinus(dst, dst_mean);
+    cv::Mat A = (dst_demean.t() * src_demean) / static_cast<float>(num);
+    cv::Mat d(dim, 1, CV_32F);
+    d.setTo(1.0f);
+    if (cv::determinant(A) < 0) {
+        d.at<float>(dim - 1, 0) = -1;
+
+    }
+    Mat T = cv::Mat::eye(dim + 1, dim + 1, CV_32F);
+    cv::Mat U, S, V;
+    SVD::compute(A, S,U, V);
+
+    // the SVD function in opencv differ from scipy .
+
+
+    int rank = MatrixRank(A);
+    if (rank == 0) {
+        assert(rank == 0);
+
+    } else if (rank == dim - 1) {
+        if (cv::determinant(U) * cv::determinant(V) > 0) {
+            T.rowRange(0, dim).colRange(0, dim) = U * V;
+        } else {
+        //            s = d[dim - 1]
+        //            d[dim - 1] = -1
+        //            T[:dim, :dim] = np.dot(U, np.dot(np.diag(d), V))
+        //            d[dim - 1] = s
+            int s = d.at<float>(dim - 1, 0) = -1;
+            d.at<float>(dim - 1, 0) = -1;
+
+            T.rowRange(0, dim).colRange(0, dim) = U * V;
+            cv::Mat diag_ = cv::Mat::diag(d);
+            cv::Mat twp = diag_*V; //np.dot(np.diag(d), V.T)
+            Mat B = Mat::zeros(3, 3, CV_8UC1);
+            Mat C = B.diag(0);
+            T.rowRange(0, dim).colRange(0, dim) = U* twp;
+            d.at<float>(dim - 1, 0) = s;
+        }
+    }
+    else{
+        cv::Mat diag_ = cv::Mat::diag(d);
+        cv::Mat twp = diag_*V.t(); //np.dot(np.diag(d), V.T)
+        cv::Mat res = U* twp; // U
+        T.rowRange(0, dim).colRange(0, dim) = -U.t()* twp;
+    }
+    cv::Mat var_ = varAxis0(src_demean);
+    float val = cv::sum(var_).val[0];
+    cv::Mat res;
+    cv::multiply(d,S,res);
+    float scale =  1.0/val*cv::sum(res).val[0];
+    T.rowRange(0, dim).colRange(0, dim) = - T.rowRange(0, dim).colRange(0, dim).t();
+    cv::Mat  temp1 = T.rowRange(0, dim).colRange(0, dim); // T[:dim, :dim]
+    cv::Mat  temp2 = src_mean.t(); //src_mean.T
+    cv::Mat  temp3 = temp1*temp2; // np.dot(T[:dim, :dim], src_mean.T)
+    cv::Mat temp4 = scale*temp3;
+    T.rowRange(0, dim).colRange(dim, dim+1)=  -(temp4 - dst_mean.t()) ;
+    T.rowRange(0, dim).colRange(0, dim) *= scale;
+    return T;
+}
+
+
+}
+#endif //FACE_DEMO_FACEPREPROCESS_H
diff --git a/cviruntime/samples/samples_extra/insightface_vpss/README.md b/cviruntime/samples/samples_extra/insightface_vpss/README.md
new file mode 100644
index 000000000..088420d7e
--- /dev/null
+++ b/cviruntime/samples/samples_extra/insightface_vpss/README.md
@@ -0,0 +1,190 @@
+# Face detection and recognition Sample
+
+### Download the model and convert the model under docker (optional)
+#### For new toolchain guide
+The following documents are required:
+* tpu-mlir_xxxx.tar.gz (The release package of tpu-mlir)
+
+Transform cvimodel shell:
+``` shell
+tar zxf tpu-mlir_xxxx.tar.gz
+source tpu-mlir_xxxx/envsetup.sh
+
+mkdir workspace && cd workspace
+cp $TPUC_ROOT/regression/image/parade.jpg .
+cp $TPUC_ROOT/regression/image/Aaron_Eckhart_0001.jpg .
+cp -rf $TPUC_ROOT/regression/dataset/LFW .
+cp -rf $TPUC_ROOT/regression/dataset/WIDER .
+
+## retinaface fuse_post_process
+model_transform.py \
+--model_name mnet \
+--model_def ./mnet_600_with_detection.prototxt \
+--model_data ./mnet.caffemodel \
+--test_input ./parade.jpg \
+--test_result mnet_top_output.npz \
+--input_shapes [[1,3,600,600]]
+--resize_dims 600,600 \
+--mean 0,0,0 \
+--scale 1,1,1 \
+--pixel_format "rgb" \
+--tolerance 0.99,0.99 \
+--excepts data \
+--mlir mnet.mlir
+
+run_calibration.py \
+mnet.mlir \
+--dataset=./WIDER \
+--input_num=100 \
+-o mnet_calibration_table
+
+model_deploy.py \
+--mlir mnet.mlir \
+--calibration_table mnet_calibration_table \
+--chip cv183x \
+--quantize INT8 \
+--quant_input \
+--customization_format RGB_PLANAR \
+--test_input ./parade.jpg \
+--test_reference mnet_top_output.npz \
+--fuse_preprocess \
+--aligned_input \
+--excepts data \
+--tolerance 0.8,0.5 \
+--model retinaface_mnet25_600_fused_preprocess_aligned_input.cvimodel
+
+## arcface
+model_transform.py \
+--model_name arcface_res50 \
+--model_def ./arcface_res50.prototxt \
+--model_data ./arcface_res50.caffemodel \
+--test_input ./Aaron_Eckhart_0001.jpg \
+--test_result arcface_res50_top_output.npz \
+--input_shapes [[1,3,112,112]]
+--resize_dims 112,112 \
+--mean 127.5,127.5,127.5 \
+--scale 0.0078125,0.0078125,0.0078125 \
+--pixel_format "rgb" \
+--tolerance 0.99,0.99 \
+--excepts data \
+--mlir arcface_res50.mlir
+
+run_calibration.py \
+arcface_res50.mlir \
+--dataset=./LFW \
+--input_num=100 \
+-o arcface_res50_calibration_table
+
+model_deploy.py \
+--mlir arcface_res50.mlir \
+--calibration_table arcface_res50_calibration_table \
+--chip cv183x \
+--quantize INT8 \
+--quant_input \
+--customization_format RGB_PLANAR \
+--test_input ./Aaron_Eckhart_0001.jpg \
+--test_reference arcface_res50_top_output.npz \
+--fuse_preprocess \
+--aligned_input \
+--excepts data \
+--tolerance 0.9,0.6 \
+--model arcface_res50_fused_preprocess_aligned_input.cvimodel
+```
+
+
+#### For old toolchain guide
+The following documents are required:
+
+* cvitek_mlir_ubuntu-18.04.tar.gz
+
+Transform model shell:
+``` shell
+tar zxf cvitek_mlir_ubuntu-18.04.tar.gz
+source cvitek_mlir/cvitek_envs.sh
+
+mkdir workspace && cd workspace
+cp $MLIR_PATH/tpuc/regression/data/parade.jpg .
+cp $MLIR_PATH/tpuc/regression/data/Aaron_Eckhart_0001.jpg .
+cp -rf $MLIR_PATH/tpuc/regression/data/images .
+
+## retinaface fuse_post_process
+model_transform.py \
+--model_type caffe \
+--model_name mnet \
+--model_def ./mnet_600_with_detection.prototxt \
+--model_data ./mnet.caffemodel \
+--image ./parade.jpg \
+--image_resize_dims 600,600 \
+--model_channel_order "rgb" \
+--tolerance 0.99,0.99,0.99 \
+--mlir mnet_416_fp32.mlir
+
+run_calibration.py \
+mnet_416_fp32.mlir \
+--dataset=./images \
+--input_num=100 \
+-o retinaface_mnet25_calibration_table
+
+model_deploy.py \
+--model_name mnet \
+--mlir mnet_416_fp32.mlir \
+--calibration_table retinaface_mnet25_calibration_table \
+--fuse_preprocess \
+--pixel_format RGB_PLANAR \
+--aligned_input true \
+--chip cv183x \
+--quantize INT8 \
+--image parade.jpg \
+--tolerance 0.90,0.85,0.54 \
+--correctness 0.95,0.95,0.9 \
+--cvimodel retinaface_mnet25_600_fused_preprocess_aligned_input.cvimodel
+
+## arcface
+model_transform.py \
+--model_type caffe \
+--model_name mnet \
+--model_def ./arcface_res50.prototxt \
+--model_data ./arcface_res50.caffemodel \
+--image ./Aaron_Eckhart_0001.jpg \
+--image_resize_dims 112,112 \
+--input_scale 0.0078125 \
+--mean 127.5,127.5,127.5 \
+--model_channel_order "rgb" \
+--tolerance 0.99,0.99,0.99 \
+--mlir arcface_res50_fp32.mlir
+
+run_calibration.py \
+arcface_res50_fp32.mlir \
+--dataset=./images \
+--input_num=100 \
+-o arcface_res50_calibration_table
+
+model_deploy.py \
+--model_name arcface_res50 \
+--mlir arcface_res50_fp32.mlir \
+--calibration_table arcface_res50_calibration_table \
+--fuse_preprocess \
+--pixel_format RGB_PLANAR \
+--aligned_input true \
+--chip cv183x \
+--quantize INT8 \
+--image pose_256_192.jpg \
+--excepts stage1_unit1_sc_scale \
+--tolerance 0.6,0.6,0 \
+--correctness 0.95,0.95,0.9 \
+--cvimodel arcface_res50_fused_preprocess_aligned_input.cvimodel
+```
+
+Copy generated retinaface_mnet25_600_fused_preprocess_aligned_input.cvimodel and arcface_res50_fused_preprocess_aligned_input.cvimodel to Development board
+
+## How To Compile Vpss input Sample In Docker
+View the Top level directory README.md
+
+## Run Samples In EVB Borad
+```
+cd install_samples/samples_extra
+./bin/insightface_vpss \
+retinaface_mnet25_600_fused_preprocess_aligned_input.cvimodel \
+arcface_res50_fused_preprocess_aligned_input.cvimodel \
+./data/obama1.jpg 
+```
\ No newline at end of file
diff --git a/cviruntime/samples/samples_extra/insightface_vpss/affine_hw.cpp b/cviruntime/samples/samples_extra/insightface_vpss/affine_hw.cpp
new file mode 100644
index 000000000..44a0c8366
--- /dev/null
+++ b/cviruntime/samples/samples_extra/insightface_vpss/affine_hw.cpp
@@ -0,0 +1,164 @@
+#include "affine_hw.h"
+#include "opencv2/imgproc.hpp"
+
+#include <cvi_comm_gdc.h>
+#include <cvi_gdc.h>
+#include <algorithm>
+
+using namespace std;
+
+cv::Mat tformfwd(const cv::Mat &trans, const cv::Mat &uv) {
+  cv::Mat uv_h = cv::Mat::ones(uv.rows, 3, CV_64FC1);
+  uv.copyTo(uv_h(cv::Rect(0, 0, 2, uv.rows)));
+  cv::Mat xv_h = uv_h * trans;
+  return xv_h(cv::Rect(0, 0, 2, uv.rows));
+}
+
+static cv::Mat find_none_flectives_similarity(const cv::Mat &uv, const cv::Mat &xy) {
+  cv::Mat A = cv::Mat::zeros(2 * xy.rows, 4, CV_64FC1);
+  cv::Mat b = cv::Mat::zeros(2 * xy.rows, 1, CV_64FC1);
+  cv::Mat x = cv::Mat::zeros(4, 1, CV_64FC1);
+
+  xy(cv::Rect(0, 0, 1, xy.rows)).copyTo(A(cv::Rect(0, 0, 1, xy.rows)));  // x
+  xy(cv::Rect(1, 0, 1, xy.rows)).copyTo(A(cv::Rect(1, 0, 1, xy.rows)));  // y
+  A(cv::Rect(2, 0, 1, xy.rows)).setTo(1.);
+
+  xy(cv::Rect(1, 0, 1, xy.rows)).copyTo(A(cv::Rect(0, xy.rows, 1, xy.rows)));    // y
+  (xy(cv::Rect(0, 0, 1, xy.rows))).copyTo(A(cv::Rect(1, xy.rows, 1, xy.rows)));  //-x
+  A(cv::Rect(1, xy.rows, 1, xy.rows)) *= -1;
+  A(cv::Rect(3, xy.rows, 1, xy.rows)).setTo(1.);
+
+  uv(cv::Rect(0, 0, 1, uv.rows)).copyTo(b(cv::Rect(0, 0, 1, uv.rows)));
+  uv(cv::Rect(1, 0, 1, uv.rows)).copyTo(b(cv::Rect(0, uv.rows, 1, uv.rows)));
+
+  cv::solve(A, b, x, cv::DECOMP_SVD);
+  cv::Mat trans_inv = (cv::Mat_<double>(3, 3) << x.at<double>(0), -x.at<double>(1), 0,
+                       x.at<double>(1), x.at<double>(0), 0, x.at<double>(2), x.at<double>(3), 1);
+  cv::Mat trans = trans_inv.inv(cv::DECOMP_SVD);
+  trans.at<double>(0, 2) = 0;
+  trans.at<double>(1, 2) = 0;
+  trans.at<double>(2, 2) = 1;
+
+  return trans;
+}
+
+static cv::Mat find_similarity(const cv::Mat &uv, const cv::Mat &xy) {
+  cv::Mat trans1 = find_none_flectives_similarity(uv, xy);
+  cv::Mat xy_reflect = xy;
+  xy_reflect(cv::Rect(0, 0, 1, xy.rows)) *= -1;
+  cv::Mat trans2r = find_none_flectives_similarity(uv, xy_reflect);
+  cv::Mat reflect = (cv::Mat_<double>(3, 3) << -1, 0, 0, 0, 1, 0, 0, 0, 1);
+
+  cv::Mat trans2 = trans2r * reflect;
+  cv::Mat xy1 = tformfwd(trans1, uv);
+
+  double norm1 = cv::norm(xy1 - xy);
+
+  cv::Mat xy2 = tformfwd(trans2, uv);
+  double norm2 = cv::norm(xy2 - xy);
+
+  cv::Mat trans;
+  if (norm1 < norm2) {
+    trans = trans1;
+  } else {
+    trans = trans2;
+  }
+  return trans;
+}
+
+static cv::Mat get_similarity_transform(const vector<cv::Point2f> &src_pts,
+                                        const vector<cv::Point2f> &dest_pts, bool reflective) {
+  cv::Mat src((int)src_pts.size(), 2, CV_32FC1, (void *)(&src_pts[0].x));
+  src.convertTo(src, CV_64FC1);
+
+  cv::Mat dst((int)dest_pts.size(), 2, CV_32FC1, (void *)(&dest_pts[0].x));
+  dst.convertTo(dst, CV_64FC1);
+
+  cv::Mat trans = reflective ? find_similarity(src, dst) : find_none_flectives_similarity(src, dst);
+  return trans(cv::Rect(0, 0, 2, trans.rows)).t();
+}
+
+inline int getTfmFromFaceInfo(const face_rect_t &face_info, const int width, const int height,
+                              cv::Mat *tfm) {
+  assert(width == 96 || width == 112);
+  assert(height == 112);
+  if ((width != 96 && width != 112) || height != 112) {
+    return -1;
+  }
+
+  int ref_width = width;
+  int ref_height = height;
+
+  vector<cv::Point2f> detect_points;
+  for (int j = 0; j < 5; ++j) {
+    cv::Point2f e;
+    e.x = face_info.landmarks[j][0];
+    e.y = face_info.landmarks[j][1];
+    detect_points.emplace_back(e);
+  }
+
+  vector<cv::Point2f> reference_points;
+  if (96 == width) {
+    reference_points = {{30.29459953, 51.69630051},
+                        {65.53179932, 51.50139999},
+                        {48.02519989, 71.73660278},
+                        {33.54930115, 92.3655014},
+                        {62.72990036, 92.20410156}};
+  } else {
+    reference_points = {{38.29459953, 51.69630051},
+                        {73.53179932, 51.50139999},
+                        {56.02519989, 71.73660278},
+                        {41.54930115, 92.3655014},
+                        {70.72990036, 92.20410156}};
+  }
+
+  for (auto &e : reference_points) {
+    e.x += (width - ref_width) / 2.0f;
+    e.y += (height - ref_height) / 2.0f;
+  }
+  *tfm = get_similarity_transform(detect_points, reference_points, true);
+  return 0;
+}
+
+int face_align_gdc(const VIDEO_FRAME_INFO_S *inFrame, VIDEO_FRAME_INFO_S *outFrame,
+                   const face_rect_t &face_info) {
+  cv::Mat tfm;
+  if (getTfmFromFaceInfo(face_info, outFrame->stVFrame.u32Width, outFrame->stVFrame.u32Height,
+                         &tfm) != 0) {
+    return -1;
+  }
+  double t =
+      (tfm.at<double>(0, 0) / tfm.at<double>(0, 1) - tfm.at<double>(1, 0) / tfm.at<double>(1, 1));
+  double a = 1 / tfm.at<double>(0, 1) / t;
+  double b = -1 / tfm.at<double>(1, 1) / t;
+  double c =
+      (-tfm.at<double>(0, 2) / tfm.at<double>(0, 1) + tfm.at<double>(1, 2) / tfm.at<double>(1, 1)) /
+      t;
+  vector<cv::Point2f> search_points;
+  const float sp_x = outFrame->stVFrame.u32Width - 1;
+  const float sp_y = outFrame->stVFrame.u32Height - 1;
+  search_points = {{0.0, 0.0}, {sp_x, 0.0}, {0.0, sp_y}, {sp_x, sp_y}};
+  AFFINE_ATTR_S stAffineAttr;
+  stAffineAttr.u32RegionNum = 1;
+  POINT2F_S *face_box = stAffineAttr.astRegionAttr[0];
+  int i = 0;
+  for (auto &e : search_points) {
+    face_box[i].x = e.x * a + e.y * b + c;
+    face_box[i].y =
+        (e.x - tfm.at<double>(0, 2) - face_box[i].x * tfm.at<double>(0, 0)) / tfm.at<double>(0, 1);
+    ++i;
+  }
+  stAffineAttr.stDestSize = {outFrame->stVFrame.u32Width, outFrame->stVFrame.u32Height};
+
+  GDC_HANDLE hHandle;
+  GDC_TASK_ATTR_S stTask;
+  stTask.stImgIn = *inFrame;
+  stTask.stImgOut = *outFrame;
+  CVI_GDC_BeginJob(&hHandle);
+  CVI_GDC_AddAffineTask(hHandle, &stTask, &stAffineAttr);
+  if (CVI_GDC_EndJob(hHandle) != CVI_SUCCESS) {
+    printf("Affine failed.\n");
+    return -1;
+  }
+  return 0;
+}
\ No newline at end of file
diff --git a/cviruntime/samples/samples_extra/insightface_vpss/affine_hw.h b/cviruntime/samples/samples_extra/insightface_vpss/affine_hw.h
new file mode 100644
index 000000000..5bc724162
--- /dev/null
+++ b/cviruntime/samples/samples_extra/insightface_vpss/affine_hw.h
@@ -0,0 +1,13 @@
+#ifndef _AFFINIE_HW_H
+#define _AFFINIE_HW_H_
+
+#include "opencv2/opencv.hpp"
+
+#include "cvi_type.h"
+#include "cvi_sys.h"
+
+#include "type_define.h"
+
+int face_align_gdc(const VIDEO_FRAME_INFO_S *inFrame, VIDEO_FRAME_INFO_S *outFrame,
+                   const face_rect_t &face_info);
+#endif
diff --git a/cviruntime/samples/samples_extra/insightface_vpss/cvi_media_sdk.cpp b/cviruntime/samples/samples_extra/insightface_vpss/cvi_media_sdk.cpp
new file mode 100644
index 000000000..88a56b6e5
--- /dev/null
+++ b/cviruntime/samples/samples_extra/insightface_vpss/cvi_media_sdk.cpp
@@ -0,0 +1,1234 @@
+#include "cvi_media_sdk.h"
+#include <inttypes.h>
+///
+/// SYS
+///
+
+static int iproc_grp = 1;
+
+static void _SYS_HandleSig(int nSignal, siginfo_t *si, void *arg) {
+    SAMPLE_COMM_SYS_Exit();
+
+    exit(1);
+}
+
+static uint32_t get_frame_size(uint32_t w, uint32_t h, PIXEL_FORMAT_E fmt) {
+    // try rotate and non-rotate, choose the larger one
+    uint32_t sz_0 = COMMON_GetPicBufferSize(w, h, fmt,
+                                            DATA_BITWIDTH_8, COMPRESS_MODE_NONE, DEFAULT_ALIGN);
+    uint32_t sz_1 = COMMON_GetPicBufferSize(h, w, fmt,
+                                            DATA_BITWIDTH_8, COMPRESS_MODE_NONE, DEFAULT_ALIGN);
+    return (sz_0 > sz_1) ? sz_0 : sz_1;
+}
+
+static int COMM_SYS_Init(VB_CONFIG_S *pstVbConfig)
+{
+    CVI_S32 s32Ret = CVI_FAILURE;
+
+    CVI_SYS_Exit();
+    CVI_VB_Exit();
+
+    if (pstVbConfig == NULL) {
+        CVI_LOGE("input parameter is null, it is invaild!\n");
+        return CVI_MAPI_ERR_FAILURE;
+    }
+
+    s32Ret = CVI_VB_SetConfig(pstVbConfig);
+    if (s32Ret != CVI_SUCCESS) {
+        CVI_LOGE("CVI_VB_SetConf failed!\n");
+        return s32Ret;
+    }
+
+    s32Ret = CVI_VB_Init();
+    if (s32Ret != CVI_SUCCESS) {
+        CVI_LOGE("CVI_VB_Init failed!\n");
+        return s32Ret;
+    }
+
+    s32Ret = CVI_SYS_Init();
+    if (s32Ret != CVI_SUCCESS) {
+        CVI_LOGE("CVI_SYS_Init failed!\n");
+        CVI_VB_Exit();
+        return s32Ret;
+    }
+
+    return CVI_MAPI_SUCCESS;
+}
+
+static void COMM_SYS_Exit(void)
+{
+    //CVI_VB_ExitModCommPool(VB_UID_VDEC);
+    CVI_VB_Exit();
+    CVI_SYS_Exit();
+}
+
+int CVI_MAPI_Media_Init(CVI_MAPI_MEDIA_SYS_ATTR_T *attr) {
+
+    //struct sigaction sa;
+    //memset(&sa, 0, sizeof(struct sigaction));
+    //sigemptyset(&sa.sa_mask);
+    //sa.sa_sigaction = _SYS_HandleSig;
+    //sa.sa_flags = SA_SIGINFO|SA_RESETHAND;    // Reset signal handler to system default after signal triggered
+    //sigaction(SIGINT, &sa, NULL);
+    //sigaction(SIGTERM, &sa, NULL);
+
+    VB_CONFIG_S stVbConf;
+    memset(&stVbConf, 0, sizeof(VB_CONFIG_S));
+
+    for (unsigned i = 0; i < attr->vb_pool_num; i++) {
+        uint32_t blk_size;
+        if (attr->vb_pool[i].is_frame) {
+            blk_size = get_frame_size(
+                attr->vb_pool[i].vb_blk_size.frame.width,
+                attr->vb_pool[i].vb_blk_size.frame.height,
+                attr->vb_pool[i].vb_blk_size.frame.fmt);
+        } else {
+            blk_size = attr->vb_pool[i].vb_blk_size.size;
+        }
+        uint32_t blk_num = attr->vb_pool[i].vb_blk_num;
+
+        stVbConf.astCommPool[i].u32BlkSize  = blk_size;
+        stVbConf.astCommPool[i].u32BlkCnt   = blk_num;
+        stVbConf.astCommPool[i].enRemapMode = VB_REMAP_MODE_CACHED;
+        CVI_LOGI("VB pool[%d] BlkSize %d BlkCnt %d\n", i, blk_size, blk_num);
+    }
+    stVbConf.u32MaxPoolCnt = attr->vb_pool_num;
+
+    int ret    = CVI_MAPI_SUCCESS;
+    CVI_S32 rc = COMM_SYS_Init(&stVbConf);
+    if (rc != CVI_MAPI_SUCCESS) {
+        CVI_LOGE("COMM_SYS_Init fail, rc = %#x\n", rc);
+        ret = CVI_MAPI_ERR_FAILURE;
+        goto error;
+    }
+
+    rc = CVI_SYS_SetVIVPSSMode(&attr->stVIVPSSMode);
+    if (rc != CVI_SUCCESS) {
+        CVI_LOGE("CVI_SYS_SetVIVPSSMode failed with %#x\n", rc);
+        ret = CVI_MAPI_ERR_FAILURE;
+        goto error;
+    }
+
+    rc = CVI_SYS_SetVPSSModeEx(&attr->stVPSSMode);
+    if (rc != CVI_SUCCESS) {
+        CVI_LOGE("CVI_SYS_SetVPSSModeEx failed with %#x\n", rc);
+        ret = CVI_MAPI_ERR_FAILURE;
+        goto error;
+    }
+
+    return ret;
+
+error:
+    COMM_SYS_Exit();
+    return ret;
+}
+
+int CVI_MAPI_Media_Deinit(void) {
+    COMM_SYS_Exit();
+    return CVI_MAPI_SUCCESS;
+}
+
+//
+// VB Frame helper functions
+//
+int CVI_MAPI_ReleaseFrame(VIDEO_FRAME_INFO_S *frm) {
+    VB_BLK blk = CVI_VB_PhysAddr2Handle(frm->stVFrame.u64PhyAddr[0]);
+    CVI_VB_ReleaseBlock(blk);
+    return CVI_MAPI_SUCCESS;
+}
+
+int CVI_MAPI_GetFrameFromMemory_YUV(VIDEO_FRAME_INFO_S *frm,
+                                    uint32_t width, uint32_t height, PIXEL_FORMAT_E fmt, void *data) {
+    CVI_LOG_ASSERT(fmt == PIXEL_FORMAT_YUV_PLANAR_420,
+                   "Not support fmt %d yet\n", fmt);
+    CVI_LOG_ASSERT(width % 2 == 0, "width not align\n");
+    CVI_LOG_ASSERT(height % 2 == 0, "height not align\n");
+
+    VB_BLK blk;
+    VB_CAL_CONFIG_S stVbCalConfig;
+
+    COMMON_GetPicBufferConfig(width, height, fmt, DATA_BITWIDTH_8,
+                              COMPRESS_MODE_NONE, DEFAULT_ALIGN, &stVbCalConfig);
+
+    frm->stVFrame.enCompressMode = COMPRESS_MODE_NONE;
+    frm->stVFrame.enPixelFormat  = fmt;
+    frm->stVFrame.enVideoFormat  = VIDEO_FORMAT_LINEAR;
+    frm->stVFrame.enColorGamut   = COLOR_GAMUT_BT709;
+    frm->stVFrame.u32Width       = width;
+    frm->stVFrame.u32Height      = height;
+    frm->stVFrame.u32Stride[0]   = stVbCalConfig.u32MainStride;
+    frm->stVFrame.u32Stride[1]   = stVbCalConfig.u32CStride;
+    frm->stVFrame.u32Stride[2]   = stVbCalConfig.u32CStride;
+    frm->stVFrame.u32TimeRef     = 0;
+    frm->stVFrame.u64PTS         = 0;
+    frm->stVFrame.enDynamicRange = DYNAMIC_RANGE_SDR8;
+
+    blk = CVI_VB_GetBlock(VB_INVALID_POOLID, stVbCalConfig.u32VBSize);
+    if (blk == VB_INVALID_HANDLE) {
+        SAMPLE_PRT("Can't acquire vb block\n");
+        return CVI_FAILURE;
+    }
+
+    frm->u32PoolId             = CVI_VB_Handle2PoolId(blk);
+    frm->stVFrame.u32Length[0] = ALIGN(stVbCalConfig.u32MainYSize,
+                                       stVbCalConfig.u16AddrAlign);
+    frm->stVFrame.u32Length[1] = frm->stVFrame.u32Length[2] = ALIGN(stVbCalConfig.u32MainCSize,
+                                                                    stVbCalConfig.u16AddrAlign);
+
+    frm->stVFrame.u64PhyAddr[0] = CVI_VB_Handle2PhysAddr(blk);
+    frm->stVFrame.u64PhyAddr[1] = frm->stVFrame.u64PhyAddr[0] + frm->stVFrame.u32Length[0];
+    frm->stVFrame.u64PhyAddr[2] = frm->stVFrame.u64PhyAddr[1] + frm->stVFrame.u32Length[1];
+    uint32_t image_size         = frm->stVFrame.u32Length[0] + frm->stVFrame.u32Length[1] + frm->stVFrame.u32Length[2];
+    frm->stVFrame.pu8VirAddr[0] = (uint8_t *)CVI_SYS_MmapCache(
+        frm->stVFrame.u64PhyAddr[0], image_size);
+    frm->stVFrame.pu8VirAddr[1] = frm->stVFrame.pu8VirAddr[0] + frm->stVFrame.u32Length[0];
+    frm->stVFrame.pu8VirAddr[2] = frm->stVFrame.pu8VirAddr[1] + frm->stVFrame.u32Length[1];
+
+    uint8_t *data_ptr = (uint8_t *)data;
+    for (int i = 0; i < 3; ++i) {
+        if (frm->stVFrame.u32Length[i] == 0)
+            continue;
+
+        uint32_t height_step = (i == 0) ? frm->stVFrame.u32Height : frm->stVFrame.u32Height / 2;
+        uint32_t width_step  = (i == 0) ? frm->stVFrame.u32Width : frm->stVFrame.u32Width / 2;
+        uint8_t *frm_ptr     = frm->stVFrame.pu8VirAddr[i];
+        for (uint32_t j = 0; j < height_step; ++j) {
+            memcpy(frm_ptr, data_ptr, width_step);
+            frm_ptr += frm->stVFrame.u32Stride[i];
+            data_ptr += width_step;
+        }
+        CVI_SYS_IonFlushCache(frm->stVFrame.u64PhyAddr[i],
+                              frm->stVFrame.pu8VirAddr[i],
+                              frm->stVFrame.u32Length[i]);
+    }
+    CVI_SYS_Munmap(frm->stVFrame.pu8VirAddr[0], image_size);
+    for (int i = 0; i < 3; ++i) {
+        frm->stVFrame.pu8VirAddr[i] = NULL;
+    }
+
+    return CVI_MAPI_SUCCESS;
+}
+
+static uint32_t getFrameSize_YUV(uint32_t w, uint32_t h, PIXEL_FORMAT_E fmt) {
+    if (fmt == PIXEL_FORMAT_YUV_PLANAR_420) {
+        return (w * h * 3) >> 1;
+    } else if (fmt == PIXEL_FORMAT_YUV_PLANAR_422) {
+        return (w * h * 2);
+    } else {
+        CVI_LOG_ASSERT(0, "Unsupported fmt %d\n", fmt);
+    }
+    return 0;
+}
+
+int CVI_MAPI_GetFrameFromFile_YUV(VIDEO_FRAME_INFO_S *frame,
+                                  uint32_t width, uint32_t height, PIXEL_FORMAT_E fmt,
+                                  const char *filaneme, uint32_t frame_no) {
+    FILE *fp = fopen(filaneme, "rb");
+    if (fp == NULL) {
+        CVI_LOGE("Input file %s open failed !\n", filaneme);
+        return CVI_MAPI_ERR_FAILURE;
+    }
+
+    uint32_t frame_size = getFrameSize_YUV(width, height, fmt);
+    char *data          = (char *)malloc(frame_size);
+    if (!data) {
+        CVI_LOGE("malloc frame buffer failed\n");
+        fclose(fp);
+        return CVI_MAPI_ERR_FAILURE;
+    }
+
+    fseek(fp, 0, SEEK_END);
+    unsigned int file_size  = ftell(fp);
+    unsigned int num_frames = file_size / frame_size;
+    if (num_frames < (frame_no + 1)) {
+        CVI_LOGE("file %s size %d to small, frame_size %d, no. %d\n",
+                 filaneme, file_size, frame_size, frame_no);
+        free(data);
+        fclose(fp);
+        return CVI_MAPI_ERR_FAILURE;
+    }
+    rewind(fp);
+
+    fseek(fp, frame_size * frame_no, SEEK_SET);
+    fread((void *)data, 1, frame_size, fp);
+    int ret = CVI_MAPI_GetFrameFromMemory_YUV(frame,
+                                              width, height, fmt, data);
+
+    free(data);
+    fclose(fp);
+    return ret;
+}
+
+/// utils function
+int CVI_MAPI_SaveFramePixelData(VIDEO_FRAME_INFO_S *frm, const char *name) {
+#define FILENAME_MAX_LEN (128)
+    char filename[FILENAME_MAX_LEN] = {0};
+    const char *extension           = NULL;
+    if (frm->stVFrame.enPixelFormat == PIXEL_FORMAT_YUV_PLANAR_420) {
+        extension = "yuv";
+    } else if (frm->stVFrame.enPixelFormat == PIXEL_FORMAT_NV12) {
+        extension = "nv12";
+    } else if (frm->stVFrame.enPixelFormat == PIXEL_FORMAT_NV21) {
+        extension = "nv21";
+    } else if (frm->stVFrame.enPixelFormat == PIXEL_FORMAT_RGB_888_PLANAR || frm->stVFrame.enPixelFormat == PIXEL_FORMAT_BGR_888_PLANAR) {
+        extension = "chw";
+    } else if (frm->stVFrame.enPixelFormat == PIXEL_FORMAT_RGB_888) {
+        extension = "rgb";
+    } else if (frm->stVFrame.enPixelFormat == PIXEL_FORMAT_BGR_888) {
+        extension = "bgr";
+    } else {
+        CVI_LOG_ASSERT(0, "Invalid frm pixel format %d\n",
+                       frm->stVFrame.enPixelFormat);
+    }
+    snprintf(filename, FILENAME_MAX_LEN, "%s_%dX%d.%s", name,
+             frm->stVFrame.u32Width,
+             frm->stVFrame.u32Height,
+             extension);
+
+    FILE *output;
+    output = fopen(filename, "wb");
+    CVI_LOG_ASSERT(output, "file open failed\n");
+
+    CVI_LOGI("Save %s, w*h(%d*%d)\n",
+             filename,
+             frm->stVFrame.u32Width,
+             frm->stVFrame.u32Height);
+
+    //CVI_LOG_ASSERT((frm->stVFrame.pu8VirAddr[0] == NULL), "frame VirAddr failed\n");
+    if (CVI_MAPI_FrameMmap(frm, true) != CVI_MAPI_SUCCESS) {
+        CVI_LOGE("CVI_MAPI_FrameMmap failed\n");
+        return CVI_MAPI_ERR_FAILURE;
+    }
+
+    if (frm->stVFrame.enPixelFormat == PIXEL_FORMAT_YUV_PLANAR_420) {
+        for (int i = 0; i < 3; ++i) {
+            CVI_LOGV("  plane(%d): paddr(0x%" PRIx64 ") vaddr(%p) stride(%d) length(%d)\n",
+                     i,
+                     frm->stVFrame.u64PhyAddr[i],
+                     frm->stVFrame.pu8VirAddr[i],
+                     frm->stVFrame.u32Stride[i],
+                     frm->stVFrame.u32Length[i]);
+            //TODO: test unaligned image
+            uint32_t length = (i == 0 ? frm->stVFrame.u32Height : frm->stVFrame.u32Height / 2);
+            uint32_t step   = (i == 0 ? frm->stVFrame.u32Width : frm->stVFrame.u32Width / 2);
+            uint8_t *ptr    = (uint8_t *)frm->stVFrame.pu8VirAddr[i];
+            for (unsigned j = 0; j < length; ++j) {
+                fwrite(ptr, step, 1, output);
+                ptr += frm->stVFrame.u32Stride[i];
+            }
+        }
+    } else if (frm->stVFrame.enPixelFormat == PIXEL_FORMAT_NV12 || frm->stVFrame.enPixelFormat == PIXEL_FORMAT_NV21) {
+        for (int i = 0; i < 2; ++i) {
+            CVI_LOGV("  plane(%d): paddr(0x%" PRIx64 ") vaddr(%p) stride(%d) length(%d)\n",
+                     i,
+                     frm->stVFrame.u64PhyAddr[i],
+                     frm->stVFrame.pu8VirAddr[i],
+                     frm->stVFrame.u32Stride[i],
+                     frm->stVFrame.u32Length[i]);
+            //TODO: test unaligned image
+            uint32_t length = (i == 0 ? frm->stVFrame.u32Height : frm->stVFrame.u32Height / 2);
+            uint32_t step   = frm->stVFrame.u32Width;
+            uint8_t *ptr    = (uint8_t *)frm->stVFrame.pu8VirAddr[i];
+            for (unsigned j = 0; j < length; ++j) {
+                fwrite(ptr, step, 1, output);
+                ptr += frm->stVFrame.u32Stride[i];
+            }
+        }
+    } else if (frm->stVFrame.enPixelFormat == PIXEL_FORMAT_RGB_888_PLANAR || frm->stVFrame.enPixelFormat == PIXEL_FORMAT_BGR_888_PLANAR) {
+        for (int i = 0; i < 3; i++) {
+            CVI_LOGV("  plane(%d): paddr(0x%" PRIx64 ") vaddr(%p) stride(%d) length(%d)\n",
+                     i,
+                     frm->stVFrame.u64PhyAddr[i],
+                     frm->stVFrame.pu8VirAddr[i],
+                     frm->stVFrame.u32Stride[i],
+                     frm->stVFrame.u32Length[i]);
+            uint8_t *ptr = frm->stVFrame.pu8VirAddr[i];
+            for (unsigned j = 0; j < frm->stVFrame.u32Height; ++j) {
+                fwrite(ptr, frm->stVFrame.u32Width, 1, output);
+                ptr += frm->stVFrame.u32Stride[i];
+            }
+        }
+    } else if (frm->stVFrame.enPixelFormat == PIXEL_FORMAT_RGB_888 || frm->stVFrame.enPixelFormat == PIXEL_FORMAT_BGR_888) {
+        CVI_LOGV("  packed: paddr(0x%" PRIx64 ") vaddr(%p) stride(%d) length(%d)\n",
+                 frm->stVFrame.u64PhyAddr[0],
+                 frm->stVFrame.pu8VirAddr[0],
+                 frm->stVFrame.u32Stride[0],
+                 frm->stVFrame.u32Length[0]);
+        uint8_t *ptr = frm->stVFrame.pu8VirAddr[0];
+        for (unsigned j = 0; j < frm->stVFrame.u32Height; ++j) {
+            fwrite(ptr, frm->stVFrame.u32Width * 3, 1, output);
+            ptr += frm->stVFrame.u32Stride[0];
+        }
+    } else {
+        CVI_LOG_ASSERT(0, "Invalid frm pixel format %d\n",
+                       frm->stVFrame.enPixelFormat);
+    }
+
+    CVI_MAPI_FrameMunmap(frm);
+
+    fclose(output);
+    return CVI_MAPI_SUCCESS;
+}
+
+int CVI_MAPI_AllocateFrame(VIDEO_FRAME_INFO_S *frm,
+                           uint32_t width, uint32_t height, PIXEL_FORMAT_E fmt) {
+    VB_BLK blk;
+    VB_CAL_CONFIG_S stVbCalConfig;
+    COMMON_GetPicBufferConfig(width, height, fmt, DATA_BITWIDTH_8,
+                              COMPRESS_MODE_NONE, DEFAULT_ALIGN, &stVbCalConfig);
+
+    memset(frm, 0x00, sizeof(*frm));
+    frm->stVFrame.enCompressMode = COMPRESS_MODE_NONE;
+    frm->stVFrame.enPixelFormat  = fmt;
+    frm->stVFrame.enVideoFormat  = VIDEO_FORMAT_LINEAR;
+    frm->stVFrame.enColorGamut   = COLOR_GAMUT_BT709;
+    frm->stVFrame.u32Width       = width;
+    frm->stVFrame.u32Height      = height;
+    frm->stVFrame.u32Stride[0]   = stVbCalConfig.u32MainStride;
+    frm->stVFrame.u32Stride[1]   = stVbCalConfig.u32CStride;
+    frm->stVFrame.u32Stride[2]   = stVbCalConfig.u32CStride;
+    frm->stVFrame.u32TimeRef     = 0;
+    frm->stVFrame.u64PTS         = 0;
+    frm->stVFrame.enDynamicRange = DYNAMIC_RANGE_SDR8;
+
+    CVI_LOGV("Allocate VB block with size %d\n", stVbCalConfig.u32VBSize);
+
+    blk = CVI_VB_GetBlock(VB_INVALID_POOLID, stVbCalConfig.u32VBSize);
+    if (blk == (unsigned long)CVI_INVALID_HANDLE) {
+        SAMPLE_PRT("Can't acquire cv block\n");
+        return CVI_FAILURE;
+    }
+
+    frm->u32PoolId             = CVI_VB_Handle2PoolId(blk);
+    frm->stVFrame.u32Length[0] = ALIGN(stVbCalConfig.u32MainYSize,
+                                       stVbCalConfig.u16AddrAlign);
+    frm->stVFrame.u32Length[1] = frm->stVFrame.u32Length[2] = ALIGN(stVbCalConfig.u32MainCSize,
+                                                                    stVbCalConfig.u16AddrAlign);
+
+    frm->stVFrame.u64PhyAddr[0] = CVI_VB_Handle2PhysAddr(blk);
+    frm->stVFrame.u64PhyAddr[1] = frm->stVFrame.u64PhyAddr[0] + frm->stVFrame.u32Length[0];
+    frm->stVFrame.u64PhyAddr[2] = frm->stVFrame.u64PhyAddr[1] + frm->stVFrame.u32Length[1];
+
+    return CVI_MAPI_SUCCESS;
+}
+
+static void get_frame_plane_num_and_mem_size(VIDEO_FRAME_INFO_S *frm,
+                                             int *plane_num, size_t *mem_size) {
+    if (frm->stVFrame.enPixelFormat == PIXEL_FORMAT_RGB_888_PLANAR || frm->stVFrame.enPixelFormat == PIXEL_FORMAT_BGR_888_PLANAR || frm->stVFrame.enPixelFormat == PIXEL_FORMAT_YUV_PLANAR_422 || frm->stVFrame.enPixelFormat == PIXEL_FORMAT_YUV_PLANAR_420 || frm->stVFrame.enPixelFormat == PIXEL_FORMAT_YUV_PLANAR_444 || frm->stVFrame.enPixelFormat == PIXEL_FORMAT_HSV_888_PLANAR || frm->stVFrame.enPixelFormat == PIXEL_FORMAT_FP32_C3_PLANAR || frm->stVFrame.enPixelFormat == PIXEL_FORMAT_INT32_C3_PLANAR || frm->stVFrame.enPixelFormat == PIXEL_FORMAT_UINT32_C3_PLANAR || frm->stVFrame.enPixelFormat == PIXEL_FORMAT_BF16_C3_PLANAR || frm->stVFrame.enPixelFormat == PIXEL_FORMAT_INT16_C3_PLANAR || frm->stVFrame.enPixelFormat == PIXEL_FORMAT_UINT16_C3_PLANAR || frm->stVFrame.enPixelFormat == PIXEL_FORMAT_INT8_C3_PLANAR || frm->stVFrame.enPixelFormat == PIXEL_FORMAT_UINT8_C3_PLANAR) {
+        *plane_num = 3;
+        // check phyaddr
+        CVI_LOG_ASSERT(frm->stVFrame.u64PhyAddr[1] - frm->stVFrame.u64PhyAddr[0] == frm->stVFrame.u32Length[0],
+                       "phy addr not continue 0, fmt = %d\n",
+                       frm->stVFrame.enPixelFormat);
+        CVI_LOG_ASSERT(frm->stVFrame.u64PhyAddr[2] - frm->stVFrame.u64PhyAddr[1] == frm->stVFrame.u32Length[1],
+                       "phy addr not continue 1, fmt = %d\n",
+                       frm->stVFrame.enPixelFormat);
+    } else if (frm->stVFrame.enPixelFormat == PIXEL_FORMAT_NV12 ||
+                 frm->stVFrame.enPixelFormat == PIXEL_FORMAT_NV21) {
+        *plane_num = 2;
+        CVI_LOG_ASSERT(frm->stVFrame.u64PhyAddr[1] - frm->stVFrame.u64PhyAddr[0] == frm->stVFrame.u32Length[0],
+                       "phy addr not continue 0, fmt = %d\n",
+                       frm->stVFrame.enPixelFormat);
+    } else {
+        *plane_num = 1;
+    }
+
+    *mem_size = 0;
+    for (int i = 0; i < *plane_num; ++i) {
+        *mem_size += frm->stVFrame.u32Length[i];
+    }
+}
+
+int CVI_MAPI_FrameMmap(VIDEO_FRAME_INFO_S *frm, bool enable_cache) {
+    int plane_num   = 0;
+    size_t mem_size = 0;
+    get_frame_plane_num_and_mem_size(frm, &plane_num, &mem_size);
+
+    void *vir_addr = NULL;
+    if (enable_cache) {
+        vir_addr = CVI_SYS_MmapCache(frm->stVFrame.u64PhyAddr[0], mem_size);
+    } else {
+        vir_addr = CVI_SYS_Mmap(frm->stVFrame.u64PhyAddr[0], mem_size);
+    }
+    CVI_LOG_ASSERT(vir_addr, "mmap failed\n");
+
+    //CVI_SYS_IonInvalidateCache(frm->stVFrame.u64PhyAddr[0], vir_addr, mem_size);
+    uint64_t plane_offset = 0;
+    for (int i = 0; i < plane_num; ++i) {
+        frm->stVFrame.pu8VirAddr[i] = (uint8_t *)vir_addr + plane_offset;
+        plane_offset += frm->stVFrame.u32Length[i];
+    }
+
+    return CVI_MAPI_SUCCESS;
+}
+
+int CVI_MAPI_FrameMunmap(VIDEO_FRAME_INFO_S *frm) {
+    int plane_num   = 0;
+    size_t mem_size = 0;
+    get_frame_plane_num_and_mem_size(frm, &plane_num, &mem_size);
+
+    void *vir_addr = (void *)frm->stVFrame.pu8VirAddr[0];
+    CVI_SYS_Munmap(vir_addr, mem_size);
+
+    for (int i = 0; i < plane_num; ++i) {
+        frm->stVFrame.pu8VirAddr[i] = NULL;
+    }
+
+    return CVI_MAPI_SUCCESS;
+}
+
+int CVI_MAPI_FrameFlushCache(VIDEO_FRAME_INFO_S *frm) {
+    int plane_num   = 0;
+    size_t mem_size = 0;
+    get_frame_plane_num_and_mem_size(frm, &plane_num, &mem_size);
+
+    void *vir_addr    = (void *)frm->stVFrame.pu8VirAddr[0];
+    uint64_t phy_addr = frm->stVFrame.u64PhyAddr[0];
+
+    CVI_SYS_IonFlushCache(phy_addr, vir_addr, mem_size);
+    return CVI_MAPI_SUCCESS;
+}
+
+int CVI_MAPI_FrameInvalidateCache(VIDEO_FRAME_INFO_S *frm) {
+    int plane_num   = 0;
+    size_t mem_size = 0;
+    get_frame_plane_num_and_mem_size(frm, &plane_num, &mem_size);
+
+    void *vir_addr    = (void *)frm->stVFrame.pu8VirAddr[0];
+    uint64_t phy_addr = frm->stVFrame.u64PhyAddr[0];
+
+    CVI_SYS_IonInvalidateCache(phy_addr, vir_addr, mem_size);
+
+    return CVI_MAPI_SUCCESS;
+}
+
+
+///
+/// Preprocess
+///
+
+void CVI_MAPI_PREPROCESS_ENABLE(VPSS_CHN_ATTR_S *attr_chn,
+                                CVI_MAPI_PREPROCESS_ATTR_T *preprocess) {
+    attr_chn->stNormalize.bEnable = CVI_TRUE;
+    if (preprocess->is_rgb) {
+        CVI_LOG_ASSERT(attr_chn->enPixelFormat == PIXEL_FORMAT_RGB_888_PLANAR,
+                       "Preprocess (RGB) enabled, fmt_out needs to be "
+                       "PIXEL_FORMAT_RGB_888_PLANAR\n");
+    } else {
+        CVI_LOG_ASSERT(attr_chn->enPixelFormat == PIXEL_FORMAT_BGR_888_PLANAR,
+                       "Preprocess (BGR) enabled, fmt_out needs to be "
+                       "PIXEL_FORMAT_BGR_888_PLANAR\n");
+    }
+
+    float quant_scale = preprocess->qscale;
+    float factor[3];
+    float mean[3];
+    for (int i = 0; i < 3; i++) {
+        factor[i] = preprocess->raw_scale / 255.0f;
+        factor[i] *= preprocess->input_scale[i] * quant_scale;
+        if (factor[i] < 1.0f / 8192) {
+            factor[i] = 1.0f / 8192;
+        }
+        if (factor[i] > 8191.0f / 8192) {
+            factor[i] = 8191.0f / 8192;
+        }
+
+        mean[i] = preprocess->mean[i];
+        mean[i] *= preprocess->input_scale[i] * quant_scale;
+    }
+    // mean and factor are supposed to be in BGR, swap R&B if RGB
+    if (preprocess->is_rgb) {
+        float tmp;
+        tmp       = factor[0];
+        factor[0] = factor[2];
+        factor[2] = tmp;
+        tmp       = mean[0];
+        mean[0]   = mean[2];
+        mean[2]   = tmp;
+    }
+    for (int i = 0; i < 3; i++) {
+        attr_chn->stNormalize.factor[i] = factor[i];
+        attr_chn->stNormalize.mean[i]   = mean[i];
+    }
+    attr_chn->stNormalize.rounding = VPSS_ROUNDING_TO_EVEN;
+}
+
+///
+/// VPROC
+///
+
+CVI_MAPI_VPROC_ATTR_T CVI_MAPI_VPROC_DefaultAttr_OneChn(
+    uint32_t width_in,
+    uint32_t height_in,
+    PIXEL_FORMAT_E pixel_format_in,
+    uint32_t width_out,
+    uint32_t height_out,
+    PIXEL_FORMAT_E pixel_format_out) {
+    CVI_MAPI_VPROC_ATTR_T attr;
+    memset((void *)&attr, 0, sizeof(attr));
+
+    attr.attr_inp.stFrameRate.s32SrcFrameRate = -1;
+    attr.attr_inp.stFrameRate.s32DstFrameRate = -1;
+    attr.attr_inp.enPixelFormat               = pixel_format_in;
+    attr.attr_inp.u32MaxW                     = width_in;
+    attr.attr_inp.u32MaxH                     = height_in;
+    attr.attr_inp.u8VpssDev                   = 0;    
+
+    attr.chn_num = 1;
+
+    attr.attr_chn[0].u32Width                    = width_out;
+    attr.attr_chn[0].u32Height                   = height_out;
+    attr.attr_chn[0].enVideoFormat               = VIDEO_FORMAT_LINEAR;
+    attr.attr_chn[0].enPixelFormat               = pixel_format_out;
+    attr.attr_chn[0].stFrameRate.s32SrcFrameRate = -1;
+    attr.attr_chn[0].stFrameRate.s32DstFrameRate = -1;
+    attr.attr_chn[0].u32Depth                    = 1;  // output buffer queue size
+    attr.attr_chn[0].bMirror                     = CVI_FALSE;
+    attr.attr_chn[0].bFlip                       = CVI_FALSE;
+    attr.attr_chn[0].stAspectRatio.enMode        = ASPECT_RATIO_NONE;
+    attr.attr_chn[0].stNormalize.bEnable         = CVI_FALSE;
+
+    return attr;
+}
+
+CVI_MAPI_VPROC_ATTR_T CVI_MAPI_VPROC_DefaultAttr_TwoChn(
+    uint32_t width_in,
+    uint32_t height_in,
+    PIXEL_FORMAT_E pixel_format_in,
+    uint32_t width_out0,
+    uint32_t height_out0,
+    PIXEL_FORMAT_E pixel_format_out0,
+    uint32_t width_out1,
+    uint32_t height_out1,
+    PIXEL_FORMAT_E pixel_format_out1) {
+    CVI_MAPI_VPROC_ATTR_T attr;
+    memset((void *)&attr, 0, sizeof(attr));
+
+    attr.attr_inp.stFrameRate.s32SrcFrameRate = -1;
+    attr.attr_inp.stFrameRate.s32DstFrameRate = -1;
+    attr.attr_inp.enPixelFormat               = pixel_format_in;
+    attr.attr_inp.u32MaxW                     = width_in;
+    attr.attr_inp.u32MaxH                     = height_in;
+    attr.attr_inp.u8VpssDev                   = 0;    
+
+    attr.chn_num = 2;
+
+    attr.attr_chn[0].u32Width                    = width_out0;
+    attr.attr_chn[0].u32Height                   = height_out0;
+    attr.attr_chn[0].enVideoFormat               = VIDEO_FORMAT_LINEAR;
+    attr.attr_chn[0].enPixelFormat               = pixel_format_out0;
+    attr.attr_chn[0].stFrameRate.s32SrcFrameRate = -1;
+    attr.attr_chn[0].stFrameRate.s32DstFrameRate = -1;
+    attr.attr_chn[0].u32Depth                    = 1;  // output buffer queue size
+    attr.attr_chn[0].bMirror                     = CVI_FALSE;
+    attr.attr_chn[0].bFlip                       = CVI_FALSE;
+    attr.attr_chn[0].stAspectRatio.enMode        = ASPECT_RATIO_NONE;
+    attr.attr_chn[0].stNormalize.bEnable         = CVI_FALSE;
+
+    attr.attr_chn[1].u32Width                    = width_out1;
+    attr.attr_chn[1].u32Height                   = height_out1;
+    attr.attr_chn[1].enVideoFormat               = VIDEO_FORMAT_LINEAR;
+    attr.attr_chn[1].enPixelFormat               = pixel_format_out1;
+    attr.attr_chn[1].stFrameRate.s32SrcFrameRate = -1;
+    attr.attr_chn[1].stFrameRate.s32DstFrameRate = -1;
+    attr.attr_chn[1].u32Depth                    = 1;  // output buffer queue size
+    attr.attr_chn[1].bMirror                     = CVI_FALSE;
+    attr.attr_chn[1].bFlip                       = CVI_FALSE;
+    attr.attr_chn[1].stAspectRatio.enMode        = ASPECT_RATIO_NONE;
+    attr.attr_chn[1].stNormalize.bEnable         = CVI_FALSE;
+
+    return attr;
+}
+
+static pthread_mutex_t vproc_mutex       = PTHREAD_MUTEX_INITIALIZER;
+static bool g_grp_used[MAX_VPSS_GRP_NUM] = {0};
+static bool g_vproc_initialized          = false;
+
+CVI_S32 Vproc_Init(VPSS_GRP VpssGrp, CVI_BOOL *pabChnEnable, VPSS_GRP_ATTR_S *pstVpssGrpAttr,
+                  VPSS_CHN_ATTR_S *pastVpssChnAttr)
+{
+    VPSS_CHN VpssChn = 0;
+    CVI_S32 s32Ret;
+
+    s32Ret = CVI_VPSS_CreateGrp(VpssGrp, pstVpssGrpAttr);
+    if (s32Ret != CVI_SUCCESS) {
+        CVI_LOGE("CVI_VPSS_CreateGrp(grp:%d) failed with %#x!\n", VpssGrp, s32Ret);
+        return s32Ret;
+    }
+
+    s32Ret = CVI_VPSS_ResetGrp(VpssGrp);
+    if (s32Ret != CVI_SUCCESS) {
+        CVI_LOGE("CVI_VPSS_ResetGrp(grp:%d) failed with %#x!\n", VpssGrp, s32Ret);
+        goto exit1;
+    }
+
+    for (unsigned j = 0; j < VPSS_MAX_PHY_CHN_NUM; j++) {
+        if (pabChnEnable[j]) {
+            VpssChn = j;
+            s32Ret = CVI_VPSS_SetChnAttr(VpssGrp, VpssChn, &pastVpssChnAttr[VpssChn]);
+            if (s32Ret != CVI_SUCCESS) {
+                CVI_LOGE("CVI_VPSS_SetChnAttr failed with %#x\n", s32Ret);
+                goto exit2;
+            }
+
+            s32Ret = CVI_VPSS_EnableChn(VpssGrp, VpssChn);
+            if (s32Ret != CVI_SUCCESS) {
+                CVI_LOGE("CVI_VPSS_EnableChn failed with %#x\n", s32Ret);
+                goto exit2;
+            }
+        }
+    }
+
+    s32Ret = CVI_VPSS_StartGrp(VpssGrp);
+    if (s32Ret != CVI_SUCCESS) {
+        CVI_LOGE("CVI_VPSS_StartGrp failed with %#x\n", s32Ret);
+        goto exit2;
+    }
+    return CVI_SUCCESS;
+
+exit2:
+    for(signed j = 0; j < VpssChn; j++){
+        if (CVI_VPSS_DisableChn(VpssGrp, j) != CVI_SUCCESS) {
+            CVI_LOGE("CVI_VPSS_DisableChn failed!\n");
+        }
+    }
+exit1:
+    if (CVI_VPSS_DestroyGrp(VpssGrp) != CVI_SUCCESS) {
+        CVI_LOGE("CVI_VPSS_DestroyGrp(grp:%d) failed!\n", VpssGrp);
+    }
+
+    return s32Ret;
+}
+
+CVI_VOID Vproc_Deinit(VPSS_GRP VpssGrp, CVI_BOOL *pabChnEnable)
+{
+    CVI_S32 j;
+    CVI_S32 s32Ret = CVI_SUCCESS;
+    VPSS_CHN VpssChn;
+
+    for (j = 0; j < VPSS_MAX_PHY_CHN_NUM; j++) {
+        if (pabChnEnable[j]) {
+            VpssChn = j;
+            s32Ret = CVI_VPSS_DisableChn(VpssGrp, VpssChn);
+            if (s32Ret != CVI_SUCCESS) {
+                CVI_LOGE("failed with %#x!\n", s32Ret);
+            }
+        }
+    }
+    s32Ret = CVI_VPSS_StopGrp(VpssGrp);
+    if (s32Ret != CVI_SUCCESS) {
+        CVI_LOGE("failed with %#x!\n", s32Ret);
+    }
+    s32Ret = CVI_VPSS_DestroyGrp(VpssGrp);
+    if (s32Ret != CVI_SUCCESS) {
+        CVI_LOGE("failed with %#x!\n", s32Ret);
+    }
+}
+
+int find_valid_vpss_grp(bool ExtChnEn)
+{
+    int i = 0;
+    int grp_id = -1;
+
+    pthread_mutex_lock(&vproc_mutex);
+    // find a valid grp slot
+    VI_VPSS_MODE_S stVIVPSSMode;
+    CVI_SYS_GetVIVPSSMode(&stVIVPSSMode);
+    if ((stVIVPSSMode.aenMode[0] == VI_OFFLINE_VPSS_ONLINE) ||
+        (stVIVPSSMode.aenMode[0] == VI_ONLINE_VPSS_ONLINE) ||
+        (ExtChnEn == true))
+        i = 2;
+
+    for (; i < MAX_VPSS_GRP_NUM; i++) {
+        if (!g_grp_used[i]) {
+            break;
+        }
+    }
+    if (i >= MAX_VPSS_GRP_NUM) {
+        CVI_LOGE("no empty grp_id left\n");
+        pthread_mutex_unlock(&vproc_mutex);
+        return grp_id;
+    } else {
+        grp_id = i;
+    }
+
+    // g_grp_used[grp_id] = true;
+    pthread_mutex_unlock(&vproc_mutex);
+
+    return grp_id;
+}
+
+int CVI_MAPI_VPROC_Init(CVI_MAPI_VPROC_HANDLE_T *vproc_hdl,
+        int grp_id, CVI_MAPI_VPROC_ATTR_T *attr)
+{
+    VPROC_CHECK_NULL_PTR(attr);
+    VPROC_CHECK_NULL_PTR(vproc_hdl);
+
+    if(attr->chn_num > CVI_MAPI_VPROC_MAX_CHN_NUM){
+        CVI_LOGE("attr->chn_num = %d, Exceed the maximum %d\n", attr->chn_num, CVI_MAPI_VPROC_MAX_CHN_NUM);
+        return CVI_MAPI_ERR_INVALID;
+    }
+
+    if (grp_id == -1)
+        grp_id = find_valid_vpss_grp(false);
+
+    if (grp_id >= 0 && grp_id < MAX_VPSS_GRP_NUM) {
+        pthread_mutex_lock(&vproc_mutex);
+        if (g_grp_used[grp_id]) {
+            CVI_LOGE("grp_id %d has been used\n", grp_id);
+            pthread_mutex_unlock(&vproc_mutex);
+            return CVI_MAPI_ERR_INVALID;
+        }
+        g_grp_used[grp_id] = true;
+        pthread_mutex_unlock(&vproc_mutex);
+    } else {
+        CVI_LOGE("Invalid grp_id %d\n", grp_id);
+        return CVI_MAPI_ERR_INVALID;
+    }
+
+    CVI_LOGI("Create VPROC with vpss grp id %d chn_num:%d\n", grp_id,attr->chn_num);
+
+    VPSS_GRP VpssGrp = grp_id;
+    CVI_BOOL abChnEnable[VPSS_MAX_PHY_CHN_NUM] = {0};
+    for (int i = 0; i < attr->chn_num; i++) {
+        abChnEnable[i] = CVI_TRUE;
+    }
+
+    CVI_S32 rc = CVI_MAPI_SUCCESS;
+    /*start vpss*/
+    rc = Vproc_Init(VpssGrp, abChnEnable, &attr->attr_inp, attr->attr_chn);
+    if (rc != CVI_SUCCESS) {
+        CVI_LOGE("Vproc_Init failed. rc: 0x%x !\n", rc);
+        rc = CVI_MAPI_ERR_FAILURE;
+        goto err1;
+    }
+
+    CVI_MAPI_VPROC_CTX_T *pt;
+    pt = (CVI_MAPI_VPROC_CTX_T *)malloc(sizeof(CVI_MAPI_VPROC_CTX_T));
+    if (!pt) {
+        CVI_LOGE("malloc failed\n");
+        rc = CVI_MAPI_ERR_NOMEM;
+        goto err2;
+    }
+    memset(pt, 0, sizeof(CVI_MAPI_VPROC_CTX_T));
+    pt->VpssGrp = VpssGrp;
+    for (int i = 0; i < VPSS_MAX_PHY_CHN_NUM; i++) {
+        pt->abChnEnable[i] = abChnEnable[i];
+    }
+    pt->attr = *attr;
+
+    *vproc_hdl = (CVI_MAPI_VPROC_HANDLE_T)pt;
+    return CVI_MAPI_SUCCESS;
+
+err2:
+    Vproc_Deinit(VpssGrp, abChnEnable);
+
+err1:
+    pthread_mutex_lock(&vproc_mutex);
+    g_grp_used[grp_id] = false;
+    pthread_mutex_unlock(&vproc_mutex);
+
+    return rc;
+}
+
+int CVI_MAPI_VPROC_Deinit(CVI_MAPI_VPROC_HANDLE_T vproc_hdl)
+{
+    VPROC_CHECK_NULL_PTR(vproc_hdl);
+    CVI_MAPI_VPROC_CTX_T *pt = (CVI_MAPI_VPROC_CTX_T *)vproc_hdl;
+    CHECK_VPROC_GRP(pt->VpssGrp);
+    int grp_id = pt->VpssGrp;
+
+    pthread_mutex_lock(&vproc_mutex);
+    g_grp_used[grp_id] = false;
+    pthread_mutex_unlock(&vproc_mutex);
+
+    CVI_LOGI("Destroy VPROC with vpss grp id %d\n", grp_id);
+    int i = 0;
+    for(i = 0; i < VPSS_MAX_PHY_CHN_NUM; i++) {
+        if (pt->ExtChn[i].ExtChnEnable == true) {
+            MMF_CHN_S stSrcChn;
+            MMF_CHN_S stDestChn;
+
+            stSrcChn.enModId = CVI_ID_VPSS;
+            stSrcChn.s32DevId = pt->VpssGrp;
+            stSrcChn.s32ChnId = pt->ExtChn[i].ExtChnAttr.BindVprocChnId;
+
+            stDestChn.enModId = CVI_ID_VPSS;
+            stDestChn.s32DevId = pt->ExtChn[i].ExtChnGrp;
+            stDestChn.s32ChnId = 0;
+
+            CVI_S32 rc = CVI_SYS_UnBind(&stSrcChn, &stDestChn);
+            if (rc != CVI_SUCCESS) {
+                CVI_LOGE("CVI_SYS_UnBind, rc: 0x%x !\n", rc);
+                return CVI_MAPI_ERR_FAILURE;
+            }
+
+        }
+    }
+
+    for(i = 0; i < VPSS_MAX_PHY_CHN_NUM; i++) {
+        if (pt->ExtChn[i].ExtChnEnable == true) {
+            CVI_BOOL ChnEnable[VPSS_MAX_PHY_CHN_NUM] = {0};
+            ChnEnable[0] = true;
+            g_grp_used[pt->ExtChn[i].ExtChnGrp] = false;
+            Vproc_Deinit(pt->ExtChn[i].ExtChnGrp, ChnEnable);
+        }
+    }
+
+    Vproc_Deinit(pt->VpssGrp, pt->abChnEnable);
+    free(pt);
+
+    return CVI_MAPI_SUCCESS;
+}
+
+int CVI_MAPI_VPROC_GetGrp(CVI_MAPI_VPROC_HANDLE_T vproc_hdl) {
+    CVI_MAPI_VPROC_CTX_T *pt = (CVI_MAPI_VPROC_CTX_T *)vproc_hdl;
+    return pt->VpssGrp;
+}
+
+
+int CVI_MAPI_VPROC_SendFrame(CVI_MAPI_VPROC_HANDLE_T vproc_hdl,
+                             VIDEO_FRAME_INFO_S *frame) {
+    CVI_MAPI_VPROC_CTX_T *pt = (CVI_MAPI_VPROC_CTX_T *)vproc_hdl;
+    CHECK_VPROC_GRP(pt->VpssGrp);
+    VPROC_CHECK_NULL_PTR(frame);
+
+    if (CVI_VPSS_SendFrame(pt->VpssGrp, frame, -1) != CVI_SUCCESS) {
+        CVI_LOGE("CVI_VPSS_SendFrame failed\n");
+        return CVI_MAPI_ERR_FAILURE;
+    }
+
+    return CVI_MAPI_SUCCESS;
+}
+
+int CVI_MAPI_VPROC_GetChnFrame(CVI_MAPI_VPROC_HANDLE_T vproc_hdl,
+                               uint32_t chn_idx, VIDEO_FRAME_INFO_S *frame) {
+    CVI_MAPI_VPROC_CTX_T *pt = (CVI_MAPI_VPROC_CTX_T *)vproc_hdl;
+    CHECK_VPROC_GRP(pt->VpssGrp);
+    CHECK_VPROC_CHN(chn_idx);
+    VPROC_CHECK_NULL_PTR(frame);
+    if (!g_grp_used[pt->VpssGrp]) {
+        CVI_LOGE("vproc grp %d uninitialized\n", pt->VpssGrp);
+        return CVI_MAPI_ERR_FAILURE;
+    }
+
+    if (CVI_VPSS_GetChnFrame(pt->VpssGrp, chn_idx, frame, -1) != CVI_SUCCESS) {
+        CVI_LOGE("CVI_VPSS_GetChnFrame failed\n");
+        return CVI_MAPI_ERR_FAILURE;
+    }
+
+    return CVI_MAPI_SUCCESS;
+}
+
+int CVI_MAPI_VPROC_SendChnFrame(CVI_MAPI_VPROC_HANDLE_T vproc_hdl,
+                                uint32_t chn_idx, VIDEO_FRAME_INFO_S *frame) {
+    CVI_MAPI_VPROC_CTX_T *pt = (CVI_MAPI_VPROC_CTX_T *)vproc_hdl;
+    CHECK_VPROC_GRP(pt->VpssGrp);
+    CHECK_VPROC_CHN(chn_idx);
+    VPROC_CHECK_NULL_PTR(frame);
+    if (!g_grp_used[pt->VpssGrp]) {
+        CVI_LOGE("vproc grp %d uninitialized\n", pt->VpssGrp);
+        return CVI_MAPI_ERR_FAILURE;
+    }
+
+    if (CVI_VPSS_SendChnFrame(pt->VpssGrp, chn_idx, frame, -1) != CVI_SUCCESS) {
+        CVI_LOGE("CVI_VPSS_SendChnFrame failed\n");
+        return CVI_MAPI_ERR_FAILURE;
+    }
+
+    return CVI_MAPI_SUCCESS;
+}
+
+int CVI_MAPI_VPROC_ReleaseFrame(CVI_MAPI_VPROC_HANDLE_T vproc_hdl,
+                                uint32_t chn_idx, VIDEO_FRAME_INFO_S *frame) {
+    CVI_MAPI_VPROC_CTX_T *pt = (CVI_MAPI_VPROC_CTX_T *)vproc_hdl;
+    CHECK_VPROC_GRP(pt->VpssGrp);
+    CHECK_VPROC_CHN(chn_idx);
+    VPROC_CHECK_NULL_PTR(frame);
+    if (!g_grp_used[pt->VpssGrp]) {
+        CVI_LOGE("vproc grp %d uninitialized\n", pt->VpssGrp);
+        return CVI_MAPI_ERR_FAILURE;
+    }
+
+    if (CVI_VPSS_ReleaseChnFrame(pt->VpssGrp, chn_idx, frame) != CVI_SUCCESS) {
+        CVI_LOGE("CVI_VPSS_SendFrame failed\n");
+        return CVI_MAPI_ERR_FAILURE;
+    }
+
+    return CVI_MAPI_SUCCESS;
+}
+
+///
+/// IPROC
+///
+
+static pthread_mutex_t iproc_mutex = PTHREAD_MUTEX_INITIALIZER;
+static bool iproc_initialized      = false;
+
+int CVI_MAPI_IPROC_Resize(VIDEO_FRAME_INFO_S *frame_in,
+                          VIDEO_FRAME_INFO_S *frame_out,
+                          uint32_t resize_width,
+                          uint32_t resize_height,
+                          PIXEL_FORMAT_E fmt_out,
+                          bool keep_aspect_ratio,
+                          CVI_MAPI_IPROC_RESIZE_CROP_ATTR_T *crop_in,
+                          CVI_MAPI_IPROC_RESIZE_CROP_ATTR_T *crop_out,
+                          CVI_MAPI_PREPROCESS_ATTR_T *preprocess) {
+    pthread_mutex_lock(&iproc_mutex);
+
+    if (!iproc_initialized) {
+        // always dual mode, grp 0 for vproc, grp 1 for iproc
+        CVI_SYS_SetVPSSMode(VPSS_MODE_DUAL);
+    }
+
+    VPSS_GRP VpssGrp = iproc_grp;
+    VPSS_CHN VpssChn = 0;
+    CVI_U8 VpssDev   = 1;
+
+    CVI_MAPI_IPROC_RESIZE_CROP_ATTR_T crop_out_adjusted;
+    uint32_t resize_width_adjusted  = resize_width;
+    uint32_t resize_height_adjusted = resize_height;
+    if (crop_out) {
+        uint32_t in_w = frame_in->stVFrame.u32Width;
+        uint32_t in_h = frame_in->stVFrame.u32Height;
+        if (crop_in) {
+            in_w = crop_in->w;
+            in_h = crop_in->h;
+        }
+        double x_scale         = 1.0 * in_w / resize_width;
+        double y_scale         = 1.0 * in_h / resize_height;
+        crop_out_adjusted.x    = crop_out->x * x_scale;
+        crop_out_adjusted.y    = crop_out->y * y_scale;
+        crop_out_adjusted.w    = crop_out->w * x_scale;
+        crop_out_adjusted.h    = crop_out->h * y_scale;
+        resize_width_adjusted  = crop_out->w;
+        resize_height_adjusted = crop_out->h;
+    }
+
+    CVI_BOOL abChnEnable[VPSS_MAX_PHY_CHN_NUM] = {0};
+    abChnEnable[0]                             = CVI_TRUE;
+    VPSS_GRP_ATTR_S attr_inp;
+    VPSS_CHN_ATTR_S attr_chn[VPSS_MAX_PHY_CHN_NUM];
+    memset((void *)&attr_inp, 0, sizeof(attr_inp));
+    memset((void *)&attr_chn[0], 0, sizeof(attr_chn));
+
+    attr_inp.stFrameRate.s32SrcFrameRate = -1;
+    attr_inp.stFrameRate.s32DstFrameRate = -1;
+    attr_inp.enPixelFormat               = frame_in->stVFrame.enPixelFormat;
+    attr_inp.u32MaxW                     = frame_in->stVFrame.u32Width;
+    attr_inp.u32MaxH                     = frame_in->stVFrame.u32Height;
+    attr_inp.u8VpssDev                   = VpssDev;
+
+    attr_chn[0].u32Width                    = resize_width_adjusted;
+    attr_chn[0].u32Height                   = resize_height_adjusted;
+    attr_chn[0].enVideoFormat               = VIDEO_FORMAT_LINEAR;
+    attr_chn[0].enPixelFormat               = fmt_out;
+    attr_chn[0].stFrameRate.s32SrcFrameRate = -1;
+    attr_chn[0].stFrameRate.s32DstFrameRate = -1;
+    attr_chn[0].u32Depth                    = 1;  // chn output queue size
+    attr_chn[0].bMirror                     = CVI_FALSE;
+    attr_chn[0].bFlip                       = CVI_FALSE;
+    attr_chn[0].stAspectRatio.enMode        = keep_aspect_ratio ? ASPECT_RATIO_AUTO : ASPECT_RATIO_NONE;
+    if (keep_aspect_ratio) {
+        attr_chn[0].stAspectRatio.bEnableBgColor = CVI_TRUE;
+        attr_chn[0].stAspectRatio.u32BgColor     = 0x00000000;
+    }
+    attr_chn[0].stNormalize.bEnable = CVI_FALSE;
+
+    // preprocess
+    if (preprocess) {
+        CVI_MAPI_PREPROCESS_ENABLE(&attr_chn[0], preprocess);
+    }
+
+    CVI_S32 rc;
+    /*start vpss*/
+    if (!iproc_initialized) {
+        rc = Vproc_Init(VpssGrp, abChnEnable, &attr_inp, attr_chn);
+        if (rc != CVI_SUCCESS) {
+            CVI_LOGE("Vproc_Init failed. rc: 0x%x !\n", rc);
+            goto err;
+        }
+    } else {
+        rc = CVI_VPSS_SetGrpAttr(VpssGrp, &attr_inp);
+        if (rc != CVI_SUCCESS) {
+            CVI_LOGE("CVI_VPSS_SetGrpAttr failed. rc: 0x%x !\n", rc);
+            goto err;
+        }
+        for (int i = 0; i < VPSS_MAX_PHY_CHN_NUM; i++) {
+            if (abChnEnable[i]) {
+                VpssChn = i;
+                rc      = CVI_VPSS_SetChnAttr(VpssGrp, VpssChn, &attr_chn[VpssChn]);
+                if (rc != CVI_SUCCESS) {
+                    CVI_LOGE("CVI_VPSS_SetChnAttr failed. rc: 0x%x !\n", rc);
+                    goto err;
+                }
+            }
+        }
+    }
+    if (!iproc_initialized) {
+        iproc_initialized = true;
+    }
+
+    if (crop_in && crop_out) {
+        VPSS_CROP_INFO_S stCropInInfo;
+        stCropInInfo.bEnable = CVI_TRUE;
+        stCropInInfo.enCropCoordinate = VPSS_CROP_ABS_COOR;
+        stCropInInfo.stCropRect.s32X      = crop_in->x;
+        stCropInInfo.stCropRect.s32Y      = crop_in->y;
+        stCropInInfo.stCropRect.u32Width  = crop_in->w;
+        stCropInInfo.stCropRect.u32Height = crop_in->h;
+        CVI_LOGV("Crop IN, %d %d %d %d\n",
+                stCropInInfo.stCropRect.s32X,
+                stCropInInfo.stCropRect.s32Y,
+                stCropInInfo.stCropRect.u32Width,
+                stCropInInfo.stCropRect.u32Height);
+        rc = CVI_VPSS_SetGrpCrop(VpssGrp, &stCropInInfo);
+        if (rc != CVI_SUCCESS) {
+            CVI_LOGE("CVI_VPSS_SetGrpCrop failed. rc: 0x%x !\n", rc);
+            goto err;
+        }
+
+        VPSS_CROP_INFO_S stCropOutInfo;
+        stCropOutInfo.bEnable = CVI_TRUE;
+        stCropOutInfo.enCropCoordinate = VPSS_CROP_ABS_COOR;
+        stCropOutInfo.stCropRect.s32X      = crop_out_adjusted.x;
+        stCropOutInfo.stCropRect.s32Y      = crop_out_adjusted.y;
+        stCropOutInfo.stCropRect.u32Width  = crop_out_adjusted.w;
+        stCropOutInfo.stCropRect.u32Height = crop_out_adjusted.h;
+        CVI_LOGV("Crop OUT, %d %d %d %d\n",
+                stCropOutInfo.stCropRect.s32X,
+                stCropOutInfo.stCropRect.s32Y,
+                stCropOutInfo.stCropRect.u32Width,
+                stCropOutInfo.stCropRect.u32Height);
+        rc = CVI_VPSS_SetChnCrop(VpssGrp, VpssChn, &stCropOutInfo);
+        if (rc != CVI_SUCCESS) {
+            CVI_LOGE("CVI_VPSS_SetChnCrop failed. rc: 0x%x !\n", rc);
+            goto err;
+        }
+    } else if (crop_in) {
+        VPSS_CROP_INFO_S stCropInInfo;
+        stCropInInfo.bEnable = CVI_TRUE;
+        stCropInInfo.enCropCoordinate = VPSS_CROP_ABS_COOR;
+        stCropInInfo.stCropRect.s32X      = crop_in->x;
+        stCropInInfo.stCropRect.s32Y      = crop_in->y;
+        stCropInInfo.stCropRect.u32Width  = crop_in->w;
+        stCropInInfo.stCropRect.u32Height = crop_in->h;
+        CVI_LOGV("Crop IN, %d %d %d %d\n",
+                stCropInInfo.stCropRect.s32X,
+                stCropInInfo.stCropRect.s32Y,
+                stCropInInfo.stCropRect.u32Width,
+                stCropInInfo.stCropRect.u32Height);
+        rc = CVI_VPSS_SetGrpCrop(VpssGrp, &stCropInInfo);
+        if (rc != CVI_SUCCESS) {
+            CVI_LOGE("CVI_VPSS_SetGrpCrop failed. rc: 0x%x !\n", rc);
+            goto err;
+        }
+
+        VPSS_CROP_INFO_S stCropDisableInfo;
+        stCropDisableInfo.bEnable = CVI_FALSE;
+        rc = CVI_VPSS_SetChnCrop(VpssGrp, VpssChn, &stCropDisableInfo);
+        if (rc != CVI_SUCCESS) {
+            CVI_LOGE("CVI_VPSS_SetChnCrop failed. rc: 0x%x !\n", rc);
+            goto err;
+        }
+
+    } else if (crop_out) {
+        VPSS_CROP_INFO_S stCropDisableInfo;
+        stCropDisableInfo.bEnable = CVI_FALSE;
+        rc = CVI_VPSS_SetGrpCrop(VpssGrp, &stCropDisableInfo);
+        if (rc != CVI_SUCCESS) {
+            CVI_LOGE("CVI_VPSS_SetGrpCrop failed. rc: 0x%x !\n", rc);
+            goto err;
+        }
+
+        VPSS_CROP_INFO_S stCropOutInfo;
+        stCropOutInfo.bEnable = CVI_TRUE;
+        stCropOutInfo.enCropCoordinate = VPSS_CROP_ABS_COOR;
+        stCropOutInfo.stCropRect.s32X      = crop_out_adjusted.x;
+        stCropOutInfo.stCropRect.s32Y      = crop_out_adjusted.y;
+        stCropOutInfo.stCropRect.u32Width  = crop_out_adjusted.w;
+        stCropOutInfo.stCropRect.u32Height = crop_out_adjusted.h;
+        CVI_LOGV("Crop OUT, %d %d %d %d\n",
+                stCropOutInfo.stCropRect.s32X,
+                stCropOutInfo.stCropRect.s32Y,
+                stCropOutInfo.stCropRect.u32Width,
+                stCropOutInfo.stCropRect.u32Height);
+        rc = CVI_VPSS_SetChnCrop(VpssGrp, VpssChn, &stCropOutInfo);
+        if (rc != CVI_SUCCESS) {
+            CVI_LOGE("CVI_VPSS_SetChnCrop failed. rc: 0x%x !\n", rc);
+            goto err;
+        }
+
+    } else {
+        VPSS_CROP_INFO_S stCropDisableInfo;
+        stCropDisableInfo.bEnable = CVI_FALSE;
+        rc = CVI_VPSS_SetGrpCrop(VpssGrp, &stCropDisableInfo);
+        if (rc != CVI_SUCCESS) {
+            CVI_LOGE("CVI_VPSS_SetGrpCrop failed. rc: 0x%x !\n", rc);
+            goto err;
+        }
+        rc = CVI_VPSS_SetChnCrop(VpssGrp, VpssChn, &stCropDisableInfo);
+        if (rc != CVI_SUCCESS) {
+            CVI_LOGE("CVI_VPSS_SetChnCrop failed. rc: 0x%x !\n", rc);
+            goto err;
+        }
+    }
+
+    rc = CVI_VPSS_SendFrame(VpssGrp, frame_in, -1);
+    if (rc != CVI_SUCCESS) {
+        CVI_LOGE("CVI_VPSS_SendFrame failed. rc: 0x%x !\n", rc);
+        goto err;
+    }
+
+    rc = CVI_VPSS_GetChnFrame(VpssGrp, VpssChn, frame_out, -1);
+    if (rc != CVI_SUCCESS) {
+        CVI_LOGE("CVI_VPSS_GetChnFrame failed with %#x\n", rc);
+        goto err;
+    }
+
+    pthread_mutex_unlock(&iproc_mutex);
+    return CVI_MAPI_SUCCESS;
+err:
+    pthread_mutex_unlock(&iproc_mutex);
+    return CVI_MAPI_ERR_FAILURE;
+
+}
+
+int CVI_MAPI_IPROC_ReleaseFrame(VIDEO_FRAME_INFO_S *frm) {
+    // IPROC always use grp 0
+    VPSS_GRP VpssGrp = iproc_grp;
+    VPSS_CHN VpssChn = 0;
+
+    CVI_S32 rc;
+    rc = CVI_VPSS_ReleaseChnFrame(VpssGrp, VpssChn, frm);
+    if (rc != CVI_SUCCESS) {
+        CVI_LOGE("CVI_VPSS_ReleaseChnFrame failed with %#x\n", rc);
+        return CVI_MAPI_ERR_FAILURE;
+    }
+
+    return CVI_MAPI_SUCCESS;
+}
+
+int CVI_MAPI_IPROC_Deinit()
+{
+    CVI_LOGI("Destroy IPROC with vpss grp id %d\n", iproc_grp);
+    int i = 0;
+
+    CVI_S32 j;
+    CVI_S32 s32Ret = CVI_SUCCESS;
+    VPSS_CHN VpssChn;
+
+    s32Ret = CVI_VPSS_DisableChn(iproc_grp, 0);
+    if (s32Ret != CVI_SUCCESS) {
+        CVI_LOGE("failed with %#x!\n", s32Ret);
+    }
+    s32Ret = CVI_VPSS_StopGrp(iproc_grp);
+    if (s32Ret != CVI_SUCCESS) {
+        CVI_LOGE("failed with %#x!\n", s32Ret);
+    }
+    s32Ret = CVI_VPSS_DestroyGrp(iproc_grp);
+    if (s32Ret != CVI_SUCCESS) {
+        CVI_LOGE("failed with %#x!\n", s32Ret);
+    }
+    return CVI_MAPI_SUCCESS;
+}
diff --git a/cviruntime/samples/samples_extra/insightface_vpss/cvi_media_sdk.h b/cviruntime/samples/samples_extra/insightface_vpss/cvi_media_sdk.h
new file mode 100644
index 000000000..b511575ec
--- /dev/null
+++ b/cviruntime/samples/samples_extra/insightface_vpss/cvi_media_sdk.h
@@ -0,0 +1,227 @@
+#ifndef _CVI_MEDIA_SDK_H_
+#define _CVI_MEDIA_SDK_H_
+
+#include "stdint.h"
+#include "stdbool.h"
+#include "stddef.h"
+#include <stdlib.h>
+#include <string.h>
+#include <signal.h>
+
+#include "sample_comm.h"
+#include "cvi_sys.h"
+#include "cvi_type.h"
+
+
+#define CVI_MAPI_SUCCESS           ((int)(0))
+#define CVI_MAPI_ERR_FAILURE       ((int)(-1001))
+#define CVI_MAPI_ERR_NOMEM         ((int)(-1002))
+#define CVI_MAPI_ERR_TIMEOUT       ((int)(-1003))
+#define CVI_MAPI_ERR_INVALID       ((int)(-1004))
+#define MAX_VPSS_GRP_NUM    (16)
+#define CVI_MAPI_VPROC_MAX_CHN_NUM    (3)
+
+#ifndef SAMPLE_CHECK_RET
+#define SAMPLE_CHECK_RET(express)                                                    \
+    do {                                                                      \
+        int rc = express;                                                     \
+        if (rc != 0) {                                                        \
+            printf("\nFailed at %s: %d  (rc:0x%#x!)\n",                       \
+                    __FILE__, __LINE__, rc);                                  \
+            return rc;                                                        \
+        }                                                                     \
+    } while (0)
+#endif
+
+
+#ifndef UNUSED
+# define UNUSED(x) x=x
+#endif
+
+#ifndef CVI_LOG_ASSERT
+#define CVI_LOG_ASSERT(x, ...)     \
+    do {                           \
+        if (!(x)) {                \
+            printf(__VA_ARGS__);   \
+			abort();               \
+        }                          \
+    } while(0)
+#endif
+#define CVI_LOGE(...)   printf(__VA_ARGS__)
+#define CVI_LOGI(...)   printf(__VA_ARGS__)
+#define CVI_LOGV(...)   printf(__VA_ARGS__)
+#define CVI_LOGV_MEM(...)   printf(__VA_ARGS__)
+
+#define CVI_MAPI_VB_POOL_MAX_NUM (16)
+
+#define CHECK_VPROC_GRP(grp) do { \
+        if (grp >= MAX_VPSS_GRP_NUM) { \
+            CVI_LOGE("VprocGrp(%d) exceeds Max(%d)\n", grp, MAX_VPSS_GRP_NUM); \
+            return CVI_MAPI_ERR_INVALID; \
+        } \
+    } while (0)
+
+#define CHECK_VPROC_CHN(chn) do { \
+    if (chn >= CVI_MAPI_VPROC_MAX_CHN_NUM) { \
+        CVI_LOGE("VprocGrp(%d) exceeds Max(%d)\n", chn, CVI_MAPI_VPROC_MAX_CHN_NUM); \
+        return CVI_MAPI_ERR_INVALID; \
+    } \
+} while (0)
+
+#define VPROC_CHECK_NULL_PTR(ptr) \
+        do { \
+            if (!(ptr)) { \
+                CVI_LOGE(" NULL pointer\n"); \
+                return CVI_MAPI_ERR_INVALID; \
+            } \
+        } while (0)
+
+typedef struct CVI_MAPI_MEDIA_SYS_VB_POOL_S {
+    union cvi_vb_blk_size {
+        uint32_t                   size;
+        struct cvi_vb_blk_frame_s {
+            uint32_t             width;
+            uint32_t             height;
+            PIXEL_FORMAT_E       fmt;
+        } frame;
+    } vb_blk_size;
+    bool                         is_frame;
+    uint32_t                     vb_blk_num;
+} CVI_MAPI_MEDIA_SYS_VB_POOL_T;
+
+
+typedef struct CVI_MAPI_MEDIA_SYS_ATTR_S {
+    CVI_MAPI_MEDIA_SYS_VB_POOL_T vb_pool[CVI_MAPI_VB_POOL_MAX_NUM];
+    uint32_t                     vb_pool_num;
+    VI_VPSS_MODE_S stVIVPSSMode;
+    VPSS_MODE_S stVPSSMode;
+} CVI_MAPI_MEDIA_SYS_ATTR_T;
+
+typedef void * CVI_MAPI_HANDLE_T;
+
+
+typedef struct CVI_MAPI_PREPROCESS_ATTR_S {
+    bool is_rgb;           // default false
+    float raw_scale;       // default 255.0 means no raw_scale
+    float mean[3];         // in BGR order
+    float input_scale[3];  // in BGR order, combined input_scale and std[3]
+    float qscale;
+} CVI_MAPI_PREPROCESS_ATTR_T;
+
+typedef CVI_MAPI_HANDLE_T CVI_MAPI_VPROC_HANDLE_T;
+
+
+typedef struct CVI_MAPI_VPROC_ATTR_S {
+    VPSS_GRP_ATTR_S   attr_inp;
+    int               chn_num;
+    VPSS_CHN_ATTR_S   attr_chn[CVI_MAPI_VPROC_MAX_CHN_NUM];
+} CVI_MAPI_VPROC_ATTR_T;
+
+typedef int (*PFN_VPROC_FrameDataProc)(uint32_t Grp,
+    uint32_t Chn, VIDEO_FRAME_INFO_S *pFrame, void *pPrivateData);
+
+typedef struct CVI_DUMP_FRAME_CALLBACK_FUNC_S {
+    PFN_VPROC_FrameDataProc pfunFrameProc;
+    void *pPrivateData;
+} CVI_DUMP_FRAME_CALLBACK_FUNC_T;
+
+typedef struct VPROC_DUMP_CTX_S {
+    CVI_BOOL bStart;
+    CVI_U32 Grp;
+    CVI_S32 Chn;
+    CVI_S32 s32Count;
+    CVI_DUMP_FRAME_CALLBACK_FUNC_T stCallbackFun;
+    pthread_t pthreadDump;
+} VPROC_DUMP_CTX_T;
+
+typedef struct CVI_MAPI_EXTCHN_ATTR_S {
+    uint32_t        ChnId;
+    uint32_t        BindVprocChnId;
+    VPSS_CHN_ATTR_S VpssChnAttr;
+} CVI_MAPI_EXTCHN_ATTR_T;
+
+typedef struct EXT_VPROC_CHN_CTX_S {
+    CVI_U32     ExtChnGrp;
+    CVI_BOOL    ExtChnEnable;
+    CVI_MAPI_EXTCHN_ATTR_T ExtChnAttr;
+} EXT_VPROC_CHN_CTX_T;
+
+typedef struct CVI_MAPI_VPROC_CTX_S {
+    VPSS_GRP VpssGrp;
+    CVI_BOOL abChnEnable[VPSS_MAX_PHY_CHN_NUM];
+    CVI_MAPI_VPROC_ATTR_T attr;
+    EXT_VPROC_CHN_CTX_T ExtChn[VPSS_MAX_PHY_CHN_NUM];
+    VPROC_DUMP_CTX_T stVprocDumpCtx;
+} CVI_MAPI_VPROC_CTX_T;
+
+typedef struct CVI_MAPI_IPROC_RECT_S {
+    uint32_t x;
+    uint32_t y;
+    uint32_t w;
+    uint32_t h;
+} CVI_MAPI_IPROC_RECT_T, CVI_MAPI_IPROC_RESIZE_CROP_ATTR_T;
+
+
+int CVI_MAPI_Media_Init(CVI_MAPI_MEDIA_SYS_ATTR_T *attr);
+int CVI_MAPI_Media_Deinit(void);
+int CVI_MAPI_ReleaseFrame(VIDEO_FRAME_INFO_S *frm);
+int CVI_MAPI_GetFrameFromMemory_YUV(VIDEO_FRAME_INFO_S *frm,
+        uint32_t width, uint32_t height, PIXEL_FORMAT_E fmt, void *data);
+int CVI_MAPI_GetFrameFromFile_YUV(VIDEO_FRAME_INFO_S *frame,
+        uint32_t width, uint32_t height, PIXEL_FORMAT_E fmt,
+        const char *filaneme, uint32_t frame_no);
+int CVI_MAPI_FrameMmap(VIDEO_FRAME_INFO_S *frm, bool enable_cache);
+int CVI_MAPI_FrameMunmap(VIDEO_FRAME_INFO_S *frm);
+int CVI_MAPI_SaveFramePixelData(VIDEO_FRAME_INFO_S *frm, const char *name);
+int CVI_MAPI_AllocateFrame(VIDEO_FRAME_INFO_S *frm,
+        uint32_t width, uint32_t height, PIXEL_FORMAT_E fmt);
+int CVI_MAPI_FrameMmap(VIDEO_FRAME_INFO_S *frm, bool enable_cache);
+int CVI_MAPI_FrameMunmap(VIDEO_FRAME_INFO_S *frm);
+int CVI_MAPI_FrameFlushCache(VIDEO_FRAME_INFO_S *frm);
+int CVI_MAPI_FrameInvalidateCache(VIDEO_FRAME_INFO_S *frm);
+
+void CVI_MAPI_PREPROCESS_ENABLE(VPSS_CHN_ATTR_S *attr_chn,
+        CVI_MAPI_PREPROCESS_ATTR_T *preprocess);
+CVI_MAPI_VPROC_ATTR_T CVI_MAPI_VPROC_DefaultAttr_OneChn(
+    uint32_t          width_in,
+    uint32_t          height_in,
+    PIXEL_FORMAT_E    pixel_format_in,
+    uint32_t          width_out,
+    uint32_t          height_out,
+    PIXEL_FORMAT_E    pixel_format_out);
+
+CVI_MAPI_VPROC_ATTR_T CVI_MAPI_VPROC_DefaultAttr_TwoChn(
+    uint32_t          width_in,
+    uint32_t          height_in,
+    PIXEL_FORMAT_E    pixel_format_in,
+    uint32_t          width_out0,
+    uint32_t          height_out0,
+    PIXEL_FORMAT_E    pixel_format_out0,
+    uint32_t          width_out1,
+    uint32_t          height_out1,
+    PIXEL_FORMAT_E    pixel_format_out1);
+
+int CVI_MAPI_VPROC_Init(CVI_MAPI_VPROC_HANDLE_T *vproc_hdl,
+        int grp_id, CVI_MAPI_VPROC_ATTR_T *attr);
+int CVI_MAPI_VPROC_Deinit(CVI_MAPI_VPROC_HANDLE_T vproc_hdl);
+int CVI_MAPI_VPROC_GetGrp(CVI_MAPI_VPROC_HANDLE_T vproc_hdl);
+int CVI_MAPI_VPROC_SendFrame(CVI_MAPI_VPROC_HANDLE_T vproc_hdl,
+        VIDEO_FRAME_INFO_S *frame);
+int CVI_MAPI_VPROC_GetChnFrame(CVI_MAPI_VPROC_HANDLE_T vproc_hdl,
+        uint32_t chn_idx, VIDEO_FRAME_INFO_S *frame);
+int CVI_MAPI_VPROC_SendChnFrame(CVI_MAPI_VPROC_HANDLE_T vproc_hdl,
+        uint32_t chn_idx, VIDEO_FRAME_INFO_S *frame);
+int CVI_MAPI_VPROC_ReleaseFrame(CVI_MAPI_VPROC_HANDLE_T vproc_hdl,
+        uint32_t chn_idx, VIDEO_FRAME_INFO_S *frame);
+int CVI_MAPI_IPROC_Resize(VIDEO_FRAME_INFO_S *frame_in,
+        VIDEO_FRAME_INFO_S *frame_out,
+        uint32_t resize_width,
+        uint32_t resize_height,
+        PIXEL_FORMAT_E fmt_out,
+        bool keep_aspect_ratio,
+        CVI_MAPI_IPROC_RESIZE_CROP_ATTR_T *crop_in,
+        CVI_MAPI_IPROC_RESIZE_CROP_ATTR_T *crop_out,
+        CVI_MAPI_PREPROCESS_ATTR_T *preprocess);
+int CVI_MAPI_IPROC_ReleaseFrame(VIDEO_FRAME_INFO_S *frm);
+int CVI_MAPI_IPROC_Deinit();
+#endif
diff --git a/cviruntime/samples/samples_extra/insightface_vpss/fd_fr_demo_mpi.cpp b/cviruntime/samples/samples_extra/insightface_vpss/fd_fr_demo_mpi.cpp
new file mode 100644
index 000000000..c0878eafe
--- /dev/null
+++ b/cviruntime/samples/samples_extra/insightface_vpss/fd_fr_demo_mpi.cpp
@@ -0,0 +1,562 @@
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <algorithm>
+#include <fstream>
+
+#include <opencv2/opencv.hpp>
+
+#include "cvi_media_sdk.h"
+
+#include "cviruntime.h"
+
+#include "FacePreprocess.h"
+#include "affine_hw.h"
+#include "type_define.h"
+
+#define IMAGE_WIDTH            (1920)
+#define IMAGE_HEIGHT           (1080)
+#define DISP_WIDTH             (1280)
+#define DISP_HEIGHT            (720)
+#define DISP_FMT               (PIXEL_FORMAT_YUV_PLANAR_420)
+
+#define FD_IMG_RESIZE_WIDTH    (600)
+#define FD_IMG_RESIZE_HEIGHT   (600)
+
+#define FD_THRESHOLD           (0.1)
+
+#define FR_IMG_RESIZE_WIDTH    (112)
+#define FR_IMG_RESIZE_HEIGHT   (112)
+
+#define HW_AFFINE
+
+#define INPUT_FMT PIXEL_FORMAT_RGB_888_PLANAR
+
+static int face_detect(CVI_MODEL_HANDLE model,
+    VIDEO_FRAME_INFO_S *frame_in, 
+    face_rect_t *dets, uint32_t *det_num,
+    bool dump) {
+  VIDEO_FRAME_INFO_S *frame_preprocessed;
+  VIDEO_FRAME_INFO_S frame_preprocessed_local;
+
+  // get input and output size
+  CVI_TENSOR *input_tensors, *output_tensors;
+  int32_t input_num, output_num;
+  CVI_NN_GetInputOutputTensors(model, &input_tensors, &input_num,
+      &output_tensors, &output_num);
+  CVI_TENSOR *input = CVI_NN_GetTensorByName(CVI_NN_DEFAULT_TENSOR,
+      input_tensors, input_num);
+
+  // retinaface model
+  // export MODEL_CHANNEL_ORDER = "rgb"
+  // export RAW_SCALE = 255.0
+  // export MEAN = 0, 0, 0
+  // export INPUT_SCALE = 1.0
+  // Threshold data 128.000000
+  // CVI_MAPI_PREPROCESS_ATTR_T pp_attr;
+  // pp_attr.is_rgb          = true;
+  // pp_attr.raw_scale       = 255.0f;
+  // pp_attr.mean[0]         = 0.0f;
+  // pp_attr.mean[1]         = 0.0f;
+  // pp_attr.mean[2]         = 0.0f;
+  // pp_attr.input_scale[0]  = 1.0f;
+  // pp_attr.input_scale[1]  = 1.0f;
+  // pp_attr.input_scale[2]  = 1.0f;
+  // pp_attr.qscale = CVI_NN_TensorQuantScale(input);
+  SAMPLE_CHECK_RET(CVI_MAPI_IPROC_Resize(frame_in, &frame_preprocessed_local,
+                                         FD_IMG_RESIZE_WIDTH, FD_IMG_RESIZE_HEIGHT,
+                                         INPUT_FMT,
+                                         false, NULL, NULL, NULL));
+
+  if (dump) {
+      SAMPLE_CHECK_RET(CVI_MAPI_SaveFramePixelData(&frame_preprocessed_local,
+                                                   "face_resized"));
+  }
+  frame_preprocessed = &frame_preprocessed_local;
+
+  CVI_NN_SetTensorPhysicalAddr(input, frame_preprocessed->stVFrame.u64PhyAddr[0]);
+
+  // run inference
+  CVI_NN_Forward(model, input_tensors, input_num,
+                 output_tensors, output_num);
+
+  SAMPLE_CHECK_RET(CVI_MAPI_IPROC_ReleaseFrame(&frame_preprocessed_local));
+
+  // get output
+  // output: [x1,y1,x2,y2,score,landmarks]
+  CVI_TENSOR *output = CVI_NN_GetTensorByName("output",
+      output_tensors, output_num);
+  int32_t num = output->shape.dim[2];
+  int32_t dim = output->shape.dim[3];
+  //printf("num %d, dim %d\n", num, dim);
+
+  // process output
+  int face_idx = 0;
+  float *det = (float *)CVI_NN_TensorPtr(output);
+  memset(dets, 0, sizeof(face_rect_t) * FD_MAX_DET_NUM);
+  for (int i = 0; i < num; ++i) {
+    if (det[4] != 0) {
+      dets[face_idx].score = det[4];
+      dets[face_idx].x1 = det[0];
+      dets[face_idx].y1 = det[1];
+      dets[face_idx].x2 = det[2];
+      dets[face_idx].y2 = det[3];
+      memcpy(dets[face_idx].landmarks, &det[5], sizeof(float) * 10);
+
+      face_idx++;
+
+      if (face_idx >= FD_MAX_DET_NUM)
+        break;
+    }
+    det += dim;
+  }
+  //printf("%d faces detected\n", face_idx);
+
+  float sw = 1.0f * FD_IMG_RESIZE_WIDTH / frame_in->stVFrame.u32Width;
+  float sh = 1.0f * FD_IMG_RESIZE_HEIGHT / frame_in->stVFrame.u32Height;
+
+  // mutiply scale to origin image size
+  for (int i = 0; i < face_idx; ++i) {
+    dets[i].x1 /= sw;
+    dets[i].y1 /= sh;
+    dets[i].x2 /= sw;
+    dets[i].y2 /= sh;
+
+    for (int j = 0; j < 5; j++) {
+      dets[i].landmarks[j][0] /= sw;
+      dets[i].landmarks[j][1] /= sh;
+    }
+  }
+
+  *det_num = face_idx;
+  return 0;
+}
+
+#ifdef HW_AFFINE
+static int face_align_by_hw(VIDEO_FRAME_INFO_S *frame_in, face_rect_t *det,
+    VIDEO_FRAME_INFO_S *frame_aligned, bool dump) {
+  //init frame_aligned
+  uint32_t width = FR_IMG_RESIZE_WIDTH;
+  uint32_t height = FR_IMG_RESIZE_HEIGHT;
+  SAMPLE_CHECK_RET(CVI_MAPI_AllocateFrame(frame_aligned, width, height, frame_in->stVFrame.enPixelFormat));
+  SAMPLE_CHECK_RET(face_align_gdc(frame_in, frame_aligned, *det));
+
+  if (dump) {
+    SAMPLE_CHECK_RET(CVI_MAPI_SaveFramePixelData(frame_aligned, "face_aligned"));
+  }
+  return 0;
+}
+#else
+static int copy_mat_to_frame(cv::Mat& mat, void * frame_data,
+    int32_t stride) {
+  CVI_LOG_ASSERT(frame_data != NULL, "Null Frame!");
+  CVI_LOG_ASSERT(mat.cols <= stride, "Error param!");
+  if (stride == mat.cols) {
+    memcpy(frame_data, mat.data, mat.cols * mat.rows);
+    return 0;
+  }
+  uint8_t * frame_ptr = (uint8_t *)frame_data;
+  uint8_t * mat_ptr = mat.data;
+  uint32_t frame_step = stride;
+  uint32_t mat_step = mat.cols;
+  for (int i = 0; i < mat.rows; ++i) {
+    memcpy(frame_ptr, mat_ptr, mat_step);
+    frame_ptr += frame_step;
+    mat_ptr += mat_step;
+  }
+  return 0;
+}
+
+static int face_align_by_sw(VIDEO_FRAME_INFO_S *frame_in, face_rect_t *det,
+    VIDEO_FRAME_INFO_S *frame_aligned, bool dump) {
+  int x = det->x1;
+  int y = det->y1;
+  int w = det->x2 - x;
+  int h = det->y2 - y;
+
+  //expand the face's rect
+  float pad_scale = 0.6;
+  cv::Rect frame_rect(0, 0,
+      frame_in->stVFrame.u32Width,
+      frame_in->stVFrame.u32Height);
+  cv::Rect face_pad_rect(
+      x - w * pad_scale / 2,
+      y - h * pad_scale / 2,
+      w * (1 + pad_scale),
+      h * (1 + pad_scale));
+  face_pad_rect.x = ALIGN(face_pad_rect.x, 2);
+  face_pad_rect.y = ALIGN(face_pad_rect.y, 2);
+  face_pad_rect.width = ALIGN(face_pad_rect.width, 2);
+  face_pad_rect.height = ALIGN(face_pad_rect.height, 2);
+  face_pad_rect &= frame_rect;
+
+  //adjust the coordinate
+  det->x1 -= face_pad_rect.x;
+  det->y1 -= face_pad_rect.y;
+  det->x2 -= face_pad_rect.x;
+  det->y2 -= face_pad_rect.y;
+
+  for (int i = 0; i < 5; i++) {
+    det->landmarks[i][0] = std::max(0.0f, det->landmarks[i][0] - face_pad_rect.x);
+    det->landmarks[i][1] = std::max(0.0f, det->landmarks[i][1] - face_pad_rect.y);
+  }
+
+  //crop face
+  CVI_MAPI_IPROC_RESIZE_CROP_ATTR_T crop_in_attr;
+  crop_in_attr.x = face_pad_rect.x;
+  crop_in_attr.y = face_pad_rect.y;
+  crop_in_attr.w = face_pad_rect.width;
+  crop_in_attr.h = face_pad_rect.height;
+
+  //printf("  crop [%d, %d, %d, %d]\n",
+  //    crop_in_attr.x, crop_in_attr.y,
+  //    crop_in_attr.w, crop_in_attr.h);
+
+  VIDEO_FRAME_INFO_S frame_crop;
+  SAMPLE_CHECK_RET(CVI_MAPI_IPROC_Resize(frame_in, &frame_crop,
+        crop_in_attr.w, crop_in_attr.h,
+        PIXEL_FORMAT_RGB_888_PLANAR,
+        false, &crop_in_attr, NULL, NULL));
+
+  if (dump) {
+    SAMPLE_CHECK_RET(CVI_MAPI_SaveFramePixelData(&frame_crop, "face_crop"));
+  }
+
+  // affine
+  // TODO: use hardware warp engine
+  //
+  float ref_pts[5][2] = {
+    { 30.2946f, 51.6963f },
+    { 65.5318f, 51.5014f },
+    { 48.0252f, 71.7366f },
+    { 33.5493f, 92.3655f },
+    { 62.7299f, 92.2041f }
+  };
+  cv::Mat ref(5, 2, CV_32FC1, ref_pts);
+  cv::Mat dst(5, 2, CV_32FC1, det->landmarks);
+
+  auto m = FacePreprocess::similarTransform(dst, ref);
+
+  //std::cout << "ref =" << std::endl << ref << std::endl;
+  //std::cout << "dst =" << std::endl << dst << std::endl;
+  //std::cout << "m =" << std::endl << m << std::endl;
+
+  uint32_t width = FR_IMG_RESIZE_WIDTH;
+  uint32_t height = FR_IMG_RESIZE_HEIGHT;
+  SAMPLE_CHECK_RET(CVI_MAPI_AllocateFrame(frame_aligned, width, height,
+                                   PIXEL_FORMAT_RGB_888_PLANAR));
+  CVI_MAPI_FrameMmap(frame_aligned, true);
+
+  CVI_MAPI_FrameMmap(&frame_crop, true);
+  CVI_MAPI_FrameInvalidateCache(&frame_crop);
+
+  for (int i = 0; i < 3; ++i) {
+    // each channel do warp alone
+    cv::Mat channel_in = cv::Mat(frame_crop.stVFrame.u32Height,
+                                 frame_crop.stVFrame.u32Width,
+                                 CV_8UC1,
+                                 frame_crop.stVFrame.pu8VirAddr[i],
+                                 frame_crop.stVFrame.u32Stride[i]);
+    cv::Mat channel_out;
+    cv::warpPerspective(channel_in, channel_out, m,
+                        cv::Size(96, 112), cv::INTER_LINEAR);
+    cv::resize(channel_out, channel_out,
+               cv::Size(FR_IMG_RESIZE_WIDTH, FR_IMG_RESIZE_HEIGHT),
+               0, 0, cv::INTER_LINEAR);
+    copy_mat_to_frame(channel_out,
+                      frame_aligned->stVFrame.pu8VirAddr[i],
+                      frame_aligned->stVFrame.u32Stride[i]);
+  }
+  // Unmap
+  CVI_MAPI_FrameMunmap(&frame_crop);
+
+  // Flush cache
+  CVI_MAPI_FrameFlushCache(frame_aligned);
+  CVI_MAPI_FrameMunmap(frame_aligned);
+
+  if (dump) {
+    SAMPLE_CHECK_RET(CVI_MAPI_SaveFramePixelData(frame_aligned, "face_aligned"));
+  }
+
+  CVI_MAPI_ReleaseFrame(&frame_crop);
+
+  return 0;
+}
+#endif
+
+static cv::Mat face_extract(CVI_MODEL_HANDLE model,
+    VIDEO_FRAME_INFO_S *frame_in) {
+  cv::Mat feature(512, 1, CV_32FC1);
+
+  CVI_TENSOR *input_tensors, *output_tensors;
+  int32_t input_num, output_num;
+  CVI_NN_GetInputOutputTensors(model, &input_tensors, &input_num, &output_tensors, &output_num);
+  CVI_TENSOR *input = CVI_NN_GetTensorByName(CVI_NN_DEFAULT_TENSOR, input_tensors, input_num);
+  //printf("fr input tensor size:%ld\n", CVI_NN_TensorSize(input));
+  //printf("fr quant scale:%lf\n", CVI_NN_TensorQuantScale(input));
+
+  // IPROC PREPROCESS
+  VIDEO_FRAME_INFO_S frame_out;
+  //eg. model
+  // export MODEL_CHANNEL_ORDER = "rgb"
+  // export RAW_SCALE = 255
+  // export MEAN = 127.5, 127.5, 127.5
+  // export INPUT_SCALE = 1
+  // Threshold data 128.0
+  // CVI_MAPI_PREPROCESS_ATTR_T pp_attr;
+  // pp_attr.is_rgb = true;
+  // pp_attr.raw_scale = 255.0f;
+  // pp_attr.mean[0] = 127.5f;
+  // pp_attr.mean[1] = 127.5f;
+  // pp_attr.mean[2] = 127.5f;
+  // pp_attr.input_scale[0] = 0.0078125f;
+  // pp_attr.input_scale[1] = 0.0078125f;
+  // pp_attr.input_scale[2] = 0.0078125f;
+  // pp_attr.qscale = CVI_NN_TensorQuantScale(input);
+
+  int ret = 0;
+  ret = CVI_MAPI_IPROC_Resize(frame_in, &frame_out,
+        frame_in->stVFrame.u32Width, frame_in->stVFrame.u32Height,
+        INPUT_FMT,
+        false, NULL, NULL, NULL);
+  if (ret != 0) {
+    printf("CVI_MAPI_IPROC_Resize failed, err %d\n", ret);
+    exit(-1);
+  }
+
+  // use device memory
+  CVI_NN_SetTensorPhysicalAddr(input, frame_out.stVFrame.u64PhyAddr[0]);
+
+  // run inference
+  ret = CVI_NN_Forward(model, input_tensors, input_num,
+                      output_tensors, output_num);
+  if (ret != CVI_RC_SUCCESS) {
+    printf("CVI_NN_Forward failed, err %d\n", ret);
+    return cv::Mat();
+  }
+  printf("CVI_NN_Forward succeeded\n");
+
+  CVI_TENSOR *output = CVI_NN_GetTensorByName("fc1_scale_dequant*", output_tensors, output_num);
+  memcpy(feature.data, (float*)CVI_NN_TensorPtr(output), CVI_NN_TensorSize(output));
+  CVI_MAPI_IPROC_ReleaseFrame(&frame_out);
+
+  return feature;
+}
+
+static float cal_similarity(cv::Mat& feature1, cv::Mat& feature2) {
+  return feature1.dot(feature2) / (cv::norm(feature1) * cv::norm(feature2));
+}
+
+static void save_feature_bin(cv::Mat& feature, const char *filename) {
+  FILE *output;
+  output = fopen(filename, "wb");
+  int len = feature.total() * feature.elemSize();
+  fwrite(feature.data, len, 1, output);
+  fclose(output);
+}
+
+static void save_feature_txt(cv::Mat& feature, const char *filename) {
+  FILE *output;
+  output = fopen(filename, "w");
+  for (int i = 0; i < feature.rows; ++i) {
+    for (int j = 0; j < feature.cols; ++j) {
+      fprintf(output, "%lf\n", feature.at<float>(i,j));
+    }
+  }
+  fclose(output);
+}
+
+static int get_boot_time(uint64_t *time_us)
+{
+    struct timespec timeo;
+    clock_gettime(CLOCK_MONOTONIC, &timeo);
+    *time_us = (uint64_t)timeo.tv_sec * 1000000 + timeo.tv_nsec / 1000;
+    return 0;
+}
+
+static inline uint64_t align_up(uint64_t x, uint64_t n)
+{
+  return (x + n - 1) / n * n;
+}
+
+static int load_image_to_frame(VIDEO_FRAME_INFO_S *in_frame, cv::Mat &mat) {
+  VIDEO_FRAME_INFO_S bgr_frame;
+  CVI_MAPI_AllocateFrame(&bgr_frame, mat.cols, mat.rows, PIXEL_FORMAT_BGR_888);
+  CVI_MAPI_FrameMmap(&bgr_frame, true);
+  uint8_t *src_ptr = mat.data;
+  uint8_t *dst_ptr = bgr_frame.stVFrame.pu8VirAddr[0];
+  for (int h = 0; h < mat.rows; ++h) {
+    memcpy(dst_ptr, src_ptr, 3 * mat.cols);
+    src_ptr += 3 * mat.cols;
+    dst_ptr += bgr_frame.stVFrame.u32Stride[0];
+  }
+  CVI_MAPI_FrameFlushCache(&bgr_frame);
+  CVI_MAPI_FrameMunmap(&bgr_frame);
+
+  SAMPLE_CHECK_RET(CVI_MAPI_IPROC_Resize(&bgr_frame, in_frame,
+                                         mat.cols, mat.rows,
+                                         INPUT_FMT,
+                                         false, NULL, NULL, NULL));
+  CVI_MAPI_ReleaseFrame(&bgr_frame);
+  return 0;
+}
+
+static void usage(char **argv) {
+  printf("Usage:\n");
+  printf("   %s cvimodel_det cvimodel_rec [yuv_input] [frame_count]\n", argv[0]);
+}
+
+
+int main(int argc, char* argv[]) {
+  if (argc != 4 && argc != 5) {
+    usage(argv);
+    exit(-1);
+  }
+
+  char *modelfile_det = argv[1];
+  char *modelfile_rec = argv[2];
+  const char *inputfile = argv[3];
+  cv::Mat image = cv::imread(inputfile);
+  if (image.empty()) {
+    printf("load image failed! image:%s\n", inputfile);
+    return -1;
+  }
+  int frame_count = 1;
+  if (argc >= 5) {
+    frame_count = atoi(argv[4]);
+  }
+
+  // register model
+  CVI_MODEL_HANDLE model_fd, model_fr;
+  CVI_NN_RegisterModel(modelfile_det, &model_fd);
+  CVI_NN_RegisterModel(modelfile_rec, &model_fr);
+
+  CVI_MAPI_MEDIA_SYS_ATTR_T sys_attr = {0};
+  sys_attr.vb_pool[0].is_frame = true;
+  sys_attr.vb_pool[0].vb_blk_size.frame.width  = IMAGE_WIDTH;
+  sys_attr.vb_pool[0].vb_blk_size.frame.height = IMAGE_HEIGHT;
+  sys_attr.vb_pool[0].vb_blk_size.frame.fmt    = INPUT_FMT;
+  sys_attr.vb_pool[0].vb_blk_num = 6;
+  sys_attr.vb_pool[1].is_frame = true;
+  sys_attr.vb_pool[1].vb_blk_size.frame.width  = image.cols;
+  sys_attr.vb_pool[1].vb_blk_size.frame.height = image.rows;
+  sys_attr.vb_pool[1].vb_blk_size.frame.fmt    = PIXEL_FORMAT_BGR_888;
+  sys_attr.vb_pool[1].vb_blk_num = 4;
+  sys_attr.vb_pool_num = 2;
+  SAMPLE_CHECK_RET(CVI_MAPI_Media_Init(&sys_attr));
+
+  VIDEO_FRAME_INFO_S frame_input;
+  load_image_to_frame(&frame_input, image);
+
+  // init vproc/vpss
+  CVI_MAPI_VPROC_HANDLE_T vproc;
+  CVI_MAPI_VPROC_ATTR_T vproc_attr = CVI_MAPI_VPROC_DefaultAttr_OneChn(
+      image.cols, image.rows, INPUT_FMT,
+      DISP_WIDTH, DISP_HEIGHT, DISP_FMT);
+  SAMPLE_CHECK_RET(CVI_MAPI_VPROC_Init(&vproc, -1, &vproc_attr));
+  int vproc_chn_id_disp = 0;
+
+  while (frame_count) {
+    bool do_dump = (frame_count == 1);
+    uint64_t t0, t1, t_vcap, t_detect, t_align = 0, t_extract = 0;
+
+    get_boot_time(&t0);
+    face_rect_t dets[FD_MAX_DET_NUM];
+    uint32_t det_num;
+    SAMPLE_CHECK_RET(face_detect(model_fd, &frame_input, dets,
+                          &det_num, do_dump));
+    get_boot_time(&t1);
+    t_detect = t1 - t0;
+
+    printf("detected %d faces\n", det_num);
+    for (uint32_t i = 0; i < det_num; i++) {
+      printf("[%d]: [%.2f, %.2f] -> [%.2f, %.2f], score %.2f\n",
+          i, dets[i].x1, dets[i].y1, dets[i].x2, dets[i].y2,
+          dets[i].score);
+
+      if (dets[i].score > FD_THRESHOLD) {
+        // align frame
+
+        get_boot_time(&t0);
+
+        VIDEO_FRAME_INFO_S frame_aligned;
+#ifdef HW_AFFINE
+#ifdef CHIP_182x
+        assert(0 && "cv182x not support hardware affine\n");
+#endif
+        SAMPLE_CHECK_RET(face_align_by_hw(&frame_input, &dets[i], &frame_aligned, do_dump));
+#else
+        SAMPLE_CHECK_RET(face_align_by_sw(&frame_input, &dets[i], &frame_aligned, do_dump));
+#endif
+
+        get_boot_time(&t1);
+        t_align = t1 - t0;
+        get_boot_time(&t0);
+
+        auto feature = face_extract(model_fr, &frame_aligned);
+
+        get_boot_time(&t1);
+        t_extract = t1 - t0;
+
+        printf("face feature extract done\n") ;
+        //std::cout << "feature: " << std::endl << feature << std::endl;
+
+        if (do_dump) {
+          save_feature_bin(feature, "face_feature.bin");
+          save_feature_txt(feature, "face_feature.txt");
+        }
+
+        // compare with the last saved feature
+        static cv::Mat last_feature;
+        if (!last_feature.empty()) {
+          float similarity = cal_similarity(feature, last_feature);
+          printf("Similarity (againt last face): %.2f \n", similarity);
+        }
+        last_feature = feature;
+
+        SAMPLE_CHECK_RET(CVI_MAPI_ReleaseFrame(&frame_aligned));
+      }
+    }
+
+    /* send frame to vo
+      SAMPLE_CHECK_RET(CVI_MAPI_VPROC_SendFrame(vproc, &frame_input));
+
+      VIDEO_FRAME_INFO_S vproc_frame_disp;
+      SAMPLE_CHECK_RET(CVI_MAPI_VPROC_GetChnFrame(vproc, 0, &vproc_frame_disp));
+
+      SAMPLE_CHECK_RET(CVI_MAPI_DISP_SendFrame(disp, &vproc_frame_disp));
+      SAMPLE_CHECK_RET(CVI_MAPI_ReleaseFrame(&vproc_frame_disp));
+    */
+
+
+    if (frame_count % 30 == 0) {
+      printf("\nPerf:\n");
+      printf("      Detect    %2.2f ms\n", (t_detect)/1000.0);
+      if (det_num) {
+        printf("      Align     %2.2f ms\n", (t_align)/1000.0);
+        printf("      Extract   %2.2f ms\n", (t_extract)/1000.0);
+      }
+    }
+
+    frame_count --;
+  }
+
+  // clean up
+  SAMPLE_CHECK_RET(CVI_MAPI_ReleaseFrame(&frame_input));
+  SAMPLE_CHECK_RET(CVI_MAPI_VPROC_Deinit(vproc));
+  SAMPLE_CHECK_RET(CVI_MAPI_IPROC_Deinit());
+  SAMPLE_CHECK_RET(CVI_MAPI_Media_Deinit());
+
+  CVI_NN_CleanupModel(model_fr);
+  CVI_NN_CleanupModel(model_fd);
+
+  return 0;
+}
diff --git a/cviruntime/samples/samples_extra/insightface_vpss/type_define.h b/cviruntime/samples/samples_extra/insightface_vpss/type_define.h
new file mode 100644
index 000000000..3219653c4
--- /dev/null
+++ b/cviruntime/samples/samples_extra/insightface_vpss/type_define.h
@@ -0,0 +1,18 @@
+#ifndef _TYPE_DEFINE_H_
+#define _TYPE_DEFINE_H_
+
+#define FD_MAX_DET_NUM    (10)
+#ifndef ALIGN
+#define ALIGN(x,y) (x + y -1) & ~(y - 1)
+#endif
+
+typedef struct face_rect_s {
+  float x1;
+  float y1;
+  float x2;
+  float y2;
+  float score;
+  float landmarks[5][2];
+} face_rect_t;
+
+#endif
\ No newline at end of file
diff --git a/cviruntime/samples/samples_extra/matchTemplate/README.md b/cviruntime/samples/samples_extra/matchTemplate/README.md
new file mode 100755
index 000000000..eee5445c2
--- /dev/null
+++ b/cviruntime/samples/samples_extra/matchTemplate/README.md
@@ -0,0 +1,4 @@
+Tis sample is for tpu-mlir
+
+Run sample:
+python test.py
diff --git a/cviruntime/samples/samples_extra/matchTemplate/convert.sh b/cviruntime/samples/samples_extra/matchTemplate/convert.sh
new file mode 100755
index 000000000..0a3ce4a01
--- /dev/null
+++ b/cviruntime/samples/samples_extra/matchTemplate/convert.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+set -xe
+
+export SET_CHIP_NAME="cv182x"
+mkdir -p tmp
+
+pushd tmp
+
+tpuc-opt ../match_template.mlir \
+ --shape-infer \
+ --canonicalize \
+ --extra-optimize \
+ -o ccoeff_normed_fp32.mlir
+
+model_deploy.py \
+   --mlir ccoeff_normed_fp32.mlir  \
+   --quantize BF16 \
+   --quant_input \
+   --chip cv182x \
+   --test_input ../input.npz \
+   --compare_all \
+   --model ccoeff_normed.cvimodel
+popd
diff --git a/cviruntime/samples/samples_extra/matchTemplate/gen_mlir.py b/cviruntime/samples/samples_extra/matchTemplate/gen_mlir.py
new file mode 100755
index 000000000..3d398cef7
--- /dev/null
+++ b/cviruntime/samples/samples_extra/matchTemplate/gen_mlir.py
@@ -0,0 +1,62 @@
+import argparse
+from utils.misc import *
+
+def gen_match_template_mlir(input_shape: list, template_shape: list,
+                             mlir_file: str = "match_template.mlir",
+                             mode: str = "TM_CCOEFF_NORMED") -> bool:
+  try:
+    assert(len(input_shape) == 2 and len(template_shape) == 2)
+    ih, iw = input_shape
+    th, tw = template_shape
+    oh, ow = ih - th + 1, iw - tw + 1
+    assert(oh > 0 and ow > 0)
+
+    with open(mlir_file, "w") as f:
+      f.write('#loc = loc(unknown)\n'
+              + 'module attributes {module.chip = "ALL", module.name = "MatchTemplate", '
+              + 'module.platform = "ONNX", module.state = "TOP_F32", module.weight_file = '
+              + '"match_template_top_f32_all_origin_weight.npz"} {\n')
+      f.write('  func.func @main(%arg0: tensor<{}x{}xf32> loc(unknown), %arg1: tensor<{}x{}xf32> \
+    loc(unknown)) -> (tensor<1xf32>, tensor<1xf32>)'.format(ih, iw, th, tw) + '{\n')
+      f.write('    %0 = "top.None"() : () -> none loc(#loc)\n')
+      f.write('    %1 = "top.Input"(%arg0) : (tensor<{}x{}xf32>) -> tensor<{}x{}xf32> loc(#loc1)\n'.format(ih, iw, ih, iw))
+      f.write('    %2 = "top.Input"(%arg1) : (tensor<{}x{}xf32>) -> tensor<{}x{}xf32> loc(#loc2)\n'.format(th, tw, th, tw))
+      f.write('    %3 = "top.MatchTemplate"(%1, %2) {' + 'mode = "{}"'.format(mode) + '} : '
+              + '(tensor<{}x{}xf32>, tensor<{}x{}xf32>) -> tensor<{}x{}xf32> loc(#loc3)\n'.format(ih, iw, th, tw, oh, ow))
+      f.write('    %4 = "top.Reshape"(%3) : (tensor<{}x{}xf32>) -> tensor<{}xf32> loc(#loc4)\n'.format(oh, ow, oh * ow))
+      f.write('    %5:2 = "top.Arg"(%4) {axis = 0 : i64, keepdims = true, mode = "ArgMax", select_last_index = true} :'
+              + '(tensor<{}xf32>) -> (tensor<1xf32>, tensor<1xf32>) loc(#loc7)\n'.format(oh * ow))
+      f.write('    return %5#0, %5#1 : tensor<1xf32>, tensor<1xf32> loc(#loc)\n')
+      f.write('  } loc(#loc)\n'
+              + '} loc(#loc)\n'
+              + '#loc1 = loc("input")\n'
+              + '#loc2 = loc("template")\n'
+              + '#loc3 = loc("match")\n'
+              + '#loc4 = loc("6_Reshape")\n'
+              + '#loc5 = loc("output_ArgMax")\n'
+              + '#loc6 = loc("output_values")\n'
+              + '#loc7 = loc(fused[#loc5, #loc6])')
+    return True
+  except:
+    return False
+
+if __name__ == '__main__':
+  # python gen_mlir.py --input_shape [109,77] --template_shape [80,49]
+  parser = argparse.ArgumentParser()
+  parser.add_argument("--input_shape", type=str2shape, required=True,
+                      help="list of input shape, like:[109, 77]")
+  parser.add_argument("--template_shape", type=str2shape, required=True,
+                      help="list of template shape, like:[80, 49]")
+  parser.add_argument("--mode", default="TM_CCOEFF_NORMED", type=str.upper,
+                      choices=['TM_CCOEFF_NORMED', 'TM_SQDIFF'],
+                      help="MatchTemplate mode")
+  parser.add_argument("--mlir", default="match_template.mlir",
+                      help="output mlir model file")
+  args = parser.parse_args()
+
+  status = gen_match_template_mlir(args.input_shape[0], args.template_shape[0], args.mlir, args.mode)
+
+  if status:
+    print("======== success gen mlir file ========")
+  else:
+    print("======== failed gen mlir file ========")
diff --git a/cviruntime/samples/samples_extra/matchTemplate/input.png b/cviruntime/samples/samples_extra/matchTemplate/input.png
new file mode 100755
index 000000000..12d8e8b41
Binary files /dev/null and b/cviruntime/samples/samples_extra/matchTemplate/input.png differ
diff --git a/cviruntime/samples/samples_extra/matchTemplate/template.png b/cviruntime/samples/samples_extra/matchTemplate/template.png
new file mode 100755
index 000000000..919903983
Binary files /dev/null and b/cviruntime/samples/samples_extra/matchTemplate/template.png differ
diff --git a/cviruntime/samples/samples_extra/matchTemplate/test.py b/cviruntime/samples/samples_extra/matchTemplate/test.py
new file mode 100755
index 000000000..e3a41c16c
--- /dev/null
+++ b/cviruntime/samples/samples_extra/matchTemplate/test.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+import os, cv2
+import numpy as np
+import pyruntime_cvi as pyruntime
+from gen_mlir import gen_match_template_mlir
+
+# read input
+image = cv2.imread("input.png", cv2.IMREAD_GRAYSCALE)
+template = cv2.imread("template.png", cv2.IMREAD_GRAYSCALE)
+image = cv2.resize(image, (77, 109))
+template = cv2.resize(template, (49, 80))
+data = {}
+data["input"] = image.astype(np.float32)
+data["template"] = template.astype(np.float32)
+np.savez("input.npz", **data)
+
+ih, iw = image.shape
+th, tw = template.shape
+oh, ow = (ih - th + 1), (iw - tw + 1)
+
+# ======= gen mlir file =======
+status = gen_match_template_mlir(image.shape, template.shape, 'match_template.mlir',
+                                 'TM_CCOEFF_NORMED')
+if not status:
+    raise ("generate match template mlir file failed.")
+
+# ======= gen cvimodel =======
+os.system('./convert.sh')
+
+# ======= by cvimodel =======
+model = pyruntime.Model("tmp/ccoeff_normed.cvimodel", 0, False)
+if model == None:
+    raise Exception("cannot load cvimodel")
+
+# fill data to inputs
+data0 = model.inputs[0].data
+data1 = model.inputs[1].data
+data0[:] = image.reshape(data0.shape)
+data1[:] = template.reshape(data1.shape)
+# forward
+model.forward()
+print(len(model.outputs))
+for o in model.outputs:
+    if o.name == "output_ArgMax":
+        max_loc = o.data
+    elif o.name == "output_values":
+        max_value = o.data
+    else:
+        assert (0)
+model_loc = (int(max_loc % ow), int(max_loc / ow))
+print("model location(x,y):{} {}".format(model_loc, max_loc))
+
+# ======== by opencv ==============
+res = cv2.matchTemplate(image, template, cv2.TM_CCOEFF_NORMED)
+_, ref_max_value, _, ref_max_loc = cv2.minMaxLoc(res)
+print("opencv location(x,y):{}".format(ref_max_loc))
+
+if model_loc == ref_max_loc:
+    print("match success \n"
+    "model_conf: {} ---> ref-opencv_conf: {}"\
+        .format(max_value, ref_max_value))
+else:
+    print("match failed \n"
+          "model_conf: {} ---> ref-opencv_conf: {}".format(max_value, ref_max_value))
diff --git a/cviruntime/samples/samples_extra/run_alphapose_fused_preprocess.sh b/cviruntime/samples/samples_extra/run_alphapose_fused_preprocess.sh
new file mode 100755
index 000000000..be1e5dee1
--- /dev/null
+++ b/cviruntime/samples/samples_extra/run_alphapose_fused_preprocess.sh
@@ -0,0 +1,27 @@
+#!/bin/sh
+
+if [ -z $MODEL_PATH ]; then
+  echo "$0 MODEL_PATH not set"
+  exit 1
+fi
+if [ ! -e ./bin/cvi_sample_alphapose_fused_preprocess ]; then
+  echo "$0 Please run at the same dir as the script"
+  exit 1
+fi
+if [ ! -e $MODEL_PATH/yolo_v3_416_fused_preprocess_with_detection.cvimodel ]; then
+  echo "$0 Model yolo_v3_416_with_detection.cvimodel not present"
+  exit 1
+fi
+if [ ! -e $MODEL_PATH/alphapose_fused_preprocess.cvimodel ]; then
+  echo "$0 Model alphapose_fused_preprocess.cvimodel not present"
+  exit 1
+fi
+
+./bin/cvi_sample_alphapose_fused_preprocess \
+    $MODEL_PATH/yolo_v3_416_fused_preprocess_with_detection.cvimodel \
+    $MODEL_PATH/alphapose_fused_preprocess.cvimodel \
+    ./data/pose_demo_2.jpg \
+    alphapose_out.jpg $1 $2
+
+test $? -ne 0 && echo "cvi_sample_alphapose_fused_preprocess failed !!"  && exit 1
+exit 0
\ No newline at end of file
diff --git a/cviruntime/samples/samples_extra/run_classifier_vpss_yuv.sh b/cviruntime/samples/samples_extra/run_classifier_vpss_yuv.sh
new file mode 100755
index 000000000..1797c65d4
--- /dev/null
+++ b/cviruntime/samples/samples_extra/run_classifier_vpss_yuv.sh
@@ -0,0 +1,23 @@
+#!/bin/sh
+
+if [ -z $MODEL_PATH ]; then
+  echo "$0 MODEL_PATH not set"
+  exit 1
+fi
+if [ ! -e ./bin/cvi_sample_classifier_vpss_yuv ]; then
+  echo "$0 Please run at the same dir as the script"
+  exit 1
+fi
+if [ ! -e $MODEL_PATH/mobilenet_v2_int8_yuv420.cvimodel ]; then
+  echo "$0 Model mobilenet_v2_int8_yuv420.cvimodel not present"
+  exit 1
+fi
+
+./bin/cvi_sample_classifier_vpss_yuv \
+    $MODEL_PATH/mobilenet_v2_int8_yuv420.cvimodel \
+    ./data/cat.jpg \
+    ./data/synset_words.txt \
+    YUV420_PLANAR 
+
+test $? -ne 0 && echo "cvi_sample_classifier_vpss_yuv failed !!"  && exit 1
+exit 0
\ No newline at end of file
diff --git a/cviruntime/samples/samples_extra/run_detector_ppyoloem_fused_preprocess.sh b/cviruntime/samples/samples_extra/run_detector_ppyoloem_fused_preprocess.sh
new file mode 100644
index 000000000..d4e03a490
--- /dev/null
+++ b/cviruntime/samples/samples_extra/run_detector_ppyoloem_fused_preprocess.sh
@@ -0,0 +1,22 @@
+#!/bin/sh
+
+if [ -z $MODEL_PATH ]; then
+  echo "$0 MODEL_PATH not set"
+  exit 1
+fi
+if [ ! -e ./bin/cvi_sample_detector_yolo_v5_fused_preprocess ]; then
+  echo "$0 Please run at the same dir as the script"
+  exit 1
+fi
+if [ ! -e $MODEL_PATH/ppyoloe_m_int8.cvimodel ]; then
+  echo "$0 Model ppyoloe_m_int8.cvimodel not present"
+  exit 1
+fi
+
+./bin/cvi_sample_detector_ppyoloem_fused_preprocess \
+    $MODEL_PATH/ppyoloe_m_int8.cvimodel \
+    ./data/dog.jpg \
+    ppyoloem_out.jpg
+
+test $? -ne 0 && echo "cvi_sample_detector_ppyoloem_fused_preprocess failed !!"  && exit 1
+exit 0
\ No newline at end of file
diff --git a/cviruntime/samples/samples_extra/run_detector_yolov3_fused_preprocess.sh b/cviruntime/samples/samples_extra/run_detector_yolov3_fused_preprocess.sh
new file mode 100755
index 000000000..0fbb5889a
--- /dev/null
+++ b/cviruntime/samples/samples_extra/run_detector_yolov3_fused_preprocess.sh
@@ -0,0 +1,22 @@
+#!/bin/sh
+
+if [ -z $MODEL_PATH ]; then
+  echo "$0 MODEL_PATH not set"
+  exit 1
+fi
+if [ ! -e ./bin/cvi_sample_detector_yolo_v3_fused_preprocess ]; then
+  echo "$0 Please run at the same dir as the script"
+  exit 1
+fi
+if [ ! -e $MODEL_PATH/yolo_v3_416_fused_preprocess_with_detection.cvimodel ]; then
+  echo "$0 Model yolo_v3_416_fused_preprocess_with_detection.cvimodel not present"
+  exit 1
+fi
+
+./bin/cvi_sample_detector_yolo_v3_fused_preprocess \
+    $MODEL_PATH/yolo_v3_416_fused_preprocess_with_detection.cvimodel \
+    ./data/dog.jpg \
+    yolo_v3_out.jpg
+
+test $? -ne 0 && echo "cvi_sample_detector_yolo_v3_fused_preprocess failed !!"  && exit 1
+exit 0
\ No newline at end of file
diff --git a/cviruntime/samples/samples_extra/run_detector_yolov5-face_fused_preprocess.sh b/cviruntime/samples/samples_extra/run_detector_yolov5-face_fused_preprocess.sh
new file mode 100644
index 000000000..303348d32
--- /dev/null
+++ b/cviruntime/samples/samples_extra/run_detector_yolov5-face_fused_preprocess.sh
@@ -0,0 +1,22 @@
+#!/bin/sh
+
+if [ -z $MODEL_PATH ]; then
+  echo "$0 MODEL_PATH not set"
+  exit 1
+fi
+if [ ! -e ./bin/cvi_sample_detector_yolov5-face_fused_preprocess ]; then
+  echo "$0 Please run at the same dir as the script"
+  exit 1
+fi
+if [ ! -e $MODEL_PATH/yolov5-face_fused_preprocess.cvimodel ]; then
+  echo "$0 Model yolov5-face_fused_preprocess.cvimodel not present"
+  exit 1
+fi
+
+./bin/cvi_sample_detector_yolov5-face_fused_preprocess \
+    $MODEL_PATH/yolov5-face_fused_preprocess.cvimodel \
+    ./data/dog.jpg \
+    yolov5-face_out.jpg
+
+test $? -ne 0 && echo "cvi_sample_detector_yolov5-face_fused_preprocess failed !!"  && exit 1
+exit 0
\ No newline at end of file
diff --git a/cviruntime/samples/samples_extra/run_detector_yolov5_fused_preprocess.sh b/cviruntime/samples/samples_extra/run_detector_yolov5_fused_preprocess.sh
new file mode 100755
index 000000000..703f91809
--- /dev/null
+++ b/cviruntime/samples/samples_extra/run_detector_yolov5_fused_preprocess.sh
@@ -0,0 +1,22 @@
+#!/bin/sh
+
+if [ -z $MODEL_PATH ]; then
+  echo "$0 MODEL_PATH not set"
+  exit 1
+fi
+if [ ! -e ./bin/cvi_sample_detector_yolo_v5_fused_preprocess ]; then
+  echo "$0 Please run at the same dir as the script"
+  exit 1
+fi
+if [ ! -e $MODEL_PATH/yolov5s_fused_preprocess.cvimodel ]; then
+  echo "$0 Model yolov5s_fused_preprocess.cvimodel not present"
+  exit 1
+fi
+
+./bin/cvi_sample_detector_yolo_v5_fused_preprocess \
+    $MODEL_PATH/yolov5s_fused_preprocess.cvimodel \
+    ./data/dog.jpg \
+    yolo_v5_out.jpg
+
+test $? -ne 0 && echo "cvi_sample_detector_yolo_v5_fused_preprocess failed !!"  && exit 1
+exit 0
\ No newline at end of file
diff --git a/cviruntime/samples/samples_extra/run_detector_yolox_s.sh b/cviruntime/samples/samples_extra/run_detector_yolox_s.sh
new file mode 100755
index 000000000..94257a68d
--- /dev/null
+++ b/cviruntime/samples/samples_extra/run_detector_yolox_s.sh
@@ -0,0 +1,22 @@
+#!/bin/sh
+
+if [ -z $MODEL_PATH ]; then
+  echo "$0 MODEL_PATH not set"
+  exit 1
+fi
+if [ ! -e ./bin/cvi_sample_detector_yolox_s ]; then
+  echo "$0 Please run at the same dir as the script"
+  exit 1
+fi
+if [ ! -e $MODEL_PATH/yolox_s.cvimodel ]; then
+  echo "$0 Model yolox_s.cvimodel not present"
+  exit 1
+fi
+
+./bin/cvi_sample_detector_yolox_s \
+    $MODEL_PATH/yolox_s.cvimodel \
+    ./data/dog.jpg \
+    yolox_s_out.jpg
+
+test $? -ne 0 && echo "cvi_sample_detector_yolox failed !!"  && exit 1
+exit 0
\ No newline at end of file
diff --git a/cviruntime/samples/samples_extra/run_insightface_fused_preprocess.sh b/cviruntime/samples/samples_extra/run_insightface_fused_preprocess.sh
new file mode 100755
index 000000000..a1a038588
--- /dev/null
+++ b/cviruntime/samples/samples_extra/run_insightface_fused_preprocess.sh
@@ -0,0 +1,57 @@
+#!/bin/sh
+
+if [ -z $MODEL_PATH ]; then
+  echo "$0 MODEL_PATH not set"
+  exit 1
+fi
+if [ ! -e ./bin/cvi_sample_fd_fr_fused_preprocess ]; then
+  echo "$0 Please run at the same dir as the script"
+  exit 1
+fi
+if [ ! -e $MODEL_PATH/retinaface_mnet25_600_fused_preprocess_with_detection.cvimodel ]; then
+  echo "$0 Model retinaface_mnet25_600_fused_preprocess_with_detection.cvimodel not present"
+  exit 1
+fi
+if [ ! -e $MODEL_PATH/arcface_res50_fused_preprocess.cvimodel ]; then
+  echo "$0 Model arcface_res50_fused_preprocess.cvimodel not present"
+  exit 1
+fi
+
+# mnet25
+./bin/cvi_sample_fd_fr_fused_preprocess \
+    $MODEL_PATH/retinaface_mnet25_600_fused_preprocess_with_detection.cvimodel \
+    $MODEL_PATH/arcface_res50_fused_preprocess.cvimodel \
+    ./data/obama1.jpg \
+    ./data/obama2.jpg
+test $? -ne 0 && echo "cvi_sample_fd_fr_fused_preprocess failed !!"  && exit 1
+./bin/cvi_sample_fd_fr_fused_preprocess \
+    $MODEL_PATH/retinaface_mnet25_600_fused_preprocess_with_detection.cvimodel \
+    $MODEL_PATH/arcface_res50_fused_preprocess.cvimodel \
+    ./data/obama1.jpg \
+    ./data/obama3.jpg
+test $? -ne 0 && echo "cvi_sample_fd_fr_fused_preprocess failed !!"  && exit 1
+./bin/cvi_sample_fd_fr_fused_preprocess \
+    $MODEL_PATH/retinaface_mnet25_600_fused_preprocess_with_detection.cvimodel \
+    $MODEL_PATH/arcface_res50_fused_preprocess.cvimodel \
+    ./data/obama2.jpg \
+    ./data/obama3.jpg
+test $? -ne 0 && echo "cvi_sample_fd_fr_fused_preprocess failed !!"  && exit 1
+./bin/cvi_sample_fd_fr_fused_preprocess \
+    $MODEL_PATH/retinaface_mnet25_600_fused_preprocess_with_detection.cvimodel \
+    $MODEL_PATH/arcface_res50_fused_preprocess.cvimodel \
+    ./data/obama1.jpg \
+    ./data/trump1.jpg
+test $? -ne 0 && echo "cvi_sample_fd_fr_fused_preprocess failed !!"  && exit 1
+./bin/cvi_sample_fd_fr_fused_preprocess \
+    $MODEL_PATH/retinaface_mnet25_600_fused_preprocess_with_detection.cvimodel \
+    $MODEL_PATH/arcface_res50_fused_preprocess.cvimodel \
+    ./data/obama1.jpg \
+    ./data/trump2.jpg
+test $? -ne 0 && echo "cvi_sample_fd_fr_fused_preprocess failed !!"  && exit 1
+./bin/cvi_sample_fd_fr_fused_preprocess \
+    $MODEL_PATH/retinaface_mnet25_600_fused_preprocess_with_detection.cvimodel \
+    $MODEL_PATH/arcface_res50_fused_preprocess.cvimodel \
+    ./data/obama1.jpg \
+    ./data/trump3.jpg
+test $? -ne 0 && echo "cvi_sample_fd_fr_fused_preprocess failed !!"  && exit 1
+exit 0
\ No newline at end of file
diff --git a/cviruntime/samples/utils/CMakeLists.txt b/cviruntime/samples/utils/CMakeLists.txt
new file mode 100644
index 000000000..37bd2a4e3
--- /dev/null
+++ b/cviruntime/samples/utils/CMakeLists.txt
@@ -0,0 +1,24 @@
+cmake_minimum_required(VERSION 2.8.0)
+
+project(cvi_sample_model_info C CXX)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
+
+if(NOT DEFINED TPU_SDK_PATH)
+  message(FATAL_ERROR "Please set TPU_SDK_PATH to point to the TPU_SDK installation")
+endif()
+include_directories(${TPU_SDK_PATH}/include)
+link_directories(${TPU_SDK_PATH}/lib)
+
+set(CVI_LIBS ${CVI_LIBS} cviruntime cvikernel)
+set(EXTRA_LIBS ${EXTRA_LIBS} dl)
+
+add_executable(cvi_sample_model_info
+    model_info.c)
+target_link_libraries(cvi_sample_model_info
+    ${CVI_LIBS}
+    ${EXTRA_LIBS})
+install(TARGETS cvi_sample_model_info
+    cvi_sample_model_info DESTINATION bin)
diff --git a/cviruntime/samples/utils/model_info.c b/cviruntime/samples/utils/model_info.c
new file mode 100644
index 000000000..7e94c1784
--- /dev/null
+++ b/cviruntime/samples/utils/model_info.c
@@ -0,0 +1,77 @@
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <stdbool.h>
+#include "cviruntime.h"
+
+static void dump_tensors(CVI_TENSOR *tensors, int32_t num) {
+  for (int32_t i = 0; i < num; i++) {
+    printf("  [%d] %s, shape (%d,%d,%d,%d), count %zu, fmt %d\n",
+        i,
+        tensors[i].name,
+        tensors[i].shape.dim[0],
+        tensors[i].shape.dim[1],
+        tensors[i].shape.dim[2],
+        tensors[i].shape.dim[3],
+        tensors[i].count,
+        tensors[i].fmt);
+  }
+}
+
+static void usage(char **argv) {
+  printf("Usage:\n");
+  printf("   %s cvimodel [1|0]\n", argv[0]);
+}
+
+int main(int argc, char **argv) {
+  int ret = 0;
+  CVI_MODEL_HANDLE model;
+
+  if (argc != 2 && argc != 3) {
+    usage(argv);
+    exit(-1);
+  }
+  bool output_all_tensors = false;
+  if (argc >= 3) {
+    output_all_tensors = (atoi(argv[2]) > 0) ? true : false;;
+  }
+
+  // normal mode
+  ret = CVI_NN_RegisterModel(argv[1], &model);
+  if (ret != CVI_RC_SUCCESS) {
+    printf("CVI_NN_RegisterModel failed, err %d\n", ret);
+    return -1;
+  }
+  printf("CVI_NN_RegisterModel succeeded\n");
+
+  // retrieve input / output tensor struct
+  CVI_TENSOR *input_tensors, *output_tensors;
+  int32_t input_num, output_num;
+  ret = CVI_NN_GetInputOutputTensors(model, &input_tensors, &input_num,
+                               &output_tensors, &output_num);
+  if (ret != CVI_RC_SUCCESS) {
+    CVI_NN_CleanupModel(model);
+    return -1;
+  }
+
+  // dump param
+  printf("Model Param for %s:\n", argv[1]);
+  printf("  Input Tensor Number  : %d\n", input_num);
+  dump_tensors(input_tensors, input_num);
+  printf("  Output Tensor Number : %d\n", output_num);
+  dump_tensors(output_tensors, output_num);
+  printf("  Output All Tensors For Debug: [%s]\n", output_all_tensors ? "YES" : "NO");
+
+  // clean up
+  ret = CVI_NN_CleanupModel(model);
+  if (ret != CVI_RC_SUCCESS) {
+    printf("CVI_NN_CleanupModel failed, err %d\n", ret);
+    return -1;
+  }
+  printf("CVI_NN_CleanupModel succeeded\n");
+
+  return 0;
+}
diff --git a/cviruntime/samples_inner/light/CMakeLists.txt b/cviruntime/samples_inner/light/CMakeLists.txt
new file mode 100644
index 000000000..7a86d73b0
--- /dev/null
+++ b/cviruntime/samples_inner/light/CMakeLists.txt
@@ -0,0 +1,30 @@
+cmake_minimum_required(VERSION 2.8.0)
+
+project(t2s C CXX)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
+
+if(NOT DEFINED TPU_SDK_PATH)
+  message(FATAL_ERROR "Please set TPU_SDK_PATH to point to the TPU_SDK installation")
+endif()
+include_directories(${TPU_SDK_PATH}/include)
+link_directories(${TPU_SDK_PATH}/lib)
+
+if(NOT DEFINED OPENCV_PATH)
+  message(FATAL_ERROR "Please set OPENCV_PATH to point to the opencvn installation")
+endif()
+include_directories(${OPENCV_PATH}/include)
+link_directories(${OPENCV_PATH}/lib)
+
+set(CVI_LIBS ${CVI_LIBS} cviruntime cvikernel cnpy)
+set(OPENCV_LIBS ${OPENCV_LIBS} opencv_core opencv_imgcodecs opencv_imgproc)
+set(EXTRA_LIBS ${EXTRA_LIBS} dl stdc++ pthread z)
+
+add_executable(light light.cpp)
+target_link_libraries(light
+    ${CVI_LIBS}
+    ${OPENCV_LIBS}
+    ${EXTRA_LIBS})
+install(TARGETS light light DESTINATION bin)
diff --git a/cviruntime/samples_inner/light/README b/cviruntime/samples_inner/light/README
new file mode 100644
index 000000000..b35c5b3c6
--- /dev/null
+++ b/cviruntime/samples_inner/light/README
@@ -0,0 +1,3 @@
+1. ./build.sh
+2. call all data and bin to soc
+3. ./light light_x.npz
diff --git a/cviruntime/samples_inner/light/build.sh b/cviruntime/samples_inner/light/build.sh
new file mode 100755
index 000000000..af64f23cc
--- /dev/null
+++ b/cviruntime/samples_inner/light/build.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+set -xe
+
+TPU_SDK_PATH=/work/sdk/install/soc_cv1826_wevb_0005a_spinand/tpu_32/cvitek_tpu_sdk
+echo "TPU_SDK_PATH=$TPU_SDK_PATH"
+
+if [ -z $TPU_SDK_PATH ]; then
+   echo "please set TPU_SDK_PATH"
+fi
+
+mkdir -p build
+cd build
+cmake .. \
+      -DCMAKE_BUILD_TYPE=RELEASE \
+      -DCMAKE_C_FLAGS_RELEASE=-O3 \
+      -DCMAKE_CXX_FLAGS_RELEASE=-O3 \
+      -DCMAKE_TOOLCHAIN_FILE=$TPU_SDK_PATH/cmake/toolchain-linux-gnueabihf.cmake \
+      -DTPU_SDK_PATH=$TPU_SDK_PATH \
+      -DOPENCV_PATH=$TPU_SDK_PATH/opencv \
+      -DCMAKE_INSTALL_PREFIX=./
+make install
+
diff --git a/cviruntime/samples_inner/light/light.cpp b/cviruntime/samples_inner/light/light.cpp
new file mode 100644
index 000000000..9381ab17d
--- /dev/null
+++ b/cviruntime/samples_inner/light/light.cpp
@@ -0,0 +1,177 @@
+#include <iostream>
+#include <random>
+#include <functional>
+#include <chrono>
+#include <algorithm>
+#include <cmath>
+#include <numeric>
+#include <vector>
+#include "cviruntime.h"
+#include "cviruntime_extra.h"
+#include "cnpy.h"
+
+using system_clock = std::chrono::system_clock;
+using duration = std::chrono::duration<double, std::milli>;
+
+static int index_get(int h, int w1, int w2) {
+  return h * w1 + w2;
+}
+
+static void fill_pad_fmap_int8(
+    const int8_t *before, int8_t **pafter, int val,
+    int pad_l, int pad_r, int pad_t, int pad_b,
+    int ins_h, int ins_w, int ins_h_last, int ins_w_last,
+    int h_before, int w_before)
+{
+  int w_after = (w_before - 1) * (ins_w + 1) + ins_w_last + 1 + pad_l + pad_r;
+  int h_after = (h_before - 1) * (ins_h + 1) + ins_h_last + 1 + pad_t + pad_b;
+  int8_t *after = *pafter;
+
+  if (!after) {
+    after = (int8_t *)malloc(sizeof(int8_t) * w_after * h_after);
+    assert(after);
+  }
+
+  memset(after, val, w_after * h_after);
+  for (int h = 0; h < h_before; h++) {
+    for (int w = 0; w < w_before; w++) {
+      int i = (h * (ins_h + 1) + pad_t) * w_after + w * (ins_w + 1) + pad_l;
+      after[i] = before[h * w_before + w];
+    }
+  }
+  *pafter = after;
+}
+
+static void max_pooling(
+    const int8_t* i_fmap,
+    int8_t* o_fmap,
+    int input_n, int input_c, int input_h, int input_w,
+    int output_h, int output_w, int kh, int kw,
+    int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r,
+    int stride_h, int stride_w,
+    int input_sign)
+{
+  const int max_init = input_sign? -128: 0;
+  int8_t *i_fmap_pad = NULL;
+  for (int nc = 0; nc < input_n * input_c; nc++) {
+    fill_pad_fmap_int8(i_fmap, &i_fmap_pad, max_init,
+      pad_w_l, pad_w_r, pad_h_t, pad_h_b,
+      0, 0, 0, 0, input_h, input_w);
+
+    for (int ph = 0; ph < output_h; ++ph) {
+      for (int pw = 0; pw < output_w; ++pw) {
+        int hstart = ph * stride_h;
+        int wstart = pw * stride_w;
+        int pool_index = index_get(ph, output_w, pw);
+        int max = max_init;
+        for (int h = 0; h < kh; h++) {
+          for (int w = 0; w < kw; w++) {
+            int index = index_get((hstart + h), (input_w + pad_w_l + pad_w_r),
+                            (w + wstart));
+            int val = input_sign ? i_fmap_pad[index]: (uint8_t)i_fmap_pad[index];
+            max = (val > max)? val: max;
+          }
+        }
+        o_fmap[pool_index] = max;
+      }
+    }
+    i_fmap += input_w * input_h;
+    o_fmap += output_w * output_h;
+  }
+  free(i_fmap_pad);
+}
+
+class Light {
+public:
+  Light(int32_t ih, int32_t iw, int32_t kernel_size)
+    : input_h(ih), input_w(iw) {
+    CVI_RT_Init(&ctx);
+    mem_x = CVI_RT_MemAlloc(ctx, ih * iw);
+    mem_y = CVI_RT_MemAlloc(ctx, ih * iw);
+    kfn = CVI_NN_PrepareGrayImageLightKernelFunc(ctx, ih, iw, kernel_size);
+  }
+
+  ~Light() {
+    CVI_RT_MemFree(ctx, mem_x);
+    CVI_RT_MemFree(ctx, mem_y);
+    CVI_NN_DestroyKernelFunc(kfn);
+    CVI_RT_DeInit(ctx);
+  }
+
+  uint8_t* run(uint8_t *input) {
+    auto vptr_x = (uint8_t *)CVI_RT_MemGetVAddr(mem_x);
+    // copy data and flush cache
+    memcpy(vptr_x, input, input_h * input_w);
+    CVI_RT_MemFlush(ctx, mem_x);
+
+    CVI_NN_RunKernelFunc(kfn, 2,
+                        CVI_RT_MemGetPAddr(mem_x),
+                        CVI_RT_MemGetPAddr(mem_y));
+    // invalidate cpu cache
+    CVI_RT_MemInvld(ctx, mem_y);
+    // get result pointer
+    return (uint8_t *)CVI_RT_MemGetVAddr(mem_y);
+  }
+
+private:
+  CVI_RT_HANDLE ctx;
+  CVI_KFUNC_HANDLE kfn;
+  CVI_RT_MEM mem_x;
+  CVI_RT_MEM mem_y;
+  int32_t input_h;
+  int32_t input_w;
+};
+
+int main(int argc, char **argv) {
+  if (argc < 2) {
+    printf("Usage: %s h w k\n", argv[0]);
+    return 1;
+  }
+  srand(100);
+
+  int ih = atoi(argv[1]);
+  int iw = atoi(argv[2]);
+  int k = atoi(argv[3]);
+  int pad = (k - 1) / 2;
+  int s = 1;
+  int oh = (ih + 2*pad - k) / s + 1;
+  int ow = (iw + 2*pad - k) / s + 1;
+  assert(ih == oh);
+  assert(iw == ow);
+
+  uint8_t *x = (uint8_t *)malloc(ih * iw);
+  for (int i = 0; i < (int)(ih * iw); i++) {
+    x[i] = rand() % 256;
+  }
+  uint8_t *bkg = (uint8_t *)malloc(ih * iw);
+  uint8_t *ref = (uint8_t *)malloc(ih * iw);
+
+  max_pooling((int8_t *)x, (int8_t *)bkg, 1, 1, ih, iw, oh, ow,
+              k, k, pad, pad, pad, pad, s, s, 0);
+  for (int i = 0; i < oh * ow; i++) {
+    int8_t mask = (x[i] >= bkg[i]) ? 0 : 1;
+    ref[i] = mask * (x[i] - bkg[i]) + 255;
+  }
+
+  Light light(ih, iw, k);
+
+  auto start = system_clock::now();
+
+  auto y = light.run(x);
+
+  auto end = system_clock::now();
+  duration d = end - start;
+  std::cout << "run duration: " << d.count() << "(ms)\n";
+
+  // get result and compare with reference
+  for (uint32_t i = 0; i < ih * iw; i++) {
+    if (y[i] != ref[i]) {
+      std::cout << "compare failed [" << i << "] " << (int)y[i]
+                << " vs " << (int)ref[i] << "\n";
+      assert(0);
+    }
+  }
+
+  printf("test passed!\n");
+  return 0;
+}
\ No newline at end of file
diff --git a/cviruntime/samples_inner/light/light.py b/cviruntime/samples_inner/light/light.py
new file mode 100644
index 000000000..7507168d4
--- /dev/null
+++ b/cviruntime/samples_inner/light/light.py
@@ -0,0 +1,25 @@
+import torch
+import torch.nn as nn
+import numpy as np
+
+class Light(nn.Module):
+    def __init__(self, kernel_size=15):
+        super().__init__()
+        self.pool = nn.MaxPool2d(kernel_size=kernel_size, stride=1, padding=(kernel_size - 1) // 2)
+
+    def forward(self, x):
+        bkg = self.pool(x)
+        x = torch.where(bkg > x, 255 - (bkg - x), torch.tensor(255.))
+        return x
+
+
+if __name__ == '__main__':
+    net = Light()
+    shape = (1, 1, 40, 70)
+    x = torch.randint(255, shape)
+    x = x.to(torch.float32)
+    np.savez("light_x.npz", x=x.numpy().astype(np.uint8))
+    print("x", x.shape, x)
+    out = net(x)
+    np.savez("light_out.npz", out=out.numpy().astype(np.uint8))
+    print("out", out.shape, out)
\ No newline at end of file
diff --git a/cviruntime/samples_inner/megvii_euc_op/CMakeLists.txt b/cviruntime/samples_inner/megvii_euc_op/CMakeLists.txt
new file mode 100644
index 000000000..44c3f98e7
--- /dev/null
+++ b/cviruntime/samples_inner/megvii_euc_op/CMakeLists.txt
@@ -0,0 +1,29 @@
+cmake_minimum_required(VERSION 2.8.0)
+
+project(megvii_euc C CXX)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
+
+if(NOT DEFINED TPU_SDK_PATH)
+  message(FATAL_ERROR "Please set TPU_SDK_PATH to point to the TPU_SDK installation")
+endif()
+include_directories(${TPU_SDK_PATH}/include)
+link_directories(${TPU_SDK_PATH}/lib)
+
+set(CVI_LIBS ${CVI_LIBS} cviruntime cvikernel)
+if(NOT CMAKE_CROSSCOMPILING)
+  set(CVI_LIBS ${CVI_LIBS} cvicmodel)
+endif()
+
+set(EXTRA_LIBS ${EXTRA_LIBS} dl stdc++ pthread z)
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+
+add_executable(megvii_euc
+    main.cpp)
+    
+target_link_libraries(megvii_euc
+    ${CVI_LIBS}
+    ${EXTRA_LIBS})
diff --git a/cviruntime/samples_inner/megvii_euc_op/README.md b/cviruntime/samples_inner/megvii_euc_op/README.md
new file mode 100644
index 000000000..9205f2e7c
--- /dev/null
+++ b/cviruntime/samples_inner/megvii_euc_op/README.md
@@ -0,0 +1,12 @@
+cmdbuf param:
+input1 : [1, k][uint8]
+input2 : [n, k][uint8]
+output : [n] [float32]
+
+```
+export MLIR_PATH=xxx
+./build.sh
+
+export SET_CHIP_NAME=cv183x
+./build/megvii_euc
+```
\ No newline at end of file
diff --git a/cviruntime/samples_inner/megvii_euc_op/build.sh b/cviruntime/samples_inner/megvii_euc_op/build.sh
new file mode 100755
index 000000000..8e3e35dba
--- /dev/null
+++ b/cviruntime/samples_inner/megvii_euc_op/build.sh
@@ -0,0 +1,11 @@
+mkdir build
+cd build
+cmake -G Ninja \
+    -DCMAKE_BUILD_TYPE=RELEASE \
+    -DCMAKE_C_FLAGS_RELEASE=-g3 -DCMAKE_CXX_FLAGS_RELEASE=-g3 \
+    -DTPU_SDK_PATH=$MLIR_PATH/tpuc \
+    -DCMAKE_INSTALL_PREFIX=../install_samples \
+    ..
+cmake --build .
+
+
diff --git a/cviruntime/samples_inner/megvii_euc_op/euc_backend.hpp b/cviruntime/samples_inner/megvii_euc_op/euc_backend.hpp
new file mode 100644
index 000000000..0a8520f58
--- /dev/null
+++ b/cviruntime/samples_inner/megvii_euc_op/euc_backend.hpp
@@ -0,0 +1,221 @@
+#pragma once
+#include <cassert>
+#include <iostream>
+#include <cmath>
+#include <fstream>
+#include <vector>
+#include <memory.h>
+
+#include <cvikernel/cvikernel.h>
+
+static void load_and_convert_to_bf16(cvk_context_t *cvk_ctx, cvk_tl_t *tl_mem,
+                                     cvk_tl_shape_t &shape,
+                                     cvk_tg_stride_t &stride, int x_base_ga_idx,
+                                     uint64_t x_ga) {
+  assert(tl_mem);
+  cvk_tdma_g2l_tensor_copy_param_t p1 = {0};
+  cvk_tg_t tg_x;
+  tg_x.start_address = x_ga;
+  tg_x.base_reg_index = x_base_ga_idx;
+  tg_x.int8_rnd_mode = 0;
+  tg_x.fmt = CVK_FMT_U8;
+  tg_x.shape = {shape.n, shape.c, shape.h, shape.w};
+  tg_x.stride = stride;
+  p1.src = &tg_x;
+  p1.dst = tl_mem;
+  cvk_ctx->ops->tdma_g2l_bf16_tensor_copy(cvk_ctx, &p1);
+
+  return;
+}
+
+static void convert_ps32_to_fp32(cvk_context_t *cvk_ctx, cvk_tl_t *output) {
+  assert(output->shape.n == 2); // Exclude lower part
+  assert((output->shape.h == 1) && (output->shape.w == 1) && "Only support h=1, w=1");
+
+  uint32_t la_high = output->start_address;
+  cvk_tl_t tl_src;
+  tl_src.start_address = la_high;
+  tl_src.fmt = CVK_FMT_BF16;
+  tl_src.shape = output->shape;
+  tl_src.shape.n = 1;
+  tl_src.stride = cvk_ctx->ops->tl_default_stride(cvk_ctx, tl_src.shape, tl_src.fmt, 1);
+  tl_src.stride.n = output->stride.n;
+
+  uint32_t la_low = output->start_address + tl_src.stride.n;
+  cvk_tl_t tl_dst;
+  tl_dst.start_address = la_low + sizeof(uint16_t); // concat higher part
+  tl_dst.fmt = CVK_FMT_BF16;
+  tl_dst.shape = output->shape;
+  tl_dst.shape.n = 1;
+  tl_dst.stride = cvk_ctx->ops->tl_default_stride(cvk_ctx, tl_dst.shape, tl_dst.fmt, 1);
+  tl_dst.stride.n = output->stride.n;
+
+  cvk_tiu_copy_param_t param = {0};
+  param.src = &tl_src;
+  param.dst = &tl_dst;
+  param.layer_id = 0;
+  cvk_ctx->ops->tiu_copy(cvk_ctx, &param);
+}
+
+static void store_fp32(cvk_context_t *cvk_ctx, int base_ga_idx, uint64_t ga_dst, cvk_tl_t *output) {
+                       
+  assert(output->shape.n == 2); // Exclude lower part
+  assert(output->shape.h == 1 && output->shape.w == 1);
+
+  cvk_tl_t src;
+  src.fmt = CVK_FMT_BF16;
+  src.shape = output->shape;
+  src.shape.n = 1;
+  src.shape.w = 2;
+  src.stride = cvk_ctx->ops->tl_default_stride(cvk_ctx, src.shape, src.fmt, 1);
+  src.stride.n = output->stride.n;
+  src.start_address = output->start_address + src.stride.n;
+  src.eu_align = 1;
+
+  cvk_tg_t dst;
+  dst.fmt = CVK_FMT_BF16;
+  dst.shape.n = 1;
+  dst.shape.c = output->shape.c;
+  dst.shape.h = output->shape.h;
+  dst.shape.w = 2;
+  dst.stride = cvk_ctx->ops->tg_default_stride(cvk_ctx, dst.shape, dst.fmt);
+  dst.base_reg_index = base_ga_idx;
+  dst.start_address = ga_dst;
+
+  cvk_tdma_l2g_tensor_copy_param_t param = {0};
+  param.src = &src;
+  param.dst = &dst;
+  param.layer_id = 0;
+  param.intra_cmd_paral = 0;
+  cvk_ctx->ops->tdma_l2g_bf16_tensor_copy(cvk_ctx, &param);
+}
+
+void runtimeJitEuclideanDistance(void *cvk_ctx, uint32_t records,
+                                 uint32_t feature_size,
+                                 std::vector<uint8_t> &cmdbuf_data) {
+  // tile
+  auto cvk = (cvk_context_t *)cvk_ctx;
+  cvk->ops->set_layer_id(cvk, 0);
+  uint32_t lane_num = cvk->info.npu_num;
+  uint32_t c_step = 0;
+  cvk_tl_shape_t input_x_shape = {1, lane_num, 1, feature_size};
+  uint32_t in_x_size =
+      cvk->ops->lmem_tensor_to_size(cvk, input_x_shape, CVK_FMT_BF16, 1);
+  for (c_step = records; c_step > 0; --c_step) {
+    uint32_t total_size = 0;
+    cvk_tl_shape_t in_y_shape = {1, c_step, 1, feature_size};
+    uint32_t in_y_size = cvk->ops->lmem_tensor_to_size(cvk, in_y_shape, CVK_FMT_BF16, 1);
+    cvk_tl_shape_t output_shape = {2, c_step, 1, 2};
+    uint32_t output_size =
+        cvk->ops->lmem_tensor_to_size(cvk, output_shape, CVK_FMT_BF16, 1);
+    total_size += in_x_size;
+    total_size += in_y_size;
+    total_size += output_size;
+    if (total_size < cvk->info.lmem_size) {
+      break;
+    }
+  }
+  assert(c_step);
+  //c_step = 8 * lane_num;
+  printf("c_step :%d\n", c_step);
+
+  int x_ga_base_reg_idx = 0;
+  int y_ga_base_reg_idx = 2;
+  int o_ga_base_reg_idx = 3;
+
+  uint64_t x_ga = 0;
+  uint64_t y_ga = 0;
+  uint64_t o_ga = 0;
+
+  // alloc lmem
+  cvk_tl_t *input_x = cvk->ops->lmem_alloc_tensor(cvk, input_x_shape, CVK_FMT_BF16, 1);
+  cvk_tl_shape_t input_y_shape = {1, c_step, 1, feature_size};
+  cvk_tl_t *input_y = cvk->ops->lmem_alloc_tensor(cvk, input_y_shape, CVK_FMT_BF16, 1);
+  cvk_tl_shape_t output_shape = {2, c_step, 1, 1};
+  cvk_tl_t *output =
+      cvk->ops->lmem_alloc_tensor(cvk, output_shape, CVK_FMT_BF16, 1);
+  assert(input_x);
+  assert(input_y);
+  assert(output);
+
+  // load input_x
+  cvk_tg_shape_t tg_input_x_shape = {1, lane_num, 1, feature_size};
+  cvk_tg_stride_t tg_input_x_stride = cvk->ops->tg_default_stride(cvk, tg_input_x_shape, CVK_FMT_U8);
+  tg_input_x_stride.c = 0;
+  tg_input_x_stride.n = 0;
+  load_and_convert_to_bf16(cvk, input_x, input_x_shape, tg_input_x_stride, x_ga_base_reg_idx, x_ga);
+
+  for (uint32_t c_pos = 0; c_pos < records;) {
+    uint32_t tile_c = std::min(c_step, records - c_pos);
+
+    //load input_y
+    cvk_tl_shape_t input_y_shape = {1, tile_c, 1, feature_size};
+    cvk_tg_shape_t tg_input_y_shape = {input_y_shape.n, input_y_shape.c, input_y_shape.h, input_y_shape.w};
+    cvk_tg_stride_t tg_input_y_stride = cvk->ops->tg_default_stride(cvk, tg_input_y_shape, CVK_FMT_U8);
+    input_y->shape.c = tile_c;
+    load_and_convert_to_bf16(cvk, input_y, input_y_shape, tg_input_y_stride, y_ga_base_reg_idx, y_ga + c_pos * feature_size);
+
+    cvk_tl_t b;
+    b.start_address = input_x->start_address;
+    b.shape = input_y->shape;
+    b.stride = input_y->stride;
+    b.stride.c = 0;
+    b.stride.n = 0;
+    b.fmt = input_x->fmt;
+
+    cvk_tiu_sub_param_t p1 = {0};
+    p1.res_high = 0;
+    p1.res_low = input_y;
+    p1.a_high = 0;
+    p1.a_low = input_y;
+    p1.b_high = 0;
+    p1.b_low = &b;
+    p1.rshift_bits = 0;
+    p1.layer_id = 0;
+    cvk->ops->tiu_sub(cvk, &p1);
+
+    output->shape.n = 1;
+    output->shape.c = tile_c;
+
+    cvk_tiu_depthwise_pt_convolution_param_t p2 = {0};
+    p2.ofmap = output;
+    p2.ifmap = input_y;
+    p2.weight = input_y;
+    p2.bias = nullptr;
+    p2.ins_h = 0;
+    p2.ins_w = 0;
+    p2.ins_last_h = 0;
+    p2.ins_last_w = 0;
+    p2.pad_top = 0;
+    p2.pad_bottom = 0;
+    p2.pad_left = 0;
+    p2.pad_right = 0;
+    p2.stride_h = 1;
+    p2.stride_w = 1;
+    p2.dilation_h = 1;
+    p2.dilation_w = 1;
+    p2.relu_enable = false;
+    p2.rshift_bits = 0;
+    p2.ps32_mode = 2;
+    p2.layer_id = 0;
+    cvk->ops->tiu_pt_depthwise_convolution(cvk, &p2);
+
+    output->shape.n = 2;
+    convert_ps32_to_fp32(cvk, output);
+
+    store_fp32(cvk, o_ga_base_reg_idx, o_ga + c_pos * sizeof(float), output);
+
+    c_pos += tile_c;
+  }
+
+  cvk->ops->lmem_free_tensor(cvk, output);
+  cvk->ops->lmem_free_tensor(cvk, input_y);
+  cvk->ops->lmem_free_tensor(cvk, input_x);
+
+  uint32_t size;
+  auto cmdbuf = cvk->ops->acquire_cmdbuf(cvk, &size);
+  cmdbuf_data.resize(size);
+  memcpy(cmdbuf_data.data(), cmdbuf, size);
+  cvk->ops->reset(cvk);
+  return;
+}
diff --git a/cviruntime/samples_inner/megvii_euc_op/main.cpp b/cviruntime/samples_inner/megvii_euc_op/main.cpp
new file mode 100644
index 000000000..7158aeaa2
--- /dev/null
+++ b/cviruntime/samples_inner/megvii_euc_op/main.cpp
@@ -0,0 +1,113 @@
+#include <iostream>
+#include <vector>
+#include <fstream>
+#include <string.h>
+#include <math.h>
+
+#include <cviruntime.h>
+#include <cviruntime_context.h>
+#include <cviruntime_extra.h>
+
+#include <euc_backend.hpp>
+
+using namespace std;
+
+void check_output(uint8_t *in1, uint8_t *in2, float *output, int m, int k, int n) {
+  std::vector<float> out;
+  float sum = 0;
+  for (int i = 0; i < n; ++i) {
+    float sum = 0;
+    for (int j = 0; j < k; ++j) {
+      sum += std::pow(in1[j] - in2[i * k + j], 2);
+    }
+    out.emplace_back(sum);
+  }
+
+#ifdef __DUMP__
+  std::ofstream o_f1("sim.text");
+  std::ofstream o_f2("cmdbuf_out.text");
+#endif
+  for (int i = 0; i < n; ++i) {
+#ifdef __DUMP__
+    o_f1 << out[i] << "\n";
+    o_f2 << output[i] << "\n";
+#endif
+    if (out[i] != output[i]) {
+      printf("check failed idx:%d [%f vs %f]\n", i, out[i], output[i]);
+      //return;
+    }
+  }
+#ifdef __DUMP__
+  o_f1.close();
+  o_f2.close();
+#endif
+  printf("check success\n");
+}
+
+int main(int argc, char *argv[]) {
+  CVI_RT_HANDLE ctx;
+  CVI_RT_Init(&ctx);
+  CVI_RT_KHANDLE k_ctx = CVI_RT_RegisterKernel(ctx, 200000);
+  int m = 1;
+  int k = 256;
+  int n = 4096;
+
+  CVI_RT_MEM input1 = CVI_RT_MemAlloc(ctx, k);
+  CVI_RT_MEM input2 = CVI_RT_MemAlloc(ctx, k * n);
+  CVI_RT_MEM output = CVI_RT_MemAlloc(ctx, n * 4);
+
+  uint8_t* in_ptr1 = CVI_RT_MemGetVAddr(input1);
+  uint8_t* in_ptr2 = CVI_RT_MemGetVAddr(input2);
+
+  for (int i = 0; i < k; ++i) {
+    in_ptr1[i] = 2;
+  }
+  int value = 0;
+
+  for (int i = 0; i < n; ++i) {
+    value++;
+    value %= 10; 
+    for (int j = 0; j < k; ++j) {
+      in_ptr2[i * k + j] = value;
+    }
+  }
+  CVI_RT_MemFlush(ctx, input1);
+  CVI_RT_MemFlush(ctx, input2);
+
+  std::vector<uint8_t> cmdbuf;
+  #if 1
+  runtimeJitEuclideanDistance(k_ctx, n, k, cmdbuf);
+  {
+    printf("cmdbuf size:%zu\n", cmdbuf.size());
+    std::ofstream  o_f("search_256_182x.bin", std::ios::binary);
+    o_f.write((char *)cmdbuf.data(), cmdbuf.size());
+    o_f.close();
+  }
+  #else
+  {
+    std::ifstream  i_f("cmdbuf/search_256_182x.bin", std::ios::binary);
+    i_f.seekg(0, i_f.end);
+    size_t length = i_f.tellg();
+    cmdbuf.resize(length);
+    i_f.seekg(0, i_f.beg);
+    i_f.read((char *)cmdbuf.data(), cmdbuf.size());
+  }
+  #endif
+
+  CVI_RT_MEM cmdbuf_mem;
+  int ret = CVI_RT_LoadCmdbuf(ctx, (uint8_t*)cmdbuf.data(), cmdbuf.size(), CVI_RT_MemGetPAddr(input1), 0, false, &cmdbuf_mem);
+
+  ret = CVI_RT_RunCmdbuf(ctx, cmdbuf_mem, CVI_RT_MemGetPAddr(input2), CVI_RT_MemGetPAddr(output));
+  CVI_RT_MemInvld(ctx, output);
+
+  check_output(in_ptr1, in_ptr2, (float *)CVI_RT_MemGetVAddr(output), m, k, n);
+
+  CVI_RT_MemFree(ctx, input1);
+  CVI_RT_MemFree(ctx, input2);
+  CVI_RT_MemFree(ctx, output);
+  CVI_RT_MemFree(ctx, cmdbuf_mem);
+  CVI_RT_UnRegisterKernel(k_ctx);
+  CVI_RT_DeInit(ctx);
+  return 0;
+
+}
diff --git a/cviruntime/samples_inner/megvii_euc_op/main_matmul.cpp b/cviruntime/samples_inner/megvii_euc_op/main_matmul.cpp
new file mode 100644
index 000000000..6940aeaa5
--- /dev/null
+++ b/cviruntime/samples_inner/megvii_euc_op/main_matmul.cpp
@@ -0,0 +1,106 @@
+#include <iostream>
+#include <vector>
+#include <fstream>
+#include <string.h>
+#include <cviruntime.h>
+#include <cviruntime_context.h>
+#include <cviruntime_extra.h>
+
+using namespace std;
+
+void check_output(uint8_t *in1, uint8_t *in2, int32_t *output) {
+  std::vector<int32_t> out;
+  int32_t sum = 0;
+#if 0
+  for (int i = 0; i < 4096; ++i) {
+    int32_t sum = 0;
+    for (int j = 0; j < 512; ++j) {
+      sum += in1[j] * in2[i * 512 + j];
+    }
+    out.emplace_back(sum);
+  printf("sim without transpose\n");
+  }
+#else
+  for (int i = 0; i < 4096; ++i) {
+    int32_t sum = 0;
+    for (int j = 0; j < 512; ++j) {
+      sum += in1[j] * in2[j * 4096 + i];
+    }
+    out.emplace_back(sum);
+  }
+  printf("sim with transpose\n");
+
+#endif
+
+  std::ofstream o_f1("sim.text");
+  std::ofstream o_f2("model_out.text");
+  for (int i = 0; i < 4096; ++i) {
+    o_f1 << out[i] << "\n";
+    o_f2 << output[i] << "\n";
+    if (out[i] != output[i]) {
+      printf("check failed\n");
+    }
+  }
+  o_f1.close();
+  o_f2.close();
+  printf("check success\n");
+}
+
+int main(int argc, char *argv[]) {
+  CVI_RT_HANDLE ctx;
+  CVI_RT_Init(&ctx);
+  int m = 1;
+  int k = 512;
+  int n = 4096;
+
+  CVI_KFUNC_HANDLE dmabuf = CVI_NN_PrepareMatrixMulKernelFunc(ctx, CVI_FMT_UINT8, 1, 512, 4096);
+
+  CVI_RT_MEM input1 = CVI_RT_MemAlloc(ctx, 512);
+  CVI_RT_MEM input2 = CVI_RT_MemAlloc(ctx, 512 * 4096);
+  CVI_RT_MEM output = CVI_RT_MemAlloc(ctx, 4096 * 4);
+
+  uint8_t* in_ptr1 = CVI_RT_MemGetVAddr(input1);
+  uint8_t* in_ptr2 = CVI_RT_MemGetVAddr(input2);
+
+  for (int i = 0; i < 512; ++i) {
+    in_ptr1[i] = 2;
+  }
+  int value = 0;
+
+#define NOT_T
+#ifdef NOT_T
+  for (int i = 0; i < 4096; ++i) {
+    value++;
+    value %= 10; 
+    for (int j = 0; j < 512; ++j) {
+      in_ptr2[i * 512 + j] = value;
+    }
+  }
+  printf("fill without transpose\n");
+#else
+  for (int i = 0; i < 4096; ++i) {
+    value++;
+    value %= 10; 
+    for (int j = 0; j < 512; ++j) {
+      in_ptr2[j * 4096 + i] = value;
+    }
+  }
+  printf("fill with transpose\n");
+
+#endif
+  CVI_RT_MemFlush(ctx, input1);
+  CVI_RT_MemFlush(ctx, input2);
+
+  int ret = CVI_NN_RunKernelFunc(dmabuf, 3, CVI_RT_MemGetPAddr(input1), CVI_RT_MemGetPAddr(input2), CVI_RT_MemGetPAddr(output));
+  CVI_RT_MemInvld(ctx, output);
+
+  check_output(in_ptr1, in_ptr2, (int32_t *)CVI_RT_MemGetVAddr(output));
+
+  CVI_RT_MemFree(ctx, input1);
+  CVI_RT_MemFree(ctx, input2);
+  CVI_RT_MemFree(ctx, output);
+  CVI_NN_DestroyKernelFunc(dmabuf);
+  CVI_RT_DeInit(ctx);
+  return 0;
+
+}
diff --git a/cviruntime/samples_inner/mt/CMakeLists.txt b/cviruntime/samples_inner/mt/CMakeLists.txt
new file mode 100644
index 000000000..701269a6b
--- /dev/null
+++ b/cviruntime/samples_inner/mt/CMakeLists.txt
@@ -0,0 +1,30 @@
+cmake_minimum_required(VERSION 2.8.0)
+
+project(cvi_sample_detector C CXX)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
+
+if(NOT DEFINED TPU_SDK_PATH)
+  message(FATAL_ERROR "Please set TPU_SDK_PATH to point to the TPU_SDK installation")
+endif()
+include_directories(${TPU_SDK_PATH}/include)
+link_directories(${TPU_SDK_PATH}/lib)
+
+if(NOT DEFINED OPENCV_PATH)
+  message(FATAL_ERROR "Please set OPENCV_PATH to point to the opencvn installation")
+endif()
+include_directories(${OPENCV_PATH}/include)
+link_directories(${OPENCV_PATH}/lib)
+
+set(CVI_LIBS ${CVI_LIBS} cviruntime cvikernel cnpy)
+set(OPENCV_LIBS ${OPENCV_LIBS} opencv_core opencv_imgcodecs opencv_imgproc)
+set(EXTRA_LIBS ${EXTRA_LIBS} dl stdc++ pthread z)
+
+add_executable(mt mt.cpp mt_model.cpp)
+target_link_libraries(mt
+    ${CVI_LIBS}
+    ${OPENCV_LIBS}
+    ${EXTRA_LIBS})
+install(TARGETS mt mt DESTINATION bin)
diff --git a/cviruntime/samples_inner/mt/README b/cviruntime/samples_inner/mt/README
new file mode 100644
index 000000000..3f910da4b
--- /dev/null
+++ b/cviruntime/samples_inner/mt/README
@@ -0,0 +1,8 @@
+1. build mt binary in src directory.
+2. cp binary, npz and cvimodel to ecb:
+3. cp cvitek_tpu_sdk.tar.gz to evb and extract it to /mnt/data/
+4. execute cmds:
+   # bash
+   # cd cvitek_tpu_sdk && source envs_tpu_sdk.sh
+   # ./mt mt_encoder_decoder_0_10_20_30_39_mix_2.cvimodel mt_input_x.npz
+
diff --git a/cviruntime/samples_inner/mt/build.sh b/cviruntime/samples_inner/mt/build.sh
new file mode 100755
index 000000000..af64f23cc
--- /dev/null
+++ b/cviruntime/samples_inner/mt/build.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+set -xe
+
+TPU_SDK_PATH=/work/sdk/install/soc_cv1826_wevb_0005a_spinand/tpu_32/cvitek_tpu_sdk
+echo "TPU_SDK_PATH=$TPU_SDK_PATH"
+
+if [ -z $TPU_SDK_PATH ]; then
+   echo "please set TPU_SDK_PATH"
+fi
+
+mkdir -p build
+cd build
+cmake .. \
+      -DCMAKE_BUILD_TYPE=RELEASE \
+      -DCMAKE_C_FLAGS_RELEASE=-O3 \
+      -DCMAKE_CXX_FLAGS_RELEASE=-O3 \
+      -DCMAKE_TOOLCHAIN_FILE=$TPU_SDK_PATH/cmake/toolchain-linux-gnueabihf.cmake \
+      -DTPU_SDK_PATH=$TPU_SDK_PATH \
+      -DOPENCV_PATH=$TPU_SDK_PATH/opencv \
+      -DCMAKE_INSTALL_PREFIX=./
+make install
+
diff --git a/cviruntime/samples_inner/mt/mt.cpp b/cviruntime/samples_inner/mt/mt.cpp
new file mode 100644
index 000000000..495f8f1ee
--- /dev/null
+++ b/cviruntime/samples_inner/mt/mt.cpp
@@ -0,0 +1,120 @@
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <opencv2/opencv.hpp>
+#include "cviruntime.h"
+#include "cnpy.h"
+#include "mt_model.hpp"
+
+/*
+static void load_input(int16_t *seq) {
+  memset(seq, 0, INFER_FIX_LEN * sizeof(seq));
+  seq[0] = 4;
+  seq[1] = 3201;
+  seq[2] = 6;
+  seq[3] = 150;
+  seq[4] = 1121;
+  seq[5] = 2;
+
+  printf("src_seq:\n");
+  for (int i = 0; i < INFER_FIX_LEN; i++) {
+    printf("%d ", seq[i]);
+  }
+  printf("\n");
+}
+*/
+
+static int levenshtein_distance(const uint16_t* s, int n, const uint16_t* t, int m) {
+   ++n; ++m;
+   int* d = new int[n * m];
+   memset(d, 0, sizeof(int) * n * m);
+   for (int i = 1, im = 0; i < m; ++i, ++im) {
+      for (int j = 1, jn = 0; j < n; ++j, ++jn) {
+         if (s[jn] == t[im]) {
+            d[(i * n) + j] = d[((i - 1) * n) + (j - 1)];
+         } else {
+            d[(i * n) + j] = std::min(d[(i - 1) * n + j] + 1, /* A deletion. */
+                                 std::min(d[i * n + (j - 1)] + 1, /* An insertion. */
+                                     d[(i - 1) * n + (j - 1)] + 1)); /* A substitution. */
+         }
+      }
+   }
+   int r = d[n * m - 1];
+   delete [] d;
+   return r;
+}
+
+int main(int argc, char **argv) {
+  int ret = 0;
+  if (argc < 3) {
+    printf("Usage:\n");
+    printf("   %s mt-cvimodel, input_npz, ref_npz\n", argv[0]);
+    exit(1);
+  }
+
+  MTrans trans(argv[1]);
+  int16_t src_seq[INFER_FIX_LEN];
+  //load_input(argv[2], src_seq);
+
+  cnpy::npz_t seq_npz = cnpy::npz_load(argv[2]);
+  if (seq_npz.size() == 0) {
+    printf("Failed to load input npz\n");
+  }
+  cnpy::npz_t ref_npz = cnpy::npz_load(argv[3]);
+  if (ref_npz.size() == 0) {
+    printf("Failed to load ref npz\n");
+  }
+
+  int err_sum = 0;
+  for (int i = 0; i < seq_npz.size(); i++) {
+    auto name = std::to_string(i);
+    auto &input_data = seq_npz[name];
+    auto in_ptr = input_data.data<uint16_t>();
+    memcpy(src_seq, in_ptr, sizeof(uint16_t) * INFER_FIX_LEN);
+    printf("src_seq: ");
+    for (int i = 0; i < INFER_FIX_LEN; i++) {
+      printf("%d ", src_seq[i]);
+    }
+    printf("\n");
+
+
+    struct timeval t0, t1;
+    gettimeofday(&t0, NULL);
+
+    int16_t gen_seq[INFER_FIX_LEN];
+    trans.run(src_seq, INFER_FIX_LEN, gen_seq, INFER_FIX_LEN);
+
+    gettimeofday(&t1, NULL);
+    long elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
+
+    printf("gen_seq: ");
+    for (int i = 0; i < INFER_FIX_LEN; i++) {
+      printf("%d ", gen_seq[i]);
+    }
+    printf("\n");
+
+    auto& data = ref_npz[name];
+    auto ptr = data.data<uint16_t>();
+    printf("ref_seq: ");
+    for (int i = 0; i < INFER_FIX_LEN; i++) {
+      printf("%d ", ptr[i]);
+    }
+    printf("\n");
+
+    int err = levenshtein_distance((uint16_t *)gen_seq, 40, ptr, 40);
+    // int err = 0;
+    // for (int i = 0; i < INFER_FIX_LEN; i++) {
+    //   if (gen_seq[i] != ptr[i]) {
+    //     err += 1;
+    //   }
+    // }
+    err_sum += err;
+    printf("%d => error:%d, sum:%d, performance: %fms\n", i, err, err_sum, elapsed / 1000.0);
+  }
+
+  printf("accuracy: %f%\n", 100 - (err_sum * 100.0f / (seq_npz.size() * 40)));
+  return 0;
+}
\ No newline at end of file
diff --git a/cviruntime/samples_inner/mt/mt_model.cpp b/cviruntime/samples_inner/mt/mt_model.cpp
new file mode 100644
index 000000000..f0ada08da
--- /dev/null
+++ b/cviruntime/samples_inner/mt/mt_model.cpp
@@ -0,0 +1,219 @@
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <opencv2/opencv.hpp>
+#include "cviruntime.h"
+#include "cnpy.h"
+#include "mt_model.hpp"
+
+static bf16_t mask_val() {
+  float val = -50;
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+  return ((bf16_t *)(&val))[0];
+#else
+  return ((bf16_t *)(&val))[1];
+#endif
+}
+
+/*
+static void store_result(std::string name, CVI_TENSOR *tensor) {
+  std::vector<size_t> shape = {
+      (size_t)tensor->shape.dim[0], (size_t)tensor->shape.dim[1],
+      (size_t)tensor->shape.dim[2], (size_t)tensor->shape.dim[3]};
+  cnpy::npz_t npz;
+  cnpy::npz_add_array<uint16_t>(npz, tensor->name,
+       (uint16_t *)CVI_NN_TensorPtr(tensor), shape);
+  cnpy::npz_save_all(name, npz);
+}
+*/
+
+Encoder::Encoder(const char *model_file) {
+  int ret = CVI_NN_RegisterModel(model_file, &model);
+  if (ret != CVI_RC_SUCCESS) {
+    printf("CVI_NN_RegisterModel failed, err %d\n", ret);
+    exit(1);
+  }
+  ret = CVI_NN_GetInputOutputTensors(model, &input_tensors, &input_num,
+                                     &output_tensors, &output_num);
+  if (ret != CVI_RC_SUCCESS) {
+    printf("CVI_NN_GetInputOutputTensors failed, err %d\n", ret);
+    exit(1);
+  }
+  assert(input_num == 2);
+  assert(output_num == 1);
+  src_seq = &input_tensors[0];
+  src_mask = &input_tensors[1];
+  enc_output = &output_tensors[0];
+  for (int i = 0; i < input_num; i++) {
+    std::cout << "input => " << input_tensors[i].name << "\n";
+  }
+}
+
+void Encoder::gen_src_mask(int16_t *seq, int32_t size) {
+  auto ptr = (bf16_t *)CVI_NN_TensorPtr(src_mask);
+  assert(CVI_NN_TensorCount(src_mask) == size);
+  auto filled_val = mask_val();
+  for (int i = 0; i < size; i++) {
+    ptr[i] = (seq[i] == 0) ? filled_val : 0;
+  }
+}
+
+bf16_t *Encoder::get_mask() {
+  return (bf16_t *)CVI_NN_TensorPtr(src_mask);
+}
+
+bf16_t* Encoder::run(int16_t *seq, int32_t size) {
+  // fill src_seq to tensor 0
+  CVI_NN_SetTensorPtr(src_seq, seq);
+  // generate src mask to tensor 1
+  gen_src_mask(seq, size);
+
+  /*
+  printf("src_seq:");
+  for (int i = 0; i < (int)CVI_NN_TensorCount(src_seq); i++) {
+    printf("%d ", ((int16_t *)CVI_NN_TensorPtr(src_seq))[i]);
+  }
+  printf("\n");
+  printf("src_mask:");
+  for (int i = 0; i < (int)CVI_NN_TensorCount(src_mask); i++) {
+    printf("%d ", ((int16_t *)CVI_NN_TensorPtr(src_mask))[i]);
+  }
+  printf("\n");
+  */
+  // run inference
+  CVI_NN_Forward(model, input_tensors, input_num,
+                 output_tensors, output_num);
+  //store_result("xx_enc_output.npz", enc_output);
+
+  return (bf16_t *)CVI_NN_TensorPtr(enc_output);
+}
+
+Decoder::Decoder(CVI_MODEL_HANDLE main_model, int32_t max_step)
+  : max_step(max_step) {
+  int ret = CVI_NN_CloneModel(main_model, &model);
+  if (ret != CVI_RC_SUCCESS) {
+    printf("CVI_NN_RegisterModel failed, err %d\n", ret);
+    exit(1);
+  }
+  CVI_NN_SetConfig(model, OPTION_BATCH_SIZE, 1);
+  switch(max_step) {
+    case 0:
+      CVI_NN_SetConfig(model, OPTION_PROGRAM_INDEX, 1);
+      break;
+    case 10:
+      CVI_NN_SetConfig(model, OPTION_PROGRAM_INDEX, 2);
+      break;
+    case 20:
+      CVI_NN_SetConfig(model, OPTION_PROGRAM_INDEX, 3);
+      break;
+    case 30:
+      CVI_NN_SetConfig(model, OPTION_PROGRAM_INDEX, 4);
+      break;
+    case 39:
+      CVI_NN_SetConfig(model, OPTION_PROGRAM_INDEX, 5);
+      break;
+  }
+  ret = CVI_NN_GetInputOutputTensors(model, &input_tensors, &input_num,
+                                     &output_tensors, &output_num);
+  if (ret != CVI_RC_SUCCESS) {
+    printf("CVI_NN_GetInputOutputTensors failed, err %d\n", ret);
+    exit(1);
+  }
+  assert(input_num == 4);
+  assert(output_num == 1);
+  trg_seq = &input_tensors[0];
+  enc_output = &input_tensors[1];
+  src_mask = &input_tensors[2];
+  trg_mask = &input_tensors[3];
+  dec_output = &output_tensors[0];
+  width = dec_output->shape.dim[2];
+  for (int i = 0; i < input_num; i++) {
+    std::cout << "input => " << input_tensors[i].name << "\n";
+  }
+  std::cout << max_step << "- Decoder: tensors: "
+            << trg_seq->name << ", "
+            << trg_mask->name << ", "
+            << enc_output->name << ", "
+            << src_mask->name << ", "
+            << dec_output->name << ", width:"
+            << width << "\n";
+  // generate default trg mask
+  gen_trg_mask();
+}
+
+void Decoder::gen_trg_mask() {
+  auto filled_val = mask_val();
+  auto ptr = (bf16_t *)CVI_NN_TensorPtr(trg_mask);
+  for (int i = 0; i < max_step; i++) {
+    for (int j = 0; j < max_step; j++) {
+      ptr[i * max_step + j] = (j > i) ? filled_val : 0;
+    }
+  }
+}
+
+int16_t Decoder::argmax(int step) {
+  step = (step == 0) ? 0 : (step - 1);
+  auto ptr = (int8_t *)CVI_NN_TensorPtr(dec_output);
+  ptr += step * width;
+  int idx = 0;
+  int8_t max_value = 0;
+  for (int j = 0; j < width; j++) {
+    int8_t val = ptr[j];
+    if (val < 0) {
+      continue;
+    }
+    if (val > max_value) {
+      idx = j;
+      max_value = val;
+    }
+  }
+  return idx;
+}
+
+int16_t Decoder::run(int step, int16_t *seq, bf16_t *enc, bf16_t *mask) {
+  // fill data to input tensor
+  CVI_NN_SetTensorPtr(trg_seq, seq);
+  CVI_NN_SetTensorPtr(enc_output, enc);
+  CVI_NN_SetTensorPtr(src_mask, mask);
+  // run inference
+  CVI_NN_Forward(model, input_tensors, input_num,
+                 output_tensors, output_num);
+  // std::string name = "xx_decode_" + std::to_string(step) + "_out.npz";
+  // store_result(name, dec_output);
+  return argmax(step);
+}
+
+void MTrans::run(int16_t *seq, int32_t seq_sz, int16_t *gen_seq, int32_t gen_seq_sz) {
+  // clean gen_seq array.
+  memset(gen_seq, 0, gen_seq_sz * sizeof(int16_t));
+
+  auto enc_output = encoder->run(seq, seq_sz);
+  auto src_mask = encoder->get_mask();
+
+  int16_t trg_seq = 1;
+  auto best_idx = decoder_0->run(0, &trg_seq, enc_output, src_mask);
+  gen_seq[0] = SOS_IDX;
+  gen_seq[1] = best_idx;
+
+  int seq_len = 0;
+  for (int step = 2; step < INFER_FIX_LEN; step++) {
+    if (step <= 10) {
+      best_idx = decoder_10->run(step, gen_seq, enc_output, src_mask);
+    } else if (step <= 20) {
+      best_idx = decoder_20->run(step, gen_seq, enc_output, src_mask);
+    } else if (step <= 30) {
+      best_idx = decoder_30->run(step, gen_seq, enc_output, src_mask);
+    } else {
+      best_idx = decoder_39->run(step, gen_seq, enc_output, src_mask);
+    }
+    gen_seq[step] = best_idx;
+    seq_len = step + 1;
+    // if (gen_seq[39] == EOS_IDX) {
+    if (best_idx == EOS_IDX) {
+      break;
+    }
+  }
+}
diff --git a/cviruntime/samples_inner/mt/mt_model.hpp b/cviruntime/samples_inner/mt/mt_model.hpp
new file mode 100644
index 000000000..adfc0a4bd
--- /dev/null
+++ b/cviruntime/samples_inner/mt/mt_model.hpp
@@ -0,0 +1,114 @@
+#ifndef __SAMPLES_MT_MODEL_HPP
+#define __SAMPLES_MT_MODEL_HPP
+
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <opencv2/opencv.hpp>
+#include "cviruntime.h"
+#include "cnpy.h"
+
+#define SOS_IDX 1
+#define LEXICON_SIZE 16002
+#define PAD_IDX 0
+#define SOS_IDX 1
+#define EOS_IDX 2
+#define INFER_FIX_LEN 40
+typedef uint16_t bf16_t;
+
+class Encoder {
+public:
+  Encoder(const char *model_file);
+  ~Encoder() {
+    if (model) {
+      CVI_NN_CleanupModel(model);
+    }
+  }
+
+  bf16_t* run(int16_t *seq, int32_t size);
+  bf16_t* get_mask();
+
+public:
+  CVI_MODEL_HANDLE model = nullptr;
+  CVI_TENSOR *src_seq;
+  CVI_TENSOR *src_mask;
+  CVI_TENSOR *enc_output;
+
+private:
+  void gen_src_mask(int16_t *src_seq, int32_t size);
+
+  CVI_TENSOR *input_tensors;
+  CVI_TENSOR *output_tensors;
+  int32_t input_num;
+  int32_t output_num;
+};
+
+class Decoder {
+public:
+  Decoder(CVI_MODEL_HANDLE model, int32_t max_step);
+  ~Decoder() {
+    if (model) {
+      CVI_NN_CleanupModel(model);
+    }
+  }
+
+  int16_t run(int step, int16_t *seq,
+              bf16_t *enc, bf16_t *mask);
+
+public:
+  CVI_TENSOR *trg_seq;
+  CVI_TENSOR *trg_mask;
+  CVI_TENSOR *enc_output;
+  CVI_TENSOR *src_mask;
+  CVI_TENSOR *dec_output;
+  int32_t max_step;
+  int32_t width;
+
+
+private:
+  void gen_trg_mask();
+  int16_t argmax(int32_t step);
+  CVI_MODEL_HANDLE model = nullptr;
+  CVI_TENSOR *input_tensors;
+  CVI_TENSOR *output_tensors;
+  int32_t input_num;
+  int32_t output_num;
+};
+
+class MTrans {
+public:
+  MTrans(const char *cvimodel) {
+    encoder = new Encoder(cvimodel);
+    decoder_0 = new Decoder(encoder->model, 0);
+    decoder_10 = new Decoder(encoder->model, 10);
+    decoder_20 = new Decoder(encoder->model, 20);
+    decoder_30 = new Decoder(encoder->model, 30);
+    decoder_39 = new Decoder(encoder->model, 39);
+  }
+
+  ~MTrans() {
+    delete encoder;
+    delete decoder_0;
+    delete decoder_10;
+    delete decoder_20;
+    delete decoder_30;
+    delete decoder_39;
+  }
+
+  void run(int16_t *seq, int32_t seq_sz,
+           int16_t *gen_seq, int32_t gen_seq_sz);
+
+private:
+  Encoder *encoder;
+  Decoder *decoder_0;
+  Decoder *decoder_10;
+  Decoder *decoder_20;
+  Decoder *decoder_30;
+  Decoder *decoder_39;
+};
+
+
+#endif
\ No newline at end of file
diff --git a/cviruntime/samples_inner/mt_bidir/CMakeLists.txt b/cviruntime/samples_inner/mt_bidir/CMakeLists.txt
new file mode 100644
index 000000000..701269a6b
--- /dev/null
+++ b/cviruntime/samples_inner/mt_bidir/CMakeLists.txt
@@ -0,0 +1,30 @@
+cmake_minimum_required(VERSION 2.8.0)
+
+project(cvi_sample_detector C CXX)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
+
+if(NOT DEFINED TPU_SDK_PATH)
+  message(FATAL_ERROR "Please set TPU_SDK_PATH to point to the TPU_SDK installation")
+endif()
+include_directories(${TPU_SDK_PATH}/include)
+link_directories(${TPU_SDK_PATH}/lib)
+
+if(NOT DEFINED OPENCV_PATH)
+  message(FATAL_ERROR "Please set OPENCV_PATH to point to the opencvn installation")
+endif()
+include_directories(${OPENCV_PATH}/include)
+link_directories(${OPENCV_PATH}/lib)
+
+set(CVI_LIBS ${CVI_LIBS} cviruntime cvikernel cnpy)
+set(OPENCV_LIBS ${OPENCV_LIBS} opencv_core opencv_imgcodecs opencv_imgproc)
+set(EXTRA_LIBS ${EXTRA_LIBS} dl stdc++ pthread z)
+
+add_executable(mt mt.cpp mt_model.cpp)
+target_link_libraries(mt
+    ${CVI_LIBS}
+    ${OPENCV_LIBS}
+    ${EXTRA_LIBS})
+install(TARGETS mt mt DESTINATION bin)
diff --git a/cviruntime/samples_inner/mt_bidir/README b/cviruntime/samples_inner/mt_bidir/README
new file mode 100644
index 000000000..a66d7ef6d
--- /dev/null
+++ b/cviruntime/samples_inner/mt_bidir/README
@@ -0,0 +1,8 @@
+1. build mt binary in src directory.
+2. cp binary, npz and cvimodel to ecb:
+3. cp cvitek_tpu_sdk.tar.gz to evb and extract it to /mnt/data/
+4. execute cmds:
+   # bash
+   # cd cvitek_tpu_sdk && source envs_tpu_sdk.sh
+   # ./mt mt_encoder_decoder_0_10_20_30_39_mix_2.cvimodel mt_input_x.npz mt_output_ref.npz s2t
+
diff --git a/cviruntime/samples_inner/mt_bidir/build.sh b/cviruntime/samples_inner/mt_bidir/build.sh
new file mode 100755
index 000000000..af64f23cc
--- /dev/null
+++ b/cviruntime/samples_inner/mt_bidir/build.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+set -xe
+
+TPU_SDK_PATH=/work/sdk/install/soc_cv1826_wevb_0005a_spinand/tpu_32/cvitek_tpu_sdk
+echo "TPU_SDK_PATH=$TPU_SDK_PATH"
+
+if [ -z $TPU_SDK_PATH ]; then
+   echo "please set TPU_SDK_PATH"
+fi
+
+mkdir -p build
+cd build
+cmake .. \
+      -DCMAKE_BUILD_TYPE=RELEASE \
+      -DCMAKE_C_FLAGS_RELEASE=-O3 \
+      -DCMAKE_CXX_FLAGS_RELEASE=-O3 \
+      -DCMAKE_TOOLCHAIN_FILE=$TPU_SDK_PATH/cmake/toolchain-linux-gnueabihf.cmake \
+      -DTPU_SDK_PATH=$TPU_SDK_PATH \
+      -DOPENCV_PATH=$TPU_SDK_PATH/opencv \
+      -DCMAKE_INSTALL_PREFIX=./
+make install
+
diff --git a/cviruntime/samples_inner/mt_bidir/mt.cpp b/cviruntime/samples_inner/mt_bidir/mt.cpp
new file mode 100644
index 000000000..e3ddb3847
--- /dev/null
+++ b/cviruntime/samples_inner/mt_bidir/mt.cpp
@@ -0,0 +1,111 @@
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <opencv2/opencv.hpp>
+#include "cviruntime.h"
+#include "cnpy.h"
+#include "mt_model.hpp"
+
+static int levenshtein_distance(const uint16_t* s, int n, const uint16_t* t, int m) {
+   ++n; ++m;
+   int* d = new int[n * m];
+   memset(d, 0, sizeof(int) * n * m);
+   for (int i = 1, im = 0; i < m; ++i, ++im) {
+      for (int j = 1, jn = 0; j < n; ++j, ++jn) {
+         if (s[jn] == t[im]) {
+            d[(i * n) + j] = d[((i - 1) * n) + (j - 1)];
+         } else {
+            d[(i * n) + j] = std::min(d[(i - 1) * n + j] + 1, /* A deletion. */
+                                 std::min(d[i * n + (j - 1)] + 1, /* An insertion. */
+                                     d[(i - 1) * n + (j - 1)] + 1)); /* A substitution. */
+         }
+      }
+   }
+   int r = d[n * m - 1];
+   delete [] d;
+   return r;
+}
+
+int main(int argc, char **argv) {
+  int ret = 0;
+  if (argc < 3) {
+    printf("Usage:\n");
+    printf("   %s mt-cvimodel input_npz ref_npz\n", argv[0]);
+    exit(1);
+  }
+
+  MTrans trans(argv[1]);
+  int16_t src_seq[INFER_FIX_LEN];
+
+  cnpy::npz_t seq_npz = cnpy::npz_load(argv[2]);
+  if (seq_npz.size() == 0) {
+    printf("Failed to load input npz\n");
+  }
+  cnpy::npz_t ref_npz = cnpy::npz_load(argv[3]);
+  if (ref_npz.size() == 0) {
+    printf("Failed to load ref npz\n");
+  }
+
+  int err_sum = 0;
+  int64_t elapsed_max = 0, elapsed_min = 0, elapsed_total = 0;
+  int num_seq = seq_npz.size();
+  for (int i = 0; i < num_seq; i++) {
+    auto name = std::to_string(i);
+    auto &input_data = seq_npz[name];
+    auto in_ptr = input_data.data<uint16_t>();
+    memcpy(src_seq, in_ptr, sizeof(uint16_t) * INFER_FIX_LEN);
+    printf("src_seq: ");
+    for (int i = 0; i < INFER_FIX_LEN; i++) {
+      printf("%d ", src_seq[i]);
+    }
+    printf("\n");
+
+
+    struct timeval t0, t1;
+    gettimeofday(&t0, NULL);
+
+    int16_t gen_seq[INFER_FIX_LEN];
+    trans.run(src_seq, INFER_FIX_LEN, gen_seq, INFER_FIX_LEN);
+
+    gettimeofday(&t1, NULL);
+    long elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
+    if (elapsed_max < elapsed) {
+      elapsed_max = elapsed;
+    }
+    if (elapsed_min == 0 || elapsed_min > elapsed) {
+      elapsed_min = elapsed;
+    }
+    elapsed_total += elapsed;
+
+    printf("gen_seq: ");
+    for (int i = 0; i < INFER_FIX_LEN; i++) {
+      printf("%d ", gen_seq[i]);
+    }
+    printf("\n");
+
+    auto& data = ref_npz[name];
+    auto ptr = data.data<uint16_t>();
+    printf("ref_seq: ");
+    for (int i = 0; i < INFER_FIX_LEN; i++) {
+      printf("%d ", ptr[i]);
+    }
+    printf("\n");
+
+    int err = levenshtein_distance((uint16_t *)gen_seq, 40, ptr, 40);
+    // int err = 0;
+    // for (int i = 0; i < INFER_FIX_LEN; i++) {
+    //   if (gen_seq[i] != ptr[i]) {
+    //     err += 1;
+    //   }
+    // }
+    err_sum += err;
+    printf("%d => error:%d, sum:%d, performance: %fms\n", i, err, err_sum, elapsed / 1000.0);
+  }
+
+  printf("accuracy: %f%, max/min/avg:%fms / %fms / %fms\n", 100 - (err_sum * 100.0f / (seq_npz.size() * 40)),
+         elapsed_max / 1000.0, elapsed_min / 1000.0, elapsed_total / num_seq / 1000.0);
+  return 0;
+}
\ No newline at end of file
diff --git a/cviruntime/samples_inner/mt_bidir/mt_model.cpp b/cviruntime/samples_inner/mt_bidir/mt_model.cpp
new file mode 100644
index 000000000..1bf8e1405
--- /dev/null
+++ b/cviruntime/samples_inner/mt_bidir/mt_model.cpp
@@ -0,0 +1,227 @@
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <opencv2/opencv.hpp>
+#include "cviruntime.h"
+#include "cnpy.h"
+#include "mt_model.hpp"
+
+static bf16_t mask_val() {
+  float val = -50;
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+  return ((bf16_t *)(&val))[0];
+#else
+  return ((bf16_t *)(&val))[1];
+#endif
+}
+
+static inline float BF16(const bf16_t & data) {
+  float data_f32 = 0.0f;
+  uint16_t *p_data_bf16 = (uint16_t*)(&data_f32);
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+  p_data_bf16[0] = data;
+#else
+  p_data_bf16[1] = data;
+#endif
+  return data_f32;
+}
+
+Encoder::Encoder(const char *model_file) {
+  int ret = CVI_NN_RegisterModel(model_file, &model);
+  if (ret != CVI_RC_SUCCESS) {
+    printf("CVI_NN_RegisterModel failed, err %d\n", ret);
+    exit(1);
+  }
+  ret = CVI_NN_GetInputOutputTensors(model, &input_tensors, &input_num,
+                                     &output_tensors, &output_num);
+  if (ret != CVI_RC_SUCCESS) {
+    printf("CVI_NN_GetInputOutputTensors failed, err %d\n", ret);
+    exit(1);
+  }
+  assert(input_num == 2);
+  assert(output_num == 1);
+  src_seq = &input_tensors[0];
+  src_mask = &input_tensors[1];
+  enc_output = &output_tensors[0];
+  for (int i = 0; i < input_num; i++) {
+    std::cout << "input => " << input_tensors[i].name << "\n";
+  }
+}
+
+Encoder::Encoder(CVI_MODEL_HANDLE main_model) {
+  int ret = CVI_NN_CloneModel(main_model, &model);
+  if (ret != CVI_RC_SUCCESS) {
+    printf("CVI_NN_RegisterModel failed, err %d\n", ret);
+    exit(1);
+  }
+  CVI_NN_SetConfig(model, OPTION_PROGRAM_INDEX, 1);
+  ret = CVI_NN_GetInputOutputTensors(model, &input_tensors, &input_num,
+                                     &output_tensors, &output_num);
+  if (ret != CVI_RC_SUCCESS) {
+    printf("CVI_NN_GetInputOutputTensors failed, err %d\n", ret);
+    exit(1);
+  }
+  assert(input_num == 2);
+  assert(output_num == 1);
+  src_seq = &input_tensors[0];
+  src_mask = &input_tensors[1];
+  enc_output = &output_tensors[0];
+  for (int i = 0; i < input_num; i++) {
+    std::cout << "input => " << input_tensors[i].name << "\n";
+  }
+}
+
+void Encoder::gen_src_mask(int16_t *seq, int32_t size) {
+  auto ptr = (bf16_t *)CVI_NN_TensorPtr(src_mask);
+  assert(CVI_NN_TensorCount(src_mask) == size);
+  auto filled_val = mask_val();
+  for (int i = 0; i < size; i++) {
+    ptr[i] = (seq[i] == 0) ? filled_val : 0;
+  }
+}
+
+bf16_t *Encoder::get_mask() {
+  return (bf16_t *)CVI_NN_TensorPtr(src_mask);
+}
+
+bf16_t* Encoder::run(int16_t *seq, int32_t size) {
+  // fill src_seq to tensor 0
+  CVI_NN_SetTensorPtr(src_seq, seq);
+  // generate src mask to tensor 1
+  gen_src_mask(seq, size);
+
+  // run inference
+  CVI_NN_Forward(model, input_tensors, input_num,
+                 output_tensors, output_num);
+
+  return (bf16_t *)CVI_NN_TensorPtr(enc_output);
+}
+
+Decoder::Decoder(CVI_MODEL_HANDLE main_model, int32_t program_index, int32_t max_step)
+  : max_step(max_step) {
+  int ret = CVI_NN_CloneModel(main_model, &model);
+  if (ret != CVI_RC_SUCCESS) {
+    printf("CVI_NN_RegisterModel failed, err %d\n", ret);
+    exit(1);
+  }
+  CVI_NN_SetConfig(model, OPTION_PROGRAM_INDEX, program_index);
+  ret = CVI_NN_GetInputOutputTensors(model, &input_tensors, &input_num,
+                                     &output_tensors, &output_num);
+  if (ret != CVI_RC_SUCCESS) {
+    printf("CVI_NN_GetInputOutputTensors failed, err %d\n", ret);
+    exit(1);
+  }
+  assert(input_num == 5);
+  assert(output_num == 1);
+  trg_seq = &input_tensors[0];
+  enc_output = &input_tensors[1];
+  src_mask = &input_tensors[2];
+  trg_mask = &input_tensors[3];
+  trg_step = &input_tensors[4];
+  dec_output = &output_tensors[0];
+  width = dec_output->shape.dim[1];
+  for (int i = 0; i < input_num; i++) {
+    std::cout << "input => " << input_tensors[i].name << "\n";
+  }
+  std::cout << max_step << "- Decoder: tensors: "
+            << trg_seq->name << ", "
+            << trg_mask->name << ", "
+            << enc_output->name << ", "
+            << src_mask->name << ", "
+            << trg_step->name << ", "
+            << dec_output->name << ", width:"
+            << width << "\n";
+  // generate default trg mask
+  gen_trg_mask();
+  is_fix8b = (dec_output->fmt != CVI_FMT_BF16);
+}
+
+void Decoder::gen_trg_mask() {
+  auto filled_val = mask_val();
+  auto ptr = (bf16_t *)CVI_NN_TensorPtr(trg_mask);
+  for (int i = 0; i < max_step; i++) {
+    for (int j = 0; j < max_step; j++) {
+      ptr[i * max_step + j] = (j > i) ? filled_val : 0;
+    }
+  }
+}
+
+int16_t Decoder::argmax_int8() {
+  auto ptr = (int8_t *)CVI_NN_TensorPtr(dec_output);
+  int idx = 0;
+  auto max_value = ptr[0];
+  for (int j = 1; j < width; j++) {
+    auto val = ptr[j];
+    if (val > max_value) {
+      idx = j;
+      max_value = val;
+    }
+  }
+  return idx;
+}
+
+int16_t Decoder::argmax_bf16() {
+  auto ptr = (bf16_t *)CVI_NN_TensorPtr(dec_output);
+  int idx = 0;
+  auto max_value = BF16(ptr[0]);
+  for (int j = 1; j < width; j++) {
+    auto val = BF16(ptr[j]);
+    if (val > max_value) {
+      idx = j;
+      max_value = val;
+    }
+  }
+  return idx;
+}
+
+int16_t Decoder::argmax() {
+  if (is_fix8b) {
+    return argmax_int8();
+  } else {
+    return argmax_bf16();
+  }
+}
+
+int16_t Decoder::run(int16_t * step, int16_t *seq, bf16_t *enc, bf16_t *mask) {
+  // fill data to input tensor
+  CVI_NN_SetTensorPtr(trg_seq, seq);
+  CVI_NN_SetTensorPtr(enc_output, enc);
+  CVI_NN_SetTensorPtr(src_mask, mask);
+  CVI_NN_SetTensorPtr(trg_step, step);
+  // run inference
+  CVI_NN_Forward(model, input_tensors, input_num,
+                 output_tensors, output_num);
+  return argmax();
+}
+
+void MTrans::run(int16_t *seq, int32_t seq_sz, int16_t *gen_seq, int32_t gen_seq_sz) {
+  // clean gen_seq array.
+  memset(gen_seq, 0, gen_seq_sz * sizeof(int16_t));
+  auto enc_output = encoder->run(seq, seq_sz);
+  auto src_mask = encoder->get_mask();
+
+  uint32_t best_idx = SOS_IDX;
+  gen_seq[0] = best_idx;
+
+  for (int16_t step = 1; step < seq_sz; step++) {
+    int16_t idx = step - 1;
+    if (step <= 1) {
+      best_idx = decoder_1->run(&idx, gen_seq, enc_output, src_mask);
+    } else if (step <= 10) {
+      best_idx = decoder_10->run(&idx, gen_seq, enc_output, src_mask);
+    } else if (step <= 20) {
+      best_idx = decoder_20->run(&idx, gen_seq, enc_output, src_mask);
+    } else if (step <= 30) {
+      best_idx = decoder_30->run(&idx, gen_seq, enc_output, src_mask);
+    } else {
+      best_idx = decoder_39->run(&idx, gen_seq, enc_output, src_mask);
+    }
+    gen_seq[step] = best_idx;
+    if (best_idx == EOS_IDX) {
+      break;
+    }
+  }
+}
diff --git a/cviruntime/samples_inner/mt_bidir/mt_model.hpp b/cviruntime/samples_inner/mt_bidir/mt_model.hpp
new file mode 100644
index 000000000..5217143ee
--- /dev/null
+++ b/cviruntime/samples_inner/mt_bidir/mt_model.hpp
@@ -0,0 +1,125 @@
+#ifndef __SAMPLES_MT_MODEL_HPP
+#define __SAMPLES_MT_MODEL_HPP
+
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <opencv2/opencv.hpp>
+#include "cviruntime.h"
+#include "cnpy.h"
+
+#define SOS_IDX 1
+#define LEXICON_SIZE 16002
+#define PAD_IDX 0
+#define SOS_IDX 1
+#define EOS_IDX 2
+#define INFER_FIX_LEN 40
+typedef uint16_t bf16_t;
+
+class Encoder
+{
+public:
+  Encoder(const char *model_file);
+  Encoder(CVI_MODEL_HANDLE model);
+  ~Encoder()
+  {
+    if (model)
+    {
+      CVI_NN_CleanupModel(model);
+    }
+  }
+
+  bf16_t *run(int16_t *seq, int32_t size);
+  bf16_t *get_mask();
+
+public:
+  CVI_MODEL_HANDLE model = nullptr;
+  CVI_TENSOR *src_seq;
+  CVI_TENSOR *src_mask;
+  CVI_TENSOR *enc_output;
+
+private:
+  void gen_src_mask(int16_t *src_seq, int32_t size);
+
+  CVI_TENSOR *input_tensors;
+  CVI_TENSOR *output_tensors;
+  int32_t input_num;
+  int32_t output_num;
+};
+
+class Decoder
+{
+public:
+  Decoder(CVI_MODEL_HANDLE model, int32_t program_idx, int32_t max_step);
+  ~Decoder()
+  {
+    if (model)
+    {
+      CVI_NN_CleanupModel(model);
+    }
+  }
+
+  int16_t run(int16_t *step, int16_t *seq, bf16_t *enc, bf16_t *mask);
+
+public:
+  CVI_TENSOR *trg_seq;
+  CVI_TENSOR *trg_mask;
+  CVI_TENSOR *trg_step;
+  CVI_TENSOR *enc_output;
+  CVI_TENSOR *src_mask;
+  CVI_TENSOR *dec_output;
+  int32_t max_step;
+  int32_t width;
+
+private:
+  void gen_trg_mask();
+  int16_t argmax();
+  int16_t argmax_int8();
+  int16_t argmax_bf16();
+  CVI_MODEL_HANDLE model = nullptr;
+  CVI_TENSOR *input_tensors;
+  CVI_TENSOR *output_tensors;
+  int32_t input_num;
+  int32_t output_num;
+  bool is_fix8b;
+};
+
+class MTrans
+{
+public:
+  MTrans(const char *cvimodel)
+  {
+    encoder = new Encoder(cvimodel);
+    decoder_1 = new Decoder(encoder->model, 1, 1);
+    decoder_10 = new Decoder(encoder->model, 2, 10);
+    decoder_20 = new Decoder(encoder->model, 3, 20);
+    decoder_30 = new Decoder(encoder->model, 4, 30);
+    decoder_39 = new Decoder(encoder->model, 5, 39);
+  }
+
+  ~MTrans()
+  {
+    delete encoder;
+    delete decoder_1;
+    delete decoder_10;
+    delete decoder_20;
+    delete decoder_30;
+    delete decoder_39;
+  }
+
+  void run(int16_t *seq, int32_t seq_sz,
+           int16_t *gen_seq, int32_t gen_seq_sz);
+
+private:
+  Encoder *encoder;
+  Decoder *decoder_1;
+  Decoder *decoder_10;
+  Decoder *decoder_20;
+  Decoder *decoder_30;
+  Decoder *decoder_39;
+};
+
+#endif
\ No newline at end of file
diff --git a/cviruntime/samples_inner/ocr/CMakeLists.txt b/cviruntime/samples_inner/ocr/CMakeLists.txt
new file mode 100644
index 000000000..8face1782
--- /dev/null
+++ b/cviruntime/samples_inner/ocr/CMakeLists.txt
@@ -0,0 +1,30 @@
+cmake_minimum_required(VERSION 2.8.0)
+
+project(cvi_sample_detector C CXX)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
+
+if(NOT DEFINED TPU_SDK_PATH)
+  message(FATAL_ERROR "Please set TPU_SDK_PATH to point to the TPU_SDK installation")
+endif()
+include_directories(${TPU_SDK_PATH}/include)
+link_directories(${TPU_SDK_PATH}/lib)
+
+if(NOT DEFINED OPENCV_PATH)
+  message(FATAL_ERROR "Please set OPENCV_PATH to point to the opencvn installation")
+endif()
+include_directories(${OPENCV_PATH}/include)
+link_directories(${OPENCV_PATH}/lib)
+
+set(CVI_LIBS ${CVI_LIBS} cviruntime cvikernel cnpy)
+set(OPENCV_LIBS ${OPENCV_LIBS} opencv_core opencv_imgcodecs opencv_imgproc)
+set(EXTRA_LIBS ${EXTRA_LIBS} dl stdc++ pthread z)
+
+add_executable(ocr ocr.cpp)
+target_link_libraries(ocr
+    ${CVI_LIBS}
+    ${OPENCV_LIBS}
+    ${EXTRA_LIBS})
+install(TARGETS ocr ocr DESTINATION bin)
diff --git a/cviruntime/samples_inner/ocr/build.sh b/cviruntime/samples_inner/ocr/build.sh
new file mode 100755
index 000000000..cc0b73a75
--- /dev/null
+++ b/cviruntime/samples_inner/ocr/build.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+set -xe
+
+echo "TPU_SDK_PATH=$TPU_SDK_PATH"
+
+if [ -z $TPU_SDK_PATH ]; then
+   echo "please set TPU_SDK_PATH"
+fi
+
+mkdir -p build
+cd build
+cmake .. \
+      -DCMAKE_BUILD_TYPE=RELEASE \
+      -DCMAKE_C_FLAGS_RELEASE=-O3 \
+      -DCMAKE_CXX_FLAGS_RELEASE=-O3 \
+      -DCMAKE_TOOLCHAIN_FILE=$TPU_SDK_PATH/cmake/toolchain-aarch64-linux.cmake \
+      -DTPU_SDK_PATH=$TPU_SDK_PATH \
+      -DOPENCV_PATH=$TPU_SDK_PATH/opencv \
+      -DCMAKE_INSTALL_PREFIX=./
+make install
\ No newline at end of file
diff --git a/cviruntime/samples_inner/ocr/ocr.cpp b/cviruntime/samples_inner/ocr/ocr.cpp
new file mode 100644
index 000000000..9fe78ecfd
--- /dev/null
+++ b/cviruntime/samples_inner/ocr/ocr.cpp
@@ -0,0 +1,125 @@
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <opencv2/opencv.hpp>
+#include "cviruntime.h"
+#include "cnpy.h"
+
+class OcrTool {
+public:
+  OcrTool(const char *model_file);
+  ~OcrTool();
+
+  float* run(uint8_t *image);
+
+public:
+  CVI_TENSOR *input;
+  CVI_TENSOR *output;
+
+private:
+  CVI_MODEL_HANDLE model = nullptr;
+  CVI_TENSOR *input_tensors;
+  CVI_TENSOR *output_tensors;
+  int32_t input_num;
+  int32_t output_num;
+};
+
+OcrTool::OcrTool(const char *model_file) {
+  int ret = CVI_NN_RegisterModel(model_file, &model);
+  if (ret != CVI_RC_SUCCESS) {
+    printf("CVI_NN_RegisterModel failed, err %d\n", ret);
+    exit(1);
+  }
+  ret = CVI_NN_GetInputOutputTensors(model, &input_tensors, &input_num,
+                                     &output_tensors, &output_num);
+  if (ret != CVI_RC_SUCCESS) {
+    printf("CVI_NN_GetInputOutputTensors failed, err %d\n", ret);
+    exit(1);
+  }
+  assert(input_num == 1);
+  assert(output_num == 1);
+  input = &input_tensors[0];
+  output = &output_tensors[0];
+}
+
+OcrTool::~OcrTool() {
+  if (model) {
+    CVI_NN_CleanupModel(model);
+  }
+}
+
+static int levenshtein_distance(const float* s, int n, const int* t, int m) {
+   ++n; ++m;
+   int* d = new int[n * m];
+   memset(d, 0, sizeof(int) * n * m);
+   for (int i = 1, im = 0; i < m; ++i, ++im) {
+      for (int j = 1, jn = 0; j < n; ++j, ++jn) {
+         if ((int)(s[jn]) == t[im]) {
+            d[(i * n) + j] = d[((i - 1) * n) + (j - 1)];
+         } else {
+            d[(i * n) + j] = std::min(d[(i - 1) * n + j] + 1, /* A deletion. */
+                                 std::min(d[i * n + (j - 1)] + 1, /* An insertion. */
+                                     d[(i - 1) * n + (j - 1)] + 1)); /* A substitution. */
+         }
+      }
+   }
+   int r = d[n * m - 1];
+   delete [] d;
+   return r;
+}
+
+
+float* OcrTool::run(uint8_t *image) {
+  // fill data to input tensor
+  CVI_NN_SetTensorPtr(input, image);
+  // run inference
+  CVI_NN_Forward(model, input_tensors, input_num, output_tensors, output_num);
+  return (float *)CVI_NN_TensorPtr(output);
+}
+
+int main(int argc, char **argv) {
+  int ret = 0;
+  if (argc < 4) {
+    printf("Usage:\n");
+    printf("   %s images_npz references_npz cvimodel\n", argv[0]);
+    exit(1);
+  }
+  const char *images_npz = argv[1];
+  const char *references_npz = argv[2];
+  const char *cvimodel = argv[3];
+
+  cnpy::npz_t images = cnpy::npz_load(images_npz);
+  if (images.size() == 0) {
+    printf("Failed to load images npz\n");
+  }
+  cnpy::npz_t references = cnpy::npz_load(references_npz);
+  if (references.size() == 0) {
+    printf("Failed to load references npz\n");
+  }
+  assert(images.size() == references.size());
+
+  OcrTool tool(cvimodel);
+
+  int all_cnt = 0;
+  int correct_cnt = 0;
+  for (auto &npy : images) {
+    auto &name = npy.first;
+    auto &image = npy.second;
+    auto *ptr = image.data<uint8_t>();
+
+    float* out = tool.run(ptr);
+
+    auto &refer_vec = references[name];
+    const int* refer = refer_vec.data<int>();
+    int num = (int)refer_vec.num_vals;
+    int dist = levenshtein_distance(out, num, refer, num);
+    correct_cnt += std::max(num - dist, 0);
+    all_cnt += num;
+  }
+
+  printf("acc: %f\n", 100.0 * correct_cnt / all_cnt);
+  return 0;
+}
\ No newline at end of file
diff --git a/cviruntime/samples_inner/tts/CMakeLists.txt b/cviruntime/samples_inner/tts/CMakeLists.txt
new file mode 100644
index 000000000..c8c57a8b2
--- /dev/null
+++ b/cviruntime/samples_inner/tts/CMakeLists.txt
@@ -0,0 +1,30 @@
+cmake_minimum_required(VERSION 2.8.0)
+
+project(t2s C CXX)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
+
+if(NOT DEFINED TPU_SDK_PATH)
+  message(FATAL_ERROR "Please set TPU_SDK_PATH to point to the TPU_SDK installation")
+endif()
+include_directories(${TPU_SDK_PATH}/include)
+link_directories(${TPU_SDK_PATH}/lib)
+
+if(NOT DEFINED OPENCV_PATH)
+  message(FATAL_ERROR "Please set OPENCV_PATH to point to the opencvn installation")
+endif()
+include_directories(${OPENCV_PATH}/include)
+link_directories(${OPENCV_PATH}/lib)
+
+set(CVI_LIBS ${CVI_LIBS} cviruntime cvikernel cnpy)
+set(OPENCV_LIBS ${OPENCV_LIBS} opencv_core opencv_imgcodecs opencv_imgproc)
+set(EXTRA_LIBS ${EXTRA_LIBS} dl stdc++ pthread z)
+
+add_executable(t2s t2s.cpp t2s_model.cpp)
+target_link_libraries(t2s
+    ${CVI_LIBS}
+    ${OPENCV_LIBS}
+    ${EXTRA_LIBS})
+install(TARGETS t2s t2s DESTINATION bin)
diff --git a/cviruntime/samples_inner/tts/README b/cviruntime/samples_inner/tts/README
new file mode 100644
index 000000000..f9ce442a4
--- /dev/null
+++ b/cviruntime/samples_inner/tts/README
@@ -0,0 +1,7 @@
+1. build tts binary in src directory.
+2. cp binary, npz and cvimodel to ecb:
+4. execute cmds:
+   # bash
+   # cd cvitek_tpu_sdk && source envs_tpu_sdk.sh
+   # ./tts mt_encoder_cv1829.cvimodel mt_decoder_600_cv1829.cvimodel tts_input_x.npz
+
diff --git a/cviruntime/samples_inner/tts/build.sh b/cviruntime/samples_inner/tts/build.sh
new file mode 100755
index 000000000..af64f23cc
--- /dev/null
+++ b/cviruntime/samples_inner/tts/build.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+set -xe
+
+TPU_SDK_PATH=/work/sdk/install/soc_cv1826_wevb_0005a_spinand/tpu_32/cvitek_tpu_sdk
+echo "TPU_SDK_PATH=$TPU_SDK_PATH"
+
+if [ -z $TPU_SDK_PATH ]; then
+   echo "please set TPU_SDK_PATH"
+fi
+
+mkdir -p build
+cd build
+cmake .. \
+      -DCMAKE_BUILD_TYPE=RELEASE \
+      -DCMAKE_C_FLAGS_RELEASE=-O3 \
+      -DCMAKE_CXX_FLAGS_RELEASE=-O3 \
+      -DCMAKE_TOOLCHAIN_FILE=$TPU_SDK_PATH/cmake/toolchain-linux-gnueabihf.cmake \
+      -DTPU_SDK_PATH=$TPU_SDK_PATH \
+      -DOPENCV_PATH=$TPU_SDK_PATH/opencv \
+      -DCMAKE_INSTALL_PREFIX=./
+make install
+
diff --git a/cviruntime/samples_inner/tts/t2s.cpp b/cviruntime/samples_inner/tts/t2s.cpp
new file mode 100644
index 000000000..996c01249
--- /dev/null
+++ b/cviruntime/samples_inner/tts/t2s.cpp
@@ -0,0 +1,64 @@
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <opencv2/opencv.hpp>
+#include "cviruntime.h"
+#include "cnpy.h"
+#include "t2s_model.hpp"
+
+using namespace t2s;
+
+static void load_input(const char *input_npz, uint16_t *text, uint16_t *lang, uint16_t *speaker, int32_t &text_sz) {
+  cnpy::npz_t npz = cnpy::npz_load(input_npz);
+  if (npz.size() == 0) {
+    printf("Failed to load images npz\n");
+  }
+  memcpy(text, npz["text"].data<uint16_t>(), 200 * sizeof(uint16_t));
+  memcpy(lang, npz["lang"].data<uint16_t>(), 200 * sizeof(uint16_t));
+  memcpy(speaker, npz["speaker"].data<uint16_t>(), 200 * sizeof(uint16_t));
+  text_sz = npz["text_len"].data<int32_t>()[0];
+}
+
+static void saveToNpz(const std::string &file, float *data, size_t size) {
+  cnpy::npz_t npz;
+  std::vector<size_t> shape = {1, size};
+  cnpy::npz_add_array<float>(npz, "mel_out", data, shape);
+  cnpy::npz_save_all(file, npz);
+}
+
+int main(int argc, char **argv) {
+  int ret = 0;
+  if (argc < 4) {
+    printf("Usage:\n");
+    printf("   %s enc-cvimodel, dec-cvimodel input_npz\n", argv[0]);
+    exit(1);
+  }
+  uint16_t text[200];
+  uint16_t lang[200];
+  uint16_t speaker[200];
+  int32_t text_sz;
+  load_input(argv[3], text, lang, speaker, text_sz);
+  printf("load input\n");
+
+  T2SModel tts(argv[1], argv[2]);
+
+  struct timeval t0, t1;
+  gettimeofday(&t0, NULL);
+
+  int32_t duration = 0;
+  auto mel_out = tts.run(text, text_sz, lang, speaker, duration);
+
+  gettimeofday(&t1, NULL);
+  long elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
+
+  printf("Performance: %f ms\n", elapsed/1000.0);
+
+  printf("duration:%d\n", duration);
+  printf("dump mel_out.npz\n");
+  saveToNpz("t2s_mel_out.npz", mel_out,  80 * MAX_DECODE_LEN);
+
+  return 0;
+}
\ No newline at end of file
diff --git a/cviruntime/samples_inner/tts/t2s_model.cpp b/cviruntime/samples_inner/tts/t2s_model.cpp
new file mode 100644
index 000000000..c65043ca7
--- /dev/null
+++ b/cviruntime/samples_inner/tts/t2s_model.cpp
@@ -0,0 +1,121 @@
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <stdlib.h>
+#include <string.h>
+#include "cviruntime.h"
+#include "t2s_model.hpp"
+
+namespace t2s {
+
+static inline bf16_t BF16(float val) {
+  return ((bf16_t *)(&val))[1];
+}
+
+static inline float FP32(bf16_t val) {
+  float ret = 0;
+  ((bf16_t *)(&ret))[1] = val;
+  return ret;
+}
+
+static void fill_mask(CVI_TENSOR *tensor, int32_t text_sz) {
+  bf16_t *text_mask = (bf16_t *)CVI_NN_TensorPtr(tensor);
+  size_t mask_sz = CVI_NN_TensorCount(tensor);
+  bf16_t zero = BF16(0);
+  bf16_t one = BF16(1);
+  for (size_t i = 0; i < text_sz; i++) {
+    text_mask[i] = one;
+  }
+  for (size_t i = text_sz; i < mask_sz; i++) {
+    text_mask[i] = zero;
+  }
+}
+
+T2SEncoder::T2SEncoder(const char *model_file) {
+  int ret = CVI_NN_RegisterModel(model_file, &model);
+  if (ret != CVI_RC_SUCCESS) {
+    printf("CVI_NN_RegisterModel failed, err %d\n", ret);
+    exit(1);
+  }
+  ret = CVI_NN_GetInputOutputTensors(model, &input_tensors, &input_num,
+                                                             &output_tensors, &output_num);
+  if (ret != CVI_RC_SUCCESS) {
+    printf("CVI_NN_GetInputOutputTensors failed, err %d\n", ret);
+    exit(1);
+  }
+  assert(input_num == 4);
+  assert(output_num == 2);
+  durations = new int32_t[MAX_TEXT_SIZE];
+  hiddens = (bf16_t *)CVI_NN_TensorPtr(&output_tensors[0]);
+}
+
+int32_t T2SEncoder::run(uint16_t *text, int32_t text_sz, uint16_t *lang, uint16_t *speaker) {
+  CVI_NN_SetTensorPtr(&input_tensors[0], text);
+  CVI_NN_SetTensorPtr(&input_tensors[2], lang);
+  CVI_NN_SetTensorPtr(&input_tensors[3], speaker);
+  fill_mask(&input_tensors[1], text_sz);
+  // run inference
+  CVI_NN_Forward(model, input_tensors, input_num,
+                             output_tensors, output_num);
+  return regulate_durations(text_sz);
+}
+
+int32_t T2SEncoder::regulate_durations(int32_t text_sz) {
+  bf16_t *ptr = (bf16_t *)CVI_NN_TensorPtr(&output_tensors[1]);
+  int32_t total_duration = 0;
+  for (int32_t i = 0; i < text_sz; i++) {
+    float d = (int32_t)std::round(FP32(ptr[i]));
+    durations[i] = (d <= 0) ? 1 : d;
+    total_duration += durations[i];
+  }
+  for (int32_t i = text_sz; i < MAX_TEXT_SIZE; i++) {
+    durations[i] = 0;
+  }
+  return total_duration;
+}
+
+T2SDecoder::T2SDecoder(const char *model_file) {
+  int ret = CVI_NN_RegisterModel(model_file, &model);
+  if (ret != CVI_RC_SUCCESS) {
+    printf("CVI_NN_RegisterModel failed, err %d\n", ret);
+    exit(1);
+  }
+  ret = CVI_NN_GetInputOutputTensors(model, &input_tensors, &input_num,
+                                     &output_tensors, &output_num);
+  if (ret != CVI_RC_SUCCESS) {
+    printf("CVI_NN_GetInputOutputTensors failed, err %d\n", ret);
+    exit(1);
+  }
+  assert(input_num == 4);
+  assert(output_num == 1);
+}
+
+void T2SDecoder::expand_hidden_states(CVI_TENSOR *tensor, bf16_t *src, int32_t*durations) {
+  auto dst = (bf16_t *)CVI_NN_TensorPtr(tensor);
+  // expand encoding
+  int offset_dst = 0;
+  int offset_src = 0;
+  for (int i = 0; i < MAX_TEXT_SIZE; i++) {
+    for (int j = 0; j < durations[i]; j++) {
+      memcpy(dst + offset_dst, src + offset_src, 256 * sizeof(bf16_t));
+      offset_dst += 256;
+    }
+    offset_src += 256;
+  }
+  memset(dst + offset_dst, 0,  (200 * 256 - offset_dst) * sizeof(bf16_t));
+}
+
+float* T2SDecoder::run(bf16_t *hidden_states, int32_t duration,  int32_t *durations) {
+  expand_hidden_states(&input_tensors[0], hidden_states, durations);
+  fill_mask(&input_tensors[1], duration);
+  fill_mask(&input_tensors[2], duration * 2);
+  fill_mask(&input_tensors[3], duration * 4);
+  // run inference
+  CVI_NN_Forward(model, input_tensors, input_num,
+                 output_tensors, output_num);
+  return (float *)CVI_NN_TensorPtr(&output_tensors[0]);
+}
+
+}
+
+
diff --git a/cviruntime/samples_inner/tts/t2s_model.hpp b/cviruntime/samples_inner/tts/t2s_model.hpp
new file mode 100644
index 000000000..db6ce5f93
--- /dev/null
+++ b/cviruntime/samples_inner/tts/t2s_model.hpp
@@ -0,0 +1,99 @@
+#ifndef __SAMPLES_TTS_MODEL_HPP
+#define __SAMPLES_TTS_MODEL_HPP
+
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <opencv2/opencv.hpp>
+#include "cviruntime.h"
+#include "cnpy.h"
+
+namespace t2s {
+
+#define MAX_TEXT_SIZE 200
+#define MAX_DECODE_LEN 800
+typedef uint16_t bf16_t;
+
+class T2SEncoder {
+public:
+  T2SEncoder(const char  *cvimodel);
+  ~T2SEncoder() {
+    if (model)
+      CVI_NN_CleanupModel(model);
+    if (durations)
+      delete[] durations;
+  }
+
+  // forward and get total durations and squence.
+  int32_t run(uint16_t *text, int32_t text_sz, uint16_t *lang, uint16_t *speaker);
+
+private:
+  int32_t regulate_durations(int32_t text_sz);
+
+public:
+  CVI_MODEL_HANDLE model = nullptr;
+  bf16_t *hiddens;
+  int32_t *durations;
+
+private:
+  CVI_TENSOR *input_tensors;
+  CVI_TENSOR *output_tensors;
+  int32_t input_num;
+  int32_t output_num;
+};
+
+class T2SDecoder {
+public:
+  T2SDecoder(const char *model_file);
+  ~T2SDecoder() {
+    if (model) {
+      CVI_NN_CleanupModel(model);
+    }
+  }
+
+  float* run(bf16_t *hidden_states, int32_t duration,  int32_t *durations);
+
+private:
+  void expand_hidden_states(CVI_TENSOR *tensor, bf16_t *hidden_states, int32_t *durations);
+
+private:
+  CVI_MODEL_HANDLE model = nullptr;
+  CVI_TENSOR *input_tensors;
+  CVI_TENSOR *output_tensors;
+  int32_t input_num;
+  int32_t output_num;
+};
+
+
+class T2SModel {
+public:
+  T2SModel(const char *enc_cvimodel, const char *dec_cvimodel) {
+    encoder = new T2SEncoder(enc_cvimodel);
+    decoder = new T2SDecoder(dec_cvimodel);
+  }
+
+  ~T2SModel() {
+    if (encoder)
+      delete encoder;
+    if (decoder)
+      delete decoder;
+  }
+
+  float* run(uint16_t *text, int32_t text_sz, uint16_t *lang, uint16_t *speaker, int32_t &duration) {
+    duration = encoder->run(text, text_sz, lang, speaker);
+    auto mel_out = decoder->run(encoder->hiddens, duration, encoder->durations);
+    duration  *= 4;
+    return mel_out; // shape is (1x80x800)
+  }
+
+private:
+  T2SEncoder *encoder = nullptr;
+  T2SDecoder *decoder = nullptr;
+};
+
+}
+
+#endif
\ No newline at end of file
diff --git a/cviruntime/samples_inner/vocoder/CMakeLists.txt b/cviruntime/samples_inner/vocoder/CMakeLists.txt
new file mode 100644
index 000000000..e8cf01dee
--- /dev/null
+++ b/cviruntime/samples_inner/vocoder/CMakeLists.txt
@@ -0,0 +1,30 @@
+cmake_minimum_required(VERSION 2.8.0)
+
+project(cvi_sample_detector C CXX)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
+
+if(NOT DEFINED TPU_SDK_PATH)
+  message(FATAL_ERROR "Please set TPU_SDK_PATH to point to the TPU_SDK installation")
+endif()
+include_directories(${TPU_SDK_PATH}/include)
+link_directories(${TPU_SDK_PATH}/lib)
+
+if(NOT DEFINED OPENCV_PATH)
+  message(FATAL_ERROR "Please set OPENCV_PATH to point to the opencvn installation")
+endif()
+include_directories(${OPENCV_PATH}/include)
+link_directories(${OPENCV_PATH}/lib)
+
+set(CVI_LIBS ${CVI_LIBS} cviruntime cvikernel cnpy)
+set(OPENCV_LIBS ${OPENCV_LIBS} opencv_core opencv_imgcodecs opencv_imgproc)
+set(EXTRA_LIBS ${EXTRA_LIBS} dl stdc++ pthread z)
+
+add_executable(vocoder vocoder.cpp vocoder_model.cpp)
+target_link_libraries(vocoder
+    ${CVI_LIBS}
+    ${OPENCV_LIBS}
+    ${EXTRA_LIBS})
+install(TARGETS vocoder vocoder DESTINATION bin)
diff --git a/cviruntime/samples_inner/vocoder/README b/cviruntime/samples_inner/vocoder/README
new file mode 100644
index 000000000..3f910da4b
--- /dev/null
+++ b/cviruntime/samples_inner/vocoder/README
@@ -0,0 +1,8 @@
+1. build mt binary in src directory.
+2. cp binary, npz and cvimodel to ecb:
+3. cp cvitek_tpu_sdk.tar.gz to evb and extract it to /mnt/data/
+4. execute cmds:
+   # bash
+   # cd cvitek_tpu_sdk && source envs_tpu_sdk.sh
+   # ./mt mt_encoder_decoder_0_10_20_30_39_mix_2.cvimodel mt_input_x.npz
+
diff --git a/cviruntime/samples_inner/vocoder/build.sh b/cviruntime/samples_inner/vocoder/build.sh
new file mode 100755
index 000000000..af64f23cc
--- /dev/null
+++ b/cviruntime/samples_inner/vocoder/build.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+set -xe
+
+TPU_SDK_PATH=/work/sdk/install/soc_cv1826_wevb_0005a_spinand/tpu_32/cvitek_tpu_sdk
+echo "TPU_SDK_PATH=$TPU_SDK_PATH"
+
+if [ -z $TPU_SDK_PATH ]; then
+   echo "please set TPU_SDK_PATH"
+fi
+
+mkdir -p build
+cd build
+cmake .. \
+      -DCMAKE_BUILD_TYPE=RELEASE \
+      -DCMAKE_C_FLAGS_RELEASE=-O3 \
+      -DCMAKE_CXX_FLAGS_RELEASE=-O3 \
+      -DCMAKE_TOOLCHAIN_FILE=$TPU_SDK_PATH/cmake/toolchain-linux-gnueabihf.cmake \
+      -DTPU_SDK_PATH=$TPU_SDK_PATH \
+      -DOPENCV_PATH=$TPU_SDK_PATH/opencv \
+      -DCMAKE_INSTALL_PREFIX=./
+make install
+
diff --git a/cviruntime/samples_inner/vocoder/vocoder.cpp b/cviruntime/samples_inner/vocoder/vocoder.cpp
new file mode 100644
index 000000000..304df5277
--- /dev/null
+++ b/cviruntime/samples_inner/vocoder/vocoder.cpp
@@ -0,0 +1,69 @@
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <opencv2/opencv.hpp>
+#include "cviruntime.h"
+#include "cnpy.h"
+#include "vocoder_model.hpp"
+#include <fstream>
+#include <iostream>
+
+void dump_info(const char *path) {
+  std::string line;
+  std::ifstream file(path);
+  std::cout << "dump " << path << "\n";
+  while (getline(file, line )) {
+    std::cout << line << "\n";
+  }
+  file.close();
+  std::cout << "=======\n";
+}
+
+int main(int argc, char **argv) {
+  int ret = 0;
+  if (argc < 4) {
+    printf("Usage:\n");
+    printf("   %s cvimodel, input_npz, ref_npz\n", argv[0]);
+    exit(1);
+  }
+  cnpy::npz_t input_npz = cnpy::npz_load(argv[2]);
+  if (input_npz.size() == 0) {
+    printf("Failed to load input npz\n");
+  }
+  auto &input_arr = input_npz["input"];
+
+  cnpy::npz_t ref_npz = cnpy::npz_load(argv[3]);
+  if (ref_npz.size() == 0) {
+    printf("Failed to load ref npz\n");
+  }
+  auto &ref_arr = ref_npz["522_Mul_dequant"];
+
+  dump_info("/proc/meminfo");
+  dump_info("/sys/kernel/debug/ion/cvi_carveout_heap_dump/alloc_mem");
+  dump_info("/sys/kernel/debug/ion/cvi_carveout_heap_dump/summary");
+  Vocoder vocoder(argv[1]);
+  dump_info("/proc/meminfo");
+  dump_info("/sys/kernel/debug/ion/cvi_carveout_heap_dump/alloc_mem");
+  dump_info("/sys/kernel/debug/ion/cvi_carveout_heap_dump/summary");
+
+  int32_t out_size;
+  float *out = vocoder.run(input_arr.data<float>(),
+                           input_arr.num_vals, out_size);
+  assert(ref_arr.num_vals == out_size);
+
+  int cnt = 10;
+  float *ref = ref_arr.data<float>();
+  for (int i = 0; i < (int)out_size; i++) {
+    float ref_val = ref[i];
+    if (abs(out[i] - ref_val) > 0.0001) {
+      printf("compare failed at %d, %f vs %f\n", i, out[i], ref_val);
+      if (cnt-- < 0)
+      assert(0);
+    }
+  }
+  printf("compare passed\n");
+  return 0;
+}
\ No newline at end of file
diff --git a/cviruntime/samples_inner/vocoder/vocoder_model.cpp b/cviruntime/samples_inner/vocoder/vocoder_model.cpp
new file mode 100644
index 000000000..ac5eafc23
--- /dev/null
+++ b/cviruntime/samples_inner/vocoder/vocoder_model.cpp
@@ -0,0 +1,114 @@
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <opencv2/opencv.hpp>
+#include "cviruntime.h"
+#include "cnpy.h"
+#include "vocoder_model.hpp"
+
+VocoderModel::VocoderModel(const char *model_file) {
+  int ret = CVI_NN_RegisterModel(model_file, &model);
+  if (ret != CVI_RC_SUCCESS) {
+    printf("CVI_NN_RegisterModel failed, err %d\n", ret);
+    exit(1);
+  }
+  CVI_NN_SetConfig(model, OPTION_BATCH_SIZE, 1);
+  CVI_NN_SetConfig(model, OPTION_PROGRAM_INDEX, 5);
+  ret = CVI_NN_GetInputOutputTensors(model, &input_tensors, &input_num,
+                                     &output_tensors, &output_num);
+  if (ret != CVI_RC_SUCCESS) {
+    printf("CVI_NN_GetInputOutputTensors failed, err %d\n", ret);
+    exit(1);
+  }
+  src = &input_tensors[0];
+  output = &output_tensors[0];
+  std::cout << "Encoder- tensors:" << src->name
+            << ", " << output->name << "\n";
+}
+
+VocoderModel::VocoderModel(CVI_MODEL_HANDLE main_model, int32_t pidx) {
+  int ret = CVI_NN_CloneModel(main_model, &model);
+  if (ret != CVI_RC_SUCCESS) {
+    printf("CVI_NN_RegisterModel failed, err %d\n", ret);
+    exit(1);
+  }
+  CVI_NN_SetConfig(model, OPTION_BATCH_SIZE, 1);
+  CVI_NN_SetConfig(model, OPTION_PROGRAM_INDEX, pidx);
+  ret = CVI_NN_GetInputOutputTensors(model, &input_tensors, &input_num,
+                                     &output_tensors, &output_num);
+  if (ret != CVI_RC_SUCCESS) {
+    printf("CVI_NN_GetInputOutputTensors failed, err %d\n", ret);
+    exit(1);
+  }
+  src = &input_tensors[0];
+  output = &output_tensors[0];
+  std::cout << "Encoder- tensors:" << src->name
+            << ", " << output->name << "\n";
+}
+
+float *VocoderModel::run(float *data, int32_t src_size, int32_t &out_size) {
+  // fill src_seq to tensor 0
+  assert(CVI_NN_TensorCount(src) == src_size);
+  CVI_NN_SetTensorPtr(src, data);
+  // memcpy(CVI_NN_TensorPtr(src), data, CVI_NN_TensorSize(src));
+  // run inference
+  CVI_NN_Forward(model, input_tensors, input_num,
+                 output_tensors, output_num);
+  out_size = CVI_NN_TensorCount(output);
+  return (float *)CVI_NN_TensorPtr(output);
+}
+
+Vocoder::Vocoder(const char *model_file) {
+  vc_600 = new VocoderModel(model_file);
+  assert(vc_600);
+  vc_500 = new VocoderModel(vc_600->model, 4);
+  assert(vc_500);
+  vc_400 = new VocoderModel(vc_600->model, 3);
+  assert(vc_400);
+  vc_300 = new VocoderModel(vc_600->model, 2);
+  assert(vc_300);
+  vc_200 = new VocoderModel(vc_600->model, 1);
+  assert(vc_200);
+  vc_100 = new VocoderModel(vc_600->model, 0);
+  assert(vc_100);
+}
+
+Vocoder::~Vocoder() {
+  delete vc_100;
+  delete vc_200;
+  delete vc_300;
+  delete vc_400;
+  delete vc_500;
+  delete vc_600;
+}
+
+float *Vocoder::run(float *data, int32_t src_size,
+    int32_t &out_size) {
+  VocoderModel *m;
+  switch(src_size) {
+    case 80 * 100:
+      m = vc_100;
+      break;
+    case 80 * 200:
+      m = vc_200;
+      break;
+    case 80 * 300:
+      m = vc_300;
+      break;
+    case 80 * 400:
+      m = vc_400;
+      break;
+    case 80 * 500:
+      m = vc_500;
+      break;
+    case 80 * 600:
+      m = vc_600;
+      break;
+    default:
+      assert(0);
+  }
+  return m->run(data, src_size, out_size);
+}
diff --git a/cviruntime/samples_inner/vocoder/vocoder_model.hpp b/cviruntime/samples_inner/vocoder/vocoder_model.hpp
new file mode 100644
index 000000000..86167c3d1
--- /dev/null
+++ b/cviruntime/samples_inner/vocoder/vocoder_model.hpp
@@ -0,0 +1,55 @@
+#ifndef __SAMPLES_MT_MODEL_HPP
+#define __SAMPLES_MT_MODEL_HPP
+
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <opencv2/opencv.hpp>
+#include "cviruntime.h"
+#include "cnpy.h"
+
+typedef uint16_t bf16_t;
+
+class VocoderModel {
+public:
+  VocoderModel(const char *model_file);
+  VocoderModel(CVI_MODEL_HANDLE main_model, int32_t pidx);
+  ~VocoderModel() {
+    if (model) {
+      CVI_NN_CleanupModel(model);
+    }
+  }
+
+  float* run(float *data, int32_t src_size, int32_t &out_size);
+
+public:
+  CVI_MODEL_HANDLE model = nullptr;
+  CVI_TENSOR *src;
+  CVI_TENSOR *output;
+
+private:
+  CVI_TENSOR *input_tensors;
+  CVI_TENSOR *output_tensors;
+  int32_t input_num;
+  int32_t output_num;
+};
+
+class Vocoder {
+public:
+  Vocoder(const char *model_file);
+  ~Vocoder();
+  float *run(float *data, int32_t src_size, int32_t &out_size);
+
+private:
+  VocoderModel *vc_100;
+  VocoderModel *vc_200;
+  VocoderModel *vc_300;
+  VocoderModel *vc_400;
+  VocoderModel *vc_500;
+  VocoderModel *vc_600;
+};
+
+#endif
\ No newline at end of file
diff --git a/cviruntime/samples_kernel/add/CMakeLists.txt b/cviruntime/samples_kernel/add/CMakeLists.txt
new file mode 100644
index 000000000..da1c9470b
--- /dev/null
+++ b/cviruntime/samples_kernel/add/CMakeLists.txt
@@ -0,0 +1,15 @@
+cmake_minimum_required(VERSION 2.8.0)
+project(transpose CXX)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
+include_directories(${TPU_SDK_PATH}/include)
+link_directories(${TPU_SDK_PATH}/lib)
+
+set(CVI_LIBS cviruntime cvikernel)
+add_executable(add add.cpp)
+target_link_libraries(add ${CVI_LIBS})
+install(TARGETS add DESTINATION bin)
+
+
diff --git a/cviruntime/samples_kernel/add/add.cpp b/cviruntime/samples_kernel/add/add.cpp
new file mode 100644
index 000000000..321a29946
--- /dev/null
+++ b/cviruntime/samples_kernel/add/add.cpp
@@ -0,0 +1,264 @@
+// This demo demonstrate how to do tranpose from [nhwc] to [nchw] with cpu and tpu
+#include <cassert>
+#include <cstring>
+#include <ctime>
+#include <vector>
+#include <iostream>
+#include <random>
+#include <functional>
+
+#include "add.h"
+#include "cviruntime.h"
+#include "cvikernel/cvikernel.h"
+
+static constexpr int NPU_NUM = 32;
+static constexpr int EU_NUM = 16;
+static constexpr int LOCAL_MEM_SIZE = 1 << 15;
+#define MAX_TIU_NUM (4096 - 32)
+
+typedef uint16_t bf16_t;
+
+static inline int ceiling_func(int numerator, int denominator)
+{
+  return (numerator + denominator - 1) / denominator;
+}
+
+void MyAddOp::tiling(int64_t total)
+{
+  tiling_info_t tile;
+  memset(&tile, 0, sizeof(tile));
+  tile.n = 1;
+  tile.c = NPU_NUM;
+  tile.w = EU_NUM;
+  tile.h = std::min(ceiling_func(total, tile.c * tile.w), MAX_TIU_NUM);
+  bool lmem_ok = false;
+  tiles.clear();
+  while (total > 0)
+  {
+    int64_t count = tile.n * tile.c * tile.h * tile.w;
+    cvk_tl_shape_t tl_shape = {
+        .n = tile.n, .c = tile.c, .h = tile.h, .w = tile.w};
+    if (lmem_ok == false)
+    {
+      uint32_t lsize = 2 * ctx->ops->lmem_tensor_to_size(ctx, tl_shape,
+                                                         CVK_FMT_BF16, 1);
+      lmem_ok = (lsize <= (uint32_t)LOCAL_MEM_SIZE);
+    }
+    if (count > total || lmem_ok == false)
+    {
+      if (tile.h > 1)
+      {
+        tile.h--;
+      }
+      else if (tile.w > 1)
+      {
+        tile.w--;
+      }
+      else if (tile.c > 1)
+      {
+        tile.c--;
+      }
+      else
+      {
+        assert(0 && "lmem is not enough");
+      }
+    }
+    else
+    {
+      tiles.emplace_back(tile);
+      total -= count;
+      tile.offset += count * 2;
+    }
+  }
+  assert(total == 0 && "tiling error");
+  return;
+}
+
+void MyAddOp::codeGenBf16(std::vector<int64_t> shape)
+{
+
+  int64_t total = std::accumulate(shape.begin(), shape.end(), 1,
+                                  std::multiplies<int64_t>());
+  uint64_t ga_input0 = 0;
+  uint64_t ga_input1 = total * sizeof(bf16_t);
+  uint64_t ga_output = ga_input1 + total * sizeof(bf16_t);
+  tiling(total);
+  for (auto &tile : tiles)
+  {
+    cvk_tl_shape_t tl_shape = {
+        .n = tile.n, .c = tile.c, .h = tile.h, .w = tile.w};
+    auto tl_input0 =
+        ctx->ops->lmem_alloc_tensor(ctx, tl_shape, CVK_FMT_BF16, 1);
+    auto tl_input1 =
+        ctx->ops->lmem_alloc_tensor(ctx, tl_shape, CVK_FMT_BF16, 1);
+    // load input 0
+    cvk_tg_t tg_i0 = {0};
+    tg_i0.fmt = CVK_FMT_BF16;
+    tg_i0.start_address = ga_input0 + tile.offset;
+    tg_i0.base_reg_index = 2;
+    tg_i0.shape = {tile.n, tile.c, tile.h, tile.w};
+    tg_i0.stride =
+        ctx->ops->tg_default_stride(ctx, tg_i0.shape, CVK_FMT_BF16);
+
+    cvk_tdma_g2l_tensor_copy_param_t p0 = {0};
+    p0.src = &tg_i0;
+    p0.dst = tl_input0;
+    p0.layer_id = 0;
+    ctx->ops->tdma_g2l_bf16_tensor_copy(ctx, &p0);
+
+    // load input 1
+    cvk_tg_t tg_i1 = {0};
+    tg_i1.fmt = CVK_FMT_BF16;
+    tg_i1.start_address = ga_input1 + tile.offset;
+    tg_i1.base_reg_index = 2;
+    tg_i1.shape = {tile.n, tile.c, tile.h, tile.w};
+    tg_i1.stride =
+        ctx->ops->tg_default_stride(ctx, tg_i1.shape, CVK_FMT_BF16);
+
+    cvk_tdma_g2l_tensor_copy_param_t p1 = {0};
+    p1.src = &tg_i1;
+    p1.dst = tl_input1;
+    p1.layer_id = 0;
+    ctx->ops->tdma_g2l_bf16_tensor_copy(ctx, &p1);
+
+    // add input 0 and input 1 => input0
+    cvk_tiu_add_param_t p2 = {0};
+    p2.res_low = tl_input0;
+    p2.a_low = tl_input0;
+    p2.b.low = tl_input1;
+    p2.layer_id = 0;
+    ctx->ops->tiu_add(ctx, &p2);
+
+    // store
+    cvk_tg_t tg_dst = {0};
+    tg_dst.fmt = CVK_FMT_BF16;
+    tg_dst.start_address = ga_output + tile.offset;
+    tg_dst.base_reg_index = 2;
+    tg_dst.shape = {tile.n, tile.c, tile.h, tile.w};
+    tg_dst.stride =
+        ctx->ops->tg_default_stride(ctx, tg_dst.shape, CVK_FMT_BF16);
+
+    cvk_tdma_l2g_tensor_copy_param_t p3 = {0};
+    p3.src = tl_input0;
+    p3.dst = &tg_dst;
+    p3.layer_id = 0;
+    ctx->ops->tdma_l2g_bf16_tensor_copy(ctx, &p3);
+
+    ctx->ops->lmem_free_tensor(ctx, tl_input1);
+    ctx->ops->lmem_free_tensor(ctx, tl_input0);
+  }
+}
+
+static void jit_compile(uint8_t **cmdbuf, uint32_t &size, std::vector<int64_t> &shape)
+{
+  cvk_reg_info_t req_info;
+
+  memset(&req_info, 0, sizeof(cvk_reg_info_t));
+  strncpy(req_info.chip_ver_str, "cv183x", sizeof(req_info.chip_ver_str) - 1);
+  req_info.cmdbuf_size = 300000;
+  req_info.cmdbuf = (uint8_t *)malloc(req_info.cmdbuf_size);
+  auto cvk_ctx = cvikernel_register(&req_info);
+  MyAddOp add(cvk_ctx);
+  add.codeGenBf16(shape);
+  *cmdbuf = cvk_ctx->ops->acquire_cmdbuf(cvk_ctx, &size);
+}
+
+void add_by_tpu(bf16_t *input0, bf16_t *input1, bf16_t *output, std::vector<int64_t> &shape)
+{
+  int64_t total = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int64_t>());
+  int64_t tensor_size = total * sizeof(bf16_t);
+  // runtime init
+  CVI_RT_HANDLE ctx = nullptr;
+  CVI_RT_Init(&ctx);
+
+  uint8_t *cmdbuf = nullptr;
+  uint32_t cmdbuf_size = 0;
+
+  // generate cmdbuf
+  jit_compile(&cmdbuf, cmdbuf_size, shape);
+
+  // Alloc device memory for input + output + cmdbuf
+  CVI_RT_MEM shared_mem = CVI_RT_MemAlloc(ctx, tensor_size * 3);
+  CVI_RT_MEM input0_mem = CVI_RT_MemPreAlloc(shared_mem, 0, tensor_size);
+  CVI_RT_MEM input1_mem = CVI_RT_MemPreAlloc(shared_mem, tensor_size, tensor_size);
+  CVI_RT_MEM output_mem = CVI_RT_MemPreAlloc(shared_mem, tensor_size * 2, tensor_size);
+
+  CVI_RT_MEM cmdbuf_mem = nullptr;
+  // Load cmdbuf
+  CVI_RT_LoadCmdbuf(ctx, cmdbuf, cmdbuf_size, CVI_RT_MemGetPAddr(shared_mem), 0, false, &cmdbuf_mem);
+
+  // Get input tensor virtual address
+  bf16_t *input0_ptr = (bf16_t *)CVI_RT_MemGetVAddr(input0_mem);
+  bf16_t *input1_ptr = (bf16_t *)CVI_RT_MemGetVAddr(input1_mem);
+  memcpy(input0_ptr, input0, tensor_size);
+  memcpy(input1_ptr, input1, tensor_size);
+  // Flush cache
+  CVI_RT_MemFlush(ctx, input0_mem);
+  CVI_RT_MemFlush(ctx, input1_mem);
+
+  // Run cmdbuf
+  CVI_RC ret = CVI_RT_RunCmdbuf(ctx, cmdbuf_mem, CVI_RT_MemGetPAddr(shared_mem), 0);
+  assert(ret == 0);
+  // Flush cache
+  CVI_RT_MemInvld(ctx, output_mem);
+
+  // Get output tensor virtual address
+  bf16_t *output_ptr = (bf16_t *)CVI_RT_MemGetVAddr(output_mem);
+  memcpy(output, output_ptr, tensor_size);
+
+  // Release device memory
+  CVI_RT_MemFree(ctx, cmdbuf_mem);
+  CVI_RT_MemFree(ctx, output_mem);
+  CVI_RT_MemFree(ctx, input1_mem);
+  CVI_RT_MemFree(ctx, input0_mem);
+  CVI_RT_MemFree(ctx, shared_mem);
+  CVI_RT_DeInit(ctx);
+}
+
+static inline bf16_t BF16(const float &data)
+{
+  uint16_t *p_data_bf16 = (uint16_t *)(&data);
+  return p_data_bf16[1];
+}
+
+static inline float FP32(const bf16_t &data)
+{
+  float data_f32 = 0.0f;
+  uint16_t *p_data_bf16 = (uint16_t *)(&data_f32);
+  p_data_bf16[1] = data;
+  return data_f32;
+}
+
+int main(int argc, char *argv[])
+{
+  std::vector<int64_t> shape = {1, 3, 1600, 2500};
+  int64_t total = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int64_t>());
+  std::vector<bf16_t> input0(total);
+  std::vector<bf16_t> input1(total);
+  std::vector<bf16_t> output(total);
+  std::vector<float> output_cpu(total);
+  for (int64_t i = 0; i < total; i++)
+  {
+    float data0 = (float)(i % 255);
+    float data1 = (float)((i + 128) % 255);
+    input0[i] = BF16(data0);
+    input1[i] = BF16(data1);
+    output_cpu[i] = data0 + data1;
+  }
+
+  add_by_tpu(input0.data(), input1.data(), output.data(), shape);
+  printf(">> cpu output: ");
+  for (int64_t i = 0; i < 16; i++)
+  {
+    printf("%f ", output_cpu[i]);
+  }
+
+  printf("\n>> tpu output: ");
+  for (int64_t i = 0; i < 16; i++)
+  {
+    printf("%f ", FP32(output[i]));
+  }
+  printf("\n");
+
+  return 0;
+}
diff --git a/cviruntime/samples_kernel/add/add.h b/cviruntime/samples_kernel/add/add.h
new file mode 100644
index 000000000..a703cd8a0
--- /dev/null
+++ b/cviruntime/samples_kernel/add/add.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (C) Cvitek Co., Ltd. 2019-2020. All rights reserved.
+ */
+#ifndef MYADD_OP_H_
+#define MYADD_OP_H_
+
+#include <cvikernel/cvikernel.h>
+#include <vector>
+
+class MyAddOp {
+public:
+  MyAddOp(cvk_context_t * ctx) : ctx(ctx) {}
+  void codeGenBf16(std::vector<int64_t> shape);
+
+private:
+  typedef struct tiling_info {
+    uint32_t n;
+    uint32_t c;
+    uint32_t h;
+    uint32_t w;
+    uint64_t offset; // gmem offset
+  } tiling_info_t;
+  cvk_context_t * ctx;
+  std::vector<tiling_info_t> tiles;
+  void tiling(int64_t total);
+
+};
+
+#endif
diff --git a/cviruntime/samples_kernel/tensor_copy/cmd.txt b/cviruntime/samples_kernel/tensor_copy/cmd.txt
new file mode 100644
index 000000000..e4f132e00
--- /dev/null
+++ b/cviruntime/samples_kernel/tensor_copy/cmd.txt
@@ -0,0 +1,4 @@
+aarch64-linux-gnu-gcc -I../cvitek_tpu_sdk/include -L../cvitek_tpu_sdk/lib -lcvikernel -lcviruntime -O3 -o tensor_copy tensor_copy.c
+
+gcc -I../cvitek_mlir/include -L../cvitek_mlir/lib -O3 -o tensor_copy tensor_copy.c -lcvicmodel -lcvikernel -lcviruntime
+
diff --git a/cviruntime/samples_kernel/tensor_copy/tensor_copy.c b/cviruntime/samples_kernel/tensor_copy/tensor_copy.c
new file mode 100644
index 000000000..3b31cc653
--- /dev/null
+++ b/cviruntime/samples_kernel/tensor_copy/tensor_copy.c
@@ -0,0 +1,246 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <inttypes.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "cvikernel/cvikernel.h"
+#include "cviruntime_context.h"
+
+static void test_tdma_g2l(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    uint64_t phy_addr,
+    int nc_transpose)
+{
+  cvk_tg_shape_t s;
+  if (nc_transpose) {
+    s.n = tl->shape.c;
+    s.c = tl->shape.n;
+  } else {
+    s.n = tl->shape.n;
+    s.c = tl->shape.c;
+  }
+  s.h = tl->shape.h;
+  s.w = tl->shape.w;
+
+  // setup tg
+  cvk_tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = phy_addr;
+  tg.fmt = CVK_FMT_I8;
+  tg.shape = s;
+  tg.stride = cvk_ctx->ops->tg_default_stride(cvk_ctx, s, CVK_FMT_I8);
+
+  // apply tdma
+  if (nc_transpose) {
+    cvk_tdma_g2l_tensor_copy_nc_transposed_param_t p;
+    memset(&p, 0, sizeof(p));
+    p.src = &tg;
+    p.dst = tl;
+    cvk_ctx->ops->tdma_g2l_tensor_copy_nc_transposed(cvk_ctx, &p);
+  } else {
+    cvk_tdma_g2l_tensor_copy_param_t p;
+    memset(&p, 0, sizeof(p));
+    p.src = &tg;
+    p.dst = tl;
+    cvk_ctx->ops->tdma_g2l_tensor_copy(cvk_ctx, &p);
+  }
+
+}
+
+static void test_tdma_l2g(
+  CVI_RT_HANDLE rt_handle,
+  cvk_context_t *cvk_ctx,
+  const cvk_tl_t *tl,
+  uint64_t phy_addr)
+{
+  cvk_tg_shape_t s;
+  s.n = tl->shape.n;
+  s.c = tl->shape.c;
+  s.h = tl->shape.h;
+  s.w = tl->shape.w;
+
+  // setup tg
+  cvk_tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = phy_addr;
+  tg.fmt = CVK_FMT_I8;
+  tg.shape = s;
+  tg.stride = cvk_ctx->ops->tg_default_stride(cvk_ctx, s, CVK_FMT_I8);
+
+  // apply tdma
+  cvk_tdma_l2g_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = tl;
+  p.dst = &tg;
+  cvk_ctx->ops->tdma_l2g_tensor_copy(cvk_ctx, &p);
+}
+
+static void tl_copy_ref(int8_t *a, int8_t *res, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++)
+    res[i] = a[i];
+}
+
+static void tl_copy_nc_transpose_ref(int8_t *a, int8_t *res,
+    int n, int c, int h, int w)
+{
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < c; j++) {
+      for (int k = 0; k < h * w; k++) {
+        res[j * n * h * w + i * h * w + k] = a[i * c * h * w + j * h * w + k];
+      }
+    }
+  }
+}
+
+static void tl_copy_hw_transpose_ref(int8_t *a, int8_t *res,
+    int n, int c, int h, int w)
+{
+  for (int i = 0; i < n * c; i++) {
+    for (int j = 0; j < h; j++) {
+      for (int k = 0; k < w; k++) {
+        res[i * h * w + k * h + j] = a[i * h * w + j * w + k];
+      }
+    }
+  }
+}
+
+static int test_tensor_copy(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx,
+    int n, int c, int h, int w,
+    int eu_align, int nc_transpose, int hw_transpose)
+{
+  printf("  %s: (%d,%d,%d,%d), eu_align %d, nc_tp %d, hw_tp %d\n",
+          __func__, n, c, h, w, eu_align, nc_transpose, hw_transpose);
+
+  int ret = 0;
+  uint32_t size = n * c * h * w;
+  int8_t *a_data = (int8_t *)malloc(size);
+  assert(a_data && "Expect allocated a_data");
+
+  int8_t *res_data = (int8_t *)malloc(size);
+  assert(res_data && "Expect allocated res_data");
+
+  for (uint32_t i = 0; i < size; i++)
+    a_data[i] = (int8_t)(i % 256);
+
+  int8_t *ref_data = (int8_t *)malloc(size);
+  assert(ref_data && "Expect allocated ref_data");
+  // calc ref
+  if (nc_transpose) {
+    tl_copy_nc_transpose_ref(a_data, ref_data, n, c, h, w);
+  } else if (hw_transpose) {
+    tl_copy_hw_transpose_ref(a_data, ref_data, n, c, h, w);
+  } else {
+    tl_copy_ref(a_data, ref_data, size);
+  }
+
+  cvk_tl_shape_t tl_shape;
+  if (nc_transpose) {
+    tl_shape.n = c;
+    tl_shape.c = n;
+  } else {
+    tl_shape.n = n;
+    tl_shape.c = c;
+  }
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  cvk_tl_t *tl_a   = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape,
+                                                     CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_res = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape,
+                                                     CVK_FMT_I8, eu_align);
+
+  // copy input content from system memory to device memory
+  size_t input_mem_size = tl_shape.n * tl_shape.c * tl_shape.h * tl_shape.w * 1; // CVK_FMT_I8
+  CVI_RT_MEM input_mem= CVI_RT_MemAlloc(rt_handle, input_mem_size);
+  CVI_RT_MemCopyS2D(rt_handle, input_mem, a_data);
+
+  // tdma copy descriptor generation, dram to sram
+  test_tdma_g2l(rt_handle, cvk_ctx, tl_a, CVI_RT_MemGetPAddr(input_mem), nc_transpose);
+
+  // tiu copy descriptor genereation, sram to sram
+  cvk_tiu_copy_param_t p10;
+  if (hw_transpose) {
+    tl_res->stride.h = 1;
+    tl_res->stride.w = tl_shape.h;
+  }
+  p10.dst = tl_res;
+  p10.src = tl_a;
+  cvk_ctx->ops->tiu_copy(cvk_ctx, &p10);
+
+  cvk_tl_shape_t tl_shape_dst = tl_shape;
+  if (hw_transpose) {
+    tl_shape_dst.h = w;
+    tl_shape_dst.w = h;
+  }
+  tl_res->shape = tl_shape_dst;
+  tl_res->stride = cvk_ctx->ops->tl_default_stride(cvk_ctx, tl_shape_dst,
+                                                   CVK_FMT_I8, eu_align);
+
+  // tdma copy descriptor generation, sram to dram
+  size_t result_mem_size = tl_res->shape.n * tl_res->shape.c * tl_res->shape.h * tl_res->shape.w * 1; // CVK_FMT_I8
+  CVI_RT_MEM result_mem = CVI_RT_MemAlloc(rt_handle, result_mem_size);
+  test_tdma_l2g(rt_handle, cvk_ctx, tl_res, CVI_RT_MemGetPAddr(result_mem));
+
+  // driving tpu hardware by descriptor list
+  CVI_RT_Submit(cvk_ctx);
+
+  // copy result content from device memory to system memory
+  CVI_RT_MemCopyD2S(rt_handle, res_data, result_mem);
+
+  for (uint64_t i = 0; i < size; i++) {
+    if (res_data[i] != ref_data[i]) {
+      printf("comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+             i, res_data[i], ref_data[i]);
+      ret = -1;
+    }
+  }
+
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+
+  free(a_data);
+  free(res_data);
+  free(ref_data);
+  CVI_RT_MemFree(rt_handle, input_mem);
+  CVI_RT_MemFree(rt_handle, result_mem);
+
+  printf("  %s %s\n", __func__, ret ? "FAILED" : "PASSED");
+
+  return ret;
+}
+
+#define CMDBUF_SIZE   (512*1024)  // Adjust based on test case
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+
+  CVI_RT_HANDLE rt_handle;
+  cvk_context_t *cvk_ctx = NULL;
+
+  //init
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+
+  printf("Init done\n");
+
+  //use case
+  ret = test_tensor_copy(rt_handle, cvk_ctx, 3, 39, 7, 37, 0, 0, 0);
+  ret |= test_tensor_copy(rt_handle, cvk_ctx, 3, 39, 7, 37, 1, 0, 0);
+  ret |= test_tensor_copy(rt_handle, cvk_ctx, 3, 39, 7, 37, 0, 1, 0);  // nc_transpose
+  ret |= test_tensor_copy(rt_handle, cvk_ctx, 3, 39, 7, 37, 0, 0, 1);  // hw_transpose
+
+  //deinit
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  printf("%s %s\n", __FILE__, ret ? "FAILED" : "PASSED");
+
+  return ret;
+}
diff --git a/cviruntime/samples_kernel/transpose/CMakeLists.txt b/cviruntime/samples_kernel/transpose/CMakeLists.txt
new file mode 100644
index 000000000..40f59c970
--- /dev/null
+++ b/cviruntime/samples_kernel/transpose/CMakeLists.txt
@@ -0,0 +1,15 @@
+cmake_minimum_required(VERSION 2.8.0)
+project(transpose CXX)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
+
+include_directories(${TPU_BASE}/cvitek_mlir/include)
+link_directories(${TPU_BASE}/cvitek_mlir/lib)
+
+set(CVI_LIBS cvicmodel cviruntime cvikernel)
+add_executable(transpose transpose.cpp)
+target_link_libraries(transpose ${CVI_LIBS})
+
+
diff --git a/cviruntime/samples_kernel/transpose/README.md b/cviruntime/samples_kernel/transpose/README.md
new file mode 100644
index 000000000..1013e5a1e
--- /dev/null
+++ b/cviruntime/samples_kernel/transpose/README.md
@@ -0,0 +1,61 @@
+### Setup and Build
+1. tar -zxvf cvitek_mlir-ubuntu-18.04.tar.gz
+2. tar -zxvf transpose.tar.gz
+3. cd transpose
+4. mkdir build && cd build
+5. cmake -G Ninja -DTPU_BASE=/work ..
+6. cmake --build .
+
+### CviRuntime API
+1. CVI_RC CVI_RT_Init(CVI_RT_HANDLE \*rt_handle)
+  runtime初始化
+  @param rt_handle runtime handle(context)
+
+2. CVI_RT_MEM CVI_RT_MemAlloc(CVI_RT_HANDLE handle, uint64_t size)
+  分配连续的运行时的device memory
+  @param handle runtime handle
+  @param size 分配的memory大小
+  @ret 返回分配的device memory内存描述符
+
+3. CVI_RT_MEM CVI_RT_MemPreAlloc(CVI_RT_MEM mem, uint64_t offset, uint64_t size)
+  从分配的运行时memory中取一段内存
+  @param mem device memory地址描述符
+  @param offset 距离device memory物理地址首地址的偏移, 作为返回的memory的首地址
+  @param size memory大小
+  @ret 返回device memory的地址描述符
+
+4. uint64_t CVI_RT_MemGetPAddr(CVI_RT_MEM mem)
+  返回device memory的物理地址(用于cvikernel)
+  @param mem device memory的地址描述符
+
+5. uint8_t\* CVI_RT_MemGetVAddr(CVI_RT_MEM mem)
+  返回device memory的虚拟地址(用于cpu寻址)
+  @param mem device meory的地址描述符
+
+6. CVI_RC CVI_RT_LoadCmdbuf(CVI_RT_HANDLE handle, uint8_t \*cmdbuf,
+              uint64_t cmdbuf_sz, uint64_t gaddr_base0, uint64_t gaddr_base1,
+              bool enable_pmu, CVI_RT_MEM *cmdbuf_mem)
+  加载cmdbuf, 用来将cmdbuf从用户空间拷贝(vaddr)到内核空间(paddr)
+  @param handle runtime handle
+  @param cmdbuf cvikernel 生成的cmdbuf
+  @param cmdbuf_sz cmdbuf的大小
+  @param gaddr_base0 neuron tensor的base寄存器索引(0)
+  @param gaddr_base1 weight的base寄存器索引(1)
+  @param enable_pmu 是否使用pmu性能分析
+  @param cmdbuf_mem 返回cmdbuf加载到device memory的地址
+
+7. CVI_RC CVI_RT_RunCmdbuf(CVI_RT_HANDLE handle, CVI_RT_MEM cmdbuf_mem,
+              uint64_t gaddr_base2, uint64_t gaddr_base3)
+  运行cmdbuf
+  @param handle runtime handle
+  @param cmdbuf_mem cmdbuf的device memory地址描述符
+  @param gaddr_base2 input tensor的device memory的物理地址
+  @param gaddr_base3 output tensor的device memory的物理地址
+8. CVI_RC CVI_RT_MemFlush(CVI_RT_HANDLE handle, CVI_RT_MEM mem)
+  刷新cache
+  @param handle runtime handle
+  @param mem deivce memory地址描述符
+9. CVI_RC CVI_RT_DeInit(CVI_RT_HANDLE handle)
+  @param handle runtime handle
+
+详情参考transpose.cpp注释
diff --git a/cviruntime/samples_kernel/transpose/transpose.cpp b/cviruntime/samples_kernel/transpose/transpose.cpp
new file mode 100644
index 000000000..1bec8cc72
--- /dev/null
+++ b/cviruntime/samples_kernel/transpose/transpose.cpp
@@ -0,0 +1,235 @@
+// This demo demonstrate how to do tranpose from [nhwc] to [nchw] with cpu and tpu
+#include <cassert>
+#include <cstring>
+#include <ctime>
+#include <vector>
+#include <iostream>
+#include <random>
+#include <functional>
+
+#include "cviruntime_context.h"
+#include "cvikernel/cvikernel.h"
+
+static constexpr int NPU_NUM = 32;
+static constexpr int EU_NUM = 16;
+static constexpr int LOCAL_MEM_SIZE = 1 << 15;
+
+static void jit_compile(uint8_t **cmdbuf, uint32_t &size, std::vector<int> &shape) {
+  cvk_reg_info_t req_info;
+
+  memset(&req_info, 0, sizeof(cvk_reg_info_t));
+  strncpy(req_info.chip_ver_str, "cv183x", sizeof(req_info.chip_ver_str) - 1);
+  req_info.cmdbuf_size = 300000;
+  req_info.cmdbuf = (uint8_t*)malloc(req_info.cmdbuf_size);
+  auto cvk_ctx = cvikernel_register(&req_info);
+
+  int i_base_ga_idx = 0;
+  int o_base_ga_idx = 3;
+  uint64_t ifmap_ga = 0;
+  uint64_t ofmap_ga = 0;
+
+  int n = shape[0];
+  int c = shape[1];
+  int h = shape[2];
+  int w = shape[3];
+
+  cvk_tg_shape_t src_shape = {n, c*h, 1, w};
+  cvk_tg_shape_t dst_shape = {n, w, 1, c*h};
+
+  cvk_tg_stride_t src_stride = cvk_ctx->ops->tg_default_stride(cvk_ctx, src_shape, CVK_FMT_I8);
+  cvk_tg_stride_t dst_stride = cvk_ctx->ops->tg_default_stride(cvk_ctx, dst_shape, CVK_FMT_I8);
+
+  cvk_tg_stride_t out_stride = {dst_stride.n, dst_stride.w, dst_stride.h, dst_stride.c};
+
+  uint32_t n_step = 1;
+  uint32_t c_step = 0;
+  uint32_t h_step = 0;
+
+  h_step = h;
+
+  for (; h_step > 0; --h_step) {
+    uint32_t total_size;
+    for (c_step = c; c_step >= NPU_NUM; --c_step) {
+      cvk_tl_shape_t tl_ifmap_shape = {1, c_step * h_step, 1, w};
+      uint32_t tl_ifmap_size = cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, tl_ifmap_shape, CVK_FMT_I8, 0);
+      total_size = tl_ifmap_size;
+      if (total_size <= LOCAL_MEM_SIZE) {
+        break;
+      }
+    }
+    if (total_size <= LOCAL_MEM_SIZE) {
+      break;
+    }
+  }
+
+  std::cout << "tiling: c_step " << c_step << ", h_step " << h_step << "\n";
+  assert(c_step && h_step);
+
+  for (uint32_t n_pos = 0; n_pos < n; n_pos += n_step) {
+    for (uint32_t c_pos = 0; c_pos < c; c_pos += c_step) {
+      uint32_t tiling_c = std::min(c - c_pos, c_step);
+      for (uint32_t h_pos = 0; h_pos < h; h_pos += h_step) {
+        uint32_t tiling_h = std::min(h - h_pos, h_step);
+
+        cvk_tl_t tl_ifmap;
+        cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tl_ifmap, {1, tiling_c * tiling_h, 1, w}, CVK_FMT_I8, 0);
+        tl_ifmap.start_address = 0;
+
+        cvk_tg_t tg_src = {0};
+        tg_src.base_reg_index = i_base_ga_idx;
+        tg_src.fmt = CVK_FMT_I8;
+        tg_src.start_address = ifmap_ga + src_stride.n * n_pos + src_stride.c * c_pos + src_stride.h * h_pos;
+        tg_src.shape = {tl_ifmap.shape.n, tl_ifmap.shape.c, tl_ifmap.shape.h, tl_ifmap.shape.w};
+        tg_src.stride = src_stride;
+
+        std::cout << "tg offset: " << tg_src.start_address << ", shape: ("
+                  << tg_src.shape.n << "," << tg_src.shape.c << ","
+                  << tg_src.shape.h << "," << tg_src.shape.w <<")\n";
+        std::cout << "tg stride: (" << src_stride.n << "," << src_stride.c << ","
+                                    << src_stride.h << "," << src_stride.w << ")\n";
+
+        cvk_tdma_g2l_tensor_copy_param_t p1 = {0};
+        p1.src = &tg_src;
+        p1.dst = &tl_ifmap;
+        cvk_ctx->ops->tdma_g2l_tensor_copy(cvk_ctx, &p1);
+
+        cvk_tg_t tg_dst = {0};
+        tg_dst.start_address = ofmap_ga + n_pos * out_stride.n + c_pos * out_stride.c + h_pos * out_stride.h;
+        tg_dst.shape = {1, w, 1, tiling_c * tiling_h};
+        tg_dst.stride = dst_stride;
+        tg_dst.base_reg_index = o_base_ga_idx;
+        tg_dst.fmt = CVK_FMT_I8;
+
+        cvk_tdma_l2g_tensor_copy_cw_transposed_param_t p2 = {0};
+        p2.src = &tl_ifmap;
+        p2.dst = &tg_dst;
+        cvk_ctx->ops->tdma_l2g_tensor_copy_cw_transposed(cvk_ctx, &p2);
+      }
+    }
+  }
+
+  *cmdbuf = cvk_ctx->ops->acquire_cmdbuf(cvk_ctx, &size);
+}
+
+void transpose_tpu(std::vector<int8_t> &input, std::vector<int> &shape,
+                   std::vector<int8_t> &output) {
+  int64_t tensor_size = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int64_t>());
+
+  // runtime init
+  CVI_RT_HANDLE ctx = nullptr;
+  CVI_RT_Init(&ctx);
+
+  uint8_t *cmdbuf = nullptr;
+  uint32_t cmdbuf_size = 0;
+
+  // generate cmdbuf
+  jit_compile(&cmdbuf, cmdbuf_size, shape);
+
+  // Alloc device memory for input + output + cmdbuf
+  CVI_RT_MEM shared_mem = CVI_RT_MemAlloc(ctx, tensor_size * 2);
+  CVI_RT_MEM input_mem = CVI_RT_MemPreAlloc(shared_mem, 0, tensor_size);
+  CVI_RT_MEM output_mem = CVI_RT_MemPreAlloc(shared_mem, tensor_size, tensor_size);
+  CVI_RT_MEM cmdbuf_mem = nullptr;
+  // Load cmdbuf
+  CVI_RT_LoadCmdbuf(ctx, cmdbuf, cmdbuf_size, CVI_RT_MemGetPAddr(shared_mem), 0, false, &cmdbuf_mem);
+
+  // Get input tensor virtual address
+  int8_t *input_ptr = (int8_t*)CVI_RT_MemGetVAddr(input_mem);
+  // Copy data to device
+  for (int i = 0; i < tensor_size; ++i) {
+    // input data range (-128, 127)
+    input_ptr[i] = (int8_t)input[i];
+  }
+  // Flush cache
+  CVI_RT_MemFlush(ctx, input_mem);
+
+  // Run cmdbuf
+  CVI_RC ret = CVI_RT_RunCmdbuf(ctx, cmdbuf_mem, CVI_RT_MemGetPAddr(input_mem), CVI_RT_MemGetPAddr(output_mem));
+  assert(ret == 0);
+
+  // Get output tensor virtual address
+  int8_t *output_ptr = (int8_t*)CVI_RT_MemGetVAddr(output_mem);
+  // Copy data from device
+  for (int i = 0; i < tensor_size; ++i) {
+    output[i] = (int)output_ptr[i];
+  }
+  // Flush cache
+  CVI_RT_MemFlush(ctx, output_mem);
+
+  // Release device memory
+  CVI_RT_MemFree(ctx, cmdbuf_mem);
+  CVI_RT_MemFree(ctx, shared_mem);
+  CVI_RT_DeInit(ctx);
+}
+
+void transpose_ref(std::vector<int8_t> &input, std::vector<int> &shape,
+                   std::vector<int8_t> &output_ref) {
+  int num = shape[0];
+  int channel = shape[1];
+  int height = shape[2];
+  int width = shape[3];
+
+  // [0, 1, 2, 3] => [0, 3, 1, 2]
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < channel; ++c) {
+      for (int h = 0; h < height; ++h) {
+        for (int w = 0; w < width; ++w) {
+          int32_t in_idx = w + h * width + c * height * width + n * channel * height * width;
+          int32_t out_idx = h + c * height + w * channel * height + n * channel  * height * width;
+          output_ref[out_idx] = input[in_idx];
+        }
+      }
+    }
+  }
+}
+
+int main(int argc, char* argv[]) {
+  std::mt19937::result_type seed = std::time(0);
+  auto randint = [&](int begin, int end) {
+    return std::bind(
+        std::uniform_int_distribution<int>(begin, end),
+        std::mt19937(seed));
+  };
+
+  // generate random shape
+  auto int_gen1 = randint(1, 100);
+  std::vector<int> shape(4);
+  for (int i = 0; i < 4; ++i) {
+    shape[i] = int_gen1();
+  }
+
+  std::cout << "tensor shape: (" << shape[0] << ", " << shape[1] << ", "
+                                << shape[2] << ", " << shape[3] <<")\n";
+
+  int64_t size = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int64_t>());
+  std::cout << "tesnor size: " << size << "\n";
+
+  // generate random input tensor
+  auto int_gen2 = randint(-128, 127);
+  std::vector<int8_t> src(size);
+  for (int i = 0; i < size; ++i) {
+    src[i] = (int8_t)int_gen2();
+  }
+
+  // cpu implementation
+  std::vector<int8_t> dst_ref(size);
+  transpose_ref(src, shape, dst_ref);
+
+  // tpu implementation
+  std::vector<int8_t> dst(size);
+  transpose_tpu(src, shape, dst);
+
+  // compare result between cpu and tpu
+  for (int i = 0; i < size; ++i) {
+    if (dst[i] == dst_ref[i]) {
+      continue;
+    } else {
+      printf("compare fail, index: %d, expect: %d, but get %d\n", i, dst_ref[i], dst[i]);
+      return -1;
+    }
+  }
+
+  std::cout << "compare pass!\n";
+
+  return 0;
+}
diff --git a/cviruntime/scripts/README.md b/cviruntime/scripts/README.md
new file mode 100644
index 000000000..96a3d95c1
--- /dev/null
+++ b/cviruntime/scripts/README.md
@@ -0,0 +1,35 @@
+# Test SDK on target board
+
+- unzip tpu_sdk, and setup environment variables
+
+  ```sh
+  $ tar zxf cvitek_tpu_sdk.tar.gz
+  $ export TPU_ROOT=$PWD/cvitek_tpu_sdk
+  ```
+
+- upzip cvimodel release
+
+  ```sh
+  $ tar zxf cvimodel_release.tar.gz
+  $ export MODEL_PATH=$PWD/cvimodel_release
+  ```
+
+- run samples
+
+  ```sh
+  $ tar zxf cvimodel_samples.tar.gz
+  $ cd cvimodel_samples
+  $ $TPU_ROOT/samples/bin/cvi_sample_model_info \
+      $MODEL_PATH/mobilenet_v2.cvimodel
+  $ $TPU_ROOT/samples/run_classifier.sh
+  $ $TPU_ROOT/samples/run_detector.sh
+  $ $TPU_ROOT/samples/run_alphapose.sh
+  $ $TPU_ROOT/samples/run_insightface.sh
+  ```
+
+- unzip cvimodel regression package, and run model regression
+
+  ```sh
+  $ tar zxf cvimodel_regression.tar.gz
+  $ MODEL_PATH=$PWD/cvimodel_regression $TPU_ROOT/regression_models.sh
+  ```
diff --git a/cviruntime/scripts/envs_tpu_sdk.sh b/cviruntime/scripts/envs_tpu_sdk.sh
new file mode 100755
index 000000000..c0ce8f6da
--- /dev/null
+++ b/cviruntime/scripts/envs_tpu_sdk.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+export LD_LIBRARY_PATH=$PWD/lib:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=$PWD/opencv/lib:$LD_LIBRARY_PATH
+export PATH=$PWD/bin:$PATH
diff --git a/cviruntime/scripts/evb/best_performance_cv180x.csv b/cviruntime/scripts/evb/best_performance_cv180x.csv
new file mode 100644
index 000000000..6732cdef6
--- /dev/null
+++ b/cviruntime/scripts/evb/best_performance_cv180x.csv
@@ -0,0 +1,13 @@
+CV180x,500Mhz,2022-10-18
+Net,Inference Cycle(ms)
+resnet18_bs1,136.03
+mobilenet_v1_bs1,37.60
+mobilenet_v2_bs1,31.68
+squeezenet_v1.1_bs1,27.45
+shufflenet_v2_bs1,4.94
+googlenet_bs1,119.58
+densenet_121_bs1,198.20
+efficientnet-lite_b0_bs1,29.19
+nasnet_mobile_bs1,53.29
+retinaface_mnet25_bs1,18.53
+retinaface_mnet25_600_bs1,64.11
\ No newline at end of file
diff --git a/cviruntime/scripts/evb/best_performance_cv181x.csv b/cviruntime/scripts/evb/best_performance_cv181x.csv
new file mode 100644
index 000000000..ae2743025
--- /dev/null
+++ b/cviruntime/scripts/evb/best_performance_cv181x.csv
@@ -0,0 +1,20 @@
+CV181x,500Mhz,2022-6-17
+Net,Inference Cycle(ms)
+resnet18_bs1,18.26
+mobilenet_v1_bs1,7.18
+mobilenet_v2_bs1,7.07
+shufflenet_v2_bs1,1.53
+googlenet_bs1,19.63
+inception_v3_bs1,55.95
+densenet_121_bs1,36.66
+densenet_201_bs1,66.26
+senet_res50_bs1,53.47
+resnext50_bs1,49.19
+efficientnet-lite_b0_bs1,6.91
+nasnet_mobile_bs1,13.91
+retinaface_mnet25_bs1,3.56
+retinaface_mnet25_600_bs1,14.25
+mobilenet_ssd_bs1,14.79
+yolo_v1_448_bs1,20.20
+yolo_v3_tiny_bs1,26.81
+arcface_res50_bs1,63.10
diff --git a/cviruntime/scripts/evb/best_performance_cv182x.csv b/cviruntime/scripts/evb/best_performance_cv182x.csv
new file mode 100644
index 000000000..e3f206b7f
--- /dev/null
+++ b/cviruntime/scripts/evb/best_performance_cv182x.csv
@@ -0,0 +1,29 @@
+CV182x,750Mhz,2021-12-12
+Net,Inference Cycle(ms)
+resnet50_bs1,31.03
+resnet18_bs1,12.73
+mobilenet_v1_bs1,5.30
+mobilenet_v2_bs1,5.60
+squeezenet_v1.1_bs1,3.59
+shufflenet_v2_bs1,1.30
+googlenet_bs1,13.94
+inception_v3_bs1,39.34
+densenet_121_bs1,28.55
+densenet_201_bs1,58.10
+senet_res50_bs1,42.47
+resnext50_bs1,38.02
+efficientnet-lite_b0_bs1,5.62
+nasnet_mobile_bs1,10.99
+retinaface_mnet25_bs1,2.60
+retinaface_mnet25_600_bs1,10.61
+retinaface_res50_bs1,366.91
+mobilenet_ssd_bs1,11.63
+yolo_v1_448_bs1,15.89
+yolo_v2_416_bs1,195.46
+yolo_v3_320_bs1,125.51
+yolo_v3_tiny_bs1,19.01
+yolo_v4_s_bs1,80.77
+yolo_v5_s_bs1,73.15
+yolox_s_bs1,100.12
+arcface_res50_bs1,45.42
+alphapose_bs1,48.04
diff --git a/cviruntime/scripts/evb/best_performance_cv183x.csv b/cviruntime/scripts/evb/best_performance_cv183x.csv
new file mode 100644
index 000000000..acbfd831e
--- /dev/null
+++ b/cviruntime/scripts/evb/best_performance_cv183x.csv
@@ -0,0 +1,45 @@
+CV183x,650Mhz,2021-12-12
+Net,Inference Cycle(ms)
+resnet50_bs1,10.19
+resnet18_bs1,4.21
+mobilenet_v1_bs1,1.80
+mobilenet_v2_bs1,1.59
+squeezenet_v1.1_bs1,1.31
+shufflenet_v2_bs1,0.75
+googlenet_bs1,5.04
+inception_v3_bs1,11.91
+inception_v4_bs1,27.32
+vgg16_bs1,39.54
+densenet_121_bs1,9.36
+densenet_201_bs1,15.99
+senet_res50_bs1,15.57
+resnext50_bs1,12.70
+res2net50_bs1,11.48
+ecanet50_bs1,15.36
+efficientnet_b0_bs1,2.70
+efficientnet-lite_b0_bs1,1.69
+nasnet_mobile_bs1,4.35
+retinaface_mnet25_bs1,1.14
+retinaface_mnet25_600_bs1,4.19
+retinaface_res50_bs1,113.99
+ssd300_bs1,63.24
+mobilenet_ssd_bs1,3.30
+yolo_v1_448_bs1,6.88
+yolo_v2_416_bs1,28.92
+yolo_v2_1080_bs1,368.77
+yolo_v3_320_bs1,37.27
+yolo_v3_416_bs1,62.65
+yolo_v3_tiny_bs1,6.15
+yolo_v3_608_bs1,132.86
+yolo_v3_spp_bs1,133.53
+yolo_v4_bs1,133.79
+yolo_v4_s_bs1,29.92
+yolo_v5_s_bs1,25.33
+yolox_s_bs1,33.60
+faster_rcnn_bs1,144.26
+arcface_res50_bs1,15.91
+alphapose_bs1,15.57
+espcn_3x_bs1,0.80
+unet_bs1,97.98
+erfnet_bs1,99.06
+enet_bs1,43.43
diff --git a/cviruntime/scripts/evb/build_x86_sdk.sh b/cviruntime/scripts/evb/build_x86_sdk.sh
new file mode 100755
index 000000000..26703515f
--- /dev/null
+++ b/cviruntime/scripts/evb/build_x86_sdk.sh
@@ -0,0 +1,154 @@
+#!/bin/bash
+echo "$WORKSPACE"
+
+PROJECT_ROOT=~/workspace/sdk
+BUILD_PATH=$PROJECT_ROOT/build_x86_64 
+INSTALL_PATH=$PROJECT_ROOT/cvitek_tpu_sdk 
+pushd $PROJECT_ROOT
+rm -rf $BUILD_PATH 
+rm -rf $INSTALL_PATH
+mkdir -p $BUILD_PATH
+mkdir -p $INSTALL_PATH
+
+BUILD_FLAG="-DCMAKE_BUILD_TYPE=RELEASE -DCMAKE_C_FLAGS_RELEASE=-O3 -DCMAKE_CXX_FLAGS_RELEASE=-O3"
+
+#build flatbuffers
+mkdir -p $BUILD_PATH/flatbuffers
+pushd $BUILD_PATH/flatbuffers
+cmake -G Ninja \
+    $PROJECT_ROOT/flatbuffers \
+    -DCMAKE_INSTALL_PREFIX=$INSTALL_PATH/flatbuffers
+cmake --build . --target install
+test $? -ne 0 && echo "build flatbuffers failed !!" && popd && exit 1
+popd
+
+#build cvikernel
+mkdir -p $BUILD_PATH/cvikernel
+pushd $BUILD_PATH/cvikernel
+cmake -G Ninja -DCHIP=BM1880v2 $BUILD_FLAG \
+    -DCMAKE_INSTALL_PREFIX=$INSTALL_PATH/tpuc \
+    $PROJECT_ROOT/cvikernel
+cmake --build . --target install
+test $? -ne 0 && echo "build cvikernel failed !!" && popd && exit 1
+popd
+
+# cvibuilder
+mkdir -p $BUILD_PATH/cvimodel
+pushd $BUILD_PATH/cvimodel
+cmake -G Ninja -DFLATBUFFERS_PATH=$INSTALL_PATH/flatbuffers \
+    -DCMAKE_INSTALL_PREFIX=$INSTALL_PATH/tpuc \
+    $PROJECT_ROOT/cvibuilder
+cmake --build . --target install
+test $? -ne 0 && echo "build cvibuilder failed !!" && popd && exit 1
+popd
+
+#build cnpy
+mkdir -p $BUILD_PATH/cnpy
+pushd $BUILD_PATH/cnpy
+cmake -G Ninja \
+    -DCMAKE_INSTALL_PREFIX=$INSTALL_PATH/cnpy \
+    $PROJECT_ROOT/cnpy
+cmake --build . --target install
+test $? -ne 0 && echo "build cnpy failed !!" && popd && exit 1
+popd
+cp $INSTALL_PATH/cnpy/lib/* $INSTALL_PATH/tpuc/lib/
+
+#build opencv
+mkdir -p $BUILD_PATH/opencv
+pushd $BUILD_PATH/opencv
+cmake -G Ninja \
+    $PROJECT_ROOT/oss/opencv \
+    -DWITH_CUDA=OFF -DWITH_IPP=OFF -DWITH_LAPACK=OFF \
+    -DWITH_DC1394=OFF -DWITH_GPHOTO2=OFF \
+    -DCMAKE_BUILD_TYPE=RELEASE \
+    -DPYTHON_DEFAULT_EXECUTABLE=$(which python3) \
+    -DBUILD_opencv_videoio=OFF \
+    -DBUILD_opencv_superres=OFF \
+    -DBUILD_opencv_videostab=OFF \
+    -DBUILD_opencv_stitching=OFF \
+    -DBUILD_opencv_objdetect=OFF \
+    -DBUILD_opencv_calib3d=OFF \
+    -DBUILD_opencv_ml=OFF \
+    -DBUILD_opencv_video=OFF \
+    -DBUILD_opencv_flann=OFF \
+    -DBUILD_opencv_photo=OFF \
+    -DCMAKE_INSTALL_PREFIX=$INSTALL_PATH/opencv
+cmake --build . --target install
+test $? -ne 0 && echo "build opencv failed !!" && popd && exit 1
+popd
+
+
+#build cmodel
+mkdir -p $BUILD_PATH/cmodel
+pushd $BUILD_PATH/cmodel
+cmake -G Ninja -DCHIP=BM1880v2 $BUILD_FLAG \
+    -DCVIKERNEL_PATH=$INSTALL_PATH/tpuc \
+    -DCMAKE_INSTALL_PREFIX=$INSTALL_PATH/tpuc \
+    $PROJECT_ROOT/cmodel
+cmake --build . --target install
+test $? -ne 0 && echo "build cmodel failed !!" && popd && exit 1
+popd
+
+
+#build cviruntime
+mkdir -p $BUILD_PATH/cviruntime
+pushd $BUILD_PATH/cviruntime
+cmake -G Ninja -DCHIP=BM1880v2 -DRUNTIME=CMODEL $BUILD_FLAG \
+    -DCVIKERNEL_PATH=$INSTALL_PATH/tpuc \
+    -DCMODEL_PATH=$INSTALL_PATH/tpuc \
+    -DENABLE_PYRUNTIME=ON \
+    -DFLATBUFFERS_PATH=$INSTALL_PATH/flatbuffers \
+    -DCNPY_PATH=$INSTALL_PATH/cnpy \
+    -DCVIBUILDER_PATH=$INSTALL_PATH/tpuc \
+    -DCMAKE_INSTALL_PREFIX=$INSTALL_PATH/tpuc \
+    -DENABLE_TEST=OFF \
+    $PROJECT_ROOT/cviruntime
+cmake --build . --target install
+test $? -ne 0 && echo "build cviruntime failed !!" && popd && exit 1
+#ctest --progress || true
+rm -f $INSTALL_PATH/tpuc/README.md
+# rm -f $INSTALL_PATH/tpuc/envs_tpu_sdk.sh
+rm -f $INSTALL_PATH/tpuc/regression_models.sh
+rm -f $INSTALL_PATH/tpuc/regression_models_e2e.sh
+rm -f $INSTALL_PATH/tpuc/regression_models_fused_preprocess.sh
+popd
+
+#adjust the directory
+pushd $INSTALL_PATH
+mkdir -p ./bin
+mv ./cnpy/bin/* ./bin
+rm -rf ./cnpy/bin
+mv ./tpuc/bin/* ./bin
+mv  ./tpuc/include ./
+mv  ./tpuc/lib ./
+mv ./tpuc/envs_tpu_sdk.sh .
+rm -rf ./tpuc
+popd
+
+#build samples
+mkdir -p $BUILD_PATH/samples
+pushd $BUILD_PATH/samples
+cmake -G Ninja $BUILD_FLAG \
+    -DTPU_SDK_PATH=$INSTALL_PATH \
+    -DCNPY_PATH=$INSTALL_PATH/cnpy \
+    -DOPENCV_PATH=$INSTALL_PATH/opencv \
+    -DCMAKE_INSTALL_PREFIX=$INSTALL_PATH/samples \
+    $PROJECT_ROOT/cviruntime/samples
+cmake --build . --target install -- -v
+test $? -ne 0 && echo "build samples failed !!" && popd && exit 1
+popd
+
+#get tpu_sdk_version
+pushd $INSTALL_PATH
+RELEASE_PATH="/data/dailyrelease/$(date '+%Y-%m-%d')-18.04"
+version="$(./bin/model_runner | grep Runtime | cut -d ")" -f2 | cut -d "@" -f1)"
+cat>$RELEASE_PATH/tpu_sdk_version.txt<<EOF
+$version
+EOF
+popd
+
+tar -zcf cvitek_tpu_sdk_x86_64.tar.gz cvitek_tpu_sdk
+rm -rf $INSTALL_PATH
+rm -rf $BUILD_PATH
+popd
+
diff --git a/cviruntime/scripts/evb/jenkins_cv180x b/cviruntime/scripts/evb/jenkins_cv180x
new file mode 100644
index 000000000..49d693259
--- /dev/null
+++ b/cviruntime/scripts/evb/jenkins_cv180x
@@ -0,0 +1,142 @@
+pipeline {
+  agent {
+    label 'docker-slave'
+  }
+  options {
+    timeout(time: 1, unit: 'HOURS')
+    parallelsAlwaysFailFast()
+    timestamps()
+  }
+  environment {
+    MODEL_PATH = "/data/mlir-models"
+    DATASET_PATH = "/data/dataset"
+    DAILYRELEASE_PATH = "/data/dailyrelease"
+    EVB_IP = "192.168.0.17"
+    NFS_IP = "192.168.0.16"
+    SSH_TOOL = "./ssh_tool.py"
+    TPU_SDK_RISCV = "cvitek_tpu_sdk_cv180x_musl_riscv64_rvv"
+    TPU_SDK_RISCV_LEGACY = "cvitek_tpu_sdk_cv180x_musl_riscv64"
+  }
+  stages {
+    stage('CleanWorkspace') {
+      steps {
+        cleanWs()
+      }
+    }
+    stage('Build') {
+      steps {
+        echo 'Begin to build tpu_sdk latest version'
+        sh '''#!/bin/bash
+          set -xe
+          echo "$WORKSPACE"
+          pushd ~/workspace/sdk
+          rm -rf install
+          ./update.sh
+          source build/envsetup_soc.sh
+
+          # riscv rvv
+          defconfig cv1801c_wevb_0009a_spinor
+          export TPU_REL=1
+          clean_3rd_party
+          build_3rd_party
+          clean_tpu_sdk
+          rm -rf cviruntime/build_sdk
+          build_tpu_sdk
+          rm -rf cviruntime/build_sdk
+          pushd install/soc_cv1801c_wevb_0009a_spinor/tpu_musl_riscv64/
+          tar -zcf ${TPU_SDK_RISCV}.tar.gz cvitek_tpu_sdk
+          cp ${TPU_SDK_RISCV}.tar.gz $WORKSPACE
+          popd
+
+          # riscv
+          export RISCV_LEGACY=1
+          clean_tpu_sdk
+          rm -rf cviruntime/build_sdk
+          build_tpu_sdk
+          rm -rf cviruntime/build_sdk
+          pushd install/soc_cv1801c_wevb_0009a_spinor/tpu_musl_riscv64/
+          tar -zcf ${TPU_SDK_RISCV_LEGACY}.tar.gz cvitek_tpu_sdk
+          cp ${TPU_SDK_RISCV_LEGACY}.tar.gz $WORKSPACE
+          popd
+          cp -rf cviruntime/scripts/evb/* $WORKSPACE
+          popd
+        '''
+      }
+    }
+    stage("MountNfs") {
+      steps {
+        sh '''#!/bin/bash
+          set -xe
+          pip3 install paramiko
+
+          pushd /data/nfs/evb_test/
+          rm -rf * 2> /dev/null || true
+          cp /data/dailyrelease/regression_models/cvimodel_regression_int8_cv180x.tar.gz ./
+          cp -rf $WORKSPACE/* ./
+
+          tar -zxf cvimodel_regression_int8_cv180x.tar.gz
+          tar -zxf ${TPU_SDK_RISCV}.tar.gz
+          chmod 777 -R *
+          popd
+        '''
+        retry(count: 3) {
+          sh '''#!/bin/bash
+            set -e
+            pwd
+            ping -c 10 $EVB_IP
+            $SSH_TOOL -t 30 -b $EVB_IP --exec "/sbin/reboot"
+            sleep 30
+            ping -c 10 $EVB_IP
+            $SSH_TOOL -t 30 -b $EVB_IP --exec "mkdir -p /mnt/data/nfs;mount -t nfs -o nolock ${NFS_IP}:/data/nfs/evb_test /mnt/data/nfs"
+          '''
+        }
+      }
+    }
+    stage("Testing") {
+      steps {
+        sh'pwd;ls -l'
+        sh'$SSH_TOOL -t 300 -b $EVB_IP --exec "set -o pipefail;/mnt/data/nfs/performance_testing_cv180x.sh 2>&1|tee /mnt/data/nfs/performance.log"'
+      }
+      post {
+        always {
+          sh'$SSH_TOOL -b $EVB_IP --exec "rm -rf /mnt/data/nfs/cvitek_tpu_sdk"'
+          sh'$SSH_TOOL -b $EVB_IP --exec "rm -rf /mnt/data/nfs/sdk_regression_out"'
+          sh'$SSH_TOOL -b $EVB_IP --exec "umount /mnt/data/nfs; rm -rf /mnt/data/nfs"'
+        }
+      }
+    }
+  }
+  post {
+    success {
+      sh'''
+        RELEASE_PATH="/data/dailyrelease/$(date '+%Y-%m-%d')-18.04"
+        PERF_PATH=$RELEASE_PATH/perf_result_cv180x
+        mkdir -p $PERF_PATH
+        cp ${TPU_SDK_RISCV}.tar.gz $RELEASE_PATH/
+        cp ${TPU_SDK_RISCV_LEGACY}.tar.gz $RELEASE_PATH/
+        cp -f /data/nfs/evb_test/performance.log .
+        cp -f performance.log $PERF_PATH
+        python3 parse_performance.py best_performance_cv180x.csv performance.log performance_improve.csv  cv180x
+        cp -f performance_improve.csv $PERF_PATH
+        cp -f performance_history.csv $PERF_PATH
+        cp -f performance_history.jpg $PERF_PATH
+      '''
+
+      emailext (
+              attachmentsPattern:"performance_history.jpg, performance_history.csv, performance_improve.csv",
+              body: '''${SCRIPT, template="groovy-html.template"}''',
+              subject: "${env.JOB_NAME} - Build # ${env.BUILD_NUMBER} - Successful",
+              mimeType: 'text/html',
+              to: "${EMAIL_RECIPIENTS}"
+      )
+    }
+    failure {
+      emailext (
+          body: '''${SCRIPT, template="groovy-html.template"}''',
+          subject: "${env.JOB_NAME} - Build # ${env.BUILD_NUMBER} - Failed",
+          mimeType: 'text/html',
+          to: "${EMAIL_RECIPIENTS}"
+      )
+    }
+  }
+}
\ No newline at end of file
diff --git a/cviruntime/scripts/evb/jenkins_cv181x b/cviruntime/scripts/evb/jenkins_cv181x
new file mode 100644
index 000000000..eeea96608
--- /dev/null
+++ b/cviruntime/scripts/evb/jenkins_cv181x
@@ -0,0 +1,173 @@
+pipeline {
+  agent {
+    label 'docker-slave'
+  }
+  options {
+    timeout(time: 1, unit: 'HOURS')
+    parallelsAlwaysFailFast()
+    timestamps()
+  }
+  environment {
+    MODEL_PATH = "/data/mlir-models"
+    DATASET_PATH = "/data/dataset"
+    DAILYRELEASE_PATH = "/data/dailyrelease"
+    EVB_IP = "192.168.0.19"
+    NFS_IP = "192.168.0.16"
+    SSH_TOOL = "./ssh_tool.py"
+    TPU_SDK_RISCV= "cvitek_tpu_sdk_cv181x_musl_riscv64_rvv"
+    TPU_SDK_RISCV_LEGACY= "cvitek_tpu_sdk_cv181x_musl_riscv64"
+    TPU_SDK_GLIBC= "cvitek_tpu_sdk_cv181x_glibc_arm"
+    TPU_SDK_RISCV_GLIBC= "cvitek_tpu_sdk_cv181x_glibc_riscv64"
+    TPU_SDK_ARM= "cvitek_tpu_sdk_cv181x_glibc_arm"
+  }
+  stages {
+    stage('CleanWorkspace') {
+      steps {
+        cleanWs()
+      }
+    }
+    stage('Build') {
+      steps {
+        echo 'Begin to build tpu_sdk latest version'
+        sh '''#!/bin/bash
+          set -xe
+          echo "$WORKSPACE"
+          pushd ~/workspace/sdk
+          rm -rf install
+          ./update.sh
+          source build/envsetup_soc.sh
+          # riscv rvv
+          defconfig cv1811c_wevb_0006a_spinor
+          export TPU_REL=1
+          clean_3rd_party
+          build_3rd_party
+          clean_tpu_sdk
+          rm -rf cviruntime/build_sdk
+          build_tpu_sdk
+          rm -rf cviruntime/build_sdk
+          pushd install/soc_cv1811c_wevb_0006a_spinor/tpu_musl_riscv64/
+          tar -zcf ${TPU_SDK_RISCV}.tar.gz cvitek_tpu_sdk
+          cp ${TPU_SDK_RISCV}.tar.gz $WORKSPACE
+          popd
+          # riscv 
+          export RISCV_LEGACY=1
+          clean_tpu_sdk
+          rm -rf cviruntime/build_sdk
+          build_tpu_sdk
+          rm -rf cviruntime/build_sdk
+          pushd install/soc_cv1811c_wevb_0006a_spinor/tpu_musl_riscv64/
+          tar -zcf ${TPU_SDK_RISCV_LEGACY}.tar.gz cvitek_tpu_sdk
+          cp ${TPU_SDK_RISCV_LEGACY}.tar.gz $WORKSPACE
+          popd
+          # riscv glibc
+          defconfig cv181x_fpga_c906
+          clean_3rd_party
+          build_3rd_party
+          clean_tpu_sdk
+          rm -rf cviruntime/build_sdk
+          build_tpu_sdk
+          rm -rf cviruntime/build_sdk
+          pushd install/soc_cv181x_fpga_c906/tpu_glibc_riscv64/
+          tar -zcf ${TPU_SDK_RISCV_GLIBC}.tar.gz cvitek_tpu_sdk
+          cp ${TPU_SDK_RISCV_GLIBC}.tar.gz $WORKSPACE
+          popd
+          # arm 
+          defconfig cv1811c_wevb_0006a_spinor
+          setconfig TOOLCHAIN_GLIBC_ARM=y
+          export TPU_REL=1
+          clean_3rd_party
+          build_3rd_party
+          clean_tpu_sdk
+          rm -rf cviruntime/build_sdk
+          build_tpu_sdk
+          rm -rf cviruntime/build_sdk
+          pushd install/soc_cv1811c_wevb_0006a_spinor/tpu_32bit/
+          tar -zcf ${TPU_SDK_ARM}.tar.gz cvitek_tpu_sdk
+          cp ${TPU_SDK_ARM}.tar.gz $WORKSPACE
+          popd
+
+          # cp data
+          cp -rf cviruntime/scripts/evb/* $WORKSPACE
+          popd
+        '''
+      }
+    }
+    stage("MountNfs") {
+      steps {
+        sh '''#!/bin/bash
+          set -xe
+          pip3 install paramiko
+
+          pushd /data/nfs/evb_test/
+          rm -rf * 2> /dev/null || true
+          cp /data/dailyrelease/regression_models/cvimodel_regression_int8_cv181x.tar.gz ./
+          cp -rf $WORKSPACE/* ./
+
+          tar -zxf cvimodel_regression_int8_cv181x.tar.gz
+          tar -zxf ${TPU_SDK_RISCV}.tar.gz
+          chmod 777 -R *
+          popd
+        '''
+        retry(count: 3) {
+          sh '''#!/bin/bash
+            set -e
+            pwd
+            ping -c 10 $EVB_IP
+            $SSH_TOOL -t 30 -b $EVB_IP --exec "/sbin/reboot"
+            sleep 30
+            ping -c 10 $EVB_IP
+            $SSH_TOOL -t 30 -b $EVB_IP --exec "mkdir -p /mnt/data/nfs;mount -t nfs -o nolock ${NFS_IP}:/data/nfs/evb_test /mnt/data/nfs"
+          '''
+        }
+      }
+    }
+    stage("Testing") {
+      steps {
+        sh'pwd;ls -l'
+        sh'$SSH_TOOL -t 300 -b $EVB_IP --exec "set -o pipefail;/mnt/data/nfs/performance_testing_cv181x.sh 2>&1|tee /mnt/data/nfs/performance.log"'
+      }
+      post {
+        always {
+          sh'$SSH_TOOL -b $EVB_IP --exec "rm -rf /mnt/data/nfs/cvitek_tpu_sdk"'
+          sh'$SSH_TOOL -b $EVB_IP --exec "rm -rf /mnt/data/nfs/sdk_regression_out"'
+          sh'$SSH_TOOL -b $EVB_IP --exec "umount /mnt/data/nfs; rm -rf /mnt/data/nfs"'
+        }
+      }
+    }
+  }
+  post {
+    success {
+      sh'''
+        RELEASE_PATH="/data/dailyrelease/$(date '+%Y-%m-%d')-18.04"
+        PERF_PATH=$RELEASE_PATH/perf_result_cv181x
+        mkdir -p $PERF_PATH
+        cp ${TPU_SDK_RISCV}.tar.gz $RELEASE_PATH/
+        cp ${TPU_SDK_RISCV_LEGACY}.tar.gz $RELEASE_PATH/
+        cp ${TPU_SDK_RISCV_GLIBC}.tar.gz $RELEASE_PATH/
+        cp ${TPU_SDK_ARM}.tar.gz $RELEASE_PATH/
+        cp -f /data/nfs/evb_test/performance.log .
+        cp -f performance.log $PERF_PATH
+        python3 parse_performance.py best_performance_cv181x.csv performance.log performance_improve.csv  cv181x
+        cp -f performance_improve.csv $PERF_PATH
+        cp -f performance_history.csv $PERF_PATH
+        cp -f performance_history.jpg $PERF_PATH
+      '''
+
+      emailext (
+              attachmentsPattern:"performance_history.jpg, performance_history.csv, performance_improve.csv",
+              body: '''${SCRIPT, template="groovy-html.template"}''',
+              subject: "${env.JOB_NAME} - Build # ${env.BUILD_NUMBER} - Successful",
+              mimeType: 'text/html',
+              to: "${EMAIL_RECIPIENTS}"
+      )
+    }
+    failure {
+      emailext (
+          body: '''${SCRIPT, template="groovy-html.template"}''',
+          subject: "${env.JOB_NAME} - Build # ${env.BUILD_NUMBER} - Failed",
+          mimeType: 'text/html',
+          to: "${EMAIL_RECIPIENTS}"
+      )
+    }
+  }
+}
diff --git a/cviruntime/scripts/evb/jenkins_cv182x b/cviruntime/scripts/evb/jenkins_cv182x
new file mode 100644
index 000000000..8d0579aec
--- /dev/null
+++ b/cviruntime/scripts/evb/jenkins_cv182x
@@ -0,0 +1,138 @@
+pipeline {
+  agent {
+    label 'docker-slave'
+  }
+  options {
+    timeout(time: 1, unit: 'HOURS')
+    parallelsAlwaysFailFast()
+    timestamps()
+  }
+  environment {
+    MODEL_PATH = "/data/mlir-models"
+    DATASET_PATH = "/data/dataset"
+    DAILYRELEASE_PATH = "/data/dailyrelease"
+    EVB_IP = "192.168.0.29"
+    NFS_IP = "192.168.0.16"
+    SSH_TOOL = "./ssh_tool.py"
+  }
+  stages {
+    stage('CleanWorkspace') {
+      steps {
+        cleanWs()
+      }
+    }
+    stage('Build') {
+      steps {
+        echo 'Begin to build tpu_sdk latest version'
+        sh '''#!/bin/bash
+          set -xe
+          echo "$WORKSPACE"
+          pushd ~/workspace/sdk
+          rm -rf install
+          ./update.sh
+          source build/envsetup_soc.sh
+          defconfig cv1826_wevb_0005a_spinand
+          export TPU_REL=1
+          clean_3rd_party
+          build_3rd_party
+          clean_tpu_sdk
+          rm -rf cviruntime/build_sdk
+          build_tpu_sdk
+          rm -rf cviruntime/build_sdk
+          pushd install/soc_cv1826_wevb_0005a_spinand/tpu_32bit/
+          tar -zcf cvitek_tpu_sdk_cv182x.tar.gz cvitek_tpu_sdk
+          cp cvitek_tpu_sdk_cv182x.tar.gz $WORKSPACE
+          popd
+          setconfig TOOLCHAIN_UCLIBC_ARM=y
+          clean_3rd_party
+          build_3rd_party
+          clean_tpu_sdk
+          rm -rf cviruntime/build_sdk
+          build_tpu_sdk
+          rm -rf cviruntime/build_sdk
+          pushd install/soc_cv1826_wevb_0005a_spinand/tpu_uclibc/
+          tar -zcf cvitek_tpu_sdk_cv182x_uclibc.tar.gz cvitek_tpu_sdk
+          cp cvitek_tpu_sdk_cv182x_uclibc.tar.gz $WORKSPACE
+          popd
+          cp -rf cviruntime/scripts/evb/* $WORKSPACE
+          popd
+        '''
+      }
+    }
+    stage("MountNfs") {
+      steps {
+        sh '''#!/bin/bash
+          set -xe
+          pip3 install paramiko
+
+          pushd /data/nfs/evb_test/
+          rm -rf * 2> /dev/null || true
+          cp /data/dailyrelease/regression_models/cvimodel_regression_int8_cv182x.tar.gz ./
+          cp -rf $WORKSPACE/* ./
+
+          tar -zxf cvimodel_regression_int8_cv182x.tar.gz
+          tar -zxf cvitek_tpu_sdk_cv182x.tar.gz
+          chmod 777 -R *
+          popd
+        '''
+        retry(count: 3) {
+          sh '''#!/bin/bash
+            set -e
+            pwd
+            ping -c 10 $EVB_IP
+            $SSH_TOOL -t 30 -b $EVB_IP --exec "reboot"
+            sleep 30
+            ping -c 10 $EVB_IP
+            $SSH_TOOL -t 30 -b $EVB_IP --exec "mkdir -p /mnt/data/nfs;mount -t nfs -o nolock ${NFS_IP}:/data/nfs/evb_test /mnt/data/nfs"
+          '''
+        }
+      }
+    }
+    stage("Testing") {
+      steps {
+        sh'pwd;ls -l'
+        sh'$SSH_TOOL -t 300 -b $EVB_IP --exec "set -o pipefail;/mnt/data/nfs/performance_testing_cv182x.sh 2>&1|tee /mnt/data/nfs/performance.log"'
+      }
+      post {
+        always {
+          sh'$SSH_TOOL -b $EVB_IP --exec "rm -rf /mnt/data/nfs/cvitek_tpu_sdk"'
+          sh'$SSH_TOOL -b $EVB_IP --exec "rm -rf /mnt/data/nfs/sdk_regression_out"'
+          sh'$SSH_TOOL -b $EVB_IP --exec "umount /mnt/data/nfs; rm -rf /mnt/data/nfs"'
+        }
+      }
+    }
+  }
+  post {
+    success {
+      sh'''
+        RELEASE_PATH="/data/dailyrelease/$(date '+%Y-%m-%d')-18.04"
+        PERF_PATH=$RELEASE_PATH/perf_result_cv182x
+        mkdir -p $PERF_PATH
+        cp cvitek_tpu_sdk_cv182x.tar.gz $RELEASE_PATH/
+        cp cvitek_tpu_sdk_cv182x_uclibc.tar.gz $RELEASE_PATH/
+        cp -f /data/nfs/evb_test/performance.log .
+        cp -f performance.log $PERF_PATH
+        python3 parse_performance.py best_performance_cv182x.csv performance.log performance_improve.csv cv182x
+        cp -f performance_improve.csv $PERF_PATH
+        cp -f performance_history.csv $PERF_PATH
+        cp -f performance_history.jpg $PERF_PATH
+      '''
+
+      emailext (
+              attachmentsPattern:"performance_history.jpg, performance_history.csv, performance_improve.csv",
+              body: '''${SCRIPT, template="groovy-html.template"}''',
+              subject: "${env.JOB_NAME} - Build # ${env.BUILD_NUMBER} - Successful",
+              mimeType: 'text/html',
+              to: "${EMAIL_RECIPIENTS}"
+      )
+    }
+    failure {
+      emailext (
+          body: '''${SCRIPT, template="groovy-html.template"}''',
+          subject: "${env.JOB_NAME} - Build # ${env.BUILD_NUMBER} - Failed",
+          mimeType: 'text/html',
+          to: "${EMAIL_RECIPIENTS}"
+      )
+    }
+  }
+}
diff --git a/cviruntime/scripts/evb/jenkins_cv183x b/cviruntime/scripts/evb/jenkins_cv183x
new file mode 100644
index 000000000..00e6e4fbc
--- /dev/null
+++ b/cviruntime/scripts/evb/jenkins_cv183x
@@ -0,0 +1,140 @@
+pipeline {
+  agent {
+    label 'docker-slave'
+  }
+  options {
+    timeout(time: 1, unit: 'HOURS')
+    parallelsAlwaysFailFast()
+    timestamps()
+  }
+  environment {
+    MODEL_PATH = "/data/mlir-models"
+    DATASET_PATH = "/data/dataset"
+    DAILYRELEASE_PATH = "/data/dailyrelease"
+    EVB_IP = "192.168.0.39"
+    NFS_IP = "192.168.0.16"
+    SSH_TOOL = "./ssh_tool.py"
+  }
+  stages {
+    stage('CleanWorkspace') {
+      steps {
+        cleanWs()
+      }
+    }
+    stage('Build') {
+      steps {
+        echo 'Begin to build tpu_sdk latest version'
+        sh '''#!/bin/bash
+          set -xe
+          echo "$WORKSPACE"
+          pushd ~/workspace/sdk
+          rm -rf install
+          ./update.sh
+          source build/envsetup_soc.sh
+          defconfig cv1835_wevb_0002a
+          export TPU_REL=1
+          clean_3rd_party
+          build_3rd_party
+          clean_tpu_sdk
+          rm -rf cviruntime/build_sdk
+          build_tpu_sdk
+          rm -rf cviruntime/build_sdk
+          pushd install/soc_cv1835_wevb_0002a/tpu_64bit/
+          tar -zcf cvitek_tpu_sdk_cv183x.tar.gz cvitek_tpu_sdk
+          cp cvitek_tpu_sdk_cv183x.tar.gz $WORKSPACE
+          popd
+          setconfig TOOLCHAIN_GLIBC_ARM=y
+          clean_3rd_party
+          build_3rd_party
+          clean_tpu_sdk
+          rm -rf cviruntime/build_sdk
+          build_tpu_sdk
+          rm -rf cviruntime/build_sdk
+          pushd install/soc_cv1835_wevb_0002a/tpu_32bit/
+          tar -zcf cvitek_tpu_sdk_cv183x_32bit.tar.gz cvitek_tpu_sdk
+          cp cvitek_tpu_sdk_cv183x_32bit.tar.gz $WORKSPACE
+          popd
+          cp -rf cviruntime/scripts/evb/* $WORKSPACE
+          popd
+        '''
+      }
+    }
+    stage("MountNfs") {
+      steps {
+        sh '''#!/bin/bash
+          set -xe
+          pip3 install paramiko
+
+          pushd /data/nfs/evb_test/
+          rm -rf * 2> /dev/null || true
+
+          cp /data/dailyrelease/regression_models/cvimodel_regression_int8_cv183x.tar.gz ./
+          cp -rf $WORKSPACE/* ./
+
+          tar -zxf cvimodel_regression_int8_cv183x.tar.gz
+          tar -zxf cvitek_tpu_sdk_cv183x.tar.gz
+          chmod 777 -R cvimodel_regression_int8_cv183x
+          chmod 777 -R cvitek_tpu_sdk/
+          popd
+        '''
+        retry(count: 3) {
+          sh '''#!/bin/bash
+            set -e
+            pwd
+            ping -c 10 $EVB_IP
+            $SSH_TOOL -t 30 -b $EVB_IP --exec "reboot"
+            sleep 30
+            ping -c 10 $EVB_IP
+            $SSH_TOOL -t 30 -b $EVB_IP --exec "mkdir -p /mnt/data/nfs;mount -t nfs -o nolock ${NFS_IP}:/data/nfs/evb_test /mnt/data/nfs"
+          '''
+        }
+      }
+    }
+    stage("Testing") {
+      steps {
+        sh'pwd;ls -l'
+        sh'$SSH_TOOL -t 300 -b $EVB_IP --exec "set -o pipefail;/mnt/data/nfs/performance_testing_cv183x.sh 2>&1|tee /mnt/data/nfs/performance.log"'
+      }
+      post {
+        always {
+          sh'$SSH_TOOL -b $EVB_IP --exec "rm -rf /mnt/data/nfs/cvitek_tpu_sdk"'
+          sh'$SSH_TOOL -b $EVB_IP --exec "rm -rf /mnt/data/nfs/sdk_regression_out"'
+          sh'$SSH_TOOL -b $EVB_IP --exec "umount /mnt/data/nfs; rm -rf /mnt/data/nfs"'
+        }
+      }
+    }
+  }
+  post {
+    success {
+      sh'''
+        RELEASE_PATH="/data/dailyrelease/$(date '+%Y-%m-%d')-18.04"
+        PERF_PATH=$RELEASE_PATH/perf_result_cv183x
+        mkdir -p $PERF_PATH
+        cp cvitek_tpu_sdk_cv183x.tar.gz $RELEASE_PATH/
+        cp cvitek_tpu_sdk_cv183x_32bit.tar.gz $RELEASE_PATH/
+        cp -f /data/nfs/evb_test/performance.log .
+        cp -f performance.log $PERF_PATH
+        python3 parse_performance.py best_performance_cv183x.csv performance.log performance_improve.csv cv183x
+        cp -f performance_improve.csv $PERF_PATH
+        cp -f performance_history.csv $PERF_PATH
+        cp -f performance_history.jpg $PERF_PATH
+      '''
+
+      emailext (
+              attachmentsPattern:"performance_history.jpg, performance_history.csv, performance_improve.csv",
+              body: '''${SCRIPT, template="groovy-html.template"}''',
+              subject: "${env.JOB_NAME} - Build # ${env.BUILD_NUMBER} - Successful",
+              mimeType: 'text/html',
+              to: "${EMAIL_RECIPIENTS}"
+      )
+    }
+    failure {
+      emailext (
+          body: '''${SCRIPT, template="groovy-html.template"}''',
+          subject: "${env.JOB_NAME} - Build # ${env.BUILD_NUMBER} - Failed",
+          mimeType: 'text/html',
+          to: "${EMAIL_RECIPIENTS}"
+      )
+    }
+  }
+}
diff --git a/cviruntime/scripts/evb/jenkins_x86_64 b/cviruntime/scripts/evb/jenkins_x86_64
new file mode 100644
index 000000000..7b6f98558
--- /dev/null
+++ b/cviruntime/scripts/evb/jenkins_x86_64
@@ -0,0 +1,68 @@
+pipeline {
+  agent {
+    label 'docker-slave'
+  }
+  options {
+    timeout(time: 1, unit: 'HOURS')
+    parallelsAlwaysFailFast()
+    timestamps()
+  }
+  environment {
+    MODEL_PATH = "/data/mlir-models"
+    DATASET_PATH = "/data/dataset"
+    DAILYRELEASE_PATH = "/data/dailyrelease"
+  }
+  stages {
+    stage('CleanWorkspace') {
+      steps {
+        cleanWs()
+      }
+    }
+    stage('Build') {
+      steps {
+        echo 'Begin to build tpu_sdk latest version'
+        sh '''#!/bin/bash
+          pushd ~/workspace/sdk
+          rm -rf install
+          ./update.sh
+          popd
+          pushd ~/workspace/sdk/cviruntime/scripts/evb
+          ./build_x86_sdk.sh
+          popd
+        '''
+      }
+    }
+  }
+  post {
+    success {
+      sh'''
+        RELEASE_PATH="/data/dailyrelease/$(date '+%Y-%m-%d')-18.04"
+        cp ~/workspace/sdk/cvitek_tpu_sdk_x86_64.tar.gz $RELEASE_PATH/
+        rm -f ~/workspace/sdk/cvitek_tpu_sdk_x86_64.tar.gz
+
+        #release tpu_samples
+        pushd ~/workspace/sdk/cviruntime
+        cp -rf ./samples ./cvitek_tpu_samples
+        tar zcvf cvitek_tpu_samples.tar.gz  cvitek_tpu_samples
+        mv cvitek_tpu_samples.tar.gz $RELEASE_PATH
+        rm -rf ./cvitek_tpu_samples
+        popd 
+      '''
+
+      emailext (
+              body: '''${SCRIPT, template="groovy-html.template"}''',
+              subject: "${env.JOB_NAME} - Build # ${env.BUILD_NUMBER} - Successful",
+              mimeType: 'text/html',
+              to: "${EMAIL_RECIPIENTS}"
+      )
+    }
+    failure {
+      emailext (
+          body: '''${SCRIPT, template="groovy-html.template"}''',
+          subject: "${env.JOB_NAME} - Build # ${env.BUILD_NUMBER} - Failed",
+          mimeType: 'text/html',
+          to: "${EMAIL_RECIPIENTS}"
+      )
+    }
+  }
+}
diff --git a/cviruntime/scripts/evb/parse_performance.py b/cviruntime/scripts/evb/parse_performance.py
new file mode 100755
index 000000000..3e78c4e02
--- /dev/null
+++ b/cviruntime/scripts/evb/parse_performance.py
@@ -0,0 +1,281 @@
+import re
+import sys
+import time
+import os
+import matplotlib
+import matplotlib.pyplot as plt
+import numpy as np
+
+DAILYRELEASE_PATH="/data/dailyrelease"
+net_names = [
+    "resnet50",
+    "resnet18",
+    "mobilenet_v1",
+    "mobilenet_v2",
+    "squeezenet_v1.1",
+    "shufflenet_v2",
+    "googlenet",
+    "inception_v3",
+    "inception_v4",
+    "vgg16",
+    "densenet_121",
+    "densenet_201",
+    "senet_r50",
+    "resnext50",
+    "res2net50",
+    "ecanet50",
+    "efficientnet_b0",
+    "efficientnet-lite_b0",
+    "nasnet_mobile",
+    "retinaface_mnet25",
+    "retinaface_mnet25_600",
+    "retinaface_res50",
+    "ssd300",
+    "mobilenet_ssd",
+    "yolo_v1_448",
+    "yolo_v2_416",
+    "yolo_v2_1080p",
+    "yolo_v3_320",
+    "yolo_v3_416",
+    "yolo_v3_tiny",
+    "yolo_v3_608",
+    "yolo_v3_spp",
+    "yolo_v4",
+    "yolo_v4_s",
+    "yolo_v5_s",
+    "yolox_s",
+    "faster_rcn",
+    "arcface_res50",
+    "alphapose",
+    "espcn_3x",
+    "unet",
+    "erfnet"
+]
+
+def parse_peformance_log(in_log_file):
+    records = {}
+    with open(in_log_file, 'r') as f:
+        lines = f.readlines()
+
+    cur_name = None
+    cur_bs = 1
+    for line in lines:
+        r = re.match('^test\s+?(.*)', line)
+        if r:
+            cur_name = r.groups()[0]
+            # print(cur_name)
+            x = re.match('^(.+?)\s+?batch\s+(\d+)$',cur_name)
+            bs = 1
+            if x:
+                cur_name = x.groups()[0]
+                bs = int(x.groups()[1])
+            if bs == 1 or bs == 4:
+                cur_name = cur_name + "_bs" + str(bs)
+                records[cur_name] = [0,0,0,0]
+            else:
+                assert(0)
+        else:
+            r = re.match('tdma_exe_ms:\s+([\d\.]*)ms,\s+tiu_exe_ms:\s+([\d\.]*)ms,\s+inference_ms:\s+([\d\.]*)ms', line)
+            if r:
+                tdma, tiu, inference = r.groups()
+                records[cur_name] = [inference, 0]
+    return records
+
+
+def parse_best_csv(best_csv):
+    best_records = {}
+    # print("parse best performance log file")
+    with open(best_csv, 'r') as f:
+        lines = f.readlines()
+    for line in lines:
+        r = re.match('^(.+?_bs\d+),([\d\.]*)', line)
+        if r:
+            cur_name, inference = r.groups()
+            best_records[cur_name] = inference
+            # print(best_records)
+    return best_records
+
+def compare_record(cur_record, best_record):
+    for k, best_perf in best_record.items():
+        if k in cur_record:
+            cur_perf = cur_record[k]
+            cur_perf[1] = (float(best_perf) - float(cur_perf[0]))*100.0/float(best_perf)
+            print("Net: {0} Best Cycle: {1}ms, Current Cycle: {2}ms Improve:{3:.2f}%".format(k,
+                   best_perf, cur_perf[0], cur_perf[1]))
+        else:
+            # current performance fail on some network
+            print("[Warning] lost today's performance data for network: ", k)
+
+def output_csv(cur_records, best_records, out_file):
+    with open(out_file, 'w') as f:
+        f.write('{}\n'.format(time.strftime("%Y-%m-%d",time.localtime(time.time()))))
+        f.write('Net,Best Cycle(ms),Current Cycle(ms),Improve(%)\n')
+        for name, best_record in best_records.items():
+            if name not in cur_records:
+                f.write("{0},{1},{2},{3:.2f}%, NNNNNN\n".format(name, best_record, -100.0, -100.0))
+                continue
+            cur_record = cur_records[name]
+            f.write("{0},{1},{2},{3:.2f}%".format(name, best_record, cur_record[0], cur_record[1]))
+            if cur_record[1] > 1.0:
+                print("[Congs]Net: {0} Best Cycle: {1}ms, Current Cycle: {2}ms, Improve: {3:.2f}%".format(
+                        name, best_records[name], cur_record[0], cur_record[1]))
+                f.write(",YYYYYY\n")
+            elif cur_record[1] < -1.0:
+                print("[Warning]Net: {0} Best Cycle: {1}ms, Current Cycle: {2}ms, Improve: {3:.2f}%".format(
+                        name, best_records[name], cur_record[0], cur_record[1]))
+                f.write(",NNNNNN\n")
+            else:
+                f.write("\n")
+
+    # generate data result according to net_names
+    file_for_release = os.path.splitext(out_file)[0] + "_release.csv"
+    with open(file_for_release, 'w') as f:
+        f.write('{}\n'.format(time.strftime("%Y-%m-%d",time.localtime(time.time()))))
+        f.write('Net,Best Cycle(ms),Current Cycle(ms),Improve(%)\n')
+        for net in net_names:
+            best_data = "--"
+            cur_data = "--"
+            cur_improve = "--"
+            b_best = False
+            b_cur = False
+            net = net + "_bs1"
+            if net in best_records:
+                best_data = best_records[net]
+                b_best = True
+            if net in cur_records:
+                cur_data = cur_records[net][0]
+                cur_improve = str(cur_records[net][1])
+                b_cur = True
+
+            if b_cur == True:
+                f.write("{0},{1},{2},{3:.2f}%".format(net, best_data, cur_data, float(cur_improve)))
+            else:
+                f.write("{0},{1},{2},{3}".format(net, best_data, cur_data, cur_improve))
+
+            if b_best == True and b_cur == True:
+                if cur_records[net][1] > 1.0:
+                    f.write(",YYYYYY")
+                elif cur_records[net][1] < -1.0:
+                    f.write(",NNNNNN")
+            f.write("\n")
+
+def get_folder_list(chip_type):
+    file_path = DAILYRELEASE_PATH
+    dir_list = os.listdir(file_path)
+    valid_folder = []
+    if not dir_list:
+        print("get data release file failed.")
+        exit
+    else:
+        dir_list = sorted(dir_list, key=lambda x: os.path.getmtime(os.path.join(file_path, x)))
+        for d in dir_list:
+            perf_path = os.path.join(os.path.join(file_path, d), "perf_result_"+chip_type)
+            perf_file = os.path.join(perf_path, "performance.log")
+            print("perf_file: ", perf_file)
+            if os.path.exists(perf_file):
+                valid_folder.append(d)
+
+    if (len(valid_folder) > 30):
+        valid_folder = valid_folder[-30:]
+    return valid_folder
+
+def get_perf_data(folder_list, chip_type, best_records):
+    all_perf_data = {}
+    all_date = []
+    for p_fold in folder_list:
+        perf_path = os.path.join(os.path.join(DAILYRELEASE_PATH, p_fold), "perf_result_"+chip_type)
+        p_file = os.path.join(perf_path, "performance.log")
+        cur_records = parse_peformance_log(p_file)
+
+        for net, data in best_records.items():
+            if net in cur_records:
+                if net in all_perf_data:
+                    all_perf_data[net].append(float(cur_records[net][0]))
+                else:
+                    all_perf_data[net] = [float(cur_records[net][0])]
+            else:
+                if net in all_perf_data:
+                    all_perf_data[net].append(float(data))
+                else:
+                    all_perf_data[net] = [float(data)]
+
+        all_date.append(p_fold)
+    return all_date, all_perf_data
+
+def draw_perf_history_graph(daily_date, daily_perf_data):
+    plt.rcParams['savefig.dpi'] = 500
+    plt.rcParams['figure.dpi'] = 300
+    import itertools
+    marker = itertools.cycle(("8","s","P","*", "D", "|", 4,5,6,7,8,9,10))
+
+    # sub graph 0: 0~5ms
+    plt.subplot(2,2,1)
+    plt.xticks(rotation=45, fontsize=2)
+    plt.yticks(fontsize=2)
+    for net_name, net_daily_perf in daily_perf_data.items():
+        if net_daily_perf[0] < 5.0:
+            plt.plot(daily_date, net_daily_perf, marker=next(marker),
+                linewidth=0.5, linestyle='--', label=net_name, markersize=2)
+            plt.legend(fontsize=2, bbox_to_anchor=(0, 1.02), loc='lower left', ncol=4)
+    # sub graph 1: 5~20ms
+    plt.subplot(2,2,2)
+    plt.xticks(rotation=45, fontsize=2)
+    plt.yticks(fontsize=2)
+    for net_name, net_daily_perf in daily_perf_data.items():
+        if net_daily_perf[0] > 5.0 and net_daily_perf[0] < 20.0:
+            plt.plot(daily_date, net_daily_perf, marker=next(marker),
+                linewidth=0.5, linestyle='--', label=net_name, markersize=2)
+            plt.legend(fontsize=2, bbox_to_anchor=(0, 1.02), loc='lower left', ncol=4)
+    # sub graph 2: 20~100ms
+    plt.subplot(2,2,3)
+    plt.xticks(rotation=45, fontsize=2)
+    plt.yticks(fontsize=2)
+    for net_name, net_daily_perf in daily_perf_data.items():
+        if net_daily_perf[0] > 20.0 and net_daily_perf[0] < 100.0:
+            plt.plot(daily_date, net_daily_perf, marker=next(marker),
+                linewidth=0.5, linestyle='--', label=net_name, markersize=2)
+            plt.legend(fontsize=2, bbox_to_anchor=(0, 1.02), loc='lower left', ncol=4)
+    # sub graph 4: 100ms~
+    plt.subplot(2,2,4)
+    plt.xticks(rotation=45, fontsize=2)
+    plt.yticks(fontsize=2)
+    for net_name, net_daily_perf in daily_perf_data.items():
+        if net_daily_perf[0] > 100:
+            plt.plot(daily_date, net_daily_perf, marker=next(marker),
+                linewidth=0.5, linestyle='--', label=net_name, markersize=2)
+            plt.legend(fontsize=2, bbox_to_anchor=(0, 1.02), loc='lower left', ncol=4)
+    plt.show()
+    plt.savefig('./performance_history.jpg')
+
+    with open('performance_history.csv', 'w') as f:
+        f.write('Inference Cycle(ms),{}\n'.format(time.strftime("%Y-%m-%d",time.localtime(time.time()))))
+        f.write('Net')
+        for d in daily_date:
+            f.write(",{}".format(d))
+        f.write("\n")
+        for net_name, net_daily_perf in daily_perf_data.items():
+            f.write("{}".format(net_name))
+            for net_perf in net_daily_perf:
+                f.write(",{}".format(net_perf))
+            f.write("\n")
+
+# genereate the perf graph of the past 30 days
+def generate_perf_graph(chip_type, best_records):
+    # get file list of past 30 days
+    folder_list = get_folder_list(chip_type)
+    # get performance data of past 30 days
+    daily_date, daily_perf_data = get_perf_data(folder_list, chip_type, best_records)
+    # print(daily_date)
+    # print(daily_perf_data)
+    # dump performance graph
+    draw_perf_history_graph(daily_date, daily_perf_data)
+
+if __name__ == '__main__':
+    if len(sys.argv) != 5:
+        print("USAGE: sys.argv[0] best_performance.csv src_log_file out_result.csv cv181x/cv182x/cv183x")
+        exit(1)
+    best_records = parse_best_csv(sys.argv[1])
+    cur_records = parse_peformance_log(sys.argv[2])
+    compare_record(cur_records, best_records)
+    output_csv(cur_records, best_records, sys.argv[3])
+    generate_perf_graph(sys.argv[4], best_records)
\ No newline at end of file
diff --git a/cviruntime/scripts/evb/performance_testing_cv180x.sh b/cviruntime/scripts/evb/performance_testing_cv180x.sh
new file mode 100755
index 000000000..a1e115553
--- /dev/null
+++ b/cviruntime/scripts/evb/performance_testing_cv180x.sh
@@ -0,0 +1,8 @@
+set -x
+
+cd /mnt/data/nfs/
+cd cvitek_tpu_sdk && source ./envs_tpu_sdk.sh && cd ..
+export TPU_ROOT=$PWD/cvitek_tpu_sdk
+
+# For batch_size = 1
+MODEL_PATH=$PWD/cvimodel_regression_int8_cv180x $TPU_ROOT/regression_models.sh
diff --git a/cviruntime/scripts/evb/performance_testing_cv181x.sh b/cviruntime/scripts/evb/performance_testing_cv181x.sh
new file mode 100755
index 000000000..e6ab3e4da
--- /dev/null
+++ b/cviruntime/scripts/evb/performance_testing_cv181x.sh
@@ -0,0 +1,8 @@
+set -x
+
+cd /mnt/data/nfs/
+cd cvitek_tpu_sdk && source ./envs_tpu_sdk.sh && cd ..
+export TPU_ROOT=$PWD/cvitek_tpu_sdk
+
+# For batch_size = 1
+MODEL_PATH=$PWD/cvimodel_regression_int8_cv181x $TPU_ROOT/regression_models.sh
diff --git a/cviruntime/scripts/evb/performance_testing_cv182x.sh b/cviruntime/scripts/evb/performance_testing_cv182x.sh
new file mode 100755
index 000000000..b368f9c1d
--- /dev/null
+++ b/cviruntime/scripts/evb/performance_testing_cv182x.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+set -x
+
+cd /mnt/data/nfs/
+cd cvitek_tpu_sdk && source ./envs_tpu_sdk.sh && cd ..
+export TPU_ROOT=$PWD/cvitek_tpu_sdk
+
+# For batch_size = 1
+MODEL_PATH=$PWD/cvimodel_regression_int8_cv182x $TPU_ROOT/regression_models.sh
diff --git a/cviruntime/scripts/evb/performance_testing_cv183x.sh b/cviruntime/scripts/evb/performance_testing_cv183x.sh
new file mode 100755
index 000000000..2d2880b07
--- /dev/null
+++ b/cviruntime/scripts/evb/performance_testing_cv183x.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+set -x
+
+cd /mnt/data/nfs/
+cd cvitek_tpu_sdk && source ./envs_tpu_sdk.sh && cd ..
+export TPU_ROOT=$PWD/cvitek_tpu_sdk
+
+# For batch_size = 1
+MODEL_PATH=$PWD/cvimodel_regression_int8_cv183x $TPU_ROOT/regression_models.sh
diff --git a/cviruntime/scripts/evb/samples_test.py b/cviruntime/scripts/evb/samples_test.py
new file mode 100755
index 000000000..f7b282b5a
--- /dev/null
+++ b/cviruntime/scripts/evb/samples_test.py
@@ -0,0 +1,111 @@
+#!/usr/bin/python3
+import argparse
+import sys
+import paramiko
+import re
+from ssh_tool import SshTool
+
+class SampleTestCase:
+    def __init__(self, ssh, models_path, sdk_path):
+        self.ssh = ssh
+        self.command = '''
+            export MODEL_PATH={}
+            cd {}
+            . ./envs_tpu_sdk.sh
+            cd samples/
+        '''.format(models_path, sdk_path)
+
+    def __parse_result(self, outputs):
+        i = 0
+        results = []
+        while i < len(outputs):
+            if outputs[i].startswith('------'):
+                j = i + 1
+                while j < len(outputs) and \
+                      not outputs[j].startswith('------'):
+                    results.append(outputs[j])
+                    j += 1
+                i = j + 1
+            else:
+                i += 1
+        if len(results) == 0:
+            raise Exception("no results")
+        return results
+
+    def run(self, shell_script, validate_fn, **kargs):
+        print("\n#################", shell_script)
+        command = self.command + './{}'.format(shell_script)
+        err, outputs = self.ssh.exec(command)
+        if err == 0:
+            results = self.__parse_result(outputs)
+            print(results)
+            return validate_fn(results, **kargs)
+        print("ERROR, shell excute failed. ret: {}".format(err))
+        return 1
+
+
+def validate_alphapose_result(outputs, poses_wanted):
+    line = outputs[0]
+    poses_detected = int(re.match('^\s*(\d+)', line).groups()[0])
+    if poses_detected != poses_wanted:
+        print("FAILED, {} poses should be detected, but only has {}".format(
+                poses_wanted, poses_detected))
+        return 1
+    else:
+        print('PASSED')
+        return 0
+
+def validate_classifier_result(outputs, id, score):
+    for line in outputs:
+        _score, _id = re.match('^\s*([0-9.]+), idx (\d+)', line).groups()
+        if int(_id) == id and float(_score) > score:
+            print('PASSED')
+            return 0
+    print('FAILED, id={}, score > {}'.format(id, score))
+    return 1
+
+def validate_detector_result(outputs, objects_wanted):
+    line = outputs[0]
+    objects_detected = int(re.match('^(\d)', line).groups()[0])
+    if objects_detected != objects_wanted:
+        print("FAILED, {} objects should be detected, but only has {}".format(
+              objects_wanted, objects_detected))
+        return 1
+    else:
+        print('PASSED')
+        return 0
+
+def validate_insightface_result(outputs, similarity):
+    for target, line in zip(similarity, outputs):
+        gotten = re.match('^Similarity: ([0-9.\-]+)', line).groups()[0]
+        if target > 0.1 and float(gotten) < target:
+            print('FAILED, similarity {} < target:{}'.format(gotten, target))
+            return 1
+        elif target <= 0.1 and float(gotten) > target:
+            print('FAILED, similarity {} > target:{}'.format(gotten, target))
+            return 1
+    print('PASSED')
+    return 0
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description="samples_test")
+    parser.add_argument("-t", "--target", type=str, required=True)
+    parser.add_argument("-m", "--models_path", type=str)
+    parser.add_argument("-s", "--sdk_path", type=str)
+    args = parser.parse_args()
+
+    ssh = SshTool(args.target)
+    testcase = SampleTestCase(ssh, args.models_path, args.sdk_path)
+    cnt = 0
+    cnt += testcase.run('run_alphapose.sh', validate_alphapose_result, poses_wanted=5)
+    cnt += testcase.run('run_alphapose_fused_preprocess.sh', validate_alphapose_result, poses_wanted=5)
+    cnt += testcase.run('run_classifier.sh', validate_classifier_result, id=285, score=0.36)
+    cnt += testcase.run('run_classifier_fused_preprocess.sh', validate_classifier_result, id=285, score=0.32)
+    cnt += testcase.run('run_classifier_yuv420.sh', validate_classifier_result, id=285, score=0.34)
+    cnt += testcase.run('run_detector.sh', validate_detector_result, objects_wanted=3)
+    cnt += testcase.run('run_detector_fused_preprocess.sh', validate_detector_result, objects_wanted=3)
+    cnt += testcase.run('run_insightface.sh', validate_insightface_result, similarity=[0.75, 0.8, 0.81, -0.01, -0.08, 0.03])
+    cnt += testcase.run('run_insightface_fused_preprocess.sh', validate_insightface_result, similarity=[0.75, 0.8, 0.76, 0, -0.08, 0.04])
+    if cnt > 0:
+        print('{} testcases failed'.format(cnt))
+    sys.exit(cnt)
\ No newline at end of file
diff --git a/cviruntime/scripts/evb/ssh_tool.py b/cviruntime/scripts/evb/ssh_tool.py
new file mode 100755
index 000000000..f72728447
--- /dev/null
+++ b/cviruntime/scripts/evb/ssh_tool.py
@@ -0,0 +1,51 @@
+#!/usr/bin/python3
+import argparse
+import sys
+import paramiko
+
+class SshTool:
+    def __init__(self, host, user='root', password='cvitek'):
+        self.host = host
+        self.user = user
+        self.password = password
+        self.ssh = paramiko.SSHClient()
+        self.ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+        self.__connect()
+
+    def __connect(self, try_cnt=5):
+        for i in range(try_cnt):
+            try:
+                self.ssh.connect(self.host, '22', self.user, self.password, timeout=20)
+                return
+            except Exception as e:
+                print(e)
+                continue
+        raise Exception("connection failed")
+
+    def exec(self, command):
+        _, stdout, stderr = self.ssh.exec_command(command)
+        for line in stdout.readlines():
+            print(line.strip())
+        for line in stderr.readlines():
+            print(line.strip())
+        err = stdout.channel.recv_exit_status()
+        print("ret:{}".format(err))
+        return err
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description="SSH client")
+    parser.add_argument("-b", "--target", type=str, required=True,
+                        help="specify ip of the target board")
+    parser.add_argument("-u", "--user", type=str, default="root",
+                        help="specify the user name to login")
+    parser.add_argument("-p", "--password", type=str, default="cvitek",
+                        help="password to login")
+    parser.add_argument("-t", "--timeout", type=int, default=0,
+                        help="timeout time to connect")
+    parser.add_argument("-e", "--exec", type=str,
+                        help="shell command to be executed on target board")
+
+    args = parser.parse_args()
+    ssh = SshTool(host=args.target, user=args.user, password=args.password)
+    ret = ssh.exec(args.exec)
+    sys.exit(ret)
diff --git a/cviruntime/scripts/evb/tpu-mlir_daily_release b/cviruntime/scripts/evb/tpu-mlir_daily_release
new file mode 100644
index 000000000..6c045003d
--- /dev/null
+++ b/cviruntime/scripts/evb/tpu-mlir_daily_release
@@ -0,0 +1,88 @@
+pipeline {
+    agent {
+        docker {
+            image 'sophgo/tpuc_dev'
+            label 'master'
+            args '-u 0:0 --privileged -v /home/docker-slave:/home/jenkins -v /data:/data'
+        }
+    }
+    options {
+        timeout(time: 7, unit: 'HOURS')
+        parallelsAlwaysFailFast()
+        timestamps()
+    }
+    environment {
+        DATASET_PATH = "/data/dataset"
+        DAILYRELEASE_PATH = "/data/dailyrelease"
+    }
+    stages {
+        stage('CleanWorkspace') {
+            steps {
+                cleanWs()
+            }
+        }
+        stage('Build') {
+            steps {
+                echo 'Begin to build tpu-mlir latest version'
+                sh '''
+                set -e
+                cd
+                apt-get update
+                apt-get install -y ssh
+                pushd /home/jenkins/workspace/tpu-mlir
+                mkdir -p /root/.ssh
+                cp -r /home/jenkins/.ssh/* /root/.ssh
+                git pull
+                rm -rf ./build
+                rm -rf ./install
+                source envsetup.sh
+                ./build.sh
+                popd
+                '''
+            }
+        }
+        stage('Release tpu-mlir && models') {
+            steps {
+                sh'''#!/bin/bash
+                set -e
+                cd
+                pushd /home/jenkins/workspace/tpu-mlir
+                source envsetup.sh
+                ./release.sh
+                os_ver="18.04"
+                mlir_version="$(grep MLIR_VERSION ${BUILD_PATH}/CMakeCache.txt | cut -d "=" -f2)"
+                mkdir -p $DAILYRELEASE_PATH/$(date '+%Y-%m-%d')-${os_ver}
+                cp ./tpu-mlir_${mlir_version}.tar.gz $DAILYRELEASE_PATH/$(date '+%Y-%m-%d')-${os_ver}/
+                rm ./tpu-mlir_${mlir_version}.tar.gz
+                chmod 777 -R $DAILYRELEASE_PATH/$(date '+%Y-%m-%d')-${os_ver}
+                popd
+
+                #release cvimodels
+                export NNMODELS_PATH=/data/sophgo_models/nnmodels
+                export MODEL_ZOO_PATH=/data/sophgo_models/model-zoo
+                rm -rf /home/jenkins/workspace/tpu-mlir/regression/regression_out
+                pushd /home/jenkins/workspace/tpu-mlir/regression
+                ./dailyrelease_cvimodel.py $DAILYRELEASE_PATH/$(date '+%Y-%m-%d')-${os_ver} 10
+                chmod 777 -R $DAILYRELEASE_PATH/$(date '+%Y-%m-%d')-${os_ver}
+				if [ -e "$DAILYRELEASE_PATH/$(date '+%Y-%m-%d')-${os_ver}/cvimodel_regression_int8_cv183x.tar.gz" ]; then
+					unlink /data/dailyrelease/regression_models
+					ln -s $DAILYRELEASE_PATH/$(date '+%Y-%m-%d')-${os_ver} /data/dailyrelease/regression_models
+				fi
+                popd
+                '''
+            }
+        }
+    }
+    post {
+        always {
+            emailext (
+                body: '''${SCRIPT, template="groovy-html.template"}''',
+                subject: "${env.JOB_NAME} - Build # ${env.BUILD_NUMBER} - ${currentBuild.currentResult}",
+                mimeType: 'text/html',
+                to: "${EMAIL_RECIPIENTS}"
+            )
+        }
+    }
+}
+
+
diff --git a/cviruntime/scripts/regression_models_cv180x.sh b/cviruntime/scripts/regression_models_cv180x.sh
new file mode 100755
index 000000000..969f7d6c5
--- /dev/null
+++ b/cviruntime/scripts/regression_models_cv180x.sh
@@ -0,0 +1,70 @@
+set -e
+
+echo "cvimodels regression for cv180x platform"
+
+if [[ -z "$MODEL_PATH" ]]; then
+  MODEL_PATH=$TPU_ROOT/../cvimodel_regression
+fi
+if [ ! -e $MODEL_PATH ]; then
+  echo "MODEL_PATH $MODEL_PATH does not exist"
+  echo "Please set MODEL_PATH to cvimodel_regression dir"
+  exit 1
+fi
+export MODEL_PATH=$MODEL_PATH
+
+if [ -f "/sys/kernel/debug/ion/cvi_carveout_heap_dump/total_mem" ]; then
+  total_ion_size=$(cat /sys/kernel/debug/ion/cvi_carveout_heap_dump/total_mem)
+else
+  # if ion size is unknown then execute basic tests.
+  total_ion_size=20000001
+fi
+
+# ION requirement >= 10 MB
+if [ "$total_ion_size" -gt "10000000" ]; then
+model_list="blazeface efficientnet-lite_b0 espcn_3x gaitset mobilenet_v1 mobilenet_v2 nasnet_mobile ppyolo_tiny retinaface_mnet25_600 retinaface_mnet25 shufflenet_v2 squeezenet_v1.0 squeezenet_v1.1"
+fi
+
+# ION requirement >= 20 MB
+if [ "$total_ion_size" -gt "20000000" ]; then
+model_list="$model_list resnet18 googlenet efficientnet_b0"
+fi
+
+if [ "$total_ion_size" -gt "35000000" ]; then
+model_list="$model_list densenet_121 efficientdet_d0"
+fi
+
+# turn on PMU
+export TPU_ENABLE_PMU=1
+
+if [ ! -e sdk_regression_out ]; then
+  mkdir sdk_regression_out
+fi
+
+ERR=0
+cd sdk_regression_out
+
+if [ -z $1 ]; then
+  for model in ${model_list}
+  do
+    echo "test $model"
+    model_runner \
+      --input $MODEL_PATH/${model}_bs1_in_fp32.npz \
+      --model $MODEL_PATH/${model}_bs1.cvimodel \
+      --reference $MODEL_PATH/${model}_bs1_out_all.npz 2>&1
+    if [ "$?" -ne "0" ]; then
+      echo "$model test FAILED" >> verdict.log
+      ERR=1
+    else
+      echo "$model test PASSED" >> verdict.log
+    fi
+  done
+fi
+
+# VERDICT
+if [ $ERR -eq 0 ]; then
+  echo $0 ALL TEST PASSED
+else
+  echo $0 FAILED
+fi
+
+exit $ERR
diff --git a/cviruntime/scripts/regression_models_cv181x.sh b/cviruntime/scripts/regression_models_cv181x.sh
new file mode 100755
index 000000000..d71f97907
--- /dev/null
+++ b/cviruntime/scripts/regression_models_cv181x.sh
@@ -0,0 +1,69 @@
+set -e
+
+echo "cvimodels regression for cv181x platform"
+
+if [[ -z "$MODEL_PATH" ]]; then
+  MODEL_PATH=$TPU_ROOT/../cvimodel_regression
+fi
+if [ ! -e $MODEL_PATH ]; then
+  echo "MODEL_PATH $MODEL_PATH does not exist"
+  echo "Please set MODEL_PATH to cvimodel_regression dir"
+  exit 1
+fi
+export MODEL_PATH=$MODEL_PATH
+
+if [ -f "/sys/kernel/debug/ion/cvi_carveout_heap_dump/total_mem" ]; then
+  total_ion_size=$(cat /sys/kernel/debug/ion/cvi_carveout_heap_dump/total_mem)
+else
+  # if ion size is unknown then execute basic tests.
+  total_ion_size=20000001
+fi
+
+# ION requirement >= 20 MB
+if [ "$total_ion_size" -gt "20000000" ]; then
+model_list="resnet18 mobilenet_v1 mobilenet_v2 squeezenet_v1.1 shufflenet_v2 googlenet densenet_121 efficientnet-lite_b0 nasnet_mobile retinaface_mnet25 retinaface_mnet25_600 mobilenet_ssd yolo_v3_tiny"
+fi
+
+if [ "$total_ion_size" -gt "35000000" ]; then
+model_list="$model_list densenet_201 senet_res50 resnext50 yolo_v1_448 inception_v3"
+fi
+
+if [ "$total_ion_size" -gt "45000000" ]; then
+model_list="$model_list arcface_res50"
+fi
+
+# turn on PMU
+export TPU_ENABLE_PMU=1
+
+if [ ! -e sdk_regression_out ]; then
+  mkdir sdk_regression_out
+fi
+
+ERR=0
+cd sdk_regression_out
+
+if [ -z $1 ]; then
+  for model in ${model_list}
+  do
+    echo "test $model"
+    model_runner \
+      --input $MODEL_PATH/${model}_bs1_in_fp32.npz \
+      --model $MODEL_PATH/${model}_bs1.cvimodel \
+      --reference $MODEL_PATH/${model}_bs1_out_all.npz 2>&1
+    if [ "$?" -ne "0" ]; then
+      echo "$model test FAILED" >> verdict.log
+      ERR=1
+    else
+      echo "$model test PASSED" >> verdict.log
+    fi
+  done
+fi
+
+# VERDICT
+if [ $ERR -eq 0 ]; then
+  echo $0 ALL TEST PASSED
+else
+  echo $0 FAILED
+fi
+
+exit $ERR
diff --git a/cviruntime/scripts/regression_models_cv182x.sh b/cviruntime/scripts/regression_models_cv182x.sh
new file mode 100755
index 000000000..00878fd51
--- /dev/null
+++ b/cviruntime/scripts/regression_models_cv182x.sh
@@ -0,0 +1,110 @@
+#!/bin/bash
+set -e
+
+echo "cvimodels regression for cv182x platform"
+
+if [[ -z "$MODEL_PATH" ]]; then
+  MODEL_PATH=$TPU_ROOT/../cvimodel_regression
+fi
+if [ ! -e $MODEL_PATH ]; then
+  echo "MODEL_PATH $MODEL_PATH does not exist"
+  echo "Please set MODEL_PATH to cvimodel_regression dir"
+  exit 1
+fi
+export MODEL_PATH=$MODEL_PATH
+
+model_list=(
+  ####################
+  # Classification   #
+  ####################
+  "resnet50"
+  "resnet18"
+  "mobilenet_v1"
+  "mobilenet_v2"
+  "squeezenet_v1.1"
+  "shufflenet_v2"
+  "googlenet"
+  "inception_v3"
+  "densenet_121"
+  "densenet_201"
+  "senet_res50"
+  "resnext50"
+  # "res2net50"
+  "efficientnet-lite_b0"
+  "nasnet_mobile"
+  ####################
+  # Detection        #
+  ####################
+  "retinaface_mnet25"
+  "retinaface_mnet25_600"
+  "retinaface_res50"
+  "mobilenet_ssd"
+  "yolo_v1_448"
+  "yolo_v2_416"
+  "yolo_v3_320"
+  "yolo_v3_tiny"
+  "yolo_v4_s"
+  "yolo_v5_s"
+  "yolox_s"
+  ####################
+  # Face Recog       #
+  ####################
+  "arcface_res50"
+  ####################
+  # Pose             #
+  ####################
+  "alphapose"
+)
+
+# turn on PMU
+export TPU_ENABLE_PMU=1
+
+if [ ! -e sdk_regression_out ]; then
+  mkdir sdk_regression_out
+fi
+
+ERR=0
+pushd sdk_regression_out
+
+if [ -z $1 ]; then
+  for model in ${model_list[@]}
+  do
+    echo "test $model"
+    model_runner \
+      --input $MODEL_PATH/${model}_bs1_in_fp32.npz \
+      --model $MODEL_PATH/${model}_bs1.cvimodel \
+      --reference $MODEL_PATH/${model}_bs1_out_all.npz 2>&1 | tee $model.log
+    if [ "${PIPESTATUS[0]}" -ne "0" ]; then
+      echo "$model test FAILED" >> verdict.log
+      ERR=1
+    else
+      echo "$model test PASSED" >> verdict.log
+    fi
+  done
+else
+  model=$1
+  count=$2
+  echo "test $model"
+  model_runner \
+    --input $MODEL_PATH/${model}_bs1_in_fp32.npz \
+    --model $MODEL_PATH/${model}_bs1.cvimodel \
+    --reference $MODEL_PATH/${model}_bs1_out_all.npz \
+    --count ${count} 2>&1 | tee $model.log
+  if [ "${PIPESTATUS[0]}" -ne "0" ]; then
+    echo "$model test FAILED" >> verdict.log
+    ERR=1
+  else
+    echo "$model test PASSED" >> verdict.log
+  fi
+fi
+
+popd
+
+# VERDICT
+if [ $ERR -eq 0 ]; then
+  echo $0 ALL TEST PASSED
+else
+  echo $0 FAILED
+fi
+
+exit $ERR
diff --git a/cviruntime/scripts/regression_models_cv183x.sh b/cviruntime/scripts/regression_models_cv183x.sh
new file mode 100755
index 000000000..41f3fce25
--- /dev/null
+++ b/cviruntime/scripts/regression_models_cv183x.sh
@@ -0,0 +1,131 @@
+#!/bin/bash
+set -e
+
+if [[ -z "$MODEL_PATH" ]]; then
+  MODEL_PATH=$TPU_ROOT/../cvimodel_regression
+fi
+if [ ! -e $MODEL_PATH ]; then
+  echo "MODEL_PATH $MODEL_PATH does not exist"
+  echo "Please set MODEL_PATH to cvimodel_regression dir"
+  exit 1
+fi
+export MODEL_PATH=$MODEL_PATH
+
+model_list=(
+  ####################
+  # Classification   #
+  ####################
+  "resnet50"
+  "resnet18"
+  "mobilenet_v1"
+  "mobilenet_v2"
+  "squeezenet_v1.1"
+  "shufflenet_v2"
+  "googlenet"
+  "inception_v3"
+  "inception_v4"
+  "vgg16"
+  "densenet_121"
+  "densenet_201"
+  "senet_res50"
+  "resnext50"
+  "res2net50"
+  "ecanet50"
+  "efficientnet_b0"
+  "efficientnet-lite_b0"
+  "nasnet_mobile"
+  ####################
+  # Detection        #
+  ####################
+  "retinaface_mnet25"
+  "retinaface_mnet25_600"
+  "retinaface_res50"
+  "ssd300"
+  "mobilenet_ssd"
+  "yolo_v1_448"
+  "yolo_v2_416"
+  "yolo_v2_1080"
+  "yolo_v3_320"
+  "yolo_v3_416"
+  "yolo_v3_tiny"
+  "yolo_v3_608"
+  "yolo_v3_spp"
+  "yolo_v4"
+  "yolo_v4_s"
+  "yolo_v5_s"
+  "yolox_s"
+  "faster_rcnn"
+  ####################
+  # Face Recog       #
+  ####################
+  "arcface_res50"
+  ####################
+  # Pose             #
+  ####################
+  "alphapose"
+  ####################
+  # SuperRes         #
+  ####################
+  "espcn_3x"
+  ####################
+  # Segementation    #
+  ####################
+  "unet"
+  "erfnet"
+  "enet"
+  # "icnet"         ## ION size
+  # "fcn-8s"        ## ION size
+)
+
+# turn on PMU
+export TPU_ENABLE_PMU=1
+
+if [ ! -e sdk_regression_out ]; then
+  mkdir sdk_regression_out
+fi
+
+ERR=0
+pushd sdk_regression_out
+
+if [ -z $1 ]; then
+  for model in ${model_list[@]}
+  do
+    echo "test $model"
+    model_runner \
+      --input $MODEL_PATH/${model}_bs1_in_fp32.npz \
+      --model $MODEL_PATH/${model}_bs1.cvimodel \
+      --reference $MODEL_PATH/${model}_bs1_out_all.npz 2>&1 | tee $model.log
+    if [ "${PIPESTATUS[0]}" -ne "0" ]; then
+      echo "$model test FAILED" >> verdict.log
+      ERR=1
+    else
+      echo "$model test PASSED" >> verdict.log
+    fi
+  done
+else
+  model=$1
+  count=$2
+  echo "test $model"
+  model_runner \
+    --input $MODEL_PATH/${model}_bs1_in_fp32.npz \
+    --model $MODEL_PATH/${model}_bs1.cvimodel \
+    --reference $MODEL_PATH/${model}_bs1_out_all.npz \
+    --count ${count} 2>&1 | tee $model.log
+  if [ "${PIPESTATUS[0]}" -ne "0" ]; then
+    echo "$model test FAILED" >> verdict.log
+    ERR=1
+  else
+    echo "$model test PASSED" >> verdict.log
+  fi
+fi
+
+popd
+
+# VERDICT
+if [ $ERR -eq 0 ]; then
+  echo $0 ALL TEST PASSED
+else
+  echo $0 FAILED
+fi
+
+exit $ERR
diff --git a/cviruntime/scripts/regression_models_e2e_cv180x.sh b/cviruntime/scripts/regression_models_e2e_cv180x.sh
new file mode 100755
index 000000000..6ff41e13c
--- /dev/null
+++ b/cviruntime/scripts/regression_models_e2e_cv180x.sh
@@ -0,0 +1,73 @@
+set -e
+
+echo "cvimodels e2e testing for cv180x platform"
+
+if [[ -z "$MODEL_PATH" ]]; then
+  MODEL_PATH=$TPU_ROOT/../cvimodel_regression
+fi
+if [ ! -e $MODEL_PATH ]; then
+  echo "MODEL_PATH $MODEL_PATH does not exist"
+  echo "Please set MODEL_PATH to cvimodel_regression dir"
+  exit 1
+fi
+export MODEL_PATH=$MODEL_PATH
+
+if [ -f "/sys/kernel/debug/ion/cvi_carveout_heap_dump/total_mem" ]; then
+  total_ion_size=$(cat /sys/kernel/debug/ion/cvi_carveout_heap_dump/total_mem)
+else
+  # if ion size is unknown then execute basic tests.
+  total_ion_size=20000001
+fi
+
+# ION requirement >= 10 MB
+if [ "$total_ion_size" -gt "10000000" ]; then
+model_list="blazeface efficientnet-lite_b0 espcn_3x gaitset mobilenet_v1 mobilenet_v2 nasnet_mobile ppyolo_tiny retinaface_mnet25_600 retinaface_mnet25 shufflenet_v2 squeezenet_v1.0 squeezenet_v1.1"
+fi
+
+# ION requirement >= 20 MB
+if [ "$total_ion_size" -gt "20000000" ]; then
+model_list="$model_list resnet18 googlenet efficientnet_b0"
+fi
+
+if [ "$total_ion_size" -gt "35000000" ]; then
+model_list="$model_list densenet_121 efficientdet_d0"
+fi
+
+# turn off PMU
+export TPU_ENABLE_PMU=0
+
+if [ ! -e sdk_regression_out ]; then
+  mkdir sdk_regression_out
+fi
+
+ERR=0
+cd sdk_regression_out
+
+if [ -z "$1" ]; then
+  for model in `echo $model_list`
+  do
+    echo "test $model"
+    model_runner \
+      --input $MODEL_PATH/${model}_bs1_in_fp32.npz \
+      --model $MODEL_PATH/${model}_bs1.cvimodel \
+      --output ${model}_out.npz \
+      --reference $MODEL_PATH/${model}_bs1_out_all.npz \
+      --enable-timer \
+      --count 100  2>&1 > $model.log
+    if [ "$?" -ne "0" ]; then
+      echo "$model test FAILED" >> verdict.log
+      ERR=1
+    else
+      echo "$model test PASSED" >> verdict.log
+    fi
+  done
+fi
+
+# VERDICT
+if [ $ERR -eq 0 ]; then
+  echo $0 ALL TEST PASSED
+else
+  echo $0 FAILED
+fi
+
+exit $ERR
diff --git a/cviruntime/scripts/regression_models_e2e_cv181x.sh b/cviruntime/scripts/regression_models_e2e_cv181x.sh
new file mode 100755
index 000000000..919081c87
--- /dev/null
+++ b/cviruntime/scripts/regression_models_e2e_cv181x.sh
@@ -0,0 +1,77 @@
+set -e
+
+echo "cvimodels e2e testing for cv181x platform"
+
+if [[ -z "$MODEL_PATH" ]]; then
+  MODEL_PATH=$TPU_ROOT/../cvimodel_regression
+fi
+if [ ! -e $MODEL_PATH ]; then
+  echo "MODEL_PATH $MODEL_PATH does not exist"
+  echo "Please set MODEL_PATH to cvimodel_regression dir"
+  exit 1
+fi
+export MODEL_PATH=$MODEL_PATH
+
+if [ -f "/sys/kernel/debug/ion/cvi_carveout_heap_dump/total_mem" ]; then
+  total_ion_size=$(cat /sys/kernel/debug/ion/cvi_carveout_heap_dump/total_mem)
+else
+  # if ion size is unknown then execute basic tests.
+  total_ion_size=20000001
+fi
+
+# ION requirement >= 20 MB
+if [ "$total_ion_size" -gt "6000000" ]; then
+model_list="$model_list mobilenet_v1 mobilenet_v2 squeezenet_v1.1 shufflenet_v2 retinaface_mnet25 retinaface_mnet25_600"
+fi
+
+if [ "$total_ion_size" -gt "10000000" ]; then
+model_list="$model_list googlenet efficientnet-lite_b0 nasnet_mobile"
+fi
+
+if [ "$total_ion_size" -gt "15000000" ]; then
+model_list="$model_list densenet_121"
+fi
+
+if [ "$total_ion_size" -gt "35000000" ]; then
+model_list="$model_list densenet_201 senet_res50 resnext50 yolo_v1_448 inception_v3"
+fi
+
+
+# turn off PMU
+export TPU_ENABLE_PMU=0
+
+if [ ! -e sdk_regression_out ]; then
+  mkdir sdk_regression_out
+fi
+
+ERR=0
+cd sdk_regression_out
+
+if [ -z "$1" ]; then
+  for model in `echo $model_list`
+  do
+    echo "test $model"
+    model_runner \
+      --input $MODEL_PATH/${model}_bs1_in_fp32.npz \
+      --model $MODEL_PATH/${model}_bs1.cvimodel \
+      --output ${model}_out.npz \
+      --reference $MODEL_PATH/${model}_bs1_out_all.npz \
+      --enable-timer \
+      --count 100 
+    if [ "$?" -ne "0" ]; then
+      echo "$model test FAILED" >> verdict.log
+      ERR=1
+    else
+      echo "$model test PASSED" >> verdict.log
+    fi
+  done
+fi
+
+# VERDICT
+if [ $ERR -eq 0 ]; then
+  echo $0 ALL TEST PASSED
+else
+  echo $0 FAILED
+fi
+
+exit $ERR
diff --git a/cviruntime/scripts/regression_models_e2e_cv182x.sh b/cviruntime/scripts/regression_models_e2e_cv182x.sh
new file mode 100755
index 000000000..5b9d57d74
--- /dev/null
+++ b/cviruntime/scripts/regression_models_e2e_cv182x.sh
@@ -0,0 +1,140 @@
+#!/bin/bash
+set -e
+
+echo "cvimodels e2e testing for cv182x platform"
+
+if [[ -z "$MODEL_PATH" ]]; then
+  MODEL_PATH=$TPU_ROOT/../cvimodel_regression
+fi
+if [ ! -e $MODEL_PATH ]; then
+  echo "MODEL_PATH $MODEL_PATH does not exist"
+  echo "Please set MODEL_PATH to cvimodel_regression dir"
+  exit 1
+fi
+export MODEL_PATH=$MODEL_PATH
+
+model_list=(
+  ####################
+  # Classification   #
+  ####################
+  "resnet50"
+  "resnet18"
+  "mobilenet_v1"
+  "mobilenet_v2"
+  "squeezenet_v1.1"
+  "shufflenet_v2"
+  "googlenet"
+  "inception_v3"
+  "densenet_121"
+  "densenet_201"
+  "senet_res50"
+  "resnext50"
+  # "res2net50"
+  "efficientnet-lite_b0"
+  "nasnet_mobile"
+  ####################
+  # Detection        #
+  ####################
+  "retinaface_mnet25"
+  "retinaface_mnet25_600"
+  # "retinaface_res50" OOM: OUT OF MEMROY
+  "mobilenet_ssd"
+  "yolo_v1_448"
+  "yolo_v2_416"
+  "yolo_v3_320"
+  # "yolo_v3_416" ION NOT ENOUGH for 32bits system
+  "yolo_v3_tiny"
+  ####################
+  # Face Recog       #
+  ####################
+  "arcface_res50"
+  ####################
+  # Pose             #
+  ####################
+  # "alphapose" failed for DDR 2166Mhz
+  ####################
+)
+
+model_list_batch=(
+  ####################
+  # Classification   #
+  ####################
+  "resnet50"
+  "resnet18"
+  "mobilenet_v1"
+  "mobilenet_v2"
+  "squeezenet_v1.1"
+  "shufflenet_v2"
+  "googlenet"
+  "densenet_121"
+  "resnext50"
+  "efficientnet-lite_b0"
+  ####################
+  # Detection        #
+  ####################
+  "retinaface_mnet25"
+  "mobilenet_ssd"
+  ####################
+  # Face Recog       #
+  ####################
+  "arcface_res50"
+  ####################
+)
+
+# turn off PMU
+export TPU_ENABLE_PMU=0
+
+if [ ! -e sdk_regression_out ]; then
+  mkdir sdk_regression_out
+fi
+
+ERR=0
+pushd sdk_regression_out
+
+if [ -z $1 ]; then
+  for model in ${model_list[@]}
+  do
+    echo "test $model"
+    model_runner \
+      --input $MODEL_PATH/${model}_bs1_in_fp32.npz \
+      --model $MODEL_PATH/${model}_bs1.cvimodel \
+      --output ${model}_out.npz \
+      --reference $MODEL_PATH/${model}_bs1_out_all.npz \
+      --enable-timer \
+      --count 100 2>&1 | tee $model.log
+    if [ "${PIPESTATUS[0]}" -ne "0" ]; then
+      echo "$model test FAILED" >> verdict.log
+      ERR=1
+    else
+      echo "$model test PASSED" >> verdict.log
+    fi
+  done
+else
+  model=$1
+  count=$2
+  echo "test $model"
+  model_runner \
+    --input $MODEL_PATH/${model}_bs1_in_fp32.npz \
+    --model $MODEL_PATH/${model}_bs1.cvimodel \
+    --output ${model}_out.npz \
+    --reference $MODEL_PATH/${model}_bs1_out_all.npz \
+    --enable-timer \
+    --count ${count} 2>&1 | tee $model.log
+  if [ "${PIPESTATUS[0]}" -ne "0" ]; then
+    echo "$model test FAILED" >> verdict.log
+    ERR=1
+  else
+    echo "$model test PASSED" >> verdict.log
+  fi
+fi
+
+popd
+
+# VERDICT
+if [ $ERR -eq 0 ]; then
+  echo $0 ALL TEST PASSED
+else
+  echo $0 FAILED
+fi
+
+exit $ERR
diff --git a/cviruntime/scripts/regression_models_e2e_cv183x.sh b/cviruntime/scripts/regression_models_e2e_cv183x.sh
new file mode 100755
index 000000000..8425cd02a
--- /dev/null
+++ b/cviruntime/scripts/regression_models_e2e_cv183x.sh
@@ -0,0 +1,127 @@
+#!/bin/bash
+set -e
+
+if [[ -z "$MODEL_PATH" ]]; then
+  MODEL_PATH=$TPU_ROOT/../cvimodel_regression
+fi
+if [ ! -e $MODEL_PATH ]; then
+  echo "MODEL_PATH $MODEL_PATH does not exist"
+  echo "Please set MODEL_PATH to cvimodel_regression dir"
+  exit 1
+fi
+export MODEL_PATH=$MODEL_PATH
+
+model_list=(
+  ####################
+  # Classification   #
+  ####################
+  "resnet50"
+  "resnet18"
+  "mobilenet_v1"
+  "mobilenet_v2"
+  "squeezenet_v1.1"
+  "shufflenet_v2"
+  "googlenet"
+  "inception_v3"
+  "inception_v4"
+  "vgg16"
+  "densenet_121"
+  "densenet_201"
+  "senet_res50"
+  # "resnext50"
+  "res2net50"
+  "ecanet50"
+  "efficientnet_b0"
+  "efficientnet-lite_b0"
+  "nasnet_mobile"
+  ####################
+  # Detection        #
+  ####################
+  "retinaface_mnet25"
+  "retinaface_mnet25_600"
+  "retinaface_res50"
+  "ssd300"
+  "mobilenet_ssd"
+  "yolo_v1_448"
+  "yolo_v2_416"
+  "yolo_v2_1080"
+  "yolo_v3_320"
+  "yolo_v3_416"
+  "yolo_v3_tiny"
+  "yolo_v3_608"
+  "yolo_v3_spp"
+  ####################
+  # Face Recog       #
+  ####################
+  "arcface_res50"
+  ####################
+  # Pose             #
+  ####################
+  "alphapose"
+  ####################
+  # SuperRes         #
+  ####################
+  "espcn_3x"
+  ####################
+  # Segementation    #
+  ####################
+  "unet"
+)
+
+# turn off PMU
+export TPU_ENABLE_PMU=0
+
+if [ ! -e sdk_regression_out ]; then
+  mkdir sdk_regression_out
+fi
+
+ERR=0
+pushd sdk_regression_out
+
+if [ -z $1 ]; then
+  for model in ${model_list[@]}
+  do
+    echo "test $model"
+    model_runner \
+      --input $MODEL_PATH/${model}_bs1_in_fp32.npz \
+      --model $MODEL_PATH/${model}_bs1.cvimodel \
+      --output ${model}_out.npz \
+      --reference $MODEL_PATH/${model}_bs1_out_all.npz \
+      --enable-timer \
+      --count 100 2>&1 | tee $model.log
+    if [ "${PIPESTATUS[0]}" -ne "0" ]; then
+      echo "$model test FAILED" >> verdict.log
+      ERR=1
+    else
+      echo "$model test PASSED" >> verdict.log
+    fi
+  done
+else
+  model=$1
+  count=$2
+  echo "test $model"
+  model_runner \
+    --input $MODEL_PATH/${model}_bs1_in_fp32.npz \
+    --model $MODEL_PATH/${model}_bs1.cvimodel \
+    --output ${model}_out.npz \
+    --reference $MODEL_PATH/${model}_bs1_out_all.npz \
+    --enable-timer \
+    --count ${count} 2>&1 | tee $model.log
+  if [ "${PIPESTATUS[0]}" -ne "0" ]; then
+    echo "$model test FAILED" >> verdict.log
+    ERR=1
+  else
+    echo "$model test PASSED" >> verdict.log
+  fi
+fi
+
+popd
+
+# VERDICT
+if [ $ERR -eq 0 ]; then
+  echo $0 ALL TEST PASSED
+else
+  echo $0 FAILED
+fi
+
+exit $ERR
diff --git a/cviruntime/scripts/regression_new_models_cv180x.sh b/cviruntime/scripts/regression_new_models_cv180x.sh
new file mode 100644
index 000000000..f6013d943
--- /dev/null
+++ b/cviruntime/scripts/regression_new_models_cv180x.sh
@@ -0,0 +1,70 @@
+set -e
+
+echo "new cvimodels regression for cv180x platform"
+
+if [[ -z "$MODEL_PATH" ]]; then
+  MODEL_PATH=$TPU_ROOT/../cvimodel_regression_int8_cv180x
+fi
+if [ ! -e $MODEL_PATH ]; then
+  echo "MODEL_PATH $MODEL_PATH does not exist"
+  echo "Please set MODEL_PATH to cvimodel_regression dir"
+  exit 1
+fi
+export MODEL_PATH=$MODEL_PATH
+
+if [ -f "/sys/kernel/debug/ion/cvi_carveout_heap_dump/total_mem" ]; then
+  total_ion_size=$(cat /sys/kernel/debug/ion/cvi_carveout_heap_dump/total_mem)
+else
+  # if ion size is unknown then execute basic tests.
+  total_ion_size=20000001
+fi
+
+# ION requirement >= 10 MB
+if [ "$total_ion_size" -gt "10000000" ]; then
+model_list="blazeface espcn_3x mobilenet_v2_cf nasnet_mobile shufflenet_v2 squeezenet_v1.1_cf "
+fi
+
+# ION requirement >= 20 MB
+if [ "$total_ion_size" -gt "20000000" ]; then
+model_list="$model_list resnet18_v1 googlenet_cf"
+fi
+
+if [ "$total_ion_size" -gt "35000000" ]; then
+model_list="$model_list densenet121-12 efficientdet-d0"
+fi
+
+# turn on PMU
+export TPU_ENABLE_PMU=1
+
+if [ ! -e sdk_regression_out ]; then
+  mkdir sdk_regression_out
+fi
+
+ERR=0
+cd sdk_regression_out
+
+if [ -z $1 ]; then
+  for model in ${model_list}
+  do
+    echo "test $model"
+    model_runner \
+      --input $MODEL_PATH/${model}_in_f32.npz \
+      --model $MODEL_PATH/${model}_bs1.cvimodel \
+      --reference $MODEL_PATH/${model}_bs1_out_all.npz 2>&1 | tee $model.log
+    if [ "$?" -ne "0" ]; then
+      echo "$model test FAILED" >> verdict.log
+      ERR=1
+    else
+      echo "$model test PASSED" >> verdict.log
+    fi
+  done
+fi
+
+# VERDICT
+if [ $ERR -eq 0 ]; then
+  echo $0 ALL TEST PASSED
+else
+  echo $0 FAILED
+fi
+
+exit $ERR
diff --git a/cviruntime/scripts/regression_new_models_cv181x.sh b/cviruntime/scripts/regression_new_models_cv181x.sh
new file mode 100644
index 000000000..6cb038025
--- /dev/null
+++ b/cviruntime/scripts/regression_new_models_cv181x.sh
@@ -0,0 +1,69 @@
+set -e
+
+echo "cvimodels regression for cv181x platform"
+
+if [[ -z "$MODEL_PATH" ]]; then
+  MODEL_PATH=$TPU_ROOT/../cvimodel_regression_int8_cv181x
+fi
+if [ ! -e $MODEL_PATH ]; then
+  echo "MODEL_PATH $MODEL_PATH does not exist"
+  echo "Please set MODEL_PATH to cvimodel_regression dir"
+  exit 1
+fi
+export MODEL_PATH=$MODEL_PATH
+
+if [ -f "/sys/kernel/debug/ion/cvi_carveout_heap_dump/total_mem" ]; then
+  total_ion_size=$(cat /sys/kernel/debug/ion/cvi_carveout_heap_dump/total_mem)
+else
+  # if ion size is unknown then execute basic tests.
+  total_ion_size=20000001
+fi
+
+# ION requirement >= 20 MB
+if [ "$total_ion_size" -gt "20000000" ]; then
+model_list="resnet18_v1 mobilenet_v2_cf squeezenet_v1.1_cf shufflenet_v2 googlenet_cf densenet121-12 nasnet_mobile blazeface retinaface_mnet_with_det mobilenetv2_ssd_cf yolov3_tiny yolov8n"
+fi
+
+if [ "$total_ion_size" -gt "35000000" ]; then
+model_list="$model_list densenet201 se-resnet50 resnext50_cf efficientdet-d0 pp_yolox yolov5s inception_v3 resnet50_v1"
+fi
+
+if [ "$total_ion_size" -gt "45000000" ]; then
+model_list="$model_list arcface_res50 alphapose_res50 retinaface pp_yoloe_m"
+fi
+
+# turn on PMU
+export TPU_ENABLE_PMU=1
+
+if [ ! -e sdk_regression_out ]; then
+  mkdir sdk_regression_out
+fi
+
+ERR=0
+cd sdk_regression_out
+
+if [ -z $1 ]; then
+  for model in ${model_list}
+  do
+    echo "test $model"
+    model_runner \
+      --input $MODEL_PATH/${model}_in_f32.npz \
+      --model $MODEL_PATH/${model}_bs1.cvimodel \
+      --reference $MODEL_PATH/${model}_bs1_out_all.npz 2>&1 | tee $model.log
+    if [ "$?" -ne "0" ]; then
+      echo "$model test FAILED" >> verdict.log
+      ERR=1
+    else
+      echo "$model test PASSED" >> verdict.log
+    fi
+  done
+fi
+
+# VERDICT
+if [ $ERR -eq 0 ]; then
+  echo $0 ALL TEST PASSED
+else
+  echo $0 FAILED
+fi
+
+exit $ERR
diff --git a/cviruntime/scripts/regression_new_models_cv182x.sh b/cviruntime/scripts/regression_new_models_cv182x.sh
new file mode 100644
index 000000000..bf15f4a45
--- /dev/null
+++ b/cviruntime/scripts/regression_new_models_cv182x.sh
@@ -0,0 +1,115 @@
+#!/bin/bash
+set -e
+
+echo "new cvimodels regression for cv182x platform"
+
+if [[ -z "$MODEL_PATH" ]]; then
+  MODEL_PATH=$TPU_ROOT/../cvimodel_regression_int8_cv182x
+fi
+if [ ! -e $MODEL_PATH ]; then
+  echo "MODEL_PATH $MODEL_PATH does not exist"
+  echo "Please set MODEL_PATH to cvimodel_regression dir"
+  exit 1
+fi
+export MODEL_PATH=$MODEL_PATH
+
+model_list=(
+  ####################
+  # Classification   #
+  ####################
+  "resnet50_v1"
+  "resnet18_v1"
+  "mobilenet_v2_cf"
+  "squeezenet_v1.1_cf"
+  "shufflenet_v2"
+  "googlenet_cf"
+  "inception_v3"
+  "densenet121-12"
+  "densenet201"
+  "se-resnet50"
+  "resnext50_cf"
+  "efficientnet"
+  "nasnet_mobile"
+  ####################
+  # Detection        #
+  ####################
+  "retinaface"
+  "retinaface_mnet_with_det"
+  "blazeface"
+  "mobilenetv2_ssd_cf"
+  "efficientdet-d0"
+  "yolov3_416_with_det"
+  "yolov3_tiny"
+  "yolov4s"
+  "yolov5s"
+  "yolox_s"
+  "yolov8n"
+  "pp_yoloe_m"
+  "pp_yolox"
+  ""
+  ####################
+  # Face Recog       #
+  ####################
+  "arcface_res50"
+  ####################
+  # Super Res       #
+  ####################
+  "espcn_3x"
+  ####################
+  # Pose             #
+  ####################
+  "alphapose_res50"
+)
+
+# turn on PMU
+export TPU_ENABLE_PMU=1
+
+if [ ! -e sdk_regression_out ]; then
+  mkdir sdk_regression_out
+fi
+
+ERR=0
+pushd sdk_regression_out
+
+if [ -z $1 ]; then
+  for model in ${model_list[@]}
+  do
+    echo "test $model"
+    model_runner \
+      --input $MODEL_PATH/${model}_in_f32.npz \
+      --model $MODEL_PATH/${model}_bs1.cvimodel \
+      --reference $MODEL_PATH/${model}_bs1_out_all.npz 2>&1 | tee $model.log
+    if [ "${PIPESTATUS[0]}" -ne "0" ]; then
+      echo "$model test FAILED" >> verdict.log
+      ERR=1
+    else
+      echo "$model test PASSED" >> verdict.log
+    fi
+  done
+else
+  model=$1
+  count=$2
+  echo "test $model"
+  model_runner \
+    --input $MODEL_PATH/${model}_in_f32.npz \
+    --model $MODEL_PATH/${model}_bs1.cvimodel \
+    --reference $MODEL_PATH/${model}_bs1_out_all.npz \
+    --count ${count} 2>&1 | tee $model.log
+  if [ "${PIPESTATUS[0]}" -ne "0" ]; then
+    echo "$model test FAILED" >> verdict.log
+    ERR=1
+  else
+    echo "$model test PASSED" >> verdict.log
+  fi
+fi
+
+popd
+
+# VERDICT
+if [ $ERR -eq 0 ]; then
+  echo $0 ALL TEST PASSED
+else
+  echo $0 FAILED
+fi
+
+exit $ERR
diff --git a/cviruntime/scripts/regression_new_models_cv183x.sh b/cviruntime/scripts/regression_new_models_cv183x.sh
new file mode 100644
index 000000000..9f9fd9eb1
--- /dev/null
+++ b/cviruntime/scripts/regression_new_models_cv183x.sh
@@ -0,0 +1,125 @@
+#!/bin/bash
+set -e
+
+if [[ -z "$MODEL_PATH" ]]; then
+  MODEL_PATH=$TPU_ROOT/../cvimodel_regression_int8_cv183x
+fi
+if [ ! -e $MODEL_PATH ]; then
+  echo "MODEL_PATH $MODEL_PATH does not exist"
+  echo "Please set MODEL_PATH to cvimodel_regression dir"
+  exit 1
+fi
+export MODEL_PATH=$MODEL_PATH
+
+model_list=(
+  ####################
+  # Classification   #
+  ####################
+  "resnet50_v1"
+  "resnet18_v1"
+  "mobilenet_v2_cf"
+  "squeezenet_v1.1_cf"
+  "shufflenet_v2"
+  "googlenet_cf"
+  "inception_v3"
+  "inception_v4_cf"
+  "densenet121-12"
+  "densenet201"
+  "se-resnet50"
+  "resnext50_cf"
+  "res2net50"
+  "ecanet50"
+  "efficientnet"
+  "nasnet_mobile"
+  ####################
+  # Detection        #
+  ####################
+  "retinaface"
+  "retinaface_mnet_with_det"
+  "blazeface"
+  "ssd300_cf"
+  "mobilenetv2_ssd_cf"
+  "efficientdet-d0"
+  "yolov3_416_with_det"
+  "yolov3_tiny"
+  "yolov3_spp_cf"
+  "yolov4_cf"
+  "yolov4s"
+  "yolov5s"
+  "yolox_s"
+  "yolov7"
+  "yolov8n"
+  "pp_yoloe_m"
+  "pp_yolov3"
+  "pp_yolox"
+  ""
+  ####################
+  # Face Recog       #
+  ####################
+  "arcface_res50"
+  ####################
+  # Super Res        #
+  ####################
+  "espcn_3x"
+  ####################
+  # Pose             #
+  ####################
+  "alphapose_res50"
+  ####################
+  # Segmentation     #
+  ####################
+  "unet"
+)
+
+# turn on PMU
+export TPU_ENABLE_PMU=1
+
+if [ ! -e sdk_regression_out ]; then
+  mkdir sdk_regression_out
+fi
+
+ERR=0
+pushd sdk_regression_out
+
+if [ -z $1 ]; then
+  for model in ${model_list[@]}
+  do
+    echo "test $model"
+    model_runner \
+      --input $MODEL_PATH/${model}_in_f32.npz \
+      --model $MODEL_PATH/${model}_bs1.cvimodel \
+      --reference $MODEL_PATH/${model}_bs1_out_all.npz 2>&1 | tee $model.log
+    if [ "${PIPESTATUS[0]}" -ne "0" ]; then
+      echo "$model test FAILED" >> verdict.log
+      ERR=1
+    else
+      echo "$model test PASSED" >> verdict.log
+    fi
+  done
+else
+  model=$1
+  count=$2
+  echo "test $model"
+  model_runner \
+    --input $MODEL_PATH/${model}_in_f32.npz \
+    --model $MODEL_PATH/${model}_bs1.cvimodel \
+    --reference $MODEL_PATH/${model}_bs1_out_all.npz \
+    --count ${count} 2>&1 | tee $model.log
+  if [ "${PIPESTATUS[0]}" -ne "0" ]; then
+    echo "$model test FAILED" >> verdict.log
+    ERR=1
+  else
+    echo "$model test PASSED" >> verdict.log
+  fi
+fi
+
+popd
+
+# VERDICT
+if [ $ERR -eq 0 ]; then
+  echo $0 ALL TEST PASSED
+else
+  echo $0 FAILED
+fi
+
+exit $ERR
diff --git a/cviruntime/scripts/regression_samples_cv180x.sh b/cviruntime/scripts/regression_samples_cv180x.sh
new file mode 100755
index 000000000..f5e7943f8
--- /dev/null
+++ b/cviruntime/scripts/regression_samples_cv180x.sh
@@ -0,0 +1,62 @@
+set -ex
+
+echo "tpu samples regression for cv180x platform"
+
+if [  -z "$MODEL_PATH" ] || [ ! -e $MODEL_PATH ]; then
+  echo "MODEL_PATH $MODEL_PATH does not exist"
+  echo "Please set MODEL_PATH to cvimodel_regression dir"
+  exit 1
+fi
+
+if [ -z "$TPU_ROOT" ] || [ ! -e $TPU_ROOT ]; then
+  echo "TPU_ROOT $TPU_ROOT does not exist"
+  echo "Please set TPU_ROOT to cvitek_tpu_sdk dir"
+  exit 1
+fi
+
+#export TPU_ENABLE_PMU=1
+
+samples_list="run_classifier.sh run_classifier_fused_preprocess.sh"
+#"run_classifier_multi_batch.sh"
+
+samples_extra_list=""
+  #"run_alphapose.sh"
+  #"run_alphapose_fused_preprocess.sh"
+  #"run_classifier_yuv420.sh"
+  #"run_detector_yolov3_fused_preprocess.sh"
+  #"run_detector_yolov3.sh"
+  #"run_detector_yolov5_fused_preprocess.sh"
+  #"run_detector_yolov5.sh"
+  #"run_detector_yolox_s.sh"
+  #"run_insightface_fused_preprocess.sh"
+
+function test_sample() {
+  for sample in $*
+  do
+    echo $sample
+    sh -ex ./$sample
+    if [ "$?" -ne "0" ]; then
+      echo "$samples FAILED"
+      return 1
+    fi
+  done
+  return 0
+}
+
+cd $TPU_ROOT/samples
+# test samples
+test_sample ${samples_list}
+if [ $? -ne 0 ]; then
+  echo "test samples failed !!"
+  exit 1
+fi
+
+cd $TPU_ROOT/samples/samples_extra
+# test samples_extra
+test_sample ${samples_extra_list}
+if [ $? -ne 0 ]; then
+  echo "test samples failed !!"
+  exit 1
+fi
+
+exit 0
diff --git a/cviruntime/scripts/regression_samples_cv181x.sh b/cviruntime/scripts/regression_samples_cv181x.sh
new file mode 100755
index 000000000..e8eaa37e2
--- /dev/null
+++ b/cviruntime/scripts/regression_samples_cv181x.sh
@@ -0,0 +1,61 @@
+set -ex
+
+echo "tpu samples regression for cv181x platform"
+
+if [  -z "$MODEL_PATH" ] || [ ! -e $MODEL_PATH ]; then
+  echo "MODEL_PATH $MODEL_PATH does not exist"
+  echo "Please set MODEL_PATH to cvimodel_regression dir"
+  exit 1
+fi
+
+if [ -z "$TPU_ROOT" ] || [ ! -e $TPU_ROOT ]; then
+  echo "TPU_ROOT $TPU_ROOT does not exist"
+  echo "Please set TPU_ROOT to cvitek_tpu_sdk dir"
+  exit 1
+fi
+
+#export TPU_ENABLE_PMU=1
+
+samples_list="run_classifier.sh run_classifier_fused_preprocess.sh"
+#"run_classifier_multi_batch.sh"
+
+samples_extra_list="run_insightface_fused_preprocess.sh"
+  #"run_alphapose.sh"
+  #"run_alphapose_fused_preprocess.sh"
+  #"run_classifier_yuv420.sh"
+  #"run_detector_yolov3_fused_preprocess.sh"
+  #"run_detector_yolov3.sh"
+  #"run_detector_yolov5_fused_preprocess.sh"
+  #"run_detector_yolov5.sh"
+  #"run_detector_yolox_s.sh"
+
+function test_sample() {
+  for sample in $*
+  do
+    echo $sample
+    sh -ex ./$sample
+    if [ "$?" -ne "0" ]; then
+      echo "$samples FAILED"
+      return 1
+    fi
+  done
+  return 0
+}
+
+cd $TPU_ROOT/samples
+# test samples
+test_sample ${samples_list}
+if [ $? -ne 0 ]; then
+  echo "test samples failed !!"
+  exit 1
+fi
+
+cd $TPU_ROOT/samples/samples_extra
+# test samples_extra
+test_sample ${samples_extra_list}
+if [ $? -ne 0 ]; then
+  echo "test samples failed !!"
+  exit 1
+fi
+
+exit 0
diff --git a/cviruntime/scripts/regression_samples_cv182x.sh b/cviruntime/scripts/regression_samples_cv182x.sh
new file mode 100755
index 000000000..1ad9fb4b2
--- /dev/null
+++ b/cviruntime/scripts/regression_samples_cv182x.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+set -ex
+
+echo "tpu samples regression for cv183x platform"
+
+if [  -z "$MODEL_PATH" ] || [ ! -e $MODEL_PATH ]; then
+  echo "MODEL_PATH $MODEL_PATH does not exist"
+  echo "Please set MODEL_PATH to cvimodel_regression dir"
+  exit 1
+fi
+
+if [ -z "$TPU_ROOT" ] || [ ! -e $TPU_ROOT ]; then
+  echo "TPU_ROOT $TPU_ROOT does not exist"
+  echo "Please set TPU_ROOT to cvitek_tpu_sdk dir"
+  exit 1
+fi
+
+#export TPU_ENABLE_PMU=1
+
+samples_list=(
+  "run_classifier.sh"
+  "run_classifier_fused_preprocess.sh"
+  #"run_classifier_multi_batch.sh"
+)
+
+samples_extra_list=(
+  #"run_alphapose.sh"
+  "run_alphapose_fused_preprocess.sh"
+  #"run_classifier_yuv420.sh"
+  "run_detector_yolov3_fused_preprocess.sh"
+  #"run_detector_yolov3.sh"
+  "run_detector_yolov5_fused_preprocess.sh"
+  #"run_detector_yolov5.sh"
+  #"run_detector_yolox_s.sh"
+  "run_insightface_fused_preprocess.sh"
+)
+
+function test_sample() {
+  for sample in $*
+  do
+    echo $sample
+    sh -ex ./$sample
+    test $? -ne 0 && echo "$sample failed !!" && return 1
+  done
+  return 0
+}
+
+pushd $TPU_ROOT/samples
+# test samples
+test_sample ${samples_list[@]}
+test $? -ne 0 && echo "test samples failed !!" && popd && exit 1
+popd
+
+pushd $TPU_ROOT/samples/samples_extra
+# test samples_extra
+test_sample ${samples_extra_list[@]}
+test $? -ne 0 && echo "test samples extra failed !!" && popd && exit 1
+popd
+
+exit 0
diff --git a/cviruntime/scripts/regression_samples_cv183x.sh b/cviruntime/scripts/regression_samples_cv183x.sh
new file mode 100755
index 000000000..1ad9fb4b2
--- /dev/null
+++ b/cviruntime/scripts/regression_samples_cv183x.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+set -ex
+
+echo "tpu samples regression for cv183x platform"
+
+if [  -z "$MODEL_PATH" ] || [ ! -e $MODEL_PATH ]; then
+  echo "MODEL_PATH $MODEL_PATH does not exist"
+  echo "Please set MODEL_PATH to cvimodel_regression dir"
+  exit 1
+fi
+
+if [ -z "$TPU_ROOT" ] || [ ! -e $TPU_ROOT ]; then
+  echo "TPU_ROOT $TPU_ROOT does not exist"
+  echo "Please set TPU_ROOT to cvitek_tpu_sdk dir"
+  exit 1
+fi
+
+#export TPU_ENABLE_PMU=1
+
+samples_list=(
+  "run_classifier.sh"
+  "run_classifier_fused_preprocess.sh"
+  #"run_classifier_multi_batch.sh"
+)
+
+samples_extra_list=(
+  #"run_alphapose.sh"
+  "run_alphapose_fused_preprocess.sh"
+  #"run_classifier_yuv420.sh"
+  "run_detector_yolov3_fused_preprocess.sh"
+  #"run_detector_yolov3.sh"
+  "run_detector_yolov5_fused_preprocess.sh"
+  #"run_detector_yolov5.sh"
+  #"run_detector_yolox_s.sh"
+  "run_insightface_fused_preprocess.sh"
+)
+
+function test_sample() {
+  for sample in $*
+  do
+    echo $sample
+    sh -ex ./$sample
+    test $? -ne 0 && echo "$sample failed !!" && return 1
+  done
+  return 0
+}
+
+pushd $TPU_ROOT/samples
+# test samples
+test_sample ${samples_list[@]}
+test $? -ne 0 && echo "test samples failed !!" && popd && exit 1
+popd
+
+pushd $TPU_ROOT/samples/samples_extra
+# test samples_extra
+test_sample ${samples_extra_list[@]}
+test $? -ne 0 && echo "test samples extra failed !!" && popd && exit 1
+popd
+
+exit 0
diff --git a/cviruntime/scripts/toolchain-aarch64-linux.cmake b/cviruntime/scripts/toolchain-aarch64-linux.cmake
new file mode 100644
index 000000000..a7ed617fc
--- /dev/null
+++ b/cviruntime/scripts/toolchain-aarch64-linux.cmake
@@ -0,0 +1,44 @@
+include(CMakeForceCompiler)
+
+# The Generic system name is used for embedded targets (targets without OS) in
+# CMake
+set( CMAKE_SYSTEM_NAME          Linux )
+set( CMAKE_SYSTEM_PROCESSOR     aarch64 )
+set( ARCH aarch64 )
+set( CROSS_COMPILE aarch64-linux-gnu-)
+
+set(CMAKE_C_COMPILER ${CROSS_COMPILE}gcc)
+set(CMAKE_CXX_COMPILER ${CROSS_COMPILE}g++)
+
+message(STATUS "CMAKE_C_COMPILER: ${CMAKE_C_COMPILER}")
+message(STATUS "CMAKE_CXX_COMPILER: ${CMAKE_CXX_COMPILER}")
+
+# To build the tests, we need to set where the target environment containing
+# the required library is. On Debian-like systems, this is
+# /usr/aarch64-linux-gnu.
+SET(CMAKE_FIND_ROOT_PATH ${AARCH64_SYSROOT_PATH})
+# search for programs in the build host directories
+SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+# for libraries and headers in the target directories
+SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+
+# We must set the OBJCOPY setting into cache so that it's available to the
+# whole project. Otherwise, this does not get set into the CACHE and therefore
+# the build doesn't know what the OBJCOPY filepath is
+set(CMAKE_OBJCOPY ${CROSS_COMPILE}objcopy
+	    CACHE FILEPATH "The toolchain objcopy command " FORCE )
+
+# Set the CMAKE C flags (which should also be used by the assembler!
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Os -std=gnu11" )
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a" )
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffunction-sections" )
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fdata-sections" )
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-pointer-to-int-cast" )
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-missing-field-initializers" )
+
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "" )
+set( CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "" )
+
+set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-missing-field-initializers" )
+set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-parentheses" )
diff --git a/cviruntime/scripts/toolchain-linux-gnueabihf.cmake b/cviruntime/scripts/toolchain-linux-gnueabihf.cmake
new file mode 100644
index 000000000..fd52eea92
--- /dev/null
+++ b/cviruntime/scripts/toolchain-linux-gnueabihf.cmake
@@ -0,0 +1,40 @@
+include(CMakeForceCompiler)
+
+# The Generic system name is used for embedded targets (targets without OS) in
+# CMake
+set( CMAKE_SYSTEM_NAME          Linux )
+set( CMAKE_SYSTEM_PROCESSOR     arm )
+set( ARCH arm )
+set( CROSS_COMPILE arm-linux-gnueabihf-)
+
+set(CMAKE_C_COMPILER ${CROSS_COMPILE}gcc)
+set(CMAKE_CXX_COMPILER ${CROSS_COMPILE}g++)
+
+message(STATUS "CMAKE_C_COMPILER: ${CMAKE_C_COMPILER}")
+message(STATUS "CMAKE_CXX_COMPILER: ${CMAKE_CXX_COMPILER}")
+
+# To build the tests, we need to set where the target environment containing
+# the required library is. On Debian-like systems, this is
+# /usr/arm-linux-gnu.
+SET(CMAKE_FIND_ROOT_PATH ${ARM_SYSROOT_PATH})
+# search for programs in the build host directories
+SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+# for libraries and headers in the target directories
+SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+
+# We must set the OBJCOPY setting into cache so that it's available to the
+# whole project. Otherwise, this does not get set into the CACHE and therefore
+# the build doesn't know what the OBJCOPY filepath is
+set(CMAKE_OBJCOPY ${CROSS_COMPILE}objcopy
+	    CACHE FILEPATH "The toolchain objcopy command " FORCE )
+
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "" )
+set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" CACHE STRING "" )
+set( CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "" )
+
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv7-a -mfpu=neon" )
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfloat-abi=hard -marm" )
+
+set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv7-a -mfpu=neon" )
+set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfloat-abi=hard -marm" )
diff --git a/cviruntime/scripts/toolchain-linux-uclibc.cmake b/cviruntime/scripts/toolchain-linux-uclibc.cmake
new file mode 100644
index 000000000..7bb6e3670
--- /dev/null
+++ b/cviruntime/scripts/toolchain-linux-uclibc.cmake
@@ -0,0 +1,39 @@
+include(CMakeForceCompiler)
+
+# The Generic system name is used for embedded targets (targets without OS) in
+# CMake
+set( CMAKE_SYSTEM_NAME          Linux )
+set( CMAKE_SYSTEM_PROCESSOR     arm )
+set( ARCH arm )
+set( CROSS_COMPILE arm-cvitek-linux-uclibcgnueabihf-)
+
+set(CMAKE_C_COMPILER ${CROSS_COMPILE}gcc)
+set(CMAKE_CXX_COMPILER ${CROSS_COMPILE}g++)
+
+message(STATUS "CMAKE_C_COMPILER: ${CMAKE_C_COMPILER}")
+message(STATUS "CMAKE_CXX_COMPILER: ${CMAKE_CXX_COMPILER}")
+
+# To build the tests, we need to set where the target environment containing
+# the required library is. On Debian-like systems, this is
+SET(CMAKE_FIND_ROOT_PATH ${ARM_SYSROOT_PATH})
+# search for programs in the build host directories
+SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+# for libraries and headers in the target directories
+SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+
+# We must set the OBJCOPY setting into cache so that it's available to the
+# whole project. Otherwise, this does not get set into the CACHE and therefore
+# the build doesn't know what the OBJCOPY filepath is
+set(CMAKE_OBJCOPY ${CROSS_COMPILE}objcopy
+	    CACHE FILEPATH "The toolchain objcopy command " FORCE )
+
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "" )
+set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" CACHE STRING "" )
+set( CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "" )
+
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv7-a -mfpu=neon" )
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfloat-abi=hard -marm" )
+
+set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv7-a -mfpu=neon" )
+set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfloat-abi=hard -marm" )
diff --git a/cviruntime/scripts/toolchain-riscv64-linux-musl-x86_64.cmake b/cviruntime/scripts/toolchain-riscv64-linux-musl-x86_64.cmake
new file mode 100644
index 000000000..9277904ea
--- /dev/null
+++ b/cviruntime/scripts/toolchain-riscv64-linux-musl-x86_64.cmake
@@ -0,0 +1,39 @@
+include(CMakeForceCompiler)
+
+# The Generic system name is used for embedded targets (targets without OS) in
+# CMake
+set( CMAKE_SYSTEM_NAME          Linux )
+set( CMAKE_SYSTEM_PROCESSOR     riscv )
+set( ARCH riscv )
+set( CROSS_COMPILE riscv64-unknown-linux-musl- )
+
+set(CMAKE_C_COMPILER ${CROSS_COMPILE}gcc)
+set(CMAKE_CXX_COMPILER ${CROSS_COMPILE}g++)
+
+message(STATUS "CMAKE_C_COMPILER: ${CMAKE_C_COMPILER}")
+message(STATUS "CMAKE_CXX_COMPILER: ${CMAKE_CXX_COMPILER}")
+
+# To build the tests, we need to set where the target environment containing
+# the required library is. On Debian-like systems, this is
+# /usr/riscv64-unknown-linux-gnu-.
+SET(CMAKE_FIND_ROOT_PATH ${ARM_SYSROOT_PATH})
+# search for programs in the build host directories
+SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+# for libraries and headers in the target directories
+SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+
+# We must set the OBJCOPY setting into cache so that it's available to the
+# whole project. Otherwise, this does not get set into the CACHE and therefore
+# the build doesn't know what the OBJCOPY filepath is
+set(CMAKE_OBJCOPY ${CROSS_COMPILE}objcopy
+	    CACHE FILEPATH "The toolchain objcopy command " FORCE )
+
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "" )
+set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" CACHE STRING "" )
+set( CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "" )
+
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=c906fdv" )
+set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcpu=c906fdv" )
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=rv64gcv0p7_zfh_xthead -mabi=lp64d" )
+set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rv64gcv0p7_zfh_xthead -mabi=lp64d" )
diff --git a/cviruntime/scripts/toolchain-riscv64-linux-musl-x86_64_legacy.cmake b/cviruntime/scripts/toolchain-riscv64-linux-musl-x86_64_legacy.cmake
new file mode 100644
index 000000000..506c42e77
--- /dev/null
+++ b/cviruntime/scripts/toolchain-riscv64-linux-musl-x86_64_legacy.cmake
@@ -0,0 +1,39 @@
+include(CMakeForceCompiler)
+
+# The Generic system name is used for embedded targets (targets without OS) in
+# CMake
+set( CMAKE_SYSTEM_NAME          Linux )
+set( CMAKE_SYSTEM_PROCESSOR     riscv )
+set( ARCH riscv )
+set( CROSS_COMPILE riscv64-unknown-linux-musl- )
+
+set(CMAKE_C_COMPILER ${CROSS_COMPILE}gcc)
+set(CMAKE_CXX_COMPILER ${CROSS_COMPILE}g++)
+
+message(STATUS "CMAKE_C_COMPILER: ${CMAKE_C_COMPILER}")
+message(STATUS "CMAKE_CXX_COMPILER: ${CMAKE_CXX_COMPILER}")
+
+# To build the tests, we need to set where the target environment containing
+# the required library is. On Debian-like systems, this is
+# /usr/riscv64-unknown-linux-gnu-.
+SET(CMAKE_FIND_ROOT_PATH ${ARM_SYSROOT_PATH})
+# search for programs in the build host directories
+SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+# for libraries and headers in the target directories
+SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+
+# We must set the OBJCOPY setting into cache so that it's available to the
+# whole project. Otherwise, this does not get set into the CACHE and therefore
+# the build doesn't know what the OBJCOPY filepath is
+set(CMAKE_OBJCOPY ${CROSS_COMPILE}objcopy
+	    CACHE FILEPATH "The toolchain objcopy command " FORCE )
+
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "" )
+set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" CACHE STRING "" )
+set( CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "" )
+
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=c906" )
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=rv64gcv_zfh_xthead" )
+set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcpu=c906" )
+set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rv64gcv_zfh_xthead" )
diff --git a/cviruntime/scripts/toolchain-riscv64-linux-x86_64.cmake b/cviruntime/scripts/toolchain-riscv64-linux-x86_64.cmake
new file mode 100644
index 000000000..717461455
--- /dev/null
+++ b/cviruntime/scripts/toolchain-riscv64-linux-x86_64.cmake
@@ -0,0 +1,39 @@
+include(CMakeForceCompiler)
+
+# The Generic system name is used for embedded targets (targets without OS) in
+# CMake
+set( CMAKE_SYSTEM_NAME          Linux )
+set( CMAKE_SYSTEM_PROCESSOR     riscv )
+set( ARCH riscv )
+set( CROSS_COMPILE riscv64-unknown-linux-gnu- )
+
+set(CMAKE_C_COMPILER ${CROSS_COMPILE}gcc)
+set(CMAKE_CXX_COMPILER ${CROSS_COMPILE}g++)
+
+message(STATUS "CMAKE_C_COMPILER: ${CMAKE_C_COMPILER}")
+message(STATUS "CMAKE_CXX_COMPILER: ${CMAKE_CXX_COMPILER}")
+
+# To build the tests, we need to set where the target environment containing
+# the required library is. On Debian-like systems, this is
+# /usr/riscv64-unknown-linux-gnu-.
+SET(CMAKE_FIND_ROOT_PATH ${ARM_SYSROOT_PATH})
+# search for programs in the build host directories
+SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+# for libraries and headers in the target directories
+SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+
+# We must set the OBJCOPY setting into cache so that it's available to the
+# whole project. Otherwise, this does not get set into the CACHE and therefore
+# the build doesn't know what the OBJCOPY filepath is
+set(CMAKE_OBJCOPY ${CROSS_COMPILE}objcopy
+	    CACHE FILEPATH "The toolchain objcopy command " FORCE )
+
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "" )
+set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" CACHE STRING "" )
+set( CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "" )
+
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=c906fdv" )
+set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcpu=c906fdv" )
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=rv64gcv0p7_zfh_xthead -mabi=lp64d" )
+set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rv64gcv0p7_zfh_xthead -mabi=lp64d" )
diff --git a/cviruntime/scripts/toolchain.cmake b/cviruntime/scripts/toolchain.cmake
new file mode 100644
index 000000000..6148ce08a
--- /dev/null
+++ b/cviruntime/scripts/toolchain.cmake
@@ -0,0 +1,83 @@
+include(CMakeForceCompiler)
+include($ENV{TOP_DIR}/build/config.cmake)
+# usage
+# cmake -DCMAKE_TOOLCHAIN_FILE=../toolchain-arm-linux.cmake ../
+# The Generic system name is used for embedded targets (targets without OS) in
+# CMake
+set( CMAKE_SYSTEM_NAME          Linux )
+set( CMAKE_SYSTEM_PROCESSOR     ${CONFIG_ARCH} )
+
+# The toolchain prefix for all toolchain executables
+set( CROSS_COMPILE ${CONFIG_CROSS_COMPILE_SDK} )
+set( ARCH ${CONFIG_ARCH} )
+
+# specify the cross compiler. We force the compiler so that CMake doesn't
+# attempt to build a simple test program as this will fail without us using
+# the -nostartfiles option on the command line
+set(CMAKE_C_COMPILER ${CROSS_COMPILE}gcc)
+set(CMAKE_CXX_COMPILER ${CROSS_COMPILE}g++)
+
+# To build the tests, we need to set where the target environment containing
+# the required library is. On Debian-like systems, this is
+# /usr/aarch64-linux-gnu.
+# SET(CMAKE_FIND_ROOT_PATH $ENV{TOOLCHAIN_TOPDIR})
+# search for programs in the build host directories
+SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+# for libraries and headers in the target directories
+SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+
+# We must set the OBJCOPY setting into cache so that it's available to the
+# whole project. Otherwise, this does not get set into the CACHE and therefore
+# the build doesn't know what the OBJCOPY filepath is
+set( CMAKE_OBJCOPY      ${CROSS_COMPILE}objcopy
+	    CACHE FILEPATH "The toolchain objcopy command " FORCE )
+
+# Set the CMAKE C flags (which should also be used by the assembler!
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Os -std=gnu11" )
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffunction-sections" )
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fdata-sections" )
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-pointer-to-int-cast" )
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-missing-field-initializers" )
+
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "" )
+set( CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "" )
+
+set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-missing-field-initializers" )
+set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-parentheses" )
+if ("${CONFIG_CROSS_COMPILE_SDK}" STREQUAL "arm-linux-gnueabihf-")
+  set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv7-a -mfpu=neon" )
+  set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfloat-abi=hard -marm" )
+  set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv7-a -mfpu=neon" )
+  set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfloat-abi=hard -marm" )
+  set( CMAKE_SYSROOT $ENV{TOP_DIR}/ramdisk/sysroot/sysroot-glibc-linaro-2.23-2017.05-arm-linux-gnueabihf)
+elseif("${CONFIG_CROSS_COMPILE_SDK}" STREQUAL "aarch64-linux-gnu-")
+  set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=cortex-a53" )
+  set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcpu=cortex-a53" )
+  set( CMAKE_SYSROOT $ENV{TOP_DIR}/ramdisk/sysroot/sysroot-glibc-linaro-2.23-2017.05-aarch64-linux-gnu)
+elseif("${CONFIG_CROSS_COMPILE_SDK}" STREQUAL "riscv64-unknown-linux-gnu-")
+  set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=c906fdv" )
+  set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=rv64imafdcv0p7xthead -mcmodel=medany -mabi=lp64d" )
+  set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcpu=c906fdv" )
+  set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rv64imafdcv0p7xthead -mcmodel=medany -mabi=lp64d" )
+  set( CMAKE_SYSROOT $ENV{TOP_DIR}/host-tools/gcc/riscv64-linux-x86_64/sysroot)
+elseif("${CONFIG_CROSS_COMPILE_SDK}" STREQUAL "riscv64-unknown-linux-musl-")
+  if (DEFINED ENV{RISCV_LEGACY})
+    set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=c906" )
+    set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=rv64gcv_zfh_xthead" )
+    set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcpu=c906" )
+    set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rv64gcv_zfh_xthead" )
+  else()
+    set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=c906fdv" )
+    set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=rv64imafdcv0p7xthead -mcmodel=medany -mabi=lp64d" )
+    set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcpu=c906fdv" )
+    set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=rv64imafdcv0p7xthead -mcmodel=medany -mabi=lp64d" )
+  endif()
+  set( CMAKE_SYSROOT $ENV{TOP_DIR}/host-tools/gcc/riscv64-linux-musl-x86_64/sysroot)
+else()
+  set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv7-a -mfpu=neon" )
+  set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfloat-abi=hard -marm" )
+  set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv7-a -mfpu=neon" )
+  set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfloat-abi=hard -marm" )
+  set( CMAKE_SYSROOT $ENV{TOP_DIR}/ramdisk/sysroot/sysroot-uclibc)
+endif()
diff --git a/cviruntime/src/CMakeLists.txt b/cviruntime/src/CMakeLists.txt
new file mode 100644
index 000000000..e7eeb7be8
--- /dev/null
+++ b/cviruntime/src/CMakeLists.txt
@@ -0,0 +1,73 @@
+cmake_minimum_required(VERSION 2.8.0)
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/common)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/common/kernel_function)
+
+set(RUNTIME_SOURCES
+   ${CMAKE_CURRENT_SOURCE_DIR}/common/model.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/common/neuron.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/common/program.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/common/stream.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/common/section.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/common/runtime.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/common/debug.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/common/taskpool.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/common/shared_mem.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/common/alloc.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/common/kernel_function/kernelFunc.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/common/kernel_function/euclideanDist.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/common/kernel_function/matrixMul.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/common/kernel_function/grayImageLight.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/common/kernel_function/tdmaCopy.cpp)
+
+if (${ENABLE_CPU_FUNC})
+  set(RUNTIME_SOURCES ${RUNTIME_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/common/cpu_function/deformableconv.cpp)
+  set(RUNTIME_SOURCES ${RUNTIME_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/common/cpu_function/deform_im2col.cpp)
+  set(RUNTIME_SOURCES ${RUNTIME_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/common/cpu_function/instancenorm.cpp)
+  set(RUNTIME_SOURCES ${RUNTIME_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/common/cpu_function/interpolation.cpp)
+  set(RUNTIME_SOURCES ${RUNTIME_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/common/cpu_function/quant.cpp)
+  set(RUNTIME_SOURCES ${RUNTIME_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/common/cpu_function/proposal.cpp)
+  set(RUNTIME_SOURCES ${RUNTIME_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/common/cpu_function/roi_pooling.cpp)
+  set(RUNTIME_SOURCES ${RUNTIME_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/common/cpu_function/frcn_detection.cpp)
+  set(RUNTIME_SOURCES ${RUNTIME_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/common/cpu_function/reducel2.cpp)
+  set(RUNTIME_SOURCES ${RUNTIME_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/common/cpu_function/reducemean.cpp)
+  set(RUNTIME_SOURCES ${RUNTIME_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/common/cpu_function/reducemax.cpp)
+  set(RUNTIME_SOURCES ${RUNTIME_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/common/cpu_function/retinaface_detection.cpp)
+  set(RUNTIME_SOURCES ${RUNTIME_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/common/cpu_function/softmax.cpp)
+  set(RUNTIME_SOURCES ${RUNTIME_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/common/cpu_function/preprocess.cpp)
+  set(RUNTIME_SOURCES ${RUNTIME_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/common/cpu_function/transpose.cpp)
+  set(RUNTIME_SOURCES ${RUNTIME_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/common/cpu_function/ssd_detection.cpp)
+  set(RUNTIME_SOURCES ${RUNTIME_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/common/cpu_function/argmax.cpp)
+  set(RUNTIME_SOURCES ${RUNTIME_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/common/cpu_function/argmax_v2.cpp)
+  set(RUNTIME_SOURCES ${RUNTIME_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/common/cpu_function/argmax_v3.cpp)
+  set(RUNTIME_SOURCES ${RUNTIME_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/common/cpu_function/pixelshuffle.cpp)
+  set(RUNTIME_SOURCES ${RUNTIME_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/common/cpu_function/yolo_detection.cpp)
+  set(RUNTIME_SOURCES ${RUNTIME_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/common/cpu_function/embedding.cpp)
+  set(RUNTIME_SOURCES ${RUNTIME_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/common/cpu_function/gathernd.cpp)
+  set(RUNTIME_SOURCES ${RUNTIME_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/common/cpu_function/grid_sampler.cpp)
+  set(RUNTIME_SOURCES ${RUNTIME_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/common/cpu_function/cumsum.cpp)
+  set(RUNTIME_SOURCES ${RUNTIME_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/common/cpu_function/gatherelements_pt.cpp)
+endif()
+
+if (${ENABLE_COMPRESS_CMDBUF})
+  set(RUNTIME_SOURCES ${RUNTIME_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/lz4/lz4_all.c)
+  set(RUNTIME_SOURCES ${RUNTIME_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/lz4/xxhash.c)
+endif()
+
+if (RUNTIME STREQUAL "SOC")
+  if (CHIP STREQUAL cv183x)
+    add_subdirectory(soc/183x)
+  elseif (CHIP STREQUAL cv182x)
+    add_subdirectory(soc/182x)
+  elseif (CHIP STREQUAL cv181x)
+    add_subdirectory(soc/181x)
+  elseif (CHIP STREQUAL cv180x)
+    add_subdirectory(soc/180x)
+  endif()
+elseif (RUNTIME STREQUAL "CMODEL")
+  set(RUNTIME_SOURCES ${RUNTIME_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/common/mmpool.cpp)
+  add_subdirectory(cmodel)
+else()
+  message(FATAL_ERROR "Unknown RUNTIME: " ${RUNTIME})
+endif()
+
diff --git a/cviruntime/src/cmodel/CMakeLists.txt b/cviruntime/src/cmodel/CMakeLists.txt
new file mode 100644
index 000000000..e9514ddc3
--- /dev/null
+++ b/cviruntime/src/cmodel/CMakeLists.txt
@@ -0,0 +1,20 @@
+cmake_minimum_required(VERSION 2.8.0)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCMODEL_IMPL")
+
+file(GLOB_RECURSE _SOURCES "*.cpp")
+
+set(RUNTIME_SOURCES ${RUNTIME_SOURCES} ${_SOURCES})
+
+include_directories(${CMODEL_PATH}/include)
+link_directories(${CMODEL_PATH}/lib)
+set(CVI_LIBS ${CVI_LIBS} cvicmodel)
+set(EXTRA_LIBS ${EXTRA_LIBS} rt dl pthread)
+
+add_library(cviruntime SHARED ${RUNTIME_SOURCES})
+add_library(cviruntime-static STATIC ${RUNTIME_SOURCES})
+
+target_link_libraries(cviruntime ${CVI_LIBS} ${EXTRA_LIBS})
+
+install(TARGETS cviruntime DESTINATION lib)
+install(TARGETS cviruntime-static DESTINATION lib)
diff --git a/cviruntime/src/cmodel/cmodel_cmdbuf.cpp b/cviruntime/src/cmodel/cmodel_cmdbuf.cpp
new file mode 100644
index 000000000..beb914d63
--- /dev/null
+++ b/cviruntime/src/cmodel/cmodel_cmdbuf.cpp
@@ -0,0 +1,331 @@
+#include "cmodel_cmdbuf.h"
+#include <bmkernel/reg_tiu.h>
+
+CModelCmdbuf::~CModelCmdbuf() {}
+
+void CModelCmdbuf::reorder_tiu_cmdbuf_reg(uint8_t *cmdbuf) {
+  int total_bits = TIU_DESC_REG_BYTES * 8;
+
+  for (int i = 0; i < total_bits; i += 128)
+    cmdbuf[(i + 128 - 8) / 8] |= (i / 128) << 4;
+
+  uint8_t tmp[128 / 8];
+  uint8_t *last = &cmdbuf[(total_bits - 128) / 8];
+  memcpy(tmp, last, sizeof(tmp));
+  memcpy(last, cmdbuf, sizeof(tmp));
+  memcpy(cmdbuf, tmp, sizeof(tmp));
+}
+
+void CModelCmdbuf::reorder_tiu_cmdbuf(uint8_t *cmdbuf, size_t sz) {
+  cmd_hdr_t *hdr = NULL;
+  for (uint32_t i = 0; i < sz; i += sizeof(*hdr) + hdr->len) {
+    hdr = (typeof(hdr))(&cmdbuf[i]);
+    if (hdr->engine_id == CVI_TPU_TIU)
+      reorder_tiu_cmdbuf_reg(hdr->cmd);
+  }
+}
+
+void CModelCmdbuf::enable_tdma_cmdbuf_barrier(uint8_t *cmdbuf, size_t sz) {
+  cmd_hdr_t *hdr = NULL;
+  for (uint32_t i = 0; i < sz; i += sizeof(*hdr) + hdr->len) {
+    hdr = (typeof(hdr))(&cmdbuf[i]);
+    if (hdr->engine_id == CVI_TPU_TDMA) {
+      uint32_t *buf = (uint32_t *)hdr->cmd;
+      buf[0] |= (1 << 4);
+    }
+  }
+}
+
+void CModelCmdbuf::adjust_cmdbuf(uint8_t *cmdbuf, size_t sz) {
+
+  set_eod(cmdbuf, sz);
+  enable_interrupt(cmdbuf, sz);
+  enable_tdma_cmdbuf_barrier(cmdbuf, sz);
+
+  /*
+   * Must come after all tiu cmdbuf adjustion
+   */
+  reorder_tiu_cmdbuf(cmdbuf, sz);
+}
+
+int CModelCmdbuf::extract_cmdbuf(int engine_id, uint8_t *cmdbuf, uint8_t *found_cmdbuf,
+                          size_t sz) {
+  int found_sz = 0;
+  cmd_hdr_t *hdr = NULL;
+  for (uint32_t i = 0; i < sz; i += sizeof(*hdr) + hdr->len) {
+    hdr = (typeof(hdr))(&cmdbuf[i]);
+
+    if (hdr->magic != cmdbuf_hdr_magic) {
+      TPU_LOG_WARNING("env cv182x/cv183x might set incorrect, cmdbuf 0x%x, 0x%x\n", hdr->magic, cmdbuf_hdr_magic);
+      return -1;
+    }
+
+    if (hdr->engine_id == engine_id) {
+      memcpy(&found_cmdbuf[found_sz], hdr->cmd, hdr->len);
+      found_sz += hdr->len;
+    }
+  }
+  return found_sz;
+}
+
+int CModelCmdbuf::extract_dmabuf(int engine_id, uint8_t *dmabuf, uint8_t *found_dmabuf,
+                          size_t sz) {
+  (void)(sz);
+  int found_sz = 0;
+  dma_hdr_t *dma_hdr = reinterpret_cast<dma_hdr_t *>(dmabuf);
+  cvi_cpu_desc_t *dma_desc = reinterpret_cast<cvi_cpu_desc_t *>(dmabuf + sizeof(dma_hdr_t));
+
+  if (dma_hdr->dmabuf_magic_s != dmabuf_hdr_magic) {
+    TPU_LOG_WARNING("env cv182x/cv183x might set incorrect, dmabuf 0x%x, 0x%x\n", dma_hdr->dmabuf_magic_s, dmabuf_hdr_magic);
+    return -1;
+  }
+
+  for (uint32_t i = 0; i < dma_hdr->cpu_desc_count; ++i) {
+    uint32_t engine_num = 0;
+    uint64_t engine_offset = 0;
+    uint32_t engine_step = 0;
+
+    switch (engine_id) {
+      case CVI_TPU_TIU: {   // TIU
+        engine_num = dma_desc->num_tiu & 0xFFFF;
+        engine_offset = dma_desc->offset_tiu;
+        engine_step = BD_REG_BYTES;
+        break;
+      }
+      case CVI_TPU_TDMA: {
+        engine_num = dma_desc->num_tdma & 0xFFFF;
+        engine_offset = dma_desc->offset_tdma;
+        engine_offset = ALIGN(engine_offset, GDMA_DESC_ALIGN_SIZE);
+        engine_step = GDMA_DESC_ALIGN_SIZE;
+        break;
+      }
+      default:
+        TPU_LOG_ERROR("engine id error!\n");
+        assert(0);
+        break;
+    }
+    int cur_sz = engine_num * engine_step;
+    memcpy(found_dmabuf + found_sz, dmabuf + engine_offset, cur_sz);
+    found_sz += cur_sz;
+    ++dma_desc;
+  }
+  return found_sz;
+}
+
+bmerr_t CModelCmdbuf::rt_send_cmdbuf(bmctx_t ctx, uint8_t *cmdbuf, size_t sz,
+                       uint16_t *seq_no) {
+  uint8_t *cmdbuf2 = new uint8_t[sz];
+  memcpy(cmdbuf2, cmdbuf, sz);
+  adjust_cmdbuf(cmdbuf2, sz);
+
+  uint8_t *tiu_cmdbuf = new uint8_t[sz];
+  int tiu_sz = extract_cmdbuf(CVI_TPU_TIU, cmdbuf2, tiu_cmdbuf, sz);
+  assert(tiu_sz <= (int)g_tiu_cmdbuf_reserved_size);
+
+  uint8_t *tdma_cmdbuf = new uint8_t[sz];
+  int tdma_sz = extract_cmdbuf(CVI_TPU_TDMA, cmdbuf2, tdma_cmdbuf, sz);
+  assert(tdma_sz <= (int)g_tdma_cmdbuf_reserved_size);
+
+  BMDEV_LOCK(ctx->dev);
+  bmmod_t model = ctx->dev->model;
+  bm_cmodel_write_gmem(model, g_tiu_cmdbuf_gaddr, tiu_cmdbuf, tiu_sz);
+  bm_cmodel_write_gmem(model, g_tdma_cmdbuf_gaddr, tdma_cmdbuf, tdma_sz);
+  bm_cmodel_run_gmem_cmdbuf(model, g_tiu_cmdbuf_gaddr, tiu_sz,
+                            g_tdma_cmdbuf_gaddr, tdma_sz);
+
+  *seq_no = ctx->seq_no++;
+  bm_wait_cmdbuf_done(ctx, *seq_no);
+
+  delete[] tiu_cmdbuf;
+  delete[] tdma_cmdbuf;
+  delete[] cmdbuf2;
+  return BM_SUCCESS;
+}
+
+bmerr_t CModelCmdbuf::rt_wait_cmdbuf_done(bmctx_t ctx, uint16_t seq_no) {
+  (void)seq_no;
+  BMDEV_UNLOCK(ctx->dev);
+  return BM_SUCCESS;
+}
+
+bmerr_t CModelCmdbuf::rt_load_cmdbuf(bmctx_t ctx, uint8_t *cmdbuf, size_t sz,
+                       unsigned long long neuron_gaddr, unsigned long long weight_gaddr,
+                       bool enable_pmu, bmmem_device_t *cmdbuf_mem) {
+  TPU_LOG_DEBUG("Cmodel: bm_load_cmdbuf\n");
+  assert(enable_pmu == false);
+  uint8_t *cmdbuf2 = new uint8_t[sz];
+  memcpy(cmdbuf2, cmdbuf, sz);
+  adjust_cmdbuf(cmdbuf2, sz);
+
+  int cmdbuf3_sz = sizeof(CMDBUF_HEADER_T) + sz;
+  uint8_t *cmdbuf3 = new uint8_t[cmdbuf3_sz];
+  uint8_t *tiu_cmdbuf = cmdbuf3 + sizeof(CMDBUF_HEADER_T);
+  int tiu_sz = extract_cmdbuf(CVI_TPU_TIU, cmdbuf2, tiu_cmdbuf, sz);
+  assert(tiu_sz <= (int)sz && tiu_sz <= (int)g_tiu_cmdbuf_reserved_size);
+
+  uint8_t *tdma_cmdbuf = tiu_cmdbuf + tiu_sz;
+  int tdma_sz = extract_cmdbuf(CVI_TPU_TDMA, cmdbuf2, tdma_cmdbuf, sz);
+  assert(tdma_sz + tiu_sz <= (int)sz && tdma_sz <= (int)g_tdma_cmdbuf_reserved_size);
+
+  CMDBUF_HEADER_T *hdr = (typeof(hdr))cmdbuf3;
+  hdr->neuron_gaddr = neuron_gaddr;
+  hdr->weight_gaddr = weight_gaddr;
+  hdr->tiu_cmdbuf_sz = tiu_sz;
+  hdr->tdma_cmdbuf_sz = tdma_sz;
+
+  *cmdbuf_mem = bmmem_device_alloc_raw(ctx, cmdbuf3_sz);
+  bmerr_t ret = bm_memcpy_s2d(ctx, *cmdbuf_mem, cmdbuf3);
+  TPU_ASSERT(ret == BM_SUCCESS, nullptr);
+  delete[] cmdbuf2;
+  delete[] cmdbuf3;
+  return BM_SUCCESS;
+}
+
+bmerr_t CModelCmdbuf::rt_load_dmabuf(bmctx_t ctx, bmmem_device_t dmabuf, size_t sz,
+                       unsigned long long neuron_gaddr, unsigned long long weight_gaddr,
+                       bool enable_pmu, bmmem_device_t *dmabuf_mem) {
+  TPU_LOG_DEBUG("Cmodel: bm_load_dmabuf\n");
+  assert(enable_pmu == false);
+
+  int dmabuf2_sz = sizeof(CMDBUF_HEADER_T) + sz;
+  uint8_t *dmabuf2 = new uint8_t[dmabuf2_sz];
+  uint8_t *tiu_dmabuf = dmabuf2 + sizeof(CMDBUF_HEADER_T);
+  int tiu_sz = extract_dmabuf(CVI_TPU_TIU, dmabuf->v_addr, tiu_dmabuf, sz);
+  assert(tiu_sz <= (int)sz && tiu_sz <= (int)g_tiu_cmdbuf_reserved_size);
+
+  uint8_t *tdma_dmabuf = tiu_dmabuf + tiu_sz;
+  int tdma_sz = extract_dmabuf(CVI_TPU_TDMA, dmabuf->v_addr, tdma_dmabuf, sz);
+  assert(tdma_sz + tiu_sz <= (int)sz && tdma_sz <= (int)g_tdma_cmdbuf_reserved_size);
+
+  CMDBUF_HEADER_T *hdr = (typeof(hdr))dmabuf2;
+  hdr->neuron_gaddr = neuron_gaddr;
+  hdr->weight_gaddr = weight_gaddr;
+  hdr->tiu_cmdbuf_sz = tiu_sz;
+  hdr->tdma_cmdbuf_sz = tdma_sz;
+
+  *dmabuf_mem = bmmem_device_alloc_raw(ctx, dmabuf2_sz);
+  bmerr_t ret = bm_memcpy_s2d(ctx, *dmabuf_mem, dmabuf2);
+  TPU_ASSERT(ret == BM_SUCCESS, nullptr);
+  delete[] dmabuf2;
+  return BM_SUCCESS;
+}
+
+bmerr_t CModelCmdbuf::rt_run_cmdbuf(bmctx_t ctx, bmmem_device_t cmdbuf_mem,
+                      uint16_t *seq_no) {
+  TPU_LOG_DEBUG("Cmodel: bm_run_cmdbuf\n");
+  bm_memory_t *mem = (bm_memory_t *)cmdbuf_mem;
+  size_t sz = mem->size;
+  uint8_t *cmdbuf = new uint8_t[sz];
+  bm_memcpy_d2s(ctx, cmdbuf, cmdbuf_mem);
+  CMDBUF_HEADER_T *hdr = (typeof(hdr))cmdbuf;
+  uint8_t *tiu_cmdbuf = cmdbuf + sizeof(CMDBUF_HEADER_T);
+  uint8_t *tdma_cmdbuf = tiu_cmdbuf + hdr->tiu_cmdbuf_sz;
+
+  BMDEV_LOCK(ctx->dev);
+  bmmod_t model = ctx->dev->model;
+  bm_device_set_base_reg(ctx, 0, hdr->neuron_gaddr);
+  bm_device_set_base_reg(ctx, 1, hdr->weight_gaddr);
+  bm_cmodel_write_gmem(model, g_tiu_cmdbuf_gaddr, tiu_cmdbuf, hdr->tiu_cmdbuf_sz);
+  bm_cmodel_write_gmem(model, g_tdma_cmdbuf_gaddr, tdma_cmdbuf, hdr->tdma_cmdbuf_sz);
+  bm_cmodel_run_gmem_cmdbuf(model, g_tiu_cmdbuf_gaddr, hdr->tiu_cmdbuf_sz,
+                            g_tdma_cmdbuf_gaddr, hdr->tdma_cmdbuf_sz);
+
+  *seq_no = ctx->seq_no++;
+
+  delete[] cmdbuf;
+  return BM_SUCCESS;
+}
+
+bmerr_t CModelCmdbuf::rt_run_cmdbuf_ex(bmctx_t ctx, bmmem_device_t cmdbuf_mem,
+                      uint16_t *seq_no, uint64_t input_base_addr, uint64_t output_base_addr) {
+  TPU_LOG_DEBUG("Cmodel: bm_run_cmdbuf\n");
+  bm_memory_t *mem = (bm_memory_t *)cmdbuf_mem;
+  size_t sz = mem->size;
+  uint8_t *cmdbuf = new uint8_t[sz];
+  bm_memcpy_d2s(ctx, cmdbuf, cmdbuf_mem);
+  CMDBUF_HEADER_T *hdr = (typeof(hdr))cmdbuf;
+  uint8_t *tiu_cmdbuf = cmdbuf + sizeof(CMDBUF_HEADER_T);
+  uint8_t *tdma_cmdbuf = tiu_cmdbuf + hdr->tiu_cmdbuf_sz;
+
+  BMDEV_LOCK(ctx->dev);
+  bmmod_t model = ctx->dev->model;
+  bm_device_set_base_reg(ctx, 0, hdr->neuron_gaddr);
+  bm_device_set_base_reg(ctx, 1, hdr->weight_gaddr);
+  bm_device_set_base_reg(ctx, 2, input_base_addr);
+  bm_device_set_base_reg(ctx, 3, output_base_addr);
+  bm_cmodel_write_gmem(model, g_tiu_cmdbuf_gaddr, tiu_cmdbuf, hdr->tiu_cmdbuf_sz);
+  bm_cmodel_write_gmem(model, g_tdma_cmdbuf_gaddr, tdma_cmdbuf, hdr->tdma_cmdbuf_sz);
+  bm_cmodel_run_gmem_cmdbuf(model, g_tiu_cmdbuf_gaddr, hdr->tiu_cmdbuf_sz,
+                            g_tdma_cmdbuf_gaddr, hdr->tdma_cmdbuf_sz);
+
+  *seq_no = ctx->seq_no++;
+
+  delete[] cmdbuf;
+  return BM_SUCCESS;
+}
+
+bmerr_t CModelCmdbuf::rt_run_cmdbuf_ex2(bmctx_t ctx, bmmem_device_t cmdbuf_mem,
+                      uint16_t *seq_no, cvi_array_base *p_array_base) {
+  TPU_LOG_DEBUG("Cmodel: bm_run_cmdbuf\n");
+  bm_memory_t *mem = (bm_memory_t *)cmdbuf_mem;
+  size_t sz = mem->size;
+  uint8_t *cmdbuf = new uint8_t[sz];
+  bm_memcpy_d2s(ctx, cmdbuf, cmdbuf_mem);
+  CMDBUF_HEADER_T *hdr = (typeof(hdr))cmdbuf;
+  uint8_t *tiu_cmdbuf = cmdbuf + sizeof(CMDBUF_HEADER_T);
+  uint8_t *tdma_cmdbuf = tiu_cmdbuf + hdr->tiu_cmdbuf_sz;
+
+  BMDEV_LOCK(ctx->dev);
+  bmmod_t model = ctx->dev->model;
+  bm_device_set_base_reg(ctx, 0, p_array_base->gaddr_base0);
+  bm_device_set_base_reg(ctx, 1, p_array_base->gaddr_base1);
+  bm_device_set_base_reg(ctx, 2, p_array_base->gaddr_base2);
+  bm_device_set_base_reg(ctx, 3, p_array_base->gaddr_base3);
+  bm_device_set_base_reg(ctx, 4, p_array_base->gaddr_base4);
+  bm_device_set_base_reg(ctx, 5, p_array_base->gaddr_base5);
+  bm_device_set_base_reg(ctx, 6, p_array_base->gaddr_base6);
+  bm_device_set_base_reg(ctx, 7, p_array_base->gaddr_base7);
+  bm_cmodel_write_gmem(model, g_tiu_cmdbuf_gaddr, tiu_cmdbuf, hdr->tiu_cmdbuf_sz);
+  bm_cmodel_write_gmem(model, g_tdma_cmdbuf_gaddr, tdma_cmdbuf, hdr->tdma_cmdbuf_sz);
+  bm_cmodel_run_gmem_cmdbuf(model, g_tiu_cmdbuf_gaddr, hdr->tiu_cmdbuf_sz,
+                            g_tdma_cmdbuf_gaddr, hdr->tdma_cmdbuf_sz);
+
+  *seq_no = ctx->seq_no++;
+
+  delete[] cmdbuf;
+  return BM_SUCCESS;
+}
+
+bmmem_device_t CModelCmdbuf::rt_device_alloc_raw(bmctx_t ctx, size_t size) {
+  size_t axi_alignment = 16;
+  size_t pool_size = ALIGN(size, axi_alignment);
+  uint64_t addr = mem_pool_alloc(ctx->dev->device_mem_pool, pool_size);
+  addr += g_gmem_reserved_size;
+
+  // TPU_LOG_DEBUG("mmpool alloc, size=%lu, addr=%lx\n", size, addr);
+
+  bm_memory_t *device_mem = new bm_memory_t();
+  device_mem->flags.u.is_prealloc = 0;
+  device_mem->flags.u.type = BMMEM_TYPE_DEVICE;
+  device_mem->p_addr = addr;
+
+  //device_mem->v_addr = NULL;
+  //device_mem->v_addr = (void*)(device_mem->p_addr);
+  device_mem->v_addr = (uint8_t*)((device_mem->p_addr) + ((unsigned long long)(bm_cmodel_get_chipGmem(ctx->dev->model))));
+  device_mem->size = size;
+  return (bmmem_device_t)device_mem;
+}
+
+void CModelCmdbuf::rt_device_free(bmctx_t ctx, bmmem_device_t mem) {
+  bm_memory_t *device_mem = (bm_memory_t *)mem;
+  TPU_ASSERT(device_mem->flags.u.type == BMMEM_TYPE_DEVICE, nullptr);
+  if (!device_mem->flags.u.is_prealloc) {
+    unsigned long long addr = device_mem->p_addr;
+    // size_t size = device_mem->size;
+    // TPU_LOG_DEBUG("mmpool free, size=%lu, addr=%lx\n", size, addr);
+
+    addr -= g_gmem_reserved_size;
+    mem_pool_free(ctx->dev->device_mem_pool, addr);
+  }
+  delete device_mem;
+}
diff --git a/cviruntime/src/cmodel/cmodel_cmdbuf.h b/cviruntime/src/cmodel/cmodel_cmdbuf.h
new file mode 100644
index 000000000..73542b1ae
--- /dev/null
+++ b/cviruntime/src/cmodel/cmodel_cmdbuf.h
@@ -0,0 +1,111 @@
+#pragma once
+
+#include "string.h"
+#include <inttypes.h>
+#include <bmruntime.h>
+#include <bmkernel/reg_tiu.h>
+
+#include <bmkernel/bm_kernel.h>
+#include <bmkernel/bm_regcpu.h>
+#include <bmkernel/reg_bdcast.h>
+#include <bmkernel/reg_tdma.h>
+#include "runtime_cmodel_internal.h"
+
+#define GDMA_DESC_ALIGN_SIZE (1 << TDMA_DESCRIPTOR_ALIGNED_BIT)
+
+typedef struct {
+  unsigned long long neuron_gaddr;
+  unsigned long long weight_gaddr;
+  uint32_t tiu_cmdbuf_sz;
+  uint32_t tdma_cmdbuf_sz;
+} CMDBUF_HEADER_T;
+
+typedef struct __dma_hdr_t {
+  uint16_t dmabuf_magic_m;
+  uint16_t dmabuf_magic_s;
+  uint32_t dmabuf_size;
+  uint32_t cpu_desc_count;
+  uint32_t bd_desc_count; //16bytes
+  uint32_t tdma_desc_count;
+  uint32_t tpu_clk_rate;
+  uint32_t pmubuf_size;
+  uint32_t pmubuf_offset; //32bytes
+  uint32_t arraybase_0_L;
+  uint32_t arraybase_0_H;
+  uint32_t arraybase_1_L;
+  uint32_t arraybase_1_H; //48bytes
+  uint32_t arraybase_2_L;
+  uint32_t arraybase_2_H;
+  uint32_t arraybase_3_L;
+  uint32_t arraybase_3_H; //64bytes
+
+  uint32_t arraybase_4_L;
+  uint32_t arraybase_4_H;
+  uint32_t arraybase_5_L;
+  uint32_t arraybase_5_H;
+  uint32_t arraybase_6_L;
+  uint32_t arraybase_6_H;
+  uint32_t arraybase_7_L;
+  uint32_t arraybase_7_H;
+  uint32_t reserve[8];   //128bytes, 128bytes align
+} dma_hdr_t;
+
+// CPU_OP_SYNC structure
+typedef struct {
+  uint32_t op_type;
+  uint32_t num_tiu;
+  uint32_t num_tdma;
+  uint32_t offset_tiu;
+  uint32_t offset_tdma;
+  uint32_t offset_tiu_ori_bk;
+	uint32_t offset_tdma_ori_bk;
+  char str[CPU_ENGINE_STR_LIMIT_BYTE];
+} __attribute__((packed)) cvi_cpu_desc_t;
+
+class CModelCmdbuf {
+public:
+  virtual ~CModelCmdbuf() = 0;
+  virtual bmerr_t rt_device_open(int index, bmdev_t *dev) = 0;
+  virtual bmerr_t rt_send_cmdbuf(bmctx_t ctx, uint8_t *cmdbuf, size_t sz,
+                              uint16_t *seq_no);
+  virtual bmerr_t rt_wait_cmdbuf_done(bmctx_t ctx, uint16_t seq_no);
+  virtual bmerr_t rt_load_cmdbuf(bmctx_t ctx, uint8_t *cmdbuf, size_t sz,
+                       unsigned long long neuron_gaddr, unsigned long long weight_gaddr,
+                       bool enable_pmu, bmmem_device_t *cmdbuf_mem);
+  virtual bmerr_t rt_load_dmabuf(bmctx_t ctx, bmmem_device_t dmabuf, size_t sz,
+                       unsigned long long neuron_gaddr, unsigned long long weight_gaddr,
+                       bool enable_pmu, bmmem_device_t *dmabuf_mem);
+  virtual bmerr_t rt_run_cmdbuf(bmctx_t ctx, bmmem_device_t cmdbuf_mem,
+                                     uint16_t *seq_no);
+  virtual bmerr_t rt_run_cmdbuf_ex(bmctx_t ctx, bmmem_device_t cmdbuf_mem,
+                                        uint16_t *seq_no,
+                                        uint64_t input_base_addr,
+                                        uint64_t output_base_addr);
+  virtual bmerr_t rt_run_cmdbuf_ex2(bmctx_t ctx, bmmem_device_t cmdbuf_mem,
+                                         uint16_t *seq_no,
+                                         cvi_array_base *p_array_base);
+  virtual void enable_interrupt(uint8_t *cmdbuf, uint64_t sz) = 0;
+  virtual void set_eod(uint8_t *cmdbuf, uint64_t sz) = 0;
+  virtual bmmem_device_t rt_device_alloc_raw(bmctx_t ctx, size_t size);
+  virtual void rt_device_free(bmctx_t ctx, bmmem_device_t mem);
+
+protected:
+  virtual void reorder_tiu_cmdbuf_reg(uint8_t *cmdbuf);
+  virtual void reorder_tiu_cmdbuf(uint8_t *cmdbuf, size_t sz);
+  virtual void enable_tdma_cmdbuf_barrier(uint8_t *cmdbuf, size_t sz);
+  virtual void adjust_cmdbuf(uint8_t *cmdbuf, size_t sz);
+  virtual int extract_cmdbuf(int engine_id, uint8_t *cmdbuf,
+                             uint8_t *found_cmdbuf, size_t sz);
+  virtual int extract_dmabuf(int engine_id, uint8_t *dmabuf,
+                             uint8_t *found_dmabuf, size_t sz);
+
+public:
+  u64 g_tiu_cmdbuf_gaddr;
+  u64 g_tdma_cmdbuf_gaddr;
+  u64 g_tiu_cmdbuf_reserved_size;
+  u64 g_tdma_cmdbuf_reserved_size;
+  u64 g_gmem_reserved_size;
+  u64 g_gmem_size;
+  uint32_t cmdbuf_hdr_magic;
+  uint32_t dmabuf_hdr_magic;
+};
\ No newline at end of file
diff --git a/cviruntime/src/cmodel/cmodel_cmdbuf_180x.cpp b/cviruntime/src/cmodel/cmodel_cmdbuf_180x.cpp
new file mode 100644
index 000000000..c9528cde5
--- /dev/null
+++ b/cviruntime/src/cmodel/cmodel_cmdbuf_180x.cpp
@@ -0,0 +1,124 @@
+#include "cmodel_cmdbuf_180x.h"
+#include <cvikernel/cv180x/cv180x_tiu_reg.h>
+#include <cvikernel/cv180x/cv180x_tdma_reg.h>
+#include <cvikernel/cv180x/cv180x_tpu_cfg.h>
+
+CModelCmdbuf180x::CModelCmdbuf180x() {
+    g_tiu_cmdbuf_gaddr = 0;
+    g_tdma_cmdbuf_gaddr = 0;
+    g_tiu_cmdbuf_reserved_size = 0;
+    g_tdma_cmdbuf_reserved_size = 0;
+    g_gmem_reserved_size = 0;
+    g_gmem_size = 0;
+    cmdbuf_hdr_magic = CMDBUF_HDR_MAGIC_180X;
+    dmabuf_hdr_magic = 0x182203;
+}
+
+CModelCmdbuf180x::~CModelCmdbuf180x() {
+
+}
+
+bmerr_t CModelCmdbuf180x::rt_device_open(int index, bmdev_t *dev) {
+  cvk_reg_info_t req_info;
+  cvk_context_t *cvk_ctx;
+  uint8_t tmp_buf[32];
+  bm_device_t *pdev = new bm_device_t;
+
+  memset(&req_info, 0, sizeof(cvk_reg_info_t));
+  strncpy(req_info.chip_ver_str, CVI_TPU_VERSION_180X, sizeof(req_info.chip_ver_str) - 1);
+  req_info.cmdbuf = tmp_buf;
+  req_info.cmdbuf_size = sizeof(tmp_buf);
+  cvk_ctx = cvikernel_register(&req_info);
+  if (!cvk_ctx) {
+    delete pdev;
+    return BM_ERR_FAILURE;
+  }
+
+  BMDEV_LOCK_INIT(pdev);
+  pdev->index = index;
+  pdev->cvk_chip_info = cvk_ctx->info;
+  g_tiu_cmdbuf_gaddr = CV180X_GLOBAL_TIU_CMDBUF_ADDR;
+  g_tdma_cmdbuf_gaddr = CV180X_GLOBAL_TDMA_CMDBUF_ADDR;
+  g_tiu_cmdbuf_reserved_size = CV180X_GLOBAL_TIU_CMDBUF_RESERVED_SIZE;
+  g_tdma_cmdbuf_reserved_size = CV180X_GLOBAL_TDMA_CMDBUF_RESERVED_SIZE;
+  g_gmem_reserved_size =
+      g_tiu_cmdbuf_reserved_size + g_tdma_cmdbuf_reserved_size;
+
+  g_gmem_size = pdev->cvk_chip_info.gmem_size;
+  pdev->gmem_size = g_gmem_size;
+  bm_cmodel_init(&pdev->model, &pdev->cvk_chip_info);
+
+  assert(g_gmem_size > g_gmem_reserved_size);
+  unsigned long long pool_size = g_gmem_size - g_gmem_reserved_size;
+  mem_pool_create(&pdev->device_mem_pool, pool_size);
+
+  cvk_ctx->ops->cleanup(cvk_ctx);
+  if (cvk_ctx->priv_data)
+    free(cvk_ctx->priv_data);
+  free(cvk_ctx);
+
+  TPU_LOG_DEBUG("device[%d] opened, %" PRIu64 "\n", index, g_gmem_size);
+
+  *dev = pdev;
+
+  return BM_SUCCESS;
+}
+
+void CModelCmdbuf180x::enable_interrupt(uint8_t *cmdbuf, size_t sz) {
+  cmd_hdr_t *hdr = NULL, *last_tiu = NULL, *last_tdma = NULL;
+  for (uint32_t i = 0; i < sz; i += sizeof(*hdr) + hdr->len) {
+    hdr = (typeof(hdr))(&cmdbuf[i]);
+    if (hdr->engine_id == CVI_TPU_TDMA)
+      last_tdma = hdr;
+    else if (hdr->engine_id == CVI_TPU_TIU)
+      last_tiu = hdr;
+  }
+
+  if (last_tiu) {
+    tiu_reg_t reg;
+    parse_tiu_reg(&reg, (u32 *)last_tiu->cmd);
+    reg.cmd_intr_en = 1;
+    emit_tiu_reg(&reg, (u32 *)last_tiu->cmd);
+  }
+
+  if (last_tdma) {
+    u32 *p = (u32 *)last_tdma->cmd;
+    p[0] |= (1 << 3);
+  }
+}
+
+void CModelCmdbuf180x::set_eod(uint8_t *cmdbuf, uint64_t sz) {
+  cmd_hdr_t *hdr = NULL;
+  cmd_hdr_t *last_tiu = NULL;
+  cmd_hdr_t *last_tdma = NULL;
+
+  for (uint32_t i = 0; i < sz; i += sizeof(*hdr) + hdr->len) {
+    hdr = (typeof(hdr))(&cmdbuf[i]);
+    if (hdr->engine_id == CVI_TPU_TDMA)
+      last_tdma = hdr;
+    else if (hdr->engine_id == CVI_TPU_TIU)
+      last_tiu = hdr;
+    else if (hdr->engine_id == CVI_TPU_CPU)
+      continue;
+    else {
+      TPU_LOG_ERROR("unknown engine_id:%d\n", (int)(hdr->engine_id));
+      while (1)
+        ;
+      assert(0);
+    }
+  }
+
+  if (last_tiu) {
+    tiu_reg_t reg;
+    parse_tiu_reg(&reg, (u32 *)last_tiu->cmd);
+    reg.cmd_end = 1;
+    emit_tiu_reg(&reg, (u32 *)last_tiu->cmd);
+  }
+
+  if (last_tdma) {
+    tdma_reg_t reg;
+    parse_tdma_reg(&reg, (u32 *)last_tdma->cmd);
+    reg.eod = 1;
+    emit_tdma_reg(&reg, (u32 *)last_tdma->cmd);
+  }
+}
diff --git a/cviruntime/src/cmodel/cmodel_cmdbuf_180x.h b/cviruntime/src/cmodel/cmodel_cmdbuf_180x.h
new file mode 100644
index 000000000..c9270e1a7
--- /dev/null
+++ b/cviruntime/src/cmodel/cmodel_cmdbuf_180x.h
@@ -0,0 +1,13 @@
+#pragma once
+#include "cmodel_cmdbuf.h"
+
+class CModelCmdbuf180x : public CModelCmdbuf {
+public:
+  CModelCmdbuf180x();
+  ~CModelCmdbuf180x() override;
+
+  virtual bmerr_t rt_device_open(int index, bmdev_t *dev) override;
+protected:
+  virtual void enable_interrupt(uint8_t *cmdbuf, size_t sz) override;
+  virtual void set_eod(uint8_t *cmdbuf, uint64_t sz) override;
+};
diff --git a/cviruntime/src/cmodel/cmodel_cmdbuf_181x.cpp b/cviruntime/src/cmodel/cmodel_cmdbuf_181x.cpp
new file mode 100644
index 000000000..3fccd8961
--- /dev/null
+++ b/cviruntime/src/cmodel/cmodel_cmdbuf_181x.cpp
@@ -0,0 +1,124 @@
+#include "cmodel_cmdbuf_181x.h"
+#include <cvikernel/cv181x/cv181x_tiu_reg.h>
+#include <cvikernel/cv181x/cv181x_tdma_reg.h>
+#include <cvikernel/cv181x/cv181x_tpu_cfg.h>
+
+CModelCmdbuf181x::CModelCmdbuf181x() {
+    g_tiu_cmdbuf_gaddr = 0;
+    g_tdma_cmdbuf_gaddr = 0;
+    g_tiu_cmdbuf_reserved_size = 0;
+    g_tdma_cmdbuf_reserved_size = 0;
+    g_gmem_reserved_size = 0;
+    g_gmem_size = 0;
+    cmdbuf_hdr_magic = CMDBUF_HDR_MAGIC_181X;
+    dmabuf_hdr_magic = 0x182202;
+}
+
+CModelCmdbuf181x::~CModelCmdbuf181x() {
+
+}
+
+bmerr_t CModelCmdbuf181x::rt_device_open(int index, bmdev_t *dev) {
+  cvk_reg_info_t req_info;
+  cvk_context_t *cvk_ctx;
+  uint8_t tmp_buf[32];
+  bm_device_t *pdev = new bm_device_t;
+
+  memset(&req_info, 0, sizeof(cvk_reg_info_t));
+  strncpy(req_info.chip_ver_str, CVI_TPU_VERSION_181X, sizeof(req_info.chip_ver_str) - 1);
+  req_info.cmdbuf = tmp_buf;
+  req_info.cmdbuf_size = sizeof(tmp_buf);
+  cvk_ctx = cvikernel_register(&req_info);
+  if (!cvk_ctx) {
+    delete pdev;
+    return BM_ERR_FAILURE;
+  }
+
+  BMDEV_LOCK_INIT(pdev);
+  pdev->index = index;
+  pdev->cvk_chip_info = cvk_ctx->info;
+  g_tiu_cmdbuf_gaddr = CV181X_GLOBAL_TIU_CMDBUF_ADDR;
+  g_tdma_cmdbuf_gaddr = CV181X_GLOBAL_TDMA_CMDBUF_ADDR;
+  g_tiu_cmdbuf_reserved_size = CV181X_GLOBAL_TIU_CMDBUF_RESERVED_SIZE;
+  g_tdma_cmdbuf_reserved_size = CV181X_GLOBAL_TDMA_CMDBUF_RESERVED_SIZE;
+  g_gmem_reserved_size =
+      g_tiu_cmdbuf_reserved_size + g_tdma_cmdbuf_reserved_size;
+
+  g_gmem_size = pdev->cvk_chip_info.gmem_size;
+  pdev->gmem_size = g_gmem_size;
+  bm_cmodel_init(&pdev->model, &pdev->cvk_chip_info);
+
+  assert(g_gmem_size > g_gmem_reserved_size);
+  unsigned long long pool_size = g_gmem_size - g_gmem_reserved_size;
+  mem_pool_create(&pdev->device_mem_pool, pool_size);
+
+  cvk_ctx->ops->cleanup(cvk_ctx);
+  if (cvk_ctx->priv_data)
+    free(cvk_ctx->priv_data);
+  free(cvk_ctx);
+
+  TPU_LOG_DEBUG("device[%d] opened, %" PRIu64 "\n", index, g_gmem_size);
+
+  *dev = pdev;
+
+  return BM_SUCCESS;
+}
+
+void CModelCmdbuf181x::enable_interrupt(uint8_t *cmdbuf, size_t sz) {
+  cmd_hdr_t *hdr = NULL, *last_tiu = NULL, *last_tdma = NULL;
+  for (uint32_t i = 0; i < sz; i += sizeof(*hdr) + hdr->len) {
+    hdr = (typeof(hdr))(&cmdbuf[i]);
+    if (hdr->engine_id == CVI_TPU_TDMA)
+      last_tdma = hdr;
+    else if (hdr->engine_id == CVI_TPU_TIU)
+      last_tiu = hdr;
+  }
+
+  if (last_tiu) {
+    tiu_reg_t reg;
+    parse_tiu_reg(&reg, (u32 *)last_tiu->cmd);
+    reg.cmd_intr_en = 1;
+    emit_tiu_reg(&reg, (u32 *)last_tiu->cmd);
+  }
+
+  if (last_tdma) {
+    u32 *p = (u32 *)last_tdma->cmd;
+    p[0] |= (1 << 3);
+  }
+}
+
+void CModelCmdbuf181x::set_eod(uint8_t *cmdbuf, uint64_t sz) {
+  cmd_hdr_t *hdr = NULL;
+  cmd_hdr_t *last_tiu = NULL;
+  cmd_hdr_t *last_tdma = NULL;
+
+  for (uint32_t i = 0; i < sz; i += sizeof(*hdr) + hdr->len) {
+    hdr = (typeof(hdr))(&cmdbuf[i]);
+    if (hdr->engine_id == CVI_TPU_TDMA)
+      last_tdma = hdr;
+    else if (hdr->engine_id == CVI_TPU_TIU)
+      last_tiu = hdr;
+    else if (hdr->engine_id == CVI_TPU_CPU)
+      continue;
+    else {
+      TPU_LOG_ERROR("unknown engine_id:%d\n", (int)(hdr->engine_id));
+      while (1)
+        ;
+      assert(0);
+    }
+  }
+
+  if (last_tiu) {
+    tiu_reg_t reg;
+    parse_tiu_reg(&reg, (u32 *)last_tiu->cmd);
+    reg.cmd_end = 1;
+    emit_tiu_reg(&reg, (u32 *)last_tiu->cmd);
+  }
+
+  if (last_tdma) {
+    tdma_reg_t reg;
+    parse_tdma_reg(&reg, (u32 *)last_tdma->cmd);
+    reg.eod = 1;
+    emit_tdma_reg(&reg, (u32 *)last_tdma->cmd);
+  }
+}
diff --git a/cviruntime/src/cmodel/cmodel_cmdbuf_181x.h b/cviruntime/src/cmodel/cmodel_cmdbuf_181x.h
new file mode 100644
index 000000000..0b88ac044
--- /dev/null
+++ b/cviruntime/src/cmodel/cmodel_cmdbuf_181x.h
@@ -0,0 +1,13 @@
+#pragma once
+#include "cmodel_cmdbuf.h"
+
+class CModelCmdbuf181x : public CModelCmdbuf {
+public:
+  CModelCmdbuf181x();
+  ~CModelCmdbuf181x() override;
+
+  virtual bmerr_t rt_device_open(int index, bmdev_t *dev) override;
+protected:
+  virtual void enable_interrupt(uint8_t *cmdbuf, size_t sz) override;
+  virtual void set_eod(uint8_t *cmdbuf, uint64_t sz) override;
+};
\ No newline at end of file
diff --git a/cviruntime/src/cmodel/cmodel_cmdbuf_182x.cpp b/cviruntime/src/cmodel/cmodel_cmdbuf_182x.cpp
new file mode 100644
index 000000000..1907147a7
--- /dev/null
+++ b/cviruntime/src/cmodel/cmodel_cmdbuf_182x.cpp
@@ -0,0 +1,107 @@
+#include "cmodel_cmdbuf_182x.h"
+#include <bmkernel/bm1822/bmkernel_1822.h>
+#include <bmkernel/bm1822/bm1822_tiu_reg.h>
+#include <bmkernel/bm1822/bm1822_tdma_reg.h>
+#include <bmkernel/bm1822/bm1822_tpu_cfg.h>
+
+CModelCmdbuf182x::CModelCmdbuf182x() {
+    g_tiu_cmdbuf_gaddr = 0;
+    g_tdma_cmdbuf_gaddr = 0;
+    g_tiu_cmdbuf_reserved_size = 0;
+    g_tdma_cmdbuf_reserved_size = 0;
+    g_gmem_reserved_size = 0;
+    g_gmem_size = 0;
+    cmdbuf_hdr_magic = CMDBUF_HDR_MAGIC_1822;
+    dmabuf_hdr_magic = 0x1822;
+}
+
+CModelCmdbuf182x::~CModelCmdbuf182x() {
+
+}
+
+bmerr_t CModelCmdbuf182x::rt_device_open(int index, bmdev_t *dev) {
+  bm_device_t *pdev = new bm_device_t;
+
+  BMDEV_LOCK_INIT(pdev);
+  pdev->index = index;
+  pdev->cvk_chip_info = bmk1822_chip_info();
+  g_tiu_cmdbuf_gaddr = BM1822_GLOBAL_TIU_CMDBUF_ADDR;
+  g_tdma_cmdbuf_gaddr = BM1822_GLOBAL_TDMA_CMDBUF_ADDR;
+  g_tiu_cmdbuf_reserved_size = BM1822_GLOBAL_TIU_CMDBUF_RESERVED_SIZE;
+  g_tdma_cmdbuf_reserved_size = BM1822_GLOBAL_TDMA_CMDBUF_RESERVED_SIZE;
+  g_gmem_reserved_size =
+      g_tiu_cmdbuf_reserved_size + g_tdma_cmdbuf_reserved_size;
+
+  g_gmem_size = pdev->cvk_chip_info.gmem_size;
+  pdev->gmem_size = g_gmem_size;
+  bm_cmodel_init(&pdev->model, &pdev->cvk_chip_info);
+
+  assert(g_gmem_size > g_gmem_reserved_size);
+  unsigned long long pool_size = g_gmem_size - g_gmem_reserved_size;
+  mem_pool_create(&pdev->device_mem_pool, pool_size);
+
+  TPU_LOG_DEBUG("device[%d] opened, %" PRIu64 "\n", index, g_gmem_size);
+
+  *dev = pdev;
+
+  return BM_SUCCESS;
+}
+
+void CModelCmdbuf182x::enable_interrupt(uint8_t *cmdbuf, size_t sz) {
+  cmd_hdr_t *hdr = NULL, *last_tiu = NULL, *last_tdma = NULL;
+  for (uint32_t i = 0; i < sz; i += sizeof(*hdr) + hdr->len) {
+    hdr = (typeof(hdr))(&cmdbuf[i]);
+    if (hdr->engine_id == CVI_TPU_TDMA)
+      last_tdma = hdr;
+    else if (hdr->engine_id == CVI_TPU_TIU)
+      last_tiu = hdr;
+  }
+
+  if (last_tiu) {
+    tiu_reg_t reg;
+    parse_tiu_reg(&reg, (u32 *)last_tiu->cmd);
+    reg.cmd_intr_en = 1;
+    emit_tiu_reg(&reg, (u32 *)last_tiu->cmd);
+  }
+
+  if (last_tdma) {
+    u32 *p = (u32 *)last_tdma->cmd;
+    p[0] |= (1 << 3);
+  }
+}
+
+void CModelCmdbuf182x::set_eod(uint8_t *cmdbuf, uint64_t sz) {
+  cmd_hdr_t *hdr = NULL;
+  cmd_hdr_t *last_tiu = NULL;
+  cmd_hdr_t *last_tdma = NULL;
+
+  for (uint32_t i = 0; i < sz; i += sizeof(*hdr) + hdr->len) {
+    hdr = (typeof(hdr))(&cmdbuf[i]);
+    if (hdr->engine_id == CVI_TPU_TDMA)
+      last_tdma = hdr;
+    else if (hdr->engine_id == CVI_TPU_TIU)
+      last_tiu = hdr;
+    else if (hdr->engine_id == CVI_TPU_CPU)
+      continue;
+    else {
+      TPU_LOG_ERROR("unknown engine_id:%d\n", (int)(hdr->engine_id));
+      while (1)
+        ;
+      assert(0);
+    }
+  }
+
+  if (last_tiu) {
+    tiu_reg_t reg;
+    parse_tiu_reg(&reg, (u32 *)last_tiu->cmd);
+    reg.cmd_end = 1;
+    emit_tiu_reg(&reg, (u32 *)last_tiu->cmd);
+  }
+
+  if (last_tdma) {
+    tdma_reg_t reg;
+    parse_tdma_reg(&reg, (u32 *)last_tdma->cmd);
+    reg.eod = 1;
+    emit_tdma_reg(&reg, (u32 *)last_tdma->cmd);
+  }
+}
diff --git a/cviruntime/src/cmodel/cmodel_cmdbuf_182x.h b/cviruntime/src/cmodel/cmodel_cmdbuf_182x.h
new file mode 100644
index 000000000..c68c88387
--- /dev/null
+++ b/cviruntime/src/cmodel/cmodel_cmdbuf_182x.h
@@ -0,0 +1,13 @@
+#pragma once
+#include "cmodel_cmdbuf.h"
+
+class CModelCmdbuf182x : public CModelCmdbuf {
+public:
+  CModelCmdbuf182x();
+  ~CModelCmdbuf182x() override;
+
+  virtual bmerr_t rt_device_open(int index, bmdev_t *dev) override;
+protected:
+  virtual void enable_interrupt(uint8_t *cmdbuf, size_t sz) override;
+  virtual void set_eod(uint8_t *cmdbuf, uint64_t sz) override;
+};
\ No newline at end of file
diff --git a/cviruntime/src/cmodel/cmodel_cmdbuf_183x.cpp b/cviruntime/src/cmodel/cmodel_cmdbuf_183x.cpp
new file mode 100644
index 000000000..a80c666fb
--- /dev/null
+++ b/cviruntime/src/cmodel/cmodel_cmdbuf_183x.cpp
@@ -0,0 +1,105 @@
+#include "cmodel_cmdbuf_183x.h"
+#include <bmkernel/bm1880v2/bmkernel_1880v2.h>
+#include <bmkernel/bm1880v2/bm1880v2_tiu_reg.h>
+#include <bmkernel/bm1880v2/bm1880v2_tdma_reg.h>
+#include <bmkernel/bm1880v2/bm1880v2_tpu_cfg.h>
+
+CModelCmdbuf183x::CModelCmdbuf183x() {
+  g_tiu_cmdbuf_gaddr = 0;
+  g_tdma_cmdbuf_gaddr = 0;
+  g_tiu_cmdbuf_reserved_size = 0;
+  g_tdma_cmdbuf_reserved_size = 0;
+  g_gmem_reserved_size = 0;
+  g_gmem_size = 0;
+  cmdbuf_hdr_magic = CMDBUF_HDR_MAGIC_1880v2;
+  dmabuf_hdr_magic = 0x1835;
+}
+
+CModelCmdbuf183x::~CModelCmdbuf183x() {}
+
+bmerr_t CModelCmdbuf183x::rt_device_open(int index, bmdev_t *dev) {
+  bm_device_t *pdev = new bm_device_t;
+
+  BMDEV_LOCK_INIT(pdev);
+  pdev->index = index;
+  pdev->cvk_chip_info = bmk1880v2_chip_info();
+  g_tiu_cmdbuf_gaddr = BM1880V2_GLOBAL_TIU_CMDBUF_ADDR;
+  g_tdma_cmdbuf_gaddr = BM1880V2_GLOBAL_TDMA_CMDBUF_ADDR;
+  g_tiu_cmdbuf_reserved_size = BM1880V2_GLOBAL_TIU_CMDBUF_RESERVED_SIZE;
+  g_tdma_cmdbuf_reserved_size = BM1880V2_GLOBAL_TDMA_CMDBUF_RESERVED_SIZE;
+  g_gmem_reserved_size =
+      g_tiu_cmdbuf_reserved_size + g_tdma_cmdbuf_reserved_size;
+
+  g_gmem_size = pdev->cvk_chip_info.gmem_size;
+  pdev->gmem_size = g_gmem_size;
+  bm_cmodel_init(&pdev->model, &pdev->cvk_chip_info);
+
+  assert(g_gmem_size > g_gmem_reserved_size);
+  unsigned long long pool_size = g_gmem_size - g_gmem_reserved_size;
+  mem_pool_create(&pdev->device_mem_pool, pool_size);
+
+  TPU_LOG_DEBUG("device[%d] opened, %" PRIu64 "\n", index, g_gmem_size);
+
+  *dev = pdev;
+
+  return BM_SUCCESS;
+}
+
+void CModelCmdbuf183x::enable_interrupt(uint8_t *cmdbuf, size_t sz) {
+  cmd_hdr_t *hdr = NULL, *last_tiu = NULL, *last_tdma = NULL;
+  for (uint32_t i = 0; i < sz; i += sizeof(*hdr) + hdr->len) {
+    hdr = (typeof(hdr))(&cmdbuf[i]);
+    if (hdr->engine_id == CVI_TPU_TDMA)
+      last_tdma = hdr;
+    else if (hdr->engine_id == CVI_TPU_TIU)
+      last_tiu = hdr;
+  }
+
+  if (last_tiu) {
+    tiu_reg_t reg;
+    parse_tiu_reg(&reg, (u32 *)last_tiu->cmd);
+    reg.cmd_intr_en = 1;
+    emit_tiu_reg(&reg, (u32 *)last_tiu->cmd);
+  }
+
+  if (last_tdma) {
+    u32 *p = (u32 *)last_tdma->cmd;
+    p[0] |= (1 << 3);
+  }
+}
+
+void CModelCmdbuf183x::set_eod(uint8_t *cmdbuf, uint64_t sz) {
+  cmd_hdr_t *hdr = NULL;
+  cmd_hdr_t *last_tiu = NULL;
+  cmd_hdr_t *last_tdma = NULL;
+
+  for (uint32_t i = 0; i < sz; i += sizeof(*hdr) + hdr->len) {
+    hdr = (typeof(hdr))(&cmdbuf[i]);
+    if (hdr->engine_id == CVI_TPU_TDMA)
+      last_tdma = hdr;
+    else if (hdr->engine_id == CVI_TPU_TIU)
+      last_tiu = hdr;
+    else if (hdr->engine_id == CVI_TPU_CPU)
+      continue;
+    else {
+      TPU_LOG_ERROR("unknown engine_id:%d\n", (int)(hdr->engine_id));
+      while (1)
+        ;
+      assert(0);
+    }
+  }
+
+  if (last_tiu) {
+    tiu_reg_t reg;
+    parse_tiu_reg(&reg, (u32 *)last_tiu->cmd);
+    reg.cmd_end = 1;
+    emit_tiu_reg(&reg, (u32 *)last_tiu->cmd);
+  }
+
+  if (last_tdma) {
+    tdma_reg_t reg;
+    parse_tdma_reg(&reg, (u32 *)last_tdma->cmd);
+    reg.eod = 1;
+    emit_tdma_reg(&reg, (u32 *)last_tdma->cmd);
+  }
+}
\ No newline at end of file
diff --git a/cviruntime/src/cmodel/cmodel_cmdbuf_183x.h b/cviruntime/src/cmodel/cmodel_cmdbuf_183x.h
new file mode 100644
index 000000000..89038b647
--- /dev/null
+++ b/cviruntime/src/cmodel/cmodel_cmdbuf_183x.h
@@ -0,0 +1,13 @@
+#pragma once
+#include "cmodel_cmdbuf.h"
+
+class CModelCmdbuf183x : public CModelCmdbuf {
+public:
+  CModelCmdbuf183x(); 
+  ~CModelCmdbuf183x() override;
+
+  virtual bmerr_t rt_device_open(int index, bmdev_t *dev) override;
+protected:
+  virtual void enable_interrupt(uint8_t *cmdbuf, size_t sz) override;
+  virtual void set_eod(uint8_t *cmdbuf, uint64_t sz) override;
+};
\ No newline at end of file
diff --git a/cviruntime/src/cmodel/runtime_cmodel_internal.h b/cviruntime/src/cmodel/runtime_cmodel_internal.h
new file mode 100644
index 000000000..ed23b22a4
--- /dev/null
+++ b/cviruntime/src/cmodel/runtime_cmodel_internal.h
@@ -0,0 +1,159 @@
+#ifndef _RUNTIME_CMODEL_INTERNAL_H_
+#define _RUNTIME_CMODEL_INTERNAL_H_
+
+#include <stdint.h>
+#include <cmodel/bm_cmodel.h>
+#include <pthread.h>
+#include <runtime/debug.h>
+#include "mmpool.h"
+#include <cvikernel/cvikernel.h>
+#include <bmkernel/bm1822/bmkernel_1822.h>
+#include <bmkernel/bm1880v2/bmkernel_1880v2.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define __ALIGN_MASK(x,mask)    (((x)+(mask))&~(mask))
+#define ALIGN(x,a)              __ALIGN_MASK(x,(__typeof__(x))(a)-1)
+
+typedef struct bm_context {
+  bmdev_t dev;
+  uint16_t seq_no;
+  void *bk_ctx;
+  u8 *cmdbuf;
+
+  struct bmk_context *cvik_context;
+  cvk_context_t *cvk_context;
+
+  void *cvik_cmdbuf;
+
+  unsigned long long neuron_paddr_nor;
+  unsigned long long weight_paddr_nor;
+  u32 weight_size_nor;
+
+  unsigned long long dmabuf_addr_sec;
+  unsigned long long dmabuf_len_sec;
+} bm_context_t;
+
+#define BMDEV_LOCK_INIT(dev) pthread_mutex_init(&dev->lock, NULL)
+#define BMDEV_LOCK_DEINIT(dev) pthread_mutex_destroy(&dev->lock)
+#define BMDEV_LOCK(dev) pthread_mutex_lock(&dev->lock)
+#define BMDEV_UNLOCK(dev) pthread_mutex_unlock(&dev->lock)
+
+typedef struct bm_device {
+  int index;
+  bmmod_t model;
+  struct mem_pool *device_mem_pool;
+  cvk_chip_info_t cvk_chip_info;
+  unsigned long long gmem_size;
+
+  pthread_mutex_t lock;
+} bm_device_t;
+
+typedef enum {
+  BMMEM_TYPE_DEVICE = 0,
+  BMMEM_TYPE_DEVICE_NEURON = 1, // obsolete
+  BMMEM_TYPE_DEVICE_COEFF = 2,  // obsolete
+  BMMEM_TYPE_HOST = 3,
+  BMMEM_TYPE_SYSTEM = 4, // obsolete
+  BMMEM_TYPE_INVALID = 5
+} bmmem_type_t;
+
+typedef union {
+  struct {
+    bmmem_type_t type : 3;
+    int is_prealloc : 1;
+    unsigned long long reserved : 60;
+  } u;
+  unsigned long long rawflags;
+} bmmem_flags_t;
+
+typedef struct bm_memory {
+  uint8_t       *v_addr; // for host, or mapped device in soc mode
+  unsigned long long p_addr;
+  size_t        size;
+  int32_t       user_ref_cnt;
+  bmmem_flags_t flags;
+} bm_memory_t;
+
+void bm1880v2_enable_interrupt(uint8_t *cmdbuf, uint64_t sz);
+void bm1880v2_set_eod(uint8_t *cmdbuf, uint64_t sz);
+void bm1822_enable_interrupt(uint8_t *cmdbuf, uint64_t sz);
+void bm1822_set_eod(uint8_t *cmdbuf, uint64_t sz);
+void cv181x_enable_interrupt(uint8_t *cmdbuf, uint64_t sz);
+void cv181x_set_eod(uint8_t *cmdbuf, uint64_t sz);
+void cv180x_enable_interrupt(uint8_t *cmdbuf, uint64_t sz);
+void cv180x_set_eod(uint8_t *cmdbuf, uint64_t sz);
+
+bmerr_t rt_1880v2_device_open(int index, bmdev_t *dev);
+bmerr_t rt_1880v2_send_cmdbuf(bmctx_t ctx, uint8_t *cmdbuf, size_t sz, uint16_t *seq_no);
+bmerr_t rt_1880v2_wait_cmdbuf_done(bmctx_t ctx, uint16_t seq_no);
+bmerr_t rt_1880v2_load_cmdbuf(bmctx_t ctx, uint8_t *cmdbuf, size_t sz,
+                       unsigned long long neuron_gaddr, unsigned long long weight_gaddr,
+                       bool enable_pmu, bmmem_device_t *cmdbuf_mem);
+bmerr_t rt_1880v2_run_cmdbuf(bmctx_t ctx, bmmem_device_t cmdbuf_mem,
+                      uint16_t *seq_no);
+bmerr_t rt_1880v2_run_cmdbuf_ex(bmctx_t ctx, bmmem_device_t cmdbuf_mem,
+                      uint16_t *seq_no, uint64_t input_base_addr, uint64_t output_base_addr);
+bmerr_t rt_1880v2_run_cmdbuf_ex2(bmctx_t ctx, bmmem_device_t cmdbuf_mem,
+                      uint16_t *seq_no, cvi_array_base *p_array_base);
+
+void rt_1880v2_device_free(bmctx_t ctx, bmmem_device_t mem);
+bmmem_device_t rt_1880v2_device_alloc_raw(bmctx_t ctx, size_t size);
+
+
+bmerr_t rt_1822_device_open(int index, bmdev_t *dev);
+bmerr_t rt_1822_send_cmdbuf(bmctx_t ctx, uint8_t *cmdbuf, size_t sz, uint16_t *seq_no);
+bmerr_t rt_1822_wait_cmdbuf_done(bmctx_t ctx, uint16_t seq_no);
+bmerr_t rt_1822_load_cmdbuf(bmctx_t ctx, uint8_t *cmdbuf, size_t sz,
+                       unsigned long long neuron_gaddr, unsigned long long weight_gaddr,
+                       bool enable_pmu, bmmem_device_t *cmdbuf_mem);
+bmerr_t rt_1822_run_cmdbuf(bmctx_t ctx, bmmem_device_t cmdbuf_mem,
+                      uint16_t *seq_no);
+bmerr_t rt_1822_run_cmdbuf_ex(bmctx_t ctx, bmmem_device_t cmdbuf_mem,
+                      uint16_t *seq_no, uint64_t input_base_addr, uint64_t output_base_addr);
+bmerr_t rt_1822_run_cmdbuf_ex2(bmctx_t ctx, bmmem_device_t cmdbuf_mem,
+                      uint16_t *seq_no, cvi_array_base *p_array_base);
+
+void rt_1822_device_free(bmctx_t ctx, bmmem_device_t mem);
+bmmem_device_t rt_1822_device_alloc_raw(bmctx_t ctx, size_t size);
+
+bmerr_t rt_cv181x_device_open(int index, bmdev_t *dev);
+bmerr_t rt_cv181x_send_cmdbuf(bmctx_t ctx, uint8_t *cmdbuf, size_t sz, uint16_t *seq_no);
+bmerr_t rt_cv181x_wait_cmdbuf_done(bmctx_t ctx, uint16_t seq_no);
+bmerr_t rt_cv181x_load_cmdbuf(bmctx_t ctx, uint8_t *cmdbuf, size_t sz,
+                       unsigned long long neuron_gaddr, unsigned long long weight_gaddr,
+                       bool enable_pmu, bmmem_device_t *cmdbuf_mem);
+bmerr_t rt_cv181x_run_cmdbuf(bmctx_t ctx, bmmem_device_t cmdbuf_mem,
+                      uint16_t *seq_no);
+bmerr_t rt_cv181x_run_cmdbuf_ex(bmctx_t ctx, bmmem_device_t cmdbuf_mem,
+                      uint16_t *seq_no, uint64_t input_base_addr, uint64_t output_base_addr);
+bmerr_t rt_cv181x_run_cmdbuf_ex2(bmctx_t ctx, bmmem_device_t cmdbuf_mem,
+                      uint16_t *seq_no, cvi_array_base *p_array_base);
+
+void rt_cv181x_device_free(bmctx_t ctx, bmmem_device_t mem);
+bmmem_device_t rt_cv181x_device_alloc_raw(bmctx_t ctx, size_t size);
+
+bmerr_t rt_cv180x_device_open(int index, bmdev_t *dev);
+bmerr_t rt_cv180x_send_cmdbuf(bmctx_t ctx, uint8_t *cmdbuf, size_t sz, uint16_t *seq_no);
+bmerr_t rt_cv180x_wait_cmdbuf_done(bmctx_t ctx, uint16_t seq_no);
+bmerr_t rt_cv180x_load_cmdbuf(bmctx_t ctx, uint8_t *cmdbuf, size_t sz,
+                       unsigned long long neuron_gaddr, unsigned long long weight_gaddr,
+                       bool enable_pmu, bmmem_device_t *cmdbuf_mem);
+bmerr_t rt_cv180x_run_cmdbuf(bmctx_t ctx, bmmem_device_t cmdbuf_mem,
+                      uint16_t *seq_no);
+bmerr_t rt_cv180x_run_cmdbuf_ex(bmctx_t ctx, bmmem_device_t cmdbuf_mem,
+                      uint16_t *seq_no, uint64_t input_base_addr, uint64_t output_base_addr);
+bmerr_t rt_cv180x_run_cmdbuf_ex2(bmctx_t ctx, bmmem_device_t cmdbuf_mem,
+                      uint16_t *seq_no, cvi_array_base *p_array_base);
+
+void rt_cv180x_device_free(bmctx_t ctx, bmmem_device_t mem);
+bmmem_device_t rt_cv180x_device_alloc_raw(bmctx_t ctx, size_t size);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/cviruntime/src/cmodel/runtime_cmodel_main.cpp b/cviruntime/src/cmodel/runtime_cmodel_main.cpp
new file mode 100644
index 000000000..f2315ed0f
--- /dev/null
+++ b/cviruntime/src/cmodel/runtime_cmodel_main.cpp
@@ -0,0 +1,820 @@
+#include "string.h"
+#include <memory>
+#include <bmkernel/bm1822/bm1822_tpu_cfg.h>
+#include <bmkernel/bm1822/bmkernel_1822.h>
+#include <bmkernel/bm1880v2/bm1880v2_tpu_cfg.h>
+#include <bmkernel/bm1880v2/bmkernel_1880v2.h>
+#include <bmkernel/reg_tiu.h>
+#include <bmruntime.h>
+#include <cvikernel/cvikernel.h>
+#include <cvikernel/cv181x/cv181x_tpu_cfg.h>
+#include <cvikernel/cv180x/cv180x_tpu_cfg.h>
+
+#include "cviruntime_context.h"
+#include "cvitpu_debug.h"
+#include "runtime_cmodel_internal.h"
+#include <bmruntime_bmkernel.h>
+#include "cmodel_cmdbuf_180x.h"
+#include "cmodel_cmdbuf_181x.h"
+#include "cmodel_cmdbuf_182x.h"
+#include "cmodel_cmdbuf_183x.h"
+
+using std::hex;
+using std::showbase;
+
+#define DEVICE_INDEX_NUM 0
+#define SUBMIT_MAGIC 0x12345678
+typedef struct _cvi_rt_submit {
+  cvk_context_t cvk_ctx;
+  bmctx_t rt_ctx;
+  uint8_t *cmdbuf;
+  uint32_t magic;
+} cvi_rt_submit;
+
+static bmdev_t g_device = nullptr;
+static int g_device_ref = 0;
+static char *g_run_chip = nullptr;
+static std::unique_ptr<CModelCmdbuf> g_cmodel_cmdbuf_183x(new CModelCmdbuf183x());
+static std::unique_ptr<CModelCmdbuf> g_cmodel_cmdbuf_182x(new CModelCmdbuf182x());
+static std::unique_ptr<CModelCmdbuf> g_cmodel_cmdbuf_181x(new CModelCmdbuf181x());
+static std::unique_ptr<CModelCmdbuf> g_cmodel_cmdbuf_180x(new CModelCmdbuf180x());
+
+static inline CModelCmdbuf *getCmdbufPtr(const char *chip_ver) {
+  if (!strcmp(chip_ver, CVI_TPU_VERSION_183X)) {
+    return g_cmodel_cmdbuf_183x.get();
+  } else if (!strcmp(chip_ver, CVI_TPU_VERSION_182X)) {
+    return g_cmodel_cmdbuf_182x.get();
+  } else if (!strcmp(chip_ver, CVI_TPU_VERSION_181X)) {
+    return g_cmodel_cmdbuf_181x.get();
+  } else if (!strcmp(chip_ver, CVI_TPU_VERSION_180X)) {
+    return g_cmodel_cmdbuf_180x.get();
+  } else {
+    assert(0);
+  }
+}
+
+static inline CModelCmdbuf *getCmdbufPtr(uint32_t chip_ver) {
+  if (chip_ver == BM1880V2_VER) {
+    return g_cmodel_cmdbuf_183x.get();
+  } else if (chip_ver == BM1822_VER) {
+    return g_cmodel_cmdbuf_182x.get();
+  } else if (chip_ver == CV181X_VER) {
+    return g_cmodel_cmdbuf_181x.get();
+  } else if (chip_ver == CV180X_VER) {
+    return g_cmodel_cmdbuf_180x.get();
+  } else {
+    assert(0);
+  }
+}
+
+bmerr_t bm_device_open(int index, bmdev_t *dev) {
+
+  if (!g_run_chip) {
+    g_run_chip = getenv("SET_CHIP_NAME");
+    if (!g_run_chip) {
+      TPU_LOG_WARNING("Please export SET_CHIP_NAME=%s/%s/%s/%s\n",
+                      CVI_TPU_VERSION_183X, CVI_TPU_VERSION_182X, CVI_TPU_VERSION_181X, CVI_TPU_VERSION_180X);
+      return BM_ERR_FAILURE;
+    }
+    TPU_LOG_INFO("Start TPU Simulator for %s\n", g_run_chip);
+  }
+
+  if (g_device) {
+    g_device_ref++;
+    *dev = g_device;
+    return BM_SUCCESS;
+  }
+
+  getCmdbufPtr(g_run_chip)->rt_device_open(index, &g_device);
+  *dev = g_device;
+  g_device_ref++;
+
+  return BM_SUCCESS;
+}
+
+void bm_device_set_base_reg(bmctx_t ctx, u32 inx, uint64_t addr) {
+  bm_cmodel_set_base_reg(ctx->dev->model, inx, addr);
+}
+
+uint64_t bm_device_read_base_reg(bmctx_t ctx, u32 inx) {
+  return bm_cmodel_read_base_reg(ctx->dev->model, inx);
+}
+
+void bm_device_close(bmdev_t dev) {
+  assert(dev == g_device);
+  if (--g_device_ref > 0) {
+    return;
+  }
+  mem_pool_destroy(dev->device_mem_pool);
+  bm_cmodel_exit(dev->model);
+
+  TPU_LOG_DEBUG("device[%d] closed\n", dev->index);
+  g_device = nullptr;
+  BMDEV_LOCK_DEINIT(dev);
+
+  delete dev;
+}
+
+int bm_device_get_chip_ver(bmdev_t dev) { return dev->cvk_chip_info.version; }
+
+bmerr_t bm_context_create(bmctx_t *ctx) {
+  bm_context_t *pctx = new bm_context_t;
+  pctx->dev = NULL;
+  pctx->seq_no = 0;
+  *ctx = pctx;
+  return BM_SUCCESS;
+}
+
+void bm_context_destroy(bmctx_t ctx) {
+  TPU_ASSERT(ctx != nullptr, nullptr);
+  delete ctx;
+}
+
+bmerr_t bm_bind_device(bmctx_t ctx, bmdev_t dev) {
+  TPU_ASSERT(ctx != nullptr, nullptr);
+  ctx->dev = dev;
+  return BM_SUCCESS;
+}
+
+void bm_unbind_device(bmctx_t ctx) {
+  TPU_ASSERT(ctx != nullptr, nullptr);
+  ctx->dev = NULL;
+}
+
+bmdev_t bm_get_device(bmctx_t ctx) {
+  TPU_ASSERT(ctx->dev != nullptr, nullptr);
+  return ctx->dev;
+}
+
+bmerr_t bm_init(int index, bmctx_t *ctx) {
+  TPU_ASSERT(index == 0, nullptr);
+
+  bmerr_t ret;
+  bmdev_t dev = nullptr;
+
+  ret = bm_device_open(index, &dev);
+  TPU_ASSERT(ret == BM_SUCCESS, nullptr);
+
+  ret = bm_context_create(ctx);
+  TPU_ASSERT(ret == BM_SUCCESS, nullptr);
+
+  ret = bm_bind_device(*ctx, dev);
+  TPU_ASSERT(ret == BM_SUCCESS, nullptr);
+
+  return ret;
+}
+
+void bm_exit(bmctx_t ctx) {
+  bmdev_t dev = ctx->dev;
+  bm_unbind_device(ctx);
+  bm_context_destroy(ctx);
+  bm_device_close(dev);
+}
+
+bmmem_device_t bmmem_device_alloc_raw(bmctx_t ctx, size_t size) {
+  uint32_t chip_ver = bm_device_get_chip_ver(ctx->dev);
+
+  return getCmdbufPtr(chip_ver)->rt_device_alloc_raw(ctx, size);
+}
+
+bmmem_device_t bmmem_device_prealloc_raw(bmctx_t ctx, bmmem_device_t mem,
+                                         uint64_t offset, size_t size) {
+  (void)ctx;
+  bm_memory_t *device_mem = new bm_memory_t();
+  device_mem->flags.u.is_prealloc = 1;
+  device_mem->flags.u.type = BMMEM_TYPE_DEVICE;
+  if (mem) {
+    TPU_ASSERT(mem->size >= size + offset, nullptr);
+    device_mem->p_addr = ((bm_memory_t *)mem)->p_addr + offset;
+  } else {
+    device_mem->p_addr = offset;
+  }
+
+  // device_mem->v_addr = NULL;
+  // device_mem->v_addr = (void*)(device_mem->p_addr +
+  // (ctx->dev->model.chip.gmem));
+  device_mem->v_addr = (uint8_t *)((device_mem->p_addr) +
+                                   ((unsigned long long)bm_cmodel_get_chipGmem(
+                                       ctx->dev->model)));
+  device_mem->size = size;
+  return (bmmem_device_t)device_mem;
+}
+
+void bmmem_device_free(bmctx_t ctx, bmmem_device_t mem) {
+  uint32_t chip_ver = bm_device_get_chip_ver(ctx->dev);
+  getCmdbufPtr(chip_ver)->rt_device_free(ctx, mem);
+}
+
+size_t bmmem_device_size(bmmem_device_t mem) {
+  bm_memory_t *device_mem = (bm_memory_t *)mem;
+  if (device_mem == NULL)
+    return 0;
+  TPU_ASSERT(device_mem->flags.u.type == BMMEM_TYPE_DEVICE, nullptr);
+  return device_mem->size;
+}
+
+uint64_t bmmem_device_addr(bmmem_device_t mem) {
+  bm_memory_t *device_mem = (bm_memory_t *)mem;
+  if (device_mem == NULL)
+    return 0;
+  TPU_ASSERT(device_mem->flags.u.type == BMMEM_TYPE_DEVICE, nullptr);
+  return device_mem->p_addr;
+}
+
+uint8_t *bmmem_device_v_addr(bmmem_device_t mem) {
+  bm_memory_t *device_mem = (bm_memory_t *)mem;
+  TPU_ASSERT(device_mem->flags.u.type == BMMEM_TYPE_DEVICE, nullptr);
+  return device_mem->v_addr;
+}
+
+int32_t bmmem_device_inc_ref(bmmem_device_t mem) {
+  bm_memory_t *device_mem = (bm_memory_t *)mem;
+  TPU_ASSERT(device_mem->flags.u.type == BMMEM_TYPE_DEVICE, nullptr);
+  return (++device_mem->user_ref_cnt);
+}
+
+int32_t bmmem_device_dec_ref(bmmem_device_t mem) {
+  bm_memory_t *device_mem = (bm_memory_t *)mem;
+  TPU_ASSERT(device_mem->flags.u.type == BMMEM_TYPE_DEVICE, nullptr);
+  return (--device_mem->user_ref_cnt);
+}
+
+bmerr_t bm_memcpy_s2d(bmctx_t ctx, bmmem_device_t dst, uint8_t *src) {
+  bm_memory_t *mem = (bm_memory_t *)dst;
+  bm_cmodel_write_gmem(ctx->dev->model, mem->p_addr, src, mem->size);
+  return BM_SUCCESS;
+}
+
+bmerr_t bm_memcpy_s2d_ex(bmctx_t ctx, bmmem_device_t dst, uint8_t *src,
+                         unsigned long long offset, size_t size) {
+  bm_memory_t *mem = (bm_memory_t *)dst;
+  bm_cmodel_write_gmem(ctx->dev->model, mem->p_addr + offset, src, size);
+  return BM_SUCCESS;
+}
+
+bmerr_t bm_memcpy_d2s(bmctx_t ctx, uint8_t *dst, bmmem_device_t src) {
+  bm_memory_t *mem = (bm_memory_t *)src;
+  bm_cmodel_read_gmem(ctx->dev->model, mem->p_addr, dst, mem->size);
+  return BM_SUCCESS;
+}
+
+bmerr_t bmmem_device_flush(bmctx_t ctx, bmmem_device_t dev) {
+  (void)ctx;
+  (void)dev;
+  return BM_SUCCESS;
+}
+
+bmerr_t bmmem_device_flush_len(bmctx_t ctx, bmmem_device_t mem, size_t len) {
+  (void)ctx;
+  (void)mem;
+  (void)len;
+  return BM_SUCCESS;
+}
+
+bmerr_t bmmem_device_invld(bmctx_t ctx, bmmem_device_t dev) {
+  (void)ctx;
+  (void)dev;
+  return BM_SUCCESS;
+}
+
+bmerr_t bmmem_device_invld_len(bmctx_t ctx, bmmem_device_t mem, size_t len) {
+  (void)ctx;
+  (void)mem;
+  (void)len;
+  return BM_SUCCESS;
+}
+
+bmerr_t bm_send_cmdbuf(bmctx_t ctx, uint8_t *cmdbuf, size_t sz,
+                       uint16_t *seq_no) {
+  uint32_t chip_ver = bm_device_get_chip_ver(ctx->dev);
+  return getCmdbufPtr(chip_ver)->rt_send_cmdbuf(ctx, cmdbuf, sz, seq_no);
+}
+
+bmerr_t bm_wait_cmdbuf_done(bmctx_t ctx, uint16_t seq_no) {
+  uint32_t chip_ver = bm_device_get_chip_ver(ctx->dev);
+  return getCmdbufPtr(chip_ver)->rt_wait_cmdbuf_done(ctx, seq_no);
+}
+
+bmerr_t bm_load_cmdbuf(bmctx_t ctx, uint8_t *cmdbuf, size_t sz,
+                       unsigned long long neuron_gaddr,
+                       unsigned long long weight_gaddr, bool enable_pmu,
+                       bmmem_device_t *cmdbuf_mem) {
+
+  uint32_t chip_ver = bm_device_get_chip_ver(ctx->dev);
+
+  return getCmdbufPtr(chip_ver)->rt_load_cmdbuf(
+      ctx, cmdbuf, sz, neuron_gaddr, weight_gaddr, enable_pmu, cmdbuf_mem);
+}
+
+bmerr_t bm_load_dmabuf(bmctx_t ctx, bmmem_device_t dmabuf, size_t sz,
+                       unsigned long long neuron_gaddr,
+                       unsigned long long weight_gaddr, bool enable_pmu,
+                       bmmem_device_t *dmabuf_mem) {
+
+  uint32_t chip_ver = bm_device_get_chip_ver(ctx->dev);
+
+  return getCmdbufPtr(chip_ver)->rt_load_dmabuf(
+      ctx, dmabuf, sz, neuron_gaddr, weight_gaddr, enable_pmu, dmabuf_mem);
+}
+
+bmerr_t bm_run_cmdbuf(bmctx_t ctx, bmmem_device_t cmdbuf_mem,
+                      uint16_t *seq_no) {
+  uint32_t chip_ver = bm_device_get_chip_ver(ctx->dev);
+
+  return getCmdbufPtr(chip_ver)->rt_run_cmdbuf(ctx, cmdbuf_mem, seq_no);
+}
+
+bmerr_t bm_run_cmdbuf_ex(bmctx_t ctx, bmmem_device_t cmdbuf_mem,
+                         uint16_t *seq_no, uint64_t input_base_addr,
+                         uint64_t output_base_addr) {
+  uint32_t chip_ver = bm_device_get_chip_ver(ctx->dev);
+
+  return getCmdbufPtr(chip_ver)->rt_run_cmdbuf_ex(ctx, cmdbuf_mem, seq_no,
+                                             input_base_addr, output_base_addr);
+}
+
+bmerr_t bm_run_cmdbuf_ex2(bmctx_t ctx, bmmem_device_t cmdbuf_mem,
+                          uint16_t *seq_no, cvi_array_base *p_array_base) {
+
+  uint32_t chip_ver = bm_device_get_chip_ver(ctx->dev);
+
+  return getCmdbufPtr(chip_ver)->rt_run_cmdbuf_ex2(ctx, cmdbuf_mem, seq_no,
+                                              p_array_base);
+}
+
+bmerr_t bm_parse_pmubuf(bmmem_device_t cmdbuf_mem, uint8_t **buf_start,
+                        uint32_t *buf_len) {
+  (void)cmdbuf_mem;
+  *buf_start = NULL;
+  *buf_len = 0;
+  return BM_SUCCESS;
+}
+
+bmerr_t bm_run_cmdbuf_pio(bmctx_t ctx, uint8_t *cmdbuf, size_t sz) {
+  (void)ctx;
+  (void)cmdbuf;
+  (void)sz;
+  assert(0); // not support
+  return BM_SUCCESS;
+}
+
+void cviruntime_cvikernel_create(bmctx_t ctx, void **p_bk_ctx) {
+  TPU_ASSERT(ctx != nullptr, nullptr);
+  TPU_ASSERT(ctx->dev != nullptr, nullptr);
+
+  cvk_chip_info_t info;
+  if (bm_device_get_chip_ver(ctx->dev) == BM1880V2_VER) {
+    info = bmk1880v2_chip_info();
+  } else if (bm_device_get_chip_ver(ctx->dev) == BM1822_VER) {
+    info = bmk1822_chip_info();
+  } else
+    assert(0);
+
+  cvk_chip_info_t *dev_info = &info;
+
+  bmk_info_t bmk_info;
+  bmk_info.chip_version = dev_info->version;
+  bmk_info.cmdbuf_size = 0x10000000;
+  bmk_info.cmdbuf = (u8 *)malloc(bmk_info.cmdbuf_size);
+  assert(bmk_info.cmdbuf);
+  if (bm_device_get_chip_ver(ctx->dev) == BM1880V2_VER) {
+    ctx->cvik_context = bmk1880v2_register(&bmk_info);
+  } else if (bm_device_get_chip_ver(ctx->dev) == BM1822_VER) {
+    ctx->cvik_context = bmk1822_register(&bmk_info);
+  }
+  ctx->cvik_cmdbuf = (void *)bmk_info.cmdbuf;
+
+  *p_bk_ctx = ctx->cvik_context;
+}
+
+void cviruntime_cvikernel_submit(bmctx_t ctx) {
+  u32 len;
+  u8 *cmdbuf;
+  if (bm_device_get_chip_ver(ctx->dev) == BM1880V2_VER) {
+    cmdbuf = bmk1880v2_acquire_cmdbuf(ctx->cvik_context, &len);
+  } else if (bm_device_get_chip_ver(ctx->dev) == BM1822_VER) {
+    cmdbuf = bmk1822_acquire_cmdbuf(ctx->cvik_context, &len);
+  } else
+    assert(0);
+  uint16_t seq_no;
+  bm_send_cmdbuf(ctx, cmdbuf, (size_t)len, &seq_no);
+  if (bm_device_get_chip_ver(ctx->dev) == BM1880V2_VER) {
+    bmk1880v2_reset(ctx->cvik_context);
+  } else if (bm_device_get_chip_ver(ctx->dev) == BM1822_VER) {
+    bmk1822_reset(ctx->cvik_context);
+  }
+}
+
+void cviruntime_cvikernel_destroy(bmctx_t ctx) {
+  assert(ctx->cvik_context);
+  assert(ctx->cvik_cmdbuf);
+  if (bm_device_get_chip_ver(ctx->dev) == BM1880V2_VER) {
+    bmk1880v2_cleanup(ctx->cvik_context);
+  } else if (bm_device_get_chip_ver(ctx->dev) == BM1822_VER) {
+    bmk1822_cleanup(ctx->cvik_context);
+  }
+  free(ctx->cvik_cmdbuf);
+}
+
+CVI_RC CVI_RT_DeInitBK(CVI_RT_HANDLE rt_handle) {
+  bmctx_t ctx = (bmctx_t)rt_handle;
+
+  // deinit kernel related
+  if (ctx->cvik_context) {
+    if (!strcmp(g_run_chip, CVI_TPU_VERSION_183X)) {
+      bmk1880v2_cleanup(ctx->cvik_context);
+    } else if (!strcmp(g_run_chip, CVI_TPU_VERSION_182X)) {
+      bmk1822_cleanup(ctx->cvik_context);
+    } else {
+      assert(0);
+      return CVI_FAILURE;
+    }
+  }
+
+  if (ctx->cvk_context)
+    ctx->cvk_context->ops->cleanup(ctx->cvk_context);
+
+  if (ctx->cvik_cmdbuf) {
+    free(ctx->cvik_cmdbuf);
+  }
+
+  // deinit basic context
+  bm_exit(ctx);
+  return CVI_SUCCESS;
+}
+
+CVI_RC CVI_RT_InitWithKernelBK(CVI_RT_HANDLE *rt_handle, uint32_t cmdbuf_size) {
+  bmctx_t *ctx = (bmctx_t *)rt_handle;
+
+  if (!g_run_chip)
+    g_run_chip = getenv("SET_CHIP_NAME");
+
+  if (!strcmp(g_run_chip, CVI_TPU_VERSION_183X)) {
+    bm_init(DEVICE_INDEX_NUM, ctx);
+
+    bmk1880v2_chip_info_t info = bmk1880v2_chip_info();
+    bmk1880v2_chip_info_t *dev_info = &info;
+
+    bmk_info_t bmk_info;
+    bmk_info.chip_version = dev_info->version;
+    bmk_info.cmdbuf_size = cmdbuf_size;
+    bmk_info.cmdbuf = (u8 *)malloc(bmk_info.cmdbuf_size);
+    assert(bmk_info.cmdbuf);
+
+    (*ctx)->cvik_context = bmk1880v2_register(&bmk_info);
+    (*ctx)->cvik_cmdbuf = (void *)bmk_info.cmdbuf;
+    (*ctx)->cvk_context = nullptr;
+    return CVI_SUCCESS;
+
+  } else if (!strcmp(g_run_chip, CVI_TPU_VERSION_182X)) {
+    bm_init(DEVICE_INDEX_NUM, ctx);
+
+    bmk1822_chip_info_t info = bmk1822_chip_info();
+    bmk1822_chip_info_t *dev_info = &info;
+
+    bmk_info_t bmk_info;
+    bmk_info.chip_version = dev_info->version;
+    bmk_info.cmdbuf_size = cmdbuf_size;
+    bmk_info.cmdbuf = (u8 *)malloc(bmk_info.cmdbuf_size);
+    assert(bmk_info.cmdbuf);
+
+    (*ctx)->cvik_context = bmk1822_register(&bmk_info);
+    (*ctx)->cvik_cmdbuf = (void *)bmk_info.cmdbuf;
+    (*ctx)->cvk_context = nullptr;
+    return CVI_SUCCESS;
+  } else {
+    assert(0);
+  }
+
+  return CVI_FAILURE;
+}
+
+CVI_RC CVI_RT_SubmitBK(CVI_RT_HANDLE rt_handle) {
+  cviruntime_cvikernel_submit((bmctx_t)rt_handle);
+  return CVI_SUCCESS;
+}
+
+CVI_RT_KHANDLE CVI_RT_GetKHandleBK(CVI_RT_HANDLE rt_handle) {
+  bmctx_t ctx = (bmctx_t)rt_handle;
+  return (CVI_RT_KHANDLE)(ctx->cvik_context);
+}
+
+CVI_RC CVI_RT_SubmitPio(CVI_RT_HANDLE rt_handle) {
+  (void)rt_handle;
+  assert(0); // not support
+  return CVI_SUCCESS;
+}
+
+CVI_RC CVI_RT_Init(CVI_RT_HANDLE *rt_handle) {
+  bmctx_t *ctx = (bmctx_t *)rt_handle;
+  bm_init(DEVICE_INDEX_NUM, ctx);
+  return CVI_SUCCESS;
+}
+
+CVI_RC CVI_RT_DeInit(CVI_RT_HANDLE rt_handle) {
+  bmctx_t ctx = (bmctx_t)rt_handle;
+
+  // deinit basic context
+  bm_exit(ctx);
+  return CVI_SUCCESS;
+}
+
+CVI_RT_KHANDLE CVI_RT_RegisterKernel(CVI_RT_HANDLE rt_handle,
+                                     uint32_t cmdbuf_size) {
+  bmctx_t ctx = (bmctx_t)rt_handle;
+  cvk_reg_info_t req_info;
+  cvk_context_t *tmp_cvk_context;
+  cvi_rt_submit *submit_handle;
+
+  // reset req_info
+  memset(&req_info, 0, sizeof(cvk_reg_info_t));
+
+  if (!strcmp(g_run_chip, CVI_TPU_VERSION_183X)) {
+    strncpy(req_info.chip_ver_str, "cv183x", sizeof(req_info.chip_ver_str) - 1);
+  } else if (!strcmp(g_run_chip, CVI_TPU_VERSION_182X)) {
+    strncpy(req_info.chip_ver_str, "cv182x", sizeof(req_info.chip_ver_str) - 1);
+  } else if (!strcmp(g_run_chip, CVI_TPU_VERSION_181X)) {
+    strncpy(req_info.chip_ver_str, "cv181x", sizeof(req_info.chip_ver_str) - 1);
+  } else if (!strcmp(g_run_chip, CVI_TPU_VERSION_180X)) {
+    strncpy(req_info.chip_ver_str, "cv180x", sizeof(req_info.chip_ver_str) - 1);
+  } else {
+    assert(0);
+    return NULL;
+  }
+
+  req_info.cmdbuf_size = cmdbuf_size;
+  req_info.cmdbuf = (uint8_t *)malloc(req_info.cmdbuf_size);
+  assert(req_info.cmdbuf && "Expect allocated cmdbuf");
+
+  // register cvikernel
+  tmp_cvk_context = cvikernel_register(&req_info);
+  submit_handle = (cvi_rt_submit *)malloc(sizeof(cvi_rt_submit));
+  assert(submit_handle && "Expect allocated kernel context");
+  memset(submit_handle, 0, sizeof(cvi_rt_submit));
+
+  // assign handle mapping related, and reassign cvikernel handle
+  memcpy(submit_handle, tmp_cvk_context, sizeof(cvk_context_t));
+  submit_handle->rt_ctx = ctx;
+  submit_handle->cmdbuf = req_info.cmdbuf;
+  submit_handle->magic = SUBMIT_MAGIC;
+  free(tmp_cvk_context);
+
+  return submit_handle;
+}
+
+CVI_RC CVI_RT_UnRegisterKernel(CVI_RT_KHANDLE rt_khandle) {
+  cvk_context_t *cvk_context = (cvk_context_t *)rt_khandle;
+  cvi_rt_submit *submit_handle = (cvi_rt_submit *)rt_khandle;
+
+  if (!cvk_context) {
+    assert(0 && "CVI_RT_UnRegisterKernel() NULL kernel handle");
+    return CVI_FAILURE;
+  }
+
+  if (cvk_context)
+    cvk_context->ops->cleanup(cvk_context);
+
+  if (cvk_context->priv_data) {
+    // priv_data alloc by malloc of cvikernel_1880v2
+    free(cvk_context->priv_data);
+  }
+
+  if (submit_handle->cmdbuf) {
+    free(submit_handle->cmdbuf);
+  }
+
+  free(rt_khandle);
+  return CVI_SUCCESS;
+}
+
+CVI_RC CVI_RT_Submit(CVI_RT_HANDLE rt_khandle) {
+  cvi_rt_submit *submit_handle = (cvi_rt_submit *)rt_khandle;
+  uint32_t len;
+  uint16_t seq_no;
+
+  if (submit_handle->magic != SUBMIT_MAGIC) {
+    TPU_LOG_WARNING("incorrect submit handle input\n");
+    return CVI_FAILURE;
+  }
+
+  cvk_context_t *cvk_context = &submit_handle->cvk_ctx;
+  uint8_t *cmdbuf = cvk_context->ops->acquire_cmdbuf(cvk_context, &len);
+
+  bm_send_cmdbuf(submit_handle->rt_ctx, cmdbuf, (size_t)len, &seq_no);
+  cvk_context->ops->reset(cvk_context);
+
+  return CVI_SUCCESS;
+}
+
+CVI_RC CVI_RT_SubmitAsync(CVI_RT_KHANDLE rt_khandle, uint8_t submit_previous) {
+  (void)rt_khandle;
+  (void)submit_previous;
+  assert(0); // not support
+  return CVI_FAILURE;
+}
+CVI_RC CVI_RT_WaitForAsync(CVI_RT_KHANDLE rt_khandle) {
+  (void)rt_khandle;
+  assert(0); // not support
+  return CVI_FAILURE;
+}
+
+CVI_RC CVI_RT_LoadCmdbuf(CVI_RT_HANDLE rt_handle, uint8_t *cmdbuf,
+                         uint64_t cmdbuf_sz, uint64_t gaddr_base0,
+                         uint64_t gaddr_base1, bool enable_pmu,
+                         CVI_RT_MEM *cmdbuf_mem) {
+  return (CVI_RC)bm_load_cmdbuf((bmctx_t)rt_handle, cmdbuf, (size_t)cmdbuf_sz,
+                                (unsigned long long)gaddr_base0,
+                                (unsigned long long)gaddr_base1, enable_pmu,
+                                (bmmem_device_t *)cmdbuf_mem);
+}
+
+CVI_RC CVI_RT_LoadDmabuf(
+    CVI_RT_HANDLE rt_handle, CVI_RT_MEM dmabuf,
+    uint64_t dmabuf_sz, uint64_t gaddr_base0,
+    uint64_t gaddr_base1, bool enable_pmu, CVI_RT_MEM *dmabuf_mem) {
+  return (CVI_RC)bm_load_dmabuf((bmctx_t)rt_handle, (bmmem_device_t)dmabuf,
+                                (size_t)dmabuf_sz,
+                                (unsigned long long)gaddr_base0,
+                                (unsigned long long)gaddr_base1, enable_pmu,
+                                (bmmem_device_t *)dmabuf_mem);
+}
+
+CVI_RC CVI_RT_RunCmdbuf(CVI_RT_HANDLE rt_handle, CVI_RT_MEM cmdbuf_mem,
+                        uint64_t gaddr_base2, uint64_t gaddr_base3) {
+
+  CVI_RC ret;
+  uint16_t seq_no;
+  ret = (CVI_RC)bm_run_cmdbuf_ex((bmctx_t)rt_handle, (bmmem_device_t)cmdbuf_mem,
+                                 &seq_no, gaddr_base2, gaddr_base3);
+  if (ret != 0)
+    return ret;
+
+  return (CVI_RC)bm_wait_cmdbuf_done((bmctx_t)rt_handle, seq_no);
+}
+
+CVI_RC CVI_RT_RunCmdbufEx(CVI_RT_HANDLE rt_handle, CVI_RT_MEM cmdbuf_mem,
+                          CVI_RT_ARRAYBASE *p_array_base) {
+  CVI_RC ret;
+  uint16_t seq_no;
+
+  ret = (CVI_RC)bm_run_cmdbuf_ex2((bmctx_t)rt_handle, (bmmem_device_t)cmdbuf_mem,
+                                   &seq_no, (cvi_array_base *)p_array_base);
+  if (ret != 0)
+    return ret;
+
+  return (CVI_RC)bm_wait_cmdbuf_done((bmctx_t)rt_handle, seq_no);
+}
+
+CVI_RT_MEM CVI_RT_MemAlloc(CVI_RT_HANDLE rt_handle, uint64_t size) {
+  bmctx_t ctx = (bmctx_t)rt_handle;
+  uint32_t chip_ver = bm_device_get_chip_ver(ctx->dev);
+
+  return getCmdbufPtr(chip_ver)->rt_device_alloc_raw(ctx, size);
+}
+
+CVI_RT_MEM CVI_RT_MemPreAlloc(CVI_RT_MEM mem, uint64_t offset, uint64_t size) {
+  bm_memory_t *dev_mem = (bm_memory_t *)mem;
+  TPU_ASSERT(dev_mem != nullptr, nullptr);
+
+  bm_memory_t *preAlloc_mem = new bm_memory_t();
+  preAlloc_mem->flags.u.is_prealloc = 1;
+  preAlloc_mem->flags.u.type = BMMEM_TYPE_DEVICE;
+  TPU_ASSERT(dev_mem->size >= size + offset, nullptr);
+  preAlloc_mem->p_addr = dev_mem->p_addr + offset;
+  preAlloc_mem->v_addr = (uint8_t *)(dev_mem->v_addr + offset);
+  preAlloc_mem->size = size;
+  return (CVI_RT_MEM)preAlloc_mem;
+}
+
+void CVI_RT_MemFree(CVI_RT_HANDLE rt_handle, CVI_RT_MEM mem) {
+  bm_memory_t *dev_mem = (bm_memory_t *)mem;
+  bmctx_t ctx = (bmctx_t)rt_handle;
+  uint32_t chip_ver = bm_device_get_chip_ver(ctx->dev);
+
+  getCmdbufPtr(chip_ver)->rt_device_free(ctx, dev_mem);
+}
+
+uint64_t CVI_RT_MemGetSize(CVI_RT_MEM mem) {
+  if (!mem)
+    return 0;
+  bm_memory_t *dev_mem = (bm_memory_t *)mem;
+  TPU_ASSERT(dev_mem->flags.u.type == BMMEM_TYPE_DEVICE, nullptr);
+  return dev_mem->size;
+}
+
+uint64_t CVI_RT_MemGetPAddr(CVI_RT_MEM mem) {
+  if (!mem)
+    return 0;
+  bm_memory_t *dev_mem = (bm_memory_t *)mem;
+  TPU_ASSERT(dev_mem->flags.u.type == BMMEM_TYPE_DEVICE, nullptr);
+  return dev_mem->p_addr;
+}
+
+uint8_t *CVI_RT_MemGetVAddr(CVI_RT_MEM mem) {
+  if (!mem)
+    return 0;
+  bm_memory_t *dev_mem = (bm_memory_t *)mem;
+  TPU_ASSERT(dev_mem->flags.u.type == BMMEM_TYPE_DEVICE, nullptr);
+  return dev_mem->v_addr;
+}
+
+int32_t CVI_RT_MemIncRef(CVI_RT_MEM mem) {
+  bm_memory_t *device_mem = (bm_memory_t *)mem;
+  TPU_ASSERT(device_mem->flags.u.type == BMMEM_TYPE_DEVICE, nullptr);
+  return (++device_mem->user_ref_cnt);
+}
+
+int32_t CVI_RT_MemDecRef(CVI_RT_MEM mem) {
+  bm_memory_t *device_mem = (bm_memory_t *)mem;
+  TPU_ASSERT(device_mem->flags.u.type == BMMEM_TYPE_DEVICE, nullptr);
+  return (--device_mem->user_ref_cnt);
+}
+
+CVI_RC CVI_RT_MemFlush(CVI_RT_HANDLE rt_handle, CVI_RT_MEM mem) {
+  (void)rt_handle;
+  (void)mem;
+  return CVI_SUCCESS;
+}
+
+CVI_RC CVI_RT_MemInvld(CVI_RT_HANDLE rt_handle, CVI_RT_MEM mem) {
+  (void)rt_handle;
+  (void)mem;
+  return CVI_SUCCESS;
+}
+
+CVI_RC CVI_RT_MemFlushEx(CVI_RT_HANDLE rt_handle, CVI_RT_MEM mem,
+                         uint64_t len) {
+  (void)rt_handle;
+  (void)mem;
+  (void)len;
+  return CVI_SUCCESS;
+}
+
+CVI_RC CVI_RT_MemInvldEx(CVI_RT_HANDLE rt_handle, CVI_RT_MEM mem,
+                         uint64_t len) {
+  (void)rt_handle;
+  (void)mem;
+  (void)len;
+  return CVI_SUCCESS;
+}
+
+CVI_RC CVI_RT_MemCopyS2D(CVI_RT_HANDLE rt_handle, CVI_RT_MEM dst,
+                         uint8_t *src) {
+  bmctx_t ctx = (bmctx_t)rt_handle;
+  bm_memory_t *mem = (bm_memory_t *)dst;
+  bm_cmodel_write_gmem(ctx->dev->model, mem->p_addr, src, mem->size);
+  return CVI_SUCCESS;
+}
+
+CVI_RC CVI_RT_MemCopyS2DEx(CVI_RT_HANDLE rt_handle, CVI_RT_MEM dst,
+                           uint64_t offset, uint64_t len, uint8_t *src) {
+  bmctx_t ctx = (bmctx_t)rt_handle;
+  bm_memory_t *mem = (bm_memory_t *)dst;
+  TPU_ASSERT((size_t)(offset + len) <= mem->size, nullptr);
+  bm_cmodel_write_gmem(ctx->dev->model, mem->p_addr + offset, src, len);
+  return CVI_SUCCESS;
+}
+
+CVI_RC CVI_RT_MemCopyD2S(CVI_RT_HANDLE rt_handle, uint8_t *dst,
+                         CVI_RT_MEM src) {
+  bmctx_t ctx = (bmctx_t)rt_handle;
+  bm_memory_t *mem = (bm_memory_t *)src;
+  bm_cmodel_read_gmem(ctx->dev->model, mem->p_addr, dst, mem->size);
+  return CVI_SUCCESS;
+}
+
+CVI_RC CVI_RT_ParsePmuBuf(CVI_RT_MEM cmdbuf_mem, uint8_t **buf_start,
+                          uint32_t *buf_len) {
+  return (CVI_RC)bm_parse_pmubuf((bmmem_device_t)cmdbuf_mem, buf_start,
+                                 buf_len);
+}
+
+CVI_RC CVI_RT_SetBaseReg(CVI_RT_HANDLE rt_handle, uint32_t inx,
+                         uint64_t base_addr) {
+  bmctx_t ctx = (bmctx_t)rt_handle;
+  bm_cmodel_set_base_reg(ctx->dev->model, inx, base_addr);
+  return CVI_SUCCESS;
+}
+
+CVI_RC CVI_RT_LoadCmdbufTee(CVI_RT_HANDLE rt_handle, uint8_t *cmdbuf, size_t sz,
+                            uint64_t neuron_gaddr, uint64_t weight_gaddr,
+                            uint32_t weight_len, CVI_RT_MEM *cmdbuf_mem) {
+  (void)rt_handle;
+  (void)cmdbuf;
+  (void)sz;
+  (void)neuron_gaddr;
+  (void)weight_gaddr;
+  (void)weight_len;
+  (void)cmdbuf_mem;
+  assert(0); // not support
+  return CVI_SUCCESS;
+}
+
+CVI_RC CVI_RT_RunCmdbufTee(CVI_RT_HANDLE rt_handle, CVI_RT_MEM cmdbuf_mem,
+                           CVI_RT_ARRAYBASE *p_array_base) {
+  (void)rt_handle;
+  (void)p_array_base;
+  (void)cmdbuf_mem;
+  assert(0); // not support
+  return CVI_SUCCESS;
+}
diff --git a/cviruntime/src/common/alloc.cpp b/cviruntime/src/common/alloc.cpp
new file mode 100644
index 000000000..608a433be
--- /dev/null
+++ b/cviruntime/src/common/alloc.cpp
@@ -0,0 +1,52 @@
+#include <mutex>
+#include <functional>
+#include "alloc.h"
+#include "runtime/debug.h"
+
+#define UNUSED(x) (void)(x)
+
+namespace cvi {
+namespace runtime {
+
+static CVI_RT_MEM cvi_def_mem_alloc(CVI_RT_HANDLE rt_handle, uint64_t size,
+                                    CVI_ALLOC_TYPE type, const char *name) {
+    (void)type;
+    (void)name;
+    return CVI_RT_MemAlloc(rt_handle, size);
+}
+
+static CVI_MEM_ALLOC_CB mem_alloc_cb = cvi_def_mem_alloc;
+static CVI_MEM_FREE_CB mem_free_cb = CVI_RT_MemFree;
+static std::mutex gMutex;
+
+CVI_RT_MEM cviMemAlloc(CVI_RT_HANDLE rt_handle, uint64_t size, CVI_ALLOC_TYPE type, const char *name) {
+    return mem_alloc_cb(rt_handle, size, type, name);
+}
+
+void cviMemFree(CVI_RT_HANDLE rt_handle, CVI_RT_MEM mem) {
+    return mem_free_cb(rt_handle, mem);
+}
+
+CVI_RC cviSetMemCallback(CVI_MEM_ALLOC_CB mem_alloc, CVI_MEM_FREE_CB mem_free) {
+    std::unique_lock<std::mutex> lk(gMutex);
+    if (!mem_alloc) {
+        TPU_LOG_ERROR("CVI_MEM_ALLOC_CB is null\n");
+        return -1;
+    }
+    if (!mem_free) {
+        TPU_LOG_ERROR("CVI_MEM_FREE_CB is null\n");
+        return -1;
+    }
+    mem_alloc_cb = mem_alloc;
+    mem_free_cb = mem_free;
+    return 0;
+}
+
+void cviResetMemCallback() {
+    std::unique_lock<std::mutex> lk(gMutex);
+    mem_alloc_cb = cvi_def_mem_alloc;
+    mem_free_cb = CVI_RT_MemFree;
+}
+
+} // namespace runtime
+} // namespace cvi
diff --git a/cviruntime/src/common/alloc.h b/cviruntime/src/common/alloc.h
new file mode 100644
index 000000000..86d30984e
--- /dev/null
+++ b/cviruntime/src/common/alloc.h
@@ -0,0 +1,13 @@
+#pragma once
+#include "cviruntime_context.h"
+
+namespace cvi {
+namespace runtime {
+
+CVI_RT_MEM cviMemAlloc(CVI_RT_HANDLE rt_handle, uint64_t size, CVI_ALLOC_TYPE type, const char * name);
+void cviMemFree(CVI_RT_HANDLE rt_handle, CVI_RT_MEM mem);
+CVI_RC cviSetMemCallback(CVI_MEM_ALLOC_CB mem_alloc, CVI_MEM_FREE_CB mem_free);
+void cviResetMemCallback();
+
+} // namespace runtime
+} // namespace cvi
\ No newline at end of file
diff --git a/cviruntime/src/common/cpu_function/argmax.cpp b/cviruntime/src/common/cpu_function/argmax.cpp
new file mode 100644
index 000000000..4ea8fe1b3
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/argmax.cpp
@@ -0,0 +1,69 @@
+#include <string.h>
+#include <iostream>
+#include <vector>
+#include <cmath>
+#include <runtime/neuron.hpp>
+#include <cpu_function/argmax.hpp>
+
+namespace cvi {
+namespace runtime {
+
+void ArgMaxFunc::setup(std::vector<std::shared_ptr<Neuron>> &inputs,
+                        std::vector<std::shared_ptr<Neuron>> &outputs,
+                        OpParam &param) {
+  (void)param;
+  _bottom = inputs[0];
+  _max_map = inputs[1];
+  _top = outputs[0];
+  auto axis = param.get<int32_t>("axis");
+  for (int i = 0; i < axis; i++) {
+    _outer_dim *= _bottom->shape[i];
+  }
+  _inner_dim = _bottom->shape[axis];
+  _tile_num = (_inner_dim + 256 - 1) / 256;
+}
+
+void ArgMaxFunc::run() {
+  if (_bottom->fmt == CVI_FMT_INT8) {
+    argmax<int8_t>();
+  } else {
+    argmax<int16_t>();
+  }
+}
+
+template <typename T>
+void ArgMaxFunc::argmax() {
+  auto data = _bottom->cpu_data<T>();
+  auto map = _max_map->cpu_data<T>();
+  auto top = _top->cpu_data<float>();
+
+  for (int i = 0; i < _outer_dim; ++i) {
+    T max_val = 0;
+    int idx = 0;
+    auto map_ptr = map + i * _tile_num;
+    // find max_val
+    for (int j = 0; j < _tile_num; j++) {
+      if (map_ptr[j] < 0) {
+        continue;
+      }
+      if (map_ptr[j] > max_val) {
+        max_val = map_ptr[j];
+        idx = j;
+      }
+    }
+    int offset = idx * 256;
+    int len = std::min(_inner_dim - offset, 256);
+    auto ptr = data + i * _inner_dim + offset;
+    idx = 0;
+    for (int j = 0; j < len; ++j) {
+      if (ptr[j] == max_val) {
+        idx = j;
+        break;
+      }
+    }
+    top[i] = (float)(idx + offset);
+  }
+}
+
+}
+}
diff --git a/cviruntime/src/common/cpu_function/argmax.hpp b/cviruntime/src/common/cpu_function/argmax.hpp
new file mode 100644
index 000000000..5fa7459e4
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/argmax.hpp
@@ -0,0 +1,36 @@
+#include <iostream>
+#include <vector>
+#include <runtime/neuron.hpp>
+#include <runtime/cpu_function.hpp>
+
+
+namespace cvi {
+namespace runtime {
+
+class ArgMaxFunc : public ICpuFunction {
+
+public:
+  void setup(std::vector<std::shared_ptr<Neuron>> &inputs,
+             std::vector<std::shared_ptr<Neuron>> &outputs,
+             OpParam &param);
+  void run();
+
+  static ICpuFunction *open() { return new ArgMaxFunc(); }
+  static void close(ICpuFunction *func) { delete func; }
+
+private:
+  std::shared_ptr<Neuron> _bottom;
+  std::shared_ptr<Neuron> _top;
+  std::shared_ptr<Neuron> _max_map;
+
+  template <typename T>
+  void argmax();
+
+  int _outer_dim = 1;
+  int _inner_dim = 1;
+  int _tile_num = 1;
+
+};
+
+}
+}
diff --git a/cviruntime/src/common/cpu_function/argmax_v2.cpp b/cviruntime/src/common/cpu_function/argmax_v2.cpp
new file mode 100644
index 000000000..c70c13a2a
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/argmax_v2.cpp
@@ -0,0 +1,86 @@
+#include <string.h>
+#include <iostream>
+#include <vector>
+#include <cmath>
+#include <runtime/neuron.hpp>
+#include <cpu_function/argmax_v2.hpp>
+
+namespace cvi {
+namespace runtime {
+
+static inline float BF16(const uint16_t & data) {
+  float data_f32 = 0.0f;
+  uint16_t *p_data_bf16 = (uint16_t*)(&data_f32);
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+  p_data_bf16[0] = data;
+#else
+  p_data_bf16[1] = data;
+#endif
+  return data_f32;
+}
+
+void ArgMaxV2Func::setup(std::vector<std::shared_ptr<Neuron>> &inputs,
+                        std::vector<std::shared_ptr<Neuron>> &outputs,
+                        OpParam &param) {
+  (void)param;
+  _bottom = inputs[0];
+  _max_map = inputs[1];
+  _top = outputs[0];
+  auto axis = param.get<int32_t>("axis");
+  if (_bottom->fmt == CVI_FMT_INT8) {
+    scale = param.get<float>("scale");
+  }
+  for (int i = 0; i < axis; i++) {
+    _outer_dim *= _bottom->shape[i];
+  }
+  _inner_dim = _bottom->shape[axis];
+  _tile_num = (_inner_dim + 256 - 1) / 256;
+}
+
+void ArgMaxV2Func::run() {
+  if (_bottom->fmt == CVI_FMT_INT8) {
+    argmax<int8_t>();
+  } else {
+    argmax<int16_t>();
+  }
+}
+
+template <typename T>
+void ArgMaxV2Func::argmax() {
+  auto data = _bottom->cpu_data<T>();
+  auto map = _max_map->cpu_data<T>();
+  auto top = _top->cpu_data<float>();
+  float max_val_fp32 = 0;
+  for (int i = 0; i < _outer_dim; ++i) {
+    auto map_ptr = map + i * _tile_num;
+    T max_val = map_ptr[0];
+    int idx = 0;
+    // find max_val
+    for (int j = 1; j < _tile_num; j++) {
+      if (map_ptr[j] > max_val) {
+        max_val = map_ptr[j];
+        idx = j;
+      }
+    }
+    int offset = idx * 256;
+    int len = std::min(_inner_dim - offset, 256);
+    auto ptr = data + i * _inner_dim + offset;
+    idx = 0;
+    for (int j = 0; j < len; ++j) {
+      if (ptr[j] == max_val) {
+        idx = j;
+        break;
+      }
+    }
+    if (std::is_same<T, int16_t>::value) {
+      max_val_fp32 = BF16(max_val);
+    } else {
+      max_val_fp32 = (int)max_val;
+    }
+    top[2 * i] = max_val_fp32 * scale;
+    top[2 * i + 1] = (float)(idx + offset);
+  }
+}
+
+}
+}
diff --git a/cviruntime/src/common/cpu_function/argmax_v2.hpp b/cviruntime/src/common/cpu_function/argmax_v2.hpp
new file mode 100644
index 000000000..7e08e6b20
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/argmax_v2.hpp
@@ -0,0 +1,36 @@
+#include <iostream>
+#include <vector>
+#include <runtime/neuron.hpp>
+#include <runtime/cpu_function.hpp>
+
+
+namespace cvi {
+namespace runtime {
+
+class ArgMaxV2Func : public ICpuFunction {
+
+public:
+  void setup(std::vector<std::shared_ptr<Neuron>> &inputs,
+             std::vector<std::shared_ptr<Neuron>> &outputs,
+             OpParam &param);
+  void run();
+
+  static ICpuFunction *open() { return new ArgMaxV2Func(); }
+  static void close(ICpuFunction *func) { delete func; }
+
+private:
+  std::shared_ptr<Neuron> _bottom;
+  std::shared_ptr<Neuron> _top;
+  std::shared_ptr<Neuron> _max_map;
+
+  template <typename T>
+  void argmax();
+
+  int _outer_dim = 1;
+  int _inner_dim = 1;
+  int _tile_num = 1;
+  float scale = 1.;
+};
+
+}
+}
diff --git a/cviruntime/src/common/cpu_function/argmax_v3.cpp b/cviruntime/src/common/cpu_function/argmax_v3.cpp
new file mode 100644
index 000000000..20b90caac
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/argmax_v3.cpp
@@ -0,0 +1,93 @@
+#include <cmath>
+#include <cpu_function/argmax_v3.hpp>
+#include <iostream>
+#include <runtime/neuron.hpp>
+#include <string.h>
+#include <vector>
+
+namespace cvi {
+namespace runtime {
+
+static inline float BF16(const uint16_t &data) {
+  float data_f32 = 0.0f;
+  uint16_t *p_data_bf16 = (uint16_t *)(&data_f32);
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+  p_data_bf16[0] = data;
+#else
+  p_data_bf16[1] = data;
+#endif
+  return data_f32;
+}
+
+void ArgMaxV3Func::setup(std::vector<std::shared_ptr<Neuron>> &inputs,
+                         std::vector<std::shared_ptr<Neuron>> &outputs,
+                         OpParam &param) {
+  (void)param;
+  _bottom = inputs[0];
+  _max_map = inputs[1];
+  _indices = outputs[0];
+  _values = nullptr;
+  if (outputs.size() > 1) {
+    _values = outputs[1];
+  }
+  auto axis = param.get<int32_t>("axis");
+  if (param.has("scale")) {
+    scale = param.get<float>("scale");
+  }
+  for (int i = 0; i < axis; i++) {
+    _outer_dim *= _bottom->shape[i];
+  }
+  _inner_dim = _bottom->shape[axis];
+  _tile_num = (_inner_dim + 256 - 1) / 256;
+}
+
+void ArgMaxV3Func::run() {
+  if (_bottom->fmt == CVI_FMT_INT8) {
+    argmax<int8_t>();
+  } else {
+    argmax<int16_t>();
+  }
+}
+
+template <typename T>
+void ArgMaxV3Func::argmax() {
+  auto data = _bottom->cpu_data<T>();
+  auto map = _max_map->cpu_data<T>();
+  auto indices = _indices->cpu_data<float>();
+  auto values = _values ? _values->cpu_data<float>() : nullptr;
+  float max_val_fp32 = 0;
+  for (int i = 0; i < _outer_dim; ++i) {
+    auto map_ptr = map + i * _tile_num;
+    T max_val = map_ptr[0];
+    int idx = 0;
+    // find max_val
+    for (int j = 1; j < _tile_num; j++) {
+      if (map_ptr[j] > max_val) {
+        max_val = map_ptr[j];
+        idx = j;
+      }
+    }
+    int offset = idx * 256;
+    int len = std::min(_inner_dim - offset, 256);
+    auto ptr = data + i * _inner_dim + offset;
+    idx = 0;
+    for (int j = 0; j < len; ++j) {
+      if (ptr[j] == max_val) {
+        idx = j;
+        break;
+      }
+    }
+    indices[i] = (float)(idx + offset);
+    if (values) {
+      if (std::is_same<T, int16_t>::value) {
+        max_val_fp32 = BF16(max_val);
+      } else {
+        max_val_fp32 = (int)max_val * scale;
+      }
+      values[i] = max_val_fp32;
+    }
+  }
+}
+
+} // namespace runtime
+} // namespace cvi
diff --git a/cviruntime/src/common/cpu_function/argmax_v3.hpp b/cviruntime/src/common/cpu_function/argmax_v3.hpp
new file mode 100644
index 000000000..b5cb32994
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/argmax_v3.hpp
@@ -0,0 +1,37 @@
+#include <iostream>
+#include <vector>
+#include <runtime/neuron.hpp>
+#include <runtime/cpu_function.hpp>
+
+
+namespace cvi {
+namespace runtime {
+
+class ArgMaxV3Func : public ICpuFunction {
+
+public:
+  void setup(std::vector<std::shared_ptr<Neuron>> &inputs,
+             std::vector<std::shared_ptr<Neuron>> &outputs,
+             OpParam &param);
+  void run();
+
+  static ICpuFunction *open() { return new ArgMaxV3Func(); }
+  static void close(ICpuFunction *func) { delete func; }
+
+private:
+  std::shared_ptr<Neuron> _bottom;
+  std::shared_ptr<Neuron> _indices;
+  std::shared_ptr<Neuron> _values;
+  std::shared_ptr<Neuron> _max_map;
+
+  template <typename T>
+  void argmax();
+
+  int _outer_dim = 1;
+  int _inner_dim = 1;
+  int _tile_num = 1;
+  float scale = 1.;
+};
+
+}
+}
diff --git a/cviruntime/src/common/cpu_function/cumsum.cpp b/cviruntime/src/common/cpu_function/cumsum.cpp
new file mode 100644
index 000000000..bf5c9a0dd
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/cumsum.cpp
@@ -0,0 +1,59 @@
+#include <string.h>
+#include <iostream>
+#include <vector>
+#include <cmath>
+#include <runtime/neuron.hpp>
+#include <cpu_function/cumsum.hpp>
+
+namespace cvi {
+namespace runtime {
+
+CumSumFunc::~CumSumFunc() {}
+
+void CumSumFunc::setup(std::vector<std::shared_ptr<Neuron>> &inputs,
+                       std::vector<std::shared_ptr<Neuron>> &outputs,
+                       OpParam &param) {
+  _bottoms = inputs;
+  _tops = outputs;
+  _axis = param.get<int32_t>("axis");
+
+}
+
+void CumSumFunc::run() {
+  auto bottom_data = _bottoms[0]->cpu_data<float>();
+  auto top_data = _tops[0]->cpu_data<float>();
+  std::vector<int> shape = _bottoms[0]->shape; 
+
+  int length = shape[_axis];
+  int stride = 1;
+
+  for (size_t i = _axis + 1; i < shape.size(); i++) { 
+    stride *= shape[i];
+  }
+  int numelement = 1;
+  for (size_t i = 0; i < shape.size(); i++) { 
+    numelement *= shape[i];
+  }
+
+  // int num_elements = _tops[0]->size();
+  int cur_index = 0;
+  while (cur_index < numelement) {
+    for (int l = 0; l < length; l++) {
+      int start =  cur_index + l * stride;
+      for (int s = 0; s < stride; s++) {
+        if (l == 0) {
+          top_data[start + s] = bottom_data[start + s];
+        } else {
+          top_data[start + s] = bottom_data[start + s] + 
+                                   top_data[start + s - stride];
+        }
+      }
+
+    }
+    cur_index += length * stride;
+  }
+
+
+}
+}
+}
\ No newline at end of file
diff --git a/cviruntime/src/common/cpu_function/cumsum.hpp b/cviruntime/src/common/cpu_function/cumsum.hpp
new file mode 100644
index 000000000..98046e68b
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/cumsum.hpp
@@ -0,0 +1,30 @@
+#include <iostream>
+#include <vector>
+#include <runtime/neuron.hpp>
+#include <runtime/cpu_function.hpp>
+
+namespace cvi {
+namespace runtime {
+
+class CumSumFunc : public ICpuFunction {
+
+public:
+  CumSumFunc() {}
+
+  ~CumSumFunc();
+
+  void setup(std::vector<std::shared_ptr<Neuron> > &inputs,
+             std::vector<std::shared_ptr<Neuron> > &outputs,
+             OpParam &param);
+  void run();
+  static ICpuFunction *open() { return new CumSumFunc(); }
+  static void close(ICpuFunction *func) { delete func; }
+
+private:
+  std::vector<std::shared_ptr<Neuron> > _bottoms;
+  std::vector<std::shared_ptr<Neuron> > _tops;
+
+  int _axis;
+};
+}
+}
\ No newline at end of file
diff --git a/cviruntime/src/common/cpu_function/deform_im2col.cpp b/cviruntime/src/common/cpu_function/deform_im2col.cpp
new file mode 100644
index 000000000..c7160d143
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/deform_im2col.cpp
@@ -0,0 +1,90 @@
+#include <iostream>
+#include <vector>
+#include <cmath>
+#include <numeric>
+#include <runtime/debug.h>
+#include <runtime/neuron.hpp>
+#include <cpu_function/deform_im2col.hpp>
+
+#include <chrono>
+
+namespace cvi {
+namespace runtime {
+
+// deconstructor
+DeformableIm2ColFunc::~DeformableIm2ColFunc() {}
+
+void DeformableIm2ColFunc::setup(tensor_list_t &inputs,
+            tensor_list_t &outputs,
+            OpParam &param) {
+
+  // init in/out
+  top_ = outputs[0];
+  bottom_ = inputs[0];
+  offset_ = inputs[1];
+  mask_ = inputs[2];
+
+  // init parameter
+  kernel_h_ = param.get<int>("kernel_h");
+  kernel_w_ = param.get<int>("kernel_w");
+  stride_h_ = param.get<int>("stride_h");
+  stride_w_ = param.get<int>("stride_w");
+  pad_h_ = param.get<int>("padding_t");
+  pad_w_ = param.get<int>("padding_l");
+  dilation_h_ = param.get<int>("dilation_h");
+  dilation_w_ = param.get<int>("dilation_w");
+  deformable_group_ = param.get<int>("deform_group");
+
+  // sanity check
+  assert((CVI_FMT_FP32 == bottom_->fmt && bottom_->fmt == top_->fmt) &&
+      "ONLY support fp32 now");
+}
+
+void DeformableIm2ColFunc::run () {
+
+  auto shape = bottom_->shape;
+  const int batch = shape[0];
+  const int channels = shape[1];
+  const int height = shape[2];
+  const int width = shape[3];
+
+  const int height_out = (height + 2 * pad_h_ - (dilation_h_ * (kernel_h_ - 1) + 1)) / stride_h_ + 1;
+  const int width_out = (width + 2 * pad_w_ - (dilation_w_ * (kernel_w_ - 1) + 1)) / stride_w_ + 1;
+
+  printf("input shape: (%d, %d, %d, %d)\n", batch, channels, height, width);
+  printf("output shape: (%d, %d)\n", channels*kernel_h_*kernel_w_, height_out * width_out);
+
+  std::vector<float> ones(height_out*width_out, 1);
+  std::vector<float> columns(channels * kernel_h_ * kernel_w_ * 1 * height_out * width_out);
+  auto output_ = top_;
+  int input_chw = channels * height * width * 1;
+  int offset_chw = offset_->shape[1] * offset_->shape[2] * offset_->shape[3] * 1;
+  int mask_chw = mask_->shape[1] * mask_->shape[2] * mask_->shape[3] * 1;
+  int output_chw = output_->shape[1] * output_->shape[2] * output_->shape[3] * 1;
+
+  for (int b = 0; b < batch; b++)
+  {
+    auto input_n = &bottom_->cpu_data<float>()[b * input_chw];
+    auto offset_n = &offset_->cpu_data<float>()[b * offset_chw];
+    auto mask_n = &mask_->cpu_data<float>()[b * mask_chw];
+    auto output_n = &output_->cpu_data<float>()[b * output_chw];
+
+    auto t1 = std::chrono::system_clock::now();
+
+    modulated_deformable_im2col_cpu(input_n,
+        offset_n,
+        mask_n,
+        1, channels, height, width,
+        height_out, width_out, kernel_h_, kernel_w_,
+        pad_h_, pad_w_, stride_h_, stride_w_, dilation_h_, dilation_w_,
+        deformable_group_,
+        output_n);
+
+    auto t2 = std::chrono::system_clock::now();
+    std::chrono::duration<double, std::milli> duration = t2 - t1;
+    std::cout << "duration: " << duration.count() << "(ms)\n";
+  }
+}
+
+} // namespace runtime
+} // namespace cvi
diff --git a/cviruntime/src/common/cpu_function/deform_im2col.hpp b/cviruntime/src/common/cpu_function/deform_im2col.hpp
new file mode 100644
index 000000000..0ecb36cab
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/deform_im2col.hpp
@@ -0,0 +1,166 @@
+#include <cstdio>
+#include <algorithm>
+#include <cstring>
+#include <cmath>
+
+#include <iostream>
+#include <vector>
+#include <unordered_map>
+#include <runtime/neuron.hpp>
+#include <runtime/cpu_function.hpp>
+
+namespace cvi {
+namespace runtime {
+
+class DeformableIm2ColFunc : public ICpuFunction {
+
+public:
+  DeformableIm2ColFunc() {}
+
+  ~DeformableIm2ColFunc();
+  void setup(tensor_list_t &inputs,
+             tensor_list_t &outputs,
+             OpParam &param);
+  void run();
+
+  static ICpuFunction *open() { return new DeformableIm2ColFunc(); }
+  static void close(ICpuFunction *func) { delete func; }
+
+private:
+  // in/out
+  std::shared_ptr<Neuron> bottom_;
+  std::shared_ptr<Neuron> top_;
+  std::shared_ptr<Neuron> offset_;
+  std::shared_ptr<Neuron> mask_;
+
+  // parameters
+  int kernel_h_;
+  int kernel_w_;
+  int stride_h_;
+  int stride_w_;
+  int pad_h_;
+  int pad_w_;
+  int dilation_h_;
+  int dilation_w_;
+  int deformable_group_;
+
+  float dmcn_im2col_bilinear_cpu(const float *bottom_data, const int data_width,
+                             const int height, const int width, float h, float w)
+  {
+    int h_low = std::floor(h);
+    int w_low = std::floor(w);
+    int h_high = h_low + 1;
+    int w_high = w_low + 1;
+
+    float lh = h - h_low;
+    float lw = w - w_low;
+    float hh = 1 - lh, hw = 1 - lw;
+
+    float v1 = 0;
+    if (h_low >= 0 && w_low >= 0)
+      v1 = bottom_data[h_low * data_width + w_low];
+    float v2 = 0;
+    if (h_low >= 0 && w_high <= width - 1)
+      v2 = bottom_data[h_low * data_width + w_high];
+    float v3 = 0;
+    if (h_high <= height - 1 && w_low >= 0)
+      v3 = bottom_data[h_high * data_width + w_low];
+    float v4 = 0;
+    if (h_high <= height - 1 && w_high <= width - 1)
+      v4 = bottom_data[h_high * data_width + w_high];
+
+    float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+    float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+    return val;
+  }
+
+  void modulated_deformable_im2col_cpu_kernel(const int n, const float *data_im, const float *data_offset, const float *data_mask,
+                                                         const int height, const int width, const int kernel_h, const int kernel_w,
+                                                         const int pad_h, const int pad_w,
+                                                         const int stride_h, const int stride_w,
+                                                         const int dilation_h, const int dilation_w,
+                                                         const int channel_per_deformable_group,
+                                                         const int batch_size, const int num_channels, const int deformable_group,
+                                                         const int height_col, const int width_col,
+                                                         float *data_col)
+  {
+    // launch channels * batch_size * height_col * width_col cores
+    for(int index=0; index<n; index++)
+    {
+      // NOTE(CharlesShang): different from Dai Jifeng's MXNet implementation, col_buffer is of shape (c*kw*kh, N, oh, ow)
+      // here columns is of shape (N, c*kw*kh, oh * ow), need to adapt axis
+
+      // index index of output matrix
+      const int w_col = index % width_col;
+      const int h_col = (index / width_col) % height_col;
+      // const int b_col = (index / width_col / height_col) % batch_size;
+      const int b_col = (index / width_col / height_col / num_channels) % batch_size;
+      // const int c_im = (index / width_col / height_col) / batch_size;
+      const int c_im = (index / width_col / height_col) % num_channels;
+      // const int c_col = c_im * kernel_h * kernel_w;
+      const int c_col = c_im * kernel_h * kernel_w;
+
+      // compute deformable group index
+      const int deformable_group_index = c_im / channel_per_deformable_group;
+
+      const int h_in = h_col * stride_h - pad_h;
+      const int w_in = w_col * stride_w - pad_w;
+
+      //  float *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+      float *data_col_ptr = data_col + ((b_col * num_channels * kernel_w * kernel_h + c_col) * height_col + h_col) * width_col + w_col;
+      //const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in;
+      const float *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width;
+      const float *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+
+      const float *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
+
+      for (int i = 0; i < kernel_h; ++i)
+      {
+        for (int j = 0; j < kernel_w; ++j)
+        {
+          const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+          const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
+          const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
+          const float offset_h = data_offset_ptr[data_offset_h_ptr];
+          const float offset_w = data_offset_ptr[data_offset_w_ptr];
+          const float mask = data_mask_ptr[data_mask_hw_ptr];
+          float val = static_cast<float>(0);
+          const float h_im = h_in + i * dilation_h + offset_h;
+          const float w_im = w_in + j * dilation_w + offset_w;
+          //if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
+          if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+          {
+            //const float map_h = i * dilation_h + offset_h;
+            //const float map_w = j * dilation_w + offset_w;
+            //const int cur_height = height - h_in;
+            //const int cur_width = width - w_in;
+            //val = dmcn_im2col_bilinear_cpu(data_im_ptr, width, cur_height, cur_width, map_h, map_w);
+            val = dmcn_im2col_bilinear_cpu(data_im_ptr, width, height, width, h_im, w_im);
+          }
+          *data_col_ptr = val * mask;
+          // data_col_ptr += batch_size * height_col * width_col;
+          data_col_ptr += height_col * width_col;
+        }
+      }
+    }
+  }
+
+  void modulated_deformable_im2col_cpu(const float* data_im, const float* data_offset, const float* data_mask,
+    const int batch_size, const int channels, const int height_im, const int width_im,
+    const int height_col, const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int deformable_group, float* data_col) {
+    // num_axes should be smaller than block size
+    const int channel_per_deformable_group = channels / deformable_group;
+    const int num_kernels = channels * batch_size * height_col * width_col;
+    modulated_deformable_im2col_cpu_kernel(
+        num_kernels, data_im, data_offset, data_mask, height_im, width_im, kernel_h, kernel_w,
+        pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group,
+        batch_size, channels, deformable_group, height_col, width_col, data_col);
+  }
+};
+
+}
+}
diff --git a/cviruntime/src/common/cpu_function/deformableconv.cpp b/cviruntime/src/common/cpu_function/deformableconv.cpp
new file mode 100644
index 000000000..59a7f0ef4
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/deformableconv.cpp
@@ -0,0 +1,190 @@
+#include <iostream>
+#include <vector>
+#include <cmath>
+#include <numeric>      // std::accumulate
+#include <runtime/debug.h>
+#include <runtime/neuron.hpp>
+#include <cpu_function/deformableconv.hpp>
+
+#include <chrono>
+
+//#include <ATen/ATen.h>
+//#include <ATen/cuda/CUDAContext.h>
+
+//#include <TH/TH.h>
+//#include <THC/THCAtomics.cuh>
+//#include <THC/THCDeviceUtils.cuh>
+
+//extern THCState *state;
+
+// author: Charles Shang
+// https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu
+// modified from the CUDA version for CPU use by Daniel K. Suhendro
+//
+// come from https://github.com/CharlesShang/DCNv2
+
+namespace cvi {
+namespace runtime {
+
+#define X(i) x[ (i)*incx ]
+static void AddDot( int k, float *x, int incx,  float *y, float *gamma,
+    float alpha, float beta)
+{
+  /* compute gamma := x' * y + gamma with vectors x and y of length n.
+     Here x starts at location x with increment (stride) incx and y starts at location y and has (implicit) stride of 1.
+  */
+
+  int p;
+
+  float _gamma = 0;
+  for ( p=0; p<k; p++ ){
+    //*gamma += X( p ) * y[ p ];
+    _gamma += X( p ) * y[ p ];
+  }
+  *gamma = alpha * _gamma + beta * (*gamma);
+}
+#define A(i,j) a[ (j)*lda + (i) ]
+#define B(i,j) b[ (j)*ldb + (i) ]
+#define C(i,j) c[ (j)*ldc + (i) ]
+// the defination comes from https://docs.rs/torch/0.1.0/src/torch/lib.rs.html#8865-8869
+// and we implemented by https://github.com/flame/how-to-optimize-gemm/blob/master/src/MMult1.c
+static void THFloatBlas_gemm(char transa,
+                          char transb,
+                          int m,
+                          int n,
+                          int k, float alpha,
+                          float* a, int lda,
+                          float* b, int ldb,
+                          float beta, float* c,
+                          int ldc) {
+  (void)transa;
+  (void)transb;
+  int i, j;
+
+  for ( j=0; j<n; j+=1 ){        /* Loop over the columns of C */
+    for ( i=0; i<m; i+=1 ){        /* Loop over the rows of C */
+      /* Update the C( i,j ) with the inner product of the ith row of A
+         and the jth column of B */
+
+      AddDot( k, &A( i,0 ), lda, &B( 0,j ), &C( i,j ), alpha, beta);
+    }
+  }
+}
+
+// deconstructor
+DeformableConvFunc::~DeformableConvFunc() {}
+
+void DeformableConvFunc::setup(tensor_list_t &inputs,
+            tensor_list_t &outputs,
+            OpParam &param) {
+
+  // init in/out
+  top_ = outputs[0];
+  bottom_ = inputs[0];
+  weight_ = inputs[1];
+  bias_ = inputs[2];
+  offset_ = inputs[3];
+  mask_ = inputs[4];
+
+  // init parameter
+  kernel_h_ = weight_->shape[2];
+  kernel_w_ = weight_->shape[3];
+  stride_h_ = param.get<int>("stride_h");
+  stride_w_ = param.get<int>("stride_w");
+  pad_h_ = param.get<int>("padding_t");
+  pad_w_ = param.get<int>("padding_l");
+  dilation_h_ = param.get<int>("dilation_h");
+  dilation_w_ = param.get<int>("dilation_w");
+  deformable_group_ = param.get<int>("deform_group");
+
+  // sanity check
+  assert((CVI_FMT_FP32 == bottom_->fmt && bottom_->fmt == top_->fmt) &&
+      "ONLY support fp32 now");
+}
+
+void DeformableConvFunc::run () {
+
+  auto shape = bottom_->shape;
+  const int batch = shape[0];
+  const int channels = shape[1];
+  const int height = shape[2];
+  const int width = shape[3];
+
+  shape = weight_->shape;
+  const int channels_out = shape[0];
+
+  const int height_out = (height + 2 * pad_h_ - (dilation_h_ * (kernel_h_ - 1) + 1)) / stride_h_ + 1;
+  const int width_out = (width + 2 * pad_w_ - (dilation_w_ * (kernel_w_ - 1) + 1)) / stride_w_ + 1;
+
+  printf("input shape: (%d, %d, %d, %d)\n", batch, channels, height, width);
+  printf("output shape: (%d, %d, %d, %d)\n", batch, channels_out, height_out, width_out);
+
+  std::vector<float> ones(height_out*width_out, 1);
+  std::vector<float> columns(channels * kernel_h_ * kernel_w_ * 1 * height_out * width_out);
+  auto output_ = top_;
+  int input_chw = channels * height * width * 1;
+  int offset_chw = offset_->shape[1] * offset_->shape[2] * offset_->shape[3] * 1;
+  int mask_chw = mask_->shape[1] * mask_->shape[2] * mask_->shape[3] * 1;
+  int output_chw = output_->shape[1] * output_->shape[2] * output_->shape[3] * 1;
+
+  for (int b = 0; b < batch; b++)
+  {
+    auto input_n = &bottom_->cpu_data<float>()[b * input_chw];
+    auto offset_n = &offset_->cpu_data<float>()[b * offset_chw];
+    auto mask_n = &mask_->cpu_data<float>()[b * mask_chw];
+    auto output_n = &output_->cpu_data<float>()[b * output_chw];
+
+    // Do Bias first:
+    // M,N,K are dims of matrix A and B
+    // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+    // (N x 1) (1 x M)
+    long m_ = channels_out;
+    long n_ = height_out * width_out;
+    long k_ = 1;
+
+    auto t1 = std::chrono::system_clock::now();
+    // C = alpha * A * B + beta * C
+    THFloatBlas_gemm('t', 'n', n_, m_, k_, 1.0f,
+        ones.data(), k_,
+        bias_->cpu_data<float>(), k_, 0.0f,
+        output_n, n_);
+
+    auto t2 = std::chrono::system_clock::now();
+
+    std::chrono::duration<double, std::milli> duration = t2 - t1;
+    std::cout << "step1: " << duration.count() << "(ms)\n";
+    t1 = t2;
+
+    modulated_deformable_im2col_cpu(input_n,
+        offset_n,
+        mask_n,
+        1, channels, height, width,
+        height_out, width_out, kernel_h_, kernel_w_,
+        pad_h_, pad_w_, stride_h_, stride_w_, dilation_h_, dilation_w_,
+        deformable_group_,
+        columns.data());
+
+    t2 = std::chrono::system_clock::now();
+    duration = t2 - t1;
+    std::cout << "step2: " << duration.count() << "(ms)\n";
+
+    t1 = t2;
+    //(k * m)  x  (m * n)
+    // Y = WC
+    long m = channels_out;
+    long n = height_out * width_out;
+    long k = channels * kernel_h_ * kernel_w_;
+    printf("m: %ld, n: %ld, k: %ld\n", m, n, k);
+    THFloatBlas_gemm('n', 'n', n, m, k, 1.0f,
+        columns.data(), n,
+        weight_->cpu_data<float>(), k, 1.0f,
+        output_n, n);
+
+    t2 = std::chrono::system_clock::now();
+    duration = t2 - t1;
+    std::cout << "step3: " << duration.count() << "(ms)\n";
+  }
+}
+
+} // namespace runtime
+} // namespace cvi
diff --git a/cviruntime/src/common/cpu_function/deformableconv.hpp b/cviruntime/src/common/cpu_function/deformableconv.hpp
new file mode 100644
index 000000000..d54c19705
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/deformableconv.hpp
@@ -0,0 +1,190 @@
+//#include "dcn_v2_im2col_cpu.h"
+#include <cstdio>
+#include <algorithm>
+#include <cstring>
+#include <cmath>
+
+
+//#include <ATen/ATen.h>
+//#include <ATen/cuda/CUDAContext.h>
+
+//#include <TH/TH.h>
+//#include <THC/THCAtomics.cuh>
+//#include <THC/THCDeviceUtils.cuh>
+
+#include <iostream>
+#include <vector>
+#include <unordered_map>
+#include <runtime/neuron.hpp>
+#include <runtime/cpu_function.hpp>
+
+namespace cvi {
+namespace runtime {
+
+// modified from the CUDA version for CPU use by Daniel K. Suhendro
+
+/*#define CUDA_KERNEL_LOOP(i, n)                          \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x;   \
+      i < (n);                                          \
+      i += blockDim.x * gridDim.x)
+
+const int CUDA_NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int N)
+{
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}*/
+
+class DeformableConvFunc : public ICpuFunction {
+
+public:
+  DeformableConvFunc() {}
+
+  ~DeformableConvFunc();
+  void setup(tensor_list_t &inputs,
+             tensor_list_t &outputs,
+             OpParam &param);
+  void run();
+
+  static ICpuFunction *open() { return new DeformableConvFunc(); }
+  static void close(ICpuFunction *func) { delete func; }
+
+private:
+  // in/out
+  std::shared_ptr<Neuron> bottom_;
+  std::shared_ptr<Neuron> top_;
+  std::shared_ptr<Neuron> weight_;
+  std::shared_ptr<Neuron> bias_;
+  std::shared_ptr<Neuron> offset_;
+  std::shared_ptr<Neuron> mask_;
+
+  // parameters
+  int kernel_h_;
+  int kernel_w_;
+  int stride_h_;
+  int stride_w_;
+  int pad_h_;
+  int pad_w_;
+  int dilation_h_;
+  int dilation_w_;
+  int deformable_group_;
+
+  float dmcn_im2col_bilinear_cpu(const float *bottom_data, const int data_width,
+                             const int height, const int width, float h, float w)
+  {
+    int h_low = std::floor(h);
+    int w_low = std::floor(w);
+    int h_high = h_low + 1;
+    int w_high = w_low + 1;
+
+    float lh = h - h_low;
+    float lw = w - w_low;
+    float hh = 1 - lh, hw = 1 - lw;
+
+    float v1 = 0;
+    if (h_low >= 0 && w_low >= 0)
+      v1 = bottom_data[h_low * data_width + w_low];
+    float v2 = 0;
+    if (h_low >= 0 && w_high <= width - 1)
+      v2 = bottom_data[h_low * data_width + w_high];
+    float v3 = 0;
+    if (h_high <= height - 1 && w_low >= 0)
+      v3 = bottom_data[h_high * data_width + w_low];
+    float v4 = 0;
+    if (h_high <= height - 1 && w_high <= width - 1)
+      v4 = bottom_data[h_high * data_width + w_high];
+
+    float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+    float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+    return val;
+  }
+
+  void modulated_deformable_im2col_cpu_kernel(const int n, const float *data_im, const float *data_offset, const float *data_mask,
+                                                         const int height, const int width, const int kernel_h, const int kernel_w,
+                                                         const int pad_h, const int pad_w,
+                                                         const int stride_h, const int stride_w,
+                                                         const int dilation_h, const int dilation_w,
+                                                         const int channel_per_deformable_group,
+                                                         const int batch_size, const int num_channels, const int deformable_group,
+                                                         const int height_col, const int width_col,
+                                                         float *data_col)
+  {
+    // launch channels * batch_size * height_col * width_col cores
+    for(int index=0; index<n; index++)
+    {
+      // NOTE(CharlesShang): different from Dai Jifeng's MXNet implementation, col_buffer is of shape (c*kw*kh, N, oh, ow)
+      // here columns is of shape (N, c*kw*kh, oh * ow), need to adapt axis
+
+      // index index of output matrix
+      const int w_col = index % width_col;
+      const int h_col = (index / width_col) % height_col;
+      // const int b_col = (index / width_col / height_col) % batch_size;
+      const int b_col = (index / width_col / height_col / num_channels) % batch_size;
+      // const int c_im = (index / width_col / height_col) / batch_size;
+      const int c_im = (index / width_col / height_col) % num_channels;
+      // const int c_col = c_im * kernel_h * kernel_w;
+      const int c_col = c_im * kernel_h * kernel_w;
+
+      // compute deformable group index
+      const int deformable_group_index = c_im / channel_per_deformable_group;
+
+      const int h_in = h_col * stride_h - pad_h;
+      const int w_in = w_col * stride_w - pad_w;
+
+      //  float *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+      float *data_col_ptr = data_col + ((b_col * num_channels * kernel_w * kernel_h + c_col) * height_col + h_col) * width_col + w_col;
+      //const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in;
+      const float *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width;
+      const float *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+
+      const float *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
+
+      for (int i = 0; i < kernel_h; ++i)
+      {
+        for (int j = 0; j < kernel_w; ++j)
+        {
+          const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+          const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
+          const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
+          const float offset_h = data_offset_ptr[data_offset_h_ptr];
+          const float offset_w = data_offset_ptr[data_offset_w_ptr];
+          const float mask = data_mask_ptr[data_mask_hw_ptr];
+          float val = static_cast<float>(0);
+          const float h_im = h_in + i * dilation_h + offset_h;
+          const float w_im = w_in + j * dilation_w + offset_w;
+          //if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
+          if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+          {
+            //const float map_h = i * dilation_h + offset_h;
+            //const float map_w = j * dilation_w + offset_w;
+            //const int cur_height = height - h_in;
+            //const int cur_width = width - w_in;
+            //val = dmcn_im2col_bilinear_cpu(data_im_ptr, width, cur_height, cur_width, map_h, map_w);
+            val = dmcn_im2col_bilinear_cpu(data_im_ptr, width, height, width, h_im, w_im);
+          }
+          *data_col_ptr = val * mask;
+          // data_col_ptr += batch_size * height_col * width_col;
+          data_col_ptr += height_col * width_col;
+        }
+      }
+    }
+  }
+
+  void modulated_deformable_im2col_cpu(const float* data_im, const float* data_offset, const float* data_mask,
+    const int batch_size, const int channels, const int height_im, const int width_im,
+    const int height_col, const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int deformable_group, float* data_col) {
+    // num_axes should be smaller than block size
+    const int channel_per_deformable_group = channels / deformable_group;
+    const int num_kernels = channels * batch_size * height_col * width_col;
+    modulated_deformable_im2col_cpu_kernel(
+        num_kernels, data_im, data_offset, data_mask, height_im, width_im, kernel_h, kernel_w,
+        pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group,
+        batch_size, channels, deformable_group, height_col, width_col, data_col);
+  }
+};
+
+}
+}
diff --git a/cviruntime/src/common/cpu_function/embedding.cpp b/cviruntime/src/common/cpu_function/embedding.cpp
new file mode 100644
index 000000000..2b6e99694
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/embedding.cpp
@@ -0,0 +1,68 @@
+#include <cmath>
+#include <cpu_function/embedding.hpp>
+#include <iostream>
+#include <runtime/debug.h>
+#include <runtime/neuron.hpp>
+#include <vector>
+
+namespace cvi {
+namespace runtime {
+
+EmbeddingFunc::~EmbeddingFunc() {}
+
+void EmbeddingFunc::setup(std::vector<std::shared_ptr<Neuron>> &inputs,
+                          std::vector<std::shared_ptr<Neuron>> &outputs,
+                          OpParam &param) {
+  (void)param;
+  _top = outputs[0];
+  _bottoms = inputs;
+  assert(_bottoms[1]->fmt == _top->fmt && "in/out dtype should be equal");
+  _feature_len = _bottoms[1]->shape[1];
+  _table_len = _bottoms[1]->count() / _feature_len;
+  _search_num = _bottoms[0]->count();
+}
+
+void EmbeddingFunc::run() {
+  switch (_bottoms[0]->fmt) {
+  case CVI_FMT_INT16:
+    lookup<int16_t>();
+    break;
+  case CVI_FMT_UINT16:
+    lookup<uint16_t>();
+    break;
+  case CVI_FMT_INT32:
+    lookup<int32_t>();
+    break;
+  case CVI_FMT_UINT32:
+    lookup<uint32_t>();
+    break;
+  default:
+    assert(0 && "input fmt error");
+    break;
+  }
+}
+
+template <typename T1, typename T2>
+void EmbeddingFunc::lookup() {
+  auto indices = _bottoms[0]->cpu_data<T1>();
+  auto table = _bottoms[1]->cpu_data<T2>();
+  auto out = _top->cpu_data<T2>();
+
+  for (int i = 0; i < _search_num; i++) {
+    size_t in_offset = indices[i] * _feature_len;
+    memcpy(out, table + in_offset, _feature_len * sizeof(T2));
+    out += _feature_len;
+  }
+}
+
+template <typename T1>
+void EmbeddingFunc::lookup() {
+  if (_top->fmt == CVI_FMT_INT8) {
+    lookup<T1, int8_t>();
+  } else {
+    lookup<T1, int16_t>();
+  }
+}
+
+} // namespace runtime
+} // namespace cvi
diff --git a/cviruntime/src/common/cpu_function/embedding.hpp b/cviruntime/src/common/cpu_function/embedding.hpp
new file mode 100644
index 000000000..f462145f8
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/embedding.hpp
@@ -0,0 +1,39 @@
+#include <iostream>
+#include <runtime/cpu_function.hpp>
+#include <runtime/neuron.hpp>
+#include <unordered_map>
+#include <vector>
+
+namespace cvi {
+namespace runtime {
+
+class EmbeddingFunc : public ICpuFunction {
+
+public:
+  EmbeddingFunc() {}
+
+  ~EmbeddingFunc();
+  void setup(std::vector<std::shared_ptr<Neuron>> &Inputs,
+             std::vector<std::shared_ptr<Neuron>> &outputs, OpParam &param);
+  void run();
+
+  static ICpuFunction *open() { return new EmbeddingFunc(); }
+  static void close(ICpuFunction *func) { delete func; }
+
+private:
+  std::vector<std::shared_ptr<Neuron>> _bottoms;
+  std::shared_ptr<Neuron> _top;
+
+  template <typename T1, typename T2>
+  void lookup();
+
+  template <typename T1>
+  void lookup();
+
+  int _search_num;
+  int _feature_len;
+  int _table_len;
+};
+
+} // namespace runtime
+} // namespace cvi
diff --git a/cviruntime/src/common/cpu_function/frcn_detection.cpp b/cviruntime/src/common/cpu_function/frcn_detection.cpp
new file mode 100644
index 000000000..e78d093fa
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/frcn_detection.cpp
@@ -0,0 +1,188 @@
+#include <iostream>
+#include <vector>
+#include <cmath>
+#include <runtime/debug.h>
+#include <runtime/neuron.hpp>
+#include <cpu_function/frcn_detection.hpp>
+
+namespace cvi {
+namespace runtime {
+
+typedef struct {
+    float x1, y1, x2, y2;
+} coord;
+
+typedef struct  {
+    coord bbox;
+    int cls;
+    float score;
+} detections;
+
+static void bbox_transform_inv(const float* boxes, const float* deltas, float* pred, int num, int class_num)
+{
+  for (int i = 0; i < num; ++i) {
+    float height = boxes[i*4+3] - boxes[i*4+1] + 1;
+    float width = boxes[i*4+2] - boxes[i*4+0] + 1;
+    float ctr_x = boxes[i*4+0] + width * 0.5;
+    float ctr_y = boxes[i*4+1] + height * 0.5;
+
+    for (int j = 0; j < class_num; ++j) {
+      float dx = deltas[i*class_num*4 + j*4 + 0];
+      float dy = deltas[i*class_num*4 + j*4 + 1];
+      float dw = deltas[i*class_num*4 + j*4 + 2];
+      float dh = deltas[i*class_num*4 + j*4 + 3];
+
+      float pred_ctr_x = dx * width + ctr_x;
+      float pred_ctr_y = dy * height + ctr_y;
+      float pred_w = std::exp(dw) * width;
+      float pred_h = std::exp(dh) * height;
+
+      pred[i*class_num*4 + j*4 + 0] = pred_ctr_x - pred_w / 2;
+      pred[i*class_num*4 + j*4 + 1] = pred_ctr_y - pred_h / 2;
+      pred[i*class_num*4 + j*4 + 2] = pred_ctr_x + pred_w / 2;
+      pred[i*class_num*4 + j*4 + 3] = pred_ctr_y + pred_h / 2;
+    }
+  }
+}
+
+static void nms(detections *dets, int num, float nms_threshold)
+{
+  for (int i = 0; i < num; i++) {
+    if (dets[i].score == 0) {
+      // erased already
+      continue;
+    }
+
+    float s1 = (dets[i].bbox.x2 - dets[i].bbox.x1 + 1) * (dets[i].bbox.y2 - dets[i].bbox.y1 + 1);
+    for (int j = i + 1; j < num; j++) {
+      if (dets[j].score == 0) {
+        // erased already
+        continue;
+      }
+      if (dets[i].cls != dets[j].cls) {
+        // not the same class
+        continue;
+      }
+
+      float s2 = (dets[j].bbox.x2 - dets[j].bbox.x1 + 1) * (dets[j].bbox.y2 - dets[j].bbox.y1 + 1);
+
+      float x1 = std::max(dets[i].bbox.x1, dets[j].bbox.x1);
+      float y1 = std::max(dets[i].bbox.y1, dets[j].bbox.y1);
+      float x2 = std::min(dets[i].bbox.x2, dets[j].bbox.x2);
+      float y2 = std::min(dets[i].bbox.y2, dets[j].bbox.y2);
+
+      float width = x2 - x1;
+      float height = y2 - y1;
+      if (width > 0 && height > 0) {
+        float iou = width * height / (s1 + s2 - width * height);
+        assert(iou <= 1.0f);
+        if (iou > nms_threshold) {
+          // overlapped, select one to erase
+          if (dets[i].score < dets[j].score) {
+            dets[i].score = 0;
+          } else {
+            dets[j].score = 0;
+          }
+        }
+      }
+    }
+  }
+}
+
+FrcnDetectionFunc::~FrcnDetectionFunc() {}
+
+void FrcnDetectionFunc::setup(tensor_list_t &inputs,
+            tensor_list_t &outputs,
+            OpParam &param) {
+  nms_threshold = param.get<float>("nms_threshold");
+  obj_threshold = param.get<float>("obj_threshold");
+  keep_topk = param.get<int32_t>("keep_topk");
+  class_num = param.get<int32_t>("class_num");
+
+  std::sort(inputs.begin(), inputs.end(),
+    [](const std::shared_ptr<Neuron> &a, const std::shared_ptr<Neuron> &b) {
+      return a->shape[1] > b->shape[1];
+    });
+
+  _bottoms = inputs;
+  _tops = outputs;
+}
+
+void FrcnDetectionFunc::run() {
+  auto top_data = _tops[0]->cpu_data<float>();
+  memset(top_data, 0, _tops[0]->size());
+
+  size_t bottom_count = _bottoms.size();
+  assert(bottom_count == 3);
+
+  float *bbox_deltas = (float *)_bottoms[0]->cpu_data<float>();
+  float *scores = (float *)_bottoms[1]->cpu_data<float>();
+  float *rois = (float *)_bottoms[2]->cpu_data<float>();
+
+  int batch = _bottoms[2]->shape[0];
+  int num = _bottoms[2]->shape[2];
+  auto deltas_size = _bottoms[0]->size() / batch;
+  auto scores_size = _bottoms[1]->size() / batch;
+
+  for (int b = 0; b < batch; ++b) {
+    auto batch_bbox_deltas = bbox_deltas + b * deltas_size;
+    auto batch_scores = scores + b * scores_size;
+    auto batch_rois = rois + _bottoms[2]->offset(b);
+    std::vector<float> boxes(num * 4, 0);
+    for (int i = 0; i < num; ++i) {
+      for (int j = 0; j < 4; ++j) {
+        boxes[i*4 + j] = batch_rois[i*5 + j + 1];
+      }
+    }
+
+    std::vector<float> pred(num * class_num * 4, 0);
+    float *pred_data = pred.data();
+    std::vector<float> deltas(batch_bbox_deltas, batch_bbox_deltas + deltas_size);
+    bbox_transform_inv(boxes.data(), deltas.data(), pred_data, num, class_num);
+
+    int det_num = 0;
+    detections dets[num];
+
+    for (int i = 0; i < num; ++i) {
+      for (int j = 1; j < class_num; ++j) {
+        if (batch_scores[i*class_num + j] > obj_threshold) {
+          dets[det_num].bbox.x1 = pred[i*class_num*4 + j*4 + 0];
+          dets[det_num].bbox.y1 = pred[i*class_num*4 + j*4 + 1];
+          dets[det_num].bbox.x2 = pred[i*class_num*4 + j*4 + 2];
+          dets[det_num].bbox.y2 = pred[i*class_num*4 + j*4 + 3];
+          dets[det_num].cls = j;
+          dets[det_num].score = batch_scores[i*class_num + j];
+          det_num++;
+        }
+      }
+    }
+
+    nms(dets, det_num, nms_threshold);
+    detections dets_nms[det_num];
+    int det_idx = 0;
+    for (int i = 0; i < det_num; i++) {
+      if (dets[i].score > 0) {
+        dets_nms[det_idx] = dets[i];
+        det_idx ++;
+      }
+    }
+
+    auto tmp_topk = keep_topk;
+    if (tmp_topk > det_idx)
+        tmp_topk = det_idx;
+
+    long long count = 0;
+    auto batch_top_data = top_data + _tops[0]->offset(b);
+    for(int i = 0; i < tmp_topk; ++i) {
+      batch_top_data[count++] = dets_nms[i].bbox.x1;
+      batch_top_data[count++] = dets_nms[i].bbox.y1;
+      batch_top_data[count++] = dets_nms[i].bbox.x2;
+      batch_top_data[count++] = dets_nms[i].bbox.y2;
+      batch_top_data[count++] = dets_nms[i].cls;
+      batch_top_data[count++] = dets_nms[i].score;
+    }
+  }
+}
+
+}
+}
\ No newline at end of file
diff --git a/cviruntime/src/common/cpu_function/frcn_detection.hpp b/cviruntime/src/common/cpu_function/frcn_detection.hpp
new file mode 100644
index 000000000..8ce0b1c47
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/frcn_detection.hpp
@@ -0,0 +1,36 @@
+#include <iostream>
+#include <vector>
+#include <unordered_map>
+#include <algorithm>
+#include <runtime/neuron.hpp>
+#include <runtime/cpu_function.hpp>
+
+namespace cvi {
+namespace runtime {
+
+class FrcnDetectionFunc : public ICpuFunction {
+
+public:
+  FrcnDetectionFunc() {}
+
+  ~FrcnDetectionFunc();
+  void setup(tensor_list_t &inputs,
+             tensor_list_t &outputs,
+             OpParam &param);
+  void run();
+
+  static ICpuFunction *open() { return new FrcnDetectionFunc(); }
+  static void close(ICpuFunction *func) { delete func; }
+
+private:
+  tensor_list_t _bottoms;
+  tensor_list_t _tops;
+
+  float nms_threshold;
+  float obj_threshold;
+  int keep_topk;
+  int class_num;
+};
+
+}
+}
diff --git a/cviruntime/src/common/cpu_function/gatherelements_pt.cpp b/cviruntime/src/common/cpu_function/gatherelements_pt.cpp
new file mode 100644
index 000000000..b48f89e45
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/gatherelements_pt.cpp
@@ -0,0 +1,221 @@
+#include <string.h>
+#include <iostream>
+#include <vector>
+#include <cmath>
+#include <runtime/neuron.hpp>
+#include <cpu_function/gatherelements_pt.hpp>
+
+namespace cvi {
+namespace runtime {
+
+GatherElementsPtFunc::~GatherElementsPtFunc() {}
+
+void GatherElementsPtFunc::setup(std::vector<std::shared_ptr<Neuron>> &inputs,
+                       std::vector<std::shared_ptr<Neuron>> &outputs,
+                       OpParam &param) {
+  _bottoms = inputs;
+  _tops = outputs;
+  _axis = param.get<int32_t>("axis");
+
+}
+
+void gather_dim1_0(
+    float *dst, const float *src,  const int *idx, int *shape) {
+    for (int i = 0; i < shape[0]; ++i) {
+        *dst = src[*idx];
+        ++dst;
+        ++idx;
+    }
+}
+
+void gather_dim2_0(
+    float *dst, const float *src, const int *idx, int *shape, int *org_shape) {
+    for (int i = 0; i < shape[0]; ++i) {
+        for (int j = 0; j < shape[1]; ++j) {
+            *dst = src[*idx * org_shape[1] + j];
+            ++dst;
+            ++idx;
+        }
+    }
+}
+
+void gather_dim2_1(
+    float *dst, const float *src, const int *idx, int *shape, int *org_shape) {
+    for (int i = 0; i < shape[0]; ++i) {
+        int idx_i = i * org_shape[1];
+        for (int j = 0; j < shape[1]; ++j) {
+            *dst = src[idx_i + *idx];
+            ++dst;
+            ++idx;
+        }
+    }
+}
+
+void gather_dim3_0(
+    float *dst, const float *src, const int *idx, int *shape, int *org_shape) {
+    int shape_1_2 = org_shape[1] * org_shape[2];
+    for (int i = 0; i < shape[0]; ++i) {
+        for (int j = 0; j < shape[1]; ++j) {
+            int idx_j = j * org_shape[2];
+            for (int k = 0; k < shape[2]; ++k) {
+                *dst = src[*idx * shape_1_2 + idx_j + k];
+                ++dst;
+                ++idx;
+            }
+        }
+    }
+}
+
+void gather_dim3_1(
+    float *dst, const float *src, const int *idx, int *shape, int *org_shape) {
+    int shape_1_2 = org_shape[1] * org_shape[2];
+    for (int i = 0; i < shape[0]; ++i) {
+        int idx_i = i * shape_1_2;
+        for (int j = 0; j < shape[1]; ++j) {
+            for (int k = 0; k < shape[2]; ++k) {
+                *dst = src[idx_i + *idx * org_shape[2] + k];
+                ++dst;
+                ++idx;
+            }
+        }
+    }
+}
+
+void gather_dim3_2(
+    float *dst, const float *src, const int *idx, int *shape, int *org_shape) {
+    int shape_1_2 = org_shape[1] * org_shape[2];
+    for (int i = 0; i < shape[0]; ++i) {
+        int idx_i = i * shape_1_2;
+        for (int j = 0; j < shape[1]; ++j) {
+            int idx_j = idx_i + j * org_shape[2];
+            for (int k = 0; k < shape[2]; ++k) {
+                *dst = src[idx_j + *idx];
+                ++dst;
+                ++idx;
+            }
+        }
+    }
+}
+
+void gather_dim4_0(
+    float *dst, const float *src, const int *idx, int *shape, int *org_shape) {
+    int shape_1_2_3 = org_shape[1] * org_shape[2] * org_shape[3];
+    int shape_2_3 = org_shape[2] * org_shape[3];
+    for (int i = 0; i < shape[0]; ++i) {
+        for (int j = 0; j < shape[1]; ++j) {
+            int idx_j = j * shape_2_3;
+            for (int k = 0; k < shape[2]; ++k) {
+                int idx_k = idx_j + k * org_shape[3];
+                for (int g = 0; g < shape[3]; ++g) {
+                    *dst = src[*idx * shape_1_2_3 + idx_k + g];
+                    ++dst;
+                    ++idx;
+                }
+            }
+        }
+    }
+}
+
+void gather_dim4_1(
+    float *dst, const float *src, const int *idx, int *shape, int *org_shape) {
+    int shape_1_2_3 = org_shape[1] * org_shape[2] * org_shape[3];
+    int shape_2_3 = org_shape[2] * org_shape[3];
+    for (int i = 0; i < shape[0]; ++i) {
+        int idx_i = i * shape_1_2_3;
+        for (int j = 0; j < shape[1]; ++j) {
+            for (int k = 0; k < shape[2]; ++k) {
+                int idx_k = k * org_shape[3];
+                for (int g = 0; g < shape[3]; ++g) {
+                    *dst = src[idx_i + *idx * shape_2_3 + idx_k + g];
+                    ++dst;
+                    ++idx;
+                }
+            }
+        }
+    }
+}
+void gather_dim4_2(
+    float *dst, const float *src, const int *idx, int *shape, int *org_shape) {
+    int shape_1_2_3 = org_shape[1] * org_shape[2] * org_shape[3];
+    int shape_2_3 = org_shape[2] * org_shape[3];
+    for (int i = 0; i < shape[0]; ++i) {
+        int idx_i = i * shape_1_2_3;
+        for (int j = 0; j < shape[1]; ++j) {
+            int idx_j = idx_i + j * shape_2_3;
+            for (int k = 0; k < shape[2]; ++k) {
+                for (int g = 0; g < shape[3]; ++g) {
+                    *dst = src[idx_j + *idx * org_shape[3] + g];
+                    ++dst;
+                    ++idx;
+                }
+            }
+        }
+    }
+}
+void gather_dim4_3(
+    float *dst, const float *src, const int *idx, int  *shape, int *org_shape) {
+    int shape_1_2_3 = org_shape[1] * org_shape[2] * org_shape[3];
+    int shape_2_3 = org_shape[2] * org_shape[3];
+    for (int i = 0; i < shape[0]; ++i) {
+        int idx_i = i * shape_1_2_3;
+        for (int j = 0; j < shape[1]; ++j) {
+            int idx_j = idx_i + j * shape_2_3;
+            for (int k = 0; k < shape[2]; ++k) {
+                int idx_k = idx_j + k * org_shape[3];
+                for (int g = 0; g < shape[3]; ++g) {
+                    *dst = src[idx_k + *idx];
+                    ++dst;
+                    ++idx;
+                }
+            }
+        }
+    }
+}
+
+void GatherElementsPtFunc::run() {
+  auto src_data = _bottoms[0]->cpu_data<float>();
+  auto indices_data = _bottoms[1]->cpu_data<int>();
+  auto dst_data = _tops[0]->cpu_data<float>();
+
+  int src_dim = _bottoms[0]->shape.size();
+  std::vector<int> input_shape = _bottoms[0]->shape; 
+  std::vector<int> indices_shape = _bottoms[1]->shape;
+  std::vector<int> output_shape = _tops[0]->shape;
+
+  switch (src_dim)
+  {
+  case 1:
+      gather_dim1_0(dst_data, src_data, indices_data, indices_shape.data());
+      break;
+  case 2:
+      if (_axis == 0)
+          gather_dim2_0(dst_data, src_data, indices_data, indices_shape.data(), input_shape.data());
+      else if (_axis == 1)
+          gather_dim2_1(dst_data, src_data, indices_data, indices_shape.data(), input_shape.data());
+      break;
+  case 3:
+      if (_axis == 0)
+          gather_dim3_0(dst_data, src_data, indices_data, indices_shape.data(), input_shape.data());
+      else if (_axis == 1)
+          gather_dim3_1(dst_data, src_data, indices_data, indices_shape.data(), input_shape.data());
+      else if (_axis == 2)
+          gather_dim3_2(dst_data, src_data, indices_data, indices_shape.data(), input_shape.data());
+      break;
+  case 4:
+      if (_axis == 0)
+          gather_dim4_0(dst_data, src_data, indices_data, indices_shape.data(), input_shape.data());
+      else if (_axis == 1)
+          gather_dim4_1(dst_data, src_data, indices_data, indices_shape.data(), input_shape.data());
+      else if (_axis == 2)
+          gather_dim4_2(dst_data, src_data, indices_data, indices_shape.data(), input_shape.data());
+      else if (_axis == 3)
+          gather_dim4_3(dst_data, src_data, indices_data, indices_shape.data(), input_shape.data());
+      break;
+  default:
+      printf("error: %s: %d: invalid input dimension: %d. \n",
+              __FILE__, __LINE__, static_cast<int>(input_shape.size()));
+      exit(-1);
+  }
+}
+}
+}
\ No newline at end of file
diff --git a/cviruntime/src/common/cpu_function/gatherelements_pt.hpp b/cviruntime/src/common/cpu_function/gatherelements_pt.hpp
new file mode 100644
index 000000000..0c2a9e4e5
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/gatherelements_pt.hpp
@@ -0,0 +1,30 @@
+#include <iostream>
+#include <vector>
+#include <runtime/neuron.hpp>
+#include <runtime/cpu_function.hpp>
+
+namespace cvi {
+namespace runtime {
+
+class GatherElementsPtFunc : public ICpuFunction {
+
+public:
+  GatherElementsPtFunc() {}
+
+  ~GatherElementsPtFunc();
+
+  void setup(std::vector<std::shared_ptr<Neuron> > &inputs,
+             std::vector<std::shared_ptr<Neuron> > &outputs,
+             OpParam &param);
+  void run();
+  static ICpuFunction *open() { return new GatherElementsPtFunc(); }
+  static void close(ICpuFunction *func) { delete func; }
+
+private:
+  std::vector<std::shared_ptr<Neuron>> _bottoms;
+  std::vector<std::shared_ptr<Neuron>> _tops;
+
+  int _axis;
+};
+}
+}
\ No newline at end of file
diff --git a/cviruntime/src/common/cpu_function/gathernd.cpp b/cviruntime/src/common/cpu_function/gathernd.cpp
new file mode 100644
index 000000000..a669b2786
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/gathernd.cpp
@@ -0,0 +1,83 @@
+#include <iostream>
+#include <vector>
+#include <cmath>
+#include <runtime/debug.h>
+#include <runtime/neuron.hpp>
+#include <cpu_function/gathernd.hpp>
+
+namespace cvi {
+namespace runtime {
+  GatherNDFunc::~GatherNDFunc() {}
+  
+  void GatherNDFunc::setup(tensor_list_t &inputs, tensor_list_t &outputs, OpParam &param) {
+    _bottoms = inputs;
+    _tops = outputs;
+    batch_dims = param.get<int32_t>("batch_dims");
+    indice_dims = param.get<int32_t>("indice_dims");
+  }
+
+  uint64_t GatherNDFunc::gather_offset(
+    std::vector<int> input_shape, std::vector<int> gather_index) {
+    uint64_t offset = 0;
+    int dim_size = gather_index.size();
+    int gap = 1;
+    for (int i = dim_size - 1; i >= 0; i--) {
+        offset += gather_index[i] * gap;
+        gap *= input_shape[i];
+    }
+    return offset;
+}
+
+  void GatherNDFunc::run() {
+    int batch_dims_size = 1;
+    auto input_info = _bottoms[0];
+    auto indices_info = _bottoms[1];
+    auto indices_shape = indices_info->shape;
+    auto input_shape = input_info->shape;
+    const float *input = input_info->cpu_data<float>();
+    const int *indices = indices_info->cpu_data<int>();
+    std::vector<int> indices_v(indices_info->count());
+    for (size_t i = 0; i < indices_info->count(); ++i) {
+        indices_v[i] = indices[i];
+    }
+    float *out = _tops[0]->cpu_data<float>();
+
+    for (int i = 0; i < batch_dims; ++i) {
+        batch_dims_size *= indices_shape[i];
+    }
+
+    int channel = (indices_info->count() / batch_dims_size) /
+                    indices_shape[indice_dims - 1];
+    assert(channel * indices_shape[indice_dims - 1] * batch_dims_size ==
+            (int)indices_info->count());
+    std::vector<int> indices_new_shape = {
+        batch_dims_size, channel, indices_shape[indice_dims - 1]};
+    std::vector<int> input_new_shape = {batch_dims_size};
+    for (size_t i = batch_dims; i < input_shape.size(); ++i) {
+        input_new_shape.push_back(input_shape[i]);
+    }
+
+    uint64_t gather_eltment =
+        _tops[0]->count() / (indices_new_shape[0] * indices_new_shape[1]);
+    assert(gather_eltment * indices_new_shape[0] * indices_new_shape[1] ==
+            _tops[0]->count());  
+    for (int b = 0; b < indices_new_shape[0]; ++b) {
+        int index1 = b * indices_new_shape[1] * indices_new_shape[2];
+        int indices_new_shape2_size = indices_new_shape[2] * sizeof(int);
+        int index2 = b * indices_new_shape[1];
+        int gather_eltment_size = gather_eltment * sizeof(float);
+        for (int c = 0; c < indices_new_shape[1]; ++c) {
+        std::vector<int> gather_index(indices_new_shape[2]);
+        memcpy(gather_index.data(),
+                (int *)indices_v.data() +
+                    index1 + c * indices_new_shape[2],
+                indices_new_shape2_size);
+        gather_index.insert(gather_index.begin(), b);
+        uint64_t offset = gather_offset(input_new_shape, gather_index);
+        memcpy(out + (index2 + c) * gather_eltment,
+                input + offset * gather_eltment, gather_eltment_size);
+        }
+    }    
+  }
+}
+}
\ No newline at end of file
diff --git a/cviruntime/src/common/cpu_function/gathernd.hpp b/cviruntime/src/common/cpu_function/gathernd.hpp
new file mode 100644
index 000000000..f9ae8e9f6
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/gathernd.hpp
@@ -0,0 +1,28 @@
+#include <iostream>
+#include <runtime/cpu_function.hpp>
+#include <runtime/neuron.hpp>
+namespace cvi {
+namespace runtime {
+class GatherNDFunc : public ICpuFunction {
+
+public:
+  GatherNDFunc() {}
+  ~GatherNDFunc();
+  void setup(tensor_list_t &inputs,
+             tensor_list_t &outputs,
+             OpParam &param);
+  void run();
+  static ICpuFunction *open() { return new GatherNDFunc(); }
+  static void close(ICpuFunction *func) { delete func; }
+
+private:
+  tensor_list_t _bottoms;
+  tensor_list_t _tops;
+  int batch_dims;
+  int indice_dims;
+  uint64_t gather_offset(std::vector<int> input_shape,
+                          std::vector<int> gather_index);
+};
+
+}
+}
\ No newline at end of file
diff --git a/cviruntime/src/common/cpu_function/grid_sampler.cpp b/cviruntime/src/common/cpu_function/grid_sampler.cpp
new file mode 100644
index 000000000..74e851c72
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/grid_sampler.cpp
@@ -0,0 +1,181 @@
+#include <iostream>
+#include <vector>
+#include <cmath>
+#include <runtime/debug.h>
+#include <runtime/neuron.hpp>
+#include <cpu_function/grid_sampler.hpp>
+
+namespace cvi {
+namespace runtime {
+  void GridSamplerFunc::setup(tensor_list_t &inputs, tensor_list_t &outputs, OpParam &param) {
+    _bottoms = inputs;
+    _tops = outputs;
+    mode = param.get<int32_t>("mode");
+    padding_mode = param.get<int32_t>("padding_mode");
+    align_corners = param.get<bool>("align_corners");
+  }
+
+template <typename scalar_t>
+scalar_t GridSamplerFunc::clip_coordinates(scalar_t in, int64_t clip_limit) {
+  return std::min(static_cast<scalar_t>(clip_limit - 1),
+                  std::max(in, static_cast<scalar_t>(0)));
+}
+
+// Reflects coordinates until they fall between low and high (inclusive).
+// The bounds are passed as twice their value so that half-integer values
+// can be represented as ints.
+template <typename scalar_t>
+scalar_t GridSamplerFunc::reflect_coordinates(scalar_t in, int64_t twice_low,
+                                           int64_t twice_high) {
+  if (twice_low == twice_high) {
+    return static_cast<scalar_t>(0);
+  }
+  scalar_t min = static_cast<scalar_t>(twice_low) / 2;
+  scalar_t span = static_cast<scalar_t>(twice_high - twice_low) / 2;
+  in = std::fabs(in - min);
+  // `fmod` returns same sign as `in`, which is positive after the `fabs` above.
+  scalar_t extra = std::fmod(in, span);
+  int flips = static_cast<int>(std::floor(in / span));
+  if (flips % 2 == 0) {
+    return extra + min;
+  } else {
+    return span - extra + min;
+  }
+}
+
+float GridSamplerFunc::computeIndex(float coord, int size, int paddingMode,
+                                    bool alignCorners) {
+  float res = 0.f;
+
+  // Unnormalize coordinate
+  // From [-1, 1] to pixel index
+  if (alignCorners)
+    res = ((coord + 1.f) * .5f) * (size - 1);
+  else
+    res = ((coord + 1.f) * size - 1.f) * .5f;
+
+  switch (paddingMode) {
+  case GridSamplerZeros:
+    break;
+  case GridSamplerBorder:
+    res = clip_coordinates(res, size);
+    break;
+  case GridSamplerReflection:
+    if (alignCorners) {
+      res = reflect_coordinates(res, 0, 2 * (size - 1));
+    } else {
+      res = reflect_coordinates(res, -1, 2 * size - 1);
+    }
+    res = clip_coordinates(res, size);
+    break;
+  default:
+    assert(0);
+  }
+  return res;
+} 
+
+void GridSamplerFunc::run() {
+  auto input_tensor = _bottoms[0];
+  auto grid_tensor = _bottoms[1];
+  auto output_tensor = _tops[0];
+  std::vector<int> input_shapes = input_tensor->shape;
+  std::vector<int> grid_shape = grid_tensor->shape;
+  const float *input_ptr = input_tensor->cpu_data<float>();
+  const float *grid_ptr = grid_tensor->cpu_data<float>();
+  float *output_ptr = output_tensor->cpu_data<float>();
+  assert((grid_shape.size() == 4 && grid_shape[3] == 2) ||
+         (grid_shape.size() == 5 && grid_shape[4] == 3));
+  const int N = input_shapes[0];
+  const int C = input_shapes[1];
+  const int IH = input_shapes[2];
+  const int IW = input_shapes[3];
+  const int OH = grid_shape[1];
+  const int OW = grid_shape[2];
+  
+  int IHW = IH * IW;
+  int OHW = OH * OW;
+  int OHW2 = 2 * OHW;
+  int ICHW = C * IHW;
+  int OCHW = C * OHW;
+  if (mode == GridSamplerBilinear) {
+    for (int n = 0; n < N; ++n) {
+        const float *input = input_ptr + n * ICHW;
+        const float *grid = grid_ptr + n * OHW2;
+        float *output = output_ptr + n * OCHW;
+        for (int h = 0; h < OH; ++h) {
+            for (int w = 0; w < OW; ++w) {
+                auto fx =
+                    computeIndex(*grid, IW, padding_mode, align_corners);
+                ++grid;
+                auto fy =
+                    computeIndex(*grid, IH, padding_mode, align_corners);
+                ++grid;
+                int x = INT(std::floor(fx));
+                int y = INT(std::floor(fy));
+                float dx = fx - x;
+                float dy = fy - y;
+                float tx = 1.f - dx;
+                float ty = 1.f - dy;
+                float txty = tx * ty, dxty = dx * ty, txdy = tx * dy, dxdy = dx * dy;
+                bool yBound_0 = y >= 0 && y < IH;
+                bool yBound_1 = y + 1 >= 0 && y + 1 < IH;
+                bool xBound_0 = x >= 0 && x < IW;
+                bool xBound_1 = x + 1 >= 0 && x + 1 < IW;
+                const float *iiter = input + y * IW + x;
+                float *oiter = output;
+                for (int c = 0; c < C; ++c) {
+                    *oiter = 0.f;
+                    if (yBound_0) {
+                    if (xBound_0)
+                        *oiter += iiter[0] * txty;
+                    if (xBound_1)
+                        *oiter += iiter[1] * dxty;
+                    }
+                    if (yBound_1) {
+                    if (xBound_0)
+                        *oiter += iiter[IW] * txdy;
+                    if (xBound_1)
+                        *oiter += iiter[IW + 1] * dxdy;
+                    }
+                    iiter += IHW;
+                    oiter += OHW;
+                }
+                ++output;
+            }
+        }
+    }
+  } else if (mode == GridSamplerNearest) {
+    for (int n = 0; n < N; ++n) {
+        const float *input = input_ptr + n * ICHW;
+        const float *grid = grid_ptr + n * OHW2;
+        float *output = output_ptr + n * OCHW;
+        for (int h = 0; h < OH; ++h) {
+            for (int w = 0; w < OW; ++w) {
+                auto fx =
+                    computeIndex(*grid, IW, padding_mode, align_corners);
+                ++grid;
+                auto fy =
+                    computeIndex(*grid, IH, padding_mode, align_corners);
+                ++grid;
+                int x = INT(std::round(fx));
+                int y = INT(std::round(fy));
+                const float *iiter = input + y * IW + x;
+                float *oiter = output;
+                for (int c = 0; c < C; ++c) {
+                    *oiter = y >= 0 && y < IH && x >= 0 && x < IW ? *iiter : 0.f;
+                    iiter += IHW;
+                    oiter += OHW;
+                }
+                ++output;
+            }
+        }
+    }
+  } else {
+    assert(0);
+  }
+  output_tensor->shape = {N, C, OH, OW};
+  return;
+}
+
+}
+}
\ No newline at end of file
diff --git a/cviruntime/src/common/cpu_function/grid_sampler.hpp b/cviruntime/src/common/cpu_function/grid_sampler.hpp
new file mode 100644
index 000000000..6776b30bb
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/grid_sampler.hpp
@@ -0,0 +1,45 @@
+#include <iostream>
+#include <runtime/cpu_function.hpp>
+#include <runtime/neuron.hpp>
+namespace cvi {
+namespace runtime {
+#define INT(val) (static_cast<int>(val))
+enum GridSamplerInterpolation {
+    GridSamplerBilinear = 0,
+    GridSamplerNearest = 1
+};
+enum GridSamplerPaddingMode {
+    GridSamplerZeros = 0,
+    GridSamplerBorder = 1,
+    GridSamplerReflection = 2
+};
+class GridSamplerFunc : public ICpuFunction {
+public:
+  GridSamplerFunc() {};
+  ~GridSamplerFunc() {};
+  void setup(tensor_list_t &inputs,
+             tensor_list_t &outputs,
+             OpParam &param);
+  void run();
+  static ICpuFunction *open() { return new GridSamplerFunc(); }
+  static void close(ICpuFunction *func) { delete func; }  
+
+private:
+  tensor_list_t _bottoms;
+  tensor_list_t _tops;
+  int mode;
+  int padding_mode;
+  bool align_corners;
+  
+  float computeIndex(float coord, int size, int paddingMode, bool alignCorners);
+
+  template <typename scalar_t>
+  scalar_t reflect_coordinates(scalar_t in, int64_t twice_low, int64_t twice_high);
+
+  template <typename scalar_t>
+  scalar_t clip_coordinates(scalar_t in, int64_t clip_limit);  
+
+};
+
+}
+}
\ No newline at end of file
diff --git a/cviruntime/src/common/cpu_function/instancenorm.cpp b/cviruntime/src/common/cpu_function/instancenorm.cpp
new file mode 100644
index 000000000..1de61dcdf
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/instancenorm.cpp
@@ -0,0 +1,119 @@
+#include <iostream>
+#include <vector>
+#include <cmath>
+#include <numeric>      // std::accumulate
+#include <runtime/debug.h>
+#include <runtime/neuron.hpp>
+#include <cpu_function/instancenorm.hpp>
+
+namespace cvi {
+namespace runtime {
+
+// Y = (X-mean(X))/(sqrt(var(X)+variance_epsilon))
+static int my_bn(float *input, float *mean, float *variance, float *scale, float variance_epsilon,
+    float *output, int n, int c, int h, int w, float *bias) {
+  float scale_factor = 1 / scale[0];
+  for (int i = 0; i < c; ++i) {
+    mean[i] = mean[i] * scale_factor;
+    variance[i] = variance[i] * scale_factor;
+  }
+  for (int ni = 0; ni < n; ++ni) {
+    for (int ci = 0; ci < c; ++ci) {
+      float b = 0;
+      if (bias) {
+        b = bias[ci];
+      }
+
+      for (int i = 0; i < h * w; ++i) {
+        auto x = input[ni * c * h * w + ci * h * w + i] - mean[ci];
+        auto d = sqrt(variance[ci] + variance_epsilon);
+        output[ni * c * h * w + ci * h * w + i] = x / d + b;
+        if (fabs(variance[ci]) <= variance_epsilon && fabs(mean[ci]) <= 1e-8
+            && fabs(input[ni * c * h * w + ci * h * w + i]) >= 1.0e-4
+            && fabs(output[ni * c * h * w + ci * h * w + i]) >= 1.0e-2) {
+          //assert(0);
+        }
+      }
+    }
+  }
+  for (int i = 0; i < c; ++i) {
+    mean[i] = mean[i] * scale[0];
+    variance[i] = variance[i] * scale[0];
+  }
+  return 0;
+}
+
+static int my_in(float *input, float* gamma_value,
+    float* beta_value,
+    float *output, float variance_epsilon, int n, int c, int h, int w) {
+
+  std::vector<float> mean(c);
+  std::vector<float> variance(c);
+  int hw = h * w;
+
+  for (int ni = 0; ni < n; ++ni) {
+    for (int ci = 0; ci < c; ++ci) {
+      //int channel_shift = ni * c * h * w + ci * h * w;
+      //auto start = input + channel_shift * sizeof(float);
+      //mean[ci] = std::accumulate(start, start + hw, static_cast<float>(0.0)) / hw;
+      {
+        // TODO: leverage std::accumulate
+        float m = 0;
+        for (int i = 0; i < hw; i++) {
+          m += input[ni * c * h * w + ci * h * w + i];
+        }
+
+        mean[ci] = m / hw;
+      }
+
+      float var = 0;
+
+      for (int i = 0; i < hw; ++i) {
+        var += pow((input)[ni * c * h * w + ci * h * w + i] - mean[ci], 2);
+      }
+      var = (var) / hw;
+      variance[ci] = var;
+    }
+  }
+
+  return my_bn(input, mean.data(), variance.data(),
+      gamma_value, variance_epsilon, output, n, c, h, w,
+      beta_value);
+}
+
+InstanceNormFunc::~InstanceNormFunc() {}
+
+void InstanceNormFunc::setup(tensor_list_t &inputs,
+            tensor_list_t &outputs,
+            OpParam &param) {
+
+  top_ = outputs[0];
+  bottom_ = inputs[0];
+  scale_ = inputs[1];
+  bias_ = inputs[2];
+
+  auto shape = bottom_->shape;
+  assert((CVI_FMT_FP32 == bottom_->fmt && bottom_->fmt == top_->fmt) &&
+         "ONLY support fp32 now");
+
+  variance_epsilon_ = param.get<float>("variance_epsilon");
+  num_ = shape[0];
+  channels_ = shape[1];
+  h_ = shape[2];
+  w_ = shape[3];
+}
+
+void InstanceNormFunc::run() {
+  auto input = bottom_->cpu_data<float>();
+  auto output = top_->cpu_data<float>();
+  auto scale = scale_->cpu_data<float>();
+  auto bias = bias_->cpu_data<float>();
+
+  int in = num_;
+  int ic = channels_;
+
+  my_in(input, scale, bias, output, variance_epsilon_, in, ic, h_, w_);
+}
+
+} // namespace runtime
+} // namespace cvi
diff --git a/cviruntime/src/common/cpu_function/instancenorm.hpp b/cviruntime/src/common/cpu_function/instancenorm.hpp
new file mode 100644
index 000000000..aeb8744ad
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/instancenorm.hpp
@@ -0,0 +1,38 @@
+#include <iostream>
+#include <vector>
+#include <unordered_map>
+#include <runtime/neuron.hpp>
+#include <runtime/cpu_function.hpp>
+
+namespace cvi {
+namespace runtime {
+
+class InstanceNormFunc : public ICpuFunction {
+
+public:
+  InstanceNormFunc() {}
+
+  ~InstanceNormFunc();
+  void setup(tensor_list_t &inputs,
+             tensor_list_t &outputs,
+             OpParam &param);
+  void run();
+
+  static ICpuFunction *open() { return new InstanceNormFunc(); }
+  static void close(ICpuFunction *func) { delete func; }
+
+private:
+  std::shared_ptr<Neuron> bottom_;
+  std::shared_ptr<Neuron> top_;
+  std::shared_ptr<Neuron> scale_;
+  std::shared_ptr<Neuron> bias_;
+
+  float variance_epsilon_;
+  int num_;
+  int channels_;
+  int h_;
+  int w_;
+};
+
+}
+}
diff --git a/cviruntime/src/common/cpu_function/interpolation.cpp b/cviruntime/src/common/cpu_function/interpolation.cpp
new file mode 100644
index 000000000..dff31dbd1
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/interpolation.cpp
@@ -0,0 +1,355 @@
+#include <iostream>
+#include <vector>
+#include <cmath>
+#include <runtime/debug.h>
+#include <runtime/neuron.hpp>
+#include <cpu_function/interpolation.hpp>
+
+namespace cvi {
+namespace runtime {
+
+static float coordinate_transform(
+    float x_resized, float x_scale, float length_resized, bool pytorch) {
+  // please refer NativeCpuImplementation.cpp for more details
+  if (pytorch) {
+    return length_resized > 1 ? (x_resized + 0.5f) / x_scale - 0.5f : 0.0f;
+  } else {
+    return (x_resized + 0.5f) / x_scale - 0.5f;
+  }
+}
+
+template <typename T>
+void upsampleBilinear(int64_t batch_size, int64_t num_channels,
+                      int64_t input_height, int64_t input_width,
+                      float height_scale, float width_scale, const T *Xdata,
+                      T *Ydata, bool pytorch) {
+  int64_t output_width = static_cast<int64_t>(input_width * width_scale);
+  int64_t output_height = static_cast<int64_t>(input_height * height_scale);
+
+  for (int64_t n = 0; n < batch_size; ++n) {
+    for (int64_t c = 0; c < num_channels; ++c) {
+      for (int64_t y = 0; y < output_height; ++y) {
+        float in_y =
+            std::min(y / height_scale, static_cast<float>(input_height - 1));
+        in_y = height_scale == 1 ? static_cast<float>(y)
+            : coordinate_transform(static_cast<float>(y), height_scale,
+                static_cast<float>(output_height), pytorch);
+        in_y = std::max(0.0f, std::min(in_y, static_cast<float>(input_height - 1)));
+
+        const int64_t in_y1 =
+            std::min(static_cast<int64_t>(in_y), input_height - 1);
+        const int64_t in_y2 = std::min(in_y1 + 1, input_height - 1);
+        float dy1 = fabs(in_y - in_y1);
+        float dy2 = fabs(in_y - in_y2);
+        if (in_y1 == in_y2) {
+          dy1 = 0.5f;
+          dy2 = 0.5f;
+        }
+
+        const int64_t input_width_mul_y1 = input_width * in_y1;
+        const int64_t input_width_mul_y2 = input_width * in_y2;
+
+        for (int64_t x = 0; x < output_width; ++x) {
+          float in_x =
+              std::min(x / width_scale, static_cast<float>(input_width - 1));
+          in_x = width_scale == 1 ? static_cast<float>(x)
+              : coordinate_transform(static_cast<float>(x),
+                  width_scale, static_cast<float>(output_width), pytorch);
+          in_x = std::max(0.0f, std::min(in_x, static_cast<float>(input_width - 1)));
+
+          const int64_t in_x1 =
+              std::min(static_cast<int64_t>(in_x), input_width - 1);
+          const int64_t in_x2 = std::min(in_x1 + 1, input_width - 1);
+
+          float dx1 = std::abs(in_x - in_x1);
+          float dx2 = std::abs(in_x - in_x2);
+          if (in_x1 == in_x2) {
+            dx1 = 0.5f;
+            dx2 = 0.5f;
+          }
+
+          T X11 = Xdata[input_width_mul_y1 + in_x1];
+          T X21 = Xdata[input_width_mul_y1 + in_x2];
+          T X12 = Xdata[input_width_mul_y2 + in_x1];
+          T X22 = Xdata[input_width_mul_y2 + in_x2];
+
+          Ydata[output_width * y + x] =
+              static_cast<T>(dx2 * dy2 * X11 + dx1 * dy2 * X21 +
+                             dx2 * dy1 * X12 + dx1 * dy1 * X22);
+        }
+      }
+      Xdata += input_height * input_width;
+      Ydata += output_width * output_height;
+    }
+  }
+}
+
+template <typename T>
+void InterpolationFunc::interp_nearest_inner(int n, int c, int ih, int iw,
+                                             int oh, int ow, bool half_pixel) {
+  auto input = _bottom->cpu_data<T>();
+  auto output = _top->cpu_data<T>();
+  int nc = n * c;
+  float scale_h = ((float)ih) / oh;
+  float scale_w = ((float)iw) / ow;
+  if (half_pixel) {
+    for (int i = 0; i < nc; i++) {
+      for (int h = 0; h < oh; h++) {
+        for (int w = 0; w < ow; w++) {
+          int o_index = i * oh * ow + h * ow + w;
+          int h_resized = (int)std::ceil((h + 0.5) * scale_h - 1.0);
+          int w_resized = (int)std::ceil((w + 0.5) * scale_w - 1.0);
+          int i_index = i * ih * iw + h_resized * iw + w_resized;
+          output[o_index] = input[i_index];
+        }
+      }
+    }
+  } else {
+    for (int i = 0; i < nc; i++) {
+      for (int h = 0; h < oh; h++) {
+        for (int w = 0; w < ow; w++) {
+          int o_index = i * oh * ow + h * ow + w;
+          int h_resized = (int)(h * scale_h);
+          int w_resized = (int)(w * scale_w);
+          int i_index = i * ih * iw + h_resized * iw + w_resized;
+          output[o_index] = input[i_index];
+        }
+      }
+    }
+  }
+}
+
+void InterpolationFunc::interp_nearest() {
+  bool half_pixel = false;
+  if (coordinate_transformation_mode == "nearest_half_pixel") {
+    half_pixel = true;
+  }
+  int n = num_;
+  int c = channels_;
+  switch (_bottom->fmt) {
+  case CVI_FMT_BF16:
+    interp_nearest_inner<uint16_t>(n, c, height_in_, width_in_, height_out_,
+                                   width_out_, half_pixel);
+    break;
+  case CVI_FMT_INT8:
+    interp_nearest_inner<int8_t>(n, c, height_in_, width_in_, height_out_,
+                                 width_out_, half_pixel);
+    break;
+  default:
+    interp_nearest_inner<float>(n, c, height_in_, width_in_, height_out_,
+                                width_out_, half_pixel);
+    break;
+  }
+}
+
+static inline float value(float *input, int w, int ih, int iw) {
+  return input[ih * w + iw];
+}
+
+static float value(float *input, int w, float fh, float fw) {
+  int h0 = std::floor(fh);
+  int h1 = std::ceil(fh);
+  int w0 = std::floor(fw);
+  int w1 = std::ceil(fw);
+  if (h0 == fh && w0 == fw) {
+    return value(input, w, h0, w0);
+  }
+  if (h0 == fh) {
+    return value(input, w, h0, w0) * (w1 - fw) +
+           value(input, w, h0, w1) * (fw - w0);
+  }
+  if (w0 == fw) {
+    return value(input, w, h0, w0) * (h1 - fh) +
+           value(input, w, h1, w0) * (fh - h0);
+  }
+  float scale0 = (w1 - fw) * (h1 - fh);
+  float scale1 = (fw - w0) * (h1 - fh);
+  float scale2 = (w1 - fw) * (fh - h0);
+  float scale3 = (fw - w0) * (fh - h0);
+  return value(input, w, h0, w0) * scale0 + value(input, w, h0, w1) * scale1 +
+         value(input, w, h1, w0) * scale2 + value(input, w, h1, w1) * scale3;
+}
+
+void interp_asymmetric(float *input, float *output, int n, int c, int ih,
+                       int iw, int oh, int ow) {
+  int nc = n * c;
+  float scale_h = (float)ih / oh;
+  float scale_w = (float)iw / ow;
+  for (int i = 0; i < nc; i++) {
+    for (int h = 0; h < oh; h++) {
+      for (int w = 0; w < ow; w++) {
+        int o_index = i * oh * ow + h * ow + w;
+        float fh = std::min(h * scale_h, (float)(ih - 1));
+        float fw = std::min(w * scale_w, (float)(iw - 1));
+        output[o_index] = value(input + i * ih * iw, iw, fh, fw);
+      }
+    }
+  }
+}
+
+// copy from caffe_cpu_interp2
+void my_interp(const int channels,
+    const float *data1, const int x1, const int y1, const int height1, const int width1, const int Height1, const int Width1,
+    float *data2, const int x2, const int y2, const int height2, const int width2, const int Height2, const int Width2) {
+  bool packed = false;
+
+  assert(x1 >= 0 && y1 >= 0 && height1 > 0 && width1 > 0 && x2 >= 0 && y2 >= 0 && height2 > 0 && width2 > 0);
+  assert(Width1 >= width1 + x1 && Height1 >= height1 + y1 && Width2 >= width2 + x2 && Height2 >= height2 + y2);
+
+  // special case: just copy
+  if (height1 == height2 && width1 == width2) {
+    for (int h2 = 0; h2 < height2; ++h2) {
+      const int h1 = h2;
+      for (int w2 = 0; w2 < width2; ++w2) {
+        const int w1 = w2;
+        if (packed) {
+          const float* pos1 = &data1[channels * ((y1 + h1) * Width1 + (x1 + w1))];
+          float* pos2 = &data2[channels * ((y2 + h2) * Width2 + (x2 + w2))];
+          for (int c = 0; c < channels; ++c) {
+            pos2[0] = pos1[0];
+            pos1++;
+            pos2++;
+          }
+        }
+        else {
+          const float* pos1 = &data1[(y1 + h1) * Width1 + (x1 + w1)];
+          float* pos2 = &data2[(y2 + h2) * Width2 + (x2 + w2)];
+          for (int c = 0; c < channels; ++c) {
+            pos2[0] = pos1[0];
+            pos1 += Width1 * Height1;
+            pos2 += Width2 * Height2;
+          }
+        }
+      }
+    }
+    return;
+  }
+  const float rheight = (height2 > 1) ? static_cast<float>(height1 - 1) / (height2 - 1) : 0.f;
+  const float rwidth = (width2 > 1) ? static_cast<float>(width1 - 1) / (width2 - 1) : 0.f;
+  for (int h2 = 0; h2 < height2; ++h2) {
+    const float h1r = rheight * h2;
+    const int h1 = h1r;
+    const int h1p = (h1 < height1 - 1) ? 1 : 0;
+    const float h1lambda = h1r - h1;
+    const float h0lambda = float(1.) - h1lambda;
+    for (int w2 = 0; w2 < width2; ++w2) {
+      const float w1r = rwidth * w2;
+      const int w1 = w1r;
+      const int w1p = (w1 < width1 - 1) ? 1 : 0;
+      const float w1lambda = w1r - w1;
+      const float w0lambda = float(1.) - w1lambda;
+      if (packed) {
+        const float* pos1 = &data1[channels * ((y1 + h1) * Width1 + (x1 + w1))];
+        float* pos2 = &data2[channels * ((y2 + h2) * Width2 + (x2 + w2))];
+        for (int c = 0; c < channels; ++c) {
+          pos2[0] =
+            h0lambda * (w0lambda * pos1[0]            + w1lambda * pos1[channels * w1p]) +
+            h1lambda * (w0lambda * pos1[channels * h1p * Width1] + w1lambda * pos1[channels * (h1p * Width1 + w1p)]);
+          pos1++;
+          pos2++;
+        }
+      }
+      else {
+        const float* pos1 = &data1[(y1 + h1) * Width1 + (x1 + w1)];
+        float* pos2 = &data2[(y2 + h2) * Width2 + (x2 + w2)];
+        for (int c = 0; c < channels; ++c) {
+          pos2[0] =
+            h0lambda * (w0lambda * pos1[0]            + w1lambda * pos1[w1p]) +
+            h1lambda * (w0lambda * pos1[h1p * Width1] + w1lambda * pos1[h1p * Width1 + w1p]);
+          pos1 += Width1 * Height1;
+          pos2 += Width2 * Height2;
+        }
+      }
+    }
+  }
+}
+InterpolationFunc::~InterpolationFunc() {}
+
+void InterpolationFunc::setup(tensor_list_t &inputs,
+            tensor_list_t &outputs,
+            OpParam &param) {
+
+  _top = outputs[0];
+  _bottom = inputs[0];
+  auto shape = _bottom->shape;
+
+  shrink_factor = param.get<int32_t>("shrink_factor");
+  zoom_factor = param.get<int32_t>("zoom_factor");
+  pad_beg_ = param.get<int32_t>("pad_beg");
+  pad_end_ = param.get<int32_t>("pad_end");
+  height = param.get<int32_t>("height");
+  width = param.get<int32_t>("width");
+  coordinate_transformation_mode =
+      param.get<std::string>("coordinate_transformation_mode");
+  num_ = shape[0];
+  channels_ = shape[1];
+  height_in_ = shape[2];
+  width_in_ = shape[3];
+
+  if (coordinate_transformation_mode != "align_corners") {
+    if (height && width) {
+      height_out_ = height;
+      width_out_ = width;
+    } else {
+      std::vector<int> output_shape = _top->shape;
+      height_out_ = output_shape[2];
+      width_out_ = output_shape[3];
+    }
+  } else {
+    height_in_eff_ = height_in_ + pad_beg_ + pad_end_;
+    width_in_eff_ = width_in_ + pad_beg_ + pad_end_;
+
+    // duplicate from TpuInterpreter
+    if (shrink_factor && !zoom_factor) {
+      assert(shrink_factor >= 1 && "Shrink factor must be positive");
+      height_out_ = (height_in_eff_ - 1) / shrink_factor + 1;
+      width_out_ = (width_in_eff_ - 1) / shrink_factor + 1;
+    } else if (zoom_factor && !shrink_factor) {
+      assert(zoom_factor >= 1 && "Zoom factor must be positive");
+      height_out_ = height_in_eff_ + (height_in_eff_ - 1) * (zoom_factor - 1);
+      width_out_ = width_in_eff_ + (width_in_eff_ - 1) * (zoom_factor - 1);
+    } else if (height && width) {
+      height_out_ = height;
+      width_out_ = width;
+    } else if (zoom_factor && shrink_factor) {
+      assert(shrink_factor >= 1 && "Shrink factor must be positive");
+      assert(zoom_factor >= 1 && "Zoom factor must be positive");
+
+      height_out_ = (height_in_eff_ - 1) / shrink_factor + 1;
+      width_out_ = (width_in_eff_ - 1) / shrink_factor + 1;
+      height_out_ = height_out_ + (height_out_ - 1) * (zoom_factor - 1);
+      width_out_ = width_out_ + (width_out_ - 1) * (zoom_factor - 1);
+    }
+  }
+}
+
+void InterpolationFunc::run() {
+  auto input = _bottom->cpu_data<float>();
+  auto output = _top->cpu_data<float>();
+
+  int in = num_;
+  int ic = channels_;
+  if (coordinate_transformation_mode == "half_pixel") {
+    float height_scale = (float)height_out_ / (float)height_in_;
+    float width_scale = (float)width_out_ / (float)width_in_;
+    upsampleBilinear<float>(num_, channels_, height_in_, width_in_,
+                            height_scale, width_scale, input, output, false);
+  } else if (coordinate_transformation_mode == "pytorch_half_pixel") {
+    float height_scale = (float)height_out_ / (float)height_in_;
+    float width_scale = (float)width_out_ / (float)width_in_;
+    upsampleBilinear<float>(num_, channels_, height_in_, width_in_,
+                            height_scale, width_scale, input, output, true);
+  } else if (coordinate_transformation_mode.compare(0, 7, "nearest") == 0) {
+    interp_nearest();
+  } else if (coordinate_transformation_mode == "asymmetric") {
+    interp_asymmetric(input, output, in, ic, height_in_, width_in_, height_out_,
+                      width_out_);
+  } else {
+    my_interp(in * ic, input, -pad_beg_, -pad_beg_, height_in_eff_,
+              width_in_eff_, height_in_, width_in_, output, 0, 0, height_out_,
+              width_out_, height_out_, width_out_);
+  }
+}
+
+} // namespace runtime
+} // namespace cvi
diff --git a/cviruntime/src/common/cpu_function/interpolation.hpp b/cviruntime/src/common/cpu_function/interpolation.hpp
new file mode 100644
index 000000000..1ac290c1f
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/interpolation.hpp
@@ -0,0 +1,50 @@
+#include <iostream>
+#include <runtime/cpu_function.hpp>
+#include <runtime/neuron.hpp>
+#include <unordered_map>
+#include <vector>
+
+namespace cvi {
+namespace runtime {
+
+class InterpolationFunc : public ICpuFunction {
+
+public:
+  InterpolationFunc() {}
+
+  ~InterpolationFunc();
+  void setup(tensor_list_t &inputs, tensor_list_t &outputs, OpParam &param);
+  void run();
+
+  static ICpuFunction *open() { return new InterpolationFunc(); }
+  static void close(ICpuFunction *func) { delete func; }
+
+protected:
+  template <typename T>
+  void interp_nearest_inner(int n, int c, int ih, int iw, int oh, int ow, bool half_pixel);
+  void interp_nearest();
+
+private:
+  std::shared_ptr<Neuron> _bottom;
+  std::shared_ptr<Neuron> _top;
+
+  int shrink_factor;
+  int zoom_factor;
+  int pad_end_;
+  int pad_beg_;
+  int height;
+  int width;
+  std::string coordinate_transformation_mode;
+
+  int height_in_eff_;
+  int width_in_eff_;
+  int height_out_ = -1;
+  int width_out_ = -1;
+  int height_in_;
+  int width_in_;
+  int num_;
+  int channels_;
+};
+
+} // namespace runtime
+} // namespace cvi
diff --git a/cviruntime/src/common/cpu_function/pixelshuffle.cpp b/cviruntime/src/common/cpu_function/pixelshuffle.cpp
new file mode 100644
index 000000000..caa74bebb
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/pixelshuffle.cpp
@@ -0,0 +1,58 @@
+#include <iostream>
+#include <vector>
+#include <cmath>
+#include <runtime/neuron.hpp>
+#include <cpu_function/pixelshuffle.hpp>
+
+namespace cvi {
+namespace runtime {
+
+void PixelShuffleFunc::setup(tensor_list_t &inputs,
+                           tensor_list_t &outputs,
+                           OpParam &param) {
+  (void)param;
+  _bottom = inputs[0];
+  _top = outputs[0];
+  upscale_factor = param.get<int32_t>("upscale_factor");
+  mode = param.get<std::string>("mode");
+}
+
+void PixelShuffleFunc::run() {
+  int batch_size = _bottom->shape[0];
+  int in_channel = _bottom->shape[1];
+  int in_height = _bottom->shape[2];
+  int in_width = _bottom->shape[3];
+  int out_channel = _top->shape[1];
+  int out_height = _top->shape[2];
+  int out_width = _top->shape[3];
+  int i_index = 0, o_index = 0, new_c = 0, new_h = 0, new_w = 0,
+      r = upscale_factor;
+
+  auto bottom_data = _bottom->cpu_data<int8_t>();
+  auto top_data = _top->cpu_data<int8_t>();
+  if (mode == "DCR"){
+    for (int n = 0; n < batch_size; n++) {
+      for (int c = 0; c < in_channel; c++) {
+        for (int h = 0; h < in_height; h++) {
+          for (int w = 0; w < in_width; w++) {
+            new_c = c % out_channel;
+            new_h = h * r + static_cast<int>(floor((c / out_channel) / r));
+            new_w = w * r + ((c / out_channel) % r);
+            o_index = n * (out_channel * out_height * out_width) +
+                      new_c * (out_height * out_width) +
+                      new_h * out_width +
+                      new_w;
+            top_data[o_index] = bottom_data[i_index];
+            i_index++;
+          }
+        }
+      }
+    }
+  }else{
+    assert(0 && "only DCR mode use cpu");
+  }
+
+}
+
+} // namespace runtime
+} // namespace cvi
\ No newline at end of file
diff --git a/cviruntime/src/common/cpu_function/pixelshuffle.hpp b/cviruntime/src/common/cpu_function/pixelshuffle.hpp
new file mode 100644
index 000000000..d427d9ede
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/pixelshuffle.hpp
@@ -0,0 +1,26 @@
+#include <iostream>
+#include <vector>
+#include <runtime/neuron.hpp>
+#include <runtime/cpu_function.hpp>
+
+namespace cvi {
+namespace runtime {
+
+class  PixelShuffleFunc : public ICpuFunction {
+public:
+  void setup(std::vector<std::shared_ptr<Neuron> > &inputs,
+             std::vector<std::shared_ptr<Neuron> > &outputs,
+             OpParam &param);
+  void run();
+  static ICpuFunction *open() { return new  PixelShuffleFunc(); }
+  static void close(ICpuFunction *func) { delete func; }
+
+private:
+  std::shared_ptr<Neuron> _bottom;
+  std::shared_ptr<Neuron> _top;
+  int upscale_factor;
+  std::string mode;
+};
+
+}
+}
\ No newline at end of file
diff --git a/cviruntime/src/common/cpu_function/preprocess.cpp b/cviruntime/src/common/cpu_function/preprocess.cpp
new file mode 100644
index 000000000..0ff923b91
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/preprocess.cpp
@@ -0,0 +1,59 @@
+#include <string.h>
+#include <iostream>
+#include <vector>
+#include <runtime/neuron.hpp>
+#include <cpu_function/preprocess.hpp>
+
+namespace cvi {
+namespace runtime {
+
+void PreprocessFunc::setup(tensor_list_t &inputs,
+                           tensor_list_t &outputs,
+                           OpParam &param) {
+  _bottom = inputs[0];
+  _top = outputs[0];
+  _scale = param.get<float>("scale");
+  _raw_scale = param.get<float>("raw_scale");
+  _mean = param.get<std::vector<float>>("mean");
+  _color_order = param.get<std::vector<int32_t>>("color_order");
+}
+
+void PreprocessFunc::run() {
+  int n = _bottom->shape[0];
+  int c = _bottom->shape[1];
+  int csz = _bottom->shape[2] * _bottom->shape[3];
+  int isz = c * csz;
+  int count = n * isz;
+  auto bottom_data = _bottom->cpu_data<float>();
+  auto top_data = _top->cpu_data<float>();
+  float *p = bottom_data;
+  float *q = top_data;
+  if (_color_order.size()) {
+    for (int i = 0; i < n; i++) {
+      for (int j = 0; j < c; j++) {
+        memcpy(q + _color_order[j] * csz,
+               p + j * csz, csz * sizeof(float));
+      }
+      p += isz;
+      q += isz;
+    }
+    p = q = top_data;
+  }
+
+  for (int i = 0; i < count; i++) {
+    float val = *p++;
+    if (_raw_scale != 0) {
+      val *= _raw_scale;
+    }
+    if (_mean.size()) {
+      val -= _mean[(i / csz) % c];
+    }
+    if (_scale != 1.0f) {
+      val *= _scale;
+    }
+    *q++ = val;
+  }
+}
+
+} // namespace runtime
+} // namespace cvi
\ No newline at end of file
diff --git a/cviruntime/src/common/cpu_function/preprocess.hpp b/cviruntime/src/common/cpu_function/preprocess.hpp
new file mode 100644
index 000000000..239b13d19
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/preprocess.hpp
@@ -0,0 +1,28 @@
+#include <iostream>
+#include <vector>
+#include <runtime/neuron.hpp>
+#include <runtime/cpu_function.hpp>
+
+namespace cvi {
+namespace runtime {
+
+class PreprocessFunc : public ICpuFunction {
+public:
+  void setup(std::vector<std::shared_ptr<Neuron> > &inputs,
+             std::vector<std::shared_ptr<Neuron> > &outputs,
+             OpParam &param);
+  void run();
+  static ICpuFunction *open() { return new PreprocessFunc(); }
+  static void close(ICpuFunction *func) { delete func; }
+
+private:
+  std::shared_ptr<Neuron> _bottom;
+  std::shared_ptr<Neuron> _top;
+  std::vector<int> _color_order;
+  std::vector<float> _mean;
+  float _scale = 1.0f;
+  float _raw_scale = 1.0f;
+};
+
+}
+}
\ No newline at end of file
diff --git a/cviruntime/src/common/cpu_function/proposal.cpp b/cviruntime/src/common/cpu_function/proposal.cpp
new file mode 100644
index 000000000..b9711efaa
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/proposal.cpp
@@ -0,0 +1,245 @@
+#include <iostream>
+#include <vector>
+#include <cmath>
+#include <runtime/debug.h>
+#include <runtime/neuron.hpp>
+#include <cpu_function/proposal.hpp>
+
+namespace cvi {
+namespace runtime {
+
+static void _mkanchors(std::vector<float> ctrs, std::vector<float> &anchors) {
+  anchors.push_back(ctrs[2] - 0.5*(ctrs[0] - 1));
+  anchors.push_back(ctrs[3] - 0.5*(ctrs[1] - 1));
+  anchors.push_back(ctrs[2] + 0.5*(ctrs[0] - 1));
+  anchors.push_back(ctrs[3] + 0.5*(ctrs[1] - 1));
+}
+
+static void _whctrs(std::vector<float> anchor, std::vector<float> &ctrs) {
+  float w = anchor[2] - anchor[0] + 1;
+  float h = anchor[3] - anchor[1] + 1;
+  float x_ctr = anchor[0] + 0.5 * (w - 1);
+  float y_ctr = anchor[1] + 0.5 * (h - 1);
+  ctrs.push_back(w);
+  ctrs.push_back(h);
+  ctrs.push_back(x_ctr);
+  ctrs.push_back(y_ctr);
+}
+
+static void _ratio_enum(std::vector<float> anchor, std::vector<float> anchor_ratio,
+                 std::vector<float> &ratio_anchors) {
+  std::vector<float> ctrs;
+  _whctrs(anchor, ctrs);
+  float size = ctrs[0] * ctrs[1];
+  int ratio_num = anchor_ratio.size();
+  for (int i = 0; i < ratio_num; i++)
+  {
+    float ratio = size / anchor_ratio[i];
+    int ws = int(std::round(std::sqrt(ratio)));
+    int hs = int(std::round(ws * anchor_ratio[i]));
+    std::vector<float> ctrs_in;
+    ctrs_in.push_back(ws);
+    ctrs_in.push_back(hs);
+    ctrs_in.push_back(ctrs[2]);
+    ctrs_in.push_back(ctrs[3]);
+    _mkanchors(ctrs_in, ratio_anchors);
+  }
+}
+
+static void _scale_enum(std::vector<float> ratio_anchors, std::vector<float> anchor_scale,
+                 std::vector<float> &anchor_boxes) {
+  int anchors_ratio_num = ratio_anchors.size() / 4;
+  for (int i = 0; i < anchors_ratio_num; i++)
+  {
+    std::vector<float> anchor;
+    anchor.push_back(ratio_anchors[i * 4]);
+    anchor.push_back(ratio_anchors[i * 4 + 1]);
+    anchor.push_back(ratio_anchors[i * 4 + 2]);
+    anchor.push_back(ratio_anchors[i * 4 + 3]);
+    std::vector<float> ctrs;
+    _whctrs(anchor, ctrs);
+    int scale_num = anchor_scale.size();
+    for (int j = 0; j < scale_num; j++)
+    {
+      float ws = ctrs[0] * anchor_scale[j];
+      float hs = ctrs[1] * anchor_scale[j];
+      std::vector<float> ctrs_in;
+      ctrs_in.push_back(ws);
+      ctrs_in.push_back(hs);
+      ctrs_in.push_back(ctrs[2]);
+      ctrs_in.push_back(ctrs[3]);
+      _mkanchors(ctrs_in, anchor_boxes);
+    }
+  }
+}
+
+static void generate_anchors(int anchor_base_size, std::vector<float> anchor_scale,
+                    std::vector<float> anchor_ratio, std::vector<float> &anchor_boxes) {
+  std::vector<float> base_anchor = {0, 0, (float)(anchor_base_size - 1), (float)(anchor_base_size - 1)};
+  std::vector<float> ratio_anchors;
+  _ratio_enum(base_anchor, anchor_ratio, ratio_anchors);
+  _scale_enum(ratio_anchors, anchor_scale, anchor_boxes);
+}
+
+static void anchor_box_transform_inv(float img_width, float img_height, std::vector<std::vector<float>> bbox,
+                    std::vector<std::vector<float>> select_anchor, std::vector<std::vector<float>> &pred)
+{
+  int num = bbox.size();
+  for (int i = 0; i< num; i++)
+  {
+    float dx = bbox[i][0];
+    float dy = bbox[i][1];
+    float dw = bbox[i][2];
+    float dh = bbox[i][3];
+    float pred_ctr_x = select_anchor[i][0] + select_anchor[i][2] * dx;
+    float pred_ctr_y = select_anchor[i][1] + select_anchor[i][3] * dy;
+    float pred_w = select_anchor[i][2] * std::exp(dw);
+    float pred_h = select_anchor[i][3] * std::exp(dh);
+    std::vector<float> tmp_pred;
+    tmp_pred.push_back(std::max(std::min((float)(pred_ctr_x - 0.5* pred_w), img_width - 1), (float)0.0));
+    tmp_pred.push_back(std::max(std::min((float)(pred_ctr_y - 0.5* pred_h), img_height - 1), (float)0.0));
+    tmp_pred.push_back(std::max(std::min((float)(pred_ctr_x + 0.5* pred_w), img_width - 1), (float)0.0));
+    tmp_pred.push_back(std::max(std::min((float)(pred_ctr_y + 0.5* pred_h), img_height - 1), (float)0.0));
+    pred.push_back(tmp_pred);
+  }
+}
+
+static void anchor_box_nms(std::vector<std::vector<float>> &pred_boxes, std::vector<float> &confidence, float nms_threshold)
+{
+  for (size_t i = 0; i < pred_boxes.size() - 1; i++)
+  {
+    float s1 = (pred_boxes[i][2] - pred_boxes[i][0] + 1) *(pred_boxes[i][3] - pred_boxes[i][1] + 1);
+    for (size_t j = i + 1; j < pred_boxes.size(); j++)
+    {
+      float s2 = (pred_boxes[j][2] - pred_boxes[j][0] + 1) *(pred_boxes[j][3] - pred_boxes[j][1] + 1);
+
+      float x1 = std::max(pred_boxes[i][0], pred_boxes[j][0]);
+      float y1 = std::max(pred_boxes[i][1], pred_boxes[j][1]);
+      float x2 = std::min(pred_boxes[i][2], pred_boxes[j][2]);
+      float y2 = std::min(pred_boxes[i][3], pred_boxes[j][3]);
+
+      float width = x2 - x1;
+      float height = y2 - y1;
+      if (width > 0 && height > 0)
+      {
+        float IOU = width * height / (s1 + s2 - width * height);
+        if (IOU > nms_threshold)
+        {
+          if (confidence[i] >= confidence[j])
+          {
+            pred_boxes.erase(pred_boxes.begin() + j);
+            confidence.erase(confidence.begin() + j);
+            j--;
+          }
+          else
+          {
+            pred_boxes.erase(pred_boxes.begin() + i);
+            confidence.erase(confidence.begin() + i);
+            i--;
+            break;
+          }
+        }
+      }
+    }
+  }
+}
+
+ProposalFunc::~ProposalFunc() {}
+
+void ProposalFunc::setup(tensor_list_t &inputs,
+             tensor_list_t &outputs,
+             OpParam &param) {
+  feat_stride = param.get<int32_t>("feat_stride");
+  anchor_base_size = param.get<int32_t>("anchor_base_size");
+  net_input_h = param.get<int32_t>("net_input_h");
+  net_input_w = param.get<int32_t>("net_input_w");
+  rpn_obj_threshold = param.get<float>("rpn_obj_threshold");
+  rpn_nms_threshold = param.get<float>("rpn_nms_threshold");
+  rpn_nms_post_top_n = param.get<int32_t>("rpn_nms_post_top_n");
+
+  std::sort(inputs.begin(), inputs.end(),
+    [](const std::shared_ptr<Neuron> &a, const std::shared_ptr<Neuron> &b) {
+      return a->shape[1] < b->shape[1];
+    });
+
+  _bottoms = inputs;
+  _tops = outputs;
+}
+
+void ProposalFunc::run() {
+  auto top_data = _tops[0]->cpu_data<float>();
+  memset(top_data, 0, _tops[0]->size());
+
+  size_t bottom_count = _bottoms.size();
+  assert(bottom_count == 2);
+
+  float *score = (float *)_bottoms[0]->cpu_data<float>();
+  float *bbox_deltas = (float *)_bottoms[1]->cpu_data<float>();
+
+  int batch = _bottoms[0]->shape[0];
+
+  int height = _bottoms[0]->shape[2];
+  int width = _bottoms[0]->shape[3];
+
+  std::vector<float> anchor_scale = {8, 16, 32};
+  std::vector<float> anchor_ratio = {0.5, 1, 2};
+
+  std::vector<float> anchor_boxes;
+  generate_anchors(anchor_base_size, anchor_scale, anchor_ratio, anchor_boxes);
+
+  float thresh = rpn_obj_threshold;
+
+  for (int b = 0; b < batch; ++b) {
+    auto batch_score = score + _bottoms[0]->offset(b);
+    auto batch_bbox_deltas = bbox_deltas + _bottoms[1]->offset(b);
+
+    std::vector<std::vector<float>> select_anchor;
+    std::vector<float> confidence;
+    std::vector<std::vector<float>> bbox;
+    int anchor_num = anchor_scale.size() * anchor_ratio.size();
+
+    for (int k = 0; k < anchor_num; k++) {
+      float w = anchor_boxes[4 * k + 2] - anchor_boxes[4 * k] + 1;
+      float h = anchor_boxes[4 * k + 3] - anchor_boxes[4 * k + 1] + 1;
+      float x_ctr = anchor_boxes[4 * k] + 0.5 * (w - 1);
+      float y_ctr = anchor_boxes[4 * k + 1] + 0.5 * (h - 1);
+
+      for (int i = 0; i < height; i++) {
+        for (int j = 0; j < width; j++) {
+          if (batch_score[anchor_num * height * width + (k * height + i) * width + j] >= thresh) {
+            std::vector<float> tmp_anchor;
+            std::vector<float> tmp_bbox;
+
+            tmp_anchor.push_back(j * feat_stride + x_ctr);
+            tmp_anchor.push_back(i * feat_stride + y_ctr);
+            tmp_anchor.push_back(w);
+            tmp_anchor.push_back(h);
+            select_anchor.push_back(tmp_anchor);
+            confidence.push_back(batch_score[anchor_num * height * width + (k * height + i) * width + j]);
+            tmp_bbox.push_back(batch_bbox_deltas[(4 * k * height + i) * width + j]);
+            tmp_bbox.push_back(batch_bbox_deltas[((4 * k +1) * height + i) * width + j]);
+            tmp_bbox.push_back(batch_bbox_deltas[((4 * k + 2) * height + i) * width + j]);
+            tmp_bbox.push_back(batch_bbox_deltas[((4 * k + 3) * height + i) * width + j]);
+            bbox.push_back(tmp_bbox);
+          }
+        }
+      }
+    }
+    std::vector<std::vector<float>> pred_boxes;
+    anchor_box_transform_inv(net_input_w, net_input_h, bbox, select_anchor, pred_boxes);
+    anchor_box_nms(pred_boxes, confidence, rpn_nms_threshold);
+    int num = pred_boxes.size() > (size_t)(rpn_nms_post_top_n) ? rpn_nms_post_top_n : pred_boxes.size();
+
+    auto batch_top_data = top_data + _tops[0]->offset(b);
+    for (int i = 0; i < num; i++) {
+      batch_top_data[5 * i] = b;
+      batch_top_data[5 * i + 1] = pred_boxes[i][0];
+      batch_top_data[5 * i + 2] = pred_boxes[i][1];
+      batch_top_data[5 * i + 3] = pred_boxes[i][2];
+      batch_top_data[5 * i + 4] = pred_boxes[i][3];
+    }
+  }
+}
+
+}
+}
diff --git a/cviruntime/src/common/cpu_function/proposal.hpp b/cviruntime/src/common/cpu_function/proposal.hpp
new file mode 100644
index 000000000..f474c2c84
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/proposal.hpp
@@ -0,0 +1,40 @@
+#include <iostream>
+#include <vector>
+#include <unordered_map>
+#include <algorithm>
+#include <runtime/neuron.hpp>
+#include <runtime/cpu_function.hpp>
+
+namespace cvi {
+namespace runtime {
+
+class ProposalFunc : public ICpuFunction {
+
+public:
+  ProposalFunc() {}
+
+  ~ProposalFunc();
+  void setup(tensor_list_t &inputs,
+             tensor_list_t &outputs,
+             OpParam &param);
+  void run();
+
+  static ICpuFunction *open() { return new ProposalFunc(); }
+  static void close(ICpuFunction *func) { delete func; }
+
+private:
+  tensor_list_t _bottoms;
+  tensor_list_t _tops;
+
+  int feat_stride;
+  int anchor_base_size;
+
+  float rpn_obj_threshold;
+  float rpn_nms_threshold;
+  int rpn_nms_post_top_n;
+  int net_input_w;
+  int net_input_h;
+};
+
+}
+}
diff --git a/cviruntime/src/common/cpu_function/quant.cpp b/cviruntime/src/common/cpu_function/quant.cpp
new file mode 100644
index 000000000..89408460d
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/quant.cpp
@@ -0,0 +1,280 @@
+#include <iostream>
+#include <vector>
+#include <cmath>
+#include <runtime/neuron.hpp>
+#include <cpu_function/quant.hpp>
+
+namespace cvi {
+namespace runtime {
+
+static inline signed char float2int8(float v)
+{
+    int int32 = std::round(v);
+    if (int32 > 127) return 127;
+    if (int32 < -128) return -128;
+    return (signed char)int32;
+}
+
+void QuantFunc::setup(tensor_list_t &inputs,
+                      tensor_list_t &outputs,
+                      OpParam &param) {
+  _bottom = inputs[0];
+  _top = outputs[0];
+  if (param.has("scale")) {
+    _scale = param.get<float>("scale");
+  }
+  if (param.get<std::string>("to") == "NONE") {
+    _dequant = true;
+    if (param.has("threshold")) {
+      _scale = param.get<float>("threshold") / 128.0f;
+    }
+  #if __arm__
+    work_buf = (int*)aligned_alloc(32, 1024 * sizeof(int));
+    assert(work_buf && "failed to allocate buffer for dequant");
+  #endif
+  } else {
+    if (param.has("threshold")) {
+      _scale = 128.0f / param.get<float>("threshold");
+    }
+  }
+}
+
+void QuantFunc::run() {
+  if (_dequant) {
+    dequantToFp32();
+  } else {
+    quantFromFp32();
+  }
+}
+
+void QuantFunc::dequantToFp32() {
+  auto top_data = _top->cpu_data<float>();
+  if (_bottom->fmt == CVI_FMT_INT8) {
+    auto bottom_data = _bottom->cpu_data<int8_t>();
+    float scale = _scale;
+#if (__arm__ || __aarch64__)
+    int total = (int)_bottom->count();
+#if __aarch64__
+    int8_t *ptr = bottom_data;
+    float* outptr = top_data;
+    // all neuron memory size is aligned to 16 or 32 bytes,
+    // it's safe to compute more than needed.
+    int nn = (total + 7) / 8;
+    if (nn > 0) {
+      asm volatile(
+          "dup    v2.4s, %w6                   \n" // scale
+          "dup    v7.4s, %w7                   \n"
+          "0:                                  \n"
+          "prfm   pldl1keep, [%1, #128]        \n"
+          "ld1    {v8.8b}, [%1], #8            \n"
+          "saddl  v9.8h, v8.8b, v7.8b          \n"
+          "saddl  v0.4s, v9.4h, v7.4h          \n"
+          "saddl2 v1.4s, v9.8h, v7.8h          \n"
+          // top_s32 -> top_f32
+          "scvtf  v5.4s, v0.4s                 \n"
+          "scvtf  v6.4s, v1.4s                 \n"
+          // top_f32 = top_f32 * scale_out
+          "fmul   v5.4s, v5.4s, v2.4s          \n"
+          "fmul   v6.4s, v6.4s, v2.4s          \n"
+          // save top_f32
+          "st1    {v5.4s, v6.4s}, [%2], #32    \n"
+          "subs   %w0, %w0, #1                 \n"
+          "bne    0b                           \n"
+          : "=r"(nn),         // %0
+            "=r"(ptr),     // %1
+            "=r"(outptr)         // %2
+          : "0"(nn),
+            "1"(ptr),
+            "2"(outptr),
+            "r"(scale),        // %6
+            "r"(0)
+          : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9"
+      );
+    }
+#else
+    int size = 0;
+    for (int offset = 0; offset < total; offset += size) {
+      size = (total - offset) < 1024 ?
+             (total - offset) : 1024;
+      for (int i = 0; i < size; ++i) {
+        work_buf[i] = (int)bottom_data[offset + i];
+      }
+      int *ptr = work_buf;
+      float *outptr = top_data + offset;
+
+      // all neuron memory size is aligned to 16 or 32 bytes,
+      // it's safe to compute more than needed.
+      int nn = (size + 7) / 8;
+      if (nn > 0) {
+        asm volatile(
+            "pld        [%1, #256]          \n"
+            "vld1.s32   {d0-d3}, [%1]!      \n" //q0-q1 data
+            "vdup.f32   q10, %6             \n" //q10 scale
+
+            "0:                             \n"
+            "vcvt.f32.s32 q0, q0            \n"
+            "vcvt.f32.s32 q1, q1            \n"
+
+            "vmul.f32   q2,q0,q10           \n"
+            "vmul.f32   q3,q1,q10           \n"
+
+            "pld        [%1, #256]          \n"
+            "vld1.s32   {d0-d3}, [%1]!      \n"
+            "vst1.f32   {d4-d7}, [%2]!      \n"
+
+            "subs       %0, #1              \n"
+            "bne        0b                  \n"
+
+            "sub        %1, #32             \n"
+            : "=r"(nn),         // %0
+              "=r"(ptr),     // %1
+              "=r"(outptr)         // %2
+            : "0"(nn),
+              "1"(ptr),
+              "2"(outptr),
+              "r"(scale)        // %6
+            : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q12"
+        );
+      }
+    }
+#endif
+#else
+    for (int i = 0; i < (int)_bottom->count(); ++i) {
+      top_data[i] = bottom_data[i] * scale;
+    }
+#endif // (__arm__ || __aarch64__)
+  } else if (_bottom->fmt == CVI_FMT_BF16) {
+    uint16_t *p = _bottom->cpu_data<uint16_t>();
+    uint16_t *q = reinterpret_cast<uint16_t *>(top_data);
+    int size = (int)_bottom->count();
+    for (; size != 0; p++, q += 2, size--) {
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+      q[0] = *p;
+      q[1] = 0;
+#else
+      q[0] = 0;
+      q[1] = *p;
+#endif
+    }
+  } else {
+    assert(0);
+  }
+}
+
+void QuantFunc::quantFromFp32() {
+  auto bottom_data = _bottom->cpu_data<float>();
+  if (_top->fmt == CVI_FMT_INT8) {
+    auto top_data = _top->cpu_data<int8_t>();
+    float scale = _scale;
+#if (__arm__ || __aarch64__)
+    int size = (int)_bottom->count();
+    const float* ptr = bottom_data;
+    signed char* outptr = top_data;
+
+    int nn = (size + 7) / 8;
+    if (nn > 0) {
+#if __aarch64__
+      asm volatile(
+          "dup    v2.4s, %w6                   \n" //scale
+          "0:                                  \n"
+          "prfm   pldl1keep, [%1, #128]        \n"
+          "ld1    {v0.4s, v1.4s}, [%1], #32    \n" //data
+          // bottom_f32 = bottom_f32 * scale
+          "fmul   v3.4s, v0.4s, v2.4s          \n"
+          "fmul   v4.4s, v1.4s, v2.4s          \n"
+          // top_f32 -> top_s32
+          "fcvtas v5.4s, v3.4s                 \n"
+          "fcvtas v6.4s, v4.4s                 \n"
+          // top_s32 -> top_s16
+          "sqxtn  v7.4h, v5.4s                 \n"
+          "sqxtn2 v7.8h, v6.4s                 \n"
+          // top_s16 -> top_s8
+          "sqxtn  v8.8b, v7.8h                 \n"
+          // save top_s8
+          "st1    {v8.8b}, [%2], #8            \n"
+          "subs   %w0, %w0, #1                 \n"
+          "bne    0b                           \n"
+          : "=r"(nn),       // %0
+            "=r"(ptr),      // %1
+            "=r"(outptr)    // %2
+          : "0"(nn),
+            "1"(ptr),
+            "2"(outptr),
+            "r"(scale)      // %6
+          : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8"
+      );
+#else
+      asm volatile(
+          "pld        [%1, #256]          \n"
+          "vld1.f32   {d0-d3}, [%1]!      \n"
+          "vdup.32    q10, %6             \n"
+
+          "0:                             \n"
+          "vmul.f32   q0,q0,q10           \n"
+          "vmul.f32   q1,q1,q10           \n"
+
+          "vcvtr.s32.f32 s0,s0            \n"
+          "vcvtr.s32.f32 s1,s1            \n"
+          "vcvtr.s32.f32 s2,s2            \n"
+          "vcvtr.s32.f32 s3,s3            \n"
+          "vcvtr.s32.f32 s4,s4            \n"
+          "vcvtr.s32.f32 s5,s5            \n"
+          "vcvtr.s32.f32 s6,s6            \n"
+          "vcvtr.s32.f32 s7,s7            \n"
+
+          "vqmovn.s32 d4,q0               \n"
+          "vqmovn.s32 d5,q1               \n"
+
+          "pld        [%1, #256]          \n"
+          "vld1.f32   {d0-d3}, [%1]!      \n"
+
+          "vqmovn.s16 d4, q2              \n"
+          "vst1.8     {d4}, [%2]!         \n"
+
+          "subs       %0, #1              \n"
+          "bne        0b                  \n"
+
+          "sub        %1, #32             \n"
+          : "=r"(nn),         // %0
+            "=r"(ptr),        // %1
+            "=r"(outptr)      // %2
+          : "0"(nn),
+            "1"(ptr),
+            "2"(outptr),
+            "r"(scale)        // %6
+          : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q10", "q11"
+      );
+#endif
+    }
+#else
+    for (int i = 0; i < (int)_bottom->count(); ++i) {
+
+      float fval = bottom_data[i] * scale;
+      //int ival = (int)(std::floor(fval + 0.5));
+      int ival = float2int8(fval);
+      if (ival > 127) {
+        top_data[i] = 127;
+      } else if (ival < -128) {
+        top_data[i] = -128;
+      } else {
+        top_data[i] = (int8_t)ival;
+      }
+    }
+#endif // (__arm__ || __aarch64__)
+  } else if (_top->fmt == CVI_FMT_BF16) {
+    auto top_data = _top->cpu_data<uint16_t>();
+    for (int i = 0; i < (int)_bottom->count(); ++i) {
+      float val = bottom_data[i] * 1.001957f;
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+      top_data[i] = ((uint16_t *)(&val))[0];
+#else
+      top_data[i] = ((uint16_t *)(&val))[1];
+#endif
+    }
+  } else {
+    assert(0);
+  }
+}
+
+} // namespace runtime
+} // namespace cvi
diff --git a/cviruntime/src/common/cpu_function/quant.hpp b/cviruntime/src/common/cpu_function/quant.hpp
new file mode 100644
index 000000000..703e57553
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/quant.hpp
@@ -0,0 +1,38 @@
+#include <iostream>
+#include <vector>
+#include <runtime/neuron.hpp>
+#include <runtime/cpu_function.hpp>
+
+namespace cvi {
+namespace runtime {
+
+class QuantFunc : public ICpuFunction {
+public:
+  ~QuantFunc() {
+    #if __arm__
+    if (work_buf)
+      free(work_buf);
+    #endif
+  }
+  void setup(std::vector<std::shared_ptr<Neuron> > &inputs,
+             std::vector<std::shared_ptr<Neuron> > &outputs,
+             OpParam &param);
+  void run();
+  static ICpuFunction *open() { return new QuantFunc(); }
+  static void close(ICpuFunction *func) { delete func; }
+
+private:
+  void quantFromFp32();
+  void dequantToFp32();
+
+  std::shared_ptr<Neuron> _bottom;
+  std::shared_ptr<Neuron> _top;
+  float _scale = 1.0f;
+  bool _dequant = false;
+  #if __arm__
+  int32_t *work_buf = nullptr;
+  #endif
+};
+
+}
+}
\ No newline at end of file
diff --git a/cviruntime/src/common/cpu_function/reducel2.cpp b/cviruntime/src/common/cpu_function/reducel2.cpp
new file mode 100644
index 000000000..09ce1c111
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/reducel2.cpp
@@ -0,0 +1,94 @@
+#include <iostream>
+#include <vector>
+#include <cmath>
+#include <runtime/debug.h>
+#include <runtime/neuron.hpp>
+#include <cpu_function/reducel2.hpp>
+
+namespace cvi {
+namespace runtime {
+
+inline int count(std::vector<int> &shape, int start_axis, int end_axis) {
+    int64_t count = 1;
+    for (int i = start_axis; i < end_axis; ++i) {
+      count *= shape[i];
+    }
+    return count;
+}
+
+
+template <typename T>
+int my_reduce_l2(T *input, T *output,
+                     std::vector<int> &org_input_shape,
+                     std::vector<int> &axes) {
+  assert(axes.size() > 0);
+  auto input_shape = org_input_shape;
+  int size = count(input_shape, 0, input_shape.size());
+  std::vector<T> tmp (size, 0);
+  T* _output = tmp.data();
+
+  for (int i = 0; i < (int)axes.size(); i++) {
+    int dim = input_shape.size();
+    int axis = axes[i];
+    assert(dim > axis);
+
+    int inner = count(input_shape, axis + 1, input_shape.size());
+    int next_inner = inner * input_shape[axis];
+    int outer = count(input_shape, 0, axis);
+
+    for (int i = 0; i < outer; i++) { 
+      std::vector<T> inner_sum (inner, 0);
+      for (int s = 0; s < input_shape[axis]; s++) {
+        for (int j = 0; j < inner; j++) {
+          inner_sum[j] += std::pow(input[i * next_inner + s * inner + j], 2);
+        }
+      }
+
+      // l2
+      for (int j = 0; j < inner; j++) {
+        _output[i * inner + j] = std::sqrt(inner_sum[j]);
+      }
+    }
+
+    input_shape[axis] = 1;
+    input = _output;
+  }
+
+  // export
+  size = count(input_shape, 0, input_shape.size());
+  std::copy(_output, _output + size, output);
+
+  return 0;
+}
+
+ReduceL2Func::~ReduceL2Func() {}
+
+void ReduceL2Func::setup(std::vector<std::shared_ptr<Neuron>> &inputs,
+            std::vector<std::shared_ptr<Neuron>> &outputs,
+            OpParam &param) {
+
+  _top = outputs[0];
+  _bottom = inputs[0];
+  _axes = param.get<std::vector<int32_t>>("axes");
+  assert(_bottom->fmt == _top->fmt && "in/out dtype should be equal");
+}
+  
+void ReduceL2Func::run() {
+  auto input_shape = _bottom->shape;
+  if (CVI_FMT_INT8 == _bottom->fmt || CVI_FMT_UINT8 == _bottom->fmt) {
+    auto input = _bottom->cpu_data<uint8_t>();
+    auto output = _top->cpu_data<uint8_t>();
+    my_reduce_l2<uint8_t>(input, output, input_shape, _axes);
+  }
+  else if (CVI_FMT_FP32 == _bottom->fmt) {
+    auto input = _bottom->cpu_data<float>();
+    auto output = _top->cpu_data<float>();
+    my_reduce_l2<float>(input, output, input_shape, _axes);
+  }
+  else {
+    assert(0 && "not support dtype");
+  }
+}
+
+} // namespace runtime
+} // namespace cvi
diff --git a/cviruntime/src/common/cpu_function/reducel2.hpp b/cviruntime/src/common/cpu_function/reducel2.hpp
new file mode 100644
index 000000000..1fbf1fa54
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/reducel2.hpp
@@ -0,0 +1,32 @@
+#include <iostream>
+#include <vector>
+#include <unordered_map>
+#include <runtime/neuron.hpp>
+#include <runtime/cpu_function.hpp>
+
+namespace cvi {
+namespace runtime {
+
+class ReduceL2Func : public ICpuFunction {
+
+public:
+  ReduceL2Func() {}
+
+  ~ReduceL2Func();
+  void setup(std::vector<std::shared_ptr<Neuron>> &inputs,
+             std::vector<std::shared_ptr<Neuron>> &outputs,
+             OpParam &param);
+  void run();
+
+  static ICpuFunction *open() { return new ReduceL2Func(); }
+  static void close(ICpuFunction *func) { delete func; }
+
+private:
+  std::shared_ptr<Neuron> _bottom;
+  std::shared_ptr<Neuron> _top;
+
+  std::vector<int> _axes;
+};
+
+}
+}
diff --git a/cviruntime/src/common/cpu_function/reducemax.cpp b/cviruntime/src/common/cpu_function/reducemax.cpp
new file mode 100644
index 000000000..4198fc9ab
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/reducemax.cpp
@@ -0,0 +1,75 @@
+#include <iostream>
+#include <vector>
+#include <cmath>
+#include <runtime/debug.h>
+#include <runtime/neuron.hpp>
+#include <cpu_function/reducemax.hpp>
+
+namespace cvi {
+namespace runtime {
+
+inline int count(std::vector<int> &shape, int start_axis, int end_axis) {
+    int64_t count = 1;
+    for (int i = start_axis; i < end_axis; ++i) {
+      count *= shape[i];
+    }
+    return count;
+}
+
+template <typename T>
+int my_reduce_max(T *input, T *output,
+                     std::vector<int> &input_shape,
+                     std::vector<int> &axes) {
+  assert(axes.size() > 0);
+  int axis = axes[0];
+  // only support one axis, if has two axis, should be continous
+  int total = count(input_shape, 0, input_shape.size());
+  int n = count(input_shape, 0, axis);
+  int c = input_shape[axis];
+  int hw = total / (n*c);
+
+  for (int nidx = 0; nidx < n; nidx++) {
+    for (int inner_idx = 0; inner_idx < hw; inner_idx++) {
+      for (int cidx = 0; cidx < c; cidx++) {
+        T tmp = input[nidx * c * hw + cidx * hw + inner_idx];
+        if (cidx == 0)
+          output[nidx * hw + inner_idx] = tmp;
+        output[nidx * hw + inner_idx] = std::max(tmp, output[nidx * hw + inner_idx]);
+      }
+    }
+  }
+  return 0;
+}
+
+
+ReduceMaxFunc::~ReduceMaxFunc() {}
+
+void ReduceMaxFunc::setup(tensor_list_t &inputs,
+            tensor_list_t &outputs,
+            OpParam &param) {
+
+  _top = outputs[0];
+  _bottom = inputs[0];
+  _axes = param.get<std::vector<int32_t>>("axes");
+  assert(_bottom->fmt == _top->fmt && "in/out dtype should be equal");
+}
+
+void ReduceMaxFunc::run() {
+  auto input_shape = _bottom->shape;
+  if (CVI_FMT_INT8 == _bottom->fmt || CVI_FMT_UINT8 == _bottom->fmt) {
+    auto input = _bottom->cpu_data<uint8_t>();
+    auto output = _top->cpu_data<uint8_t>();
+    my_reduce_max<uint8_t>(input, output, input_shape, _axes);
+  }
+  else if (CVI_FMT_FP32 == _bottom->fmt) {
+    auto input = _bottom->cpu_data<float>();
+    auto output = _top->cpu_data<float>();
+    my_reduce_max<float>(input, output, input_shape, _axes);
+  }
+  else {
+    assert(0 && "not support dtype");
+  }
+}
+
+} // namespace runtime
+} // namespace cvi
diff --git a/cviruntime/src/common/cpu_function/reducemax.hpp b/cviruntime/src/common/cpu_function/reducemax.hpp
new file mode 100644
index 000000000..14f63f026
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/reducemax.hpp
@@ -0,0 +1,32 @@
+#include <iostream>
+#include <vector>
+#include <unordered_map>
+#include <runtime/neuron.hpp>
+#include <runtime/cpu_function.hpp>
+
+namespace cvi {
+namespace runtime {
+
+class ReduceMaxFunc : public ICpuFunction {
+
+public:
+  ReduceMaxFunc() {}
+
+  ~ReduceMaxFunc();
+  void setup(tensor_list_t &inputs,
+             tensor_list_t &outputs,
+             OpParam &param);
+  void run();
+
+  static ICpuFunction *open() { return new ReduceMaxFunc(); }
+  static void close(ICpuFunction *func) { delete func; }
+
+private:
+  std::shared_ptr<Neuron> _bottom;
+  std::shared_ptr<Neuron> _top;
+
+  std::vector<int> _axes;
+};
+
+}
+}
diff --git a/cviruntime/src/common/cpu_function/reducemean.cpp b/cviruntime/src/common/cpu_function/reducemean.cpp
new file mode 100644
index 000000000..1326a2a0c
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/reducemean.cpp
@@ -0,0 +1,94 @@
+#include <iostream>
+#include <vector>
+#include <cmath>
+#include <runtime/debug.h>
+#include <runtime/neuron.hpp>
+#include <cpu_function/reducemean.hpp>
+
+namespace cvi {
+namespace runtime {
+
+inline int count(std::vector<int> &shape, int start_axis, int end_axis) {
+    int64_t count = 1;
+    for (int i = start_axis; i < end_axis; ++i) {
+      count *= shape[i];
+    }
+    return count;
+}
+
+
+template <typename T>
+int my_reduce_mean(T *input, T *output,
+                     std::vector<int> &org_input_shape,
+                     std::vector<int> &axes) {
+  assert(axes.size() > 0);
+  auto input_shape = org_input_shape;
+  int size = count(input_shape, 0, input_shape.size());
+  std::vector<T> tmp (size, 0);
+  T* _output = tmp.data();
+
+  for (int i = 0; i < (int)axes.size(); i++) {
+    int dim = input_shape.size();
+    int axis = axes[i];
+    assert(dim > axis);
+
+    int inner = count(input_shape, axis + 1, input_shape.size());
+    int next_inner = inner * input_shape[axis];
+    int outer = count(input_shape, 0, axis);
+
+    for (int i = 0; i < outer; i++) {
+      std::vector<T> inner_sum (inner, 0);
+      for (int s = 0; s < input_shape[axis]; s++) {
+        for (int j = 0; j < inner; j++) {
+          inner_sum[j] += input[i * next_inner + s * inner + j];
+        }
+      }
+
+      // mean
+      for (int j = 0; j < inner; j++) {
+        _output[i * inner + j]  = inner_sum[j] / input_shape[axis];
+      }
+    }
+
+    input_shape[axis] = 1;
+    input = _output;
+  }
+
+  // export
+  size = count(input_shape, 0, input_shape.size());
+  std::copy(_output, _output + size, output);
+
+  return 0;
+}
+
+ReduceMeanFunc::~ReduceMeanFunc() {}
+
+void ReduceMeanFunc::setup(tensor_list_t &inputs,
+            tensor_list_t &outputs,
+            OpParam &param) {
+
+  _top = outputs[0];
+  _bottom = inputs[0];
+  _axes = param.get<std::vector<int32_t>>("axes");
+  assert(_bottom->fmt == _top->fmt && "in/out dtype should be equal");
+}
+
+void ReduceMeanFunc::run() {
+  auto input_shape = _bottom->shape;
+  if (CVI_FMT_INT8 == _bottom->fmt || CVI_FMT_UINT8 == _bottom->fmt) {
+    auto input = _bottom->cpu_data<uint8_t>();
+    auto output = _top->cpu_data<uint8_t>();
+    my_reduce_mean<uint8_t>(input, output, input_shape, _axes);
+  }
+  else if (CVI_FMT_FP32 == _bottom->fmt) {
+    auto input = _bottom->cpu_data<float>();
+    auto output = _top->cpu_data<float>();
+    my_reduce_mean<float>(input, output, input_shape, _axes);
+  }
+  else {
+    assert(0 && "not support dtype");
+  }
+}
+
+} // namespace runtime
+} // namespace cvi
diff --git a/cviruntime/src/common/cpu_function/reducemean.hpp b/cviruntime/src/common/cpu_function/reducemean.hpp
new file mode 100644
index 000000000..8991a4ceb
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/reducemean.hpp
@@ -0,0 +1,32 @@
+#include <iostream>
+#include <vector>
+#include <unordered_map>
+#include <runtime/neuron.hpp>
+#include <runtime/cpu_function.hpp>
+
+namespace cvi {
+namespace runtime {
+
+class ReduceMeanFunc : public ICpuFunction {
+
+public:
+  ReduceMeanFunc() {}
+
+  ~ReduceMeanFunc();
+  void setup(tensor_list_t &inputs,
+             tensor_list_t &outputs,
+             OpParam &param);
+  void run();
+
+  static ICpuFunction *open() { return new ReduceMeanFunc(); }
+  static void close(ICpuFunction *func) { delete func; }
+
+private:
+  std::shared_ptr<Neuron> _bottom;
+  std::shared_ptr<Neuron> _top;
+
+  std::vector<int> _axes;
+};
+
+}
+}
diff --git a/cviruntime/src/common/cpu_function/retinaface_detection.cpp b/cviruntime/src/common/cpu_function/retinaface_detection.cpp
new file mode 100644
index 000000000..5251e9abf
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/retinaface_detection.cpp
@@ -0,0 +1,141 @@
+#include <iostream>
+#include <vector>
+#include <cmath>
+#include <runtime/debug.h>
+#include <runtime/neuron.hpp>
+#include <cpu_function/retinaface_detection.hpp>
+
+namespace cvi {
+namespace runtime {
+
+void RetinaFaceDetectionFunc::setup(tensor_list_t &inputs,
+                                    tensor_list_t &outputs,
+                                    OpParam &param) {
+  // sort inputs by neuron shape size
+  std::sort(inputs.begin(), inputs.end(),
+   [](const std::shared_ptr<Neuron> &a, const std::shared_ptr<Neuron> &b) {
+    if (a->shape[3] < b->shape[3]) {
+      return true;
+    } else if (a->shape[3] == b->shape[3]) {
+      return a->shape[1] < b->shape[1];
+    } else {
+      return false;
+    }
+  });
+  _bottoms = inputs;
+  _tops = outputs;
+
+  _nms_threshold = param.get<float>("nms_threshold");
+  _confidence_threshold = param.get<float>("confidence_threshold");
+  _keep_topk = param.get<int32_t>("keep_topk");
+}
+
+void RetinaFaceDetectionFunc::run() {
+  auto top_data = _tops[0]->cpu_data<float>();
+  memset(top_data, 0, _tops[0]->size());
+
+  size_t bottom_count = _bottoms.size();
+  assert(bottom_count == 9);
+
+  auto batch = _tops[0]->shape[0];
+
+  for (int b = 0; b < batch; ++b) {
+    std::vector<FaceInfo> infos;
+    for (size_t i = 0; i < _feature_stride_fpn.size(); ++i) {
+      int stride = _feature_stride_fpn[i];
+
+      auto score_data = _bottoms[3*i]->cpu_data<float>() + _bottoms[3*i]->offset(b);
+      size_t score_count = _bottoms[3*i]->count() / batch;
+
+      auto bbox_data = _bottoms[3*i+1]->cpu_data<float>() + _bottoms[3*i+1]->offset(b);
+      size_t bbox_count = _bottoms[3*i+1]->count() / batch;
+
+      auto landmark_data = _bottoms[3*i+2]->cpu_data<float>() + _bottoms[3*i+2]->offset(b);
+      size_t landmark_count = _bottoms[3*i+2]->count() / batch;
+
+      auto shape = _bottoms[3*i]->shape;
+      size_t height = shape[2];
+      size_t width = shape[3];
+
+      std::vector<float> score(score_data + score_count / 2, score_data + score_count);
+      std::vector<float> bbox(bbox_data, bbox_data + bbox_count);
+      std::vector<float> landmark(landmark_data, landmark_data + landmark_count);
+
+      int count = height * width;
+      std::string key = "stride" + std::to_string(stride);
+      auto anchors_fpn = _anchors_fpn[key];
+      auto num_anchors = _num_anchors[key];
+
+      std::vector<AnchorBox> anchors = anchors_plane(height, width, stride, anchors_fpn);
+
+      for (int num = 0; num < num_anchors; ++num) {
+        for (int j = 0; j < count; ++j) {
+          float confidence = score[j + count * num];
+          if (confidence <= _confidence_threshold)
+            continue;
+
+          float dx = bbox[j + count * (0 + num * 4)];
+          float dy = bbox[j + count * (1 + num * 4)];
+          float dw = bbox[j + count * (2 + num * 4)];
+          float dh = bbox[j + count * (3 + num * 4)];
+          std::vector<float> bbox_deltas{dx, dy, dw, dh};
+          auto bbox = bbox_pred(anchors[j + count * num], bbox_deltas);
+
+          std::vector<float> landmark_deltas(10, 0);
+          for (size_t k = 0; k < 5; ++k) {
+            landmark_deltas[k] = landmark[j + count * (num * 10 + k * 2)];
+            landmark_deltas[k + 5] = landmark[j + count * (num * 10 + k * 2 + 1)];
+          }
+
+          auto pts = landmark_pred(anchors[j + count * num], landmark_deltas);
+
+          FaceInfo info;
+          info.x1 = bbox[0];
+          info.y1 = bbox[1];
+          info.x2 = bbox[2];
+          info.y2 = bbox[3];
+          info.score = confidence;
+          for (int idx = 0; idx < 5; ++idx) {
+            info.x[idx] = pts[idx];
+            info.y[idx] = pts[idx + 5];
+          }
+
+          infos.push_back(info);
+        }
+      }
+    }
+
+    auto preds = nms(infos, _nms_threshold);
+    auto keep_topk = _keep_topk;
+    if (keep_topk > (int)preds.size())
+      keep_topk = (int)preds.size();
+
+    long long count = 0;
+    auto batch_top_data = top_data + _tops[0]->offset(b);
+    for (int i = 0; i < keep_topk; ++i) {
+      batch_top_data[count++] = preds[i].x1;
+      batch_top_data[count++] = preds[i].y1;
+      batch_top_data[count++] = preds[i].x2;
+      batch_top_data[count++] = preds[i].y2;
+      batch_top_data[count++] = preds[i].score;
+      for (int j = 0; j < 5; ++j) {
+        batch_top_data[count++] = preds[i].x[j];
+        batch_top_data[count++] = preds[i].y[j];
+      }
+
+#if 0
+      TPU_LOG_DEBUG(
+          "x1 = %f, y1 = %f, x2 = %f, y2 = %f, score = %f,"
+          "pts1 = %f, pts2 = %f, pts3 = %f, pts4 = %f, pts5 = %f"
+          "pts6 = %f, pts7 = %f, pts8 = %f, pts9 = %f, pts10 = %f\n",
+          preds[i].x1, preds[i].y1, preds[i].x2, preds[i].y2, preds[i].score,
+          preds[i].x[0], preds[i].y[0], preds[i].x[1], preds[i].y[1],
+          preds[i].x[2], preds[i].y[2], preds[i].x[3], preds[i].y[3],
+          preds[i].x[4], preds[i].y[4]);
+#endif
+    }
+  }
+}
+
+} // namespace runtime
+} // namespace cvi
diff --git a/cviruntime/src/common/cpu_function/retinaface_detection.hpp b/cviruntime/src/common/cpu_function/retinaface_detection.hpp
new file mode 100644
index 000000000..747dabf45
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/retinaface_detection.hpp
@@ -0,0 +1,281 @@
+#include <iostream>
+#include <vector>
+#include <cmath>
+#include <algorithm>
+#include <unordered_map>
+#include <runtime/neuron.hpp>
+#include <runtime/cpu_function.hpp>
+
+namespace cvi {
+namespace runtime {
+
+struct AnchorCfg {
+public:
+  AnchorCfg(int stride, std::vector<int> scales, int base_size, std::vector<float> ratios,
+            int allowed_border)
+      : stride(stride), scales(scales), base_size(base_size), ratios(ratios),
+        allowed_border(allowed_border) {}
+
+  int stride;
+  std::vector<int> scales;
+  int base_size;
+  std::vector<float> ratios;
+  int allowed_border;
+};
+
+struct AnchorBox {
+  float x1, y1, x2, y2;
+};
+
+struct AnchorCenter {
+  float ctr_x, ctr_y, w, h;
+};
+
+struct FaceInfo {
+  float x1, y1, x2, y2;
+  float score;
+  float x[5];
+  float y[5];
+};
+
+class RetinaFaceDetectionFunc : public ICpuFunction {
+
+public:
+  RetinaFaceDetectionFunc() {
+    _cfg.clear();
+    AnchorCfg cfg1(32, {32, 16}, 16, {1.0}, 9999);
+    AnchorCfg cfg2(16, {8, 4}, 16, {1.0}, 9999);
+    AnchorCfg cfg3(8, {2, 1}, 16, {1.0}, 9999);
+    _cfg.push_back(cfg1);
+    _cfg.push_back(cfg2);
+    _cfg.push_back(cfg3);
+
+    _anchors_fpn.clear();
+    auto anchors = generate_anchors_fpn(false, _cfg);
+    for (size_t i = 0; i < _feature_stride_fpn.size(); ++i) {
+      std::string key = "stride" + std::to_string(_feature_stride_fpn[i]);
+      _anchors_fpn[key] = anchors[i];
+      _num_anchors[key] = anchors[i].size();
+    }
+  }
+
+  ~RetinaFaceDetectionFunc() = default;
+  void setup(tensor_list_t &inputs,
+             tensor_list_t &outputs, OpParam &param);
+  void run();
+
+  static ICpuFunction *open() { return new RetinaFaceDetectionFunc(); }
+  static void close(ICpuFunction *func) { delete func; }
+
+private:
+  AnchorCenter mkcenter(AnchorBox &base_anchor) {
+    AnchorCenter ctr;
+    ctr.w = base_anchor.x2 - base_anchor.x1 + 1;
+    ctr.h = base_anchor.y2 - base_anchor.y1 + 1;
+    ctr.ctr_x = base_anchor.x1 + 0.5 * (ctr.w - 1);
+    ctr.ctr_y = base_anchor.y1 + 0.5 * (ctr.h - 1);
+    return ctr;
+  }
+
+  AnchorBox mkanchor(AnchorCenter &ctr) {
+    AnchorBox anchor;
+    anchor.x1 = ctr.ctr_x - 0.5 * (ctr.w - 1);
+    anchor.y1 = ctr.ctr_y - 0.5 * (ctr.h - 1);
+    anchor.x2 = ctr.ctr_x + 0.5 * (ctr.w + 1);
+    anchor.y2 = ctr.ctr_y + 0.5 * (ctr.h + 1);
+    return anchor;
+  }
+
+  std::vector<AnchorBox> ratio_enum(AnchorBox &base_anchor, std::vector<float> &ratios) {
+    std::vector<AnchorBox> anchors;
+    for (size_t i = 0; i < ratios.size(); ++i) {
+      AnchorCenter ctr = mkcenter(base_anchor);
+
+      float scale = (ctr.w * ctr.h) / ratios[i];
+      ctr.w = std::round(std::sqrt(scale));
+      ctr.h = std::round(ctr.w * ratios[i]);
+
+      AnchorBox anchor = mkanchor(ctr);
+      anchors.push_back(anchor);
+    }
+    return anchors;
+  }
+
+  std::vector<AnchorBox> scale_enum(AnchorBox anchor, std::vector<int> &scales) {
+    std::vector<AnchorBox> anchors;
+    for (size_t i = 0; i < scales.size(); ++i) {
+      auto ctr = mkcenter(anchor);
+      ctr.w = ctr.w * scales[i];
+      ctr.h = ctr.h * scales[i];
+
+      auto scale_anchor = mkanchor(ctr);
+      // LOGI << "x1 = " << scale_anchor.x1 << ",y1 = " << scale_anchor.y1
+      //     << ",x2 = " << scale_anchor.x2 << ",y2 = " << scale_anchor.y2;
+      anchors.push_back(scale_anchor);
+    }
+
+    return anchors;
+  }
+
+  std::vector<AnchorBox> generate_anchors(bool dense, AnchorCfg &cfg) {
+    AnchorBox base_anchor;
+    base_anchor.x1 = 0;
+    base_anchor.y1 = 0;
+    base_anchor.x2 = cfg.base_size - 1;
+    base_anchor.y2 = cfg.base_size - 1;
+
+    auto ratio_anchors = ratio_enum(base_anchor, cfg.ratios);
+
+    std::vector<AnchorBox> anchors;
+    for (size_t i = 0; i < ratio_anchors.size(); ++i) {
+      auto scale_anchors = scale_enum(ratio_anchors[i], cfg.scales);
+      anchors.insert(anchors.end(), scale_anchors.begin(), scale_anchors.end());
+    }
+
+    if (dense) {
+      // TODO: anchors x and y need to add stride / 2
+    }
+    return anchors;
+  }
+
+  std::vector<std::vector<AnchorBox>>
+  generate_anchors_fpn(bool dense, std::vector<AnchorCfg> &cfg) {
+    std::vector<std::vector<AnchorBox>> anchors_fpn;
+    for (size_t i = 0; i < cfg.size(); ++i) {
+      auto anchors = generate_anchors(dense, cfg[i]);
+      anchors_fpn.push_back(anchors);
+    }
+    return anchors_fpn;
+  }
+
+  std::vector<AnchorBox> anchors_plane(int height, int width, int stride,
+                                       std::vector<AnchorBox> anchors_fpn) {
+    std::vector<AnchorBox> anchors;
+    for (size_t k = 0; k < anchors_fpn.size(); ++k) {
+      for (int ih = 0; ih < height; ++ih) {
+        int sh = ih * stride;
+        for (int iw = 0; iw < width; ++iw) {
+          int sw = iw * stride;
+          AnchorBox anchor;
+          anchor.x1 = anchors_fpn[k].x1 + sw;
+          anchor.y1 = anchors_fpn[k].y1 + sh;
+          anchor.x2 = anchors_fpn[k].x2 + sw;
+          anchor.y2 = anchors_fpn[k].y2 + sh;
+          anchors.push_back(anchor);
+          // LOGI << "x1 = " << anchor.x1 << ",y1 = " << anchor.y1
+          //      << ",x2 = " << anchor.x2 << ",y2 = " << anchor.y2;
+        }
+      }
+    }
+
+    return anchors;
+  }
+
+  std::vector<float> bbox_pred(AnchorBox anchor, std::vector<float> bbox_deltas) {
+    std::vector<float> bbox(4, 0);
+
+    float width = anchor.x2 - anchor.x1 + 1;
+    float height = anchor.y2 - anchor.y1 + 1;
+    float center_x = anchor.x1 + 0.5 * (width - 1);
+    float center_y = anchor.y1 + 0.5 * (height - 1);
+
+    float pred_center_x = bbox_deltas[0] * width + center_x;
+    float pred_center_y = bbox_deltas[1] * height + center_y;
+    float pred_w = exp(bbox_deltas[2]) * width;
+    float pred_h = exp(bbox_deltas[3]) * height;
+
+    bbox[0] = pred_center_x - 0.5 * (pred_w - 1);
+    bbox[1] = pred_center_y - 0.5 * (pred_h - 1);
+    bbox[2] = pred_center_x + 0.5 * (pred_w - 1);
+    bbox[3] = pred_center_y + 0.5 * (pred_h - 1);
+
+    return bbox;
+  }
+
+  std::vector<float> landmark_pred(AnchorBox anchor, std::vector<float> landmark_deltas) {
+    std::vector<float> pts(10, 0);
+
+    float width = anchor.x2 - anchor.x1 + 1;
+    float height = anchor.y2 - anchor.y1 + 1;
+    float center_x = anchor.x1 + 0.5 * (width - 1);
+    float center_y = anchor.y1 + 0.5 * (height - 1);
+
+    for (int i = 0; i < 5; ++i) {
+      pts[i] = center_x + landmark_deltas[i] * width;
+      pts[i + 5] = center_y + landmark_deltas[i + 5] * height;
+    }
+
+    return pts;
+  }
+
+  std::vector<FaceInfo> nms(std::vector<FaceInfo> infos, float nms_threshold) {
+    std::vector<FaceInfo> infos_nms;
+    std::sort(infos.begin(), infos.end(),
+              [](FaceInfo &a, FaceInfo &b) { return a.score > b.score; });
+
+    int selected = 0;
+    int count = infos.size();
+    std::vector<int> mask(count, 0);
+    bool exit = false;
+    while (!exit) {
+      while (selected < count && mask[selected] == 1)
+        selected++;
+
+      if (selected == count) {
+        exit = true;
+        continue;
+      }
+
+      infos_nms.push_back(infos[selected]);
+      mask[selected] = 1;
+
+      float w1 = infos[selected].x2 - infos[selected].x1 + 1;
+      float h1 = infos[selected].y2 - infos[selected].y1 + 1;
+      float area1 = w1 * h1;
+
+      selected++;
+      for (int i = selected; i < count; ++i) {
+        if (mask[i] == 1)
+          continue;
+
+        float w2 = infos[i].x2 - infos[i].x1 + 1;
+        float h2 = infos[i].y2 - infos[i].y1 + 1;
+        float area2 = w2 * h2;
+
+        float inter_x1 = std::max(infos[selected].x1, infos[i].x1);
+        float inter_y1 = std::max(infos[selected].y1, infos[i].y1);
+        float inter_x2 = std::min(infos[selected].x2, infos[i].x2);
+        float inter_y2 = std::min(infos[selected].y2, infos[i].y2);
+
+        float w = inter_x2 - inter_x1 + 1;
+        float h = inter_y2 - inter_y1 + 1;
+
+        if (w <= 0 || h <= 0)
+          continue;
+
+        float iou = w * h / (area1 + area2 - w * h);
+        if (iou > nms_threshold) {
+          mask[i] = 1;
+        }
+      }
+    }
+
+    return infos_nms;
+  }
+
+private:
+  tensor_list_t _bottoms;
+  tensor_list_t _tops;
+
+  float _nms_threshold;
+  float _confidence_threshold;
+  int _keep_topk;
+
+  std::unordered_map<std::string, std::vector<AnchorBox>> _anchors_fpn;
+  std::unordered_map<std::string, int> _num_anchors;
+  std::vector<AnchorCfg> _cfg;
+  std::vector<int> _feature_stride_fpn{32, 16, 8};
+};
+
+} // namespace runtime
+} // namespace cvi
diff --git a/cviruntime/src/common/cpu_function/roi_pooling.cpp b/cviruntime/src/common/cpu_function/roi_pooling.cpp
new file mode 100644
index 000000000..e0d51319d
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/roi_pooling.cpp
@@ -0,0 +1,114 @@
+#include <iostream>
+#include <vector>
+#include <cmath>
+#include <runtime/debug.h>
+#include <runtime/neuron.hpp>
+#include <cpu_function/roi_pooling.hpp>
+
+namespace cvi {
+namespace runtime {
+
+ROIPoolingFunc::~ROIPoolingFunc() {}
+
+void ROIPoolingFunc::setup(tensor_list_t &inputs,
+             tensor_list_t &outputs,
+             OpParam &param) {
+  pooled_h = param.get<int32_t>("pooled_h");
+  pooled_w = param.get<int32_t>("pooled_w");
+  spatial_scale = param.get<float>("spatial_scale");
+
+  auto on = outputs[0]->shape[0];
+  auto oc = outputs[0]->shape[1];
+  if (inputs[0]->shape[1] == oc && inputs[1]->shape[2] == on) {
+    _bottoms = inputs;
+  } else {
+    std::swap(inputs[0], inputs[1]);
+    _bottoms = inputs;
+  }
+
+  _tops = outputs;
+}
+
+void ROIPoolingFunc::run() {
+  auto top_data = _tops[0]->cpu_data<float>();
+  memset(top_data, 0, _tops[0]->size());
+
+  size_t bottom_count = _bottoms.size();
+  assert(bottom_count == 2);
+
+  float *data = (float *)_bottoms[0]->cpu_data<float>();
+  float *rois = (float *)_bottoms[1]->cpu_data<float>();
+
+  int num_rois = _bottoms[1]->shape[2];
+  int batch = _bottoms[0]->shape[0];
+  int channel = _bottoms[0]->shape[1];
+  int height = _bottoms[0]->shape[2];
+  int width = _bottoms[0]->shape[3];
+
+  for (int b = 0; b < batch; ++b) {
+    auto batch_rois = rois + _bottoms[1]->offset(b);
+    auto batch_top_data = top_data + _tops[0]->offset(b * num_rois);
+    for (int n = 0; n < num_rois; ++n) {
+      int roi_batch_ind = batch_rois[0];
+      int roi_start_w = std::round(batch_rois[1] * spatial_scale);
+      int roi_start_h = std::round(batch_rois[2] * spatial_scale);
+      int roi_end_w = std::round(batch_rois[3] * spatial_scale);
+      int roi_end_h = std::round(batch_rois[4] * spatial_scale);
+      assert(roi_batch_ind < batch);
+
+      int roi_height = std::max(roi_end_h - roi_start_h + 1, 1);
+      int roi_width = std::max(roi_end_w - roi_start_w + 1, 1);
+      const float bin_size_h = static_cast<float>(roi_height)
+                                  / static_cast<float>(pooled_h);
+      const float bin_size_w = static_cast<float>(roi_width)
+                                  / static_cast<float>(pooled_w);
+
+      float* batch_data = data + roi_batch_ind * channel * height * width;
+
+      for (int c = 0; c < channel; ++c) {
+        for (int ph = 0; ph < pooled_h; ++ph) {
+          for (int pw = 0; pw < pooled_w; ++pw) {
+            // Compute pooling region for this output unit:
+            //  start (included) = floor(ph * roi_height / pooled_height_)
+            //  end (excluded) = ceil((ph + 1) * roi_height / pooled_height_)
+            int hstart = static_cast<int>(std::floor(static_cast<float>(ph)
+                                          * bin_size_h));
+            int wstart = static_cast<int>(std::floor(static_cast<float>(pw)
+                                          * bin_size_w));
+            int hend = static_cast<int>(std::ceil(static_cast<float>(ph + 1)
+                                          * bin_size_h));
+            int wend = static_cast<int>(std::ceil(static_cast<float>(pw + 1)
+                                          * bin_size_w));
+
+            hstart = std::min(std::max(hstart + roi_start_h, 0), height);
+            hend = std::min(std::max(hend + roi_start_h, 0), height);
+            wstart = std::min(std::max(wstart + roi_start_w, 0), width);
+            wend = std::min(std::max(wend + roi_start_w, 0), width);
+
+            bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+            const int pool_index = ph * pooled_w + pw;
+            if (is_empty) {
+              batch_top_data[pool_index] = 0;
+            }
+
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                const int index = h * width + w;
+                if (batch_data[index] > batch_top_data[pool_index]) {
+                  batch_top_data[pool_index] = batch_data[index];
+                }
+              }
+            }
+          }
+        }
+        batch_data += height * width;
+        batch_top_data += pooled_h * pooled_w;
+      }
+      batch_rois += 5;
+    }
+  }
+}
+
+}
+}
\ No newline at end of file
diff --git a/cviruntime/src/common/cpu_function/roi_pooling.hpp b/cviruntime/src/common/cpu_function/roi_pooling.hpp
new file mode 100644
index 000000000..e31e019d7
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/roi_pooling.hpp
@@ -0,0 +1,35 @@
+#include <iostream>
+#include <vector>
+#include <algorithm>
+#include <unordered_map>
+#include <runtime/neuron.hpp>
+#include <runtime/cpu_function.hpp>
+
+namespace cvi {
+namespace runtime {
+
+class ROIPoolingFunc : public ICpuFunction {
+
+public:
+  ROIPoolingFunc() {}
+
+  ~ROIPoolingFunc();
+  void setup(tensor_list_t &inputs,
+             tensor_list_t &outputs,
+             OpParam &param);
+  void run();
+
+  static ICpuFunction *open() { return new ROIPoolingFunc(); }
+  static void close(ICpuFunction *func) { delete func; }
+
+private:
+  tensor_list_t _bottoms;
+  tensor_list_t _tops;
+
+  int pooled_h;
+  int pooled_w;
+  float spatial_scale;
+};
+
+}
+}
diff --git a/cviruntime/src/common/cpu_function/softmax.cpp b/cviruntime/src/common/cpu_function/softmax.cpp
new file mode 100644
index 000000000..a7f15db52
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/softmax.cpp
@@ -0,0 +1,81 @@
+#include <string.h>
+#include <iostream>
+#include <vector>
+#include <cmath>
+#include <runtime/neuron.hpp>
+#include <cpu_function/softmax.hpp>
+
+namespace cvi {
+namespace runtime {
+
+SoftmaxFunc::~SoftmaxFunc() {
+  if (_max)
+    delete[] _max;
+  if (_sum)
+    delete[] _sum;
+}
+
+void SoftmaxFunc::setup(tensor_list_t &inputs,
+                        tensor_list_t &outputs,
+                        OpParam &param) {
+  _bottom = inputs[0];
+  _top = outputs[0];
+  _axis = param.get<int32_t>("axis");
+  //assert(_axis == 1);
+  assert(_axis >= 0);
+  auto shape = _bottom->shape;
+  _axis = _axis % shape.size();
+
+  _n = 1;
+  for(int i = 0; i < _axis; ++i) {
+    _n *= shape[i];
+  }
+
+  _inner_dim = 1;
+  for(size_t i = _axis+1; i < shape.size(); ++i) {
+    _inner_dim *= shape[i];
+  }
+
+  _c = shape[_axis];
+  _dim = _c * _inner_dim;
+
+  _max = new float[_inner_dim];
+  _sum = new float[_inner_dim];
+}
+
+void SoftmaxFunc::run() {
+  auto bottom_data = _bottom->cpu_data<float>();
+  auto top_data = _top->cpu_data<float>();
+
+  for (int i = 0; i < _n; ++i) {
+    memcpy(_max, bottom_data, _inner_dim * sizeof(float));
+    memset(_sum, 0, _inner_dim * sizeof(float));
+    // find max value accross channel
+    int c_offset = i * _dim;
+    for (int j = 0; j < _c; ++j, c_offset += _inner_dim) {
+      for (int k = 0; k < _inner_dim; k++) {
+        if (_max[k] < bottom_data[c_offset + k])
+          _max[k] = bottom_data[c_offset + k];
+      }
+    }
+
+    // calculate exp(x)
+    c_offset = i * _dim;
+    for (int j = 0; j < _c; ++j, c_offset += _inner_dim) {
+      for (int k = 0; k < _inner_dim; k++) {
+        top_data[c_offset + k] = std::exp(bottom_data[c_offset + k] - _max[k]);
+        _sum[k] += top_data[c_offset + k];
+      }
+    }
+
+    c_offset = i * _dim;
+    for (int j = 0; j < _c; ++j, c_offset += _inner_dim) {
+      for (int k = 0; k < _inner_dim; k++) {
+        top_data[c_offset + k] /= _sum[k];
+      }
+    }
+  }
+}
+
+}
+}
diff --git a/cviruntime/src/common/cpu_function/softmax.hpp b/cviruntime/src/common/cpu_function/softmax.hpp
new file mode 100644
index 000000000..3a7338291
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/softmax.hpp
@@ -0,0 +1,37 @@
+#include <iostream>
+#include <vector>
+#include <runtime/neuron.hpp>
+#include <runtime/cpu_function.hpp>
+
+
+namespace cvi {
+namespace runtime {
+
+class SoftmaxFunc : public ICpuFunction {
+
+public:
+  SoftmaxFunc() {};
+  ~SoftmaxFunc();
+  void setup(tensor_list_t &inputs,
+             tensor_list_t &outputs,
+             OpParam &param);
+  void run();
+
+  static ICpuFunction *open() { return new SoftmaxFunc(); }
+  static void close(ICpuFunction *func) { delete func; }
+
+private:
+  std::shared_ptr<Neuron> _bottom;
+  std::shared_ptr<Neuron> _top;
+  int _axis;
+  int _inner_dim;
+  int _dim;
+  int _c;
+  int _n;
+  float *_max = nullptr;
+  float *_sum = nullptr;
+
+};
+
+}
+}
diff --git a/cviruntime/src/common/cpu_function/ssd_detection.cpp b/cviruntime/src/common/cpu_function/ssd_detection.cpp
new file mode 100644
index 000000000..20f880b81
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/ssd_detection.cpp
@@ -0,0 +1,958 @@
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <cmath>
+#include <algorithm>
+#include <runtime/debug.h>
+#include <runtime/neuron.hpp>
+#include <cpu_function/ssd_detection.hpp>
+#include <time.h>
+#include <sys/time.h>
+#ifdef __ARM_NEON
+#include <numeric>
+#include <tuple>
+#include "arm_neon.h"
+#endif /* ifdef __ARM_NEON */
+
+
+namespace cvi {
+namespace runtime {
+
+bool SortScoreCmp0(const std::pair<float, int> &pair1,
+                   const std::pair<float, int> &pair2) {
+  return pair1.first > pair2.first;
+}
+
+bool SortScoreCmp1(const std::pair<float, std::pair<int, int>> &pair1,
+                   const std::pair<float, std::pair<int, int>> &pair2) {
+  return pair1.first > pair2.first;
+}
+//#undef __ARM_NEON
+#ifdef __ARM_NEON
+typedef union {
+  int ival;
+  unsigned int uval;
+  float fval;
+} unitype;
+
+const float __expf_rng[2] = {
+  1.442695041f,
+  0.693147180f
+};
+
+const float __expf_lut[8] = {
+  0.9999999916728642,    //p0
+  0.04165989275009526,   //p4
+  0.5000006143673624,   //p2
+  0.0014122663401803872,   //p6
+  1.000000059694879,     //p1
+  0.008336936973260111,   //p5
+  0.16666570253074878,   //p3
+  0.00019578093328483123  //p7
+};
+
+float expf_c(float x)
+{
+  float a, b, c, d, xx;
+  int m;
+
+  union {
+    float   f;
+    int   i;
+  } r;
+
+  //Range Reduction:
+  m = (int) (x * __expf_rng[0]);
+  x = x - ((float) m) * __expf_rng[1];
+
+  //Taylor Polynomial (Estrins)
+  a = (__expf_lut[4] * x) + (__expf_lut[0]);
+  b = (__expf_lut[6] * x) + (__expf_lut[2]);
+  c = (__expf_lut[5] * x) + (__expf_lut[1]);
+  d = (__expf_lut[7] * x) + (__expf_lut[3]);
+  xx = x * x;
+  a = a + b * xx;
+  c = c + d * xx;
+  xx = xx* xx;
+  r.f = a + c * xx;
+
+  //multiply by 2 ^ m
+  m = m << 23;
+  r.i = r.i + m;
+
+  return r.f;
+}
+
+// fast exp
+float expf_neon_sfp(float x)
+{
+  return expf_c(x);
+}
+
+static inline bool is_background_cls(int cls, int background_label_id) {
+  return cls == background_label_id;
+}
+
+static inline bool is_share_loc_background_cls(int cls, int background_label_id) {
+  return false;
+}
+
+// get loc and decode bbox info in one for loop
+void GetLocBBox_opt(
+    std::vector<std::map<int, std::pair<std::vector<std::pair<float, int>>, std::vector<BBox_l>* >>> *all_conf_scores,
+    const float *loc_data, const float *prior_data,
+    const int num, const int num_priors, const int num_loc_classes,
+    const bool share_location, const int num_classes,
+    const int background_label_id, const CodeType code_type,
+    const bool variance_encoded_in_target, int top_k,
+    std::vector<LabelBBox_l> *all_decode_bboxes
+    ) {
+
+  assert(code_type == PriorBoxParameter_CodeType_CENTER_SIZE);
+  bool (*check_background)(int, int) = is_background_cls;
+
+  if (share_location) {
+    assert(num_loc_classes == 1);
+    check_background = is_share_loc_background_cls;
+    // return const, it should be remove check background's branch
+  }
+
+  for (int i = 0; i < num; ++i) {
+    // save valid decode
+    std::map<int, std::vector<std::pair<float, int>>> indices;
+    LabelBBox_l &decode_bboxes = (*all_decode_bboxes)[i];
+    std::vector<int> decode_keep_index((*all_conf_scores)[i].size() * top_k);
+    int decode_keep_index_cnt = 0;
+
+    // collect all decode index
+    for (auto it = (*all_conf_scores)[i].begin(); it != (*all_conf_scores)[i].end(); it++) {
+      std::vector<std::pair<float, int>> &scores = it->second.first;
+      auto c = it->first;
+
+      if (check_background(c, background_label_id)) {
+        // Ignore background class.
+        continue;
+      }
+
+      // init for share position
+      it->second.second = &(decode_bboxes[-1]);
+
+      // sort by top_k, align cmodel
+      if (top_k < (int)scores.size()) {
+        std::partial_sort(scores.begin(), scores.begin() + top_k, scores.end(),
+                          SortScoreCmp0);
+      } else {
+        std::sort(scores.begin(), scores.end(), SortScoreCmp0);
+      }
+
+      int length = std::min(top_k, (int)scores.size());
+
+      for (int k = 0; k < length; ++k) {
+        // later get index
+        scores[k].second /= num_classes;
+        decode_keep_index[decode_keep_index_cnt] = (scores[k].second);
+        decode_keep_index_cnt++;
+      }
+    }
+
+    for (int c = 0; c < num_loc_classes; ++c) {
+      int label = c;
+      std::vector<BBox_l> *p = &(decode_bboxes[label]);
+      if (share_location) {
+        label = -1;
+        p = &(decode_bboxes[label]);
+        assert (label != background_label_id);
+      }
+      else {
+        if (label == background_label_id) {
+          // Ignore background class.
+          continue;
+        }
+
+        auto all_conf_score = (*all_conf_scores)[i].find(c);
+        if (all_conf_score == (*all_conf_scores)[i].end()) {
+          continue;
+        }
+
+        std::vector<BBox_l> *p = &(decode_bboxes[label]);
+        all_conf_score->second.second = p;
+      }
+
+      p->resize(num_priors); // assing max
+      int sz = decode_keep_index_cnt;
+
+      auto init_bbox = [=] (int k, int idx, BBox_l* decode_bbox) mutable -> void {
+        // prior_bboxes
+        int start_idx = k * 4;
+        const float *p0 = prior_data + start_idx;
+        const float *p1 = prior_data + start_idx + 4 * num_priors;
+
+        // get prior_width / prior_height / prior_center_x / prior_center_y
+        float32x4_t _v1 = {p0[2], p0[3], p0[0], p0[1]};
+        float32x4_t _v2 = {-p0[0], -p0[1], p0[2], p0[3]};
+        float32x4_t _v3 = {1, 1, 0.5, 0.5};
+        float32x4_t sum = vaddq_f32(_v1, _v2);
+        float32x4_t prod = vmulq_f32(sum, _v3);
+
+        assert(prod[0] > 0); // prior_width
+        assert(prod[1] > 0); // prior_height
+
+        float _p1[4];
+        memcpy(_p1, p1, sizeof(float) * 4);
+
+        // opt CENTER_SIZE
+        if (variance_encoded_in_target) {
+          // variance is encoded in target, we simply need to retore the offset
+          // predictions.
+          _p1[0] = _p1[1] = _p1[2] = _p1[3] = 1;
+        }
+        //else {
+        //  // variance is encoded in bbox, we need to scale the offset accordingly.
+        //}
+
+        // get decode_bbox_center_x/decode_bbox_center_y/decode_bbox_width/decode_bbox_height
+        int shift = k * num_loc_classes * 4 + c * 4;
+        auto xmin = _p1[0] * loc_data[shift];
+        auto ymin = _p1[1] * loc_data[shift + 1];
+        auto xmax = loc_data[shift + 2];
+        auto ymax = loc_data[shift + 3];
+
+
+        float _decode_bbox_width, _decode_bbox_height;
+        _decode_bbox_width = _p1[2] * xmax;
+        _decode_bbox_height = _p1[3] * ymax;
+        _decode_bbox_width = expf_neon_sfp(_decode_bbox_width);
+        _decode_bbox_height = expf_neon_sfp(_decode_bbox_height);
+
+        // decode bbox, please refer \DecodeBBoxesAll_opt
+        float32x4_t v1 = {xmin, ymin, _decode_bbox_width, _decode_bbox_height};
+        float32x4_t v2 = {prod[0], prod[1], prod[0], prod[1]};
+        float32x4_t v3 = {prod[2], prod[3], 0, 0};
+        float32x4_t acc = vmlaq_f32(v3, v1, v2);  // acc = v3 + v1 * v2
+
+        float32x4_t v4 = {acc[0], acc[1], acc[0], acc[1]};
+        float32x4_t v5 = {-acc[2], -acc[3], acc[2], acc[3]};
+        float32_t s = 0.5;
+
+        // directly store back to xmin/... info
+        vst1q_f32(decode_bbox->xy.b, vmlaq_n_f32(v4, v5, s));
+
+        decode_bbox->CalcSize();
+      };
+
+      // TODO: try to leverage openmp
+      {
+#define ADD_DECODE_BBOX(idx) \
+          init_bbox (decode_keep_index[idx], idx, &((*p)[decode_keep_index[idx]]));
+
+        for (int _k = 0; _k < sz; _k++) {
+          ADD_DECODE_BBOX(_k);
+        }
+      }
+    }
+    loc_data += num_priors * num_loc_classes * 4;
+  }
+}
+
+void inline GetConfidenceScores(
+    const float *conf_data, const int num, const int num_preds_per_class,
+    const int num_classes, const float score_threshold,
+    const int background_label_id, const bool share_location,
+    std::vector<std::map<int, std::pair<std::vector<std::pair<float, int>>, std::vector<BBox_l>* >>> *conf_preds) {
+
+  // we compare float as integer
+  assert(score_threshold > 0);
+
+  // exclude background 0: exclude, 1: include
+  int skip_backgroud = !share_location;  // accord to normal implement background always skipped
+  // assert (background_label_id == 0);
+
+  auto all_classes = num_preds_per_class * num_classes;
+  unitype t;
+  t.fval = score_threshold;
+
+  for (int i = 0; i < num; i++) {
+    std::map<int, std::pair<std::vector<std::pair<float, int>>, std::vector<BBox_l>*>> &label_scores = (*conf_preds)[i];
+
+    // later handle bbox idx
+    unitype* _conf_data = (unitype*)conf_data;
+    // measure neon/unroll performance
+    //struct timeval net_fwd_time_t0;
+    //gettimeofday(&net_fwd_time_t0, NULL);
+#if 0
+    unitype v;
+    t.fval = score_threshold;
+    float32x4_t thes = vmovq_n_f32(score_threshold);
+    for (int i = 0; i < all_classes; i+=4) {
+      // vector compare less than
+      float32x4_t v1 = vld1q_f32(conf_data + i);
+
+      if (vmaxvq_f32(v1) < t.fval) {
+        continue;
+      }
+      uint32x4_t vcon = vcgtq_f32(v1, thes);
+#define SELECT_SCORE(idx) \
+      do {\
+        if (vcon[idx] == 0xffffffff) label_scores[(i+idx) % num_classes].first.emplace_back(std::make_pair(conf_data[i+idx], (i+idx))); \
+      } while (0);
+
+      SELECT_SCORE(0);
+      SELECT_SCORE(1);
+      SELECT_SCORE(2);
+      SELECT_SCORE(3);
+    }
+#else
+    // unroll for not stall once cache miss
+    // 8 is magic number that we try the best unroll loops
+    int unroll_cnt = 8;
+#define SELECT_SCORE(idx) \
+    do {\
+      unitype v = _conf_data[(j+idx)];\
+      if (v.ival > t.ival) { \
+        int c = (j+idx) % num_classes;\
+        if ((c - background_label_id) || skip_backgroud) label_scores[c].first.emplace_back(std::make_pair(v.fval, (j+idx) )); \
+      } \
+    } while (0);
+
+    int j = 0;
+    for (; j < all_classes - unroll_cnt; j+=unroll_cnt) {
+      // uint32 with less compare instruction
+      //if (score > score_threshold)
+      SELECT_SCORE(0);
+      SELECT_SCORE(1);
+      SELECT_SCORE(2);
+      SELECT_SCORE(3);
+      SELECT_SCORE(4);
+      SELECT_SCORE(5);
+      SELECT_SCORE(6);
+      SELECT_SCORE(7);
+    }
+
+    // deal with residule
+    for (; j < all_classes ; j++) {
+      SELECT_SCORE(0);
+    }
+#endif
+    // next batch
+    conf_data += num_preds_per_class * num_classes;
+  }
+}
+
+static void ApplyNMSFast(std::vector<BBox_l> *bboxes,
+                      const std::vector<std::pair<float, int>> &conf_score,
+                      const float nms_threshold, const float eta, int top_k,
+                      int label,
+                      std::vector<std::pair<float, std::tuple<int, int, std::vector<BBox_l>*>>> &score_index_pairs,
+                      int* det_num) {
+  // Do nms.
+  float adaptive_threshold = nms_threshold;
+  int i = 0;
+  int indices_sz = 0;
+  int offset = score_index_pairs.size();
+  int length = (top_k < (int)conf_score.size()) ? top_k : conf_score.size();
+  while (length != i) {
+    bool keep = true;
+    for (int k = 0; k < indices_sz; ++k) {
+      if (keep) {
+        int kept_idx = std::get<1>(score_index_pairs[k + offset].second);
+        const BBox_l &b1 = (*bboxes)[conf_score[i].second];
+        const BBox_l &b2 = (*bboxes)[kept_idx];
+
+        if (b2.xy.s.xmin > b1.xy.s.xmax || b2.xy.s.xmax < b1.xy.s.xmin || b2.xy.s.ymin > b1.xy.s.ymax ||
+            b2.xy.s.ymax < b1.xy.s.ymin) {
+          keep = true;
+        } else {
+          const float inter_xmin = std::max(b1.xy.s.xmin, b2.xy.s.xmin);
+          const float inter_ymin = std::max(b1.xy.s.ymin, b2.xy.s.ymin);
+          const float inter_xmax = std::min(b1.xy.s.xmax, b2.xy.s.xmax);
+          const float inter_ymax = std::min(b1.xy.s.ymax, b2.xy.s.ymax);
+          const float inter_width = inter_xmax - inter_xmin;
+          const float inter_height = inter_ymax - inter_ymin;
+          const float inter_size = inter_width * inter_height;
+          const float total_size = b1.size + b2.size;
+          keep =
+              (inter_size * (adaptive_threshold + 1) <= total_size * adaptive_threshold)
+                  ? true
+                  : false;
+        }
+      } else {
+        break;
+      }
+    }
+
+    if (keep) {
+      // preserve
+      score_index_pairs.emplace_back(std::make_pair(
+            conf_score[i].first, std::make_tuple(label, conf_score[i].second, bboxes)));
+      indices_sz++;
+    }
+
+    if (keep && eta < 1 && adaptive_threshold > 0.5) {
+      adaptive_threshold *= eta;
+    }
+    i++;
+  }
+  (*det_num) = indices_sz;
+}
+
+
+
+void SSDDetectionFunc::neon_run(float* top_data, bool variance_encoded_in_target_,
+      int num_loc_classes, float eta_, Decode_CodeType code_type_) {
+  int num = _bottoms[0]->shape[0];
+  int num_priors_ = _bottoms[2]->shape[2] / 4;
+
+  float *loc_data = _bottoms[0]->cpu_data<float>();
+  float *conf_data = _bottoms[1]->cpu_data<float>();
+  float *prior_data = reinterpret_cast<float *>(_bottoms[2]->cpu_data<uint8_t>());
+
+  std::vector<std::map<int, std::pair<std::vector<std::pair<float, int>>, std::vector<BBox_l>* >>> all_conf_scores;
+  all_conf_scores.resize(num);
+  // select score
+  GetConfidenceScores(conf_data, num, num_priors_, _num_classes, _obj_threshold,
+                          _background_label_id, _share_location, &all_conf_scores);
+
+  std::vector<LabelBBox_l> all_decode_bboxes(num);
+
+  // get box location
+  GetLocBBox_opt(&all_conf_scores, loc_data, prior_data, num, num_priors_,
+      num_loc_classes, _share_location, _num_classes,
+      _background_label_id, code_type_, variance_encoded_in_target_,
+      _top_k, &all_decode_bboxes);
+
+  int num_shift = 0;
+  for (int i = 0; i < num; ++i) {
+    std::map<int, std::pair<std::vector<std::pair<float, int>>, std::vector<BBox_l>* >> &conf_scores =
+        all_conf_scores[i];
+    int num_det = 0;
+
+    // we keep bbox point for reduce search
+    std::vector<std::pair<float, std::tuple<int, int, std::vector<BBox_l>*>>> score_index_pairs;
+    // init and collect count in each label
+    // pair store <each lable count, each data offset in lable>
+    std::map<int, std::pair<int, int>> new_indices_cnt;
+
+    bool (*check_background)(int, int) = is_background_cls;
+    if (_share_location) {
+      check_background = is_share_loc_background_cls;
+      // return const, it should be remove check background's branch
+    }
+
+    for (auto it = conf_scores.begin(); it != conf_scores.end(); it++) {
+      int c = it->first;
+
+      if (check_background(c, _background_label_id)) {
+        // Ignore background class.
+        continue;
+      }
+
+      std::vector<BBox_l> *bboxes = it->second.second;
+      const std::vector<std::pair<float, int>> &aa = it->second.first;
+
+      // get property nms
+      ApplyNMSFast(bboxes, aa, _nms_threshold, eta_, _top_k, c, score_index_pairs,
+          &new_indices_cnt[c].first);
+    }
+
+    num_det = score_index_pairs.size();
+    int sz = num_det;
+
+    if (_keep_topk > -1 && num_det > _keep_topk) {
+      // Keep top k results per image.
+      std::sort(score_index_pairs.begin(), score_index_pairs.end(),
+          [](const std::pair<float, std::tuple<int, int, std::vector<BBox_l>*>> &pair1,
+          const std::pair<float, std::tuple<int, int, std::vector<BBox_l>*>> &pair2) {
+          return pair1.first > pair2.first;
+          });
+
+      sz = _keep_topk;
+
+      // reset it
+      for (auto it = new_indices_cnt.begin(); it != new_indices_cnt.end(); it++) {
+        it->second.first = 0;
+      }
+
+      for (int j = 0; j < sz; ++j) {
+        int label = std::get<0>(score_index_pairs[j].second);
+        new_indices_cnt[label].first++;
+      }
+    }
+
+    auto it = new_indices_cnt.begin();
+    if (it != new_indices_cnt.end()) {
+      int first_cnt = it->second.first;
+      it->second.first = 0; // start with
+
+      // calculate each label's distance
+      for (++it; it != new_indices_cnt.end(); ++it) {
+        int curr = it->second.first;
+        it->second.first = first_cnt;
+        first_cnt += curr;
+      }
+    }
+
+    // try to continuous write
+    for (int j = 0; j < sz; ++j) {
+
+      int label = std::get<0>(score_index_pairs[j].second);
+      int idx = std::get<1>(score_index_pairs[j].second);
+      float s = score_index_pairs[j].first;
+      std::vector<BBox_l>* bboxes = std::get<2>(score_index_pairs[j].second);
+
+      int _cnt = (new_indices_cnt[label].first + new_indices_cnt[label].second + num_shift) * 7;
+      (*bboxes)[idx].num = i;
+      (*bboxes)[idx].label = label;
+      (*bboxes)[idx].score = s;
+      // try to sequential write cus num/lable/score/boxs are continuously
+      memcpy(&top_data[_cnt], &((*bboxes)[idx]), sizeof(float) * 7);
+
+      // add shift in its label
+      new_indices_cnt[label].second++;
+    }
+
+    num_shift += sz;
+  }
+
+  int output_size = num * _keep_topk * 1 * 1 * 7;
+
+  // fill dummy to end for align cmodel
+  for (int i = num_shift * 7; i < output_size; ++i) {
+    top_data[i] = -1;
+  }
+}
+#else
+#endif
+void GetConfidenceScores_opt(
+    const float *conf_data, const int num, const int num_preds_per_class,
+    const int num_classes, const float score_threshold,
+    std::vector<std::map<int, std::vector<std::pair<float, int>>>> *conf_preds) {
+  conf_preds->clear();
+  conf_preds->resize(num);
+
+  for (int i = 0; i < num; i++) {
+    std::map<int, std::vector<std::pair<float, int>>> &label_scores = (*conf_preds)[i];
+    for (int p = 0; p < num_preds_per_class; ++p) {
+      int start_idx = p * num_classes;
+      for (int c = 0; c < num_classes; ++c) {
+        if (conf_data[start_idx + c] > score_threshold) {
+          label_scores[c].push_back(std::make_pair(conf_data[start_idx + c], p));
+        }
+      }
+    }
+    conf_data += num_preds_per_class * num_classes;
+  }
+}
+
+void GetLocPredictions_opt(const float *loc_data, const int num,
+                           const int num_preds_per_class, const int num_loc_classes,
+                           const bool share_location, float *decode_index,
+                           std::vector<LabelBBox_l> *loc_preds) {
+  loc_preds->clear();
+  if (share_location) {
+    assert(num_loc_classes == 1);
+  }
+  loc_preds->resize(num);
+  float *decode_pos = decode_index;
+  for (int i = 0; i < num; ++i) {
+    if (share_location) {
+      decode_pos = decode_index + i * num_preds_per_class;
+    }
+    LabelBBox_l &label_bbox = (*loc_preds)[i];
+    for (int p = 0; p < num_preds_per_class; ++p) {
+      int start_idx = p * num_loc_classes * 4;
+      for (int c = 0; c < num_loc_classes; ++c) {
+        if (!share_location) {
+          decode_pos = decode_index + num_preds_per_class * num_loc_classes * i +
+                       num_preds_per_class * c;
+        }
+        int label = share_location ? -1 : c;
+        if (label_bbox.find(label) == label_bbox.end()) {
+          label_bbox[label].resize(num_preds_per_class);
+        }
+        if (decode_pos[p] != 1) {
+          continue;
+        }
+        label_bbox[label][p].xy.s.xmin = loc_data[start_idx + c * 4];
+        label_bbox[label][p].xy.s.ymin = loc_data[start_idx + c * 4 + 1];
+        label_bbox[label][p].xy.s.xmax = loc_data[start_idx + c * 4 + 2];
+        label_bbox[label][p].xy.s.ymax = loc_data[start_idx + c * 4 + 3];
+      }
+    }
+    loc_data += num_preds_per_class * num_loc_classes * 4;
+  }
+}
+
+void DecodeBBoxesAll_opt(const std::vector<LabelBBox_l> &all_loc_preds, int num_priors,
+                         const float *prior_data, const int num,
+                         const bool share_location, const int num_loc_classes,
+                         const int background_label_id, const CodeType code_type,
+                         const bool variance_encoded_in_target, float *decode_index,
+                         std::vector<LabelBBox_l> *all_decode_bboxes) {
+  assert(all_loc_preds.size() == (size_t)num);
+  all_decode_bboxes->clear();
+  all_decode_bboxes->resize(num);
+  float *decode_pos = decode_index;
+  for (int i = 0; i < num; ++i) {
+    if (share_location) {
+      decode_pos = decode_index + i * num_priors;
+    }
+    // Decode predictions into bboxes.
+    for (int c = 0; c < num_loc_classes; ++c) {
+      int label = share_location ? -1 : c;
+      if (label == background_label_id) {
+        // Ignore background class.
+        continue;
+      }
+      if (all_loc_preds[i].find(label) == all_loc_preds[i].end()) {
+        //TPU_LOG_DEBUG("Could not find location predictions for label %d\n", label);
+      }
+      const std::vector<BBox_l> &bboxes = all_loc_preds[i].find(label)->second;
+      LabelBBox_l &decode_bboxes = (*all_decode_bboxes)[i];
+      std::vector<BBox_l> *p = &(decode_bboxes[label]);
+      p->clear();
+
+      if (!share_location) {
+        decode_pos = decode_index + num_priors * num_loc_classes * i + num_priors * c;
+      }
+      for (int k = 0; k < num_priors; ++k) {
+        // NormalizedBBox decode_bbox;
+        BBox_l decode_bbox;
+        if (decode_pos[k] != 1) {
+          p->push_back(decode_bbox);
+          continue;
+        }
+        // opt CENTER_SIZE
+        assert(code_type == PriorBoxParameter_CodeType_CENTER_SIZE);
+        // prior_bboxes
+        int start_idx = k * 4;
+        const float *p0 = prior_data + start_idx;
+        const float *p1 = prior_data + start_idx + 4 * num_priors;
+        float prior_width = p0[2] - p0[0];
+        assert(prior_width > 0);
+        float prior_height = p0[3] - p0[1];
+        assert(prior_height > 0);
+        float prior_center_x = (p0[0] + p0[2]) * 0.5;
+        float prior_center_y = (p0[1] + p0[3]) * 0.5;
+
+        float decode_bbox_center_x, decode_bbox_center_y;
+        float decode_bbox_width, decode_bbox_height;
+        if (variance_encoded_in_target) {
+          // variance is encoded in target, we simply need to retore the offset
+          // predictions.
+          decode_bbox_center_x = bboxes[k].xy.s.xmin * prior_width + prior_center_x;
+          decode_bbox_center_y = bboxes[k].xy.s.ymin * prior_height + prior_center_y;
+          decode_bbox_width = std::exp(bboxes[k].xy.s.xmax) * prior_width;
+          decode_bbox_height = std::exp(bboxes[k].xy.s.ymax) * prior_height;
+        } else {
+          // variance is encoded in bbox, we need to scale the offset accordingly.
+          decode_bbox_center_x = p1[0] * bboxes[k].xy.s.xmin * prior_width + prior_center_x;
+          decode_bbox_center_y = p1[1] * bboxes[k].xy.s.ymin * prior_height + prior_center_y;
+          decode_bbox_width = std::exp(p1[2] * bboxes[k].xy.s.xmax) * prior_width;
+          decode_bbox_height = std::exp(p1[3] * bboxes[k].xy.s.ymax) * prior_height;
+        }
+        decode_bbox.xy.s.xmin = decode_bbox_center_x - decode_bbox_width * 0.5;
+        decode_bbox.xy.s.ymin = decode_bbox_center_y - decode_bbox_height * 0.5;
+        decode_bbox.xy.s.xmax = decode_bbox_center_x + decode_bbox_width * 0.5;
+        decode_bbox.xy.s.ymax = decode_bbox_center_y + decode_bbox_height * 0.5;
+        decode_bbox.CalcSize();
+        p->push_back(decode_bbox);
+      }
+    }
+  }
+}
+
+void SSDDetectionFunc::ApplyNMSFast_opt(const std::vector<BBox_l> &bboxes,
+                      const std::vector<std::pair<float, int>> &conf_score,
+                      const float nms_threshold, const float eta, int top_k,
+                      std::vector<std::pair<float, int>> *indices) {
+  // Do nms.
+  float adaptive_threshold = nms_threshold;
+  int i = 0;
+  int length = (top_k < (int)conf_score.size()) ? top_k : conf_score.size();
+  while (length != i) {
+    bool keep = true;
+    for (int k = 0; k < (int)indices->size(); ++k) {
+      if (keep) {
+        const int kept_idx = (*indices)[k].second;
+        const BBox_l &b1 = bboxes[conf_score[i].second];
+        const BBox_l &b2 = bboxes[kept_idx];
+
+        if (b2.xy.s.xmin > b1.xy.s.xmax || b2.xy.s.xmax < b1.xy.s.xmin || b2.xy.s.ymin > b1.xy.s.ymax ||
+            b2.xy.s.ymax < b1.xy.s.ymin) {
+          keep = true;
+        } else {
+          const float inter_xmin = std::max(b1.xy.s.xmin, b2.xy.s.xmin);
+          const float inter_ymin = std::max(b1.xy.s.ymin, b2.xy.s.ymin);
+          const float inter_xmax = std::min(b1.xy.s.xmax, b2.xy.s.xmax);
+          const float inter_ymax = std::min(b1.xy.s.ymax, b2.xy.s.ymax);
+          const float inter_width = inter_xmax - inter_xmin;
+          const float inter_height = inter_ymax - inter_ymin;
+          const float inter_size = inter_width * inter_height;
+          const float total_size = b1.size + b2.size;
+          keep =
+              (inter_size * (adaptive_threshold + 1) <= total_size * adaptive_threshold)
+                  ? true
+                  : false;
+        }
+      } else {
+        break;
+      }
+    }
+    if (keep) {
+      indices->push_back(conf_score[i]);
+    }
+    if (keep && eta < 1 && adaptive_threshold > 0.5) {
+      adaptive_threshold *= eta;
+    }
+    i++;
+  }
+}
+
+SSDDetectionFunc::~SSDDetectionFunc() {}
+
+void SSDDetectionFunc::setup(tensor_list_t &inputs,
+                             tensor_list_t &outputs,
+                             OpParam &param) {
+  _num_classes = param.get<int32_t>("num_classes");
+  _share_location = param.get<bool>("share_location");
+  _background_label_id = param.get<int32_t>("background_label_id");
+  _code_type = param.get<std::string>("code_type");
+  _top_k = param.get<int32_t>("top_k");
+  _nms_threshold = param.get<float>("nms_threshold");
+  _obj_threshold = param.get<float>("confidence_threshold");
+  _keep_topk = param.get<int32_t>("keep_top_k");
+
+  // location : mbox_loc [1, prior_num * 4]
+  // priorbox : mbox_priorbox [1, 2, prior_num * 4]
+  // confidence: mbox_conf [1, prior_num * class_num]
+  // can't decide input order by shape
+  // so still use name to arrange
+  std::vector<std::string> names = {"mbox_loc", "mbox_conf", "mbox_priorbox"};
+
+  for (auto name : names) {
+    for (auto input : inputs) {
+      if (input->name.find(name) != std::string::npos) {
+        _bottoms.emplace_back(input);
+        break;
+      }
+    }
+  }
+  if (_bottoms.size() != 3){
+    _bottoms.clear();
+    for (auto input : inputs) {
+      _bottoms.emplace_back(input);
+    }
+  }
+  _tops = outputs;
+}
+
+void SSDDetectionFunc::run() {
+  //struct timeval net_fwd_time_t0;
+  //gettimeofday(&net_fwd_time_t0, NULL);
+  auto top_data = _tops[0]->cpu_data<float>();
+
+  size_t bottom_count = _bottoms.size();
+  assert(bottom_count == 3);
+
+  int num_loc_classes = _share_location ? 1 : _num_classes;
+  float eta_ = 1.0;
+  Decode_CodeType code_type_;
+  if (_code_type == "CORNER") {
+    code_type_ = PriorBoxParameter_CodeType_CORNER;
+  } else if (_code_type == "CENTER_SIZE") {
+    code_type_ = PriorBoxParameter_CodeType_CENTER_SIZE;
+  } else if (_code_type == "CORNER_SIZE") {
+    code_type_ = PriorBoxParameter_CodeType_CORNER_SIZE;
+  } else {
+    assert(0);
+  }
+
+  //TPU_LOG_DEBUG("priorbox_size: %zu\n", _bottoms[2]->shape.size());
+  //TPU_LOG_DEBUG("n = %d, c = %d, h = %d, w = %d\n",
+  //              _bottoms[2]->shape[0], _bottoms[2]->shape[1],
+  //              _bottoms[2]->shape[2], _bottoms[2]->shape[3]);
+
+  bool variance_encoded_in_target_ = false;
+
+#ifdef __ARM_NEON
+  neon_run(top_data, variance_encoded_in_target_, num_loc_classes, eta_, code_type_);
+
+  return;
+#else
+#endif
+  memset(top_data, 0, _tops[0]->size());
+
+  int num = _bottoms[0]->shape[0];  // batch_size
+  int num_priors_ = _bottoms[2]->shape[2] / 4;
+
+  float *loc_data = _bottoms[0]->cpu_data<float>();
+  float *conf_data = _bottoms[1]->cpu_data<float>();
+  float *prior_data = reinterpret_cast<float *>(_bottoms[2]->cpu_data<uint8_t>());
+
+  std::vector<std::map<int, std::vector<std::pair<float, int>>>> all_conf_scores;
+  // filter result by score, retun batched vector [{label0: [(score, detcetion_idx), ...],  ...}, ...]
+  GetConfidenceScores_opt(conf_data, num, num_priors_, _num_classes, _obj_threshold,
+                          &all_conf_scores);
+  for (int i = 0; i < num; ++i) {
+    for (int c = 0; c < _num_classes; ++c) {
+      if (all_conf_scores[i].find(c) == all_conf_scores[i].end()) {
+        continue;
+      }
+      std::vector<std::pair<float, int>> &scores = all_conf_scores[i].find(c)->second;
+      // get topK in eatch class
+      if (_top_k < (int)scores.size()) {
+        std::partial_sort(scores.begin(), scores.begin() + _top_k, scores.end(),
+                          SortScoreCmp0);
+      } else {
+        std::sort(scores.begin(), scores.end(), SortScoreCmp0);
+      }
+    }   // nofclass
+  }   // batch
+
+  // build keep for decode ,recode vilad index
+  float *decode_keep_index;
+  int buf_length = 0;
+  if (_share_location) {
+    buf_length = num * num_priors_;
+  } else {
+    buf_length = num * num_priors_ * _num_classes;
+  }
+  decode_keep_index = new float[buf_length];
+  memset(decode_keep_index, 0, buf_length * 4);
+  float *p = decode_keep_index;
+  for (int i = 0; i < num; ++i) {
+    if (_share_location) {
+      p = decode_keep_index + num_priors_ * i;
+    }
+    for (int c = 0; c < _num_classes; ++c) {
+      if (!_share_location) {
+        p = decode_keep_index + num_priors_ * _num_classes * i + num_priors_ * c;
+      }
+      if (c == _background_label_id) {
+        // Ignore background class.
+        continue;
+      }
+
+      if (all_conf_scores[i].find(c) == all_conf_scores[i].end())
+        continue;
+      std::vector<std::pair<float, int>> &scores = all_conf_scores[i].find(c)->second;
+      int length = _top_k < (int)scores.size() ? _top_k : scores.size();
+      for (int k = 0; k < length; ++k) {
+        p[scores[k].second] = 1;
+      }
+    }
+  }
+
+  // Retrieve all location predictions.
+  std::vector<LabelBBox_l> all_loc_preds;
+  GetLocPredictions_opt(loc_data, num, num_priors_, num_loc_classes, _share_location,
+                        decode_keep_index, &all_loc_preds);
+
+  // Decode all loc predictions to bboxes.
+  std::vector<LabelBBox_l> all_decode_bboxes;
+  DecodeBBoxesAll_opt(all_loc_preds, num_priors_, prior_data, num, _share_location,
+                      num_loc_classes, _background_label_id, code_type_,
+                      variance_encoded_in_target_, decode_keep_index, &all_decode_bboxes);
+  delete[] decode_keep_index;
+
+  int num_kept = 0;
+  std::vector<std::map<int, std::vector<std::pair<float, int>>>> all_indices;
+  for (int i = 0; i < num; ++i) {
+    const LabelBBox_l &decode_bboxes = all_decode_bboxes[i];
+    const std::map<int, std::vector<std::pair<float, int>>> &conf_scores =
+        all_conf_scores[i];
+    std::map<int, std::vector<std::pair<float, int>>> indices;
+    int num_det = 0;
+    for (int c = 0; c < _num_classes; ++c) {
+      if (c == _background_label_id) {
+        // Ignore background class.
+        continue;
+      }
+      if (conf_scores.find(c) == conf_scores.end())
+        continue;
+      int label = _share_location ? -1 : c;
+      if (decode_bboxes.find(label) == decode_bboxes.end()) {
+        // Something bad happened if there are no predictions for current label.
+        continue;
+      }
+      const std::vector<BBox_l> &bboxes = decode_bboxes.find(label)->second;
+      const std::vector<std::pair<float, int>> &aa = conf_scores.find(c)->second;
+      ApplyNMSFast_opt(bboxes, aa, _nms_threshold, eta_, _top_k, &(indices[c]));
+
+      num_det += indices[c].size();
+    }
+
+    if (_keep_topk > -1 && num_det > _keep_topk) {
+      std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
+      for (auto it = indices.begin(); it != indices.end(); ++it) {
+        int label = it->first;
+        const std::vector<std::pair<float, int>> &label_indices = it->second;
+        for (int j = 0; j < (int)label_indices.size(); ++j) {
+          score_index_pairs.emplace_back(std::make_pair(
+              label_indices[j].first, std::make_pair(label, label_indices[j].second)));
+        }
+      }
+      // Keep top k results per image.
+      std::sort(score_index_pairs.begin(), score_index_pairs.end(), SortScoreCmp1);
+      score_index_pairs.resize(_keep_topk);
+      // Store the new indices.
+      std::map<int, std::vector<std::pair<float, int>>> new_indices;
+      for (int j = 0; j < (int)score_index_pairs.size(); ++j) {
+
+        int label = score_index_pairs[j].second.first;
+        int idx = score_index_pairs[j].second.second;
+        float s = score_index_pairs[j].first;
+
+        new_indices[label].emplace_back(std::make_pair(s, idx));
+      }
+      all_indices.emplace_back(new_indices);
+      num_kept += _keep_topk;
+    } else {
+      all_indices.emplace_back(indices);
+      num_kept += num_det;
+    }
+  }
+
+  int output_size = num * _keep_topk * 1 * 1 * 7;
+  for (int i = 0; i < output_size; ++i) {
+    top_data[i] = -1;
+  }
+
+  if (num_kept == 0) {
+    // Generate fake results per image.
+    for (int i = 0; i < num; ++i) {
+      top_data[0] = i;
+      top_data += 7;
+    }
+  } else {
+    int count = 0;
+    for (int i = 0; i < num; ++i) {
+      const LabelBBox_l &decode_bboxes = all_decode_bboxes[i];
+      for (auto it = all_indices[i].begin(); it != all_indices[i].end(); ++it) {
+        int label = it->first;
+        int loc_label = _share_location ? -1 : label;
+        if (decode_bboxes.find(loc_label) == decode_bboxes.end()) {
+          // Something bad happened if there are no predictions for current label.
+          continue;
+        }
+        const std::vector<BBox_l> &bboxes = decode_bboxes.find(loc_label)->second;
+        std::vector<std::pair<float, int>> &indices = it->second;
+        for (int j = 0; j < (int)indices.size(); ++j) {
+          int idx = indices[j].second;
+          top_data[count * 7] = i;
+          top_data[count * 7 + 1] = label;
+          top_data[count * 7 + 2] = indices[j].first;
+          const BBox_l &bbox = bboxes[idx];
+          top_data[count * 7 + 3] = bbox.xy.s.xmin;
+          top_data[count * 7 + 4] = bbox.xy.s.ymin;
+          top_data[count * 7 + 5] = bbox.xy.s.xmax;
+          top_data[count * 7 + 6] = bbox.xy.s.ymax;
+          ++count;
+        }
+      }
+    }
+  }
+}
+
+} // namespace runtime
+} // namespace cvi
diff --git a/cviruntime/src/common/cpu_function/ssd_detection.hpp b/cviruntime/src/common/cpu_function/ssd_detection.hpp
new file mode 100644
index 000000000..c1314c3bd
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/ssd_detection.hpp
@@ -0,0 +1,85 @@
+#include <iostream>
+#include <vector>
+#include <string>
+#include <algorithm>
+#include <unordered_map>
+#include <runtime/neuron.hpp>
+#include <runtime/cpu_function.hpp>
+
+
+namespace cvi {
+namespace runtime {
+
+enum Decode_CodeType {
+  PriorBoxParameter_CodeType_CORNER = 1,
+  PriorBoxParameter_CodeType_CENTER_SIZE = 2,
+  PriorBoxParameter_CodeType_CORNER_SIZE = 3
+};
+
+class BBox_l {
+  public:
+  float num;
+  float label;
+  float score;
+  union {
+    struct {
+      float xmin;
+      float ymin;
+      float xmax;
+      float ymax;
+    } s;
+    float b[4]; //for neon used
+  } xy;
+  float size;
+
+  void CalcSize() {
+    if (xy.s.xmax < xy.s.xmin || xy.s.ymax < xy.s.ymin) {
+      size = 0;
+    } else {
+      float width = xy.s.xmax - xy.s.xmin;
+      float height = xy.s.ymax - xy.s.ymin;
+      size = width * height;
+    }
+  }
+};
+
+typedef Decode_CodeType CodeType;
+typedef std::map<int, std::vector<BBox_l> > LabelBBox_l;
+
+class SSDDetectionFunc : public ICpuFunction {
+
+public:
+  SSDDetectionFunc() {}
+
+  ~SSDDetectionFunc();
+  void setup(tensor_list_t &inputs,
+             tensor_list_t &outputs,
+             OpParam &param);
+  void run();
+  void neon_run(float* top_data, bool variance_encoded_in_target_,
+      int num_loc_classes, float eta_, Decode_CodeType code_type_);
+
+  static ICpuFunction *open() { return new SSDDetectionFunc(); }
+  static void close(ICpuFunction *func) { delete func; }
+
+private:
+  tensor_list_t _bottoms;
+  tensor_list_t _tops;
+
+  void ApplyNMSFast_opt(const std::vector<BBox_l> &bboxes,
+                        const std::vector<std::pair<float, int>> &conf_score,
+                        const float nms_threshold, const float eta, int top_k,
+                        std::vector<std::pair<float, int>> *indices);
+
+  int _num_classes;
+  bool _share_location{true};
+  int _background_label_id;
+  int _top_k;
+  std::string _code_type;
+  float _nms_threshold;
+  float _obj_threshold;
+  int _keep_topk;
+};
+
+}
+}
diff --git a/cviruntime/src/common/cpu_function/transpose.cpp b/cviruntime/src/common/cpu_function/transpose.cpp
new file mode 100644
index 000000000..e9b4f134f
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/transpose.cpp
@@ -0,0 +1,34 @@
+#include <iostream>
+#include <vector>
+#include <runtime/neuron.hpp>
+#include <cpu_function/transpose.hpp>
+
+namespace cvi {
+namespace runtime {
+
+void TransposeFunc::setup(tensor_list_t &inputs,
+                           tensor_list_t &outputs,
+                           OpParam &param) {
+  (void)param;
+  _bottom = inputs[0];
+  _top = outputs[0];
+}
+
+void TransposeFunc::run() {
+  int channel = _top->shape[1];
+  int channel_size = _top->shape[2] * _top->shape[3];
+  int image_size = channel * channel_size;
+  auto bottom_data = _bottom->cpu_data<float>();
+  auto top_data = _top->cpu_data<float>();
+
+  for (int i = 0; i < _top->shape[0]; i++) {
+    for (int j = 0; j < image_size; j++) {
+      int x_idx = i * image_size + j;
+      int y_idx = i * image_size + (j % channel) * channel_size + j / channel;
+      top_data[y_idx] = bottom_data[x_idx];
+    }
+  }
+}
+
+} // namespace runtime
+} // namespace cvi
\ No newline at end of file
diff --git a/cviruntime/src/common/cpu_function/transpose.hpp b/cviruntime/src/common/cpu_function/transpose.hpp
new file mode 100644
index 000000000..a7ad86182
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/transpose.hpp
@@ -0,0 +1,24 @@
+#include <iostream>
+#include <vector>
+#include <runtime/neuron.hpp>
+#include <runtime/cpu_function.hpp>
+
+namespace cvi {
+namespace runtime {
+
+class TransposeFunc : public ICpuFunction {
+public:
+  void setup(std::vector<std::shared_ptr<Neuron> > &inputs,
+             std::vector<std::shared_ptr<Neuron> > &outputs,
+             OpParam &param);
+  void run();
+  static ICpuFunction *open() { return new TransposeFunc(); }
+  static void close(ICpuFunction *func) { delete func; }
+
+private:
+  std::shared_ptr<Neuron> _bottom;
+  std::shared_ptr<Neuron> _top;
+};
+
+}
+}
\ No newline at end of file
diff --git a/cviruntime/src/common/cpu_function/yolo_detection.cpp b/cviruntime/src/common/cpu_function/yolo_detection.cpp
new file mode 100644
index 000000000..529ae797b
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/yolo_detection.cpp
@@ -0,0 +1,358 @@
+#include <iostream>
+#include <vector>
+#include <cmath>
+#include <sstream>
+#include <runtime/debug.h>
+#include <runtime/neuron.hpp>
+#include <cpu_function/yolo_detection.hpp>
+
+namespace cvi {
+namespace runtime {
+
+#define MAX_DET 200
+#define MAX_DET_RAW 500
+
+typedef struct box_ {
+  float x, y, w, h;
+} box;
+
+typedef struct detection_ {
+  box bbox;
+  int cls;
+  float score;
+} detection;
+
+static inline float exp_fast(float x) {
+  union {
+    unsigned int i;
+    float f;
+  } v;
+  v.i = (1 << 23) * (1.4426950409 * x + 126.93490512f);
+
+  return v.f;
+}
+
+static inline float _sigmoid(float x, bool fast) {
+  if (fast)
+    return 1.0f / (1.0f + exp_fast(-x));
+  else
+    return 1.0f / (1.0f + std::exp(-x));
+}
+
+static inline float _softmax(float *probs, float *data, int input_stride,
+                             int num_of_class, int *max_cls, bool fast) {
+  //assert(num_of_class == 80);
+  float x[num_of_class];
+  float max_x = -INFINITY;
+  float min_x = INFINITY;
+  for (int i = 0; i < num_of_class; i++) {
+    x[i] = data[i * input_stride];
+    if (x[i] > max_x) {
+      max_x = x[i];
+    }
+    if (x[i] < min_x) {
+      min_x = x[i];
+    }
+  }
+#define t (-100.0f)
+  float exp_x[num_of_class];
+  float sum = 0;
+  for (int i = 0; i < num_of_class; i++) {
+    x[i] = x[i] - max_x;
+    if (min_x < t)
+      x[i] = x[i] / min_x * t;
+    if (fast)
+      exp_x[i] = exp_fast(x[i]);
+    else
+      exp_x[i] = std::exp(x[i]);
+    sum += exp_x[i];
+  }
+  float max_prob = 0;
+  for (int i = 0; i < num_of_class; i++) {
+    probs[i] = exp_x[i] / sum;
+    if (probs[i] > max_prob) {
+      max_prob = probs[i];
+      *max_cls = i;
+    }
+  }
+  return max_prob;
+}
+
+// feature in shape [3][5+80][grid_size][grid_size]
+#define GET_INDEX(cell_idx, box_idx_in_cell, data_idx, num_cell, class_num)                         \
+  (box_idx_in_cell * (class_num + 5) * num_cell + data_idx * num_cell + cell_idx)
+
+static void process_feature(detection *det, int *det_idx, float *feature,
+                            std::vector<int> grid_size, float *anchor,
+                            std::vector<int> yolo_size, int num_of_class,
+                            float obj_threshold) {
+  int yolo_w = yolo_size[1];
+  int yolo_h = yolo_size[0];
+  //TPU_LOG_DEBUG("grid_size_h: %d\n", grid_size[0]);
+  //TPU_LOG_DEBUG("grid_size_w: %d\n", grid_size[1]);
+  //TPU_LOG_DEBUG("obj_threshold: %f\n", obj_threshold);
+  int num_boxes_per_cell = 3;
+  //assert(num_of_class == 80);
+
+// 255 = 3 * (5 + 80)
+// feature in shape [3][5+80][grid_size][grid_size]
+#define COORD_X_INDEX (0)
+#define COORD_Y_INDEX (1)
+#define COORD_W_INDEX (2)
+#define COORD_H_INDEX (3)
+#define CONF_INDEX (4)
+#define CLS_INDEX (5)
+  int num_cell = grid_size[0] * grid_size[1];
+  // int box_dim = 5 + num_of_class;
+
+  int idx = *det_idx;
+  int hit = 0, hit2 = 0;
+  ;
+  for (int i = 0; i < num_cell; i++) {
+    for (int j = 0; j < num_boxes_per_cell; j++) {
+      float box_confidence =
+          _sigmoid(feature[GET_INDEX(i, j, CONF_INDEX, num_cell, num_of_class)], false);
+      if (box_confidence < obj_threshold) {
+        continue;
+      }
+      hit++;
+      float box_class_probs[num_of_class];
+      int box_max_cls = -1;
+      float box_max_prob =
+          _softmax(box_class_probs, &feature[GET_INDEX(i, j, CLS_INDEX, num_cell, num_of_class)],
+                   num_cell, num_of_class, &box_max_cls, false);
+      float box_max_score = box_confidence * box_max_prob;
+      if (box_max_score < obj_threshold) {
+        continue;
+      }
+      // get coord now
+      int grid_x = i % grid_size[1];
+      int grid_y = i / grid_size[1];
+      float box_x = _sigmoid(feature[GET_INDEX(i, j, COORD_X_INDEX, num_cell, num_of_class)], false);
+      box_x += grid_x;
+      box_x /= grid_size[1];
+      float box_y = _sigmoid(feature[GET_INDEX(i, j, COORD_Y_INDEX, num_cell, num_of_class)], false);
+      box_y += grid_y;
+      box_y /= grid_size[0];
+      // anchor is in shape [3][2]
+      float box_w = std::exp(feature[GET_INDEX(i, j, COORD_W_INDEX, num_cell, num_of_class)]);
+      box_w *= anchor[j * 2];
+      box_w /= yolo_w;
+      float box_h = std::exp(feature[GET_INDEX(i, j, COORD_H_INDEX, num_cell, num_of_class)]);
+      box_h *= anchor[j * 2 + 1];
+      box_h /= yolo_h;
+      hit2++;
+      // DBG("  hit2 %d, conf = %f, cls = %d, coord = [%f, %f, %f, %f]\n",
+      //    hit2, box_max_score, box_max_cls, box_x, box_y, box_w, box_h);
+      det[idx].bbox = box{box_x, box_y, box_w, box_h};
+      det[idx].score = box_max_score;
+      det[idx].cls = box_max_cls;
+      idx++;
+      assert(idx <= MAX_DET);
+    }
+  }
+  *det_idx = idx;
+}
+
+// https://github.com/ChenYingpeng/caffe-yolov3/blob/master/box.cpp
+static float overlap(float x1, float w1, float x2, float w2) {
+  float l1 = x1 - w1 / 2;
+  float l2 = x2 - w2 / 2;
+  float left = l1 > l2 ? l1 : l2;
+  float r1 = x1 + w1 / 2;
+  float r2 = x2 + w2 / 2;
+  float right = r1 < r2 ? r1 : r2;
+  return right - left;
+}
+
+static float box_intersection(box a, box b) {
+  float w = overlap(a.x, a.w, b.x, b.w);
+  float h = overlap(a.y, a.h, b.y, b.h);
+  if (w < 0 || h < 0)
+    return 0;
+  float area = w * h;
+  return area;
+}
+
+static float box_union(box a, box b) {
+  float i = box_intersection(a, b);
+  float u = a.w * a.h + b.w * b.h - i;
+  return u;
+}
+
+//
+// more aboud iou
+//   https://github.com/ultralytics/yolov3/blob/master/utils/utils.py
+// IoU = inter / (a + b - inter), can't handle enclosure issue
+// GIoU, DIoU, CIoU?
+//
+static float box_iou(box a, box b) {
+  return box_intersection(a, b) / box_union(a, b);
+}
+
+static void nms(detection *det, int num, float nms_threshold) {
+  for (int i = 0; i < num; i++) {
+    if (det[i].score == 0) {
+      // erased already
+      continue;
+    }
+    for (int j = i + 1; j < num; j++) {
+      if (det[j].score == 0) {
+        // erased already
+        continue;
+      }
+      if (det[i].cls != det[j].cls) {
+        // not the same class
+        continue;
+      }
+      float iou = box_iou(det[i].bbox, det[j].bbox);
+      assert(iou <= 1.0f);
+      if (iou > nms_threshold) {
+        // overlapped, select one to erase
+        if (det[i].score < det[j].score) {
+          det[i].score = 0;
+        } else {
+          det[j].score = 0;
+        }
+      }
+    }
+  }
+}
+
+YoloDetectionFunc::~YoloDetectionFunc() {}
+
+void YoloDetectionFunc::setup(tensor_list_t &inputs,
+                              tensor_list_t &outputs,
+                              OpParam &param) {
+  _net_input_h = param.get<int32_t>("net_input_h");
+  _net_input_w = param.get<int32_t>("net_input_w");
+  _nms_threshold = param.get<float>("nms_threshold");
+  _obj_threshold = param.get<float>("obj_threshold");
+  _keep_topk = param.get<int32_t>("keep_topk");
+
+  if (param.has("tiny")) {
+    _tiny = param.get<bool>("tiny");
+  }
+
+  if (param.has("yolo_v4")) {
+    _yolo_v4 = param.get<bool>("yolo_v4");
+  }
+
+  if (param.has("spp_net")) {
+    _spp_net = param.get<bool>("spp_net");
+  }
+
+  if (param.has("class_num")) {
+    _class_num = param.get<int32_t>("class_num");
+  }
+
+  _anchors.clear();
+  if (param.has("anchors")) {
+    auto anchors = param.get<std::string>("anchors");
+
+    std::istringstream iss(anchors);
+    std::string s;
+    while (std::getline(iss, s, ',')) {
+      _anchors.push_back(atof(s.c_str()));
+    }
+  }
+
+  std::sort(inputs.begin(), inputs.end(),
+   [](const std::shared_ptr<Neuron> &a, const std::shared_ptr<Neuron> &b) {
+     return a->shape[3] > b->shape[3];
+   });
+
+  _bottoms = inputs;
+  _tops = outputs;
+
+  if (_tiny) {
+    assert(_bottoms.size() == 2);
+    if (_anchors.size() == 0) {
+      // Yolov3-tiny default anchors
+      _anchors = {
+        10,14, 23,27, 37,58,      // layer23-conv (26*26)
+        81,82, 135,169, 344,319   // layer16-conv (13*13)
+      };
+    }
+  } else {
+    assert(_bottoms.size() == 3);
+    if (_anchors.size() == 0) {
+      if (_yolo_v4) {
+        _anchors = {
+          142, 110, 192, 243, 459, 401, // layer161-conv
+          36, 75, 76, 55, 72, 146,// layer150-conv
+          12, 16, 19, 36, 40, 28, // layer139-conv
+        };
+      }
+      else {
+        // Yolov3 default anchors
+        _anchors = {
+          10,13, 16,30, 33,23,      // layer106-conv (52*52)
+          30,61, 62,45, 59,119,     // layer94-conv  (26*26)
+          116,90, 156,198, 373,326  // layer82-conv  (13*13)
+        };
+      }
+    }
+  }
+}
+
+void YoloDetectionFunc::run() {
+  auto top_data = _tops[0]->cpu_data<float>();
+  memset(top_data, 0, _tops[0]->size());
+  int batch = _tops[0]->shape[0];
+
+  size_t bottom_count = _bottoms.size();
+  assert(_anchors.size() == bottom_count * 6);
+  float (*anchors)[6] = (float (*)[6])_anchors.data();
+
+  for (int b = 0; b < batch; ++b) {
+    std::vector<std::vector<int>> grid_size;
+    std::vector<float*> features;
+
+    for (size_t i = 0; i < bottom_count; ++i) {
+      int offset = b * _bottoms[i]->shape[1] * _bottoms[i]->shape[2] * _bottoms[i]->shape[3];
+      grid_size.push_back({_bottoms[i]->shape[2], _bottoms[i]->shape[3]});
+      auto data = _bottoms[i]->cpu_data<float>() + offset;
+      //auto size = _bottoms[i]->count() / batch;
+      //std::vector<float> bottom_data(data, data + size);
+      features.push_back(data);
+    }
+
+    detection det_raw[MAX_DET_RAW];
+    detection dets[MAX_DET];
+    int det_raw_idx = 0;
+    for (size_t i = 0; i < features.size(); i++) {
+      process_feature(det_raw, &det_raw_idx, features[i], grid_size[i],
+                    &anchors[i][0], {_net_input_h, _net_input_w}, _class_num, _obj_threshold);
+    }
+    nms(det_raw, det_raw_idx, _nms_threshold);
+    int det_idx = 0;
+    for (int i = 0; i < det_raw_idx; i++) {
+      if (det_raw[i].score > 0) {
+        dets[det_idx] = det_raw[i];
+        det_idx++;
+      }
+    }
+
+    auto keep_topk = _keep_topk;
+    if (keep_topk > det_idx)
+      keep_topk = det_idx;
+
+    long long count = 0;
+    auto batch_output_data = top_data + b * _tops[0]->shape[1] * _tops[0]->shape[2] * _tops[0]->shape[3];
+    for (int i = 0; i < keep_topk; ++i) {
+      batch_output_data[count++] = dets[i].bbox.x;
+      batch_output_data[count++] = dets[i].bbox.y;
+      batch_output_data[count++] = dets[i].bbox.w;
+      batch_output_data[count++] = dets[i].bbox.h;
+      batch_output_data[count++] = dets[i].cls;
+      batch_output_data[count++] = dets[i].score;
+
+      //TPU_LOG_DEBUG("x = %f, y = %f, w = %f, h = %f, class = %d, score = %f\n",
+      //              dets[i].bbox.x, dets[i].bbox.y, dets[i].bbox.w, dets[i].bbox.h, dets[i].cls, dets[i].score);
+    }
+  }
+}
+
+} // namespace runtime
+} // namespace cvi
diff --git a/cviruntime/src/common/cpu_function/yolo_detection.hpp b/cviruntime/src/common/cpu_function/yolo_detection.hpp
new file mode 100644
index 000000000..77a8cdd7d
--- /dev/null
+++ b/cviruntime/src/common/cpu_function/yolo_detection.hpp
@@ -0,0 +1,44 @@
+#include <iostream>
+#include <vector>
+#include <algorithm>
+#include <unordered_map>
+#include <runtime/neuron.hpp>
+#include <runtime/cpu_function.hpp>
+
+
+namespace cvi {
+namespace runtime {
+
+
+class YoloDetectionFunc : public ICpuFunction {
+
+public:
+  YoloDetectionFunc() {}
+
+  ~YoloDetectionFunc();
+  void setup(tensor_list_t &inputs,
+             tensor_list_t &outputs,
+             OpParam &param);
+  void run();
+
+  static ICpuFunction *open() { return new YoloDetectionFunc(); }
+  static void close(ICpuFunction *func) { delete func; }
+
+private:
+  tensor_list_t _bottoms;
+  tensor_list_t _tops;
+
+  int _net_input_h;
+  int _net_input_w;
+  float _nms_threshold;
+  float _obj_threshold;
+  int _keep_topk;
+  bool _tiny = false;
+  bool _yolo_v4 = false;
+  bool _spp_net = false;
+  int _class_num = 80;
+  std::vector<float> _anchors;
+};
+
+}
+}
diff --git a/cviruntime/src/common/debug.cpp b/cviruntime/src/common/debug.cpp
new file mode 100644
index 000000000..e01ab50a9
--- /dev/null
+++ b/cviruntime/src/common/debug.cpp
@@ -0,0 +1,40 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string>
+#include <fstream>
+#include <iostream>
+#include <runtime/debug.h>
+#include <cvibuilder/cvimodel_generated.h>
+
+void showRuntimeVersion() {
+  printf("Cvitek Runtime (%d.%d.%d)%s\n",
+         cvi::model::MajorVersion_value,
+         cvi::model::MinorVersion_value,
+         cvi::model::SubMinorVersion_value,
+         RUNTIME_VERSION);
+}
+
+void dumpSysfsDebugFile(const char *path) {
+  std::string line;
+  std::ifstream file(path);
+  std::cout << "dump " << path << "\n";
+  while (std::getline(file, line )) {
+    std::cout << line << "\n";
+  }
+  file.close();
+  std::cout << "=======\n";
+}
+
+void mem_protect(uint8_t *vaddr, size_t size) {
+  int ret = mprotect(vaddr, size, PROT_READ);
+  if (ret != 0) {
+    perror("cmdbuf memory protect failed");
+  }  
+}
+
+void mem_unprotect(uint8_t *vaddr, size_t size) {
+  int ret = mprotect(vaddr, size, PROT_READ | PROT_WRITE);
+  if (ret != 0) {
+    perror("cmdbuf memory unprotect failed");
+  }  
+}
diff --git a/cviruntime/src/common/kernel_function/IKernelFunc.hpp b/cviruntime/src/common/kernel_function/IKernelFunc.hpp
new file mode 100644
index 000000000..048d968e7
--- /dev/null
+++ b/cviruntime/src/common/kernel_function/IKernelFunc.hpp
@@ -0,0 +1,37 @@
+#include <sys/mman.h>
+#include <unistd.h>
+#include <dlfcn.h>
+#include <iostream>
+#include <sstream>
+#include <fstream>
+#include <cmath>
+#include <functional>
+#include <chrono>
+#include <algorithm>
+#include <cmath>
+#include <numeric>
+#include <queue>
+#include <vector>
+#include <stdlib.h>
+#include <string.h>
+#include <runtime/debug.h>
+#include <runtime/kernel_function.hpp>
+#include "cviruntime_extra.h"
+
+namespace cvi {
+namespace runtime {
+
+class IKernelFunc {
+public:
+  IKernelFunc(CVI_RT_HANDLE ctx) : ctx(ctx) {}
+  virtual ~IKernelFunc() {
+    if (cmdbuf_mem)
+      CVI_RT_MemFree(ctx, cmdbuf_mem);
+  }
+
+  CVI_RT_HANDLE ctx;
+  CVI_RT_MEM cmdbuf_mem = nullptr;
+};
+
+}
+}
\ No newline at end of file
diff --git a/cviruntime/src/common/kernel_function/euclideanDist.cpp b/cviruntime/src/common/kernel_function/euclideanDist.cpp
new file mode 100644
index 000000000..77c73753c
--- /dev/null
+++ b/cviruntime/src/common/kernel_function/euclideanDist.cpp
@@ -0,0 +1,224 @@
+#include <cassert>
+#include <iostream>
+#include <cmath>
+#include <fstream>
+#include <vector>
+#include <memory.h>
+
+#include <runtime/kernel_function.hpp>
+
+namespace cvi {
+namespace runtime {
+
+static void load_and_convert_to_bf16(cvk_context_t *cvk_ctx, cvk_tl_t *tl_mem,
+                                     cvk_tl_shape_t &shape,
+                                     cvk_tg_stride_t &stride, int x_base_ga_idx,
+                                     uint64_t x_ga) {
+  assert(tl_mem);
+  cvk_tdma_g2l_tensor_copy_param_t p1 = {0};
+  cvk_tg_t tg_x;
+  tg_x.start_address = x_ga;
+  tg_x.base_reg_index = x_base_ga_idx;
+  tg_x.int8_rnd_mode = 0;
+  tg_x.fmt = CVK_FMT_U8;
+  tg_x.shape = {shape.n, shape.c, shape.h, shape.w};
+  tg_x.stride = stride;
+  p1.src = &tg_x;
+  p1.dst = tl_mem;
+  cvk_ctx->ops->tdma_g2l_bf16_tensor_copy(cvk_ctx, &p1);
+
+  return;
+}
+
+static void convert_ps32_to_fp32(cvk_context_t *cvk_ctx, cvk_tl_t *output) {
+  assert(output->shape.n == 2); // Exclude lower part
+  assert((output->shape.h == 1) && (output->shape.w == 1) && "Only support h=1, w=1");
+
+  uint32_t la_high = output->start_address;
+  cvk_tl_t tl_src;
+  tl_src.start_address = la_high;
+  tl_src.fmt = CVK_FMT_BF16;
+  tl_src.shape = output->shape;
+  tl_src.shape.n = 1;
+  tl_src.stride = cvk_ctx->ops->tl_default_stride(cvk_ctx, tl_src.shape, tl_src.fmt, 1);
+  tl_src.stride.n = output->stride.n;
+
+  uint32_t la_low = output->start_address + tl_src.stride.n;
+  cvk_tl_t tl_dst;
+  tl_dst.start_address = la_low + sizeof(uint16_t); // concat higher part
+  tl_dst.fmt = CVK_FMT_BF16;
+  tl_dst.shape = output->shape;
+  tl_dst.shape.n = 1;
+  tl_dst.stride = cvk_ctx->ops->tl_default_stride(cvk_ctx, tl_dst.shape, tl_dst.fmt, 1);
+  tl_dst.stride.n = output->stride.n;
+
+  cvk_tiu_copy_param_t param = {0};
+  param.src = &tl_src;
+  param.dst = &tl_dst;
+  param.layer_id = 0;
+  cvk_ctx->ops->tiu_copy(cvk_ctx, &param);
+}
+
+static void store_fp32(cvk_context_t *cvk_ctx, int base_ga_idx, uint64_t ga_dst, cvk_tl_t *output) {
+                       
+  assert(output->shape.n == 2); // Exclude lower part
+  assert(output->shape.h == 1 && output->shape.w == 1);
+
+  cvk_tl_t src;
+  src.fmt = CVK_FMT_BF16;
+  src.shape = output->shape;
+  src.shape.n = 1;
+  src.shape.w = 2;
+  src.stride = cvk_ctx->ops->tl_default_stride(cvk_ctx, src.shape, src.fmt, 1);
+  src.stride.n = output->stride.n;
+  src.start_address = output->start_address + src.stride.n;
+  src.eu_align = 1;
+
+  cvk_tg_t dst;
+  dst.fmt = CVK_FMT_BF16;
+  dst.shape.n = 1;
+  dst.shape.c = output->shape.c;
+  dst.shape.h = output->shape.h;
+  dst.shape.w = 2;
+  dst.stride = cvk_ctx->ops->tg_default_stride(cvk_ctx, dst.shape, dst.fmt);
+  dst.base_reg_index = base_ga_idx;
+  dst.start_address = ga_dst;
+
+  cvk_tdma_l2g_tensor_copy_param_t param = {0};
+  param.src = &src;
+  param.dst = &dst;
+  param.layer_id = 0;
+  param.intra_cmd_paral = 0;
+  cvk_ctx->ops->tdma_l2g_bf16_tensor_copy(cvk_ctx, &param);
+}
+
+CVI_RT_MEM runtimeJitEuclideanDistance(CVI_RT_HANDLE ctx, void *cvk_ctx, uint32_t records,
+                                 uint32_t feature_size) {
+  // tile
+  auto cvk = (cvk_context_t *)cvk_ctx;
+  cvk->ops->set_layer_id(cvk, 0);
+  uint32_t lane_num = cvk->info.npu_num;
+  uint32_t c_step = 0;
+  cvk_tl_shape_t input_x_shape = {1, lane_num, 1, feature_size};
+  uint32_t in_x_size =
+      cvk->ops->lmem_tensor_to_size(cvk, input_x_shape, CVK_FMT_BF16, 1);
+  for (c_step = records; c_step > 0; --c_step) {
+    uint32_t total_size = 0;
+    cvk_tl_shape_t in_y_shape = {1, c_step, 1, feature_size};
+    uint32_t in_y_size = cvk->ops->lmem_tensor_to_size(cvk, in_y_shape, CVK_FMT_BF16, 1);
+    cvk_tl_shape_t output_shape = {2, c_step, 1, 2};
+    uint32_t output_size =
+        cvk->ops->lmem_tensor_to_size(cvk, output_shape, CVK_FMT_BF16, 1);
+    total_size += in_x_size;
+    total_size += in_y_size;
+    total_size += output_size;
+    if (total_size < cvk->info.lmem_size) {
+      break;
+    }
+  }
+  assert(c_step);
+
+  int x_ga_base_reg_idx = 2;
+  int y_ga_base_reg_idx = 3;
+  int o_ga_base_reg_idx = 4;
+
+  uint64_t x_ga = 0;
+  uint64_t y_ga = 0;
+  uint64_t o_ga = 0;
+
+  // alloc lmem
+  cvk_tl_t *input_x = cvk->ops->lmem_alloc_tensor(cvk, input_x_shape, CVK_FMT_BF16, 1);
+  cvk_tl_shape_t input_y_shape = {1, c_step, 1, feature_size};
+  cvk_tl_t *input_y = cvk->ops->lmem_alloc_tensor(cvk, input_y_shape, CVK_FMT_BF16, 1);
+  cvk_tl_shape_t output_shape = {2, c_step, 1, 1};
+  cvk_tl_t *output =
+      cvk->ops->lmem_alloc_tensor(cvk, output_shape, CVK_FMT_BF16, 1);
+  assert(input_x);
+  assert(input_y);
+  assert(output);
+
+  // load input_x
+  cvk_tg_shape_t tg_input_x_shape = {1, lane_num, 1, feature_size};
+  cvk_tg_stride_t tg_input_x_stride = cvk->ops->tg_default_stride(cvk, tg_input_x_shape, CVK_FMT_U8);
+  tg_input_x_stride.c = 0;
+  tg_input_x_stride.n = 0;
+  load_and_convert_to_bf16(cvk, input_x, input_x_shape, tg_input_x_stride, x_ga_base_reg_idx, x_ga);
+
+  for (uint32_t c_pos = 0; c_pos < records;) {
+    uint32_t tile_c = std::min(c_step, records - c_pos);
+
+    //load input_y
+    cvk_tl_shape_t input_y_shape = {1, tile_c, 1, feature_size};
+    cvk_tg_shape_t tg_input_y_shape = {input_y_shape.n, input_y_shape.c, input_y_shape.h, input_y_shape.w};
+    cvk_tg_stride_t tg_input_y_stride = cvk->ops->tg_default_stride(cvk, tg_input_y_shape, CVK_FMT_U8);
+    input_y->shape.c = tile_c;
+    load_and_convert_to_bf16(cvk, input_y, input_y_shape, tg_input_y_stride, y_ga_base_reg_idx, y_ga + c_pos * feature_size);
+
+    cvk_tl_t b;
+    b.start_address = input_x->start_address;
+    b.shape = input_y->shape;
+    b.stride = input_y->stride;
+    b.stride.c = 0;
+    b.stride.n = 0;
+    b.fmt = input_x->fmt;
+
+    cvk_tiu_sub_param_t p1 = {0};
+    p1.res_high = 0;
+    p1.res_low = input_y;
+    p1.a_high = 0;
+    p1.a_low = input_y;
+    p1.b_high = 0;
+    p1.b_low = &b;
+    p1.rshift_bits = 0;
+    p1.layer_id = 0;
+    cvk->ops->tiu_sub(cvk, &p1);
+
+    output->shape.n = 1;
+    output->shape.c = tile_c;
+
+    cvk_tiu_depthwise_pt_convolution_param_t p2 = {0};
+    p2.ofmap = output;
+    p2.ifmap = input_y;
+    p2.weight = input_y;
+    p2.bias = nullptr;
+    p2.ins_h = 0;
+    p2.ins_w = 0;
+    p2.ins_last_h = 0;
+    p2.ins_last_w = 0;
+    p2.pad_top = 0;
+    p2.pad_bottom = 0;
+    p2.pad_left = 0;
+    p2.pad_right = 0;
+    p2.stride_h = 1;
+    p2.stride_w = 1;
+    p2.dilation_h = 1;
+    p2.dilation_w = 1;
+    p2.relu_enable = false;
+    p2.rshift_bits = 0;
+    p2.ps32_mode = 2;
+    p2.layer_id = 0;
+    cvk->ops->tiu_pt_depthwise_convolution(cvk, &p2);
+
+    output->shape.n = 2;
+    convert_ps32_to_fp32(cvk, output);
+
+    store_fp32(cvk, o_ga_base_reg_idx, o_ga + c_pos * sizeof(float), output);
+
+    c_pos += tile_c;
+  }
+
+  cvk->ops->lmem_free_tensor(cvk, output);
+  cvk->ops->lmem_free_tensor(cvk, input_y);
+  cvk->ops->lmem_free_tensor(cvk, input_x);
+
+  CVI_RT_MEM cmdbuf_mem;
+  uint32_t size;
+  auto cmdbuf = cvk->ops->acquire_cmdbuf(cvk, &size);
+  int ret = CVI_RT_LoadCmdbuf(ctx, cmdbuf, size, 0, 0, false, &cmdbuf_mem);
+  assert(ret == 0);
+  cvk->ops->reset(cvk);
+  return cmdbuf_mem;
+}
+
+}
+}
diff --git a/cviruntime/src/common/kernel_function/grayImageLight.cpp b/cviruntime/src/common/kernel_function/grayImageLight.cpp
new file mode 100644
index 000000000..c3563609e
--- /dev/null
+++ b/cviruntime/src/common/kernel_function/grayImageLight.cpp
@@ -0,0 +1,288 @@
+#include <iostream>
+#include <cassert>
+#include <cmath>
+#include <vector>
+#include <algorithm>
+#include <runtime/kernel_function.hpp>
+#include <runtime/debug.h>
+
+namespace cvi {
+namespace runtime {
+
+static inline int ceiling_func(int numerator, int denominator) {
+  return (numerator + denominator - 1) / denominator;
+}
+
+static inline uint64_t align_up(uint64_t x, uint64_t n) {
+  return (x + n - 1) / n * n;
+}
+
+static void tdma_load_stride(cvk_context_t* cvk, cvk_tl_t* tensor,
+    uint64_t ga_src, cvk_tg_stride_t stride, int32_t reg_idx) {
+  cvk_tg_t src;
+  src.base_reg_index = reg_idx;
+  src.fmt = CVK_FMT_U8;
+  src.shape = {tensor->shape.n, tensor->shape.c,
+               tensor->shape.h, tensor->shape.w};
+  src.start_address = ga_src;
+  src.stride = stride;
+
+  cvk_tdma_g2l_tensor_copy_param_t p = {0};
+  p.src = &src;
+  p.dst = tensor;
+  p.layer_id = 0;
+
+  cvk->ops->tdma_g2l_tensor_copy(cvk, &p);
+}
+
+static void tdma_store_stride(cvk_context_t *cvk, cvk_tl_t *tensor,
+    uint64_t ga_dst, cvk_tg_stride_t stride) {
+  cvk_tg_t dst;
+  dst.base_reg_index = 3;
+  dst.fmt = CVK_FMT_U8;
+  dst.start_address = ga_dst;
+  dst.shape = {tensor->shape.n, tensor->shape.c,
+               tensor->shape.h, tensor->shape.w};
+  dst.stride = stride;
+
+  cvk_tdma_l2g_tensor_copy_param_t p;
+  p.src = tensor;
+  p.dst = &dst;
+  p.layer_id = 0;
+  cvk->ops->tdma_l2g_tensor_copy(cvk, &p);
+}
+
+static void calc_background(cvk_context_t *cvk,
+    int32_t ih, int32_t iw, int32_t oh, int32_t ow,
+    int32_t k, int32_t s, int32_t pad) {
+  uint64_t x_ga = 0;
+  uint64_t y_ga = 0;
+  int32_t kh = k;
+  int32_t kw = k;
+  int32_t sh = s;
+  int32_t sw = s;
+
+  int32_t step_oh = 0, step_ow = 0;
+  int32_t x_tl_sz = 0, y_tl_sz = 0;
+
+  // determin the shape of tile.
+  for (step_ow = ow; step_ow > 0; step_ow--) {
+    for (step_oh = oh; step_oh > 0; step_oh--) {
+      auto step_ih = (step_oh - 1) * sh + kh;
+      auto step_iw = (step_ow - 1) * sw + kw;
+      if (step_ih > ih) {
+        step_ih = ih;
+      }
+      if (step_iw > iw) {
+        step_iw = iw;
+      }
+      cvk_tl_shape_t input_shape = {1, 1, (uint32_t)step_ih, (uint32_t)step_iw};
+      cvk_tl_shape_t output_shape = {1, 1, (uint32_t)step_oh, (uint32_t)step_ow};
+      x_tl_sz = cvk->ops->lmem_tensor_to_size(cvk, input_shape, CVK_FMT_U8, 1);
+      y_tl_sz = cvk->ops->lmem_tensor_to_size(cvk, output_shape, CVK_FMT_U8, 1);
+      auto total_lmem = x_tl_sz + y_tl_sz;
+      if (total_lmem <= (int32_t)cvk->info.lmem_size) {
+        goto do_tile;
+      }
+    }
+  }
+do_tile:
+  cvk_tg_stride_t ga_stride = cvk->ops->tg_default_stride(
+      cvk, {1, 1, (uint32_t)ih, (uint32_t)iw}, CVK_FMT_U8);
+  for (int oh_pos = 0; oh_pos < oh; oh_pos += step_oh) {
+    int32_t cur_oh = std::min(oh - oh_pos, step_oh);
+    int32_t oh_top = oh_pos;
+    int32_t oh_bot = oh_pos + cur_oh;
+    int32_t ih_top = std::max(oh_top * sh - pad, 0);
+    int32_t ih_bot = std::min((oh_bot - 1) * sh + kh - pad, ih);
+    int32_t cur_ih = ih_bot - ih_top;
+    int32_t cur_pad_t = (ih_top == 0) ? (pad - oh_top * sh) : 0;
+    int32_t cur_pad_b = (ih_bot == ih) ? ((oh_bot - 1) * sh + kh - pad - ih) : 0;
+
+    for (int ow_pos = 0; ow_pos < ow; ow_pos += step_ow) {
+      int32_t cur_ow = std::min(ow - ow_pos, step_ow);
+      int32_t ow_left = ow_pos;
+      int32_t ow_right = ow_pos + cur_ow;
+      int32_t iw_left = std::max(ow_left * sw - pad, 0);
+      int32_t iw_right = std::min((ow_right - 1) * sw + kw - pad, iw);
+      int32_t cur_iw = iw_right - iw_left;
+      int32_t cur_pad_l = (iw_left == 0) ? (pad - ow_left * sw) : 0;
+      int32_t cur_pad_r = (iw_right == iw) ? ((ow_right - 1) * sw + kw - pad - iw) : 0;
+      cvk_tl_t x, bkg;
+      x.fmt = CVK_FMT_U8;
+      x.start_address = 0;
+      x.shape = {1, 1, (uint32_t)cur_ih, (uint32_t)cur_iw};
+      x.stride = cvk->ops->tl_default_stride(cvk, x.shape, x.fmt, 1);
+      tdma_load_stride(cvk, &x, x_ga + ih_top * iw + iw_left, ga_stride, 2);
+
+      bkg = x;
+      bkg.fmt = CVK_FMT_U8;
+      bkg.start_address = x_tl_sz;
+      bkg.shape = {1, 1, (uint32_t)cur_oh, (uint32_t)cur_ow};
+      bkg.stride = cvk->ops->tl_default_stride(cvk, bkg.shape, bkg.fmt, 1);
+
+      cvk_tiu_max_pooling_param_t p0 = {0};
+      p0.ofmap = &bkg;
+      p0.ifmap = &x;
+      p0.kh = kh;
+      p0.kw = kw;
+      p0.pad_top = cur_pad_t;
+      p0.pad_bottom = cur_pad_b;
+      p0.pad_left = cur_pad_l;
+      p0.pad_right = cur_pad_r;
+      p0.stride_h = 1;
+      p0.stride_w = 1;
+      cvk->ops->tiu_max_pooling(cvk, &p0);
+      tdma_store_stride(cvk, &bkg, y_ga + oh_pos * ow + ow_pos, ga_stride);
+    }
+  }
+}
+
+static void compute(cvk_context_t *cvk, int32_t offset, int32_t ic, int32_t ih, int32_t iw) {
+  uint64_t x_ga = 0;
+  uint64_t y_ga = 0;
+
+  cvk_fmt_t fmt = CVK_FMT_U8;
+  cvk_tl_t x, bkg, mask, diff, mask_high, diff_high;
+  cvk_tl_t y, tmp_high;
+
+  uint32_t lmem_address = 0;
+  x.start_address = lmem_address;
+  x.fmt = fmt;
+  x.shape = {1, (uint32_t)ic, (uint32_t)ih, (uint32_t)iw};
+  x.stride = cvk->ops->tl_default_stride(cvk, x.shape, fmt, 1);
+  cvk_tg_stride_t ga_stride = cvk->ops->tg_default_stride(
+          cvk, {1, (uint32_t)ic, (uint32_t)ih, (uint32_t)iw}, fmt);
+  uint32_t tl_size = cvk->ops->lmem_tensor_to_size(cvk, x.shape, fmt, 1);
+  lmem_address += tl_size;
+
+  bkg = x;
+  bkg.start_address = lmem_address;
+  lmem_address += tl_size;
+
+  mask = x;
+  mask.start_address = lmem_address;
+  lmem_address += tl_size;
+
+  diff = x;
+  diff.fmt = CVK_FMT_I8;
+  diff.start_address = lmem_address;
+  lmem_address += tl_size;
+
+  mask_high = x;
+  mask_high.start_address = lmem_address;
+  lmem_address += tl_size;
+
+  diff_high = diff;
+  diff_high.start_address = lmem_address;
+  lmem_address += tl_size;
+  assert(lmem_address < cvk->info.lmem_size);
+  y = mask;
+  tmp_high = mask_high;
+
+  tdma_load_stride(cvk, &x, x_ga + offset, ga_stride, 2);
+  tdma_load_stride(cvk, &bkg, y_ga + offset, ga_stride, 3);
+
+  cvk_tiu_ge_param_t p1 = {0};
+  p1.ge = &mask;
+  p1.a = &x;
+  p1.b = &bkg;
+  cvk->ops->tiu_ge(cvk, &p1);
+
+  cvk_tiu_mul_param_t p2 = {0};
+  mask.fmt = CVK_FMT_I8;
+  mask_high.fmt = CVK_FMT_I8;
+  p2.res_high = &mask_high;
+  p2.res_low = &mask;
+  p2.a = &mask;
+  p2.b_is_const = 1;
+  p2.b_const.val = 255;
+  p2.b_const.is_signed = 1;
+  cvk->ops->tiu_mul(cvk, &p2);
+
+  cvk_tiu_xor_int8_param_t p3 = {0};
+  p3.res = &tmp_high;
+  p3.a = &tmp_high;
+  p3.b = &tmp_high;
+  cvk->ops->tiu_xor_int8(cvk, &p3);
+
+  cvk_tiu_sub_param_t p4 = {0};
+  p4.res_high = &diff_high;
+  p4.res_low = &diff;
+  p4.a_high = &tmp_high;
+  p4.a_low = &x;
+  p4.b_high = &tmp_high;
+  p4.b_low = &bkg;
+  cvk->ops->tiu_sub(cvk, &p4);
+
+  cvk_tiu_add_param_t p5 = {0};
+  p5.res_high = &diff_high;
+  p5.res_low = &diff;
+  p5.a_high = &diff_high;
+  p5.a_low = &diff;
+  p5.b_is_const = 1;
+  p5.b_const.val = 255;
+  p5.b_const.is_signed = 1;
+  cvk->ops->tiu_add(cvk, &p5);
+
+  cvk_tiu_add_param_t p6 = {0};
+  p6.res_low = &y;
+  p6.a_high = &diff_high;
+  p6.a_low = &diff;
+  p6.b.high = &mask_high;
+  p6.b.low = &mask;
+  cvk->ops->tiu_add(cvk, &p6);
+
+  tdma_store_stride(cvk, &y, y_ga + offset, ga_stride);
+}
+
+CVI_RT_MEM runtimeJitGrayImageLight(
+    CVI_RT_HANDLE ctx, void* cvk_ctx,
+    int32_t ih, int32_t iw, int32_t kernel_sz) {
+
+  auto cvk = (cvk_context_t*)cvk_ctx;
+  int32_t pad = (kernel_sz - 1) / 2;
+  calc_background(cvk, ih, iw, ih, iw, kernel_sz, 1, pad);
+
+  int32_t block_num = 6;
+  int32_t eu_num = cvk->info.eu_num;
+  int32_t npu_num = cvk->info.npu_num;
+  int32_t total_sz = align_up(ih * iw, eu_num);
+  uint32_t remain = total_sz;
+  uint32_t offset = 0;
+  int32_t max_h = cvk->info.lmem_size / (eu_num * block_num);
+  max_h = std::min(max_h, 4096 - 32);
+  assert(max_h);
+  int32_t cur_h = max_h;
+
+  int32_t loop = remain / (npu_num * eu_num * max_h);
+  remain %= npu_num * eu_num * max_h;
+  for (int32_t i = 0; i < loop; i++) {
+    compute(cvk, offset, npu_num, cur_h, eu_num);
+    offset += 1 * npu_num * cur_h * eu_num;
+  }
+  if (remain) {
+    int32_t cur_c = remain / (eu_num * cur_h);
+    if (cur_c) {
+      compute(cvk, offset, cur_c, cur_h, eu_num);
+      offset += 1 * cur_c * cur_h * eu_num;
+    }
+
+    remain %= eu_num * cur_h;
+    if (remain) {
+      compute(cvk, offset, 1, ceiling_func(remain, eu_num), eu_num);
+    }
+  }
+
+  CVI_RT_MEM cmdbuf_mem;
+  uint32_t size;
+  auto cmdbuf = cvk->ops->acquire_cmdbuf(cvk, &size);
+  int ret = CVI_RT_LoadCmdbuf(ctx, cmdbuf, size, 0, 0, false, &cmdbuf_mem);
+  assert(ret == 0);
+  cvk->ops->reset(cvk);
+  return cmdbuf_mem;
+}
+
+
+}
+}
\ No newline at end of file
diff --git a/cviruntime/src/common/kernel_function/kernelFunc.cpp b/cviruntime/src/common/kernel_function/kernelFunc.cpp
new file mode 100644
index 000000000..7af3efc6b
--- /dev/null
+++ b/cviruntime/src/common/kernel_function/kernelFunc.cpp
@@ -0,0 +1,62 @@
+#include "cviruntime.h"
+#include "IKernelFunc.hpp"
+#include <runtime/kernel_function.hpp>
+
+CVI_KFUNC_HANDLE CVI_NN_PrepareEuclideanDistanceKernelFunc(
+    CVI_RT_HANDLE ctx, CVI_FMT fmt, uint32_t k, uint32_t n) {
+  (void)fmt;
+  auto cvk = CVI_RT_RegisterKernel(ctx, 200000);
+  assert(cvk);
+
+  auto kfun = new cvi::runtime::IKernelFunc(ctx);
+  kfun->cmdbuf_mem = cvi::runtime::runtimeJitEuclideanDistance(ctx, cvk, n, k);
+  CVI_RT_UnRegisterKernel(cvk);
+  return (void *)kfun;
+}
+
+CVI_KFUNC_HANDLE CVI_NN_PrepareMatrixMulKernelFunc(
+    CVI_RT_HANDLE ctx, CVI_FMT fmt, uint32_t m, uint32_t k, uint32_t n) {
+
+  auto cvk = CVI_RT_RegisterKernel(ctx, 200000);
+  assert(cvk);
+
+  auto kfun = new cvi::runtime::IKernelFunc(ctx);
+  kfun->cmdbuf_mem = cvi::runtime::runtimeJitMatrixMul(ctx, cvk, fmt, m, k, n);
+  CVI_RT_UnRegisterKernel(cvk);
+  return (void *)kfun;
+}
+
+CVI_KFUNC_HANDLE CVI_NN_PrepareGrayImageLightKernelFunc(
+    CVI_RT_HANDLE ctx, uint32_t ih, uint32_t iw, uint32_t kernel_sz) {
+
+  auto cvk = CVI_RT_RegisterKernel(ctx, 100000);
+  assert(cvk);
+
+  auto kfun = new cvi::runtime::IKernelFunc(ctx);
+  kfun->cmdbuf_mem = cvi::runtime::runtimeJitGrayImageLight(ctx, cvk, ih, iw, kernel_sz);
+  CVI_RT_UnRegisterKernel(cvk);
+  return (void *)kfun;
+}
+
+CVI_RC CVI_NN_RunKernelFunc(CVI_KFUNC_HANDLE kfun, int32_t mem_num, ...) {
+  assert(mem_num <= 6);
+  uint64_t baseArray[mem_num + 2];
+
+  baseArray[0] = 0;
+  baseArray[1] = 0;
+
+  va_list valist;
+  va_start(valist, mem_num);
+  for (int i = 2; i < mem_num + 2; i++) {
+    baseArray[i] = va_arg(valist, uint64_t);
+  }
+  va_end(valist);
+  auto kfn = static_cast<cvi::runtime::IKernelFunc *>(kfun);
+  return CVI_RT_RunCmdbufEx(kfn->ctx, kfn->cmdbuf_mem, (CVI_RT_ARRAYBASE *)baseArray);
+}
+
+CVI_RC CVI_NN_DestroyKernelFunc(CVI_KFUNC_HANDLE kfun) {
+  auto kfn = static_cast<cvi::runtime::IKernelFunc *>(kfun);
+  delete kfn;
+  return CVI_RC_SUCCESS;
+}
diff --git a/cviruntime/src/common/kernel_function/matrixMul.cpp b/cviruntime/src/common/kernel_function/matrixMul.cpp
new file mode 100644
index 000000000..d3d9f84a5
--- /dev/null
+++ b/cviruntime/src/common/kernel_function/matrixMul.cpp
@@ -0,0 +1,233 @@
+#include <iostream>
+#include <cassert>
+#include <cmath>
+#include <vector>
+#include <runtime/kernel_function.hpp>
+#include <runtime/debug.h>
+
+namespace cvi {
+namespace runtime {
+
+static inline int ceiling_func(int numerator, int denominator) {
+  return (numerator + denominator - 1) / denominator;
+}
+
+static inline uint64_t align_up(uint64_t x, uint64_t n) {
+  return (x + n - 1) / n * n;
+}
+
+static void tdma_load_stride(cvk_context_t* cvk, cvk_ml_t* ml,
+    uint64_t ga_src, cvk_mg_stride_t mg_stride, uint32_t reg_idx) {
+  cvk_mg_t src;
+  src.base_reg_index = reg_idx;
+  src.fmt = CVK_FMT_U8;
+  src.shape = {ml->shape.n, ml->shape.col};
+  src.start_address = ga_src;
+  src.stride = mg_stride;
+
+  cvk_tdma_g2l_matrix_copy_param_t p;
+  p.src = &src;
+  p.dst = ml;
+  p.layer_id = 0;
+
+  cvk->ops->tdma_g2l_matrix_copy(cvk, &p);
+}
+
+static void tdma_store_stride(cvk_context_t *cvk, cvk_ml_t *ml,
+    uint64_t ga_dst, cvk_mg_stride_t mg_stride) {
+  cvk_mg_t dst;
+  dst.base_reg_index = 4;
+  dst.fmt = CVK_FMT_U8;
+  dst.start_address = ga_dst;
+  dst.shape = {ml->shape.n, ml->shape.col};
+  dst.stride = mg_stride;
+
+  cvk_tdma_l2g_matrix_copy_param_t p;
+  p.src = ml;
+  p.dst = &dst;
+  p.layer_id = 0;
+
+  cvk->ops->tdma_l2g_matrix_copy(cvk, &p);
+}
+
+static void convert_result_and_store(cvk_context_t *cvk, cvk_ml_t *ps32, uint64_t gaddr, uint32_t res_col) {
+  auto row = ps32->shape.n;
+  auto c = ps32->shape.c;
+  auto w = ps32->shape.w;
+  auto col = ps32->shape.col;
+
+  cvk_tl_shape_t shape = {row, c, 1, w};
+  cvk_tl_stride_t stride = cvk->ops->tl_default_stride(cvk, shape, CVK_FMT_U8, 1);
+  int size = cvk->ops->lmem_tensor_to_size(cvk, shape, CVK_FMT_U8, 1);
+  auto laddr_src = ps32->start_address;
+  auto laddr_dst = laddr_src + 4 * size;
+
+  cvk_tl_t src = {};
+  src.shape = shape;
+  src.stride = stride;
+  src.fmt = CVK_FMT_U8;
+  src.start_address = laddr_src;
+
+  cvk_tl_t dst = {};
+  dst.shape = shape;
+  uint32_t lane_num = cvk->info.npu_num;
+  uint32_t c_per_lane = ceiling_func(c, lane_num);
+  dst.stride = {c_per_lane * w * 4, w * 4, w * 4, 4};
+  dst.fmt = CVK_FMT_U8;
+  dst.start_address = laddr_dst;
+
+  for (int i = 0; i < 4; ++i) {
+    src.start_address = laddr_src + i * size;
+    dst.start_address = laddr_dst + i;
+
+    cvk_tiu_copy_param_t param;
+    param.src = &src;
+    param.dst = &dst;
+    cvk->ops->tiu_copy(cvk, &param);
+  }
+
+  cvk_ml_t res = {};
+  res.shape = {row, c, 4 * w, 4 * col};
+  res.fmt = CVK_FMT_U8;
+  res.stride = cvk->ops->ml_default_stride(cvk, res.shape, CVK_FMT_U8, 0);
+  res.start_address = laddr_dst;
+
+  cvk_mg_stride_t mg_stride;
+  mg_stride.row = 4 * res_col;
+  tdma_store_stride(cvk, &res, gaddr, mg_stride);
+}
+
+static cvk_fmt_t formatTranslate(CVI_FMT fmt) {
+  switch(fmt) {
+    case CVI_FMT_INT8:  return CVK_FMT_I8;
+    case CVI_FMT_UINT8: return CVK_FMT_U8;
+    default:
+      TPU_LOG_ERROR("unsupported fmt:%d\n", fmt);
+      assert(0);
+  }
+  return CVK_FMT_U8;
+}
+
+CVI_RT_MEM runtimeJitMatrixMul(
+    CVI_RT_HANDLE ctx, void* cvk_ctx, CVI_FMT format,
+    uint32_t m, uint32_t k, uint32_t n) {
+  auto cvk = (cvk_context_t*)cvk_ctx;
+
+  uint64_t l_ga = 0;
+  uint64_t r_ga = 0;
+  uint64_t y_ga = 0;
+
+  cvk_fmt_t fmt = formatTranslate(format);
+  uint32_t max_tiu = (1 << 12) - 1; // 1880v2: 12 bit
+  uint32_t m_step = std::min(m, max_tiu);
+  uint32_t k_step = std::min(k, max_tiu);
+  uint32_t n_step = std::min(n, max_tiu);
+  uint32_t lane_num = cvk->info.npu_num;
+  uint32_t eu_num = cvk->info.eu_num;
+  uint32_t min_n_step = eu_num * lane_num;
+
+  uint32_t total_size = 0;
+  for (; k_step > 0; --k_step) {
+    for (n_step = n; n_step > 0; n_step = align_up(n_step - min_n_step, min_n_step)) {
+      for (m_step = m; m_step > 0; --m_step) {
+        total_size = 0;
+
+        cvk_ml_shape_t tiled_L_shape = cvk->ops->ml_default_shape(cvk, m_step, k_step, fmt);
+        cvk_ml_shape_t tiled_R_shape = cvk->ops->ml_default_shape(cvk, k_step, n_step, fmt);
+        cvk_ml_shape_t tiled_Y_shape = cvk->ops->ml_default_shape(cvk, m_step, n_step, fmt);
+        cvk_tl_shape_t result_shape = {tiled_Y_shape.n, tiled_Y_shape.c, 1, 4 * tiled_Y_shape.w};
+
+        total_size += cvk->ops->lmem_matrix_to_size(cvk, tiled_L_shape, fmt, 1);
+        total_size += cvk->ops->lmem_matrix_to_size(cvk, tiled_R_shape, fmt, 1);
+        total_size += cvk->ops->lmem_ps32_matrix_to_size(cvk, tiled_Y_shape, fmt, 1);
+        total_size += cvk->ops->lmem_tensor_to_size(cvk, result_shape, fmt, 1);
+
+        if (total_size < cvk->info.lmem_size) {
+          goto start;
+        }
+      }
+    }
+  }
+
+start:
+  assert(m_step > 0 && k_step > 0 && n_step > 0);
+  // printf("split: m:%d, k:%d, n:%d\n", (int)m_step, (int)k_step, (int)n_step);
+  assert(k_step >= k && "m or n is too bigger");
+
+  cvk_ml_t tiled_L;
+  cvk_ml_t tiled_R;
+  cvk_ml_t tiled_Y;
+
+  cvk_mg_stride_t mg_stride;
+
+  for (uint32_t k_pos = 0; k_pos < k; k_pos += k_step) {
+    uint32_t cur_k = std::min(k - k_pos, k_step);
+    assert(cur_k == k_step);
+    uint32_t ps32_mode = 2;
+
+    for (uint32_t m_pos = 0; m_pos < m; m_pos += m_step) {
+      uint32_t cur_m = std::min(m - m_pos, m_step);
+
+      uint32_t lmem_address = 0;
+      tiled_L.start_address = lmem_address;
+      tiled_L.fmt = fmt;
+      tiled_L.shape = cvk->ops->ml_default_shape(cvk, cur_m, cur_k, fmt);
+      tiled_L.stride = cvk->ops->ml_default_stride(cvk, tiled_L.shape, fmt, 1);
+      lmem_address += cvk->ops->lmem_matrix_to_size(cvk, tiled_L.shape, fmt, 1);
+
+      mg_stride.row = k;
+      tdma_load_stride(cvk, &tiled_L, l_ga + m_pos * k + k_pos, mg_stride, 2);
+
+      for (uint32_t n_pos = 0; n_pos < n; n_pos += n_step) {
+        uint32_t cur_n = std::min(n - n_pos, n_step);
+
+        // std::cout << "tiled L :(" << cur_m << "x" << cur_k << "), "
+        //           << "tiled R :(" << cur_k << "x" << cur_n << ")\n";
+
+        uint32_t lmem_address_1 = lmem_address;
+        tiled_R.start_address = lmem_address_1;
+        tiled_R.fmt = fmt;
+        tiled_R.shape = cvk->ops->ml_default_shape(cvk, cur_k, cur_n, fmt);
+        tiled_R.stride = cvk->ops->ml_default_stride(cvk, tiled_R.shape, fmt, 1);
+        lmem_address_1 += cvk->ops->lmem_matrix_to_size(cvk, tiled_R.shape, fmt, 1);
+
+        mg_stride.row = n;
+        tdma_load_stride(cvk, &tiled_R, r_ga + k_pos * n + n_pos, mg_stride, 3);
+
+        tiled_Y.start_address = lmem_address_1;
+        tiled_Y.fmt = fmt;
+        tiled_Y.shape = cvk->ops->ml_default_shape(cvk, cur_m, cur_n, fmt);
+        tiled_Y.stride = cvk->ops->ml_default_stride(cvk, tiled_Y.shape, fmt, 1);
+
+        cvk_tiu_matrix_multiplication_param_t p;
+        p.res = &tiled_Y;
+        p.left = &tiled_L;
+        p.right = &tiled_R;
+        p.bias = nullptr;
+        p.lshift_bits = 0;
+        p.rshift_bits = 0;
+        p.res_is_int8 = 0;
+        p.add_result = 0;
+        p.relu_enable = 0;
+        p.ps32_mode = ps32_mode;
+        p.layer_id = 0;
+
+        cvk->ops->tiu_matrix_multiplication(cvk, &p);
+
+        convert_result_and_store(cvk, &tiled_Y, y_ga + (m_pos * n + n_pos) * sizeof(int), n);
+      }
+    }
+  }
+
+  CVI_RT_MEM cmdbuf_mem;
+  uint32_t size;
+  auto cmdbuf = cvk->ops->acquire_cmdbuf(cvk, &size);
+  int ret = CVI_RT_LoadCmdbuf(ctx, cmdbuf, size, 0, 0, false, &cmdbuf_mem);
+  assert(ret == 0);
+  cvk->ops->reset(cvk);
+  return cmdbuf_mem;
+}
+
+
+}
+}
\ No newline at end of file
diff --git a/cviruntime/src/common/kernel_function/tdmaCopy.cpp b/cviruntime/src/common/kernel_function/tdmaCopy.cpp
new file mode 100644
index 000000000..d1d2fdddb
--- /dev/null
+++ b/cviruntime/src/common/kernel_function/tdmaCopy.cpp
@@ -0,0 +1,85 @@
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include <iostream>
+#include <runtime/debug.h>
+#include <runtime/kernel_function.hpp>
+#include <inttypes.h>
+
+namespace cvi {
+namespace runtime {
+
+static CVI_RT_MEM runtimeJitCompile(CVI_RT_HANDLE ctx, void *cvk) {
+  uint32_t sz;
+  CVI_RT_MEM cmdbuf_mem;
+  auto cvkernel = (cvk_context_t *)cvk;
+  auto cmdbuf = cvkernel->ops->acquire_cmdbuf(cvkernel, &sz);
+  int ret = CVI_RT_LoadCmdbuf(ctx, cmdbuf, sz, 0, 0, false, &cmdbuf_mem);
+  cvkernel->ops->reset(cvkernel);
+  if (ret != 0) {
+      TPU_LOG_WARNING("runtimeJitCompile failed ret[%d]\n", ret);
+      return nullptr;
+  }
+  return cmdbuf_mem;
+}
+
+CVI_RC runtimeExecuteKernelFunction(CVI_RT_HANDLE ctx, CVI_RT_MEM codeBuf,
+                                  uint64_t gaddrSrc, uint64_t gaddrDst) {
+  //TPU_LOG_DEBUG("runtimeExecuteKernelFunction, src:%" PRIu64 " dst:%" PRIu64 "\n", gaddrSrc, gaddrDst);
+  CVI_RC ret = CVI_RT_RunCmdbuf(ctx, codeBuf, gaddrSrc, gaddrDst);
+  if (ret != 0) {
+    TPU_LOG_WARNING("runtimeExecuteKernelFunction failed ret[%d]\n", ret);
+  }
+  return ret;
+}
+
+CVI_RT_MEM runtimeJitTdmaStrideCopy(CVI_RT_HANDLE ctx, void *cvk, CVI_FMT fmt,
+                                    cvk_tg_shape_t *shapeDst,
+                                    cvk_tg_stride_t *strideDst,
+                                    cvk_tg_shape_t *shapeSrc,
+                                    cvk_tg_stride_t *strideSrc) {
+  auto cvkernel = (cvk_context_t *)cvk;
+  // programing with cvikernel intrinsic functions
+  cvk_fmt_t cvk_fmt;
+  if (fmt == CVI_FMT_INT8) {
+    cvk_fmt = CVK_FMT_I8;
+  } else if (fmt == CVI_FMT_UINT8) {
+    cvk_fmt = CVK_FMT_U8;
+  } else {
+    cvk_fmt = CVK_FMT_BF16;
+  }
+
+  cvk_tg_t src;
+  src.base_reg_index = 2;
+  src.start_address = 0;
+  src.shape = *shapeSrc;
+  if (strideSrc) {
+    src.stride = *strideSrc;
+  } else {
+    src.stride = cvkernel->ops->tg_default_stride(cvkernel, src.shape, cvk_fmt);
+  }
+  src.fmt = CVK_FMT_I8;
+
+  cvk_tg_t dst;
+  dst.base_reg_index = 3;
+  dst.start_address = 0;
+  dst.shape = *shapeDst;
+  if (strideDst) {
+    dst.stride = *strideDst;
+  } else {
+    dst.stride = cvkernel->ops->tg_default_stride(cvkernel, dst.shape, cvk_fmt);
+  }
+  dst.fmt = CVK_FMT_I8;
+
+  cvk_tdma_g2g_tensor_copy_param_t p;
+  p.src = &src;
+  p.dst = &dst;
+
+  cvkernel->ops->tdma_g2g_tensor_copy(cvkernel, &p);
+
+  // do jit complilation
+  return runtimeJitCompile(ctx, cvk);
+}
+
+} // namespace runtime
+} // namespace cvi
\ No newline at end of file
diff --git a/cviruntime/src/common/mmpool.cpp b/cviruntime/src/common/mmpool.cpp
new file mode 100644
index 000000000..8c994cba8
--- /dev/null
+++ b/cviruntime/src/common/mmpool.cpp
@@ -0,0 +1,917 @@
+#include "mmpool.h"
+
+//#define MEM_POOL_DEBUG
+
+#ifdef MEM_POOL_NAIVE
+void mem_pool_cleanup(mem_pool_t *pool)
+{
+  pool->head_addr = 0;
+  pool->head_slot = 0;
+  pool->slot_used = 0;
+  for ( int i = 0; i < MEM_POOL_SLOT_NUM; i++ ) {
+    pool->slot[i] = MEM_POOL_ADDR_INVALID;
+  }
+}
+
+u64 mem_pool_alloc(mem_pool_t *pool, u64 size)
+{
+  //assert(size % GLOBAL_MEM_ALIGN_SIZE == 0); // FIXME(wwcai): support 1byte align
+  POOL_LOCK(pool);
+  /* single direction, no looking up for empty slot */
+  int cur_slot = pool->head_slot;
+  if (cur_slot >= MEM_POOL_SLOT_NUM) {
+    assert(0);  // no error handling yet
+    return MEM_POOL_ADDR_INVALID;
+  }
+
+  u64 cur_addr = pool->head_addr;
+  if (cur_addr + size > pool->total_size) {
+    assert(0);  // no error handling yet
+    return MEM_POOL_ADDR_INVALID;
+  }
+
+  pool->slot[cur_slot] = cur_addr;
+  pool->head_addr += size;
+  pool->head_slot++;
+  pool->slot_used++;
+
+  POOL_UNLOCK(pool);
+#ifdef MEM_POOL_DEBUG
+  printf("mem_pool: alloc addr 0x%16x on slot %d\n", cur_addr, cur_slot);
+#endif
+  return cur_addr;
+}
+
+void mem_pool_free(mem_pool_t *pool, u64 addr)
+{
+  POOL_LOCK(pool);
+  /* lookup for addr for sanity checking */
+  int cur_slot = MEM_POOL_SLOT_NUM;
+  for ( int i = 0; i < pool->head_slot; i++ ) {
+    if (pool->slot[i] == addr) {
+      cur_slot = i;
+      break;
+    }
+  }
+  if (cur_slot == MEM_POOL_SLOT_NUM) {
+    printf("mem_pool: No matching slot for free, addr = 0x%16llx\n", addr);
+    assert(0);
+  }
+  pool->slot[cur_slot] = MEM_POOL_ADDR_INVALID;
+
+#ifdef MEM_POOL_DEBUG
+  printf("mem_pool: free addr 0x%16x on slot %d\n", addr, cur_slot);
+#endif
+
+  assert(pool->slot_used);
+  pool->slot_used--;
+  if ( pool->slot_used == 0 ) {
+    mem_pool_cleanup(pool);
+#ifdef MEM_POOL_DEBUG
+    printf("mem_pool: cleanup\n");
+#endif
+  }
+  POOL_UNLOCK(pool);
+}
+
+void mem_pool_create(mem_pool_t **pool, u64 total_size)
+{
+  mem_pool_t *tpool = new mem_pool_t;
+  POOL_LOCK_INIT(tpool);
+
+  tpool->total_size = total_size;
+  mem_pool_cleanup(tpool);
+
+  *pool = tpool;
+}
+
+void mem_pool_destroy(mem_pool_t *pool)
+{
+  /* sanity checking */
+  assert(pool->slot_used == 0);
+  POOL_LOCK_DEINIT(pool);
+
+  delete pool;
+}
+#endif /* MEM_POOL_NAIVE */
+
+#ifdef MEM_POOL_ZEPHRE
+
+/* Auto-Defrag settings */
+
+#define AD_NONE 0
+#define AD_BEFORE_SEARCH4BIGGERBLOCK 1
+#define AD_AFTER_SEARCH4BIGGERBLOCK 2
+
+#define AUTODEFRAG AD_AFTER_SEARCH4BIGGERBLOCK
+
+#define OCTET_TO_SIZEOFUNIT(X) (X)
+
+#define MAX_BLOCK_SIZE      (1024 * 1024 * 1024)
+#define MIN_BLOCK_SIZE      (4 * 1024)
+
+static void _k_mem_pool_init_pre(struct pool_struct *P, u64 total_size)
+{
+    assert(total_size % MAX_BLOCK_SIZE == 0);
+
+    P->maxblock_size = MAX_BLOCK_SIZE;
+    P->minblock_size = MIN_BLOCK_SIZE;
+    P->nr_of_maxblocks = total_size/P->maxblock_size;
+    P->nr_of_block_sets = 0;
+    P->bufblock = (pool_addr_t)0;
+
+    /* deternmine nr_of_block_sets */
+    int nr_of_block_sets = 1;
+    int num = MAX_BLOCK_SIZE/MIN_BLOCK_SIZE;
+    while(num >>= 2) {
+        nr_of_block_sets++;
+    }
+    P->block_set = malloc(nr_of_block_sets * sizeof(struct pool_block_set));
+    assert(P->block_set);
+
+#ifdef MEM_POOL_DEBUG
+    printf("mem_pool: total_size %lld, max %d, min %d, nr_max %d\n",
+        total_size, P->maxblock_size, P->minblock_size, P->nr_of_maxblocks);
+#endif
+
+    int block_size = MAX_BLOCK_SIZE;
+    int nr_of_entries = (P->nr_of_maxblocks + 3) / 4;
+    int i = 0;
+
+    while (block_size >= P->minblock_size) {
+#ifdef MEM_POOL_DEBUG
+        printf("mem_pool: alloc block_set for size %d, nr %d\n",
+            block_size, nr_of_entries);
+#endif
+        P->block_set[i].block_size = block_size;
+        P->block_set[i].nr_of_entries = nr_of_entries;
+        P->block_set[i].quad_block = malloc(nr_of_entries * sizeof(struct pool_quad_block));
+        assert(P->block_set[i].quad_block);
+        for (int t = 0; t < nr_of_entries; t++) {
+            P->block_set[i].quad_block[t].mem_blocks = MEM_POOL_ADDR_INVALID;
+            P->block_set[i].quad_block[t].mem_status = 0;
+        }
+#ifdef CONFIG_OBJECT_MONITOR
+        P->block_set[i].count = 0;
+#endif
+        i++;
+        P->nr_of_block_sets++;
+        block_size = block_size >> 2;
+        nr_of_entries = nr_of_entries << 2;
+    }
+
+#ifdef MEM_POOL_DEBUG
+    printf("mem_pool: init %d block_set\n", nr_of_block_sets);
+#endif
+    assert(nr_of_block_sets == P->nr_of_block_sets);
+}
+
+static void _k_mem_pool_exit_post(struct pool_struct *P)
+{
+    for (int i = 0; i < P->nr_of_block_sets; i++) {
+        free(P->block_set[i].quad_block);
+    }
+    free(P->block_set);
+}
+
+static void _k_mem_pool_init(mem_pool_t *pool)
+{
+    struct pool_struct *P;
+    int i;
+
+    /* perform initialization for each memory pool */
+
+    for (i = 0, P = pool->_k_mem_pool_list; i < pool->_k_mem_pool_count; i++, P++) {
+
+        /*
+         * mark block set for largest block size
+         * as owning all of the memory pool buffer space
+         */
+
+        int remaining = P->nr_of_maxblocks;
+        int t = 0;
+        pool_addr_t memptr = P->bufblock;
+
+#ifdef MEM_POOL_DEBUG
+        printf("mem_pool: remaining %d\n", remaining);
+#endif
+
+        while (remaining >= 4) {
+            P->block_set[0].quad_block[t].mem_blocks = memptr;
+            P->block_set[0].quad_block[t].mem_status = 0xF;
+            t++;
+            remaining = remaining - 4;
+            memptr +=
+                OCTET_TO_SIZEOFUNIT(P->block_set[0].block_size)
+                * 4;
+        }
+
+        if (remaining != 0) {
+            P->block_set[0].quad_block[t].mem_blocks = memptr;
+            P->block_set[0].quad_block[t].mem_status =
+                0xF >> (4 - remaining);
+            /* non-existent blocks are marked as unavailable */
+        }
+
+        /*
+         * note: all other block sets own no blocks, since their
+         * first quad-block has a NULL memory pointer
+         */
+#ifdef MEM_POOL_DEBUG
+        printf("mem_pool: remaining %d, t = %d\n", remaining, t);
+#endif
+    }
+}
+
+static u64 compute_block_set_index(struct pool_struct *P, u32 data_size)
+{
+    u32 block_size = P->minblock_size;
+    u32 offset = P->nr_of_block_sets - 1;
+
+    while (data_size > block_size) {
+        block_size = block_size << 2;
+        offset--;
+    }
+
+    return offset;
+}
+
+static void free_existing_block(pool_addr_t ptr, struct pool_struct *P, int index)
+{
+    struct pool_quad_block *quad_block = P->block_set[index].quad_block;
+    pool_addr_t block_ptr;
+    int i, j;
+
+    /*
+     * search block set's quad-blocks until the block is located,
+     * then mark it as unused
+     *
+     * note: block *must* exist, so no need to do array bounds checking
+     */
+
+    for (i = 0; ; i++) {
+        assert((i < P->block_set[index].nr_of_entries) &&
+             (quad_block[i].mem_blocks != MEM_POOL_ADDR_INVALID));
+
+        block_ptr = quad_block[i].mem_blocks;
+        for (j = 0; j < 4; j++) {
+            if (ptr == block_ptr) {
+                quad_block[i].mem_status |= (1 << j);
+                return;
+            }
+            block_ptr += OCTET_TO_SIZEOFUNIT(
+                P->block_set[index].block_size);
+        }
+    }
+}
+
+static void defrag(struct pool_struct *P,
+           int ifraglevel_start, int ifraglevel_stop)
+{
+    struct pool_quad_block *quad_block;
+
+    /* process block sets from smallest to largest permitted sizes */
+
+    for (int j = ifraglevel_start; j > ifraglevel_stop; j--) {
+
+        quad_block = P->block_set[j].quad_block;
+        int i = 0;
+
+        do {
+            /* block set is done if no more quad-blocks exist */
+
+            if (quad_block[i].mem_blocks == MEM_POOL_ADDR_INVALID) {
+                break;
+            }
+
+            /* reassemble current quad-block, if possible */
+
+            if (quad_block[i].mem_status == 0xF) {
+
+                /*
+                 * mark the corresponding block in next larger
+                 * block set as free
+                 */
+
+                free_existing_block(
+                    quad_block[i].mem_blocks, P, j - 1);
+
+                /*
+                 * delete the quad-block from this block set
+                 * by replacing it with the last quad-block
+                 *
+                 * (algorithm works even when the deleted
+                 * quad-block is the last quad_block)
+                 */
+
+                int k = i;
+                while (((k+1) != P->block_set[j].nr_of_entries)
+                       &&
+                       (quad_block[k+1].mem_blocks != MEM_POOL_ADDR_INVALID)) {
+                    k++;
+                }
+
+                quad_block[i].mem_blocks =
+                    quad_block[k].mem_blocks;
+                quad_block[i].mem_status =
+                    quad_block[k].mem_status;
+
+                quad_block[k].mem_blocks = MEM_POOL_ADDR_INVALID;
+
+                /* loop & process replacement quad_block[i] */
+            } else {
+                i++;
+            }
+
+            /* block set is done if at end of quad-block array */
+
+        } while (i < P->block_set[j].nr_of_entries);
+    }
+}
+
+#ifdef __arm__
+static inline unsigned int find_lsb_set(u32 op)
+{
+        unsigned int bit;
+
+        __asm__ volatile(
+                "rsb %0, %1, #0;\n\t"
+                "ands %0, %0, %1;\n\t" /* r0 = x & (-x): only LSB set */
+                "itt ne;\n\t"
+                "   clzne %0, %0;\n\t" /* count leading zeroes */
+                "   rsbne %0, %0, #32;\n\t"
+                : "=&r"(bit)
+                : "r"(op));
+
+        return bit;
+}
+#endif /* __arm__ */
+#ifdef __x86_64__
+static inline unsigned int find_lsb_set(u32 op)
+{
+        unsigned int bitpos;
+
+        __asm__ volatile (
+
+#if defined(CONFIG_CMOV)
+
+                "bsfl %1, %0;\n\t"
+                "cmovzl %2, %0;\n\t"
+                : "=r" (bitpos)
+                : "rm" (op), "r" (-1)
+                : "cc"
+
+#else
+
+                  "bsfl %1, %0;\n\t"
+                  "jnz 1f;\n\t"
+                  "movl $-1, %0;\n\t"
+                  "1:\n\t"
+                : "=r" (bitpos)
+                : "rm" (op)
+                : "cc"
+
+#endif /* CONFIG_CMOV */
+                );
+
+        return (bitpos + 1);
+}
+#endif /* __x86_64__ */
+
+static pool_addr_t get_existing_block(struct pool_block_set *pfraglevelinfo,
+                int *piblockindex)
+{
+    pool_addr_t found = MEM_POOL_ADDR_INVALID;
+    int i = 0;
+    int free_bit;
+
+    do {
+        /* give up if no more quad-blocks exist */
+
+        if (pfraglevelinfo->quad_block[i].mem_blocks == MEM_POOL_ADDR_INVALID) {
+            break;
+        }
+
+        /* allocate a block from current quad-block, if possible */
+
+        int status = pfraglevelinfo->quad_block[i].mem_status;
+        if (status != 0x0) {
+            /* identify first free block */
+            free_bit = find_lsb_set(status) - 1;
+
+            /* compute address of free block */
+            found = (u64)pfraglevelinfo->quad_block[i].mem_blocks +
+                (u64)(OCTET_TO_SIZEOFUNIT(free_bit * (u64)pfraglevelinfo->block_size));
+
+            /* mark block as unavailable (using XOR to invert) */
+            pfraglevelinfo->quad_block[i].mem_status ^=
+                1 << free_bit;
+#ifdef CONFIG_OBJECT_MONITOR
+            pfraglevelinfo->count++;
+#endif
+            break;
+        }
+
+        /* move on to next quad-block; give up if at end of array */
+
+    } while (++i < pfraglevelinfo->nr_of_entries);
+
+    *piblockindex = i;
+    return found;
+}
+
+static pool_addr_t get_block_recursive(struct pool_struct *P,
+                 int index, int startindex)
+{
+    int i;
+    pool_addr_t found, larger_block;
+    struct pool_block_set *fr_table;
+
+    /* give up if we've exhausted the set of maximum size blocks */
+
+    if (index < 0) {
+        return MEM_POOL_ADDR_INVALID;
+    }
+
+    /* try allocating a block from the current block set */
+
+    fr_table = P->block_set;
+    i = 0;
+
+    found = get_existing_block(&(fr_table[index]), &i);
+    if (found != MEM_POOL_ADDR_INVALID) {
+        return found;
+    }
+
+#if AUTODEFRAG == AD_BEFORE_SEARCH4BIGGERBLOCK
+    /*
+     * do a partial defragmentation of memory pool & try allocating again
+     * - do this on initial invocation only, not recursive ones
+     *   (since there is no benefit in repeating the defrag)
+     * - defrag only the blocks smaller than the desired size,
+     *   and only until the size needed is reached
+     *
+     * note: defragging at this time tries to preserve the memory pool's
+     * larger blocks by fragmenting them only when necessary
+     * (i.e. at the cost of doing more frequent auto-defragmentations)
+     */
+
+    if (index == startindex) {
+        defrag(P, P->nr_of_block_sets - 1, startindex);
+        found = get_existing_block(&(fr_table[index]), &i);
+        if (found != MEM_POOL_ADDR_INVALID) {
+            return found;
+        }
+    }
+#endif
+
+    /* try allocating a block from the next largest block set */
+
+    larger_block = get_block_recursive(P, index - 1, startindex);
+    if (larger_block != MEM_POOL_ADDR_INVALID) {
+        /*
+         * add a new quad-block to the current block set,
+         * then mark one of its 4 blocks as used and return it
+         *
+         * note: "i" was earlier set to indicate the first unused
+         * quad-block entry in the current block set
+         */
+
+        fr_table[index].quad_block[i].mem_blocks = larger_block;
+        fr_table[index].quad_block[i].mem_status = 0xE;
+#ifdef CONFIG_OBJECT_MONITOR
+        fr_table[index].count++;
+#endif
+        return larger_block;
+    }
+
+#if AUTODEFRAG == AD_AFTER_SEARCH4BIGGERBLOCK
+    /*
+     * do a partial defragmentation of memory pool & try allocating again
+     * - do this on initial invocation only, not recursive ones
+     *   (since there is no benefit in repeating the defrag)
+     * - defrag only the blocks smaller than the desired size,
+     *   and only until the size needed is reached
+     *
+     * note: defragging at this time tries to limit the cost of doing
+     * auto-defragmentations by doing them only when necessary
+     * (i.e. at the cost of fragmenting the memory pool's larger blocks)
+     */
+
+    if (index == startindex) {
+        defrag(P, P->nr_of_block_sets - 1, startindex);
+        found = get_existing_block(&(fr_table[index]), &i);
+        if (found != MEM_POOL_ADDR_INVALID) {
+            return found;
+        }
+    }
+#endif
+
+    return MEM_POOL_ADDR_INVALID; /* can't find (or create) desired block */
+}
+
+/* FIXME: need do sort for performance */
+static int find_slot(mem_pool_t *pool, u64 addr)
+{
+    for (int i = 0; i < MEM_POOL_SLOT_NUM; i++) {
+        if (pool->slot_addr[i] == addr)
+            return i;
+    }
+    /* failed to find */
+    assert(0);
+}
+
+static int find_empty_slot(mem_pool_t *pool)
+{
+    for (int i = 0; i < MEM_POOL_SLOT_NUM; i++) {
+        if (pool->slot_size[i] == 0) {
+            assert(pool->slot_addr[i] == MEM_POOL_ADDR_INVALID);
+            return i;
+        }
+    }
+    /* failed to find empty slot */
+    assert(0);
+}
+
+static void take_slot(mem_pool_t *pool, int i, u64 addr, u32 size)
+{
+    assert(i >= 0 && i < MEM_POOL_SLOT_NUM);
+    pool->slot_used++;
+    pool->slot_addr[i] = addr;
+    pool->slot_size[i] = size;
+    assert(pool->slot_used <= MEM_POOL_SLOT_NUM);
+}
+
+static void give_slot(mem_pool_t *pool, int i)
+{
+    assert(i >= 0 && i < MEM_POOL_SLOT_NUM);
+    assert(pool->slot_used > 0);
+    pool->slot_addr[i] = MEM_POOL_ADDR_INVALID;
+    pool->slot_size[i] = 0;
+    pool->slot_used--;
+}
+
+u64 mem_pool_alloc(mem_pool_t *pool, u64 size)
+{
+    //assert(size % GLOBAL_MEM_ALIGN_SIZE == 0); // FIXME(wwcai): support 1byte align
+    POOL_LOCK(pool);
+
+    struct pool_struct *P = &pool->_k_mem_pool_list[0];
+    pool_addr_t found_block;
+    int offset;
+
+#ifdef MEM_POOL_DEBUG
+    printf("mem_pool: alloc req_size %lld\n", size);
+#endif
+    assert(size <= P->maxblock_size);
+    /* locate block set to try allocating from */
+
+    offset = compute_block_set_index(P, size);
+
+    /* allocate block (fragmenting a larger block, if needed) */
+
+    found_block = get_block_recursive(P, offset, offset);
+
+    if (found_block == MEM_POOL_ADDR_INVALID) {
+        return MEM_POOL_ADDR_INVALID;
+    }
+
+#ifdef MEM_POOL_DEBUG
+    printf("mem_pool: alloc addr 0x%llx size %lld\n", found_block, size);
+#endif
+
+    int slot = find_empty_slot(pool);
+    take_slot(pool, slot, (u64)found_block, size);
+
+    POOL_UNLOCK(pool);
+    return (u64)found_block;
+}
+
+void mem_pool_free(mem_pool_t *pool, u64 addr)
+{
+    POOL_LOCK(pool);
+
+    struct pool_struct *P = &pool->_k_mem_pool_list[0];
+    u64 offset;
+
+    int slot = find_slot(pool, addr);
+    int size = pool->slot_size[slot];
+
+#ifdef MEM_POOL_DEBUG
+    printf("mem_pool: free addr 0x%llx, found size %d\n", addr, size);
+#endif
+    /* determine block set that block belongs to */
+
+    offset = compute_block_set_index(P, size);
+
+    /* mark the block as unused */
+
+    free_existing_block(addr, P, offset);
+
+    give_slot(pool, slot);
+
+    POOL_UNLOCK(pool);
+}
+
+void mem_pool_create(mem_pool_t **pool, u64 total_size)
+{
+  mem_pool_t *tpool = new mem_pool_t;
+  POOL_LOCK_INIT(tpool);
+  tpool->total_size = total_size;
+
+  struct pool_struct *P = &tpool->_k_mem_pool_list[0];
+  _k_mem_pool_init_pre(P, total_size);
+  tpool->_k_mem_pool_count = 1;
+  _k_mem_pool_init(tpool);
+
+  tpool->slot_used = 0;
+  for (int i = 0; i < MEM_POOL_SLOT_NUM; i++) {
+    tpool->slot_addr[i] = MEM_POOL_ADDR_INVALID;
+    tpool->slot_size[i] = 0;
+  }
+
+  *pool = tpool;
+
+#ifdef MEM_POOL_DEBUG
+  printf("mem_pool: create\n");
+#endif
+}
+
+void mem_pool_destroy(mem_pool_t *pool)
+{
+#ifdef MEM_POOL_DEBUG
+  printf("mem_pool: destroy\n");
+#endif
+  POOL_LOCK(pool);
+
+  /* sanity checking */
+  if (pool->slot_used) {
+    printf("mem_pool: destroy pool with %d left\n", pool->slot_used);
+    for (int i = 0; i < MEM_POOL_SLOT_NUM; i++) {
+      if (pool->slot_size[i] != 0) {
+        printf("mem_pool:   slot %d in use, size %d\n", i, pool->slot_size[i]);
+      }
+    }
+  }
+  assert(pool->slot_used == 0);
+
+  struct pool_struct *P = &pool->_k_mem_pool_list[0];
+  k_mem_pool_exit_post(P);
+
+  POOL_UNLOCK(pool);
+  POOL_LOCK_DEINIT(pool);
+
+  delete pool;
+}
+#endif /* MEM_POOL_ZEPHRE */
+
+#ifdef MEM_POOL_NAIVE_PLUS
+void mem_pool_init(mem_pool_t *pool)
+{
+  struct pool_struct *P = &pool->_mem_pool_list[0];
+  P->slot_avail.clear();
+  P->slot_in_use.clear();
+  P->slot_avail.push_back(make_pair(0, pool->total_size));
+}
+
+/* look for the smallest yet sufficient memory slot for the demanding size */
+static pool_addr_t find_slot(struct pool_struct *P, pool_size_t size){
+  assert(size % MIN_SLOT_SIZE == 0);
+
+  vector<pool_pair_t>::iterator it, it_min = P->slot_avail.end();
+  pool_size_t min_size_among_sufficient = 0;
+  for(it = P->slot_avail.begin(); it != P->slot_avail.end(); ++it){
+    if((*it).second >= size){
+      if(min_size_among_sufficient == 0 || (*it).second < min_size_among_sufficient){
+        it_min = it;
+        min_size_among_sufficient = (*it).second;
+      }
+    }
+  }
+
+  if(it_min == P->slot_avail.end() || (*it_min).second == 0){
+    printf("Memory exhausted: cannot find a slot.\n");
+    return MEM_POOL_ADDR_INVALID;
+  }
+  pool_addr_t addr = (*it_min).first;
+
+  if((*it_min).second == size){
+    P->slot_avail.erase(it_min);
+  } else {
+    (*it_min).first = addr + size;
+    (*it_min).second -= size;
+  }
+
+  P->slot_in_use.insert(make_pair(addr, size));
+
+  return addr;
+}
+
+pool_addr_t mem_pool_alloc(mem_pool_t *pool, pool_size_t size)
+{
+  //assert(size % GLOBAL_MEM_ALIGN_SIZE == 0); // FIXME(wwcai): support 1byte align
+  POOL_LOCK(pool);
+
+  struct pool_struct *P = &pool->_mem_pool_list[0];
+  assert(P->num_slots_in_use < MEM_POOL_SLOT_NUM);
+  pool_size_t size_to_alloc = (size + MIN_SLOT_SIZE -1) / MIN_SLOT_SIZE * MIN_SLOT_SIZE;
+  pool_addr_t addr_to_alloc = find_slot(P, size_to_alloc);
+
+  if (addr_to_alloc == MEM_POOL_ADDR_INVALID) {
+#ifdef MEM_POOL_DEBUG
+    printf("mem_pool: mem alloc failed in searching stage\n");
+#endif
+    assert(0);  // no error handling yet
+    return MEM_POOL_ADDR_INVALID;
+  }
+  else if (addr_to_alloc + size_to_alloc > pool->total_size) {
+#ifdef MEM_POOL_DEBUG
+    printf("mem_pool: mem alloc insufficient size\n");
+#endif
+    assert(0);  // no error handling yet
+    return MEM_POOL_ADDR_INVALID;
+  }
+
+  P->num_slots_in_use++;
+#ifdef MEM_POOL_DEBUG
+  printf("mem_pool: alloc addr 0x%lx with size of %ld; actual size required = %ld\n",
+      addr_to_alloc, size_to_alloc, size);
+#endif
+
+  POOL_UNLOCK(pool);
+  return addr_to_alloc;
+}
+
+void mem_pool_free(mem_pool_t *pool, pool_addr_t addr_to_free)
+{
+  POOL_LOCK(pool);
+
+  struct pool_struct *P = &pool->_mem_pool_list[0];
+  pool_map_t::iterator it = P->slot_in_use.find(addr_to_free);
+  assert(it != P->slot_in_use.end());
+  pool_size_t size_to_free = P->slot_in_use[addr_to_free];
+  assert(size_to_free % MIN_SLOT_SIZE == 0);
+  P->slot_in_use.erase(it);
+  P->num_slots_in_use--;
+
+  pool_addr_t addr_next = addr_to_free + size_to_free;
+  vector<pool_pair_t>::iterator it_prev_slot = find_if(P->slot_avail.begin(),
+      P->slot_avail.end(), offset_prev_finder(addr_to_free));
+  vector<pool_pair_t>::iterator it_next_slot = find_if(P->slot_avail.begin(),
+      P->slot_avail.end(), offset_next_finder(addr_next));
+
+  if(it_prev_slot == P->slot_avail.end() && it_next_slot == P->slot_avail.end()){
+    P->slot_avail.push_back(make_pair(addr_to_free, size_to_free));
+  }
+  else if(it_prev_slot == P->slot_avail.end()){
+    (*it_next_slot).first = addr_to_free;
+    (*it_next_slot).second += size_to_free;
+#ifdef DEBUG
+    CHECK_FREED_MEM((*it_next_slot).first, (*it_next_slot).second, P);
+#endif
+  }
+  else if(it_next_slot == P->slot_avail.end()){
+    (*it_prev_slot).second += size_to_free;
+#ifdef DEBUG
+    CHECK_FREED_MEM((*it_prev_slot).first, (*it_prev_slot).second, P);
+#endif
+  } else {
+    (*it_prev_slot).second += (size_to_free + (*it_next_slot).second);
+    P->slot_avail.erase(it_next_slot);
+  }
+
+#ifdef MEM_POOL_DEBUG
+  printf("mem_pool_free: addr_to_free = 0x%lx; size_to_free = %ld\n",
+      addr_to_free, size_to_free);
+#endif
+
+  POOL_UNLOCK(pool);
+}
+
+void mem_pool_create(mem_pool_t **pool, u64 total_size)
+{
+  mem_pool_t *tpool = new mem_pool_t;
+  POOL_LOCK_INIT(tpool);
+
+  tpool->total_size = total_size;
+  mem_pool_init(tpool);
+  tpool->_mem_pool_count = 1;
+  struct pool_struct *P = &tpool->_mem_pool_list[0];
+  P->num_slots_in_use = 0;
+
+  *pool = tpool;
+
+#ifdef MEM_POOL_DEBUG
+  printf("mem_pool: create\n");
+#endif
+}
+
+void mem_pool_destroy(mem_pool_t *pool)
+{
+  POOL_LOCK(pool);
+
+  /* sanity checking */
+  struct pool_struct *P = &pool->_mem_pool_list[0];
+  assert(P->slot_avail[0].first == 0);
+  assert(P->slot_avail[0].second == pool->total_size);
+  assert(P->slot_in_use.empty());
+  assert(P->num_slots_in_use == 0);
+
+  POOL_UNLOCK(pool);
+  POOL_LOCK_DEINIT(pool);
+
+  delete pool;
+}
+
+static bool slot_in_bank(pool_addr_t addr, pool_size_t size)
+{
+  pool_addr_t aligned_addr = addr % BANK_SIZE;
+  return ((aligned_addr + size) <= BANK_SIZE);
+}
+
+static pool_addr_t find_slot_in_bank(struct pool_struct *P, pool_size_t size_to_alloc)
+{
+  assert(size_to_alloc % MIN_SLOT_SIZE == 0);
+
+  vector<pool_pair_t>::iterator it, it_find = P->slot_avail.end();
+  int find_case = 0;
+  for (it = P->slot_avail.begin(); it != P->slot_avail.end(); ++it) {
+      if ((*it).second >= size_to_alloc) {
+        pool_addr_t slot_addr = (*it).first;
+        pool_size_t slot_size = (*it).second;
+        if (slot_in_bank(slot_addr, size_to_alloc)) {
+            find_case = 1;
+        } else if(slot_in_bank(slot_addr + slot_size - size_to_alloc, size_to_alloc)) {
+            find_case = 2;
+        }
+        if (find_case > 0) {
+            it_find = it;
+            break;
+        }
+      }
+  }
+
+  if (find_case == 0) {
+    printf("Memory exhausted: cannot find a slot.\n");
+    return MEM_POOL_ADDR_INVALID;
+  }
+
+  pool_addr_t addr;
+  if (find_case == 1) {
+    addr = (*it_find).first;
+  } else {
+    addr = (*it_find).first + (*it_find).second - size_to_alloc;
+  }
+
+  if ((*it_find).second == size_to_alloc) {
+    P->slot_avail.erase(it_find);
+  } else {
+    if (find_case == 1) {
+        (*it_find).first += size_to_alloc;
+        (*it_find).second -= size_to_alloc;
+    } else {
+        (*it_find).second -= size_to_alloc;
+    }
+  }
+
+  P->slot_in_use.insert(make_pair(addr, size_to_alloc));
+
+  return addr;
+}
+
+pool_addr_t mem_pool_alloc_in_bank(mem_pool_t *pool, pool_size_t size)
+{
+  //assert(size % GLOBAL_MEM_ALIGN_SIZE == 0); // FIXME(wwcai): support 1byte align
+  POOL_LOCK(pool);
+
+  struct pool_struct *P = &pool->_mem_pool_list[0];
+  assert(P->num_slots_in_use < MEM_POOL_SLOT_NUM);
+  pool_size_t size_to_alloc = (size + MIN_SLOT_SIZE -1) / MIN_SLOT_SIZE * MIN_SLOT_SIZE;
+  pool_addr_t addr_to_alloc = find_slot_in_bank(P, size_to_alloc);
+
+  if (addr_to_alloc == MEM_POOL_ADDR_INVALID) {
+#ifdef MEM_POOL_DEBUG
+    printf("mem_pool: mem alloc failed in searching stage\n");
+#endif
+    assert(0);  // no error handling yet
+    return MEM_POOL_ADDR_INVALID;
+  }
+  else if (addr_to_alloc + size_to_alloc > pool->total_size) {
+#ifdef MEM_POOL_DEBUG
+    printf("mem_pool: mem alloc insufficient size\n");
+#endif
+    assert(0);  // no error handling yet
+    return MEM_POOL_ADDR_INVALID;
+  }
+
+  P->num_slots_in_use++;
+#ifdef MEM_POOL_DEBUG
+  printf("mem_pool: alloc addr 0x%lx with size of %ld; actual size required = %ld\n",
+      addr_to_alloc, size_to_alloc, size);
+#endif
+
+  POOL_UNLOCK(pool);
+  return addr_to_alloc;
+}
+
+#endif /* MEM_POOL_NAIVE_PLUS */
diff --git a/cviruntime/src/common/mmpool.h b/cviruntime/src/common/mmpool.h
new file mode 100644
index 000000000..a8600dc28
--- /dev/null
+++ b/cviruntime/src/common/mmpool.h
@@ -0,0 +1,165 @@
+#ifndef BMDNN_MMPOOL_H_
+#define BMDNN_MMPOOL_H_
+
+#include <bmkernel/bm_kernel.h>
+
+#include <utility>
+#include <vector>
+#include <iostream>
+#include <map>
+#include <algorithm>
+#include <stdio.h>
+#include <assert.h>
+
+#ifdef __linux__
+#define POOL_USE_PTHREAD
+#include <pthread.h>
+#endif
+
+#include <stdint.h>
+
+#define GLOBAL_MEM_ADDR_NULL    (0x000000ffffffffffULL)
+
+using namespace std;
+
+//#define MEM_POOL_NAIVE
+//#define MEM_POOL_ZEPHRE
+#define MEM_POOL_NAIVE_PLUS
+
+#define MEM_POOL_ADDR_INVALID   (GLOBAL_MEM_ADDR_NULL)
+#define MEM_POOL_SLOT_NUM       (2048 * 8)
+#define BANK_SIZE               (0x80000000)
+
+#ifdef MEM_POOL_NAIVE
+#endif
+
+#ifdef MEM_POOL_ZEPHRE
+typedef u64 pool_addr_t;
+struct pool_quad_block {
+    pool_addr_t mem_blocks;
+    u32 mem_status;
+};
+
+struct pool_block_set {
+    int block_size;
+    int nr_of_entries;
+    struct pool_quad_block *quad_block;
+#ifdef CONFIG_OBJECT_MONITOR
+    int count;
+#endif
+};
+
+struct pool_struct {
+    int maxblock_size;
+    int minblock_size;
+    int nr_of_maxblocks;
+    int nr_of_block_sets;
+    struct pool_block_set *block_set;
+    pool_addr_t bufblock;
+};
+#define MAX_POOL_COUNT          (2)
+#endif /* MEM_POOL_ZEPHRE */
+
+#ifdef MEM_POOL_NAIVE_PLUS
+#define MIN_SLOT_SIZE (4 * 1024)
+
+#define CHECK_FREED_MEM(offset, size, P)                                      \
+  do {                                                                        \
+    pool_map_t::iterator it;                                                  \
+    for (it = P->slot_in_use.begin(); it != P->slot_in_use.end(); it++) {     \
+      if( ((*it).first <= offset && (*it).first + (*it).second > offset) ||   \
+          ((*it).first < offset + size && (*it).first + (*it).second >= offset + size) ){ \
+        printf("CHECK_FREED_MEM ERROR: attempting to free memory in use\n");  \
+        assert(0);                                                            \
+      }                                                                       \
+    }                                                                         \
+  } while(0);
+
+typedef u64 pool_addr_t;
+typedef u64 pool_size_t;
+typedef pair<pool_addr_t, pool_size_t> pool_pair_t;
+typedef map<pool_addr_t, pool_size_t> pool_map_t;
+
+class offset_prev_finder {
+public:
+  offset_prev_finder(pool_addr_t offset_prev) : offset_prev_(offset_prev) {}
+  bool operator () (const pool_pair_t &pair_){
+    return pair_.first + pair_.second == offset_prev_;
+  }
+private:
+  pool_addr_t offset_prev_;
+};
+
+class offset_next_finder {
+public:
+  offset_next_finder(pool_addr_t offset_next) : offset_next_(offset_next) {}
+  bool operator () (const pool_pair_t &pair_){
+    return pair_.first == offset_next_;
+  }
+private:
+  pool_addr_t offset_next_;
+};
+
+struct pool_struct {
+  int num_slots_in_use;
+  vector<pool_pair_t> slot_avail;  // (offset, size)
+  pool_map_t slot_in_use; // offset -> size
+};
+#define MAX_POOL_COUNT (2)
+
+#endif /* MEM_POOL_NAIVE_PLUS */
+
+typedef struct mem_pool {
+  u64                           total_size;
+#ifdef MEM_POOL_NAIVE
+  u64                           head_addr;
+  u64                           slot[MEM_POOL_SLOT_NUM];
+  int                           head_slot;
+  int                           slot_used;
+#endif /* MEM_POOL_NAIVE */
+#ifdef MEM_POOL_ZEPHRE
+  struct pool_struct            _k_mem_pool_list[MAX_POOL_COUNT];
+  int                           _k_mem_pool_count;
+  /* managing allocated chunk */
+  u64                           slot_addr[MEM_POOL_SLOT_NUM];
+  int                           slot_size[MEM_POOL_SLOT_NUM];
+  int                           slot_used;
+#endif /* MEM_POOL_ZEPHRE */
+#ifdef MEM_POOL_NAIVE_PLUS
+  struct pool_struct            _mem_pool_list[MAX_POOL_COUNT];
+  int                           _mem_pool_count;
+  /* managing allocated chunk */
+  u64                           slot_addr[MEM_POOL_SLOT_NUM];
+  int                           slot_size[MEM_POOL_SLOT_NUM];
+  int                           slot_used;
+#endif /* MEM_POOL_ZEPHRE */
+#ifdef POOL_USE_PTHREAD
+  pthread_mutex_t               lock;
+#define POOL_LOCK_INIT(pool)    pthread_mutex_init(&pool->lock, NULL)
+#define POOL_LOCK_DEINIT(pool)  pthread_mutex_destroy(&pool->lock)
+#define POOL_LOCK(pool)         pthread_mutex_lock(&pool->lock)
+#define POOL_UNLOCK(pool)       pthread_mutex_unlock(&pool->lock)
+#else
+#define POOL_LOCK_INIT(pool)
+#define POOL_LOCK_DEINIT(pool)
+#define POOL_LOCK(pool)
+#define POOL_UNLOCK(pool)
+#endif
+} mem_pool_t;
+
+#ifdef MEM_POOL_NAIVE_PLUS
+void mem_pool_cleanup(mem_pool_t *pool);
+pool_addr_t mem_pool_alloc(mem_pool_t *pool, pool_size_t size);
+void mem_pool_free(mem_pool_t *pool, pool_addr_t addr);
+void mem_pool_create(mem_pool_t **pool, u64 total_size);
+void mem_pool_destroy(mem_pool_t *pool);
+pool_addr_t mem_pool_alloc_in_bank(mem_pool_t *pool, pool_size_t size);
+#else
+void mem_pool_cleanup(mem_pool_t *pool);
+u64 mem_pool_alloc(mem_pool_t *pool, u64 size);
+void mem_pool_free(mem_pool_t *pool, u64 addr);
+void mem_pool_create(mem_pool_t **pool, u64 total_size);
+void mem_pool_destroy(mem_pool_t *pool);
+#endif
+
+#endif /* BMDNN_MMPOOL_H_ */
diff --git a/cviruntime/src/common/model.cpp b/cviruntime/src/common/model.cpp
new file mode 100644
index 000000000..b16b4da81
--- /dev/null
+++ b/cviruntime/src/common/model.cpp
@@ -0,0 +1,534 @@
+#include <sys/mman.h>
+#include <unistd.h>
+#include <dlfcn.h>
+#include <iostream>
+#include <sstream>
+#include <mutex>
+#include <runtime/model.hpp>
+#include <runtime/stream.hpp>
+#include <runtime/debug.h>
+#include <runtime/version.h>
+#include <alloc.h>
+
+#ifdef ENABLE_CPU_FUNC
+#include <cpu_function/deformableconv.hpp>
+#include <cpu_function/deform_im2col.hpp>
+#include <cpu_function/instancenorm.hpp>
+#include <cpu_function/interpolation.hpp>
+#include <cpu_function/softmax.hpp>
+#include <cpu_function/quant.hpp>
+#include <cpu_function/reducemean.hpp>
+#include <cpu_function/reducemax.hpp>
+#include <cpu_function/reducel2.hpp>
+#include <cpu_function/retinaface_detection.hpp>
+#include <cpu_function/preprocess.hpp>
+#include <cpu_function/transpose.hpp>
+#include <cpu_function/ssd_detection.hpp>
+#include <cpu_function/yolo_detection.hpp>
+#include <cpu_function/frcn_detection.hpp>
+#include <cpu_function/pixelshuffle.hpp>
+#include <cpu_function/proposal.hpp>
+#include <cpu_function/roi_pooling.hpp>
+#include <cpu_function/argmax.hpp>
+#include <cpu_function/argmax_v2.hpp>
+#include <cpu_function/argmax_v3.hpp>
+#include <cpu_function/embedding.hpp>
+#include <cpu_function/gathernd.hpp>
+#include <cpu_function/grid_sampler.hpp>
+#include <cpu_function/cumsum.hpp>
+#include <cpu_function/gatherelements_pt.hpp>
+#endif
+
+#ifdef ENABLE_COMPRESS_CMDBUF
+#include "lz4.h"
+#endif
+
+namespace cvi {
+namespace runtime {
+
+std::string CviModel::targetChipType = "";
+
+CviModel::CviModel(CVI_RT_HANDLE ctx, int count)
+    : _ctx(ctx), ref(1), _count(count), _max_shared_mem_size(0) {
+  _pool = new TaskPool(1);
+  if (std::getenv("TPU_ENABLE_PROTECT")) {
+    isprotect = true;
+  }
+
+#ifdef ENABLE_CPU_FUNC
+  _cpu_functions.push_back(new CpuRuntimeFunction("deform_conv2d", DeformableConvFunc::open));
+  _cpu_functions.push_back(new CpuRuntimeFunction("deform_im2col", DeformableIm2ColFunc::open));
+  _cpu_functions.push_back(new CpuRuntimeFunction("instance_norm", InstanceNormFunc::open));
+  _cpu_functions.push_back(new CpuRuntimeFunction("interp", InterpolationFunc::open));
+  _cpu_functions.push_back(new CpuRuntimeFunction("softmax", SoftmaxFunc::open));
+  _cpu_functions.push_back(new CpuRuntimeFunction("softmax_cpu", SoftmaxFunc::open));
+  _cpu_functions.push_back(new CpuRuntimeFunction("quant", QuantFunc::open));
+  _cpu_functions.push_back(
+      new CpuRuntimeFunction("retinaface_detection", RetinaFaceDetectionFunc::open));
+  _cpu_functions.push_back(new CpuRuntimeFunction("preprocess", PreprocessFunc::open));
+  _cpu_functions.push_back(new CpuRuntimeFunction("transpose", TransposeFunc::open));
+  _cpu_functions.push_back(
+      new CpuRuntimeFunction("detectionoutput", SSDDetectionFunc::open));
+  _cpu_functions.push_back(
+      new CpuRuntimeFunction("yolo_detection", YoloDetectionFunc::open));
+  _cpu_functions.push_back(
+      new CpuRuntimeFunction("frcn_detection", FrcnDetectionFunc::open));
+  _cpu_functions.push_back(
+      new CpuRuntimeFunction("pixelshuffle", PixelShuffleFunc::open));
+  _cpu_functions.push_back(new CpuRuntimeFunction("proposal", ProposalFunc::open));
+  _cpu_functions.push_back(
+      new CpuRuntimeFunction("cpu_reduce_mean", ReduceMeanFunc::open));
+  _cpu_functions.push_back(new CpuRuntimeFunction("cpu_reduce_max", ReduceMaxFunc::open));
+  _cpu_functions.push_back(new CpuRuntimeFunction("reduce_l2", ReduceL2Func::open));
+  _cpu_functions.push_back(new CpuRuntimeFunction("roi_pooling", ROIPoolingFunc::open));
+  _cpu_functions.push_back(new CpuRuntimeFunction("argmax", ArgMaxFunc::open));
+  _cpu_functions.push_back(new CpuRuntimeFunction("argmax_with_conf", ArgMaxV2Func::open));
+  _cpu_functions.push_back(new CpuRuntimeFunction("argmax_v3", ArgMaxV3Func::open));
+  _cpu_functions.push_back(new CpuRuntimeFunction("embedding", EmbeddingFunc::open));
+  _cpu_functions.push_back(new CpuRuntimeFunction("gathernd_tf", GatherNDFunc::open));
+  _cpu_functions.push_back(new CpuRuntimeFunction("grid_sampler", GridSamplerFunc::open));
+  _cpu_functions.push_back(new CpuRuntimeFunction("cumsum", CumSumFunc::open));
+  _cpu_functions.push_back(new CpuRuntimeFunction("gatherelements_pt", GatherElementsPtFunc::open));
+#endif
+}
+
+CviModel::~CviModel() {
+  if (_model_body)
+    delete[] _model_body;
+  if (_pool)
+    delete _pool;
+  if (_weight_mem) {
+    if (isprotect) {
+      mem_unprotect(CVI_RT_MemGetVAddr(_weight_mem), CVI_RT_MemGetSize(_weight_mem));
+    }
+    cviMemFree(_ctx, _weight_mem);
+  }
+  for (auto func : _cpu_functions) {
+    delete func;
+  }
+  for (auto buf : dmabuf_map) {
+    if (isprotect) {
+      mem_unprotect(CVI_RT_MemGetVAddr(buf.second), CVI_RT_MemGetSize(buf.second));
+    }
+    cviMemFree(_ctx, buf.second);
+  }
+}
+
+bool CviModel::checkIfMatchTargetChipType(std::string &target) {
+#if defined(__x86_64__) || defined(_M_X64)
+  char *deviceChipType = std::getenv("SET_CHIP_NAME");
+#else
+#if CHIPID == 0x1
+  char *deviceChipType = (char *)"cv183x";
+#elif CHIPID == 0x2
+  char *deviceChipType = (char *)"cv182x";
+#elif CHIPID ==  0x3
+  char *deviceChipType = (char *)"cv181x";
+#elif CHIPID == 0x4
+  char *deviceChipType = (char *)"cv180x";
+#else
+#error "CHIPID is not defined"
+#endif
+#endif
+  if (target != deviceChipType) {
+    TPU_LOG_ERROR("cvimodel built for %s CANNOT run on platform %s\n",
+                  target.c_str(), deviceChipType);
+    return false;
+  }
+  return true;
+}
+
+CVI_RC CviModel::parseModelHeader(BaseStream *stream, size_t &payload_sz, size_t &header_sz) {
+  if (stream->length() <= sizeof(MODEL_HEADER)) {
+    TPU_LOG_ERROR("Error, invalid cvimodel file\n");
+    return CVI_RC_INVALID_ARG;
+  }
+
+  MODEL_HEADER header;
+  stream->read((uint8_t *)&header, 0, sizeof(header));
+  payload_sz = header.body_size;
+  /* before version 1.1, heder size is 32 bytes */
+  if (header.major == 1 && header.minor == 0) {
+    header_sz = 0x20;
+  } else {
+    header_sz = sizeof(MODEL_HEADER);
+  }
+
+  /* No chip field in heder before version 1.1 */
+  if (header.major == 1 && header.minor == 0) {
+    targetChipType = "cv183x";
+  } else {
+    targetChipType = header.chip;
+  }
+  if (!checkIfMatchTargetChipType(targetChipType)) {
+    return CVI_RC_INVALID_ARG;
+  }
+  // TODO, verify md5 here
+  return CVI_RC_SUCCESS;
+}
+
+CVI_RC CviModel::showAndCheckVersion() {
+  auto version = _fb_model->version();
+  major_ver = (int)version->major_();
+  minor_ver = (int)version->minor_();
+  TPU_LOG_INFO("version: %d.%d.%d\n",
+               major_ver, minor_ver, (int)version->sub_minor());
+  if (_fb_model->target()) {
+    TPU_LOG_INFO("%s Build at %s For platform %s\n", _fb_model->name()->str().c_str(),
+                 _fb_model->build_time()->str().c_str(), _fb_model->target()->str().c_str());
+  } else {
+    TPU_LOG_INFO("%s Build at %s For platform cv183x\n", _fb_model->name()->str().c_str(),
+                 _fb_model->build_time()->str().c_str());
+  }
+
+  /* runtime should compatible with the previous cvimodel
+     cvimodel version should smaller than runtime */
+  if (cvi::model::MajorVersion_value > major_ver) {
+    return CVI_RC_SUCCESS;
+  } else if (cvi::model::MinorVersion_value < minor_ver) {
+    TPU_LOG_ERROR("cvimodel(%d.%d) is not supported in runtime(%d.%d)\n",
+        major_ver, minor_ver,
+        (int)cvi::model::MajorVersion_value,
+        (int)cvi::model::MinorVersion_value);
+    TPU_LOG_ERROR("Please update runtime lib.\n");
+    return CVI_RC_INVALID_ARG ;
+  }
+  return CVI_RC_SUCCESS;
+}
+
+void CviModel::parseProgramNum() {
+  auto &programs = *_fb_model->programs();
+  program_num = programs.size();
+
+  _max_shared_mem_size = 0;
+  for (int i = 0; i < program_num; ++i) {
+    if (programs[i]->shared_gmem() > _max_shared_mem_size) {
+      _max_shared_mem_size = programs[i]->shared_gmem();
+    }
+  }
+  TPU_LOG_INFO("Max SharedMem size:%zu\n", _max_shared_mem_size);
+}
+
+CVI_RC CviModel::extractSections(BaseStream *stream, size_t bin_offset) {
+  auto &sections = *_fb_model->sections();
+  std::vector<const cvi::model::Section*> cmdbuf_sections;
+  CVI_RC ret;
+  for (auto s : sections) {
+#if __aarch64__
+    if (s->type() == cvi::model::SectionType_FUNC_AARCH64) {
+#else
+    if (s->type() == cvi::model::SectionType_FUNC_X86) {
+#endif
+      if (s->size() == 0)
+        continue;
+      if (!_custom_section.load(stream, s->offset() + bin_offset, s->size(),
+                                _cpu_functions)) {
+        return CVI_RC_FAILURE;
+      }
+    } else if (s->type() == cvi::model::SectionType_WEIGHT) {
+      ret = loadWeight(stream, s->offset() + bin_offset, s->size());
+      if (ret != CVI_RC_SUCCESS) {
+        return ret;
+      }
+    } else if (s->type() == cvi::model::SectionType_CMDBUF) {
+      cmdbuf_sections.emplace_back(s);
+    } else if (s->type() == cvi::model::SectionType_DMABUF) {
+      ret = loadDmabuf(stream, s->offset() + bin_offset, s->size(), s);
+      if (ret != CVI_RC_SUCCESS) {
+        return ret;
+      }
+    }
+  }
+  for (auto s : cmdbuf_sections) {
+    ret = loadCmdbuf(stream, s->offset() + bin_offset, s->size(), s);
+    if (ret != CVI_RC_SUCCESS) {
+        return ret;
+    }
+  }
+  return CVI_RC_SUCCESS;
+}
+
+CVI_RC CviModel::loadDmabuf(BaseStream *stream, size_t offset, size_t size, const cvi::model::Section *section) {
+  if (section->encrypt()) {
+    assert(0 && "TODO encrypt");
+  }
+  CVI_RT_MEM buf = cviMemAlloc(_ctx, size, CVI_ALLOC_DMABUF, _model_name.c_str());
+  if (!buf) {
+    TPU_LOG_ERROR("alloc memory for dmabuf failed, size:%zu\n", size);
+    return CVI_RC_NOMEM;
+  }
+  stream->read(CVI_RT_MemGetVAddr(buf), offset, size);
+  size_t length = size;
+  if (section->compress() && section->decompressed_size() > 0) {
+#ifdef ENABLE_COMPRESS_CMDBUF
+    auto tmp_buf = cviMemAlloc(_ctx, section->decompressed_size(), CVI_ALLOC_DMABUF, _model_name.c_str());
+    if (!tmp_buf) {
+        TPU_LOG_ERROR("alloc memory for decompressed dmabuf failed, size:%zu\n", size);
+        cviMemFree(_ctx, buf);
+        return false;
+    }
+    size_t rc = LZ4_decompress_safe(reinterpret_cast<char *>(CVI_RT_MemGetVAddr(buf)),
+                                    reinterpret_cast<char *>(CVI_RT_MemGetVAddr(tmp_buf)),
+                                    size, section->decompressed_size());
+    TPU_ASSERT(rc == section->decompressed_size(), "decompress error rc != decompressed size");
+    cviMemFree(_ctx, buf);
+    buf = tmp_buf;
+    length = section->decompressed_size();
+#else
+    TPU_LOG_ERROR("Compressed dmabuf is not supported! please recompile with ENABLE_COMPRESS_CMDBUF\n");
+    return CVI_RC_UNSUPPORT;
+#endif
+  }
+
+  bool enable_pmu = false;
+#ifdef ENABLE_PMU
+  const char *pmu_enable_env = std::getenv("TPU_ENABLE_PMU");
+  if (pmu_enable_env) {
+    if (atoi(pmu_enable_env) > 0) {
+      enable_pmu = true;
+    }
+  }
+#endif
+
+  CVI_RT_MEM cmdbuf_mem = nullptr;
+  int ret = CVI_RT_LoadDmabuf(_ctx, buf, length, 0, 0, enable_pmu, &cmdbuf_mem);
+  if (CVI_RC_SUCCESS != ret) {
+    cviMemFree(_ctx, buf);
+    return ret;
+  }
+  if (cmdbuf_mem != buf) {
+    cviMemFree(_ctx, buf);
+  }
+
+  dmabuf_map.emplace(section->name()->str(), cmdbuf_mem);
+  return CVI_RC_SUCCESS;
+}
+
+CVI_RC CviModel::loadCmdbuf(BaseStream *stream, size_t offset, size_t size, const cvi::model::Section *section) {
+  //assert(size && _weight_mem); // load cmdbuf must behind load weight
+  if (0 == size) {
+    return CVI_RC_SUCCESS;
+  }
+  std::vector<uint8_t> cmdbuf(size);
+  bool enable_pmu = false;
+
+  stream->read(cmdbuf.data(), offset, size);
+
+#ifdef ENABLE_PMU
+  const char *pmu_enable_env = std::getenv("TPU_ENABLE_PMU");
+  if (pmu_enable_env) {
+    if (atoi(pmu_enable_env) > 0) {
+      enable_pmu = true;
+    }
+  }
+#endif
+
+  CVI_RT_MEM cmdbuf_mem;
+  CVI_RC ret;
+  if (section->encrypt()) {
+    uint32_t weight_size = CVI_RT_MemGetSize(_weight_mem);
+    ret = CVI_RT_LoadCmdbufTee(_ctx, cmdbuf.data(), size, 0,
+                               0, weight_size, &cmdbuf_mem);
+  } else {
+    if (section->compress() && section->decompressed_size()) {
+#ifdef ENABLE_COMPRESS_CMDBUF
+      uint8_t *buf = new(std::nothrow) uint8_t[section->decompressed_size()];
+      TPU_ASSERT(buf != nullptr, "Allocate decompress buff failed");
+
+      size_t rc = LZ4_decompress_safe(reinterpret_cast<char *>(cmdbuf.data()),
+                                      reinterpret_cast<char *>(buf),
+                                      size, section->decompressed_size());
+      TPU_ASSERT(rc == section->decompressed_size(), "decompress error rc != decompressed size");
+
+      ret = CVI_RT_LoadCmdbuf(
+          _ctx, buf, section->decompressed_size(), 0,
+          0, enable_pmu, &cmdbuf_mem);
+      delete[] buf;
+#else
+      TPU_LOG_ERROR("Compressed cmdbuf is not supported! please recompile with ENABLE_COMPRESS_CMDBUF\n");
+      return CVI_RC_UNSUPPORT;
+#endif
+    } else {
+      // setup base address of neuron & weight gmem
+      // then load cmdbuf to gmem
+      ret = CVI_RT_LoadCmdbuf(
+          _ctx, cmdbuf.data(), size, 0,
+          0, enable_pmu, &cmdbuf_mem);
+    }
+  }
+  if (ret == CVI_RC_SUCCESS) {
+    dmabuf_map.emplace(section->name()->str(), cmdbuf_mem);
+  } else {
+    TPU_LOG_WARNING("loadCmdbuf failed\n");
+  }
+  if (isprotect) {
+    mem_protect(CVI_RT_MemGetVAddr(cmdbuf_mem), CVI_RT_MemGetSize(cmdbuf_mem));
+  }
+  return ret;
+}
+
+CVI_RC CviModel::loadWeight(BaseStream *stream, size_t offset, size_t size) {
+  /// debug
+  if (size == 0) {
+    return CVI_RC_SUCCESS;
+  }
+  size_t alloc_size = size;
+  if (isprotect) {
+    int pageSize = getpagesize();
+    alloc_size = ((size + pageSize -1) / pageSize) * pageSize;
+  }
+  _weight_mem = cviMemAlloc(_ctx, alloc_size, CVI_ALLOC_WEIGHT, _model_name.c_str());
+  if (!_weight_mem) {
+    TPU_LOG_ERROR("alloc memory for weight failed, size:%zu\n", size);
+    return CVI_RC_NOMEM;
+  }
+  stream->read(CVI_RT_MemGetVAddr(_weight_mem), offset, size);
+  CVI_RT_MemFlush(_ctx, _weight_mem);
+  if (isprotect) {
+    mem_protect(CVI_RT_MemGetVAddr(_weight_mem), alloc_size);
+  }
+  return CVI_RC_SUCCESS;
+}
+
+void CviModel::createCpuWeightMap() {
+  if (!_fb_model->weight_map()) {
+    return;
+  }
+
+  auto &weights = *_fb_model->weight_map();
+  for (auto w : weights) {
+    if (w->shape()) {
+      auto weight = std::make_shared<Neuron>(_ctx, w, _weight_mem, _model_name.c_str());
+      weight_map[w->name()->str()] = weight;
+    }
+  }
+}
+
+CVI_RC CviModel::parse(BaseStream *stream) {
+  CVI_RC ret;
+  size_t payload_size;
+  size_t header_size;
+  ret = parseModelHeader(stream, payload_size, header_size);
+  if (ret != CVI_RC_SUCCESS) {
+    return ret;
+  }
+  size_t bin_offset = header_size + payload_size;
+  _model_body = new uint8_t[payload_size];
+  if (!_model_body) {
+    TPU_LOG_ERROR("Failed to allocate memory\n");
+    return CVI_RC_NOMEM;
+  }
+  stream->read(_model_body, header_size, payload_size);
+
+  _fb_model = (cvi::model::Model *)cvi::model::GetModel(_model_body);
+  ret = showAndCheckVersion();
+  if (ret != CVI_RC_SUCCESS) {
+    return ret;
+  }
+
+  std::stringstream model_name;
+  model_name << _fb_model->name()->str() << ":" << _count;
+  _model_name = model_name.str();
+
+  ret = extractSections(stream, bin_offset);
+  if (ret != CVI_RC_SUCCESS) {
+    return ret;
+  }
+
+  parseProgramNum();
+  createCpuWeightMap();
+
+  return CVI_RC_SUCCESS;
+}
+
+CVI_RC CviModel::loadProgram(Program **program,
+                             int program_id,
+                             bool export_all_tensors,
+                             bool skip_preprocess) {
+  CVI_RC ret;
+  auto &programs = *_fb_model->programs();
+  assert(program_id < program_num);
+  auto fb_program = programs[program_id];
+  auto ptr = new Program(_ctx, _pool, dmabuf_map,
+                          _cpu_functions, weight_map,
+                          _weight_mem, _model_name.c_str(), _max_shared_mem_size);
+  if (!ptr) {
+    TPU_LOG_ERROR("Failed to create a Program instance\n");
+    return CVI_RC_FAILURE;
+  }
+  ptr->setOptions(export_all_tensors, skip_preprocess);
+  ret = ptr->load(fb_program);
+  if (ret != CVI_RC_SUCCESS) {
+    TPU_LOG_ERROR("program load failed:%d\n", ret);
+    delete ptr;
+    *program = nullptr;
+    return ret;
+  }
+  *program = ptr;
+  return CVI_RC_SUCCESS;
+}
+
+std::string CviModel::getChipType(
+    const std::string &modelFile,
+    const int8_t *buf, size_t size) {
+  BaseStream *stream;
+  if (!modelFile.empty()) {
+    stream = new FileStream(modelFile);
+  } else if (buf) {
+    stream = new BufferStream(buf, size);
+  } else {
+    assert(0);
+  }
+  if (stream->length() <= sizeof(MODEL_HEADER)) {
+    TPU_LOG_ERROR("Error, invalid cvimodel file\n");
+    assert(0);
+  }
+  MODEL_HEADER header;
+  stream->read((uint8_t *)&header, 0, sizeof(header));
+  delete stream;
+  return std::string(header.chip);
+}
+
+CVI_RC CviModel::acquire(const int8_t *buf, size_t size) {
+  BaseStream *stream = new BufferStream(buf, size);
+  CVI_RC ret = this->parse(stream);
+  if (ret != CVI_RC_SUCCESS) {
+    TPU_LOG_ERROR("failed to parse cvimodel\n");
+  }
+  delete stream;
+  return ret;
+}
+
+CVI_RC CviModel::acquire(const std::string &modelFile) {
+  BaseStream *stream = new FileStream(modelFile);
+  CVI_RC ret = this->parse(stream);
+  if (ret != CVI_RC_SUCCESS) {
+    TPU_LOG_ERROR("failed to parse cvimodel\n");
+  }
+  delete stream;
+  return ret;
+}
+
+/*
+fd:The file descriptor
+ud_offset:The file header offset defined by the user.
+*/
+CVI_RC CviModel::acquire(const int fd, const size_t ud_offset) {
+  BaseStream *stream = new FdStream(fd, ud_offset);
+  CVI_RC ret = this->parse(stream);
+  if (ret != CVI_RC_SUCCESS) {
+    TPU_LOG_ERROR("failed to parse cvimodel\n");
+  }
+  delete stream;
+  return ret;
+}
+
+void CviModel::release() {
+  --ref;
+  if (ref == 0) {
+    delete this;
+  }
+}
+
+} // namespace runtime
+} // namespace cvi
diff --git a/cviruntime/src/common/neuron.cpp b/cviruntime/src/common/neuron.cpp
new file mode 100644
index 000000000..fb92587a0
--- /dev/null
+++ b/cviruntime/src/common/neuron.cpp
@@ -0,0 +1,557 @@
+#include <iostream>
+#include <sstream>
+#include <mutex>
+#include <runtime/debug.h>
+#include "cviruntime.h"
+#include <runtime/model.hpp>
+#include <runtime/neuron.hpp>
+#include <runtime/kernel_function.hpp>
+#include <cvibuilder/cvimodel_generated.h>
+#include "alloc.h"
+
+#include <fstream>
+
+namespace cvi {
+namespace runtime {
+
+// helper functions
+static void fbDtypeToCVIFMTandSize(const cvi::model::DType dtype, CVI_FMT &fmt,
+                                   int &dsize) {
+  switch (dtype) {
+    case cvi::model::DType_BF16:
+      fmt = CVI_FMT_BF16;
+      dsize = 2;
+      break;
+    case cvi::model::DType_INT8:
+      fmt = CVI_FMT_INT8;
+      dsize = 1;
+      break;
+    case cvi::model::DType_UINT8:
+      fmt = CVI_FMT_UINT8;
+      dsize = 1;
+      break;
+    case cvi::model::DType_FP32:
+      fmt = CVI_FMT_FP32;
+      dsize = 4;
+      break;
+    case cvi::model::DType_INT16:
+      fmt = CVI_FMT_INT16;
+      dsize = 2;
+      break;
+    case cvi::model::DType_UINT16:
+      fmt = CVI_FMT_UINT16;
+      dsize = 2;
+      break;
+    case cvi::model::DType_INT32:
+      fmt = CVI_FMT_INT32;
+      dsize = 4;
+      break;
+    case cvi::model::DType_UINT32:
+      fmt = CVI_FMT_UINT32;
+      dsize = 4;
+      break;
+    default:
+      TPU_LOG_FATAL("unsupported dtype:%d\n", (int)dtype);
+  }
+}
+
+static void fbShapeToVector(const cvi::model::Shape *shape,
+                            std::vector<int> &shape_vec) {
+  shape_vec.resize(4);
+  shape_vec[0] = (int)shape->dim()->Get(0);
+  shape_vec[1] = (int)shape->dim()->Get(1);
+  shape_vec[2] = (int)shape->dim()->Get(2);
+  shape_vec[3] = (int)shape->dim()->Get(3);
+}
+
+static inline int align_up(int x, int n) {
+  return ((x + n - 1) / n) * n;
+}
+
+void Neuron::setPixelAlign(CVI_NN_PIXEL_FORMAT_E format) {
+  if (CviModel::targetChipType == "cv183x") {
+    vpss_y_align = 32;
+    vpss_w_align = 32;
+    vpss_channel_align = 0x1000;
+    if (CVI_NN_PIXEL_YUV_420_PLANAR == format) {
+      vpss_y_align = vpss_w_align * 2;
+    }
+  } else {
+    vpss_y_align = 64;
+    vpss_w_align = 64;
+    vpss_channel_align = 64;
+    if (CVI_NN_PIXEL_YUV_420_PLANAR == format) {
+      vpss_y_align = vpss_w_align * 2;
+    }
+  }
+}
+
+uint32_t Neuron::yuv_size(int n, int c, int h, int w, CVI_NN_PIXEL_FORMAT_E format) {
+  switch (format) {
+    case CVI_NN_PIXEL_YUV_420_PLANAR: {
+      assert(c == 3);
+      int y_w_aligned  = align_up(w, vpss_y_align);
+      int uv_w_aligned = align_up(w / 2, vpss_w_align);
+      int u            = align_up(h * y_w_aligned, vpss_channel_align);
+      int v            = align_up(u + h / 2 * uv_w_aligned, vpss_channel_align);
+      int n_stride     = align_up(v + h / 2 * uv_w_aligned, vpss_channel_align);
+      return n * n_stride;
+    }
+    case CVI_NN_PIXEL_YUV_NV21:
+    case CVI_NN_PIXEL_YUV_NV12: {
+      assert(c == 3);
+      int y_w_aligned  = align_up(w, vpss_y_align);
+      int uv_w_aligned = align_up(w, vpss_w_align);
+      int uv            = align_up(h * y_w_aligned, vpss_channel_align);
+      int n_stride     = align_up(uv + h / 2 * uv_w_aligned, vpss_channel_align);
+      return n * n_stride;
+    }
+    default:
+      TPU_LOG_FATAL("unsupported yuv pixel format:%d\n", format);
+  }
+  return 0;
+}
+
+Neuron::Neuron(CVI_RT_HANDLE ctx, const void *model_tensor,
+               CVI_RT_MEM weight_mem, const char *model_name)
+    : type(Neuron::WEIGHT), _ctx(ctx), _state(Neuron::TPU_MEM) {
+
+  CVI_FMT fmt = CVI_FMT_FP32;
+  int32_t dsize = 0;
+  std::vector<int> shape;
+
+  auto weight = (const cvi::model::Weight *)model_tensor;
+  fbDtypeToCVIFMTandSize(weight->type(), fmt, dsize);
+  fbShapeToVector(weight->shape(), shape);
+  this->_id = 0;
+  this->_count = shape[0] * shape[1] * shape[2] * shape[3];
+  this->_size = _count * dsize;
+  this->shape = shape;
+  this->fmt = fmt;
+  this->name = weight->name()->str();
+  this->pixel_format = CVI_NN_PIXEL_TENSOR;
+  this->type = Neuron::WEIGHT;
+  if (model_name) {
+    _model_name = model_name;
+  }
+  _module_name = _model_name + ":";
+  _module_name += this->name;
+  _gmem = CVI_RT_MemPreAlloc(weight_mem, weight->offset() & 0x0FFFFFFFFFF, _size);
+  _vaddr = CVI_RT_MemGetVAddr(_gmem);
+  _paddr = CVI_RT_MemGetPAddr(_gmem);
+  _base_mem = weight_mem;
+}
+
+Neuron::Neuron(
+    CVI_RT_HANDLE ctx, CVI_RT_KHANDLE cvk, const void *model_tensor,
+    uint64_t *baseAddrArray, CVI_RT_MEM *baseMemArray, const char *model_name)
+    : type(Neuron::ACTIVATION),
+      _ctx(ctx), _cvk(cvk),
+      _state(Neuron::TPU_MEM),
+      _baseAddrArray(baseAddrArray),
+      _baseMemArray(baseMemArray) {
+
+  if (model_name) {
+    _model_name = model_name;
+  }
+  CVI_FMT fmt = CVI_FMT_FP32;
+  int32_t dsize = 0;
+  std::vector<int> shape;
+
+  auto tensor = (const cvi::model::Tensor *)model_tensor;
+  fbDtypeToCVIFMTandSize(tensor->dtype(), fmt, dsize);
+  fbShapeToVector(tensor->shape(), shape);
+  this->_id = tensor->tensor_id();
+  this->_count = shape[0] * shape[1] * shape[2] * shape[3];
+  this->_overwrote = (bool)tensor->overwrote();
+  this->shape = shape;
+  this->fmt = fmt;
+  this->name = tensor->name()->str();
+  this->type = Neuron::ACTIVATION;
+
+  this->aligned = tensor->aligned();
+  this->_tensor_size = tensor->size();
+  auto pixel_format = tensor->pixel_format() ?
+      tensor->pixel_format()->str() : std::string("");
+  setPixelFormatAndSize(pixel_format, dsize);
+
+  _module_name = _model_name + ":";
+  _module_name += this->name;
+
+  if (tensor->scale()) {
+    for (int i = 0; i < (int)tensor->scale()->size(); i++) {
+      this->scale.push_back(tensor->scale()->Get(i));
+    }
+  }
+
+  if (tensor->mean()) {
+    for (int i = 0; i < (int)tensor->mean()->size(); i++) {
+      this->mean.push_back(tensor->mean()->Get(i));
+    }
+  }
+
+  if (tensor->quant()) {
+    setQScale(tensor->quant()->qscale());
+    setZeroPoint(tensor->quant()->zero_point());
+  }
+
+}
+
+Neuron::~Neuron() {
+  if (_gmem)
+    cviMemFree(_ctx, _gmem);
+  if (_channelPreloadCmdbuf)
+    CVI_RT_MemFree(_ctx, _channelPreloadCmdbuf);
+  if (_framePreloadCmdbuf)
+    CVI_RT_MemFree(_ctx, _framePreloadCmdbuf);
+  if (_streamCopyCmdbuf)
+    CVI_RT_MemFree(_ctx, _streamCopyCmdbuf);
+  if (_cpu_mem)
+    free(_cpu_mem);
+}
+
+void Neuron::setPixelFormatAndSize(const std::string &pixel_format,
+                                   int32_t dsize) {
+  if (pixel_format.empty()) {
+    assert(!this->aligned);
+    this->pixel_format = CVI_NN_PIXEL_TENSOR;
+  } else if (pixel_format == "BGR_PLANAR") {
+    this->pixel_format = CVI_NN_PIXEL_BGR_PLANAR;
+  } else if (pixel_format == "BGR_PACKED") {
+    this->pixel_format = CVI_NN_PIXEL_BGR_PACKED;
+  } else if (pixel_format == "RGB_PLANAR") {
+    this->pixel_format = CVI_NN_PIXEL_RGB_PLANAR;
+  } else if (pixel_format == "RGB_PACKED") {
+    this->pixel_format = CVI_NN_PIXEL_RGB_PACKED;
+  } else if (pixel_format == "GRAYSCALE") {
+    this->pixel_format = CVI_NN_PIXEL_GRAYSCALE;
+  } else if (pixel_format == "YUV_NV12") {
+    this->pixel_format = CVI_NN_PIXEL_YUV_NV12;
+  } else if (pixel_format == "YUV_NV21") {
+    this->pixel_format = CVI_NN_PIXEL_YUV_NV21;
+  } else if (pixel_format == "YUV420_PLANAR") {
+    this->pixel_format = CVI_NN_PIXEL_YUV_420_PLANAR;
+  } else if (pixel_format == "RGBA_PLANAR") {
+    this->pixel_format = CVI_NN_PIXEL_RGBA_PLANAR;
+  } else {
+    TPU_LOG_FATAL("unkown pixel_format:%s\n", pixel_format.c_str());
+  }
+  setPixelAlign(this->pixel_format);
+
+  if (!this->aligned) {
+    _size = _count * dsize;
+    return;
+  }
+  if (this->aligned && this->_tensor_size) {
+    _size = this->_tensor_size;
+    return;
+  }
+
+  switch(this->pixel_format) {
+    case CVI_NN_PIXEL_GRAYSCALE:
+      _size = shape[0] * shape[1] * shape[2] * align_up(shape[3], vpss_w_align);
+      break;
+    case CVI_NN_PIXEL_BGR_PLANAR:
+    case CVI_NN_PIXEL_RGB_PLANAR:
+    case CVI_NN_PIXEL_RGBA_PLANAR: {
+      int align_w = align_up(shape[3], vpss_w_align);
+      int align_c = align_up(align_w * shape[2], vpss_channel_align);
+      _size = shape[0] * shape[1] * align_c;
+      break;
+    }
+    case CVI_NN_PIXEL_BGR_PACKED:
+    case CVI_NN_PIXEL_RGB_PACKED:
+      _size = shape[0] * shape[2] * align_up(shape[3] * shape[1], vpss_w_align);
+      break;
+    case CVI_NN_PIXEL_YUV_NV12:
+    case CVI_NN_PIXEL_YUV_NV21:
+    case CVI_NN_PIXEL_YUV_420_PLANAR:
+      _size = yuv_size(shape[0], shape[1], shape[2], shape[3], this->pixel_format);
+      break;
+    default:
+      assert(0);
+  }
+}
+
+bool Neuron::isPacked() {
+  if (pixel_format == CVI_NN_PIXEL_TENSOR) {
+    pixel_format = (shape[3] == 3) ? CVI_NN_PIXEL_PACKED :
+                    CVI_NN_PIXEL_PLANAR;
+  }
+  if (pixel_format == CVI_NN_PIXEL_BGR_PACKED ||
+      pixel_format == CVI_NN_PIXEL_RGB_PACKED ||
+      pixel_format == CVI_NN_PIXEL_PACKED) {
+    return true;
+  }
+  return false;
+}
+
+// preload cahnnel's data from vpss buffer, which w dimension is aligned by vpss_w_align.
+// we need copy and unalign the data to compactly tensor using TDMA.
+CVI_RC Neuron::preloadChannelAndCompact(int32_t channel_idx, uint64_t src_paddr) {
+  uint32_t h, w;
+  if (isPacked()) {
+    h = shape[1];
+    w = shape[2] * shape[3];
+  } else {
+    h = shape[2];
+    w = shape[3];
+  }
+  CVI_RC ret = CVI_RC_SUCCESS;
+  for (int i = 0; i < 3; ++i) {
+      if (!_channelPreloadCmdbuf) {
+          uint32_t hstride           = align_up(w, vpss_w_align);
+          cvk_tg_shape_t tg_shape    = {1, 1, h, w};
+          cvk_tg_stride_t src_stride = {1, 1, hstride, 1};
+          cvk_tg_stride_t dst_stride = {1, 1, w, 1};
+          _channelPreloadCmdbuf      = runtimeJitTdmaStrideCopy(
+                   _ctx, _cvk, fmt, &tg_shape,
+                   &dst_stride, &tg_shape, &src_stride);
+          if (!_channelPreloadCmdbuf) {
+              continue;
+          }
+      }
+      ret = runtimeExecuteKernelFunction(
+          _ctx, _channelPreloadCmdbuf, src_paddr,
+          _paddr + channel_idx * h * w);
+      if (ret != CVI_RC_SUCCESS) {
+          TPU_LOG_ERROR("preloadChannelAndCompact fail!ret:%d", ret);
+          CVI_RT_MemFree(_ctx, _channelPreloadCmdbuf);
+          _channelPreloadCmdbuf = nullptr;
+      } else {
+          return CVI_RC_SUCCESS;
+      }
+  }
+  return ret;
+}
+
+// preload frame's data from vpss buffer
+// we need copy and unalign the data to compactly tensor using TDMA.
+CVI_RC Neuron::preloadFrameAndCompact(int32_t frame_idx, uint64_t src_paddr) {
+  uint32_t c, h, w;
+  if (isPacked()) {
+    c = 1;
+    h = shape[1];
+    w = shape[2] * shape[3];
+  } else {
+    c = shape[1];
+    h = shape[2];
+    w = shape[3];
+  }
+  CVI_RC ret = CVI_RC_SUCCESS;
+  for (int i = 0; i < 3; ++i) {
+      if (!_framePreloadCmdbuf) {
+          uint32_t hstride           = align_up(w, vpss_w_align);
+          cvk_tg_shape_t tg_shape    = {1, c, h, w};
+          cvk_tg_stride_t src_stride = {1, h * hstride, hstride, 1};
+          cvk_tg_stride_t dst_stride = {1, h * w, w, 1};
+          _framePreloadCmdbuf        = runtimeJitTdmaStrideCopy(
+                     _ctx, _cvk, fmt, &tg_shape,
+                     &dst_stride, &tg_shape, &src_stride);
+          if (!_framePreloadCmdbuf) {
+              continue;
+          }
+      }
+      ret = runtimeExecuteKernelFunction(
+          _ctx, _framePreloadCmdbuf, src_paddr,
+          _paddr + frame_idx * c * h * w);
+      if (ret != CVI_RC_SUCCESS) {
+          TPU_LOG_ERROR("preloadFrameAndCompact fail!ret:%d", ret);
+          CVI_RT_MemFree(_ctx, _framePreloadCmdbuf);
+          _framePreloadCmdbuf = nullptr;
+      } else {
+        return CVI_RC_SUCCESS;
+      }
+  }
+  return ret;
+}
+
+// just copy vpss data to neuron and
+// keep w and frame dimensions's alignment.
+CVI_RC Neuron::preload(int32_t frame_idx, uint64_t src_paddr) {
+  uint32_t frame_size = align_up(_size / shape[0], vpss_w_align);
+  CVI_RC ret = CVI_RC_SUCCESS;
+  for (int i = 0; i < 3; ++i) {
+      if (!_framePreloadCmdbuf) {
+          cvk_tg_shape_t tg_shape = {1, 1, frame_size / vpss_w_align, (uint32_t)vpss_w_align};
+          _framePreloadCmdbuf     = runtimeJitTdmaStrideCopy(
+                  _ctx, _cvk, CVI_FMT_INT8, &tg_shape,
+                  nullptr, &tg_shape, nullptr);
+          if (!_framePreloadCmdbuf) {
+              continue;
+          }
+      }
+      ret = runtimeExecuteKernelFunction(
+          _ctx, _framePreloadCmdbuf, src_paddr,
+          _paddr + frame_idx * frame_size);
+      if (ret != CVI_RC_SUCCESS) {
+          TPU_LOG_ERROR("preload fail!ret:%d", ret);
+          CVI_RT_MemFree(_ctx, _framePreloadCmdbuf);
+          _framePreloadCmdbuf = nullptr;
+      } else {
+        return CVI_RC_SUCCESS;
+      }
+  }
+  return ret;
+}
+
+void Neuron::load(CVI_TENSOR &tensor) {
+  // load data from system mem.
+  if (tensor.mem_type == CVI_MEM_SYSTEM) {
+    if (_vaddr) {
+      if (_vaddr != tensor.sys_mem) {
+        memcpy(_vaddr, tensor.sys_mem, _size);
+      }
+      TPU_ASSERT((int)CVI_RT_MemFlush(_ctx, _gmem) == 0, nullptr);
+      _state = Neuron::TPU_MEM;
+    } else {
+      if (_cpu_mem != tensor.sys_mem) {
+        memcpy(_cpu_mem, tensor.sys_mem, _size);
+      }
+      _state = Neuron::CPU_MEM;
+    }
+  } else { // load data from device mem.
+    if (!_gmem && !_paddr) {
+      assert(0 && "has no device mem allocated");
+    }
+    // needed to copy data using tdma.
+    if (tensor.paddr != _paddr) {
+      if (!_streamCopyCmdbuf) {
+        cvk_tg_shape_t tg_shape;
+        tg_shape.n = shape[0];
+        tg_shape.c = shape[1];
+        tg_shape.h = shape[2];
+        tg_shape.w = shape[3];
+        _streamCopyCmdbuf =
+            runtimeJitTdmaStrideCopy(_ctx, _cvk, fmt, &tg_shape, nullptr, &tg_shape, nullptr);
+      }
+      runtimeExecuteKernelFunction(_ctx, _streamCopyCmdbuf, tensor.paddr, _paddr);
+    }
+    _state = Neuron::TPU_MEM;
+  }
+}
+
+void Neuron::store(CVI_TENSOR &tensor) {
+  if (tensor.mem_type == CVI_MEM_SYSTEM) {
+    if (_state == Neuron::CPU_MEM) {
+      if (tensor.sys_mem != sys_mem())
+        memcpy(tensor.sys_mem, sys_mem(), _size);
+    } else {
+      TPU_ASSERT((int)CVI_RT_MemInvld(_ctx, _gmem) == 0,nullptr);
+      if (tensor.sys_mem != _vaddr)
+        memcpy(tensor.sys_mem, _vaddr, _size);
+    }
+  } else {
+    if (tensor.paddr != _paddr) {
+      if (!_streamCopyCmdbuf) {
+        cvk_tg_shape_t tg_shape;
+        tg_shape.n = shape[0];
+        tg_shape.c = shape[1];
+        tg_shape.h = shape[2];
+        tg_shape.w = shape[3];
+        _streamCopyCmdbuf =
+            runtimeJitTdmaStrideCopy(_ctx, _cvk, fmt, &tg_shape, nullptr, &tg_shape, nullptr);
+      }
+      runtimeExecuteKernelFunction(_ctx, _streamCopyCmdbuf, _paddr, tensor.paddr);
+    }
+  }
+}
+
+void Neuron::toCpu() {
+  if (_state != Neuron::CPU_MEM) {
+    if (_cpu_mem) {
+      CVI_RT_MemCopyD2S(_ctx, _cpu_mem, _gmem);
+    } else {
+      TPU_ASSERT((int)CVI_RT_MemInvld(_ctx, _gmem) == 0, nullptr);
+    }
+    _state = Neuron::CPU_MEM;
+  }
+}
+
+void Neuron::toTpu() {
+  if (_state != Neuron::TPU_MEM) {
+    if (_cpu_mem) {
+      CVI_RT_MemCopyS2D(_ctx, _gmem, _cpu_mem);
+      //TPU_LOG_DEBUG("load data from cpu_mem (%p) to device_mem (%p)\n",
+      //              (void *)_cpu_mem, (void *)_gmem);
+    } else {
+      CVI_RT_MemFlush(_ctx, _gmem);
+      CVI_RT_MemInvld(_ctx, _base_mem);
+      //TPU_LOG_DEBUG("flush device_mem (%p)\n", (void *)_vaddr);
+    }
+    _state = Neuron::TPU_MEM;
+  }
+}
+
+CVI_RC Neuron::reserveIonMem(int64_t offset) {
+  if (offset == -1) {
+    return CVI_RC_SUCCESS;
+  }
+
+  _baseAddrIndex = (offset >> 40 & 0x07);
+  assert(_baseAddrIndex < 8 && _baseAddrIndex != 1);
+  uint64_t shift = offset & 0x0FFFFFFFFFF;
+  if (_baseAddrIndex < 3) { // shared mem
+    _gmem = CVI_RT_MemPreAlloc(_baseMemArray[_baseAddrIndex], shift, _size);
+  } else {
+    if (!_baseMemArray[_baseAddrIndex]) {
+      assert(shift == 0);
+      _gmem = cviMemAlloc(_ctx, _size, CVI_ALLOC_NEURON, _module_name.c_str());
+      if (!_gmem) {
+        TPU_LOG_ERROR("failed to alloc io mem\n");
+        return CVI_RC_NOMEM;
+      }
+      _baseMemArray[_baseAddrIndex] = _gmem;
+      updateBaseAddr(_gmem);
+    } else {
+      _gmem = CVI_RT_MemPreAlloc(_baseMemArray[_baseAddrIndex], shift, _size);
+    }
+  }
+  _base_mem = _baseMemArray[_baseAddrIndex];
+  _vaddr = CVI_RT_MemGetVAddr(_gmem);
+  _paddr = CVI_RT_MemGetPAddr(_gmem);
+  return CVI_RC_SUCCESS;
+}
+
+CVI_RC Neuron::reserveSysMem() {
+  /*
+   * if the tensor has device_mem, we can use vaddr
+   * of device_mem as cpu mem. So no need to assign
+   * memory in such case. Otherwise, we need allocate
+   * memory from heap.
+   */
+  if (_gmem)
+    return CVI_RC_SUCCESS;
+
+  if (!_cpu_mem) {
+    // _cpu_mem needed be aligned to 32bytes
+    // with size aligned to 64 bytes.
+    _cpu_mem = (uint8_t *)aligned_alloc(32, align_up(_size, 64));
+    if (!_cpu_mem) {
+      TPU_LOG_ERROR("alloc system memory for tensor failed, %s , size:%d\n", name.c_str(), _size);
+      return CVI_RC_NOMEM;
+    }
+  }
+  return CVI_RC_SUCCESS ;
+}
+
+void Neuron::updateBaseAddr(uint64_t paddr) {
+  if (_baseAddrIndex < 3)
+    return;
+  _baseAddrArray[_baseAddrIndex] = paddr;
+  _paddr = paddr;
+  if (_gmem) {
+    cviMemFree(_ctx, _gmem);
+    _gmem = nullptr;
+  }
+}
+
+void Neuron::updateBaseAddr(CVI_RT_MEM mem) {
+  if (_baseAddrIndex < 3)
+    return;
+  _baseMemArray[_baseAddrIndex] = mem;
+  _baseAddrArray[_baseAddrIndex] = CVI_RT_MemGetPAddr(mem);
+}
+
+} // namespace runtime
+} // namespace cvi
diff --git a/cviruntime/src/common/program.cpp b/cviruntime/src/common/program.cpp
new file mode 100644
index 000000000..2a3b626fb
--- /dev/null
+++ b/cviruntime/src/common/program.cpp
@@ -0,0 +1,603 @@
+#include <unistd.h>
+#include <dlfcn.h>
+#include <iostream>
+#include <sstream>
+#include <mutex>
+#include <set>
+#include <algorithm>
+#include <runtime/program.hpp>
+#include <runtime/debug.h>
+#include <runtime/shared_mem.hpp>
+#include <cvibuilder/parameter_generated.h>
+#include "cviruntime.h"
+#include "alloc.h"
+
+//#define MEASURE_TIME
+#ifdef MEASURE_TIME
+#include <sys/time.h>
+#endif
+
+namespace cvi {
+namespace runtime {
+
+Program::Program(CVI_RT_HANDLE ctx, TaskPool *pool,
+                 dmabuf_map_t &dmabuf_map,
+                 std::vector<CpuRuntimeFunction *> &functions,
+                 tensor_map_t &weight_map, CVI_RT_MEM weight_mem,
+                 const char *model_name,
+                 size_t max_shared_mem_size)
+    : weight_map(weight_map),
+      dmabuf_map(dmabuf_map),
+      cpu_functions(functions),
+      _ctx(ctx), _pool(pool),
+      _max_shared_mem_size(max_shared_mem_size) {
+
+  _cvk = CVI_RT_RegisterKernel(ctx, 1024);
+  for (int i = 0; i < 8; ++i) {
+    baseAddrArray[i] = 0;
+    baseMemArray[i] = nullptr;
+  }
+  baseMemArray[1] = weight_mem;
+  baseAddrArray[1] = CVI_RT_MemGetPAddr(weight_mem);
+  if (model_name) {
+    _model_name = model_name;
+  }
+}
+
+Program::~Program() {
+  if (shared_mem) {
+    deallocateSharedMemory(_ctx, shared_mem);
+  }
+  if (private_mem) {
+    cviMemFree(_ctx, private_mem);
+  }
+  if (_cvk) {
+    CVI_RT_UnRegisterKernel(_cvk);
+  }
+}
+
+void Program::setOptions(bool export_all_tensors, bool skip_preprocess) {
+  this->_export_all_tensors = export_all_tensors;
+  this->_skip_preprocess = skip_preprocess;
+}
+
+CVI_RC Program::createNeuronSpace(const cvi::model::Program *fb_program) {
+  // As for old version cvimodel that only has one big
+  // neuron memory, the private gmem is same as shared gmem.
+  if (fb_program->neuron_size()) {
+    auto size = fb_program->neuron_size();
+    private_mem = cviMemAlloc(_ctx, size, CVI_ALLOC_PROGRAM, _model_name.c_str());
+    if (!private_mem) {
+      TPU_LOG_ERROR("failed to alloc private gmem: %u\n", size);
+      return CVI_RC_NOMEM;
+    }
+    baseMemArray[0] = private_mem;
+    baseMemArray[2] = private_mem;
+    baseAddrArray[0] = CVI_RT_MemGetPAddr(private_mem);
+    baseAddrArray[2] = CVI_RT_MemGetPAddr(private_mem);
+    return CVI_RC_SUCCESS;
+  }
+
+  auto size = fb_program->shared_gmem();
+  if (size) {
+    shared_mem = allocateSharedMemory(_ctx, _max_shared_mem_size);
+    if (!shared_mem) {
+      TPU_LOG_ERROR("failed to alloc shared gmem: %zu\n", _max_shared_mem_size);
+      return CVI_RC_NOMEM;
+    }
+    baseMemArray[0] = shared_mem;
+    baseAddrArray[0] = CVI_RT_MemGetPAddr(shared_mem);
+  }
+
+  size = fb_program->private_gmem();
+  if (size) {
+    private_mem = cviMemAlloc(_ctx, size, CVI_ALLOC_PROGRAM, _model_name.c_str());
+    if (!private_mem) {
+      TPU_LOG_ERROR("failed to alloc private gmem: %u\n", size);
+      return CVI_RC_NOMEM;
+    }
+    baseMemArray[2] = private_mem;
+    baseAddrArray[2] = CVI_RT_MemGetPAddr(private_mem);
+  }
+  return CVI_RC_SUCCESS;
+}
+
+CVI_RC Program::createNeuronMap(const cvi::model::Program *fb_program) {
+  auto &tensor_vector = *fb_program->tensor_map();
+  std::string in_name = fb_program->input_tensors()->begin()->str();
+  for (auto t : tensor_vector) {
+    auto tensor = std::make_shared<Neuron>(_ctx, _cvk, t,
+                                           baseAddrArray, baseMemArray, _model_name.c_str());
+    if (tensor->reserveIonMem(t->offset()) != CVI_RC_SUCCESS) {
+      return CVI_RC_NOMEM;
+    }
+    neuron_map[t->name()->str()] = tensor;
+  }
+
+  auto &ins = *fb_program->input_tensors();
+  for (auto i : ins) {
+    in_tensors.push_back(neuron_map[i->str()]);
+  }
+  auto &outs = *fb_program->output_tensors();
+  for (auto o : outs) {
+    out_tensors.push_back(neuron_map[o->str()]);
+  }
+  return CVI_RC_SUCCESS;
+}
+
+CVI_RC Program::createRoutines(const cvi::model::Program *fb_program) {
+  auto &routines = *fb_program->routines();
+  for (auto r : routines) {
+    std::shared_ptr<Routine> rt;
+    if (r->type() == cvi::model::RoutineType_TPU) {
+      rt = std::make_shared<TpuRoutine>(_ctx, this);
+    } else {
+#ifdef ENABLE_CPU_FUNC
+      rt = std::make_shared<CpuRoutine>(_ctx, this);
+#else
+      TPU_LOG_ERROR("Cpu function is not supported! please recompile with ENABLE_CPU_FUNC\n");
+      return CVI_RC_UNSUPPORT;
+#endif
+    }
+    if (!rt->initialize(r)) {
+      return CVI_RC_DATA_ERR;
+    }
+    _routines.push_back(rt);
+  }
+  return CVI_RC_SUCCESS;
+}
+
+CVI_RC Program::load(const cvi::model::Program *fb_program) {
+  CVI_RC ret;
+
+  ret = this->createNeuronSpace(fb_program);
+  if (ret != CVI_RC_SUCCESS) {
+    return ret;
+  }
+
+  ret = this->createNeuronMap(fb_program);
+  if (ret != CVI_RC_SUCCESS) {
+    return ret;
+  }
+
+  ret = this->createRoutines(fb_program);
+  if (ret != CVI_RC_SUCCESS) {
+    return ret;
+  }
+
+  for (auto &rt : _routines) {
+    ret = rt->prepare();
+    if (ret != CVI_RC_SUCCESS) {
+      return ret;
+    }
+  }
+  return CVI_RC_SUCCESS;
+}
+
+static void exportTensorInfo(void *program, const std::shared_ptr<Neuron> &neuron,
+                             CVI_TENSOR *tensor) {
+  tensor->name = const_cast<char *>(neuron->name.c_str());
+  tensor->shape.dim[0] = neuron->shape[0];
+  tensor->shape.dim[1] = neuron->shape[1];
+  tensor->shape.dim[2] = neuron->shape[2];
+  tensor->shape.dim[3] = neuron->shape[3];
+  tensor->shape.dim_size = 4;
+  tensor->fmt = (CVI_FMT)neuron->fmt;
+  tensor->count = neuron->count();
+  tensor->mem_type = CVI_MEM_SYSTEM;
+  tensor->mem_size = neuron->size();
+  tensor->sys_mem = neuron->sys_mem();
+  tensor->paddr = neuron->paddr();
+  tensor->qscale = neuron->qscale();
+  tensor->zero_point = neuron->zero_point();
+  tensor->pixel_format = neuron->pixel_format;
+  tensor->aligned = neuron->aligned;
+  for (int i = 0; i < (int)neuron->scale.size(); i++) {
+    tensor->scale[i] = neuron->scale[i];
+  }
+  for (int i = 0; i < (int)neuron->mean.size(); i++) {
+    tensor->mean[i] = neuron->mean[i];
+  }
+  tensor->owner = program;
+}
+
+CVI_TENSOR *Program::exportInputs(int32_t &size) {
+  size = this->in_tensors.size();
+  auto *tensors = new CVI_TENSOR[size];
+  if (!tensors) {
+    return nullptr;
+  }
+  for (int i = 0; i < size; i++) {
+    exportTensorInfo(this, this->in_tensors[i], tensors + i);
+  }
+  return tensors;
+}
+
+CVI_TENSOR *Program::exportOutputs(int32_t &size) {
+  int i = 0;
+  CVI_TENSOR *tensors = nullptr;
+
+  if (!_export_all_tensors) {
+    size = (int)out_tensors.size();
+    tensors = new CVI_TENSOR[size];
+    if (!tensors) {
+      goto Error;
+    }
+    for (; i < size; i++) {
+      exportTensorInfo(this, out_tensors[i], tensors + i);
+    }
+  } else {
+    for (auto &kv : neuron_map) {
+      auto &tensor = kv.second;
+      if (!tensor->overwrote())
+        ++i;
+    }
+    size = i;
+    tensors = new CVI_TENSOR[size];
+    if (!tensors) {
+      goto Error;
+    }
+    i = 0;
+    for (auto &kv : neuron_map) {
+      auto &tensor = kv.second;
+      if (tensor->overwrote())
+        continue;
+
+      exportTensorInfo(this, tensor, tensors + i);
+      ++i;
+    }
+    size = i;
+  }
+  return tensors;
+
+Error:
+  if (tensors) {
+    delete[] tensors;
+  }
+  size = 0;
+  return nullptr;
+}
+
+bool Program::forward(CVI_TENSOR *inputs, int input_num,
+                      CVI_TENSOR *outputs, int output_num) {
+#ifdef MEASURE_TIME
+  struct timeval t0, t1;
+  long elapsed;
+  gettimeofday(&t0, NULL);
+#endif
+
+  TPU_ASSERT(input_num == (int)in_tensors.size(), nullptr);
+  for (int i = 0; i < (int)in_tensors.size(); i++) {
+    auto &tensor = this->in_tensors[i];
+    tensor->load(inputs[i]);
+  }
+
+#ifdef MEASURE_TIME
+  gettimeofday(&t1, NULL);
+  elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
+  printf("  PERF: [load  ] %ld us\n", elapsed);
+  t0 = t1;
+#endif
+
+  if (!this->run()) {
+    return false;
+  }
+
+#ifdef MEASURE_TIME
+  gettimeofday(&t1, NULL);
+  elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
+  printf("  PERF: [run   ] %ld us\n", elapsed);
+  t0 = t1;
+#endif
+
+  if (!_export_all_tensors) {
+    TPU_ASSERT(output_num == (int)out_tensors.size(), nullptr);
+    for (int i = 0; i < (int)out_tensors.size(); i++) {
+      out_tensors[i]->store(outputs[i]);
+    }
+  } else {
+    int i = 0;
+    for (auto &kv : neuron_map) {
+      auto &tensor = kv.second;
+      if (tensor->overwrote())
+        continue;
+
+      tensor->store(outputs[i]);
+      ++i;
+    }
+    TPU_ASSERT(output_num == i, nullptr);
+  }
+
+#ifdef MEASURE_TIME
+  gettimeofday(&t1, NULL);
+  elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
+  printf("  PERF: [store ] %ld us\n", elapsed);
+#endif
+
+  return true;
+}
+
+void *Program::forwardAsync(CVI_TENSOR *inputs, int input_num, CVI_TENSOR *outputs,
+                            int output_num) {
+  _pool->startPool();
+  return new Task(_pool, (void *)this, inputs, input_num, outputs, output_num);
+}
+
+CVI_RC Program::forwardWait(void *task) {
+  auto *myTask = (Task *)task;
+  _pool->waitTask(myTask);
+  return myTask->retCode;
+}
+
+bool Program::run() {
+  // reset all routines for new inference.
+  for (auto &r : _routines) {
+    r->reset();
+  }
+  for (auto &r : _routines) {
+    r->run();
+  }
+  return true;
+}
+
+void TpuRoutine::reset() {
+  for (auto &neuron : outputs) {
+    neuron->setState(Neuron::TPU_MEM);
+  }
+}
+
+int TpuRoutine::init_dmabuf (Program *program, const std::string &name) {
+  auto iter = program->dmabuf_map.find(name);
+  if (program->dmabuf_map.end() == iter) {
+    assert(0);
+  }
+  buf_mem = iter->second;
+ 
+#ifdef ENABLE_PMU
+  const char *pmu_enable_env = std::getenv("TPU_ENABLE_PMU");
+  if (pmu_enable_env) {
+    if (atoi(pmu_enable_env) > 0) {
+      this->enable_pmu = true;
+    }
+  }
+#else
+  TPU_LOG_WARNING("Tpu pmu is not supported! please recompile with ENABLE_PMU\n");
+#endif
+  return CVI_RC_SUCCESS;
+}
+
+bool TpuRoutine::initialize(const cvi::model::Routine *routine) {
+  // setup input & output tensors
+  auto &in_tensors = *routine->in_tensors();
+  for (auto i : in_tensors) {
+    inputs.push_back(_program->neuron_map[i->str()]);
+  }
+  auto &out_tensors = *routine->out_tensors();
+  for (auto o : out_tensors) {
+    outputs.push_back(_program->neuron_map[o->str()]);
+  }
+
+  int ret = 0;
+  // find cmdbuf section
+  if (routine->tpu_routine()->cmdbuf_section()) {
+    ret = init_dmabuf(_program, routine->tpu_routine()->cmdbuf_section()->str());
+  } else if (routine->tpu_routine()->dmabuf_section()) {
+    ret = init_dmabuf(_program, routine->tpu_routine()->dmabuf_section()->str());
+  } else {
+    TPU_LOG_ERROR("model not contain cmdbuf section and dmabuf section!\n");
+    return false;
+  }
+
+  TPU_ASSERT(ret == 0, "CVI_RT_LoadCmdbuf failed");
+  return true;
+}
+
+CVI_RC TpuRoutine::run() {
+#ifdef MEASURE_TIME
+  struct timeval t0, t1;
+  long elapsed;
+  gettimeofday(&t0, NULL);
+#endif
+
+  for (auto &neuron : inputs) {
+    neuron->toTpu();
+  }
+
+#ifdef MEASURE_TIME
+  gettimeofday(&t1, NULL);
+  elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
+  printf("  PERF: [to_tpu ] %ld us\n", elapsed);
+#endif
+
+  CVI_RC ret = CVI_SUCCESS;
+  CVI_RT_ARRAYBASE *baseArray =
+      reinterpret_cast<CVI_RT_ARRAYBASE *>(&(_program->baseAddrArray[0]));
+  if (this->encrypted) {
+    ret = CVI_RT_RunCmdbufTee(_ctx, buf_mem, baseArray);
+  } else {
+    ret = CVI_RT_RunCmdbufEx(_ctx, buf_mem, baseArray);
+  }
+
+  if (ret != 0) {
+    TPU_LOG_ERROR("run cmdbuf failed:%d\n", ret);
+    return CVI_FAILURE;
+  }
+
+#ifdef MEASURE_TIME
+  gettimeofday(&t1, NULL);
+  elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
+  printf("  PERF: [tpu_run] %ld us\n", elapsed);
+#endif
+
+  // if we need to dump pmubuf
+  if (this->enable_pmu) {
+    uint8_t *pmubuf = nullptr;
+    uint32_t buf_len = 0;
+    CVI_RT_ParsePmuBuf(buf_mem, &pmubuf, &buf_len);
+#ifdef DUMP_PMU_RAW
+    if (pmubuf && buf_len) {
+      const char *pmubuf_output_file_env = std::getenv("TPU_PMUBUF_OUTPUT_FILE");
+      if (pmubuf_output_file_env) {
+        std::fstream f_output(pmubuf_output_file_env,
+                              std::ios::out | std::ios::trunc | std::ios::binary);
+        f_output.write((char *)pmubuf, buf_len);
+      }
+    }
+#endif
+  }
+  return CVI_SUCCESS;
+}
+
+void CpuRoutine::reset() {
+  for (auto &neuron : outputs) {
+    neuron->setState(Neuron::CPU_MEM);
+  }
+}
+
+void CpuRoutine::handleFuncArgs(const uint8_t *args, OpParam &param) {
+  auto packed_param = cvi::cpu_op::GetParameter(args);
+  auto &attributes = *packed_param->attributes();
+  for (auto attr : attributes) {
+    if (attr->int_attr()) {
+      auto _int = attr->int_attr();
+      param.put<int32_t>(_int->key()->str(), _int->value());
+    } else if (attr->float_attr()) {
+      auto _float = attr->float_attr();
+      param.put<float>(_float->key()->str(), _float->value());
+    } else if (attr->bool_attr()) {
+      auto _bool = attr->bool_attr();
+      param.put<bool>(_bool->key()->str(), _bool->value());
+    } else if (attr->str_attr()) {
+      auto _str = attr->str_attr();
+      param.put<std::string>(_str->key()->str(), _str->value()->str());
+    } else if (attr->int_array_attr()) {
+      auto _int_array = attr->int_array_attr();
+      std::vector<int32_t> vec;
+      auto &value = *_int_array->value();
+      for (auto v : value) {
+        vec.push_back(v);
+      }
+      param.put<std::vector<int32_t>>(_int_array->key()->str(), vec);
+    } else if (attr->float_array_attr()) {
+      auto _float_array = attr->float_array_attr();
+      std::vector<float> vec;
+      auto &value = *_float_array->value();
+      for (auto v : value) {
+        vec.push_back(v);
+      }
+      param.put<std::vector<float>>(_float_array->key()->str(), vec);
+    } else {
+      assert(0);
+    }
+  }
+}
+
+void CpuRoutine::fetchQscaleFromDequant(OpParam &param) {
+  if (param.get<std::string>("from") == "NONE" &&
+      param.get<std::string>("to") == "INT8") {
+    float scale = param.has("threshold") ?
+                  (128.0 / param.get<float>("threshold"))
+                  : param.get<float>("scale");
+    outputs[0]->setQScale(scale);
+  }
+}
+
+bool CpuRoutine::initialize(const cvi::model::Routine *routine) {
+  // setup input & output tensors
+  auto &in_tensors = *routine->in_tensors();
+  for (auto i : in_tensors) {
+    auto name = i->str();
+    auto it = _program->neuron_map.find(name);
+    if (it != _program->neuron_map.end()) {
+      auto &neuron = it->second;
+      inputs.push_back(neuron);
+    } else {
+      //TPU_LOG_DEBUG("CpuRoutine need load weight, tensor name: %s\n", name.c_str());
+      auto it = _program->weight_map.find(name);
+      if (it == _program->weight_map.end()) {
+        TPU_LOG_ERROR("Cannot find weight in map, %s\n", name.c_str());
+        return false;
+      }
+      auto &neuron = it->second;
+      neuron->toCpu();
+      inputs.push_back(neuron);
+    }
+  }
+  auto &out_tensors = *routine->out_tensors();
+  for (auto o : out_tensors) {
+    auto &neuron = _program->neuron_map[o->str()];
+    outputs.push_back(neuron);
+  }
+
+  auto func_name = routine->cpu_routine()->function_section()->str();
+  auto func_args = routine->cpu_routine()->function_args();
+  for (auto f : _program->cpu_functions) {
+    if (f->name == func_name) {
+      _func_open = f->func_open;
+      break;
+    }
+  }
+  if (!_func_open) {
+    TPU_LOG_ERROR("Cannot find runtime function of %s\n", func_name.c_str());
+    return false;
+  }
+  _func = _func_open();
+  OpParam param;
+  if (func_args) {
+    handleFuncArgs(func_args->data(), param);
+    if (func_name == "quant") {
+      fetchQscaleFromDequant(param);
+    }
+  }
+  _func->setup(inputs, outputs, param);
+  return true;
+}
+
+CVI_RC CpuRoutine::prepare() {
+  CVI_RC ret;
+  for (auto &input : inputs) {
+    ret = input->reserveSysMem();
+    if (ret != CVI_RC_SUCCESS) {
+      return ret;
+    }
+  }
+  for (auto &output : outputs) {
+    ret = output->reserveSysMem();
+    if (ret != CVI_RC_SUCCESS) {
+      return ret;
+    }
+  }
+  return CVI_RC_SUCCESS;
+}
+
+CVI_RC CpuRoutine::run() {
+#ifdef MEASURE_TIME
+  struct timeval t0, t1;
+  long elapsed;
+  gettimeofday(&t0, NULL);
+#endif
+
+  for (auto &neuron : inputs) {
+    neuron->toCpu();
+  }
+
+#ifdef MEASURE_TIME
+  gettimeofday(&t1, NULL);
+  elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
+  printf("  PERF: [to_cpu ] %ld us\n", elapsed);
+  t0 = t1;
+#endif
+
+  _func->run();
+
+#ifdef MEASURE_TIME
+  gettimeofday(&t1, NULL);
+  elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
+  printf("  PERF: [cpu_run] %ld us, %s\n", elapsed, outputs[0]->name.c_str());
+#endif
+
+  return CVI_SUCCESS;
+}
+
+} // namespace runtime
+} // namespace cvi
diff --git a/cviruntime/src/common/runtime.cpp b/cviruntime/src/common/runtime.cpp
new file mode 100644
index 000000000..2864b1b98
--- /dev/null
+++ b/cviruntime/src/common/runtime.cpp
@@ -0,0 +1,584 @@
+#include <sys/mman.h>
+#include <unistd.h>
+#include <dlfcn.h>
+#include <iostream>
+#include <sstream>
+#include <stdlib.h>
+#include <mutex>
+#include <string.h>
+#include <runtime/debug.h>
+#include <runtime/model.hpp>
+#include <runtime/stream.hpp>
+#include <runtime/shared_mem.hpp>
+#include "cviruntime.h"
+#include "cviruntime_context.h"
+#include "alloc.h"
+
+using namespace cvi::runtime;
+
+static CVI_RT_HANDLE g_ctx = nullptr;
+static int g_ctx_ref_count = 0;
+static std::mutex g_ctx_mutex;
+static int g_model_count = 0;
+
+struct ModelInstance {
+  ModelInstance(CviModel *model) :
+      model(model), program(nullptr) {
+    program_num = model->program_num;
+  }
+
+  ~ModelInstance() {
+    if (inputs) {
+      delete[] inputs;
+    }
+    if (outputs) {
+      delete[] outputs;
+    }
+    if (program) {
+      delete program;
+    }
+    model->release();
+  }
+
+  cvi::runtime::CviModel *model;
+  cvi::runtime::Program *program = nullptr;
+  CVI_TENSOR *inputs = nullptr;
+  CVI_TENSOR *outputs = nullptr;
+  int32_t input_num = 0;
+  int32_t output_num = 0;
+  int32_t program_id = 0;
+  int32_t program_num = 1;
+  bool output_all_tensors_for_debug = false;
+  bool skip_preprocess = false;
+};
+
+static void setChipTypeForCmodel(const char *modelFile, const int8_t *buf, size_t size) {
+#if defined(__x86_64__) || defined(_M_X64)
+  std::string filename = modelFile != nullptr ? modelFile : "";
+  std::string chip_target = CviModel::getChipType(filename, buf, size);
+  setenv("SET_CHIP_NAME", chip_target.c_str(), 1);
+  TPU_LOG_ERROR("setenv:%s\n", chip_target.c_str());
+#endif
+}
+
+static void setChipTypeForCmodelFd(const int fd, const size_t ud_offset) {
+#if defined(__x86_64__) || defined(_M_X64)
+  BaseStream *stream = new FdStream(fd, ud_offset);
+  if (stream->length() <= sizeof(MODEL_HEADER)) {
+    TPU_LOG_ERROR("Error, invalid cvimodel file\n");
+    assert(0);
+  }
+  MODEL_HEADER header;
+  stream->read((uint8_t *)&header, 0, sizeof(header));
+  delete stream;
+  std::string chip_target = std::string(header.chip);
+  setenv("SET_CHIP_NAME", chip_target.c_str(), 1);
+  TPU_LOG_ERROR("setenv:%s\n", chip_target.c_str());
+#endif
+}
+
+//According to the file descriptor and user defined offset to construct model.
+CVI_RC CVI_NN_RegisterModelFromFd(const int fd, const size_t ud_offset, CVI_MODEL_HANDLE *model) {
+  *model = NULL;
+  const std::lock_guard<std::mutex> lock(g_ctx_mutex);
+
+  if (!g_ctx) {
+    setChipTypeForCmodelFd(fd, ud_offset);
+    CVI_RT_Init(&g_ctx);
+  }
+
+  auto _model = new CviModel(g_ctx, g_model_count++);
+  if (!_model) {
+    TPU_LOG_ERROR("failed to create a CviModel Instance\n");
+    return CVI_RC_FAILURE;
+  }
+  CVI_RC ret = _model->acquire(fd, ud_offset);
+  if (ret != CVI_RC_SUCCESS) {
+    _model->release();
+    return ret;
+  }
+  auto instance = new ModelInstance(_model);
+  if (!instance) {
+    _model->release();
+    return CVI_RC_FAILURE;
+  }
+
+  g_ctx_ref_count++;
+  *model = (void *)instance;
+  return CVI_RC_SUCCESS;
+}
+
+CVI_RC CVI_NN_RegisterModel(const char *modelFile, CVI_MODEL_HANDLE *model) {
+  *model = NULL;
+  const std::lock_guard<std::mutex> lock(g_ctx_mutex);
+
+  if (!g_ctx) {
+    setChipTypeForCmodel(modelFile, nullptr, 0);
+    CVI_RT_Init(&g_ctx);
+  }
+
+  auto _model = new CviModel(g_ctx, g_model_count++);
+  if (!_model) {
+    TPU_LOG_ERROR("failed to create a CviModel Instance\n");
+    return CVI_RC_FAILURE;
+  }
+  CVI_RC ret = _model->acquire(modelFile);
+  if (ret != CVI_RC_SUCCESS) {
+    _model->release();
+    return ret;
+  }
+  auto instance = new ModelInstance(_model);
+  if (!instance) {
+    _model->release();
+    return CVI_RC_FAILURE;
+  }
+
+  g_ctx_ref_count++;
+  *model = (void *)instance;
+  return CVI_RC_SUCCESS;
+}
+
+CVI_RC CVI_NN_RegisterModelFromBuffer(const int8_t *buf, uint32_t size,
+                                      CVI_MODEL_HANDLE *model) {
+  *model = NULL;
+  const std::lock_guard<std::mutex> lock(g_ctx_mutex);
+
+  if (!g_ctx) {
+    setChipTypeForCmodel(nullptr, buf, size);
+    CVI_RT_Init(&g_ctx);
+  }
+
+  auto _model = new CviModel(g_ctx, g_model_count++);
+  if (!_model) {
+    TPU_LOG_ERROR("failed to create a CviModel Instance\n");
+    return CVI_RC_FAILURE;
+  }
+  CVI_RC ret = _model->acquire(buf, size);
+  if (ret != CVI_RC_SUCCESS) {
+    _model->release();
+    return ret;
+  }
+  auto instance = new ModelInstance(_model);
+  if (!instance) {
+    _model->release();
+    return CVI_RC_FAILURE;
+  }
+
+  g_ctx_ref_count++;
+  *model = (void *)instance;
+  return CVI_RC_SUCCESS;
+}
+
+
+
+CVI_RC CVI_NN_CloneModel(CVI_MODEL_HANDLE model, CVI_MODEL_HANDLE *clonedModel) {
+  const std::lock_guard<std::mutex> lock(g_ctx_mutex);
+  ++g_ctx_ref_count;
+  auto instance = new ModelInstance(((struct ModelInstance *)model)->model);
+  if (!instance) {
+    --g_ctx_ref_count;
+    return CVI_RC_FAILURE;
+  }
+  instance->model->refer();
+  *clonedModel = (void *)instance;
+  return CVI_RC_SUCCESS;
+}
+
+CVI_RC CVI_NN_GetModelVersion(CVI_MODEL_HANDLE model, int32_t *major, int32_t *minor) {
+  auto instance = (struct ModelInstance *)model;
+  *major = instance->model->major_ver;
+  *minor = instance->model->minor_ver;
+  return CVI_RC_SUCCESS;
+}
+
+const char *CVI_NN_GetModelTarget(CVI_MODEL_HANDLE model) {
+  auto instance = (struct ModelInstance *)model;
+  return instance->model->targetChipType.c_str();
+}
+
+CVI_RC CVI_NN_SetConfig(CVI_MODEL_HANDLE model, CVI_CONFIG_OPTION option, ...) {
+  va_list valist;
+  auto instance = (struct ModelInstance *)model;
+
+  va_start(valist, option);
+  switch (option) {
+    case OPTION_BATCH_SIZE:
+      instance->program_id = 0;
+      break;
+    case OPTION_OUTPUT_ALL_TENSORS:
+      instance->output_all_tensors_for_debug = va_arg(valist, int32_t);
+      break;
+    case OPTION_PROGRAM_INDEX:
+      instance->program_id = va_arg(valist, int32_t);
+      assert(instance->program_id < instance->program_num);
+      break;
+    case OPTION_SKIP_PREPROCESS:
+    case OPTION_SKIP_POSTPROCESS:
+    case OPTION_INPUT_MEM_TYPE:
+    case OPTION_OUTPUT_MEM_TYPE:
+    case OPTION_PREPARE_BUF_FOR_INPUTS:
+    case OPTION_PREPARE_BUF_FOR_OUTPUTS:
+      TPU_LOG_WARNING("deprecated option:%d\n", (int)option);
+      break;
+    default:
+      TPU_LOG_ERROR("unsupported option:%d\n", (int)option);
+      assert(0);
+  }
+  va_end(valist);
+  return CVI_RC_SUCCESS;
+}
+
+CVI_RC CVI_NN_GetInputOutputTensors(CVI_MODEL_HANDLE model, CVI_TENSOR **inputs,
+                              int32_t *input_num, CVI_TENSOR **outputs,
+                              int32_t *output_num) {
+  CVI_RC ret;
+  auto instance = (struct ModelInstance *)model;
+  if (!instance->program) {
+    ret = instance->model->loadProgram(
+        &(instance->program), instance->program_id,
+        instance->output_all_tensors_for_debug,
+        instance->skip_preprocess);
+    if (ret != CVI_RC_SUCCESS) {
+      TPU_LOG_ERROR("ret:%d\n", ret);
+      return ret;
+    }
+  }
+
+  if (!instance->inputs) {
+    instance->inputs = instance->program->exportInputs(instance->input_num);
+    if (!instance->inputs) {
+      return CVI_RC_FAILURE;
+    }
+  }
+  if (!instance->outputs) {
+    instance->outputs = instance->program->exportOutputs(instance->output_num);
+    if (!instance->outputs) {
+      return CVI_RC_FAILURE;
+    }
+  }
+
+  if (inputs)
+    *inputs = instance->inputs;
+  if (input_num)
+    *input_num = instance->input_num;
+  if (outputs)
+    *outputs = instance->outputs;
+  if (output_num)
+    *output_num = instance->output_num;
+  return CVI_RC_SUCCESS;
+}
+
+CVI_RC CVI_NN_Forward(CVI_MODEL_HANDLE model, CVI_TENSOR inputs[], int32_t input_num,
+                      CVI_TENSOR outputs[], int output_num) {
+  auto instance = (struct ModelInstance *)model;
+  if (instance->program->forward(inputs, input_num, outputs, output_num))
+    return CVI_RC_SUCCESS;
+  return CVI_RC_FAILURE;
+}
+
+CVI_RC CVI_NN_ForwardAsync(CVI_MODEL_HANDLE model, CVI_TENSOR inputs[], int input_num,
+                           CVI_TENSOR outputs[], int output_num, void **taskNo) {
+  auto instance = (struct ModelInstance *)model;
+  *taskNo = instance->program->forwardAsync(inputs, input_num, outputs, output_num);
+  return CVI_RC_SUCCESS;
+}
+
+CVI_RC CVI_NN_ForwardWait(CVI_MODEL_HANDLE model, void *taskNo) {
+  auto instance = (struct ModelInstance *)model;
+  return instance->program->forwardWait(taskNo);
+}
+
+CVI_RC CVI_NN_CleanupModel(CVI_MODEL_HANDLE model) {
+  if (model) {
+    delete (struct ModelInstance *)model;
+  }
+
+  const std::lock_guard<std::mutex> lock(g_ctx_mutex);
+  g_ctx_ref_count--;
+  if (g_ctx_ref_count == 0) {
+    CVI_RT_DeInit(g_ctx);
+    g_ctx = nullptr;
+  }
+  return CVI_RC_SUCCESS;
+}
+
+///
+/// Helper functions
+///
+CVI_RC CVI_NN_GetInputTensors(CVI_MODEL_HANDLE model, CVI_TENSOR **inputs, int32_t *input_num) {
+  return CVI_NN_GetInputOutputTensors(model, inputs, input_num, nullptr, nullptr);
+}
+
+CVI_RC CVI_NN_GetOutputTensors(CVI_MODEL_HANDLE model, CVI_TENSOR **outputs, int32_t *output_num) {
+  return CVI_NN_GetInputOutputTensors(model, nullptr, nullptr, outputs, output_num);
+}
+
+CVI_TENSOR *CVI_NN_GetTensorByName(const char *name, CVI_TENSOR *tensors, int32_t num) {
+  if (name == CVI_NN_DEFAULT_TENSOR) {
+    if (num == 1) {
+      return &tensors[0];
+    } else {
+      return NULL;
+    }
+  }
+  // if last char of name is '*', use strncmp instead.
+  int sz = strlen(name);
+  bool has_wildcard = (name[sz - 1] == '*');
+  for (int32_t i = 0; i < num; i++) {
+    if (!has_wildcard) {
+      if (strcmp(tensors[i].name, name) == 0) {
+        return &tensors[i];
+      }
+    } else if (strncmp(tensors[i].name, name, sz - 1) == 0) {
+      return &tensors[i];
+    }
+  }
+  return NULL;
+}
+
+char *CVI_NN_TensorName(CVI_TENSOR *tensor) { return tensor->name; }
+
+void *CVI_NN_TensorPtr(CVI_TENSOR *tensor) {
+  if (tensor->mem_type == CVI_MEM_SYSTEM) {
+    return (void *)tensor->sys_mem;
+  } else if (tensor->mem_type == CVI_MEM_DEVICE) {
+    TPU_LOG_ERROR("Try to get mem ptr with device memory\n");
+    return nullptr;
+  } else {
+    TPU_LOG_ERROR("Try to get mem ptr with unknown type\n");
+    return nullptr;
+  }
+}
+
+size_t CVI_NN_TensorSize(CVI_TENSOR *tensor) {
+  return tensor->mem_size;
+}
+
+size_t CVI_NN_TensorCount(CVI_TENSOR *tensor) {
+  return tensor->count;
+}
+
+CVI_SHAPE CVI_NN_TensorShape(CVI_TENSOR *tensor) {
+  return tensor->shape;
+}
+
+float CVI_NN_TensorQuantScale(CVI_TENSOR *tensor) {
+  return tensor->qscale;
+}
+
+int CVI_NN_TensorQuantZeroPoint(CVI_TENSOR *tensor){
+  return tensor->zero_point;
+}
+
+CVI_RC CVI_NN_SetTensorPtr(CVI_TENSOR *tensor, void *mem) {
+  assert(mem);
+  tensor->sys_mem = (uint8_t *)mem;
+  tensor->mem_type = CVI_MEM_SYSTEM;
+  return CVI_RC_SUCCESS;
+}
+
+CVI_RC CVI_NN_SetTensorPhysicalAddr(CVI_TENSOR *tensor, uint64_t paddr) {
+  tensor->paddr = paddr;
+  tensor->mem_type = CVI_MEM_DEVICE;
+
+  assert(tensor->owner);
+  auto program = static_cast<cvi::runtime::Program *>(tensor->owner);
+  for (auto &input_tensor : program->input_tensors()) {
+    if (input_tensor->name == tensor->name) {
+      input_tensor->updateBaseAddr(paddr);
+      return CVI_RC_SUCCESS;
+    }
+  }
+  for (auto &output_tensor : program->output_tensors()) {
+    if (output_tensor->name == tensor->name) {
+      output_tensor->updateBaseAddr(paddr);
+      return CVI_RC_SUCCESS;
+    }
+  }
+  assert(0 && "invalid tensor");
+  return CVI_RC_FAILURE;
+}
+
+static std::shared_ptr<Neuron> findTargetInput(CVI_TENSOR *tensor) {
+  auto program = static_cast<cvi::runtime::Program *>(tensor->owner);
+  for (auto &input : program->input_tensors()) {
+    if (input->name == tensor->name) {
+      return input;
+    }
+  }
+  assert(0);
+  return nullptr;
+}
+
+CVI_RC CVI_NN_SetTensorWithVideoFrame(
+    CVI_MODEL_HANDLE model, CVI_TENSOR* tensor,
+    CVI_VIDEO_FRAME_INFO* video_frame_info) {
+  (void)model;
+
+  // check param
+  // check frame type
+#if 0
+  // video_frame_info->type is CVI_FRAME_PLANAR on early sampes,
+  // so don't check
+  if (tensor->pixel_format != video_frame_info->type) {
+    TPU_LOG_ERROR("Frame format error! [need|%d] vs [input|%d]\n",
+                  tensor->pixel_format, video_frame_info->type);
+    return CVI_RC_DATA_ERR;
+  }
+#endif
+
+  // check shape
+  if (tensor->shape.dim[1] != video_frame_info->shape.dim[1] ||
+      tensor->shape.dim[2] != video_frame_info->shape.dim[2] ||
+      tensor->shape.dim[3] != video_frame_info->shape.dim[3]) {
+      TPU_LOG_ERROR("Frame size error! [need|%d, %d, %d, %d] vs [input|%d, %d, %d, %d]\n",
+                    tensor->shape.dim[0], tensor->shape.dim[1], tensor->shape.dim[2], tensor->shape.dim[3],
+                    video_frame_info->shape.dim[0], video_frame_info->shape.dim[1],
+                    video_frame_info->shape.dim[2], video_frame_info->shape.dim[3]);
+      return CVI_RC_DATA_ERR;
+  }
+
+  int n = tensor->shape.dim[0];
+  assert(n == 1);
+  tensor->mem_type = CVI_MEM_DEVICE;
+
+  auto input = findTargetInput(tensor);
+  CVI_RC ret = CVI_RC_SUCCESS;
+  if (!tensor->aligned) {
+    int c = input->isPacked() ? 1 : tensor->shape.dim[1];
+    assert(c <= 3);
+    for (int i = 0; i < c; i++) {
+      ret = input->preloadChannelAndCompact(i, video_frame_info->pyaddr[i]);
+      if (ret != CVI_RC_SUCCESS) {
+        TPU_LOG_ERROR("CVI_NN_SetTensorWithVideoFrame fail!");
+        return ret;
+      }
+    }
+  } else {
+    /* check y_align w_align channel_align
+       1.on cv183x channel_align is 0x1000
+       2.on cv183x yuv_420_planar's y_align is 64, w_align is 32
+       3.on cv182x yuv_420_planar's y_align is 128, w_align is 64
+    */
+    CVI_NN_SetTensorWithAlignedFrames(
+        tensor, &(video_frame_info->pyaddr[0]), 1,
+        video_frame_info->type);
+  }
+  return CVI_RC_SUCCESS;
+}
+
+CVI_RC CVI_NN_SetTensorWithAlignedFrames(
+    CVI_TENSOR *tensor, uint64_t frame_paddrs[],
+    int32_t frame_num, CVI_NN_PIXEL_FORMAT_E pixel_format) {
+
+  assert(tensor->owner);
+  assert(frame_num <= tensor->shape.dim[0]);
+  tensor->mem_type = CVI_MEM_DEVICE;
+
+  auto input = findTargetInput(tensor);
+  CVI_RC ret = CVI_RC_SUCCESS;
+
+  if (!tensor->aligned) {
+    for (int i = 0; i < frame_num; i++) {
+      ret = input->preloadFrameAndCompact(i, frame_paddrs[i]);
+      if (ret != CVI_RC_SUCCESS) {
+        TPU_LOG_ERROR("CVI_NN_SetTensorWithAlignedFrames unaligned fail!");
+        return ret;
+      }
+    }
+  } else {
+    // check pixel format
+    if (pixel_format != tensor->pixel_format) {
+      TPU_LOG_ERROR("pixel_format is not correct, %d vs %d\n", tensor->pixel_format, pixel_format);
+      assert(0);
+    }
+    if (frame_num == 1 && tensor->shape.dim[0] == 1) {
+      CVI_NN_SetTensorPhysicalAddr(tensor, frame_paddrs[0]);
+    } else {
+      for (int i = 0; i < frame_num; i++) {
+        ret = input->preload(i, frame_paddrs[i]);
+        if (ret != CVI_RC_SUCCESS) {
+            TPU_LOG_ERROR("CVI_NN_SetTensorWithAlignedFrames aligned fail!");
+            return ret;
+        }
+      }
+    }
+  }
+  return CVI_RC_SUCCESS;
+}
+
+CVI_RC CVI_NN_FeedTensorWithFrames(
+    CVI_MODEL_HANDLE model, CVI_TENSOR *tensor,
+    CVI_FRAME_TYPE type, CVI_FMT format,
+    int32_t channel_num, uint64_t *channel_paddrs,
+    int32_t height, int32_t width, uint32_t height_stride) {
+
+  (void)model;
+  (void)height_stride;
+  (void)format;
+
+  // check param
+  // check frame type
+#if 0
+  // video_frame_info->type is CVI_FRAME_PLANAR on early sampes,
+  // so don't check for now
+  if (tensor->pixel_format != type) {
+    TPU_LOG_ERROR("Frame format error! [need|%d] vs [input|%d]\n",
+                  tensor->pixel_format, format);
+    return CVI_RC_DATA_ERR;
+  }
+#endif
+
+  auto input = findTargetInput(tensor);
+  // check width height
+  if (input->isPacked()) {
+    if (tensor->shape.dim[1] != height || tensor->shape.dim[2] != width) {
+      TPU_LOG_ERROR("Frame size error! [(w, h) need|%d, %d] vs [input|%d, %d]\n",
+                      tensor->shape.dim[2], tensor->shape.dim[1], width, height);
+      return CVI_RC_DATA_ERR;
+    }
+  } else {
+    if (tensor->shape.dim[2] != height || tensor->shape.dim[3] != width) {
+      TPU_LOG_ERROR("Frame size error! [(w, h) need|%d, %d] vs [input|%d, %d]\n",
+                      tensor->shape.dim[3], tensor->shape.dim[2], width, height);
+      return CVI_RC_DATA_ERR;
+    }
+  }
+
+  int n = tensor->shape.dim[0];
+  assert(n == 1);
+  tensor->mem_type = CVI_MEM_DEVICE;
+  CVI_RC ret = CVI_RC_SUCCESS;
+
+  if (!tensor->aligned) {
+    int c = input->isPacked() ? 1 : tensor->shape.dim[1];
+    assert(channel_num <= c);
+    for (int i = 0; i < channel_num; i++) {
+      ret = input->preloadChannelAndCompact(i, channel_paddrs[i]);
+      if (ret != CVI_RC_SUCCESS) {
+        TPU_LOG_WARNING("FeedTensor failed\n");
+        return CVI_RC_FAILURE;
+      }
+    }
+  } else {
+    /* check y_align w_align channel_align
+       1.on cv183x channel_align is 0x1000
+       2.on cv183x yuv_420_planar's y_align is 64, w_align is 32
+       3.on cv182x yuv_420_planar's y_align is 128, w_align is 64
+    */
+    CVI_NN_SetTensorWithAlignedFrames(tensor, &(channel_paddrs[0]), 1, type);
+  }
+  return CVI_RC_SUCCESS;
+}
+
+CVI_RC CVI_RT_Global_SetMemAllocCallback(CVI_MEM_ALLOC_CB alloc_cb, CVI_MEM_FREE_CB free_cb) {
+  return cviSetMemCallback(alloc_cb, free_cb);
+}
+
+void CVI_RT_Global_ResetMemAllocCallback() {
+  return cviResetMemCallback();
+}
+
+void CVI_NN_Global_SetSharedMemorySize(size_t size) {
+  setSharedMemSize(size);
+}
diff --git a/cviruntime/src/common/section.cpp b/cviruntime/src/common/section.cpp
new file mode 100644
index 000000000..b2e91636d
--- /dev/null
+++ b/cviruntime/src/common/section.cpp
@@ -0,0 +1,86 @@
+#include <sys/mman.h>
+#include <unistd.h>
+#include <dlfcn.h>
+#include <iostream>
+#include <sstream>
+#include <mutex>
+#include <linux/memfd.h>
+#include <asm/unistd.h>
+#include <runtime/debug.h>
+#include <runtime/section.hpp>
+#include <runtime/stream.hpp>
+
+#if defined(__aarch64__) || defined(__arm__) || (__GNUC__ < 6)
+#include <sys/syscall.h>
+#include <fcntl.h>
+
+/*
+ * Before glibc version 2.27 there was no wrapper for memfd_create(2),
+ * so we have to provide our own.
+ *
+ * Also define memfd fcntl sealing macros. While they are already
+ * defined in the kernel header file <linux/fcntl.h>, that file as
+ * a whole conflicts with the original glibc header <fnctl.h>.
+ */
+
+static inline int memfd_create(const char *name, unsigned int flags) {
+  return syscall(SYS_memfd_create, name, flags);
+}
+#endif
+
+namespace cvi {
+namespace runtime {
+
+bool CustomFunctionSection::load(BaseStream *stream, size_t offset, size_t size,
+                                 std::vector<CpuRuntimeFunction *> &cpu_functions) {
+  char path[64];
+  uint8_t *buf = new uint8_t[size];
+  if (!buf) {
+    TPU_LOG_ERROR("Error, failed to allocate memory\n");
+    return false;
+  }
+  stream->read(buf, offset, size);
+  // load function by dlopen.
+  shm_fd = memfd_create("cvitek", MFD_CLOEXEC);
+  if (-1 == write(shm_fd, buf, size)) {
+    TPU_LOG_ERROR("Error, write data to shared mem failed:%d\n", errno);
+    delete[] buf;
+    return false;
+  }
+  delete[] buf;
+
+  snprintf(path, sizeof(path), "/proc/%d/fd/%d", getpid(), shm_fd);
+  dso_handle = dlopen(path, RTLD_NOW | RTLD_LOCAL);
+  if (!dso_handle) {
+    TPU_LOG_ERROR("Error, dlopen %s, %s\n", path, dlerror());
+    return false;
+  }
+
+  auto num = (int *)dlsym(dso_handle, "customOpRuntimeFuncsNum");
+  if (!num) {
+    TPU_LOG_ERROR("Error, dlsym find 'customOpRuntimeFuncsNum' failed\n");
+    return false;
+  }
+
+  auto custom_funcs = (CustomOpRuntimeFunc*)dlsym(dso_handle, "customOpRuntimeFuncs");
+  if (!custom_funcs) {
+    TPU_LOG_ERROR("Error, dlsym find 'customOpRuntimeFuncs' failed\n");
+    return false;
+  }
+
+  for (int i = 0; i < (*num); i++) {
+    cpu_functions.push_back(
+        new CpuRuntimeFunction(custom_funcs[i].name, custom_funcs[i].func));
+  }
+  return true;
+}
+
+CustomFunctionSection::~CustomFunctionSection() {
+  if (dso_handle)
+    dlclose(dso_handle);
+  if (shm_fd)
+    close(shm_fd);
+}
+
+} // namespace runtime
+} // namespace cvi
diff --git a/cviruntime/src/common/shared_mem.cpp b/cviruntime/src/common/shared_mem.cpp
new file mode 100644
index 000000000..333589759
--- /dev/null
+++ b/cviruntime/src/common/shared_mem.cpp
@@ -0,0 +1,80 @@
+#include <inttypes.h>
+#include <list>
+#include <mutex>
+#include <iostream>
+#include <sstream>
+#include <runtime/debug.h>
+#include <runtime/stream.hpp>
+#include "cviruntime_context.h"
+#include "alloc.h"
+
+namespace cvi {
+namespace runtime {
+
+static std::mutex gMutexLock;
+static std::list<CVI_RT_MEM> gSharedMemList;
+static size_t gMaxSharedMemSize = 0;
+
+void setSharedMemSize(size_t size) {
+  #define PAGESIZE 4096
+  const std::lock_guard<std::mutex> lock(gMutexLock);
+  uint32_t mask = PAGESIZE - 1;
+  size = (size + mask) & (~mask);
+  gMaxSharedMemSize = std::max(gMaxSharedMemSize, size);
+}
+
+CVI_RT_MEM allocateSharedMemory(CVI_RT_HANDLE ctx, size_t size) {
+  const std::lock_guard<std::mutex> lock(gMutexLock);
+  size = std::max(gMaxSharedMemSize, size);
+  // check if a mem in shared list is big enough.
+  for (auto &mem : gSharedMemList) {
+    if (CVI_RT_MemGetSize(mem) >= size) {
+      TPU_LOG_DEBUG("find shared memory(%" PRIu64 "),  saved:%zu \n",
+                    CVI_RT_MemGetSize(mem), size);
+      CVI_RT_MemIncRef(mem);
+      return mem;
+    }
+  }
+
+  // if no availabel stored mem, create a new one
+  // and insert it in sharedMemList.
+  CVI_RT_MEM mem = cviMemAlloc(ctx, size, CVI_ALLOC_SHARED, "SharedMemory");
+  if (!mem) {
+    return nullptr;
+  }
+  CVI_RT_MemIncRef(mem);
+  if (gSharedMemList.empty()) {
+    gSharedMemList.push_back(mem);
+  } else {
+    for (auto it = gSharedMemList.begin();
+        it != gSharedMemList.end();
+        ++it) {
+      if (CVI_RT_MemGetSize(*it) < size) {
+        gSharedMemList.insert(it, mem);
+        break;
+      }
+    }
+  }
+  return mem;
+}
+
+void deallocateSharedMemory(CVI_RT_HANDLE ctx, CVI_RT_MEM mem) {
+  const std::lock_guard<std::mutex> lock(gMutexLock);
+  for (auto candidate : gSharedMemList) {
+    if (candidate == mem) {
+      // if ref drops to 0, free it.
+      if (CVI_RT_MemDecRef(mem) == 0) {
+        gSharedMemList.remove(mem);
+        cviMemFree(ctx, mem);
+      }
+      // otherwise, do nothing.
+      return;
+    }
+  }
+  // not a shared mem, free it directly.
+  cviMemFree(ctx, mem);
+}
+
+} // namespace runtime
+} // namespace cvi
+
diff --git a/cviruntime/src/common/stream.cpp b/cviruntime/src/common/stream.cpp
new file mode 100644
index 000000000..7f513d144
--- /dev/null
+++ b/cviruntime/src/common/stream.cpp
@@ -0,0 +1,63 @@
+#include <iostream>
+#include <sstream>
+#include <runtime/debug.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <runtime/stream.hpp>
+
+namespace cvi {
+namespace runtime {
+
+FileStream::FileStream(const std::string& file_name) {
+  _fstream = new std::ifstream(file_name, std::ifstream::binary);
+  if (!_fstream->good()) {
+    TPU_LOG_ERROR("Error, Failed to open %s\n", file_name.c_str());
+    return;
+  }
+  _fstream->seekg(0, _fstream->end);
+  _length = _fstream->tellg();
+  _fstream->seekg(0, _fstream->beg);
+}
+
+FileStream::~FileStream() {
+  if (_fstream)
+    delete _fstream;
+}
+
+size_t FileStream::read(uint8_t *buf, size_t offset, size_t size) {
+  TPU_ASSERT(offset + size <= _length, "model is incomplete or incorrect!");
+  _fstream->seekg(offset);
+  _fstream->read((char *)buf, size);
+  return size;
+}
+
+BufferStream::BufferStream(const int8_t *buf, size_t size)
+  : buffer(buf) {
+  _length = size;
+}
+
+size_t BufferStream::read(uint8_t *buf, size_t offset, size_t size) {
+  TPU_ASSERT(offset + size <= _length, "model is incomplete or incorrect!");
+  memcpy(buf, buffer + offset, size);
+  return size;
+}
+
+FdStream::FdStream(const int fd, const size_t ud_offset) {
+  file_descriptor = fd;
+  user_define_offset = ud_offset;
+  _length = (size_t)lseek(fd, 0, SEEK_END) - ud_offset;
+}
+
+size_t FdStream::read(uint8_t *buf, size_t offset, size_t size) {
+  TPU_ASSERT(offset + size <= _length, "model is incomplete or incorrect!");
+  //when reading add user_define_offset to the offset
+  lseek(file_descriptor, offset + user_define_offset, SEEK_SET);
+  size_t sz = ::read(file_descriptor, buf, size);
+  return sz;
+}
+
+} // namespace runtime
+} // namespace cvi
+
diff --git a/cviruntime/src/common/taskpool.cpp b/cviruntime/src/common/taskpool.cpp
new file mode 100644
index 000000000..2f4ecdaa3
--- /dev/null
+++ b/cviruntime/src/common/taskpool.cpp
@@ -0,0 +1,66 @@
+#include <sys/mman.h>
+#include <unistd.h>
+#include <iostream>
+#include <mutex>
+#include <runtime/taskpool.hpp>
+#include <runtime/model.hpp>
+
+namespace cvi {
+namespace runtime {
+
+Task::Task(TaskPool *pool, void *program, CVI_TENSOR *inputs,
+           int input_num, CVI_TENSOR *outputs, int output_num)
+    : program(program), input_num(input_num), output_num(output_num),
+      inputs(inputs), outputs(outputs) {
+  pool->addTask(this);
+}
+
+TaskPool::~TaskPool() {
+  if (_started) {
+    _done = true;
+    for (int i = 0; i < _pool_size; ++i) {
+      addTerminateTask();
+    }
+    for (auto &thread : _threads) {
+      if (thread.joinable()) {
+        thread.join();
+      }
+    }
+  }
+}
+
+void TaskPool::startPool() {
+  if (_started) {
+    return;
+  }
+  std::unique_lock<std::mutex> lock(_mutex);
+  for (int i = 0; i < _pool_size; ++i) {
+    _threads.push_back(std::thread(run, this));
+  }
+  while (!_started) {
+    usleep(10);
+  }
+}
+
+void TaskPool::workFunc() {
+  _started = true;
+  while (!_done) {
+    auto task = _queue.get();
+    if (task == nullptr)
+      continue;
+    auto program = (Program *)task->program;
+    task->retCode = (CVI_RC)program->forward(task->inputs, task->input_num, task->outputs,
+                                     task->output_num);
+    std::unique_lock<std::mutex> lock(_mutex);
+    _cond_feedback.notify_all();
+  }
+}
+
+void TaskPool::waitTask(Task *task) {
+  std::unique_lock<std::mutex> lock(_mutex);
+  while (task->retCode == CVI_RC_UNINIT)
+    _cond_feedback.wait(lock);
+}
+
+}
+}
diff --git a/cviruntime/src/lz4/lz4_all.c b/cviruntime/src/lz4/lz4_all.c
new file mode 100644
index 000000000..771d8041d
--- /dev/null
+++ b/cviruntime/src/lz4/lz4_all.c
@@ -0,0 +1,2496 @@
+/*
+   LZ4 - Fast LZ compression algorithm
+   Copyright (C) 2011-present, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+    - LZ4 homepage : http://www.lz4.org
+    - LZ4 source repository : https://github.com/lz4/lz4
+*/
+
+/*-************************************
+*  Tuning parameters
+**************************************/
+/*
+ * LZ4_HEAPMODE :
+ * Select how default compression functions will allocate memory for their hash table,
+ * in memory stack (0:default, fastest), or in memory heap (1:requires malloc()).
+ */
+#ifndef LZ4_HEAPMODE
+#  define LZ4_HEAPMODE 0
+#endif
+
+/*
+ * LZ4_ACCELERATION_DEFAULT :
+ * Select "acceleration" for LZ4_compress_fast() when parameter value <= 0
+ */
+#define LZ4_ACCELERATION_DEFAULT 1
+/*
+ * LZ4_ACCELERATION_MAX :
+ * Any "acceleration" value higher than this threshold
+ * get treated as LZ4_ACCELERATION_MAX instead (fix #876)
+ */
+#define LZ4_ACCELERATION_MAX 65537
+
+
+/*-************************************
+*  CPU Feature Detection
+**************************************/
+/* LZ4_FORCE_MEMORY_ACCESS
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
+ *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method is portable but violate C standard.
+ *            It can generate buggy code on targets which assembly generation depends on alignment.
+ *            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+ * See https://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
+ */
+#ifndef LZ4_FORCE_MEMORY_ACCESS   /* can be defined externally */
+#  if defined(__GNUC__) && \
+  ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) \
+  || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
+#    define LZ4_FORCE_MEMORY_ACCESS 2
+#  elif (defined(__INTEL_COMPILER) && !defined(_WIN32)) || defined(__GNUC__)
+#    define LZ4_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+/*
+ * LZ4_FORCE_SW_BITCOUNT
+ * Define this parameter if your target system or compiler does not support hardware bit count
+ */
+#if defined(_MSC_VER) && defined(_WIN32_WCE)   /* Visual Studio for WinCE doesn't support Hardware bit count */
+#  undef  LZ4_FORCE_SW_BITCOUNT  /* avoid double def */
+#  define LZ4_FORCE_SW_BITCOUNT
+#endif
+
+
+
+/*-************************************
+*  Dependency
+**************************************/
+/*
+ * LZ4_SRC_INCLUDED:
+ * Amalgamation flag, whether lz4.c is included
+ */
+#ifndef LZ4_SRC_INCLUDED
+#  define LZ4_SRC_INCLUDED 1
+#endif
+
+#ifndef LZ4_STATIC_LINKING_ONLY
+#define LZ4_STATIC_LINKING_ONLY
+#endif
+
+#ifndef LZ4_DISABLE_DEPRECATE_WARNINGS
+#define LZ4_DISABLE_DEPRECATE_WARNINGS /* due to LZ4_decompress_safe_withPrefix64k */
+#endif
+
+#define LZ4_STATIC_LINKING_ONLY  /* LZ4_DISTANCE_MAX */
+#include "lz4.h"
+/* see also "memory routines" below */
+
+
+/*-************************************
+*  Compiler Options
+**************************************/
+#if defined(_MSC_VER) && (_MSC_VER >= 1400)  /* Visual Studio 2005+ */
+#  include <intrin.h>               /* only present in VS2005+ */
+#  pragma warning(disable : 4127)   /* disable: C4127: conditional expression is constant */
+#endif  /* _MSC_VER */
+
+#ifndef LZ4_FORCE_INLINE
+#  ifdef _MSC_VER    /* Visual Studio */
+#    define LZ4_FORCE_INLINE static __forceinline
+#  else
+#    if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#      ifdef __GNUC__
+#        define LZ4_FORCE_INLINE static inline __attribute__((always_inline))
+#      else
+#        define LZ4_FORCE_INLINE static inline
+#      endif
+#    else
+#      define LZ4_FORCE_INLINE static
+#    endif /* __STDC_VERSION__ */
+#  endif  /* _MSC_VER */
+#endif /* LZ4_FORCE_INLINE */
+
+/* LZ4_FORCE_O2 and LZ4_FORCE_INLINE
+ * gcc on ppc64le generates an unrolled SIMDized loop for LZ4_wildCopy8,
+ * together with a simple 8-byte copy loop as a fall-back path.
+ * However, this optimization hurts the decompression speed by >30%,
+ * because the execution does not go to the optimized loop
+ * for typical compressible data, and all of the preamble checks
+ * before going to the fall-back path become useless overhead.
+ * This optimization happens only with the -O3 flag, and -O2 generates
+ * a simple 8-byte copy loop.
+ * With gcc on ppc64le, all of the LZ4_decompress_* and LZ4_wildCopy8
+ * functions are annotated with __attribute__((optimize("O2"))),
+ * and also LZ4_wildCopy8 is forcibly inlined, so that the O2 attribute
+ * of LZ4_wildCopy8 does not affect the compression speed.
+ */
+#if defined(__PPC64__) && defined(__LITTLE_ENDIAN__) && defined(__GNUC__) && !defined(__clang__)
+#  define LZ4_FORCE_O2  __attribute__((optimize("O2")))
+#  undef LZ4_FORCE_INLINE
+#  define LZ4_FORCE_INLINE  static __inline __attribute__((optimize("O2"),always_inline))
+#else
+#  define LZ4_FORCE_O2
+#endif
+
+#if (defined(__GNUC__) && (__GNUC__ >= 3)) || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) || defined(__clang__)
+#  define expect(expr,value)    (__builtin_expect ((expr),(value)) )
+#else
+#  define expect(expr,value)    (expr)
+#endif
+
+#ifndef likely
+#define likely(expr)     expect((expr) != 0, 1)
+#endif
+#ifndef unlikely
+#define unlikely(expr)   expect((expr) != 0, 0)
+#endif
+
+/* Should the alignment test prove unreliable, for some reason,
+ * it can be disabled by setting LZ4_ALIGN_TEST to 0 */
+#ifndef LZ4_ALIGN_TEST  /* can be externally provided */
+# define LZ4_ALIGN_TEST 1
+#endif
+
+
+/*-************************************
+*  Memory routines
+**************************************/
+#ifdef LZ4_USER_MEMORY_FUNCTIONS
+/* memory management functions can be customized by user project.
+ * Below functions must exist somewhere in the Project
+ * and be available at link time */
+void* LZ4_malloc(size_t s);
+void* LZ4_calloc(size_t n, size_t s);
+void  LZ4_free(void* p);
+# define ALLOC(s)          LZ4_malloc(s)
+# define ALLOC_AND_ZERO(s) LZ4_calloc(1,s)
+# define FREEMEM(p)        LZ4_free(p)
+#else
+# include <stdlib.h>   /* malloc, calloc, free */
+# define ALLOC(s)          malloc(s)
+# define ALLOC_AND_ZERO(s) calloc(1,s)
+# define FREEMEM(p)        free(p)
+#endif
+
+#include <string.h>   /* memset, memcpy */
+#define MEM_INIT(p,v,s)   memset((p),(v),(s))
+
+
+/*-************************************
+*  Common Constants
+**************************************/
+#define MINMATCH 4
+
+#define WILDCOPYLENGTH 8
+#define LASTLITERALS   5   /* see ../doc/lz4_Block_format.md#parsing-restrictions */
+#define MFLIMIT       12   /* see ../doc/lz4_Block_format.md#parsing-restrictions */
+#define MATCH_SAFEGUARD_DISTANCE  ((2*WILDCOPYLENGTH) - MINMATCH)   /* ensure it's possible to write 2 x wildcopyLength without overflowing output buffer */
+#define FASTLOOP_SAFE_DISTANCE 64
+static const int LZ4_minLength = (MFLIMIT+1);
+
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+
+#define LZ4_DISTANCE_ABSOLUTE_MAX 65535
+#if (LZ4_DISTANCE_MAX > LZ4_DISTANCE_ABSOLUTE_MAX)   /* max supported by LZ4 format */
+#  error "LZ4_DISTANCE_MAX is too big : must be <= 65535"
+#endif
+
+#define ML_BITS  4
+#define ML_MASK  ((1U<<ML_BITS)-1)
+#define RUN_BITS (8-ML_BITS)
+#define RUN_MASK ((1U<<RUN_BITS)-1)
+
+
+/*-************************************
+*  Error detection
+**************************************/
+#if defined(LZ4_DEBUG) && (LZ4_DEBUG>=1)
+#  include <assert.h>
+#else
+#  ifndef assert
+#    define assert(condition) ((void)0)
+#  endif
+#endif
+
+#define LZ4_STATIC_ASSERT(c)   { enum { LZ4_static_assert = 1/(int)(!!(c)) }; }   /* use after variable declarations */
+
+#if defined(LZ4_DEBUG) && (LZ4_DEBUG>=2)
+#  include <stdio.h>
+   static int g_debuglog_enable = 1;
+#  define DEBUGLOG(l, ...) {                          \
+        if ((g_debuglog_enable) && (l<=LZ4_DEBUG)) {  \
+            fprintf(stderr, __FILE__ ": ");           \
+            fprintf(stderr, __VA_ARGS__);             \
+            fprintf(stderr, " \n");                   \
+    }   }
+#else
+#  define DEBUGLOG(l, ...) {}    /* disabled */
+#endif
+
+static int LZ4_isAligned(const void* ptr, size_t alignment)
+{
+    return ((size_t)ptr & (alignment -1)) == 0;
+}
+
+
+/*-************************************
+*  Types
+**************************************/
+#include <limits.h>
+#if defined(__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */)
+# include <stdint.h>
+  typedef  uint8_t BYTE;
+  typedef uint16_t U16;
+  typedef uint32_t U32;
+  typedef  int32_t S32;
+  typedef uint64_t U64;
+  typedef uintptr_t uptrval;
+#else
+# if UINT_MAX != 4294967295UL
+#   error "LZ4 code (when not C++ or C99) assumes that sizeof(int) == 4"
+# endif
+  typedef unsigned char       BYTE;
+  typedef unsigned short      U16;
+  typedef unsigned int        U32;
+  typedef   signed int        S32;
+  typedef unsigned long long  U64;
+  typedef size_t              uptrval;   /* generally true, except OpenVMS-64 */
+#endif
+
+#if defined(__x86_64__)
+  typedef U64    reg_t;   /* 64-bits in x32 mode */
+#else
+  typedef size_t reg_t;   /* 32-bits in x32 mode */
+#endif
+
+typedef enum {
+    notLimited = 0,
+    limitedOutput = 1,
+    fillOutput = 2
+} limitedOutput_directive;
+
+
+/*-************************************
+*  Reading and writing into memory
+**************************************/
+
+/**
+ * LZ4 relies on memcpy with a constant size being inlined. In freestanding
+ * environments, the compiler can't assume the implementation of memcpy() is
+ * standard compliant, so it can't apply its specialized memcpy() inlining
+ * logic. When possible, use __builtin_memcpy() to tell the compiler to analyze
+ * memcpy() as if it were standard compliant, so it can inline it in freestanding
+ * environments. This is needed when decompressing the Linux Kernel, for example.
+ */
+#if defined(__GNUC__) && (__GNUC__ >= 4)
+#define LZ4_memcpy(dst, src, size) __builtin_memcpy(dst, src, size)
+#else
+#define LZ4_memcpy(dst, src, size) memcpy(dst, src, size)
+#endif
+
+static unsigned LZ4_isLittleEndian(void)
+{
+    const union { U32 u; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental */
+    return one.c[0];
+}
+
+
+#if defined(LZ4_FORCE_MEMORY_ACCESS) && (LZ4_FORCE_MEMORY_ACCESS==2)
+/* lie to the compiler about data alignment; use with caution */
+
+static U16 LZ4_read16(const void* memPtr) { return *(const U16*) memPtr; }
+static U32 LZ4_read32(const void* memPtr) { return *(const U32*) memPtr; }
+static reg_t LZ4_read_ARCH(const void* memPtr) { return *(const reg_t*) memPtr; }
+
+static void LZ4_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; }
+static void LZ4_write32(void* memPtr, U32 value) { *(U32*)memPtr = value; }
+
+#elif defined(LZ4_FORCE_MEMORY_ACCESS) && (LZ4_FORCE_MEMORY_ACCESS==1)
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { U16 u16; U32 u32; reg_t uArch; } __attribute__((packed)) unalign;
+
+static U16 LZ4_read16(const void* ptr) { return ((const unalign*)ptr)->u16; }
+static U32 LZ4_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+static reg_t LZ4_read_ARCH(const void* ptr) { return ((const unalign*)ptr)->uArch; }
+
+static void LZ4_write16(void* memPtr, U16 value) { ((unalign*)memPtr)->u16 = value; }
+static void LZ4_write32(void* memPtr, U32 value) { ((unalign*)memPtr)->u32 = value; }
+
+#else  /* safe and portable access using memcpy() */
+
+static U16 LZ4_read16(const void* memPtr)
+{
+    U16 val; LZ4_memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+static U32 LZ4_read32(const void* memPtr)
+{
+    U32 val; LZ4_memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+static reg_t LZ4_read_ARCH(const void* memPtr)
+{
+    reg_t val; LZ4_memcpy(&val, memPtr, sizeof(val)); return val;
+}
+
+static void LZ4_write16(void* memPtr, U16 value)
+{
+    LZ4_memcpy(memPtr, &value, sizeof(value));
+}
+
+static void LZ4_write32(void* memPtr, U32 value)
+{
+    LZ4_memcpy(memPtr, &value, sizeof(value));
+}
+
+#endif /* LZ4_FORCE_MEMORY_ACCESS */
+
+
+static U16 LZ4_readLE16(const void* memPtr)
+{
+    if (LZ4_isLittleEndian()) {
+        return LZ4_read16(memPtr);
+    } else {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U16)((U16)p[0] + (p[1]<<8));
+    }
+}
+
+static void LZ4_writeLE16(void* memPtr, U16 value)
+{
+    if (LZ4_isLittleEndian()) {
+        LZ4_write16(memPtr, value);
+    } else {
+        BYTE* p = (BYTE*)memPtr;
+        p[0] = (BYTE) value;
+        p[1] = (BYTE)(value>>8);
+    }
+}
+
+/* customized variant of memcpy, which can overwrite up to 8 bytes beyond dstEnd */
+LZ4_FORCE_INLINE
+void LZ4_wildCopy8(void* dstPtr, const void* srcPtr, void* dstEnd)
+{
+    BYTE* d = (BYTE*)dstPtr;
+    const BYTE* s = (const BYTE*)srcPtr;
+    BYTE* const e = (BYTE*)dstEnd;
+
+    do { LZ4_memcpy(d,s,8); d+=8; s+=8; } while (d<e);
+}
+
+static const unsigned inc32table[8] = {0, 1, 2,  1,  0,  4, 4, 4};
+static const int      dec64table[8] = {0, 0, 0, -1, -4,  1, 2, 3};
+
+
+#ifndef LZ4_FAST_DEC_LOOP
+#  if defined __i386__ || defined _M_IX86 || defined __x86_64__ || defined _M_X64
+#    define LZ4_FAST_DEC_LOOP 1
+#  elif defined(__aarch64__) && !defined(__clang__)
+     /* On aarch64, we disable this optimization for clang because on certain
+      * mobile chipsets, performance is reduced with clang. For information
+      * refer to https://github.com/lz4/lz4/pull/707 */
+#    define LZ4_FAST_DEC_LOOP 1
+#  else
+#    define LZ4_FAST_DEC_LOOP 0
+#  endif
+#endif
+
+#if LZ4_FAST_DEC_LOOP
+
+LZ4_FORCE_INLINE void
+LZ4_memcpy_using_offset_base(BYTE* dstPtr, const BYTE* srcPtr, BYTE* dstEnd, const size_t offset)
+{
+    assert(srcPtr + offset == dstPtr);
+    if (offset < 8) {
+        LZ4_write32(dstPtr, 0);   /* silence an msan warning when offset==0 */
+        dstPtr[0] = srcPtr[0];
+        dstPtr[1] = srcPtr[1];
+        dstPtr[2] = srcPtr[2];
+        dstPtr[3] = srcPtr[3];
+        srcPtr += inc32table[offset];
+        LZ4_memcpy(dstPtr+4, srcPtr, 4);
+        srcPtr -= dec64table[offset];
+        dstPtr += 8;
+    } else {
+        LZ4_memcpy(dstPtr, srcPtr, 8);
+        dstPtr += 8;
+        srcPtr += 8;
+    }
+
+    LZ4_wildCopy8(dstPtr, srcPtr, dstEnd);
+}
+
+/* customized variant of memcpy, which can overwrite up to 32 bytes beyond dstEnd
+ * this version copies two times 16 bytes (instead of one time 32 bytes)
+ * because it must be compatible with offsets >= 16. */
+LZ4_FORCE_INLINE void
+LZ4_wildCopy32(void* dstPtr, const void* srcPtr, void* dstEnd)
+{
+    BYTE* d = (BYTE*)dstPtr;
+    const BYTE* s = (const BYTE*)srcPtr;
+    BYTE* const e = (BYTE*)dstEnd;
+
+    do { LZ4_memcpy(d,s,16); LZ4_memcpy(d+16,s+16,16); d+=32; s+=32; } while (d<e);
+}
+
+/* LZ4_memcpy_using_offset()  presumes :
+ * - dstEnd >= dstPtr + MINMATCH
+ * - there is at least 8 bytes available to write after dstEnd */
+LZ4_FORCE_INLINE void
+LZ4_memcpy_using_offset(BYTE* dstPtr, const BYTE* srcPtr, BYTE* dstEnd, const size_t offset)
+{
+    BYTE v[8];
+
+    assert(dstEnd >= dstPtr + MINMATCH);
+
+    switch(offset) {
+    case 1:
+        MEM_INIT(v, *srcPtr, 8);
+        break;
+    case 2:
+        LZ4_memcpy(v, srcPtr, 2);
+        LZ4_memcpy(&v[2], srcPtr, 2);
+        LZ4_memcpy(&v[4], v, 4);
+        break;
+    case 4:
+        LZ4_memcpy(v, srcPtr, 4);
+        LZ4_memcpy(&v[4], srcPtr, 4);
+        break;
+    default:
+        LZ4_memcpy_using_offset_base(dstPtr, srcPtr, dstEnd, offset);
+        return;
+    }
+
+    LZ4_memcpy(dstPtr, v, 8);
+    dstPtr += 8;
+    while (dstPtr < dstEnd) {
+        LZ4_memcpy(dstPtr, v, 8);
+        dstPtr += 8;
+    }
+}
+#endif
+
+
+/*-************************************
+*  Common functions
+**************************************/
+static unsigned LZ4_NbCommonBytes (reg_t val)
+{
+    assert(val != 0);
+    if (LZ4_isLittleEndian()) {
+        if (sizeof(val) == 8) {
+#       if defined(_MSC_VER) && (_MSC_VER >= 1800) && defined(_M_AMD64) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            /* x64 CPUS without BMI support interpret `TZCNT` as `REP BSF` */
+            return (unsigned)_tzcnt_u64(val) >> 3;
+#       elif defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            unsigned long r = 0;
+            _BitScanForward64(&r, (U64)val);
+            return (unsigned)r >> 3;
+#       elif (defined(__clang__) || (defined(__GNUC__) && ((__GNUC__ > 3) || \
+                            ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) && \
+                                        !defined(LZ4_FORCE_SW_BITCOUNT)
+            return (unsigned)__builtin_ctzll((U64)val) >> 3;
+#       else
+            const U64 m = 0x0101010101010101ULL;
+            val ^= val - 1;
+            return (unsigned)(((U64)((val & (m - 1)) * m)) >> 56);
+#       endif
+        } else /* 32 bits */ {
+#       if defined(_MSC_VER) && (_MSC_VER >= 1400) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            unsigned long r;
+            _BitScanForward(&r, (U32)val);
+            return (unsigned)r >> 3;
+#       elif (defined(__clang__) || (defined(__GNUC__) && ((__GNUC__ > 3) || \
+                            ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) && \
+                        !defined(__TINYC__) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            return (unsigned)__builtin_ctz((U32)val) >> 3;
+#       else
+            const U32 m = 0x01010101;
+            return (unsigned)((((val - 1) ^ val) & (m - 1)) * m) >> 24;
+#       endif
+        }
+    } else   /* Big Endian CPU */ {
+        if (sizeof(val)==8) {
+#       if (defined(__clang__) || (defined(__GNUC__) && ((__GNUC__ > 3) || \
+                            ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) && \
+                        !defined(__TINYC__) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            return (unsigned)__builtin_clzll((U64)val) >> 3;
+#       else
+#if 1
+            /* this method is probably faster,
+             * but adds a 128 bytes lookup table */
+            static const unsigned char ctz7_tab[128] = {
+                7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+                5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+                6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+                5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
+            };
+            U64 const mask = 0x0101010101010101ULL;
+            U64 const t = (((val >> 8) - mask) | val) & mask;
+            return ctz7_tab[(t * 0x0080402010080402ULL) >> 57];
+#else
+            /* this method doesn't consume memory space like the previous one,
+             * but it contains several branches,
+             * that may end up slowing execution */
+            static const U32 by32 = sizeof(val)*4;  /* 32 on 64 bits (goal), 16 on 32 bits.
+            Just to avoid some static analyzer complaining about shift by 32 on 32-bits target.
+            Note that this code path is never triggered in 32-bits mode. */
+            unsigned r;
+            if (!(val>>by32)) { r=4; } else { r=0; val>>=by32; }
+            if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
+            r += (!val);
+            return r;
+#endif
+#       endif
+        } else /* 32 bits */ {
+#       if (defined(__clang__) || (defined(__GNUC__) && ((__GNUC__ > 3) || \
+                            ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))))) && \
+                                        !defined(LZ4_FORCE_SW_BITCOUNT)
+            return (unsigned)__builtin_clz((U32)val) >> 3;
+#       else
+            val >>= 8;
+            val = ((((val + 0x00FFFF00) | 0x00FFFFFF) + val) |
+              (val + 0x00FF0000)) >> 24;
+            return (unsigned)val ^ 3;
+#       endif
+        }
+    }
+}
+
+
+#define STEPSIZE sizeof(reg_t)
+LZ4_FORCE_INLINE
+unsigned LZ4_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* pInLimit)
+{
+    const BYTE* const pStart = pIn;
+
+    if (likely(pIn < pInLimit-(STEPSIZE-1))) {
+        reg_t const diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn);
+        if (!diff) {
+            pIn+=STEPSIZE; pMatch+=STEPSIZE;
+        } else {
+            return LZ4_NbCommonBytes(diff);
+    }   }
+
+    while (likely(pIn < pInLimit-(STEPSIZE-1))) {
+        reg_t const diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn);
+        if (!diff) { pIn+=STEPSIZE; pMatch+=STEPSIZE; continue; }
+        pIn += LZ4_NbCommonBytes(diff);
+        return (unsigned)(pIn - pStart);
+    }
+
+    if ((STEPSIZE==8) && (pIn<(pInLimit-3)) && (LZ4_read32(pMatch) == LZ4_read32(pIn))) { pIn+=4; pMatch+=4; }
+    if ((pIn<(pInLimit-1)) && (LZ4_read16(pMatch) == LZ4_read16(pIn))) { pIn+=2; pMatch+=2; }
+    if ((pIn<pInLimit) && (*pMatch == *pIn)) pIn++;
+    return (unsigned)(pIn - pStart);
+}
+
+
+#ifndef LZ4_COMMONDEFS_ONLY
+/*-************************************
+*  Local Constants
+**************************************/
+static const int LZ4_64Klimit = ((64 KB) + (MFLIMIT-1));
+static const U32 LZ4_skipTrigger = 6;  /* Increase this value ==> compression run slower on incompressible data */
+
+
+/*-************************************
+*  Local Structures and types
+**************************************/
+typedef enum { clearedTable = 0, byPtr, byU32, byU16 } tableType_t;
+
+/**
+ * This enum distinguishes several different modes of accessing previous
+ * content in the stream.
+ *
+ * - noDict        : There is no preceding content.
+ * - withPrefix64k : Table entries up to ctx->dictSize before the current blob
+ *                   blob being compressed are valid and refer to the preceding
+ *                   content (of length ctx->dictSize), which is available
+ *                   contiguously preceding in memory the content currently
+ *                   being compressed.
+ * - usingExtDict  : Like withPrefix64k, but the preceding content is somewhere
+ *                   else in memory, starting at ctx->dictionary with length
+ *                   ctx->dictSize.
+ * - usingDictCtx  : Like usingExtDict, but everything concerning the preceding
+ *                   content is in a separate context, pointed to by
+ *                   ctx->dictCtx. ctx->dictionary, ctx->dictSize, and table
+ *                   entries in the current context that refer to positions
+ *                   preceding the beginning of the current compression are
+ *                   ignored. Instead, ctx->dictCtx->dictionary and ctx->dictCtx
+ *                   ->dictSize describe the location and size of the preceding
+ *                   content, and matches are found by looking in the ctx
+ *                   ->dictCtx->hashTable.
+ */
+typedef enum { noDict = 0, withPrefix64k, usingExtDict, usingDictCtx } dict_directive;
+typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive;
+
+
+/*-************************************
+*  Local Utils
+**************************************/
+int LZ4_versionNumber (void) { return LZ4_VERSION_NUMBER; }
+const char* LZ4_versionString(void) { return LZ4_VERSION_STRING; }
+int LZ4_compressBound(int isize)  { return LZ4_COMPRESSBOUND(isize); }
+int LZ4_sizeofState(void) { return LZ4_STREAMSIZE; }
+
+
+/*-************************************
+*  Internal Definitions used in Tests
+**************************************/
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+int LZ4_compress_forceExtDict (LZ4_stream_t* LZ4_dict, const char* source, char* dest, int srcSize);
+
+int LZ4_decompress_safe_forceExtDict(const char* source, char* dest,
+                                     int compressedSize, int maxOutputSize,
+                                     const void* dictStart, size_t dictSize);
+
+#if defined (__cplusplus)
+}
+#endif
+
+/*-******************************
+*  Compression functions
+********************************/
+LZ4_FORCE_INLINE U32 LZ4_hash4(U32 sequence, tableType_t const tableType)
+{
+    if (tableType == byU16)
+        return ((sequence * 2654435761U) >> ((MINMATCH*8)-(LZ4_HASHLOG+1)));
+    else
+        return ((sequence * 2654435761U) >> ((MINMATCH*8)-LZ4_HASHLOG));
+}
+
+LZ4_FORCE_INLINE U32 LZ4_hash5(U64 sequence, tableType_t const tableType)
+{
+    const U32 hashLog = (tableType == byU16) ? LZ4_HASHLOG+1 : LZ4_HASHLOG;
+    if (LZ4_isLittleEndian()) {
+        const U64 prime5bytes = 889523592379ULL;
+        return (U32)(((sequence << 24) * prime5bytes) >> (64 - hashLog));
+    } else {
+        const U64 prime8bytes = 11400714785074694791ULL;
+        return (U32)(((sequence >> 24) * prime8bytes) >> (64 - hashLog));
+    }
+}
+
+LZ4_FORCE_INLINE U32 LZ4_hashPosition(const void* const p, tableType_t const tableType)
+{
+    if ((sizeof(reg_t)==8) && (tableType != byU16)) return LZ4_hash5(LZ4_read_ARCH(p), tableType);
+    return LZ4_hash4(LZ4_read32(p), tableType);
+}
+
+LZ4_FORCE_INLINE void LZ4_clearHash(U32 h, void* tableBase, tableType_t const tableType)
+{
+    switch (tableType)
+    {
+    default: /* fallthrough */
+    case clearedTable: { /* illegal! */ assert(0); return; }
+    case byPtr: { const BYTE** hashTable = (const BYTE**)tableBase; hashTable[h] = NULL; return; }
+    case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = 0; return; }
+    case byU16: { U16* hashTable = (U16*) tableBase; hashTable[h] = 0; return; }
+    }
+}
+
+LZ4_FORCE_INLINE void LZ4_putIndexOnHash(U32 idx, U32 h, void* tableBase, tableType_t const tableType)
+{
+    switch (tableType)
+    {
+    default: /* fallthrough */
+    case clearedTable: /* fallthrough */
+    case byPtr: { /* illegal! */ assert(0); return; }
+    case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = idx; return; }
+    case byU16: { U16* hashTable = (U16*) tableBase; assert(idx < 65536); hashTable[h] = (U16)idx; return; }
+    }
+}
+
+LZ4_FORCE_INLINE void LZ4_putPositionOnHash(const BYTE* p, U32 h,
+                                  void* tableBase, tableType_t const tableType,
+                            const BYTE* srcBase)
+{
+    switch (tableType)
+    {
+    case clearedTable: { /* illegal! */ assert(0); return; }
+    case byPtr: { const BYTE** hashTable = (const BYTE**)tableBase; hashTable[h] = p; return; }
+    case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = (U32)(p-srcBase); return; }
+    case byU16: { U16* hashTable = (U16*) tableBase; hashTable[h] = (U16)(p-srcBase); return; }
+    }
+}
+
+LZ4_FORCE_INLINE void LZ4_putPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase)
+{
+    U32 const h = LZ4_hashPosition(p, tableType);
+    LZ4_putPositionOnHash(p, h, tableBase, tableType, srcBase);
+}
+
+/* LZ4_getIndexOnHash() :
+ * Index of match position registered in hash table.
+ * hash position must be calculated by using base+index, or dictBase+index.
+ * Assumption 1 : only valid if tableType == byU32 or byU16.
+ * Assumption 2 : h is presumed valid (within limits of hash table)
+ */
+LZ4_FORCE_INLINE U32 LZ4_getIndexOnHash(U32 h, const void* tableBase, tableType_t tableType)
+{
+    LZ4_STATIC_ASSERT(LZ4_MEMORY_USAGE > 2);
+    if (tableType == byU32) {
+        const U32* const hashTable = (const U32*) tableBase;
+        assert(h < (1U << (LZ4_MEMORY_USAGE-2)));
+        return hashTable[h];
+    }
+    if (tableType == byU16) {
+        const U16* const hashTable = (const U16*) tableBase;
+        assert(h < (1U << (LZ4_MEMORY_USAGE-1)));
+        return hashTable[h];
+    }
+    assert(0); return 0;  /* forbidden case */
+}
+
+static const BYTE* LZ4_getPositionOnHash(U32 h, const void* tableBase, tableType_t tableType, const BYTE* srcBase)
+{
+    if (tableType == byPtr) { const BYTE* const* hashTable = (const BYTE* const*) tableBase; return hashTable[h]; }
+    if (tableType == byU32) { const U32* const hashTable = (const U32*) tableBase; return hashTable[h] + srcBase; }
+    { const U16* const hashTable = (const U16*) tableBase; return hashTable[h] + srcBase; }   /* default, to ensure a return */
+}
+
+LZ4_FORCE_INLINE const BYTE*
+LZ4_getPosition(const BYTE* p,
+                const void* tableBase, tableType_t tableType,
+                const BYTE* srcBase)
+{
+    U32 const h = LZ4_hashPosition(p, tableType);
+    return LZ4_getPositionOnHash(h, tableBase, tableType, srcBase);
+}
+
+LZ4_FORCE_INLINE void
+LZ4_prepareTable(LZ4_stream_t_internal* const cctx,
+           const int inputSize,
+           const tableType_t tableType) {
+    /* If the table hasn't been used, it's guaranteed to be zeroed out, and is
+     * therefore safe to use no matter what mode we're in. Otherwise, we figure
+     * out if it's safe to leave as is or whether it needs to be reset.
+     */
+    if ((tableType_t)cctx->tableType != clearedTable) {
+        assert(inputSize >= 0);
+        if ((tableType_t)cctx->tableType != tableType
+          || ((tableType == byU16) && cctx->currentOffset + (unsigned)inputSize >= 0xFFFFU)
+          || ((tableType == byU32) && cctx->currentOffset > 1 GB)
+          || tableType == byPtr
+          || inputSize >= 4 KB)
+        {
+            DEBUGLOG(4, "LZ4_prepareTable: Resetting table in %p", cctx);
+            MEM_INIT(cctx->hashTable, 0, LZ4_HASHTABLESIZE);
+            cctx->currentOffset = 0;
+            cctx->tableType = (U32)clearedTable;
+        } else {
+            DEBUGLOG(4, "LZ4_prepareTable: Re-use hash table (no reset)");
+        }
+    }
+
+    /* Adding a gap, so all previous entries are > LZ4_DISTANCE_MAX back, is faster
+     * than compressing without a gap. However, compressing with
+     * currentOffset == 0 is faster still, so we preserve that case.
+     */
+    if (cctx->currentOffset != 0 && tableType == byU32) {
+        DEBUGLOG(5, "LZ4_prepareTable: adding 64KB to currentOffset");
+        cctx->currentOffset += 64 KB;
+    }
+
+    /* Finally, clear history */
+    cctx->dictCtx = NULL;
+    cctx->dictionary = NULL;
+    cctx->dictSize = 0;
+}
+
+/** LZ4_compress_generic() :
+ *  inlined, to ensure branches are decided at compilation time.
+ *  Presumed already validated at this stage:
+ *  - source != NULL
+ *  - inputSize > 0
+ */
+LZ4_FORCE_INLINE int LZ4_compress_generic_validated(
+                 LZ4_stream_t_internal* const cctx,
+                 const char* const source,
+                 char* const dest,
+                 const int inputSize,
+                 int *inputConsumed, /* only written when outputDirective == fillOutput */
+                 const int maxOutputSize,
+                 const limitedOutput_directive outputDirective,
+                 const tableType_t tableType,
+                 const dict_directive dictDirective,
+                 const dictIssue_directive dictIssue,
+                 const int acceleration)
+{
+    int result;
+    const BYTE* ip = (const BYTE*) source;
+
+    U32 const startIndex = cctx->currentOffset;
+    const BYTE* base = (const BYTE*) source - startIndex;
+    const BYTE* lowLimit;
+
+    const LZ4_stream_t_internal* dictCtx = (const LZ4_stream_t_internal*) cctx->dictCtx;
+    const BYTE* const dictionary =
+        dictDirective == usingDictCtx ? dictCtx->dictionary : cctx->dictionary;
+    const U32 dictSize =
+        dictDirective == usingDictCtx ? dictCtx->dictSize : cctx->dictSize;
+    const U32 dictDelta = (dictDirective == usingDictCtx) ? startIndex - dictCtx->currentOffset : 0;   /* make indexes in dictCtx comparable with index in current context */
+
+    int const maybe_extMem = (dictDirective == usingExtDict) || (dictDirective == usingDictCtx);
+    U32 const prefixIdxLimit = startIndex - dictSize;   /* used when dictDirective == dictSmall */
+    const BYTE* const dictEnd = dictionary ? dictionary + dictSize : dictionary;
+    const BYTE* anchor = (const BYTE*) source;
+    const BYTE* const iend = ip + inputSize;
+    const BYTE* const mflimitPlusOne = iend - MFLIMIT + 1;
+    const BYTE* const matchlimit = iend - LASTLITERALS;
+
+    /* the dictCtx currentOffset is indexed on the start of the dictionary,
+     * while a dictionary in the current context precedes the currentOffset */
+    const BYTE* dictBase = !dictionary ? NULL : (dictDirective == usingDictCtx) ?
+                            dictionary + dictSize - dictCtx->currentOffset :
+                            dictionary + dictSize - startIndex;
+
+    BYTE* op = (BYTE*) dest;
+    BYTE* const olimit = op + maxOutputSize;
+
+    U32 offset = 0;
+    U32 forwardH;
+
+    DEBUGLOG(5, "LZ4_compress_generic_validated: srcSize=%i, tableType=%u", inputSize, tableType);
+    assert(ip != NULL);
+    /* If init conditions are not met, we don't have to mark stream
+     * as having dirty context, since no action was taken yet */
+    if (outputDirective == fillOutput && maxOutputSize < 1) { return 0; } /* Impossible to store anything */
+    if ((tableType == byU16) && (inputSize>=LZ4_64Klimit)) { return 0; }  /* Size too large (not within 64K limit) */
+    if (tableType==byPtr) assert(dictDirective==noDict);      /* only supported use case with byPtr */
+    assert(acceleration >= 1);
+
+    lowLimit = (const BYTE*)source - (dictDirective == withPrefix64k ? dictSize : 0);
+
+    /* Update context state */
+    if (dictDirective == usingDictCtx) {
+        /* Subsequent linked blocks can't use the dictionary. */
+        /* Instead, they use the block we just compressed. */
+        cctx->dictCtx = NULL;
+        cctx->dictSize = (U32)inputSize;
+    } else {
+        cctx->dictSize += (U32)inputSize;
+    }
+    cctx->currentOffset += (U32)inputSize;
+    cctx->tableType = (U32)tableType;
+
+    if (inputSize<LZ4_minLength) goto _last_literals;        /* Input too small, no compression (all literals) */
+
+    /* First Byte */
+    LZ4_putPosition(ip, cctx->hashTable, tableType, base);
+    ip++; forwardH = LZ4_hashPosition(ip, tableType);
+
+    /* Main Loop */
+    for ( ; ; ) {
+        const BYTE* match;
+        BYTE* token;
+        const BYTE* filledIp;
+
+        /* Find a match */
+        if (tableType == byPtr) {
+            const BYTE* forwardIp = ip;
+            int step = 1;
+            int searchMatchNb = acceleration << LZ4_skipTrigger;
+            do {
+                U32 const h = forwardH;
+                ip = forwardIp;
+                forwardIp += step;
+                step = (searchMatchNb++ >> LZ4_skipTrigger);
+
+                if (unlikely(forwardIp > mflimitPlusOne)) goto _last_literals;
+                assert(ip < mflimitPlusOne);
+
+                match = LZ4_getPositionOnHash(h, cctx->hashTable, tableType, base);
+                forwardH = LZ4_hashPosition(forwardIp, tableType);
+                LZ4_putPositionOnHash(ip, h, cctx->hashTable, tableType, base);
+
+            } while ( (match+LZ4_DISTANCE_MAX < ip)
+                   || (LZ4_read32(match) != LZ4_read32(ip)) );
+
+        } else {   /* byU32, byU16 */
+
+            const BYTE* forwardIp = ip;
+            int step = 1;
+            int searchMatchNb = acceleration << LZ4_skipTrigger;
+            do {
+                U32 const h = forwardH;
+                U32 const current = (U32)(forwardIp - base);
+                U32 matchIndex = LZ4_getIndexOnHash(h, cctx->hashTable, tableType);
+                assert(matchIndex <= current);
+                assert(forwardIp - base < (ptrdiff_t)(2 GB - 1));
+                ip = forwardIp;
+                forwardIp += step;
+                step = (searchMatchNb++ >> LZ4_skipTrigger);
+
+                if (unlikely(forwardIp > mflimitPlusOne)) goto _last_literals;
+                assert(ip < mflimitPlusOne);
+
+                if (dictDirective == usingDictCtx) {
+                    if (matchIndex < startIndex) {
+                        /* there was no match, try the dictionary */
+                        assert(tableType == byU32);
+                        matchIndex = LZ4_getIndexOnHash(h, dictCtx->hashTable, byU32);
+                        match = dictBase + matchIndex;
+                        matchIndex += dictDelta;   /* make dictCtx index comparable with current context */
+                        lowLimit = dictionary;
+                    } else {
+                        match = base + matchIndex;
+                        lowLimit = (const BYTE*)source;
+                    }
+                } else if (dictDirective==usingExtDict) {
+                    if (matchIndex < startIndex) {
+                        DEBUGLOG(7, "extDict candidate: matchIndex=%5u  <  startIndex=%5u", matchIndex, startIndex);
+                        assert(startIndex - matchIndex >= MINMATCH);
+                        match = dictBase + matchIndex;
+                        lowLimit = dictionary;
+                    } else {
+                        match = base + matchIndex;
+                        lowLimit = (const BYTE*)source;
+                    }
+                } else {   /* single continuous memory segment */
+                    match = base + matchIndex;
+                }
+                forwardH = LZ4_hashPosition(forwardIp, tableType);
+                LZ4_putIndexOnHash(current, h, cctx->hashTable, tableType);
+
+                DEBUGLOG(7, "candidate at pos=%u  (offset=%u \n", matchIndex, current - matchIndex);
+                if ((dictIssue == dictSmall) && (matchIndex < prefixIdxLimit)) { continue; }    /* match outside of valid area */
+                assert(matchIndex < current);
+                if ( ((tableType != byU16) || (LZ4_DISTANCE_MAX < LZ4_DISTANCE_ABSOLUTE_MAX))
+                  && (matchIndex+LZ4_DISTANCE_MAX < current)) {
+                    continue;
+                } /* too far */
+                assert((current - matchIndex) <= LZ4_DISTANCE_MAX);  /* match now expected within distance */
+
+                if (LZ4_read32(match) == LZ4_read32(ip)) {
+                    if (maybe_extMem) offset = current - matchIndex;
+                    break;   /* match found */
+                }
+
+            } while(1);
+        }
+
+        /* Catch up */
+        filledIp = ip;
+        while (((ip>anchor) & (match > lowLimit)) && (unlikely(ip[-1]==match[-1]))) { ip--; match--; }
+
+        /* Encode Literals */
+        {   unsigned const litLength = (unsigned)(ip - anchor);
+            token = op++;
+            if ((outputDirective == limitedOutput) &&  /* Check output buffer overflow */
+                (unlikely(op + litLength + (2 + 1 + LASTLITERALS) + (litLength/255) > olimit)) ) {
+                return 0;   /* cannot compress within `dst` budget. Stored indexes in hash table are nonetheless fine */
+            }
+            if ((outputDirective == fillOutput) &&
+                (unlikely(op + (litLength+240)/255 /* litlen */ + litLength /* literals */ + 2 /* offset */ + 1 /* token */ + MFLIMIT - MINMATCH /* min last literals so last match is <= end - MFLIMIT */ > olimit))) {
+                op--;
+                goto _last_literals;
+            }
+            if (litLength >= RUN_MASK) {
+                int len = (int)(litLength - RUN_MASK);
+                *token = (RUN_MASK<<ML_BITS);
+                for(; len >= 255 ; len-=255) *op++ = 255;
+                *op++ = (BYTE)len;
+            }
+            else *token = (BYTE)(litLength<<ML_BITS);
+
+            /* Copy Literals */
+            LZ4_wildCopy8(op, anchor, op+litLength);
+            op+=litLength;
+            DEBUGLOG(6, "seq.start:%i, literals=%u, match.start:%i",
+                        (int)(anchor-(const BYTE*)source), litLength, (int)(ip-(const BYTE*)source));
+        }
+
+_next_match:
+        /* at this stage, the following variables must be correctly set :
+         * - ip : at start of LZ operation
+         * - match : at start of previous pattern occurence; can be within current prefix, or within extDict
+         * - offset : if maybe_ext_memSegment==1 (constant)
+         * - lowLimit : must be == dictionary to mean "match is within extDict"; must be == source otherwise
+         * - token and *token : position to write 4-bits for match length; higher 4-bits for literal length supposed already written
+         */
+
+        if ((outputDirective == fillOutput) &&
+            (op + 2 /* offset */ + 1 /* token */ + MFLIMIT - MINMATCH /* min last literals so last match is <= end - MFLIMIT */ > olimit)) {
+            /* the match was too close to the end, rewind and go to last literals */
+            op = token;
+            goto _last_literals;
+        }
+
+        /* Encode Offset */
+        if (maybe_extMem) {   /* static test */
+            DEBUGLOG(6, "             with offset=%u  (ext if > %i)", offset, (int)(ip - (const BYTE*)source));
+            assert(offset <= LZ4_DISTANCE_MAX && offset > 0);
+            LZ4_writeLE16(op, (U16)offset); op+=2;
+        } else  {
+            DEBUGLOG(6, "             with offset=%u  (same segment)", (U32)(ip - match));
+            assert(ip-match <= LZ4_DISTANCE_MAX);
+            LZ4_writeLE16(op, (U16)(ip - match)); op+=2;
+        }
+
+        /* Encode MatchLength */
+        {   unsigned matchCode;
+
+            if ( (dictDirective==usingExtDict || dictDirective==usingDictCtx)
+              && (lowLimit==dictionary) /* match within extDict */ ) {
+                const BYTE* limit = ip + (dictEnd-match);
+                assert(dictEnd > match);
+                if (limit > matchlimit) limit = matchlimit;
+                matchCode = LZ4_count(ip+MINMATCH, match+MINMATCH, limit);
+                ip += (size_t)matchCode + MINMATCH;
+                if (ip==limit) {
+                    unsigned const more = LZ4_count(limit, (const BYTE*)source, matchlimit);
+                    matchCode += more;
+                    ip += more;
+                }
+                DEBUGLOG(6, "             with matchLength=%u starting in extDict", matchCode+MINMATCH);
+            } else {
+                matchCode = LZ4_count(ip+MINMATCH, match+MINMATCH, matchlimit);
+                ip += (size_t)matchCode + MINMATCH;
+                DEBUGLOG(6, "             with matchLength=%u", matchCode+MINMATCH);
+            }
+
+            if ((outputDirective) &&    /* Check output buffer overflow */
+                (unlikely(op + (1 + LASTLITERALS) + (matchCode+240)/255 > olimit)) ) {
+                if (outputDirective == fillOutput) {
+                    /* Match description too long : reduce it */
+                    U32 newMatchCode = 15 /* in token */ - 1 /* to avoid needing a zero byte */ + ((U32)(olimit - op) - 1 - LASTLITERALS) * 255;
+                    ip -= matchCode - newMatchCode;
+                    assert(newMatchCode < matchCode);
+                    matchCode = newMatchCode;
+                    if (unlikely(ip <= filledIp)) {
+                        /* We have already filled up to filledIp so if ip ends up less than filledIp
+                         * we have positions in the hash table beyond the current position. This is
+                         * a problem if we reuse the hash table. So we have to remove these positions
+                         * from the hash table.
+                         */
+                        const BYTE* ptr;
+                        DEBUGLOG(5, "Clearing %u positions", (U32)(filledIp - ip));
+                        for (ptr = ip; ptr <= filledIp; ++ptr) {
+                            U32 const h = LZ4_hashPosition(ptr, tableType);
+                            LZ4_clearHash(h, cctx->hashTable, tableType);
+                        }
+                    }
+                } else {
+                    assert(outputDirective == limitedOutput);
+                    return 0;   /* cannot compress within `dst` budget. Stored indexes in hash table are nonetheless fine */
+                }
+            }
+            if (matchCode >= ML_MASK) {
+                *token += ML_MASK;
+                matchCode -= ML_MASK;
+                LZ4_write32(op, 0xFFFFFFFF);
+                while (matchCode >= 4*255) {
+                    op+=4;
+                    LZ4_write32(op, 0xFFFFFFFF);
+                    matchCode -= 4*255;
+                }
+                op += matchCode / 255;
+                *op++ = (BYTE)(matchCode % 255);
+            } else
+                *token += (BYTE)(matchCode);
+        }
+        /* Ensure we have enough space for the last literals. */
+        assert(!(outputDirective == fillOutput && op + 1 + LASTLITERALS > olimit));
+
+        anchor = ip;
+
+        /* Test end of chunk */
+        if (ip >= mflimitPlusOne) break;
+
+        /* Fill table */
+        LZ4_putPosition(ip-2, cctx->hashTable, tableType, base);
+
+        /* Test next position */
+        if (tableType == byPtr) {
+
+            match = LZ4_getPosition(ip, cctx->hashTable, tableType, base);
+            LZ4_putPosition(ip, cctx->hashTable, tableType, base);
+            if ( (match+LZ4_DISTANCE_MAX >= ip)
+              && (LZ4_read32(match) == LZ4_read32(ip)) )
+            { token=op++; *token=0; goto _next_match; }
+
+        } else {   /* byU32, byU16 */
+
+            U32 const h = LZ4_hashPosition(ip, tableType);
+            U32 const current = (U32)(ip-base);
+            U32 matchIndex = LZ4_getIndexOnHash(h, cctx->hashTable, tableType);
+            assert(matchIndex < current);
+            if (dictDirective == usingDictCtx) {
+                if (matchIndex < startIndex) {
+                    /* there was no match, try the dictionary */
+                    matchIndex = LZ4_getIndexOnHash(h, dictCtx->hashTable, byU32);
+                    match = dictBase + matchIndex;
+                    lowLimit = dictionary;   /* required for match length counter */
+                    matchIndex += dictDelta;
+                } else {
+                    match = base + matchIndex;
+                    lowLimit = (const BYTE*)source;  /* required for match length counter */
+                }
+            } else if (dictDirective==usingExtDict) {
+                if (matchIndex < startIndex) {
+                    match = dictBase + matchIndex;
+                    lowLimit = dictionary;   /* required for match length counter */
+                } else {
+                    match = base + matchIndex;
+                    lowLimit = (const BYTE*)source;   /* required for match length counter */
+                }
+            } else {   /* single memory segment */
+                match = base + matchIndex;
+            }
+            LZ4_putIndexOnHash(current, h, cctx->hashTable, tableType);
+            assert(matchIndex < current);
+            if ( ((dictIssue==dictSmall) ? (matchIndex >= prefixIdxLimit) : 1)
+              && (((tableType==byU16) && (LZ4_DISTANCE_MAX == LZ4_DISTANCE_ABSOLUTE_MAX)) ? 1 : (matchIndex+LZ4_DISTANCE_MAX >= current))
+              && (LZ4_read32(match) == LZ4_read32(ip)) ) {
+                token=op++;
+                *token=0;
+                if (maybe_extMem) offset = current - matchIndex;
+                DEBUGLOG(6, "seq.start:%i, literals=%u, match.start:%i",
+                            (int)(anchor-(const BYTE*)source), 0, (int)(ip-(const BYTE*)source));
+                goto _next_match;
+            }
+        }
+
+        /* Prepare next loop */
+        forwardH = LZ4_hashPosition(++ip, tableType);
+
+    }
+
+_last_literals:
+    /* Encode Last Literals */
+    {   size_t lastRun = (size_t)(iend - anchor);
+        if ( (outputDirective) &&  /* Check output buffer overflow */
+            (op + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > olimit)) {
+            if (outputDirective == fillOutput) {
+                /* adapt lastRun to fill 'dst' */
+                assert(olimit >= op);
+                lastRun  = (size_t)(olimit-op) - 1/*token*/;
+                lastRun -= (lastRun + 256 - RUN_MASK) / 256;  /*additional length tokens*/
+            } else {
+                assert(outputDirective == limitedOutput);
+                return 0;   /* cannot compress within `dst` budget. Stored indexes in hash table are nonetheless fine */
+            }
+        }
+        DEBUGLOG(6, "Final literal run : %i literals", (int)lastRun);
+        if (lastRun >= RUN_MASK) {
+            size_t accumulator = lastRun - RUN_MASK;
+            *op++ = RUN_MASK << ML_BITS;
+            for(; accumulator >= 255 ; accumulator-=255) *op++ = 255;
+            *op++ = (BYTE) accumulator;
+        } else {
+            *op++ = (BYTE)(lastRun<<ML_BITS);
+        }
+        LZ4_memcpy(op, anchor, lastRun);
+        ip = anchor + lastRun;
+        op += lastRun;
+    }
+
+    if (outputDirective == fillOutput) {
+        *inputConsumed = (int) (((const char*)ip)-source);
+    }
+    result = (int)(((char*)op) - dest);
+    assert(result > 0);
+    DEBUGLOG(5, "LZ4_compress_generic: compressed %i bytes into %i bytes", inputSize, result);
+    return result;
+}
+
+/** LZ4_compress_generic() :
+ *  inlined, to ensure branches are decided at compilation time;
+ *  takes care of src == (NULL, 0)
+ *  and forward the rest to LZ4_compress_generic_validated */
+LZ4_FORCE_INLINE int LZ4_compress_generic(
+                 LZ4_stream_t_internal* const cctx,
+                 const char* const src,
+                 char* const dst,
+                 const int srcSize,
+                 int *inputConsumed, /* only written when outputDirective == fillOutput */
+                 const int dstCapacity,
+                 const limitedOutput_directive outputDirective,
+                 const tableType_t tableType,
+                 const dict_directive dictDirective,
+                 const dictIssue_directive dictIssue,
+                 const int acceleration)
+{
+    DEBUGLOG(5, "LZ4_compress_generic: srcSize=%i, dstCapacity=%i",
+                srcSize, dstCapacity);
+
+    if ((U32)srcSize > (U32)LZ4_MAX_INPUT_SIZE) { return 0; }  /* Unsupported srcSize, too large (or negative) */
+    if (srcSize == 0) {   /* src == NULL supported if srcSize == 0 */
+        if (outputDirective != notLimited && dstCapacity <= 0) return 0;  /* no output, can't write anything */
+        DEBUGLOG(5, "Generating an empty block");
+        assert(outputDirective == notLimited || dstCapacity >= 1);
+        assert(dst != NULL);
+        dst[0] = 0;
+        if (outputDirective == fillOutput) {
+            assert (inputConsumed != NULL);
+            *inputConsumed = 0;
+        }
+        return 1;
+    }
+    assert(src != NULL);
+
+    return LZ4_compress_generic_validated(cctx, src, dst, srcSize,
+                inputConsumed, /* only written into if outputDirective == fillOutput */
+                dstCapacity, outputDirective,
+                tableType, dictDirective, dictIssue, acceleration);
+}
+
+
+int LZ4_compress_fast_extState(void* state, const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration)
+{
+    LZ4_stream_t_internal* const ctx = & LZ4_initStream(state, sizeof(LZ4_stream_t)) -> internal_donotuse;
+    assert(ctx != NULL);
+    if (acceleration < 1) acceleration = LZ4_ACCELERATION_DEFAULT;
+    if (acceleration > LZ4_ACCELERATION_MAX) acceleration = LZ4_ACCELERATION_MAX;
+    if (maxOutputSize >= LZ4_compressBound(inputSize)) {
+        if (inputSize < LZ4_64Klimit) {
+            return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, 0, notLimited, byU16, noDict, noDictIssue, acceleration);
+        } else {
+            const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)source > LZ4_DISTANCE_MAX)) ? byPtr : byU32;
+            return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, 0, notLimited, tableType, noDict, noDictIssue, acceleration);
+        }
+    } else {
+        if (inputSize < LZ4_64Klimit) {
+            return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, byU16, noDict, noDictIssue, acceleration);
+        } else {
+            const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)source > LZ4_DISTANCE_MAX)) ? byPtr : byU32;
+            return LZ4_compress_generic(ctx, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, noDict, noDictIssue, acceleration);
+        }
+    }
+}
+
+/**
+ * LZ4_compress_fast_extState_fastReset() :
+ * A variant of LZ4_compress_fast_extState().
+ *
+ * Using this variant avoids an expensive initialization step. It is only safe
+ * to call if the state buffer is known to be correctly initialized already
+ * (see comment in lz4.h on LZ4_resetStream_fast() for a definition of
+ * "correctly initialized").
+ */
+int LZ4_compress_fast_extState_fastReset(void* state, const char* src, char* dst, int srcSize, int dstCapacity, int acceleration)
+{
+    LZ4_stream_t_internal* ctx = &((LZ4_stream_t*)state)->internal_donotuse;
+    if (acceleration < 1) acceleration = LZ4_ACCELERATION_DEFAULT;
+    if (acceleration > LZ4_ACCELERATION_MAX) acceleration = LZ4_ACCELERATION_MAX;
+
+    if (dstCapacity >= LZ4_compressBound(srcSize)) {
+        if (srcSize < LZ4_64Klimit) {
+            const tableType_t tableType = byU16;
+            LZ4_prepareTable(ctx, srcSize, tableType);
+            if (ctx->currentOffset) {
+                return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, 0, notLimited, tableType, noDict, dictSmall, acceleration);
+            } else {
+                return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, 0, notLimited, tableType, noDict, noDictIssue, acceleration);
+            }
+        } else {
+            const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)src > LZ4_DISTANCE_MAX)) ? byPtr : byU32;
+            LZ4_prepareTable(ctx, srcSize, tableType);
+            return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, 0, notLimited, tableType, noDict, noDictIssue, acceleration);
+        }
+    } else {
+        if (srcSize < LZ4_64Klimit) {
+            const tableType_t tableType = byU16;
+            LZ4_prepareTable(ctx, srcSize, tableType);
+            if (ctx->currentOffset) {
+                return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, dstCapacity, limitedOutput, tableType, noDict, dictSmall, acceleration);
+            } else {
+                return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, dstCapacity, limitedOutput, tableType, noDict, noDictIssue, acceleration);
+            }
+        } else {
+            const tableType_t tableType = ((sizeof(void*)==4) && ((uptrval)src > LZ4_DISTANCE_MAX)) ? byPtr : byU32;
+            LZ4_prepareTable(ctx, srcSize, tableType);
+            return LZ4_compress_generic(ctx, src, dst, srcSize, NULL, dstCapacity, limitedOutput, tableType, noDict, noDictIssue, acceleration);
+        }
+    }
+}
+
+
+int LZ4_compress_fast(const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration)
+{
+    int result;
+#if (LZ4_HEAPMODE)
+    LZ4_stream_t* ctxPtr = ALLOC(sizeof(LZ4_stream_t));   /* malloc-calloc always properly aligned */
+    if (ctxPtr == NULL) return 0;
+#else
+    LZ4_stream_t ctx;
+    LZ4_stream_t* const ctxPtr = &ctx;
+#endif
+    result = LZ4_compress_fast_extState(ctxPtr, source, dest, inputSize, maxOutputSize, acceleration);
+
+#if (LZ4_HEAPMODE)
+    FREEMEM(ctxPtr);
+#endif
+    return result;
+}
+
+
+int LZ4_compress_default(const char* src, char* dst, int srcSize, int maxOutputSize)
+{
+    return LZ4_compress_fast(src, dst, srcSize, maxOutputSize, 1);
+}
+
+
+/* Note!: This function leaves the stream in an unclean/broken state!
+ * It is not safe to subsequently use the same state with a _fastReset() or
+ * _continue() call without resetting it. */
+static int LZ4_compress_destSize_extState (LZ4_stream_t* state, const char* src, char* dst, int* srcSizePtr, int targetDstSize)
+{
+    void* const s = LZ4_initStream(state, sizeof (*state));
+    assert(s != NULL); (void)s;
+
+    if (targetDstSize >= LZ4_compressBound(*srcSizePtr)) {  /* compression success is guaranteed */
+        return LZ4_compress_fast_extState(state, src, dst, *srcSizePtr, targetDstSize, 1);
+    } else {
+        if (*srcSizePtr < LZ4_64Klimit) {
+            return LZ4_compress_generic(&state->internal_donotuse, src, dst, *srcSizePtr, srcSizePtr, targetDstSize, fillOutput, byU16, noDict, noDictIssue, 1);
+        } else {
+            tableType_t const addrMode = ((sizeof(void*)==4) && ((uptrval)src > LZ4_DISTANCE_MAX)) ? byPtr : byU32;
+            return LZ4_compress_generic(&state->internal_donotuse, src, dst, *srcSizePtr, srcSizePtr, targetDstSize, fillOutput, addrMode, noDict, noDictIssue, 1);
+    }   }
+}
+
+
+int LZ4_compress_destSize(const char* src, char* dst, int* srcSizePtr, int targetDstSize)
+{
+#if (LZ4_HEAPMODE)
+    LZ4_stream_t* ctx = (LZ4_stream_t*)ALLOC(sizeof(LZ4_stream_t));   /* malloc-calloc always properly aligned */
+    if (ctx == NULL) return 0;
+#else
+    LZ4_stream_t ctxBody;
+    LZ4_stream_t* ctx = &ctxBody;
+#endif
+
+    int result = LZ4_compress_destSize_extState(ctx, src, dst, srcSizePtr, targetDstSize);
+
+#if (LZ4_HEAPMODE)
+    FREEMEM(ctx);
+#endif
+    return result;
+}
+
+
+
+/*-******************************
+*  Streaming functions
+********************************/
+
+LZ4_stream_t* LZ4_createStream(void)
+{
+    LZ4_stream_t* const lz4s = (LZ4_stream_t*)ALLOC(sizeof(LZ4_stream_t));
+    LZ4_STATIC_ASSERT(LZ4_STREAMSIZE >= sizeof(LZ4_stream_t_internal));    /* A compilation error here means LZ4_STREAMSIZE is not large enough */
+    DEBUGLOG(4, "LZ4_createStream %p", lz4s);
+    if (lz4s == NULL) return NULL;
+    LZ4_initStream(lz4s, sizeof(*lz4s));
+    return lz4s;
+}
+
+static size_t LZ4_stream_t_alignment(void)
+{
+#if LZ4_ALIGN_TEST
+    typedef struct { char c; LZ4_stream_t t; } t_a;
+    return sizeof(t_a) - sizeof(LZ4_stream_t);
+#else
+    return 1;  /* effectively disabled */
+#endif
+}
+
+LZ4_stream_t* LZ4_initStream (void* buffer, size_t size)
+{
+    DEBUGLOG(5, "LZ4_initStream");
+    if (buffer == NULL) { return NULL; }
+    if (size < sizeof(LZ4_stream_t)) { return NULL; }
+    if (!LZ4_isAligned(buffer, LZ4_stream_t_alignment())) return NULL;
+    MEM_INIT(buffer, 0, sizeof(LZ4_stream_t_internal));
+    return (LZ4_stream_t*)buffer;
+}
+
+/* resetStream is now deprecated,
+ * prefer initStream() which is more general */
+void LZ4_resetStream (LZ4_stream_t* LZ4_stream)
+{
+    DEBUGLOG(5, "LZ4_resetStream (ctx:%p)", LZ4_stream);
+    MEM_INIT(LZ4_stream, 0, sizeof(LZ4_stream_t_internal));
+}
+
+void LZ4_resetStream_fast(LZ4_stream_t* ctx) {
+    LZ4_prepareTable(&(ctx->internal_donotuse), 0, byU32);
+}
+
+int LZ4_freeStream (LZ4_stream_t* LZ4_stream)
+{
+    if (!LZ4_stream) return 0;   /* support free on NULL */
+    DEBUGLOG(5, "LZ4_freeStream %p", LZ4_stream);
+    FREEMEM(LZ4_stream);
+    return (0);
+}
+
+
+#define HASH_UNIT sizeof(reg_t)
+int LZ4_loadDict (LZ4_stream_t* LZ4_dict, const char* dictionary, int dictSize)
+{
+    LZ4_stream_t_internal* dict = &LZ4_dict->internal_donotuse;
+    const tableType_t tableType = byU32;
+    const BYTE* p = (const BYTE*)dictionary;
+    const BYTE* const dictEnd = p + dictSize;
+    const BYTE* base;
+
+    DEBUGLOG(4, "LZ4_loadDict (%i bytes from %p into %p)", dictSize, dictionary, LZ4_dict);
+
+    /* It's necessary to reset the context,
+     * and not just continue it with prepareTable()
+     * to avoid any risk of generating overflowing matchIndex
+     * when compressing using this dictionary */
+    LZ4_resetStream(LZ4_dict);
+
+    /* We always increment the offset by 64 KB, since, if the dict is longer,
+     * we truncate it to the last 64k, and if it's shorter, we still want to
+     * advance by a whole window length so we can provide the guarantee that
+     * there are only valid offsets in the window, which allows an optimization
+     * in LZ4_compress_fast_continue() where it uses noDictIssue even when the
+     * dictionary isn't a full 64k. */
+    dict->currentOffset += 64 KB;
+
+    if (dictSize < (int)HASH_UNIT) {
+        return 0;
+    }
+
+    if ((dictEnd - p) > 64 KB) p = dictEnd - 64 KB;
+    base = dictEnd - dict->currentOffset;
+    dict->dictionary = p;
+    dict->dictSize = (U32)(dictEnd - p);
+    dict->tableType = (U32)tableType;
+
+    while (p <= dictEnd-HASH_UNIT) {
+        LZ4_putPosition(p, dict->hashTable, tableType, base);
+        p+=3;
+    }
+
+    return (int)dict->dictSize;
+}
+
+void LZ4_attach_dictionary(LZ4_stream_t* workingStream, const LZ4_stream_t* dictionaryStream) {
+    const LZ4_stream_t_internal* dictCtx = dictionaryStream == NULL ? NULL :
+        &(dictionaryStream->internal_donotuse);
+
+    DEBUGLOG(4, "LZ4_attach_dictionary (%p, %p, size %u)",
+             workingStream, dictionaryStream,
+             dictCtx != NULL ? dictCtx->dictSize : 0);
+
+    if (dictCtx != NULL) {
+        /* If the current offset is zero, we will never look in the
+         * external dictionary context, since there is no value a table
+         * entry can take that indicate a miss. In that case, we need
+         * to bump the offset to something non-zero.
+         */
+        if (workingStream->internal_donotuse.currentOffset == 0) {
+            workingStream->internal_donotuse.currentOffset = 64 KB;
+        }
+
+        /* Don't actually attach an empty dictionary.
+         */
+        if (dictCtx->dictSize == 0) {
+            dictCtx = NULL;
+        }
+    }
+    workingStream->internal_donotuse.dictCtx = dictCtx;
+}
+
+
+static void LZ4_renormDictT(LZ4_stream_t_internal* LZ4_dict, int nextSize)
+{
+    assert(nextSize >= 0);
+    if (LZ4_dict->currentOffset + (unsigned)nextSize > 0x80000000) {   /* potential ptrdiff_t overflow (32-bits mode) */
+        /* rescale hash table */
+        U32 const delta = LZ4_dict->currentOffset - 64 KB;
+        const BYTE* dictEnd = LZ4_dict->dictionary + LZ4_dict->dictSize;
+        int i;
+        DEBUGLOG(4, "LZ4_renormDictT");
+        for (i=0; i<LZ4_HASH_SIZE_U32; i++) {
+            if (LZ4_dict->hashTable[i] < delta) LZ4_dict->hashTable[i]=0;
+            else LZ4_dict->hashTable[i] -= delta;
+        }
+        LZ4_dict->currentOffset = 64 KB;
+        if (LZ4_dict->dictSize > 64 KB) LZ4_dict->dictSize = 64 KB;
+        LZ4_dict->dictionary = dictEnd - LZ4_dict->dictSize;
+    }
+}
+
+
+int LZ4_compress_fast_continue (LZ4_stream_t* LZ4_stream,
+                                const char* source, char* dest,
+                                int inputSize, int maxOutputSize,
+                                int acceleration)
+{
+    const tableType_t tableType = byU32;
+    LZ4_stream_t_internal* streamPtr = &LZ4_stream->internal_donotuse;
+    const BYTE* dictEnd = streamPtr->dictionary + streamPtr->dictSize;
+
+    DEBUGLOG(5, "LZ4_compress_fast_continue (inputSize=%i)", inputSize);
+
+    LZ4_renormDictT(streamPtr, inputSize);   /* avoid index overflow */
+    if (acceleration < 1) acceleration = LZ4_ACCELERATION_DEFAULT;
+    if (acceleration > LZ4_ACCELERATION_MAX) acceleration = LZ4_ACCELERATION_MAX;
+
+    /* invalidate tiny dictionaries */
+    if ( (streamPtr->dictSize-1 < 4-1)   /* intentional underflow */
+      && (dictEnd != (const BYTE*)source) ) {
+        DEBUGLOG(5, "LZ4_compress_fast_continue: dictSize(%u) at addr:%p is too small", streamPtr->dictSize, streamPtr->dictionary);
+        streamPtr->dictSize = 0;
+        streamPtr->dictionary = (const BYTE*)source;
+        dictEnd = (const BYTE*)source;
+    }
+
+    /* Check overlapping input/dictionary space */
+    {   const BYTE* sourceEnd = (const BYTE*) source + inputSize;
+        if ((sourceEnd > streamPtr->dictionary) && (sourceEnd < dictEnd)) {
+            streamPtr->dictSize = (U32)(dictEnd - sourceEnd);
+            if (streamPtr->dictSize > 64 KB) streamPtr->dictSize = 64 KB;
+            if (streamPtr->dictSize < 4) streamPtr->dictSize = 0;
+            streamPtr->dictionary = dictEnd - streamPtr->dictSize;
+        }
+    }
+
+    /* prefix mode : source data follows dictionary */
+    if (dictEnd == (const BYTE*)source) {
+        if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset))
+            return LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, withPrefix64k, dictSmall, acceleration);
+        else
+            return LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, withPrefix64k, noDictIssue, acceleration);
+    }
+
+    /* external dictionary mode */
+    {   int result;
+        if (streamPtr->dictCtx) {
+            /* We depend here on the fact that dictCtx'es (produced by
+             * LZ4_loadDict) guarantee that their tables contain no references
+             * to offsets between dictCtx->currentOffset - 64 KB and
+             * dictCtx->currentOffset - dictCtx->dictSize. This makes it safe
+             * to use noDictIssue even when the dict isn't a full 64 KB.
+             */
+            if (inputSize > 4 KB) {
+                /* For compressing large blobs, it is faster to pay the setup
+                 * cost to copy the dictionary's tables into the active context,
+                 * so that the compression loop is only looking into one table.
+                 */
+                LZ4_memcpy(streamPtr, streamPtr->dictCtx, sizeof(*streamPtr));
+                result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingExtDict, noDictIssue, acceleration);
+            } else {
+                result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingDictCtx, noDictIssue, acceleration);
+            }
+        } else {
+            if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) {
+                result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingExtDict, dictSmall, acceleration);
+            } else {
+                result = LZ4_compress_generic(streamPtr, source, dest, inputSize, NULL, maxOutputSize, limitedOutput, tableType, usingExtDict, noDictIssue, acceleration);
+            }
+        }
+        streamPtr->dictionary = (const BYTE*)source;
+        streamPtr->dictSize = (U32)inputSize;
+        return result;
+    }
+}
+
+
+/* Hidden debug function, to force-test external dictionary mode */
+int LZ4_compress_forceExtDict (LZ4_stream_t* LZ4_dict, const char* source, char* dest, int srcSize)
+{
+    LZ4_stream_t_internal* streamPtr = &LZ4_dict->internal_donotuse;
+    int result;
+
+    LZ4_renormDictT(streamPtr, srcSize);
+
+    if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) {
+        result = LZ4_compress_generic(streamPtr, source, dest, srcSize, NULL, 0, notLimited, byU32, usingExtDict, dictSmall, 1);
+    } else {
+        result = LZ4_compress_generic(streamPtr, source, dest, srcSize, NULL, 0, notLimited, byU32, usingExtDict, noDictIssue, 1);
+    }
+
+    streamPtr->dictionary = (const BYTE*)source;
+    streamPtr->dictSize = (U32)srcSize;
+
+    return result;
+}
+
+
+/*! LZ4_saveDict() :
+ *  If previously compressed data block is not guaranteed to remain available at its memory location,
+ *  save it into a safer place (char* safeBuffer).
+ *  Note : you don't need to call LZ4_loadDict() afterwards,
+ *         dictionary is immediately usable, you can therefore call LZ4_compress_fast_continue().
+ *  Return : saved dictionary size in bytes (necessarily <= dictSize), or 0 if error.
+ */
+int LZ4_saveDict (LZ4_stream_t* LZ4_dict, char* safeBuffer, int dictSize)
+{
+    LZ4_stream_t_internal* const dict = &LZ4_dict->internal_donotuse;
+    const BYTE* const previousDictEnd = dict->dictionary + dict->dictSize;
+
+    if ((U32)dictSize > 64 KB) { dictSize = 64 KB; } /* useless to define a dictionary > 64 KB */
+    if ((U32)dictSize > dict->dictSize) { dictSize = (int)dict->dictSize; }
+
+    if (safeBuffer == NULL) assert(dictSize == 0);
+    if (safeBuffer && dictSize > 0)
+        memmove(safeBuffer, previousDictEnd - dictSize, dictSize);
+
+    dict->dictionary = (const BYTE*)safeBuffer;
+    dict->dictSize = (U32)dictSize;
+
+    return dictSize;
+}
+
+
+
+/*-*******************************
+ *  Decompression functions
+ ********************************/
+
+typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive;
+typedef enum { decode_full_block = 0, partial_decode = 1 } earlyEnd_directive;
+
+#undef MIN
+#define MIN(a,b)    ( (a) < (b) ? (a) : (b) )
+
+/* Read the variable-length literal or match length.
+ *
+ * ip - pointer to use as input.
+ * lencheck - end ip.  Return an error if ip advances >= lencheck.
+ * loop_check - check ip >= lencheck in body of loop.  Returns loop_error if so.
+ * initial_check - check ip >= lencheck before start of loop.  Returns initial_error if so.
+ * error (output) - error code.  Should be set to 0 before call.
+ */
+typedef enum { loop_error = -2, initial_error = -1, ok = 0 } variable_length_error;
+LZ4_FORCE_INLINE unsigned
+read_variable_length(const BYTE**ip, const BYTE* lencheck,
+                     int loop_check, int initial_check,
+                     variable_length_error* error)
+{
+    U32 length = 0;
+    U32 s;
+    if (initial_check && unlikely((*ip) >= lencheck)) {    /* overflow detection */
+        *error = initial_error;
+        return length;
+    }
+    do {
+        s = **ip;
+        (*ip)++;
+        length += s;
+        if (loop_check && unlikely((*ip) >= lencheck)) {    /* overflow detection */
+            *error = loop_error;
+            return length;
+        }
+    } while (s==255);
+
+    return length;
+}
+
+/*! LZ4_decompress_generic() :
+ *  This generic decompression function covers all use cases.
+ *  It shall be instantiated several times, using different sets of directives.
+ *  Note that it is important for performance that this function really get inlined,
+ *  in order to remove useless branches during compilation optimization.
+ */
+LZ4_FORCE_INLINE int
+LZ4_decompress_generic(
+                 const char* const src,
+                 char* const dst,
+                 int srcSize,
+                 int outputSize,         /* If endOnInput==endOnInputSize, this value is `dstCapacity` */
+
+                 endCondition_directive endOnInput,   /* endOnOutputSize, endOnInputSize */
+                 earlyEnd_directive partialDecoding,  /* full, partial */
+                 dict_directive dict,                 /* noDict, withPrefix64k, usingExtDict */
+                 const BYTE* const lowPrefix,  /* always <= dst, == dst when no prefix */
+                 const BYTE* const dictStart,  /* only if dict==usingExtDict */
+                 const size_t dictSize         /* note : = 0 if noDict */
+                 )
+{
+    if (src == NULL) { return -1; }
+
+    {   const BYTE* ip = (const BYTE*) src;
+        const BYTE* const iend = ip + srcSize;
+
+        BYTE* op = (BYTE*) dst;
+        BYTE* const oend = op + outputSize;
+        BYTE* cpy;
+
+        const BYTE* const dictEnd = (dictStart == NULL) ? NULL : dictStart + dictSize;
+
+        const int safeDecode = (endOnInput==endOnInputSize);
+        const int checkOffset = ((safeDecode) && (dictSize < (int)(64 KB)));
+
+
+        /* Set up the "end" pointers for the shortcut. */
+        const BYTE* const shortiend = iend - (endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
+        const BYTE* const shortoend = oend - (endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
+
+        const BYTE* match;
+        size_t offset;
+        unsigned token;
+        size_t length;
+
+
+        DEBUGLOG(5, "LZ4_decompress_generic (srcSize:%i, dstSize:%i)", srcSize, outputSize);
+
+        /* Special cases */
+        assert(lowPrefix <= op);
+        if ((endOnInput) && (unlikely(outputSize==0))) {
+            /* Empty output buffer */
+            if (partialDecoding) return 0;
+            return ((srcSize==1) && (*ip==0)) ? 0 : -1;
+        }
+        if ((!endOnInput) && (unlikely(outputSize==0))) { return (*ip==0 ? 1 : -1); }
+        if ((endOnInput) && unlikely(srcSize==0)) { return -1; }
+
+	/* Currently the fast loop shows a regression on qualcomm arm chips. */
+#if LZ4_FAST_DEC_LOOP
+        if ((oend - op) < FASTLOOP_SAFE_DISTANCE) {
+            DEBUGLOG(6, "skip fast decode loop");
+            goto safe_decode;
+        }
+
+        /* Fast loop : decode sequences as long as output < iend-FASTLOOP_SAFE_DISTANCE */
+        while (1) {
+            /* Main fastloop assertion: We can always wildcopy FASTLOOP_SAFE_DISTANCE */
+            assert(oend - op >= FASTLOOP_SAFE_DISTANCE);
+            if (endOnInput) { assert(ip < iend); }
+            token = *ip++;
+            length = token >> ML_BITS;  /* literal length */
+
+            assert(!endOnInput || ip <= iend); /* ip < iend before the increment */
+
+            /* decode literal length */
+            if (length == RUN_MASK) {
+                variable_length_error error = ok;
+                length += read_variable_length(&ip, iend-RUN_MASK, (int)endOnInput, (int)endOnInput, &error);
+                if (error == initial_error) { goto _output_error; }
+                if ((safeDecode) && unlikely((uptrval)(op)+length<(uptrval)(op))) { goto _output_error; } /* overflow detection */
+                if ((safeDecode) && unlikely((uptrval)(ip)+length<(uptrval)(ip))) { goto _output_error; } /* overflow detection */
+
+                /* copy literals */
+                cpy = op+length;
+                LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
+                if (endOnInput) {  /* LZ4_decompress_safe() */
+                    if ((cpy>oend-32) || (ip+length>iend-32)) { goto safe_literal_copy; }
+                    LZ4_wildCopy32(op, ip, cpy);
+                } else {   /* LZ4_decompress_fast() */
+                    if (cpy>oend-8) { goto safe_literal_copy; }
+                    LZ4_wildCopy8(op, ip, cpy); /* LZ4_decompress_fast() cannot copy more than 8 bytes at a time :
+                                                 * it doesn't know input length, and only relies on end-of-block properties */
+                }
+                ip += length; op = cpy;
+            } else {
+                cpy = op+length;
+                if (endOnInput) {  /* LZ4_decompress_safe() */
+                    DEBUGLOG(7, "copy %u bytes in a 16-bytes stripe", (unsigned)length);
+                    /* We don't need to check oend, since we check it once for each loop below */
+                    if (ip > iend-(16 + 1/*max lit + offset + nextToken*/)) { goto safe_literal_copy; }
+                    /* Literals can only be 14, but hope compilers optimize if we copy by a register size */
+                    LZ4_memcpy(op, ip, 16);
+                } else {  /* LZ4_decompress_fast() */
+                    /* LZ4_decompress_fast() cannot copy more than 8 bytes at a time :
+                     * it doesn't know input length, and relies on end-of-block properties */
+                    LZ4_memcpy(op, ip, 8);
+                    if (length > 8) { LZ4_memcpy(op+8, ip+8, 8); }
+                }
+                ip += length; op = cpy;
+            }
+
+            /* get offset */
+            offset = LZ4_readLE16(ip); ip+=2;
+            match = op - offset;
+            assert(match <= op);
+
+            /* get matchlength */
+            length = token & ML_MASK;
+
+            if (length == ML_MASK) {
+                variable_length_error error = ok;
+                if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) { goto _output_error; } /* Error : offset outside buffers */
+                length += read_variable_length(&ip, iend - LASTLITERALS + 1, (int)endOnInput, 0, &error);
+                if (error != ok) { goto _output_error; }
+                if ((safeDecode) && unlikely((uptrval)(op)+length<(uptrval)op)) { goto _output_error; } /* overflow detection */
+                length += MINMATCH;
+                if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
+                    goto safe_match_copy;
+                }
+            } else {
+                length += MINMATCH;
+                if (op + length >= oend - FASTLOOP_SAFE_DISTANCE) {
+                    goto safe_match_copy;
+                }
+
+                /* Fastpath check: Avoids a branch in LZ4_wildCopy32 if true */
+                if ((dict == withPrefix64k) || (match >= lowPrefix)) {
+                    if (offset >= 8) {
+                        assert(match >= lowPrefix);
+                        assert(match <= op);
+                        assert(op + 18 <= oend);
+
+                        LZ4_memcpy(op, match, 8);
+                        LZ4_memcpy(op+8, match+8, 8);
+                        LZ4_memcpy(op+16, match+16, 2);
+                        op += length;
+                        continue;
+            }   }   }
+
+            if (checkOffset && (unlikely(match + dictSize < lowPrefix))) { goto _output_error; } /* Error : offset outside buffers */
+            /* match starting within external dictionary */
+            if ((dict==usingExtDict) && (match < lowPrefix)) {
+                if (unlikely(op+length > oend-LASTLITERALS)) {
+                    if (partialDecoding) {
+                        DEBUGLOG(7, "partialDecoding: dictionary match, close to dstEnd");
+                        length = MIN(length, (size_t)(oend-op));
+                    } else {
+                        goto _output_error;  /* end-of-block condition violated */
+                }   }
+
+                if (length <= (size_t)(lowPrefix-match)) {
+                    /* match fits entirely within external dictionary : just copy */
+                    memmove(op, dictEnd - (lowPrefix-match), length);
+                    op += length;
+                } else {
+                    /* match stretches into both external dictionary and current block */
+                    size_t const copySize = (size_t)(lowPrefix - match);
+                    size_t const restSize = length - copySize;
+                    LZ4_memcpy(op, dictEnd - copySize, copySize);
+                    op += copySize;
+                    if (restSize > (size_t)(op - lowPrefix)) {  /* overlap copy */
+                        BYTE* const endOfMatch = op + restSize;
+                        const BYTE* copyFrom = lowPrefix;
+                        while (op < endOfMatch) { *op++ = *copyFrom++; }
+                    } else {
+                        LZ4_memcpy(op, lowPrefix, restSize);
+                        op += restSize;
+                }   }
+                continue;
+            }
+
+            /* copy match within block */
+            cpy = op + length;
+
+            assert((op <= oend) && (oend-op >= 32));
+            if (unlikely(offset<16)) {
+                LZ4_memcpy_using_offset(op, match, cpy, offset);
+            } else {
+                LZ4_wildCopy32(op, match, cpy);
+            }
+
+            op = cpy;   /* wildcopy correction */
+        }
+    safe_decode:
+#endif
+
+        /* Main Loop : decode remaining sequences where output < FASTLOOP_SAFE_DISTANCE */
+        while (1) {
+            token = *ip++;
+            length = token >> ML_BITS;  /* literal length */
+
+            assert(!endOnInput || ip <= iend); /* ip < iend before the increment */
+
+            /* A two-stage shortcut for the most common case:
+             * 1) If the literal length is 0..14, and there is enough space,
+             * enter the shortcut and copy 16 bytes on behalf of the literals
+             * (in the fast mode, only 8 bytes can be safely copied this way).
+             * 2) Further if the match length is 4..18, copy 18 bytes in a similar
+             * manner; but we ensure that there's enough space in the output for
+             * those 18 bytes earlier, upon entering the shortcut (in other words,
+             * there is a combined check for both stages).
+             */
+            if ( (endOnInput ? length != RUN_MASK : length <= 8)
+                /* strictly "less than" on input, to re-enter the loop with at least one byte */
+              && likely((endOnInput ? ip < shortiend : 1) & (op <= shortoend)) ) {
+                /* Copy the literals */
+                LZ4_memcpy(op, ip, endOnInput ? 16 : 8);
+                op += length; ip += length;
+
+                /* The second stage: prepare for match copying, decode full info.
+                 * If it doesn't work out, the info won't be wasted. */
+                length = token & ML_MASK; /* match length */
+                offset = LZ4_readLE16(ip); ip += 2;
+                match = op - offset;
+                assert(match <= op); /* check overflow */
+
+                /* Do not deal with overlapping matches. */
+                if ( (length != ML_MASK)
+                  && (offset >= 8)
+                  && (dict==withPrefix64k || match >= lowPrefix) ) {
+                    /* Copy the match. */
+                    LZ4_memcpy(op + 0, match + 0, 8);
+                    LZ4_memcpy(op + 8, match + 8, 8);
+                    LZ4_memcpy(op +16, match +16, 2);
+                    op += length + MINMATCH;
+                    /* Both stages worked, load the next token. */
+                    continue;
+                }
+
+                /* The second stage didn't work out, but the info is ready.
+                 * Propel it right to the point of match copying. */
+                goto _copy_match;
+            }
+
+            /* decode literal length */
+            if (length == RUN_MASK) {
+                variable_length_error error = ok;
+                length += read_variable_length(&ip, iend-RUN_MASK, (int)endOnInput, (int)endOnInput, &error);
+                if (error == initial_error) { goto _output_error; }
+                if ((safeDecode) && unlikely((uptrval)(op)+length<(uptrval)(op))) { goto _output_error; } /* overflow detection */
+                if ((safeDecode) && unlikely((uptrval)(ip)+length<(uptrval)(ip))) { goto _output_error; } /* overflow detection */
+            }
+
+            /* copy literals */
+            cpy = op+length;
+#if LZ4_FAST_DEC_LOOP
+        safe_literal_copy:
+#endif
+            LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
+            if ( ((endOnInput) && ((cpy>oend-MFLIMIT) || (ip+length>iend-(2+1+LASTLITERALS))) )
+              || ((!endOnInput) && (cpy>oend-WILDCOPYLENGTH)) )
+            {
+                /* We've either hit the input parsing restriction or the output parsing restriction.
+                 * In the normal scenario, decoding a full block, it must be the last sequence,
+                 * otherwise it's an error (invalid input or dimensions).
+                 * In partialDecoding scenario, it's necessary to ensure there is no buffer overflow.
+                 */
+                if (partialDecoding) {
+                    /* Since we are partial decoding we may be in this block because of the output parsing
+                     * restriction, which is not valid since the output buffer is allowed to be undersized.
+                     */
+                    assert(endOnInput);
+                    DEBUGLOG(7, "partialDecoding: copying literals, close to input or output end")
+                    DEBUGLOG(7, "partialDecoding: literal length = %u", (unsigned)length);
+                    DEBUGLOG(7, "partialDecoding: remaining space in dstBuffer : %i", (int)(oend - op));
+                    DEBUGLOG(7, "partialDecoding: remaining space in srcBuffer : %i", (int)(iend - ip));
+                    /* Finishing in the middle of a literals segment,
+                     * due to lack of input.
+                     */
+                    if (ip+length > iend) {
+                        length = (size_t)(iend-ip);
+                        cpy = op + length;
+                    }
+                    /* Finishing in the middle of a literals segment,
+                     * due to lack of output space.
+                     */
+                    if (cpy > oend) {
+                        cpy = oend;
+                        assert(op<=oend);
+                        length = (size_t)(oend-op);
+                    }
+                } else {
+                    /* We must be on the last sequence because of the parsing limitations so check
+                     * that we exactly regenerate the original size (must be exact when !endOnInput).
+                     */
+                    if ((!endOnInput) && (cpy != oend)) { goto _output_error; }
+                     /* We must be on the last sequence (or invalid) because of the parsing limitations
+                      * so check that we exactly consume the input and don't overrun the output buffer.
+                      */
+                    if ((endOnInput) && ((ip+length != iend) || (cpy > oend))) {
+                        DEBUGLOG(6, "should have been last run of literals")
+                        DEBUGLOG(6, "ip(%p) + length(%i) = %p != iend (%p)", ip, (int)length, ip+length, iend);
+                        DEBUGLOG(6, "or cpy(%p) > oend(%p)", cpy, oend);
+                        goto _output_error;
+                    }
+                }
+                memmove(op, ip, length);  /* supports overlapping memory regions; only matters for in-place decompression scenarios */
+                ip += length;
+                op += length;
+                /* Necessarily EOF when !partialDecoding.
+                 * When partialDecoding, it is EOF if we've either
+                 * filled the output buffer or
+                 * can't proceed with reading an offset for following match.
+                 */
+                if (!partialDecoding || (cpy == oend) || (ip >= (iend-2))) {
+                    break;
+                }
+            } else {
+                LZ4_wildCopy8(op, ip, cpy);   /* may overwrite up to WILDCOPYLENGTH beyond cpy */
+                ip += length; op = cpy;
+            }
+
+            /* get offset */
+            offset = LZ4_readLE16(ip); ip+=2;
+            match = op - offset;
+
+            /* get matchlength */
+            length = token & ML_MASK;
+
+    _copy_match:
+            if (length == ML_MASK) {
+              variable_length_error error = ok;
+              length += read_variable_length(&ip, iend - LASTLITERALS + 1, (int)endOnInput, 0, &error);
+              if (error != ok) goto _output_error;
+                if ((safeDecode) && unlikely((uptrval)(op)+length<(uptrval)op)) goto _output_error;   /* overflow detection */
+            }
+            length += MINMATCH;
+
+#if LZ4_FAST_DEC_LOOP
+        safe_match_copy:
+#endif
+            if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) goto _output_error;   /* Error : offset outside buffers */
+            /* match starting within external dictionary */
+            if ((dict==usingExtDict) && (match < lowPrefix)) {
+                if (unlikely(op+length > oend-LASTLITERALS)) {
+                    if (partialDecoding) length = MIN(length, (size_t)(oend-op));
+                    else goto _output_error;   /* doesn't respect parsing restriction */
+                }
+
+                if (length <= (size_t)(lowPrefix-match)) {
+                    /* match fits entirely within external dictionary : just copy */
+                    memmove(op, dictEnd - (lowPrefix-match), length);
+                    op += length;
+                } else {
+                    /* match stretches into both external dictionary and current block */
+                    size_t const copySize = (size_t)(lowPrefix - match);
+                    size_t const restSize = length - copySize;
+                    LZ4_memcpy(op, dictEnd - copySize, copySize);
+                    op += copySize;
+                    if (restSize > (size_t)(op - lowPrefix)) {  /* overlap copy */
+                        BYTE* const endOfMatch = op + restSize;
+                        const BYTE* copyFrom = lowPrefix;
+                        while (op < endOfMatch) *op++ = *copyFrom++;
+                    } else {
+                        LZ4_memcpy(op, lowPrefix, restSize);
+                        op += restSize;
+                }   }
+                continue;
+            }
+            assert(match >= lowPrefix);
+
+            /* copy match within block */
+            cpy = op + length;
+
+            /* partialDecoding : may end anywhere within the block */
+            assert(op<=oend);
+            if (partialDecoding && (cpy > oend-MATCH_SAFEGUARD_DISTANCE)) {
+                size_t const mlen = MIN(length, (size_t)(oend-op));
+                const BYTE* const matchEnd = match + mlen;
+                BYTE* const copyEnd = op + mlen;
+                if (matchEnd > op) {   /* overlap copy */
+                    while (op < copyEnd) { *op++ = *match++; }
+                } else {
+                    LZ4_memcpy(op, match, mlen);
+                }
+                op = copyEnd;
+                if (op == oend) { break; }
+                continue;
+            }
+
+            if (unlikely(offset<8)) {
+                LZ4_write32(op, 0);   /* silence msan warning when offset==0 */
+                op[0] = match[0];
+                op[1] = match[1];
+                op[2] = match[2];
+                op[3] = match[3];
+                match += inc32table[offset];
+                LZ4_memcpy(op+4, match, 4);
+                match -= dec64table[offset];
+            } else {
+                LZ4_memcpy(op, match, 8);
+                match += 8;
+            }
+            op += 8;
+
+            if (unlikely(cpy > oend-MATCH_SAFEGUARD_DISTANCE)) {
+                BYTE* const oCopyLimit = oend - (WILDCOPYLENGTH-1);
+                if (cpy > oend-LASTLITERALS) { goto _output_error; } /* Error : last LASTLITERALS bytes must be literals (uncompressed) */
+                if (op < oCopyLimit) {
+                    LZ4_wildCopy8(op, match, oCopyLimit);
+                    match += oCopyLimit - op;
+                    op = oCopyLimit;
+                }
+                while (op < cpy) { *op++ = *match++; }
+            } else {
+                LZ4_memcpy(op, match, 8);
+                if (length > 16)  { LZ4_wildCopy8(op+8, match+8, cpy); }
+            }
+            op = cpy;   /* wildcopy correction */
+        }
+
+        /* end of decoding */
+        if (endOnInput) {
+            DEBUGLOG(5, "decoded %i bytes", (int) (((char*)op)-dst));
+           return (int) (((char*)op)-dst);     /* Nb of output bytes decoded */
+       } else {
+           return (int) (((const char*)ip)-src);   /* Nb of input bytes read */
+       }
+
+        /* Overflow error detected */
+    _output_error:
+        return (int) (-(((const char*)ip)-src))-1;
+    }
+}
+
+
+/*===== Instantiate the API decoding functions. =====*/
+
+LZ4_FORCE_O2
+int LZ4_decompress_safe(const char* source, char* dest, int compressedSize, int maxDecompressedSize)
+{
+    return LZ4_decompress_generic(source, dest, compressedSize, maxDecompressedSize,
+                                  endOnInputSize, decode_full_block, noDict,
+                                  (BYTE*)dest, NULL, 0);
+}
+
+LZ4_FORCE_O2
+int LZ4_decompress_safe_partial(const char* src, char* dst, int compressedSize, int targetOutputSize, int dstCapacity)
+{
+    dstCapacity = MIN(targetOutputSize, dstCapacity);
+    return LZ4_decompress_generic(src, dst, compressedSize, dstCapacity,
+                                  endOnInputSize, partial_decode,
+                                  noDict, (BYTE*)dst, NULL, 0);
+}
+
+LZ4_FORCE_O2
+int LZ4_decompress_fast(const char* source, char* dest, int originalSize)
+{
+    return LZ4_decompress_generic(source, dest, 0, originalSize,
+                                  endOnOutputSize, decode_full_block, withPrefix64k,
+                                  (BYTE*)dest - 64 KB, NULL, 0);
+}
+
+/*===== Instantiate a few more decoding cases, used more than once. =====*/
+
+LZ4_FORCE_O2 /* Exported, an obsolete API function. */
+int LZ4_decompress_safe_withPrefix64k(const char* source, char* dest, int compressedSize, int maxOutputSize)
+{
+    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
+                                  endOnInputSize, decode_full_block, withPrefix64k,
+                                  (BYTE*)dest - 64 KB, NULL, 0);
+}
+
+/* Another obsolete API function, paired with the previous one. */
+int LZ4_decompress_fast_withPrefix64k(const char* source, char* dest, int originalSize)
+{
+    /* LZ4_decompress_fast doesn't validate match offsets,
+     * and thus serves well with any prefixed dictionary. */
+    return LZ4_decompress_fast(source, dest, originalSize);
+}
+
+LZ4_FORCE_O2
+static int LZ4_decompress_safe_withSmallPrefix(const char* source, char* dest, int compressedSize, int maxOutputSize,
+                                               size_t prefixSize)
+{
+    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
+                                  endOnInputSize, decode_full_block, noDict,
+                                  (BYTE*)dest-prefixSize, NULL, 0);
+}
+
+LZ4_FORCE_O2
+int LZ4_decompress_safe_forceExtDict(const char* source, char* dest,
+                                     int compressedSize, int maxOutputSize,
+                                     const void* dictStart, size_t dictSize)
+{
+    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
+                                  endOnInputSize, decode_full_block, usingExtDict,
+                                  (BYTE*)dest, (const BYTE*)dictStart, dictSize);
+}
+
+LZ4_FORCE_O2
+static int LZ4_decompress_fast_extDict(const char* source, char* dest, int originalSize,
+                                       const void* dictStart, size_t dictSize)
+{
+    return LZ4_decompress_generic(source, dest, 0, originalSize,
+                                  endOnOutputSize, decode_full_block, usingExtDict,
+                                  (BYTE*)dest, (const BYTE*)dictStart, dictSize);
+}
+
+/* The "double dictionary" mode, for use with e.g. ring buffers: the first part
+ * of the dictionary is passed as prefix, and the second via dictStart + dictSize.
+ * These routines are used only once, in LZ4_decompress_*_continue().
+ */
+LZ4_FORCE_INLINE
+int LZ4_decompress_safe_doubleDict(const char* source, char* dest, int compressedSize, int maxOutputSize,
+                                   size_t prefixSize, const void* dictStart, size_t dictSize)
+{
+    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
+                                  endOnInputSize, decode_full_block, usingExtDict,
+                                  (BYTE*)dest-prefixSize, (const BYTE*)dictStart, dictSize);
+}
+
+LZ4_FORCE_INLINE
+int LZ4_decompress_fast_doubleDict(const char* source, char* dest, int originalSize,
+                                   size_t prefixSize, const void* dictStart, size_t dictSize)
+{
+    return LZ4_decompress_generic(source, dest, 0, originalSize,
+                                  endOnOutputSize, decode_full_block, usingExtDict,
+                                  (BYTE*)dest-prefixSize, (const BYTE*)dictStart, dictSize);
+}
+
+/*===== streaming decompression functions =====*/
+
+LZ4_streamDecode_t* LZ4_createStreamDecode(void)
+{
+    LZ4_streamDecode_t* lz4s = (LZ4_streamDecode_t*) ALLOC_AND_ZERO(sizeof(LZ4_streamDecode_t));
+    LZ4_STATIC_ASSERT(LZ4_STREAMDECODESIZE >= sizeof(LZ4_streamDecode_t_internal));    /* A compilation error here means LZ4_STREAMDECODESIZE is not large enough */
+    return lz4s;
+}
+
+int LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream)
+{
+    if (LZ4_stream == NULL) { return 0; }  /* support free on NULL */
+    FREEMEM(LZ4_stream);
+    return 0;
+}
+
+/*! LZ4_setStreamDecode() :
+ *  Use this function to instruct where to find the dictionary.
+ *  This function is not necessary if previous data is still available where it was decoded.
+ *  Loading a size of 0 is allowed (same effect as no dictionary).
+ * @return : 1 if OK, 0 if error
+ */
+int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize)
+{
+    LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse;
+    lz4sd->prefixSize = (size_t) dictSize;
+    lz4sd->prefixEnd = (const BYTE*) dictionary + dictSize;
+    lz4sd->externalDict = NULL;
+    lz4sd->extDictSize  = 0;
+    return 1;
+}
+
+/*! LZ4_decoderRingBufferSize() :
+ *  when setting a ring buffer for streaming decompression (optional scenario),
+ *  provides the minimum size of this ring buffer
+ *  to be compatible with any source respecting maxBlockSize condition.
+ *  Note : in a ring buffer scenario,
+ *  blocks are presumed decompressed next to each other.
+ *  When not enough space remains for next block (remainingSize < maxBlockSize),
+ *  decoding resumes from beginning of ring buffer.
+ * @return : minimum ring buffer size,
+ *           or 0 if there is an error (invalid maxBlockSize).
+ */
+int LZ4_decoderRingBufferSize(int maxBlockSize)
+{
+    if (maxBlockSize < 0) return 0;
+    if (maxBlockSize > LZ4_MAX_INPUT_SIZE) return 0;
+    if (maxBlockSize < 16) maxBlockSize = 16;
+    return LZ4_DECODER_RING_BUFFER_SIZE(maxBlockSize);
+}
+
+/*
+*_continue() :
+    These decoding functions allow decompression of multiple blocks in "streaming" mode.
+    Previously decoded blocks must still be available at the memory position where they were decoded.
+    If it's not possible, save the relevant part of decoded data into a safe buffer,
+    and indicate where it stands using LZ4_setStreamDecode()
+*/
+LZ4_FORCE_O2
+int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxOutputSize)
+{
+    LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse;
+    int result;
+
+    if (lz4sd->prefixSize == 0) {
+        /* The first call, no dictionary yet. */
+        assert(lz4sd->extDictSize == 0);
+        result = LZ4_decompress_safe(source, dest, compressedSize, maxOutputSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize = (size_t)result;
+        lz4sd->prefixEnd = (BYTE*)dest + result;
+    } else if (lz4sd->prefixEnd == (BYTE*)dest) {
+        /* They're rolling the current segment. */
+        if (lz4sd->prefixSize >= 64 KB - 1)
+            result = LZ4_decompress_safe_withPrefix64k(source, dest, compressedSize, maxOutputSize);
+        else if (lz4sd->extDictSize == 0)
+            result = LZ4_decompress_safe_withSmallPrefix(source, dest, compressedSize, maxOutputSize,
+                                                         lz4sd->prefixSize);
+        else
+            result = LZ4_decompress_safe_doubleDict(source, dest, compressedSize, maxOutputSize,
+                                                    lz4sd->prefixSize, lz4sd->externalDict, lz4sd->extDictSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize += (size_t)result;
+        lz4sd->prefixEnd  += result;
+    } else {
+        /* The buffer wraps around, or they're switching to another buffer. */
+        lz4sd->extDictSize = lz4sd->prefixSize;
+        lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize;
+        result = LZ4_decompress_safe_forceExtDict(source, dest, compressedSize, maxOutputSize,
+                                                  lz4sd->externalDict, lz4sd->extDictSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize = (size_t)result;
+        lz4sd->prefixEnd  = (BYTE*)dest + result;
+    }
+
+    return result;
+}
+
+LZ4_FORCE_O2
+int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int originalSize)
+{
+    LZ4_streamDecode_t_internal* lz4sd = &LZ4_streamDecode->internal_donotuse;
+    int result;
+    assert(originalSize >= 0);
+
+    if (lz4sd->prefixSize == 0) {
+        assert(lz4sd->extDictSize == 0);
+        result = LZ4_decompress_fast(source, dest, originalSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize = (size_t)originalSize;
+        lz4sd->prefixEnd = (BYTE*)dest + originalSize;
+    } else if (lz4sd->prefixEnd == (BYTE*)dest) {
+        if (lz4sd->prefixSize >= 64 KB - 1 || lz4sd->extDictSize == 0)
+            result = LZ4_decompress_fast(source, dest, originalSize);
+        else
+            result = LZ4_decompress_fast_doubleDict(source, dest, originalSize,
+                                                    lz4sd->prefixSize, lz4sd->externalDict, lz4sd->extDictSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize += (size_t)originalSize;
+        lz4sd->prefixEnd  += originalSize;
+    } else {
+        lz4sd->extDictSize = lz4sd->prefixSize;
+        lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize;
+        result = LZ4_decompress_fast_extDict(source, dest, originalSize,
+                                             lz4sd->externalDict, lz4sd->extDictSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize = (size_t)originalSize;
+        lz4sd->prefixEnd  = (BYTE*)dest + originalSize;
+    }
+
+    return result;
+}
+
+
+/*
+Advanced decoding functions :
+*_usingDict() :
+    These decoding functions work the same as "_continue" ones,
+    the dictionary must be explicitly provided within parameters
+*/
+
+int LZ4_decompress_safe_usingDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize)
+{
+    if (dictSize==0)
+        return LZ4_decompress_safe(source, dest, compressedSize, maxOutputSize);
+    if (dictStart+dictSize == dest) {
+        if (dictSize >= 64 KB - 1) {
+            return LZ4_decompress_safe_withPrefix64k(source, dest, compressedSize, maxOutputSize);
+        }
+        assert(dictSize >= 0);
+        return LZ4_decompress_safe_withSmallPrefix(source, dest, compressedSize, maxOutputSize, (size_t)dictSize);
+    }
+    assert(dictSize >= 0);
+    return LZ4_decompress_safe_forceExtDict(source, dest, compressedSize, maxOutputSize, dictStart, (size_t)dictSize);
+}
+
+int LZ4_decompress_fast_usingDict(const char* source, char* dest, int originalSize, const char* dictStart, int dictSize)
+{
+    if (dictSize==0 || dictStart+dictSize == dest)
+        return LZ4_decompress_fast(source, dest, originalSize);
+    assert(dictSize >= 0);
+    return LZ4_decompress_fast_extDict(source, dest, originalSize, dictStart, (size_t)dictSize);
+}
+
+
+/*=*************************************************
+*  Obsolete Functions
+***************************************************/
+/* obsolete compression functions */
+int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize)
+{
+    return LZ4_compress_default(source, dest, inputSize, maxOutputSize);
+}
+int LZ4_compress(const char* src, char* dest, int srcSize)
+{
+    return LZ4_compress_default(src, dest, srcSize, LZ4_compressBound(srcSize));
+}
+int LZ4_compress_limitedOutput_withState (void* state, const char* src, char* dst, int srcSize, int dstSize)
+{
+    return LZ4_compress_fast_extState(state, src, dst, srcSize, dstSize, 1);
+}
+int LZ4_compress_withState (void* state, const char* src, char* dst, int srcSize)
+{
+    return LZ4_compress_fast_extState(state, src, dst, srcSize, LZ4_compressBound(srcSize), 1);
+}
+int LZ4_compress_limitedOutput_continue (LZ4_stream_t* LZ4_stream, const char* src, char* dst, int srcSize, int dstCapacity)
+{
+    return LZ4_compress_fast_continue(LZ4_stream, src, dst, srcSize, dstCapacity, 1);
+}
+int LZ4_compress_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize)
+{
+    return LZ4_compress_fast_continue(LZ4_stream, source, dest, inputSize, LZ4_compressBound(inputSize), 1);
+}
+
+/*
+These decompression functions are deprecated and should no longer be used.
+They are only provided here for compatibility with older user programs.
+- LZ4_uncompress is totally equivalent to LZ4_decompress_fast
+- LZ4_uncompress_unknownOutputSize is totally equivalent to LZ4_decompress_safe
+*/
+int LZ4_uncompress (const char* source, char* dest, int outputSize)
+{
+    return LZ4_decompress_fast(source, dest, outputSize);
+}
+int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize)
+{
+    return LZ4_decompress_safe(source, dest, isize, maxOutputSize);
+}
+
+/* Obsolete Streaming functions */
+
+int LZ4_sizeofStreamState(void) { return LZ4_STREAMSIZE; }
+
+int LZ4_resetStreamState(void* state, char* inputBuffer)
+{
+    (void)inputBuffer;
+    LZ4_resetStream((LZ4_stream_t*)state);
+    return 0;
+}
+
+void* LZ4_create (char* inputBuffer)
+{
+    (void)inputBuffer;
+    return LZ4_createStream();
+}
+
+char* LZ4_slideInputBuffer (void* state)
+{
+    /* avoid const char * -> char * conversion warning */
+    return (char *)(uptrval)((LZ4_stream_t*)state)->internal_donotuse.dictionary;
+}
+
+#endif   /* LZ4_COMMONDEFS_ONLY */
+
diff --git a/cviruntime/src/lz4/xxhash.c b/cviruntime/src/lz4/xxhash.c
new file mode 100644
index 000000000..ff28749e3
--- /dev/null
+++ b/cviruntime/src/lz4/xxhash.c
@@ -0,0 +1,1030 @@
+/*
+*  xxHash - Fast Hash algorithm
+*  Copyright (C) 2012-2016, Yann Collet
+*
+*  BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+*
+*  Redistribution and use in source and binary forms, with or without
+*  modification, are permitted provided that the following conditions are
+*  met:
+*
+*  * Redistributions of source code must retain the above copyright
+*  notice, this list of conditions and the following disclaimer.
+*  * Redistributions in binary form must reproduce the above
+*  copyright notice, this list of conditions and the following disclaimer
+*  in the documentation and/or other materials provided with the
+*  distribution.
+*
+*  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+*  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+*  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+*  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+*  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+*  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+*  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+*  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+*  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+*  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+*  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+*  You can contact the author at :
+*  - xxHash homepage: http://www.xxhash.com
+*  - xxHash source repository : https://github.com/Cyan4973/xxHash
+*/
+
+
+/* *************************************
+*  Tuning parameters
+***************************************/
+/*!XXH_FORCE_MEMORY_ACCESS :
+ * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
+ * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
+ * The below switch allow to select different access method for improved performance.
+ * Method 0 (default) : use `memcpy()`. Safe and portable.
+ * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
+ *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
+ * Method 2 : direct access. This method doesn't depend on compiler but violate C standard.
+ *            It can generate buggy code on targets which do not support unaligned memory accesses.
+ *            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
+ * See http://stackoverflow.com/a/32095106/646947 for details.
+ * Prefer these methods in priority order (0 > 1 > 2)
+ */
+#ifndef XXH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
+#  if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \
+                        || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) \
+                        || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
+#    define XXH_FORCE_MEMORY_ACCESS 2
+#  elif (defined(__INTEL_COMPILER) && !defined(_WIN32)) || \
+  (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \
+                    || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \
+                    || defined(__ARM_ARCH_7S__) ))
+#    define XXH_FORCE_MEMORY_ACCESS 1
+#  endif
+#endif
+
+/*!XXH_ACCEPT_NULL_INPUT_POINTER :
+ * If input pointer is NULL, xxHash default behavior is to dereference it, triggering a segfault.
+ * When this macro is enabled, xxHash actively checks input for null pointer.
+ * It it is, result for null input pointers is the same as a null-length input.
+ */
+#ifndef XXH_ACCEPT_NULL_INPUT_POINTER   /* can be defined externally */
+#  define XXH_ACCEPT_NULL_INPUT_POINTER 0
+#endif
+
+/*!XXH_FORCE_NATIVE_FORMAT :
+ * By default, xxHash library provides endian-independent Hash values, based on little-endian convention.
+ * Results are therefore identical for little-endian and big-endian CPU.
+ * This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format.
+ * Should endian-independence be of no importance for your application, you may set the #define below to 1,
+ * to improve speed for Big-endian CPU.
+ * This option has no impact on Little_Endian CPU.
+ */
+#ifndef XXH_FORCE_NATIVE_FORMAT   /* can be defined externally */
+#  define XXH_FORCE_NATIVE_FORMAT 0
+#endif
+
+/*!XXH_FORCE_ALIGN_CHECK :
+ * This is a minor performance trick, only useful with lots of very small keys.
+ * It means : check for aligned/unaligned input.
+ * The check costs one initial branch per hash;
+ * set it to 0 when the input is guaranteed to be aligned,
+ * or when alignment doesn't matter for performance.
+ */
+#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */
+#  if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
+#    define XXH_FORCE_ALIGN_CHECK 0
+#  else
+#    define XXH_FORCE_ALIGN_CHECK 1
+#  endif
+#endif
+
+
+/* *************************************
+*  Includes & Memory related functions
+***************************************/
+/*! Modify the local functions below should you wish to use some other memory routines
+*   for malloc(), free() */
+#include <stdlib.h>
+static void* XXH_malloc(size_t s) { return malloc(s); }
+static void  XXH_free  (void* p)  { free(p); }
+/*! and for memcpy() */
+#include <string.h>
+static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); }
+
+#include <assert.h>   /* assert */
+
+#define XXH_STATIC_LINKING_ONLY
+#include "xxhash.h"
+
+
+/* *************************************
+*  Compiler Specific Options
+***************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  pragma warning(disable : 4127)      /* disable: C4127: conditional expression is constant */
+#  define FORCE_INLINE static __forceinline
+#else
+#  if defined (__cplusplus) || defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* C99 */
+#    ifdef __GNUC__
+#      define FORCE_INLINE static inline __attribute__((always_inline))
+#    else
+#      define FORCE_INLINE static inline
+#    endif
+#  else
+#    define FORCE_INLINE static
+#  endif /* __STDC_VERSION__ */
+#endif
+
+
+/* *************************************
+*  Basic Types
+***************************************/
+#ifndef MEM_MODULE
+# if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   include <stdint.h>
+    typedef uint8_t  BYTE;
+    typedef uint16_t U16;
+    typedef uint32_t U32;
+# else
+    typedef unsigned char      BYTE;
+    typedef unsigned short     U16;
+    typedef unsigned int       U32;
+# endif
+#endif
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+static U32 XXH_read32(const void* memPtr) { return *(const U32*) memPtr; }
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { U32 u32; } __attribute__((packed)) unalign;
+static U32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; }
+
+#else
+
+/* portable and safe solution. Generally efficient.
+ * see : http://stackoverflow.com/a/32095106/646947
+ */
+static U32 XXH_read32(const void* memPtr)
+{
+    U32 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+
+/* ****************************************
+*  Compiler-specific Functions and Macros
+******************************************/
+#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+/* Note : although _rotl exists for minGW (GCC under windows), performance seems poor */
+#if defined(_MSC_VER)
+#  define XXH_rotl32(x,r) _rotl(x,r)
+#  define XXH_rotl64(x,r) _rotl64(x,r)
+#else
+#  define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r)))
+#  define XXH_rotl64(x,r) ((x << r) | (x >> (64 - r)))
+#endif
+
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXH_swap32 _byteswap_ulong
+#elif XXH_GCC_VERSION >= 403
+#  define XXH_swap32 __builtin_bswap32
+#else
+static U32 XXH_swap32 (U32 x)
+{
+    return  ((x << 24) & 0xff000000 ) |
+            ((x <<  8) & 0x00ff0000 ) |
+            ((x >>  8) & 0x0000ff00 ) |
+            ((x >> 24) & 0x000000ff );
+}
+#endif
+
+
+/* *************************************
+*  Architecture Macros
+***************************************/
+typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
+
+/* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example on the compiler command line */
+#ifndef XXH_CPU_LITTLE_ENDIAN
+static int XXH_isLittleEndian(void)
+{
+    const union { U32 u; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
+    return one.c[0];
+}
+#   define XXH_CPU_LITTLE_ENDIAN   XXH_isLittleEndian()
+#endif
+
+
+/* ***************************
+*  Memory reads
+*****************************/
+typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment;
+
+FORCE_INLINE U32 XXH_readLE32_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
+{
+    if (align==XXH_unaligned)
+        return endian==XXH_littleEndian ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr));
+    else
+        return endian==XXH_littleEndian ? *(const U32*)ptr : XXH_swap32(*(const U32*)ptr);
+}
+
+FORCE_INLINE U32 XXH_readLE32(const void* ptr, XXH_endianess endian)
+{
+    return XXH_readLE32_align(ptr, endian, XXH_unaligned);
+}
+
+static U32 XXH_readBE32(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr);
+}
+
+
+/* *************************************
+*  Macros
+***************************************/
+#define XXH_STATIC_ASSERT(c)  { enum { XXH_sa = 1/(int)(!!(c)) }; }  /* use after variable declarations */
+XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; }
+
+
+/* *******************************************************************
+*  32-bit hash functions
+*********************************************************************/
+static const U32 PRIME32_1 = 2654435761U;
+static const U32 PRIME32_2 = 2246822519U;
+static const U32 PRIME32_3 = 3266489917U;
+static const U32 PRIME32_4 =  668265263U;
+static const U32 PRIME32_5 =  374761393U;
+
+static U32 XXH32_round(U32 seed, U32 input)
+{
+    seed += input * PRIME32_2;
+    seed  = XXH_rotl32(seed, 13);
+    seed *= PRIME32_1;
+    return seed;
+}
+
+/* mix all bits */
+static U32 XXH32_avalanche(U32 h32)
+{
+    h32 ^= h32 >> 15;
+    h32 *= PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= PRIME32_3;
+    h32 ^= h32 >> 16;
+    return(h32);
+}
+
+#define XXH_get32bits(p) XXH_readLE32_align(p, endian, align)
+
+static U32
+XXH32_finalize(U32 h32, const void* ptr, size_t len,
+                XXH_endianess endian, XXH_alignment align)
+
+{
+    const BYTE* p = (const BYTE*)ptr;
+
+#define PROCESS1               \
+    h32 += (*p++) * PRIME32_5; \
+    h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
+
+#define PROCESS4                         \
+    h32 += XXH_get32bits(p) * PRIME32_3; \
+    p+=4;                                \
+    h32  = XXH_rotl32(h32, 17) * PRIME32_4 ;
+
+    switch(len&15)  /* or switch(bEnd - p) */
+    {
+      case 12:      PROCESS4;
+                    /* fallthrough */
+      case 8:       PROCESS4;
+                    /* fallthrough */
+      case 4:       PROCESS4;
+                    return XXH32_avalanche(h32);
+
+      case 13:      PROCESS4;
+                    /* fallthrough */
+      case 9:       PROCESS4;
+                    /* fallthrough */
+      case 5:       PROCESS4;
+                    PROCESS1;
+                    return XXH32_avalanche(h32);
+
+      case 14:      PROCESS4;
+                    /* fallthrough */
+      case 10:      PROCESS4;
+                    /* fallthrough */
+      case 6:       PROCESS4;
+                    PROCESS1;
+                    PROCESS1;
+                    return XXH32_avalanche(h32);
+
+      case 15:      PROCESS4;
+                    /* fallthrough */
+      case 11:      PROCESS4;
+                    /* fallthrough */
+      case 7:       PROCESS4;
+                    /* fallthrough */
+      case 3:       PROCESS1;
+                    /* fallthrough */
+      case 2:       PROCESS1;
+                    /* fallthrough */
+      case 1:       PROCESS1;
+                    /* fallthrough */
+      case 0:       return XXH32_avalanche(h32);
+    }
+    assert(0);
+    return h32;   /* reaching this point is deemed impossible */
+}
+
+
+FORCE_INLINE U32
+XXH32_endian_align(const void* input, size_t len, U32 seed,
+                    XXH_endianess endian, XXH_alignment align)
+{
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* bEnd = p + len;
+    U32 h32;
+
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+    if (p==NULL) {
+        len=0;
+        bEnd=p=(const BYTE*)(size_t)16;
+    }
+#endif
+
+    if (len>=16) {
+        const BYTE* const limit = bEnd - 15;
+        U32 v1 = seed + PRIME32_1 + PRIME32_2;
+        U32 v2 = seed + PRIME32_2;
+        U32 v3 = seed + 0;
+        U32 v4 = seed - PRIME32_1;
+
+        do {
+            v1 = XXH32_round(v1, XXH_get32bits(p)); p+=4;
+            v2 = XXH32_round(v2, XXH_get32bits(p)); p+=4;
+            v3 = XXH32_round(v3, XXH_get32bits(p)); p+=4;
+            v4 = XXH32_round(v4, XXH_get32bits(p)); p+=4;
+        } while (p < limit);
+
+        h32 = XXH_rotl32(v1, 1)  + XXH_rotl32(v2, 7)
+            + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
+    } else {
+        h32  = seed + PRIME32_5;
+    }
+
+    h32 += (U32)len;
+
+    return XXH32_finalize(h32, p, len&15, endian, align);
+}
+
+
+XXH_PUBLIC_API unsigned int XXH32 (const void* input, size_t len, unsigned int seed)
+{
+#if 0
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH32_state_t state;
+    XXH32_reset(&state, seed);
+    XXH32_update(&state, input, len);
+    return XXH32_digest(&state);
+#else
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 3) == 0) {   /* Input is 4-bytes aligned, leverage the speed benefit */
+            if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+                return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned);
+            else
+                return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
+    }   }
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned);
+    else
+        return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
+#endif
+}
+
+
+
+/*======   Hash streaming   ======*/
+
+XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void)
+{
+    return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t));
+}
+XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState)
+{
+    memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, unsigned int seed)
+{
+    XXH32_state_t state;   /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
+    memset(&state, 0, sizeof(state));
+    state.v1 = seed + PRIME32_1 + PRIME32_2;
+    state.v2 = seed + PRIME32_2;
+    state.v3 = seed + 0;
+    state.v4 = seed - PRIME32_1;
+    /* do not write into reserved, planned to be removed in a future version */
+    memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved));
+    return XXH_OK;
+}
+
+
+FORCE_INLINE XXH_errorcode
+XXH32_update_endian(XXH32_state_t* state, const void* input, size_t len, XXH_endianess endian)
+{
+    if (input==NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+        return XXH_OK;
+#else
+        return XXH_ERROR;
+#endif
+
+    {   const BYTE* p = (const BYTE*)input;
+        const BYTE* const bEnd = p + len;
+
+        state->total_len_32 += (unsigned)len;
+        state->large_len |= (len>=16) | (state->total_len_32>=16);
+
+        if (state->memsize + len < 16)  {   /* fill in tmp buffer */
+            XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, len);
+            state->memsize += (unsigned)len;
+            return XXH_OK;
+        }
+
+        if (state->memsize) {   /* some data left from previous update */
+            XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, 16-state->memsize);
+            {   const U32* p32 = state->mem32;
+                state->v1 = XXH32_round(state->v1, XXH_readLE32(p32, endian)); p32++;
+                state->v2 = XXH32_round(state->v2, XXH_readLE32(p32, endian)); p32++;
+                state->v3 = XXH32_round(state->v3, XXH_readLE32(p32, endian)); p32++;
+                state->v4 = XXH32_round(state->v4, XXH_readLE32(p32, endian));
+            }
+            p += 16-state->memsize;
+            state->memsize = 0;
+        }
+
+        if (p <= bEnd-16) {
+            const BYTE* const limit = bEnd - 16;
+            U32 v1 = state->v1;
+            U32 v2 = state->v2;
+            U32 v3 = state->v3;
+            U32 v4 = state->v4;
+
+            do {
+                v1 = XXH32_round(v1, XXH_readLE32(p, endian)); p+=4;
+                v2 = XXH32_round(v2, XXH_readLE32(p, endian)); p+=4;
+                v3 = XXH32_round(v3, XXH_readLE32(p, endian)); p+=4;
+                v4 = XXH32_round(v4, XXH_readLE32(p, endian)); p+=4;
+            } while (p<=limit);
+
+            state->v1 = v1;
+            state->v2 = v2;
+            state->v3 = v3;
+            state->v4 = v4;
+        }
+
+        if (p < bEnd) {
+            XXH_memcpy(state->mem32, p, (size_t)(bEnd-p));
+            state->memsize = (unsigned)(bEnd-p);
+        }
+    }
+
+    return XXH_OK;
+}
+
+
+XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* state_in, const void* input, size_t len)
+{
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH32_update_endian(state_in, input, len, XXH_littleEndian);
+    else
+        return XXH32_update_endian(state_in, input, len, XXH_bigEndian);
+}
+
+
+FORCE_INLINE U32
+XXH32_digest_endian (const XXH32_state_t* state, XXH_endianess endian)
+{
+    U32 h32;
+
+    if (state->large_len) {
+        h32 = XXH_rotl32(state->v1, 1)
+            + XXH_rotl32(state->v2, 7)
+            + XXH_rotl32(state->v3, 12)
+            + XXH_rotl32(state->v4, 18);
+    } else {
+        h32 = state->v3 /* == seed */ + PRIME32_5;
+    }
+
+    h32 += state->total_len_32;
+
+    return XXH32_finalize(h32, state->mem32, state->memsize, endian, XXH_aligned);
+}
+
+
+XXH_PUBLIC_API unsigned int XXH32_digest (const XXH32_state_t* state_in)
+{
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH32_digest_endian(state_in, XXH_littleEndian);
+    else
+        return XXH32_digest_endian(state_in, XXH_bigEndian);
+}
+
+
+/*======   Canonical representation   ======*/
+
+/*! Default XXH result types are basic unsigned 32 and 64 bits.
+*   The canonical representation follows human-readable write convention, aka big-endian (large digits first).
+*   These functions allow transformation of hash result into and from its canonical format.
+*   This way, hash values can be written into a file or buffer, remaining comparable across different systems.
+*/
+
+XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash);
+    memcpy(dst, &hash, sizeof(*dst));
+}
+
+XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src)
+{
+    return XXH_readBE32(src);
+}
+
+
+#ifndef XXH_NO_LONG_LONG
+
+/* *******************************************************************
+*  64-bit hash functions
+*********************************************************************/
+
+/*======   Memory access   ======*/
+
+#ifndef MEM_MODULE
+# define MEM_MODULE
+# if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   include <stdint.h>
+    typedef uint64_t U64;
+# else
+    /* if compiler doesn't support unsigned long long, replace by another 64-bit type */
+    typedef unsigned long long U64;
+# endif
+#endif
+
+
+#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2))
+
+/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
+static U64 XXH_read64(const void* memPtr) { return *(const U64*) memPtr; }
+
+#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1))
+
+/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
+/* currently only defined for gcc and icc */
+typedef union { U32 u32; U64 u64; } __attribute__((packed)) unalign64;
+static U64 XXH_read64(const void* ptr) { return ((const unalign64*)ptr)->u64; }
+
+#else
+
+/* portable and safe solution. Generally efficient.
+ * see : http://stackoverflow.com/a/32095106/646947
+ */
+
+static U64 XXH_read64(const void* memPtr)
+{
+    U64 val;
+    memcpy(&val, memPtr, sizeof(val));
+    return val;
+}
+
+#endif   /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
+
+#if defined(_MSC_VER)     /* Visual Studio */
+#  define XXH_swap64 _byteswap_uint64
+#elif XXH_GCC_VERSION >= 403
+#  define XXH_swap64 __builtin_bswap64
+#else
+static U64 XXH_swap64 (U64 x)
+{
+    return  ((x << 56) & 0xff00000000000000ULL) |
+            ((x << 40) & 0x00ff000000000000ULL) |
+            ((x << 24) & 0x0000ff0000000000ULL) |
+            ((x << 8)  & 0x000000ff00000000ULL) |
+            ((x >> 8)  & 0x00000000ff000000ULL) |
+            ((x >> 24) & 0x0000000000ff0000ULL) |
+            ((x >> 40) & 0x000000000000ff00ULL) |
+            ((x >> 56) & 0x00000000000000ffULL);
+}
+#endif
+
+FORCE_INLINE U64 XXH_readLE64_align(const void* ptr, XXH_endianess endian, XXH_alignment align)
+{
+    if (align==XXH_unaligned)
+        return endian==XXH_littleEndian ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr));
+    else
+        return endian==XXH_littleEndian ? *(const U64*)ptr : XXH_swap64(*(const U64*)ptr);
+}
+
+FORCE_INLINE U64 XXH_readLE64(const void* ptr, XXH_endianess endian)
+{
+    return XXH_readLE64_align(ptr, endian, XXH_unaligned);
+}
+
+static U64 XXH_readBE64(const void* ptr)
+{
+    return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr);
+}
+
+
+/*======   xxh64   ======*/
+
+static const U64 PRIME64_1 = 11400714785074694791ULL;
+static const U64 PRIME64_2 = 14029467366897019727ULL;
+static const U64 PRIME64_3 =  1609587929392839161ULL;
+static const U64 PRIME64_4 =  9650029242287828579ULL;
+static const U64 PRIME64_5 =  2870177450012600261ULL;
+
+static U64 XXH64_round(U64 acc, U64 input)
+{
+    acc += input * PRIME64_2;
+    acc  = XXH_rotl64(acc, 31);
+    acc *= PRIME64_1;
+    return acc;
+}
+
+static U64 XXH64_mergeRound(U64 acc, U64 val)
+{
+    val  = XXH64_round(0, val);
+    acc ^= val;
+    acc  = acc * PRIME64_1 + PRIME64_4;
+    return acc;
+}
+
+static U64 XXH64_avalanche(U64 h64)
+{
+    h64 ^= h64 >> 33;
+    h64 *= PRIME64_2;
+    h64 ^= h64 >> 29;
+    h64 *= PRIME64_3;
+    h64 ^= h64 >> 32;
+    return h64;
+}
+
+
+#define XXH_get64bits(p) XXH_readLE64_align(p, endian, align)
+
+static U64
+XXH64_finalize(U64 h64, const void* ptr, size_t len,
+               XXH_endianess endian, XXH_alignment align)
+{
+    const BYTE* p = (const BYTE*)ptr;
+
+#define PROCESS1_64            \
+    h64 ^= (*p++) * PRIME64_5; \
+    h64 = XXH_rotl64(h64, 11) * PRIME64_1;
+
+#define PROCESS4_64          \
+    h64 ^= (U64)(XXH_get32bits(p)) * PRIME64_1; \
+    p+=4;                    \
+    h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
+
+#define PROCESS8_64 {        \
+    U64 const k1 = XXH64_round(0, XXH_get64bits(p)); \
+    p+=8;                    \
+    h64 ^= k1;               \
+    h64  = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4; \
+}
+
+    switch(len&31) {
+      case 24: PROCESS8_64;
+                    /* fallthrough */
+      case 16: PROCESS8_64;
+                    /* fallthrough */
+      case  8: PROCESS8_64;
+               return XXH64_avalanche(h64);
+
+      case 28: PROCESS8_64;
+                    /* fallthrough */
+      case 20: PROCESS8_64;
+                    /* fallthrough */
+      case 12: PROCESS8_64;
+                    /* fallthrough */
+      case  4: PROCESS4_64;
+               return XXH64_avalanche(h64);
+
+      case 25: PROCESS8_64;
+                    /* fallthrough */
+      case 17: PROCESS8_64;
+                    /* fallthrough */
+      case  9: PROCESS8_64;
+               PROCESS1_64;
+               return XXH64_avalanche(h64);
+
+      case 29: PROCESS8_64;
+                    /* fallthrough */
+      case 21: PROCESS8_64;
+                    /* fallthrough */
+      case 13: PROCESS8_64;
+                    /* fallthrough */
+      case  5: PROCESS4_64;
+               PROCESS1_64;
+               return XXH64_avalanche(h64);
+
+      case 26: PROCESS8_64;
+                    /* fallthrough */
+      case 18: PROCESS8_64;
+                    /* fallthrough */
+      case 10: PROCESS8_64;
+               PROCESS1_64;
+               PROCESS1_64;
+               return XXH64_avalanche(h64);
+
+      case 30: PROCESS8_64;
+                    /* fallthrough */
+      case 22: PROCESS8_64;
+                    /* fallthrough */
+      case 14: PROCESS8_64;
+                    /* fallthrough */
+      case  6: PROCESS4_64;
+               PROCESS1_64;
+               PROCESS1_64;
+               return XXH64_avalanche(h64);
+
+      case 27: PROCESS8_64;
+                    /* fallthrough */
+      case 19: PROCESS8_64;
+                    /* fallthrough */
+      case 11: PROCESS8_64;
+               PROCESS1_64;
+               PROCESS1_64;
+               PROCESS1_64;
+               return XXH64_avalanche(h64);
+
+      case 31: PROCESS8_64;
+                    /* fallthrough */
+      case 23: PROCESS8_64;
+                    /* fallthrough */
+      case 15: PROCESS8_64;
+                    /* fallthrough */
+      case  7: PROCESS4_64;
+                    /* fallthrough */
+      case  3: PROCESS1_64;
+                    /* fallthrough */
+      case  2: PROCESS1_64;
+                    /* fallthrough */
+      case  1: PROCESS1_64;
+                    /* fallthrough */
+      case  0: return XXH64_avalanche(h64);
+    }
+
+    /* impossible to reach */
+    assert(0);
+    return 0;  /* unreachable, but some compilers complain without it */
+}
+
+FORCE_INLINE U64
+XXH64_endian_align(const void* input, size_t len, U64 seed,
+                XXH_endianess endian, XXH_alignment align)
+{
+    const BYTE* p = (const BYTE*)input;
+    const BYTE* bEnd = p + len;
+    U64 h64;
+
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+    if (p==NULL) {
+        len=0;
+        bEnd=p=(const BYTE*)(size_t)32;
+    }
+#endif
+
+    if (len>=32) {
+        const BYTE* const limit = bEnd - 32;
+        U64 v1 = seed + PRIME64_1 + PRIME64_2;
+        U64 v2 = seed + PRIME64_2;
+        U64 v3 = seed + 0;
+        U64 v4 = seed - PRIME64_1;
+
+        do {
+            v1 = XXH64_round(v1, XXH_get64bits(p)); p+=8;
+            v2 = XXH64_round(v2, XXH_get64bits(p)); p+=8;
+            v3 = XXH64_round(v3, XXH_get64bits(p)); p+=8;
+            v4 = XXH64_round(v4, XXH_get64bits(p)); p+=8;
+        } while (p<=limit);
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+        h64 = XXH64_mergeRound(h64, v1);
+        h64 = XXH64_mergeRound(h64, v2);
+        h64 = XXH64_mergeRound(h64, v3);
+        h64 = XXH64_mergeRound(h64, v4);
+
+    } else {
+        h64  = seed + PRIME64_5;
+    }
+
+    h64 += (U64) len;
+
+    return XXH64_finalize(h64, p, len, endian, align);
+}
+
+
+XXH_PUBLIC_API unsigned long long XXH64 (const void* input, size_t len, unsigned long long seed)
+{
+#if 0
+    /* Simple version, good for code maintenance, but unfortunately slow for small inputs */
+    XXH64_state_t state;
+    XXH64_reset(&state, seed);
+    XXH64_update(&state, input, len);
+    return XXH64_digest(&state);
+#else
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if (XXH_FORCE_ALIGN_CHECK) {
+        if ((((size_t)input) & 7)==0) {  /* Input is aligned, let's leverage the speed advantage */
+            if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+                return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned);
+            else
+                return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned);
+    }   }
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned);
+    else
+        return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned);
+#endif
+}
+
+/*======   Hash Streaming   ======*/
+
+XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void)
+{
+    return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t));
+}
+XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr)
+{
+    XXH_free(statePtr);
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t* srcState)
+{
+    memcpy(dstState, srcState, sizeof(*dstState));
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, unsigned long long seed)
+{
+    XXH64_state_t state;   /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
+    memset(&state, 0, sizeof(state));
+    state.v1 = seed + PRIME64_1 + PRIME64_2;
+    state.v2 = seed + PRIME64_2;
+    state.v3 = seed + 0;
+    state.v4 = seed - PRIME64_1;
+     /* do not write into reserved, planned to be removed in a future version */
+    memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved));
+    return XXH_OK;
+}
+
+FORCE_INLINE XXH_errorcode
+XXH64_update_endian (XXH64_state_t* state, const void* input, size_t len, XXH_endianess endian)
+{
+    if (input==NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+        return XXH_OK;
+#else
+        return XXH_ERROR;
+#endif
+
+    {   const BYTE* p = (const BYTE*)input;
+        const BYTE* const bEnd = p + len;
+
+        state->total_len += len;
+
+        if (state->memsize + len < 32) {  /* fill in tmp buffer */
+            XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, len);
+            state->memsize += (U32)len;
+            return XXH_OK;
+        }
+
+        if (state->memsize) {   /* tmp buffer is full */
+            XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, 32-state->memsize);
+            state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0, endian));
+            state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1, endian));
+            state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2, endian));
+            state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3, endian));
+            p += 32-state->memsize;
+            state->memsize = 0;
+        }
+
+        if (p+32 <= bEnd) {
+            const BYTE* const limit = bEnd - 32;
+            U64 v1 = state->v1;
+            U64 v2 = state->v2;
+            U64 v3 = state->v3;
+            U64 v4 = state->v4;
+
+            do {
+                v1 = XXH64_round(v1, XXH_readLE64(p, endian)); p+=8;
+                v2 = XXH64_round(v2, XXH_readLE64(p, endian)); p+=8;
+                v3 = XXH64_round(v3, XXH_readLE64(p, endian)); p+=8;
+                v4 = XXH64_round(v4, XXH_readLE64(p, endian)); p+=8;
+            } while (p<=limit);
+
+            state->v1 = v1;
+            state->v2 = v2;
+            state->v3 = v3;
+            state->v4 = v4;
+        }
+
+        if (p < bEnd) {
+            XXH_memcpy(state->mem64, p, (size_t)(bEnd-p));
+            state->memsize = (unsigned)(bEnd-p);
+        }
+    }
+
+    return XXH_OK;
+}
+
+XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* state_in, const void* input, size_t len)
+{
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH64_update_endian(state_in, input, len, XXH_littleEndian);
+    else
+        return XXH64_update_endian(state_in, input, len, XXH_bigEndian);
+}
+
+FORCE_INLINE U64 XXH64_digest_endian (const XXH64_state_t* state, XXH_endianess endian)
+{
+    U64 h64;
+
+    if (state->total_len >= 32) {
+        U64 const v1 = state->v1;
+        U64 const v2 = state->v2;
+        U64 const v3 = state->v3;
+        U64 const v4 = state->v4;
+
+        h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18);
+        h64 = XXH64_mergeRound(h64, v1);
+        h64 = XXH64_mergeRound(h64, v2);
+        h64 = XXH64_mergeRound(h64, v3);
+        h64 = XXH64_mergeRound(h64, v4);
+    } else {
+        h64  = state->v3 /*seed*/ + PRIME64_5;
+    }
+
+    h64 += (U64) state->total_len;
+
+    return XXH64_finalize(h64, state->mem64, (size_t)state->total_len, endian, XXH_aligned);
+}
+
+XXH_PUBLIC_API unsigned long long XXH64_digest (const XXH64_state_t* state_in)
+{
+    XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
+
+    if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT)
+        return XXH64_digest_endian(state_in, XXH_littleEndian);
+    else
+        return XXH64_digest_endian(state_in, XXH_bigEndian);
+}
+
+
+/*====== Canonical representation   ======*/
+
+XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash)
+{
+    XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t));
+    if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash);
+    memcpy(dst, &hash, sizeof(*dst));
+}
+
+XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src)
+{
+    return XXH_readBE64(src);
+}
+
+#endif  /* XXH_NO_LONG_LONG */
diff --git a/cviruntime/src/soc/180x/CMakeLists.txt b/cviruntime/src/soc/180x/CMakeLists.txt
new file mode 100644
index 000000000..011a2590a
--- /dev/null
+++ b/cviruntime/src/soc/180x/CMakeLists.txt
@@ -0,0 +1,29 @@
+cmake_minimum_required(VERSION 2.8.0)
+
+include_directories(./)
+include_directories(../common)
+include_directories(${CMAKE_SYSROOT}/include)
+add_definitions(-DION_CACHE_OPEN)
+add_definitions(-DMULTI_PROCESS)
+
+set(RUNTIME_SOURCES ${RUNTIME_SOURCES}
+   ${CMAKE_CURRENT_SOURCE_DIR}/../runtime_bmkernel.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/bmruntime_soc.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/bm_dmabuf.c
+   ${CMAKE_CURRENT_SOURCE_DIR}/tpu_pmu.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/cvi_rt_180x.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/cvi180x_device_mem.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/../common/cvi_device_mem.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/../common/cvi_rt_base.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/../common/cviruntime_context.cpp)
+
+set(EXTRA_LIBS ${EXTRA_LIBS} rt dl pthread)
+
+add_library(cviruntime-static STATIC ${RUNTIME_SOURCES})
+set_property(TARGET cviruntime-static PROPERTY POSITION_INDEPENDENT_CODE ON)
+
+add_library(cviruntime SHARED ${RUNTIME_SOURCES})
+target_link_libraries(cviruntime cvikernel ${EXTRA_LIBS})
+
+install(TARGETS cviruntime DESTINATION lib)
+install(TARGETS cviruntime-static DESTINATION lib)
diff --git a/cviruntime/src/soc/180x/bm_dmabuf.c b/cviruntime/src/soc/180x/bm_dmabuf.c
new file mode 100644
index 000000000..a87dd8557
--- /dev/null
+++ b/cviruntime/src/soc/180x/bm_dmabuf.c
@@ -0,0 +1,532 @@
+#include <stdio.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <string.h>
+
+#include <bmkernel/bm1822/bmkernel_1822.h>
+#include <bmkernel/bm1822/bm1822_tiu_reg.h>
+#include <bmkernel/bm1822/bm1822_tdma_reg.h>
+#include <bmkernel/reg_tiu.h>
+#include <bmkernel/reg_tdma.h>
+#include <bmkernel/reg_bdcast.h>
+#include <bmkernel/bm_regcpu.h>
+#include "bmruntime_internal.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define __ALIGN_MASK(x,mask)    (((x)+(mask))&~(mask))
+#define ALIGN(x,a)              __ALIGN_MASK(x,(__typeof__(x))(a)-1)
+
+#define BD_DESC_ALIGN_SIZE (1 << BDC_ENGINE_CMD_ALIGNED_BIT)
+#define GDMA_DESC_ALIGN_SIZE (1 << TDMA_DESCRIPTOR_ALIGNED_BIT)
+#define BD_EOD_PADDING_BYTES (128)
+
+typedef struct {
+  cmd_hdr_t hdr;
+  uint32_t body[0];
+} DESC;
+
+static bmerr_t traverse_start(uint8_t *cmdbuf, DESC **desc)
+{
+  if (!cmdbuf) {
+    TPU_LOG_WARNING("cmdbuf is null!\n");
+    return BM_ERR_DATA;
+  }
+  DESC *tmp_desc = (DESC *)cmdbuf;
+  if (tmp_desc->hdr.magic != CMDBUF_HDR_MAGIC_180X) {
+    TPU_LOG_WARNING("traverse_start magic num error!\n");
+    return BM_ERR_DATA;
+  }
+  *desc = tmp_desc; 
+  return BM_SUCCESS;
+}
+
+static bmerr_t traverse_next(DESC *desc, uint8_t *cmdbuf, size_t size, DESC **next_desc) {
+  DESC *tmp_next_desc = (DESC *)((uint8_t *)desc + cmd_hdr_len(&desc->hdr) + sizeof(cmd_hdr_t));
+  if ((uint8_t *)tmp_next_desc >= cmdbuf + size) {
+    *next_desc = NULL;
+    return BM_SUCCESS;
+  }
+  if (tmp_next_desc->hdr.magic != CMDBUF_HDR_MAGIC_180X) {
+    TPU_LOG_WARNING("traverse_next magic num error!\n");
+    return BM_ERR_DATA;
+  }
+  *next_desc = tmp_next_desc;
+  return BM_SUCCESS;
+}
+
+static bmerr_t is_last_desc(DESC *desc, uint8_t *cmdbuf, size_t size, bool *is_last) {
+  DESC *next_desc;
+  bmerr_t ret = traverse_next(desc, cmdbuf, size, &next_desc);
+  if (ret != BM_SUCCESS) {
+    TPU_LOG_WARNING("is_last_desc traverse_next failed\n");
+    return ret;
+  }
+  *is_last = next_desc ? false : true;
+  return BM_SUCCESS;
+}
+
+static void reorder_bd_cmdbuf_reg(uint8_t *cmdbuf)
+{
+  int total_bits = BD_REG_BYTES * 8;
+
+  for (int i = 0; i < total_bits; i += 128)
+    cmdbuf[(i + 128 - 8) / 8] |= (i / 128) << 4;
+
+  uint8_t tmp[128 / 8];
+  uint8_t *last = &cmdbuf[(total_bits - 128) / 8];
+  memcpy(tmp, last, sizeof(tmp));
+  memcpy(last, cmdbuf, sizeof(tmp));
+  memcpy(cmdbuf, tmp, sizeof(tmp));
+}
+
+static void adjust_desc_tdma(uint32_t *body, bool eod)
+{
+  if (eod) {
+    body[0] |= (1 << TDMA_ACCPI0_EOD_BIT);
+    body[0] |= (1 << TDMA_ACCPI0_INTERRUPT_BIT); // interrupt
+  }
+  body[0] |= (1 << TDMA_ACCPI0_BARRIER_ENABLE_BIT);
+}
+
+static void adjust_desc_bd(uint32_t *body, bool eod)
+{
+  if (eod) {
+    tiu_reg_t reg;
+    parse_tiu_reg(&reg, body);
+    reg.cmd_end = 1;
+    reg.cmd_intr_en = 1;
+    emit_tiu_reg(&reg, body);
+  }
+  reorder_bd_cmdbuf_reg((uint8_t *)body);
+}
+
+bmerr_t cvi180x_dmabuf_relocate(uint8_t *dmabuf, uint64_t dmabuf_devaddr, uint32_t original_size, uint32_t pmubuf_size)
+{
+  dma_hdr_t *header = (dma_hdr_t *)dmabuf;
+  uint64_t tmpAddress = 0;
+
+  if (header->dmabuf_magic_m != TPU_DMABUF_HEADER_M) {
+    TPU_LOG_WARNING("dmabuf relocate magic num error!\n");
+    return BM_ERR_DATA;
+  }
+  cvi_cpu_desc_t *desc = (cvi_cpu_desc_t *)(dmabuf + sizeof(dma_hdr_t));
+
+  for (uint32_t i = 0; i < header->cpu_desc_count; i++, desc++) {
+    uint32_t tiu_num = desc->num_tiu & 0xFFFF;
+    uint32_t tdma_num = desc->num_tdma & 0xFFFF;
+
+    if (tiu_num) {
+      tmpAddress = dmabuf_devaddr + desc->offset_tiu;
+      //TPU_LOG_DEBUG("bd tmpAddress = 0x%lu\n", tmpAddress);
+      desc->offset_tiu_ori_bk = desc->offset_tiu;
+      desc->offset_tiu = tmpAddress >> BDC_ENGINE_CMD_ALIGNED_BIT;
+    }
+
+    if (tdma_num) {
+      tmpAddress = dmabuf_devaddr + desc->offset_tdma;
+      //TPU_LOG_DEBUG("tdma tmpAddress = 0x%lu\n", tmpAddress);
+      desc->offset_tdma_ori_bk = desc->offset_tdma;
+      desc->offset_tdma = tmpAddress >> TDMA_DESCRIPTOR_ALIGNED_BIT;
+    }
+
+    //set pmubuf_addr_p to enable pmu kick
+    header->pmubuf_size = pmubuf_size;
+    header->pmubuf_offset = original_size;
+  }
+  return BM_SUCCESS;
+}
+
+static bmerr_t desc_sync_id(DESC *desc, uint32_t *sync_id)
+{
+  switch (desc->hdr.engine_id) {
+    case BMK1822_TIU: {
+      tiu_reg_t reg;
+      parse_tiu_reg(&reg, desc->body);
+      *sync_id = reg.cmd_id_tpu;
+      return BM_SUCCESS;
+    }
+    case BMK1822_TDMA: {
+      tdma_reg_t reg;
+      parse_tdma_reg(&reg, desc->body);
+      *sync_id = reg.cmd_id;
+      return BM_SUCCESS;
+    }
+    default:
+      return BM_ERR_DATA;
+  }
+}
+
+static bmerr_t fill_header_and_arm(uint8_t *cmdbuf, size_t sz, uint8_t *dmabuf, uint64_t *tiu_offset, uint64_t *tdma_offset)
+{
+  dma_hdr_t header = {0};
+  header.dmabuf_magic_m = TPU_DMABUF_HEADER_M;
+  header.dmabuf_magic_s = 0x1822;
+  bmerr_t ret = BM_SUCCESS;
+
+  cvi_cpu_desc_t *segments = (cvi_cpu_desc_t *)(dmabuf + sizeof(dma_hdr_t));
+  DESC *desc = NULL;
+  size_t desc_nums[BMK1822_ENGINE_NUM] = {0};
+  size_t counters[BMK1822_ENGINE_NUM] = {0};
+  size_t desc_size[BMK1822_ENGINE_NUM] = {0};
+
+  if (!segments) {
+    TPU_LOG_WARNING("fill_header_and_arm segments is null\n");
+    return BM_ERR_DATA;
+  }
+  // fill arm descs
+  ret = traverse_start(cmdbuf, &desc);
+  if (ret != BM_SUCCESS) {
+    TPU_LOG_WARNING("fill_header_and_arm traverse start failed\n");
+    return ret;
+  }
+
+  while (desc != NULL) {
+    uint32_t engine_id = (uint32_t)desc->hdr.engine_id;
+    counters[engine_id]++;
+    desc_nums[engine_id]++;
+    if (engine_id != BMK1822_CPU) {
+      // a new arm desc inserted to do sync operation
+      uint32_t sync_id = 0;
+      ret = desc_sync_id(desc, &sync_id);
+      if (ret != BM_SUCCESS) {
+        TPU_LOG_WARNING("fill_header_and_arm desc_sync_id failed\n");
+        return ret;
+      }
+      bool is_last = false;
+      ret = is_last_desc(desc, cmdbuf, sz, &is_last);
+      if (ret != BM_SUCCESS) {
+        TPU_LOG_WARNING("fill_header_and_arm is_last_desc failed\n");
+        return ret;
+      }
+      if (sync_id == 0xFFFF || is_last) {
+        desc_nums[BMK1822_CPU]++;
+        cvi_cpu_desc_t *arm = segments + desc_nums[BMK1822_CPU] - 1;
+        memset(arm, 0, sizeof(cvi_cpu_desc_t));
+        arm->op_type = CPU_OP_SYNC;
+        arm->num_tiu = counters[BMK1822_TIU];
+        arm->num_tdma = counters[BMK1822_TDMA];
+        strncpy(arm->str, "layer_end", sizeof(arm->str) - 1);
+        if (counters[BMK1822_TIU] != 0) {
+          desc_size[BMK1822_TIU] =
+              ALIGN(desc_size[BMK1822_TIU] + counters[BMK1822_TIU] * BD_REG_BYTES + BD_EOD_PADDING_BYTES,
+                    BD_DESC_ALIGN_SIZE);
+        }
+        counters[BMK1822_TIU] = 0;
+        counters[BMK1822_TDMA] = 0;
+      }
+    } else {
+      cvi_cpu_desc_t *arm = segments + desc_nums[BMK1822_CPU] - 1;
+      memcpy(arm, &(desc->body), sizeof(cvi_cpu_desc_t));
+      arm->num_tiu = counters[BMK1822_TIU];
+      arm->num_tdma = counters[BMK1822_TDMA];
+      if (counters[BMK1822_TIU] != 0) {
+        desc_size[BMK1822_TIU] =
+            ALIGN(desc_size[BMK1822_TIU] + counters[BMK1822_TIU] * BD_REG_BYTES + BD_EOD_PADDING_BYTES,
+                  BD_DESC_ALIGN_SIZE);
+      }
+      counters[BMK1822_TIU] = 0;
+      counters[BMK1822_TDMA] = 0;
+    }
+    ret = traverse_next(desc, cmdbuf, sz, &desc);
+    if (ret != BM_SUCCESS) {
+      TPU_LOG_WARNING("fill_header_and_arm traverse next failed\n");
+      return ret;
+    }
+  }
+  desc_size[BMK1822_CPU] = desc_nums[BMK1822_CPU] * CPU_ENGINE_BYTES;
+  desc_size[BMK1822_TDMA] = desc_nums[BMK1822_TDMA] * GDMA_DESC_ALIGN_SIZE;
+
+  (*tiu_offset) = ALIGN(sizeof(header) + desc_size[BMK1822_CPU], BD_DESC_ALIGN_SIZE);
+  (*tdma_offset) = ALIGN((*tiu_offset) + desc_size[BMK1822_TIU], GDMA_DESC_ALIGN_SIZE);
+
+  // dma hdr + arm descs + bd descs + tdma descs
+  header.dmabuf_size = (*tdma_offset) + desc_size[BMK1822_TDMA];
+  header.cpu_desc_count = desc_nums[BMK1822_CPU];
+  header.bd_desc_count = desc_nums[BMK1822_TIU];
+  header.tdma_desc_count = desc_nums[BMK1822_TDMA];
+
+  //TPU_LOG_DEBUG("header.dmabuf_size = %d\n", header.dmabuf_size);
+  // TPU_LOG_DEBUG("header.cpu_desc_count = %d\n", header.cpu_desc_count);
+  // TPU_LOG_DEBUG("header.bd_desc_count = %d\n", header.bd_desc_count);
+  // TPU_LOG_DEBUG("header.tdma_desc_count = %d\n", header.tdma_desc_count);
+
+  memcpy(dmabuf, &header, sizeof(header));
+  return BM_SUCCESS;
+}
+
+static bmerr_t fill_bd_and_tdma(uint8_t *cmdbuf, size_t sz, uint8_t *dmabuf, uint64_t tiu_offset, uint64_t tdma_offset)
+{
+  dma_hdr_t *p_header = (dma_hdr_t *)dmabuf;
+  cvi_cpu_desc_t *segments = (cvi_cpu_desc_t *)(dmabuf + sizeof(dma_hdr_t));
+  DESC *desc = NULL;
+  bmerr_t ret = traverse_start(cmdbuf, &desc);
+  if (ret != BM_SUCCESS) {
+    TPU_LOG_WARNING("fill_bd_and_tdma traverse start failed\n");
+    return ret;
+  }
+  //uint64_t address_max = 0x0;
+  
+  for (uint32_t i = 0; i < p_header->cpu_desc_count; i++) {
+
+    cvi_cpu_desc_t *arm = segments + i;
+    
+    uint32_t tiu_num = arm->num_tiu & 0xFFFF;
+    uint32_t tdma_num = arm->num_tdma & 0xFFFF;
+
+    if (tiu_num) {
+      tiu_offset = ALIGN(tiu_offset, 1 << BDC_ENGINE_CMD_ALIGNED_BIT);
+      arm->offset_tiu = tiu_offset;
+      //TPU_LOG_DEBUG("arm->offset_tiu = 0x%x \n", arm->offset_tiu);
+    }
+
+    if (tdma_num) {
+      tdma_offset = ALIGN(tdma_offset, 1 << TDMA_DESCRIPTOR_ALIGNED_BIT);
+      arm->offset_tdma = tdma_offset;
+      //TPU_LOG_DEBUG("arm->offset_tdma = 0x%x \n", arm->offset_tdma);
+    }
+
+    while (tiu_num || tdma_num) {
+      uint32_t engine_id = (uint32_t)desc->hdr.engine_id;
+      void *p_body = NULL;
+
+      switch (engine_id) {
+        case BMK1822_TIU:
+          tiu_num--;
+          p_body = (void *)(dmabuf + tiu_offset);
+          tiu_offset += BD_REG_BYTES;
+          memcpy(p_body, desc->body, desc->hdr.len);
+          adjust_desc_bd((uint32_t *)p_body, tiu_num == 0);
+          break;
+        case BMK1822_TDMA:
+          tdma_num--;
+          tdma_offset = ALIGN(tdma_offset, GDMA_DESC_ALIGN_SIZE);
+          p_body = (void *)(dmabuf + tdma_offset);
+          tdma_offset += GDMA_DESC_ALIGN_SIZE;
+          memcpy(p_body, desc->body, desc->hdr.len);
+
+#if 0 //debug feature, for checking if neuron overshoot
+{
+          tdma_reg_t reg_tdma = {0};
+          uint64_t tdma_address = 0, tdma_address2 = 0;
+
+          parse_tdma_reg(&reg_tdma, p_body);
+
+          if (reg_tdma.src_base_reg_sel == 0) {
+            //  reg.trans_dir = 2; // 0:tg2l, 1:l2tg, 2:g2g, 3:l2l
+            if (reg_tdma.trans_dir == 0) {
+              TPU_LOG_DEBUG ("src_base_addr_high=%x, src_base_addr_low=%x\n", reg_tdma.src_base_addr_high, reg_tdma.src_base_addr_low);
+              tdma_address = ((uint64_t)reg_tdma.src_base_addr_high) << 32 | (uint64_t)reg_tdma.src_base_addr_low;
+            } else if (reg_tdma.trans_dir == 1) {
+              TPU_LOG_DEBUG ("dst_base_addr_high=%x, dst_base_addr_low=%x\n", reg_tdma.dst_base_addr_high, reg_tdma.dst_base_addr_low);
+              tdma_address = ((uint64_t)reg_tdma.dst_base_addr_high) << 32 | (uint64_t)reg_tdma.dst_base_addr_low;
+            } else if (reg_tdma.trans_dir == 2) {
+              TPU_LOG_DEBUG ("dst_base_addr_high=%x, dst_base_addr_low=%x\n", reg_tdma.dst_base_addr_high, reg_tdma.dst_base_addr_low);
+              tdma_address = ((uint64_t)reg_tdma.src_base_addr_high) << 32 | (uint64_t)reg_tdma.src_base_addr_low;
+              tdma_address2 = ((uint64_t)reg_tdma.dst_base_addr_high) << 32 | (uint64_t)reg_tdma.dst_base_addr_low;
+
+              if (tdma_address2 > tdma_address) {
+                tdma_address = tdma_address2;
+              }
+            }
+            
+            if (tdma_address > address_max) {
+              address_max = tdma_address;
+              TPU_LOG_DEBUG("address_max=%llx\n", address_max);
+            }
+          }
+}
+#endif
+          adjust_desc_tdma((uint32_t *)p_body, tdma_num == 0);
+          break;
+        default:
+          break;
+      }
+      ret = traverse_next(desc, cmdbuf, sz, &desc);
+      if (ret != BM_SUCCESS) {
+        TPU_LOG_WARNING("fill_bd_and_tdma traverse next failed\n");
+        return ret;
+      }
+    }
+
+    // padding zero after eod to workaroud hardware bug
+    if (arm->num_tiu & 0xFFFF) {
+      void *buf = (void *)(dmabuf + tiu_offset);
+      memset(buf, 0, BD_EOD_PADDING_BYTES);
+      tiu_offset += BD_EOD_PADDING_BYTES;
+    }
+  }
+  return BM_SUCCESS;
+}
+
+bmerr_t cvi180x_dmabuf_convert(uint8_t *cmdbuf, size_t sz, uint8_t *dmabuf)
+{
+  uint64_t tiu_offset = 0;
+  uint64_t tdma_offset = 0;
+  bmerr_t ret = BM_SUCCESS;
+  ret = fill_header_and_arm(cmdbuf, sz, dmabuf, &tiu_offset, &tdma_offset);
+  if (ret != BM_SUCCESS) {
+    TPU_LOG_WARNING("dmabuf_convert fill_header_and_arm failed\n");
+    return ret;
+  }
+  ret = fill_bd_and_tdma(cmdbuf, sz, dmabuf, tiu_offset, tdma_offset);
+  if (ret != BM_SUCCESS) {
+    TPU_LOG_WARNING("dmabuf_convert fill_bd_and_tdma failed\n");
+    return ret;
+  }
+  return BM_SUCCESS;
+}
+
+#define PER_DES_SIZE 16
+#define PADDING_SIZE (1024 * 1024)
+bmerr_t cvi180x_dmabuf_size(uint8_t *cmdbuf, size_t sz, size_t *psize, size_t *pmu_size)
+{
+  size_t tdma_desc_num = {0};
+  size_t counters[BMK1822_ENGINE_NUM] = {0};
+  size_t bd_size = 0;
+  size_t dmabuf_size = 0;
+
+  uint32_t tiu_cnt = 0;
+  uint32_t tdma_cnt = 0;
+  bmerr_t ret = BM_SUCCESS;
+
+  // calculate desc numbers
+  DESC *desc = NULL;
+  ret = traverse_start(cmdbuf, &desc);
+  if (ret != BM_SUCCESS) {
+    TPU_LOG_WARNING("dmabuf_size traverse start failed\n");
+    return ret;
+  }
+
+  while (desc != NULL) {
+    uint32_t engine_id = (uint32_t)desc->hdr.engine_id;
+    counters[engine_id]++;
+    if (engine_id != BMK1822_CPU) {
+      // a new arm desc inserted to do sync operation
+      uint32_t sync_id = 0;
+      ret = desc_sync_id(desc, &sync_id);
+      if (ret != BM_SUCCESS) {
+        TPU_LOG_WARNING("dmabuf_size desc_sync_id failed\n");
+        return ret;
+      }
+      bool is_last = false;
+      ret = is_last_desc(desc, cmdbuf, sz, &is_last);
+      if (ret != BM_SUCCESS) {
+        TPU_LOG_WARNING("dmabuf_size is_last_desc failed\n");
+        return ret;
+      }
+      if (sync_id == 0xFFFF || is_last) {
+        counters[BMK1822_CPU]++;
+        tdma_desc_num += counters[BMK1822_TDMA];
+        if (counters[BMK1822_TIU] != 0) {
+          bd_size = ALIGN(bd_size + counters[BMK1822_TIU] * BD_REG_BYTES + BD_EOD_PADDING_BYTES,
+                          BD_DESC_ALIGN_SIZE);
+        }
+        tiu_cnt += counters[BMK1822_TIU] & 0xFFFF;
+        tdma_cnt += counters[BMK1822_TDMA] & 0xFFFF;
+        counters[BMK1822_TIU] = 0;
+        counters[BMK1822_TDMA] = 0;
+      }
+    } else {
+      tdma_desc_num += counters[BMK1822_TDMA];
+      if (counters[BMK1822_TIU] != 0) {
+        bd_size = ALIGN(bd_size + counters[BMK1822_TIU] * BD_REG_BYTES + BD_EOD_PADDING_BYTES,
+                        BD_DESC_ALIGN_SIZE);
+      }
+      tiu_cnt += counters[BMK1822_TIU] & 0xFFFF;
+      tdma_cnt += counters[BMK1822_TDMA] & 0xFFFF;
+      counters[BMK1822_TIU] = 0;
+      counters[BMK1822_TDMA] = 0;
+    }
+    ret = traverse_next(desc, cmdbuf, sz, &desc);
+    if (ret != BM_SUCCESS) {
+      TPU_LOG_WARNING("dmabuf_size traverse next failed\n");
+      return ret;
+    }
+  }
+  // dma hdr + arm descs + bd descs + tdma descs
+  dmabuf_size = sizeof(dma_hdr_t) + counters[BMK1822_CPU] * CPU_ENGINE_BYTES;
+  dmabuf_size = ALIGN(dmabuf_size, BD_DESC_ALIGN_SIZE) + bd_size;
+  dmabuf_size = ALIGN(dmabuf_size, GDMA_DESC_ALIGN_SIZE) + tdma_desc_num * GDMA_DESC_ALIGN_SIZE;
+
+  *psize = dmabuf_size;
+
+  *pmu_size = ALIGN((tiu_cnt + tdma_cnt) * PER_DES_SIZE + PADDING_SIZE, 0x1000);
+  return BM_SUCCESS;
+}
+
+bmerr_t cvi180x_arraybase_set(uint8_t *dmabuf, uint32_t arraybase0L, uint32_t arraybase1L, uint32_t arraybase0H, uint32_t arraybase1H)
+{
+  if (!dmabuf) {
+    TPU_LOG_WARNING("arraybase_set dmabuf is null\n");
+    return BM_ERR_DATA;
+  }
+  dma_hdr_t *header = (dma_hdr_t *)dmabuf;
+
+  if (header->dmabuf_magic_m != TPU_DMABUF_HEADER_M) {
+    TPU_LOG_WARNING("arraybase_set  magic num error!\n");
+    return BM_ERR_DATA;
+  }
+  header->arraybase_0_L = arraybase0L;
+  header->arraybase_1_L = arraybase1L;
+  header->arraybase_0_H = arraybase0H;
+  header->arraybase_1_H = arraybase1H;
+  return BM_SUCCESS;
+}
+
+bmerr_t cvi180x_get_pmusize(uint8_t * dmabuf, uint64_t *pmu_size)
+{
+  uint32_t tiu_cnt = 0, tdma_cnt = 0;
+  uint64_t tmp_pmu_size = 0;
+
+  if (!dmabuf) {
+    TPU_LOG_WARNING("get_pmusize dmabuf is null\n");
+    return BM_ERR_DATA;
+  }
+  dma_hdr_t *header = (dma_hdr_t *)dmabuf;
+  if (header->dmabuf_magic_m != TPU_DMABUF_HEADER_M) {
+    TPU_LOG_WARNING("get_pmusize magic num error!\n");
+    return BM_ERR_DATA;
+  }
+
+  cvi_cpu_desc_t *desc = (cvi_cpu_desc_t *)(dmabuf + sizeof(dma_hdr_t));
+
+  for (uint32_t i = 0; i < header->cpu_desc_count; i++, desc++) {
+    tiu_cnt += (desc->num_tiu & 0xFFFF);
+    tdma_cnt += (desc->num_tdma & 0xFFFF);
+  }
+
+  tmp_pmu_size = ALIGN((tiu_cnt + tdma_cnt) * PER_DES_SIZE + PADDING_SIZE, 0x1000);
+  //TPU_LOG_DEBUG("cvi180x_get_pmusize pmusize= %" PRIu64 " \n", pmu_size);
+  *pmu_size = tmp_pmu_size;
+  return BM_SUCCESS;
+}
+
+void cvi180x_dmabuf_dump(uint8_t *dmabuf)
+{
+  TPU_ASSERT(dmabuf, NULL);
+  dma_hdr_t *header = (dma_hdr_t *)dmabuf;
+  TPU_LOG_DEBUG("bmk1822_dmabuf_dump header->arraybase_0_L = 0x%x\n", header->arraybase_0_L);
+  TPU_LOG_DEBUG("bmk1822_dmabuf_dump header->arraybase_1_L = 0x%x\n", header->arraybase_1_L);
+  TPU_LOG_DEBUG("bmk1822_dmabuf_dump header->arraybase_0_H = 0x%x\n", header->arraybase_0_H);
+  TPU_LOG_DEBUG("bmk1822_dmabuf_dump header->arraybase_1_H = 0x%x\n", header->arraybase_1_H);
+  TPU_LOG_DEBUG("bmk1822_dmabuf_dump header->pmubuf_offset = 0x%x\n", header->pmubuf_offset);
+
+  TPU_ASSERT(header->dmabuf_magic_m == TPU_DMABUF_HEADER_M, NULL);
+  cvi_cpu_desc_t *desc = (cvi_cpu_desc_t *)(dmabuf + sizeof(dma_hdr_t));
+
+  for (u32 i = 0; i < header->cpu_desc_count; i++, desc++) {
+    int bd_num = desc->num_tiu & 0xFFFF;
+    int tdma_num = desc->num_tdma & 0xFFFF;
+    u32 bd_offset = desc->offset_tiu;
+    u32 tdma_offset = desc->offset_tdma;
+    TPU_LOG_DEBUG("cvi180x_dmabuf_dump num<bd:%d, tdma:%d>, offset<0x%08x, 0x%08x>\n", bd_num, tdma_num, bd_offset, tdma_offset);
+  }
+}
+
+#ifdef __cplusplus
+}
+#endif
+
diff --git a/cviruntime/src/soc/180x/bmruntime_internal.h b/cviruntime/src/soc/180x/bmruntime_internal.h
new file mode 100644
index 000000000..3508c8f7d
--- /dev/null
+++ b/cviruntime/src/soc/180x/bmruntime_internal.h
@@ -0,0 +1,32 @@
+#ifndef _BM_RUNTIME_INTERNAL_H_
+#define _BM_RUNTIME_INTERNAL_H_
+
+#include <pthread.h>
+#include <bmkernel/bm1822/bmkernel_1822.h>
+#include <bmruntime.h>
+#include <cvikernel/cvikernel.h>
+#include "cvitpu_debug.h"
+#include <bmkernel/bm_regcpu.h>
+#include "bm_types.h"
+
+#ifdef __cplusplus
+	extern "C" {
+#endif
+
+bmerr_t cvi180x_dmabuf_size(uint8_t *cmdbuf, size_t sz, size_t *psize, size_t *pmu_size);
+bmerr_t cvi180x_dmabuf_relocate(uint8_t *dmabuf, uint64_t dmabuf_devaddr, uint32_t original_size, uint32_t pmubuf_size);
+bmerr_t cvi180x_dmabuf_convert(uint8_t *cmdbuf, size_t sz, uint8_t *dmabuf);
+void cvi180x_dmabuf_dump(uint8_t * dmabuf);
+bmerr_t cvi180x_arraybase_set(uint8_t *dmabuf, uint32_t arraybase0L, uint32_t arraybase1L, uint32_t arraybase0H, uint32_t arraybase1H);
+bmerr_t cvi180x_get_pmusize(uint8_t * dmabuf, uint64_t *pmu_size);
+
+uint32_t tpu_pmu_dump_main(uint8_t *v_dma_buf, uint64_t p_dma_buf);
+
+#define TPU_PMUBUF_SIZE         (1024 * 1024 * 2)
+#define TPU_DMABUF_HEADER_M     0xB5B5
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BM_RUNTIME_INTERNAL_H_ */
diff --git a/cviruntime/src/soc/180x/bmruntime_soc.cpp b/cviruntime/src/soc/180x/bmruntime_soc.cpp
new file mode 100644
index 000000000..9827a3ff2
--- /dev/null
+++ b/cviruntime/src/soc/180x/bmruntime_soc.cpp
@@ -0,0 +1,173 @@
+#include <inttypes.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <cstdlib>
+#include <runtime/debug.h>
+#include <bmruntime.h>
+#include <mmpool.h>
+#include "bmruntime_internal.h"
+#include "cvi180x_device_mem.h"
+
+Cvi180xDeviceMem cvi_device;
+
+bmmem_device_t bmmem_device_alloc_raw(bmctx_t ctx, size_t size) {
+  return cvi_device.mem_alloc_raw(ctx, size);
+}
+
+bmmem_device_t bmmem_device_prealloc_raw(bmctx_t ctx, bmmem_device_t mem, uint64_t offset,
+                                         size_t size) {
+  return cvi_device.mem_prealloc_raw(ctx, mem, offset, size);
+}
+
+void bmmem_device_free(bmctx_t ctx, bmmem_device_t mem) {
+  cvi_device.mem_free_raw(ctx, mem);
+}
+
+void bmmem_device_free_ex(uint64_t p_addr) {
+  cvi_device.mem_free_ex(p_addr);
+}
+
+size_t bmmem_device_size(bmmem_device_t mem) {
+  return cvi_device.mem_size(mem);
+}
+
+uint64_t bmmem_device_addr(bmmem_device_t mem) {
+  return cvi_device.mem_p_addr(mem);
+}
+
+uint8_t *bmmem_device_v_addr(bmmem_device_t mem) {
+  return cvi_device.mem_v_addr(mem);
+}
+
+int32_t bmmem_device_inc_ref(bmmem_device_t mem) {
+  return cvi_device.mem_inc_ref(mem);
+}
+
+int32_t bmmem_device_dec_ref(bmmem_device_t mem) {
+  return cvi_device.mem_dec_ref(mem);
+}
+
+bmerr_t bm_memcpy_s2d(bmctx_t ctx, bmmem_device_t dst, uint8_t *src) {
+  return cvi_device.mem_memcpy_s2d(ctx, dst, src);
+}
+
+bmerr_t bm_memcpy_s2d_ex(bmctx_t ctx, bmmem_device_t dst, uint8_t *src, uint64_t offset,
+                         size_t size) {
+  return cvi_device.mem_memcpy_s2d_ex(ctx, dst, src, offset, size);
+}
+
+bmerr_t bm_memcpy_d2s(bmctx_t ctx, uint8_t *dst, bmmem_device_t src) {
+  return cvi_device.mem_memcpy_d2s(ctx, dst, src);
+}
+
+bmerr_t bm_memcpy_d2s_ex(bmctx_t ctx, uint8_t *dst, bmmem_device_t src, uint64_t offset,
+                         size_t size) {
+  return cvi_device.mem_memcpy_d2s_ex(ctx, dst, src, offset, size);
+}
+
+bmerr_t bm_context_create(bmctx_t *ctx) {
+  return cvi_device.context_create(ctx);
+}
+
+bmerr_t bm_bind_device(bmctx_t ctx, bmdev_t dev) {
+  return cvi_device.bind_device(ctx, dev);
+}
+
+void bm_unbind_device(bmctx_t ctx) {
+  return cvi_device.unbind_device(ctx);
+}
+
+bmdev_t bm_get_device(bmctx_t ctx) {
+  return cvi_device.get_device(ctx);
+}
+
+bmerr_t bm_init(int index, bmctx_t *ctx) {
+  return cvi_device.device_init(index, ctx);
+}
+
+void bm_exit(bmctx_t ctx) {
+  cvi_device.device_exit(ctx);
+}
+
+bmerr_t bm_load_cmdbuf(bmctx_t ctx, uint8_t *cmdbuf, size_t sz,
+                       uint64_t neuron_gaddr, uint64_t weight_gaddr,
+                       bool enable_pmu, bmmem_device_t *cmdbuf_mem) {
+  return cvi_device.load_cmdbuf(ctx, cmdbuf, sz, neuron_gaddr,
+                                 weight_gaddr, enable_pmu, cmdbuf_mem);
+}
+
+bmerr_t cvi_load_cmdbuf_tee(bmctx_t ctx, uint8_t *cmdbuf, size_t sz,
+                       uint64_t neuron_gaddr, uint64_t weight_gaddr, uint32_t weight_len, bmmem_device_t *cmdbuf_mem)
+{
+  return cvi_device.load_cmdbuf_tee(ctx, cmdbuf, sz, neuron_gaddr,
+                                     weight_gaddr, weight_len, cmdbuf_mem);
+}
+
+bmerr_t cvi_run_cmdbuf_tee(bmctx_t ctx, uint16_t *seq_no, uint64_t dmabuf_addr, cvi_array_base *array_base)
+{
+  return cvi_device.run_cmdbuf_tee(ctx, seq_no, dmabuf_addr, array_base);
+}
+
+bmerr_t bm_run_cmdbuf(bmctx_t ctx, bmmem_device_t cmdbuf_mem, uint16_t *seq_no) {
+  return cvi_device.run_cmdbuf(ctx, cmdbuf_mem, seq_no);
+}
+
+bmerr_t bm_run_cmdbuf_ex(bmctx_t ctx, bmmem_device_t cmdbuf_mem, uint16_t *seq_no,
+                       uint64_t input_base_addr, uint64_t output_base_addr) {
+  return cvi_device.run_cmdbuf_ex(ctx, cmdbuf_mem, seq_no, input_base_addr, output_base_addr);
+}
+
+bmerr_t bm_run_cmdbuf_ex2(bmctx_t ctx, bmmem_device_t cmdbuf_mem, uint16_t *seq_no,
+                       cvi_array_base *p_array_base) {
+  return cvi_device.run_cmdbuf_ex2(ctx, cmdbuf_mem, seq_no, p_array_base);
+}
+
+bmerr_t cvi_run_async(bmctx_t ctx, bmmem_device_t cmdbuf_mem)
+{
+  return cvi_device.run_async(ctx, cmdbuf_mem);
+}
+
+bmerr_t bm_send_cmdbuf(bmctx_t ctx, uint8_t *cmdbuf, size_t sz, uint16_t *seq_no) {
+  return cvi_device.send_cmdbuf(ctx, cmdbuf, sz, seq_no);
+}
+
+bmerr_t bm_wait_cmdbuf_done(bmctx_t ctx, uint16_t seq_no) {
+  return cvi_device.wait_cmdbuf_done(ctx, seq_no);
+}
+
+bmerr_t cvi_wait_cmdbuf_all(bmctx_t ctx) {
+  return cvi_device.wait_cmdbuf_all(ctx);
+}
+
+bmerr_t bm_run_cmdbuf_pio(bmctx_t ctx, uint8_t *cmdbuf, size_t sz) {
+  return cvi_device.run_cmdbuf_pio(ctx, cmdbuf, sz);
+}
+
+void bm_device_set_base_reg(bmctx_t ctx, uint32_t inx, uint64_t addr) {
+  cvi_device.set_base_reg(ctx, inx, addr);
+}
+
+uint64_t bm_device_read_base_reg(bmctx_t ctx, u32 inx) {
+  return cvi_device.read_base_reg(ctx, inx);
+}
+
+int bm_device_get_chip_ver(bmdev_t dev) {
+  return cvi_device.get_chip_ver(dev);
+}
+
+bmerr_t bm_parse_pmubuf(bmmem_device_t cmdbuf_mem, uint8_t **buf_start, uint32_t *buf_len) {
+  return cvi_device.parse_pmubuf(cmdbuf_mem, buf_start, buf_len);
+}
+
+void cviruntime_cvikernel_create(bmctx_t ctx, void **p_bk_ctx) {
+  cvi_device.cvikernel_create(ctx, p_bk_ctx);
+}
+
+void cviruntime_cvikernel_submit(bmctx_t ctx) {
+  cvi_device.cvikernel_submit(ctx);
+}
+
+void cviruntime_cvikernel_destroy(bmctx_t ctx) {
+  cvi_device.cvikernel_destroy(ctx);
+}
diff --git a/cviruntime/src/soc/180x/cvi180x_device_mem.cpp b/cviruntime/src/soc/180x/cvi180x_device_mem.cpp
new file mode 100644
index 000000000..f8b66ada1
--- /dev/null
+++ b/cviruntime/src/soc/180x/cvi180x_device_mem.cpp
@@ -0,0 +1,291 @@
+#include <cstdlib>
+#include <memory>
+#include <cstring>
+#include "cvi180x_device_mem.h"
+
+Cvi180xDeviceMem::Cvi180xDeviceMem() {
+  GLOBAL_MEM_START_ADDR = 0x00;
+  g_gmem_size = 1ULL << 30; // 1GB
+  tpu_dmabuf_header_m = 0xB5B5;
+}
+
+Cvi180xDeviceMem::~Cvi180xDeviceMem() {}
+
+
+bmerr_t Cvi180xDeviceMem::device_open(int index, bmdev_t *dev)
+{
+  bm_device_t *pdev = new bm_device_t;
+
+  BMDEV_LOCK_INIT(pdev);
+  pdev->index = index;
+  pdev->info.info182x = bmk1822_chip_info();
+  pdev->gmem_size = g_gmem_size;
+
+  const char* tpu_dev_name_defalut = TPU_DEV_NAME;
+  const char* tpu_dev_name_env = std::getenv("TPU_DEV");
+  const char *tpu_dev_name = tpu_dev_name_defalut;
+  if (tpu_dev_name_env) {
+    tpu_dev_name = tpu_dev_name_env;
+  }
+
+  pdev->dev_fd = open(tpu_dev_name, O_RDWR);
+  if (pdev->dev_fd <= 0) {
+    TPU_LOG_WARNING("open %s failed\n", tpu_dev_name);
+    return BM_ERR_FAILURE;
+  }
+
+  pdev->ion_fd = open(ION_DEV_NAME, O_RDWR);
+  if (pdev->ion_fd <= 0) {
+    TPU_LOG_WARNING("open %s failed\n", ION_DEV_NAME);
+    return BM_ERR_FAILURE;
+  }
+
+  int ret = ion_query_heap(pdev);
+  TPU_ASSERT(ret == BM_SUCCESS, NULL);
+
+  *dev = pdev;
+
+  return BM_SUCCESS;
+}
+
+void Cvi180xDeviceMem::device_close(bmdev_t dev)
+{
+  close(dev->ion_fd);
+  close(dev->dev_fd);
+
+  // TPU_LOG_WARNING("device[%d] closed\n", dev->index);
+
+  BMDEV_LOCK_DEINIT(dev);
+  delete dev;
+}
+
+int Cvi180xDeviceMem::get_chip_ver(bmdev_t dev) {
+  return dev->info.info182x.version;
+}
+
+void Cvi180xDeviceMem::mem_free_raw(bmctx_t ctx, bmmem_device_t mem) {
+  char array_got = 0;
+  bm_memory_t *device_mem = (bm_memory_t *)mem;
+  TPU_ASSERT(device_mem->flags.u.type == BMMEM_TYPE_DEVICE, NULL);
+
+  if (!device_mem->flags.u.is_prealloc) {
+    mem_free(device_mem->v_addr, device_mem->size, device_mem->dma_fd);
+
+    for (int i = 0; i < MEMARRAY_MAX_CNT; i ++) {
+      if (ctx->root_mem_array[i].p_addr == device_mem->p_addr) {
+        ctx->root_mem_array[i].p_addr = 0;
+        ctx->root_mem_array[i].mem = NULL;
+        array_got = 1;
+        break;
+      }
+    }
+
+    if (!array_got)
+      TPU_LOG_WARNING("bmmem_device_free() can not find match\n");
+  }
+
+  BMEMEM_DUMP();
+  delete device_mem;
+}
+
+bmerr_t Cvi180xDeviceMem::load_cmdbuf(bmctx_t ctx, uint8_t *cmdbuf, size_t sz,
+                       uint64_t neuron_gaddr, uint64_t weight_gaddr,
+                       bool enable_pmu, bmmem_device_t *cmdbuf_mem) {
+  bmerr_t ret;
+  size_t dmabuf_size = 0;
+  size_t pmubuf_size = 0;
+  bmmem_device_t dmabuf_mem;
+
+  ret = cvi180x_dmabuf_size(cmdbuf, sz, &dmabuf_size, &pmubuf_size);
+  if (ret != BM_SUCCESS) {
+    TPU_LOG_WARNING("load_cmdbuf dmabuf_size fail\n");
+    return ret;
+  }
+
+  //calculate pmu size
+  pmubuf_size = enable_pmu ? pmubuf_size : 0;
+  //TPU_LOG_DEBUG("pmubuf_size = 0x%lx\n", pmubuf_size);
+  if (protect) {
+    dmabuf_mem = mem_alloc_pagesize(ctx, dmabuf_size + pmubuf_size);
+  } else {
+    dmabuf_mem = mem_alloc_raw(ctx, dmabuf_size + pmubuf_size);
+  }
+  uint64_t dmabuf_devaddr = mem_p_addr(dmabuf_mem);
+
+  ret = cvi180x_dmabuf_convert(cmdbuf, sz, dmabuf_mem->v_addr);
+  if (ret != BM_SUCCESS) {
+    TPU_LOG_WARNING("load_cmdbuf dmabuf_convert fail\n");
+    mem_free_raw(ctx, dmabuf_mem);
+    return ret;
+  }
+  set_base_reg(ctx, 0, neuron_gaddr);
+  set_base_reg(ctx, 1, weight_gaddr);
+  ret = cvi180x_arraybase_set(dmabuf_mem->v_addr, (u32)neuron_gaddr, (u32)weight_gaddr, 0, 0);
+  if (ret != BM_SUCCESS) {
+    TPU_LOG_WARNING("load_cmdbuf arraybase_set fail\n");
+    mem_free_raw(ctx, dmabuf_mem);
+    return ret;
+  }
+
+  ret = cvi180x_dmabuf_relocate(dmabuf_mem->v_addr, dmabuf_devaddr, dmabuf_size, pmubuf_size);
+  if (ret != BM_SUCCESS) {
+    TPU_LOG_WARNING("load_cmdbuf relocate fail\n");
+    mem_free_raw(ctx, dmabuf_mem);
+    return ret;
+  }
+
+  ret = mem_flush_ext(ctx->dev, dmabuf_mem->dma_fd, dmabuf_mem->p_addr, dmabuf_size);
+  if (ret != BM_SUCCESS) {
+    TPU_LOG_WARNING("load_cmdbuf flush_ext fail\n");
+    mem_free_raw(ctx, dmabuf_mem);
+    return ret;
+  }
+
+  // record dmabuf crc32
+  // dmabuf_mem->crc32 = bm_crc32(dmabuf, dmabuf_size);
+  *cmdbuf_mem = dmabuf_mem;
+
+  // if (0) {
+  // cvi180x_dmabuf_dump(dmabuf);
+  //}
+  return ret;
+}
+
+bmerr_t Cvi180xDeviceMem::load_dmabuf(bmctx_t ctx, bmmem_device_t dmabuf,
+                                      size_t sz, uint64_t neuron_gaddr, uint64_t weight_gaddr,
+                                      bool enable_pmu, bmmem_device_t *dmabuf_mem) {
+  size_t pmubuf_size = 0;
+  bmerr_t ret = BM_SUCCESS;
+  if (enable_pmu) {
+    ret = cvi180x_get_pmusize(dmabuf->v_addr, &pmubuf_size);
+    if (ret != BM_SUCCESS) {
+      TPU_LOG_WARNING("load_dmabuf get_pmusize fail\n");
+      return ret;
+    }
+    *dmabuf_mem = mem_alloc_raw(ctx, sz + pmubuf_size);
+    if (*dmabuf_mem == nullptr) {
+        TPU_LOG_ERROR("alloc dmabuf mem fail!\n");
+        return BM_ERR_FAILURE;
+    }
+    std::memcpy((*dmabuf_mem)->v_addr, dmabuf->v_addr, sz);
+  } else {
+    *dmabuf_mem = dmabuf;
+  }
+  uint64_t dmabuf_devaddr = mem_p_addr(*dmabuf_mem);
+
+  //set_base_reg(ctx, 0, neuron_gaddr);
+  //set_base_reg(ctx, 1, weight_gaddr);
+  ret = cvi180x_arraybase_set((*dmabuf_mem)->v_addr, (u32)neuron_gaddr, (u32)weight_gaddr, 0, 0);
+  if (ret != BM_SUCCESS) {
+    if (enable_pmu) {
+      mem_free_raw(ctx, *dmabuf_mem);
+    }
+    TPU_LOG_WARNING("load_dmabuf get_pmusize fail\n");
+    return ret;
+  }
+
+  ret = cvi180x_dmabuf_relocate((*dmabuf_mem)->v_addr, dmabuf_devaddr, sz,
+                          pmubuf_size);
+  if (ret != BM_SUCCESS) {
+    if (enable_pmu) {
+      mem_free_raw(ctx, *dmabuf_mem);
+    }
+    TPU_LOG_WARNING("load_dmabuf dmabuf_relocate fail\n");
+    return ret;
+  }
+  ret = mem_flush_ext(ctx->dev, (*dmabuf_mem)->dma_fd, (*dmabuf_mem)->p_addr, sz);
+  if (ret != BM_SUCCESS) {
+    if (enable_pmu) {
+      mem_free_raw(ctx, *dmabuf_mem);
+    }
+    TPU_LOG_WARNING("load_dmabuf flush_ext fail\n");
+    return ret;
+  }
+
+  // if (0) {
+  //cvi180x_dmabuf_dump(dmabuf);
+  //}
+  return BM_SUCCESS;
+}
+
+
+bmerr_t Cvi180xDeviceMem::load_cmdbuf_tee(bmctx_t ctx, uint8_t *cmdbuf, size_t sz,
+                            uint64_t neuron_gaddr, uint64_t weight_gaddr,
+                            uint32_t weight_len, bmmem_device_t *cmdbuf_mem)
+{
+  //bmerr_t ret;
+  bmmem_device_t dmabuf_mem;
+
+  //malloc double size buffer, because TEE needs 2nd space to calculate dmabuf
+  dmabuf_mem = mem_alloc_raw(ctx, sz + sz);
+
+  //transfer encrypted cmdbuf to TEE
+  memcpy(dmabuf_mem->v_addr, cmdbuf, sz);
+  TPU_ASSERT((int)mem_flush_ext(ctx->dev, dmabuf_mem->dma_fd,
+        dmabuf_mem->p_addr, sz) == BM_SUCCESS, NULL);
+
+  //ioctl to get secure dma buffer
+  load_tee(ctx, dmabuf_mem->p_addr, sz, weight_gaddr, weight_len, neuron_gaddr);
+
+  //this region should be protected, can't touch in REE
+  *cmdbuf_mem = dmabuf_mem;
+	return 0;
+}
+
+bmerr_t Cvi180xDeviceMem::unload_tee(bmctx_t ctx, uint64_t paddr, size_t size) {
+  TPU_ASSERT(0, NULL); // not support
+  return BM_SUCCESS;
+}
+
+bmerr_t Cvi180xDeviceMem::parse_pmubuf(bmmem_device_t cmdbuf_mem, uint8_t **buf_start, uint32_t *buf_len) {
+  dma_hdr_t *header = (dma_hdr_t *)(cmdbuf_mem->v_addr);
+  //TPU_LOG_DEBUG("header->arraybase_0_L = 0x%x\n", header->arraybase_0_L);
+  //TPU_LOG_DEBUG("header->arraybase_1_L = 0x%x\n", header->arraybase_1_L);
+  //TPU_LOG_DEBUG("header->arraybase_0_H = 0x%x\n", header->arraybase_0_H);
+  //TPU_LOG_DEBUG("header->arraybase_1_H = 0x%x\n", header->arraybase_1_H);
+  //TPU_LOG_DEBUG("header->pmubuf_offset = 0x%x\n", header->pmubuf_offset);
+  //TPU_LOG_DEBUG("header->pmubuf_size = 0x%x\n", header->pmubuf_size);
+  if (header->pmubuf_size && header->pmubuf_offset) {
+    tpu_pmu_dump_main(cmdbuf_mem->v_addr, cmdbuf_mem->p_addr);
+  }
+  *buf_start = cmdbuf_mem->v_addr;
+  *buf_len = cmdbuf_mem->size;
+  return BM_SUCCESS;
+}
+
+void Cvi180xDeviceMem::cvikernel_create(bmctx_t ctx, void **p_bk_ctx) {
+  TPU_ASSERT(ctx != nullptr, nullptr);
+  TPU_ASSERT(ctx->dev != nullptr, nullptr);
+
+  bmk1822_chip_info_t info = bmk1822_chip_info();
+  bmk1822_chip_info_t *dev_info = &info;
+
+  bmk_info_t bmk_info;
+  bmk_info.chip_version = dev_info->version;
+  bmk_info.cmdbuf_size = 0x100000;
+  bmk_info.cmdbuf = (u8 *)malloc(bmk_info.cmdbuf_size);
+  TPU_ASSERT(bmk_info.cmdbuf, "create cvikernel, malloc failed\n");
+
+  ctx->cvik_context.ctx182x = bmk1822_register(&bmk_info);
+  ctx->cvik_cmdbuf = (void *)bmk_info.cmdbuf;
+
+  *p_bk_ctx = ctx->cvik_context.ctx182x;
+}
+
+void Cvi180xDeviceMem::cvikernel_submit(bmctx_t ctx) {
+  u32 len;
+  u8 *cmdbuf = bmk1822_acquire_cmdbuf(ctx->cvik_context.ctx182x, &len);
+
+  uint16_t seq_no;
+  bmerr_t ret = send_cmdbuf(ctx, cmdbuf, (size_t)len, &seq_no);
+  TPU_ASSERT(ret == BM_SUCCESS, NULL);
+  bmk1822_reset(ctx->cvik_context.ctx182x);
+}
+
+void Cvi180xDeviceMem::cvikernel_destroy(bmctx_t ctx) {
+  TPU_ASSERT(ctx->cvik_context.ctx182x, NULL);
+  TPU_ASSERT(ctx->cvik_cmdbuf, NULL);
+
+  bmk1822_cleanup(ctx->cvik_context.ctx182x);
+  free(ctx->cvik_cmdbuf);
+}
diff --git a/cviruntime/src/soc/180x/cvi180x_device_mem.h b/cviruntime/src/soc/180x/cvi180x_device_mem.h
new file mode 100644
index 000000000..4bf8a1a0a
--- /dev/null
+++ b/cviruntime/src/soc/180x/cvi180x_device_mem.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <string.h>
+#include "cvi_device_mem.h"
+#include "bmruntime_internal.h"
+
+class Cvi180xDeviceMem : public CviDeviceMem {
+public:
+  Cvi180xDeviceMem();
+  ~Cvi180xDeviceMem() override;
+  virtual bmerr_t device_open(int index, bmdev_t *dev) override;
+  virtual void device_close(bmdev_t dev) override;
+  virtual int get_chip_ver(bmdev_t dev) override;
+  virtual void mem_free_raw(bmctx_t ctx, bmmem_device_t mem);
+  virtual bmerr_t load_cmdbuf(bmctx_t ctx, uint8_t *cmdbuf, size_t sz,
+                       uint64_t neuron_gaddr, uint64_t weight_gaddr,
+                       bool enable_pmu, bmmem_device_t *cmdbuf_mem) override;
+  virtual bmerr_t load_dmabuf(bmctx_t ctx, bmmem_device_t dmabuf,
+                              size_t sz, uint64_t neuron_gaddr,
+                              uint64_t weight_gaddr, bool enable_pmu,
+                              bmmem_device_t *dmabuf_mem) override;
+
+  virtual bmerr_t load_cmdbuf_tee(bmctx_t ctx, uint8_t *cmdbuf, size_t sz,
+                                          uint64_t neuron_gaddr, uint64_t weight_gaddr,
+                                          uint32_t weight_len,
+                                          bmmem_device_t *cmdbuf_mem);
+  virtual bmerr_t unload_tee(bmctx_t ctx, uint64_t paddr, size_t size);
+  virtual bmerr_t parse_pmubuf(bmmem_device_t cmdbuf_mem, uint8_t **buf_start, uint32_t *buf_len);
+  virtual void cvikernel_create(bmctx_t ctx, void **p_bk_ctx) override;
+  virtual void cvikernel_submit(bmctx_t ctx) override;
+  virtual void cvikernel_destroy(bmctx_t ctx) override;
+};
diff --git a/cviruntime/src/soc/180x/cvi_rt_180x.cpp b/cviruntime/src/soc/180x/cvi_rt_180x.cpp
new file mode 100644
index 000000000..5220e1a89
--- /dev/null
+++ b/cviruntime/src/soc/180x/cvi_rt_180x.cpp
@@ -0,0 +1,82 @@
+#include "cvi_rt_180x.h"
+
+std::unique_ptr<CviRTSoc> cvi_chip(new CviRT180x());
+
+CviRT180x::CviRT180x() {
+    chip_name_    = "cv180x";
+    submit_magic_ = 0x18225678;
+    cvi_device    = std::move(std::unique_ptr<CviDeviceMem>(new Cvi180xDeviceMem()));
+}
+
+CviRT180x::~CviRT180x() {}
+
+CVI_RT_KHANDLE CviRT180x::GetKHandleBK(CVI_RT_HANDLE rt_handle) {
+    bmctx_t ctx = (bmctx_t)rt_handle;
+    return (CVI_RT_KHANDLE)(ctx->cvik_context.ctx182x);
+}
+
+CVI_RC CviRT180x::DeInitBK(CVI_RT_HANDLE rt_handle) {
+    bmctx_t ctx = (bmctx_t)rt_handle;
+
+    //deinit kernel related
+    if (ctx->cvik_context.ctx182x) {
+        bmk1822_cleanup(ctx->cvik_context.ctx182x);
+    }
+
+    if (ctx->cvik_cmdbuf) {
+        free(ctx->cvik_cmdbuf);
+    }
+
+    //deinit basic context
+    bm_exit(ctx);
+    return CVI_SUCCESS;
+}
+
+CVI_RC CviRT180x::InitWithKernelBK(CVI_RT_HANDLE *rt_handle, uint32_t cmdbuf_size) {
+    bmctx_t *ctx = (bmctx_t *)rt_handle;
+
+    //init basic context
+    bm_init(DEVICE_INDEX_NUM, ctx);
+
+    //init cvikernel related
+    bmk1822_chip_info_t info      = bmk1822_chip_info();
+    bmk1822_chip_info_t *dev_info = &info;
+
+    bmk_info_t bmk_info;
+    bmk_info.chip_version = dev_info->version;
+    bmk_info.cmdbuf_size  = cmdbuf_size;
+    bmk_info.cmdbuf       = (u8 *)malloc(bmk_info.cmdbuf_size);
+    if (!bmk_info.cmdbuf) {
+        TPU_ASSERT(bmk_info.cmdbuf, "malloc kernel buffer failed");
+        return CVI_FAILURE;
+    }
+
+    (*ctx)->cvik_context.ctx182x = bmk1822_register(&bmk_info);
+    (*ctx)->cvik_cmdbuf          = (void *)bmk_info.cmdbuf;
+
+    return CVI_SUCCESS;
+}
+
+CVI_RC CviRT180x::LoadCmdbufTee(CVI_RT_HANDLE rt_handle, uint8_t *cmdbuf,
+                                size_t sz, uint64_t neuron_gaddr,
+                                uint64_t weight_gaddr, uint32_t weight_len,
+                                CVI_RT_MEM *cmdbuf_mem) {
+    (void)rt_handle;
+    (void)cmdbuf;
+    (void)sz;
+    (void)neuron_gaddr;
+    (void)weight_gaddr;
+    (void)weight_len;
+    (void)cmdbuf_mem;
+    TPU_ASSERT(0, NULL);  // not support
+    return CVI_SUCCESS;
+}
+
+CVI_RC CviRT180x::RunCmdbufTee(CVI_RT_HANDLE rt_handle, CVI_RT_MEM cmdbuf_mem,
+                              CVI_RT_ARRAYBASE *p_array_base) {
+    (void)rt_handle;
+    (void)p_array_base;
+    (void)cmdbuf_mem;
+    TPU_ASSERT(0, NULL); // not support
+    return CVI_SUCCESS;
+}
diff --git a/cviruntime/src/soc/180x/cvi_rt_180x.h b/cviruntime/src/soc/180x/cvi_rt_180x.h
new file mode 100644
index 000000000..d5dd03beb
--- /dev/null
+++ b/cviruntime/src/soc/180x/cvi_rt_180x.h
@@ -0,0 +1,19 @@
+#pragma once
+#include "cvi_rt_base.h"
+#include "cvi180x_device_mem.h"
+
+class CviRT180x : public CviRTSoc {
+public:
+  CviRT180x();
+  virtual ~CviRT180x() override;
+
+  virtual CVI_RT_KHANDLE GetKHandleBK(CVI_RT_HANDLE rt_handle) override;
+  virtual CVI_RC DeInitBK(CVI_RT_HANDLE rt_handle) override;
+  virtual CVI_RC InitWithKernelBK(CVI_RT_HANDLE *rt_handle, uint32_t cmdbuf_size) override;
+  virtual CVI_RC LoadCmdbufTee(CVI_RT_HANDLE rt_handle, uint8_t *cmdbuf,
+                       size_t sz, uint64_t neuron_gaddr,
+                       uint64_t weight_gaddr, uint32_t weight_len,
+                       CVI_RT_MEM *cmdbuf_mem) override;
+  virtual CVI_RC RunCmdbufTee(CVI_RT_HANDLE rt_handle, CVI_RT_MEM cmdbuf_mem,
+                              CVI_RT_ARRAYBASE *p_array_base);
+};
diff --git a/cviruntime/src/soc/180x/tpu_pmu.cpp b/cviruntime/src/soc/180x/tpu_pmu.cpp
new file mode 100644
index 000000000..93cad9741
--- /dev/null
+++ b/cviruntime/src/soc/180x/tpu_pmu.cpp
@@ -0,0 +1,903 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <string.h>
+#include <bmkernel/bm1822/bm1822_tiu_reg.h>
+#include <bmkernel/bm1822/bm1822_tdma_reg.h>
+#include "bmruntime_internal.h"
+#include <bmkernel/bm_regcpu.h>
+#include <bmkernel/reg_bdcast.h>
+#include <bmkernel/reg_tdma.h>
+
+
+struct TPU_PMU_DOUBLEEVENT {
+  unsigned long long type : 4;
+  unsigned long long desID : 16;
+  unsigned long long eventCnt0 : 22;
+  unsigned long long eventCnt1 : 22;
+  uint32_t endTime;
+  uint32_t startTime;
+};
+
+typedef enum _EXCEL_TYPE {
+  EXCEL_TYPE_0    = 0,
+  EXCEL_TYPE_1    = 1,
+  EXCEL_TYPE_2    = 2,
+  EXCEL_TYPE_3    = 3,
+  EXCEL_TYPE_4    = 4,
+} EXCEL_TYPE;
+
+enum TPU_PMUTYPE {
+  TPU_PMUTYPE_TDMALOAD  = 1,
+  TPU_PMUTYPE_TDMASTORE = 2,
+  TPU_PMUTYPE_TDMAMOVE  = 3,
+  TPU_PMUTYPE_TIU       = 4,
+};
+
+typedef struct _TPU_DES_ELEMENT {
+  TPU_PMU_DOUBLEEVENT pmuEvent;
+  tiu_reg_t   tiuReg;
+  tdma_reg_t  tdmaReg;
+  char typeStr[50];
+} TPU_DES_ELEMENT;
+
+typedef struct _TPU_LAYERID_ELEMENT {
+  uint32_t    layerID;
+  TPU_PMUTYPE last_desType;
+  uint32_t    last_mapping_desID;
+  uint32_t    endTime;
+  uint32_t    startTime;
+//  uint8_t     layerName[50];
+  uint32_t    u32StartAddr;
+  uint32_t    u32OutputLen;
+
+  uint32_t    u32LoadNueronTime;
+  uint32_t    u32LoadWeightTime;
+  uint32_t    u32StoreNueronTime;
+  uint32_t    u32TIUTime;
+  uint32_t    u32TDMATime;
+  uint32_t    u32byteCnt;
+
+  double      parallelism;
+  double      duration_percent;
+  double      loadNeuron_percent;
+  double      loadWeight_percent;
+  double      storeNeuron_percent;
+  double      tiu_percent;
+  double      throughput_MB;
+} TPU_LAYERID_ELEMENT;
+
+#define FILE_OUT_LINE_LEN 2048
+#define TPUPMU_DES_FILENAME "_des.csv"
+#define TPUPMU_LAYER_FILENAME "_layer.csv"
+const char *pmubuf_output_file_env = NULL;
+
+
+#define TPU_CLOCK_DEFAULT (750000000)
+#define TPU_WRAP_LIMIT  0xFFFFFFFF
+#define TPU_BURST_SIZE  16
+#define DES_MAX   (65535 * 6)    //hardcore firstly, real count could be queried from dmabuf
+TPU_DES_ELEMENT *p_element = NULL;
+TPU_LAYERID_ELEMENT *p_layer = NULL;
+
+static void tpu_pmu_fill_cmdbuf(uint8_t *v_dma_buf);
+
+static void reorder_back_tiu_cmdbuf_reg(uint8_t *cmdbuf)
+{
+  int total_bits = BD_REG_BYTES * 8;
+
+  uint8_t tmp[128 / 8];
+  uint8_t *last = &cmdbuf[(total_bits - 128) / 8];
+  memcpy(tmp, last, sizeof(tmp));
+  memcpy(last, cmdbuf, sizeof(tmp));
+  memcpy(cmdbuf, tmp, sizeof(tmp));
+}
+
+static void tdma_des_fill_str(TPU_DES_ELEMENT *element)
+{
+  char str1[50] = {0};
+  char tmpStr[10] = {0};
+
+  switch(element->pmuEvent.type) {
+    case TPU_PMUTYPE_TDMALOAD:
+      sprintf(tmpStr, "%s", "Load");
+      break;
+    case TPU_PMUTYPE_TDMASTORE:
+      sprintf(tmpStr, "%s", "Store");
+      break;
+    case TPU_PMUTYPE_TDMAMOVE:
+      sprintf(tmpStr, "%s", "Move");
+      break;
+    default:
+      break;
+  }
+
+  if (element->tdmaReg.compress_en)
+    sprintf(str1 , "%s %s", tmpStr , "Compression");
+  else
+    sprintf(str1 , "%s" , tmpStr);
+
+  if (element->tdmaReg.sys_dtype)
+    sprintf(element->typeStr, "%s %s", "TDMA Matrix", str1);
+  else
+    sprintf(element->typeStr, "%s %s", "TDMA Tensor", str1);
+}
+
+static void tpu_pmu_fill_cmdbuf(uint8_t *v_dma_buf)
+{
+  cvi_cpu_desc_t *desc = (cvi_cpu_desc_t *)(v_dma_buf + sizeof(dma_hdr_t));
+
+  uint64_t tiu_offset = 0, tdma_offset = 0;
+  uint32_t tiu_cnt = 0, tdma_cnt = 0, i = 0, offset = 0;
+  uint32_t start_index_tdma = 0, start_index_tiu = 0;
+  uint32_t index = 0;
+  tdma_reg_t tmpTDMA_Reg;
+  tiu_reg_t tmpTIU_Reg;
+  uint8_t tiu_recorded_buf[BD_REG_BYTES];
+  uint32_t tdma_id_previous = 0, tdma_start_pre= 0, tdma_end_pre = 0;
+
+  //get tiu/tdma descriptor start address
+  tiu_offset = desc->offset_tiu_ori_bk;
+  tdma_offset = desc->offset_tdma_ori_bk;
+  TPU_LOG_DEBUG("tpu_pmu_fill_cmdbuf() tiu_offset=0x%" PRIx64", tdma_offset=0x%" PRIx64 "\n", tiu_offset, tdma_offset);
+
+  tiu_cnt = desc->num_tiu;
+  tdma_cnt = desc->num_tdma;
+  TPU_LOG_DEBUG("tpu_pmu_fill_cmdbuf() tiu_cnt=%d, tdma_cnt=%d\n", tiu_cnt, tdma_cnt);
+
+  while (p_element[index].pmuEvent.type) {
+    if (p_element[index].pmuEvent.type != TPU_PMUTYPE_TIU) {    //tdma
+
+      if ((p_element[index].pmuEvent.desID != tdma_id_previous) ||
+          (p_element[index].pmuEvent.startTime != tdma_start_pre) ||
+          (p_element[index].pmuEvent.endTime != tdma_end_pre)) {
+        for (i = start_index_tdma; i < tdma_cnt; i ++) {
+          offset = tdma_offset + ((1 << TDMA_DESCRIPTOR_ALIGNED_BIT) * i);
+          parse_tdma_reg(&tmpTDMA_Reg, (uint32_t *)(v_dma_buf + offset));
+
+          if (p_element[index].pmuEvent.desID == tmpTDMA_Reg.cmd_id) {
+            memcpy(&p_element[index].tdmaReg, &tmpTDMA_Reg, sizeof(tmpTDMA_Reg));
+            tdma_des_fill_str(&p_element[index]);
+            start_index_tdma = i + 1;
+            tdma_id_previous = p_element[index].pmuEvent.desID;
+            tdma_start_pre = p_element[index].pmuEvent.startTime;
+            tdma_end_pre = p_element[index].pmuEvent.endTime;
+            break;
+          }
+        }
+      } else {  //tdma g2g case, copy 1st to 2nd tdma descriptor
+        memcpy(&p_element[index].tdmaReg, &p_element[index - 1].tdmaReg, sizeof(tmpTDMA_Reg));
+        tdma_des_fill_str(&p_element[index]);
+      }
+    } else {   //tiu
+      for (i = start_index_tiu; i < tiu_cnt; i ++) {
+        offset = tiu_offset + (BD_REG_BYTES * i);
+        uint8_t *tiu_cmdbuf = v_dma_buf + offset;
+
+        //get tiu_reg struc
+        memcpy(tiu_recorded_buf, tiu_cmdbuf, BD_REG_BYTES);
+        reorder_back_tiu_cmdbuf_reg(tiu_recorded_buf);
+        parse_tiu_reg(&tmpTIU_Reg, (uint32_t *)tiu_recorded_buf);
+
+        if (p_element[index].pmuEvent.desID == tmpTIU_Reg.cmd_id_tpu) {
+          memcpy(&p_element[index].tiuReg, &tmpTIU_Reg, sizeof(tmpTIU_Reg));
+
+#if 1
+          switch (tmpTIU_Reg.tsk_typ) {
+            case DCR_TYPE_CONV_FIX8B:
+              if (!tmpTIU_Reg.opt_chl_quan) {
+                if (tmpTIU_Reg.opd_typ) {
+                  strcpy(p_element[index].typeStr, "TIU BF16 Convolution");
+                } else {
+                  strcpy(p_element[index].typeStr, "TIU Convolution");
+                }
+              } else {
+                strcpy(p_element[index].typeStr, "TIU PerChannel Convolution");
+              }
+              break;
+            case DCR_TYPE_DEPTHWISE_POOL_FIX8B:
+              switch (tmpTIU_Reg.tsk_eu_typ) {
+                case 0:
+                  if (tmpTIU_Reg.opd_typ) {
+                    strcpy(p_element[index].typeStr, "TIU BF16 Max Pooling");
+                  } else {
+                    strcpy(p_element[index].typeStr, "TIU Max Pooling");
+                  }
+                  break;
+                case 1:
+                  if (tmpTIU_Reg.opd_typ) {
+                    strcpy(p_element[index].typeStr, "TIU BF16 Average Pooling");
+                  } else {
+                    strcpy(p_element[index].typeStr, "TIU Average Pooling");
+                  }
+                  break;
+                case 2:
+                  if (!tmpTIU_Reg.opt_chl_quan) {
+                    if (tmpTIU_Reg.opd_typ) {
+                      strcpy(p_element[index].typeStr, "TIU BF16 Depthwise Convolution");
+                    } else {
+                      strcpy(p_element[index].typeStr, "TIU Depthwise Convolution");
+                    }
+                  } else {
+                    strcpy(p_element[index].typeStr, "TIU Depthwise PerChannel Convolution");
+                  }
+                  break;
+                case 3:
+                  if (tmpTIU_Reg.opd_typ) {
+                    strcpy(p_element[index].typeStr, "TIU BF16 Min Pooling");
+                  } else {
+                    strcpy(p_element[index].typeStr, "TIU Min Pooling");
+                  }
+                  break;
+                default:
+                  break;
+              }
+              break;
+            case DCR_TYPE_FC_FIX8B:
+              if (!tmpTIU_Reg.opt_chl_quan) {
+                if (tmpTIU_Reg.opd_typ) {
+                  strcpy(p_element[index].typeStr, "TIU BF16 Matrix Multiplication");
+                } else {
+                  strcpy(p_element[index].typeStr, "TIU Matrix Multiplication");
+                }
+              } else {
+                strcpy(p_element[index].typeStr, "TIU PerChannel Matrix Multiplication");
+              }
+              break;
+            case DCR_TYPE_TENSOR_ARITH_FIX8B:
+              switch(tmpTIU_Reg.tsk_eu_typ) {
+                case 0:
+                  if (!tmpTIU_Reg.opt_chl_quan) {
+                    if (tmpTIU_Reg.opd_typ) {
+                       strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Mul");
+                    } else {
+                      strcpy(p_element[index].typeStr, "TIU Element-wise Mul");
+                    }
+                  } else {
+                    strcpy(p_element[index].typeStr, "TIU Element-wise Mul(QDM)");
+                  }
+                  break;
+                case 1:
+                  if (tmpTIU_Reg.opd_typ) {
+                    strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Mac");
+                  } else {
+                    strcpy(p_element[index].typeStr, "TIU Element-wise Mac");
+                  }
+                  break;
+                case 2:
+                  if (tmpTIU_Reg.opd_typ) {
+                    strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Add");
+                  } else {
+                    strcpy(p_element[index].typeStr, "TIU Element-wise Add");
+                  }
+                  break;
+                case 3:
+                  if (tmpTIU_Reg.opd_typ) {
+                    strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Sub");
+                  } else {
+                    strcpy(p_element[index].typeStr, "TIU Element-wise Sub");
+                  }
+                  break;
+                case 4:
+                  if (tmpTIU_Reg.opd_typ) {
+                    strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Max");
+                  } else {
+                    strcpy(p_element[index].typeStr, "TIU Element-wise Max");
+                  }
+                  break;
+                case 5:
+                  if (tmpTIU_Reg.opd_typ) {
+                    strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Min");
+                  } else {
+                    strcpy(p_element[index].typeStr, "TIU Element-wise Min");
+                  }
+                  break;
+                case 6:
+                  strcpy(p_element[index].typeStr, "TIU Element-wise Shift");
+                  break;
+                case 7:
+                  strcpy(p_element[index].typeStr, "TIU Element-wise AND");
+                  break;
+                case 8:
+                  strcpy(p_element[index].typeStr, "TIU Element-wise OR");
+                  break;
+                case 9:
+                  strcpy(p_element[index].typeStr, "TIU Element-wise XOR");
+                  break;
+                case 10:
+                  if (tmpTIU_Reg.opd_typ) {
+                    strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Copy");
+                  } else {
+                    strcpy(p_element[index].typeStr, "TIU Element-wise Copy");
+                  }
+                  break;
+                case 11:
+                  if (tmpTIU_Reg.opd_typ) {
+                    strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Ge");
+                  } else {
+                    strcpy(p_element[index].typeStr, "TIU Element-wise Ge");
+                  }
+                  break;
+                case 12:
+                  strcpy(p_element[index].typeStr, "TIU Lookup Table");
+                  break;
+                default:
+                  break;
+                }
+                default:
+                  break;
+          }
+
+#else
+          switch(tmpTIU_Reg.tsk_typ) {
+            case DCR_TYPE_CONV_FIX8B:
+              if (!tmpTIU_Reg.opt_chl_quan) {
+                if (tmpTIU_Reg.opd_typ)
+                  strcpy(p_element[index].typeStr, "TIU BF16 Convolution");
+                else
+                  strcpy(p_element[index].typeStr, "TIU Convolution");
+              } else {
+                strcpy(p_element[index].typeStr, "TIU PerChannel Convolution");
+              }
+              break;
+            case DCR_TYPE_DEPTHWISE_POOL_FIX8B:
+              switch (tmpTIU_Reg.tsk_eu_typ) {
+                    case 0:
+                  if (tmpTIU_Reg.opd_typ)
+                    strcpy(p_element[index].typeStr, "TIU BF16 Max Pooling");
+                  else
+                    strcpy(p_element[index].typeStr, "TIU Max Pooling");
+                  break;
+                    case 1:
+                  if (tmpTIU_Reg.opd_typ)
+                    strcpy(p_element[index].typeStr, "TIU BF16 Average Pooling");
+                  else
+                    strcpy(p_element[index].typeStr, "TIU Average Pooling");
+                  break;
+                    case 2:
+                  if (!tmpTIU_Reg.opt_chl_quan) {
+                    if (tmpTIU_Reg.opd_typ)
+                      strcpy(p_element[index].typeStr, "TIU BF16 Depthwise Convolution");
+                    else
+                      strcpy(p_element[index].typeStr, "TIU Depthwise Convolution");
+                  } else {
+                    strcpy(p_element[index].typeStr, "TIU Depthwise PerChannel Convolution");
+                  }
+                  break;
+                    default:
+                      break;
+                  }
+              break;
+            case DCR_TYPE_FC_FIX8B:
+              if (!tmpTIU_Reg.opt_chl_quan) {
+                if (tmpTIU_Reg.opd_typ)
+                  strcpy(p_element[index].typeStr, "TIU BF16 Matrix Multiplication");
+                else
+                  strcpy(p_element[index].typeStr, "TIU Matrix Multiplication");
+              } else {
+                strcpy(p_element[index].typeStr, "TIU PerChannel Matrix Multiplication");
+              }
+              break;
+            case DCR_TYPE_TENSOR_ARITH_FIX8B:
+              if (tmpTIU_Reg.tens_mdsum) {
+                strcpy(p_element[index].typeStr, "TIU Mdsum");
+              } else if (tmpTIU_Reg.tens_lookup) {
+                strcpy(p_element[index].typeStr, "TIU Lookup Table");
+              } else {
+                switch (tmpTIU_Reg.tsk_eu_typ) {
+                  case 0:
+                    if (!tmpTIU_Reg.opt_chl_quan) {
+                      if (tmpTIU_Reg.opd_typ) {
+                        strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Mul");
+                      } else {
+                        strcpy(p_element[index].typeStr, "TIU Element-wise Mul");
+                      }
+                    } else {
+                      strcpy(p_element[index].typeStr, "TIU Element-wise Mul(QDM)");
+                    }
+                    break;
+                  case 1:
+                    if (tmpTIU_Reg.opd_typ) {
+                      strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Mac");
+                    } else {
+                      strcpy(p_element[index].typeStr, "TIU Element-wise Mac");
+                    }
+                    break;
+                  case 2:
+                    if (tmpTIU_Reg.opd_typ) {
+                      strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Add");
+                    } else {
+                      strcpy(p_element[index].typeStr, "TIU Element-wise Add");
+                    }
+                    break;
+                  case 3:
+                    if (tmpTIU_Reg.opd_typ) {
+                      strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Sub");
+                    } else {
+                      strcpy(p_element[index].typeStr, "TIU Element-wise Sub");
+                    }
+                    break;
+                  case 4:
+                    if (tmpTIU_Reg.opd_typ) {
+                      strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Max");
+                    } else {
+                      strcpy(p_element[index].typeStr, "TIU Element-wise Max");
+                    }
+                    break;
+                  case 5:
+                    if (tmpTIU_Reg.opd_typ) {
+                      strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Min");
+                    } else {
+                      strcpy(p_element[index].typeStr, "TIU Element-wise Min");
+                    }
+                    break;
+                  case 6:
+                    strcpy(p_element[index].typeStr, "TIU Element-wise Shift");
+                    break;
+                  case 7:
+                    strcpy(p_element[index].typeStr, "TIU Element-wise AND");
+                    break;
+                  case 8:
+                    strcpy(p_element[index].typeStr, "TIU Element-wise OR");
+                    break;
+                  case 9:
+                    strcpy(p_element[index].typeStr, "TIU Element-wise XOR");
+                    break;
+                  case 10:
+                    if (tmpTIU_Reg.opd_typ) {
+                      strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Copy");
+                    } else {
+                      strcpy(p_element[index].typeStr, "TIU Element-wise Copy");
+                    }
+                    break;
+                  default:
+                    break;
+                }
+              }
+              break;
+          }
+#endif
+          start_index_tiu = i + 1;
+          break;
+        }
+      }
+    }
+    index ++;
+  }
+
+}
+
+#include <iostream>
+#include <fstream>
+using namespace std;
+
+static void tpu_pmu_fwrite_des()
+{
+  uint32_t index = 0;
+	uint64_t srcAddr = 0, dstAddr = 0;
+
+  char lineStr[FILE_OUT_LINE_LEN] = {0};
+  EXCEL_TYPE excelType = EXCEL_TYPE_0;
+
+  std::fstream fout_element;
+  sprintf(lineStr, "%s%s", pmubuf_output_file_env, TPUPMU_DES_FILENAME);
+  TPU_LOG_DEBUG("out file_des name=%s\n", lineStr);
+  fout_element.open(lineStr, std::ios::out | std::ios::trunc);
+
+  strcpy(lineStr, "pmutype, desID, event0, event1, , start, duration, end, layerID, desType, \
+    srcAddr, dstAddr, trans_fmt, transpose_md, cmd_id, wait_id_tpu, dst_h_stride, dst_c_stride_low, \
+    dst_n_stride, src_h_stride, src_c_stride_low, src_n_stride, dst_c, src_c, dst_w, dst_h, src_w, src_h, src_n\n");
+  fout_element << lineStr;
+
+  //dump descriptor content related
+  while (p_element[index].pmuEvent.type)
+  {
+    switch (p_element[index].pmuEvent.type) {
+      case TPU_PMUTYPE_TDMALOAD:
+        excelType = EXCEL_TYPE_1;
+        break;
+      case TPU_PMUTYPE_TDMASTORE:
+      case TPU_PMUTYPE_TDMAMOVE:
+        excelType = EXCEL_TYPE_2;
+        break;
+      case TPU_PMUTYPE_TIU:
+        excelType = EXCEL_TYPE_3;
+        break;
+    }
+
+    if (p_element[index].pmuEvent.type == TPU_PMUTYPE_TIU) {
+      sprintf(lineStr, "%u, %u, %u, %u, %u, %u, %u, %u, %u, %s\n",
+                        p_element[index].pmuEvent.type,
+                        p_element[index].pmuEvent.desID,
+                        p_element[index].pmuEvent.eventCnt0,
+                        p_element[index].pmuEvent.eventCnt1,
+                        excelType,
+                        p_element[index].pmuEvent.startTime,
+                        p_element[index].pmuEvent.endTime - p_element[index].pmuEvent.startTime,
+                        p_element[index].pmuEvent.endTime,
+                        p_element[index].tiuReg.layer_info,
+                        p_element[index].typeStr);
+    } else {
+      srcAddr = ((uint64_t)(p_element[index].tdmaReg.src_base_addr_high) << 32) |
+                  (uint64_t)(p_element[index].tdmaReg.src_base_addr_low);
+      dstAddr = ((uint64_t)(p_element[index].tdmaReg.dst_base_addr_high) << 32) |
+                  (uint64_t)(p_element[index].tdmaReg.dst_base_addr_low);
+
+      sprintf(lineStr, "%u, %u, %u, %u, %u, %u, %u, %u, %u, %s, 0x%" PRIu64 ", 0x%" PRIu64 ", \
+        %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u\n",
+                        p_element[index].pmuEvent.type,
+                        p_element[index].pmuEvent.desID,
+                        p_element[index].pmuEvent.eventCnt0,
+                        p_element[index].pmuEvent.eventCnt1,
+                        excelType,
+                        p_element[index].pmuEvent.startTime,
+                        p_element[index].pmuEvent.endTime - p_element[index].pmuEvent.startTime,
+                        p_element[index].pmuEvent.endTime,
+                        p_element[index].tdmaReg.layer_ID,
+                        p_element[index].typeStr,
+                        srcAddr,
+                        dstAddr,
+                        p_element[index].tdmaReg.trans_fmt,
+                        p_element[index].tdmaReg.transpose_md,
+                        p_element[index].tdmaReg.cmd_id,
+                        p_element[index].tdmaReg.wait_id_tpu,
+                        p_element[index].tdmaReg.dst_h_stride,
+                        p_element[index].tdmaReg.dst_c_stride_low,
+                        p_element[index].tdmaReg.dst_n_stride,
+                        p_element[index].tdmaReg.src_h_stride,
+                        p_element[index].tdmaReg.src_c_stride_low,
+                        p_element[index].tdmaReg.src_n_stride,
+                        p_element[index].tdmaReg.dst_c,
+                        p_element[index].tdmaReg.src_c,
+                        p_element[index].tdmaReg.dst_w,
+                        p_element[index].tdmaReg.dst_h,
+                        p_element[index].tdmaReg.src_w,
+                        p_element[index].tdmaReg.src_h,
+                        p_element[index].tdmaReg.src_n);
+    }
+
+    fout_element << lineStr;
+    index ++;
+  }
+
+  fout_element.close();
+}
+
+static void tpu_pmu_getlayerInfo(void)
+{
+  uint32_t index = 0, layIDIndex = 0;
+  uint32_t curLayID = 0;
+  uint32_t u32SingleDuration = 0;
+
+  TPU_LOG_DEBUG("tpu_pmu_getlayerInfo() start\n");
+  while (p_element[index].pmuEvent.type) {
+    if (!curLayID) {
+      //record current layerID
+      curLayID = p_element[index].pmuEvent.type == TPU_PMUTYPE_TIU ?
+      p_element[index].tiuReg.layer_info : p_element[index].tdmaReg.layer_ID;
+
+      p_layer[layIDIndex].last_desType = (TPU_PMUTYPE)p_element[index].pmuEvent.type;
+      p_layer[layIDIndex].layerID = curLayID;
+      p_layer[layIDIndex].endTime = p_element[index].pmuEvent.endTime;
+      p_layer[layIDIndex].startTime = p_element[index].pmuEvent.startTime;
+      p_layer[layIDIndex].last_mapping_desID = p_element[index].pmuEvent.desID;
+    } else {
+      //if next layer ID is identical
+      if (curLayID == (p_element[index].pmuEvent.type == TPU_PMUTYPE_TIU ?
+        p_element[index].tiuReg.layer_info : p_element[index].tdmaReg.layer_ID)) {
+        p_layer[layIDIndex].endTime = (p_element[index].pmuEvent.endTime > p_layer[layIDIndex].endTime) ?
+          (p_element[index].pmuEvent.endTime) : (p_layer[layIDIndex].endTime);
+
+        p_layer[layIDIndex].last_mapping_desID = p_element[index].pmuEvent.desID;
+
+      } else {
+        layIDIndex ++;
+        curLayID = p_element[index].pmuEvent.type == TPU_PMUTYPE_TIU ?
+          p_element[index].tiuReg.layer_info : p_element[index].tdmaReg.layer_ID;
+
+        p_layer[layIDIndex].last_desType = (TPU_PMUTYPE)p_element[index].pmuEvent.type;
+        p_layer[layIDIndex].layerID = curLayID;
+        p_layer[layIDIndex].endTime = p_element[index].pmuEvent.endTime;
+        p_layer[layIDIndex].startTime = p_element[index].pmuEvent.startTime;
+        p_layer[layIDIndex].last_mapping_desID = p_element[index].pmuEvent.desID;
+      }
+    }
+
+    //get each duration and then classfy by type
+    u32SingleDuration = p_element[index].pmuEvent.endTime - p_element[index].pmuEvent.startTime;
+    switch (p_element[index].pmuEvent.type) {
+      case TPU_PMUTYPE_TIU:
+        p_layer[layIDIndex].u32TIUTime += u32SingleDuration;
+        break;
+
+      case TPU_PMUTYPE_TDMALOAD:
+        if (p_element[index].tdmaReg.src_base_reg_sel == 0)
+          p_layer[layIDIndex].u32LoadNueronTime += u32SingleDuration;
+        else if (p_element[index].tdmaReg.src_base_reg_sel == 1)
+          p_layer[layIDIndex].u32LoadWeightTime += u32SingleDuration;
+
+        p_layer[layIDIndex].u32TDMATime += u32SingleDuration;
+        break;
+
+      case TPU_PMUTYPE_TDMASTORE:
+        if (p_element[index].tdmaReg.src_base_reg_sel == 0)
+          p_layer[layIDIndex].u32StoreNueronTime += u32SingleDuration;
+
+        p_layer[layIDIndex].u32TDMATime += u32SingleDuration;
+        break;
+
+      default:
+        break;
+    }
+
+    //accumulate byte counts, one burst count = 16bytes
+    p_layer[layIDIndex].u32byteCnt += (p_element[index].pmuEvent.eventCnt1 * 16);
+    index ++;
+  }
+}
+
+static void tpu_pmu_fwrite_layer(uint64_t tpu_clock)
+{
+  uint32_t index = 0;
+  char lineStr[FILE_OUT_LINE_LEN] = {0};
+  uint64_t u64totalDuration = 0, u64singleDuration = 0;
+  std::fstream fout_layer;
+
+  sprintf(lineStr, "%s%s", pmubuf_output_file_env, TPUPMU_LAYER_FILENAME);
+  TPU_LOG_DEBUG("out file_des name=%s\n", lineStr);
+  fout_layer.open(lineStr, std::ios::out | std::ios::trunc);
+
+  //pre-processing once, and we can get total duration
+  index = 0;
+  while (p_layer[index].layerID) {
+    u64totalDuration += p_layer[index].endTime - p_layer[index].startTime;
+    index ++;
+  }
+
+  index = 0;
+  while (p_layer[index].layerID) {
+    u64singleDuration = p_layer[index].endTime - p_layer[index].startTime;
+    p_layer[index].parallelism = (double)(p_layer[index].u32TDMATime + p_layer[index].u32TIUTime) / (double)u64singleDuration * 100;
+    p_layer[index].parallelism =  p_layer[index].parallelism < 100 ? 100 : p_layer[index].parallelism;
+
+    p_layer[index].duration_percent = (double)u64singleDuration / (double)u64totalDuration * 100;
+    p_layer[index].tiu_percent = (double)p_layer[index].u32TIUTime / (double)u64singleDuration * 100;
+    p_layer[index].loadNeuron_percent = (double)p_layer[index].u32LoadNueronTime / (double)u64singleDuration * 100;
+    p_layer[index].loadWeight_percent = (double)p_layer[index].u32LoadWeightTime / (double)u64singleDuration * 100;
+    p_layer[index].storeNeuron_percent = (double)p_layer[index].u32StoreNueronTime / (double)u64singleDuration * 100;
+    p_layer[index].throughput_MB = (double)p_layer[index].u32byteCnt * tpu_clock / (double)u64singleDuration / 1024 / 1024;
+    index ++;
+  }
+
+  strcpy(lineStr, "layerID, start, duration, end, duration(%), parallelism(%), TIU(%), \
+    loadNeuron(%), loadWeight(%), storeNeuron(%), throughput(MB/s), last_tdmaID, dumpStart, dumpLen, TIU, loadNeuron, \
+    loadWeight, storeNeuron, byteCnt\n");
+
+  fout_layer << lineStr;
+
+  index = 0;
+  while (p_layer[index].layerID) {
+    sprintf(lineStr, "%d, %d, %d, %d, %lf%%, %lf%%, %lf%%, %lf%%, %lf%%, %lf%%, %.2lfMB/s, %d, 0x%x, 0x%x, %d, %d, %d, %d, %d\n",
+                p_layer[index].layerID,
+                p_layer[index].startTime,
+                p_layer[index].endTime - p_layer[index].startTime,
+                p_layer[index].endTime,
+
+                p_layer[index].duration_percent,
+                p_layer[index].parallelism,
+                p_layer[index].tiu_percent,
+                p_layer[index].loadNeuron_percent,
+                p_layer[index].loadWeight_percent,
+                p_layer[index].storeNeuron_percent,
+                p_layer[index].throughput_MB,
+
+                p_layer[index].last_mapping_desID,
+                p_layer[index].u32StartAddr,
+                p_layer[index].u32OutputLen,
+                p_layer[index].u32TIUTime,
+                p_layer[index].u32LoadNueronTime,
+                p_layer[index].u32LoadWeightTime,
+                p_layer[index].u32StoreNueronTime,
+                p_layer[index].u32byteCnt);
+    fout_layer << lineStr;
+    index ++;
+  }
+
+  fout_layer.close();
+}
+
+static int tpu_pmu_time(uint8_t *v_dma_buf, uint64_t p_dma_buf, uint8_t all_info)
+{
+  dma_hdr_t *header = (dma_hdr_t *)(v_dma_buf);
+  struct TPU_PMU_DOUBLEEVENT *pCurrent = (struct TPU_PMU_DOUBLEEVENT *)(v_dma_buf + header->pmubuf_offset);
+
+  uint64_t bmnet_p_total = 0;
+  uint64_t bmnet_p_duration = 0;
+
+  uint64_t u64TDMATotal = 0;
+  uint64_t u64TIUTotal = 0;
+  uint64_t u64_des_start = 0, u64_des_end = 0;
+  uint32_t u32TDMACnt = 0, u32TIUCnt = 0;
+  uint32_t index = 0, diff = 0, wrap_cnt = 0;
+  uint32_t tpu_clk_rate = header->tpu_clk_rate;
+  uint64_t u64_load_bytes = 0, u64_store_bytes = 0;
+  uint32_t tdma_id_previous = 0, tdma_start_pre= 0, tdma_end_pre = 0;
+  double percent_tdma = 0, percent_tiu = 0, percent_paralellism = 0;
+  double ms_tdma = 0, ms_tiu = 0, ms_influence = 0;
+  double load_mb = 0, store_mb = 0;
+  double bandwidth = 0;
+
+  TPU_LOG_DEBUG("TPU_LOG_DEBUG tpu_pmu_time() all_info=%x\n", all_info);
+  //traverse pmu buffer
+  while (*(uint32_t *)pCurrent) {
+    if (pCurrent->type >= TPU_PMUTYPE_TDMALOAD && pCurrent->type <= TPU_PMUTYPE_TIU) {
+      if (index == 0) {
+        u64_des_start = pCurrent->startTime;
+        u64_des_end = pCurrent->endTime;
+      } else {
+        u64_des_end = pCurrent->endTime;
+      }
+
+      if (all_info)
+        memcpy(&p_element[index].pmuEvent, pCurrent, sizeof(TPU_PMU_DOUBLEEVENT));
+
+    } else {
+      TPU_LOG_ERROR("pmubuf content header type incorrect, just next\n");
+      index ++;
+      pCurrent++;
+      continue;
+    }
+
+    if (pCurrent->type == TPU_PMUTYPE_TIU) {  //tiu case
+      if (pCurrent->endTime > pCurrent->startTime) {
+        diff = pCurrent->endTime - pCurrent->startTime;
+      } else {
+        diff = 0xFFFFFFFF - pCurrent->startTime + pCurrent->endTime;
+        wrap_cnt ++;
+      }
+
+      u64TIUTotal += diff;
+      u32TIUCnt++;
+    } else {    //tdma case
+
+      //g2g will generate two des loadx1+storex1, we only accumulate one of them
+      if ((pCurrent->desID != tdma_id_previous) ||
+          (pCurrent->startTime != tdma_start_pre) ||
+          (pCurrent->endTime != tdma_end_pre)) {
+
+        if (pCurrent->endTime > pCurrent->startTime) {
+          diff = pCurrent->endTime - pCurrent->startTime;
+        } else {
+          diff = TPU_WRAP_LIMIT - pCurrent->startTime + pCurrent->endTime;
+          wrap_cnt ++;
+        }
+        u64TDMATotal += diff;
+        u32TDMACnt++;
+      }
+
+      if (pCurrent->type == TPU_PMUTYPE_TDMALOAD) {
+        u64_load_bytes += TPU_BURST_SIZE * pCurrent->eventCnt1;
+      } else if (pCurrent->type == TPU_PMUTYPE_TDMASTORE) {
+        u64_store_bytes += TPU_BURST_SIZE * pCurrent->eventCnt1;
+      }
+
+      tdma_id_previous = pCurrent->desID;
+      tdma_start_pre = pCurrent->startTime;
+      tdma_end_pre = pCurrent->endTime;
+    } 
+
+    index ++;
+    pCurrent++;
+  }
+
+  bmnet_p_total = u64TDMATotal + u64TIUTotal;
+  if (wrap_cnt)
+    bmnet_p_duration = TPU_WRAP_LIMIT * (wrap_cnt - 1) + TPU_WRAP_LIMIT - u64_des_start + u64_des_end;
+  else
+    bmnet_p_duration = u64_des_end - u64_des_start;
+
+  percent_tdma = (double)u64TDMATotal / (double)bmnet_p_duration * (double)100;
+  percent_tiu = (double)u64TIUTotal / (double)bmnet_p_duration * (double)100;
+  percent_paralellism = (double)(bmnet_p_total) / (double)bmnet_p_duration * (double)100;
+  percent_paralellism = percent_paralellism < 100 ? 100 : percent_paralellism;
+
+  if (!tpu_clk_rate) {
+    tpu_clk_rate = TPU_CLOCK_DEFAULT;
+    printf("can't get tpu clock, assume to %dMhz\n", tpu_clk_rate / 1000000);
+  }
+
+  ms_tdma = (double)u64TDMATotal / (double)tpu_clk_rate * (double)1000;
+  ms_tiu = (double)u64TIUTotal / (double)tpu_clk_rate * (double)1000;
+  ms_influence = (double)bmnet_p_duration / (double)tpu_clk_rate * (double)1000;
+
+  load_mb =  (double)u64_load_bytes / (double)1024 / (double)1024;
+  store_mb =  (double)u64_store_bytes / (double)1024 / (double)1024;
+
+  bandwidth = (double)(load_mb + store_mb) / (double)ms_influence * (double)1000;
+
+  printf("=======================inference total info ==========================\n");
+  //printf("cv183x tpu clock: %dMhz\n", header->tpu_clk_rate / 1000000);
+  printf("%-20s %8dMhz, %-20s %9.2fMB, %-20s %7.2fMB/s\n",
+          "cv180x_tpu_clock:", tpu_clk_rate / 1000000, "inferece_data:", load_mb + store_mb, "inference_bw:", bandwidth);
+
+  printf("%-20s %10" PRIu64 "t, %-20s %10" PRIu64 "t, %-20s %10" PRIu64 "t\n",
+         "tdma_exe_tick:", u64TDMATotal, "tiu_exe_tick", u64TIUTotal, "inference_tick", bmnet_p_duration);
+  printf("%-20s %10.2f%%, %-20s %10.2f%%, %-20s %10.2f%%\n",
+          "tdma_exe_percent:", percent_tdma, "tiu_exe_percent:", percent_tiu, "paralellism_percent", percent_paralellism);
+  printf("%-20s %9.2fms, %-20s %9.2fms, %-20s %9.2fms\n",
+          "tdma_exe_ms:", ms_tdma, "tiu_exe_ms:", ms_tiu, "inference_ms:", ms_influence);
+
+  if (all_info) {
+    tpu_pmu_fill_cmdbuf(v_dma_buf);
+    tpu_pmu_fwrite_des();
+    tpu_pmu_getlayerInfo();
+    tpu_pmu_fwrite_layer(tpu_clk_rate);
+  }
+
+  return 0;
+}
+
+uint32_t tpu_pmu_get_des_cnt(uint8_t *v_dma_buf)
+{
+  uint32_t tiu_cnt = 0, tdma_cnt = 0;
+  dma_hdr_t *header = (dma_hdr_t *)v_dma_buf;
+  cvi_cpu_desc_t *desc = (cvi_cpu_desc_t *)(v_dma_buf + sizeof(dma_hdr_t));
+
+  for (uint32_t i = 0; i < header->cpu_desc_count; i++, desc++) {
+    tiu_cnt += (desc->num_tiu & 0xFFFF);
+    tdma_cnt += (desc->num_tdma & 0xFFFF);
+  }
+
+  //assume worst case tdma g2g case will generate double descriptor
+  return (tiu_cnt + tdma_cnt + tdma_cnt);
+}
+
+#define TPU_PMU_MALLOC_PADDING  1024
+uint32_t tpu_pmu_dump_main(uint8_t *v_dma_buf, uint64_t p_dma_buf)
+{
+  dma_hdr_t *dma_header = (dma_hdr_t *)v_dma_buf;
+  uint8_t all_info = 0;
+
+  //check header first
+  if (dma_header->dmabuf_magic_m != TPU_DMABUF_HEADER_M) {
+    TPU_LOG_NOTICE("pmu buffer header incorrect\n");
+    return CVI_RC_FAILURE;
+  }
+
+  //check if we need output pmubuf
+  pmubuf_output_file_env = std::getenv("TPU_PMUBUF_OUTPUT_FILE");
+  if (pmubuf_output_file_env) {
+    all_info = 1;
+  }
+
+  //malloc element array
+  if (all_info) {
+    p_element = (TPU_DES_ELEMENT *)malloc(tpu_pmu_get_des_cnt(v_dma_buf) * sizeof(TPU_DES_ELEMENT) + TPU_PMU_MALLOC_PADDING);
+    p_layer = (TPU_LAYERID_ELEMENT *)malloc(tpu_pmu_get_des_cnt(v_dma_buf) * sizeof(TPU_LAYERID_ELEMENT) + TPU_PMU_MALLOC_PADDING);
+
+    if (!p_element || !p_layer) {
+      TPU_LOG_INFO("tpu pmu des array malloc failed\n");
+      return CVI_RC_FAILURE;
+    }
+  }
+
+  //get pmu overview data
+  tpu_pmu_time(v_dma_buf, p_dma_buf, all_info);
+
+  //free element array
+  if (all_info) {
+    if (p_element) {
+      free(p_element);
+      p_element = NULL;
+    }
+
+    if (p_layer) {
+      free(p_layer);
+      p_layer = NULL;
+    }
+  }
+
+  return CVI_RC_SUCCESS;
+}
+
diff --git a/cviruntime/src/soc/181x/CMakeLists.txt b/cviruntime/src/soc/181x/CMakeLists.txt
new file mode 100644
index 000000000..f455c3a6b
--- /dev/null
+++ b/cviruntime/src/soc/181x/CMakeLists.txt
@@ -0,0 +1,33 @@
+cmake_minimum_required(VERSION 2.8.0)
+
+include_directories(./)
+include_directories(../common)
+include_directories(${CMAKE_SYSROOT}/include)
+add_definitions(-DION_CACHE_OPEN)
+add_definitions(-DMULTI_PROCESS)
+
+set(RUNTIME_SOURCES ${RUNTIME_SOURCES}
+   ${CMAKE_CURRENT_SOURCE_DIR}/../runtime_bmkernel.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/bmruntime_soc.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/bm_dmabuf.c
+   ${CMAKE_CURRENT_SOURCE_DIR}/cvi_rt_181x.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/cvi181x_device_mem.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/../common/cvi_device_mem.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/../common/cvi_rt_base.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/../common/cviruntime_context.cpp)
+
+
+if (${ENABLE_PMU}) 
+  set(RUNTIME_SOURCES ${RUNTIME_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/tpu_pmu.cpp)
+endif()
+
+set(EXTRA_LIBS ${EXTRA_LIBS} rt dl pthread)
+
+add_library(cviruntime-static STATIC ${RUNTIME_SOURCES})
+set_property(TARGET cviruntime-static PROPERTY POSITION_INDEPENDENT_CODE ON)
+
+add_library(cviruntime SHARED ${RUNTIME_SOURCES})
+target_link_libraries(cviruntime cvikernel ${EXTRA_LIBS})
+
+install(TARGETS cviruntime DESTINATION lib)
+install(TARGETS cviruntime-static DESTINATION lib)
diff --git a/cviruntime/src/soc/181x/bm_dmabuf.c b/cviruntime/src/soc/181x/bm_dmabuf.c
new file mode 100644
index 000000000..ec9bda65b
--- /dev/null
+++ b/cviruntime/src/soc/181x/bm_dmabuf.c
@@ -0,0 +1,432 @@
+#include <stdio.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <string.h>
+
+#include <bmkernel/bm1822/bmkernel_1822.h>
+#include <bmkernel/bm1822/bm1822_tiu_reg.h>
+#include <bmkernel/bm1822/bm1822_tdma_reg.h>
+#include <bmkernel/reg_tiu.h>
+#include <bmkernel/reg_tdma.h>
+#include <bmkernel/reg_bdcast.h>
+#include <bmkernel/bm_regcpu.h>
+#include "bmruntime_internal.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define __ALIGN_MASK(x,mask)    (((x)+(mask))&~(mask))
+#define ALIGN(x,a)              __ALIGN_MASK(x,(__typeof__(x))(a)-1)
+
+#define BD_DESC_ALIGN_SIZE (1 << BDC_ENGINE_CMD_ALIGNED_BIT)
+#define GDMA_DESC_ALIGN_SIZE (1 << TDMA_DESCRIPTOR_ALIGNED_BIT)
+#define BD_EOD_PADDING_BYTES (128)
+
+typedef struct {
+  cmd_hdr_t hdr;
+  uint32_t body[0];
+} DESC;
+
+static DESC *traverse_start(uint8_t *cmdbuf)
+{
+  TPU_ASSERT(cmdbuf, NULL);
+  DESC *desc = (DESC *)cmdbuf;
+  TPU_ASSERT(desc->hdr.magic == CMDBUF_HDR_MAGIC_181X, NULL);
+  return desc;
+}
+
+static DESC *traverse_next(DESC *desc, uint8_t *cmdbuf, size_t size)
+{
+  DESC *next_desc = (DESC *)((uint8_t *)desc + cmd_hdr_len(&desc->hdr) + sizeof(cmd_hdr_t));
+  if ((uint8_t *)next_desc >= cmdbuf + size)
+    return NULL;
+  TPU_ASSERT(next_desc->hdr.magic == CMDBUF_HDR_MAGIC_181X, NULL);
+  return next_desc;
+}
+
+static bool is_last_desc(DESC *desc, uint8_t *cmdbuf, size_t size)
+{
+  DESC *next_desc = traverse_next(desc, cmdbuf, size);
+  return next_desc ? false : true;
+}
+
+static void reorder_bd_cmdbuf_reg(uint8_t *cmdbuf)
+{
+  int total_bits = BD_REG_BYTES * 8;
+
+  for (int i = 0; i < total_bits; i += 128)
+    cmdbuf[(i + 128 - 8) / 8] |= (i / 128) << 4;
+
+  uint8_t tmp[128 / 8];
+  uint8_t *last = &cmdbuf[(total_bits - 128) / 8];
+  memcpy(tmp, last, sizeof(tmp));
+  memcpy(last, cmdbuf, sizeof(tmp));
+  memcpy(cmdbuf, tmp, sizeof(tmp));
+}
+
+static void adjust_desc_tdma(uint32_t *body, bool eod)
+{
+  if (eod) {
+    body[0] |= (1 << TDMA_ACCPI0_EOD_BIT);
+    body[0] |= (1 << TDMA_ACCPI0_INTERRUPT_BIT); // interrupt
+  }
+  body[0] |= (1 << TDMA_ACCPI0_BARRIER_ENABLE_BIT);
+}
+
+static void adjust_desc_bd(uint32_t *body, bool eod)
+{
+  if (eod) {
+    tiu_reg_t reg;
+    parse_tiu_reg(&reg, body);
+    reg.cmd_end = 1;
+    reg.cmd_intr_en = 1;
+    emit_tiu_reg(&reg, body);
+  }
+  reorder_bd_cmdbuf_reg((uint8_t *)body);
+}
+
+bmerr_t cvi181x_dmabuf_relocate(uint8_t *dmabuf, uint64_t dmabuf_devaddr, uint32_t original_size, uint32_t pmubuf_size)
+{
+  dma_hdr_t *header = (dma_hdr_t *)dmabuf;
+  uint64_t tmpAddress = 0;
+
+  TPU_ASSERT(header->dmabuf_magic_m == TPU_DMABUF_HEADER_M, NULL);
+  cvi_cpu_desc_t *desc = (cvi_cpu_desc_t *)(dmabuf + sizeof(dma_hdr_t));
+
+  for (uint32_t i = 0; i < header->cpu_desc_count; i++, desc++) {
+    uint32_t tiu_num = desc->num_tiu & 0xFFFF;
+    uint32_t tdma_num = desc->num_tdma & 0xFFFF;
+
+    if (tiu_num) {
+      tmpAddress = dmabuf_devaddr + desc->offset_tiu;
+      //TPU_LOG_DEBUG("bd tmpAddress = 0x%lu\n", tmpAddress);
+      desc->offset_tiu_ori_bk = desc->offset_tiu;
+      desc->offset_tiu = tmpAddress >> BDC_ENGINE_CMD_ALIGNED_BIT;
+    }
+
+    if (tdma_num) {
+      tmpAddress = dmabuf_devaddr + desc->offset_tdma;
+      //TPU_LOG_DEBUG("tdma tmpAddress = 0x%lu\n", tmpAddress);
+      desc->offset_tdma_ori_bk = desc->offset_tdma;
+      desc->offset_tdma = tmpAddress >> TDMA_DESCRIPTOR_ALIGNED_BIT;
+    }
+
+    //set pmubuf_addr_p to enable pmu kick
+    header->pmubuf_size = pmubuf_size;
+    header->pmubuf_offset = original_size;
+  }
+  return BM_SUCCESS;
+}
+
+static uint32_t desc_sync_id(DESC *desc)
+{
+  switch (desc->hdr.engine_id) {
+    case BMK1822_TIU: {
+      tiu_reg_t reg;
+      parse_tiu_reg(&reg, desc->body);
+      return reg.cmd_id_tpu;
+    }
+    case BMK1822_TDMA: {
+      tdma_reg_t reg;
+      parse_tdma_reg(&reg, desc->body);
+      return reg.cmd_id;
+    }
+    default:
+      TPU_ASSERT(0, NULL);
+      return 1;
+  }
+}
+
+static void fill_header_and_arm(uint8_t *cmdbuf, size_t sz, uint8_t *dmabuf, uint64_t *tiu_offset, uint64_t *tdma_offset)
+{
+  dma_hdr_t header = {0};
+  header.dmabuf_magic_m = TPU_DMABUF_HEADER_M;
+  header.dmabuf_magic_s = 0x1822;
+
+  cvi_cpu_desc_t *segments = (cvi_cpu_desc_t *)(dmabuf + sizeof(dma_hdr_t));
+  DESC *desc = NULL;
+  size_t desc_nums[BMK1822_ENGINE_NUM] = {0};
+  size_t counters[BMK1822_ENGINE_NUM] = {0};
+  size_t desc_size[BMK1822_ENGINE_NUM] = {0};
+
+  TPU_ASSERT(segments, NULL);
+  // fill arm descs
+  desc = traverse_start(cmdbuf);
+
+  while (desc != NULL) {
+    uint32_t engine_id = (uint32_t)desc->hdr.engine_id;
+    counters[engine_id]++;
+    desc_nums[engine_id]++;
+    if (engine_id != BMK1822_CPU) {
+      // a new arm desc inserted to do sync operation
+      if (desc_sync_id(desc) == 0xFFFF || is_last_desc(desc, cmdbuf, sz)) {
+        desc_nums[BMK1822_CPU]++;
+        cvi_cpu_desc_t *arm = segments + desc_nums[BMK1822_CPU] - 1;
+        memset(arm, 0, sizeof(cvi_cpu_desc_t));
+        arm->op_type = CPU_OP_SYNC;
+        arm->num_tiu = counters[BMK1822_TIU];
+        arm->num_tdma = counters[BMK1822_TDMA];
+        strncpy(arm->str, "layer_end", sizeof(arm->str) - 1);
+        if (counters[BMK1822_TIU] != 0) {
+          desc_size[BMK1822_TIU] =
+              ALIGN(desc_size[BMK1822_TIU] + counters[BMK1822_TIU] * BD_REG_BYTES + BD_EOD_PADDING_BYTES,
+                    BD_DESC_ALIGN_SIZE);
+        }
+        counters[BMK1822_TIU] = 0;
+        counters[BMK1822_TDMA] = 0;
+      }
+    } else {
+      cvi_cpu_desc_t *arm = segments + desc_nums[BMK1822_CPU] - 1;
+      memcpy(arm, &(desc->body), sizeof(cvi_cpu_desc_t));
+      arm->num_tiu = counters[BMK1822_TIU];
+      arm->num_tdma = counters[BMK1822_TDMA];
+      if (counters[BMK1822_TIU] != 0) {
+        desc_size[BMK1822_TIU] =
+            ALIGN(desc_size[BMK1822_TIU] + counters[BMK1822_TIU] * BD_REG_BYTES + BD_EOD_PADDING_BYTES,
+                  BD_DESC_ALIGN_SIZE);
+      }
+      counters[BMK1822_TIU] = 0;
+      counters[BMK1822_TDMA] = 0;
+    }
+    desc = traverse_next(desc, cmdbuf, sz);
+  }
+  desc_size[BMK1822_CPU] = desc_nums[BMK1822_CPU] * CPU_ENGINE_BYTES;
+  desc_size[BMK1822_TDMA] = desc_nums[BMK1822_TDMA] * GDMA_DESC_ALIGN_SIZE;
+
+  (*tiu_offset) = ALIGN(sizeof(header) + desc_size[BMK1822_CPU], BD_DESC_ALIGN_SIZE);
+  (*tdma_offset) = ALIGN((*tiu_offset) + desc_size[BMK1822_TIU], GDMA_DESC_ALIGN_SIZE);
+
+  // dma hdr + arm descs + bd descs + tdma descs
+  header.dmabuf_size = (*tdma_offset) + desc_size[BMK1822_TDMA];
+  header.cpu_desc_count = desc_nums[BMK1822_CPU];
+  header.bd_desc_count = desc_nums[BMK1822_TIU];
+  header.tdma_desc_count = desc_nums[BMK1822_TDMA];
+
+  //TPU_LOG_DEBUG("header.dmabuf_size = %d\n", header.dmabuf_size);
+  //TPU_LOG_DEBUG("header.cpu_desc_count = %d\n", header.cpu_desc_count);
+  //TPU_LOG_DEBUG("header.bd_desc_count = %d\n", header.bd_desc_count);
+  //TPU_LOG_DEBUG("header.tdma_desc_count = %d\n", header.tdma_desc_count);
+
+  memcpy(dmabuf, &header, sizeof(header));
+}
+
+static void fill_bd_and_tdma(uint8_t *cmdbuf, size_t sz, uint8_t *dmabuf, uint64_t tiu_offset, uint64_t tdma_offset)
+{
+  dma_hdr_t *p_header = (dma_hdr_t *)dmabuf;
+  cvi_cpu_desc_t *segments = (cvi_cpu_desc_t *)(dmabuf + sizeof(dma_hdr_t));
+  DESC *desc = traverse_start(cmdbuf);
+  //uint64_t address_max = 0x0;
+  
+  for (uint32_t i = 0; i < p_header->cpu_desc_count; i++) {
+
+    cvi_cpu_desc_t *arm = segments + i;
+    
+    uint32_t tiu_num = arm->num_tiu & 0xFFFF;
+    uint32_t tdma_num = arm->num_tdma & 0xFFFF;
+
+    if (tiu_num) {
+      tiu_offset = ALIGN(tiu_offset, 1 << BDC_ENGINE_CMD_ALIGNED_BIT);
+      arm->offset_tiu = tiu_offset;
+      //TPU_LOG_DEBUG("arm->offset_tiu = 0x%x \n", arm->offset_tiu);
+    }
+
+    if (tdma_num) {
+      tdma_offset = ALIGN(tdma_offset, 1 << TDMA_DESCRIPTOR_ALIGNED_BIT);
+      arm->offset_tdma = tdma_offset;
+      //TPU_LOG_DEBUG("arm->offset_tdma = 0x%x \n", arm->offset_tdma);
+    }
+
+    while (tiu_num || tdma_num) {
+      uint32_t engine_id = (uint32_t)desc->hdr.engine_id;
+      void *p_body = NULL;
+
+      switch (engine_id) {
+        case BMK1822_TIU:
+          tiu_num--;
+          p_body = (void *)(dmabuf + tiu_offset);
+          tiu_offset += BD_REG_BYTES;
+          memcpy(p_body, desc->body, desc->hdr.len);
+          adjust_desc_bd((uint32_t *)p_body, tiu_num == 0);
+          break;
+        case BMK1822_TDMA:
+          tdma_num--;
+          tdma_offset = ALIGN(tdma_offset, GDMA_DESC_ALIGN_SIZE);
+          p_body = (void *)(dmabuf + tdma_offset);
+          tdma_offset += GDMA_DESC_ALIGN_SIZE;
+          memcpy(p_body, desc->body, desc->hdr.len);
+
+#if 0 //debug feature, for checking if neuron overshoot
+{
+          tdma_reg_t reg_tdma = {0};
+          uint64_t tdma_address = 0, tdma_address2 = 0;
+
+          parse_tdma_reg(&reg_tdma, p_body);
+
+          if (reg_tdma.src_base_reg_sel == 0) {
+            //  reg.trans_dir = 2; // 0:tg2l, 1:l2tg, 2:g2g, 3:l2l
+            if (reg_tdma.trans_dir == 0) {
+              TPU_LOG_DEBUG ("src_base_addr_high=%x, src_base_addr_low=%x\n", reg_tdma.src_base_addr_high, reg_tdma.src_base_addr_low);
+              tdma_address = ((uint64_t)reg_tdma.src_base_addr_high) << 32 | (uint64_t)reg_tdma.src_base_addr_low;
+            } else if (reg_tdma.trans_dir == 1) {
+              TPU_LOG_DEBUG ("dst_base_addr_high=%x, dst_base_addr_low=%x\n", reg_tdma.dst_base_addr_high, reg_tdma.dst_base_addr_low);
+              tdma_address = ((uint64_t)reg_tdma.dst_base_addr_high) << 32 | (uint64_t)reg_tdma.dst_base_addr_low;
+            } else if (reg_tdma.trans_dir == 2) {
+              TPU_LOG_DEBUG ("dst_base_addr_high=%x, dst_base_addr_low=%x\n", reg_tdma.dst_base_addr_high, reg_tdma.dst_base_addr_low);
+              tdma_address = ((uint64_t)reg_tdma.src_base_addr_high) << 32 | (uint64_t)reg_tdma.src_base_addr_low;
+              tdma_address2 = ((uint64_t)reg_tdma.dst_base_addr_high) << 32 | (uint64_t)reg_tdma.dst_base_addr_low;
+
+              if (tdma_address2 > tdma_address) {
+                tdma_address = tdma_address2;
+              }
+            }
+            
+            if (tdma_address > address_max) {
+              address_max = tdma_address;
+              TPU_LOG_DEBUG("address_max=%llx\n", address_max);
+            }
+          }
+}
+#endif
+          adjust_desc_tdma((uint32_t *)p_body, tdma_num == 0);
+          break;
+        default:
+          break;
+      }
+      desc = traverse_next(desc, cmdbuf, sz);
+    }
+
+    // padding zero after eod to workaroud hardware bug
+    if (arm->num_tiu & 0xFFFF) {
+      void *buf = (void *)(dmabuf + tiu_offset);
+      memset(buf, 0, BD_EOD_PADDING_BYTES);
+      tiu_offset += BD_EOD_PADDING_BYTES;
+    }
+  }
+
+}
+
+bmerr_t cvi181x_dmabuf_convert(uint8_t *cmdbuf, size_t sz, uint8_t *dmabuf)
+{
+  uint64_t tiu_offset = 0;
+  uint64_t tdma_offset = 0;
+  fill_header_and_arm(cmdbuf, sz, dmabuf, &tiu_offset, &tdma_offset);
+  fill_bd_and_tdma(cmdbuf, sz, dmabuf, tiu_offset, tdma_offset);
+  return BM_SUCCESS;
+}
+
+#define PER_DES_SIZE 16
+#define PADDING_SIZE (1024 * 1024)
+bmerr_t cvi181x_dmabuf_size(uint8_t *cmdbuf, size_t sz, size_t *psize, size_t *pmu_size)
+{
+  size_t tdma_desc_num = {0};
+  size_t counters[BMK1822_ENGINE_NUM] = {0};
+  size_t bd_size = 0;
+  size_t dmabuf_size = 0;
+
+  uint32_t tiu_cnt = 0;
+  uint32_t tdma_cnt = 0;
+
+  // calculate desc numbers
+  DESC *desc = traverse_start(cmdbuf);
+
+  while (desc != NULL) {
+    uint32_t engine_id = (uint32_t)desc->hdr.engine_id;
+    counters[engine_id]++;
+    if (engine_id != BMK1822_CPU) {
+      // a new arm desc inserted to do sync operation
+      if (desc_sync_id(desc) == 0xFFFF || is_last_desc(desc, cmdbuf, sz)) {
+        counters[BMK1822_CPU]++;
+        tdma_desc_num += counters[BMK1822_TDMA];
+        if (counters[BMK1822_TIU] != 0) {
+          bd_size = ALIGN(bd_size + counters[BMK1822_TIU] * BD_REG_BYTES + BD_EOD_PADDING_BYTES,
+                          BD_DESC_ALIGN_SIZE);
+        }
+        tiu_cnt += counters[BMK1822_TIU] & 0xFFFF;
+        tdma_cnt += counters[BMK1822_TDMA] & 0xFFFF;
+        counters[BMK1822_TIU] = 0;
+        counters[BMK1822_TDMA] = 0;
+      }
+    } else {
+      tdma_desc_num += counters[BMK1822_TDMA];
+      if (counters[BMK1822_TIU] != 0) {
+        bd_size = ALIGN(bd_size + counters[BMK1822_TIU] * BD_REG_BYTES + BD_EOD_PADDING_BYTES,
+                        BD_DESC_ALIGN_SIZE);
+      }
+      tiu_cnt += counters[BMK1822_TIU] & 0xFFFF;
+      tdma_cnt += counters[BMK1822_TDMA] & 0xFFFF;
+      counters[BMK1822_TIU] = 0;
+      counters[BMK1822_TDMA] = 0;
+    }
+    desc = traverse_next(desc, cmdbuf, sz);
+  }
+  // dma hdr + arm descs + bd descs + tdma descs
+  dmabuf_size = sizeof(dma_hdr_t) + counters[BMK1822_CPU] * CPU_ENGINE_BYTES;
+  dmabuf_size = ALIGN(dmabuf_size, BD_DESC_ALIGN_SIZE) + bd_size;
+  dmabuf_size = ALIGN(dmabuf_size, GDMA_DESC_ALIGN_SIZE) + tdma_desc_num * GDMA_DESC_ALIGN_SIZE;
+
+  *psize = dmabuf_size;
+
+  *pmu_size = ALIGN((tiu_cnt + tdma_cnt) * PER_DES_SIZE + PADDING_SIZE, 0x1000);
+  return BM_SUCCESS;
+}
+
+void cvi181x_arraybase_set(uint8_t *dmabuf, uint32_t arraybase0L, uint32_t arraybase1L, uint32_t arraybase0H, uint32_t arraybase1H)
+{
+  TPU_ASSERT(dmabuf, NULL);
+  dma_hdr_t *header = (dma_hdr_t *)dmabuf;
+
+  TPU_ASSERT(header->dmabuf_magic_m == TPU_DMABUF_HEADER_M, NULL);
+  header->arraybase_0_L = arraybase0L;
+  header->arraybase_1_L = arraybase1L;
+  header->arraybase_0_H = arraybase0H;
+  header->arraybase_1_H = arraybase1H;
+  return;
+}
+
+uint64_t cvi181x_get_pmusize(uint8_t * dmabuf)
+{
+  uint32_t tiu_cnt = 0, tdma_cnt = 0;
+  uint64_t pmu_size = 0;
+
+  TPU_ASSERT(dmabuf, NULL);
+  dma_hdr_t *header = (dma_hdr_t *)dmabuf;
+  TPU_ASSERT(header->dmabuf_magic_m == TPU_DMABUF_HEADER_M, NULL);
+
+  cvi_cpu_desc_t *desc = (cvi_cpu_desc_t *)(dmabuf + sizeof(dma_hdr_t));
+
+  for (uint32_t i = 0; i < header->cpu_desc_count; i++, desc++) {
+    tiu_cnt += (desc->num_tiu & 0xFFFF);
+    tdma_cnt += (desc->num_tdma & 0xFFFF);
+  }
+
+  pmu_size = ALIGN((tiu_cnt + tdma_cnt) * PER_DES_SIZE + PADDING_SIZE, 0x1000);
+  //TPU_LOG_DEBUG("cvi181x_get_pmusize pmusize= %" PRIu64 " \n", pmu_size);
+  return pmu_size;
+}
+void cvi181x_dmabuf_dump(uint8_t *dmabuf)
+{
+  TPU_ASSERT(dmabuf, NULL);
+  dma_hdr_t *header = (dma_hdr_t *)dmabuf;
+  //TPU_LOG_DEBUG("bmk1822_dmabuf_dump header->arraybase_0_L = 0x%x\n", header->arraybase_0_L);
+  //TPU_LOG_DEBUG("bmk1822_dmabuf_dump header->arraybase_1_L = 0x%x\n", header->arraybase_1_L);
+  //TPU_LOG_DEBUG("bmk1822_dmabuf_dump header->arraybase_0_H = 0x%x\n", header->arraybase_0_H);
+  //TPU_LOG_DEBUG("bmk1822_dmabuf_dump header->arraybase_1_H = 0x%x\n", header->arraybase_1_H);
+  //TPU_LOG_DEBUG("bmk1822_dmabuf_dump header->pmubuf_offset = 0x%x\n", header->pmubuf_offset);
+
+  TPU_ASSERT(header->dmabuf_magic_m == TPU_DMABUF_HEADER_M, NULL);
+  cvi_cpu_desc_t *desc = (cvi_cpu_desc_t *)(dmabuf + sizeof(dma_hdr_t));
+
+  for (u32 i = 0; i < header->cpu_desc_count; i++, desc++) {
+    int bd_num = desc->num_tiu & 0xFFFF;
+    int tdma_num = desc->num_tdma & 0xFFFF;
+    u32 bd_offset = desc->offset_tiu;
+    u32 tdma_offset = desc->offset_tdma;
+    TPU_LOG_DEBUG("cvi181x_dmabuf_dump num<bd:%d, tdma:%d>, offset<0x%08x, 0x%08x>\n", bd_num, tdma_num, bd_offset, tdma_offset);
+  }
+}
+
+#ifdef __cplusplus
+}
+#endif
+
diff --git a/cviruntime/src/soc/181x/bmruntime_internal.h b/cviruntime/src/soc/181x/bmruntime_internal.h
new file mode 100644
index 000000000..6a32e7156
--- /dev/null
+++ b/cviruntime/src/soc/181x/bmruntime_internal.h
@@ -0,0 +1,32 @@
+#ifndef _BM_RUNTIME_INTERNAL_H_
+#define _BM_RUNTIME_INTERNAL_H_
+
+#include <pthread.h>
+#include <bmkernel/bm1822/bmkernel_1822.h>
+#include <bmruntime.h>
+#include <cvikernel/cvikernel.h>
+#include "cvitpu_debug.h"
+#include <bmkernel/bm_regcpu.h>
+#include "bm_types.h"
+
+#ifdef __cplusplus
+	extern "C" {
+#endif
+
+bmerr_t cvi181x_dmabuf_size(uint8_t *cmdbuf, size_t sz, size_t *psize, size_t *pmu_size);
+bmerr_t cvi181x_dmabuf_relocate(uint8_t *dmabuf, uint64_t dmabuf_devaddr, uint32_t original_size, uint32_t pmubuf_size);
+bmerr_t cvi181x_dmabuf_convert(uint8_t *cmdbuf, size_t sz, uint8_t *dmabuf);
+void cvi181x_dmabuf_dump(uint8_t * dmabuf);
+void cvi181x_arraybase_set(uint8_t *dmabuf, uint32_t arraybase0L, uint32_t arraybase1L, uint32_t arraybase0H, uint32_t arraybase1H);
+uint64_t cvi181x_get_pmusize(uint8_t * dmabuf);
+
+uint32_t tpu_pmu_dump_main(uint8_t *v_dma_buf, uint64_t p_dma_buf);
+
+#define TPU_PMUBUF_SIZE         (1024 * 1024 * 2)
+#define TPU_DMABUF_HEADER_M     0xB5B5
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BM_RUNTIME_INTERNAL_H_ */
diff --git a/cviruntime/src/soc/181x/bmruntime_soc.cpp b/cviruntime/src/soc/181x/bmruntime_soc.cpp
new file mode 100644
index 000000000..fdf556408
--- /dev/null
+++ b/cviruntime/src/soc/181x/bmruntime_soc.cpp
@@ -0,0 +1,173 @@
+#include <inttypes.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <cstdlib>
+#include <runtime/debug.h>
+#include <bmruntime.h>
+#include <mmpool.h>
+#include "bmruntime_internal.h"
+#include "cvi181x_device_mem.h"
+
+Cvi181xDeviceMem cvi_device;
+
+bmmem_device_t bmmem_device_alloc_raw(bmctx_t ctx, size_t size) {
+  return cvi_device.mem_alloc_raw(ctx, size);
+}
+
+bmmem_device_t bmmem_device_prealloc_raw(bmctx_t ctx, bmmem_device_t mem, uint64_t offset,
+                                         size_t size) {
+  return cvi_device.mem_prealloc_raw(ctx, mem, offset, size);
+}
+
+void bmmem_device_free(bmctx_t ctx, bmmem_device_t mem) {
+  cvi_device.mem_free_raw(ctx, mem);
+}
+
+void bmmem_device_free_ex(uint64_t p_addr) {
+  cvi_device.mem_free_ex(p_addr);
+}
+
+size_t bmmem_device_size(bmmem_device_t mem) {
+  return cvi_device.mem_size(mem);
+}
+
+uint64_t bmmem_device_addr(bmmem_device_t mem) {
+  return cvi_device.mem_p_addr(mem);
+}
+
+uint8_t *bmmem_device_v_addr(bmmem_device_t mem) {
+  return cvi_device.mem_v_addr(mem);
+}
+
+int32_t bmmem_device_inc_ref(bmmem_device_t mem) {
+  return cvi_device.mem_inc_ref(mem);
+}
+
+int32_t bmmem_device_dec_ref(bmmem_device_t mem) {
+  return cvi_device.mem_dec_ref(mem);
+}
+
+bmerr_t bm_memcpy_s2d(bmctx_t ctx, bmmem_device_t dst, uint8_t *src) {
+  return cvi_device.mem_memcpy_s2d(ctx, dst, src);
+}
+
+bmerr_t bm_memcpy_s2d_ex(bmctx_t ctx, bmmem_device_t dst, uint8_t *src, uint64_t offset,
+                         size_t size) {
+  return cvi_device.mem_memcpy_s2d_ex(ctx, dst, src, offset, size);
+}
+
+bmerr_t bm_memcpy_d2s(bmctx_t ctx, uint8_t *dst, bmmem_device_t src) {
+  return cvi_device.mem_memcpy_d2s(ctx, dst, src);
+}
+
+bmerr_t bm_memcpy_d2s_ex(bmctx_t ctx, uint8_t *dst, bmmem_device_t src, uint64_t offset,
+                         size_t size) {
+  return cvi_device.mem_memcpy_d2s_ex(ctx, dst, src, offset, size);
+}
+
+bmerr_t bm_context_create(bmctx_t *ctx) {
+  return cvi_device.context_create(ctx);
+}
+
+bmerr_t bm_bind_device(bmctx_t ctx, bmdev_t dev) {
+  return cvi_device.bind_device(ctx, dev);
+}
+
+void bm_unbind_device(bmctx_t ctx) {
+  return cvi_device.unbind_device(ctx);
+}
+
+bmdev_t bm_get_device(bmctx_t ctx) {
+  return cvi_device.get_device(ctx);
+}
+
+bmerr_t bm_init(int index, bmctx_t *ctx) {
+  return cvi_device.device_init(index, ctx);
+}
+
+void bm_exit(bmctx_t ctx) {
+  cvi_device.device_exit(ctx);
+}
+
+bmerr_t bm_load_cmdbuf(bmctx_t ctx, uint8_t *cmdbuf, size_t sz,
+                       uint64_t neuron_gaddr, uint64_t weight_gaddr,
+                       bool enable_pmu, bmmem_device_t *cmdbuf_mem) {
+  return cvi_device.load_cmdbuf(ctx, cmdbuf, sz, neuron_gaddr,
+                                 weight_gaddr, enable_pmu, cmdbuf_mem);
+}
+
+bmerr_t cvi_load_cmdbuf_tee(bmctx_t ctx, uint8_t *cmdbuf, size_t sz,
+                       uint64_t neuron_gaddr, uint64_t weight_gaddr, uint32_t weight_len, bmmem_device_t *cmdbuf_mem)
+{
+  return cvi_device.load_cmdbuf_tee(ctx, cmdbuf, sz, neuron_gaddr,
+                                     weight_gaddr, weight_len, cmdbuf_mem);
+}
+
+bmerr_t cvi_run_cmdbuf_tee(bmctx_t ctx, uint16_t *seq_no, uint64_t dmabuf_addr, cvi_array_base *array_base)
+{
+  return cvi_device.run_cmdbuf_tee(ctx, seq_no, dmabuf_addr, array_base);
+}
+
+bmerr_t bm_run_cmdbuf(bmctx_t ctx, bmmem_device_t cmdbuf_mem, uint16_t *seq_no) {
+  return cvi_device.run_cmdbuf(ctx, cmdbuf_mem, seq_no);
+}
+
+bmerr_t bm_run_cmdbuf_ex(bmctx_t ctx, bmmem_device_t cmdbuf_mem, uint16_t *seq_no,
+                       uint64_t input_base_addr, uint64_t output_base_addr) {
+  return cvi_device.run_cmdbuf_ex(ctx, cmdbuf_mem, seq_no, input_base_addr, output_base_addr);
+}
+
+bmerr_t bm_run_cmdbuf_ex2(bmctx_t ctx, bmmem_device_t cmdbuf_mem, uint16_t *seq_no,
+                       cvi_array_base *p_array_base) {
+  return cvi_device.run_cmdbuf_ex2(ctx, cmdbuf_mem, seq_no, p_array_base);
+}
+
+bmerr_t cvi_run_async(bmctx_t ctx, bmmem_device_t cmdbuf_mem)
+{
+  return cvi_device.run_async(ctx, cmdbuf_mem);
+}
+
+bmerr_t bm_send_cmdbuf(bmctx_t ctx, uint8_t *cmdbuf, size_t sz, uint16_t *seq_no) {
+  return cvi_device.send_cmdbuf(ctx, cmdbuf, sz, seq_no);
+}
+
+bmerr_t bm_wait_cmdbuf_done(bmctx_t ctx, uint16_t seq_no) {
+  return cvi_device.wait_cmdbuf_done(ctx, seq_no);
+}
+
+bmerr_t cvi_wait_cmdbuf_all(bmctx_t ctx) {
+  return cvi_device.wait_cmdbuf_all(ctx);
+}
+
+bmerr_t bm_run_cmdbuf_pio(bmctx_t ctx, uint8_t *cmdbuf, size_t sz) {
+  return cvi_device.run_cmdbuf_pio(ctx, cmdbuf, sz);
+}
+
+void bm_device_set_base_reg(bmctx_t ctx, uint32_t inx, uint64_t addr) {
+  cvi_device.set_base_reg(ctx, inx, addr);
+}
+
+uint64_t bm_device_read_base_reg(bmctx_t ctx, u32 inx) {
+  return cvi_device.read_base_reg(ctx, inx);
+}
+
+int bm_device_get_chip_ver(bmdev_t dev) {
+  return cvi_device.get_chip_ver(dev);
+}
+
+bmerr_t bm_parse_pmubuf(bmmem_device_t cmdbuf_mem, uint8_t **buf_start, uint32_t *buf_len) {
+  return cvi_device.parse_pmubuf(cmdbuf_mem, buf_start, buf_len);
+}
+
+void cviruntime_cvikernel_create(bmctx_t ctx, void **p_bk_ctx) {
+  cvi_device.cvikernel_create(ctx, p_bk_ctx);
+}
+
+void cviruntime_cvikernel_submit(bmctx_t ctx) {
+  cvi_device.cvikernel_submit(ctx);
+}
+
+void cviruntime_cvikernel_destroy(bmctx_t ctx) {
+  cvi_device.cvikernel_destroy(ctx);
+}
diff --git a/cviruntime/src/soc/181x/cvi181x_device_mem.cpp b/cviruntime/src/soc/181x/cvi181x_device_mem.cpp
new file mode 100644
index 000000000..4c8a82a66
--- /dev/null
+++ b/cviruntime/src/soc/181x/cvi181x_device_mem.cpp
@@ -0,0 +1,260 @@
+#include <cstdlib>
+#include <memory>
+#include <cstring>
+#include "cvi181x_device_mem.h"
+
+Cvi181xDeviceMem::Cvi181xDeviceMem() {
+  GLOBAL_MEM_START_ADDR = 0x00;
+  g_gmem_size = 1ULL << 30; // 1GB
+  tpu_dmabuf_header_m = 0xB5B5;
+}
+
+Cvi181xDeviceMem::~Cvi181xDeviceMem() {}
+
+
+bmerr_t Cvi181xDeviceMem::device_open(int index, bmdev_t *dev)
+{
+  bm_device_t *pdev = new bm_device_t;
+
+  BMDEV_LOCK_INIT(pdev);
+  pdev->index = index;
+  pdev->info.info182x = bmk1822_chip_info();
+  pdev->gmem_size = g_gmem_size;
+
+  const char* tpu_dev_name_defalut = TPU_DEV_NAME;
+  const char* tpu_dev_name_env = std::getenv("TPU_DEV");
+  const char *tpu_dev_name = tpu_dev_name_defalut;
+  if (tpu_dev_name_env) {
+    tpu_dev_name = tpu_dev_name_env;
+  }
+
+  pdev->dev_fd = open(tpu_dev_name, O_RDWR);
+  if (pdev->dev_fd <= 0) {
+    TPU_LOG_WARNING("open %s failed\n", tpu_dev_name);
+    return BM_ERR_FAILURE;
+  }
+
+  pdev->ion_fd = open(ION_DEV_NAME, O_RDWR);
+  if (pdev->ion_fd <= 0) {
+    TPU_LOG_WARNING("open %s failed\n", ION_DEV_NAME);
+    return BM_ERR_FAILURE;
+  }
+
+  int ret = ion_query_heap(pdev);
+  TPU_ASSERT(ret == BM_SUCCESS, NULL);
+
+  *dev = pdev;
+
+  return BM_SUCCESS;
+}
+
+void Cvi181xDeviceMem::device_close(bmdev_t dev)
+{
+  close(dev->ion_fd);
+  close(dev->dev_fd);
+
+  // TPU_LOG_WARNING("device[%d] closed\n", dev->index);
+
+  BMDEV_LOCK_DEINIT(dev);
+  delete dev;
+}
+
+int Cvi181xDeviceMem::get_chip_ver(bmdev_t dev) {
+  return dev->info.info182x.version;
+}
+
+void Cvi181xDeviceMem::mem_free_raw(bmctx_t ctx, bmmem_device_t mem) {
+  char array_got = 0;
+  bm_memory_t *device_mem = (bm_memory_t *)mem;
+  TPU_ASSERT(device_mem->flags.u.type == BMMEM_TYPE_DEVICE, NULL);
+
+  if (!device_mem->flags.u.is_prealloc) {
+    mem_free(device_mem->v_addr, device_mem->size, device_mem->dma_fd);
+
+    for (int i = 0; i < MEMARRAY_MAX_CNT; i ++) {
+      if (ctx->root_mem_array[i].p_addr == device_mem->p_addr) {
+        ctx->root_mem_array[i].p_addr = 0;
+        ctx->root_mem_array[i].mem = NULL;
+        array_got = 1;
+        break;
+      }
+    }
+
+    if (!array_got)
+      TPU_LOG_WARNING("bmmem_device_free() can not find match\n");
+  }
+
+  BMEMEM_DUMP();
+  delete device_mem;
+}
+
+bmerr_t Cvi181xDeviceMem::load_cmdbuf(bmctx_t ctx, uint8_t *cmdbuf, size_t sz,
+                       uint64_t neuron_gaddr, uint64_t weight_gaddr,
+                       bool enable_pmu, bmmem_device_t *cmdbuf_mem) {
+  bmerr_t ret;
+  size_t dmabuf_size = 0;
+  size_t pmubuf_size = 0;
+  bmmem_device_t dmabuf_mem;
+
+  ret = cvi181x_dmabuf_size(cmdbuf, sz, &dmabuf_size, &pmubuf_size);
+
+  //calculate pmu size
+#ifdef ENABLE_PMU
+  pmubuf_size = enable_pmu ? pmubuf_size : 0;
+#else
+  pmubuf_size = 0;
+#endif
+  //TPU_LOG_DEBUG("pmubuf_size = 0x%lx\n", pmubuf_size);
+  if (protect) {
+    dmabuf_mem = mem_alloc_pagesize(ctx, dmabuf_size + pmubuf_size);
+  } else {
+    dmabuf_mem = mem_alloc_raw(ctx, dmabuf_size + pmubuf_size);
+  }
+  if (!dmabuf_mem) {
+      TPU_LOG_ERROR("alloc dmabuf mem fail!\n");
+      return BM_ERR_NOMEM;
+  }
+  uint64_t dmabuf_devaddr = mem_p_addr(dmabuf_mem);
+
+  ret = cvi181x_dmabuf_convert(cmdbuf, sz, dmabuf_mem->v_addr);
+  set_base_reg(ctx, 0, neuron_gaddr);
+  set_base_reg(ctx, 1, weight_gaddr);
+  cvi181x_arraybase_set(dmabuf_mem->v_addr, (u32)neuron_gaddr, (u32)weight_gaddr, 0, 0);
+
+  cvi181x_dmabuf_relocate(dmabuf_mem->v_addr, dmabuf_devaddr, dmabuf_size, pmubuf_size);
+
+  TPU_ASSERT(mem_flush_ext(ctx->dev, dmabuf_mem->dma_fd,
+        dmabuf_mem->p_addr, dmabuf_size) == BM_SUCCESS, NULL);
+
+  // record dmabuf crc32
+  // dmabuf_mem->crc32 = bm_crc32(dmabuf, dmabuf_size);
+  *cmdbuf_mem = dmabuf_mem;
+  // if (0) {
+  // cvi181x_dmabuf_dump(dmabuf);
+  //}
+  return ret;
+}
+
+bmerr_t Cvi181xDeviceMem::load_dmabuf(bmctx_t ctx, bmmem_device_t dmabuf,
+                                      size_t sz, uint64_t neuron_gaddr, uint64_t weight_gaddr,
+                                      bool enable_pmu, bmmem_device_t *dmabuf_mem) {
+  size_t pmubuf_size = 0;
+  if (enable_pmu) {
+    pmubuf_size = cvi181x_get_pmusize(dmabuf->v_addr);
+    *dmabuf_mem = mem_alloc_raw(ctx, sz + pmubuf_size);
+    if (*dmabuf_mem == nullptr) {
+        TPU_LOG_ERROR("alloc dmabuf mem fail!\n");
+        return BM_ERR_NOMEM;
+    }
+    std::memcpy((*dmabuf_mem)->v_addr, dmabuf->v_addr, sz);
+  } else {
+    *dmabuf_mem = dmabuf;
+  }
+  uint64_t dmabuf_devaddr = mem_p_addr(*dmabuf_mem);
+
+  //set_base_reg(ctx, 0, neuron_gaddr);
+  //set_base_reg(ctx, 1, weight_gaddr);
+  cvi181x_arraybase_set((*dmabuf_mem)->v_addr, (u32)neuron_gaddr, (u32)weight_gaddr, 0, 0);
+
+  cvi181x_dmabuf_relocate((*dmabuf_mem)->v_addr, dmabuf_devaddr, sz,
+                          pmubuf_size);
+  TPU_ASSERT(mem_flush_ext(ctx->dev, (*dmabuf_mem)->dma_fd,
+                           (*dmabuf_mem)->p_addr, sz) == BM_SUCCESS,
+             NULL);
+  // if (0) {
+  //cvi181x_dmabuf_dump(dmabuf);
+  //}
+  return BM_SUCCESS;
+}
+
+
+bmerr_t Cvi181xDeviceMem::load_cmdbuf_tee(bmctx_t ctx, uint8_t *cmdbuf, size_t sz,
+                            uint64_t neuron_gaddr, uint64_t weight_gaddr,
+                            uint32_t weight_len, bmmem_device_t *cmdbuf_mem)
+{
+  //bmerr_t ret;
+  bmmem_device_t dmabuf_mem;
+
+  //malloc double size buffer, because TEE needs 2nd space to calculate dmabuf
+  if (protect) {
+    dmabuf_mem = mem_alloc_pagesize(ctx, sz + sz);
+  } else {
+    dmabuf_mem = mem_alloc_raw(ctx, sz + sz);
+  }
+  if (!dmabuf_mem) {
+      TPU_LOG_ERROR("alloc dmabuf mem fail!\n");
+      return BM_ERR_NOMEM;
+  }
+
+  //transfer encrypted cmdbuf to TEE
+  memcpy(dmabuf_mem->v_addr, cmdbuf, sz);
+  TPU_ASSERT((int)mem_flush_ext(ctx->dev, dmabuf_mem->dma_fd,
+        dmabuf_mem->p_addr, sz) == BM_SUCCESS, NULL);
+
+  //ioctl to get secure dma buffer
+  load_tee(ctx, dmabuf_mem->p_addr, sz, weight_gaddr, weight_len, neuron_gaddr);
+
+  //this region should be protected, can't touch in REE
+  *cmdbuf_mem = dmabuf_mem;
+	return 0;
+}
+
+bmerr_t Cvi181xDeviceMem::unload_tee(bmctx_t ctx, uint64_t paddr, size_t size) {
+  TPU_ASSERT(0, NULL); // not support
+  return BM_SUCCESS;
+}
+
+bmerr_t Cvi181xDeviceMem::parse_pmubuf(bmmem_device_t cmdbuf_mem, uint8_t **buf_start, uint32_t *buf_len) {
+#ifdef ENABLE_PMU
+  dma_hdr_t *header = (dma_hdr_t *)(cmdbuf_mem->v_addr);
+  //TPU_LOG_DEBUG("header->arraybase_0_L = 0x%x\n", header->arraybase_0_L);
+  //TPU_LOG_DEBUG("header->arraybase_1_L = 0x%x\n", header->arraybase_1_L);
+  //TPU_LOG_DEBUG("header->arraybase_0_H = 0x%x\n", header->arraybase_0_H);
+  //TPU_LOG_DEBUG("header->arraybase_1_H = 0x%x\n", header->arraybase_1_H);
+  //TPU_LOG_DEBUG("header->pmubuf_offset = 0x%x\n", header->pmubuf_offset);
+  //TPU_LOG_DEBUG("header->pmubuf_size = 0x%x\n", header->pmubuf_size);
+  if (header->pmubuf_size && header->pmubuf_offset) {
+    tpu_pmu_dump_main(cmdbuf_mem->v_addr, cmdbuf_mem->p_addr);
+  }
+  *buf_start = cmdbuf_mem->v_addr;
+  *buf_len = cmdbuf_mem->size;
+#endif
+  return BM_SUCCESS;
+}
+
+void Cvi181xDeviceMem::cvikernel_create(bmctx_t ctx, void **p_bk_ctx) {
+  TPU_ASSERT(ctx != nullptr, nullptr);
+  TPU_ASSERT(ctx->dev != nullptr, nullptr);
+
+  bmk1822_chip_info_t info = bmk1822_chip_info();
+  bmk1822_chip_info_t *dev_info = &info;
+
+  bmk_info_t bmk_info;
+  bmk_info.chip_version = dev_info->version;
+  bmk_info.cmdbuf_size = 0x100000;
+  bmk_info.cmdbuf = (u8 *)malloc(bmk_info.cmdbuf_size);
+  TPU_ASSERT(bmk_info.cmdbuf, "create cvikernel, malloc failed\n");
+
+  ctx->cvik_context.ctx182x = bmk1822_register(&bmk_info);
+  ctx->cvik_cmdbuf = (void *)bmk_info.cmdbuf;
+
+  *p_bk_ctx = ctx->cvik_context.ctx182x;
+}
+
+void Cvi181xDeviceMem::cvikernel_submit(bmctx_t ctx) {
+  u32 len;
+  u8 *cmdbuf = bmk1822_acquire_cmdbuf(ctx->cvik_context.ctx182x, &len);
+
+  uint16_t seq_no;
+  bmerr_t ret = send_cmdbuf(ctx, cmdbuf, (size_t)len, &seq_no);
+  TPU_ASSERT(ret == BM_SUCCESS, NULL);
+  bmk1822_reset(ctx->cvik_context.ctx182x);
+}
+
+void Cvi181xDeviceMem::cvikernel_destroy(bmctx_t ctx) {
+  TPU_ASSERT(ctx->cvik_context.ctx182x, NULL);
+  TPU_ASSERT(ctx->cvik_cmdbuf, NULL);
+
+  bmk1822_cleanup(ctx->cvik_context.ctx182x);
+  free(ctx->cvik_cmdbuf);
+}
diff --git a/cviruntime/src/soc/181x/cvi181x_device_mem.h b/cviruntime/src/soc/181x/cvi181x_device_mem.h
new file mode 100644
index 000000000..bf758fbd0
--- /dev/null
+++ b/cviruntime/src/soc/181x/cvi181x_device_mem.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <string.h>
+#include "cvi_device_mem.h"
+#include "bmruntime_internal.h"
+
+class Cvi181xDeviceMem : public CviDeviceMem {
+public:
+  Cvi181xDeviceMem();
+  ~Cvi181xDeviceMem() override;
+  virtual bmerr_t device_open(int index, bmdev_t *dev) override;
+  virtual void device_close(bmdev_t dev) override;
+  virtual int get_chip_ver(bmdev_t dev) override;
+  virtual void mem_free_raw(bmctx_t ctx, bmmem_device_t mem);
+  virtual bmerr_t load_cmdbuf(bmctx_t ctx, uint8_t *cmdbuf, size_t sz,
+                       uint64_t neuron_gaddr, uint64_t weight_gaddr,
+                       bool enable_pmu, bmmem_device_t *cmdbuf_mem) override;
+  virtual bmerr_t load_dmabuf(bmctx_t ctx, bmmem_device_t dmabuf,
+                              size_t sz, uint64_t neuron_gaddr,
+                              uint64_t weight_gaddr, bool enable_pmu,
+                              bmmem_device_t *dmabuf_mem) override;
+
+  virtual bmerr_t load_cmdbuf_tee(bmctx_t ctx, uint8_t *cmdbuf, size_t sz,
+                                          uint64_t neuron_gaddr, uint64_t weight_gaddr,
+                                          uint32_t weight_len,
+                                          bmmem_device_t *cmdbuf_mem);
+  virtual bmerr_t unload_tee(bmctx_t ctx, uint64_t paddr, size_t size);
+  virtual bmerr_t parse_pmubuf(bmmem_device_t cmdbuf_mem, uint8_t **buf_start, uint32_t *buf_len);
+  virtual void cvikernel_create(bmctx_t ctx, void **p_bk_ctx) override;
+  virtual void cvikernel_submit(bmctx_t ctx) override;
+  virtual void cvikernel_destroy(bmctx_t ctx) override;
+};
diff --git a/cviruntime/src/soc/181x/cvi_rt_181x.cpp b/cviruntime/src/soc/181x/cvi_rt_181x.cpp
new file mode 100644
index 000000000..b0888a4f2
--- /dev/null
+++ b/cviruntime/src/soc/181x/cvi_rt_181x.cpp
@@ -0,0 +1,82 @@
+#include "cvi_rt_181x.h"
+
+std::unique_ptr<CviRTSoc> cvi_chip(new CviRT181x());
+
+CviRT181x::CviRT181x() {
+    chip_name_    = "cv181x";
+    submit_magic_ = 0x18225678;
+    cvi_device    = std::move(std::unique_ptr<CviDeviceMem>(new Cvi181xDeviceMem()));
+}
+
+CviRT181x::~CviRT181x() {}
+
+CVI_RT_KHANDLE CviRT181x::GetKHandleBK(CVI_RT_HANDLE rt_handle) {
+    bmctx_t ctx = (bmctx_t)rt_handle;
+    return (CVI_RT_KHANDLE)(ctx->cvik_context.ctx182x);
+}
+
+CVI_RC CviRT181x::DeInitBK(CVI_RT_HANDLE rt_handle) {
+    bmctx_t ctx = (bmctx_t)rt_handle;
+
+    //deinit kernel related
+    if (ctx->cvik_context.ctx182x) {
+        bmk1822_cleanup(ctx->cvik_context.ctx182x);
+    }
+
+    if (ctx->cvik_cmdbuf) {
+        free(ctx->cvik_cmdbuf);
+    }
+
+    //deinit basic context
+    bm_exit(ctx);
+    return CVI_SUCCESS;
+}
+
+CVI_RC CviRT181x::InitWithKernelBK(CVI_RT_HANDLE *rt_handle, uint32_t cmdbuf_size) {
+    bmctx_t *ctx = (bmctx_t *)rt_handle;
+
+    //init basic context
+    bm_init(DEVICE_INDEX_NUM, ctx);
+
+    //init cvikernel related
+    bmk1822_chip_info_t info      = bmk1822_chip_info();
+    bmk1822_chip_info_t *dev_info = &info;
+
+    bmk_info_t bmk_info;
+    bmk_info.chip_version = dev_info->version;
+    bmk_info.cmdbuf_size  = cmdbuf_size;
+    bmk_info.cmdbuf       = (u8 *)malloc(bmk_info.cmdbuf_size);
+    if (!bmk_info.cmdbuf) {
+        TPU_ASSERT(bmk_info.cmdbuf, "malloc kernel buffer failed");
+        return CVI_FAILURE;
+    }
+
+    (*ctx)->cvik_context.ctx182x = bmk1822_register(&bmk_info);
+    (*ctx)->cvik_cmdbuf          = (void *)bmk_info.cmdbuf;
+
+    return CVI_SUCCESS;
+}
+
+CVI_RC CviRT181x::LoadCmdbufTee(CVI_RT_HANDLE rt_handle, uint8_t *cmdbuf,
+                                size_t sz, uint64_t neuron_gaddr,
+                                uint64_t weight_gaddr, uint32_t weight_len,
+                                CVI_RT_MEM *cmdbuf_mem) {
+    (void)rt_handle;
+    (void)cmdbuf;
+    (void)sz;
+    (void)neuron_gaddr;
+    (void)weight_gaddr;
+    (void)weight_len;
+    (void)cmdbuf_mem;
+    TPU_ASSERT(0, NULL);  // not support
+    return CVI_SUCCESS;
+}
+
+CVI_RC CviRT181x::RunCmdbufTee(CVI_RT_HANDLE rt_handle, CVI_RT_MEM cmdbuf_mem,
+                              CVI_RT_ARRAYBASE *p_array_base) {
+    (void)rt_handle;
+    (void)p_array_base;
+    (void)cmdbuf_mem;
+    TPU_ASSERT(0, NULL); // not support
+    return CVI_SUCCESS;
+}
diff --git a/cviruntime/src/soc/181x/cvi_rt_181x.h b/cviruntime/src/soc/181x/cvi_rt_181x.h
new file mode 100644
index 000000000..71556a13e
--- /dev/null
+++ b/cviruntime/src/soc/181x/cvi_rt_181x.h
@@ -0,0 +1,19 @@
+#pragma once
+#include "cvi_rt_base.h"
+#include "cvi181x_device_mem.h"
+
+class CviRT181x : public CviRTSoc {
+public:
+  CviRT181x();
+  virtual ~CviRT181x() override;
+
+  virtual CVI_RT_KHANDLE GetKHandleBK(CVI_RT_HANDLE rt_handle) override;
+  virtual CVI_RC DeInitBK(CVI_RT_HANDLE rt_handle) override;
+  virtual CVI_RC InitWithKernelBK(CVI_RT_HANDLE *rt_handle, uint32_t cmdbuf_size) override;
+  virtual CVI_RC LoadCmdbufTee(CVI_RT_HANDLE rt_handle, uint8_t *cmdbuf,
+                       size_t sz, uint64_t neuron_gaddr,
+                       uint64_t weight_gaddr, uint32_t weight_len,
+                       CVI_RT_MEM *cmdbuf_mem) override;
+  virtual CVI_RC RunCmdbufTee(CVI_RT_HANDLE rt_handle, CVI_RT_MEM cmdbuf_mem,
+                              CVI_RT_ARRAYBASE *p_array_base);
+};
diff --git a/cviruntime/src/soc/181x/tpu_pmu.cpp b/cviruntime/src/soc/181x/tpu_pmu.cpp
new file mode 100644
index 000000000..06b9aacad
--- /dev/null
+++ b/cviruntime/src/soc/181x/tpu_pmu.cpp
@@ -0,0 +1,912 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <string.h>
+#include <bmkernel/bm1822/bm1822_tiu_reg.h>
+#include <bmkernel/bm1822/bm1822_tdma_reg.h>
+#include "bmruntime_internal.h"
+#include <bmkernel/bm_regcpu.h>
+#include <bmkernel/reg_bdcast.h>
+#include <bmkernel/reg_tdma.h>
+
+
+struct TPU_PMU_DOUBLEEVENT {
+  unsigned long long type : 4;
+  unsigned long long desID : 16;
+  unsigned long long eventCnt0 : 22;
+  unsigned long long eventCnt1 : 22;
+  uint32_t endTime;
+  uint32_t startTime;
+};
+
+typedef enum _EXCEL_TYPE {
+  EXCEL_TYPE_0    = 0,
+  EXCEL_TYPE_1    = 1,
+  EXCEL_TYPE_2    = 2,
+  EXCEL_TYPE_3    = 3,
+  EXCEL_TYPE_4    = 4,
+} EXCEL_TYPE;
+
+enum TPU_PMUTYPE {
+  TPU_PMUTYPE_TDMALOAD  = 1,
+  TPU_PMUTYPE_TDMASTORE = 2,
+  TPU_PMUTYPE_TDMAMOVE  = 3,
+  TPU_PMUTYPE_TIU       = 4,
+};
+
+typedef struct _TPU_DES_ELEMENT {
+  TPU_PMU_DOUBLEEVENT pmuEvent;
+  tiu_reg_t   tiuReg;
+  tdma_reg_t  tdmaReg;
+  char typeStr[50];
+} TPU_DES_ELEMENT;
+
+typedef struct _TPU_LAYERID_ELEMENT {
+  uint32_t    layerID;
+  TPU_PMUTYPE last_desType;
+  uint32_t    last_mapping_desID;
+  uint32_t    endTime;
+  uint32_t    startTime;
+//  uint8_t     layerName[50];
+  uint32_t    u32StartAddr;
+  uint32_t    u32OutputLen;
+
+  uint32_t    u32LoadNueronTime;
+  uint32_t    u32LoadWeightTime;
+  uint32_t    u32StoreNueronTime;
+  uint32_t    u32TIUTime;
+  uint32_t    u32TDMATime;
+  uint32_t    u32byteCnt;
+
+  double      parallelism;
+  double      duration_percent;
+  double      loadNeuron_percent;
+  double      loadWeight_percent;
+  double      storeNeuron_percent;
+  double      tiu_percent;
+  double      throughput_MB;
+} TPU_LAYERID_ELEMENT;
+
+#define FILE_OUT_LINE_LEN 2048
+#define TPUPMU_DES_FILENAME "_des.csv"
+#define TPUPMU_LAYER_FILENAME "_layer.csv"
+const char *pmubuf_output_file_env = NULL;
+
+
+#define TPU_CLOCK_DEFAULT (750000000)
+#define TPU_WRAP_LIMIT  0xFFFFFFFF
+#define TPU_BURST_SIZE  16
+#define DES_MAX   (65535 * 6)    //hardcore firstly, real count could be queried from dmabuf
+TPU_DES_ELEMENT *p_element = NULL;
+TPU_LAYERID_ELEMENT *p_layer = NULL;
+
+static void tpu_pmu_fill_cmdbuf(uint8_t *v_dma_buf);
+
+static void reorder_back_tiu_cmdbuf_reg(uint8_t *cmdbuf)
+{
+  int total_bits = BD_REG_BYTES * 8;
+
+  uint8_t tmp[128 / 8];
+  uint8_t *last = &cmdbuf[(total_bits - 128) / 8];
+  memcpy(tmp, last, sizeof(tmp));
+  memcpy(last, cmdbuf, sizeof(tmp));
+  memcpy(cmdbuf, tmp, sizeof(tmp));
+}
+
+static void tdma_des_fill_str(TPU_DES_ELEMENT *element)
+{
+  char str1[50] = {0};
+  char tmpStr[10] = {0};
+
+  switch(element->pmuEvent.type) {
+    case TPU_PMUTYPE_TDMALOAD:
+      sprintf(tmpStr, "%s", "Load");
+      break;
+    case TPU_PMUTYPE_TDMASTORE:
+      sprintf(tmpStr, "%s", "Store");
+      break;
+    case TPU_PMUTYPE_TDMAMOVE:
+      sprintf(tmpStr, "%s", "Move");
+      break;
+    default:
+      break;
+  }
+
+  if (element->tdmaReg.compress_en)
+    sprintf(str1 , "%s %s", tmpStr , "Compression");
+  else
+    sprintf(str1 , "%s" , tmpStr);
+
+  if (element->tdmaReg.sys_dtype)
+    sprintf(element->typeStr, "%s %s", "TDMA Matrix", str1);
+  else
+    sprintf(element->typeStr, "%s %s", "TDMA Tensor", str1);
+}
+
+static void tpu_pmu_fill_cmdbuf(uint8_t *v_dma_buf)
+{
+  cvi_cpu_desc_t *desc = (cvi_cpu_desc_t *)(v_dma_buf + sizeof(dma_hdr_t));
+
+  uint64_t tiu_offset = 0, tdma_offset = 0;
+  uint32_t tiu_cnt = 0, tdma_cnt = 0, i = 0, offset = 0;
+  uint32_t start_index_tdma = 0, start_index_tiu = 0;
+  uint32_t index = 0;
+  tdma_reg_t tmpTDMA_Reg;
+  tiu_reg_t tmpTIU_Reg;
+  uint8_t tiu_recorded_buf[BD_REG_BYTES];
+  uint32_t tdma_id_previous = 0, tdma_start_pre= 0, tdma_end_pre = 0;
+
+  //get tiu/tdma descriptor start address
+  tiu_offset = desc->offset_tiu_ori_bk;
+  tdma_offset = desc->offset_tdma_ori_bk;
+  //TPU_LOG_DEBUG("tpu_pmu_fill_cmdbuf() tiu_offset=0x%" PRIx64", tdma_offset=0x%" PRIx64 "\n", tiu_offset, tdma_offset);
+
+  tiu_cnt = desc->num_tiu;
+  tdma_cnt = desc->num_tdma;
+  //TPU_LOG_DEBUG("tpu_pmu_fill_cmdbuf() tiu_cnt=%d, tdma_cnt=%d\n", tiu_cnt, tdma_cnt);
+
+  while (p_element[index].pmuEvent.type) {
+    if (p_element[index].pmuEvent.type != TPU_PMUTYPE_TIU) {    //tdma
+
+      if ((p_element[index].pmuEvent.desID != tdma_id_previous) ||
+          (p_element[index].pmuEvent.startTime != tdma_start_pre) ||
+          (p_element[index].pmuEvent.endTime != tdma_end_pre)) {
+        for (i = start_index_tdma; i < tdma_cnt; i ++) {
+          offset = tdma_offset + ((1 << TDMA_DESCRIPTOR_ALIGNED_BIT) * i);
+          parse_tdma_reg(&tmpTDMA_Reg, (uint32_t *)(v_dma_buf + offset));
+
+          if (p_element[index].pmuEvent.desID == tmpTDMA_Reg.cmd_id) {
+            memcpy(&p_element[index].tdmaReg, &tmpTDMA_Reg, sizeof(tmpTDMA_Reg));
+            tdma_des_fill_str(&p_element[index]);
+            start_index_tdma = i + 1;
+            tdma_id_previous = p_element[index].pmuEvent.desID;
+            tdma_start_pre = p_element[index].pmuEvent.startTime;
+            tdma_end_pre = p_element[index].pmuEvent.endTime;
+            break;
+          }
+        }
+      } else {  //tdma g2g case, copy 1st to 2nd tdma descriptor
+        memcpy(&p_element[index].tdmaReg, &p_element[index - 1].tdmaReg, sizeof(tmpTDMA_Reg));
+        tdma_des_fill_str(&p_element[index]);
+      }
+    } else {   //tiu
+      for (i = start_index_tiu; i < tiu_cnt; i ++) {
+        offset = tiu_offset + (BD_REG_BYTES * i);
+        uint8_t *tiu_cmdbuf = v_dma_buf + offset;
+
+        //get tiu_reg struc
+        memcpy(tiu_recorded_buf, tiu_cmdbuf, BD_REG_BYTES);
+        reorder_back_tiu_cmdbuf_reg(tiu_recorded_buf);
+        parse_tiu_reg(&tmpTIU_Reg, (uint32_t *)tiu_recorded_buf);
+
+        if (p_element[index].pmuEvent.desID == tmpTIU_Reg.cmd_id_tpu) {
+          memcpy(&p_element[index].tiuReg, &tmpTIU_Reg, sizeof(tmpTIU_Reg));
+
+#if 1
+          switch (tmpTIU_Reg.tsk_typ) {
+            case DCR_TYPE_CONV_FIX8B:
+              if (!tmpTIU_Reg.opt_chl_quan) {
+                if (tmpTIU_Reg.opd_typ) {
+                  strcpy(p_element[index].typeStr, "TIU BF16 Convolution");
+                } else {
+                  strcpy(p_element[index].typeStr, "TIU Convolution");
+                }
+              } else {
+                strcpy(p_element[index].typeStr, "TIU PerChannel Convolution");
+              }
+              break;
+            case DCR_TYPE_DEPTHWISE_POOL_FIX8B:
+              switch (tmpTIU_Reg.tsk_eu_typ) {
+                case 0:
+                  if (tmpTIU_Reg.opd_typ) {
+                    strcpy(p_element[index].typeStr, "TIU BF16 Max Pooling");
+                  } else {
+                    strcpy(p_element[index].typeStr, "TIU Max Pooling");
+                  }
+                  break;
+                case 1:
+                  if (tmpTIU_Reg.opd_typ) {
+                    strcpy(p_element[index].typeStr, "TIU BF16 Average Pooling");
+                  } else {
+                    strcpy(p_element[index].typeStr, "TIU Average Pooling");
+                  }
+                  break;
+                case 2:
+                  if (!tmpTIU_Reg.opt_chl_quan) {
+                    if (tmpTIU_Reg.opd_typ) {
+                      strcpy(p_element[index].typeStr, "TIU BF16 Depthwise Convolution");
+                    } else {
+                      strcpy(p_element[index].typeStr, "TIU Depthwise Convolution");
+                    }
+                  } else {
+                    strcpy(p_element[index].typeStr, "TIU Depthwise PerChannel Convolution");
+                  }
+                  break;
+                case 3:
+                  if (tmpTIU_Reg.opd_typ) {
+                    strcpy(p_element[index].typeStr, "TIU BF16 Min Pooling");
+                  } else {
+                    strcpy(p_element[index].typeStr, "TIU Min Pooling");
+                  }
+                  break;
+                default:
+                  break;
+              }
+              break;
+            case DCR_TYPE_FC_FIX8B:
+              if (!tmpTIU_Reg.opt_chl_quan) {
+                if (tmpTIU_Reg.opd_typ) {
+                  strcpy(p_element[index].typeStr, "TIU BF16 Matrix Multiplication");
+                } else {
+                  strcpy(p_element[index].typeStr, "TIU Matrix Multiplication");
+                }
+              } else {
+                strcpy(p_element[index].typeStr, "TIU PerChannel Matrix Multiplication");
+              }
+              break;
+            case DCR_TYPE_TENSOR_ARITH_FIX8B:
+              switch(tmpTIU_Reg.tsk_eu_typ) {
+                case 0:
+                  if (!tmpTIU_Reg.opt_chl_quan) {
+                    if (tmpTIU_Reg.opd_typ) {
+                       strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Mul");
+                    } else {
+                      strcpy(p_element[index].typeStr, "TIU Element-wise Mul");
+                    }
+                  } else {
+                    strcpy(p_element[index].typeStr, "TIU Element-wise Mul(QDM)");
+                  }
+                  break;
+                case 1:
+                  if (tmpTIU_Reg.opd_typ) {
+                    strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Mac");
+                  } else {
+                    strcpy(p_element[index].typeStr, "TIU Element-wise Mac");
+                  }
+                  break;
+                case 2:
+                  if (tmpTIU_Reg.opd_typ) {
+                    strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Add");
+                  } else {
+                    strcpy(p_element[index].typeStr, "TIU Element-wise Add");
+                  }
+                  break;
+                case 3:
+                  if (tmpTIU_Reg.opd_typ) {
+                    strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Sub");
+                  } else {
+                    strcpy(p_element[index].typeStr, "TIU Element-wise Sub");
+                  }
+                  break;
+                case 4:
+                  if (tmpTIU_Reg.opd_typ) {
+                    strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Max");
+                  } else {
+                    strcpy(p_element[index].typeStr, "TIU Element-wise Max");
+                  }
+                  break;
+                case 5:
+                  if (tmpTIU_Reg.opd_typ) {
+                    strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Min");
+                  } else {
+                    strcpy(p_element[index].typeStr, "TIU Element-wise Min");
+                  }
+                  break;
+                case 6:
+                  strcpy(p_element[index].typeStr, "TIU Element-wise Shift");
+                  break;
+                case 7:
+                  strcpy(p_element[index].typeStr, "TIU Element-wise AND");
+                  break;
+                case 8:
+                  strcpy(p_element[index].typeStr, "TIU Element-wise OR");
+                  break;
+                case 9:
+                  strcpy(p_element[index].typeStr, "TIU Element-wise XOR");
+                  break;
+                case 10:
+                  if (tmpTIU_Reg.opd_typ) {
+                    strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Copy");
+                  } else {
+                    strcpy(p_element[index].typeStr, "TIU Element-wise Copy");
+                  }
+                  break;
+                case 11:
+                  if (tmpTIU_Reg.opd_typ) {
+                    strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Ge");
+                  } else {
+                    strcpy(p_element[index].typeStr, "TIU Element-wise Ge");
+                  }
+                  break;
+                case 12:
+                  strcpy(p_element[index].typeStr, "TIU Lookup Table");
+                  break;
+                default:
+                  break;
+                }
+                default:
+                  break;
+          }
+
+#else
+          switch(tmpTIU_Reg.tsk_typ) {
+            case DCR_TYPE_CONV_FIX8B:
+              if (!tmpTIU_Reg.opt_chl_quan) {
+                if (tmpTIU_Reg.opd_typ)
+                  strcpy(p_element[index].typeStr, "TIU BF16 Convolution");
+                else
+                  strcpy(p_element[index].typeStr, "TIU Convolution");
+              } else {
+                strcpy(p_element[index].typeStr, "TIU PerChannel Convolution");
+              }
+              break;
+            case DCR_TYPE_DEPTHWISE_POOL_FIX8B:
+              switch (tmpTIU_Reg.tsk_eu_typ) {
+                    case 0:
+                  if (tmpTIU_Reg.opd_typ)
+                    strcpy(p_element[index].typeStr, "TIU BF16 Max Pooling");
+                  else
+                    strcpy(p_element[index].typeStr, "TIU Max Pooling");
+                  break;
+                    case 1:
+                  if (tmpTIU_Reg.opd_typ)
+                    strcpy(p_element[index].typeStr, "TIU BF16 Average Pooling");
+                  else
+                    strcpy(p_element[index].typeStr, "TIU Average Pooling");
+                  break;
+                    case 2:
+                  if (!tmpTIU_Reg.opt_chl_quan) {
+                    if (tmpTIU_Reg.opd_typ)
+                      strcpy(p_element[index].typeStr, "TIU BF16 Depthwise Convolution");
+                    else
+                      strcpy(p_element[index].typeStr, "TIU Depthwise Convolution");
+                  } else {
+                    strcpy(p_element[index].typeStr, "TIU Depthwise PerChannel Convolution");
+                  }
+                  break;
+                    default:
+                      break;
+                  }
+              break;
+            case DCR_TYPE_FC_FIX8B:
+              if (!tmpTIU_Reg.opt_chl_quan) {
+                if (tmpTIU_Reg.opd_typ)
+                  strcpy(p_element[index].typeStr, "TIU BF16 Matrix Multiplication");
+                else
+                  strcpy(p_element[index].typeStr, "TIU Matrix Multiplication");
+              } else {
+                strcpy(p_element[index].typeStr, "TIU PerChannel Matrix Multiplication");
+              }
+              break;
+            case DCR_TYPE_TENSOR_ARITH_FIX8B:
+              if (tmpTIU_Reg.tens_mdsum) {
+                strcpy(p_element[index].typeStr, "TIU Mdsum");
+              } else if (tmpTIU_Reg.tens_lookup) {
+                strcpy(p_element[index].typeStr, "TIU Lookup Table");
+              } else {
+                switch (tmpTIU_Reg.tsk_eu_typ) {
+                  case 0:
+                    if (!tmpTIU_Reg.opt_chl_quan) {
+                      if (tmpTIU_Reg.opd_typ) {
+                        strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Mul");
+                      } else {
+                        strcpy(p_element[index].typeStr, "TIU Element-wise Mul");
+                      }
+                    } else {
+                      strcpy(p_element[index].typeStr, "TIU Element-wise Mul(QDM)");
+                    }
+                    break;
+                  case 1:
+                    if (tmpTIU_Reg.opd_typ) {
+                      strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Mac");
+                    } else {
+                      strcpy(p_element[index].typeStr, "TIU Element-wise Mac");
+                    }
+                    break;
+                  case 2:
+                    if (tmpTIU_Reg.opd_typ) {
+                      strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Add");
+                    } else {
+                      strcpy(p_element[index].typeStr, "TIU Element-wise Add");
+                    }
+                    break;
+                  case 3:
+                    if (tmpTIU_Reg.opd_typ) {
+                      strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Sub");
+                    } else {
+                      strcpy(p_element[index].typeStr, "TIU Element-wise Sub");
+                    }
+                    break;
+                  case 4:
+                    if (tmpTIU_Reg.opd_typ) {
+                      strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Max");
+                    } else {
+                      strcpy(p_element[index].typeStr, "TIU Element-wise Max");
+                    }
+                    break;
+                  case 5:
+                    if (tmpTIU_Reg.opd_typ) {
+                      strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Min");
+                    } else {
+                      strcpy(p_element[index].typeStr, "TIU Element-wise Min");
+                    }
+                    break;
+                  case 6:
+                    strcpy(p_element[index].typeStr, "TIU Element-wise Shift");
+                    break;
+                  case 7:
+                    strcpy(p_element[index].typeStr, "TIU Element-wise AND");
+                    break;
+                  case 8:
+                    strcpy(p_element[index].typeStr, "TIU Element-wise OR");
+                    break;
+                  case 9:
+                    strcpy(p_element[index].typeStr, "TIU Element-wise XOR");
+                    break;
+                  case 10:
+                    if (tmpTIU_Reg.opd_typ) {
+                      strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Copy");
+                    } else {
+                      strcpy(p_element[index].typeStr, "TIU Element-wise Copy");
+                    }
+                    break;
+                  default:
+                    break;
+                }
+              }
+              break;
+          }
+#endif
+          start_index_tiu = i + 1;
+          break;
+        }
+      }
+    }
+    index ++;
+  }
+
+}
+
+#include <iostream>
+#include <fstream>
+using namespace std;
+
+static void tpu_pmu_fwrite_des()
+{
+  uint32_t index = 0;
+	uint64_t srcAddr = 0, dstAddr = 0;
+
+  char lineStr[FILE_OUT_LINE_LEN] = {0};
+  EXCEL_TYPE excelType = EXCEL_TYPE_0;
+
+  std::fstream fout_element;
+  sprintf(lineStr, "%s%s", pmubuf_output_file_env, TPUPMU_DES_FILENAME);
+  //TPU_LOG_DEBUG("out file_des name=%s\n", lineStr);
+  fout_element.open(lineStr, std::ios::out | std::ios::trunc);
+
+  strcpy(lineStr, "pmutype, desID, event0, event1, , start, duration, end, layerID, desType, \
+    srcAddr, dstAddr, trans_fmt, transpose_md, cmd_id, wait_id_tpu, dst_h_stride, dst_c_stride_low, \
+    dst_n_stride, src_h_stride, src_c_stride_low, src_n_stride, dst_c, src_c, dst_w, dst_h, src_w, src_h, src_n\n");
+  fout_element << lineStr;
+
+  //dump descriptor content related
+  while (p_element[index].pmuEvent.type)
+  {
+    switch (p_element[index].pmuEvent.type) {
+      case TPU_PMUTYPE_TDMALOAD:
+        excelType = EXCEL_TYPE_1;
+        break;
+      case TPU_PMUTYPE_TDMASTORE:
+      case TPU_PMUTYPE_TDMAMOVE:
+        excelType = EXCEL_TYPE_2;
+        break;
+      case TPU_PMUTYPE_TIU:
+        excelType = EXCEL_TYPE_3;
+        break;
+    }
+
+    if (p_element[index].pmuEvent.type == TPU_PMUTYPE_TIU) {
+#ifdef __riscv
+        sprintf(lineStr, "%u, %u, %u, %u, %u, %u, %u, %u, %u, %s\n",
+#else
+        sprintf(lineStr, "%llu, %llu, %llu, %llu, %u, %u, %u, %u, %u, %s\n",
+#endif
+                p_element[index].pmuEvent.type,
+                p_element[index].pmuEvent.desID,
+                p_element[index].pmuEvent.eventCnt0,
+                p_element[index].pmuEvent.eventCnt1,
+                excelType,
+                p_element[index].pmuEvent.startTime,
+                p_element[index].pmuEvent.endTime - p_element[index].pmuEvent.startTime,
+                p_element[index].pmuEvent.endTime,
+                p_element[index].tiuReg.layer_info,
+                p_element[index].typeStr);
+    } else {
+      srcAddr = ((uint64_t)(p_element[index].tdmaReg.src_base_addr_high) << 32) |
+                  (uint64_t)(p_element[index].tdmaReg.src_base_addr_low);
+      dstAddr = ((uint64_t)(p_element[index].tdmaReg.dst_base_addr_high) << 32) |
+                  (uint64_t)(p_element[index].tdmaReg.dst_base_addr_low);
+
+#ifdef __riscv
+      sprintf(lineStr, "%u, %u, %u, %u, %u, %u, %u, %u, %u, %s, 0x%" PRIu64 ", 0x%" PRIu64 ", \
+        %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u\n",
+#else
+      sprintf(lineStr, "%llu, %llu, %llu, %llu, %u, %u, %u, %u, %u, %s, 0x%" PRIu64 ", 0x%" PRIu64 ", \
+        %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u\n",
+#endif
+                        p_element[index].pmuEvent.type,
+                        p_element[index].pmuEvent.desID,
+                        p_element[index].pmuEvent.eventCnt0,
+                        p_element[index].pmuEvent.eventCnt1,
+                        excelType,
+                        p_element[index].pmuEvent.startTime,
+                        p_element[index].pmuEvent.endTime - p_element[index].pmuEvent.startTime,
+                        p_element[index].pmuEvent.endTime,
+                        p_element[index].tdmaReg.layer_ID,
+                        p_element[index].typeStr,
+                        srcAddr,
+                        dstAddr,
+                        p_element[index].tdmaReg.trans_fmt,
+                        p_element[index].tdmaReg.transpose_md,
+                        p_element[index].tdmaReg.cmd_id,
+                        p_element[index].tdmaReg.wait_id_tpu,
+                        p_element[index].tdmaReg.dst_h_stride,
+                        p_element[index].tdmaReg.dst_c_stride_low,
+                        p_element[index].tdmaReg.dst_n_stride,
+                        p_element[index].tdmaReg.src_h_stride,
+                        p_element[index].tdmaReg.src_c_stride_low,
+                        p_element[index].tdmaReg.src_n_stride,
+                        p_element[index].tdmaReg.dst_c,
+                        p_element[index].tdmaReg.src_c,
+                        p_element[index].tdmaReg.dst_w,
+                        p_element[index].tdmaReg.dst_h,
+                        p_element[index].tdmaReg.src_w,
+                        p_element[index].tdmaReg.src_h,
+                        p_element[index].tdmaReg.src_n);
+    }
+
+    fout_element << lineStr;
+    index ++;
+  }
+
+  fout_element.close();
+}
+
+static void tpu_pmu_getlayerInfo(void)
+{
+  uint32_t index = 0, layIDIndex = 0;
+  uint32_t curLayID = 0;
+  uint32_t u32SingleDuration = 0;
+
+  //TPU_LOG_DEBUG("tpu_pmu_getlayerInfo() start\n");
+  while (p_element[index].pmuEvent.type) {
+    if (!curLayID) {
+      //record current layerID
+      curLayID = p_element[index].pmuEvent.type == TPU_PMUTYPE_TIU ?
+      p_element[index].tiuReg.layer_info : p_element[index].tdmaReg.layer_ID;
+
+      p_layer[layIDIndex].last_desType = (TPU_PMUTYPE)p_element[index].pmuEvent.type;
+      p_layer[layIDIndex].layerID = curLayID;
+      p_layer[layIDIndex].endTime = p_element[index].pmuEvent.endTime;
+      p_layer[layIDIndex].startTime = p_element[index].pmuEvent.startTime;
+      p_layer[layIDIndex].last_mapping_desID = p_element[index].pmuEvent.desID;
+    } else {
+      //if next layer ID is identical
+      if (curLayID == (p_element[index].pmuEvent.type == TPU_PMUTYPE_TIU ?
+        p_element[index].tiuReg.layer_info : p_element[index].tdmaReg.layer_ID)) {
+        p_layer[layIDIndex].endTime = (p_element[index].pmuEvent.endTime > p_layer[layIDIndex].endTime) ?
+          (p_element[index].pmuEvent.endTime) : (p_layer[layIDIndex].endTime);
+
+        p_layer[layIDIndex].last_mapping_desID = p_element[index].pmuEvent.desID;
+
+      } else {
+        layIDIndex ++;
+        curLayID = p_element[index].pmuEvent.type == TPU_PMUTYPE_TIU ?
+          p_element[index].tiuReg.layer_info : p_element[index].tdmaReg.layer_ID;
+
+        p_layer[layIDIndex].last_desType = (TPU_PMUTYPE)p_element[index].pmuEvent.type;
+        p_layer[layIDIndex].layerID = curLayID;
+        p_layer[layIDIndex].endTime = p_element[index].pmuEvent.endTime;
+        p_layer[layIDIndex].startTime = p_element[index].pmuEvent.startTime;
+        p_layer[layIDIndex].last_mapping_desID = p_element[index].pmuEvent.desID;
+      }
+    }
+
+    //get each duration and then classfy by type
+    u32SingleDuration = p_element[index].pmuEvent.endTime - p_element[index].pmuEvent.startTime;
+    switch (p_element[index].pmuEvent.type) {
+      case TPU_PMUTYPE_TIU:
+        p_layer[layIDIndex].u32TIUTime += u32SingleDuration;
+        break;
+
+      case TPU_PMUTYPE_TDMALOAD:
+        if (p_element[index].tdmaReg.src_base_reg_sel == 0)
+          p_layer[layIDIndex].u32LoadNueronTime += u32SingleDuration;
+        else if (p_element[index].tdmaReg.src_base_reg_sel == 1)
+          p_layer[layIDIndex].u32LoadWeightTime += u32SingleDuration;
+
+        p_layer[layIDIndex].u32TDMATime += u32SingleDuration;
+        break;
+
+      case TPU_PMUTYPE_TDMASTORE:
+        if (p_element[index].tdmaReg.src_base_reg_sel == 0)
+          p_layer[layIDIndex].u32StoreNueronTime += u32SingleDuration;
+
+        p_layer[layIDIndex].u32TDMATime += u32SingleDuration;
+        break;
+
+      default:
+        break;
+    }
+
+    //accumulate byte counts, one burst count = 16bytes
+    p_layer[layIDIndex].u32byteCnt += (p_element[index].pmuEvent.eventCnt1 * 16);
+    index ++;
+  }
+}
+
+static void tpu_pmu_fwrite_layer(uint64_t tpu_clock)
+{
+  uint32_t index = 0;
+  char lineStr[FILE_OUT_LINE_LEN] = {0};
+  uint64_t u64totalDuration = 0, u64singleDuration = 0;
+  std::fstream fout_layer;
+
+  sprintf(lineStr, "%s%s", pmubuf_output_file_env, TPUPMU_LAYER_FILENAME);
+  //TPU_LOG_DEBUG("out file_des name=%s\n", lineStr);
+  fout_layer.open(lineStr, std::ios::out | std::ios::trunc);
+
+  //pre-processing once, and we can get total duration
+  index = 0;
+  while (p_layer[index].layerID) {
+    u64totalDuration += p_layer[index].endTime - p_layer[index].startTime;
+    index ++;
+  }
+
+  index = 0;
+  while (p_layer[index].layerID) {
+    u64singleDuration = p_layer[index].endTime - p_layer[index].startTime;
+    p_layer[index].parallelism = (double)(p_layer[index].u32TDMATime + p_layer[index].u32TIUTime) / (double)u64singleDuration * 100;
+    p_layer[index].parallelism =  p_layer[index].parallelism < 100 ? 100 : p_layer[index].parallelism;
+
+    p_layer[index].duration_percent = (double)u64singleDuration / (double)u64totalDuration * 100;
+    p_layer[index].tiu_percent = (double)p_layer[index].u32TIUTime / (double)u64singleDuration * 100;
+    p_layer[index].loadNeuron_percent = (double)p_layer[index].u32LoadNueronTime / (double)u64singleDuration * 100;
+    p_layer[index].loadWeight_percent = (double)p_layer[index].u32LoadWeightTime / (double)u64singleDuration * 100;
+    p_layer[index].storeNeuron_percent = (double)p_layer[index].u32StoreNueronTime / (double)u64singleDuration * 100;
+    p_layer[index].throughput_MB = (double)p_layer[index].u32byteCnt * tpu_clock / (double)u64singleDuration / 1024 / 1024;
+    index ++;
+  }
+
+  strcpy(lineStr, "layerID, start, duration, end, duration(%), parallelism(%), TIU(%), \
+    loadNeuron(%), loadWeight(%), storeNeuron(%), throughput(MB/s), last_tdmaID, dumpStart, dumpLen, TIU, loadNeuron, \
+    loadWeight, storeNeuron, byteCnt\n");
+
+  fout_layer << lineStr;
+
+  index = 0;
+  while (p_layer[index].layerID) {
+    sprintf(lineStr, "%d, %d, %d, %d, %lf%%, %lf%%, %lf%%, %lf%%, %lf%%, %lf%%, %.2lfMB/s, %d, 0x%x, 0x%x, %d, %d, %d, %d, %d\n",
+                p_layer[index].layerID,
+                p_layer[index].startTime,
+                p_layer[index].endTime - p_layer[index].startTime,
+                p_layer[index].endTime,
+
+                p_layer[index].duration_percent,
+                p_layer[index].parallelism,
+                p_layer[index].tiu_percent,
+                p_layer[index].loadNeuron_percent,
+                p_layer[index].loadWeight_percent,
+                p_layer[index].storeNeuron_percent,
+                p_layer[index].throughput_MB,
+
+                p_layer[index].last_mapping_desID,
+                p_layer[index].u32StartAddr,
+                p_layer[index].u32OutputLen,
+                p_layer[index].u32TIUTime,
+                p_layer[index].u32LoadNueronTime,
+                p_layer[index].u32LoadWeightTime,
+                p_layer[index].u32StoreNueronTime,
+                p_layer[index].u32byteCnt);
+    fout_layer << lineStr;
+    index ++;
+  }
+
+  fout_layer.close();
+}
+
+static int tpu_pmu_time(uint8_t *v_dma_buf, uint64_t p_dma_buf, uint8_t all_info)
+{
+  dma_hdr_t *header = (dma_hdr_t *)(v_dma_buf);
+  struct TPU_PMU_DOUBLEEVENT *pCurrent = (struct TPU_PMU_DOUBLEEVENT *)(v_dma_buf + header->pmubuf_offset);
+
+  uint64_t bmnet_p_total = 0;
+  uint64_t bmnet_p_duration = 0;
+
+  uint64_t u64TDMATotal = 0;
+  uint64_t u64TIUTotal = 0;
+  uint64_t u64_des_start = 0, u64_des_end = 0;
+  uint32_t u32TDMACnt = 0, u32TIUCnt = 0;
+  uint32_t index = 0, diff = 0, wrap_cnt = 0;
+  uint32_t tpu_clk_rate = header->tpu_clk_rate;
+  uint64_t u64_load_bytes = 0, u64_store_bytes = 0;
+  uint32_t tdma_id_previous = 0, tdma_start_pre= 0, tdma_end_pre = 0;
+  double percent_tdma = 0, percent_tiu = 0, percent_paralellism = 0;
+  double ms_tdma = 0, ms_tiu = 0, ms_influence = 0;
+  double load_mb = 0, store_mb = 0;
+  double bandwidth = 0;
+
+  //TPU_LOG_DEBUG("TPU_LOG_DEBUG tpu_pmu_time() all_info=%x\n", all_info);
+  //traverse pmu buffer
+  while (*(uint32_t *)pCurrent) {
+    if (pCurrent->type >= TPU_PMUTYPE_TDMALOAD && pCurrent->type <= TPU_PMUTYPE_TIU) {
+      if (index == 0) {
+        u64_des_start = pCurrent->startTime;
+        u64_des_end = pCurrent->endTime;
+      } else {
+        u64_des_end = pCurrent->endTime;
+      }
+
+      if (all_info)
+        memcpy(&p_element[index].pmuEvent, pCurrent, sizeof(TPU_PMU_DOUBLEEVENT));
+
+    } else {
+      TPU_LOG_ERROR("pmubuf content header type incorrect, just next\n");
+      index ++;
+      pCurrent++;
+      continue;
+    }
+
+    if (pCurrent->type == TPU_PMUTYPE_TIU) {  //tiu case
+      if (pCurrent->endTime > pCurrent->startTime) {
+        diff = pCurrent->endTime - pCurrent->startTime;
+      } else {
+        diff = 0xFFFFFFFF - pCurrent->startTime + pCurrent->endTime;
+        wrap_cnt ++;
+      }
+
+      u64TIUTotal += diff;
+      u32TIUCnt++;
+    } else {    //tdma case
+
+      //g2g will generate two des loadx1+storex1, we only accumulate one of them
+      if ((pCurrent->desID != tdma_id_previous) ||
+          (pCurrent->startTime != tdma_start_pre) ||
+          (pCurrent->endTime != tdma_end_pre)) {
+
+        if (pCurrent->endTime > pCurrent->startTime) {
+          diff = pCurrent->endTime - pCurrent->startTime;
+        } else {
+          diff = TPU_WRAP_LIMIT - pCurrent->startTime + pCurrent->endTime;
+          wrap_cnt ++;
+        }
+        u64TDMATotal += diff;
+        u32TDMACnt++;
+      }
+
+      if (pCurrent->type == TPU_PMUTYPE_TDMALOAD) {
+        u64_load_bytes += TPU_BURST_SIZE * pCurrent->eventCnt1;
+      } else if (pCurrent->type == TPU_PMUTYPE_TDMASTORE) {
+        u64_store_bytes += TPU_BURST_SIZE * pCurrent->eventCnt1;
+      }
+
+      tdma_id_previous = pCurrent->desID;
+      tdma_start_pre = pCurrent->startTime;
+      tdma_end_pre = pCurrent->endTime;
+    } 
+
+    index ++;
+    pCurrent++;
+  }
+
+  bmnet_p_total = u64TDMATotal + u64TIUTotal;
+  if (wrap_cnt)
+    bmnet_p_duration = TPU_WRAP_LIMIT * (wrap_cnt - 1) + TPU_WRAP_LIMIT - u64_des_start + u64_des_end;
+  else
+    bmnet_p_duration = u64_des_end - u64_des_start;
+
+  percent_tdma = (double)u64TDMATotal / (double)bmnet_p_duration * (double)100;
+  percent_tiu = (double)u64TIUTotal / (double)bmnet_p_duration * (double)100;
+  percent_paralellism = (double)(bmnet_p_total) / (double)bmnet_p_duration * (double)100;
+  percent_paralellism = percent_paralellism < 100 ? 100 : percent_paralellism;
+
+  if (!tpu_clk_rate) {
+    tpu_clk_rate = TPU_CLOCK_DEFAULT;
+    printf("can't get tpu clock, assume to %dMhz\n", tpu_clk_rate / 1000000);
+  }
+
+  ms_tdma = (double)u64TDMATotal / (double)tpu_clk_rate * (double)1000;
+  ms_tiu = (double)u64TIUTotal / (double)tpu_clk_rate * (double)1000;
+  ms_influence = (double)bmnet_p_duration / (double)tpu_clk_rate * (double)1000;
+
+  load_mb =  (double)u64_load_bytes / (double)1024 / (double)1024;
+  store_mb =  (double)u64_store_bytes / (double)1024 / (double)1024;
+
+  bandwidth = (double)(load_mb + store_mb) / (double)ms_influence * (double)1000;
+
+  printf("=======================inference total info ==========================\n");
+  //printf("cv183x tpu clock: %dMhz\n", header->tpu_clk_rate / 1000000);
+  printf("%-20s %8dMhz, %-20s %9.2fMB, %-20s %7.2fMB/s\n",
+          "cv181x_tpu_clock:", tpu_clk_rate / 1000000, "inferece_data:", load_mb + store_mb, "inference_bw:", bandwidth);
+
+  printf("%-20s %10" PRIu64 "t, %-20s %10" PRIu64 "t, %-20s %10" PRIu64 "t\n",
+         "tdma_exe_tick:", u64TDMATotal, "tiu_exe_tick", u64TIUTotal, "inference_tick", bmnet_p_duration);
+  printf("%-20s %10.2f%%, %-20s %10.2f%%, %-20s %10.2f%%\n",
+          "tdma_exe_percent:", percent_tdma, "tiu_exe_percent:", percent_tiu, "paralellism_percent", percent_paralellism);
+  printf("%-20s %9.2fms, %-20s %9.2fms, %-20s %9.2fms\n",
+          "tdma_exe_ms:", ms_tdma, "tiu_exe_ms:", ms_tiu, "inference_ms:", ms_influence);
+
+  if (all_info) {
+    tpu_pmu_fill_cmdbuf(v_dma_buf);
+    tpu_pmu_fwrite_des();
+    tpu_pmu_getlayerInfo();
+    tpu_pmu_fwrite_layer(tpu_clk_rate);
+  }
+
+  return 0;
+}
+
+uint32_t tpu_pmu_get_des_cnt(uint8_t *v_dma_buf)
+{
+  uint32_t tiu_cnt = 0, tdma_cnt = 0;
+  dma_hdr_t *header = (dma_hdr_t *)v_dma_buf;
+  cvi_cpu_desc_t *desc = (cvi_cpu_desc_t *)(v_dma_buf + sizeof(dma_hdr_t));
+
+  for (uint32_t i = 0; i < header->cpu_desc_count; i++, desc++) {
+    tiu_cnt += (desc->num_tiu & 0xFFFF);
+    tdma_cnt += (desc->num_tdma & 0xFFFF);
+  }
+
+  //assume worst case tdma g2g case will generate double descriptor
+  return (tiu_cnt + tdma_cnt + tdma_cnt);
+}
+
+#define TPU_PMU_MALLOC_PADDING  1024
+uint32_t tpu_pmu_dump_main(uint8_t *v_dma_buf, uint64_t p_dma_buf)
+{
+  dma_hdr_t *dma_header = (dma_hdr_t *)v_dma_buf;
+  uint8_t all_info = 0;
+
+  //check header first
+  if (dma_header->dmabuf_magic_m != TPU_DMABUF_HEADER_M) {
+    TPU_LOG_NOTICE("pmu buffer header incorrect\n");
+    return CVI_RC_FAILURE;
+  }
+
+  //check if we need output pmubuf
+  pmubuf_output_file_env = std::getenv("TPU_PMUBUF_OUTPUT_FILE");
+  if (pmubuf_output_file_env) {
+    all_info = 1;
+  }
+
+  //malloc element array
+  if (all_info) {
+    p_element = (TPU_DES_ELEMENT *)malloc(tpu_pmu_get_des_cnt(v_dma_buf) * sizeof(TPU_DES_ELEMENT) + TPU_PMU_MALLOC_PADDING);
+    p_layer = (TPU_LAYERID_ELEMENT *)malloc(tpu_pmu_get_des_cnt(v_dma_buf) * sizeof(TPU_LAYERID_ELEMENT) + TPU_PMU_MALLOC_PADDING);
+
+    if (!p_element || !p_layer) {
+      TPU_LOG_INFO("tpu pmu des array malloc failed\n");
+      return CVI_RC_FAILURE;
+    }
+  }
+
+  //get pmu overview data
+  tpu_pmu_time(v_dma_buf, p_dma_buf, all_info);
+
+  //free element array
+  if (all_info) {
+    if (p_element) {
+      free(p_element);
+      p_element = NULL;
+    }
+
+    if (p_layer) {
+      free(p_layer);
+      p_layer = NULL;
+    }
+  }
+
+  return CVI_RC_SUCCESS;
+}
+
diff --git a/cviruntime/src/soc/182x/CMakeLists.txt b/cviruntime/src/soc/182x/CMakeLists.txt
new file mode 100644
index 000000000..b79dab96f
--- /dev/null
+++ b/cviruntime/src/soc/182x/CMakeLists.txt
@@ -0,0 +1,29 @@
+cmake_minimum_required(VERSION 2.8.0)
+
+include_directories(./)
+include_directories(../common)
+include_directories(${CMAKE_SYSROOT}/include)
+add_definitions(-DION_CACHE_OPEN)
+add_definitions(-DMULTI_PROCESS)
+
+set(RUNTIME_SOURCES ${RUNTIME_SOURCES}
+   ${CMAKE_CURRENT_SOURCE_DIR}/../runtime_bmkernel.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/bmruntime_soc.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/bm_dmabuf.c
+   ${CMAKE_CURRENT_SOURCE_DIR}/tpu_pmu.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/cvi_rt_182x.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/cvi182x_device_mem.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/../common/cvi_device_mem.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/../common/cvi_rt_base.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/../common/cviruntime_context.cpp)
+
+set(EXTRA_LIBS ${EXTRA_LIBS} rt dl pthread)
+
+add_library(cviruntime-static STATIC ${RUNTIME_SOURCES})
+set_property(TARGET cviruntime-static PROPERTY POSITION_INDEPENDENT_CODE ON)
+
+add_library(cviruntime SHARED ${RUNTIME_SOURCES})
+target_link_libraries(cviruntime cvikernel ${EXTRA_LIBS})
+
+install(TARGETS cviruntime DESTINATION lib)
+install(TARGETS cviruntime-static DESTINATION lib)
diff --git a/cviruntime/src/soc/182x/bm_dmabuf.c b/cviruntime/src/soc/182x/bm_dmabuf.c
new file mode 100644
index 000000000..81d73912a
--- /dev/null
+++ b/cviruntime/src/soc/182x/bm_dmabuf.c
@@ -0,0 +1,432 @@
+#include <stdio.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <string.h>
+
+#include <bmkernel/bm1822/bmkernel_1822.h>
+#include <bmkernel/bm1822/bm1822_tiu_reg.h>
+#include <bmkernel/bm1822/bm1822_tdma_reg.h>
+#include <bmkernel/reg_tiu.h>
+#include <bmkernel/reg_tdma.h>
+#include <bmkernel/reg_bdcast.h>
+#include <bmkernel/bm_regcpu.h>
+#include "bmruntime_internal.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define __ALIGN_MASK(x,mask)    (((x)+(mask))&~(mask))
+#define ALIGN(x,a)              __ALIGN_MASK(x,(__typeof__(x))(a)-1)
+
+#define BD_DESC_ALIGN_SIZE (1 << BDC_ENGINE_CMD_ALIGNED_BIT)
+#define GDMA_DESC_ALIGN_SIZE (1 << TDMA_DESCRIPTOR_ALIGNED_BIT)
+#define BD_EOD_PADDING_BYTES (128)
+
+typedef struct {
+  cmd_hdr_t hdr;
+  uint32_t body[0];
+} DESC;
+
+static DESC *traverse_start(uint8_t *cmdbuf)
+{
+  TPU_ASSERT(cmdbuf, NULL);
+  DESC *desc = (DESC *)cmdbuf;
+  TPU_ASSERT(desc->hdr.magic == CMDBUF_HDR_MAGIC_1822, NULL);
+  return desc;
+}
+
+static DESC *traverse_next(DESC *desc, uint8_t *cmdbuf, size_t size)
+{
+  DESC *next_desc = (DESC *)((uint8_t *)desc + cmd_hdr_len(&desc->hdr) + sizeof(cmd_hdr_t));
+  if ((uint8_t *)next_desc >= cmdbuf + size)
+    return NULL;
+  TPU_ASSERT(next_desc->hdr.magic == CMDBUF_HDR_MAGIC_1822, NULL);
+  return next_desc;
+}
+
+static bool is_last_desc(DESC *desc, uint8_t *cmdbuf, size_t size)
+{
+  DESC *next_desc = traverse_next(desc, cmdbuf, size);
+  return next_desc ? false : true;
+}
+
+static void reorder_bd_cmdbuf_reg(uint8_t *cmdbuf)
+{
+  int total_bits = BD_REG_BYTES * 8;
+
+  for (int i = 0; i < total_bits; i += 128)
+    cmdbuf[(i + 128 - 8) / 8] |= (i / 128) << 4;
+
+  uint8_t tmp[128 / 8];
+  uint8_t *last = &cmdbuf[(total_bits - 128) / 8];
+  memcpy(tmp, last, sizeof(tmp));
+  memcpy(last, cmdbuf, sizeof(tmp));
+  memcpy(cmdbuf, tmp, sizeof(tmp));
+}
+
+static void adjust_desc_tdma(uint32_t *body, bool eod)
+{
+  if (eod) {
+    body[0] |= (1 << TDMA_ACCPI0_EOD_BIT);
+    body[0] |= (1 << TDMA_ACCPI0_INTERRUPT_BIT); // interrupt
+  }
+  body[0] |= (1 << TDMA_ACCPI0_BARRIER_ENABLE_BIT);
+}
+
+static void adjust_desc_bd(uint32_t *body, bool eod)
+{
+  if (eod) {
+    tiu_reg_t reg;
+    parse_tiu_reg(&reg, body);
+    reg.cmd_end = 1;
+    reg.cmd_intr_en = 1;
+    emit_tiu_reg(&reg, body);
+  }
+  reorder_bd_cmdbuf_reg((uint8_t *)body);
+}
+
+bmerr_t cvi182x_dmabuf_relocate(uint8_t *dmabuf, uint64_t dmabuf_devaddr, uint32_t original_size, uint32_t pmubuf_size)
+{
+  dma_hdr_t *header = (dma_hdr_t *)dmabuf;
+  uint64_t tmpAddress = 0;
+
+  TPU_ASSERT(header->dmabuf_magic_m == TPU_DMABUF_HEADER_M, NULL);
+  cvi_cpu_desc_t *desc = (cvi_cpu_desc_t *)(dmabuf + sizeof(dma_hdr_t));
+
+  for (uint32_t i = 0; i < header->cpu_desc_count; i++, desc++) {
+    uint32_t tiu_num = desc->num_tiu & 0xFFFF;
+    uint32_t tdma_num = desc->num_tdma & 0xFFFF;
+
+    if (tiu_num) {
+      tmpAddress = dmabuf_devaddr + desc->offset_tiu;
+      //TPU_LOG_DEBUG("bd tmpAddress = 0x%lu\n", tmpAddress);
+      desc->offset_tiu_ori_bk = desc->offset_tiu;
+      desc->offset_tiu = tmpAddress >> BDC_ENGINE_CMD_ALIGNED_BIT;
+    }
+
+    if (tdma_num) {
+      tmpAddress = dmabuf_devaddr + desc->offset_tdma;
+      //TPU_LOG_DEBUG("tdma tmpAddress = 0x%lu\n", tmpAddress);
+      desc->offset_tdma_ori_bk = desc->offset_tdma;
+      desc->offset_tdma = tmpAddress >> TDMA_DESCRIPTOR_ALIGNED_BIT;
+    }
+
+    //set pmubuf_addr_p to enable pmu kick
+    header->pmubuf_size = pmubuf_size;
+    header->pmubuf_offset = original_size;
+  }
+  return BM_SUCCESS;
+}
+
+static uint32_t desc_sync_id(DESC *desc)
+{
+  switch (desc->hdr.engine_id) {
+    case BMK1822_TIU: {
+      tiu_reg_t reg;
+      parse_tiu_reg(&reg, desc->body);
+      return reg.cmd_id_tpu;
+    }
+    case BMK1822_TDMA: {
+      tdma_reg_t reg;
+      parse_tdma_reg(&reg, desc->body);
+      return reg.cmd_id;
+    }
+    default:
+      TPU_ASSERT(0, NULL);
+      return 1;
+  }
+}
+
+static void fill_header_and_arm(uint8_t *cmdbuf, size_t sz, uint8_t *dmabuf, uint64_t *tiu_offset, uint64_t *tdma_offset)
+{
+  dma_hdr_t header = {0};
+  header.dmabuf_magic_m = TPU_DMABUF_HEADER_M;
+  header.dmabuf_magic_s = 0x1822;
+
+  cvi_cpu_desc_t *segments = (cvi_cpu_desc_t *)(dmabuf + sizeof(dma_hdr_t));
+  DESC *desc = NULL;
+  size_t desc_nums[BMK1822_ENGINE_NUM] = {0};
+  size_t counters[BMK1822_ENGINE_NUM] = {0};
+  size_t desc_size[BMK1822_ENGINE_NUM] = {0};
+
+  TPU_ASSERT(segments, NULL);
+  // fill arm descs
+  desc = traverse_start(cmdbuf);
+
+  while (desc != NULL) {
+    uint32_t engine_id = (uint32_t)desc->hdr.engine_id;
+    counters[engine_id]++;
+    desc_nums[engine_id]++;
+    if (engine_id != BMK1822_CPU) {
+      // a new arm desc inserted to do sync operation
+      if (desc_sync_id(desc) == 0xFFFF || is_last_desc(desc, cmdbuf, sz)) {
+        desc_nums[BMK1822_CPU]++;
+        cvi_cpu_desc_t *arm = segments + desc_nums[BMK1822_CPU] - 1;
+        memset(arm, 0, sizeof(cvi_cpu_desc_t));
+        arm->op_type = CPU_OP_SYNC;
+        arm->num_tiu = counters[BMK1822_TIU];
+        arm->num_tdma = counters[BMK1822_TDMA];
+        strncpy(arm->str, "layer_end", sizeof(arm->str) - 1);
+        if (counters[BMK1822_TIU] != 0) {
+          desc_size[BMK1822_TIU] =
+              ALIGN(desc_size[BMK1822_TIU] + counters[BMK1822_TIU] * BD_REG_BYTES + BD_EOD_PADDING_BYTES,
+                    BD_DESC_ALIGN_SIZE);
+        }
+        counters[BMK1822_TIU] = 0;
+        counters[BMK1822_TDMA] = 0;
+      }
+    } else {
+      cvi_cpu_desc_t *arm = segments + desc_nums[BMK1822_CPU] - 1;
+      memcpy(arm, &(desc->body), sizeof(cvi_cpu_desc_t));
+      arm->num_tiu = counters[BMK1822_TIU];
+      arm->num_tdma = counters[BMK1822_TDMA];
+      if (counters[BMK1822_TIU] != 0) {
+        desc_size[BMK1822_TIU] =
+            ALIGN(desc_size[BMK1822_TIU] + counters[BMK1822_TIU] * BD_REG_BYTES + BD_EOD_PADDING_BYTES,
+                  BD_DESC_ALIGN_SIZE);
+      }
+      counters[BMK1822_TIU] = 0;
+      counters[BMK1822_TDMA] = 0;
+    }
+    desc = traverse_next(desc, cmdbuf, sz);
+  }
+  desc_size[BMK1822_CPU] = desc_nums[BMK1822_CPU] * CPU_ENGINE_BYTES;
+  desc_size[BMK1822_TDMA] = desc_nums[BMK1822_TDMA] * GDMA_DESC_ALIGN_SIZE;
+
+  (*tiu_offset) = ALIGN(sizeof(header) + desc_size[BMK1822_CPU], BD_DESC_ALIGN_SIZE);
+  (*tdma_offset) = ALIGN((*tiu_offset) + desc_size[BMK1822_TIU], GDMA_DESC_ALIGN_SIZE);
+
+  // dma hdr + arm descs + bd descs + tdma descs
+  header.dmabuf_size = (*tdma_offset) + desc_size[BMK1822_TDMA];
+  header.cpu_desc_count = desc_nums[BMK1822_CPU];
+  header.bd_desc_count = desc_nums[BMK1822_TIU];
+  header.tdma_desc_count = desc_nums[BMK1822_TDMA];
+
+  //TPU_LOG_DEBUG("header.dmabuf_size = %d\n", header.dmabuf_size);
+  //TPU_LOG_DEBUG("header.cpu_desc_count = %d\n", header.cpu_desc_count);
+  //TPU_LOG_DEBUG("header.bd_desc_count = %d\n", header.bd_desc_count);
+  //TPU_LOG_DEBUG("header.tdma_desc_count = %d\n", header.tdma_desc_count);
+
+  memcpy(dmabuf, &header, sizeof(header));
+}
+
+static void fill_bd_and_tdma(uint8_t *cmdbuf, size_t sz, uint8_t *dmabuf, uint64_t tiu_offset, uint64_t tdma_offset)
+{
+  dma_hdr_t *p_header = (dma_hdr_t *)dmabuf;
+  cvi_cpu_desc_t *segments = (cvi_cpu_desc_t *)(dmabuf + sizeof(dma_hdr_t));
+  DESC *desc = traverse_start(cmdbuf);
+  //uint64_t address_max = 0x0;
+  
+  for (uint32_t i = 0; i < p_header->cpu_desc_count; i++) {
+
+    cvi_cpu_desc_t *arm = segments + i;
+    
+    uint32_t tiu_num = arm->num_tiu & 0xFFFF;
+    uint32_t tdma_num = arm->num_tdma & 0xFFFF;
+
+    if (tiu_num) {
+      tiu_offset = ALIGN(tiu_offset, 1 << BDC_ENGINE_CMD_ALIGNED_BIT);
+      arm->offset_tiu = tiu_offset;
+      //TPU_LOG_DEBUG("arm->offset_tiu = 0x%x \n", arm->offset_tiu);
+    }
+
+    if (tdma_num) {
+      tdma_offset = ALIGN(tdma_offset, 1 << TDMA_DESCRIPTOR_ALIGNED_BIT);
+      arm->offset_tdma = tdma_offset;
+      //TPU_LOG_DEBUG("arm->offset_tdma = 0x%x \n", arm->offset_tdma);
+    }
+
+    while (tiu_num || tdma_num) {
+      uint32_t engine_id = (uint32_t)desc->hdr.engine_id;
+      void *p_body = NULL;
+
+      switch (engine_id) {
+        case BMK1822_TIU:
+          tiu_num--;
+          p_body = (void *)(dmabuf + tiu_offset);
+          tiu_offset += BD_REG_BYTES;
+          memcpy(p_body, desc->body, desc->hdr.len);
+          adjust_desc_bd((uint32_t *)p_body, tiu_num == 0);
+          break;
+        case BMK1822_TDMA:
+          tdma_num--;
+          tdma_offset = ALIGN(tdma_offset, GDMA_DESC_ALIGN_SIZE);
+          p_body = (void *)(dmabuf + tdma_offset);
+          tdma_offset += GDMA_DESC_ALIGN_SIZE;
+          memcpy(p_body, desc->body, desc->hdr.len);
+
+#if 0 //debug feature, for checking if neuron overshoot
+{
+          tdma_reg_t reg_tdma = {0};
+          uint64_t tdma_address = 0, tdma_address2 = 0;
+
+          parse_tdma_reg(&reg_tdma, p_body);
+
+          if (reg_tdma.src_base_reg_sel == 0) {
+            //  reg.trans_dir = 2; // 0:tg2l, 1:l2tg, 2:g2g, 3:l2l
+            if (reg_tdma.trans_dir == 0) {
+              TPU_LOG_DEBUG ("src_base_addr_high=%x, src_base_addr_low=%x\n", reg_tdma.src_base_addr_high, reg_tdma.src_base_addr_low);
+              tdma_address = ((uint64_t)reg_tdma.src_base_addr_high) << 32 | (uint64_t)reg_tdma.src_base_addr_low;
+            } else if (reg_tdma.trans_dir == 1) {
+              TPU_LOG_DEBUG ("dst_base_addr_high=%x, dst_base_addr_low=%x\n", reg_tdma.dst_base_addr_high, reg_tdma.dst_base_addr_low);
+              tdma_address = ((uint64_t)reg_tdma.dst_base_addr_high) << 32 | (uint64_t)reg_tdma.dst_base_addr_low;
+            } else if (reg_tdma.trans_dir == 2) {
+              TPU_LOG_DEBUG ("dst_base_addr_high=%x, dst_base_addr_low=%x\n", reg_tdma.dst_base_addr_high, reg_tdma.dst_base_addr_low);
+              tdma_address = ((uint64_t)reg_tdma.src_base_addr_high) << 32 | (uint64_t)reg_tdma.src_base_addr_low;
+              tdma_address2 = ((uint64_t)reg_tdma.dst_base_addr_high) << 32 | (uint64_t)reg_tdma.dst_base_addr_low;
+
+              if (tdma_address2 > tdma_address) {
+                tdma_address = tdma_address2;
+              }
+            }
+            
+            if (tdma_address > address_max) {
+              address_max = tdma_address;
+              TPU_LOG_DEBUG("address_max=%llx\n", address_max);
+            }
+          }
+}
+#endif
+          adjust_desc_tdma((uint32_t *)p_body, tdma_num == 0);
+          break;
+        default:
+          break;
+      }
+      desc = traverse_next(desc, cmdbuf, sz);
+    }
+
+    // padding zero after eod to workaroud hardware bug
+    if (arm->num_tiu & 0xFFFF) {
+      void *buf = (void *)(dmabuf + tiu_offset);
+      memset(buf, 0, BD_EOD_PADDING_BYTES);
+      tiu_offset += BD_EOD_PADDING_BYTES;
+    }
+  }
+
+}
+
+bmerr_t cvi182x_dmabuf_convert(uint8_t *cmdbuf, size_t sz, uint8_t *dmabuf)
+{
+  uint64_t tiu_offset = 0;
+  uint64_t tdma_offset = 0;
+  fill_header_and_arm(cmdbuf, sz, dmabuf, &tiu_offset, &tdma_offset);
+  fill_bd_and_tdma(cmdbuf, sz, dmabuf, tiu_offset, tdma_offset);
+  return BM_SUCCESS;
+}
+
+#define PER_DES_SIZE 16
+#define PADDING_SIZE (1024 * 1024)
+bmerr_t cvi182x_dmabuf_size(uint8_t *cmdbuf, size_t sz, size_t *psize, size_t *pmu_size)
+{
+  size_t tdma_desc_num = {0};
+  size_t counters[BMK1822_ENGINE_NUM] = {0};
+  size_t bd_size = 0;
+  size_t dmabuf_size = 0;
+
+  uint32_t tiu_cnt = 0;
+  uint32_t tdma_cnt = 0;
+
+  // calculate desc numbers
+  DESC *desc = traverse_start(cmdbuf);
+
+  while (desc != NULL) {
+    uint32_t engine_id = (uint32_t)desc->hdr.engine_id;
+    counters[engine_id]++;
+    if (engine_id != BMK1822_CPU) {
+      // a new arm desc inserted to do sync operation
+      if (desc_sync_id(desc) == 0xFFFF || is_last_desc(desc, cmdbuf, sz)) {
+        counters[BMK1822_CPU]++;
+        tdma_desc_num += counters[BMK1822_TDMA];
+        if (counters[BMK1822_TIU] != 0) {
+          bd_size = ALIGN(bd_size + counters[BMK1822_TIU] * BD_REG_BYTES + BD_EOD_PADDING_BYTES,
+                          BD_DESC_ALIGN_SIZE);
+        }
+        tiu_cnt += counters[BMK1822_TIU] & 0xFFFF;
+        tdma_cnt += counters[BMK1822_TDMA] & 0xFFFF;
+        counters[BMK1822_TIU] = 0;
+        counters[BMK1822_TDMA] = 0;
+      }
+    } else {
+      tdma_desc_num += counters[BMK1822_TDMA];
+      if (counters[BMK1822_TIU] != 0) {
+        bd_size = ALIGN(bd_size + counters[BMK1822_TIU] * BD_REG_BYTES + BD_EOD_PADDING_BYTES,
+                        BD_DESC_ALIGN_SIZE);
+      }
+      tiu_cnt += counters[BMK1822_TIU] & 0xFFFF;
+      tdma_cnt += counters[BMK1822_TDMA] & 0xFFFF;
+      counters[BMK1822_TIU] = 0;
+      counters[BMK1822_TDMA] = 0;
+    }
+    desc = traverse_next(desc, cmdbuf, sz);
+  }
+  // dma hdr + arm descs + bd descs + tdma descs
+  dmabuf_size = sizeof(dma_hdr_t) + counters[BMK1822_CPU] * CPU_ENGINE_BYTES;
+  dmabuf_size = ALIGN(dmabuf_size, BD_DESC_ALIGN_SIZE) + bd_size;
+  dmabuf_size = ALIGN(dmabuf_size, GDMA_DESC_ALIGN_SIZE) + tdma_desc_num * GDMA_DESC_ALIGN_SIZE;
+
+  *psize = dmabuf_size;
+
+  *pmu_size = ALIGN((tiu_cnt + tdma_cnt) * PER_DES_SIZE + PADDING_SIZE, 0x1000);
+  return BM_SUCCESS;
+}
+
+void cvi182x_arraybase_set(uint8_t *dmabuf, uint32_t arraybase0L, uint32_t arraybase1L, uint32_t arraybase0H, uint32_t arraybase1H)
+{
+  TPU_ASSERT(dmabuf, NULL);
+  dma_hdr_t *header = (dma_hdr_t *)dmabuf;
+
+  TPU_ASSERT(header->dmabuf_magic_m == TPU_DMABUF_HEADER_M, NULL);
+  header->arraybase_0_L = arraybase0L;
+  header->arraybase_1_L = arraybase1L;
+  header->arraybase_0_H = arraybase0H;
+  header->arraybase_1_H = arraybase1H;
+  return;
+}
+
+uint64_t cvi182x_get_pmusize(uint8_t * dmabuf)
+{
+  uint32_t tiu_cnt = 0, tdma_cnt = 0;
+  uint64_t pmu_size = 0;
+
+  TPU_ASSERT(dmabuf, NULL);
+  dma_hdr_t *header = (dma_hdr_t *)dmabuf;
+  TPU_ASSERT(header->dmabuf_magic_m == TPU_DMABUF_HEADER_M, NULL);
+
+  cvi_cpu_desc_t *desc = (cvi_cpu_desc_t *)(dmabuf + sizeof(dma_hdr_t));
+
+  for (uint32_t i = 0; i < header->cpu_desc_count; i++, desc++) {
+    tiu_cnt += (desc->num_tiu & 0xFFFF);
+    tdma_cnt += (desc->num_tdma & 0xFFFF);
+  }
+
+  pmu_size = ALIGN((tiu_cnt + tdma_cnt) * PER_DES_SIZE + PADDING_SIZE, 0x1000);
+  //TPU_LOG_DEBUG("cvi182x_get_pmusize pmusize= %" PRIu64 " \n", pmu_size);
+  return pmu_size;
+}
+void cvi182x_dmabuf_dump(uint8_t *dmabuf)
+{
+  TPU_ASSERT(dmabuf, NULL);
+  dma_hdr_t *header = (dma_hdr_t *)dmabuf;
+  //TPU_LOG_DEBUG("bmk1822_dmabuf_dump header->arraybase_0_L = 0x%x\n", header->arraybase_0_L);
+  //TPU_LOG_DEBUG("bmk1822_dmabuf_dump header->arraybase_1_L = 0x%x\n", header->arraybase_1_L);
+  //TPU_LOG_DEBUG("bmk1822_dmabuf_dump header->arraybase_0_H = 0x%x\n", header->arraybase_0_H);
+  //TPU_LOG_DEBUG("bmk1822_dmabuf_dump header->arraybase_1_H = 0x%x\n", header->arraybase_1_H);
+  //TPU_LOG_DEBUG("bmk1822_dmabuf_dump header->pmubuf_offset = 0x%x\n", header->pmubuf_offset);
+
+  TPU_ASSERT(header->dmabuf_magic_m == TPU_DMABUF_HEADER_M, NULL);
+  cvi_cpu_desc_t *desc = (cvi_cpu_desc_t *)(dmabuf + sizeof(dma_hdr_t));
+
+  for (u32 i = 0; i < header->cpu_desc_count; i++, desc++) {
+    int bd_num = desc->num_tiu & 0xFFFF;
+    int tdma_num = desc->num_tdma & 0xFFFF;
+    u32 bd_offset = desc->offset_tiu;
+    u32 tdma_offset = desc->offset_tdma;
+    TPU_LOG_DEBUG("cvi182x_dmabuf_dump num<bd:%d, tdma:%d>, offset<0x%08x, 0x%08x>\n", bd_num, tdma_num, bd_offset, tdma_offset);
+  }
+}
+
+#ifdef __cplusplus
+}
+#endif
+
diff --git a/cviruntime/src/soc/182x/bmruntime_internal.h b/cviruntime/src/soc/182x/bmruntime_internal.h
new file mode 100644
index 000000000..fa4ad5315
--- /dev/null
+++ b/cviruntime/src/soc/182x/bmruntime_internal.h
@@ -0,0 +1,32 @@
+#ifndef _BM_RUNTIME_INTERNAL_H_
+#define _BM_RUNTIME_INTERNAL_H_
+
+#include <pthread.h>
+#include <bmkernel/bm1822/bmkernel_1822.h>
+#include <bmruntime.h>
+#include <cvikernel/cvikernel.h>
+#include "cvitpu_debug.h"
+#include <bmkernel/bm_regcpu.h>
+#include "bm_types.h"
+
+#ifdef __cplusplus
+	extern "C" {
+#endif
+
+bmerr_t cvi182x_dmabuf_size(uint8_t *cmdbuf, size_t sz, size_t *psize, size_t *pmu_size);
+bmerr_t cvi182x_dmabuf_relocate(uint8_t *dmabuf, uint64_t dmabuf_devaddr, uint32_t original_size, uint32_t pmubuf_size);
+bmerr_t cvi182x_dmabuf_convert(uint8_t *cmdbuf, size_t sz, uint8_t *dmabuf);
+void cvi182x_dmabuf_dump(uint8_t * dmabuf);
+void cvi182x_arraybase_set(uint8_t *dmabuf, uint32_t arraybase0L, uint32_t arraybase1L, uint32_t arraybase0H, uint32_t arraybase1H);
+uint64_t cvi182x_get_pmusize(uint8_t * dmabuf);
+
+uint32_t tpu_pmu_dump_main(uint8_t *v_dma_buf, uint64_t p_dma_buf);
+
+#define TPU_PMUBUF_SIZE         (1024 * 1024 * 2)
+#define TPU_DMABUF_HEADER_M     0xB5B5
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BM_RUNTIME_INTERNAL_H_ */
diff --git a/cviruntime/src/soc/182x/bmruntime_soc.cpp b/cviruntime/src/soc/182x/bmruntime_soc.cpp
new file mode 100644
index 000000000..522264657
--- /dev/null
+++ b/cviruntime/src/soc/182x/bmruntime_soc.cpp
@@ -0,0 +1,173 @@
+#include <inttypes.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <cstdlib>
+#include <runtime/debug.h>
+#include <bmruntime.h>
+#include <mmpool.h>
+#include "bmruntime_internal.h"
+#include "cvi182x_device_mem.h"
+
+Cvi182xDeviceMem cvi_device;
+
+bmmem_device_t bmmem_device_alloc_raw(bmctx_t ctx, size_t size) {
+  return cvi_device.mem_alloc_raw(ctx, size);
+}
+
+bmmem_device_t bmmem_device_prealloc_raw(bmctx_t ctx, bmmem_device_t mem, uint64_t offset,
+                                         size_t size) {
+  return cvi_device.mem_prealloc_raw(ctx, mem, offset, size);
+}
+
+void bmmem_device_free(bmctx_t ctx, bmmem_device_t mem) {
+  cvi_device.mem_free_raw(ctx, mem);
+}
+
+void bmmem_device_free_ex(uint64_t p_addr) {
+  cvi_device.mem_free_ex(p_addr);
+}
+
+size_t bmmem_device_size(bmmem_device_t mem) {
+  return cvi_device.mem_size(mem);
+}
+
+uint64_t bmmem_device_addr(bmmem_device_t mem) {
+  return cvi_device.mem_p_addr(mem);
+}
+
+uint8_t *bmmem_device_v_addr(bmmem_device_t mem) {
+  return cvi_device.mem_v_addr(mem);
+}
+
+int32_t bmmem_device_inc_ref(bmmem_device_t mem) {
+  return cvi_device.mem_inc_ref(mem);
+}
+
+int32_t bmmem_device_dec_ref(bmmem_device_t mem) {
+  return cvi_device.mem_dec_ref(mem);
+}
+
+bmerr_t bm_memcpy_s2d(bmctx_t ctx, bmmem_device_t dst, uint8_t *src) {
+  return cvi_device.mem_memcpy_s2d(ctx, dst, src);
+}
+
+bmerr_t bm_memcpy_s2d_ex(bmctx_t ctx, bmmem_device_t dst, uint8_t *src, uint64_t offset,
+                         size_t size) {
+  return cvi_device.mem_memcpy_s2d_ex(ctx, dst, src, offset, size);
+}
+
+bmerr_t bm_memcpy_d2s(bmctx_t ctx, uint8_t *dst, bmmem_device_t src) {
+  return cvi_device.mem_memcpy_d2s(ctx, dst, src);
+}
+
+bmerr_t bm_memcpy_d2s_ex(bmctx_t ctx, uint8_t *dst, bmmem_device_t src, uint64_t offset,
+                         size_t size) {
+  return cvi_device.mem_memcpy_d2s_ex(ctx, dst, src, offset, size);
+}
+
+bmerr_t bm_context_create(bmctx_t *ctx) {
+  return cvi_device.context_create(ctx);
+}
+
+bmerr_t bm_bind_device(bmctx_t ctx, bmdev_t dev) {
+  return cvi_device.bind_device(ctx, dev);
+}
+
+void bm_unbind_device(bmctx_t ctx) {
+  return cvi_device.unbind_device(ctx);
+}
+
+bmdev_t bm_get_device(bmctx_t ctx) {
+  return cvi_device.get_device(ctx);
+}
+
+bmerr_t bm_init(int index, bmctx_t *ctx) {
+  return cvi_device.device_init(index, ctx);
+}
+
+void bm_exit(bmctx_t ctx) {
+  cvi_device.device_exit(ctx);
+}
+
+bmerr_t bm_load_cmdbuf(bmctx_t ctx, uint8_t *cmdbuf, size_t sz,
+                       uint64_t neuron_gaddr, uint64_t weight_gaddr,
+                       bool enable_pmu, bmmem_device_t *cmdbuf_mem) {
+  return cvi_device.load_cmdbuf(ctx, cmdbuf, sz, neuron_gaddr,
+                                 weight_gaddr, enable_pmu, cmdbuf_mem);
+}
+
+bmerr_t cvi_load_cmdbuf_tee(bmctx_t ctx, uint8_t *cmdbuf, size_t sz,
+                       uint64_t neuron_gaddr, uint64_t weight_gaddr, uint32_t weight_len, bmmem_device_t *cmdbuf_mem)
+{
+  return cvi_device.load_cmdbuf_tee(ctx, cmdbuf, sz, neuron_gaddr,
+                                     weight_gaddr, weight_len, cmdbuf_mem);
+}
+
+bmerr_t cvi_run_cmdbuf_tee(bmctx_t ctx, uint16_t *seq_no, uint64_t dmabuf_addr, cvi_array_base *array_base)
+{
+  return cvi_device.run_cmdbuf_tee(ctx, seq_no, dmabuf_addr, array_base);
+}
+
+bmerr_t bm_run_cmdbuf(bmctx_t ctx, bmmem_device_t cmdbuf_mem, uint16_t *seq_no) {
+  return cvi_device.run_cmdbuf(ctx, cmdbuf_mem, seq_no);
+}
+
+bmerr_t bm_run_cmdbuf_ex(bmctx_t ctx, bmmem_device_t cmdbuf_mem, uint16_t *seq_no,
+                       uint64_t input_base_addr, uint64_t output_base_addr) {
+  return cvi_device.run_cmdbuf_ex(ctx, cmdbuf_mem, seq_no, input_base_addr, output_base_addr);
+}
+
+bmerr_t bm_run_cmdbuf_ex2(bmctx_t ctx, bmmem_device_t cmdbuf_mem, uint16_t *seq_no,
+                       cvi_array_base *p_array_base) {
+  return cvi_device.run_cmdbuf_ex2(ctx, cmdbuf_mem, seq_no, p_array_base);
+}
+
+bmerr_t cvi_run_async(bmctx_t ctx, bmmem_device_t cmdbuf_mem)
+{
+  return cvi_device.run_async(ctx, cmdbuf_mem);
+}
+
+bmerr_t bm_send_cmdbuf(bmctx_t ctx, uint8_t *cmdbuf, size_t sz, uint16_t *seq_no) {
+  return cvi_device.send_cmdbuf(ctx, cmdbuf, sz, seq_no);
+}
+
+bmerr_t bm_wait_cmdbuf_done(bmctx_t ctx, uint16_t seq_no) {
+  return cvi_device.wait_cmdbuf_done(ctx, seq_no);
+}
+
+bmerr_t cvi_wait_cmdbuf_all(bmctx_t ctx) {
+  return cvi_device.wait_cmdbuf_all(ctx);
+}
+
+bmerr_t bm_run_cmdbuf_pio(bmctx_t ctx, uint8_t *cmdbuf, size_t sz) {
+  return cvi_device.run_cmdbuf_pio(ctx, cmdbuf, sz);
+}
+
+void bm_device_set_base_reg(bmctx_t ctx, uint32_t inx, uint64_t addr) {
+  cvi_device.set_base_reg(ctx, inx, addr);
+}
+
+uint64_t bm_device_read_base_reg(bmctx_t ctx, u32 inx) {
+  return cvi_device.read_base_reg(ctx, inx);
+}
+
+int bm_device_get_chip_ver(bmdev_t dev) {
+  return cvi_device.get_chip_ver(dev);
+}
+
+bmerr_t bm_parse_pmubuf(bmmem_device_t cmdbuf_mem, uint8_t **buf_start, uint32_t *buf_len) {
+  return cvi_device.parse_pmubuf(cmdbuf_mem, buf_start, buf_len);
+}
+
+void cviruntime_cvikernel_create(bmctx_t ctx, void **p_bk_ctx) {
+  cvi_device.cvikernel_create(ctx, p_bk_ctx);
+}
+
+void cviruntime_cvikernel_submit(bmctx_t ctx) {
+  cvi_device.cvikernel_submit(ctx);
+}
+
+void cviruntime_cvikernel_destroy(bmctx_t ctx) {
+  cvi_device.cvikernel_destroy(ctx);
+}
diff --git a/cviruntime/src/soc/182x/cvi182x_device_mem.cpp b/cviruntime/src/soc/182x/cvi182x_device_mem.cpp
new file mode 100644
index 000000000..c9e5339ef
--- /dev/null
+++ b/cviruntime/src/soc/182x/cvi182x_device_mem.cpp
@@ -0,0 +1,260 @@
+#include <cstdlib>
+#include <memory>
+#include <cstring>
+#include "cvi182x_device_mem.h"
+
+Cvi182xDeviceMem::Cvi182xDeviceMem() {
+  GLOBAL_MEM_START_ADDR = 0x00;
+  g_gmem_size = 1ULL << 30; // 1GB
+  tpu_dmabuf_header_m = 0xB5B5;
+}
+
+Cvi182xDeviceMem::~Cvi182xDeviceMem() {}
+
+
+bmerr_t Cvi182xDeviceMem::device_open(int index, bmdev_t *dev)
+{
+  bm_device_t *pdev = new bm_device_t;
+
+  BMDEV_LOCK_INIT(pdev);
+  pdev->index = index;
+  pdev->info.info182x = bmk1822_chip_info();
+  pdev->gmem_size = g_gmem_size;
+
+  const char* tpu_dev_name_defalut = TPU_DEV_NAME;
+  const char* tpu_dev_name_env = std::getenv("TPU_DEV");
+  const char *tpu_dev_name = tpu_dev_name_defalut;
+  if (tpu_dev_name_env) {
+    tpu_dev_name = tpu_dev_name_env;
+  }
+
+  pdev->dev_fd = open(tpu_dev_name, O_RDWR);
+  if (pdev->dev_fd <= 0) {
+    TPU_LOG_WARNING("open %s failed\n", tpu_dev_name);
+    return BM_ERR_FAILURE;
+  }
+
+  pdev->ion_fd = open(ION_DEV_NAME, O_RDWR | O_DSYNC);
+  if (pdev->ion_fd <= 0) {
+    TPU_LOG_WARNING("open %s failed\n", ION_DEV_NAME);
+    return BM_ERR_FAILURE;
+  }
+
+  int ret = ion_query_heap(pdev);
+  TPU_ASSERT(ret == BM_SUCCESS, NULL);
+
+  *dev = pdev;
+
+  return BM_SUCCESS;
+}
+
+void Cvi182xDeviceMem::device_close(bmdev_t dev)
+{
+  close(dev->ion_fd);
+  close(dev->dev_fd);
+
+  // TPU_LOG_WARNING("device[%d] closed\n", dev->index);
+
+  BMDEV_LOCK_DEINIT(dev);
+  delete dev;
+}
+
+int Cvi182xDeviceMem::get_chip_ver(bmdev_t dev) {
+  return dev->info.info182x.version;
+}
+
+void Cvi182xDeviceMem::mem_free_raw(bmctx_t ctx, bmmem_device_t mem) {
+  char array_got = 0;
+  bm_memory_t *device_mem = (bm_memory_t *)mem;
+  TPU_ASSERT(device_mem->flags.u.type == BMMEM_TYPE_DEVICE, NULL);
+
+  if (!device_mem->flags.u.is_prealloc) {
+    mem_free(device_mem->v_addr, device_mem->size, device_mem->dma_fd);
+
+    for (int i = 0; i < MEMARRAY_MAX_CNT; i ++) {
+      if (ctx->root_mem_array[i].p_addr == device_mem->p_addr) {
+        ctx->root_mem_array[i].p_addr = 0;
+        ctx->root_mem_array[i].mem = NULL;
+        array_got = 1;
+        break;
+      }
+    }
+
+    if (!array_got)
+      TPU_LOG_WARNING("bmmem_device_free() can not find match\n");
+  }
+
+  BMEMEM_DUMP();
+  delete device_mem;
+}
+
+
+bmerr_t Cvi182xDeviceMem::load_cmdbuf(bmctx_t ctx, uint8_t *cmdbuf, size_t sz,
+                       uint64_t neuron_gaddr, uint64_t weight_gaddr,
+                       bool enable_pmu, bmmem_device_t *cmdbuf_mem) {
+  bmerr_t ret;
+  size_t dmabuf_size = 0;
+  size_t pmubuf_size = 0;
+  bmmem_device_t dmabuf_mem;
+
+  ret = cvi182x_dmabuf_size(cmdbuf, sz, &dmabuf_size, &pmubuf_size);
+
+  //calculate pmu size
+  pmubuf_size = enable_pmu ? pmubuf_size : 0;
+  //TPU_LOG_DEBUG("pmubuf_size = 0x%lx\n", pmubuf_size);
+  if (protect) {
+    dmabuf_mem = mem_alloc_pagesize(ctx, dmabuf_size + pmubuf_size);
+  } else {
+    dmabuf_mem = mem_alloc_raw(ctx, dmabuf_size + pmubuf_size);
+  }
+  if (!dmabuf_mem) {
+      TPU_LOG_ERROR("alloc dmabuf mem fail!\n");
+      return BM_ERR_NOMEM;
+  }
+  uint64_t dmabuf_devaddr = mem_p_addr(dmabuf_mem);
+
+  ret = cvi182x_dmabuf_convert(cmdbuf, sz, dmabuf_mem->v_addr);
+  set_base_reg(ctx, 0, neuron_gaddr);
+  set_base_reg(ctx, 1, weight_gaddr);
+  cvi182x_arraybase_set(dmabuf_mem->v_addr, (u32)neuron_gaddr, (u32)weight_gaddr, 0, 0);
+
+  cvi182x_dmabuf_relocate(dmabuf_mem->v_addr, dmabuf_devaddr, dmabuf_size, pmubuf_size);
+
+  ret = mem_flush_ext(ctx->dev, dmabuf_mem->dma_fd,
+                      dmabuf_mem->p_addr, dmabuf_size);
+  if (ret) {
+      mem_free_raw(ctx, dmabuf_mem);
+      return ret;
+  }
+
+  // record dmabuf crc32
+  // dmabuf_mem->crc32 = bm_crc32(dmabuf, dmabuf_size);
+  *cmdbuf_mem = dmabuf_mem;
+
+  // if (0) {
+  // cvi182x_dmabuf_dump(dmabuf);
+  //}
+  return ret;
+}
+
+bmerr_t Cvi182xDeviceMem::load_dmabuf(bmctx_t ctx, bmmem_device_t dmabuf,
+                                      size_t sz, uint64_t neuron_gaddr, uint64_t weight_gaddr,
+                                      bool enable_pmu, bmmem_device_t *dmabuf_mem) {
+  size_t pmubuf_size = 0;
+  if (enable_pmu) {
+    pmubuf_size = cvi182x_get_pmusize(dmabuf->v_addr);
+    *dmabuf_mem = mem_alloc_raw(ctx, sz + pmubuf_size);
+    if (*dmabuf_mem == nullptr) {
+        TPU_LOG_ERROR("alloc dmabuf mem fail!\n");
+        return BM_ERR_NOMEM;
+    }
+    std::memcpy((*dmabuf_mem)->v_addr, dmabuf->v_addr, sz);
+  } else {
+    *dmabuf_mem = dmabuf;
+  }
+  uint64_t dmabuf_devaddr = mem_p_addr(*dmabuf_mem);
+
+  //set_base_reg(ctx, 0, neuron_gaddr);
+  //set_base_reg(ctx, 1, weight_gaddr);
+  cvi182x_arraybase_set((*dmabuf_mem)->v_addr, (u32)neuron_gaddr, (u32)weight_gaddr, 0, 0);
+
+  cvi182x_dmabuf_relocate((*dmabuf_mem)->v_addr, dmabuf_devaddr, sz,
+                          pmubuf_size);
+  TPU_ASSERT(mem_flush_ext(ctx->dev, (*dmabuf_mem)->dma_fd,
+                           (*dmabuf_mem)->p_addr, sz) == BM_SUCCESS,
+             NULL);
+  // if (0) {
+  //cvi182x_dmabuf_dump(dmabuf);
+  //}
+  return BM_SUCCESS;
+}
+
+
+bmerr_t Cvi182xDeviceMem::load_cmdbuf_tee(bmctx_t ctx, uint8_t *cmdbuf, size_t sz,
+                            uint64_t neuron_gaddr, uint64_t weight_gaddr,
+                            uint32_t weight_len, bmmem_device_t *cmdbuf_mem)
+{
+  //bmerr_t ret;
+  bmmem_device_t dmabuf_mem;
+
+  //malloc double size buffer, because TEE needs 2nd space to calculate dmabuf
+  if (protect) {
+    dmabuf_mem = mem_alloc_pagesize(ctx, sz + sz); 
+  } else {
+    dmabuf_mem = mem_alloc_raw(ctx, sz + sz);
+  }
+  if (!dmabuf_mem) {
+      TPU_LOG_ERROR("alloc dmabuf mem fail!\n");
+      return BM_ERR_NOMEM;
+  }
+
+  //transfer encrypted cmdbuf to TEE
+  memcpy(dmabuf_mem->v_addr, cmdbuf, sz);
+  TPU_ASSERT((int)mem_flush_ext(ctx->dev, dmabuf_mem->dma_fd,
+        dmabuf_mem->p_addr, sz) == BM_SUCCESS, NULL);
+
+  //ioctl to get secure dma buffer
+  load_tee(ctx, dmabuf_mem->p_addr, sz, weight_gaddr, weight_len, neuron_gaddr);
+
+  //this region should be protected, can't touch in REE
+  *cmdbuf_mem = dmabuf_mem;
+	return 0;
+}
+
+bmerr_t Cvi182xDeviceMem::unload_tee(bmctx_t ctx, uint64_t paddr, size_t size) {
+  TPU_ASSERT(0, NULL); // not support
+  return BM_SUCCESS;
+}
+
+bmerr_t Cvi182xDeviceMem::parse_pmubuf(bmmem_device_t cmdbuf_mem, uint8_t **buf_start, uint32_t *buf_len) {
+  dma_hdr_t *header = (dma_hdr_t *)(cmdbuf_mem->v_addr);
+  //TPU_LOG_DEBUG("header->arraybase_0_L = 0x%x\n", header->arraybase_0_L);
+  //TPU_LOG_DEBUG("header->arraybase_1_L = 0x%x\n", header->arraybase_1_L);
+  //TPU_LOG_DEBUG("header->arraybase_0_H = 0x%x\n", header->arraybase_0_H);
+  //TPU_LOG_DEBUG("header->arraybase_1_H = 0x%x\n", header->arraybase_1_H);
+  //TPU_LOG_DEBUG("header->pmubuf_offset = 0x%x\n", header->pmubuf_offset);
+  //TPU_LOG_DEBUG("header->pmubuf_size = 0x%x\n", header->pmubuf_size);
+  if (header->pmubuf_size && header->pmubuf_offset) {
+    tpu_pmu_dump_main(cmdbuf_mem->v_addr, cmdbuf_mem->p_addr);
+  }
+  *buf_start = cmdbuf_mem->v_addr;
+  *buf_len = cmdbuf_mem->size;
+  return BM_SUCCESS;
+}
+
+void Cvi182xDeviceMem::cvikernel_create(bmctx_t ctx, void **p_bk_ctx) {
+  TPU_ASSERT(ctx != nullptr, nullptr);
+  TPU_ASSERT(ctx->dev != nullptr, nullptr);
+
+  bmk1822_chip_info_t info = bmk1822_chip_info();
+  bmk1822_chip_info_t *dev_info = &info;
+
+  bmk_info_t bmk_info;
+  bmk_info.chip_version = dev_info->version;
+  bmk_info.cmdbuf_size = 0x100000;
+  bmk_info.cmdbuf = (u8 *)malloc(bmk_info.cmdbuf_size);
+  TPU_ASSERT(bmk_info.cmdbuf, "create cvikernel, malloc failed\n");
+
+  ctx->cvik_context.ctx182x = bmk1822_register(&bmk_info);
+  ctx->cvik_cmdbuf = (void *)bmk_info.cmdbuf;
+
+  *p_bk_ctx = ctx->cvik_context.ctx182x;
+}
+
+void Cvi182xDeviceMem::cvikernel_submit(bmctx_t ctx) {
+  u32 len;
+  u8 *cmdbuf = bmk1822_acquire_cmdbuf(ctx->cvik_context.ctx182x, &len);
+
+  uint16_t seq_no;
+  bmerr_t ret = send_cmdbuf(ctx, cmdbuf, (size_t)len, &seq_no);
+  TPU_ASSERT(ret == BM_SUCCESS, NULL);
+  bmk1822_reset(ctx->cvik_context.ctx182x);
+}
+
+void Cvi182xDeviceMem::cvikernel_destroy(bmctx_t ctx) {
+  TPU_ASSERT(ctx->cvik_context.ctx182x, NULL);
+  TPU_ASSERT(ctx->cvik_cmdbuf, NULL);
+
+  bmk1822_cleanup(ctx->cvik_context.ctx182x);
+  free(ctx->cvik_cmdbuf);
+}
\ No newline at end of file
diff --git a/cviruntime/src/soc/182x/cvi182x_device_mem.h b/cviruntime/src/soc/182x/cvi182x_device_mem.h
new file mode 100644
index 000000000..c33b744a9
--- /dev/null
+++ b/cviruntime/src/soc/182x/cvi182x_device_mem.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <string.h>
+#include "cvi_device_mem.h"
+#include "bmruntime_internal.h"
+
+class Cvi182xDeviceMem : public CviDeviceMem {
+public:
+  Cvi182xDeviceMem();
+  ~Cvi182xDeviceMem() override;
+  virtual bmerr_t device_open(int index, bmdev_t *dev) override;
+  virtual void device_close(bmdev_t dev) override;
+  virtual int get_chip_ver(bmdev_t dev) override;
+  virtual void mem_free_raw(bmctx_t ctx, bmmem_device_t mem);
+  virtual bmerr_t load_cmdbuf(bmctx_t ctx, uint8_t *cmdbuf, size_t sz,
+                       uint64_t neuron_gaddr, uint64_t weight_gaddr,
+                       bool enable_pmu, bmmem_device_t *cmdbuf_mem) override;
+  virtual bmerr_t load_dmabuf(bmctx_t ctx, bmmem_device_t dmabuf,
+                              size_t sz, uint64_t neuron_gaddr,
+                              uint64_t weight_gaddr, bool enable_pmu,
+                              bmmem_device_t *dmabuf_mem) override;
+
+  virtual bmerr_t load_cmdbuf_tee(bmctx_t ctx, uint8_t *cmdbuf, size_t sz,
+                                          uint64_t neuron_gaddr, uint64_t weight_gaddr,
+                                          uint32_t weight_len,
+                                          bmmem_device_t *cmdbuf_mem);
+  virtual bmerr_t unload_tee(bmctx_t ctx, uint64_t paddr, size_t size);
+  virtual bmerr_t parse_pmubuf(bmmem_device_t cmdbuf_mem, uint8_t **buf_start, uint32_t *buf_len);
+  virtual void cvikernel_create(bmctx_t ctx, void **p_bk_ctx) override;
+  virtual void cvikernel_submit(bmctx_t ctx) override;
+  virtual void cvikernel_destroy(bmctx_t ctx) override;
+};
diff --git a/cviruntime/src/soc/182x/cvi_rt_182x.cpp b/cviruntime/src/soc/182x/cvi_rt_182x.cpp
new file mode 100644
index 000000000..b1cf03b8d
--- /dev/null
+++ b/cviruntime/src/soc/182x/cvi_rt_182x.cpp
@@ -0,0 +1,82 @@
+#include "cvi_rt_182x.h"
+
+std::unique_ptr<CviRTSoc> cvi_chip(new CviRT182x());
+
+CviRT182x::CviRT182x() {
+    chip_name_    = "cv182x";
+    submit_magic_ = 0x18225678;
+    cvi_device    = std::move(std::unique_ptr<CviDeviceMem>(new Cvi182xDeviceMem()));
+}
+
+CviRT182x::~CviRT182x() {}
+
+CVI_RT_KHANDLE CviRT182x::GetKHandleBK(CVI_RT_HANDLE rt_handle) {
+    bmctx_t ctx = (bmctx_t)rt_handle;
+    return (CVI_RT_KHANDLE)(ctx->cvik_context.ctx182x);
+}
+
+CVI_RC CviRT182x::DeInitBK(CVI_RT_HANDLE rt_handle) {
+    bmctx_t ctx = (bmctx_t)rt_handle;
+
+    //deinit kernel related
+    if (ctx->cvik_context.ctx182x) {
+        bmk1822_cleanup(ctx->cvik_context.ctx182x);
+    }
+
+    if (ctx->cvik_cmdbuf) {
+        free(ctx->cvik_cmdbuf);
+    }
+
+    //deinit basic context
+    bm_exit(ctx);
+    return CVI_SUCCESS;
+}
+
+CVI_RC CviRT182x::InitWithKernelBK(CVI_RT_HANDLE *rt_handle, uint32_t cmdbuf_size) {
+    bmctx_t *ctx = (bmctx_t *)rt_handle;
+
+    //init basic context
+    bm_init(DEVICE_INDEX_NUM, ctx);
+
+    //init cvikernel related
+    bmk1822_chip_info_t info      = bmk1822_chip_info();
+    bmk1822_chip_info_t *dev_info = &info;
+
+    bmk_info_t bmk_info;
+    bmk_info.chip_version = dev_info->version;
+    bmk_info.cmdbuf_size  = cmdbuf_size;
+    bmk_info.cmdbuf       = (u8 *)malloc(bmk_info.cmdbuf_size);
+    if (!bmk_info.cmdbuf) {
+        TPU_ASSERT(bmk_info.cmdbuf, "malloc kernel buffer failed");
+        return CVI_FAILURE;
+    }
+
+    (*ctx)->cvik_context.ctx182x = bmk1822_register(&bmk_info);
+    (*ctx)->cvik_cmdbuf          = (void *)bmk_info.cmdbuf;
+
+    return CVI_SUCCESS;
+}
+
+CVI_RC CviRT182x::LoadCmdbufTee(CVI_RT_HANDLE rt_handle, uint8_t *cmdbuf,
+                                size_t sz, uint64_t neuron_gaddr,
+                                uint64_t weight_gaddr, uint32_t weight_len,
+                                CVI_RT_MEM *cmdbuf_mem) {
+    (void)rt_handle;
+    (void)cmdbuf;
+    (void)sz;
+    (void)neuron_gaddr;
+    (void)weight_gaddr;
+    (void)weight_len;
+    (void)cmdbuf_mem;
+    TPU_ASSERT(0, NULL);  // not support
+    return CVI_SUCCESS;
+}
+
+CVI_RC CviRT182x::RunCmdbufTee(CVI_RT_HANDLE rt_handle, CVI_RT_MEM cmdbuf_mem,
+                              CVI_RT_ARRAYBASE *p_array_base) {
+    (void)rt_handle;
+    (void)p_array_base;
+    (void)cmdbuf_mem;
+    TPU_ASSERT(0, NULL); // not support
+    return CVI_SUCCESS;
+}
\ No newline at end of file
diff --git a/cviruntime/src/soc/182x/cvi_rt_182x.h b/cviruntime/src/soc/182x/cvi_rt_182x.h
new file mode 100644
index 000000000..3da3939b1
--- /dev/null
+++ b/cviruntime/src/soc/182x/cvi_rt_182x.h
@@ -0,0 +1,19 @@
+#pragma once
+#include "cvi_rt_base.h"
+#include "cvi182x_device_mem.h"
+
+class CviRT182x : public CviRTSoc {
+public:
+  CviRT182x();
+  virtual ~CviRT182x() override;
+
+  virtual CVI_RT_KHANDLE GetKHandleBK(CVI_RT_HANDLE rt_handle) override;
+  virtual CVI_RC DeInitBK(CVI_RT_HANDLE rt_handle) override;
+  virtual CVI_RC InitWithKernelBK(CVI_RT_HANDLE *rt_handle, uint32_t cmdbuf_size) override;
+  virtual CVI_RC LoadCmdbufTee(CVI_RT_HANDLE rt_handle, uint8_t *cmdbuf,
+                       size_t sz, uint64_t neuron_gaddr,
+                       uint64_t weight_gaddr, uint32_t weight_len,
+                       CVI_RT_MEM *cmdbuf_mem) override;
+  virtual CVI_RC RunCmdbufTee(CVI_RT_HANDLE rt_handle, CVI_RT_MEM cmdbuf_mem,
+                              CVI_RT_ARRAYBASE *p_array_base);
+};
diff --git a/cviruntime/src/soc/182x/tpu_pmu.cpp b/cviruntime/src/soc/182x/tpu_pmu.cpp
new file mode 100644
index 000000000..e14db6840
--- /dev/null
+++ b/cviruntime/src/soc/182x/tpu_pmu.cpp
@@ -0,0 +1,899 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <string.h>
+#include <bmkernel/bm1822/bm1822_tiu_reg.h>
+#include <bmkernel/bm1822/bm1822_tdma_reg.h>
+#include "bmruntime_internal.h"
+#include <bmkernel/bm_regcpu.h>
+#include <bmkernel/reg_bdcast.h>
+#include <bmkernel/reg_tdma.h>
+
+
+struct TPU_PMU_DOUBLEEVENT {
+  unsigned long long type : 4;
+  unsigned long long desID : 16;
+  unsigned long long eventCnt0 : 22;
+  unsigned long long eventCnt1 : 22;
+  uint32_t endTime;
+  uint32_t startTime;
+};
+
+typedef enum _EXCEL_TYPE {
+  EXCEL_TYPE_0    = 0,
+  EXCEL_TYPE_1    = 1,
+  EXCEL_TYPE_2    = 2,
+  EXCEL_TYPE_3    = 3,
+  EXCEL_TYPE_4    = 4,
+} EXCEL_TYPE;
+
+enum TPU_PMUTYPE {
+  TPU_PMUTYPE_TDMALOAD  = 1,
+  TPU_PMUTYPE_TDMASTORE = 2,
+  TPU_PMUTYPE_TDMAMOVE  = 3,
+  TPU_PMUTYPE_TIU       = 4,
+};
+
+typedef struct _TPU_DES_ELEMENT {
+  TPU_PMU_DOUBLEEVENT pmuEvent;
+  tiu_reg_t   tiuReg;
+  tdma_reg_t  tdmaReg;
+  char typeStr[50];
+} TPU_DES_ELEMENT;
+
+typedef struct _TPU_LAYERID_ELEMENT {
+  uint32_t    layerID;
+  TPU_PMUTYPE last_desType;
+  uint32_t    last_mapping_desID;
+  uint32_t    endTime;
+  uint32_t    startTime;
+//  uint8_t     layerName[50];
+  uint32_t    u32StartAddr;
+  uint32_t    u32OutputLen;
+
+  uint32_t    u32LoadNueronTime;
+  uint32_t    u32LoadWeightTime;
+  uint32_t    u32StoreNueronTime;
+  uint32_t    u32TIUTime;
+  uint32_t    u32TDMATime;
+  uint32_t    u32byteCnt;
+
+  double      parallelism;
+  double      duration_percent;
+  double      loadNeuron_percent;
+  double      loadWeight_percent;
+  double      storeNeuron_percent;
+  double      tiu_percent;
+  double      throughput_MB;
+} TPU_LAYERID_ELEMENT;
+
+#define FILE_OUT_LINE_LEN 2048
+#define TPUPMU_DES_FILENAME "_des.csv"
+#define TPUPMU_LAYER_FILENAME "_layer.csv"
+const char *pmubuf_output_file_env = NULL;
+
+
+#define TPU_CLOCK_DEFAULT (750000000)
+#define TPU_WRAP_LIMIT  0xFFFFFFFF
+#define TPU_BURST_SIZE  16
+#define DES_MAX   (65535 * 6)    //hardcore firstly, real count could be queried from dmabuf
+TPU_DES_ELEMENT *p_element = NULL;
+TPU_LAYERID_ELEMENT *p_layer = NULL;
+
+static void tpu_pmu_fill_cmdbuf(uint8_t *v_dma_buf);
+
+static void reorder_back_tiu_cmdbuf_reg(uint8_t *cmdbuf)
+{
+  int total_bits = BD_REG_BYTES * 8;
+
+  uint8_t tmp[128 / 8];
+  uint8_t *last = &cmdbuf[(total_bits - 128) / 8];
+  memcpy(tmp, last, sizeof(tmp));
+  memcpy(last, cmdbuf, sizeof(tmp));
+  memcpy(cmdbuf, tmp, sizeof(tmp));
+}
+
+static void tdma_des_fill_str(TPU_DES_ELEMENT *element)
+{
+  char str1[50];
+
+  switch(element->pmuEvent.type) {
+    case TPU_PMUTYPE_TDMALOAD:
+      sprintf(str1, "%s", "Load");
+      break;
+    case TPU_PMUTYPE_TDMASTORE:
+      sprintf(str1, "%s", "Store");
+      break;
+    case TPU_PMUTYPE_TDMAMOVE:
+      sprintf(str1, "%s", "Move");
+      break;
+    default:
+      break;
+  }
+
+  if (element->tdmaReg.compress_en)
+    sprintf(str1, "%s %s", str1, "Compression");
+
+  if (element->tdmaReg.sys_dtype)
+    sprintf(element->typeStr, "%s %s", "TDMA Matrix", str1);
+  else
+    sprintf(element->typeStr, "%s %s", "TDMA Tensor", str1);
+}
+
+static void tpu_pmu_fill_cmdbuf(uint8_t *v_dma_buf)
+{
+  cvi_cpu_desc_t *desc = (cvi_cpu_desc_t *)(v_dma_buf + sizeof(dma_hdr_t));
+
+  uint64_t tiu_offset = 0, tdma_offset = 0;
+  uint32_t tiu_cnt = 0, tdma_cnt = 0, i = 0, offset = 0;
+  uint32_t start_index_tdma = 0, start_index_tiu = 0;
+  uint32_t index = 0;
+  tdma_reg_t tmpTDMA_Reg;
+  tiu_reg_t tmpTIU_Reg;
+  uint8_t tiu_recorded_buf[BD_REG_BYTES];
+  uint32_t tdma_id_previous = 0, tdma_start_pre= 0, tdma_end_pre = 0;
+
+  //get tiu/tdma descriptor start address
+  tiu_offset = desc->offset_tiu_ori_bk;
+  tdma_offset = desc->offset_tdma_ori_bk;
+  //TPU_LOG_DEBUG("tpu_pmu_fill_cmdbuf() tiu_offset=0x%" PRIx64", tdma_offset=0x%" PRIx64 "\n", tiu_offset, tdma_offset);
+
+  tiu_cnt = desc->num_tiu;
+  tdma_cnt = desc->num_tdma;
+  //TPU_LOG_DEBUG("tpu_pmu_fill_cmdbuf() tiu_cnt=%d, tdma_cnt=%d\n", tiu_cnt, tdma_cnt);
+
+  while (p_element[index].pmuEvent.type) {
+    if (p_element[index].pmuEvent.type != TPU_PMUTYPE_TIU) {    //tdma
+
+      if ((p_element[index].pmuEvent.desID != tdma_id_previous) ||
+          (p_element[index].pmuEvent.startTime != tdma_start_pre) ||
+          (p_element[index].pmuEvent.endTime != tdma_end_pre)) {
+        for (i = start_index_tdma; i < tdma_cnt; i ++) {
+          offset = tdma_offset + ((1 << TDMA_DESCRIPTOR_ALIGNED_BIT) * i);
+          parse_tdma_reg(&tmpTDMA_Reg, (uint32_t *)(v_dma_buf + offset));
+
+          if (p_element[index].pmuEvent.desID == tmpTDMA_Reg.cmd_id) {
+            memcpy(&p_element[index].tdmaReg, &tmpTDMA_Reg, sizeof(tmpTDMA_Reg));
+            tdma_des_fill_str(&p_element[index]);
+            start_index_tdma = i + 1;
+            tdma_id_previous = p_element[index].pmuEvent.desID;
+            tdma_start_pre = p_element[index].pmuEvent.startTime;
+            tdma_end_pre = p_element[index].pmuEvent.endTime;
+            break;
+          }
+        }
+      } else {  //tdma g2g case, copy 1st to 2nd tdma descriptor
+        memcpy(&p_element[index].tdmaReg, &p_element[index - 1].tdmaReg, sizeof(tmpTDMA_Reg));
+        tdma_des_fill_str(&p_element[index]);
+      }
+    } else {   //tiu
+      for (i = start_index_tiu; i < tiu_cnt; i ++) {
+        offset = tiu_offset + (BD_REG_BYTES * i);
+        uint8_t *tiu_cmdbuf = v_dma_buf + offset;
+
+        //get tiu_reg struc
+        memcpy(tiu_recorded_buf, tiu_cmdbuf, BD_REG_BYTES);
+        reorder_back_tiu_cmdbuf_reg(tiu_recorded_buf);
+        parse_tiu_reg(&tmpTIU_Reg, (uint32_t *)tiu_recorded_buf);
+
+        if (p_element[index].pmuEvent.desID == tmpTIU_Reg.cmd_id_tpu) {
+          memcpy(&p_element[index].tiuReg, &tmpTIU_Reg, sizeof(tmpTIU_Reg));
+
+#if 1
+          switch (tmpTIU_Reg.tsk_typ) {
+            case DCR_TYPE_CONV_FIX8B:
+              if (!tmpTIU_Reg.opt_chl_quan) {
+                if (tmpTIU_Reg.opd_typ) {
+                  strcpy(p_element[index].typeStr, "TIU BF16 Convolution");
+                } else {
+                  strcpy(p_element[index].typeStr, "TIU Convolution");
+                }
+              } else {
+                strcpy(p_element[index].typeStr, "TIU PerChannel Convolution");
+              }
+              break;
+            case DCR_TYPE_DEPTHWISE_POOL_FIX8B:
+              switch (tmpTIU_Reg.tsk_eu_typ) {
+                case 0:
+                  if (tmpTIU_Reg.opd_typ) {
+                    strcpy(p_element[index].typeStr, "TIU BF16 Max Pooling");
+                  } else {
+                    strcpy(p_element[index].typeStr, "TIU Max Pooling");
+                  }
+                  break;
+                case 1:
+                  if (tmpTIU_Reg.opd_typ) {
+                    strcpy(p_element[index].typeStr, "TIU BF16 Average Pooling");
+                  } else {
+                    strcpy(p_element[index].typeStr, "TIU Average Pooling");
+                  }
+                  break;
+                case 2:
+                  if (!tmpTIU_Reg.opt_chl_quan) {
+                    if (tmpTIU_Reg.opd_typ) {
+                      strcpy(p_element[index].typeStr, "TIU BF16 Depthwise Convolution");
+                    } else {
+                      strcpy(p_element[index].typeStr, "TIU Depthwise Convolution");
+                    }
+                  } else {
+                    strcpy(p_element[index].typeStr, "TIU Depthwise PerChannel Convolution");
+                  }
+                  break;
+                case 3:
+                  if (tmpTIU_Reg.opd_typ) {
+                    strcpy(p_element[index].typeStr, "TIU BF16 Min Pooling");
+                  } else {
+                    strcpy(p_element[index].typeStr, "TIU Min Pooling");
+                  }
+                  break;
+                default:
+                  break;
+              }
+            case DCR_TYPE_FC_FIX8B:
+              if (!tmpTIU_Reg.opt_chl_quan) {
+                if (tmpTIU_Reg.opd_typ) {
+                  strcpy(p_element[index].typeStr, "TIU BF16 Matrix Multiplication");
+                } else {
+                  strcpy(p_element[index].typeStr, "TIU Matrix Multiplication");
+                }
+              } else {
+                strcpy(p_element[index].typeStr, "TIU PerChannel Matrix Multiplication");
+              }
+              break;
+            case DCR_TYPE_TENSOR_ARITH_FIX8B:
+              switch(tmpTIU_Reg.tsk_eu_typ) {
+                case 0:
+                  if (!tmpTIU_Reg.opt_chl_quan) {
+                    if (tmpTIU_Reg.opd_typ) {
+                       strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Mul");
+                    } else {
+                      strcpy(p_element[index].typeStr, "TIU Element-wise Mul");
+                    }
+                  } else {
+                    strcpy(p_element[index].typeStr, "TIU Element-wise Mul(QDM)");
+                  }
+                  break;
+                case 1:
+                  if (tmpTIU_Reg.opd_typ) {
+                    strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Mac");
+                  } else {
+                    strcpy(p_element[index].typeStr, "TIU Element-wise Mac");
+                  }
+                  break;
+                case 2:
+                  if (tmpTIU_Reg.opd_typ) {
+                    strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Add");
+                  } else {
+                    strcpy(p_element[index].typeStr, "TIU Element-wise Add");
+                  }
+                  break;
+                case 3:
+                  if (tmpTIU_Reg.opd_typ) {
+                    strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Sub");
+                  } else {
+                    strcpy(p_element[index].typeStr, "TIU Element-wise Sub");
+                  }
+                  break;
+                case 4:
+                  if (tmpTIU_Reg.opd_typ) {
+                    strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Max");
+                  } else {
+                    strcpy(p_element[index].typeStr, "TIU Element-wise Max");
+                  }
+                  break;
+                case 5:
+                  if (tmpTIU_Reg.opd_typ) {
+                    strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Min");
+                  } else {
+                    strcpy(p_element[index].typeStr, "TIU Element-wise Min");
+                  }
+                  break;
+                case 6:
+                  strcpy(p_element[index].typeStr, "TIU Element-wise Shift");
+                  break;
+                case 7:
+                  strcpy(p_element[index].typeStr, "TIU Element-wise AND");
+                  break;
+                case 8:
+                  strcpy(p_element[index].typeStr, "TIU Element-wise OR");
+                  break;
+                case 9:
+                  strcpy(p_element[index].typeStr, "TIU Element-wise XOR");
+                  break;
+                case 10:
+                  if (tmpTIU_Reg.opd_typ) {
+                    strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Copy");
+                  } else {
+                    strcpy(p_element[index].typeStr, "TIU Element-wise Copy");
+                  }
+                  break;
+                case 11:
+                  if (tmpTIU_Reg.opd_typ) {
+                    strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Ge");
+                  } else {
+                    strcpy(p_element[index].typeStr, "TIU Element-wise Ge");
+                  }
+                  break;
+                case 12:
+                  strcpy(p_element[index].typeStr, "TIU Lookup Table");
+                  break;
+                default:
+                  break;
+                }
+                default:
+                  break;
+          }
+
+#else
+          switch(tmpTIU_Reg.tsk_typ) {
+            case DCR_TYPE_CONV_FIX8B:
+              if (!tmpTIU_Reg.opt_chl_quan) {
+                if (tmpTIU_Reg.opd_typ)
+                  strcpy(p_element[index].typeStr, "TIU BF16 Convolution");
+                else
+                  strcpy(p_element[index].typeStr, "TIU Convolution");
+              } else {
+                strcpy(p_element[index].typeStr, "TIU PerChannel Convolution");
+              }
+              break;
+            case DCR_TYPE_DEPTHWISE_POOL_FIX8B:
+              switch (tmpTIU_Reg.tsk_eu_typ) {
+                    case 0:
+                  if (tmpTIU_Reg.opd_typ)
+                    strcpy(p_element[index].typeStr, "TIU BF16 Max Pooling");
+                  else
+                    strcpy(p_element[index].typeStr, "TIU Max Pooling");
+                  break;
+                    case 1:
+                  if (tmpTIU_Reg.opd_typ)
+                    strcpy(p_element[index].typeStr, "TIU BF16 Average Pooling");
+                  else
+                    strcpy(p_element[index].typeStr, "TIU Average Pooling");
+                  break;
+                    case 2:
+                  if (!tmpTIU_Reg.opt_chl_quan) {
+                    if (tmpTIU_Reg.opd_typ)
+                      strcpy(p_element[index].typeStr, "TIU BF16 Depthwise Convolution");
+                    else
+                      strcpy(p_element[index].typeStr, "TIU Depthwise Convolution");
+                  } else {
+                    strcpy(p_element[index].typeStr, "TIU Depthwise PerChannel Convolution");
+                  }
+                  break;
+                    default:
+                      break;
+                  }
+              break;
+            case DCR_TYPE_FC_FIX8B:
+              if (!tmpTIU_Reg.opt_chl_quan) {
+                if (tmpTIU_Reg.opd_typ)
+                  strcpy(p_element[index].typeStr, "TIU BF16 Matrix Multiplication");
+                else
+                  strcpy(p_element[index].typeStr, "TIU Matrix Multiplication");
+              } else {
+                strcpy(p_element[index].typeStr, "TIU PerChannel Matrix Multiplication");
+              }
+              break;
+            case DCR_TYPE_TENSOR_ARITH_FIX8B:
+              if (tmpTIU_Reg.tens_mdsum) {
+                strcpy(p_element[index].typeStr, "TIU Mdsum");
+              } else if (tmpTIU_Reg.tens_lookup) {
+                strcpy(p_element[index].typeStr, "TIU Lookup Table");
+              } else {
+                switch (tmpTIU_Reg.tsk_eu_typ) {
+                  case 0:
+                    if (!tmpTIU_Reg.opt_chl_quan) {
+                      if (tmpTIU_Reg.opd_typ) {
+                        strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Mul");
+                      } else {
+                        strcpy(p_element[index].typeStr, "TIU Element-wise Mul");
+                      }
+                    } else {
+                      strcpy(p_element[index].typeStr, "TIU Element-wise Mul(QDM)");
+                    }
+                    break;
+                  case 1:
+                    if (tmpTIU_Reg.opd_typ) {
+                      strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Mac");
+                    } else {
+                      strcpy(p_element[index].typeStr, "TIU Element-wise Mac");
+                    }
+                    break;
+                  case 2:
+                    if (tmpTIU_Reg.opd_typ) {
+                      strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Add");
+                    } else {
+                      strcpy(p_element[index].typeStr, "TIU Element-wise Add");
+                    }
+                    break;
+                  case 3:
+                    if (tmpTIU_Reg.opd_typ) {
+                      strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Sub");
+                    } else {
+                      strcpy(p_element[index].typeStr, "TIU Element-wise Sub");
+                    }
+                    break;
+                  case 4:
+                    if (tmpTIU_Reg.opd_typ) {
+                      strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Max");
+                    } else {
+                      strcpy(p_element[index].typeStr, "TIU Element-wise Max");
+                    }
+                    break;
+                  case 5:
+                    if (tmpTIU_Reg.opd_typ) {
+                      strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Min");
+                    } else {
+                      strcpy(p_element[index].typeStr, "TIU Element-wise Min");
+                    }
+                    break;
+                  case 6:
+                    strcpy(p_element[index].typeStr, "TIU Element-wise Shift");
+                    break;
+                  case 7:
+                    strcpy(p_element[index].typeStr, "TIU Element-wise AND");
+                    break;
+                  case 8:
+                    strcpy(p_element[index].typeStr, "TIU Element-wise OR");
+                    break;
+                  case 9:
+                    strcpy(p_element[index].typeStr, "TIU Element-wise XOR");
+                    break;
+                  case 10:
+                    if (tmpTIU_Reg.opd_typ) {
+                      strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Copy");
+                    } else {
+                      strcpy(p_element[index].typeStr, "TIU Element-wise Copy");
+                    }
+                    break;
+                  default:
+                    break;
+                }
+              }
+              break;
+          }
+#endif
+          start_index_tiu = i + 1;
+          break;
+        }
+      }
+    }
+    index ++;
+  }
+
+}
+
+#include <iostream>
+#include <fstream>
+using namespace std;
+
+static void tpu_pmu_fwrite_des()
+{
+  uint32_t index = 0;
+	uint64_t srcAddr = 0, dstAddr = 0;
+
+  char lineStr[FILE_OUT_LINE_LEN] = {0};
+  EXCEL_TYPE excelType = EXCEL_TYPE_0;
+
+  std::fstream fout_element;
+  sprintf(lineStr, "%s%s", pmubuf_output_file_env, TPUPMU_DES_FILENAME);
+  TPU_LOG_DEBUG("out file_des name=%s\n", lineStr);
+  fout_element.open(lineStr, std::ios::out | std::ios::trunc);
+
+  strcpy(lineStr, "pmutype, desID, event0, event1, , start, duration, end, layerID, desType, \
+    srcAddr, dstAddr, trans_fmt, transpose_md, cmd_id, wait_id_tpu, dst_h_stride, dst_c_stride_low, \
+    dst_n_stride, src_h_stride, src_c_stride_low, src_n_stride, dst_c, src_c, dst_w, dst_h, src_w, src_h, src_n\n");
+  fout_element << lineStr;
+
+  //dump descriptor content related
+  while (p_element[index].pmuEvent.type)
+  {
+    switch (p_element[index].pmuEvent.type) {
+      case TPU_PMUTYPE_TDMALOAD:
+        excelType = EXCEL_TYPE_1;
+        break;
+      case TPU_PMUTYPE_TDMASTORE:
+      case TPU_PMUTYPE_TDMAMOVE:
+        excelType = EXCEL_TYPE_2;
+        break;
+      case TPU_PMUTYPE_TIU:
+        excelType = EXCEL_TYPE_3;
+        break;
+    }
+
+    if (p_element[index].pmuEvent.type == TPU_PMUTYPE_TIU) {
+      sprintf(lineStr, "%llu, %llu, %llu, %llu, %u, %u, %u, %u, %u, %s\n",
+                        p_element[index].pmuEvent.type,
+                        p_element[index].pmuEvent.desID,
+                        p_element[index].pmuEvent.eventCnt0,
+                        p_element[index].pmuEvent.eventCnt1,
+                        excelType,
+                        p_element[index].pmuEvent.startTime,
+                        p_element[index].pmuEvent.endTime - p_element[index].pmuEvent.startTime,
+                        p_element[index].pmuEvent.endTime,
+                        p_element[index].tiuReg.layer_info,
+                        p_element[index].typeStr);
+    } else {
+      srcAddr = ((uint64_t)(p_element[index].tdmaReg.src_base_addr_high) << 32) |
+                  (uint64_t)(p_element[index].tdmaReg.src_base_addr_low);
+      dstAddr = ((uint64_t)(p_element[index].tdmaReg.dst_base_addr_high) << 32) |
+                  (uint64_t)(p_element[index].tdmaReg.dst_base_addr_low);
+
+      sprintf(lineStr, "%llu, %llu, %llu, %llu, %u, %u, %u, %u, %u, %s, 0x%" PRIu64 ", 0x%" PRIu64 ", \
+        %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u\n",
+                        p_element[index].pmuEvent.type,
+                        p_element[index].pmuEvent.desID,
+                        p_element[index].pmuEvent.eventCnt0,
+                        p_element[index].pmuEvent.eventCnt1,
+                        excelType,
+                        p_element[index].pmuEvent.startTime,
+                        p_element[index].pmuEvent.endTime - p_element[index].pmuEvent.startTime,
+                        p_element[index].pmuEvent.endTime,
+                        p_element[index].tdmaReg.layer_ID,
+                        p_element[index].typeStr,
+                        srcAddr,
+                        dstAddr,
+                        p_element[index].tdmaReg.trans_fmt,
+                        p_element[index].tdmaReg.transpose_md,
+                        p_element[index].tdmaReg.cmd_id,
+                        p_element[index].tdmaReg.wait_id_tpu,
+                        p_element[index].tdmaReg.dst_h_stride,
+                        p_element[index].tdmaReg.dst_c_stride_low,
+                        p_element[index].tdmaReg.dst_n_stride,
+                        p_element[index].tdmaReg.src_h_stride,
+                        p_element[index].tdmaReg.src_c_stride_low,
+                        p_element[index].tdmaReg.src_n_stride,
+                        p_element[index].tdmaReg.dst_c,
+                        p_element[index].tdmaReg.src_c,
+                        p_element[index].tdmaReg.dst_w,
+                        p_element[index].tdmaReg.dst_h,
+                        p_element[index].tdmaReg.src_w,
+                        p_element[index].tdmaReg.src_h,
+                        p_element[index].tdmaReg.src_n);
+    }
+
+    fout_element << lineStr;
+    index ++;
+  }
+
+  fout_element.close();
+}
+
+static void tpu_pmu_getlayerInfo(void)
+{
+  uint32_t index = 0, layIDIndex = 0;
+  uint32_t curLayID = 0;
+  uint32_t u32SingleDuration = 0;
+
+  TPU_LOG_DEBUG("tpu_pmu_getlayerInfo() start\n");
+  while (p_element[index].pmuEvent.type) {
+    if (!curLayID) {
+      //record current layerID
+      curLayID = p_element[index].pmuEvent.type == TPU_PMUTYPE_TIU ?
+      p_element[index].tiuReg.layer_info : p_element[index].tdmaReg.layer_ID;
+
+      p_layer[layIDIndex].last_desType = (TPU_PMUTYPE)p_element[index].pmuEvent.type;
+      p_layer[layIDIndex].layerID = curLayID;
+      p_layer[layIDIndex].endTime = p_element[index].pmuEvent.endTime;
+      p_layer[layIDIndex].startTime = p_element[index].pmuEvent.startTime;
+      p_layer[layIDIndex].last_mapping_desID = p_element[index].pmuEvent.desID;
+    } else {
+      //if next layer ID is identical
+      if (curLayID == (p_element[index].pmuEvent.type == TPU_PMUTYPE_TIU ?
+        p_element[index].tiuReg.layer_info : p_element[index].tdmaReg.layer_ID)) {
+        p_layer[layIDIndex].endTime = (p_element[index].pmuEvent.endTime > p_layer[layIDIndex].endTime) ?
+          (p_element[index].pmuEvent.endTime) : (p_layer[layIDIndex].endTime);
+
+        p_layer[layIDIndex].last_mapping_desID = p_element[index].pmuEvent.desID;
+
+      } else {
+        layIDIndex ++;
+        curLayID = p_element[index].pmuEvent.type == TPU_PMUTYPE_TIU ?
+          p_element[index].tiuReg.layer_info : p_element[index].tdmaReg.layer_ID;
+
+        p_layer[layIDIndex].last_desType = (TPU_PMUTYPE)p_element[index].pmuEvent.type;
+        p_layer[layIDIndex].layerID = curLayID;
+        p_layer[layIDIndex].endTime = p_element[index].pmuEvent.endTime;
+        p_layer[layIDIndex].startTime = p_element[index].pmuEvent.startTime;
+        p_layer[layIDIndex].last_mapping_desID = p_element[index].pmuEvent.desID;
+      }
+    }
+
+    //get each duration and then classfy by type
+    u32SingleDuration = p_element[index].pmuEvent.endTime - p_element[index].pmuEvent.startTime;
+    switch (p_element[index].pmuEvent.type) {
+      case TPU_PMUTYPE_TIU:
+        p_layer[layIDIndex].u32TIUTime += u32SingleDuration;
+        break;
+
+      case TPU_PMUTYPE_TDMALOAD:
+        if (p_element[index].tdmaReg.src_base_reg_sel == 0)
+          p_layer[layIDIndex].u32LoadNueronTime += u32SingleDuration;
+        else if (p_element[index].tdmaReg.src_base_reg_sel == 1)
+          p_layer[layIDIndex].u32LoadWeightTime += u32SingleDuration;
+
+        p_layer[layIDIndex].u32TDMATime += u32SingleDuration;
+        break;
+
+      case TPU_PMUTYPE_TDMASTORE:
+        if (p_element[index].tdmaReg.src_base_reg_sel == 0)
+          p_layer[layIDIndex].u32StoreNueronTime += u32SingleDuration;
+
+        p_layer[layIDIndex].u32TDMATime += u32SingleDuration;
+        break;
+
+      default:
+        break;
+    }
+
+    //accumulate byte counts, one burst count = 16bytes
+    p_layer[layIDIndex].u32byteCnt += (p_element[index].pmuEvent.eventCnt1 * 16);
+    index ++;
+  }
+}
+
+static void tpu_pmu_fwrite_layer(uint64_t tpu_clock)
+{
+  uint32_t index = 0;
+  char lineStr[FILE_OUT_LINE_LEN] = {0};
+  uint64_t u64totalDuration = 0, u64singleDuration = 0;
+  std::fstream fout_layer;
+
+  sprintf(lineStr, "%s%s", pmubuf_output_file_env, TPUPMU_LAYER_FILENAME);
+  TPU_LOG_DEBUG("out file_des name=%s\n", lineStr);
+  fout_layer.open(lineStr, std::ios::out | std::ios::trunc);
+
+  //pre-processing once, and we can get total duration
+  index = 0;
+  while (p_layer[index].layerID) {
+    u64totalDuration += p_layer[index].endTime - p_layer[index].startTime;
+    index ++;
+  }
+
+  index = 0;
+  while (p_layer[index].layerID) {
+    u64singleDuration = p_layer[index].endTime - p_layer[index].startTime;
+    p_layer[index].parallelism = (double)(p_layer[index].u32TDMATime + p_layer[index].u32TIUTime) / (double)u64singleDuration * 100;
+    p_layer[index].parallelism =  p_layer[index].parallelism < 100 ? 100 : p_layer[index].parallelism;
+
+    p_layer[index].duration_percent = (double)u64singleDuration / (double)u64totalDuration * 100;
+    p_layer[index].tiu_percent = (double)p_layer[index].u32TIUTime / (double)u64singleDuration * 100;
+    p_layer[index].loadNeuron_percent = (double)p_layer[index].u32LoadNueronTime / (double)u64singleDuration * 100;
+    p_layer[index].loadWeight_percent = (double)p_layer[index].u32LoadWeightTime / (double)u64singleDuration * 100;
+    p_layer[index].storeNeuron_percent = (double)p_layer[index].u32StoreNueronTime / (double)u64singleDuration * 100;
+    p_layer[index].throughput_MB = (double)p_layer[index].u32byteCnt * tpu_clock / (double)u64singleDuration / 1024 / 1024;
+    index ++;
+  }
+
+  strcpy(lineStr, "layerID, start, duration, end, duration(%), parallelism(%), TIU(%), \
+    loadNeuron(%), loadWeight(%), storeNeuron(%), throughput(MB/s), last_tdmaID, dumpStart, dumpLen, TIU, loadNeuron, \
+    loadWeight, storeNeuron, byteCnt\n");
+
+  fout_layer << lineStr;
+
+  index = 0;
+  while (p_layer[index].layerID) {
+    sprintf(lineStr, "%d, %d, %d, %d, %lf%%, %lf%%, %lf%%, %lf%%, %lf%%, %lf%%, %.2lfMB/s, %d, 0x%x, 0x%x, %d, %d, %d, %d, %d\n",
+                p_layer[index].layerID,
+                p_layer[index].startTime,
+                p_layer[index].endTime - p_layer[index].startTime,
+                p_layer[index].endTime,
+
+                p_layer[index].duration_percent,
+                p_layer[index].parallelism,
+                p_layer[index].tiu_percent,
+                p_layer[index].loadNeuron_percent,
+                p_layer[index].loadWeight_percent,
+                p_layer[index].storeNeuron_percent,
+                p_layer[index].throughput_MB,
+
+                p_layer[index].last_mapping_desID,
+                p_layer[index].u32StartAddr,
+                p_layer[index].u32OutputLen,
+                p_layer[index].u32TIUTime,
+                p_layer[index].u32LoadNueronTime,
+                p_layer[index].u32LoadWeightTime,
+                p_layer[index].u32StoreNueronTime,
+                p_layer[index].u32byteCnt);
+    fout_layer << lineStr;
+    index ++;
+  }
+
+  fout_layer.close();
+}
+
+static int tpu_pmu_time(uint8_t *v_dma_buf, uint64_t p_dma_buf, uint8_t all_info)
+{
+  dma_hdr_t *header = (dma_hdr_t *)(v_dma_buf);
+  struct TPU_PMU_DOUBLEEVENT *pCurrent = (struct TPU_PMU_DOUBLEEVENT *)(v_dma_buf + header->pmubuf_offset);
+
+  uint64_t bmnet_p_total = 0;
+  uint64_t bmnet_p_duration = 0;
+
+  uint64_t u64TDMATotal = 0;
+  uint64_t u64TIUTotal = 0;
+  uint64_t u64_des_start = 0, u64_des_end = 0;
+  uint32_t u32TDMACnt = 0, u32TIUCnt = 0;
+  uint32_t index = 0, diff = 0, wrap_cnt = 0;
+  uint32_t tpu_clk_rate = header->tpu_clk_rate;
+  uint64_t u64_load_bytes = 0, u64_store_bytes = 0;
+  uint32_t tdma_id_previous = 0, tdma_start_pre= 0, tdma_end_pre = 0;
+  double percent_tdma = 0, percent_tiu = 0, percent_paralellism = 0;
+  double ms_tdma = 0, ms_tiu = 0, ms_influence = 0;
+  double load_mb = 0, store_mb = 0;
+  double bandwidth = 0;
+
+  TPU_LOG_DEBUG("TPU_LOG_DEBUG tpu_pmu_time() all_info=%x\n", all_info);
+  //traverse pmu buffer
+  while (*(uint32_t *)pCurrent) {
+    if (pCurrent->type >= TPU_PMUTYPE_TDMALOAD && pCurrent->type <= TPU_PMUTYPE_TIU) {
+      if (index == 0) {
+        u64_des_start = pCurrent->startTime;
+        u64_des_end = pCurrent->endTime;
+      } else {
+        u64_des_end = pCurrent->endTime;
+      }
+
+      if (all_info)
+        memcpy(&p_element[index].pmuEvent, pCurrent, sizeof(TPU_PMU_DOUBLEEVENT));
+
+    } else {
+      TPU_LOG_ERROR("pmubuf content header type incorrect, just next\n");
+      index ++;
+      pCurrent++;
+      continue;
+    }
+
+    if (pCurrent->type == TPU_PMUTYPE_TIU) {  //tiu case
+      if (pCurrent->endTime > pCurrent->startTime) {
+        diff = pCurrent->endTime - pCurrent->startTime;
+      } else {
+        diff = 0xFFFFFFFF - pCurrent->startTime + pCurrent->endTime;
+        wrap_cnt ++;
+      }
+
+      u64TIUTotal += diff;
+      u32TIUCnt++;
+    } else {    //tdma case
+
+      //g2g will generate two des loadx1+storex1, we only accumulate one of them
+      if ((pCurrent->desID != tdma_id_previous) ||
+          (pCurrent->startTime != tdma_start_pre) ||
+          (pCurrent->endTime != tdma_end_pre)) {
+
+        if (pCurrent->endTime > pCurrent->startTime) {
+          diff = pCurrent->endTime - pCurrent->startTime;
+        } else {
+          diff = TPU_WRAP_LIMIT - pCurrent->startTime + pCurrent->endTime;
+          wrap_cnt ++;
+        }
+        u64TDMATotal += diff;
+        u32TDMACnt++;
+      }
+
+      if (pCurrent->type == TPU_PMUTYPE_TDMALOAD) {
+        u64_load_bytes += TPU_BURST_SIZE * pCurrent->eventCnt1;
+      } else if (pCurrent->type == TPU_PMUTYPE_TDMASTORE) {
+        u64_store_bytes += TPU_BURST_SIZE * pCurrent->eventCnt1;
+      }
+
+      tdma_id_previous = pCurrent->desID;
+      tdma_start_pre = pCurrent->startTime;
+      tdma_end_pre = pCurrent->endTime;
+    } 
+
+    index ++;
+    pCurrent++;
+  }
+
+  bmnet_p_total = u64TDMATotal + u64TIUTotal;
+  if (wrap_cnt)
+    bmnet_p_duration = TPU_WRAP_LIMIT * (wrap_cnt - 1) + TPU_WRAP_LIMIT - u64_des_start + u64_des_end;
+  else
+    bmnet_p_duration = u64_des_end - u64_des_start;
+
+  percent_tdma = (double)u64TDMATotal / (double)bmnet_p_duration * (double)100;
+  percent_tiu = (double)u64TIUTotal / (double)bmnet_p_duration * (double)100;
+  percent_paralellism = (double)(bmnet_p_total) / (double)bmnet_p_duration * (double)100;
+  percent_paralellism = percent_paralellism < 100 ? 100 : percent_paralellism;
+
+  if (!tpu_clk_rate) {
+    tpu_clk_rate = TPU_CLOCK_DEFAULT;
+    printf("can't get tpu clock, assume to %dMhz\n", tpu_clk_rate / 1000000);
+  }
+
+  ms_tdma = (double)u64TDMATotal / (double)tpu_clk_rate * (double)1000;
+  ms_tiu = (double)u64TIUTotal / (double)tpu_clk_rate * (double)1000;
+  ms_influence = (double)bmnet_p_duration / (double)tpu_clk_rate * (double)1000;
+
+  load_mb =  (double)u64_load_bytes / (double)1024 / (double)1024;
+  store_mb =  (double)u64_store_bytes / (double)1024 / (double)1024;
+
+  bandwidth = (double)(load_mb + store_mb) / (double)ms_influence * (double)1000;
+
+  printf("=======================inference total info ==========================\n");
+  //printf("cv183x tpu clock: %dMhz\n", header->tpu_clk_rate / 1000000);
+  printf("%-20s %8dMhz, %-20s %9.2fMB, %-20s %7.2fMB/s\n",
+          "cv182x_tpu_clock:", tpu_clk_rate / 1000000, "inferece_data:", load_mb + store_mb, "inference_bw:", bandwidth);
+
+  printf("%-20s %10" PRIu64 "t, %-20s %10" PRIu64 "t, %-20s %10" PRIu64 "t\n",
+         "tdma_exe_tick:", u64TDMATotal, "tiu_exe_tick", u64TIUTotal, "inference_tick", bmnet_p_duration);
+  printf("%-20s %10.2f%%, %-20s %10.2f%%, %-20s %10.2f%%\n",
+          "tdma_exe_percent:", percent_tdma, "tiu_exe_percent:", percent_tiu, "paralellism_percent", percent_paralellism);
+  printf("%-20s %9.2fms, %-20s %9.2fms, %-20s %9.2fms\n",
+          "tdma_exe_ms:", ms_tdma, "tiu_exe_ms:", ms_tiu, "inference_ms:", ms_influence);
+
+  if (all_info) {
+    tpu_pmu_fill_cmdbuf(v_dma_buf);
+    tpu_pmu_fwrite_des();
+    tpu_pmu_getlayerInfo();
+    tpu_pmu_fwrite_layer(tpu_clk_rate);
+  }
+
+  return 0;
+}
+
+uint32_t tpu_pmu_get_des_cnt(uint8_t *v_dma_buf)
+{
+  uint32_t tiu_cnt = 0, tdma_cnt = 0;
+  dma_hdr_t *header = (dma_hdr_t *)v_dma_buf;
+  cvi_cpu_desc_t *desc = (cvi_cpu_desc_t *)(v_dma_buf + sizeof(dma_hdr_t));
+
+  for (uint32_t i = 0; i < header->cpu_desc_count; i++, desc++) {
+    tiu_cnt += (desc->num_tiu & 0xFFFF);
+    tdma_cnt += (desc->num_tdma & 0xFFFF);
+  }
+
+  //assume worst case tdma g2g case will generate double descriptor
+  return (tiu_cnt + tdma_cnt + tdma_cnt);
+}
+
+#define TPU_PMU_MALLOC_PADDING  1024
+uint32_t tpu_pmu_dump_main(uint8_t *v_dma_buf, uint64_t p_dma_buf)
+{
+  dma_hdr_t *dma_header = (dma_hdr_t *)v_dma_buf;
+  uint8_t all_info = 0;
+
+  //check header first
+  if (dma_header->dmabuf_magic_m != TPU_DMABUF_HEADER_M) {
+    TPU_LOG_NOTICE("pmu buffer header incorrect\n");
+    return CVI_RC_FAILURE;
+  }
+
+  //check if we need output pmubuf
+  pmubuf_output_file_env = std::getenv("TPU_PMUBUF_OUTPUT_FILE");
+  if (pmubuf_output_file_env) {
+    all_info = 1;
+  }
+
+  //malloc element array
+  if (all_info) {
+    p_element = (TPU_DES_ELEMENT *)malloc(tpu_pmu_get_des_cnt(v_dma_buf) * sizeof(TPU_DES_ELEMENT) + TPU_PMU_MALLOC_PADDING);
+    p_layer = (TPU_LAYERID_ELEMENT *)malloc(tpu_pmu_get_des_cnt(v_dma_buf) * sizeof(TPU_LAYERID_ELEMENT) + TPU_PMU_MALLOC_PADDING);
+
+    if (!p_element || !p_layer) {
+      TPU_LOG_INFO("tpu pmu des array malloc failed\n");
+      return CVI_RC_FAILURE;
+    }
+  }
+
+  //get pmu overview data
+  tpu_pmu_time(v_dma_buf, p_dma_buf, all_info);
+
+  //free element array
+  if (all_info) {
+    if (p_element) {
+      free(p_element);
+      p_element = NULL;
+    }
+
+    if (p_layer) {
+      free(p_layer);
+      p_layer = NULL;
+    }
+  }
+
+  return CVI_RC_SUCCESS;
+}
+
diff --git a/cviruntime/src/soc/183x/CMakeLists.txt b/cviruntime/src/soc/183x/CMakeLists.txt
new file mode 100644
index 000000000..8c0b4b3b6
--- /dev/null
+++ b/cviruntime/src/soc/183x/CMakeLists.txt
@@ -0,0 +1,29 @@
+cmake_minimum_required(VERSION 2.8.0)
+
+include_directories(./)
+include_directories(../common)
+include_directories(${CMAKE_SYSROOT}/include)
+add_definitions(-DION_CACHE_OPEN)
+add_definitions(-DMULTI_PROCESS)
+
+set(RUNTIME_SOURCES ${RUNTIME_SOURCES}
+   ${CMAKE_CURRENT_SOURCE_DIR}/../runtime_bmkernel.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/bmruntime_soc.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/bm_dmabuf.c
+   ${CMAKE_CURRENT_SOURCE_DIR}/tpu_pmu.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/cvi_rt_183x.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/cvi183x_device_mem.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/../common/cvi_device_mem.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/../common/cvi_rt_base.cpp
+   ${CMAKE_CURRENT_SOURCE_DIR}/../common/cviruntime_context.cpp)
+
+set(EXTRA_LIBS ${EXTRA_LIBS} rt dl pthread)
+
+add_library(cviruntime-static STATIC ${RUNTIME_SOURCES})
+set_property(TARGET cviruntime-static PROPERTY POSITION_INDEPENDENT_CODE ON)
+
+add_library(cviruntime SHARED ${RUNTIME_SOURCES})
+target_link_libraries(cviruntime cvikernel ${EXTRA_LIBS})
+
+install(TARGETS cviruntime DESTINATION lib)
+install(TARGETS cviruntime-static DESTINATION lib)
diff --git a/cviruntime/src/soc/183x/bm_dmabuf.c b/cviruntime/src/soc/183x/bm_dmabuf.c
new file mode 100644
index 000000000..8df5a6c43
--- /dev/null
+++ b/cviruntime/src/soc/183x/bm_dmabuf.c
@@ -0,0 +1,432 @@
+#include <stdio.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <string.h>
+
+#include <bmkernel/bm1880v2/bmkernel_1880v2.h>
+#include <bmkernel/bm1880v2/bm1880v2_tiu_reg.h>
+#include <bmkernel/bm1880v2/bm1880v2_tdma_reg.h>
+#include <bmkernel/reg_tiu.h>
+#include <bmkernel/reg_tdma.h>
+#include <bmkernel/reg_bdcast.h>
+#include <bmkernel/bm_regcpu.h>
+#include "bmruntime_internal.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define __ALIGN_MASK(x,mask)    (((x)+(mask))&~(mask))
+#define ALIGN(x,a)              __ALIGN_MASK(x,(__typeof__(x))(a)-1)
+
+#define BD_DESC_ALIGN_SIZE (1 << BDC_ENGINE_CMD_ALIGNED_BIT)
+#define GDMA_DESC_ALIGN_SIZE (1 << TDMA_DESCRIPTOR_ALIGNED_BIT)
+#define BD_EOD_PADDING_BYTES (128)
+
+typedef struct {
+  cmd_hdr_t hdr;
+  uint32_t body[0];
+} DESC;
+
+static DESC *traverse_start(uint8_t *cmdbuf)
+{
+  TPU_ASSERT(cmdbuf, NULL);
+  DESC *desc = (DESC *)cmdbuf;
+  TPU_ASSERT(desc->hdr.magic == CMDBUF_HDR_MAGIC_1880v2, NULL);
+  return desc;
+}
+
+static DESC *traverse_next(DESC *desc, uint8_t *cmdbuf, size_t size)
+{
+  DESC *next_desc = (DESC *)((uint8_t *)desc + cmd_hdr_len(&desc->hdr) + sizeof(cmd_hdr_t));
+  if ((uint8_t *)next_desc >= cmdbuf + size)
+    return NULL;
+  TPU_ASSERT(next_desc->hdr.magic == CMDBUF_HDR_MAGIC_1880v2, NULL);
+  return next_desc;
+}
+
+static bool is_last_desc(DESC *desc, uint8_t *cmdbuf, size_t size)
+{
+  DESC *next_desc = traverse_next(desc, cmdbuf, size);
+  return next_desc ? false : true;
+}
+
+static void reorder_bd_cmdbuf_reg(uint8_t *cmdbuf)
+{
+  int total_bits = BD_REG_BYTES * 8;
+
+  for (int i = 0; i < total_bits; i += 128)
+    cmdbuf[(i + 128 - 8) / 8] |= (i / 128) << 4;
+
+  uint8_t tmp[128 / 8];
+  uint8_t *last = &cmdbuf[(total_bits - 128) / 8];
+  memcpy(tmp, last, sizeof(tmp));
+  memcpy(last, cmdbuf, sizeof(tmp));
+  memcpy(cmdbuf, tmp, sizeof(tmp));
+}
+
+static void adjust_desc_tdma(uint32_t *body, bool eod)
+{
+  if (eod) {
+    body[0] |= (1 << TDMA_ACCPI0_EOD_BIT);
+    body[0] |= (1 << TDMA_ACCPI0_INTERRUPT_BIT); // interrupt
+  }
+  body[0] |= (1 << TDMA_ACCPI0_BARRIER_ENABLE_BIT);
+}
+
+static void adjust_desc_bd(uint32_t *body, bool eod)
+{
+  if (eod) {
+    tiu_reg_t reg;
+    parse_tiu_reg(&reg, body);
+    reg.cmd_end = 1;
+    reg.cmd_intr_en = 1;
+    emit_tiu_reg(&reg, body);
+  }
+  reorder_bd_cmdbuf_reg((uint8_t *)body);
+}
+
+bmerr_t cvi183x_dmabuf_relocate(uint8_t *dmabuf, uint64_t dmabuf_devaddr, uint32_t original_size, uint32_t pmubuf_size)
+{
+  dma_hdr_t *header = (dma_hdr_t *)dmabuf;
+  uint64_t tmpAddress = 0;
+
+  TPU_ASSERT(header->dmabuf_magic_m == TPU_DMABUF_HEADER_M, NULL);
+  cvi_cpu_desc_t *desc = (cvi_cpu_desc_t *)(dmabuf + sizeof(dma_hdr_t));
+
+  for (uint32_t i = 0; i < header->cpu_desc_count; i++, desc++) {
+    uint32_t tiu_num = desc->num_tiu & 0xFFFF;
+    uint32_t tdma_num = desc->num_tdma & 0xFFFF;
+
+    if (tiu_num) {
+      tmpAddress = dmabuf_devaddr + desc->offset_tiu;
+      //TPU_LOG_INFO("bd tmpAddress = 0x%lu\n", tmpAddress);
+      desc->offset_tiu_ori_bk = desc->offset_tiu;
+      desc->offset_tiu = tmpAddress >> BDC_ENGINE_CMD_ALIGNED_BIT;
+    }
+
+    if (tdma_num) {
+      tmpAddress = dmabuf_devaddr + desc->offset_tdma;
+      //TPU_LOG_INFO("tdma tmpAddress = 0x%lu\n", tmpAddress);
+      desc->offset_tdma_ori_bk = desc->offset_tdma;
+      desc->offset_tdma = tmpAddress >> TDMA_DESCRIPTOR_ALIGNED_BIT;
+    }
+
+    //set pmubuf_addr_p to enable pmu kick
+    header->pmubuf_size = pmubuf_size;
+    header->pmubuf_offset = original_size;
+  }
+  return BM_SUCCESS;
+}
+
+static uint32_t desc_sync_id(DESC *desc)
+{
+  switch (desc->hdr.engine_id) {
+    case BMK1880v2_TIU: {
+      tiu_reg_t reg;
+      parse_tiu_reg(&reg, desc->body);
+      return reg.cmd_id_tpu;
+    }
+    case BMK1880v2_TDMA: {
+      tdma_reg_t reg;
+      parse_tdma_reg(&reg, desc->body);
+      return reg.cmd_id;
+    }
+    default:
+      TPU_ASSERT(0, NULL);
+      return 1;
+  }
+}
+
+static void fill_header_and_arm(uint8_t *cmdbuf, size_t sz, uint8_t *dmabuf, uint64_t *tiu_offset, uint64_t *tdma_offset)
+{
+  dma_hdr_t header = {0};
+  header.dmabuf_magic_m = TPU_DMABUF_HEADER_M;
+  header.dmabuf_magic_s = 0x1835;
+
+  cvi_cpu_desc_t *segments = (cvi_cpu_desc_t *)(dmabuf + sizeof(dma_hdr_t));
+  DESC *desc = NULL;
+  size_t desc_nums[BMK1880v2_ENGINE_NUM] = {0};
+  size_t counters[BMK1880v2_ENGINE_NUM] = {0};
+  size_t desc_size[BMK1880v2_ENGINE_NUM] = {0};
+
+  TPU_ASSERT(segments, NULL);
+  // fill arm descs
+  desc = traverse_start(cmdbuf);
+
+  while (desc != NULL) {
+    uint32_t engine_id = (uint32_t)desc->hdr.engine_id;
+    counters[engine_id]++;
+    desc_nums[engine_id]++;
+    if (engine_id != BMK1880v2_CPU) {
+      // a new arm desc inserted to do sync operation
+      if (desc_sync_id(desc) == 0xFFFF || is_last_desc(desc, cmdbuf, sz)) {
+        desc_nums[BMK1880v2_CPU]++;
+        cvi_cpu_desc_t *arm = segments + desc_nums[BMK1880v2_CPU] - 1;
+        memset(arm, 0, sizeof(cvi_cpu_desc_t));
+        arm->op_type = CPU_OP_SYNC;
+        arm->num_tiu = counters[BMK1880v2_TIU];
+        arm->num_tdma = counters[BMK1880v2_TDMA];
+        strncpy(arm->str, "layer_end", sizeof(arm->str) - 1);
+        if (counters[BMK1880v2_TIU] != 0) {
+          desc_size[BMK1880v2_TIU] =
+              ALIGN(desc_size[BMK1880v2_TIU] + counters[BMK1880v2_TIU] * BD_REG_BYTES + BD_EOD_PADDING_BYTES,
+                    BD_DESC_ALIGN_SIZE);
+        }
+        counters[BMK1880v2_TIU] = 0;
+        counters[BMK1880v2_TDMA] = 0;
+      }
+    } else {
+      cvi_cpu_desc_t *arm = segments + desc_nums[BMK1880v2_CPU] - 1;
+      memcpy(arm, &(desc->body), sizeof(cvi_cpu_desc_t));
+      arm->num_tiu = counters[BMK1880v2_TIU];
+      arm->num_tdma = counters[BMK1880v2_TDMA];
+      if (counters[BMK1880v2_TIU] != 0) {
+        desc_size[BMK1880v2_TIU] =
+            ALIGN(desc_size[BMK1880v2_TIU] + counters[BMK1880v2_TIU] * BD_REG_BYTES + BD_EOD_PADDING_BYTES,
+                  BD_DESC_ALIGN_SIZE);
+      }
+      counters[BMK1880v2_TIU] = 0;
+      counters[BMK1880v2_TDMA] = 0;
+    }
+    desc = traverse_next(desc, cmdbuf, sz);
+  }
+  desc_size[BMK1880v2_CPU] = desc_nums[BMK1880v2_CPU] * CPU_ENGINE_BYTES;
+  desc_size[BMK1880v2_TDMA] = desc_nums[BMK1880v2_TDMA] * GDMA_DESC_ALIGN_SIZE;
+
+  (*tiu_offset) = ALIGN(sizeof(header) + desc_size[BMK1880v2_CPU], BD_DESC_ALIGN_SIZE);
+  (*tdma_offset) = ALIGN((*tiu_offset) + desc_size[BMK1880v2_TIU], GDMA_DESC_ALIGN_SIZE);
+
+  // dma hdr + arm descs + bd descs + tdma descs
+  header.dmabuf_size = (*tdma_offset) + desc_size[BMK1880v2_TDMA];
+  header.cpu_desc_count = desc_nums[BMK1880v2_CPU];
+  header.bd_desc_count = desc_nums[BMK1880v2_TIU];
+  header.tdma_desc_count = desc_nums[BMK1880v2_TDMA];
+
+  //TPU_LOG_DEBUG("header.dmabuf_size = %d\n", header.dmabuf_size);
+  //TPU_LOG_DEBUG("header.cpu_desc_count = %d\n", header.cpu_desc_count);
+  //TPU_LOG_DEBUG("header.bd_desc_count = %d\n", header.bd_desc_count);
+  //TPU_LOG_DEBUG("header.tdma_desc_count = %d\n", header.tdma_desc_count);
+  memcpy(dmabuf, &header, sizeof(header));
+}
+
+static void fill_bd_and_tdma(uint8_t *cmdbuf, size_t sz, uint8_t *dmabuf, uint64_t tiu_offset, uint64_t tdma_offset)
+{
+  dma_hdr_t *p_header = (dma_hdr_t *)dmabuf;
+  cvi_cpu_desc_t *segments = (cvi_cpu_desc_t *)(dmabuf + sizeof(dma_hdr_t));
+  DESC *desc = traverse_start(cmdbuf);
+  //uint64_t address_max = 0x100000000;
+  
+  for (uint32_t i = 0; i < p_header->cpu_desc_count; i++) {
+
+    cvi_cpu_desc_t *arm = segments + i;
+    
+    uint32_t tiu_num = arm->num_tiu & 0xFFFF;
+    uint32_t tdma_num = arm->num_tdma & 0xFFFF;
+
+    if (tiu_num) {
+      tiu_offset = ALIGN(tiu_offset, 1 << BDC_ENGINE_CMD_ALIGNED_BIT);
+      arm->offset_tiu = tiu_offset;
+      //TPU_LOG_INFO("arm->offset_tiu = 0x%x \n", arm->offset_tiu);
+    }
+
+    if (tdma_num) {
+      tdma_offset = ALIGN(tdma_offset, 1 << TDMA_DESCRIPTOR_ALIGNED_BIT);
+      arm->offset_tdma = tdma_offset;
+      //TPU_LOG_INFO("arm->offset_tdma = 0x%x \n", arm->offset_tdma);
+    }
+
+    while (tiu_num || tdma_num) {
+      uint32_t engine_id = (uint32_t)desc->hdr.engine_id;
+      void *p_body = NULL;
+
+      switch (engine_id) {
+        case BMK1880v2_TIU:
+          tiu_num--;
+          p_body = (void *)(dmabuf + tiu_offset);
+          tiu_offset += BD_REG_BYTES;
+          memcpy(p_body, desc->body, desc->hdr.len);
+          adjust_desc_bd((uint32_t *)p_body, tiu_num == 0);
+          break;
+        case BMK1880v2_TDMA:
+          tdma_num--;
+          tdma_offset = ALIGN(tdma_offset, GDMA_DESC_ALIGN_SIZE);
+          p_body = (void *)(dmabuf + tdma_offset);
+          tdma_offset += GDMA_DESC_ALIGN_SIZE;
+          memcpy(p_body, desc->body, desc->hdr.len);
+
+#if 0 //debug feature, for checking if neuron overshoot
+{
+          tdma_reg_t reg_tdma = {0};
+          uint64_t tdma_address = 0, tdma_address2 = 0;
+
+          parse_tdma_reg(&reg_tdma, p_body);
+
+          if (reg_tdma.src_base_reg_sel == 0) {
+            //  reg.trans_dir = 2; // 0:tg2l, 1:l2tg, 2:g2g, 3:l2l
+            if (reg_tdma.trans_dir == 0) {
+              TPU_LOG_INFO ("src_base_addr_high=%x, src_base_addr_low=%x\n", reg_tdma.src_base_addr_high, reg_tdma.src_base_addr_low);
+              tdma_address = ((uint64_t)reg_tdma.src_base_addr_high) << 32 | (uint64_t)reg_tdma.src_base_addr_low;
+            } else if (reg_tdma.trans_dir == 1) {
+              TPU_LOG_INFO ("dst_base_addr_high=%x, dst_base_addr_low=%x\n", reg_tdma.dst_base_addr_high, reg_tdma.dst_base_addr_low);
+              tdma_address = ((uint64_t)reg_tdma.dst_base_addr_high) << 32 | (uint64_t)reg_tdma.dst_base_addr_low;
+            } else if (reg_tdma.trans_dir == 2) {
+              TPU_LOG_INFO ("dst_base_addr_high=%x, dst_base_addr_low=%x\n", reg_tdma.dst_base_addr_high, reg_tdma.dst_base_addr_low);
+              tdma_address = ((uint64_t)reg_tdma.src_base_addr_high) << 32 | (uint64_t)reg_tdma.src_base_addr_low;
+              tdma_address2 = ((uint64_t)reg_tdma.dst_base_addr_high) << 32 | (uint64_t)reg_tdma.dst_base_addr_low;
+
+              if (tdma_address2 > tdma_address) {
+                tdma_address = tdma_address2;
+              }
+            }
+            
+            if (tdma_address > address_max) {
+              address_max = tdma_address;
+              TPU_LOG_INFO("address_max=%llx\n", address_max);
+            }
+          }
+}
+#endif
+          adjust_desc_tdma((uint32_t *)p_body, tdma_num == 0);
+          break;
+        default:
+          break;
+      }
+      desc = traverse_next(desc, cmdbuf, sz);
+    }
+
+    // padding zero after eod to workaroud hardware bug
+    if (arm->num_tiu & 0xFFFF) {
+      void *buf = (void *)(dmabuf + tiu_offset);
+      memset(buf, 0, BD_EOD_PADDING_BYTES);
+      tiu_offset += BD_EOD_PADDING_BYTES;
+    }
+  }
+
+}
+
+bmerr_t cvi183x_dmabuf_convert(uint8_t *cmdbuf, size_t sz, uint8_t *dmabuf)
+{
+  uint64_t tiu_offset = 0;
+  uint64_t tdma_offset = 0;
+  fill_header_and_arm(cmdbuf, sz, dmabuf, &tiu_offset, &tdma_offset);
+  fill_bd_and_tdma(cmdbuf, sz, dmabuf, tiu_offset, tdma_offset);
+  return BM_SUCCESS;
+}
+
+#define PER_DES_SIZE 16
+#define PADDING_SIZE (1024 * 1024)
+bmerr_t cvi183x_dmabuf_size(uint8_t *cmdbuf, size_t sz, size_t *psize, size_t *pmu_size)
+{
+  size_t tdma_desc_num = {0};
+  size_t counters[BMK1880v2_ENGINE_NUM] = {0};
+  size_t bd_size = 0;
+  size_t dmabuf_size = 0;
+
+  uint32_t tiu_cnt = 0;
+  uint32_t tdma_cnt = 0;
+
+  // calculate desc numbers
+  DESC *desc = traverse_start(cmdbuf);
+
+  while (desc != NULL) {
+    uint32_t engine_id = (uint32_t)desc->hdr.engine_id;
+    counters[engine_id]++;
+    if (engine_id != BMK1880v2_CPU) {
+      // a new arm desc inserted to do sync operation
+      if (desc_sync_id(desc) == 0xFFFF || is_last_desc(desc, cmdbuf, sz)) {
+        counters[BMK1880v2_CPU]++;
+        tdma_desc_num += counters[BMK1880v2_TDMA];
+        if (counters[BMK1880v2_TIU] != 0) {
+          bd_size = ALIGN(bd_size + counters[BMK1880v2_TIU] * BD_REG_BYTES + BD_EOD_PADDING_BYTES,
+                          BD_DESC_ALIGN_SIZE);
+        }
+        tiu_cnt += counters[BMK1880v2_TIU] & 0xFFFF;
+        tdma_cnt += counters[BMK1880v2_TDMA] & 0xFFFF;
+        counters[BMK1880v2_TIU] = 0;
+        counters[BMK1880v2_TDMA] = 0;
+      }
+    } else {
+      tdma_desc_num += counters[BMK1880v2_TDMA];
+      if (counters[BMK1880v2_TIU] != 0) {
+        bd_size = ALIGN(bd_size + counters[BMK1880v2_TIU] * BD_REG_BYTES + BD_EOD_PADDING_BYTES,
+                        BD_DESC_ALIGN_SIZE);
+      }
+      tiu_cnt += counters[BMK1880v2_TIU] & 0xFFFF;
+      tdma_cnt += counters[BMK1880v2_TDMA] & 0xFFFF;
+      counters[BMK1880v2_TIU] = 0;
+      counters[BMK1880v2_TDMA] = 0;
+    }
+    desc = traverse_next(desc, cmdbuf, sz);
+  }
+  // dma hdr + arm descs + bd descs + tdma descs
+  dmabuf_size = sizeof(dma_hdr_t) + counters[BMK1880v2_CPU] * CPU_ENGINE_BYTES;
+  dmabuf_size = ALIGN(dmabuf_size, BD_DESC_ALIGN_SIZE) + bd_size;
+  dmabuf_size = ALIGN(dmabuf_size, GDMA_DESC_ALIGN_SIZE) + tdma_desc_num * GDMA_DESC_ALIGN_SIZE;
+
+  *pmu_size = ALIGN((tiu_cnt + tdma_cnt) * PER_DES_SIZE + PADDING_SIZE, 0x1000);
+
+  *psize = dmabuf_size;
+  return BM_SUCCESS;
+}
+
+void cvi183x_arraybase_set(uint8_t *dmabuf, uint32_t arraybase0L, uint32_t arraybase1L, uint32_t arraybase0H, uint32_t arraybase1H)
+{
+  TPU_ASSERT(dmabuf, NULL);
+  dma_hdr_t *header = (dma_hdr_t *)dmabuf;
+
+  TPU_ASSERT(header->dmabuf_magic_m == TPU_DMABUF_HEADER_M, NULL);
+  header->arraybase_0_L = arraybase0L;
+  header->arraybase_1_L = arraybase1L;
+  header->arraybase_0_H = arraybase0H;
+  header->arraybase_1_H = arraybase1H;
+  return;
+}
+
+uint64_t cvi183x_get_pmusize(uint8_t * dmabuf)
+{
+  uint32_t tiu_cnt = 0, tdma_cnt = 0;
+  uint64_t pmu_size = 0;
+
+  TPU_ASSERT(dmabuf, NULL);
+  dma_hdr_t *header = (dma_hdr_t *)dmabuf;
+  TPU_ASSERT(header->dmabuf_magic_m == TPU_DMABUF_HEADER_M, NULL);
+
+  cvi_cpu_desc_t *desc = (cvi_cpu_desc_t *)(dmabuf + sizeof(dma_hdr_t));
+
+  for (uint32_t i = 0; i < header->cpu_desc_count; i++, desc++) {
+    tiu_cnt += (desc->num_tiu & 0xFFFF);
+    tdma_cnt += (desc->num_tdma & 0xFFFF);
+  }
+
+  pmu_size = ALIGN((tiu_cnt + tdma_cnt) * PER_DES_SIZE + PADDING_SIZE, 0x1000);
+  //TPU_LOG_INFO("cvi183x_get_pmusize pmusize= %" PRIu64 " \n", pmu_size);
+  return pmu_size;
+}
+
+void cvi183x_dmabuf_dump(uint8_t *dmabuf)
+{
+  TPU_ASSERT(dmabuf, NULL);
+  dma_hdr_t *header = (dma_hdr_t *)dmabuf;
+  //TPU_LOG_INFO("cvi183x_dmabuf_dump header->arraybase_0_L = 0x%x\n", header->arraybase_0_L);
+  //TPU_LOG_INFO("cvi183x_dmabuf_dump header->arraybase_1_L = 0x%x\n", header->arraybase_1_L);
+  //TPU_LOG_INFO("cvi183x_dmabuf_dump header->arraybase_0_H = 0x%x\n", header->arraybase_0_H);
+  //TPU_LOG_INFO("cvi183x_dmabuf_dump header->arraybase_1_H = 0x%x\n", header->arraybase_1_H);
+  //TPU_LOG_INFO("cvi183x_dmabuf_dump header->pmubuf_offset = 0x%x\n", header->pmubuf_offset);
+
+  TPU_ASSERT(header->dmabuf_magic_m == TPU_DMABUF_HEADER_M, NULL);
+  cvi_cpu_desc_t *desc = (cvi_cpu_desc_t *)(dmabuf + sizeof(dma_hdr_t));
+
+  for (uint32_t i = 0; i < header->cpu_desc_count; i++, desc++) {
+    int tiu_num = desc->num_tiu & 0xFFFF;
+    int tdma_num = desc->num_tdma & 0xFFFF;
+    uint32_t tiu_offset = desc->offset_tiu;
+    uint32_t tdma_offset = desc->offset_tdma;
+    TPU_LOG_INFO("cvi183x_dmabuf_dump num<tiu:%d, tdma:%d>, offset<0x%08x, 0x%08x>\n", tiu_num, tdma_num, tiu_offset, tdma_offset);
+  }
+}
+
+#ifdef __cplusplus
+}
+#endif
+
diff --git a/cviruntime/src/soc/183x/bmruntime_internal.h b/cviruntime/src/soc/183x/bmruntime_internal.h
new file mode 100644
index 000000000..e3a70f9c7
--- /dev/null
+++ b/cviruntime/src/soc/183x/bmruntime_internal.h
@@ -0,0 +1,32 @@
+#ifndef _BM_RUNTIME_INTERNAL_H_
+#define _BM_RUNTIME_INTERNAL_H_
+
+#include <pthread.h>
+#include <bmkernel/bm1880v2/bmkernel_1880v2.h>
+#include <bmruntime.h>
+#include <cvikernel/cvikernel.h>
+#include "cvitpu_debug.h"
+#include <bmkernel/bm_regcpu.h>
+#include "bm_types.h"
+
+#ifdef __cplusplus
+	extern "C" {
+#endif
+
+bmerr_t cvi183x_dmabuf_size(uint8_t *cmdbuf, size_t sz, size_t *psize, size_t *pmu_size);
+bmerr_t cvi183x_dmabuf_relocate(uint8_t *dmabuf, uint64_t dmabuf_devaddr, uint32_t original_size, uint32_t pmubuf_size);
+bmerr_t cvi183x_dmabuf_convert(uint8_t *cmdbuf, size_t sz, uint8_t *dmabuf);
+void cvi183x_dmabuf_dump(uint8_t * dmabuf);
+void cvi183x_arraybase_set(uint8_t *dmabuf, uint32_t arraybase0L, uint32_t arraybase1L, uint32_t arraybase0H, uint32_t arraybase1H);
+uint64_t cvi183x_get_pmusize(uint8_t * dmabuf);
+
+uint32_t tpu_pmu_dump_main(uint8_t *v_dma_buf, uint64_t p_dma_buf);
+
+#define TPU_PMUBUF_SIZE         (1024 * 1024 * 2)
+#define TPU_DMABUF_HEADER_M  0xB5B5
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BM_RUNTIME_INTERNAL_H_ */
diff --git a/cviruntime/src/soc/183x/bmruntime_soc.cpp b/cviruntime/src/soc/183x/bmruntime_soc.cpp
new file mode 100644
index 000000000..fe415851e
--- /dev/null
+++ b/cviruntime/src/soc/183x/bmruntime_soc.cpp
@@ -0,0 +1,173 @@
+#include <inttypes.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <cstdlib>
+#include <runtime/debug.h>
+#include <bmruntime.h>
+#include <mmpool.h>
+#include "bmruntime_internal.h"
+#include "cvi183x_device_mem.h"
+
+Cvi183xDeviceMem cvi_device;
+
+bmmem_device_t bmmem_device_alloc_raw(bmctx_t ctx, size_t size) {
+  return cvi_device.mem_alloc_raw(ctx, size);
+}
+
+bmmem_device_t bmmem_device_prealloc_raw(bmctx_t ctx, bmmem_device_t mem, uint64_t offset,
+                                         size_t size) {
+  return cvi_device.mem_prealloc_raw(ctx, mem, offset, size);
+}
+
+void bmmem_device_free(bmctx_t ctx, bmmem_device_t mem) {
+  cvi_device.mem_free_raw(ctx, mem);
+}
+
+void bmmem_device_free_ex(uint64_t p_addr) {
+  cvi_device.mem_free_ex(p_addr);
+}
+
+size_t bmmem_device_size(bmmem_device_t mem) {
+  return cvi_device.mem_size(mem);
+}
+
+uint64_t bmmem_device_addr(bmmem_device_t mem) {
+  return cvi_device.mem_p_addr(mem);
+}
+
+uint8_t *bmmem_device_v_addr(bmmem_device_t mem) {
+  return cvi_device.mem_v_addr(mem);
+}
+
+int32_t bmmem_device_inc_ref(bmmem_device_t mem) {
+  return cvi_device.mem_inc_ref(mem);
+}
+
+int32_t bmmem_device_dec_ref(bmmem_device_t mem) {
+  return cvi_device.mem_dec_ref(mem);
+}
+
+bmerr_t bm_memcpy_s2d(bmctx_t ctx, bmmem_device_t dst, uint8_t *src) {
+  return cvi_device.mem_memcpy_s2d(ctx, dst, src);
+}
+
+bmerr_t bm_memcpy_s2d_ex(bmctx_t ctx, bmmem_device_t dst, uint8_t *src, uint64_t offset,
+                         size_t size) {
+  return cvi_device.mem_memcpy_s2d_ex(ctx, dst, src, offset, size);
+}
+
+bmerr_t bm_memcpy_d2s(bmctx_t ctx, uint8_t *dst, bmmem_device_t src) {
+  return cvi_device.mem_memcpy_d2s(ctx, dst, src);
+}
+
+bmerr_t bm_memcpy_d2s_ex(bmctx_t ctx, uint8_t *dst, bmmem_device_t src, uint64_t offset,
+                         size_t size) {
+  return cvi_device.mem_memcpy_d2s_ex(ctx, dst, src, offset, size);
+}
+
+bmerr_t bm_context_create(bmctx_t *ctx) {
+  return cvi_device.context_create(ctx);
+}
+
+bmerr_t bm_bind_device(bmctx_t ctx, bmdev_t dev) {
+  return cvi_device.bind_device(ctx, dev);
+}
+
+void bm_unbind_device(bmctx_t ctx) {
+  return cvi_device.unbind_device(ctx);
+}
+
+bmdev_t bm_get_device(bmctx_t ctx) {
+  return cvi_device.get_device(ctx);
+}
+
+bmerr_t bm_init(int index, bmctx_t *ctx) {
+  return cvi_device.device_init(index, ctx);
+}
+
+void bm_exit(bmctx_t ctx) {
+  cvi_device.device_exit(ctx);
+}
+
+bmerr_t bm_load_cmdbuf(bmctx_t ctx, uint8_t *cmdbuf, size_t sz,
+                       uint64_t neuron_gaddr, uint64_t weight_gaddr,
+                       bool enable_pmu, bmmem_device_t *cmdbuf_mem) {
+  return cvi_device.load_cmdbuf(ctx, cmdbuf, sz, neuron_gaddr,
+                                 weight_gaddr, enable_pmu, cmdbuf_mem);
+}
+
+bmerr_t cvi_load_cmdbuf_tee(bmctx_t ctx, uint8_t *cmdbuf, size_t sz,
+                       uint64_t neuron_gaddr, uint64_t weight_gaddr, uint32_t weight_len, bmmem_device_t *cmdbuf_mem)
+{
+  return cvi_device.load_cmdbuf_tee(ctx, cmdbuf, sz, neuron_gaddr,
+                                     weight_gaddr, weight_len, cmdbuf_mem);
+}
+
+bmerr_t cvi_run_cmdbuf_tee(bmctx_t ctx, uint16_t *seq_no, uint64_t dmabuf_addr, cvi_array_base *array_base)
+{
+  return cvi_device.run_cmdbuf_tee(ctx, seq_no, dmabuf_addr, array_base);
+}
+
+bmerr_t bm_run_cmdbuf(bmctx_t ctx, bmmem_device_t cmdbuf_mem, uint16_t *seq_no) {
+  return cvi_device.run_cmdbuf(ctx, cmdbuf_mem, seq_no);
+}
+
+bmerr_t bm_run_cmdbuf_ex(bmctx_t ctx, bmmem_device_t cmdbuf_mem, uint16_t *seq_no,
+                       uint64_t input_base_addr, uint64_t output_base_addr) {
+  return cvi_device.run_cmdbuf_ex(ctx, cmdbuf_mem, seq_no, input_base_addr, output_base_addr);
+}
+
+bmerr_t bm_run_cmdbuf_ex2(bmctx_t ctx, bmmem_device_t cmdbuf_mem, uint16_t *seq_no,
+                       cvi_array_base *p_array_base) {
+  return cvi_device.run_cmdbuf_ex2(ctx, cmdbuf_mem, seq_no, p_array_base);
+}
+
+bmerr_t cvi_run_async(bmctx_t ctx, bmmem_device_t cmdbuf_mem)
+{
+  return cvi_device.run_async(ctx, cmdbuf_mem);
+}
+
+bmerr_t bm_send_cmdbuf(bmctx_t ctx, uint8_t *cmdbuf, size_t sz, uint16_t *seq_no) {
+  return cvi_device.send_cmdbuf(ctx, cmdbuf, sz, seq_no);
+}
+
+bmerr_t bm_wait_cmdbuf_done(bmctx_t ctx, uint16_t seq_no) {
+  return cvi_device.wait_cmdbuf_done(ctx, seq_no);
+}
+
+bmerr_t cvi_wait_cmdbuf_all(bmctx_t ctx) {
+  return cvi_device.wait_cmdbuf_all(ctx);
+}
+
+bmerr_t bm_run_cmdbuf_pio(bmctx_t ctx, uint8_t *cmdbuf, size_t sz) {
+  return cvi_device.run_cmdbuf_pio(ctx, cmdbuf, sz);
+}
+
+void bm_device_set_base_reg(bmctx_t ctx, uint32_t inx, uint64_t addr) {
+  cvi_device.set_base_reg(ctx, inx, addr);
+}
+
+uint64_t bm_device_read_base_reg(bmctx_t ctx, u32 inx) {
+  return cvi_device.read_base_reg(ctx, inx);
+}
+
+int bm_device_get_chip_ver(bmdev_t dev) {
+  return cvi_device.get_chip_ver(dev);
+}
+
+bmerr_t bm_parse_pmubuf(bmmem_device_t cmdbuf_mem, uint8_t **buf_start, uint32_t *buf_len) {
+  return cvi_device.parse_pmubuf(cmdbuf_mem, buf_start, buf_len);
+}
+
+void cviruntime_cvikernel_create(bmctx_t ctx, void **p_bk_ctx) {
+  cvi_device.cvikernel_create(ctx, p_bk_ctx);
+}
+
+void cviruntime_cvikernel_submit(bmctx_t ctx) {
+  cvi_device.cvikernel_submit(ctx);
+}
+
+void cviruntime_cvikernel_destroy(bmctx_t ctx) {
+  cvi_device.cvikernel_destroy(ctx);
+}
diff --git a/cviruntime/src/soc/183x/cvi183x_device_mem.cpp b/cviruntime/src/soc/183x/cvi183x_device_mem.cpp
new file mode 100644
index 000000000..d734a34aa
--- /dev/null
+++ b/cviruntime/src/soc/183x/cvi183x_device_mem.cpp
@@ -0,0 +1,298 @@
+#include <cstdlib>
+#include <memory>
+#include <cstring>
+#include "cvi183x_device_mem.h"
+
+Cvi183xDeviceMem::Cvi183xDeviceMem() {
+  GLOBAL_MEM_START_ADDR = 0x100000000;
+  g_gmem_size = 1ULL << 30; // 1GB
+  tpu_dmabuf_header_m = 0xB5B5;
+}
+
+Cvi183xDeviceMem::~Cvi183xDeviceMem() {}
+
+
+bmerr_t Cvi183xDeviceMem::device_open(int index, bmdev_t *dev)
+{
+  bm_device_t *pdev = new bm_device_t;
+
+  BMDEV_LOCK_INIT(pdev);
+  pdev->index = index;
+  pdev->info.info183x = bmk1880v2_chip_info();
+  pdev->gmem_size = g_gmem_size;
+
+  const char* tpu_dev_name_defalut = TPU_DEV_NAME;
+  const char* tpu_dev_name_env = std::getenv("TPU_DEV");
+  const char *tpu_dev_name = tpu_dev_name_defalut;
+  if (tpu_dev_name_env) {
+    tpu_dev_name = tpu_dev_name_env;
+  }
+
+  pdev->dev_fd = open(tpu_dev_name, O_RDWR);
+  if (pdev->dev_fd <= 0) {
+    TPU_LOG_WARNING("open %s failed\n", tpu_dev_name);
+    return BM_ERR_FAILURE;
+  }
+
+  pdev->ion_fd = open(ION_DEV_NAME, O_RDWR);
+  if (pdev->ion_fd <= 0) {
+    TPU_LOG_WARNING("open %s failed\n", ION_DEV_NAME);
+    return BM_ERR_FAILURE;
+  }
+
+  int ret = ion_query_heap(pdev);
+  TPU_ASSERT(ret == BM_SUCCESS, NULL);
+
+  *dev = pdev;
+
+  return BM_SUCCESS;
+}
+
+void Cvi183xDeviceMem::device_close(bmdev_t dev)
+{
+  close(dev->ion_fd);
+  close(dev->dev_fd);
+
+  // TPU_LOG_WARNING("device[%d] closed\n", dev->index);
+
+  BMDEV_LOCK_DEINIT(dev);
+  delete dev;
+}
+
+int Cvi183xDeviceMem::get_chip_ver(bmdev_t dev) {
+  return dev->info.info183x.version;
+}
+
+void Cvi183xDeviceMem::mem_free_raw(bmctx_t ctx, bmmem_device_t mem) {
+  char array_got = 0;
+  bm_memory_t *device_mem = (bm_memory_t *)mem;
+  TPU_ASSERT(device_mem->flags.u.type == BMMEM_TYPE_DEVICE, NULL);
+
+  if (!device_mem->flags.u.is_prealloc) {
+    //checking if it needs to unload firstly
+    for (uint32_t i = 0; i < TEE_FIREWALL_MAX; i ++) {
+      if (root_tee_firewall_info[i].address == device_mem->p_addr) {
+        unload_tee(ctx, device_mem->p_addr, device_mem->size);
+        memset(&root_tee_firewall_info[i], 0, sizeof(tee_firewall_info));
+        break;
+      }
+    }
+
+    mem_free(device_mem->v_addr, device_mem->size, device_mem->dma_fd);
+
+    for (int i = 0; i < MEMARRAY_MAX_CNT; i ++) {
+      if (ctx->root_mem_array[i].p_addr == device_mem->p_addr) {
+        ctx->root_mem_array[i].p_addr = 0;
+        ctx->root_mem_array[i].mem = NULL;
+        array_got = 1;
+        break;
+      }
+    }
+
+    if (!array_got)
+      TPU_LOG_WARNING("bmmem_device_free() can not find match\n");
+  }
+
+  BMEMEM_DUMP();
+  delete device_mem;
+}
+
+
+bmerr_t Cvi183xDeviceMem::load_cmdbuf(bmctx_t ctx, uint8_t *cmdbuf, size_t sz,
+                       uint64_t neuron_gaddr, uint64_t weight_gaddr,
+                       bool enable_pmu, bmmem_device_t *cmdbuf_mem) {
+  bmerr_t ret;
+  size_t dmabuf_size = 0;
+  size_t pmubuf_size = 0;
+  bmmem_device_t dmabuf_mem;
+  ret = cvi183x_dmabuf_size(cmdbuf, sz, &dmabuf_size, &pmubuf_size);
+
+  //calculate pmu size
+  pmubuf_size = enable_pmu ? pmubuf_size : 0;
+  //TPU_LOG_DEBUG("pmubuf_size = 0x%lx\n", pmubuf_size);
+  if (protect) {
+    dmabuf_mem = mem_alloc_pagesize(ctx, dmabuf_size + pmubuf_size);
+  } else {
+    dmabuf_mem = mem_alloc_raw(ctx, dmabuf_size + pmubuf_size);
+  }
+  if (!dmabuf_mem) {
+    return BM_ERR_NOMEM;
+  }
+  uint64_t dmabuf_devaddr = mem_p_addr(dmabuf_mem);
+
+  ret = cvi183x_dmabuf_convert(cmdbuf, sz, dmabuf_mem->v_addr);
+  set_base_reg(ctx, 0, neuron_gaddr);
+  set_base_reg(ctx, 1, weight_gaddr);
+  cvi183x_arraybase_set(dmabuf_mem->v_addr, (u32)neuron_gaddr, (u32)weight_gaddr, 0, 0);
+
+  cvi183x_dmabuf_relocate(dmabuf_mem->v_addr, dmabuf_devaddr + GLOBAL_MEM_START_ADDR, dmabuf_size,
+                            pmubuf_size);
+  TPU_ASSERT(mem_flush_ext(ctx->dev, dmabuf_mem->dma_fd,
+        dmabuf_mem->p_addr, dmabuf_size) == BM_SUCCESS, NULL);
+  // record dmabuf crc32
+  // dmabuf_mem->crc32 = bm_crc32(dmabuf, dmabuf_size);
+
+  *cmdbuf_mem = dmabuf_mem;
+
+  // if (0) {
+    //cvi183x_dmabuf_dump(dmabuf);
+  //}
+  return ret;
+}
+
+bmerr_t Cvi183xDeviceMem::load_dmabuf(bmctx_t ctx, bmmem_device_t dmabuf,
+                                      size_t sz, uint64_t neuron_gaddr, uint64_t weight_gaddr,
+                                      bool enable_pmu, bmmem_device_t *dmabuf_mem) {
+  size_t pmubuf_size = 0;
+  if (enable_pmu) {
+    pmubuf_size = cvi183x_get_pmusize(dmabuf->v_addr);
+    *dmabuf_mem = mem_alloc_raw(ctx, sz + pmubuf_size);
+    if (*dmabuf_mem == nullptr) {
+        TPU_LOG_ERROR("alloc dmabuf mem fail!\n");
+        return BM_ERR_NOMEM;
+    }
+    std::memcpy((*dmabuf_mem)->v_addr, dmabuf->v_addr, sz);
+  } else {
+    *dmabuf_mem = dmabuf;
+  }
+  uint64_t dmabuf_devaddr = mem_p_addr(*dmabuf_mem);
+
+
+  //set_base_reg(ctx, 0, neuron_gaddr);
+  //set_base_reg(ctx, 1, weight_gaddr);
+  cvi183x_arraybase_set((*dmabuf_mem)->v_addr, (u32)neuron_gaddr, (u32)weight_gaddr, 0, 0);
+
+  cvi183x_dmabuf_relocate((*dmabuf_mem)->v_addr, dmabuf_devaddr + GLOBAL_MEM_START_ADDR, sz,
+                          pmubuf_size);
+  TPU_ASSERT(mem_flush_ext(ctx->dev, (*dmabuf_mem)->dma_fd,
+                           (*dmabuf_mem)->p_addr, sz) == BM_SUCCESS,
+             NULL);
+  // if (0) {
+  //cvi183x_dmabuf_dump(dmabuf);
+  //}
+  return BM_SUCCESS;
+}
+
+bmerr_t Cvi183xDeviceMem::load_cmdbuf_tee(bmctx_t ctx, uint8_t *cmdbuf, size_t sz,
+                            uint64_t neuron_gaddr, uint64_t weight_gaddr,
+                            uint32_t weight_len, bmmem_device_t *cmdbuf_mem)
+{
+  bmmem_device_t dmabuf_mem;
+  uint8_t i = 0, firewall_hit = 0;
+
+  //malloc double size buffer, because TEE needs 2nd space to calculate dmabuf
+  if (protect) {
+    dmabuf_mem = mem_alloc_pagesize(ctx, sz + sz);
+  } else {
+    dmabuf_mem = mem_alloc_raw(ctx, sz + sz);
+  }
+
+  if (!dmabuf_mem) {
+      TPU_LOG_ERROR("alloc dmabuf mem fail!\n");
+      return BM_ERR_NOMEM;
+  }
+
+  //transfer encrypted cmdbuf to TEE
+  memcpy(dmabuf_mem->v_addr, cmdbuf, sz);
+  TPU_ASSERT((int)mem_flush_ext(ctx->dev, dmabuf_mem->dma_fd,
+        dmabuf_mem->p_addr, sz) == BM_SUCCESS, NULL);
+
+  //firewall setting is in driver, only record info
+  for (i = 0; i < TEE_FIREWALL_MAX; i ++) {
+    if (root_tee_firewall_info[i].address == weight_gaddr) {
+      firewall_hit = 1;
+      break;
+    }
+  }
+
+  if (!firewall_hit) {
+    for (i = 0; i < TEE_FIREWALL_MAX; i ++) {
+      if (!root_tee_firewall_info[i].address) {
+        root_tee_firewall_info[i].address = weight_gaddr;
+        firewall_hit = 1;
+        break;
+      }
+    }
+  }
+
+  if (!firewall_hit) {
+    TPU_LOG_ERROR("tee only supported %d firewall, not enough\n", TEE_FIREWALL_MAX);
+    return BM_ERR_FAILURE;
+  }
+
+  //ioctl to get secure dma buffer
+  load_tee(ctx, dmabuf_mem->p_addr, sz, weight_gaddr, weight_len, neuron_gaddr);
+
+  //this region should be protected, can't touch in REE
+  *cmdbuf_mem = dmabuf_mem;
+  return BM_SUCCESS;
+}
+
+
+bmerr_t Cvi183xDeviceMem::unload_tee(bmctx_t ctx, uint64_t paddr, size_t size)
+{
+  int ret;
+  struct cvi_unload_tee_arg unload_arg;
+  unload_arg.paddr = paddr + GLOBAL_MEM_START_ADDR;
+  unload_arg.size = size;
+
+  ret = ioctl(ctx->dev->dev_fd, CVITPU_UNLOAD_TEE, &unload_arg);
+  if (ret != 0) {
+    TPU_LOG_WARNING("unload firewall failed, ret=%x\n", ret);
+    return BM_ERR_FAILURE;
+  }
+  return BM_SUCCESS;
+}
+
+bmerr_t Cvi183xDeviceMem::parse_pmubuf(bmmem_device_t cmdbuf_mem, uint8_t **buf_start, uint32_t *buf_len) {
+  dma_hdr_t *header = (dma_hdr_t *)(cmdbuf_mem->v_addr);
+  //TPU_LOG_DEBUG("header->arraybase_0_L = 0x%x\n", header->arraybase_0_L);
+  //TPU_LOG_DEBUG("header->arraybase_1_L = 0x%x\n", header->arraybase_1_L);
+  //TPU_LOG_DEBUG("header->arraybase_0_H = 0x%x\n", header->arraybase_0_H);
+  //TPU_LOG_DEBUG("header->arraybase_1_H = 0x%x\n", header->arraybase_1_H);
+  //TPU_LOG_DEBUG("header->pmubuf_offset = 0x%x\n", header->pmubuf_offset);
+  //TPU_LOG_DEBUG("header->pmubuf_size = 0x%x\n", header->pmubuf_size);
+  if (header->pmubuf_size && header->pmubuf_offset) {
+    tpu_pmu_dump_main(cmdbuf_mem->v_addr, cmdbuf_mem->p_addr);
+  }
+  *buf_start = cmdbuf_mem->v_addr;
+  *buf_len = cmdbuf_mem->size;
+  return BM_SUCCESS;
+}
+
+void Cvi183xDeviceMem::cvikernel_create(bmctx_t ctx, void **p_bk_ctx) {
+  TPU_ASSERT(ctx != nullptr, nullptr);
+  TPU_ASSERT(ctx->dev != nullptr, nullptr);
+
+  bmk1880v2_chip_info_t info = bmk1880v2_chip_info();
+  bmk1880v2_chip_info_t *dev_info = &info;
+
+  bmk_info_t bmk_info;
+  bmk_info.chip_version = dev_info->version;
+  bmk_info.cmdbuf_size = 0x100000;
+  bmk_info.cmdbuf = (u8 *)malloc(bmk_info.cmdbuf_size);
+  TPU_ASSERT(bmk_info.cmdbuf, "create cvikernel, malloc failed\n");
+
+  ctx->cvik_context.ctx183x = bmk1880v2_register(&bmk_info);
+  ctx->cvik_cmdbuf = (void *)bmk_info.cmdbuf;
+
+  *p_bk_ctx = ctx->cvik_context.ctx183x;
+}
+
+void Cvi183xDeviceMem::cvikernel_submit(bmctx_t ctx) {
+  u32 len;
+  u8 *cmdbuf = bmk1880v2_acquire_cmdbuf(ctx->cvik_context.ctx183x, &len);
+
+  uint16_t seq_no;
+  bmerr_t ret = send_cmdbuf(ctx, cmdbuf, (size_t)len, &seq_no);
+  TPU_ASSERT(ret == BM_SUCCESS, NULL);
+  bmk1880v2_reset(ctx->cvik_context.ctx183x);
+}
+
+void Cvi183xDeviceMem::cvikernel_destroy(bmctx_t ctx) {
+  TPU_ASSERT(ctx->cvik_context.ctx183x, NULL);
+  TPU_ASSERT(ctx->cvik_cmdbuf, NULL);
+
+  bmk1880v2_cleanup(ctx->cvik_context.ctx183x);
+  free(ctx->cvik_cmdbuf);
+}
\ No newline at end of file
diff --git a/cviruntime/src/soc/183x/cvi183x_device_mem.h b/cviruntime/src/soc/183x/cvi183x_device_mem.h
new file mode 100644
index 000000000..82fcbf551
--- /dev/null
+++ b/cviruntime/src/soc/183x/cvi183x_device_mem.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <string.h>
+#include "cvi_device_mem.h"
+#include "bmruntime_internal.h"
+
+class Cvi183xDeviceMem : public CviDeviceMem {
+public:
+  Cvi183xDeviceMem();
+  ~Cvi183xDeviceMem() override;
+  virtual bmerr_t device_open(int index, bmdev_t *dev) override;
+  virtual void device_close(bmdev_t dev) override;
+  virtual int get_chip_ver(bmdev_t dev) override;
+  virtual void mem_free_raw(bmctx_t ctx, bmmem_device_t mem);
+  virtual bmerr_t load_cmdbuf(bmctx_t ctx, uint8_t *cmdbuf, size_t sz,
+                       uint64_t neuron_gaddr, uint64_t weight_gaddr,
+                       bool enable_pmu, bmmem_device_t *cmdbuf_mem) override;
+  virtual bmerr_t load_dmabuf(bmctx_t ctx, bmmem_device_t dmabuf,
+                              size_t sz, uint64_t neuron_gaddr,
+                              uint64_t weight_gaddr, bool enable_pmu,
+                              bmmem_device_t *dmabuf_mem) override;
+  virtual bmerr_t load_cmdbuf_tee(bmctx_t ctx, uint8_t *cmdbuf, size_t sz,
+                                          uint64_t neuron_gaddr, uint64_t weight_gaddr,
+                                          uint32_t weight_len,
+                                          bmmem_device_t *cmdbuf_mem);
+  virtual bmerr_t unload_tee(bmctx_t ctx, uint64_t paddr, size_t size);
+  virtual bmerr_t parse_pmubuf(bmmem_device_t cmdbuf_mem, uint8_t **buf_start, uint32_t *buf_len);
+  virtual void cvikernel_create(bmctx_t ctx, void **p_bk_ctx) override;
+  virtual void cvikernel_submit(bmctx_t ctx) override;
+  virtual void cvikernel_destroy(bmctx_t ctx) override;
+};
diff --git a/cviruntime/src/soc/183x/cvi_rt_183x.cpp b/cviruntime/src/soc/183x/cvi_rt_183x.cpp
new file mode 100644
index 000000000..0cd1f99a9
--- /dev/null
+++ b/cviruntime/src/soc/183x/cvi_rt_183x.cpp
@@ -0,0 +1,83 @@
+#include "cvi_rt_183x.h"
+std::unique_ptr<CviRTSoc> cvi_chip(new CviRT183x());
+
+CviRT183x::CviRT183x() {
+  chip_name_ = "cv183x";
+  submit_magic_ = 0x18325678;
+  cvi_device = std::move(std::unique_ptr<CviDeviceMem>(new Cvi183xDeviceMem()));
+}
+
+CviRT183x::~CviRT183x() {}
+
+CVI_RT_KHANDLE CviRT183x::GetKHandleBK(CVI_RT_HANDLE rt_handle) {
+    bmctx_t ctx = (bmctx_t)rt_handle;
+    return (CVI_RT_KHANDLE)(ctx->cvik_context.ctx183x);
+}
+
+CVI_RC CviRT183x::DeInitBK(CVI_RT_HANDLE rt_handle) {
+    bmctx_t ctx = (bmctx_t)rt_handle;
+
+    //deinit kernel related
+    if (ctx->cvik_context.ctx183x) {
+        bmk1880v2_cleanup(ctx->cvik_context.ctx183x);
+    }
+
+    if (ctx->cvik_cmdbuf) {
+        free(ctx->cvik_cmdbuf);
+    }
+
+    //deinit basic context
+    bm_exit(ctx);
+    return CVI_SUCCESS;
+}
+
+CVI_RC CviRT183x::InitWithKernelBK(CVI_RT_HANDLE *rt_handle, uint32_t cmdbuf_size) {
+    bmctx_t *ctx = (bmctx_t *)rt_handle;
+
+    //init basic context
+    bm_init(DEVICE_INDEX_NUM, ctx);
+
+    //init cvikernel related
+    bmk1880v2_chip_info_t info      = bmk1880v2_chip_info();
+    bmk1880v2_chip_info_t *dev_info = &info;
+
+    bmk_info_t bmk_info;
+    bmk_info.chip_version = dev_info->version;
+    bmk_info.cmdbuf_size  = cmdbuf_size;
+    bmk_info.cmdbuf       = (u8 *)malloc(bmk_info.cmdbuf_size);
+    if (!bmk_info.cmdbuf) {
+        TPU_ASSERT(bmk_info.cmdbuf, "malloc kernel buffer failed");
+        return CVI_FAILURE;
+    }
+
+    (*ctx)->cvik_context.ctx183x = bmk1880v2_register(&bmk_info);
+    (*ctx)->cvik_cmdbuf          = (void *)bmk_info.cmdbuf;
+
+    return CVI_SUCCESS;
+}
+
+CVI_RC CviRT183x::LoadCmdbufTee(CVI_RT_HANDLE rt_handle, uint8_t *cmdbuf,
+                     size_t sz, uint64_t neuron_gaddr,
+                     uint64_t weight_gaddr, uint32_t weight_len,
+                     CVI_RT_MEM *cmdbuf_mem) {
+    return (CVI_RC)cvi_device->load_cmdbuf_tee((bmctx_t)rt_handle, cmdbuf,
+                                               sz, neuron_gaddr, weight_gaddr,
+                                               weight_len, (bmmem_device_t *)cmdbuf_mem);
+}
+
+CVI_RC CviRT183x::RunCmdbufTee(
+    CVI_RT_HANDLE rt_handle, CVI_RT_MEM cmdbuf_mem,
+    CVI_RT_ARRAYBASE *p_array_base)
+{
+  CVI_RC ret;
+  uint16_t seq_no;
+  bm_memory_t *mem = (bm_memory_t *)cmdbuf_mem;
+
+  ret = (CVI_RC)cvi_device->run_cmdbuf_tee(
+                    (bmctx_t)rt_handle,
+                    &seq_no, mem->p_addr, (cvi_array_base *)p_array_base);
+  if (ret != 0)
+    return ret;
+
+  return (CVI_RC)cvi_device->wait_cmdbuf_done((bmctx_t)rt_handle, seq_no);
+}
\ No newline at end of file
diff --git a/cviruntime/src/soc/183x/cvi_rt_183x.h b/cviruntime/src/soc/183x/cvi_rt_183x.h
new file mode 100644
index 000000000..eeede1759
--- /dev/null
+++ b/cviruntime/src/soc/183x/cvi_rt_183x.h
@@ -0,0 +1,19 @@
+#pragma once
+#include "cvi_rt_base.h"
+#include "cvi183x_device_mem.h"
+
+class CviRT183x : public CviRTSoc {
+public:
+  CviRT183x();
+  virtual ~CviRT183x() override;
+
+  virtual CVI_RT_KHANDLE GetKHandleBK(CVI_RT_HANDLE rt_handle) override;
+  virtual CVI_RC DeInitBK(CVI_RT_HANDLE rt_handle) override;
+  virtual CVI_RC InitWithKernelBK(CVI_RT_HANDLE *rt_handle, uint32_t cmdbuf_size) override;
+  virtual CVI_RC LoadCmdbufTee(CVI_RT_HANDLE rt_handle, uint8_t *cmdbuf,
+                       size_t sz, uint64_t neuron_gaddr,
+                       uint64_t weight_gaddr, uint32_t weight_len,
+                       CVI_RT_MEM *cmdbuf_mem) override;
+  virtual CVI_RC RunCmdbufTee(CVI_RT_HANDLE rt_handle, CVI_RT_MEM cmdbuf_mem,
+                              CVI_RT_ARRAYBASE *p_array_base);
+};
diff --git a/cviruntime/src/soc/183x/tpu_pmu.cpp b/cviruntime/src/soc/183x/tpu_pmu.cpp
new file mode 100644
index 000000000..dcab3eda1
--- /dev/null
+++ b/cviruntime/src/soc/183x/tpu_pmu.cpp
@@ -0,0 +1,753 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <string.h>
+#include <bmkernel/bm1880v2/bm1880v2_tiu_reg.h>
+#include <bmkernel/bm1880v2/bm1880v2_tdma_reg.h>
+#include "bmruntime_internal.h"
+#include <bmkernel/bm_regcpu.h>
+#include <bmkernel/reg_bdcast.h>
+#include <bmkernel/reg_tdma.h>
+
+
+struct TPU_PMU_DOUBLEEVENT {
+  unsigned long long type : 4;
+  unsigned long long desID : 16;
+  unsigned long long eventCnt0 : 22;
+  unsigned long long eventCnt1 : 22;
+  uint32_t endTime;
+  uint32_t startTime;
+};
+
+typedef enum _EXCEL_TYPE {
+  EXCEL_TYPE_0    = 0,
+  EXCEL_TYPE_1    = 1,
+  EXCEL_TYPE_2    = 2,
+  EXCEL_TYPE_3    = 3,
+  EXCEL_TYPE_4    = 4,
+} EXCEL_TYPE;
+
+enum TPU_PMUTYPE {
+  TPU_PMUTYPE_TDMALOAD  = 1,
+  TPU_PMUTYPE_TDMASTORE = 2,
+  TPU_PMUTYPE_TDMAMOVE  = 3,
+  TPU_PMUTYPE_TIU       = 4,
+};
+
+typedef struct _TPU_DES_ELEMENT {
+  TPU_PMU_DOUBLEEVENT pmuEvent;
+  tiu_reg_t   tiuReg;
+  tdma_reg_t  tdmaReg;
+  char typeStr[50];
+} TPU_DES_ELEMENT;
+
+typedef struct _TPU_LAYERID_ELEMENT {
+  uint32_t    layerID;
+  TPU_PMUTYPE last_desType;
+  uint32_t    last_mapping_desID;
+  uint32_t    endTime;
+  uint32_t    startTime;
+//  uint8_t     layerName[50];
+  uint32_t    u32StartAddr;
+  uint32_t    u32OutputLen;
+
+  uint32_t    u32LoadNueronTime;
+  uint32_t    u32LoadWeightTime;
+  uint32_t    u32StoreNueronTime;
+  uint32_t    u32TIUTime;
+  uint32_t    u32TDMATime;
+  uint32_t    u32byteCnt;
+
+  double      parallelism;
+  double      duration_percent;
+  double      loadNeuron_percent;
+  double      loadWeight_percent;
+  double      storeNeuron_percent;
+  double      tiu_percent;
+  double      throughput_MB;
+} TPU_LAYERID_ELEMENT;
+
+#define FILE_OUT_LINE_LEN 2048
+#define TPUPMU_DES_FILENAME "_des.csv"
+#define TPUPMU_LAYER_FILENAME "_layer.csv"
+const char *pmubuf_output_file_env = NULL;
+
+
+#define TPU_CLOCK_DEFAULT (750000000)
+#define TPU_WRAP_LIMIT  0xFFFFFFFF
+#define TPU_BURST_SIZE  16
+#define DES_MAX   (65535 * 6)    //hardcore firstly, real count could be queried from dmabuf
+TPU_DES_ELEMENT *p_element = NULL;
+TPU_LAYERID_ELEMENT *p_layer = NULL;
+
+static void tpu_pmu_fill_cmdbuf(uint8_t *v_dma_buf);
+
+static void reorder_back_tiu_cmdbuf_reg(uint8_t *cmdbuf)
+{
+  int total_bits = BD_REG_BYTES * 8;
+
+  uint8_t tmp[128 / 8];
+  uint8_t *last = &cmdbuf[(total_bits - 128) / 8];
+  memcpy(tmp, last, sizeof(tmp));
+  memcpy(last, cmdbuf, sizeof(tmp));
+  memcpy(cmdbuf, tmp, sizeof(tmp));
+}
+
+static void tdma_des_fill_str(TPU_DES_ELEMENT *element)
+{
+  char str1[50];
+
+  switch(element->pmuEvent.type) {
+    case TPU_PMUTYPE_TDMALOAD:
+      sprintf(str1, "%s", "Load");
+      break;
+    case TPU_PMUTYPE_TDMASTORE:
+      sprintf(str1, "%s", "Store");
+      break;
+    case TPU_PMUTYPE_TDMAMOVE:
+      sprintf(str1, "%s", "Move");
+      break;
+    default:
+      break;
+  }
+
+  if (element->tdmaReg.compress_en)
+    sprintf(str1, "%s %s", str1, "Compression");
+
+  if (element->tdmaReg.sys_dtype)
+    sprintf(element->typeStr, "%s %s", "TDMA Matrix", str1);
+  else
+    sprintf(element->typeStr, "%s %s", "TDMA Tensor", str1);
+}
+
+static void tpu_pmu_fill_cmdbuf(uint8_t *v_dma_buf)
+{
+  cvi_cpu_desc_t *desc = (cvi_cpu_desc_t *)(v_dma_buf + sizeof(dma_hdr_t));
+
+  uint64_t tiu_offset = 0, tdma_offset = 0;
+  uint32_t tiu_cnt = 0, tdma_cnt = 0, i = 0, offset = 0;
+  uint32_t start_index_tdma = 0, start_index_tiu = 0;
+  uint32_t index = 0;
+  tdma_reg_t tmpTDMA_Reg;
+  tiu_reg_t tmpTIU_Reg;
+  uint8_t tiu_recorded_buf[BD_REG_BYTES];
+  uint32_t tdma_id_previous = 0, tdma_start_pre= 0, tdma_end_pre = 0;
+
+  //get tiu/tdma descriptor start address
+  tiu_offset = desc->offset_tiu_ori_bk;
+  tdma_offset = desc->offset_tdma_ori_bk;
+  //TPU_LOG_DEBUG("tpu_pmu_fill_cmdbuf() tiu_offset=0x%" PRIx64", tdma_offset=0x%" PRIx64 "\n", tiu_offset, tdma_offset);
+
+  tiu_cnt = desc->num_tiu;
+  tdma_cnt = desc->num_tdma;
+  //TPU_LOG_DEBUG("tpu_pmu_fill_cmdbuf() tiu_cnt=%d, tdma_cnt=%d\n", tiu_cnt, tdma_cnt);
+
+  while (p_element[index].pmuEvent.type) {
+    if (p_element[index].pmuEvent.type != TPU_PMUTYPE_TIU) {    //tdma
+
+      if ((p_element[index].pmuEvent.desID != tdma_id_previous) ||
+          (p_element[index].pmuEvent.startTime != tdma_start_pre) ||
+          (p_element[index].pmuEvent.endTime != tdma_end_pre)) {
+        for (i = start_index_tdma; i < tdma_cnt; i ++) {
+          offset = tdma_offset + ((1 << TDMA_DESCRIPTOR_ALIGNED_BIT) * i);
+          parse_tdma_reg(&tmpTDMA_Reg, (uint32_t *)(v_dma_buf + offset));
+
+          if (p_element[index].pmuEvent.desID == tmpTDMA_Reg.cmd_id) {
+            memcpy(&p_element[index].tdmaReg, &tmpTDMA_Reg, sizeof(tmpTDMA_Reg));
+            tdma_des_fill_str(&p_element[index]);
+            start_index_tdma = i + 1;
+            tdma_id_previous = p_element[index].pmuEvent.desID;
+            tdma_start_pre = p_element[index].pmuEvent.startTime;
+            tdma_end_pre = p_element[index].pmuEvent.endTime;
+            break;
+          }
+        }
+      } else {  //tdma g2g case, copy 1st to 2nd tdma descriptor
+        memcpy(&p_element[index].tdmaReg, &p_element[index - 1].tdmaReg, sizeof(tmpTDMA_Reg));
+        tdma_des_fill_str(&p_element[index]);
+      }
+    } else {   //tiu
+      for (i = start_index_tiu; i < tiu_cnt; i ++) {
+        offset = tiu_offset + (BD_REG_BYTES * i);
+        uint8_t *tiu_cmdbuf = v_dma_buf + offset;
+
+        //get tiu_reg struc
+        memcpy(tiu_recorded_buf, tiu_cmdbuf, BD_REG_BYTES);
+        reorder_back_tiu_cmdbuf_reg(tiu_recorded_buf);
+        parse_tiu_reg(&tmpTIU_Reg, (uint32_t *)tiu_recorded_buf);
+
+        if (p_element[index].pmuEvent.desID == tmpTIU_Reg.cmd_id_tpu) {
+          memcpy(&p_element[index].tiuReg, &tmpTIU_Reg, sizeof(tmpTIU_Reg));
+
+          switch(tmpTIU_Reg.tsk_typ) {
+            case DCR_TYPE_CONV_FIX8B:
+              if (!tmpTIU_Reg.opt_chl_quan) {
+                if (tmpTIU_Reg.opd_typ)
+                  strcpy(p_element[index].typeStr, "TIU BF16 Convolution");
+                else
+                  strcpy(p_element[index].typeStr, "TIU Convolution");
+              } else {
+                strcpy(p_element[index].typeStr, "TIU PerChannel Convolution");
+              }
+              break;
+            case DCR_TYPE_DEPTHWISE_POOL_FIX8B:
+              switch (tmpTIU_Reg.tsk_eu_typ) {
+                    case 0:
+                  if (tmpTIU_Reg.opd_typ)
+                    strcpy(p_element[index].typeStr, "TIU BF16 Max Pooling");
+                  else
+                    strcpy(p_element[index].typeStr, "TIU Max Pooling");
+                  break;
+                    case 1:
+                  if (tmpTIU_Reg.opd_typ)
+                    strcpy(p_element[index].typeStr, "TIU BF16 Average Pooling");
+                  else
+                    strcpy(p_element[index].typeStr, "TIU Average Pooling");
+                  break;
+                    case 2:
+                  if (!tmpTIU_Reg.opt_chl_quan) {
+                    if (tmpTIU_Reg.opd_typ)
+                      strcpy(p_element[index].typeStr, "TIU BF16 Depthwise Convolution");
+                    else
+                      strcpy(p_element[index].typeStr, "TIU Depthwise Convolution");
+                  } else {
+                    strcpy(p_element[index].typeStr, "TIU Depthwise PerChannel Convolution");
+                  }
+                  break;
+                    default:
+                      break;
+                  }
+              break;
+            case DCR_TYPE_FC_FIX8B:
+              if (!tmpTIU_Reg.opt_chl_quan) {
+                if (tmpTIU_Reg.opd_typ)
+                  strcpy(p_element[index].typeStr, "TIU BF16 Matrix Multiplication");
+                else
+                  strcpy(p_element[index].typeStr, "TIU Matrix Multiplication");
+              } else {
+                strcpy(p_element[index].typeStr, "TIU PerChannel Matrix Multiplication");
+              }
+              break;
+            case DCR_TYPE_TENSOR_ARITH_FIX8B:
+              if (tmpTIU_Reg.tens_mdsum) {
+                strcpy(p_element[index].typeStr, "TIU Mdsum");
+              } else if (tmpTIU_Reg.tens_lookup) {
+                strcpy(p_element[index].typeStr, "TIU Lookup Table");
+              } else {
+                switch (tmpTIU_Reg.tsk_eu_typ) {
+                  case 0:
+                    if (!tmpTIU_Reg.opt_chl_quan) {
+                      if (tmpTIU_Reg.opd_typ) {
+                        strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Mul");
+                      } else {
+                        strcpy(p_element[index].typeStr, "TIU Element-wise Mul");
+                      }
+                    } else {
+                      strcpy(p_element[index].typeStr, "TIU Element-wise Mul(QDM)");
+                    }
+                    break;
+                  case 1:
+                    if (tmpTIU_Reg.opd_typ) {
+                      strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Mac");
+                    } else {
+                      strcpy(p_element[index].typeStr, "TIU Element-wise Mac");
+                    }
+                    break;
+                  case 2:
+                    if (tmpTIU_Reg.opd_typ) {
+                      strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Add");
+                    } else {
+                      strcpy(p_element[index].typeStr, "TIU Element-wise Add");
+                    }
+                    break;
+                  case 3:
+                    if (tmpTIU_Reg.opd_typ) {
+                      strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Sub");
+                    } else {
+                      strcpy(p_element[index].typeStr, "TIU Element-wise Sub");
+                    }
+                    break;
+                  case 4:
+                    if (tmpTIU_Reg.opd_typ) {
+                      strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Max");
+                    } else {
+                      strcpy(p_element[index].typeStr, "TIU Element-wise Max");
+                    }
+                    break;
+                  case 5:
+                    if (tmpTIU_Reg.opd_typ) {
+                      strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Min");
+                    } else {
+                      strcpy(p_element[index].typeStr, "TIU Element-wise Min");
+                    }
+                    break;
+                  case 6:
+                    strcpy(p_element[index].typeStr, "TIU Element-wise Shift");
+                    break;
+                  case 7:
+                    strcpy(p_element[index].typeStr, "TIU Element-wise AND");
+                    break;
+                  case 8:
+                    strcpy(p_element[index].typeStr, "TIU Element-wise OR");
+                    break;
+                  case 9:
+                    strcpy(p_element[index].typeStr, "TIU Element-wise XOR");
+                    break;
+                  case 10:
+                    if (tmpTIU_Reg.opd_typ) {
+                      strcpy(p_element[index].typeStr, "TIU BF16 Element-wise Copy");
+                    } else {
+                      strcpy(p_element[index].typeStr, "TIU Element-wise Copy");
+                    }
+                    break;
+                  default:
+                    break;
+                }
+              }
+              break;
+          }
+
+          start_index_tiu = i + 1;
+          break;
+        }
+      }
+    }
+    index ++;
+  }
+
+}
+
+#include <iostream>
+#include <fstream>
+using namespace std;
+
+static void tpu_pmu_fwrite_des()
+{
+  uint32_t index = 0;
+	uint64_t srcAddr = 0, dstAddr = 0;
+
+  char lineStr[FILE_OUT_LINE_LEN] = {0};
+  EXCEL_TYPE excelType = EXCEL_TYPE_0;
+
+  std::fstream fout_element;
+  sprintf(lineStr, "%s%s", pmubuf_output_file_env, TPUPMU_DES_FILENAME);
+  //TPU_LOG_DEBUG("out file_des name=%s\n", lineStr);
+  fout_element.open(lineStr, std::ios::out | std::ios::trunc);
+
+  strcpy(lineStr, "pmutype, desID, event0, event1, , start, duration, end, layerID, desType, \
+    srcAddr, dstAddr, trans_fmt, transpose_md, cmd_id, wait_id_tpu, dst_h_stride, dst_c_stride_low, \
+    dst_n_stride, src_h_stride, src_c_stride_low, src_n_stride, dst_c, src_c, dst_w, dst_h, src_w, src_h, src_n\n");
+  fout_element << lineStr;
+
+  //dump descriptor content related
+  while (p_element[index].pmuEvent.type)
+  {
+    switch (p_element[index].pmuEvent.type) {
+      case TPU_PMUTYPE_TDMALOAD:
+        excelType = EXCEL_TYPE_1;
+        break;
+      case TPU_PMUTYPE_TDMASTORE:
+      case TPU_PMUTYPE_TDMAMOVE:
+        excelType = EXCEL_TYPE_2;
+        break;
+      case TPU_PMUTYPE_TIU:
+        excelType = EXCEL_TYPE_3;
+        break;
+    }
+
+    if (p_element[index].pmuEvent.type == TPU_PMUTYPE_TIU) {
+      sprintf(lineStr, "%llu, %llu, %llu, %llu, %u, %u, %u, %u, %u, %s\n",
+                        p_element[index].pmuEvent.type,
+                        p_element[index].pmuEvent.desID,
+                        p_element[index].pmuEvent.eventCnt0,
+                        p_element[index].pmuEvent.eventCnt1,
+                        excelType,
+                        p_element[index].pmuEvent.startTime,
+                        p_element[index].pmuEvent.endTime - p_element[index].pmuEvent.startTime,
+                        p_element[index].pmuEvent.endTime,
+                        p_element[index].tiuReg.layer_info,
+                        p_element[index].typeStr);
+    } else {
+      srcAddr = ((uint64_t)(p_element[index].tdmaReg.src_base_addr_high) << 32) |
+                  (uint64_t)(p_element[index].tdmaReg.src_base_addr_low);
+      dstAddr = ((uint64_t)(p_element[index].tdmaReg.dst_base_addr_high) << 32) |
+                  (uint64_t)(p_element[index].tdmaReg.dst_base_addr_low);
+
+      sprintf(lineStr, "%llu, %llu, %llu, %llu, %u, %u, %u, %u, %u, %s, 0x%" PRIu64 ", 0x%" PRIu64 ", \
+        %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u, %u\n",
+                        p_element[index].pmuEvent.type,
+                        p_element[index].pmuEvent.desID,
+                        p_element[index].pmuEvent.eventCnt0,
+                        p_element[index].pmuEvent.eventCnt1,
+                        excelType,
+                        p_element[index].pmuEvent.startTime,
+                        p_element[index].pmuEvent.endTime - p_element[index].pmuEvent.startTime,
+                        p_element[index].pmuEvent.endTime,
+                        p_element[index].tdmaReg.layer_ID,
+                        p_element[index].typeStr,
+                        srcAddr,
+                        dstAddr,
+                        p_element[index].tdmaReg.trans_fmt,
+                        p_element[index].tdmaReg.transpose_md,
+                        p_element[index].tdmaReg.cmd_id,
+                        p_element[index].tdmaReg.wait_id_tpu,
+                        p_element[index].tdmaReg.dst_h_stride,
+                        p_element[index].tdmaReg.dst_c_stride_low,
+                        p_element[index].tdmaReg.dst_n_stride,
+                        p_element[index].tdmaReg.src_h_stride,
+                        p_element[index].tdmaReg.src_c_stride_low,
+                        p_element[index].tdmaReg.src_n_stride,
+                        p_element[index].tdmaReg.dst_c,
+                        p_element[index].tdmaReg.src_c,
+                        p_element[index].tdmaReg.dst_w,
+                        p_element[index].tdmaReg.dst_h,
+                        p_element[index].tdmaReg.src_w,
+                        p_element[index].tdmaReg.src_h,
+                        p_element[index].tdmaReg.src_n);
+    }
+
+    fout_element << lineStr;
+    index ++;
+  }
+
+  fout_element.close();
+}
+
+static void tpu_pmu_getlayerInfo(void)
+{
+  uint32_t index = 0, layIDIndex = 0;
+  uint32_t curLayID = 0;
+  uint32_t u32SingleDuration = 0;
+
+  //TPU_LOG_DEBUG("tpu_pmu_getlayerInfo() start\n");
+  while (p_element[index].pmuEvent.type) {
+    if (!curLayID) {
+      //record current layerID
+      curLayID = p_element[index].pmuEvent.type == TPU_PMUTYPE_TIU ?
+      p_element[index].tiuReg.layer_info : p_element[index].tdmaReg.layer_ID;
+
+      p_layer[layIDIndex].last_desType = (TPU_PMUTYPE)p_element[index].pmuEvent.type;
+      p_layer[layIDIndex].layerID = curLayID;
+      p_layer[layIDIndex].endTime = p_element[index].pmuEvent.endTime;
+      p_layer[layIDIndex].startTime = p_element[index].pmuEvent.startTime;
+      p_layer[layIDIndex].last_mapping_desID = p_element[index].pmuEvent.desID;
+    } else {
+      //if next layer ID is identical
+      if (curLayID == (p_element[index].pmuEvent.type == TPU_PMUTYPE_TIU ?
+        p_element[index].tiuReg.layer_info : p_element[index].tdmaReg.layer_ID)) {
+        p_layer[layIDIndex].endTime = (p_element[index].pmuEvent.endTime > p_layer[layIDIndex].endTime) ?
+          (p_element[index].pmuEvent.endTime) : (p_layer[layIDIndex].endTime);
+
+        p_layer[layIDIndex].last_mapping_desID = p_element[index].pmuEvent.desID;
+
+      } else {
+        layIDIndex ++;
+        curLayID = p_element[index].pmuEvent.type == TPU_PMUTYPE_TIU ?
+          p_element[index].tiuReg.layer_info : p_element[index].tdmaReg.layer_ID;
+
+        p_layer[layIDIndex].last_desType = (TPU_PMUTYPE)p_element[index].pmuEvent.type;
+        p_layer[layIDIndex].layerID = curLayID;
+        p_layer[layIDIndex].endTime = p_element[index].pmuEvent.endTime;
+        p_layer[layIDIndex].startTime = p_element[index].pmuEvent.startTime;
+        p_layer[layIDIndex].last_mapping_desID = p_element[index].pmuEvent.desID;
+      }
+    }
+
+    //get each duration and then classfy by type
+    u32SingleDuration = p_element[index].pmuEvent.endTime - p_element[index].pmuEvent.startTime;
+    switch (p_element[index].pmuEvent.type) {
+      case TPU_PMUTYPE_TIU:
+        p_layer[layIDIndex].u32TIUTime += u32SingleDuration;
+        break;
+
+      case TPU_PMUTYPE_TDMALOAD:
+        if (p_element[index].tdmaReg.src_base_reg_sel == 0)
+          p_layer[layIDIndex].u32LoadNueronTime += u32SingleDuration;
+        else if (p_element[index].tdmaReg.src_base_reg_sel == 1)
+          p_layer[layIDIndex].u32LoadWeightTime += u32SingleDuration;
+
+        p_layer[layIDIndex].u32TDMATime += u32SingleDuration;
+        break;
+
+      case TPU_PMUTYPE_TDMASTORE:
+        if (p_element[index].tdmaReg.src_base_reg_sel == 0)
+          p_layer[layIDIndex].u32StoreNueronTime += u32SingleDuration;
+
+        p_layer[layIDIndex].u32TDMATime += u32SingleDuration;
+        break;
+
+      default:
+        break;
+    }
+
+    //accumulate byte counts, one burst count = 16bytes
+    p_layer[layIDIndex].u32byteCnt += (p_element[index].pmuEvent.eventCnt1 * 16);
+    index ++;
+  }
+}
+
+static void tpu_pmu_fwrite_layer(uint64_t tpu_clock)
+{
+  uint32_t index = 0;
+  char lineStr[FILE_OUT_LINE_LEN] = {0};
+  uint64_t u64totalDuration = 0, u64singleDuration = 0;
+  std::fstream fout_layer;
+
+  sprintf(lineStr, "%s%s", pmubuf_output_file_env, TPUPMU_LAYER_FILENAME);
+  //TPU_LOG_DEBUG("out file_des name=%s\n", lineStr);
+  fout_layer.open(lineStr, std::ios::out | std::ios::trunc);
+
+  //pre-processing once, and we can get total duration
+  index = 0;
+  while (p_layer[index].layerID) {
+    u64totalDuration += p_layer[index].endTime - p_layer[index].startTime;
+    index ++;
+  }
+
+  index = 0;
+  while (p_layer[index].layerID) {
+    u64singleDuration = p_layer[index].endTime - p_layer[index].startTime;
+    p_layer[index].parallelism = (double)(p_layer[index].u32TDMATime + p_layer[index].u32TIUTime) / (double)u64singleDuration * 100;
+    p_layer[index].parallelism =  p_layer[index].parallelism < 100 ? 100 : p_layer[index].parallelism;
+
+    p_layer[index].duration_percent = (double)u64singleDuration / (double)u64totalDuration * 100;
+    p_layer[index].tiu_percent = (double)p_layer[index].u32TIUTime / (double)u64singleDuration * 100;
+    p_layer[index].loadNeuron_percent = (double)p_layer[index].u32LoadNueronTime / (double)u64singleDuration * 100;
+    p_layer[index].loadWeight_percent = (double)p_layer[index].u32LoadWeightTime / (double)u64singleDuration * 100;
+    p_layer[index].storeNeuron_percent = (double)p_layer[index].u32StoreNueronTime / (double)u64singleDuration * 100;
+    p_layer[index].throughput_MB = (double)p_layer[index].u32byteCnt * tpu_clock / (double)u64singleDuration / 1024 / 1024;
+    index ++;
+  }
+
+  strcpy(lineStr, "layerID, start, duration, end, duration(%), parallelism(%), TIU(%), \
+    loadNeuron(%), loadWeight(%), storeNeuron(%), throughput(MB/s), last_tdmaID, dumpStart, dumpLen, TIU, loadNeuron, \
+    loadWeight, storeNeuron, byteCnt\n");
+
+  fout_layer << lineStr;
+
+  index = 0;
+  while (p_layer[index].layerID) {
+    sprintf(lineStr, "%d, %d, %d, %d, %lf%%, %lf%%, %lf%%, %lf%%, %lf%%, %lf%%, %.2lfMB/s, %d, 0x%x, 0x%x, %d, %d, %d, %d, %d\n",
+                p_layer[index].layerID,
+                p_layer[index].startTime,
+                p_layer[index].endTime - p_layer[index].startTime,
+                p_layer[index].endTime,
+
+                p_layer[index].duration_percent,
+                p_layer[index].parallelism,
+                p_layer[index].tiu_percent,
+                p_layer[index].loadNeuron_percent,
+                p_layer[index].loadWeight_percent,
+                p_layer[index].storeNeuron_percent,
+                p_layer[index].throughput_MB,
+
+                p_layer[index].last_mapping_desID,
+                p_layer[index].u32StartAddr,
+                p_layer[index].u32OutputLen,
+                p_layer[index].u32TIUTime,
+                p_layer[index].u32LoadNueronTime,
+                p_layer[index].u32LoadWeightTime,
+                p_layer[index].u32StoreNueronTime,
+                p_layer[index].u32byteCnt);
+    fout_layer << lineStr;
+    index ++;
+  }
+
+  fout_layer.close();
+}
+
+static int tpu_pmu_time(uint8_t *v_dma_buf, uint64_t p_dma_buf, uint8_t all_info)
+{
+  dma_hdr_t *header = (dma_hdr_t *)(v_dma_buf);
+  struct TPU_PMU_DOUBLEEVENT *pCurrent = (struct TPU_PMU_DOUBLEEVENT *)(v_dma_buf + header->pmubuf_offset);
+
+  uint64_t bmnet_p_total = 0;
+  uint64_t bmnet_p_duration = 0;
+
+  uint64_t u64TDMATotal = 0;
+  uint64_t u64TIUTotal = 0;
+  uint64_t u64_des_start = 0, u64_des_end = 0;
+  uint32_t u32TDMACnt = 0, u32TIUCnt = 0;
+  uint32_t index = 0, diff = 0, wrap_cnt = 0;
+  uint32_t tpu_clk_rate = header->tpu_clk_rate;
+  uint64_t u64_load_bytes = 0, u64_store_bytes = 0;
+  uint32_t tdma_id_previous = 0, tdma_start_pre= 0, tdma_end_pre = 0;
+  double percent_tdma = 0, percent_tiu = 0, percent_paralellism = 0;
+  double ms_tdma = 0, ms_tiu = 0, ms_influence = 0;
+  double load_mb = 0, store_mb = 0;
+  double bandwidth = 0;
+
+  //TPU_LOG_DEBUG("TPU_LOG_DEBUG tpu_pmu_time() all_info=%x\n", all_info);
+  //traverse pmu buffer
+  while (*(uint32_t *)pCurrent) {
+    if (pCurrent->type >= TPU_PMUTYPE_TDMALOAD && pCurrent->type <= TPU_PMUTYPE_TIU) {
+      if (index == 0) {
+        u64_des_start = pCurrent->startTime;
+        u64_des_end = pCurrent->endTime;
+      } else {
+        u64_des_end = pCurrent->endTime;
+      }
+
+      if (all_info)
+        memcpy(&p_element[index].pmuEvent, pCurrent, sizeof(TPU_PMU_DOUBLEEVENT));
+
+    } else {
+      TPU_LOG_ERROR("pmubuf content header type incorrect, just next\n");
+      index ++;
+      pCurrent++;
+      continue;
+    }
+
+    if (pCurrent->type == TPU_PMUTYPE_TIU) {  //tiu case
+      if (pCurrent->endTime > pCurrent->startTime) {
+        diff = pCurrent->endTime - pCurrent->startTime;
+      } else {
+        diff = 0xFFFFFFFF - pCurrent->startTime + pCurrent->endTime;
+        wrap_cnt ++;
+      }
+
+      u64TIUTotal += diff;
+      u32TIUCnt++;
+    } else {    //tdma case
+
+      //g2g will generate two des loadx1+storex1, we only accumulate one of them
+      if ((pCurrent->desID != tdma_id_previous) ||
+          (pCurrent->startTime != tdma_start_pre) ||
+          (pCurrent->endTime != tdma_end_pre)) {
+
+        if (pCurrent->endTime > pCurrent->startTime) {
+          diff = pCurrent->endTime - pCurrent->startTime;
+        } else {
+          diff = TPU_WRAP_LIMIT - pCurrent->startTime + pCurrent->endTime;
+          wrap_cnt ++;
+        }
+        u64TDMATotal += diff;
+        u32TDMACnt++;
+      }
+
+      if (pCurrent->type == TPU_PMUTYPE_TDMALOAD) {
+        u64_load_bytes += TPU_BURST_SIZE * pCurrent->eventCnt1;
+      } else if (pCurrent->type == TPU_PMUTYPE_TDMASTORE) {
+        u64_store_bytes += TPU_BURST_SIZE * pCurrent->eventCnt1;
+      }
+
+      tdma_id_previous = pCurrent->desID;
+      tdma_start_pre = pCurrent->startTime;
+      tdma_end_pre = pCurrent->endTime;
+    } 
+
+    index ++;
+    pCurrent++;
+  }
+
+  bmnet_p_total = u64TDMATotal + u64TIUTotal;
+  if (wrap_cnt)
+    bmnet_p_duration = TPU_WRAP_LIMIT * (wrap_cnt - 1) + TPU_WRAP_LIMIT - u64_des_start + u64_des_end;
+  else
+    bmnet_p_duration = u64_des_end - u64_des_start;
+
+  percent_tdma = (double)u64TDMATotal / (double)bmnet_p_duration * (double)100;
+  percent_tiu = (double)u64TIUTotal / (double)bmnet_p_duration * (double)100;
+  percent_paralellism = (double)(bmnet_p_total) / (double)bmnet_p_duration * (double)100;
+  percent_paralellism = percent_paralellism < 100 ? 100 : percent_paralellism;
+
+  if (!tpu_clk_rate) {
+    tpu_clk_rate = TPU_CLOCK_DEFAULT;
+    printf("can't get tpu clock, assume to %dMhz\n", tpu_clk_rate / 1000000);
+  }
+
+  ms_tdma = (double)u64TDMATotal / (double)tpu_clk_rate * (double)1000;
+  ms_tiu = (double)u64TIUTotal / (double)tpu_clk_rate * (double)1000;
+  ms_influence = (double)bmnet_p_duration / (double)tpu_clk_rate * (double)1000;
+
+  load_mb =  (double)u64_load_bytes / (double)1024 / (double)1024;
+  store_mb =  (double)u64_store_bytes / (double)1024 / (double)1024;
+
+  bandwidth = (double)(load_mb + store_mb) / (double)ms_influence * (double)1000;
+
+  printf("=======================inference total info ==========================\n");
+  //printf("cv183x tpu clock: %dMhz\n", header->tpu_clk_rate / 1000000);
+  printf("%-20s %8dMhz, %-20s %9.2fMB, %-20s %7.2fMB/s\n",
+          "cv183x_tpu_clock:", tpu_clk_rate / 1000000, "inferece_data:", load_mb + store_mb, "inference_bw:", bandwidth);
+
+  printf("%-20s %10" PRIu64 "t, %-20s %10" PRIu64 "t, %-20s %10" PRIu64 "t\n",
+         "tdma_exe_tick:", u64TDMATotal, "tiu_exe_tick", u64TIUTotal, "inference_tick", bmnet_p_duration);
+  printf("%-20s %10.2f%%, %-20s %10.2f%%, %-20s %10.2f%%\n",
+          "tdma_exe_percent:", percent_tdma, "tiu_exe_percent:", percent_tiu, "paralellism_percent", percent_paralellism);
+  printf("%-20s %9.2fms, %-20s %9.2fms, %-20s %9.2fms\n",
+          "tdma_exe_ms:", ms_tdma, "tiu_exe_ms:", ms_tiu, "inference_ms:", ms_influence);
+
+  if (all_info) {
+    tpu_pmu_fill_cmdbuf(v_dma_buf);
+    tpu_pmu_fwrite_des();
+    tpu_pmu_getlayerInfo();
+    tpu_pmu_fwrite_layer(tpu_clk_rate);
+  }
+
+  return 0;
+}
+
+uint32_t tpu_pmu_get_des_cnt(uint8_t *v_dma_buf)
+{
+  uint32_t tiu_cnt = 0, tdma_cnt = 0;
+  dma_hdr_t *header = (dma_hdr_t *)v_dma_buf;
+  cvi_cpu_desc_t *desc = (cvi_cpu_desc_t *)(v_dma_buf + sizeof(dma_hdr_t));
+
+  for (uint32_t i = 0; i < header->cpu_desc_count; i++, desc++) {
+    tiu_cnt += (desc->num_tiu & 0xFFFF);
+    tdma_cnt += (desc->num_tdma & 0xFFFF);
+  }
+
+  //assume worst case tdma g2g case will generate double descriptor
+  return (tiu_cnt + tdma_cnt + tdma_cnt);
+}
+
+#define TPU_PMU_MALLOC_PADDING  1024
+uint32_t tpu_pmu_dump_main(uint8_t *v_dma_buf, uint64_t p_dma_buf)
+{
+  dma_hdr_t *dma_header = (dma_hdr_t *)v_dma_buf;
+  uint8_t all_info = 0;
+
+  //check header first
+  if (dma_header->dmabuf_magic_m != TPU_DMABUF_HEADER_M) {
+    TPU_LOG_NOTICE("pmu buffer header incorrect\n");
+    return CVI_RC_FAILURE;
+  }
+
+  //check if we need output pmubuf
+  pmubuf_output_file_env = std::getenv("TPU_PMUBUF_OUTPUT_FILE");
+  if (pmubuf_output_file_env) {
+    all_info = 1;
+  }
+
+  //malloc element array
+  if (all_info) {
+    p_element = (TPU_DES_ELEMENT *)malloc(tpu_pmu_get_des_cnt(v_dma_buf) * sizeof(TPU_DES_ELEMENT) + TPU_PMU_MALLOC_PADDING);
+    p_layer = (TPU_LAYERID_ELEMENT *)malloc(tpu_pmu_get_des_cnt(v_dma_buf) * sizeof(TPU_LAYERID_ELEMENT) + TPU_PMU_MALLOC_PADDING);
+
+    if (!p_element || !p_layer) {
+      TPU_LOG_INFO("tpu pmu des array malloc failed\n");
+      return CVI_RC_FAILURE;
+    }
+  }
+
+  //get pmu overview data
+  tpu_pmu_time(v_dma_buf, p_dma_buf, all_info);
+
+  //free element array
+  if (all_info) {
+    if (p_element) {
+      free(p_element);
+      p_element = NULL;
+    }
+
+    if (p_layer) {
+      free(p_layer);
+      p_layer = NULL;
+    }
+  }
+
+  return CVI_RC_SUCCESS;
+}
+
diff --git a/cviruntime/src/soc/common/bm_types.h b/cviruntime/src/soc/common/bm_types.h
new file mode 100644
index 000000000..d90691a66
--- /dev/null
+++ b/cviruntime/src/soc/common/bm_types.h
@@ -0,0 +1,127 @@
+#pragma once
+#include <pthread.h>
+#include <bmruntime.h>
+#include <cvikernel/cvikernel.h>
+#include "cvitpu_debug.h"
+#include <bmkernel/bm_regcpu.h>
+#include <bmkernel/bm1880v2/bmkernel_1880v2.h>
+#include <bmkernel/bm1822/bmkernel_1822.h>
+
+#ifdef __cplusplus
+	extern "C" {
+#endif
+
+#define UNUSED(x) (void)(x)
+#define MEMARRAY_MAX_CNT 1000
+
+typedef struct __cvi_mem_pair {
+  uint64_t p_addr;
+  void *mem;
+} cvi_mem_pair;
+
+typedef struct bm_context {
+  bmdev_t dev;
+  u16 seq_no;
+  union {
+    bmk1880v2_context_t *ctx183x;
+    bmk1822_context_t *ctx182x;
+  } cvik_context;
+  cvk_context_t *cvk_context;
+  void *cvik_cmdbuf;
+
+  uint64_t array_base0;
+  uint64_t array_base1;
+  cvi_mem_pair  root_mem_array[MEMARRAY_MAX_CNT];
+} bm_context_t;
+
+typedef struct bm_device {
+  int index;
+  int dev_fd;
+  int ion_fd;
+  u32 ion_heap_id;
+  union {
+    bmk1880v2_chip_info_t info183x;
+    bmk1822_chip_info_t info182x;
+  } info;
+  unsigned long long gmem_size;
+  pthread_mutex_t lock;
+#define BMDEV_LOCK_INIT(dev)    pthread_mutex_init(&dev->lock, NULL)
+#define BMDEV_LOCK_DEINIT(dev)  pthread_mutex_destroy(&dev->lock)
+#define BMDEV_LOCK(dev)         pthread_mutex_lock(&dev->lock)
+#define BMDEV_UNLOCK(dev)       pthread_mutex_unlock(&dev->lock)
+} bm_device_t;
+
+typedef enum {
+  BMMEM_TYPE_DEVICE             = 0,
+  BMMEM_TYPE_DEVICE_NEURON      = 1,  // obsolete
+  BMMEM_TYPE_DEVICE_COEFF       = 2,  // obsolete
+  BMMEM_TYPE_HOST               = 3,
+  BMMEM_TYPE_SYSTEM             = 4,  // obsolete
+  BMMEM_TYPE_INVALID            = 5
+} bmmem_type_t;
+
+typedef union {
+  struct {
+    bmmem_type_t        type : 3;
+    int                 is_prealloc: 1;
+    unsigned long long  reserved : 60;
+  } u;
+  unsigned long long    rawflags;
+} bmmem_flags_t;
+
+typedef struct bm_memory {
+  uint8_t               *v_addr;  // for host, or mapped device in soc mode
+  uint64_t              p_addr;
+  size_t                size;
+  bmmem_flags_t         flags;
+  uint32_t              crc32;  // for data check if needed
+  int                   dma_fd;
+  int32_t               user_ref_cnt;
+  uint64_t              offset;
+} bm_memory_t;
+
+typedef struct __dma_hdr_t {
+  uint16_t dmabuf_magic_m;
+  uint16_t dmabuf_magic_s;
+  uint32_t dmabuf_size;
+  uint32_t cpu_desc_count;
+  uint32_t bd_desc_count; //16bytes
+  uint32_t tdma_desc_count;
+  uint32_t tpu_clk_rate;
+  uint32_t pmubuf_size;
+  uint32_t pmubuf_offset; //32bytes
+  uint32_t arraybase_0_L;
+  uint32_t arraybase_0_H;
+  uint32_t arraybase_1_L;
+  uint32_t arraybase_1_H; //48bytes
+  uint32_t arraybase_2_L;
+  uint32_t arraybase_2_H;
+  uint32_t arraybase_3_L;
+  uint32_t arraybase_3_H; //64bytes
+
+  uint32_t arraybase_4_L;
+  uint32_t arraybase_4_H;
+  uint32_t arraybase_5_L;
+  uint32_t arraybase_5_H;
+  uint32_t arraybase_6_L;
+  uint32_t arraybase_6_H;
+  uint32_t arraybase_7_L;
+  uint32_t arraybase_7_H;
+  uint32_t reserve[8];   //128bytes, 128bytes align
+} dma_hdr_t;
+
+// CPU_OP_SYNC structure
+typedef struct {
+  uint32_t op_type;
+  uint32_t num_tiu;
+  uint32_t num_tdma;
+  uint32_t offset_tiu;
+  uint32_t offset_tdma;
+  uint32_t offset_tiu_ori_bk;
+	uint32_t offset_tdma_ori_bk;
+  char str[CPU_ENGINE_STR_LIMIT_BYTE];
+} __attribute__((packed)) cvi_cpu_desc_t;
+
+#ifdef __cplusplus
+}
+#endif
\ No newline at end of file
diff --git a/cviruntime/src/soc/common/cvi_device_mem.cpp b/cviruntime/src/soc/common/cvi_device_mem.cpp
new file mode 100644
index 000000000..d875ada08
--- /dev/null
+++ b/cviruntime/src/soc/common/cvi_device_mem.cpp
@@ -0,0 +1,938 @@
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <memory.h>
+#include <cstdlib>
+#include <sys/mman.h>
+#include <runtime/debug.h>
+#include "cvi_device_mem.h"
+#include "bmruntime.h"
+#include "cvitpu_debug.h"
+#include "errno.h"
+
+bmctx_t  CviDeviceMem::root_ctx_array[CTX_MAX_CNT] = {0};
+uint16_t CviDeviceMem::root_submit_array[SUBMIT_QUEUE_MAX] = {0};
+pthread_mutex_t CviDeviceMem::root_daemon_lock = PTHREAD_MUTEX_INITIALIZER;
+tee_firewall_info CviDeviceMem::root_tee_firewall_info[TEE_FIREWALL_MAX] = {0};
+
+CviDeviceMem::CviDeviceMem() {
+  if (std::getenv("TPU_ENABLE_PROTECT")) {
+    printf("TPU_ENABLE_PROTECT, protect=true \n");
+    protect = true;
+  }
+}
+
+CviDeviceMem::~CviDeviceMem() {}
+#ifdef ION_CACHE_OPEN
+bmerr_t CviDeviceMem::mem_flush_fd(bmdev_t dev, int dma_fd) {
+  int ret;
+  ret = ioctl(dev->dev_fd, CVITPU_DMABUF_FLUSH_FD, &dma_fd);
+  if (ret != 0) {
+    TPU_LOG_WARNING("memory flush failed, ret=%x\n", ret);
+    return BM_ERR_FAILURE;
+  }
+  return BM_SUCCESS;
+}
+
+bmerr_t CviDeviceMem::mem_invld_fd(bmdev_t dev, int dma_fd) {
+  int ret;
+  ret = ioctl(dev->dev_fd, CVITPU_DMABUF_INVLD_FD, &dma_fd);
+  if (ret != 0) {
+    TPU_LOG_WARNING("memory invalidate failed, ret=%x\n", ret);
+    return BM_ERR_FAILURE;
+  }
+  return BM_SUCCESS;
+}
+
+#define __ALIGN_MASK(x,mask)    (((x)+(mask))&~(mask))
+#define ALIGN(x,a)              __ALIGN_MASK(x,(__typeof__(x))(a)-1)
+
+bmerr_t CviDeviceMem::mem_flush_ext(bmdev_t dev, int dma_fd, uint64_t paddr, size_t size)
+{
+  int ret;
+  struct bm_cache_op_arg flush_arg;
+  flush_arg.paddr = paddr + GLOBAL_MEM_START_ADDR;
+  flush_arg.size = size;
+  flush_arg.dma_fd = dma_fd;
+
+  uint64_t addr_new = ALIGN(flush_arg.paddr, 0x40);
+  flush_arg.size = ALIGN(flush_arg.size, 0x40);
+
+  if (addr_new != flush_arg.paddr) {
+    //TPU_LOG_WARNING("fix flush add_p=0x%lx, len=0x%llx, add_p_ori=0x%llx\n", addr_new - 0x40, flush_arg.size, flush_arg.paddr);
+    flush_arg.size += 0x40;
+    flush_arg.paddr = addr_new - 0x40;
+  } else {
+    //TPU_LOG_WARNING("ok flush add_p=0x%llx, len=0x%llx\n", flush_arg.paddr, flush_arg.size);
+  }
+
+  // In some special cases, third-party libraries may cause fd errors
+  // so if the ioctl fails, then reopen the device
+  for (int i = 0; i < 3; ++i) {
+    ret = ioctl(dev->dev_fd, CVITPU_DMABUF_FLUSH, &flush_arg);
+    if (ret != 0) {
+        perror("flush ioctl fail:");
+        TPU_LOG_WARNING("memory flush failed, ret=%x\n", ret);
+        reopen_dev(dev, 1);
+    } else {
+        break;
+    }
+  }
+  return ret == 0 ? BM_SUCCESS : BM_ERR_FAILURE;
+}
+
+bmerr_t CviDeviceMem::mem_invld_ext(bmdev_t dev, int dma_fd, uint64_t paddr, size_t size)
+{
+  int ret;
+  struct bm_cache_op_arg invalidate_arg;
+  invalidate_arg.paddr = paddr + GLOBAL_MEM_START_ADDR;
+  invalidate_arg.size = size;
+  invalidate_arg.dma_fd = dma_fd;
+
+  uint64_t addr_new = ALIGN(invalidate_arg.paddr, 0x40);
+  invalidate_arg.size = ALIGN(invalidate_arg.size, 0x40);
+
+  if (addr_new != invalidate_arg.paddr) {
+    //TPU_LOG_WARNING("fix invalid add_p=0x%lx, len=0x%llx, add_p_ori=0x%llx\n", addr_new - 0x40, invalidate_arg.size, invalidate_arg.paddr);
+    invalidate_arg.size += 0x40;
+    invalidate_arg.paddr = addr_new - 0x40;
+  } else {
+    //TPU_LOG_WARNING("ok invalid add_p=0x%llx, len=0x%llx\n", invalidate_arg.paddr, invalidate_arg.size);
+  }
+
+  // In some special cases, third-party libraries may cause fd errors
+  // so if the ioctl fails, then reopen the device
+  for (int i = 0; i < 3; ++i) {
+      ret = ioctl(dev->dev_fd, CVITPU_DMABUF_INVLD, &invalidate_arg);
+      if (ret != 0) {
+          perror("invld ioctl fail:");
+          TPU_LOG_WARNING("memory invalidate failed, ret=%x\n", ret);
+          reopen_dev(dev, 1);
+      } else {
+        break;
+      }
+  }
+  return ret == 0 ? BM_SUCCESS : BM_ERR_FAILURE;
+}
+
+#else
+
+bmerr_t CviDeviceMem::mem_flush_ext(bmdev_t dev, int dma_fd, uint64_t paddr, size_t size)
+{
+  return BM_SUCCESS;
+}
+
+bmerr_t CviDeviceMem::mem_invld_ext(bmdev_t dev, int dma_fd, uint64_t paddr, size_t size)
+{
+  return BM_SUCCESS;
+}
+
+#endif
+
+bmerr_t CviDeviceMem::submit_dmabuf(bmdev_t dev, int dma_fd, uint32_t seq_no)
+{
+  struct bm_submit_dma_arg submit_dma_arg;
+  submit_dma_arg.fd = dma_fd;
+  submit_dma_arg.seq_no = seq_no;
+  int ret = ioctl(dev->dev_fd, CVITPU_SUBMIT_DMABUF, &submit_dma_arg);
+  if (ret != 0) {
+      perror("submit ioctl fail:");
+      TPU_LOG_WARNING("submit dmabuf failed err[%d]\n", ret);
+      reopen_dev(dev, 1);
+  }
+  return ret ? BM_ERR_FAILURE : BM_SUCCESS;
+}
+
+bmerr_t CviDeviceMem::wait_dmabuf(bmdev_t dev, uint32_t seq_no)
+{
+  struct bm_wait_dma_arg wait_dma_arg;
+  wait_dma_arg.seq_no = seq_no;
+  int ret = 0, loop_cnt = 0;
+
+  do {
+    if (loop_cnt > 10)
+      TPU_LOG_WARNING("bm_device_wait_dmabuf() triggered loop=%d\n", loop_cnt);
+
+    ret = ioctl(dev->dev_fd, CVITPU_WAIT_DMABUF, &wait_dma_arg);
+    loop_cnt ++;
+  } while (ret);
+
+  if (wait_dma_arg.ret != 0) {
+    TPU_LOG_WARNING("wait dmabuf failed[%d]\n", wait_dma_arg.ret);
+    return BM_ERR_FAILURE;
+  }
+  return BM_SUCCESS;
+}
+
+// In some special cases, third-party libraries may cause fd errors
+// so if the ioctl fails, then reopen the device
+bmerr_t CviDeviceMem::reopen_dev(bmdev_t dev, int flag) {
+    if (flag == 1) {
+        // reopen tpu
+        const char *tpu_dev_name_defalut = TPU_DEV_NAME;
+        const char *tpu_dev_name_env     = std::getenv("TPU_DEV");
+        const char *tpu_dev_name         = tpu_dev_name_defalut;
+        if (tpu_dev_name_env) {
+            tpu_dev_name = tpu_dev_name_env;
+        }
+        printf("reopen tpu dev\n");
+        TPU_LOG_WARNING("reopen tpu dev");
+        int dev_fd = open(tpu_dev_name, O_RDWR);
+        if (dev_fd <= 0) {
+            TPU_LOG_WARNING("open tpu dev failed\n");
+            return BM_ERR_FAILURE;
+        } else {
+            close(dev->dev_fd);
+            dev->dev_fd = dev_fd;
+        }
+        printf("reopen tpu dev success\n");
+        TPU_LOG_WARNING("reopen tpu dev success");
+    } else if (flag == 2) {
+        // reopen ion
+        printf("reopen ion dev\n");
+        TPU_LOG_WARNING("reopen ion dev");
+        int ion_fd = open(ION_DEV_NAME, O_RDWR | O_DSYNC);
+        if (ion_fd <= 0) {
+            TPU_LOG_WARNING("open ion dev failed\n");
+            return BM_ERR_FAILURE;
+        } else {
+            printf("reopen ion dev success\n");
+            TPU_LOG_WARNING("reopen ion dev success");
+            close(dev->ion_fd);
+            dev->ion_fd = ion_fd;
+            /*
+            if (0 != ion_query_heap(dev)) {
+                TPU_LOG_WARNING("ion_query_heap failed\n");
+            }
+            */
+        }
+    } else {
+        TPU_LOG_WARNING("Input param error in reopen_dev! flag:%d\n", flag);
+        return BM_ERR_INVALID_ARGUMENT;
+    }
+    return BM_SUCCESS;
+}
+
+bmerr_t CviDeviceMem::ion_ioctl(int fd, unsigned int heap_id_mask, size_t* size,  uint64_t *paddr, int *dma_fd) {
+  if (!ion_legacy) {
+    struct ion_allocation_data alloc_data;
+    int ret;
+
+    /* alloc buffer */
+    memset(&alloc_data, 0, sizeof(struct ion_allocation_data));
+    alloc_data.len          = *size;
+    alloc_data.heap_id_mask = heap_id_mask;
+#ifdef ION_CACHE_OPEN
+    alloc_data.flags = ION_FLAG_CACHED;
+#else
+    alloc_data.flags = 0;  // ION_FLAG_NONCACHED;
+#endif
+    strncpy(alloc_data.name, "tpu", MAX_ION_BUFFER_NAME);
+
+    // In some special cases, third-party libraries may cause fd errors
+    // so if the ioctl fails, then reopen the device
+    ret = ioctl(fd, ION_IOC_ALLOC, &alloc_data);
+    if (0 == ret) {
+      *paddr = alloc_data.paddr;
+      *dma_fd = alloc_data.fd; 
+      *size = alloc_data.len;
+      return BM_SUCCESS;
+    } else if (errno == EINVAL) {
+      ion_legacy = true; 
+      TPU_LOG_WARNING("use ion legacy!");
+    } else {
+      perror("ion ioctl fail:");
+      TPU_LOG_WARNING("ion alloc failed, size = %zu, ret=%x\n", *size, ret);
+      return BM_ERR_FAILURE;
+    }
+  }
+
+  struct ion_allocation_data_legacy alloc_data;
+  int ret;
+
+  /* alloc buffer */
+  memset(&alloc_data, 0, sizeof(struct ion_allocation_data_legacy));
+  alloc_data.len          = *size;
+  alloc_data.heap_id_mask = heap_id_mask;
+#ifdef ION_CACHE_OPEN
+  alloc_data.flags = ION_FLAG_CACHED;
+#else
+  alloc_data.flags = 0;  // ION_FLAG_NONCACHED;
+#endif
+
+  // In some special cases, third-party libraries may cause fd errors
+  // so if the ioctl fails, then reopen the device
+  ret = ioctl(fd, ION_IOC_ALLOC_LEGACY, &alloc_data);
+  if (0 == ret) {
+    *paddr = alloc_data.paddr;
+    *dma_fd = alloc_data.fd; 
+    *size = alloc_data.len;
+    return BM_SUCCESS;
+  } else {
+    perror("ion ioctl fail:");
+    TPU_LOG_WARNING("ion alloc failed, size = %zu, ret=%x\n", *size, ret);
+    return BM_ERR_FAILURE;
+  }
+}
+
+bmerr_t CviDeviceMem::mem_alloc(bmdev_t dev, size_t size, uint64_t *paddr,
+                      uint8_t **vaddr, int *dma_fd)
+{
+  void *user_addr = NULL;
+  int ret;
+  unsigned int heap_id_mask = (1 << dev->ion_heap_id);
+  int err_flag = 0;
+
+  // In some special cases, third-party libraries may cause fd errors
+  // so if the ioctl fails, then reopen the device
+  for (int i = 0; i < 3; ++i) {
+    do {
+      ret = ion_ioctl(dev->ion_fd, heap_id_mask, &size, paddr, dma_fd);
+      if (ret != BM_SUCCESS) {
+        err_flag = 2;
+        break;
+      }
+
+      /* mmap to user */
+      user_addr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED,
+                       *dma_fd, 0);
+      if (user_addr == MAP_FAILED) {
+        perror("mmap fail:");
+        TPU_LOG_WARNING("ion map failed phy_addr[%" PRIx64 "] length[%zu] fd[%u]\n", *paddr, size, *dma_fd);
+        close(*dma_fd);
+        err_flag = 0;
+        ret = -1;
+        break;
+      }
+
+      *paddr -= GLOBAL_MEM_START_ADDR;
+      *vaddr = (uint8_t *)user_addr;
+      ret = mem_invld_ext(dev, *dma_fd, *paddr, size);
+      if (ret != BM_SUCCESS) {
+        munmap(user_addr, size);
+        close(*dma_fd);
+        TPU_LOG_WARNING("memory invalidate failed, ret=%x\n", ret);
+        err_flag = 1;
+        break;
+      }
+    } while (0);
+    if (ret == BM_SUCCESS) {
+      break;
+    } else {
+      if (err_flag > 0) {
+        reopen_dev(dev, err_flag);
+      }
+    }
+  }
+
+  return ret;
+}
+
+bmerr_t CviDeviceMem::mem_free(uint8_t *vaddr, size_t size, int dma_fd)
+{
+  munmap(vaddr, size);
+  close(dma_fd);
+  return BM_SUCCESS;
+}
+
+bmerr_t CviDeviceMem::ion_query_heap(bmdev_t dev)
+{
+  unsigned int heap_id;
+  struct ion_heap_query query;
+  struct ion_heap_data heap_data[MAX_HEAP_COUNT];
+  int ret;
+
+  memset(&query, 0, sizeof(query));
+  query.cnt = MAX_HEAP_COUNT;
+  query.heaps = (unsigned long int)&heap_data[0];
+  ret = ioctl(dev->ion_fd, ION_IOC_HEAP_QUERY, &query);
+  if (ret != 0) {
+    TPU_LOG_WARNING("io query heap failed, ret=%x\n", ret);
+    return BM_ERR_FAILURE;
+  }
+
+  heap_id = MAX_HEAP_COUNT + 1;
+  for (unsigned int i = 0; i < query.cnt; i++) {
+    if (heap_data[i].type == ION_HEAP_TYPE_CARVEOUT) {
+      heap_id = heap_data[i].heap_id;
+      break;
+    }
+  }
+
+  if (heap_id > MAX_HEAP_COUNT) {
+    TPU_LOG_WARNING("no carveout heap found\n");
+    return BM_ERR_FAILURE;
+  }
+
+  dev->ion_heap_id = heap_id;
+  return BM_SUCCESS;
+}
+
+bmerr_t CviDeviceMem::load_tee(bmctx_t ctx, uint64_t cmdbuf_addr_ree, uint32_t cmdbuf_len_ree,
+                      uint64_t weight_addr_ree, uint32_t weight_len_ree,
+                      uint64_t neuron_addr_ree)
+{
+  struct cvi_load_tee_arg iocrl_arg;
+
+  //recover physical address for TEE 1 vs 1 memory mapping
+  iocrl_arg.cmdbuf_addr_ree = cmdbuf_addr_ree + GLOBAL_MEM_START_ADDR;
+  iocrl_arg.cmdbuf_len_ree = cmdbuf_len_ree;
+  iocrl_arg.weight_addr_ree = weight_addr_ree + GLOBAL_MEM_START_ADDR;
+  iocrl_arg.weight_len_ree = weight_len_ree;
+  iocrl_arg.neuron_addr_ree = neuron_addr_ree + GLOBAL_MEM_START_ADDR;
+
+  int ret = ioctl(ctx->dev->dev_fd, CVITPU_LOAD_TEE, &iocrl_arg);
+  TPU_ASSERT(ret == 0, "cvi_device_load_tee() failed");
+  return BM_SUCCESS;
+}
+
+bmerr_t CviDeviceMem::run_tee(bmctx_t ctx, uint32_t seq_no, uint64_t dmabuf_addr,
+                      uint64_t array_base2, uint64_t array_base3,
+                      uint64_t array_base4, uint64_t array_base5,
+                      uint64_t array_base6, uint64_t array_base7)
+{
+  struct cvi_submit_tee_arg iocrl_arg;
+
+  iocrl_arg.dmabuf_tee_addr = dmabuf_addr;
+  iocrl_arg.gaddr_base2 = array_base2;
+  iocrl_arg.gaddr_base3 = array_base3;
+  iocrl_arg.gaddr_base4 = array_base4;
+  iocrl_arg.gaddr_base5 = array_base5;
+  iocrl_arg.gaddr_base6 = array_base6;
+  iocrl_arg.gaddr_base7 = array_base7;
+  iocrl_arg.seq_no = seq_no;
+
+  int ret = ioctl(ctx->dev->dev_fd, CVITPU_SUBMIT_TEE, &iocrl_arg);
+  TPU_ASSERT(ret == 0, "cvi_device_run_tee() failed");
+
+  return BM_SUCCESS;
+}
+
+
+void CviDeviceMem::bmmem_dump_mem_array(void) {
+  int total = 0;
+  cvi_mem_pair *root_pair = NULL;
+
+  for (int j = 0; j < CTX_MAX_CNT; j ++) {
+
+    if (root_ctx_array[j]) {
+      root_pair = root_ctx_array[j]->root_mem_array;
+
+      for (int i = 0; i < MEMARRAY_MAX_CNT; i ++) {
+        if (root_pair[i].p_addr) {
+          TPU_LOG_DEBUG("%" PRIx64 ", index=%x\n", root_pair[i].p_addr, i);
+          total ++;
+        }
+      }
+    }
+  }
+
+  TPU_LOG_DEBUG("bmmem_dump_mem_array() cnt=%x\n", total);
+}
+
+bmmem_device_t CviDeviceMem::mem_alloc_raw(bmctx_t ctx, size_t size) {
+  int ret = 0, i = 0;
+  char array_got = 0;
+
+  bm_memory_t *device_mem = new bm_memory_t();
+  device_mem->flags.u.is_prealloc = 0;
+  device_mem->flags.u.type = BMMEM_TYPE_DEVICE;
+  device_mem->size = size;
+  device_mem->user_ref_cnt = 0;
+  ROOTDAEMON_LOCK();
+  ret = mem_alloc(ctx->dev, size, &device_mem->p_addr, &device_mem->v_addr,
+                            &device_mem->dma_fd);
+  ROOTDAEMON_UNLOCK();
+
+  if (ret != BM_SUCCESS) {
+    delete device_mem;
+    TPU_ASSERT(0, "alloc ion failed");
+    return NULL;
+  }
+
+  ROOTDAEMON_LOCK();
+  //only support alloc, not support for prealloc
+  for (i = 0; i < MEMARRAY_MAX_CNT; i ++) {
+    if (ctx->root_mem_array[i].p_addr == 0) {
+      ctx->root_mem_array[i].p_addr = device_mem->p_addr;
+      ctx->root_mem_array[i].mem = (void *)device_mem;
+      array_got = 1;
+      break;
+    }
+  }
+  ROOTDAEMON_UNLOCK();
+
+  if (!array_got)
+    TPU_LOG_WARNING("bmmem_device_alloc_raw() alloc over %d\n", MEMARRAY_MAX_CNT);
+
+  BMEMEM_DUMP();
+  return (bmmem_device_t)device_mem;
+}
+
+bmmem_device_t CviDeviceMem::mem_alloc_pagesize(bmctx_t ctx, size_t size) {
+  int ret = 0, i = 0;
+  char array_got = 0;
+  int pagesize = getpagesize();
+  size_t align_size = align_up(size, pagesize);
+  bm_memory_t *device_mem = new bm_memory_t();
+  device_mem->flags.u.is_prealloc = 0;
+  device_mem->flags.u.type = BMMEM_TYPE_DEVICE;
+  device_mem->user_ref_cnt = 0;
+  device_mem->size = align_size;
+  ROOTDAEMON_LOCK();  
+  ret = mem_alloc(ctx->dev, align_size, &device_mem->p_addr, &device_mem->v_addr,
+                            &device_mem->dma_fd);
+  ROOTDAEMON_UNLOCK();
+
+  if (ret != BM_SUCCESS) {
+    delete device_mem;
+    TPU_ASSERT(0, "alloc ion failed");
+    return NULL;
+  }
+
+  ROOTDAEMON_LOCK();
+  //only support alloc, not support for prealloc
+  for (i = 0; i < MEMARRAY_MAX_CNT; i ++) {
+    if (ctx->root_mem_array[i].p_addr == 0) {
+      ctx->root_mem_array[i].p_addr = device_mem->p_addr;
+      ctx->root_mem_array[i].mem = (void *)device_mem;
+      array_got = 1;
+      break;
+    }
+  }
+  ROOTDAEMON_UNLOCK();
+
+  if (!array_got)
+    TPU_LOG_WARNING("bmmem_device_alloc_raw() alloc over %d\n", MEMARRAY_MAX_CNT);
+
+  BMEMEM_DUMP();
+  return (bmmem_device_t)device_mem;
+}
+
+bmmem_device_t CviDeviceMem::mem_prealloc_raw(bmctx_t ctx, bmmem_device_t mem, uint64_t offset,
+                                         size_t size) {
+  (void)ctx;
+  TPU_ASSERT(mem != nullptr, nullptr);
+  TPU_ASSERT(mem->size >= size + offset, nullptr);
+  bm_memory_t *device_mem = new bm_memory_t();
+  device_mem->flags.u.is_prealloc = 1;
+  device_mem->flags.u.type = BMMEM_TYPE_DEVICE;
+  device_mem->p_addr = ((bm_memory_t *)mem)->p_addr + offset;
+  device_mem->v_addr = ((bm_memory_t *)mem)->v_addr + offset;
+  device_mem->dma_fd = ((bm_memory_t *)mem)->dma_fd;
+  device_mem->offset = offset;
+  device_mem->size = size;
+  device_mem->user_ref_cnt = 0;
+
+  BMEMEM_DUMP();
+  return (bmmem_device_t)device_mem;
+}
+
+
+void CviDeviceMem::mem_free_ex(uint64_t p_addr) {
+  bm_memory_t *device_mem = NULL;
+  cvi_mem_pair *root_pair = NULL;
+
+  ROOTDAEMON_LOCK();
+  for (int j = 0; j < CTX_MAX_CNT; j ++) {
+
+    if (root_ctx_array[j]) {
+      root_pair = root_ctx_array[j]->root_mem_array;
+
+      for (int i = 0; i < MEMARRAY_MAX_CNT; i ++) {
+        if (root_pair[i].p_addr == p_addr) {
+          device_mem = (bm_memory_t *)(root_pair[i].mem);
+          mem_free(device_mem->v_addr, device_mem->size, device_mem->dma_fd);
+
+          root_pair[i].p_addr = 0;
+          root_pair[i].mem = NULL;
+          break;
+        }
+      }
+    }
+  }
+  ROOTDAEMON_UNLOCK();
+
+  BMEMEM_DUMP();
+  delete device_mem;
+}
+
+size_t CviDeviceMem::mem_size(bmmem_device_t mem) {
+  bm_memory_t *device_mem = (bm_memory_t *)mem;
+  if (device_mem == NULL)
+    return 0;
+  TPU_ASSERT(device_mem->flags.u.type == BMMEM_TYPE_DEVICE, NULL);
+  return device_mem->size;
+}
+
+uint64_t CviDeviceMem::mem_p_addr(bmmem_device_t mem) {
+  bm_memory_t *device_mem = (bm_memory_t *)mem;
+  if (device_mem == NULL)
+    return 0;
+  TPU_ASSERT(device_mem->flags.u.type == BMMEM_TYPE_DEVICE, NULL);
+  // tl_gdma will add an GLOBAL_MEM_START_ADDR offset to ga
+  return device_mem->p_addr;
+}
+
+uint8_t *CviDeviceMem::mem_v_addr(bmmem_device_t mem) {
+  bm_memory_t *device_mem = (bm_memory_t *)mem;
+  TPU_ASSERT(device_mem->flags.u.type == BMMEM_TYPE_DEVICE, NULL);
+  return device_mem->v_addr;
+}
+
+int32_t CviDeviceMem::mem_inc_ref(bmmem_device_t mem) {
+  bm_memory_t *device_mem = (bm_memory_t *)mem;
+  TPU_ASSERT(device_mem->flags.u.type == BMMEM_TYPE_DEVICE, NULL);
+  return (++device_mem->user_ref_cnt);
+}
+
+int32_t CviDeviceMem::mem_dec_ref(bmmem_device_t mem) {
+  bm_memory_t *device_mem = (bm_memory_t *)mem;
+  TPU_ASSERT(device_mem->flags.u.type == BMMEM_TYPE_DEVICE, NULL);
+  return (--device_mem->user_ref_cnt);
+}
+
+bmerr_t CviDeviceMem::mem_memcpy_s2d(bmctx_t ctx, bmmem_device_t dst, uint8_t *src) {
+  bm_memory_t *device_mem = (bm_memory_t *)dst;
+  memcpy(device_mem->v_addr, src, device_mem->size);
+  TPU_ASSERT((int)mem_flush_ext(ctx->dev, device_mem->dma_fd,
+        device_mem->p_addr, device_mem->size) == BM_SUCCESS, NULL);
+  return BM_SUCCESS;
+}
+
+bmerr_t CviDeviceMem::mem_memcpy_s2d_ex(bmctx_t ctx, bmmem_device_t dst, uint8_t *src, uint64_t offset,
+                         size_t size) {
+  bm_memory_t *device_mem = (bm_memory_t *)dst;
+  memcpy(device_mem->v_addr + offset, src, size);
+  TPU_ASSERT(mem_flush_ext(ctx->dev,  device_mem->dma_fd,
+        device_mem->p_addr + offset, size) == BM_SUCCESS, NULL);
+  return BM_SUCCESS;
+}
+
+bmerr_t CviDeviceMem::mem_memcpy_d2s(bmctx_t ctx, uint8_t *dst, bmmem_device_t src) {
+  bm_memory_t *device_mem = (bm_memory_t *)src;
+  TPU_ASSERT(mem_invld_ext(ctx->dev, device_mem->dma_fd,
+        device_mem->p_addr, device_mem->size) == BM_SUCCESS, NULL);
+  memcpy(dst, device_mem->v_addr, device_mem->size);
+  return BM_SUCCESS;
+}
+
+bmerr_t CviDeviceMem::mem_memcpy_d2s_ex(bmctx_t ctx, uint8_t *dst, bmmem_device_t src, uint64_t offset,
+                         size_t size) {
+  bm_memory_t *device_mem = (bm_memory_t *)src;
+  TPU_ASSERT(mem_invld_ext(ctx->dev, device_mem->dma_fd,
+        device_mem->p_addr + offset, size) == BM_SUCCESS, NULL);
+  memcpy(dst, (device_mem->v_addr + offset), size);
+  return BM_SUCCESS;
+}
+
+bmerr_t CviDeviceMem::mem_device_flush(bmctx_t ctx, bmmem_device_t mem) {
+  return mem_flush_ext(ctx->dev, mem->dma_fd, mem->p_addr, mem->size);
+}
+
+bmerr_t CviDeviceMem::mem_device_flush_len(bmctx_t ctx, bmmem_device_t mem, size_t len) {
+  return mem_flush_ext(ctx->dev, mem->dma_fd, mem->p_addr, len);
+}
+
+bmerr_t CviDeviceMem::mem_device_invld(bmctx_t ctx, bmmem_device_t mem) {
+  return mem_invld_ext(ctx->dev, mem->dma_fd, mem->p_addr, mem->size);
+}
+
+bmerr_t CviDeviceMem::mem_device_invld_len(bmctx_t ctx, bmmem_device_t mem, size_t len) {
+  return mem_invld_ext(ctx->dev, mem->dma_fd, mem->p_addr, len);
+}
+
+bmerr_t CviDeviceMem::context_create(bmctx_t *ctx) {
+  char array_got = 0;
+  bm_context_t *pctx = new bm_context_t;
+
+  // memset context
+  memset(pctx, 0, sizeof(bm_context_t));
+  pctx->dev = NULL;
+  pctx->seq_no = 0;
+  *ctx = pctx;
+
+  ROOTDAEMON_LOCK();
+  //assign into root
+  for (int i = 0; i < CTX_MAX_CNT; i ++) {
+    if (!root_ctx_array[i]) {
+      root_ctx_array[i] = pctx;
+      array_got = 1;
+      break;
+    }
+  }
+  ROOTDAEMON_UNLOCK();
+
+  if (!array_got)
+    TPU_LOG_WARNING("bm_context_create() over %d\n", CTX_MAX_CNT);
+
+  return BM_SUCCESS;
+}
+
+void CviDeviceMem::context_destroy(bmctx_t ctx) {
+  TPU_ASSERT(ctx != nullptr,nullptr);
+
+  ROOTDAEMON_LOCK();
+  //remove from root
+  for (int i = 0; i < CTX_MAX_CNT; i ++) {
+    if (root_ctx_array[i] == ctx) {
+      root_ctx_array[i] = NULL;
+      break;
+    }
+  }
+  ROOTDAEMON_UNLOCK();
+
+  delete ctx;
+}
+
+bmerr_t CviDeviceMem::bind_device(bmctx_t ctx, bmdev_t dev) {
+  TPU_ASSERT(ctx != nullptr, nullptr);
+  ctx->dev = dev;
+  return BM_SUCCESS;
+}
+
+void CviDeviceMem::unbind_device(bmctx_t ctx) {
+  TPU_ASSERT(ctx != nullptr, nullptr);
+  ctx->dev = NULL;
+}
+
+bmdev_t CviDeviceMem::get_device(bmctx_t ctx) {
+  TPU_ASSERT(ctx->dev != nullptr, NULL);
+  return ctx->dev;
+}
+
+
+void CviDeviceMem::device_exit(bmctx_t ctx) {
+  bmdev_t dev = ctx->dev;
+  unbind_device(ctx);
+  context_destroy(ctx);
+  device_close(dev);
+}
+
+bmerr_t CviDeviceMem::device_init(int index, bmctx_t *ctx) {
+  bmerr_t ret;
+  TPU_ASSERT(index == 0, NULL);
+
+  bmdev_t dev;
+  ret = device_open(index, &dev);
+  TPU_ASSERT(ret == BM_SUCCESS, NULL);
+
+  ret = context_create(ctx);
+  TPU_ASSERT(ret == BM_SUCCESS, NULL);
+
+  ret = bind_device(*ctx, dev);
+  TPU_ASSERT(ret == BM_SUCCESS, NULL);
+  return BM_SUCCESS;
+}
+
+
+bmerr_t CviDeviceMem::send_cmdbuf(bmctx_t ctx, uint8_t *cmdbuf, size_t sz, uint16_t *seq_no) {
+  int ret;
+  bmmem_device_t cmdbuf_mem;
+
+  ret = load_cmdbuf(ctx, cmdbuf, sz, 0, 0, false, &cmdbuf_mem);
+  if (ret != BM_SUCCESS) {
+    TPU_LOG_WARNING("load cmdbuf error\n");
+    return BM_ERR_FAILURE;
+  }
+
+  ret = run_cmdbuf(ctx, cmdbuf_mem, seq_no);
+  if (ret == BM_SUCCESS) {
+    ret = wait_cmdbuf_done(ctx, *seq_no);
+  }
+  mem_free_raw(ctx, cmdbuf_mem);
+
+  return ret;
+}
+
+bmerr_t CviDeviceMem::wait_cmdbuf_done(bmctx_t ctx, uint16_t seq_no) {
+  return wait_dmabuf(ctx->dev, seq_no);
+}
+
+bmerr_t CviDeviceMem::wait_cmdbuf_all(bmctx_t ctx) {
+  int i, ret;
+
+  for (i = 0; i < SUBMIT_QUEUE_MAX; i ++) {
+    if (root_submit_array[i]) {
+      ret = wait_dmabuf(ctx->dev, root_submit_array[i]);
+      TPU_ASSERT(ret == BM_SUCCESS, NULL);
+      root_submit_array[i] = 0;
+    }
+  }
+  return 0;
+}
+
+bmerr_t CviDeviceMem::run_cmdbuf_pio(bmctx_t ctx, uint8_t *cmdbuf, size_t sz) {
+  (void)ctx;
+  (void)cmdbuf;
+  (void)sz;
+  TPU_ASSERT(0, NULL); // not support
+  return BM_SUCCESS;
+}
+
+void CviDeviceMem::set_base_reg(bmctx_t ctx, uint32_t inx, uint64_t addr) {
+  // currently we set base_select0 = neuron
+  // base_select1 = weight
+  // WARNING: this api is not thread-safe, only used in verification.
+  if (inx == 0) {
+    ctx->array_base0 = addr;
+  } else if (inx == 1) {
+    ctx->array_base1 = addr;
+  } else {
+    TPU_ASSERT(0, NULL);      //not supported
+  }
+  return;
+}
+
+uint64_t CviDeviceMem::read_base_reg(bmctx_t ctx, u32 inx) {
+  // currently we set base_select0 = neuron
+  // base_select1 = weight
+  if (inx == 0) {
+    return ctx->array_base0;
+  } else if (inx == 1) {
+    return ctx->array_base1;
+  } else {
+    TPU_ASSERT(0, NULL);      //not supported
+  }
+  return 0;
+}
+
+bmerr_t CviDeviceMem::run_cmdbuf_tee(bmctx_t ctx, uint16_t *seq_no, uint64_t dmabuf_addr, cvi_array_base *array_base)
+{
+  // seq_no need be protected
+  BMDEV_LOCK(ctx->dev);
+
+  dmabuf_addr += GLOBAL_MEM_START_ADDR;
+  bmerr_t ret = run_tee(ctx, ctx->seq_no, dmabuf_addr,
+                  array_base->gaddr_base2, array_base->gaddr_base3,
+                  array_base->gaddr_base4, array_base->gaddr_base5,
+                  array_base->gaddr_base6, array_base->gaddr_base7);
+
+  *seq_no = ctx->seq_no++;
+  BMDEV_UNLOCK(ctx->dev);
+  return ret;
+}
+
+bmerr_t CviDeviceMem::run_cmdbuf(bmctx_t ctx, bmmem_device_t cmdbuf_mem, uint16_t *seq_no) {
+  // seq_no need be protected
+  BMDEV_LOCK(ctx->dev);
+
+  bm_memory_t *device_mem = (bm_memory_t *)cmdbuf_mem;
+  bmerr_t ret = submit_dmabuf(ctx->dev, device_mem->dma_fd, ctx->seq_no);
+
+  if (ret != BM_SUCCESS) {
+    // ret = bmmem_device_crc32_check(ctx, cmdbuf_mem);
+    // TPU_LOG_WARNING("run dambuf failed, crc32 check:" <;
+  }
+
+  *seq_no = ctx->seq_no++;
+  BMDEV_UNLOCK(ctx->dev);
+
+  return ret ? BM_ERR_FAILURE : BM_SUCCESS;
+}
+
+bmerr_t CviDeviceMem::run_cmdbuf_ex(bmctx_t ctx, bmmem_device_t cmdbuf_mem, uint16_t *seq_no,
+                       uint64_t input_base_addr, uint64_t output_base_addr) {
+  // seq_no need be protected
+  BMDEV_LOCK(ctx->dev);
+  if (protect) {
+    mem_unprotect(cmdbuf_mem->v_addr, cmdbuf_mem->size);
+  }
+  bm_memory_t *device_mem = (bm_memory_t *)cmdbuf_mem;
+
+  //assign input/output base selection
+  dma_hdr_t *header = (dma_hdr_t *)(device_mem->v_addr);
+  if (header->dmabuf_magic_m != tpu_dmabuf_header_m) {
+    TPU_LOG_ERROR("run cmdbuf ex:cmdbuf magic check fail!\n");
+    BMDEV_UNLOCK(ctx->dev);
+    return BM_ERR_FAILURE;
+  }
+
+  //chip define arraybase_0 activation/neuron
+  //chip define arraybase_1 weight
+  //chip define arraybase_2 input
+  //chip define arraybase_3 output
+  header->arraybase_2_L = (uint32_t)input_base_addr;
+  header->arraybase_2_H = 0;
+  header->arraybase_3_L = (uint32_t)output_base_addr;
+  header->arraybase_3_H = 0;
+  if (protect) {
+    mem_protect(cmdbuf_mem->v_addr, cmdbuf_mem->size);
+  }
+  //need not flush, bacause cmdbuf submit to kernel has remap mechanism
+  //bm_device_mem_flush_ext(ctx->dev, device_mem->dma_fd, device_mem->p_addr, device_mem->size);
+
+  //submit
+  bmerr_t ret = submit_dmabuf(ctx->dev, device_mem->dma_fd, ctx->seq_no);
+  *seq_no = ctx->seq_no++;
+  BMDEV_UNLOCK(ctx->dev);
+
+  return ret ? BM_ERR_FAILURE : BM_SUCCESS;
+}
+
+bmerr_t CviDeviceMem::run_cmdbuf_ex2(bmctx_t ctx, bmmem_device_t cmdbuf_mem, uint16_t *seq_no,
+                       cvi_array_base *p_array_base) {
+  // seq_no need be protected
+  BMDEV_LOCK(ctx->dev);
+  if (protect) {
+    mem_unprotect(cmdbuf_mem->v_addr, cmdbuf_mem->size);
+  }
+  bm_memory_t *device_mem = (bm_memory_t *)cmdbuf_mem;
+
+  //assign input/output base selection
+  dma_hdr_t *header = (dma_hdr_t *)(device_mem->v_addr);
+  if (header->dmabuf_magic_m != tpu_dmabuf_header_m) {
+    TPU_ASSERT(0, NULL);
+    return BM_ERR_FAILURE;
+  }
+
+  //chip define arraybase_0 activation/neuron
+  //chip define arraybase_1 weight
+  //chip define arraybase_3 input
+  //chip define arraybase_4 output
+  header->arraybase_0_L = (uint32_t)p_array_base->gaddr_base0;
+  header->arraybase_1_L = (uint32_t)p_array_base->gaddr_base1;
+  header->arraybase_2_L = (uint32_t)p_array_base->gaddr_base2;
+  header->arraybase_3_L = (uint32_t)p_array_base->gaddr_base3;
+  header->arraybase_4_L = (uint32_t)p_array_base->gaddr_base4;
+  header->arraybase_5_L = (uint32_t)p_array_base->gaddr_base5;
+  header->arraybase_6_L = (uint32_t)p_array_base->gaddr_base6;
+  header->arraybase_7_L = (uint32_t)p_array_base->gaddr_base7;
+  if (protect) {
+    mem_protect(cmdbuf_mem->v_addr, cmdbuf_mem->size);
+  }
+  //submit
+  bmerr_t ret = submit_dmabuf(ctx->dev, device_mem->dma_fd, ctx->seq_no);
+  *seq_no = ctx->seq_no++;
+  BMDEV_UNLOCK(ctx->dev);
+
+  return ret ? BM_ERR_FAILURE : BM_SUCCESS;
+}
+
+bmerr_t CviDeviceMem::run_async(bmctx_t ctx, bmmem_device_t cmdbuf_mem)
+{
+  uint16_t seq_no_current = 0;
+  int i;
+
+  // seq_no need be protected
+  BMDEV_LOCK(ctx->dev);
+  bm_memory_t *device_mem = (bm_memory_t *)cmdbuf_mem;
+
+  //submit
+  bmerr_t ret = submit_dmabuf(ctx->dev, device_mem->dma_fd, ctx->seq_no);
+  seq_no_current = ctx->seq_no++;
+  BMDEV_UNLOCK(ctx->dev);
+
+  ROOTDAEMON_LOCK();
+  for (i = 0; i < SUBMIT_QUEUE_MAX; i ++) {
+    if (!root_submit_array[i]) {
+      root_submit_array[i] = seq_no_current;
+      break;
+    }
+  }
+  ROOTDAEMON_UNLOCK();
+
+  return ret ? BM_ERR_FAILURE : BM_SUCCESS;
+}
diff --git a/cviruntime/src/soc/common/cvi_device_mem.h b/cviruntime/src/soc/common/cvi_device_mem.h
new file mode 100644
index 000000000..792053ca4
--- /dev/null
+++ b/cviruntime/src/soc/common/cvi_device_mem.h
@@ -0,0 +1,133 @@
+#pragma once
+#include <fcntl.h>
+#include <stdint.h>
+#include <stddef.h>
+#include <inttypes.h>
+#include "bmruntime.h"
+#include "linux/ion.h"
+#include "linux/bm_npu_ioctl.h"
+#include "bm_types.h"
+
+#define TPU_DEV_NAME "/dev/cvi-tpu0"
+#define ION_DEV_NAME "/dev/ion"
+#define MAX_HEAP_COUNT ION_HEAP_TYPE_CUSTOM
+
+typedef struct __tee_firewall_info {
+  uint64_t address;
+} tee_firewall_info;
+
+#define CTX_MAX_CNT 100
+#define SUBMIT_QUEUE_MAX 100  //kernel driver queue is 100 as well
+#define TEE_FIREWALL_MAX 6
+
+#define ROOTDAEMON_LOCK() pthread_mutex_lock(&root_daemon_lock)
+#define ROOTDAEMON_UNLOCK() pthread_mutex_unlock(&root_daemon_lock)
+
+#define MEMARRAY_DUMP 0
+
+#if MEMARRAY_DUMP
+  #define BMEMEM_DUMP() bmmem_dump_mem_array()
+#else
+  #define BMEMEM_DUMP()
+#endif
+
+class CviDeviceMem {
+public:
+  CviDeviceMem();
+  virtual ~CviDeviceMem();
+  // bmruntime_soc.cpp
+  virtual bmmem_device_t mem_alloc_raw(bmctx_t ctx, size_t size);
+  virtual bmmem_device_t mem_alloc_pagesize(bmctx_t ctx, size_t size);
+  virtual bmmem_device_t mem_prealloc_raw(bmctx_t ctx, bmmem_device_t mem, uint64_t offset,
+                                         size_t size); 
+  virtual void mem_free_raw(bmctx_t ctx, bmmem_device_t mem) = 0;
+  virtual void mem_free_ex(uint64_t p_addr);
+  virtual size_t mem_size(bmmem_device_t mem);
+  virtual uint64_t mem_p_addr(bmmem_device_t mem);
+  virtual uint8_t *mem_v_addr(bmmem_device_t mem);
+  virtual int32_t mem_inc_ref(bmmem_device_t mem);
+  virtual int32_t mem_dec_ref(bmmem_device_t mem);
+  virtual bmerr_t mem_memcpy_s2d(bmctx_t ctx, bmmem_device_t dst, uint8_t *src);
+  virtual bmerr_t mem_memcpy_s2d_ex(bmctx_t ctx, bmmem_device_t dst, uint8_t * src,
+                                    uint64_t offset, size_t size);
+  virtual bmerr_t mem_memcpy_d2s(bmctx_t ctx, uint8_t *dst, bmmem_device_t src);
+  virtual bmerr_t mem_memcpy_d2s_ex(bmctx_t ctx, uint8_t * dst, bmmem_device_t src,
+                                    uint64_t offset, size_t size);
+  virtual bmerr_t mem_device_flush(bmctx_t ctx, bmmem_device_t mem); 
+  virtual bmerr_t mem_device_flush_len(bmctx_t ctx, bmmem_device_t mem, size_t len); 
+  virtual bmerr_t mem_device_invld(bmctx_t ctx, bmmem_device_t mem); 
+  virtual bmerr_t mem_device_invld_len(bmctx_t ctx, bmmem_device_t mem, size_t len); 
+
+  virtual bmerr_t context_create(bmctx_t *ctx);
+  virtual void context_destroy(bmctx_t ctx);
+  virtual bmerr_t bind_device(bmctx_t ctx, bmdev_t dev);
+  virtual void unbind_device(bmctx_t ctx);
+  virtual bmdev_t get_device(bmctx_t ctx);
+  virtual bmerr_t device_init(int index, bmctx_t *ctx);
+  virtual void device_exit(bmctx_t ctx);
+  virtual bmerr_t load_cmdbuf(bmctx_t ctx, uint8_t * cmdbuf, size_t sz,
+                              uint64_t neuron_gaddr, uint64_t weight_gaddr,
+                              bool enable_pmu, bmmem_device_t *cmdbuf_mem) = 0;
+  virtual bmerr_t load_cmdbuf_tee(bmctx_t ctx, uint8_t * cmdbuf, size_t sz,
+                                  uint64_t neuron_gaddr, uint64_t weight_gaddr,
+                                  uint32_t weight_len, bmmem_device_t * cmdbuf_mem) = 0;
+  virtual bmerr_t load_dmabuf(bmctx_t ctx, bmmem_device_t in_mem,
+                              size_t sz, uint64_t neuron_gaddr, uint64_t weight_gaddr,
+                              bool enable_pmu, bmmem_device_t *dmabuf_mem)          = 0;
+  virtual bmerr_t run_cmdbuf_tee(bmctx_t ctx, uint16_t * seq_no, uint64_t dmabuf_addr,
+                                 cvi_array_base * array_base);
+  virtual bmerr_t run_cmdbuf(bmctx_t ctx, bmmem_device_t cmdbuf_mem, uint16_t *seq_no);
+  virtual bmerr_t run_cmdbuf_ex(bmctx_t ctx, bmmem_device_t cmdbuf_mem, uint16_t *seq_no,
+                       uint64_t input_base_addr, uint64_t output_base_addr);
+  virtual bmerr_t run_cmdbuf_ex2(bmctx_t ctx, bmmem_device_t cmdbuf_mem, uint16_t *seq_no,
+                       cvi_array_base *p_array_base);
+  virtual bmerr_t run_async(bmctx_t ctx, bmmem_device_t cmdbuf_mem);
+  virtual bmerr_t send_cmdbuf(bmctx_t ctx, uint8_t *cmdbuf, size_t sz, uint16_t *seq_no);
+  virtual bmerr_t wait_cmdbuf_done(bmctx_t ctx, uint16_t seq_no);
+  virtual bmerr_t wait_cmdbuf_all(bmctx_t ctx);
+  virtual bmerr_t run_cmdbuf_pio(bmctx_t ctx, uint8_t *cmdbuf, size_t sz);
+  virtual void set_base_reg(bmctx_t ctx, uint32_t inx, uint64_t addr);
+  virtual uint64_t read_base_reg(bmctx_t ctx, u32 inx);
+  virtual int get_chip_ver(bmdev_t dev) = 0;
+  virtual bmerr_t parse_pmubuf(bmmem_device_t cmdbuf_mem, uint8_t **buf_start, uint32_t *buf_len) = 0;
+  virtual void cvikernel_create(bmctx_t ctx, void **p_bk_ctx) = 0;
+  virtual void cvikernel_submit(bmctx_t ctx) = 0;
+  virtual void cvikernel_destroy(bmctx_t ctx) = 0;
+
+public:
+  virtual void bmmem_dump_mem_array(void);
+  bmerr_t ion_query_heap(bmdev_t dev);
+#ifdef ION_CACHE_OPEN
+  virtual bmerr_t mem_flush_fd(bmdev_t dev, int dma_fd);
+  virtual bmerr_t mem_invld_fd(bmdev_t dev, int dma_fd);
+#endif
+  virtual bmerr_t mem_flush_ext(bmdev_t dev, int dma_fd, uint64_t paddr, size_t size);
+  virtual bmerr_t mem_invld_ext(bmdev_t dev, int dma_fd, uint64_t paddr, size_t size);
+  virtual bmerr_t submit_dmabuf(bmdev_t dev, int dma_fd, uint32_t seq_no);
+  virtual bmerr_t wait_dmabuf(bmdev_t dev, uint32_t seq_no);
+  virtual bmerr_t mem_alloc(bmdev_t dev, size_t size, uint64_t *paddr, uint8_t **vaddr, int *dma_fd);
+  virtual bmerr_t mem_free(uint8_t *vaddr, size_t size, int dma_fd);
+  virtual bmerr_t device_open(int index, bmdev_t *dev) = 0;
+  virtual void device_close(bmdev_t dev) = 0;
+  virtual bmerr_t load_tee(bmctx_t ctx, uint64_t cmdbuf_addr_ree, uint32_t cmdbuf_len_ree,
+                           uint64_t weight_addr_ree, uint32_t weight_len_ree, uint64_t neuron_addr_ree);
+  virtual bmerr_t unload_tee(bmctx_t ctx, uint64_t paddr, size_t size) = 0;
+  virtual bmerr_t run_tee(bmctx_t ctx, uint32_t seq_no, uint64_t dmabuf_addr,
+                  uint64_t array_base2, uint64_t array_base3,
+                  uint64_t array_base4, uint64_t array_base5,
+                  uint64_t array_base6, uint64_t array_base7);
+  virtual bmerr_t reopen_dev(bmdev_t dev, int flag);
+  bmerr_t ion_ioctl(int fd, unsigned int heap_id_mask, size_t* size, uint64_t *paddr, int *dma_fd);
+
+ protected:
+  uint64_t GLOBAL_MEM_START_ADDR;
+  uint64_t g_gmem_size;
+  uint16_t tpu_dmabuf_header_m;
+  bool ion_legacy = false;
+  bool protect = false; //if cmdbuf_mem protect
+public:
+  static bmctx_t root_ctx_array[CTX_MAX_CNT];
+  static uint16_t root_submit_array[SUBMIT_QUEUE_MAX];
+  static pthread_mutex_t root_daemon_lock;
+  static tee_firewall_info root_tee_firewall_info[TEE_FIREWALL_MAX];
+};
diff --git a/cviruntime/src/soc/common/cvi_rt_base.cpp b/cviruntime/src/soc/common/cvi_rt_base.cpp
new file mode 100644
index 000000000..1fd86825f
--- /dev/null
+++ b/cviruntime/src/soc/common/cvi_rt_base.cpp
@@ -0,0 +1,389 @@
+#include <stdlib.h>
+#include <cstdlib>
+#include <unistd.h>
+#include <string.h>
+
+#include <mmpool.h>
+#include "cvi_rt_base.h"
+#include "bmruntime.h"
+
+CviRTBase::CviRTBase() {}
+CviRTBase::~CviRTBase() {}
+CviRTSoc::~CviRTSoc() {}
+
+CVI_RC CviRTSoc::SubmitBK(CVI_RT_HANDLE rt_handle) {
+  cvi_device->cvikernel_submit((bmctx_t)rt_handle);
+  return CVI_SUCCESS;
+}
+
+CVI_RC CviRTSoc::SubmitPio(CVI_RT_HANDLE rt_handle) {
+  (void)rt_handle;
+  TPU_ASSERT(0, NULL);  // not support
+  return CVI_SUCCESS;
+}
+
+CVI_RC CviRTSoc::Init(CVI_RT_HANDLE *rt_handle) {
+  bmctx_t *ctx = (bmctx_t *)rt_handle;
+  cvi_device->device_init(DEVICE_INDEX_NUM, ctx);
+  return CVI_SUCCESS;
+}
+
+CVI_RC CviRTSoc::DeInit(CVI_RT_HANDLE rt_handle) {
+  bmctx_t ctx = (bmctx_t)rt_handle;
+
+  //deinit basic context
+  cvi_device->device_exit(ctx);
+  return CVI_SUCCESS;
+}
+
+CVI_RT_KHANDLE CviRTSoc::RegisterKernel(CVI_RT_HANDLE rt_handle, uint32_t cmdbuf_size)
+{
+  bmctx_t ctx = (bmctx_t)rt_handle;
+  cvk_reg_info_t req_info;
+  cvk_context_t *tmp_cvk_context;
+  cvi_rt_submit *submit_handle;
+
+  //fill cvikernel request info
+  memset(&req_info, 0, sizeof(cvk_reg_info_t));
+  strncpy(req_info.chip_ver_str, chip_name_.c_str(), sizeof(req_info.chip_ver_str)-1);
+  req_info.cmdbuf_size = cmdbuf_size;
+  req_info.cmdbuf = (uint8_t *)malloc(req_info.cmdbuf_size);
+  if (!req_info.cmdbuf) {
+    TPU_ASSERT(req_info.cmdbuf, "Expect allocated cmdbuf");
+    return NULL;
+  }
+
+  //register cvikernel
+  tmp_cvk_context = cvikernel_register(&req_info);
+  submit_handle = (cvi_rt_submit *)malloc(sizeof(cvi_rt_submit));
+  if (!submit_handle) {
+    TPU_ASSERT(req_info.cmdbuf, "Expect allocated kernel context");
+    return NULL;
+  }
+
+  memset(submit_handle, 0, sizeof(cvi_rt_submit));
+
+  //assign handle mapping related, and reassign cvikernel handle
+  memcpy(submit_handle, tmp_cvk_context, sizeof(cvk_context_t));
+  submit_handle->rt_ctx = ctx;
+  submit_handle->cmdbuf = req_info.cmdbuf;
+  submit_handle->magic = submit_magic_;
+  free(tmp_cvk_context);
+
+  return submit_handle;
+}
+
+
+CVI_RC CviRTSoc::UnRegisterKernel(CVI_RT_KHANDLE rt_khandle)
+{
+  cvk_context_t *cvk_context = (cvk_context_t *)rt_khandle;
+  cvi_rt_submit *submit_handle = (cvi_rt_submit *)rt_khandle;
+
+  if (!cvk_context) {
+    TPU_ASSERT(0, "CVI_RT_UnRegisterKernel() NULL kernel handle");
+    return CVI_FAILURE;
+  }
+
+  if (submit_handle->dmabuf)
+    cvi_device->mem_free_raw(submit_handle->rt_ctx, submit_handle->dmabuf);
+
+  if (cvk_context)
+    cvk_context->ops->cleanup(cvk_context);
+
+  if (cvk_context->priv_data)
+    free(cvk_context->priv_data);
+
+  if (submit_handle->cmdbuf) {
+    free(submit_handle->cmdbuf);
+  }
+
+  free(rt_khandle);
+  return CVI_SUCCESS;
+}
+
+CVI_RC CviRTSoc::SubmitAsync(CVI_RT_KHANDLE rt_khandle, uint8_t submit_previous)
+{
+  cvi_rt_submit *submit_handle = (cvi_rt_submit *)rt_khandle;
+  uint32_t len;
+  bmctx_t ctx = submit_handle->rt_ctx;
+
+  if (submit_handle->magic != submit_magic_) {
+    TPU_LOG_WARNING("incorrect submit handle input\n");
+    return CVI_FAILURE;
+  }
+
+  if (submit_previous) {
+    if (submit_handle->dmabuf) {
+      cvi_run_async(ctx, submit_handle->dmabuf);
+    } else {
+      TPU_LOG_WARNING("CVI_RT_SubmitAsync() previous cmdbuff NULL!\n");
+      return CVI_FAILURE;
+    }
+
+  } else {
+    cvk_context_t *cvk_context = &submit_handle->cvk_ctx;
+    uint8_t *cmdbuf = cvk_context->ops->acquire_cmdbuf(cvk_context, &len);
+    bmmem_device_t dmabuf_mem;
+
+    //free last
+    if (submit_handle->dmabuf)
+      cvi_device->mem_free_raw(ctx, submit_handle->dmabuf);
+
+    //load and run
+    cvi_device->load_cmdbuf(ctx, cmdbuf, (size_t)len, 0, 0, false, &dmabuf_mem);
+    cvi_device->run_async(ctx, dmabuf_mem);
+
+    //record the last
+    submit_handle->dmabuf = dmabuf_mem;
+    cvk_context->ops->reset(cvk_context);
+  }
+
+  return CVI_SUCCESS;
+}
+
+CVI_RC CviRTSoc::WaitForAsync(CVI_RT_KHANDLE rt_khandle)
+{
+  cvi_rt_submit *submit_handle = (cvi_rt_submit *)rt_khandle;
+  return (CVI_RC)cvi_device->wait_cmdbuf_all(submit_handle->rt_ctx);
+}
+
+CVI_RC CviRTSoc::Submit(CVI_RT_KHANDLE rt_khandle)
+{
+  cvi_rt_submit *submit_handle = (cvi_rt_submit *)rt_khandle;
+  uint32_t len;
+  uint16_t seq_no;
+
+  if (submit_handle->magic != submit_magic_) {
+    TPU_LOG_WARNING("incorrect submit handle input\n");
+    return CVI_FAILURE;
+  }
+
+  cvk_context_t *cvk_context = &submit_handle->cvk_ctx;
+  uint8_t *cmdbuf = cvk_context->ops->acquire_cmdbuf(cvk_context, &len);
+
+  int ret = cvi_device->send_cmdbuf(submit_handle->rt_ctx, cmdbuf, (size_t)len, &seq_no);
+  if (ret != 0) {
+    TPU_LOG_WARNING("send_cmdbuf failed\n");
+  }
+  cvk_context->ops->reset(cvk_context);
+  return ret;
+}
+
+CVI_RC CviRTSoc::LoadCmdbuf(
+    CVI_RT_HANDLE rt_handle, uint8_t *cmdbuf,
+    uint64_t cmdbuf_sz, uint64_t gaddr_base0,
+    uint64_t gaddr_base1, bool enable_pmu,
+    CVI_RT_MEM *cmdbuf_mem)
+{
+  return (CVI_RC)cvi_device->load_cmdbuf(
+                    (bmctx_t)rt_handle, cmdbuf, (size_t)cmdbuf_sz,
+                    (uint64_t)gaddr_base0,
+                    (uint64_t)gaddr_base1,
+                    enable_pmu, (bmmem_device_t *)cmdbuf_mem);
+}
+
+CVI_RC CviRTSoc::LoadDmabuf(
+    CVI_RT_HANDLE rt_handle, CVI_RT_MEM dmabuf,
+    uint64_t dmabuf_sz, uint64_t gaddr_base0,
+    uint64_t gaddr_base1, bool enable_pmu, CVI_RT_MEM *dmabuf_mem)
+{
+  return (CVI_RC)cvi_device->load_dmabuf(
+                    (bmctx_t)rt_handle, (bmmem_device_t)dmabuf, 
+                    (size_t)dmabuf_sz,
+                    (uint64_t)gaddr_base0,
+                    (uint64_t)gaddr_base1,
+                    enable_pmu,
+                    (bmmem_device_t *)dmabuf_mem);
+}
+
+CVI_RC CviRTSoc::RunCmdbuf(
+    CVI_RT_HANDLE rt_handle, CVI_RT_MEM cmdbuf_mem,
+    uint64_t gaddr_base2, uint64_t gaddr_base3)
+{
+  CVI_RC ret;
+  uint16_t seq_no;
+  ret = (CVI_RC)cvi_device->run_cmdbuf_ex(
+                    (bmctx_t)rt_handle, (bmmem_device_t)cmdbuf_mem,
+                    &seq_no, gaddr_base2, gaddr_base3);
+  if (ret != 0) {
+    TPU_LOG_ERROR("RunCmdbuf fail!");
+    return ret;
+  }
+
+  return (CVI_RC)cvi_device->wait_cmdbuf_done((bmctx_t)rt_handle, seq_no);
+}
+
+CVI_RC CviRTSoc::RunCmdbufEx(
+    CVI_RT_HANDLE rt_handle, CVI_RT_MEM cmdbuf_mem,
+    CVI_RT_ARRAYBASE *p_array_base)
+{
+  CVI_RC ret;
+  uint16_t seq_no;
+
+  ret = (CVI_RC)cvi_device->run_cmdbuf_ex2(
+                    (bmctx_t)rt_handle, (bmmem_device_t)cmdbuf_mem,
+                    &seq_no, (cvi_array_base *)p_array_base);
+  if (ret != 0)
+    return ret;
+
+  return (CVI_RC)cvi_device->wait_cmdbuf_done((bmctx_t)rt_handle, seq_no);
+}
+
+CVI_RT_MEM CviRTSoc::MemAlloc(CVI_RT_HANDLE rt_handle, uint64_t size)
+{
+  return (CVI_RT_MEM)cvi_device->mem_alloc_raw((bmctx_t)rt_handle, size);
+}
+
+CVI_RT_MEM CviRTSoc::MemPreAlloc(CVI_RT_MEM mem, uint64_t offset, uint64_t size)
+{
+  bm_memory_t *dev_mem = (bm_memory_t*)mem;
+
+  TPU_ASSERT(dev_mem != nullptr, nullptr);
+  TPU_ASSERT(dev_mem->size >= size + offset, nullptr);
+  bm_memory_t *preAlloc_mem = new bm_memory_t();
+  preAlloc_mem->flags.u.is_prealloc = 1;
+  preAlloc_mem->flags.u.type = BMMEM_TYPE_DEVICE;
+  preAlloc_mem->p_addr = ((bm_memory_t *)dev_mem)->p_addr + offset;
+  preAlloc_mem->v_addr = ((bm_memory_t *)dev_mem)->v_addr + offset;
+  preAlloc_mem->dma_fd = ((bm_memory_t *)dev_mem)->dma_fd;
+  preAlloc_mem->size = size;
+  return (CVI_RT_MEM)preAlloc_mem;
+}
+
+void CviRTSoc::MemFree(CVI_RT_HANDLE rt_handle, CVI_RT_MEM mem)
+{
+  cvi_device->mem_free_raw((bmctx_t)rt_handle, (bmmem_device_t)mem);
+}
+
+void CviRTSoc::MemFreeEx(uint64_t p_addr)
+{
+  cvi_device->mem_free_ex(p_addr);
+}
+
+uint64_t CviRTSoc::MemGetSize(CVI_RT_MEM mem)
+{
+  if (!mem)
+    return 0;
+  bm_memory_t *dev_mem = (bm_memory_t *)mem;
+  TPU_ASSERT(dev_mem->flags.u.type == BMMEM_TYPE_DEVICE, NULL);
+  return dev_mem->size;
+}
+
+uint64_t CviRTSoc::MemGetPAddr(CVI_RT_MEM mem)
+{
+  if (!mem)
+    return 0;
+  bm_memory_t *dev_mem = (bm_memory_t *)mem;
+  TPU_ASSERT(dev_mem->flags.u.type == BMMEM_TYPE_DEVICE, NULL);
+  return dev_mem->p_addr;
+}
+
+uint8_t* CviRTSoc::MemGetVAddr(CVI_RT_MEM mem)
+{
+  if (!mem)
+    return 0;
+  bm_memory_t *dev_mem  = (bm_memory_t *)mem;
+  TPU_ASSERT(dev_mem->flags.u.type == BMMEM_TYPE_DEVICE, NULL);
+  return dev_mem->v_addr;
+}
+
+int32_t CviRTSoc::MemIncRef(CVI_RT_MEM mem) {
+  bm_memory_t *device_mem = (bm_memory_t *)mem;
+  TPU_ASSERT(device_mem->flags.u.type == BMMEM_TYPE_DEVICE, NULL);
+  return (++device_mem->user_ref_cnt);
+}
+
+int32_t CviRTSoc::MemDecRef(CVI_RT_MEM mem) {
+  bm_memory_t *device_mem = (bm_memory_t *)mem;
+  TPU_ASSERT(device_mem->flags.u.type == BMMEM_TYPE_DEVICE, NULL);
+  return (--device_mem->user_ref_cnt);
+}
+
+CVI_RC CviRTSoc::MemFlush(CVI_RT_HANDLE rt_handle, CVI_RT_MEM mem)
+{
+  bmctx_t ctx = (bmctx_t)rt_handle;
+  bm_memory_t *device_mem = (bm_memory_t *)mem;
+  TPU_ASSERT(cvi_device->mem_flush_ext(ctx->dev, device_mem->dma_fd,
+        device_mem->p_addr, device_mem->size) == BM_SUCCESS, NULL);
+  return CVI_SUCCESS;
+}
+
+CVI_RC CviRTSoc::MemInvld(CVI_RT_HANDLE rt_handle, CVI_RT_MEM mem)
+{
+  bmctx_t ctx = (bmctx_t)rt_handle;
+  bm_memory_t *device_mem = (bm_memory_t *)mem;
+  TPU_ASSERT(cvi_device->mem_invld_ext(ctx->dev, device_mem->dma_fd,
+        device_mem->p_addr, device_mem->size) == BM_SUCCESS, NULL);
+  return CVI_SUCCESS;
+}
+
+CVI_RC CviRTSoc::MemFlushEx(CVI_RT_HANDLE rt_handle, CVI_RT_MEM mem, uint64_t len)
+{
+  bmctx_t ctx = (bmctx_t)rt_handle;
+  bm_memory_t *device_mem = (bm_memory_t *)mem;
+  TPU_ASSERT(cvi_device->mem_flush_ext(ctx->dev, device_mem->dma_fd,
+        device_mem->p_addr, len) == BM_SUCCESS, NULL);
+  return CVI_SUCCESS;
+}
+
+CVI_RC CviRTSoc::MemInvldEx(CVI_RT_HANDLE rt_handle, CVI_RT_MEM mem, uint64_t len)
+{
+  bmctx_t ctx = (bmctx_t)rt_handle;
+  bm_memory_t *device_mem = (bm_memory_t *)mem;
+  TPU_ASSERT(cvi_device->mem_invld_ext(ctx->dev, device_mem->dma_fd,
+        device_mem->p_addr, len) == BM_SUCCESS, NULL);
+  return CVI_SUCCESS;
+}
+
+CVI_RC CviRTSoc::MemCopyS2D(CVI_RT_HANDLE rt_handle, CVI_RT_MEM dst, uint8_t* src)
+{
+  bmctx_t ctx = (bmctx_t)rt_handle;
+  bm_memory_t *device_mem = (bm_memory_t *)dst;
+  memcpy(device_mem->v_addr, src, device_mem->size);
+  TPU_ASSERT((int)cvi_device->mem_flush_ext(ctx->dev, device_mem->dma_fd,
+        device_mem->p_addr, device_mem->size) == BM_SUCCESS, NULL);
+  return CVI_SUCCESS;
+}
+
+CVI_RC CviRTSoc::MemCopyS2DEx(
+    CVI_RT_HANDLE rt_handle,
+    CVI_RT_MEM dst, uint64_t offset,
+    uint64_t len, uint8_t* src)
+{
+  bmctx_t ctx = (bmctx_t)rt_handle;
+  bm_memory_t *device_mem = (bm_memory_t *)dst;
+  TPU_ASSERT((size_t)(offset + len) <= device_mem->size, nullptr);
+  memcpy(device_mem->v_addr + offset, src, len);
+  TPU_ASSERT((int)cvi_device->mem_flush_ext(ctx->dev, device_mem->dma_fd,
+        device_mem->p_addr + offset, len) == BM_SUCCESS, NULL);
+  return CVI_SUCCESS;
+}
+
+CVI_RC CviRTSoc::MemCopyD2S(CVI_RT_HANDLE rt_handle, uint8_t* dst, CVI_RT_MEM src)
+{
+  bmctx_t ctx = (bmctx_t)rt_handle;
+  bm_memory_t *device_mem = (bm_memory_t *)src;
+  TPU_ASSERT(cvi_device->mem_invld_ext(ctx->dev, device_mem->dma_fd,
+        device_mem->p_addr, device_mem->size) == BM_SUCCESS, NULL);
+  memcpy(dst, device_mem->v_addr, device_mem->size);
+  return CVI_SUCCESS;
+}
+
+CVI_RC CviRTSoc::ParsePmuBuf(CVI_RT_MEM cmdbuf_mem, uint8_t **buf_start, uint32_t *buf_len)
+{
+  return (CVI_RC)cvi_device->parse_pmubuf((bmmem_device_t)cmdbuf_mem, buf_start, buf_len);
+}
+
+CVI_RC CviRTSoc::SetBaseReg(CVI_RT_HANDLE rt_handle, uint32_t inx, uint64_t base_addr)
+{
+  bmctx_t ctx = (bmctx_t)rt_handle;
+
+  if (inx == 0)
+    ctx->array_base0 = base_addr;
+  else if (inx == 1)
+    ctx->array_base1 = base_addr;
+  else
+    TPU_ASSERT(0, NULL); // not support
+
+  return CVI_SUCCESS;
+}
\ No newline at end of file
diff --git a/cviruntime/src/soc/common/cvi_rt_base.h b/cviruntime/src/soc/common/cvi_rt_base.h
new file mode 100644
index 000000000..ad8d00596
--- /dev/null
+++ b/cviruntime/src/soc/common/cvi_rt_base.h
@@ -0,0 +1,134 @@
+#pragma once
+#include <memory>
+#include "bmruntime.h"
+#include "cvi_device_mem.h"
+#include "cviruntime_context.h"
+#include "bm_types.h"
+#include <string>
+
+#define DEVICE_INDEX_NUM 0
+typedef struct _cvi_rt_submit {
+  cvk_context_t  cvk_ctx;
+  bmctx_t        rt_ctx;
+  uint8_t        *cmdbuf;
+  bmmem_device_t dmabuf;
+  uint32_t       magic;
+} cvi_rt_submit;
+
+class CviRTBase {
+public:
+  CviRTBase();
+  CviRTBase(const CviRTBase&) = delete;
+  CviRTBase(CviRTBase&&)  = delete;
+  CviRTBase& operator=(const CviRTBase&) = delete;
+  CviRTBase& operator=(CviRTBase&&) = delete;
+  virtual ~CviRTBase()                                                                 = 0;
+  virtual CVI_RC DeInitBK(CVI_RT_HANDLE rt_handle)                                     = 0;
+  virtual CVI_RC InitWithKernelBK(CVI_RT_HANDLE *rt_handle, uint32_t cmdbuf_size)      = 0;
+  virtual CVI_RC SubmitBK(CVI_RT_HANDLE rt_handle)                                     = 0;
+  virtual CVI_RT_KHANDLE GetKHandleBK(CVI_RT_HANDLE rt_handle)                         = 0;
+  virtual CVI_RC SubmitPio(CVI_RT_HANDLE rt_handle)                                    = 0;
+  virtual CVI_RC Init(CVI_RT_HANDLE *rt_handle)                                        = 0;
+  virtual CVI_RC DeInit(CVI_RT_HANDLE rt_handle)                                       = 0;
+  virtual CVI_RT_KHANDLE RegisterKernel(CVI_RT_HANDLE rt_handle, uint32_t cmdbuf_size) = 0;
+  virtual CVI_RC UnRegisterKernel(CVI_RT_KHANDLE rt_khandle)                           = 0;
+  virtual CVI_RC SubmitAsync(CVI_RT_KHANDLE rt_khandle, uint8_t submit_previous)       = 0;
+  virtual CVI_RC WaitForAsync(CVI_RT_KHANDLE rt_khandle)                               = 0;
+  virtual CVI_RC Submit(CVI_RT_KHANDLE rt_khandle)                                     = 0;
+  virtual CVI_RC LoadCmdbuf(CVI_RT_HANDLE rt_handle, uint8_t *cmdbuf,
+                            uint64_t cmdbuf_sz, uint64_t gaddr_base0,
+                            uint64_t gaddr_base1, bool enable_pmu,
+                            CVI_RT_MEM *cmdbuf_mem)                                    = 0;
+  virtual CVI_RC LoadDmabuf(
+      CVI_RT_HANDLE rt_handle, CVI_RT_MEM dmabuf,
+      uint64_t dmabuf_sz, uint64_t gaddr_base0,
+      uint64_t gaddr_base1, bool enable_pmu, CVI_RT_MEM *dmabuf_mem)                   = 0;
+  virtual CVI_RC RunCmdbuf(CVI_RT_HANDLE rt_handle, CVI_RT_MEM cmdbuf_mem,
+                           uint64_t gaddr_base2, uint64_t gaddr_base3)                 = 0;
+  virtual CVI_RC RunCmdbufEx(CVI_RT_HANDLE rt_handle, CVI_RT_MEM cmdbuf_mem,
+                             CVI_RT_ARRAYBASE *p_array_base)                           = 0;
+  virtual CVI_RC LoadCmdbufTee(CVI_RT_HANDLE rt_handle, uint8_t *cmdbuf,
+                               size_t sz, uint64_t neuron_gaddr,
+                               uint64_t weight_gaddr, uint32_t weight_len,
+                               CVI_RT_MEM *cmdbuf_mem)                                 = 0;
+  virtual CVI_RC RunCmdbufTee(CVI_RT_HANDLE rt_handle, CVI_RT_MEM cmdbuf_mem,
+                              CVI_RT_ARRAYBASE *p_array_base)                          = 0;
+  virtual CVI_RT_MEM MemAlloc(CVI_RT_HANDLE rt_handle, uint64_t size)                  = 0;
+  virtual CVI_RT_MEM MemPreAlloc(CVI_RT_MEM mem, uint64_t offset, uint64_t size)       = 0;
+  virtual void MemFree(CVI_RT_HANDLE rt_hanlde, CVI_RT_MEM mem)                        = 0;
+  virtual void MemFreeEx(uint64_t p_addr)                                              = 0;
+  virtual uint64_t MemGetSize(CVI_RT_MEM mem)                                          = 0;
+  virtual uint64_t MemGetPAddr(CVI_RT_MEM mem)                                         = 0;
+  virtual uint8_t *MemGetVAddr(CVI_RT_MEM mem)                                         = 0;
+  virtual int32_t MemIncRef(CVI_RT_MEM mem)                                            = 0;
+  virtual int32_t MemDecRef(CVI_RT_MEM mem)                                            = 0;
+  virtual CVI_RC MemFlush(CVI_RT_HANDLE rt_handle, CVI_RT_MEM mem)                     = 0;
+  virtual CVI_RC MemInvld(CVI_RT_HANDLE rt_handle, CVI_RT_MEM mem)                     = 0;
+  virtual CVI_RC MemFlushEx(CVI_RT_HANDLE rt_handle, CVI_RT_MEM mem, uint64_t len)     = 0;
+  virtual CVI_RC MemInvldEx(CVI_RT_HANDLE rt_handle, CVI_RT_MEM mem, uint64_t len)     = 0;
+  virtual CVI_RC MemCopyS2D(CVI_RT_HANDLE rt_handle, CVI_RT_MEM dst, uint8_t *src)     = 0;
+  virtual CVI_RC MemCopyS2DEx(CVI_RT_HANDLE rt_handle,
+                              CVI_RT_MEM dst, uint64_t offset,
+                              uint64_t len, uint8_t *src)                              = 0;
+  virtual CVI_RC MemCopyD2S(CVI_RT_HANDLE rt_handle, uint8_t *dst, CVI_RT_MEM src)     = 0;
+  virtual CVI_RC ParsePmuBuf(CVI_RT_MEM cmdbuf_mem, uint8_t **buf_start,
+                             uint32_t *buf_len)                                        = 0;
+  virtual CVI_RC SetBaseReg(CVI_RT_HANDLE rt_handle, uint32_t inx,
+                            uint64_t base_addr)                                        = 0;
+
+ public:
+  std::string chip_name_;
+  std::unique_ptr<CviDeviceMem> cvi_device;
+};
+
+
+class CviRTSoc : public CviRTBase {
+public:
+  ~CviRTSoc() override;
+  virtual CVI_RC SubmitBK(CVI_RT_HANDLE rt_handle) override;
+  virtual CVI_RC SubmitPio(CVI_RT_HANDLE rt_handle) override;
+  virtual CVI_RC Init(CVI_RT_HANDLE *rt_handle) override;
+  virtual CVI_RC DeInit(CVI_RT_HANDLE rt_handle) override;
+  virtual CVI_RT_KHANDLE RegisterKernel(CVI_RT_HANDLE rt_handle, uint32_t cmdbuf_size) override;
+  virtual CVI_RC UnRegisterKernel(CVI_RT_KHANDLE rt_khandle) override;
+  virtual CVI_RC SubmitAsync(CVI_RT_KHANDLE rt_khandle, uint8_t submit_previous) override;
+  virtual CVI_RC WaitForAsync(CVI_RT_KHANDLE rt_khandle) override;
+  virtual CVI_RC Submit(CVI_RT_KHANDLE rt_khandle) override;
+  virtual CVI_RC LoadCmdbuf(CVI_RT_HANDLE rt_handle, uint8_t *cmdbuf,
+                    uint64_t cmdbuf_sz, uint64_t gaddr_base0,
+                    uint64_t gaddr_base1, bool enable_pmu,
+                    CVI_RT_MEM *cmdbuf_mem) override;
+  virtual CVI_RC LoadDmabuf(
+      CVI_RT_HANDLE rt_handle, CVI_RT_MEM dmabuf,
+      uint64_t dmabuf_sz, uint64_t gaddr_base0,
+      uint64_t gaddr_base1, bool enable_pmu, CVI_RT_MEM *dmabuf_mem) override;
+  virtual CVI_RC RunCmdbuf(CVI_RT_HANDLE rt_handle, CVI_RT_MEM cmdbuf_mem,
+                   uint64_t gaddr_base2, uint64_t gaddr_base3) override;
+  virtual CVI_RC RunCmdbufEx(
+      CVI_RT_HANDLE rt_handle, CVI_RT_MEM cmdbuf_mem,
+      CVI_RT_ARRAYBASE *p_array_base);
+  virtual CVI_RT_MEM MemAlloc(CVI_RT_HANDLE rt_handle, uint64_t size) override;
+  virtual CVI_RT_MEM MemPreAlloc(CVI_RT_MEM mem, uint64_t offset, uint64_t size) override;
+  virtual void MemFree(CVI_RT_HANDLE rt_hanlde, CVI_RT_MEM mem) override;
+  virtual void MemFreeEx(uint64_t p_addr) override;
+  virtual uint64_t MemGetSize(CVI_RT_MEM mem) override;
+  virtual uint64_t MemGetPAddr(CVI_RT_MEM mem) override;
+  virtual uint8_t* MemGetVAddr(CVI_RT_MEM mem) override;
+  virtual int32_t MemIncRef(CVI_RT_MEM mem) override;
+  virtual int32_t MemDecRef(CVI_RT_MEM mem) override;
+  virtual CVI_RC MemFlush(CVI_RT_HANDLE rt_handle, CVI_RT_MEM mem) override;
+  virtual CVI_RC MemInvld(CVI_RT_HANDLE rt_handle, CVI_RT_MEM mem) override;
+  virtual CVI_RC MemFlushEx(CVI_RT_HANDLE rt_handle, CVI_RT_MEM mem, uint64_t len) override;
+  virtual CVI_RC MemInvldEx(CVI_RT_HANDLE rt_handle, CVI_RT_MEM mem, uint64_t len) override;
+  virtual CVI_RC MemCopyS2D(CVI_RT_HANDLE rt_handle, CVI_RT_MEM dst, uint8_t *src) override;
+  virtual CVI_RC MemCopyS2DEx(CVI_RT_HANDLE rt_handle,
+                      CVI_RT_MEM dst, uint64_t offset,
+                      uint64_t len, uint8_t *src) override;
+  virtual CVI_RC MemCopyD2S(CVI_RT_HANDLE rt_handle, uint8_t* dst, CVI_RT_MEM src) override;
+  virtual CVI_RC ParsePmuBuf(CVI_RT_MEM cmdbuf_mem, uint8_t **buf_start,
+                     uint32_t *buf_len) override;
+  virtual CVI_RC SetBaseReg(CVI_RT_HANDLE rt_handle, uint32_t inx,
+                           uint64_t base_addr) override;
+public:
+  uint32_t submit_magic_;
+};
diff --git a/cviruntime/src/soc/common/cviruntime_context.cpp b/cviruntime/src/soc/common/cviruntime_context.cpp
new file mode 100644
index 000000000..5db190749
--- /dev/null
+++ b/cviruntime/src/soc/common/cviruntime_context.cpp
@@ -0,0 +1,212 @@
+#include <stdlib.h>
+#include <cstdlib>
+#include <memory>
+#include "cviruntime_context.h"
+#include "bmruntime.h"
+#include "bmruntime_internal.h"
+
+#include <runtime/debug.h>
+#include <mmpool.h>
+#include "cvi_rt_base.h"
+
+#define DEVICE_INDEX_NUM 0
+
+extern std::unique_ptr<CviRTSoc> cvi_chip;
+
+CVI_RC CVI_RT_DeInitBK(CVI_RT_HANDLE rt_handle)
+{
+  return cvi_chip->DeInitBK(rt_handle);
+}
+
+CVI_RC CVI_RT_InitWithKernelBK(CVI_RT_HANDLE *rt_handle, uint32_t cmdbuf_size)
+{
+  return cvi_chip->InitWithKernelBK(rt_handle, cmdbuf_size);
+}
+
+CVI_RC CVI_RT_SubmitBK(CVI_RT_HANDLE rt_handle)
+{
+  return cvi_chip->SubmitBK(rt_handle);
+}
+
+CVI_RT_KHANDLE CVI_RT_GetKHandleBK(CVI_RT_HANDLE rt_handle)
+{
+  return cvi_chip->GetKHandleBK(rt_handle);
+}
+
+CVI_RC CVI_RT_SubmitPio(CVI_RT_HANDLE rt_handle)
+{
+  return cvi_chip->SubmitPio(rt_handle);
+}
+
+CVI_RC CVI_RT_Init(CVI_RT_HANDLE *rt_handle)
+{
+  return cvi_chip->Init(rt_handle);
+}
+
+CVI_RC CVI_RT_DeInit(CVI_RT_HANDLE rt_handle)
+{
+  return cvi_chip->DeInit(rt_handle);
+}
+
+CVI_RT_KHANDLE CVI_RT_RegisterKernel(CVI_RT_HANDLE rt_handle, uint32_t cmdbuf_size)
+{
+   return cvi_chip->RegisterKernel(rt_handle, cmdbuf_size);
+}
+
+CVI_RC CVI_RT_UnRegisterKernel(CVI_RT_KHANDLE rt_khandle)
+{
+  return cvi_chip->UnRegisterKernel(rt_khandle);
+}
+
+CVI_RC CVI_RT_SubmitAsync(CVI_RT_KHANDLE rt_khandle, uint8_t submit_previous)
+{
+  return cvi_chip->SubmitAsync(rt_khandle, submit_previous);
+}
+
+CVI_RC CVI_RT_WaitForAsync(CVI_RT_KHANDLE rt_khandle)
+{
+  return cvi_chip->WaitForAsync(rt_khandle);
+}
+
+CVI_RC CVI_RT_Submit(CVI_RT_KHANDLE rt_khandle)
+{
+  return cvi_chip->Submit(rt_khandle);
+}
+
+CVI_RC CVI_RT_LoadCmdbuf(
+    CVI_RT_HANDLE rt_handle, uint8_t *cmdbuf,
+    uint64_t cmdbuf_sz, uint64_t gaddr_base0,
+    uint64_t gaddr_base1, bool enable_pmu,
+    CVI_RT_MEM *cmdbuf_mem)
+{
+  return cvi_chip->LoadCmdbuf(rt_handle, cmdbuf, cmdbuf_sz, gaddr_base0,
+                             gaddr_base1, enable_pmu, cmdbuf_mem);
+}
+
+CVI_RC CVI_RT_LoadDmabuf(
+    CVI_RT_HANDLE rt_handle, CVI_RT_MEM dmabuf,
+    uint64_t dmabuf_sz, uint64_t gaddr_base0, 
+    uint64_t gaddr_base1, bool enable_pmu, CVI_RT_MEM *dmabuf_mem)
+{
+  return cvi_chip->LoadDmabuf(rt_handle, dmabuf, dmabuf_sz, gaddr_base0,
+                             gaddr_base1, enable_pmu, dmabuf_mem);
+}
+
+CVI_RC CVI_RT_RunCmdbuf(
+    CVI_RT_HANDLE rt_handle, CVI_RT_MEM cmdbuf_mem,
+    uint64_t gaddr_base2, uint64_t gaddr_base3)
+{
+  return cvi_chip->RunCmdbuf(rt_handle, cmdbuf_mem, gaddr_base2, gaddr_base3);
+}
+
+CVI_RC CVI_RT_RunCmdbufEx(
+    CVI_RT_HANDLE rt_handle, CVI_RT_MEM cmdbuf_mem,
+    CVI_RT_ARRAYBASE *p_array_base)
+{
+  return cvi_chip->RunCmdbufEx(rt_handle, cmdbuf_mem, p_array_base);
+}
+
+CVI_RC CVI_RT_LoadCmdbufTee(
+    CVI_RT_HANDLE rt_handle, uint8_t *cmdbuf,
+    size_t sz, uint64_t neuron_gaddr, uint64_t weight_gaddr, uint32_t weight_len, CVI_RT_MEM *cmdbuf_mem)
+{
+  return cvi_chip->LoadCmdbufTee(rt_handle, cmdbuf, sz, neuron_gaddr, weight_gaddr, weight_len, cmdbuf_mem);
+}
+
+CVI_RC CVI_RT_RunCmdbufTee(
+    CVI_RT_HANDLE rt_handle, CVI_RT_MEM cmdbuf_mem,
+    CVI_RT_ARRAYBASE *p_array_base)
+{
+  return cvi_chip->RunCmdbufTee(rt_handle, cmdbuf_mem, p_array_base);
+}
+
+CVI_RT_MEM CVI_RT_MemAlloc(CVI_RT_HANDLE rt_handle, uint64_t size)
+{
+  return cvi_chip->MemAlloc(rt_handle, size);
+}
+
+CVI_RT_MEM CVI_RT_MemPreAlloc(CVI_RT_MEM mem, uint64_t offset, uint64_t size)
+{
+  return cvi_chip->MemPreAlloc(mem, offset, size);
+}
+
+void CVI_RT_MemFree(CVI_RT_HANDLE rt_handle, CVI_RT_MEM mem)
+{
+  return cvi_chip->MemFree(rt_handle, mem);
+}
+
+void CVI_RT_MemFreeEx(uint64_t p_addr)
+{
+  return cvi_chip->MemFreeEx(p_addr);
+}
+
+uint64_t CVI_RT_MemGetSize(CVI_RT_MEM mem)
+{
+  return cvi_chip->MemGetSize(mem);
+}
+
+uint64_t CVI_RT_MemGetPAddr(CVI_RT_MEM mem)
+{
+  return cvi_chip->MemGetPAddr(mem);
+}
+
+uint8_t* CVI_RT_MemGetVAddr(CVI_RT_MEM mem)
+{
+  return cvi_chip->MemGetVAddr(mem);
+}
+
+int32_t CVI_RT_MemIncRef(CVI_RT_MEM mem) {
+  return cvi_chip->MemIncRef(mem);
+}
+
+int32_t CVI_RT_MemDecRef(CVI_RT_MEM mem) {
+  return cvi_chip->MemDecRef(mem);
+}
+
+CVI_RC CVI_RT_MemFlush(CVI_RT_HANDLE rt_handle, CVI_RT_MEM mem)
+{
+  return cvi_chip->MemFlush(rt_handle, mem);
+}
+
+CVI_RC CVI_RT_MemInvld(CVI_RT_HANDLE rt_handle, CVI_RT_MEM mem)
+{
+  return cvi_chip->MemInvld(rt_handle, mem);
+}
+
+CVI_RC CVI_RT_MemFlushEx(CVI_RT_HANDLE rt_handle, CVI_RT_MEM mem, uint64_t len)
+{
+  return cvi_chip->MemFlushEx(rt_handle, mem, len);
+}
+
+CVI_RC CVI_RT_MemInvldEx(CVI_RT_HANDLE rt_handle, CVI_RT_MEM mem, uint64_t len)
+{
+  return cvi_chip->MemInvldEx(rt_handle, mem, len);
+}
+
+CVI_RC CVI_RT_MemCopyS2D(CVI_RT_HANDLE rt_handle, CVI_RT_MEM dst, uint8_t* src)
+{
+  return cvi_chip->MemCopyS2D(rt_handle, dst, src);
+}
+
+CVI_RC CVI_RT_MemCopyS2DEx(
+    CVI_RT_HANDLE rt_handle,
+    CVI_RT_MEM dst, uint64_t offset,
+    uint64_t len, uint8_t* src)
+{
+  return cvi_chip->MemCopyS2DEx(rt_handle, dst, offset, len, src);
+}
+
+CVI_RC CVI_RT_MemCopyD2S(CVI_RT_HANDLE rt_handle, uint8_t* dst, CVI_RT_MEM src)
+{
+  return cvi_chip->MemCopyD2S(rt_handle, dst, src);
+}
+
+CVI_RC CVI_RT_ParsePmuBuf(CVI_RT_MEM cmdbuf_mem, uint8_t **buf_start, uint32_t *buf_len)
+{
+  return cvi_chip->ParsePmuBuf(cmdbuf_mem, buf_start, buf_len);
+}
+
+CVI_RC CVI_RT_SetBaseReg(CVI_RT_HANDLE rt_handle, uint32_t inx, uint64_t base_addr)
+{
+  return cvi_chip->SetBaseReg(rt_handle, inx, base_addr);
+}
diff --git a/cviruntime/src/soc/common/linux/bm_npu_ioctl.h b/cviruntime/src/soc/common/linux/bm_npu_ioctl.h
new file mode 100644
index 000000000..6cb7b3d19
--- /dev/null
+++ b/cviruntime/src/soc/common/linux/bm_npu_ioctl.h
@@ -0,0 +1,78 @@
+/*
+ * Bitmain SoC NPU Controller Driver
+ *
+ * Copyright (c) 2018 Bitmain Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __BM_NPU_IOCTL_H__
+#define __BM_NPU_IOCTL_H__
+
+#include <stdint.h>
+
+struct bm_cache_op_arg {
+  unsigned long long paddr;
+  unsigned long long size;
+  int dma_fd;
+};
+
+struct bm_submit_dma_arg {
+  int fd;
+  unsigned int seq_no;
+};
+
+struct bm_wait_dma_arg {
+  unsigned int seq_no;
+  int ret;
+};
+
+struct bm_pio_mode {
+  unsigned long long cmdbuf;
+  unsigned long long sz;
+};
+
+struct cvi_load_tee_arg {
+  //ree domain
+  unsigned long long cmdbuf_addr_ree;
+  unsigned int cmdbuf_len_ree;
+  unsigned long long weight_addr_ree;
+  unsigned int weight_len_ree;
+  unsigned long long neuron_addr_ree;
+
+  //tee domain
+  unsigned long long dmabuf_addr_tee;
+};
+
+struct cvi_submit_tee_arg {
+  unsigned long long dmabuf_tee_addr;
+  unsigned long long gaddr_base2;
+  unsigned long long gaddr_base3;
+  unsigned long long gaddr_base4;
+  unsigned long long gaddr_base5;
+  unsigned long long gaddr_base6;
+  unsigned long long gaddr_base7;
+  unsigned int seq_no;
+};
+
+struct cvi_unload_tee_arg {
+  unsigned long long paddr;
+  unsigned long long size;
+};
+
+#define CVITPU_SUBMIT_DMABUF      _IOW('p', 0x01, unsigned long long)
+#define CVITPU_DMABUF_FLUSH_FD    _IOW('p', 0x02, unsigned long long)
+#define CVITPU_DMABUF_INVLD_FD    _IOW('p', 0x03, unsigned long long)
+#define CVITPU_DMABUF_FLUSH       _IOW('p', 0x04, unsigned long long)
+#define CVITPU_DMABUF_INVLD       _IOW('p', 0x05, unsigned long long)
+#define CVITPU_WAIT_DMABUF        _IOWR('p', 0x06, unsigned long long)
+#define CVITPU_PIO_MODE           _IOW('p', 0x07, unsigned long long)
+
+#define CVITPU_LOAD_TEE           _IOWR('p', 0x08, unsigned long long)
+#define CVITPU_SUBMIT_TEE         _IOW('p', 0x09, unsigned long long)
+#define CVITPU_UNLOAD_TEE         _IOW('p', 0x0A, unsigned long long)
+
+#endif /* __BM_NPU_IOCTL_H__ */
diff --git a/cviruntime/src/soc/common/linux/ion.h b/cviruntime/src/soc/common/linux/ion.h
new file mode 100644
index 000000000..805aac04b
--- /dev/null
+++ b/cviruntime/src/soc/common/linux/ion.h
@@ -0,0 +1,151 @@
+/*
+ * drivers/staging/android/uapi/ion.h
+ *
+ * Copyright (C) 2011 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _UAPI_LINUX_ION_H
+#define _UAPI_LINUX_ION_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+/**
+ * enum ion_heap_types - list of all possible types of heaps
+ * @ION_HEAP_TYPE_SYSTEM:	 memory allocated via vmalloc
+ * @ION_HEAP_TYPE_SYSTEM_CONTIG: memory allocated via kmalloc
+ * @ION_HEAP_TYPE_CARVEOUT:	 memory allocated from a prereserved
+ *				 carveout heap, allocations are physically
+ *				 contiguous
+ * @ION_HEAP_TYPE_DMA:		 memory allocated via DMA API
+ * @ION_NUM_HEAPS:		 helper for iterating over heaps, a bit mask
+ *				 is used to identify the heaps, so only 32
+ *				 total heap types are supported
+ */
+enum ion_heap_type {
+	ION_HEAP_TYPE_SYSTEM,
+	ION_HEAP_TYPE_SYSTEM_CONTIG,
+	ION_HEAP_TYPE_CARVEOUT,
+	ION_HEAP_TYPE_CHUNK,
+	ION_HEAP_TYPE_DMA,
+	ION_HEAP_TYPE_CUSTOM, /*
+			       * must be last so device specific heaps always
+			       * are at the end of this enum
+			       */
+};
+
+#define ION_NUM_HEAP_IDS		(sizeof(unsigned int) * 8)
+
+/**
+ * allocation flags - the lower 16 bits are used by core ion, the upper 16
+ * bits are reserved for use by the heaps themselves.
+ */
+
+/*
+ * mappings of this buffer should be cached, ion will do cache maintenance
+ * when the buffer is mapped for dma
+ */
+#define ION_FLAG_CACHED 1
+
+/**
+ * DOC: Ion Userspace API
+ *
+ * create a client by opening /dev/ion
+ * most operations handled via following ioctls
+ *
+ */
+
+#define MAX_ION_BUFFER_NAME			32
+
+/**
+ * struct ion_allocation_data - metadata passed from userspace for allocations
+ * @len:		size of the allocation
+ * @heap_id_mask:	mask of heap ids to allocate from
+ * @flags:		flags passed to heap
+ * @handle:		pointer that will be populated with a cookie to use to
+ *			refer to this allocation
+ *
+ * Provided by userspace as an argument to the ioctl
+ */
+struct ion_allocation_data {
+	__u64 len;
+	__u32 heap_id_mask;
+	__u32 flags;
+	__u32 fd;
+	__u32 unused;
+	__u64 paddr;
+	char name[MAX_ION_BUFFER_NAME];
+};
+
+struct ion_allocation_data_legacy {
+	__u64 len;
+	__u32 heap_id_mask;
+	__u32 flags;
+	__u32 fd;
+	__u32 unused;
+	__u64 paddr;
+};
+
+#define MAX_HEAP_NAME			32
+
+/**
+ * struct ion_heap_data - data about a heap
+ * @name - first 32 characters of the heap name
+ * @type - heap type
+ * @heap_id - heap id for the heap
+ */
+struct ion_heap_data {
+	char name[MAX_HEAP_NAME];
+	__u32 type;
+	__u32 heap_id;
+	__u32 reserved0;
+	__u32 reserved1;
+	__u32 reserved2;
+};
+
+/**
+ * struct ion_heap_query - collection of data about all heaps
+ * @cnt - total number of heaps to be copied
+ * @heaps - buffer to copy heap data
+ */
+struct ion_heap_query {
+	__u32 cnt; /* Total number of heaps to be copied */
+	__u32 reserved0; /* align to 64bits */
+	__u64 heaps; /* buffer to be populated */
+	__u32 reserved1;
+	__u32 reserved2;
+};
+
+#define ION_IOC_MAGIC		'I'
+
+/**
+ * DOC: ION_IOC_ALLOC - allocate memory
+ *
+ * Takes an ion_allocation_data struct and returns it with the handle field
+ * populated with the opaque handle for the allocation.
+ */
+#define ION_IOC_ALLOC		_IOWR(ION_IOC_MAGIC, 0, \
+				      struct ion_allocation_data)
+#define ION_IOC_ALLOC_LEGACY		_IOWR(ION_IOC_MAGIC, 0, \
+				      struct ion_allocation_data_legacy)
+
+/**
+ * DOC: ION_IOC_HEAP_QUERY - information about available heaps
+ *
+ * Takes an ion_heap_query structure and populates information about
+ * available Ion heaps.
+ */
+#define ION_IOC_HEAP_QUERY     _IOWR(ION_IOC_MAGIC, 8, \
+					struct ion_heap_query)
+
+#endif /* _UAPI_LINUX_ION_H */
diff --git a/cviruntime/src/soc/runtime_bmkernel.cpp b/cviruntime/src/soc/runtime_bmkernel.cpp
new file mode 100644
index 000000000..1f66da1b2
--- /dev/null
+++ b/cviruntime/src/soc/runtime_bmkernel.cpp
@@ -0,0 +1,22 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <bmruntime.h>
+#include <bmruntime_bmkernel.h>
+#include <runtime/debug.h>
+
+void bmruntime_bmkernel_create(bmctx_t ctx, void **p_bk_ctx) {
+  cviruntime_cvikernel_create(ctx, p_bk_ctx);
+}
+
+void bmruntime_bmkernel_submit(bmctx_t ctx) {
+  cviruntime_cvikernel_submit(ctx);
+}
+
+void bmruntime_bmkernel_destroy(bmctx_t ctx) {
+  cviruntime_cvikernel_destroy(ctx);
+}
+
+void bmruntime_bmkernel_submit_pio(bmctx_t ctx) {
+  (void)ctx;
+  TPU_ASSERT(0, NULL);
+}
diff --git a/cviruntime/test/180x/test_180x_avg_pooling.c b/cviruntime/test/180x/test_180x_avg_pooling.c
new file mode 100644
index 000000000..78817d577
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_avg_pooling.c
@@ -0,0 +1,272 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_native_ref.h"
+
+#define INVALIDE_STRIDE (-1)
+
+static void print_pooling_param(const cvk_tiu_average_pooling_param_t *p)
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+
+  printf("  Pooling parameters:\n");
+  printf("    ifmap = (%d, %d, %d, %d)\n", in, ic, ih, iw);
+  printf("    opd0_sign = %d\n", p->ifmap->fmt == CVK_FMT_I8);
+  printf("    weight = (%d, %d)\n", p->kh, p->kw);
+  printf("    padding = (%d, %d, %d, %d)\n",
+         p->pad_top, p->pad_bottom, p->pad_left, p->pad_right);
+  printf("    stride = (%d, %d)\n", p->stride_h, p->stride_w);
+  printf("    ins0 = (%d, %d, %d, %d)\n",
+         p->ins_h, p->ins_last_h, p->ins_w, p->ins_last_w);
+  printf("    avg_pooling_const = %d\n", p->avg_pooling_const);
+  printf("    rshift_bits = %d\n", p->rshift_bits);
+}
+
+static int8_t *alloc_input(cvk_tiu_average_pooling_param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->ifmap->shape, p->ifmap->fmt);
+  int8_t *data = (int8_t *)malloc(size);
+  if (!data)
+    return NULL;
+
+  for (uint64_t i = 0; i < size; i++)
+    data[i] = rand() % 256 - 128;
+  return data;
+}
+
+static int8_t *alloc_output(cvk_tiu_average_pooling_param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->ofmap->shape, p->ofmap->fmt);
+  return (int8_t *)malloc(size);
+}
+
+static int pooling_ih_ext(cvk_tiu_average_pooling_param_t *p, int ih)
+{
+  int ins = p->ins_h;
+  int ins_last = p->ins_last_h;
+  int pad = p->pad_top + p->pad_bottom;
+  return (ih - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+static int pooling_iw_ext(cvk_tiu_average_pooling_param_t *p, int iw)
+{
+  int ins = p->ins_w;
+  int ins_last = p->ins_last_w;
+  int pad = p->pad_left + p->pad_right;
+  return (iw - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+static int pooling_oh(cvk_tiu_average_pooling_param_t *p, int ih)
+{
+  int ih_ext = pooling_ih_ext(p, ih);
+  return (ih_ext - p->kh) / p->stride_h + 1;
+}
+
+static int pooling_ow(cvk_tiu_average_pooling_param_t *p, int iw)
+{
+  int iw_ext = pooling_iw_ext(p, iw);
+  return (iw_ext - p->kw) / p->stride_w + 1;
+}
+
+static void free_pooling_param(
+    cvk_context_t *cvk_ctx,
+    cvk_tiu_average_pooling_param_t *p)
+{
+  if (p->ifmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->ifmap);
+  if (p->ofmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->ofmap);
+}
+
+static cvk_tiu_average_pooling_param_t random_pooling_param(cvk_context_t *cvk_ctx, int stride_w, int stride_h)
+{
+  int retry_cnt = 100;
+  srand(clock());
+  cvk_tiu_average_pooling_param_t p;
+
+  for (int i = 0; i < retry_cnt; i++) {
+    int in = rand() % 5 + 1;
+    int ic = rand() % (3 * cvk_ctx->info.npu_num) + 1;
+    int ih = rand() % 30 + 3 + (INVALIDE_STRIDE == stride_h ? 0 : stride_h);
+    int iw = rand() % 30 + 6 + (INVALIDE_STRIDE == stride_w ? 0 : stride_w);
+    int opd0_sign = rand() % 2;
+
+    memset(&p, 0, sizeof(p));
+    p.kh = rand() % 7 + 1;
+    p.kw = rand() % 7 + 1;
+    p.stride_h = INVALIDE_STRIDE == stride_h ? rand() % (p.kh) + 1 : stride_h;
+    p.stride_w = INVALIDE_STRIDE == stride_w ? rand() % (p.kh) + 1 : stride_w;
+    p.ins_h = rand() % p.kh;
+    p.ins_w = rand() % p.kw;
+    p.ins_last_h = rand() % p.kh;
+    p.ins_last_w = rand() % p.kw;
+    p.pad_top = rand() % p.kh;
+    p.pad_bottom = rand() % p.kh;
+    p.pad_left = rand() % p.kw;
+    p.pad_right= rand() % p.kw;
+    p.avg_pooling_const = rand() % 256;
+    p.rshift_bits = rand() % 32;
+
+    cvk_tl_shape_t ifmap_shape;
+    ifmap_shape.n = in;
+    ifmap_shape.c = ic;
+    ifmap_shape.h = ih;
+    ifmap_shape.w = iw;
+
+    int on = in;
+    int oc = ic;
+    int oh = pooling_oh(&p, ih);
+    int ow = pooling_ow(&p, iw);
+    cvk_tl_shape_t ofmap_shape;
+    ofmap_shape.n = on;
+    ofmap_shape.c = oc;
+    ofmap_shape.h = oh;
+    ofmap_shape.w = ow;
+
+    cvk_fmt_t fmt = opd0_sign? CVK_FMT_I8: CVK_FMT_U8;
+    p.ofmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, ofmap_shape, CVK_FMT_I8, 1);
+    p.ifmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, ifmap_shape, fmt, 1);
+
+    if ((p.kh > pooling_ih_ext(&p, ih))
+        || (p.kw > pooling_iw_ext(&p, iw))
+        || (p.pad_top >= (1 << 4))
+        || (p.pad_bottom >= (1 << 4))
+        || (p.pad_left >= (1 << 4))
+        || (p.pad_right >= (1 << 4))
+        || !p.ofmap
+        || !p.ifmap) {
+      printf("retry init_pooling_param\n");
+      free_pooling_param(cvk_ctx, &p);
+    } else
+      break;
+  }
+
+  return p;
+}
+
+static int compare_results(
+    cvk_tiu_average_pooling_param_t *p,
+    int8_t input[],
+    int8_t output[])
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+  int opd0_sign = (p->ifmap->fmt == CVK_FMT_I8);
+
+  int8_t *output_ref = alloc_output(p);
+  int ret = native_pooling_ave_int8(
+      input, &p->avg_pooling_const, NULL, output_ref,
+      in, ic, ih, iw, p->kh, p->kw,
+      p->pad_top, p->pad_bottom, p->pad_left, p->pad_right,
+      p->stride_h, p->stride_w,
+      p->ins_h, p->ins_w, p->ins_last_h, p->ins_last_w,
+      opd0_sign, opd0_sign, p->rshift_bits, 1);
+  if (ret)
+    return ret;
+
+  ret = array_cmp_int8(
+      "Comparing results ...\n", output_ref, output,
+      tl_shape_size(&p->ofmap->shape, p->ofmap->fmt));
+
+  if (ret != 0) {
+    printf("Comparison FAILED!!!\n");
+    print_pooling_param(p);
+  }
+
+  free(output_ref);
+
+  return ret;
+}
+
+static int _test_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int stride_w, int stride_h)
+{
+  int ret;
+  cvk_tiu_average_pooling_param_t p = random_pooling_param(cvk_ctx, stride_w, stride_h);
+  int8_t *input = alloc_input(&p);
+  if (!input)
+    return -1;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, p.ifmap, (uint8_t *)input);
+
+  cvk_ctx->ops->tiu_average_pooling(cvk_ctx, &p);
+  CVI_RT_Submit(cvk_ctx);
+
+  int8_t *output = (int8_t *)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, p.ofmap);
+  if (!output)
+    return -1;
+
+  ret = compare_results(&p, input, output);
+
+  free_pooling_param(cvk_ctx, &p);
+  free(output);
+  free(input);
+
+  return ret;
+}
+
+static int test_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx) {
+  return _test_pooling(rt_handle, cvk_ctx, INVALIDE_STRIDE, INVALIDE_STRIDE);
+}
+
+static int test_avg_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+  for (uint64_t i = 0; i < 16; i++)
+    ret |= test_pooling(rt_handle, cvk_ctx);
+
+  // test stride extend (0, 31]
+  int stride_list[] = {15, 16, 31};
+  int stride_list_len = sizeof(stride_list) / sizeof(stride_list[0]);
+
+  for (int stride_w_idx = 0; stride_w_idx < stride_list_len; stride_w_idx++) {
+    for (int stride_h_idx = 0; stride_h_idx < stride_list_len; stride_h_idx++) {
+      int stride_w = stride_list[stride_w_idx];
+      int stride_h = stride_list[stride_h_idx];
+
+      ret |= _test_pooling(rt_handle, cvk_ctx, stride_w, stride_h);
+    }
+  }
+  
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret = test_avg_pooling(rt_handle, cvk_ctx);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  if (!ret)
+    printf("%s pass\n", __FILENAME__);
+  else
+    printf("%s fail\n", __FILENAME__);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_bf16_avg_pooling.c b/cviruntime/test/180x/test_180x_bf16_avg_pooling.c
new file mode 100644
index 000000000..7f3d29c6d
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_bf16_avg_pooling.c
@@ -0,0 +1,353 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_native_ref.h"
+
+#define INVALIDE_STRIDE (-1)
+typedef cvk_tiu_average_pooling_param_t param_t;
+int random_seed;
+static void print_pooling_param(const param_t *p)
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+
+  printf("  Pooling parameters:\n");
+  printf("    random_seed : %d \n", random_seed);
+  printf("    ifmap = (%d, %d, %d, %d)\n", in, ic, ih, iw);
+  printf("    opd0_sign = %d\n", p->ifmap->fmt == CVK_FMT_I8);
+  printf("    weight = (%d, %d)\n", p->kh, p->kw);
+  printf("    padding = (%d, %d, %d, %d)\n",
+         p->pad_top, p->pad_bottom, p->pad_left, p->pad_right);
+  printf("    stride = (%d, %d)\n", p->stride_h, p->stride_w);
+  printf("    ins0 = (%d, %d, %d, %d)\n",
+         p->ins_h, p->ins_last_h, p->ins_w, p->ins_last_w);
+  printf("    avg_pooling_const = %d\n", p->avg_pooling_const);
+  printf("    rshift_bits = %d\n", p->rshift_bits);
+}
+static int index_get(int h, int w1, int w2)
+{
+  return h * w1 + w2;
+}
+
+int native_pooling_avg_bf16(
+    const uint16_t* i_fmap,
+    const void* weight,
+    const uint32_t *bias,
+    uint16_t * o_fmap,
+    int input_n, int input_c, int input_h, int input_w,
+    int kh, int kw,
+    int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r,
+    int stride_h, int stride_w,
+    int ins_h, int ins_w,
+    int ins_h_last, int ins_w_last,
+    int const_weight)
+{
+  if (kh * kw <= 0)
+    return -1;
+
+  float *avg_pooling_mac_a = (float *)malloc(kh * kw * sizeof(float));
+  float *avg_pooling_mac_b = (float *)malloc(kh * kw * sizeof(float));
+
+  uint16_t avg_const_weight = *(uint16_t *)weight;
+  const uint16_t *weight_arr = (uint16_t*)weight;
+
+  int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b);
+  int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r);
+
+  int output_h = calc_output_hw(h_after, kh, stride_h);
+  int output_w = calc_output_hw(w_after, kw, stride_w);
+  uint16_t *i_fmap_pad = NULL;
+  for (int n = 0; n < input_n; n++) {
+    if (const_weight == 0)
+      weight_arr = (uint16_t*)weight;
+
+    for (int c = 0; c < input_c; ++c) {
+      fill_pad_fmap_bf16(i_fmap, &i_fmap_pad, cvk_convert_fp32_bf16(0),
+          pad_w_l, pad_w_r, pad_h_t, pad_h_b,
+          ins_h, ins_w, ins_h_last, ins_w_last,
+          input_h, input_w);
+
+      for (int ph = 0; ph < output_h; ++ph) {
+        for (int pw = 0; pw < output_w; ++pw) {
+          int hstart = ph * stride_h;
+          int wstart = pw * stride_w;
+          int pool_index = index_get(ph, output_w, pw);
+          int mac_index = 0;
+          float avg_pool_result=0;
+          for (int h = 0; h < kh; h++) {
+            for (int w = 0; w < kw; w++) {
+              int index = index_get((hstart+h), w_after, (w+wstart));
+              mac_index = index_get(h, kw, w);
+              float a = cvk_convert_bf16_fp32(i_fmap_pad[index]);
+              float b = const_weight ?
+                  cvk_convert_bf16_fp32(avg_const_weight) : cvk_convert_bf16_fp32(weight_arr[mac_index]);
+
+              avg_pool_result += a*b;
+            }
+          }
+
+          if(bias) {
+            avg_pool_result += cvk_convert_hex_fp32(bias[c]);
+          }
+          *(o_fmap+pool_index) = cvk_convert_fp32_bf16(avg_pool_result);
+        }
+      }
+      i_fmap += input_w * input_h;
+      if (const_weight == 0)
+        weight_arr += kh * kw;
+
+      o_fmap += output_w * output_h;
+    }
+  }
+  free(i_fmap_pad);
+
+  free(avg_pooling_mac_a);
+  free(avg_pooling_mac_b);
+
+  return 0;
+}
+
+static uint16_t *alloc_input(param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->ifmap->shape, p->ifmap->fmt);
+  uint16_t *data = (uint16_t *)malloc(size);
+  if (!data)
+    return NULL;
+
+  for (uint64_t i = 0; i < size / sizeof(uint16_t); i++) {
+    float val = 0;
+    int RAND_MAX2 = RAND_MAX/2; //100 ~ -100
+    val = (float)(rand()-RAND_MAX2)*1000 / (float)RAND_MAX;
+    data[i] = cvk_convert_fp32_bf16(val);//rand() % 256 - 128;
+  }
+  return data;
+}
+
+static uint16_t *alloc_output(param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->ofmap->shape, p->ofmap->fmt);
+  return (uint16_t *)malloc(size);
+}
+
+static int pooling_ih_ext(param_t *p, int ih)
+{
+  int ins = p->ins_h;
+  int ins_last = p->ins_last_h;
+  int pad = p->pad_top + p->pad_bottom;
+  return (ih - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+static int pooling_iw_ext(param_t *p, int iw)
+{
+  int ins = p->ins_w;
+  int ins_last = p->ins_last_w;
+  int pad = p->pad_left + p->pad_right;
+  return (iw - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+static int pooling_oh(param_t *p, int ih)
+{
+  int ih_ext = pooling_ih_ext(p, ih);
+  return (ih_ext - p->kh) / p->stride_h + 1;
+}
+
+static int pooling_ow(param_t *p, int iw)
+{
+  int iw_ext = pooling_iw_ext(p, iw);
+  return (iw_ext - p->kw) / p->stride_w + 1;
+}
+
+static void free_pooling_param(
+    cvk_context_t *cvk_ctx,
+    param_t *p)
+{
+  if (p->ifmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->ifmap);
+  if (p->ofmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->ofmap);
+}
+
+static param_t random_pooling_param(cvk_context_t *cvk_ctx, int stride_w, int stride_h)
+{
+  param_t p;
+
+retry:
+  random_seed = clock();
+  srand(random_seed);
+
+  int in = rand() % 5 + 1;
+  int ic = rand() % (3 * cvk_ctx->info.npu_num) + 1;
+  int ih = rand() % 30 + 3 + (INVALIDE_STRIDE == stride_h ? 0 : stride_h);
+  int iw = rand() % 30 + 6 + (INVALIDE_STRIDE == stride_w ? 0 : stride_w);
+
+  memset(&p, 0, sizeof(p));
+  p.kh = rand() % 7 + 1;
+  p.kw = rand() % 7 + 1;
+  p.stride_h = INVALIDE_STRIDE == stride_h ? rand() % (p.kh) + 1 : stride_h;
+  p.stride_w = INVALIDE_STRIDE == stride_w ? rand() % (p.kh) + 1 : stride_w;
+  p.ins_h = rand() % p.kh;
+  p.ins_w = rand() % p.kw;
+  p.ins_last_h = rand() % p.kh;
+  p.ins_last_w = rand() % p.kw;
+  p.pad_top = rand() % p.kh;
+  p.pad_bottom = rand() % p.kh;
+  p.pad_left = rand() % p.kw;
+  p.pad_right= rand() % p.kw;
+  p.rshift_bits = rand() % 32;
+  p.avg_pooling_const = cvk_convert_fp32_bf16(rand()%0x1000);//rand() % 256;
+  cvk_tl_shape_t ifmap_shape;
+  ifmap_shape.n = in;
+  ifmap_shape.c = ic;
+  ifmap_shape.h = ih;
+  ifmap_shape.w = iw;
+
+  int on = in;
+  int oc = ic;
+  int oh = pooling_oh(&p, ih);
+  int ow = pooling_ow(&p, iw);
+  cvk_tl_shape_t ofmap_shape;
+  ofmap_shape.n = on;
+  ofmap_shape.c = oc;
+  ofmap_shape.h = oh;
+  ofmap_shape.w = ow;
+
+  p.ofmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, ofmap_shape, CVK_FMT_BF16, 1);
+  p.ifmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, ifmap_shape, CVK_FMT_BF16, 1);
+
+  if ((p.kh > pooling_ih_ext(&p, ih))
+      || (p.kw > pooling_iw_ext(&p, iw))
+      || (p.pad_top >= (1 << 4))
+      || (p.pad_bottom >= (1 << 4))
+      || (p.pad_left >= (1 << 4))
+      || (p.pad_right >= (1 << 4))
+      || !p.ofmap
+      || !p.ifmap) {
+    printf("retry init_pooling_param\n");
+    free_pooling_param(cvk_ctx, &p);
+    goto retry;
+  }
+
+  return p;
+}
+
+static int compare_results(
+    param_t *p,
+    uint16_t input[],
+    uint16_t output[])
+{
+  int ret = 0;
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+
+  uint16_t *output_ref = alloc_output(p);
+  p->avg_pooling_const = cvk_convert_fp32_bf16(cvk_convert_bf16_fp32(p->avg_pooling_const)/(p->kh * p->kw));
+  ret = native_pooling_avg_bf16(
+      input, &p->avg_pooling_const, NULL, output_ref,
+      in, ic, ih, iw, p->kh, p->kw,
+      p->pad_top, p->pad_bottom, p->pad_left, p->pad_right,
+      p->stride_h, p->stride_w,
+      p->ins_h, p->ins_w, p->ins_last_h, p->ins_last_w,1
+      );
+  if (ret) {
+    free(output_ref);
+    return ret;
+  }
+
+  ret = array_cmp_int8(
+      "Comparing results ...\n", (int8_t*)output_ref, (int8_t*) output,
+      tl_shape_size(&p->ofmap->shape, p->ofmap->fmt));
+
+  if (ret) {
+    printf("Comparison FAILED!!!\n");
+    print_pooling_param(p);
+    ret = -1;
+  }
+
+  free(output_ref);
+
+  return ret;
+}
+
+static int _test_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int stride_w, int stride_h)
+{
+  param_t p = random_pooling_param(cvk_ctx, stride_w, stride_h);
+//  print_pooling_param(&p);
+
+  uint16_t *input = alloc_input(&p);
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, p.ifmap, (uint8_t *)input);
+  cvk_ctx->ops->tiu_average_pooling(cvk_ctx, &p);
+  uint16_t *output = (uint16_t *)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, p.ofmap);
+
+  int ret = compare_results(&p, input, output);
+
+  free_pooling_param(cvk_ctx, &p);
+  free(output);
+  free(input);
+
+  return ret;
+}
+
+
+static int test_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx) {
+  return _test_pooling(rt_handle, cvk_ctx, INVALIDE_STRIDE, INVALIDE_STRIDE);
+}
+
+static int test_avg_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+  for (uint64_t i = 0; i < 20; i++)
+    ret |= test_pooling(rt_handle, cvk_ctx);
+
+  // test stride extend (0, 31]
+  int stride_list[] = {15, 16, 31};
+  int stride_list_len = sizeof(stride_list) / sizeof(stride_list[0]);
+
+  for (int stride_w_idx = 0; stride_w_idx < stride_list_len; stride_w_idx++) {
+    for (int stride_h_idx = 0; stride_h_idx < stride_list_len; stride_h_idx++) {
+      int stride_w = stride_list[stride_w_idx];
+      int stride_h = stride_list[stride_h_idx];
+
+      ret |= _test_pooling(rt_handle, cvk_ctx, stride_w, stride_h);
+    }
+  }
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+  int round_mode;
+  round_mode = cvk_set_store_feround();
+  ret = test_avg_pooling(rt_handle, cvk_ctx);
+  cvk_restore_feround(round_mode);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_bf16_conv.c b/cviruntime/test/180x/test_180x_bf16_conv.c
new file mode 100644
index 000000000..acefb4130
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_bf16_conv.c
@@ -0,0 +1,780 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_native_ref.h"
+
+#define INVALIDE_STRIDE (-1)
+typedef struct {
+  int random_seed;
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int using_bias;
+  int bReLU_EN;
+  int r_shift_m;
+  int opd0_sign;
+  int opd1_sign;
+  int opd2_sign;
+  int bf16_enable;
+} conv_param_t;
+
+static void print_conv_param(const conv_param_t *p);
+
+static inline void bf16_relu(float *buf, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++)
+    if (buf[i] < 0)
+      buf[i] = 0;
+}
+
+static int conv_ref(
+    const conv_param_t *p_param,
+    const uint16_t *ifmap,
+    const uint16_t *weight,
+    const uint32_t *bias,
+    uint16_t *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int do_relu = p_param->bReLU_EN;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  float *result = (float *)malloc(sizeof(float) * in * oc * oh * ow);
+  if (!result)
+    return -1;
+
+  memset(result, 0, sizeof(float) * in * oc * oh * ow);
+  int ret = 0;
+
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      uint16_t *i_fmap_pad[ic];
+      uint16_t *kernel_pad[ic];
+
+      for (int iic = 0; iic < ic; ++iic) {
+        i_fmap_pad[iic] = NULL;
+        kernel_pad[iic] = NULL;
+        fill_pad_fmap_bf16(
+            (ifmap + n*ic*ih*iw + iic*ih*iw), &i_fmap_pad[iic], cvk_convert_fp32_bf16(0),
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+        //kernel_dilation(
+        fill_pad_fmap_bf16(
+            (weight + c*ic*kh*kw + iic*kh*kw), &kernel_pad[iic], cvk_convert_fp32_bf16(0),
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+                  kh, kw);
+      }
+      for (int ph = 0; ph < oh; ++ph) {
+        for (int pw = 0; pw < ow; ++pw) {
+          float result_val = result[n*oc*oh*ow + c*oh*ow + ph*ow + pw];
+          for (int idxh = 0; idxh < kh_ext; ++idxh) {
+            for (int idxw = 0; idxw < kw_ext; ++idxw) {
+              for (int iic = 0; iic < ic; ++iic){
+                float ifv = cvk_convert_bf16_fp32(i_fmap_pad[iic][(idxh+ph*stride_h) * iw_ext + idxw + pw*stride_w]);
+                float ikv = cvk_convert_bf16_fp32(kernel_pad[iic][idxh* kw_ext + idxw]);
+                result_val += ifv*ikv;
+              }
+            }
+          }
+          result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] = result_val;
+        }
+      }
+
+       if (p_param->using_bias) {
+         for (int ph = 0; ph < oh; ++ph) {
+           for (int pw = 0; pw < ow; ++pw) {
+             result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += cvk_convert_hex_fp32(bias[c]); //bias+c ;
+           }
+         }
+       }
+
+       if (do_relu)
+         bf16_relu(&result[n*oc*oh*ow + c*oh*ow], oh * ow);
+       for(int i = 0 ;i<ic;i++) {
+         free(i_fmap_pad[i]);
+         free(kernel_pad[i]);
+       }
+       if (ret)
+         goto error_release;
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+    for (int i = 0; i < in * oc * oh * ow; i++) {
+      ofmap[i] = cvk_convert_fp32_bf16(result[i]);
+    }
+
+error_release:
+  free(result);
+
+  return ret;
+}
+
+static uint16_t * transform_weight(const cvk_tl_shape_t *s, uint16_t before[])
+{
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  uint32_t size = ic * oc * kh * kw;
+  uint16_t *after = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  if (!after)
+    return NULL;
+
+  /*
+   * (oc, ic, kh, kw) -> (1, oc, kh * kw, ic)
+   */
+  for (uint32_t oci = 0; oci < oc; oci++) {
+    for (uint32_t ici = 0; ici < ic; ici++) {
+      for (uint32_t khi = 0; khi < kh; khi++) {
+        for (uint32_t kwi = 0; kwi < kw; kwi++) {
+          uint32_t src_i = oci * ic * kh * kw + ici * kh * kw + khi * kw + kwi;
+          uint32_t dst_i = oci * kh * kw * ic + khi * kw * ic + kwi * ic + ici;
+          after[dst_i] = before[src_i];
+        }
+      }
+    }
+  }
+
+  return after;
+}
+
+static void put_conv_weight(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    uint16_t *data)
+{
+#if 0
+  const cvk_tl_shape_t *s = &tl->shape;
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  bmshape_t bms = BM_TENSOR_INT8((int)oc, (int)ic, (int)kh, (int)kw*2);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  uint16_t *transformed_data = transform_weight(s, data);
+
+  /* we put weight to region 1. bm_memcpy_s2d regard dev_mem as
+   * absolute address, so we should pass abs address to copy weight
+   * to right place.
+   */
+
+  //u64 ab_addr = bm_device_read_base_reg(*ctx, 1);
+  //bmmem_device_t ab_dev_mem =
+    //bmmem_device_prealloc(*ctx, NULL, ab_addr + gaddr, &bms);
+
+  //int ret = bm_memcpy_s2d(*ctx, ab_dev_mem, (u8*)transformed_data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, (u8*)transformed_data);
+
+  assert(ret == BM_SUCCESS);
+  cvk_tl_shape_t tdma_shape = { 1, oc, kh * kw, ic };
+
+  tg_t tdma_tg;
+  tdma_tg.base_reg_index = 0;
+  tdma_tg.start_address = gaddr;
+  tdma_tg.fmt = FMT_BF16;
+  tdma_tg.shape.n = tdma_shape.n;
+  tdma_tg.shape.c = tdma_shape.c;
+  tdma_tg.shape.h = tdma_shape.h;
+  tdma_tg.shape.w = tdma_shape.w;
+  tdma_tg.stride = bmk1822_tensor_tgmem_default_stride(tdma_tg.shape, FMT_BF16);
+  tdma_tg.base_reg_index = 1;
+
+  tl_t tdma_tl = *tl;
+  tdma_tl.shape = tdma_shape;
+  tdma_tl.stride = bmk1822_tensor_lmem_default_stride(bk_ctx, tdma_shape, FMT_BF16, 0);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tdma_tg;
+  p.dst = &tdma_tl;
+
+  bmk1822_tdma_g2l_bf16_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+  bmmem_device_free(*ctx, dev_mem);
+  //delete[] transformed_data;
+  return transformed_data;
+#endif
+
+  const cvk_tl_shape_t *s = &tl->shape;
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  uint16_t *transformed_data = transform_weight(s, data);
+
+  cvk_tl_shape_t tdma_shape = tl_shape_t4(1, oc, kh * kw, ic);
+  cvk_tl_t tdma_tl;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tdma_tl, tdma_shape, tl->fmt, tl->eu_align);
+  tdma_tl.start_address = tl->start_address;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, &tdma_tl, (uint8_t *)transformed_data);
+
+  free(transformed_data);
+}
+
+
+static uint16_t * transform_bias(int oc, uint32_t before[])
+{
+  uint16_t *after = (uint16_t *)malloc(2 * sizeof(uint16_t) * oc);
+  if (!after)
+    return NULL;
+
+  for (int i = 0; i < oc; i++){
+    after[i] = (before[i] >> 16) & 0xffff;
+    after[i + oc] = before[i] & 0xffff;
+  }
+  return after;
+}
+
+static void put_conv_bias(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    uint32_t *data)
+{
+#if 0
+  int oc = tl->shape.c;
+
+  bmshape_t bms = BM_TENSOR_INT8(2 * sizeof(short), oc, 1, 1);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  uint16_t *transformed_data = transform_bias(oc, data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, (u8 *)transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_BF16;
+  tg.shape.n = 2;
+  tg.shape.c = oc;
+  tg.shape.h = 1;
+  tg.shape.w = 1;
+  tg.stride = bmk1822_tensor_tgmem_default_stride(tg.shape, FMT_BF16);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1822_tdma_g2l_bf16_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  bmmem_device_free(*ctx, dev_mem);
+#endif
+
+  int oc = tl->shape.c;
+  uint16_t *transformed_data = transform_bias(oc, data);
+
+  cvk_tl_shape_t tdma_shape = tl_shape_t4(2, oc, 1, 1);
+  cvk_tl_t tdma_tl;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tdma_tl, tdma_shape, tl->fmt, tl->eu_align);
+  tdma_tl.start_address = tl->start_address;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, &tdma_tl, (uint8_t *)transformed_data);
+
+  free(transformed_data);
+}
+
+static int conv_kh_ext(const conv_param_t *p)
+{
+  return (p->kh - 1) * p->dh + 1;
+}
+
+static int conv_kw_ext(const conv_param_t *p)
+{
+  return (p->kw - 1) * p->dw + 1;
+}
+
+static int conv_ih_ext(const conv_param_t *p)
+{
+  return (p->input_h - 1) * (p->ins_h + 1) +
+      p->ins_h_last + 1 + p->pad_top + p->pad_bot;
+}
+
+static int conv_iw_ext(const conv_param_t *p)
+{
+  return (p->input_w - 1) * (p->ins_w + 1) +
+      p->ins_w_last + 1 + p->pad_left + p->pad_right;
+}
+
+static int conv_oh(const conv_param_t *p)
+{
+  return (conv_ih_ext(p) - conv_kh_ext(p)) / p->stride_h + 1;
+}
+
+static int conv_ow(const conv_param_t *p)
+{
+  return (conv_iw_ext(p) - conv_kw_ext(p)) / p->stride_w + 1;
+}
+
+static int conv_input_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int ic = p->input_c;
+  int ih = p->input_h;
+  int iw = p->input_w;
+  return in * ic * ih * iw;
+}
+
+static int conv_output_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int oc = p->output_c;
+  int oh = conv_oh(p);
+  int ow = conv_ow(p);
+  return in * oc * oh * ow;
+}
+
+static int conv_weight_size(const conv_param_t *p)
+{
+  int oc = p->output_c;
+  int ic = p->input_c;
+  int kh = p->kh;
+  int kw = p->kw;
+  return oc * ic * kh * kw;
+}
+
+static uint16_t * alloc_input(const conv_param_t *p)
+{
+  int size = conv_input_size(p);
+  uint16_t *buf = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (int i = 0; i < size; i++) {
+    float val = 0;
+    int RAND_MAX2 = RAND_MAX/2; //5 ~ -5
+    val = (float)(rand()-RAND_MAX2)*5 / (float)RAND_MAX;
+    buf[i] = cvk_convert_fp32_bf16(val);
+  }
+  return buf;
+}
+
+static uint16_t * alloc_weight(const conv_param_t *p)
+{
+  int size = conv_weight_size(p);
+  uint16_t *buf = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (int i = 0; i < size; i++) {
+    float val = 0;
+    int RAND_MAX2 = RAND_MAX/2; // 5 ~ -5
+    val = (float)(rand()-RAND_MAX2)*5 / (float)RAND_MAX;
+    buf[i] = cvk_convert_fp32_bf16(val);
+  }
+
+  return buf;
+}
+
+static uint32_t * alloc_bias(const conv_param_t *p)
+{
+  int oc = p->output_c;
+  uint32_t *bias = (uint32_t *)malloc(sizeof(uint32_t) * oc);
+  if (!bias)
+    return NULL;
+
+  for (int i = 0; i < oc; i++) {
+    float val = 0;
+    int RAND_MAX2 = RAND_MAX/2; // 5 ~ -5
+    val = (float)(rand()-RAND_MAX2)*5 / (float)RAND_MAX;
+    bias[i] = cvk_convert_fp32_hex(val);
+  }
+
+  return bias;
+}
+
+static cvk_tl_t * conv_ifmap_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = CVK_FMT_BF16;//p->opd0_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 1);
+}
+
+static cvk_tl_t * conv_weight_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = CVK_FMT_BF16;//p->opd1_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 0);
+}
+
+static cvk_tl_t * conv_ofmap_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_tl_shape_t s;
+  cvk_fmt_t fmt = CVK_FMT_BF16;
+  s.n = p->input_n;
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 1);
+}
+
+static cvk_tl_t * conv_bias_tensor(
+    cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = CVK_FMT_BF16;
+  cvk_tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 0);
+}
+
+static int conv_param_is_ok(const conv_param_t *p)
+{
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+
+  if ((kh_ext > ih_ext)
+      || (kw_ext > iw_ext)
+      || (kh_ext <= p->pad_top)
+      || (kh_ext <= p->pad_bot)
+      || (kw_ext <= p->pad_left)
+      || (kw_ext <= p->pad_right)
+      || (p->pad_top >= (1 << 4))
+      || (p->pad_bot >= (1 << 4))
+      || (p->pad_left >= (1 << 4))
+      || (p->pad_right >= (1 << 4))) {
+    return 0;
+  }
+
+  return 1;
+}
+
+static int bmk_conv_param_alloc_ok(
+    const cvk_tiu_pt_convolution_param_t *p,
+    const conv_param_t *param)
+{
+  if (!p->ifmap || !p->ofmap || !p->weight)
+    return 0;
+
+  if (param->using_bias)
+    if (!p->bias)
+      return 0;
+
+  return 1;
+}
+
+static void make_bmk_conv_param(
+    cvk_context_t *cvk_ctx,
+    cvk_tiu_pt_convolution_param_t *dst,
+    const conv_param_t *p)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  dst->relu_enable = p->bReLU_EN;
+  dst->rshift_bits = p->r_shift_m;
+
+  dst->ifmap = conv_ifmap_tensor(cvk_ctx, p);
+  dst->weight = conv_weight_tensor(cvk_ctx, p);
+  dst->ofmap = conv_ofmap_tensor(cvk_ctx, p);
+  dst->bias = NULL;
+  dst->ps32_mode = 0;
+  if (p->using_bias)
+    dst->bias = conv_bias_tensor(cvk_ctx, p);
+  dst->w_is_const = 0;
+}
+
+static void free_bmk_conv_param(
+    cvk_context_t *cvk_ctx,
+    cvk_tiu_pt_convolution_param_t *r,
+    const conv_param_t *p)
+{
+  if (p->using_bias && r->bias)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->bias);
+  if (r->ofmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ofmap);
+  if (r->weight)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->weight);
+  if (r->ifmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ifmap);
+}
+
+static void _init_conv_param(conv_param_t *p, int stride_w, int stride_h)
+{
+  printf("init_conv_param\n");
+  memset(p, 0, sizeof(*p));
+  p->random_seed = clock();
+  srand(p->random_seed);
+
+retry:
+
+  p->input_n = rand() % 5 + 1;
+  p->input_c = rand() % (5 * 32) + 1;
+  p->kh = rand() % 7 + 1;
+  p->kw = rand() % 7 + 1;
+  p->input_h = rand() % 40 + p->kh + (INVALIDE_STRIDE == stride_h ? 0 : stride_h);
+  p->input_w = rand() % 40 + p->kw + (INVALIDE_STRIDE == stride_w ? 0 : stride_w);
+  p->output_c = rand() % 10 + 3;
+  p->stride_h = INVALIDE_STRIDE == stride_h ? rand() % (p->kh) + 1 : stride_h;
+  p->stride_w = INVALIDE_STRIDE == stride_w ? rand() % (p->kh) + 1 : stride_w;
+  p->ins_h = rand() % p->kh;
+  p->ins_w = rand() % p->kw;
+  p->ins_h_last = rand() % p->kh;;
+  p->ins_w_last = rand() % p->kw;;
+  p->dh = rand() % 3 + 1;
+  p->dw = rand() % 3 + 1;
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  p->pad_top = rand() % kh_ext;
+  p->pad_bot = rand() % kh_ext;
+  p->pad_left = rand() % kw_ext;
+  p->pad_right = rand() % kw_ext;
+
+  if (!conv_param_is_ok(p)) {
+    printf("retry init_conv_param\n");
+    goto retry;
+  }
+
+  p->using_bias = rand() % 2;
+  p->bReLU_EN = rand() % 2;
+
+  p->opd0_sign = rand() % 2;
+  p->opd1_sign = 1;
+  p->opd2_sign = 1;
+
+  assert(p->opd1_sign == 1 && p->opd2_sign == 1);
+
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+  assert(ih_ext >= kh_ext);
+  assert(iw_ext >= kw_ext);
+}
+
+static void init_conv_param(conv_param_t *p) {
+  _init_conv_param(p, INVALIDE_STRIDE, INVALIDE_STRIDE);
+}
+
+#if 1
+static void print_conv_param(const conv_param_t *p)
+{
+  printf("%s\n", "Conv parameters:");
+  printf("  %s%d;\n", "p->random_seed = ", p->random_seed);
+
+  printf("  %s%d;\n", "p->input_n = ", p->input_n);
+  printf("  %s%d;\n", "p->input_c = ", p->input_c);
+  printf("  %s%d;\n", "p->input_h = ", p->input_h);
+  printf("  %s%d;\n", "p->input_w = ", p->input_w);
+  printf("  %s%d;\n", "p->output_c = ", p->output_c);
+
+  printf("  %s%d;\n", "p->kh = ", p->kh);
+  printf("  %s%d;\n", "p->kw = ", p->kw);
+  printf("  %s%d;\n", "p->dh = ", p->dh);
+  printf("  %s%d;\n", "p->dw = ", p->dw);
+  printf("  %s%d;\n", "p->pad_top = ", p->pad_top);
+  printf("  %s%d;\n", "p->pad_bot = ", p->pad_bot);
+  printf("  %s%d;\n", "p->pad_left = ", p->pad_left);
+  printf("  %s%d;\n", "p->pad_right = ", p->pad_right);
+  printf("  %s%d;\n", "p->stride_h = ", p->stride_h);
+  printf("  %s%d;\n", "p->stride_w = ", p->stride_w);
+  printf("  %s%d;\n", "p->ins_w = ", p->ins_w);
+  printf("  %s%d;\n", "p->ins_h = ", p->ins_h);
+  printf("  %s%d;\n", "p->ins_w_last = ", p->ins_w_last);
+  printf("  %s%d;\n", "p->ins_h_last = ", p->ins_h_last);
+
+  printf("  %s%d;\n", "p->r_shift_m = ", p->r_shift_m);
+  printf("  %s%d;\n", "p->opd0_sign = ", p->opd0_sign);
+  printf("  %s%d;\n", "p->opd1_sign = ", p->opd1_sign);
+  printf("  %s%d;\n", "p->opd2_sign = ", p->opd2_sign);
+  printf("  %s%d;\n", "p->bReLU_EN = ", p->bReLU_EN);
+  printf("  %s%d;\n", "p->using_bias = ", p->using_bias);
+
+  printf("  %s%d\n", "kh_ext = ", conv_kh_ext(p));
+  printf("  %s%d\n", "kw_ext = ", conv_kw_ext(p));
+  printf("  %s%d\n", "ih_ext = ", conv_ih_ext(p));
+  printf("  %s%d\n", "iw_ext = ", conv_iw_ext(p));
+  printf("  %s%d\n", "output_h = ", conv_oh(p));
+  printf("  %s%d\n", "output_w = ", conv_ow(p));
+}
+#endif
+
+static int test_conv(
+    conv_param_t *p_param, CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+  uint16_t *input = alloc_input(p_param);
+  uint16_t *weight = alloc_weight(p_param);
+  uint32_t *bias = alloc_bias(p_param);
+  uint16_t *output_ref = (uint16_t *)malloc(sizeof(uint16_t) * conv_output_size(p_param));
+  if (!input || !weight || !bias || !output_ref) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  //print_conv_param(p_param);
+
+  ret = conv_ref(p_param, input, weight, bias, output_ref);
+  if (ret)
+    goto fail_exit;
+
+  cvk_tiu_pt_convolution_param_t conv_param;
+  make_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+
+  int tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, p_param);
+  if (tl_alloc_success) {
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, conv_param.ifmap, (uint8_t *)input);
+    put_conv_weight(rt_handle, cvk_ctx, conv_param.weight, weight);
+    if (p_param->using_bias)
+      put_conv_bias(rt_handle, cvk_ctx, conv_param.bias, bias);
+    cvk_ctx->ops->tiu_pt_convolution(cvk_ctx, &conv_param);
+    uint16_t *output = (uint16_t *) tensor_copy_l2g_d2s(rt_handle, cvk_ctx, conv_param.ofmap);
+
+    ret = array_cmp_int8(
+        "Comparing results ...\n",
+        (int8_t*)output_ref, (int8_t*)output, conv_output_size(p_param)*2);
+
+    if (ret) {
+      print_conv_param(p_param);
+      printf("Comparison FAILED\n");
+      ret = -1;
+    }
+    free(output);
+  }
+
+  free_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+
+fail_exit:
+  free(input);
+  free(weight);
+  free(output_ref);
+  free(bias);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+  int round_mode;
+  round_mode = cvk_set_store_feround();
+
+  // 20 -> 5 for 1810
+  for (int i = 0; i < 5; i++) {
+    printf("random_test_conv iteration: %d\n", i);
+    conv_param_t test_conv_param;
+    init_conv_param(&test_conv_param);
+
+    ret |= test_conv(&test_conv_param, rt_handle, cvk_ctx);
+
+    if (!test_conv_param.using_bias)
+      test_conv_param.using_bias = 1;
+
+    if (test_conv_param.output_c <= 32)
+    {
+      test_conv_param.output_c += 32;
+    }
+    ret |= test_conv(&test_conv_param, rt_handle, cvk_ctx);
+  }
+
+  // test stride extend (0, 31]
+  int stride_list[] = {15, 16, 31};
+  int stride_list_len = sizeof(stride_list) / sizeof(stride_list[0]);
+
+  for (int stride_w_idx = 0; stride_w_idx < stride_list_len; stride_w_idx++) {
+    for (int stride_h_idx = 0; stride_h_idx < stride_list_len; stride_h_idx++) {
+      int stride_w = stride_list[stride_w_idx];
+      int stride_h = stride_list[stride_h_idx];
+      conv_param_t test_conv_param;
+      _init_conv_param(&test_conv_param, stride_w, stride_h);
+
+      ret |= test_conv(&test_conv_param, rt_handle, cvk_ctx);
+    }
+  }
+
+  cvk_restore_feround(round_mode);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_bf16_conv_ps32.c b/cviruntime/test/180x/test_180x_bf16_conv_ps32.c
new file mode 100644
index 000000000..5061e5f98
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_bf16_conv_ps32.c
@@ -0,0 +1,1193 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_native_ref.h"
+
+typedef struct {
+  int random_seed;
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int using_bias;
+  int bReLU_EN;
+  int r_shift_m;
+  int opd0_sign;
+  int opd1_sign;
+  int opd2_sign;
+  int bf16_enable;
+} conv_param_t;
+
+static inline void bf16_relu(float *buf, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++)
+    if (buf[i] < 0)
+      buf[i] = 0;
+}
+
+static int ps32_conv_ref(
+    const conv_param_t *p_param,
+    const uint16_t *ifmap,
+    const uint16_t *weight,
+    const uint32_t *bias,
+    uint16_t *ofmap, int ps32_mode)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  float *result = (float *)malloc(sizeof(float) * in * oc * oh * ow);
+  if (!result)
+    return -1;
+
+  uint32_t bstride = in * oc * oh * ow;
+  int ret = 0;
+
+  if (ps32_mode == 2 || ps32_mode == 0)
+    memset(result, 0, sizeof(float) * in * oc * oh * ow);
+  else {
+    for (int i = 0; i < in * oc * oh * ow; i++) {
+      result[i] = cvk_convert_hex_fp32((ofmap[i + bstride * 0] << 16) | ofmap[i + bstride * 1]);
+    }
+  }
+
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      uint16_t *i_fmap_pad[ic];
+      uint16_t *kernel_pad[ic];
+
+      for (int iic = 0; iic < ic; ++iic) {
+        i_fmap_pad[iic] = NULL;
+        kernel_pad[iic] = NULL;
+        fill_pad_fmap_bf16(
+            (ifmap + n*ic*ih*iw + iic*ih*iw), &i_fmap_pad[iic], cvk_convert_fp32_bf16(0),
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+        //kernel_dilation(
+        fill_pad_fmap_bf16(
+            (weight + c*ic*kh*kw + iic*kh*kw), &kernel_pad[iic], cvk_convert_fp32_bf16(0),
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+                  kh, kw);
+      }
+      for (int ph = 0; ph < oh; ++ph) {
+        for (int pw = 0; pw < ow; ++pw) {
+          float result_val= result[n*oc*oh*ow + c*oh*ow + ph*ow + pw];
+          for (int idxh = 0; idxh < kh_ext; ++idxh)  {
+            for (int idxw = 0; idxw < kw_ext; ++idxw) {
+              for (int iic = 0; iic < ic; ++iic){
+                float ifv = cvk_convert_bf16_fp32(i_fmap_pad[iic][(idxh+ph*stride_h) * iw_ext + idxw + pw*stride_w]);
+                float ikv = cvk_convert_bf16_fp32(kernel_pad[iic][idxh* kw_ext + idxw]);
+                result_val += ifv*ikv;
+              }
+            }
+          }
+          result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] = result_val;
+		    }
+      }
+        for(int i = 0; i < ic; i++) {
+          if (i_fmap_pad[i]) {
+            free(i_fmap_pad[i]);
+            i_fmap_pad[i] = NULL;
+          }
+          if (kernel_pad[i]) {
+            free(kernel_pad[i]);
+            kernel_pad[i] = NULL;
+          }
+        }
+    } //end for (int c = 0; c < oc; ++c)
+  }
+
+  if( ps32_mode & 0x2) {
+    for (int i = 0; i < in * oc * oh * ow; i ++) {
+      ofmap[i] = cvk_convert_fp32_hex(result[i]) >> 16;
+      ofmap[bstride + i] = cvk_convert_fp32_hex(result[i]) & 0xFFFF;
+    }
+  } else {
+    for (int n = 0; n < in; ++n) {
+      for (int c = 0; c < oc; ++c) {
+        if (p_param->using_bias) {
+          for (int ph = 0; ph < oh; ++ph) {
+            for (int pw = 0; pw < ow; ++pw) {
+              result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += cvk_convert_hex_fp32(bias[c]); //bias+c ;
+            }
+          }
+        }
+        if (p_param->bReLU_EN)
+          bf16_relu(&result[n*oc*oh*ow + c*oh*ow], oh * ow);
+      }
+    }
+    for (int i = 0; i < in * oc * oh * ow; i++) {
+      ofmap[i] = cvk_convert_fp32_bf16(result[i]);
+    }
+  }
+  free(result);
+  return ret;
+}
+
+static uint16_t * transform_weight(const cvk_tl_shape_t *s, uint16_t before[])
+{
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  uint32_t size = ic * oc * kh * kw;
+  uint16_t *after = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  if (!after)
+    return NULL;
+
+  /*
+   * (oc, ic, kh, kw) -> (1, oc, kh * kw, ic)
+   */
+  for (uint32_t oci = 0; oci < oc; oci++) {
+    for (uint32_t ici = 0; ici < ic; ici++) {
+      for (uint32_t khi = 0; khi < kh; khi++) {
+        for (uint32_t kwi = 0; kwi < kw; kwi++) {
+          uint32_t src_i = oci * ic * kh * kw + ici * kh * kw + khi * kw + kwi;
+          uint32_t dst_i = oci * kh * kw * ic + khi * kw * ic + kwi * ic + ici;
+          after[dst_i] = before[src_i];
+        }
+      }
+    }
+  }
+
+  return after;
+}
+
+static void put_conv_weight(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    uint16_t *data)
+{
+#if 0
+  const cvk_tl_shape_t *s = &tl->shape;
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  bmshape_t bms = BM_TENSOR_INT8((int)oc, (int)ic, (int)kh, (int)kw*2);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  uint16_t *transformed_data = transform_weight(s, data);
+
+  /* we put weight to region 1. bm_memcpy_s2d regard dev_mem as
+   * absolute address, so we should pass abs address to copy weight
+   * to right place.
+   */
+
+  //u64 ab_addr = bm_device_read_base_reg(*ctx, 1);
+  //bmmem_device_t ab_dev_mem =
+    //bmmem_device_prealloc(*ctx, NULL, ab_addr + gaddr, &bms);
+
+  //int ret = bm_memcpy_s2d(*ctx, ab_dev_mem, (u8*)transformed_data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, (u8*)transformed_data);
+
+  assert(ret == BM_SUCCESS);
+  cvk_tl_shape_t tdma_shape = { 1, oc, kh * kw, ic };
+
+  tg_t tdma_tg;
+  tdma_tg.base_reg_index = 0;
+  tdma_tg.start_address = gaddr;
+  tdma_tg.fmt = FMT_BF16;
+  tdma_tg.shape.n = tdma_shape.n;
+  tdma_tg.shape.c = tdma_shape.c;
+  tdma_tg.shape.h = tdma_shape.h;
+  tdma_tg.shape.w = tdma_shape.w;
+  tdma_tg.stride = bmk1822_tensor_tgmem_default_stride(tdma_tg.shape, FMT_BF16);
+  tdma_tg.base_reg_index = 1;
+
+  tl_t tdma_tl = *tl;
+  tdma_tl.shape = tdma_shape;
+  tdma_tl.stride = bmk1822_tensor_lmem_default_stride(bk_ctx, tdma_shape, FMT_BF16, 0);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tdma_tg;
+  p.dst = &tdma_tl;
+
+  bmk1822_tdma_g2l_bf16_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+  bmmem_device_free(*ctx, dev_mem);
+  //delete[] transformed_data;
+  return transformed_data;
+#endif
+
+  const cvk_tl_shape_t *s = &tl->shape;
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  uint16_t *transformed_data = transform_weight(s, data);
+
+  cvk_tl_shape_t tdma_shape = tl_shape_t4(1, oc, kh * kw, ic);
+  cvk_tl_t tdma_tl;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tdma_tl, tdma_shape, tl->fmt, tl->eu_align);
+  tdma_tl.start_address = tl->start_address;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, &tdma_tl, (uint8_t *)transformed_data);
+
+  free(transformed_data);
+}
+
+
+static uint16_t * transform_bias(int oc, uint32_t before[])
+{
+  uint16_t *after = (uint16_t *)malloc(sizeof(uint16_t) * 2 * oc);
+  if (!after)
+    return NULL;
+
+  for (int i = 0; i < oc; i++){
+    after[i] = (before[i] >> 16) & 0xffff;
+    after[i + oc] = before[i] & 0xffff;
+  }
+  return after;
+}
+
+static void put_conv_bias(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    uint32_t *data)
+{
+#if 0
+  int oc = tl->shape.c;
+
+  bmshape_t bms = BM_TENSOR_INT8(2 * sizeof(short), oc, 1, 1);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  uint16_t *transformed_data = transform_bias(oc, data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, (u8 *)transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_BF16;
+  tg.shape.n = 2;
+  tg.shape.c = oc;
+  tg.shape.h = 1;
+  tg.shape.w = 1;
+  tg.stride = bmk1822_tensor_tgmem_default_stride(tg.shape, FMT_BF16);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1822_tdma_g2l_bf16_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  bmmem_device_free(*ctx, dev_mem);
+#endif
+
+  int oc = tl->shape.c;
+  uint16_t *transformed_data = transform_bias(oc, data);
+
+  cvk_tl_shape_t tdma_shape = tl_shape_t4(2, oc, 1, 1);
+  cvk_tl_t tdma_tl;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tdma_tl, tdma_shape, tl->fmt, tl->eu_align);
+  tdma_tl.start_address = tl->start_address;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, &tdma_tl, (uint8_t *)transformed_data);
+
+  free(transformed_data);
+}
+
+static int conv_kh_ext(const conv_param_t *p)
+{
+  return (p->kh - 1) * p->dh + 1;
+}
+
+static int conv_kw_ext(const conv_param_t *p)
+{
+  return (p->kw - 1) * p->dw + 1;
+}
+
+static int conv_ih_ext(const conv_param_t *p)
+{
+  return (p->input_h - 1) * (p->ins_h + 1) +
+      p->ins_h_last + 1 + p->pad_top + p->pad_bot;
+}
+
+static int conv_iw_ext(const conv_param_t *p)
+{
+  return (p->input_w - 1) * (p->ins_w + 1) +
+      p->ins_w_last + 1 + p->pad_left + p->pad_right;
+}
+
+static int conv_oh(const conv_param_t *p)
+{
+  return (conv_ih_ext(p) - conv_kh_ext(p)) / p->stride_h + 1;
+}
+
+static int conv_ow(const conv_param_t *p)
+{
+  return (conv_iw_ext(p) - conv_kw_ext(p)) / p->stride_w + 1;
+}
+
+static int conv_input_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int ic = p->input_c;
+  int ih = p->input_h;
+  int iw = p->input_w;
+  return in * ic * ih * iw;
+}
+
+static int conv_output_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int oc = p->output_c;
+  int oh = conv_oh(p);
+  int ow = conv_ow(p);
+
+  return in * oc * oh * ow;
+}
+
+static int conv_weight_size(const conv_param_t *p)
+{
+  int oc = p->output_c;
+  int ic = p->input_c;
+  int kh = p->kh;
+  int kw = p->kw;
+  return oc * ic * kh * kw;
+}
+
+static uint16_t * alloc_input(const conv_param_t *p)
+{
+  int size = conv_input_size(p);
+  uint16_t *buf = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (int i = 0; i < size; i++) {
+    buf[i] = cvk_convert_fp32_bf16(i);
+  }
+
+  return buf;
+}
+
+static uint16_t * alloc_weight(const conv_param_t *p)
+{
+  int size = conv_weight_size(p);
+
+  uint16_t *buf = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (int i = 0; i < size; i++) {
+    buf[i] = cvk_convert_fp32_bf16(i);
+  }
+
+  return buf;
+}
+
+static uint32_t * alloc_bias(const conv_param_t *p)
+{
+  int oc = p->output_c;
+
+  uint32_t *bias = (uint32_t *)malloc(sizeof(uint32_t) * oc);
+  if (!bias)
+    return NULL;
+
+  float val = 100;
+  for (int i = 0; i < oc; i++) {
+    bias[i] = cvk_convert_fp32_hex(val);
+    val += 1;
+  }
+  return bias;
+}
+
+static cvk_tl_t * conv_ifmap_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = CVK_FMT_BF16;
+  cvk_tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 1);
+}
+
+static uint32_t conv_ifmap_tensor_size(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = CVK_FMT_BF16;
+  cvk_tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, s, fmt, 1);
+}
+
+static cvk_tl_t * conv_weight_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = CVK_FMT_BF16; //p->opd1_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 0);
+}
+
+static uint32_t conv_weight_tensor_to_size(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = CVK_FMT_BF16; //p->opd1_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+  return cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, s, fmt, 0);
+}
+
+static cvk_tl_t * conv_ofmap_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_tl_shape_t s;
+  s.n = p->input_n * 4;
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  cvk_tl_t *tl = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, CVK_FMT_BF16, 1);
+  if (tl)
+    tl->shape.n = p->input_n;
+  return tl;
+}
+
+static uint32_t conv_ofmap_tensor_to_size(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_tl_shape_t s;
+  s.n = p->input_n * sizeof(uint32_t) / sizeof(uint8_t);
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  return cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, s, CVK_FMT_BF16, 1);
+}
+
+static cvk_tl_t * conv_bias_tensor(
+    cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = CVK_FMT_BF16;//p->opd2_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 0);
+}
+
+static uint32_t conv_bias_tensor_size(
+    cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = CVK_FMT_BF16;//p->opd2_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+  return cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, s, fmt, 0);
+}
+
+static int conv_param_is_ok(const conv_param_t *p)
+{
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+
+  if ((kh_ext > ih_ext)
+      || (kw_ext > iw_ext)
+      || (kh_ext <= p->pad_top)
+      || (kh_ext <= p->pad_bot)
+      || (kw_ext <= p->pad_left)
+      || (kw_ext <= p->pad_right)
+      || (p->pad_top >= (1 << 4))
+      || (p->pad_bot >= (1 << 4))
+      || (p->pad_left >= (1 << 4))
+      || (p->pad_right >= (1 << 4))) {
+    return 0;
+  }
+
+  return 1;
+}
+
+static int bmk_conv_param_alloc_ok(
+    const cvk_tiu_pt_convolution_param_t *p,
+    const conv_param_t *param)
+{
+  if (!p->ifmap || !p->ofmap || !p->weight)
+    return 0;
+  if(p->ps32_mode==1)
+      if (param->using_bias)
+        if (!p->bias)
+          return 0;
+
+  return 1;
+}
+
+static void make_bmk_conv_param_ps32(
+    cvk_context_t *cvk_ctx,
+    cvk_tiu_pt_convolution_param_t *dst,
+    const conv_param_t *p, uint32_t ps32_mode)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  if(ps32_mode==2)
+  {
+    uint32_t ifmap_size = conv_ifmap_tensor_size(cvk_ctx, p);
+    uint32_t weight_size = conv_weight_tensor_to_size(cvk_ctx, p);
+    uint32_t ofmap_size = conv_ofmap_tensor_to_size(cvk_ctx, p);
+    uint32_t bias_size = p->using_bias ? conv_bias_tensor_size(cvk_ctx, p) : 0;
+    uint32_t total_size = ifmap_size + weight_size + ofmap_size + bias_size;
+
+    // Allocation if size fit.
+    if (total_size <= cvk_ctx->info.lmem_size) {
+      dst->ifmap = conv_ifmap_tensor(cvk_ctx, p);
+      dst->weight = conv_weight_tensor(cvk_ctx, p);
+      dst->ofmap = conv_ofmap_tensor(cvk_ctx, p);
+    } else {
+      dst->ifmap = NULL;
+      dst->weight = NULL;
+      dst->ofmap = NULL;
+    }
+  }
+
+  dst->ps32_mode = ps32_mode;
+
+  dst->bias = NULL;
+  dst->relu_enable = 0;
+  dst->rshift_bits = 0;
+  if(ps32_mode==1)
+  {
+    dst->relu_enable = p->bReLU_EN;
+    dst->rshift_bits = p->r_shift_m;
+    // only mode=1 can use bias
+    if (p->using_bias)
+      dst->bias = conv_bias_tensor(cvk_ctx, p);
+  }
+}
+
+static void make_bmk_conv_param(
+    cvk_context_t *cvk_ctx,
+    cvk_tiu_pt_convolution_param_t *dst,
+    const conv_param_t *p)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  uint32_t ifmap_size = conv_ifmap_tensor_size(cvk_ctx, p);
+  uint32_t weight_size = conv_weight_tensor_to_size(cvk_ctx, p);
+  uint32_t ofmap_size = conv_ofmap_tensor_to_size(cvk_ctx, p);
+  uint32_t bias_size = p->using_bias ? conv_bias_tensor_size(cvk_ctx, p) : 0;
+  uint32_t total_size = ifmap_size + weight_size + ofmap_size + bias_size;
+
+  // Allocation if size fit.
+  if (total_size <= cvk_ctx->info.lmem_size) {
+    dst->ifmap = conv_ifmap_tensor(cvk_ctx, p);
+    dst->weight = conv_weight_tensor(cvk_ctx, p);
+    dst->ofmap = conv_ofmap_tensor(cvk_ctx, p);
+
+    if (p->using_bias)
+      dst->bias = conv_bias_tensor(cvk_ctx, p);
+  } else {
+    dst->ifmap = NULL;
+    dst->weight = NULL;
+    dst->ofmap = NULL;
+  }
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  dst->relu_enable = p->bReLU_EN;
+  dst->rshift_bits = p->r_shift_m;
+  // dst->ifmap = conv_ifmap_tensor(cvk_ctx, p);
+  // dst->weight = conv_weight_tensor(cvk_ctx, p);
+  // dst->ofmap = conv_ofmap_tensor(cvk_ctx, p);
+  // dst->bias = NULL;
+  dst->ps32_mode = 0;
+  // if (p->using_bias)
+  //   dst->bias = conv_bias_tensor(cvk_ctx, p);
+}
+
+static void free_bmk_conv_param(
+    cvk_context_t *cvk_ctx,
+    cvk_tiu_pt_convolution_param_t *r,
+    const conv_param_t *p)
+{
+  if (p->using_bias && r->bias)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->bias);
+
+  if (r->ofmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ofmap);
+
+  if (r->weight)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->weight);
+
+  if (r->ifmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ifmap);
+}
+
+static void init_conv_param(conv_param_t *p)
+{
+  printf("init_conv_param\n");
+  p->random_seed = clock();
+  srand(p->random_seed);
+
+retry:
+
+  memset(p, 0, sizeof(*p));
+  p->input_n = 1;
+  p->input_c = rand() % (10) + 2;
+  p->kh = rand() % 6 + 1;
+  p->kw = rand() % 6 + 1;
+  p->input_h = rand() % 10 + p->kh;
+  p->input_w = rand() % 10 + p->kw;
+  p->output_c = rand() % 10 + 3;
+  p->stride_h = rand() % (p->kh) + 1;
+  p->stride_w = rand() % (p->kw) + 1;
+  p->ins_h = rand() % p->kh;
+  p->ins_w = rand() % p->kw;
+  p->ins_h_last = rand() % p->kh;;
+  p->ins_w_last = rand() % p->kw;;
+  p->dh = rand() % 3 + 1;
+  p->dw = rand() % 3 + 1;
+
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  p->pad_top = rand() % kh_ext;
+  p->pad_bot = rand() % kh_ext;
+  p->pad_left = rand() % kw_ext;
+  p->pad_right = rand() % kw_ext;
+
+  if (!conv_param_is_ok(p)) {
+    printf("retry init_conv_param\n");
+    goto retry;
+  }
+
+  p->using_bias = rand() % 2;
+  p->r_shift_m = rand() % 8;
+  p->bReLU_EN = rand() % 2;
+  p->opd0_sign = rand() % 2;
+  p->opd1_sign = 1;
+  p->opd2_sign = 1;
+
+  assert(p->opd1_sign == 1 && p->opd2_sign == 1);
+
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+  assert(ih_ext >= kh_ext);
+  assert(iw_ext >= kw_ext);
+}
+
+static void print_conv_param(const conv_param_t *p)
+{
+  printf("%s\n", "Conv parameters:");
+  printf("  %s%d;\n", "p->random_seed = ", p->random_seed);
+
+  printf("  %s%d;\n", "p->input_n = ", p->input_n);
+  printf("  %s%d;\n", "p->input_c = ", p->input_c);
+  printf("  %s%d;\n", "p->input_h = ", p->input_h);
+  printf("  %s%d;\n", "p->input_w = ", p->input_w);
+  printf("  %s%d;\n", "p->output_c = ", p->output_c);
+
+  printf("  %s%d;\n", "p->kh = ", p->kh);
+  printf("  %s%d;\n", "p->kw = ", p->kw);
+  printf("  %s%d;\n", "p->dh = ", p->dh);
+  printf("  %s%d;\n", "p->dw = ", p->dw);
+  printf("  %s%d;\n", "p->pad_top = ", p->pad_top);
+  printf("  %s%d;\n", "p->pad_bot = ", p->pad_bot);
+  printf("  %s%d;\n", "p->pad_left = ", p->pad_left);
+  printf("  %s%d;\n", "p->pad_right = ", p->pad_right);
+  printf("  %s%d;\n", "p->stride_h = ", p->stride_h);
+  printf("  %s%d;\n", "p->stride_w = ", p->stride_w);
+  printf("  %s%d;\n", "p->ins_w = ", p->ins_w);
+  printf("  %s%d;\n", "p->ins_h = ", p->ins_h);
+  printf("  %s%d;\n", "p->ins_w_last = ", p->ins_w_last);
+  printf("  %s%d;\n", "p->ins_h_last = ", p->ins_h_last);
+
+  printf("  %s%d;\n", "p->r_shift_m = ", p->r_shift_m);
+  printf("  %s%d;\n", "p->opd0_sign = ", p->opd0_sign);
+  printf("  %s%d;\n", "p->opd1_sign = ", p->opd1_sign);
+  printf("  %s%d;\n", "p->opd2_sign = ", p->opd2_sign);
+  printf("  %s%d;\n", "p->bReLU_EN = ", p->bReLU_EN);
+  printf("  %s%d;\n", "p->using_bias = ", p->using_bias);
+
+  printf("  %s%d\n", "kh_ext = ", conv_kh_ext(p));
+  printf("  %s%d\n", "kw_ext = ", conv_kw_ext(p));
+  printf("  %s%d\n", "ih_ext = ", conv_ih_ext(p));
+  printf("  %s%d\n", "iw_ext = ", conv_iw_ext(p));
+  printf("  %s%d\n", "output_h = ", conv_oh(p));
+  printf("  %s%d\n", "output_w = ", conv_ow(p));
+}
+
+static int test_ps32_ut(
+    conv_param_t *p_param, CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  printf("test_ps32_ut\n");
+  int ret = 0;
+  uint16_t *input = alloc_input(p_param);
+  uint16_t *weight = alloc_weight(p_param);
+  uint32_t *bias = alloc_bias(p_param);
+  uint16_t *output_ref = (uint16_t *)malloc(sizeof(uint32_t) * conv_output_size(p_param));
+  if (!input || !weight || !bias || !output_ref) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  ret = ps32_conv_ref(p_param, input, weight, bias, output_ref, 2);
+  if (ret)
+    goto fail_exit;
+
+  cvk_tiu_pt_convolution_param_t conv_param;
+  make_bmk_conv_param_ps32(cvk_ctx, &conv_param, p_param, 2);
+
+  int tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, p_param);
+  if (tl_alloc_success) {
+
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, conv_param.ifmap, (uint8_t *)input);
+    put_conv_weight(rt_handle, cvk_ctx, conv_param.weight, weight);
+    cvk_ctx->ops->tiu_pt_convolution(cvk_ctx, &conv_param);
+    cvk_tl_t ps32_ofmap;
+    ps32_ofmap = *conv_param.ofmap;
+    ps32_ofmap.shape.n = ps32_ofmap.shape.n * sizeof(short);
+    uint16_t *output = (uint16_t*) tensor_copy_l2g_d2s(rt_handle, cvk_ctx, &ps32_ofmap);
+
+    ret = array_cmp_int8(
+        "Comparing M2 begin_mode results ...\n",
+        (int8_t*)output_ref, (int8_t *)output, conv_output_size(p_param) * sizeof(int));
+
+    if (ret) {
+      print_conv_param(p_param);
+      printf("Comparison M2 FAILED\n");
+      exit(-1);
+    } else
+      printf("Comparison M2 PASS\n");
+
+    free(output);
+  }
+  free_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+  
+  printf("test_ps32_intermediate_mode\n");
+  for (int i=0; i < conv_input_size(p_param); i++)
+    input[i] = cvk_convert_fp32_bf16(i);
+
+  for (int i=0; i < conv_weight_size(p_param); i++)
+    weight[i] = cvk_convert_fp32_bf16(i);
+
+  ret = ps32_conv_ref(p_param, input, weight, bias, output_ref, 3);
+  if (ret)
+    return ret;
+
+  make_bmk_conv_param_ps32(cvk_ctx, &conv_param, p_param, 3);
+  tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, p_param);
+
+  if (tl_alloc_success) {
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, conv_param.ifmap, (uint8_t *)input);
+    put_conv_weight(rt_handle, cvk_ctx, conv_param.weight, weight);
+
+    cvk_ctx->ops->tiu_pt_convolution(cvk_ctx, &conv_param);
+
+    cvk_tl_t ps32_ofmap;
+    ps32_ofmap = *conv_param.ofmap;
+    ps32_ofmap.shape.n = ps32_ofmap.shape.n * sizeof(short);
+
+    uint16_t *output = (uint16_t*) tensor_copy_l2g_d2s(rt_handle, cvk_ctx, &ps32_ofmap);
+
+    ret = array_cmp_int8(
+        "Comparing M3 intermediate results ...\n",
+        (int8_t*)output_ref, (int8_t *)output, conv_output_size(p_param) * sizeof(int));
+
+    if (ret) {
+      print_conv_param(p_param);
+      printf("Comparison M3 FAILED\n");
+      exit(-1);
+    } else
+      printf("Comparison M3 PASS\n");
+
+    free(output);
+  }
+  free_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+
+  printf("test_ps32_end_mode\n");
+  for (int i=0; i < conv_input_size(p_param); i++)
+    input[i] = cvk_convert_fp32_bf16(i);
+
+  for (int i=0; i < conv_weight_size(p_param); i++)
+    weight[i] = cvk_convert_fp32_bf16(i);
+
+  ret = ps32_conv_ref(p_param, input, weight, bias, output_ref, 1);
+  if (ret)
+    return ret;
+
+  make_bmk_conv_param_ps32(cvk_ctx, &conv_param, p_param, 1);
+
+  tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, p_param);
+
+  if (tl_alloc_success) {
+
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, conv_param.ifmap, (uint8_t *)input);
+    put_conv_weight(rt_handle, cvk_ctx, conv_param.weight, weight);
+    if (p_param->using_bias) {
+      put_conv_bias(rt_handle, cvk_ctx, conv_param.bias, bias);
+    }
+    cvk_ctx->ops->tiu_pt_convolution(cvk_ctx, &conv_param);
+    uint16_t *output = (uint16_t*) tensor_copy_l2g_d2s(rt_handle, cvk_ctx, conv_param.ofmap);
+
+    ret = array_cmp_int8(
+        "Comparing M1 end results ...\n",
+        (int8_t*)output_ref, (int8_t *)output, conv_output_size(p_param) * 2);
+
+    if (ret) {
+      print_conv_param(p_param);
+      printf("Comparison M1 FAILED\n");
+      exit(-1);
+    } else
+      printf("Comparison M1 PASS\n");
+
+    free(output);
+  }
+  free_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+
+fail_exit:
+  free(input);
+  free(weight);
+  free(bias);
+  free(output_ref);
+
+  return ret;
+}
+
+static int test_ic_tiling_conv(
+    conv_param_t *p_param, CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  printf("test tiled ps32 conv\n");
+  int ret = 0;
+  uint32_t output_size = sizeof(uint16_t) * conv_output_size(p_param);
+  uint16_t *input = alloc_input(p_param);
+  uint16_t *weight = alloc_weight(p_param);
+  uint32_t *bias = alloc_bias(p_param);
+  uint16_t *output_ref = (uint16_t *)malloc(output_size);
+  if (!input || !weight || !bias || !output_ref) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  p_param->r_shift_m = 0;
+  memset((uint8_t*)output_ref, 0, conv_output_size(p_param)*2);
+  ret = ps32_conv_ref(p_param, input, weight, bias, output_ref, 0);
+  if (ret)
+    goto fail_exit;
+
+  cvk_tiu_pt_convolution_param_t conv_tmp_param;
+  cvk_tiu_pt_convolution_param_t conv_param;
+  make_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+
+  int tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, p_param);
+  if (tl_alloc_success) {
+    if (p_param->using_bias) {
+      conv_tmp_param.bias = conv_param.bias;
+      put_conv_bias(rt_handle, cvk_ctx, conv_param.bias, bias);
+    }
+    // for ps32_md[1] = 1, relu_enable & rshift_bits need to set to 0
+    // so we store those parameters to conv_tmp_para
+    conv_tmp_param.relu_enable = conv_param.relu_enable;
+    conv_tmp_param.rshift_bits = conv_param.rshift_bits;
+    conv_tmp_param.bias        = conv_param.bias;
+
+    uint32_t ic_step = 1;
+    uint32_t n_step = 1;
+    cvk_tl_t ifmap = *conv_param.ifmap;
+    cvk_tl_t ofmap = *conv_param.ofmap;
+    cvk_tg_shape_t s;
+    s.n = conv_param.ifmap->shape.n;
+    s.c = conv_param.ifmap->shape.c;
+    s.h = conv_param.ifmap->shape.h;
+    s.w = conv_param.ifmap->shape.w;
+    cvk_tg_t *tg_ifmap = alloc_tensor_dev_mem(rt_handle, cvk_ctx, s, CVK_FMT_BF16);
+    tensor_copy_s2d(rt_handle, tg_ifmap, (uint8_t *)input);
+
+    s.n = conv_param.weight->shape.n;
+    s.c = conv_param.weight->shape.c;
+    s.h = conv_param.weight->shape.h;
+    s.w = conv_param.weight->shape.w;
+    uint16_t *transformed_weight =
+      transform_weight(&conv_param.weight->shape, (uint16_t *)weight);
+    cvk_tg_t *tg_weight = alloc_tensor_dev_mem(rt_handle, cvk_ctx, s, CVK_FMT_BF16);
+    tensor_copy_s2d(rt_handle, tg_weight, (uint8_t *)transformed_weight);
+    free(transformed_weight);
+
+    cvk_tl_shape_t cur_tl_ifmap_shape = {
+        n_step,
+        ic_step,
+        ifmap.shape.h,
+        ifmap.shape.w
+    };
+
+    cvk_tg_shape_t cur_tg_ifmap_shape = {
+      cur_tl_ifmap_shape.n,
+      cur_tl_ifmap_shape.c,
+      cur_tl_ifmap_shape.h,
+      cur_tl_ifmap_shape.w
+    };
+
+    cvk_tg_stride_t cur_tg_ifmap_stride = {
+      tg_ifmap->stride.n,
+      tg_ifmap->stride.c,
+      tg_ifmap->stride.h,
+      fmt_size(CVK_FMT_BF16)
+    };
+
+    cvk_tg_t cur_tg_ifmap;
+    cur_tg_ifmap.base_reg_index = 0;
+    cur_tg_ifmap.start_address = tg_ifmap->start_address;
+    cur_tg_ifmap.shape = cur_tg_ifmap_shape;
+    cur_tg_ifmap.stride = cur_tg_ifmap_stride;
+    cur_tg_ifmap.fmt = CVK_FMT_BF16;
+
+    cvk_tl_t cur_tl_ifmap;
+    cur_tl_ifmap.shape = cur_tl_ifmap_shape;
+    cur_tl_ifmap.stride =
+      cvk_ctx->ops->tl_default_stride(cvk_ctx, cur_tl_ifmap_shape, CVK_FMT_BF16, 1);
+    cur_tl_ifmap.start_address = ifmap.start_address;
+    cur_tl_ifmap.fmt = ifmap.fmt;
+
+    cvk_tl_t cur_tl_ofmap;
+    cur_tl_ofmap.start_address = ofmap.start_address;
+    cur_tl_ofmap.shape = ofmap.shape;
+    cur_tl_ofmap.shape.n = n_step;
+    cur_tl_ofmap.stride =
+      cvk_ctx->ops->tl_default_stride(cvk_ctx, cur_tl_ofmap.shape, CVK_FMT_BF16, 1);
+    cur_tl_ofmap.fmt = ofmap.fmt;
+
+    cvk_tl_t cur_tl_weight;
+    cur_tl_weight.start_address = conv_param.weight->start_address;
+    cur_tl_weight.shape = conv_param.weight->shape;
+    cur_tl_weight.shape.n = ic_step;
+    cur_tl_weight.stride.n = 2;
+    cur_tl_weight.stride.c = cur_tl_weight.shape.n * cur_tl_weight.shape.h * cur_tl_weight.shape.w * 2;
+    cur_tl_weight.stride.h = cur_tl_weight.shape.n * cur_tl_weight.shape.w * 2;
+    cur_tl_weight.stride.w = cur_tl_weight.shape.n * 2;
+    cur_tl_weight.fmt = conv_param.weight->fmt;
+
+    const cvk_tl_t *saved_tl_weight = conv_param.weight;
+    const cvk_tl_t *saved_tl_ifmap = conv_param.ifmap;
+    for (uint32_t ci = 0; ci < ifmap.shape.c; ci += ic_step) {
+      {
+        uint32_t ic = tg_weight->shape.n;
+        uint32_t oc = tg_weight->shape.c;
+        uint32_t kh = tg_weight->shape.h;
+        uint32_t kw = tg_weight->shape.w;
+
+        cvk_tg_t cur_tdma_tg_weight;
+        cur_tdma_tg_weight.base_reg_index = tg_weight->base_reg_index;
+        cur_tdma_tg_weight.start_address = tg_weight->start_address + ci * (tg_weight->fmt == CVK_FMT_BF16 ? 2 : 1);
+        cur_tdma_tg_weight.fmt = tg_weight->fmt;
+        cur_tdma_tg_weight.shape = tg_shape_t4(1, oc, kh * kw, ic);
+        cur_tdma_tg_weight.stride =
+          cvk_ctx->ops->tg_default_stride(cvk_ctx, cur_tdma_tg_weight.shape, CVK_FMT_BF16);
+        cur_tdma_tg_weight.shape = tg_shape_t4(1, oc, kh * kw, ic_step);
+
+        cvk_tl_t cur_tdma_tl_weight;
+        cur_tdma_tl_weight = cur_tl_weight;
+        cur_tdma_tl_weight.shape.n = cur_tdma_tg_weight.shape.n;
+        cur_tdma_tl_weight.shape.c = cur_tdma_tg_weight.shape.c;
+        cur_tdma_tl_weight.shape.h = cur_tdma_tg_weight.shape.h;
+        cur_tdma_tl_weight.shape.w = cur_tdma_tg_weight.shape.w;
+        cur_tdma_tl_weight.stride = cvk_ctx->ops->tl_default_stride(
+            cvk_ctx, cur_tdma_tl_weight.shape, cur_tdma_tl_weight.fmt, 0);
+
+        cvk_tdma_g2l_tensor_copy_param_t p1;
+        memset(&p1, 0, sizeof(p1));
+        p1.src = &cur_tdma_tg_weight;
+        p1.dst = &cur_tdma_tl_weight;
+        cvk_ctx->ops->tdma_g2l_bf16_tensor_copy(cvk_ctx, &p1);
+        CVI_RT_Submit(cvk_ctx);
+      }
+      {
+        cvk_tdma_g2l_tensor_copy_param_t p2;
+        memset(&p2, 0, sizeof(p2));
+        cur_tg_ifmap.start_address =
+          tg_ifmap->start_address + ci * tg_ifmap->stride.c;
+        p2.src = &cur_tg_ifmap;
+        p2.dst = &cur_tl_ifmap;
+        cvk_ctx->ops->tdma_g2l_bf16_tensor_copy(cvk_ctx, &p2);
+        CVI_RT_Submit(cvk_ctx);
+      }
+
+      conv_param.ifmap = &cur_tl_ifmap;
+      conv_param.weight = &cur_tl_weight;
+
+      // for ps32_md[1] = 1, relu_enable & rshift_bits need to set to 0
+      // so we store those parameters to conv_tmp_para
+      if (ci == (ifmap.shape.c - 1))
+      {
+        conv_param.relu_enable = conv_tmp_param.relu_enable;
+        conv_param.rshift_bits = conv_tmp_param.rshift_bits;
+        conv_param.bias        = conv_tmp_param.bias;
+        conv_param.ps32_mode   = 1;
+      }
+      else if (ci == 0)
+      {
+        conv_param.relu_enable = 0;
+        conv_param.rshift_bits = 0;
+        conv_param.bias        = 0;;
+        conv_param.ps32_mode   = 2;
+      }
+      else
+      {
+        conv_param.relu_enable = 0;
+        conv_param.rshift_bits = 0;
+        conv_param.bias        = 0;;
+        conv_param.ps32_mode   = 3;
+      }
+      cvk_ctx->ops->tiu_pt_convolution(cvk_ctx, &conv_param);
+      conv_param.weight = saved_tl_weight;
+      conv_param.ifmap = saved_tl_ifmap;
+    }
+
+    uint16_t *output = (uint16_t*) tensor_copy_l2g_d2s(rt_handle, cvk_ctx, conv_param.ofmap);
+
+    free_tensor_dev_mem(rt_handle, tg_ifmap);
+    free_tensor_dev_mem(rt_handle, tg_weight);
+
+    ret = array_cmp_int8(
+        "Comparing results ...\n",
+        (int8_t*) output_ref, (int8_t *)output, conv_output_size(p_param)*2);
+    if (ret) {
+      print_conv_param(p_param);
+      printf("Comparison FAILED\n");
+      ret = -1;
+    }
+    free(output);
+  }
+  free_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+
+fail_exit:
+  free(input);
+  free(weight);
+  free(output_ref);
+  free(bias);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+  int round_mode;
+  round_mode = cvk_set_store_feround();
+
+  for (int i = 0; i < 15; i++) {
+    printf("random_test_conv iteration: %d\n", i);
+    conv_param_t test_conv_param;
+    init_conv_param(&test_conv_param);
+    //print_conv_param(&test_conv_param);
+    ret |= test_ic_tiling_conv(&test_conv_param, rt_handle, cvk_ctx);
+    if (ret)
+      break;
+    ret |= test_ps32_ut(&test_conv_param, rt_handle, cvk_ctx);
+    if (ret)
+      break;
+
+    if (!test_conv_param.using_bias)
+      test_conv_param.using_bias = 1;
+    if (test_conv_param.output_c <= 9)
+      test_conv_param.output_c += 3;
+    //print_conv_param(&test_conv_param);
+    ret |= test_ic_tiling_conv(&test_conv_param, rt_handle, cvk_ctx);
+    if (ret)
+      break;
+
+    ret |= test_ps32_ut(&test_conv_param, rt_handle, cvk_ctx);
+    if (ret)
+      break;
+  }
+  cvk_restore_feround(round_mode);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_bf16_conv_zero_ratio.c b/cviruntime/test/180x/test_180x_bf16_conv_zero_ratio.c
new file mode 100644
index 000000000..04f4510bc
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_bf16_conv_zero_ratio.c
@@ -0,0 +1,810 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_native_ref.h"
+
+typedef struct{
+    uint16_t *conv_input;
+    uint16_t *conv_weight;
+    uint32_t *conv_bias;
+    uint16_t *conv_output;
+    uint16_t *conv_output_ref;
+}u_test_data;
+
+typedef struct {
+  int random_seed;
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int using_bias;
+  int bReLU_EN;
+  int r_shift_m;
+  int opd0_sign;
+  int opd1_sign;
+  int opd2_sign;
+  int izratio;
+  int kzratio;
+} conv_param_t;
+
+conv_param_t conv_param;
+u_test_data u16_test_data;
+cvk_tiu_pt_convolution_param_t bmk_conv_param;
+
+cvk_tl_t *skip_tensor_lmem[10];
+uint32_t skip_tensor_num=0;
+
+/* need to make sure the free order of test_alloc_tl for skip_tensor_lmem*/
+void skip_tensor_lmem_size(cvk_context_t *cvk_ctx, const cvk_tl_t *p)
+{
+  uint32_t needed = align_up(p->shape.n * p->stride.n, cvk_ctx->info.eu_num);
+  uint32_t start_addr = p->start_address + needed; //src_shape.n*src_shape.c*src_shape.h*src_shape.w/32;
+  uint32_t remain_size = start_addr % cvk_ctx->info.lmem_bank_size ? (cvk_ctx->info.lmem_bank_size - start_addr % cvk_ctx->info.lmem_bank_size) : 0; // remain size for each lane
+  if(remain_size)
+  {
+    cvk_tl_shape_t src_shape2 = {1, cvk_ctx->info.npu_num, 1, remain_size};
+    skip_tensor_lmem[skip_tensor_num] = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, src_shape2, CVK_FMT_BF16, 1); // skip the lmem size and next tl can alignment to bank size
+  }
+  skip_tensor_num++;
+}
+
+void free_skip_tensor_lmem(cvk_context_t *cvk_ctx)
+{
+  if(skip_tensor_lmem[--skip_tensor_num]!=NULL)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, skip_tensor_lmem[skip_tensor_num]);
+}
+
+static inline void bf16_relu(float *buf, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++)
+    if (buf[i] < 0)
+      buf[i] = 0;
+}
+
+static int conv_ref(
+    const conv_param_t *p_param,
+    const uint16_t *ifmap,
+    const uint16_t *weight,
+    const uint32_t *bias,
+    uint16_t *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int do_relu = p_param->bReLU_EN;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+  float *result = (float *)malloc(sizeof(float) * in * oc * oh * ow);
+  if (!result)
+    return -1;
+
+  memset(result, 0, sizeof(float) * in * oc * oh * ow);
+  int ret = 0;
+
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      uint16_t *i_fmap_pad[ic];
+      uint16_t *kernel_pad[ic];
+
+      for (int iic = 0; iic < ic; ++iic) {
+        i_fmap_pad[iic] = NULL;
+        kernel_pad[iic] = NULL;
+        fill_pad_fmap_bf16(
+            (ifmap + n*ic*ih*iw + iic*ih*iw), &i_fmap_pad[iic], cvk_convert_fp32_bf16(0),
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+        //kernel_dilation(
+        fill_pad_fmap_bf16(
+            (weight + c*ic*kh*kw + iic*kh*kw), &kernel_pad[iic], cvk_convert_fp32_bf16(0),
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+                  kh, kw);
+      }
+      for (int ph = 0; ph < oh; ++ph) {
+        for (int pw = 0; pw < ow; ++pw) {
+          float result_val = result[n*oc*oh*ow + c*oh*ow + ph*ow + pw];
+          for (int idxh = 0; idxh < kh_ext; idxh += dh) {
+            for (int idxw = 0; idxw < kw_ext; idxw += dw) {
+              for (int iic = 0; iic < ic; ++iic){
+                float ifv = cvk_convert_bf16_fp32(i_fmap_pad[iic][(idxh+ph*stride_h) * iw_ext + idxw + pw*stride_w]);
+                float ikv = cvk_convert_bf16_fp32(kernel_pad[iic][idxh* kw_ext + idxw]);
+                result_val += ifv*ikv;
+              }
+            }
+          }
+          result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] = result_val;
+        }
+      }
+       if (p_param->using_bias) {
+         for (int ph = 0; ph < oh; ++ph) {
+           for (int pw = 0; pw < ow; ++pw) {
+             result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += cvk_convert_hex_fp32(bias[c]); //bias+c ;
+           }
+         }
+       }
+       if (do_relu)
+         bf16_relu(&result[n*oc*oh*ow + c*oh*ow], oh * ow);
+       for(int i = 0 ;i<ic;i++) {
+         free(i_fmap_pad[i]);
+         free(kernel_pad[i]);
+       }
+       if (ret)
+         goto error_release;
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+    for (int i = 0; i < in * oc * oh * ow; i++) {
+      ofmap[i] = cvk_convert_fp32_bf16(result[i]);
+    }
+
+error_release:
+  free(result);
+
+  return ret;
+
+}
+
+static uint16_t * transform_weight(const cvk_tl_shape_t *s, uint16_t before[])
+{
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  uint32_t size = ic * oc * kh * kw;
+  uint16_t *after = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  if (!after)
+    return NULL;
+
+  /*
+   * (oc, ic, kh, kw) -> (1, oc, kh * kw, ic)
+   */
+  for (uint32_t oci = 0; oci < oc; oci++) {
+    for (uint32_t ici = 0; ici < ic; ici++) {
+      for (uint32_t khi = 0; khi < kh; khi++) {
+        for (uint32_t kwi = 0; kwi < kw; kwi++) {
+          uint32_t src_i = oci * ic * kh * kw + ici * kh * kw + khi * kw + kwi;
+          uint32_t dst_i = oci * kh * kw * ic + khi * kw * ic + kwi * ic + ici;
+          after[dst_i] = before[src_i];
+        }
+      }
+    }
+  }
+
+  return after;
+}
+
+static void put_conv_weight(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    uint16_t *data)
+{
+#if 0
+  const cvk_tl_shape_t *s = &tl->shape;
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  bmshape_t bms = BM_TENSOR_INT8((int)oc, (int)ic, (int)kh, (int)kw*2);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  uint16_t *transformed_data = transform_weight(s, data);
+
+  /* we put weight to region 1. bm_memcpy_s2d regard dev_mem as
+   * absolute address, so we should pass abs address to copy weight
+   * to right place.
+   */
+
+  //u64 ab_addr = bm_device_read_base_reg(*ctx, 1);
+  //bmmem_device_t ab_dev_mem =
+    //bmmem_device_prealloc(*ctx, NULL, ab_addr + gaddr, &bms);
+
+  //int ret = bm_memcpy_s2d(*ctx, ab_dev_mem, (u8*)transformed_data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, (u8*)transformed_data);
+
+  assert(ret == BM_SUCCESS);
+  cvk_tl_shape_t tdma_shape = { 1, oc, kh * kw, ic };
+
+  tg_t tdma_tg;
+  tdma_tg.base_reg_index = 0;
+  tdma_tg.start_address = gaddr;
+  tdma_tg.fmt = FMT_BF16;
+  tdma_tg.shape.n = tdma_shape.n;
+  tdma_tg.shape.c = tdma_shape.c;
+  tdma_tg.shape.h = tdma_shape.h;
+  tdma_tg.shape.w = tdma_shape.w;
+  tdma_tg.stride = bmk1822_tensor_tgmem_default_stride(tdma_tg.shape, FMT_BF16);
+  tdma_tg.base_reg_index = 1;
+
+  tl_t tdma_tl = *tl;
+  tdma_tl.shape = tdma_shape;
+  tdma_tl.stride = bmk1822_tensor_lmem_default_stride(bk_ctx, tdma_shape, FMT_BF16, 0);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tdma_tg;
+  p.dst = &tdma_tl;
+
+  bmk1822_tdma_g2l_bf16_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+  bmmem_device_free(*ctx, dev_mem);
+  //delete[] transformed_data;
+  return transformed_data;
+#endif
+
+  const cvk_tl_shape_t *s = &tl->shape;
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  uint16_t *transformed_data = transform_weight(s, data);
+
+  cvk_tl_shape_t tdma_shape = tl_shape_t4(1, oc, kh * kw, ic);
+  cvk_tl_t tdma_tl;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tdma_tl, tdma_shape, tl->fmt, tl->eu_align);
+  tdma_tl.start_address = tl->start_address;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, &tdma_tl, (uint8_t *)transformed_data);
+
+  free(transformed_data);
+}
+
+
+static uint16_t * transform_bias(int oc, uint32_t before[])
+{
+  uint16_t *after = (uint16_t *)malloc(2 * sizeof(uint16_t) * oc);
+  if (!after)
+    return NULL;
+
+  for (int i = 0; i < oc; i++){
+    after[i] = (before[i] >> 16) & 0xffff;
+    after[i + oc] = before[i] & 0xffff;
+  }
+  return after;
+}
+
+static void put_conv_bias(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    uint32_t *data)
+{
+#if 0
+  int oc = tl->shape.c;
+
+  bmshape_t bms = BM_TENSOR_INT8(2 * sizeof(short), oc, 1, 1);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  uint16_t *transformed_data = transform_bias(oc, data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, (u8 *)transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_BF16;
+  tg.shape.n = 2;
+  tg.shape.c = oc;
+  tg.shape.h = 1;
+  tg.shape.w = 1;
+  tg.stride = bmk1822_tensor_tgmem_default_stride(tg.shape, FMT_BF16);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1822_tdma_g2l_bf16_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  bmmem_device_free(*ctx, dev_mem);
+#endif
+
+  int oc = tl->shape.c;
+  uint16_t *transformed_data = transform_bias(oc, data);
+
+  cvk_tl_shape_t tdma_shape = tl_shape_t4(2, oc, 1, 1);
+  cvk_tl_t tdma_tl;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tdma_tl, tdma_shape, tl->fmt, tl->eu_align);
+  tdma_tl.start_address = tl->start_address;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, &tdma_tl, (uint8_t *)transformed_data);
+
+  free(transformed_data);
+}
+
+static int conv_kh_ext(const conv_param_t *p)
+{
+  return (p->kh - 1) * p->dh + 1;
+}
+
+static int conv_kw_ext(const conv_param_t *p)
+{
+  return (p->kw - 1) * p->dw + 1;
+}
+
+static int conv_ih_ext(const conv_param_t *p)
+{
+  return (p->input_h - 1) * (p->ins_h + 1) +
+      p->ins_h_last + 1 + p->pad_top + p->pad_bot;
+}
+
+static int conv_iw_ext(const conv_param_t *p)
+{
+  return (p->input_w - 1) * (p->ins_w + 1) +
+      p->ins_w_last + 1 + p->pad_left + p->pad_right;
+}
+
+static int conv_oh(const conv_param_t *p)
+{
+  return (conv_ih_ext(p) - conv_kh_ext(p)) / p->stride_h + 1;
+}
+
+static int conv_ow(const conv_param_t *p)
+{
+  return (conv_iw_ext(p) - conv_kw_ext(p)) / p->stride_w + 1;
+}
+
+static int conv_input_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int ic = p->input_c;
+  int ih = p->input_h;
+  int iw = p->input_w;
+  return in * ic * ih * iw;
+}
+
+static int conv_output_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int oc = p->output_c;
+  int oh = conv_oh(p);
+  int ow = conv_ow(p);
+  return in * oc * oh * ow;
+}
+
+static int conv_weight_size(const conv_param_t *p)
+{
+  int oc = p->output_c;
+  int ic = p->input_c;
+  int kh = p->kh;
+  int kw = p->kw;
+  return oc * ic * kh * kw;
+}
+
+static uint16_t * alloc_input(const conv_param_t *p)
+{
+  int size = conv_input_size(p);
+  uint16_t *buf = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (int i = 0; i < size; i++) {
+    if (p->izratio == 0) //almost 100% not zero
+      buf[i] = cvk_convert_fp32_bf16(rand() % 256 - 128);
+    else if (p->izratio == 1)
+      buf[i] = cvk_convert_fp32_bf16(rand() % 2 ? rand() % 256 - 128 : 0);
+    else
+      buf[i] = 0;
+  }
+  return buf;
+}
+
+static uint16_t * alloc_weight(const conv_param_t *p)
+{
+  int size = conv_weight_size(p);
+
+  uint16_t *buf = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (int i = 0; i < size; i++) {
+    if (p->kzratio == 0) //almost 100% not zero
+      buf[i] = cvk_convert_fp32_bf16(rand() % 256 - 128);
+    else if (p->kzratio == 1)
+      buf[i] = cvk_convert_fp32_bf16(rand() % 2 ? rand() % 256 - 128 : 0);
+    else
+      buf[i] = 0;
+  }
+  return buf;
+}
+
+static uint32_t * alloc_bias(const conv_param_t *p)
+{
+  int oc = p->output_c;
+
+  uint32_t *bias = (uint32_t *)malloc(sizeof(uint32_t) * oc);
+  if (!bias)
+    return NULL;
+
+  for (int i = 0; i < oc; i++)
+    bias[i] = cvk_convert_fp32_hex(rand() % 65536 - 32768);
+
+  return bias;
+}
+
+static cvk_tl_t * conv_ifmap_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  //cvk_fmt_t fmt = p->opd0_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_fmt_t fmt = CVK_FMT_BF16;
+  cvk_tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 1);
+}
+
+static cvk_tl_t * conv_weight_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  //cvk_fmt_t fmt = p->opd1_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_fmt_t fmt = CVK_FMT_BF16;
+  cvk_tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 0);
+}
+
+static cvk_tl_t * conv_ofmap_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, CVK_FMT_BF16, 1);
+}
+
+static cvk_tl_t * conv_bias_tensor(
+    cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  //cvk_fmt_t fmt = p->opd2_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_fmt_t fmt = CVK_FMT_BF16;
+  cvk_tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 0);
+}
+
+static int conv_param_is_ok(const conv_param_t *p)
+{
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+
+  if ((kh_ext > ih_ext)
+      || (kw_ext > iw_ext)
+      || (kh_ext <= p->pad_top)
+      || (kh_ext <= p->pad_bot)
+      || (kw_ext <= p->pad_left)
+      || (kw_ext <= p->pad_right)
+      || (p->pad_top >= (1 << 4))
+      || (p->pad_bot >= (1 << 4))
+      || (p->pad_left >= (1 << 4))
+      || (p->pad_right >= (1 << 4))) {
+    return 0;
+  }
+
+  return 1;
+}
+
+static int bmk_conv_param_alloc_ok(
+    const cvk_tiu_pt_convolution_param_t *p,
+    const conv_param_t *param)
+{
+  if (!p->ifmap || !p->ofmap || !p->weight)
+    return 0;
+
+  if (param->using_bias)
+    if (!p->bias)
+      return 0;
+
+  return 1;
+}
+
+static void make_bmk_conv_param(
+    cvk_context_t *cvk_ctx,
+    cvk_tiu_pt_convolution_param_t *dst,
+    const conv_param_t *p)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  dst->relu_enable = p->bReLU_EN;
+  dst->rshift_bits = p->r_shift_m;
+
+  dst->ifmap = conv_ifmap_tensor(cvk_ctx, p);
+  skip_tensor_lmem_size(cvk_ctx, dst->ifmap);
+  dst->weight = conv_weight_tensor(cvk_ctx, p);
+  skip_tensor_lmem_size(cvk_ctx, dst->weight);
+  dst->ofmap = conv_ofmap_tensor(cvk_ctx, p);
+  skip_tensor_lmem_size(cvk_ctx, dst->ofmap);
+  dst->bias = NULL;
+  dst->ps32_mode = 0;
+  if (p->using_bias)
+  {
+    dst->bias = conv_bias_tensor(cvk_ctx, p);
+    skip_tensor_lmem_size(cvk_ctx, dst->bias);
+  }
+  dst->w_is_const = 0;
+}
+
+static void free_bmk_conv_param(
+    cvk_context_t *cvk_ctx,
+    cvk_tiu_pt_convolution_param_t *r,
+    const conv_param_t *p)
+{
+  if (p->using_bias && r->bias)
+  {
+    free_skip_tensor_lmem(cvk_ctx);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->bias);
+  }
+  if (r->ofmap)
+  {
+    free_skip_tensor_lmem(cvk_ctx);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ofmap);
+  }
+  if (r->weight)
+  {
+    free_skip_tensor_lmem(cvk_ctx);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->weight);
+  }
+  if (r->ifmap)
+  {
+    free_skip_tensor_lmem(cvk_ctx);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ifmap);
+  }
+}
+
+static void init_conv_param(conv_param_t *p)
+{
+retry:
+  p->input_n = 1;
+  p->input_c = 16;
+  p->input_h = 2;
+  p->input_w = 600;
+
+  p->kh = 2;
+  p->kw = 16;
+  p->output_c = 16;
+
+  p->stride_h = 1;
+  p->stride_w = 15;
+  p->ins_h = 0;
+  p->ins_w = 0;
+  p->ins_h_last = 0;;
+  p->ins_w_last = 0;;
+  p->dh = 1;
+  p->dw = 1;
+
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  p->pad_top = 1;
+  p->pad_bot = 0;
+  p->pad_left = 0;
+  p->pad_right = 0;
+
+  if (!conv_param_is_ok(p)) {
+    printf("retry init_conv_param\n");
+    goto retry;
+  }
+
+  p->using_bias = 0;
+  p->r_shift_m = 7;
+  p->bReLU_EN = 1;
+
+  p->opd0_sign = 0;
+  p->opd1_sign = 1;
+  p->opd2_sign = 1;
+
+  assert(p->opd1_sign == 1 && p->opd2_sign == 1);
+
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+  assert(ih_ext >= kh_ext);
+  assert(iw_ext >= kw_ext);
+
+}
+
+static void print_conv_param(const conv_param_t *p)
+{
+  printf("%s\n", "Conv parameters:");
+  printf("  %s%d;\n", "p->random_seed = ", p->random_seed);
+
+  printf("  %s%d;\n", "p->input_n = ", p->input_n);
+  printf("  %s%d;\n", "p->input_c = ", p->input_c);
+  printf("  %s%d;\n", "p->input_h = ", p->input_h);
+  printf("  %s%d;\n", "p->input_w = ", p->input_w);
+  printf("  %s%d;\n", "p->output_c = ", p->output_c);
+
+  printf("  %s%d;\n", "p->kh = ", p->kh);
+  printf("  %s%d;\n", "p->kw = ", p->kw);
+  printf("  %s%d;\n", "p->dh = ", p->dh);
+  printf("  %s%d;\n", "p->dw = ", p->dw);
+  printf("  %s%d;\n", "p->pad_top = ", p->pad_top);
+  printf("  %s%d;\n", "p->pad_bot = ", p->pad_bot);
+  printf("  %s%d;\n", "p->pad_left = ", p->pad_left);
+  printf("  %s%d;\n", "p->pad_right = ", p->pad_right);
+  printf("  %s%d;\n", "p->stride_h = ", p->stride_h);
+  printf("  %s%d;\n", "p->stride_w = ", p->stride_w);
+  printf("  %s%d;\n", "p->ins_w = ", p->ins_w);
+  printf("  %s%d;\n", "p->ins_h = ", p->ins_h);
+  printf("  %s%d;\n", "p->ins_w_last = ", p->ins_w_last);
+  printf("  %s%d;\n", "p->ins_h_last = ", p->ins_h_last);
+
+  printf("  %s%d;\n", "p->r_shift_m = ", p->r_shift_m);
+  printf("  %s%d;\n", "p->opd0_sign = ", p->opd0_sign);
+  printf("  %s%d;\n", "p->opd1_sign = ", p->opd1_sign);
+  printf("  %s%d;\n", "p->opd2_sign = ", p->opd2_sign);
+  printf("  %s%d;\n", "p->bReLU_EN = ", p->bReLU_EN);
+  printf("  %s%d;\n", "p->using_bias = ", p->using_bias);
+
+  printf("  %s%d\n", "kh_ext = ", conv_kh_ext(p));
+  printf("  %s%d\n", "kw_ext = ", conv_kw_ext(p));
+  printf("  %s%d\n", "ih_ext = ", conv_ih_ext(p));
+  printf("  %s%d\n", "iw_ext = ", conv_iw_ext(p));
+  printf("  %s%d\n", "output_h = ", conv_oh(p));
+  printf("  %s%d\n", "output_w = ", conv_ow(p));
+}
+
+static int setup_conv(
+    conv_param_t *p_param, CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  u16_test_data.conv_input = alloc_input(p_param);
+  u16_test_data.conv_weight = alloc_weight(p_param);
+  u16_test_data.conv_bias = alloc_bias(p_param);
+  //p_param->r_shift_m = calc_rshift_m(p_param, s8_test_data.conv_weight);
+  u16_test_data.conv_output_ref = (uint16_t *)malloc(sizeof(uint16_t) * conv_output_size(p_param));
+  if (!u16_test_data.conv_output_ref)
+    return -1;
+
+  int ret = conv_ref(p_param, u16_test_data.conv_input, u16_test_data.conv_weight, u16_test_data.conv_bias, u16_test_data.conv_output_ref);
+  if (ret)
+    return ret;
+
+  make_bmk_conv_param(cvk_ctx, &bmk_conv_param , p_param);
+
+  bmk_conv_param_alloc_ok(&bmk_conv_param, p_param);
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, bmk_conv_param.ifmap, (uint8_t *)u16_test_data.conv_input);
+  put_conv_weight(rt_handle, cvk_ctx, bmk_conv_param.weight, u16_test_data.conv_weight);
+  if (p_param->using_bias)
+    put_conv_bias(rt_handle, cvk_ctx, bmk_conv_param.bias, u16_test_data.conv_bias);
+
+  return 0;
+}
+
+void get_result(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  u16_test_data.conv_output = (uint16_t*) tensor_copy_l2g_d2s(rt_handle, cvk_ctx, bmk_conv_param.ofmap);
+}
+
+void check_result()
+{
+    int has_error = array_cmp_int8(
+        "conv Comparing results ...\n",
+        (int8_t*)u16_test_data.conv_output_ref, (int8_t *)u16_test_data.conv_output, conv_output_size(&conv_param)*2);
+
+    if (has_error) {
+      print_conv_param(&conv_param);
+      printf("Comparison FAILED\n");
+      exit(-1);
+    }
+
+}
+
+void trigger_max_power(cvk_context_t *cvk_ctx)
+{
+  cvk_ctx->ops->tiu_pt_convolution(cvk_ctx, &bmk_conv_param);
+  CVI_RT_Submit(cvk_ctx);
+}
+
+void free_s8_data()
+{
+  free(u16_test_data.conv_input);
+  free(u16_test_data.conv_weight);
+  free(u16_test_data.conv_bias);
+  free(u16_test_data.conv_output);
+  free(u16_test_data.conv_output_ref);
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  for (int i = 0; i < 3; i++) {
+    for (int k = 0; k < 3; k++) {
+      printf("bf16 conv zero ratio test: ( %d ) ( %d )\n",i,k);
+      init_conv_param(&conv_param);
+      conv_param.izratio = i;
+      conv_param.kzratio = k;
+      ret |= setup_conv(&conv_param, rt_handle, cvk_ctx);
+
+      trigger_max_power(cvk_ctx);
+      get_result(rt_handle, cvk_ctx);
+      check_result();
+
+      free_bmk_conv_param(cvk_ctx, &bmk_conv_param, &conv_param);
+      free_s8_data();
+    }
+  }
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_bf16_depthwise.c b/cviruntime/test/180x/test_180x_bf16_depthwise.c
new file mode 100644
index 000000000..e3339aab7
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_bf16_depthwise.c
@@ -0,0 +1,472 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_native_ref.h"
+
+#define INVALIDE_STRIDE (-1)
+typedef cvk_tiu_depthwise_pt_convolution_param_t param_t;
+int random_seed;
+static void print_pooling_param(param_t *p)
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+  int kh = p->weight->shape.h;
+  int kw = p->weight->shape.w;
+
+  printf("  Pooling parameters:\n");
+  printf("    random_seed : %d \n", random_seed);
+  printf("    ifmap = (%d, %d, %d, %d)\n", in, ic, ih, iw);
+  printf("    opd0_sign = %d\n", p->ifmap->fmt == CVK_FMT_I8);
+  printf("    weight = (%d, %d)\n", kh, kw);
+  printf("    padding = (%d, %d, %d, %d)\n",
+         p->pad_top, p->pad_bottom, p->pad_left, p->pad_right);
+  printf("    stride = (%d, %d)\n", p->stride_h, p->stride_w);
+  printf("    ins0 = (%d, %d, %d, %d)\n",
+         p->ins_h, p->ins_last_h, p->ins_w, p->ins_last_w);
+  printf("    dilation = (%d, %d)\n",p->dilation_h, p->dilation_w);
+  printf("    rshift_bits = %d\n", p->rshift_bits);
+  printf("    relu_enable = %d\n", p->relu_enable);
+  printf("    res0_sign = %d\n", p->ofmap->fmt == CVK_FMT_I8);
+}
+
+static uint16_t *alloc_input(param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->ifmap->shape, p->ifmap->fmt);
+  uint16_t *data = (uint16_t *)malloc(size);
+  if (!data)
+    return NULL;
+
+  for (uint64_t i = 0; i < size / sizeof(uint16_t); i++) {
+    float val = 0;
+    int RAND_MAX2 = RAND_MAX/2; //5 ~ -5
+    val = (float)(rand()-RAND_MAX2)*5 / (float)RAND_MAX;
+    data[i] = cvk_convert_fp32_bf16(val);
+  }
+  return data;
+}
+
+static uint16_t *alloc_weight(param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->weight->shape, p->weight->fmt);
+  uint16_t *data = (uint16_t *)malloc(size);
+  if (!data)
+    return NULL;
+
+  for (uint64_t i = 0; i < size / sizeof(uint16_t); i++) {
+    float val = 0;
+    int RAND_MAX2 = RAND_MAX/2; //5 ~ -5
+    val = (float)(rand()-RAND_MAX2)*5 / (float)RAND_MAX;
+    data[i] = cvk_convert_fp32_bf16(val);
+  }
+  return data;
+}
+
+static uint32_t *alloc_bias(param_t *p)
+{
+  int c = p->bias->shape.c;
+  uint32_t *bias = (uint32_t *)malloc(sizeof(uint32_t) * c);
+  if (!bias)
+    return NULL;
+
+  for (int i = 0; i < c; i++) {
+    float val = 0;
+    int RAND_MAX2 = RAND_MAX/2; //2 ~ -2
+    val = (float)(rand()-RAND_MAX2)*2 / (float)RAND_MAX;
+    bias[i] = cvk_convert_fp32_hex(val);
+  }
+  return bias;
+}
+
+static uint16_t *alloc_output(param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->ofmap->shape, p->ofmap->fmt);
+  return (uint16_t *)malloc(size * 2);
+}
+
+static inline void bf16_relu(uint16_t *buf, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++)
+    if (cvk_convert_bf16_fp32(buf[i]) < 0)
+      buf[i] = cvk_convert_fp32_bf16(0);
+}
+
+static int index_get(int h, int w1, int w2)
+{
+  return h * w1 + w2;
+}
+
+int native_pooling_avg_bf16(
+    const uint16_t* i_fmap,
+    const void* weight,
+    const uint32_t *bias,
+    uint16_t * o_fmap,
+    int input_n, int input_c, int input_h, int input_w,
+    int kh, int kw,
+    int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r,
+    int stride_h, int stride_w,
+    int ins_h, int ins_w,
+    int ins_h_last, int ins_w_last,
+    int dh, int dw,
+    int const_weight)
+{
+  if (kh * kw <= 0)
+    return -1;
+
+  uint16_t avg_const_weight = *(uint16_t *)weight;
+  uint16_t *weight_arr = (uint16_t*)weight;
+  int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b);
+  int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r);
+  int d_kh = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int d_kw = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int output_h = calc_output_hw(h_after, d_kh, stride_h);
+  int output_w = calc_output_hw(w_after, d_kw, stride_w);
+  float *avg_pooling_mac_a = (float *)malloc(d_kh * d_kw * sizeof(float));
+  float *avg_pooling_mac_b = (float *)malloc(d_kh * d_kw * sizeof(float));
+
+  uint16_t *i_fmap_pad = NULL;
+  uint16_t *i_kmap_pad = NULL;
+  for (int n = 0; n < input_n; n++) {
+    if (const_weight == 0)
+      weight_arr = (uint16_t*)weight;
+
+    for (int c = 0; c < input_c; ++c) {
+      fill_pad_fmap_bf16(i_fmap, &i_fmap_pad, 0,
+          pad_w_l, pad_w_r, pad_h_t, pad_h_b,
+          ins_h, ins_w, ins_h_last, ins_w_last,
+          input_h, input_w);
+
+      //kernel_dilation(
+      if (const_weight == 0)
+        fill_pad_fmap_bf16(
+          (weight_arr ), &i_kmap_pad, 0,
+          0, 0, 0, 0,  // no padding
+          dh - 1, dw - 1, 0, 0,
+          kh, kw);
+
+      float avg_pool_result;
+      for (int ph = 0; ph < output_h; ++ph) {
+        for (int pw = 0; pw < output_w; ++pw) {
+          int hstart = ph * stride_h;
+          int wstart = pw * stride_w;
+          int pool_index = index_get(ph, output_w, pw);
+          int mac_index = 0;
+
+          for (int h = 0; h < d_kh; h++) {
+            for (int w = 0; w < d_kw; w++) {
+              int index = index_get((hstart+h), w_after, (w+wstart));
+              mac_index = h*d_kw + w;
+
+              avg_pooling_mac_a[mac_index] = cvk_convert_bf16_fp32(i_fmap_pad[index]);
+
+              avg_pooling_mac_b[h*d_kw+w] = const_weight ?
+                  cvk_convert_bf16_fp32(avg_const_weight) : cvk_convert_bf16_fp32(i_kmap_pad[mac_index]);
+            }
+          }
+          inner_float_product(avg_pooling_mac_a, avg_pooling_mac_b, d_kh * d_kw,
+              &avg_pool_result);
+
+          if(bias) {
+            avg_pool_result += cvk_convert_hex_fp32(bias[c]);
+          }
+          *(o_fmap+pool_index) = cvk_convert_fp32_bf16(avg_pool_result);
+        }
+      }
+      weight_arr += kh * kw;
+      i_fmap += input_w * input_h;
+      o_fmap += output_w * output_h;
+    }
+  }
+  free(i_fmap_pad);
+  free(i_kmap_pad);
+  free(avg_pooling_mac_a);
+  free(avg_pooling_mac_b);
+
+  return 0;
+}
+
+static int compare_results(
+    param_t *p,
+    uint16_t input[],
+    uint16_t weight[],
+    uint32_t bias[],
+    uint16_t output[])
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+  int kh = p->weight->shape.h;
+  int kw = p->weight->shape.w;
+
+  uint16_t *output_ref = alloc_output(p);
+  int ret = native_pooling_avg_bf16(
+      input, weight, p->bias ? bias : NULL, output_ref,
+      in, ic, ih, iw, kh, kw,
+      p->pad_top, p->pad_bottom, p->pad_left, p->pad_right,
+      p->stride_h, p->stride_w,
+      p->ins_h, p->ins_w, p->ins_last_h, p->ins_last_w,
+      p->dilation_h, p->dilation_w, 0
+      );
+  if (ret)
+    goto fail_exit;
+
+  if(p->relu_enable )
+    bf16_relu(output_ref, tl_shape_size(&p->ofmap->shape, p->ofmap->fmt));
+
+  ret = array_cmp_int8(
+      "Comparing results ...\n", (int8_t*) output_ref, (int8_t*) output,
+      tl_shape_size(&p->ofmap->shape, p->ofmap->fmt));
+
+  if (ret) {
+    printf("Comparison FAILED!!!\n");
+    print_pooling_param(p);
+    ret = -1;
+  }
+
+fail_exit:
+  free(output_ref);
+
+  return ret;
+}
+
+static int pooling_ih_ext(param_t *p, int ih)
+{
+  int ins = p->ins_h;
+  int ins_last = p->ins_last_h;
+  int pad = p->pad_top + p->pad_bottom;
+  return (ih - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+static int pooling_iw_ext(param_t *p, int iw)
+{
+  int ins = p->ins_w;
+  int ins_last = p->ins_last_w;
+  int pad = p->pad_left + p->pad_right;
+  return (iw - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+static int pooling_oh(param_t *p, int ih, int kh, int dh)
+{
+  int ih_ext = pooling_ih_ext(p, ih);
+  int d_h = (kh -1) * dh + 1;
+  return (ih_ext - d_h) / p->stride_h + 1;
+}
+
+static int pooling_ow(param_t *p, int iw, int kw, int dw)
+{
+  int iw_ext = pooling_iw_ext(p, iw);
+  int d_w = (kw -1) * dw +1;
+  return (iw_ext - d_w) / p->stride_w + 1;
+}
+
+static void free_depthwise_param(
+    cvk_context_t *cvk_ctx,
+    param_t *p)
+{
+  if (p->bias)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->bias);
+
+  if (p->weight)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->weight);
+
+  if (p->ifmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->ifmap);
+
+  if (p->ofmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->ofmap);
+}
+
+static param_t random_depthwise_param(cvk_context_t *cvk_ctx, int stride_w, int stride_h)
+{
+  param_t p;
+
+  memset(&p, 0, sizeof(p));
+
+retry:
+  random_seed = clock();
+  srand(random_seed);
+  int using_bias = rand() % 2;
+  int n = rand() % 5 + 1;
+  int c = rand() % (3 * cvk_ctx->info.npu_num) + 1;
+  int ih = rand() % 30 + 3 + (INVALIDE_STRIDE == stride_h ? 0 : stride_h);
+  int iw = rand() % 30 + 6 + (INVALIDE_STRIDE == stride_w ? 0 : stride_w);
+  int kh = rand() % 7 + 1;
+  int kw = rand() % 7 + 1;
+
+  p.ins_h = rand() % kh;
+  p.ins_w = rand() % kw;
+  p.ins_last_h = rand() % kh;
+  p.ins_last_w = rand() % kw;
+  p.stride_h = INVALIDE_STRIDE == stride_h ? rand() % (kh) + 1 : stride_h;
+  p.stride_w = INVALIDE_STRIDE == stride_w ? rand() % (kh) + 1 : stride_w;
+  p.pad_top = rand() % kh;
+  p.pad_bottom = rand() % kh;
+  p.pad_left = rand() % kw;
+  p.pad_right = rand() % kw;
+  p.rshift_bits = rand() % 32;
+  p.dilation_h = rand()%4 + 1;
+  p.dilation_w = rand()%4 + 1;
+
+  int oh = pooling_oh(&p, ih, kh, p.dilation_h);
+  int ow = pooling_ow(&p, iw, kw, p.dilation_w);
+  int d_kh = calc_dilute_hw(kh, p.dilation_h - 1, 0, 0, 0);
+  int d_kw = calc_dilute_hw(kw, p.dilation_w - 1, 0, 0, 0);
+
+  cvk_tl_shape_t ofmap_shape;
+  ofmap_shape.n = n;
+  ofmap_shape.c = c;
+  ofmap_shape.h = oh;
+  ofmap_shape.w = ow;
+  cvk_tl_shape_t ifmap_shape;
+  ifmap_shape.n = n;
+  ifmap_shape.c = c;
+  ifmap_shape.h = ih;
+  ifmap_shape.w = iw;
+  cvk_tl_shape_t weight_shape;
+  weight_shape.n = 1;
+  weight_shape.c = c;
+  weight_shape.h = kh;
+  weight_shape.w = kw;
+  cvk_tl_shape_t bias_shape;
+  bias_shape.n = 2;
+  bias_shape.c = c;
+  bias_shape.h = 1;
+  bias_shape.w = 1;
+  p.relu_enable = rand()%2;
+
+  cvk_fmt_t ifmt = CVK_FMT_BF16;
+  p.ofmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, ofmap_shape, CVK_FMT_BF16, 1);
+  p.ifmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, ifmap_shape, ifmt, 1);
+  p.weight = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, weight_shape, CVK_FMT_BF16, 1);
+  p.bias = NULL;
+  if (using_bias)
+    p.bias = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, bias_shape, CVK_FMT_BF16, 0);
+
+  if ((kh > pooling_ih_ext(&p, ih))
+      || (kw > pooling_iw_ext(&p, iw))
+      || (oh < d_kh)
+      || (ow < d_kw)
+      || (p.pad_top >= (1 << 4))
+      || (p.pad_bottom >= (1 << 4))
+      || (p.pad_left >= (1 << 4))
+      || (p.pad_right >= (1 << 4))
+      || !p.ofmap
+      || !p.ifmap
+      || !p.weight
+      || (using_bias && !p.bias)) {
+    printf("retry init_pooling_param\n");
+    free_depthwise_param(cvk_ctx, &p);
+    goto retry;
+  }
+  return p;
+}
+
+static void put_bias_tensor(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    uint32_t data[])
+{
+  int c = tl->shape.c;
+
+  uint16_t *hi_lo = (uint16_t *)malloc(2 * c * 2);
+  if (!hi_lo)
+    return;
+
+  for (int i = 0; i < c; i++) {
+    hi_lo[i] = (data[i] >> 16) & 0xffff;
+    hi_lo[i + c] = (data[i]  & 0xffff);
+  }
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl, (uint8_t *)hi_lo);
+
+  free(hi_lo);
+}
+
+static int _test_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int stride_w, int stride_h)
+{
+  param_t param = random_depthwise_param(cvk_ctx, stride_w, stride_h);
+  //print_pooling_param(&param);
+  uint16_t *input = alloc_input(&param);
+  uint16_t *weight = alloc_weight(&param);
+  uint32_t *bias = NULL;
+  if (param.bias)
+    bias = alloc_bias(&param);
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, param.ifmap, (uint8_t *)input);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, param.weight, (uint8_t *)weight);
+  if (param.bias)
+    put_bias_tensor(rt_handle, cvk_ctx, param.bias, bias);
+
+  cvk_ctx->ops->tiu_pt_depthwise_convolution(cvk_ctx, &param);
+  uint16_t *output = (uint16_t *)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, param.ofmap);
+  int ret = compare_results(&param, input, weight, bias, output);
+
+  free_depthwise_param(cvk_ctx, &param);
+  free(input);
+  free(weight);
+  free(bias);
+  free(output);
+
+  return ret;
+}
+
+static int test_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx) {
+  return _test_pooling(rt_handle, cvk_ctx, INVALIDE_STRIDE, INVALIDE_STRIDE);
+}
+
+static int test_depthwise_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+  for (uint64_t i = 0; i < 20; i++)
+    ret |= test_pooling(rt_handle, cvk_ctx);
+
+  // test stride extend (0, 31]
+  int stride_list[] = {15, 16, 31};
+  int stride_list_len = sizeof(stride_list) / sizeof(stride_list[0]);
+
+  for (int stride_w_idx = 0; stride_w_idx < stride_list_len; stride_w_idx++) {
+    for (int stride_h_idx = 0; stride_h_idx < stride_list_len; stride_h_idx++) {
+      int stride_w = stride_list[stride_w_idx];
+      int stride_h = stride_list[stride_h_idx];
+      ret |= _test_pooling(rt_handle, cvk_ctx, stride_w, stride_h);
+    }
+  }
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+  int round_mode;
+  round_mode = cvk_set_store_feround();
+  ret |= test_depthwise_pooling(rt_handle, cvk_ctx);
+  cvk_restore_feround(round_mode);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_bf16_lut.c b/cviruntime/test/180x/test_180x_bf16_lut.c
new file mode 100644
index 000000000..e04cfc9be
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_bf16_lut.c
@@ -0,0 +1,149 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static uint32_t channel = -1; //<! 1822 hardcode
+
+static uint64_t shape_size(cvk_tl_shape_t s)
+{
+  return s.n * s.c * s.h * s.w;
+}
+
+static void tl_lut_ref(
+    uint16_t *ofmap,
+    uint16_t *ifmap,
+    uint16_t *table,
+    cvk_tl_shape_t ifmap_shape,
+    cvk_tl_shape_t table_shape)
+{
+  int ih, iw;
+  int tn, th, tw;
+
+  ih = ifmap_shape.h;
+  iw = ifmap_shape.w;
+  tn = table_shape.n;
+  th = table_shape.h;
+  tw = table_shape.w;
+  assert(tn == 1);
+  assert(th * tw == 256);
+
+  for (uint64_t i = 0; i < shape_size(ifmap_shape); i++) {
+    int ici = i / (ih * iw) % 32;
+    ofmap[i] = table[ici * (th * tw) + ifmap[i]];
+  }
+}
+
+static int test_tl_lut(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+  cvk_tl_shape_t ifmap_shape = {1, channel, 1, 224};
+  cvk_tl_shape_t table_shape = {1, channel, 32, 8};
+  cvk_tl_shape_t ofmap_shape = ifmap_shape;
+
+  uint64_t ifmap_size = shape_size(ifmap_shape);
+  uint64_t table_size = shape_size(table_shape);
+  uint64_t ofmap_size = shape_size(ofmap_shape);
+
+  cvk_fmt_t fmt = CVK_FMT_BF16;
+
+  int data_type_size = fmt_size(fmt);
+  uint64_t ifmap_bytesize  =  ifmap_size * data_type_size;
+  uint64_t table_bytesize  =  table_size * data_type_size;
+  uint64_t ofmap_bytesize  =  ofmap_size * data_type_size;
+
+  uint16_t *ifmap_data = (uint16_t *)malloc(ifmap_bytesize);
+  uint16_t *table_data = (uint16_t *)malloc(table_bytesize);
+  uint16_t *ref_data = (uint16_t *)malloc(ofmap_bytesize);
+  if (!ifmap_data || !table_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint64_t i = 0; i < ifmap_size; i++)
+    ifmap_data[i] = 0;
+    //ifmap_data[i] = i - 20;
+
+  for (uint64_t i = 0; i < table_size; i++)
+    table_data[i] = i + i / 256 * 3;
+
+  tl_lut_ref(ref_data, ifmap_data, table_data, ifmap_shape, table_shape);
+
+  cvk_tl_t *tl_ifmap =
+    cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx,ifmap_shape, fmt, 1);
+  cvk_tl_t *tl_table =
+    cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, table_shape, fmt, /*align*/1);
+  cvk_tl_t *tl_ofmap =
+    cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx,ofmap_shape, fmt, /*align*/1);
+  uint16_t *ofmap_data = NULL;
+  if (!tl_ifmap || !tl_table || !tl_ofmap) {
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_ifmap, (uint8_t *)ifmap_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_table, (uint8_t *)table_data);
+
+  cvk_tiu_lookup_table_param_t p12;
+  p12.ofmap = tl_ofmap;
+  p12.ifmap = tl_ifmap;
+  p12.table = tl_table;
+  cvk_ctx->ops->tiu_lookup_table(cvk_ctx, &p12);
+  CVI_RT_Submit(cvk_ctx);
+
+  ofmap_data = (uint16_t*)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_ofmap);
+  for (uint64_t i = 0; i < ofmap_size; i++) {
+    if (ofmap_data[i] != ref_data[i]) {
+      fprintf(stderr,
+          "comparing failed at ofmap_data[%" PRIu64 "], got %d, exp %d\n",
+          i, ofmap_data[i], ref_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+fail_exit_2:
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_ofmap);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_table);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_ifmap);
+  free(ofmap_data);
+
+fail_exit:
+  free(ifmap_data);
+  free(table_data);
+  free(ref_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  channel = cvk_ctx->info.npu_num;
+
+  ret |= test_tl_lut(rt_handle, cvk_ctx);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_bf16_matrix_mac.c b/cviruntime/test/180x/test_180x_bf16_matrix_mac.c
new file mode 100644
index 000000000..21ee79017
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_bf16_matrix_mac.c
@@ -0,0 +1,372 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef cvk_tiu_matrix_multiplication_param_t param_t;
+int random_seed;
+
+static uint64_t matrix_size(const cvk_ml_t *ml)
+{
+
+  uint64_t row = ml->shape.n;
+  uint64_t col = ml->shape.col;
+  return row * col;
+}
+
+static uint64_t res_size(param_t *p)
+{
+    return matrix_size(p->res);
+}
+
+static uint16_t * alloc_left(param_t *p)
+{
+  uint64_t size = matrix_size(p->left);
+  uint16_t *buf = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (uint64_t i = 0; i < size; i++) {
+    buf[i] = cvk_convert_fp32_bf16(i);
+  }
+
+  return buf;
+}
+
+static uint16_t * alloc_right(param_t *p)
+{
+  uint64_t size = matrix_size(p->right);
+  uint16_t *buf = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (uint64_t i = 0; i < size; i++) {
+    float val = 0.01;
+    buf[i] = cvk_convert_fp32_bf16(i);
+    val += 0.01;
+  }
+  return buf;
+}
+
+static uint32_t * alloc_bias(param_t *p)
+{
+  if (!p->bias)
+    return NULL;
+
+  uint64_t size = matrix_size(p->bias);
+  uint32_t *buf = (uint32_t *)malloc(sizeof(uint32_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (uint64_t i = 0; i < size; i++) {
+    buf[i] = cvk_convert_fp32_hex(i);
+  }
+  return buf;
+}
+
+static uint32_t * alloc_res(param_t *p)
+{
+  uint64_t size = res_size(p);
+  uint32_t *buf = (uint32_t *)malloc(sizeof(uint32_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (uint64_t i = 0; i < size; i++) {
+    buf[i] = cvk_convert_fp32_bf16(i);
+  }
+  return buf;
+}
+
+static inline void bf16_relu(float *buf, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++)
+    if (buf[i] < 0)
+      buf[i] = 0;
+}
+
+static void matrix_mac_ref(
+    param_t *p, uint16_t left[], uint16_t right[], uint32_t bias[], uint32_t res[])
+{
+  uint64_t size = res_size(p);
+  uint32_t left_col = p->left->shape.col;
+  uint32_t right_col = p->right->shape.col;
+  uint32_t res_row = p->left->shape.n;
+  uint32_t res_col = p->res->shape.col;
+  uint32_t left_c = p->left->shape.c;
+  uint32_t left_w = p->left->shape.w;
+
+  float *tmp_res = (float *)malloc(sizeof(float) * size);
+  if (!tmp_res)
+    return;
+
+  if (p->add_result) {
+    for (uint32_t i = 0; i < res_row * res_col; i++)
+      tmp_res[i] = cvk_convert_bf16_fp32(res[i]);
+  } else {
+    for (uint32_t i = 0; i < res_row * res_col; i++)
+      tmp_res[i] = 0;
+  }
+  for (uint32_t row = 0; row < res_row; row++) {
+    for (uint32_t col = 0; col < res_col; col++) {
+      for (uint32_t wi = 0; wi < left_w; wi++) {
+        for (uint32_t ci = 0; ci < left_c; ci++) {
+          if ((wi + (ci*left_w)) >= left_col)
+            continue;
+          uint32_t li = row * left_col + left_w * ci + wi;
+          uint32_t ri = (ci* left_w + wi )* right_col + col;
+
+          float l = cvk_convert_bf16_fp32(left[li]);
+          float r = cvk_convert_bf16_fp32(right[ri]);
+          tmp_res[row * res_col + col] += l * r;
+        }
+      }
+    }
+  }
+
+  if (p->bias && bias) {
+    for (uint32_t row = 0; row < res_row; row++) {
+      for (uint32_t col = 0; col < res_col; col++) {
+        float b = cvk_convert_hex_fp32(bias[col]);
+        tmp_res[row * res_col + col] += b;
+      }
+    }
+  }
+
+  if (p->relu_enable)
+    bf16_relu(tmp_res, size);
+
+  for (uint64_t i = 0; i < size; i++) {
+    res[i] = cvk_convert_fp32_bf16(tmp_res[i]);
+  }
+  free(tmp_res);
+}
+
+static void put_bias(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_ml_t *ml,
+    uint32_t data[])
+{
+  uint64_t size = ml->shape.col;
+
+  uint16_t *tmp = (uint16_t *)malloc(sizeof(uint16_t) * size * 2);
+  if (!tmp)
+    return;
+
+  for (uint64_t i = 0; i < size; i++) {
+    tmp[i] = (data[i] >> 16) & 0xFFFF;
+    tmp[i + size] = (data[i] & 0xFFFF);
+  }
+
+  matrix_copy_s2d_g2l(rt_handle, cvk_ctx, ml, (uint8_t*)tmp);
+
+  free(tmp);
+}
+
+static void put_res(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_ml_t *ml,
+    uint32_t data[])
+{
+  uint64_t size = ml->shape.n  * ml->shape.col;
+
+  uint16_t *tmp = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  if (!tmp)
+    return;
+
+  for (uint64_t i = 0; i < size; i++) {
+    tmp[i] = (data[i] & 0xFFFF);
+  }
+
+  matrix_copy_s2d_g2l(rt_handle, cvk_ctx, ml, (uint8_t*)tmp);
+
+  free(tmp);
+}
+
+static uint32_t * get_res(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    param_t *p)
+{
+  uint64_t size = res_size(p);
+  uint32_t *res = (uint32_t *)malloc(sizeof(uint32_t) * size);
+  if (!res)
+    return NULL;
+
+  uint16_t *tmp = (uint16_t *)matrix_copy_l2g_d2s(rt_handle, cvk_ctx, p->res);
+  if (tmp) {
+    for (uint64_t i = 0; i < size; i++)
+      res[i] = tmp[i];
+
+    free(tmp);
+  } else {
+    free(res);
+    res = NULL;
+  }
+
+  return res;
+}
+
+static void test_param(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  uint16_t *left = alloc_left(p);
+  uint16_t *right = alloc_right(p);
+  uint32_t *bias = alloc_bias(p);
+  uint32_t *ref = alloc_res(p);
+  matrix_copy_s2d_g2l(rt_handle, cvk_ctx, p->left, (uint8_t*)left);
+  matrix_copy_s2d_g2l(rt_handle, cvk_ctx, p->right, (uint8_t*)right);
+  if (bias)
+    put_bias(rt_handle, cvk_ctx, p->bias, bias);
+  if (p->add_result)
+    put_res(rt_handle, cvk_ctx, p->res, ref);
+
+  cvk_ctx->ops->tiu_matrix_multiplication(cvk_ctx, p);
+  uint32_t *res = get_res(rt_handle, cvk_ctx, p);
+  matrix_mac_ref(p, left, right, bias, ref);
+  uint64_t size = res_size(p);
+  for (uint64_t i = 0; i < size; i++) {
+    if (res[i] != ref[i]) {
+      fprintf(stderr, "comparing failed at out[%" PRIu64 "], got %x, exp %x\n",
+              i, res[i], ref[i]);
+      fprintf(stderr, "random_seed=%d\n", random_seed);
+      exit(-1);
+    }
+  }
+  free(left);
+  free(right);
+  free(bias);
+  free(ref);
+  free(res);
+}
+
+static void destroy_param(cvk_context_t *cvk_ctx, param_t *p)
+{
+  if (p->bias)
+    cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->bias);
+  if (p->res)
+    cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->res);
+  if (p->right)
+    cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->right);
+  if (p->left)
+    cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->left);
+}
+
+static cvk_ml_t *alloc_param_res(
+    cvk_context_t *cvk_ctx, param_t *p)
+{
+  cvk_ml_shape_t s;
+
+  s.n = p->left->shape.n;
+  s.c = p->right->shape.c;
+  s.w = p->right->shape.w;
+  s.col = p->right->shape.col;
+  cvk_fmt_t fmt = CVK_FMT_BF16;
+  return cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, s, fmt, 1);
+}
+
+static param_t param_0(cvk_context_t *cvk_ctx)
+{
+
+retry:
+  random_seed = clock();
+  srand(random_seed);
+
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = rand()%2;
+  p.add_result = 0; /*bf16 HW does not support add_result*/
+  p.ps32_mode = 0;
+
+  uint32_t left_row = rand() % 100 +1;
+  uint32_t left_col = rand() % 100 + 1;
+  uint32_t left_w = rand() % (left_col/5+1) + 1; // c is generate by w, and make c is larger
+  uint32_t left_c = left_col / left_w + (left_col % left_w ? 1: 0);
+
+  uint32_t right_row = left_col;
+  uint32_t right_col = rand() % 100 + 1;
+  uint32_t right_w = (rand() % (right_col/5+1) + 1); // make c is larger
+  uint32_t right_c = right_col / right_w + (right_col % right_w ? 1: 0) ;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  uint32_t bias = rand()%2;
+  p.bias = NULL;
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_BF16, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_BF16, 1);
+  if (!p.left || !p.right) {
+    printf("retry init_matrix_param\n");
+    destroy_param(cvk_ctx, &p);
+    goto retry;
+  }
+
+  p.res = alloc_param_res(cvk_ctx, &p);
+  if (bias) {
+    cvk_ml_shape_t bias_shape = right_shape;
+    bias_shape.n = 2;
+    p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_BF16, 1);
+  }
+
+  if (!p.res || (bias && !p.bias)) {
+    printf("retry init_matrix_param\n");
+    destroy_param(cvk_ctx, &p);
+    goto retry;
+  }
+
+  return p;
+}
+
+
+#define test_one_param(n)                               \
+  do {                                                  \
+    param_t p = param_##n(cvk_ctx);                      \
+    test_param(rt_handle, cvk_ctx, &p);                       \
+    destroy_param(cvk_ctx, &p);                          \
+  } while (0)
+
+int main(int argc, char **argv)
+{
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+  int round_mode;
+  round_mode = cvk_set_store_feround();
+
+  for (int i = 0 ; i < 30 ; i++)
+    test_one_param(0);
+
+  cvk_restore_feround(round_mode);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return 0;
+}
diff --git a/cviruntime/test/180x/test_180x_bf16_matrix_mac_ps32.c b/cviruntime/test/180x/test_180x_bf16_matrix_mac_ps32.c
new file mode 100644
index 000000000..fa11bd117
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_bf16_matrix_mac_ps32.c
@@ -0,0 +1,583 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_native_ref.h"
+
+typedef cvk_tiu_matrix_multiplication_param_t param_t;
+
+typedef struct{
+  uint32_t left_sign;
+  uint32_t left_row ;
+  uint32_t left_col ;
+  uint32_t left_c ;
+  uint32_t left_w ;
+  uint32_t right_sign;
+  uint32_t right_row ;
+  uint32_t right_col ;
+  uint32_t right_c ;
+  uint32_t right_w ;
+  uint32_t lshift_bits ;
+  uint32_t rshift_bits ;
+  uint32_t relu_enable ;
+  uint32_t using_bias;
+  uint32_t bias_sign;
+} matrix_init_para_t;
+
+uint32_t random_seed;
+matrix_init_para_t matrix_para_t;
+
+static void make_bmk_matrix_param_ps32(cvk_context_t *cvk_ctx, param_t *p, int ps32_mode);
+static param_t param_init();
+
+void print_param(param_t *p)
+{
+  printf("random_seed =%d\n", random_seed);
+  printf("ps32_mode =%d\n",p->ps32_mode);
+  printf("left_shape.n =%d\n",p->left->shape.n);
+  printf("left_shape.col =%d\n",p->left->shape.col);
+  printf("left_shape.c =%d\n",p->left->shape.c);
+  printf("left_shape.w =%d\n",p->left->shape.w);
+  printf("left_fmt =%d\n",p->left->fmt);
+  printf("right_shape.n =%d\n",p->right->shape.n);
+  printf("right_shape.col =%d\n",p->right->shape.col);
+  printf("right_shape.c =%d\n",p->right->shape.c);
+  printf("right_shape.w =%d\n",p->right->shape.w);
+  printf("right_fmt =%d\n",p->right->fmt);
+  if(p->bias)
+  {
+    printf("bias_shape.n =%d\n",p->bias->shape.n);
+    printf("bias_shape.col =%d\n",p->bias->shape.col);
+    printf("bias_shape.c =%d\n",p->bias->shape.c);
+    printf("bias_shape.w =%d\n",p->bias->shape.w);
+    printf("bias_fmt =%d\n",p->bias->fmt);
+  }
+  printf("result_shape.n =%d\n",p->res->shape.n);
+  printf("result_shape.col =%d\n",p->res->shape.col);
+  printf("result_shape.c =%d\n",p->res->shape.c);
+  printf("result_shape.w =%d\n",p->res->shape.w);
+  printf("result_fmt =%d\n",p->res->fmt);
+  printf("relu_enable=%d\n",p->relu_enable);
+  printf("rshift_bits=%d\n",p->rshift_bits);
+}
+
+
+static uint64_t matrix_size(const cvk_ml_t *ml)
+{
+  uint64_t row = ml->shape.n;
+  uint64_t col = ml->shape.col;
+  return row * col;
+}
+
+static uint64_t res_ps32_size(param_t *p)
+{
+    return matrix_size(p->res);
+}
+
+static uint64_t res_size(param_t *p)
+{
+    return matrix_size(p->res);
+}
+
+static uint16_t * alloc_left(param_t *p)
+{
+  uint64_t size = matrix_size(p->left);
+  uint16_t *buf = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (uint64_t i = 0; i < size; i++)
+    buf[i] = cvk_convert_fp32_bf16(i);
+
+  return buf;
+}
+
+static uint16_t * alloc_right(param_t *p)
+{
+  uint64_t size = matrix_size(p->right);
+
+  uint16_t *buf = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (uint64_t i = 0; i < size; i++)
+    buf[i] = cvk_convert_fp32_bf16(i);
+
+  return buf;
+}
+static uint32_t * alloc_bias(param_t *p)
+{
+  if (!p->bias)
+    return NULL;
+
+  uint64_t size = matrix_size(p->bias) / 2;
+
+  uint32_t *buf = (uint32_t *)malloc(sizeof(uint32_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (uint64_t i = 0; i < size; i++)
+    buf[i] = cvk_convert_fp32_hex(i);
+
+  return buf;
+}
+
+static uint16_t * alloc_ps32_res(param_t *p)
+{
+  uint64_t size = res_ps32_size(p)*2;
+  uint16_t *buf = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (uint64_t i = 0; i < size; i++)
+    buf[i] = cvk_convert_fp32_bf16(i);
+
+  return buf;
+}
+
+static inline void bf16_relu(float *buf, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++)
+    if (buf[i] < 0)
+      buf[i] = 0;
+}
+
+static int ps32_m2_matrix_mac_ref(
+  param_t *p,
+  uint16_t *left,
+  uint16_t *right,
+  uint16_t *res)
+{
+  uint64_t size = res_ps32_size(p);
+  uint32_t left_col = p->left->shape.col;
+  uint32_t right_col = p->right->shape.col;
+  uint32_t res_row = p->left->shape.n;
+  uint32_t res_col = p->res->shape.col;
+  uint32_t left_c = p->left->shape.c;
+  uint32_t left_w = p->left->shape.w;
+
+  int ret = 0;
+  int bstride = res_row * res_col;
+
+  float *tmp_res = (float *)malloc(sizeof(float) * size);
+  if (!tmp_res)
+    return -1;
+
+  for (uint32_t i = 0; i < res_row * res_col; i++)
+      tmp_res[i] = 0;
+  for (uint32_t row = 0; row < res_row; row++) {
+    for (uint32_t col = 0; col < res_col; col++) {
+      for (uint32_t wi = 0; wi < left_w; wi++) {
+        for (uint32_t ci = 0; ci < left_c; ci++) {
+          if ((wi + (ci*left_w)) >= left_col)
+            continue;
+          uint32_t li = row * left_col + left_w * ci + wi;
+          uint32_t ri = (ci* left_w + wi )* right_col + col;
+
+          float l = cvk_convert_bf16_fp32(left[li]);
+          float r = cvk_convert_bf16_fp32(right[ri]);
+          tmp_res[row * res_col + col] += l * r;
+        }
+      }
+    }
+  }
+
+  for (uint64_t i = 0; i < size; i++)
+    res[i + bstride*0] = (cvk_convert_fp32_hex(tmp_res[i]) >> 16) & 0xFFFF;
+  for (uint64_t i = 0; i < size; i++)
+    res[i + bstride*1] = (cvk_convert_fp32_hex(tmp_res[i]) >> 0) & 0xFFFF;
+
+  free(tmp_res);
+
+  return ret;
+}
+
+static int ps32_m3_matrix_mac_ref(
+  param_t *p,
+  uint16_t *left,
+  uint16_t *right,
+  uint16_t *res)
+{
+  uint64_t size = res_ps32_size(p);
+  uint32_t left_col = p->left->shape.col;
+  uint32_t right_col = p->right->shape.col;
+  uint32_t res_row = p->left->shape.n;
+  uint32_t res_col = p->res->shape.col;
+  uint32_t left_c = p->left->shape.c;
+  uint32_t left_w = p->left->shape.w;
+
+  int ret = 0;
+  int bstride = res_row * res_col;
+
+  float *tmp_res = (float *)malloc(sizeof(float) * size);
+  if (!tmp_res)
+    return -1;
+
+  for (uint64_t i = 0; i < size; i++)
+    tmp_res[i] = cvk_convert_hex_fp32((res[i + bstride*0] << 16) | res[i + bstride*1]);
+
+  for (uint32_t row = 0; row < res_row; row++) {
+    for (uint32_t col = 0; col < res_col; col++) {
+      for (uint32_t wi = 0; wi < left_w; wi++) {
+        for (uint32_t ci = 0; ci < left_c; ci++) {
+          if ((wi + (ci*left_w)) >= left_col)
+            continue;
+          uint32_t li = row * left_col + left_w * ci + wi;
+          uint32_t ri = (ci* left_w + wi )* right_col + col;
+
+          float l = cvk_convert_bf16_fp32(left[li]);
+          float r = cvk_convert_bf16_fp32(right[ri]);
+          tmp_res[row * res_col + col] += l * r;
+        }
+      }
+    }
+  }
+
+  for (uint64_t i = 0; i < size; i++)
+    res[i + bstride*0] = (cvk_convert_fp32_hex(tmp_res[i]) >> 16) & 0xFFFF;
+  for (uint64_t i = 0; i < size; i++)
+    res[i + bstride*1] = (cvk_convert_fp32_hex(tmp_res[i]) >> 0) & 0xFFFF;
+
+  free(tmp_res);
+
+  return ret;
+}
+
+static int ps32_m1_matrix_mac_ref(
+  param_t *p,
+  uint16_t *left,
+  uint16_t *right,
+  uint32_t * bias,
+  uint16_t *res)
+{
+  uint64_t size = res_ps32_size(p);
+  uint32_t left_col = p->left->shape.col;
+  uint32_t right_col = p->right->shape.col;
+  uint32_t res_row = p->left->shape.n;
+  uint32_t res_col = p->res->shape.col;
+  uint32_t left_c = p->left->shape.c;
+  uint32_t left_w = p->left->shape.w;
+
+  int ret = 0;
+  int bstride = res_row * res_col;
+
+  float *tmp_res = (float *)malloc(sizeof(float) * size);
+  if (!tmp_res)
+    return -1;
+
+  for (uint64_t i = 0; i < size; i++) {
+    tmp_res[i] = cvk_convert_hex_fp32((res[i + bstride*0] << 16) | res[i + bstride*1]);
+   }
+
+  for (uint32_t row = 0; row < res_row; row++) {
+    for (uint32_t col = 0; col < res_col; col++) {
+      for (uint32_t wi = 0; wi < left_w; wi++) {
+        for (uint32_t ci = 0; ci < left_c; ci++) {
+          if ((wi + (ci*left_w)) >= left_col)
+            continue;
+          uint32_t li = row * left_col + left_w * ci + wi;
+          uint32_t ri = (ci* left_w + wi )* right_col + col;
+
+          float l = cvk_convert_bf16_fp32(left[li]);
+          float r = cvk_convert_bf16_fp32(right[ri]);
+          tmp_res[row * res_col + col] += l * r;
+        }
+      }
+    }
+  }
+
+  if (p->bias && bias) {
+    for (uint32_t row = 0; row < res_row; row++) {
+      for (uint32_t col = 0; col < res_col; col++) {
+        float b = cvk_convert_hex_fp32(bias[col]);
+        tmp_res[row * res_col + col] += b;
+      }
+    }
+  }
+
+  if (p->relu_enable)
+    bf16_relu(tmp_res, size);
+
+  for (uint64_t i = 0; i < size; i++)
+    res[i] = cvk_convert_fp32_bf16(tmp_res[i]);
+
+  free(tmp_res);
+
+  return ret;
+}
+
+static void put_bias(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_ml_t *ml,
+    uint32_t data[])
+{
+  uint64_t size = ml->shape.col;
+
+  uint16_t *tmp = (uint16_t *)malloc(sizeof(uint16_t) * size * 2);
+  if (!tmp)
+    return;
+
+  for (uint64_t i = 0; i < size; i++) {
+    tmp[i] = data[i] >> 16;
+    tmp[i + size] = data[i] & 0xFFFF;
+  }
+  matrix_copy_s2d_g2l(rt_handle, cvk_ctx, ml, (uint8_t*) tmp);
+
+  free(tmp);
+}
+
+
+static int test_matrix_ps32_ut(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  int ret = 0;
+  make_bmk_matrix_param_ps32(cvk_ctx, p, 2);
+  if (!p->left || !p->right || !p->res) {
+    // bypass, not compare fail
+    return 0;
+  }
+
+  uint16_t *left = alloc_left(p);
+  uint16_t *right = alloc_right(p);
+  uint16_t *ref = alloc_ps32_res(p);
+  if (!left || !right || !ref) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  {
+    ret = ps32_m2_matrix_mac_ref(p, left, right, ref);
+    if (ret)
+      goto fail_exit;
+
+    matrix_copy_s2d_g2l(rt_handle, cvk_ctx, p->left, (uint8_t*) left);
+    matrix_copy_s2d_g2l(rt_handle, cvk_ctx, p->right, (uint8_t*) right);
+    cvk_ctx->ops->tiu_matrix_multiplication(cvk_ctx, p);
+    cvk_ml_t ps32_res;
+    ps32_res = *p->res;
+    ps32_res.shape.n *= sizeof(short);
+    uint16_t *res = (uint16_t*) matrix_copy_l2g_d2s(rt_handle, cvk_ctx, &ps32_res);
+
+    ret = array_cmp_int8(
+        "Comparing begin_mode results ...\n",
+        (int8_t *)ref, (int8_t *)res ,(int)res_ps32_size(p)*sizeof(int));
+    if (ret) {
+      printf("Comparison M2 FAILED\n");
+      print_param(p);
+      ret = -1;
+    }else
+      printf("Comparison M2 PASS\n");
+    free(res);
+  }
+
+  {
+    make_bmk_matrix_param_ps32(cvk_ctx, p, 3);
+
+    ret = ps32_m3_matrix_mac_ref(p, left, right, ref);
+    if (ret)
+      goto fail_exit;
+
+    cvk_ctx->ops->tiu_matrix_multiplication(cvk_ctx, p);
+    cvk_ml_t ps32_res;
+    ps32_res = *p->res;
+    ps32_res.shape.n *= sizeof(short);
+    uint16_t *res = (uint16_t *) matrix_copy_l2g_d2s(rt_handle, cvk_ctx, &ps32_res);
+
+    ret = array_cmp_int8(
+        "Comparing m3 results ...\n",
+        (int8_t *)ref, (int8_t *)res ,(int)res_ps32_size(p)*sizeof(int));
+    if (ret) {
+      printf("Comparison M3 FAILED\n");
+      print_param(p);
+      ret = -1;
+    }else
+      printf("Comparison M3 PASS\n");
+
+    free(res);
+  }
+  {
+    make_bmk_matrix_param_ps32(cvk_ctx, p, 1);
+    if (matrix_para_t.using_bias && !p->bias) {
+      // bypass, not compare fail
+      ret = 0;
+      goto fail_exit;
+    }
+
+    uint32_t *bias = alloc_bias(p);
+
+    ret = ps32_m1_matrix_mac_ref(p, left, right, bias, ref);
+    if (ret)
+      goto fail_exit;
+
+    if(p->bias)
+      put_bias(rt_handle, cvk_ctx, p->bias, bias);
+
+    cvk_ctx->ops->tiu_matrix_multiplication(cvk_ctx, p);
+    cvk_ml_t ps32_res;
+    ps32_res = *p->res;
+    ps32_res.shape.n *= 2;
+
+    uint16_t *res = (uint16_t *)matrix_copy_l2g_d2s(rt_handle, cvk_ctx, &ps32_res);
+
+    ret = array_cmp_int8(
+        "Comparing m1 results ...\n",
+        (int8_t *)ref, (int8_t *)res ,(int)res_size(p)*2);
+    if (ret) {
+      printf("Comparison M1 FAILED\n");
+      print_param(p);
+      ret = -1;
+    }else
+      printf("Comparison M1 PASS\n");
+
+    free(res);
+    free(bias);
+  }
+
+fail_exit:
+  free(left);
+  free(right);
+  free(ref);
+
+  return ret;
+}
+
+static void destroy_param(cvk_context_t *cvk_ctx, param_t *p)
+{
+  if (p->bias)
+    cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->bias);
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->res);
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->right);
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->left);
+}
+
+static cvk_ml_t *alloc_param_res(
+    cvk_context_t *cvk_ctx, param_t *p)
+{
+  cvk_ml_shape_t s;
+  s.n = p->left->shape.n;
+  s.c = p->right->shape.c;
+  s.w = p->right->shape.w;
+  s.col = p->right->shape.col;
+
+  cvk_fmt_t fmt = CVK_FMT_BF16;
+  return cvk_ctx->ops->lmem_alloc_ps32_matrix(cvk_ctx, s, fmt, 1);
+}
+
+
+static void make_bmk_matrix_param_ps32(cvk_context_t *cvk_ctx, param_t *p, int ps32_mode)
+{
+
+  cvk_ml_shape_t left_shape;
+  cvk_ml_shape_t right_shape;
+
+  p->ps32_mode = ps32_mode;
+  p->relu_enable = 0;
+  p->lshift_bits = 0;
+  p->rshift_bits = 0;
+  if(ps32_mode==2)
+  {
+    left_shape.n = matrix_para_t.left_row;
+    left_shape.c = matrix_para_t.left_c;
+    left_shape.w = matrix_para_t.left_w;
+    left_shape.col = matrix_para_t.left_col;
+
+    right_shape.n = matrix_para_t.right_row;
+    right_shape.c = matrix_para_t.right_c;
+    right_shape.w = matrix_para_t.right_w;
+    right_shape.col = matrix_para_t.right_col;
+    p->left  = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_BF16, 1);
+    p->right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_BF16, 1);
+    p->bias = NULL;
+    p->res = alloc_param_res(cvk_ctx, p);
+  }else if(ps32_mode==3)
+  {
+
+  }else if(ps32_mode==1)
+  {
+     p->relu_enable = matrix_para_t.relu_enable;
+     p->rshift_bits = matrix_para_t.rshift_bits;
+     if(matrix_para_t.using_bias)
+     {
+       right_shape.n = matrix_para_t.right_row;
+       right_shape.c = matrix_para_t.right_c;
+       right_shape.w = matrix_para_t.right_w;
+       right_shape.col = matrix_para_t.right_col;
+
+       cvk_ml_shape_t bias_shape = right_shape;
+       bias_shape.n = 2;
+       p->bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_BF16, 1);
+    }
+  }
+  //print_param(p);
+}
+
+static param_t param_init(void)
+{
+  param_t p;
+
+  random_seed = clock();
+  srand(random_seed);
+
+  memset(&p, 0, sizeof(param_t));
+  memset(&matrix_para_t, 0, sizeof(matrix_init_para_t));
+
+  matrix_para_t.using_bias = rand()%2;
+  matrix_para_t.relu_enable = rand()%2;
+
+  matrix_para_t.left_row = rand()%60+1;
+  matrix_para_t.left_col = rand()%40+1;
+  matrix_para_t.left_w = matrix_para_t.left_col/0x10 ? (uint32_t)rand()%8+8 : matrix_para_t.left_col;
+  matrix_para_t.left_c =
+    matrix_para_t.left_col%matrix_para_t.left_w?
+      matrix_para_t.left_col/matrix_para_t.left_w+1 : matrix_para_t.left_col/matrix_para_t.left_w;
+
+  matrix_para_t.right_row = matrix_para_t.left_col;
+  matrix_para_t.right_col = rand()%50+1;
+  matrix_para_t.right_w = rand()%16+1;
+  matrix_para_t.right_c =
+    matrix_para_t.right_col%matrix_para_t.right_w?
+      matrix_para_t.right_col/matrix_para_t.right_w+1 : matrix_para_t.right_col/matrix_para_t.right_w;
+  return p;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+  int round_mode;
+  round_mode = cvk_set_store_feround();
+
+  for (int i = 0; i < 30; i++) {
+    printf("random_test_conv iteration: %d\n", i);
+    param_t p = param_init();
+
+    ret |= test_matrix_ps32_ut(rt_handle, cvk_ctx, &p);
+    destroy_param(cvk_ctx, &p);
+  }
+
+  printf("bf16 matrix mac ps32 test %s\n", ret ? "fail" : "pass");
+
+  cvk_restore_feround(round_mode);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_bf16_max_pooling.c b/cviruntime/test/180x/test_180x_bf16_max_pooling.c
new file mode 100644
index 000000000..553499ce8
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_bf16_max_pooling.c
@@ -0,0 +1,345 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+#include <float.h>
+
+#include "test_cvikernel_util.h"
+#include "test_native_ref.h"
+
+#define INVALIDE_STRIDE (-1)
+typedef cvk_tiu_max_pooling_param_t param_t;
+int random_seed;
+static void print_pooling_param(param_t *p)
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+  int on = p->ofmap->shape.n;
+  int oc = p->ofmap->shape.c;
+  int oh = p->ofmap->shape.h;
+  int ow = p->ofmap->shape.w;
+
+  printf("  Pooling parameters:\n");
+  printf("    random_seed : %d \n", random_seed);
+  printf("    ifmap = (%d, %d, %d, %d)\n", in, ic, ih, iw);
+  printf("    opd0_sign = %d\n", p->ifmap->fmt == CVK_FMT_I8);
+  printf("    weight = (%d, %d)\n", p->kh, p->kw);
+  printf("    padding = (%d, %d, %d, %d)\n",
+         p->pad_top, p->pad_bottom, p->pad_left, p->pad_right);
+  printf("    stride = (%d, %d)\n", p->stride_h, p->stride_w);
+  printf("    ofmap = (%d, %d, %d, %d)\n", on, oc, oh, ow);
+}
+
+static int pooling_ih_ext(param_t *p, int ih)
+{
+  int pad = p->pad_top + p->pad_bottom;
+  return ih + pad;
+}
+
+static int pooling_iw_ext(param_t *p, int iw)
+{
+  int pad = p->pad_left + p->pad_right;
+  return iw + pad;
+}
+
+static int pooling_oh(param_t *p, int ih)
+{
+  int ih_ext = pooling_ih_ext(p, ih);
+  return (ih_ext - p->kh) / p->stride_h + 1;
+}
+
+static int pooling_ow(param_t *p, int iw)
+{
+  int iw_ext = pooling_iw_ext(p, iw);
+  return (iw_ext - p->kw) / p->stride_w + 1;
+}
+
+static uint16_t *alloc_input(param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->ifmap->shape, p->ifmap->fmt);
+  uint16_t *data = (uint16_t *)malloc(size);
+  if (!data)
+    return NULL;
+
+  for (uint64_t i = 0; i < size/sizeof(uint16_t); i++) {
+    float val;
+    int RAND_MAX2 = RAND_MAX/2; //100 ~ -100
+    val = (float)(rand()-RAND_MAX2)*100 / (float)RAND_MAX;
+    data[i] = cvk_convert_fp32_bf16(val);
+  }
+  return data;
+}
+
+static uint16_t *alloc_output(param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->ofmap->shape, p->ofmap->fmt);
+  return (uint16_t *)malloc(size);
+}
+
+static void free_pooling_param(
+    cvk_context_t *cvk_ctx,
+    param_t *r)
+{
+  if (r->ifmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ifmap);
+  if (r->ofmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ofmap);
+}
+
+static param_t random_pooling_param(cvk_context_t *cvk_ctx, int stride_w, int stride_h)
+{
+
+  param_t p;
+
+  memset(&p, 0, sizeof(p));
+
+retry:
+  random_seed = clock();
+//  random_seed = 3058538;
+  srand(random_seed);
+
+#if 0
+  int in = 1;
+  int ic = 1;
+  int ih = 6;
+  int iw = 6;
+  //int opd0_sign = rand() % 2;
+
+  p.kh = 3;
+  p.kw = 3;
+  p.stride_h = p.kh;
+  p.stride_w = p.kw;
+  p.pad_top = 3;//rand() % p.kh;
+  p.pad_bottom = 3;//rand() % p.kh;
+  p.pad_left = 3;//rand() % p.kw;
+  p.pad_right = 3;//rand() % p.kw;
+
+  cvk_tl_shape_t ifmap_shape;
+  ifmap_shape.n = in;
+  ifmap_shape.c = ic;
+  ifmap_shape.h = ih;
+  ifmap_shape.w = iw;
+  cvk_tl_shape_t ofmap_shape;
+  ofmap_shape.n = in;
+  ofmap_shape.c = ic;
+  ofmap_shape.h = pooling_oh(&p, ih);
+  ofmap_shape.w = pooling_ow(&p, iw);
+
+#else
+  int in = rand() % 5 + 1;
+  int ic = rand() % (3 * cvk_ctx->info.npu_num) + 1;
+  int ih = rand() % 30 + 3 + (INVALIDE_STRIDE == stride_h ? 0 : stride_h);
+  int iw = rand() % 30 + 6 + (INVALIDE_STRIDE == stride_w ? 0 : stride_w);
+  //int opd0_sign = rand() % 2;
+
+  p.kh = rand() % 5 + 1;
+  p.kw = rand() % 5 + 1;
+  p.stride_h = INVALIDE_STRIDE == stride_h ? rand() % (p.kh) + 1 : stride_h;
+  p.stride_w = INVALIDE_STRIDE == stride_w ? rand() % (p.kh) + 1 : stride_w;
+  p.pad_top = rand() % p.kh;
+  p.pad_bottom = rand() % p.kh;
+  p.pad_left = rand() % p.kw;
+  p.pad_right = rand() % p.kw;
+
+  cvk_tl_shape_t ifmap_shape;
+  ifmap_shape.n = in;
+  ifmap_shape.c = ic;
+  ifmap_shape.h = ih;
+  ifmap_shape.w = iw;
+  cvk_tl_shape_t ofmap_shape;
+  ofmap_shape.n = in;
+  ofmap_shape.c = ic;
+  ofmap_shape.h = pooling_oh(&p, ih);
+  ofmap_shape.w = pooling_ow(&p, iw);
+#endif
+//  cvk_fmt_t fmt = opd0_sign? CVK_FMT_I8: CVK_FMT_U8;
+  p.ofmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, ofmap_shape, CVK_FMT_BF16, 1);
+  p.ifmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, ifmap_shape, CVK_FMT_BF16, 1);
+
+  int RAND_MAX2 = RAND_MAX/2; //20 ~ -20
+  float ins_val = (float)(rand()-RAND_MAX2)*20 / (float)RAND_MAX;
+  p.ins_fp = cvk_convert_fp32_bf16(ins_val);
+
+  if ((p.kh > pooling_ih_ext(&p, ih))
+      || (p.kw > pooling_iw_ext(&p, iw))
+      || (p.pad_top >= (1 << 4))
+      || (p.pad_bottom >= (1 << 4))
+      || (p.pad_left >= (1 << 4))
+      || (p.pad_right >= (1 << 4))
+      || (p.kh * p.kw == 1)
+      || !p.ofmap || !p.ifmap) {
+      printf("retry init_pooling_param\n");
+    free_pooling_param(cvk_ctx, &p);
+    goto retry;
+  }
+
+  return p;
+}
+static int index_get(int h, int w1, int w2)
+{
+  return h * w1 + w2;
+}
+
+int native_pooling_max_bf16(
+    const uint16_t* i_fmap,
+    uint16_t* o_fmap,
+    int input_n, int input_c, int input_h, int input_w,
+    int kh, int kw,
+    int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r,
+    int stride_h, int stride_w,
+    int ins_h, int ins_w,
+    int ins_h_last, int ins_w_last,
+    uint16_t ins_fp
+    )
+{
+  if (ins_h != 0 || ins_w != 0 || ins_h_last != 0  || ins_w_last !=0)
+    return -1;
+
+  int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b);
+  int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r);
+
+  int output_h = calc_output_hw(h_after, kh, stride_h);
+  int output_w = calc_output_hw(w_after, kw, stride_w);
+
+  const float max_init = -FLT_MAX;//cvk_convert_bf16_fp32(ins_fp);
+  uint16_t *i_fmap_pad = NULL;
+  for (int nc = 0; nc < input_n * input_c; nc++) {
+    fill_pad_fmap_bf16(i_fmap, &i_fmap_pad, ins_fp,
+      pad_w_l, pad_w_r, pad_h_t, pad_h_b,
+      0, 0, 0, 0, input_h, input_w);
+
+    for (int ph = 0; ph < output_h; ++ph) {
+      for (int pw = 0; pw < output_w; ++pw) {
+        int hstart = ph * stride_h;
+        int wstart = pw * stride_w;
+        int pool_index = index_get(ph, output_w, pw);
+        float max = max_init;
+        for (int h = 0; h < kh; h++) {
+          for (int w = 0; w < kw; w++) {
+            int index = index_get((hstart + h), (input_w + pad_w_l + pad_w_r),
+                            (w + wstart));
+            float val = cvk_convert_bf16_fp32(i_fmap_pad[index]);
+            max = (val > max)? val: max;
+          }
+        }
+        o_fmap[pool_index] = cvk_convert_fp32_bf16(max);
+      }
+    }
+    i_fmap += input_w * input_h;
+    o_fmap += output_w * output_h;
+  }
+  free(i_fmap_pad);
+
+  return 0;
+}
+
+
+static int compare_results(
+    param_t *p,
+    uint16_t input[],
+    uint16_t output[])
+{
+  int ret = 0;
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+
+  uint16_t *output_ref = alloc_output(p);
+  ret = native_pooling_max_bf16(
+      input, output_ref, in, ic, ih, iw, p->kh, p->kw,
+      p->pad_top, p->pad_bottom, p->pad_left, p->pad_right,
+      p->stride_h, p->stride_w, 0, 0, 0, 0, p->ins_fp);
+  if (ret)
+    goto fail_exit;
+  
+  ret = array_cmp_int8(
+      "Comparing results ...\n", (int8_t*) output_ref, (int8_t*)output,
+      tl_shape_size(&p->ofmap->shape, p->ofmap->fmt));
+  if (ret) {
+    printf("Comparison FAILED!!!\n");
+    print_pooling_param(p);
+    ret = -1;
+  }
+
+fail_exit:
+  free(output_ref);
+
+  return ret;
+}
+
+static int _test_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int stride_w, int stride_h)
+{
+  param_t param = random_pooling_param(cvk_ctx, stride_w, stride_h);
+  //print_pooling_param(&param);
+  uint16_t *input = alloc_input(&param);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, param.ifmap, (uint8_t *)input);
+  cvk_ctx->ops->tiu_max_pooling(cvk_ctx, &param);
+
+  uint16_t *output = (uint16_t *)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, param.ofmap);
+
+  int ret = compare_results(&param, input, output);
+
+  free_pooling_param(cvk_ctx, &param);
+  free(output);
+  free(input);
+
+  return ret;
+}
+
+
+static int test_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx) {
+  return _test_pooling(rt_handle, cvk_ctx, INVALIDE_STRIDE, INVALIDE_STRIDE);
+}
+
+static int test_max_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+  for (uint64_t i = 0; i < 20; i++)
+    ret |= test_pooling(rt_handle, cvk_ctx);
+
+  // test stride extend (0, 31]
+  int stride_list[] = {15, 16, 31};
+  int stride_list_len = sizeof(stride_list) / sizeof(stride_list[0]);
+
+  for (int stride_w_idx = 0; stride_w_idx < stride_list_len; stride_w_idx++) {
+    for (int stride_h_idx = 0; stride_h_idx < stride_list_len; stride_h_idx++) {
+      int stride_w = stride_list[stride_w_idx];
+      int stride_h = stride_list[stride_h_idx];
+
+      ret |= _test_pooling(rt_handle, cvk_ctx, stride_w, stride_h);
+    }
+  }
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret = test_max_pooling(rt_handle, cvk_ctx);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_bf16_min_pooling.c b/cviruntime/test/180x/test_180x_bf16_min_pooling.c
new file mode 100644
index 000000000..71500317b
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_bf16_min_pooling.c
@@ -0,0 +1,325 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <float.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_native_ref.h"
+
+typedef cvk_tiu_min_pooling_param_t param_t;
+int random_seed;
+static void print_pooling_param(param_t *p)
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+  int on = p->ofmap->shape.n;
+  int oc = p->ofmap->shape.c;
+  int oh = p->ofmap->shape.h;
+  int ow = p->ofmap->shape.w;
+
+  printf("  Pooling parameters:\n");
+  printf("    random_seed : %d \n", random_seed);
+  printf("    ifmap = (%d, %d, %d, %d)\n", in, ic, ih, iw);
+  printf("    opd0_sign = %d\n", p->ifmap->fmt == CVK_FMT_I8);
+  printf("    weight = (%d, %d)\n", p->kh, p->kw);
+  printf("    padding = (%d, %d, %d, %d)\n",
+         p->pad_top, p->pad_bottom, p->pad_left, p->pad_right);
+  printf("    stride = (%d, %d)\n", p->stride_h, p->stride_w);
+  printf("    ofmap = (%d, %d, %d, %d)\n", on, oc, oh, ow);
+}
+
+static int pooling_ih_ext(param_t *p, int ih)
+{
+  int pad = p->pad_top + p->pad_bottom;
+  return ih + pad;
+}
+
+static int pooling_iw_ext(param_t *p, int iw)
+{
+  int pad = p->pad_left + p->pad_right;
+  return iw + pad;
+}
+
+static int pooling_oh(param_t *p, int ih)
+{
+  int ih_ext = pooling_ih_ext(p, ih);
+  return (ih_ext - p->kh) / p->stride_h + 1;
+}
+
+static int pooling_ow(param_t *p, int iw)
+{
+  int iw_ext = pooling_iw_ext(p, iw);
+  return (iw_ext - p->kw) / p->stride_w + 1;
+}
+
+static uint16_t *alloc_input(param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->ifmap->shape, p->ifmap->fmt);
+  uint16_t *data = (uint16_t *)malloc(size);
+  if (!data)
+    return NULL;
+
+  for (uint64_t i = 0; i < size/sizeof(uint16_t); i++) {
+    float val;
+    int RAND_MAX2 = RAND_MAX/2; //100 ~ -100
+    val = (float)(rand()-RAND_MAX2)*100 / (float)RAND_MAX;
+    data[i] = cvk_convert_fp32_bf16(val);
+  }
+  return data;
+}
+
+static uint16_t *alloc_output(param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->ofmap->shape, p->ofmap->fmt);
+  return (uint16_t *)malloc(size);
+}
+
+static void free_pooling_param(
+    cvk_context_t *cvk_ctx,
+    param_t *r)
+{
+  if (r->ifmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ifmap);
+  if (r->ofmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ofmap);
+}
+
+static param_t random_pooling_param(cvk_context_t *cvk_ctx)
+{
+
+  param_t p;
+
+  memset(&p, 0, sizeof(p));
+
+retry:
+  random_seed = clock();
+//  random_seed = 3058538;
+  srand(random_seed);
+
+#if 0
+  int in = 1;
+  int ic = 1;
+  int ih = 6;
+  int iw = 6;
+  //int opd0_sign = rand() % 2;
+
+  p.kh = 3;
+  p.kw = 3;
+  p.stride_h = p.kh;
+  p.stride_w = p.kw;
+  p.pad_top = 3;//rand() % p.kh;
+  p.pad_bottom = 3;//rand() % p.kh;
+  p.pad_left = 3;//rand() % p.kw;
+  p.pad_right = 3;//rand() % p.kw;
+
+  cvk_tl_shape_t ifmap_shape;
+  ifmap_shape.n = in;
+  ifmap_shape.c = ic;
+  ifmap_shape.h = ih;
+  ifmap_shape.w = iw;
+  cvk_tl_shape_t ofmap_shape;
+  ofmap_shape.n = in;
+  ofmap_shape.c = ic;
+  ofmap_shape.h = pooling_oh(&p, ih);
+  ofmap_shape.w = pooling_ow(&p, iw);
+
+#else
+  int in = rand() % 5 + 1;
+  int ic = rand() % (3 * cvk_ctx->info.npu_num) + 1;
+  int ih = rand() % 30 + 3;
+  int iw = rand() % 30 + 6;
+  //int opd0_sign = rand() % 2;
+
+  p.kh = rand() % 5 + 1;
+  p.kw = rand() % 5 + 1;
+  p.stride_h = rand() % (p.kh) + 1;
+  p.stride_w = rand() % (p.kw) + 1;
+  p.pad_top = rand() % p.kh;
+  p.pad_bottom = rand() % p.kh;
+  p.pad_left = rand() % p.kw;
+  p.pad_right = rand() % p.kw;
+
+  cvk_tl_shape_t ifmap_shape;
+  ifmap_shape.n = in;
+  ifmap_shape.c = ic;
+  ifmap_shape.h = ih;
+  ifmap_shape.w = iw;
+  cvk_tl_shape_t ofmap_shape;
+  ofmap_shape.n = in;
+  ofmap_shape.c = ic;
+  ofmap_shape.h = pooling_oh(&p, ih);
+  ofmap_shape.w = pooling_ow(&p, iw);
+#endif
+//  cvk_fmt_t fmt = opd0_sign? CVK_FMT_I8: CVK_FMT_U8;
+  p.ofmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, ofmap_shape, CVK_FMT_BF16, 1);
+  p.ifmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, ifmap_shape, CVK_FMT_BF16, 1);
+
+  int RAND_MAX2 = RAND_MAX/2; //20 ~ -20
+  float ins_val = (float)(rand()-RAND_MAX2)*20 / (float)RAND_MAX;
+  p.ins_fp = cvk_convert_fp32_bf16(ins_val);
+
+  if ((p.kh > pooling_ih_ext(&p, ih))
+      || (p.kw > pooling_iw_ext(&p, iw))
+      || (p.pad_top >= (1 << 4))
+      || (p.pad_bottom >= (1 << 4))
+      || (p.pad_left >= (1 << 4))
+      || (p.pad_right >= (1 << 4))
+      || (p.kh * p.kw == 1)
+      || !p.ofmap || !p.ifmap) {
+      printf("retry init_pooling_param\n");
+    free_pooling_param(cvk_ctx, &p);
+    goto retry;
+  }
+
+  return p;
+}
+static int index_get(int h, int w1, int w2)
+{
+  return h * w1 + w2;
+}
+
+int native_pooling_min_bf16(
+    const uint16_t* i_fmap,
+    uint16_t* o_fmap,
+    int input_n, int input_c, int input_h, int input_w,
+    int kh, int kw,
+    int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r,
+    int stride_h, int stride_w,
+    int ins_h, int ins_w,
+    int ins_h_last, int ins_w_last,
+    uint16_t ins_fp
+    )
+{
+  if (ins_h != 0 || ins_w != 0 || ins_h_last != 0  || ins_w_last !=0)
+    return -1;
+
+  int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b);
+  int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r);
+
+  int output_h = calc_output_hw(h_after, kh, stride_h);
+  int output_w = calc_output_hw(w_after, kw, stride_w);
+
+  const float min_init = FLT_MAX;//cvk_convert_bf16_fp32(ins_fp);
+  uint16_t *i_fmap_pad = NULL;
+  for (int nc = 0; nc < input_n * input_c; nc++) {
+    fill_pad_fmap_bf16(i_fmap, &i_fmap_pad, ins_fp,
+      pad_w_l, pad_w_r, pad_h_t, pad_h_b,
+      0, 0, 0, 0, input_h, input_w);
+
+    for (int ph = 0; ph < output_h; ++ph) {
+      for (int pw = 0; pw < output_w; ++pw) {
+        int hstart = ph * stride_h;
+        int wstart = pw * stride_w;
+        int pool_index = index_get(ph, output_w, pw);
+        float min = min_init;
+        for (int h = 0; h < kh; h++) {
+          for (int w = 0; w < kw; w++) {
+            int index = index_get((hstart + h), (input_w + pad_w_l + pad_w_r),
+                            (w + wstart));
+            float val = cvk_convert_bf16_fp32(i_fmap_pad[index]);
+            min = (val < min)? val: min;
+          }
+        }
+        o_fmap[pool_index] = cvk_convert_fp32_bf16(min);
+      }
+    }
+    i_fmap += input_w * input_h;
+    o_fmap += output_w * output_h;
+  }
+  free(i_fmap_pad);
+
+  return 0;
+}
+
+
+static int compare_results(
+    param_t *p,
+    uint16_t input[],
+    uint16_t output[])
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+
+  uint16_t *output_ref = alloc_output(p);
+  int ret = native_pooling_min_bf16(
+      input, output_ref, in, ic, ih, iw, p->kh, p->kw,
+      p->pad_top, p->pad_bottom, p->pad_left, p->pad_right,
+      p->stride_h, p->stride_w, 0, 0, 0, 0, p->ins_fp);
+  if (ret)
+    goto fail_exit;
+
+  ret = array_cmp_int8(
+      "Comparing results ...\n", (int8_t*) output_ref, (int8_t*)output,
+      tl_shape_size(&p->ofmap->shape, p->ofmap->fmt));
+  if (ret != 0) {
+    printf("Comparison FAILED!!!\n");
+    print_pooling_param(p);
+    ret = -1;;
+  }
+
+fail_exit:
+  free(output_ref);
+
+  return ret;
+}
+
+static int test_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  param_t param = random_pooling_param(cvk_ctx);
+  //print_pooling_param(&param);
+  uint16_t *input = alloc_input(&param);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, param.ifmap, (uint8_t *)input);
+  cvk_ctx->ops->tiu_min_pooling(cvk_ctx, &param);
+
+  uint16_t *output = (uint16_t *)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, param.ofmap);
+
+  int ret = compare_results(&param, input, output);
+
+  free_pooling_param(cvk_ctx, &param);
+  free(output);
+  free(input);
+
+  return ret;
+}
+
+static int test_min_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+  for (uint64_t i = 0; i < 20; i++)
+    ret |= test_pooling(rt_handle, cvk_ctx);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret = test_min_pooling(rt_handle, cvk_ctx);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_bf16_tensor_add.c b/cviruntime/test/180x/test_180x_bf16_tensor_add.c
new file mode 100644
index 000000000..51980e504
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_bf16_tensor_add.c
@@ -0,0 +1,142 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_add_ref(
+    uint16_t *ref_low,
+    uint16_t *a_low,
+    uint16_t *b_low,
+    uint64_t size, int relu_enable)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    float ta = cvk_convert_bf16_fp32(a_low[i]);
+    float tb = cvk_convert_bf16_fp32(b_low[i]);
+    float res = ta + tb;
+    if(relu_enable && res <0)
+        res = 0;
+    ref_low[i] = cvk_convert_fp32_bf16(res);
+  }
+}
+
+static int test_tl_add(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 2;  // 3 -> 2 for 1810
+  int c = 9; // 39 -> 9 for 180x
+  int h = 7;
+  int w = 37;
+  int rshift_bits;
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  cvk_fmt_t fmt_type = CVK_FMT_BF16;
+  uint32_t size = n * c * h  * w;
+  uint32_t data_size = size * (fmt_type == CVK_FMT_BF16 ? 2 : 1);
+  uint16_t *a_low_data = (uint16_t *)malloc(data_size);
+  uint16_t *b_low_data = (uint16_t *)malloc(data_size);
+  uint16_t *ref_low_data = (uint16_t *)malloc(data_size);
+  if (!a_low_data || !b_low_data || !ref_low_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  cvk_tl_t *tl_a_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  cvk_tl_t *tl_b_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  cvk_tl_t *tl_res_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  uint16_t *res_low_data = NULL;
+  if (!tl_a_low || !tl_b_low || !tl_res_low) {
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  for(int relu_enable = 0; relu_enable < 2; relu_enable++) {
+    for (uint64_t i = 0; i < size; i++) {
+      a_low_data[i] = cvk_convert_fp32_bf16(i);
+      b_low_data[i] = cvk_convert_fp32_bf16(i);
+    }
+    rshift_bits = 0;
+
+    tl_add_ref(ref_low_data,
+               a_low_data,
+               b_low_data,
+               size, relu_enable);
+
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a_low, (uint8_t *)a_low_data);
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b_low, (uint8_t *)b_low_data);
+    cvk_tiu_add_param_t p4;
+    p4.res_high = 0;
+    p4.res_low = tl_res_low;
+    p4.a_high = 0;
+    p4.a_low = tl_a_low;
+    p4.b_is_const = 0;
+    p4.b.high = 0;
+    p4.b.low = tl_b_low;
+    p4.rshift_bits = rshift_bits;
+    p4.relu_enable = relu_enable;
+    cvk_ctx->ops->tiu_add(cvk_ctx, &p4);
+    res_low_data = (uint16_t*)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res_low);
+
+    for (uint32_t i = 0; i < size; i++) {
+      if (res_low_data[i] != ref_low_data[i]) {
+        fprintf(stderr, "comparing failed at res_low_data[%u], got %x, exp %x\n",
+                i, res_low_data[i], ref_low_data[i]);
+        ret = -1;
+        break;
+      }
+    }
+
+    free(res_low_data);
+  }
+
+fail_exit_2:
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res_low);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b_low);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a_low);
+
+fail_exit:
+  free(a_low_data);
+  free(b_low_data);
+  free(ref_low_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  int round_mode;
+  round_mode = cvk_set_store_feround();
+
+  ret |= test_tl_add(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_add(rt_handle, cvk_ctx, 1);
+
+  cvk_restore_feround(round_mode);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_bf16_tensor_add_const.c b/cviruntime/test/180x/test_180x_bf16_tensor_add_const.c
new file mode 100644
index 000000000..8b2bc51ff
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_bf16_tensor_add_const.c
@@ -0,0 +1,133 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_add_const_ref(
+    uint16_t *ref_low,
+    uint16_t *a_low,
+    uint16_t b,
+    uint64_t size, int relu_enable)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    float ta = cvk_convert_bf16_fp32(a_low[i]);
+    float tb = cvk_convert_bf16_fp32(b);
+    float res = ta + tb;
+    if(relu_enable && res <0)
+        res = 0;
+    ref_low[i] = cvk_convert_fp32_bf16(res);
+  }
+}
+
+static int test_tl_add_const(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 3;
+  int c = 9; // 39 -> 9 for 180x
+  int h = 7;
+  int w = 37;
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  cvk_fmt_t fmt_type = CVK_FMT_BF16;
+  uint32_t size = n * c * h  * w;
+  uint32_t data_size = size * (fmt_type == CVK_FMT_BF16 ? 2 : 1);
+  uint16_t *a_low_data = (uint16_t *)malloc(data_size);
+  uint16_t *ref_low_data = (uint16_t *)malloc(data_size);
+  if (!a_low_data || !ref_low_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  cvk_tl_t *tl_a_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  cvk_tl_t *tl_res_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  if (!tl_a_low || !tl_res_low) {
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  uint16_t b = cvk_convert_fp32_bf16(-3);
+
+  for(int relu_enable = 0; relu_enable < 2; relu_enable++) {
+
+    for (uint64_t i = 0; i < size; i++) {
+      a_low_data[i] = cvk_convert_fp32_bf16(i);
+    }
+
+    tl_add_const_ref(ref_low_data,
+                     a_low_data,
+                     b, size,relu_enable);
+
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a_low, (uint8_t*) a_low_data);
+
+    cvk_tiu_add_param_t p4;
+    p4.res_high = 0;
+    p4.res_low = tl_res_low;
+    p4.a_high = 0;
+    p4.a_low = tl_a_low;
+    p4.b_is_const = 1;
+    p4.b_const.val = b;
+//    p4.b_const.is_signed = b_is_signed;
+//    p4.rshift_bits = rshift_bits;
+    p4.relu_enable = relu_enable;
+    cvk_ctx->ops->tiu_add(cvk_ctx, &p4);
+
+//    uint8_t *res_high_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res_high);
+    uint16_t *res_low_data = (uint16_t *) tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res_low);
+    for (uint64_t i = 0; i < size; i++) {
+      if (res_low_data[i] != ref_low_data[i]) {
+        fprintf(stderr, "comparing failed at res_low_data[%" PRIu64 "], got %d, exp %d\n",
+                i, res_low_data[i], ref_low_data[i]);
+        ret = -1;
+        break;
+      }
+    }
+
+    free(res_low_data);
+  }
+
+fail_exit_2:
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res_low);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a_low);
+
+fail_exit:
+  free(a_low_data);
+  free(ref_low_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+
+  ret |= test_tl_add_const(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_add_const(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_bf16_tensor_ge.c b/cviruntime/test/180x/test_180x_bf16_tensor_ge.c
new file mode 100644
index 000000000..8dfbce57d
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_bf16_tensor_ge.c
@@ -0,0 +1,127 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_ge_ref(uint16_t *a, uint16_t *b, uint16_t *result, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    float fa = cvk_convert_bf16_fp32(a[i]);
+    float fb = cvk_convert_bf16_fp32(b[i]);
+    float fge;
+    if (fa >= fb)
+      fge = 1;
+    else
+      fge = 0;
+    result[i] = cvk_convert_fp32_bf16(fge);
+  }
+}
+
+static int test_tl_ge(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 2; // 3 -> 2 for 1810
+  int c = 9; // 39 -> 9 for 180x
+  int h = 7;
+  int w = 37;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  cvk_fmt_t fmt_type = CVK_FMT_BF16;
+  uint32_t size = n * c * h * w;
+  uint32_t data_size = size * (fmt_type == CVK_FMT_BF16 ? 2 : 1);
+  uint16_t *a_data = (uint16_t *)malloc(data_size);
+  uint16_t *b_data = (uint16_t *)malloc(data_size);
+  uint16_t *ref_data = (uint16_t *)malloc(data_size);
+  if (!a_data || !b_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint32_t i = 0; i < size; i++)
+    a_data[i] = cvk_convert_fp32_bf16((int8_t)(i % 256));
+
+  for (uint32_t i = 0; i < size; i++)
+    b_data[i] = cvk_convert_fp32_bf16((int8_t)(100 - i % 256));
+
+  tl_ge_ref(a_data, b_data, ref_data, size);
+
+  cvk_tl_t *tl_a = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  cvk_tl_t *tl_b = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  cvk_tl_t *tl_ge = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  if (!tl_a || !tl_b || !tl_ge) {
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, (uint8_t *)a_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b, (uint8_t *)b_data);
+
+  cvk_tiu_ge_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.ge = tl_ge;
+  p.a = tl_a;
+  p.b_is_const = 0;
+  p.b = tl_b;
+  cvk_ctx->ops->tiu_ge(cvk_ctx, &p);
+  uint16_t *ge_data = (uint16_t*)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_ge);
+
+  for (uint32_t i = 0; i < size; i++) {
+    if (ge_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%u], got %x, exp %x\n",
+             i, ge_data[i], ref_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+  free(ge_data);
+
+fail_exit_2:
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_ge);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+
+fail_exit:
+  free(a_data);
+  free(b_data);
+  free(ref_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_ge(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_ge(rt_handle, cvk_ctx, 1);
+
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_bf16_tensor_ge_const.c b/cviruntime/test/180x/test_180x_bf16_tensor_ge_const.c
new file mode 100644
index 000000000..b4bb4448a
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_bf16_tensor_ge_const.c
@@ -0,0 +1,124 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_ge_const_ref(uint16_t *a, uint16_t b, uint16_t *result, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    float fa = cvk_convert_bf16_fp32(a[i]);
+    float fb = cvk_convert_bf16_fp32(b);
+    float fge;
+    if (fa >= fb)
+      fge = 1;
+    else
+      fge = 0;
+    result[i] = cvk_convert_fp32_bf16(fge);
+  }
+}
+
+static int test_tl_ge_const(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 3;
+  int c = 9; // 39 -> 9 for 180x
+  int h = 7;
+  int w = 37;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  cvk_fmt_t fmt_type = CVK_FMT_BF16;
+  uint32_t size = n * c * h  * w;
+  uint32_t data_size = size * (fmt_type == CVK_FMT_BF16 ? 2 : 1);
+
+  uint16_t *a_data = (uint16_t *)malloc(data_size);
+  uint16_t *ref_data = (uint16_t *)malloc(data_size);
+  if (!a_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint32_t i = 0; i < size; i++)
+    a_data[i] = cvk_convert_fp32_bf16(i);
+    //a_data[i] = cvk_convert_fp32_bf16(rand()%100 - 50);
+
+  uint16_t b = cvk_convert_fp32_bf16(20);
+
+  tl_ge_const_ref(a_data, b, ref_data, size);
+
+  cvk_tl_t *tl_a = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  cvk_tl_t *tl_ge = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  if (!tl_a || !tl_ge) {
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, (uint8_t *)a_data);
+  cvk_tiu_ge_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.ge = tl_ge;
+  p.a = tl_a;
+  p.b_is_const = 1;
+  p.b_const.val = b;
+  p.b_const.is_signed = 1;
+
+  cvk_ctx->ops->tiu_ge(cvk_ctx, &p);
+
+  uint16_t *ge_data = (uint16_t*) tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_ge);
+
+  for (uint64_t i = 0; i < size; i++) {
+    if (ge_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+             i, ge_data[i], ref_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+  free(ge_data);
+
+fail_exit_2:
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_ge);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+
+fail_exit:
+  free(a_data);
+  free(ref_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_ge_const(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_ge_const(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_bf16_tensor_mac.c b/cviruntime/test/180x/test_180x_bf16_tensor_mac.c
new file mode 100644
index 000000000..161c2384a
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_bf16_tensor_mac.c
@@ -0,0 +1,147 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_mac_ref(
+    uint16_t *ref,
+    uint16_t *a, uint16_t *b, uint16_t *c,
+    uint64_t size, int relu_enable)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    float ta = cvk_convert_bf16_fp32(a[i]);
+    float tb = cvk_convert_bf16_fp32(b[i]);
+    float tc = cvk_convert_bf16_fp32(c[i]);
+    float res = ta * tb + tc;
+
+    if(relu_enable)
+      if(res<0)
+        res=0;
+    ref[i] = cvk_convert_fp32_bf16(res);
+  }
+}
+
+static int test_tl_mac(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int lshift_bits = 1;
+  int rshift_bits = 3;
+  int n = 2; // 3 -> 2 for 1810
+  int c = 9; // 39 -> 9 for 180x
+  int h = 7;
+  int w = 37;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+  cvk_fmt_t fmt_type = CVK_FMT_BF16;
+
+  uint32_t size = n * c * h * w;
+  uint32_t data_size = size * (fmt_type == CVK_FMT_BF16 ? 2 : 1);
+  uint16_t *a_data = (uint16_t *)malloc(data_size);
+  uint16_t *b_data = (uint16_t *)malloc(data_size);
+  uint16_t *c_data = (uint16_t *)malloc(data_size);
+  uint16_t *ref_data = (uint16_t *)malloc(data_size);
+  if (!a_data || !b_data || !c_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  cvk_tl_t *tl_a = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  cvk_tl_t *tl_b = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  cvk_tl_t *tl_c = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  if (!tl_a || !tl_b || !tl_c) {
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  for(int relu_enable = 0; relu_enable < 2; relu_enable++) {
+    for (uint32_t i = 0; i < size; i++) {
+      a_data[i] = cvk_convert_fp32_bf16(rand());
+      b_data[i] = cvk_convert_fp32_bf16(rand());
+      c_data[i] = cvk_convert_fp32_bf16(rand());
+    }
+
+    tl_mac_ref(ref_data,
+               a_data, b_data, c_data,
+               size, relu_enable);
+
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, (uint8_t *)a_data);
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b, (uint8_t *)b_data);
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_c, (uint8_t *)c_data);
+
+    cvk_tiu_mac_param_t p2;
+    p2.res_high = 0;
+    p2.res_low = tl_c;
+    p2.res_is_int8 = relu_enable;
+    p2.a = tl_a;
+    p2.b_is_const = 0;
+    p2.b = tl_b;
+    p2.lshift_bits = lshift_bits;
+    p2.rshift_bits = rshift_bits;
+    p2.relu_enable = relu_enable;
+    cvk_ctx->ops->tiu_mac(cvk_ctx, &p2);
+    uint16_t *mac_data = (uint16_t *)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_c);
+
+    for (uint32_t i = 0; i < size; i++) {
+      if (mac_data[i] != ref_data[i]) {
+        fprintf(stderr, "comparing failed at mac_data[%u], got %d, exp %d\n",
+               i, mac_data[i], ref_data[i]);
+        ret = -1;
+        break;
+      }
+    }
+
+    free(mac_data);
+  }
+
+fail_exit_2:
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_c);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+
+fail_exit:
+  free(a_data);
+  free(b_data);
+  free(c_data);
+  free(ref_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  int round_mode;
+  round_mode = cvk_set_store_feround();
+  ret |= test_tl_mac(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_mac(rt_handle, cvk_ctx, 1);
+  cvk_restore_feround(round_mode);
+
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_bf16_tensor_mac_const.c b/cviruntime/test/180x/test_180x_bf16_tensor_mac_const.c
new file mode 100644
index 000000000..e048ab1ff
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_bf16_tensor_mac_const.c
@@ -0,0 +1,143 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_mac_const_ref(
+    uint16_t *ref_low,
+    uint16_t *a, uint16_t b_const,
+    uint16_t *c_low,
+    uint64_t size, int relu_enable)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    float ta = cvk_convert_bf16_fp32(a[i]);
+    float tb = cvk_convert_bf16_fp32(b_const);
+    float tc = cvk_convert_bf16_fp32(c_low[i]);
+    float res = ta * tb + tc;
+
+    if(relu_enable)
+    {
+      if(res<0)
+        res=0;
+    }
+    ref_low[i] = cvk_convert_fp32_bf16(res);
+  }
+}
+
+static int test_tl_mac_const(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 3;
+  int c = 9; // 39 -> 9 for 180x
+  int h = 7;
+  int w = 37;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  cvk_fmt_t fmt_type = CVK_FMT_BF16;
+  uint32_t size = n * c * h  * w;
+  uint32_t data_size = size * (fmt_type == CVK_FMT_BF16 ? 2 : 1);
+  uint16_t *a_data = (uint16_t *)malloc(data_size);
+  uint16_t *c_low_data = (uint16_t *)malloc(data_size);
+  uint16_t *ref_low_data = (uint16_t *)malloc(data_size);
+  if (!a_data || !c_low_data || !ref_low_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  cvk_tl_t *tl_a = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  cvk_tl_t *tl_c_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  if (!tl_a || !tl_c_low) {
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  uint16_t b_const = cvk_convert_fp32_bf16(37);
+
+  for(int relu_enable = 0; relu_enable < 2; relu_enable++) {
+    for (uint32_t i = 0; i < size; i++) {
+      a_data[i] = cvk_convert_fp32_bf16(rand() % 256);
+      c_low_data[i] = cvk_convert_fp32_bf16(i);
+    }
+
+    tl_mac_const_ref(ref_low_data,
+                     a_data, b_const, c_low_data,
+                     size, relu_enable);
+
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, (uint8_t*) a_data);
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_c_low, (uint8_t*) c_low_data);
+    cvk_tiu_mac_param_t p3;
+    p3.res_high = 0;
+    p3.res_low = tl_c_low;
+    p3.res_is_int8 = 1;//relu_enable;
+    p3.a = tl_a;
+    p3.b_is_const = 1;
+    p3.b_const.val = b_const;
+    p3.relu_enable = relu_enable;
+
+    cvk_ctx->ops->tiu_mac(cvk_ctx, &p3);
+    uint16_t *mac_low_data = (uint16_t*) tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_c_low);
+    for (uint64_t i = 0; i < size; i++) {
+      if (mac_low_data[i] != ref_low_data[i]) {
+        fprintf(stderr, "comparing failed at mac_low_data[%" PRIu64 "], got %d, exp %d\n",
+               i, mac_low_data[i], ref_low_data[i]);
+        ret = -1;
+        break;
+      }
+    }
+
+    free(mac_low_data);
+  }
+
+fail_exit_2:
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_c_low);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+
+
+fail_exit:
+  free(a_data);
+  free(c_low_data);
+  free(ref_low_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  int round_mode;
+  round_mode = cvk_set_store_feround();
+
+  ret |= test_tl_mac_const(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_mac_const(rt_handle, cvk_ctx, 1);
+
+  cvk_restore_feround(round_mode);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_bf16_tensor_max.c b/cviruntime/test/180x/test_180x_bf16_tensor_max.c
new file mode 100644
index 000000000..a768fcd40
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_bf16_tensor_max.c
@@ -0,0 +1,126 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_max_ref(uint16_t *a, uint16_t *b, uint16_t *max, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    float fa = cvk_convert_bf16_fp32(a[i]);
+    float fb = cvk_convert_bf16_fp32(b[i]);
+    float fmax;
+    if (fa > fb)
+      fmax = fa;
+    else
+      fmax = fb;
+    max[i] = cvk_convert_fp32_bf16(fmax);
+  }
+}
+
+static int test_tl_max(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 2; // 3 -> 2 for 1810
+  int c = 9; // 39 -> 9 for 180x
+  int h = 7;
+  int w = 37;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  cvk_fmt_t fmt_type = CVK_FMT_BF16;
+  uint32_t size = n * c * h * w;
+  uint32_t data_size = size * (fmt_type == CVK_FMT_BF16 ? 2 : 1);
+  uint16_t *a_data = (uint16_t *)malloc(data_size);
+  uint16_t *b_data = (uint16_t *)malloc(data_size);
+  uint16_t *ref_data = (uint16_t *)malloc(data_size);
+  if (!a_data || !b_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint32_t i = 0; i < size; i++)
+    a_data[i] = cvk_convert_fp32_bf16((int8_t)(i % 256));
+
+  for (uint32_t i = 0; i < size; i++)
+    b_data[i] = cvk_convert_fp32_bf16((int8_t)(100 - i % 256));
+
+  tl_max_ref(a_data, b_data, ref_data, size);
+
+  cvk_tl_t *tl_a = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  cvk_tl_t *tl_b = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  cvk_tl_t *tl_max = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  if (!tl_a || !tl_b || !tl_max) {
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, (uint8_t *)a_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b, (uint8_t *)b_data);
+
+  cvk_tiu_max_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.max = tl_max;
+  p.a = tl_a;
+  p.b_is_const = 0;
+  p.b = tl_b;
+  cvk_ctx->ops->tiu_max(cvk_ctx, &p);
+  uint16_t *max_data = (uint16_t*)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_max);
+
+  for (uint32_t i = 0; i < size; i++) {
+    if (max_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%u], got %x, exp %x\n",
+             i, max_data[i], ref_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+  free(max_data);
+
+fail_exit_2:
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_max);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+
+fail_exit:
+  free(a_data);
+  free(b_data);
+  free(ref_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_max(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_max(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_bf16_tensor_max_const.c b/cviruntime/test/180x/test_180x_bf16_tensor_max_const.c
new file mode 100644
index 000000000..72ffe85c7
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_bf16_tensor_max_const.c
@@ -0,0 +1,120 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_max_const_ref(uint16_t *a, uint16_t b, uint16_t *max, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    if (cvk_convert_bf16_fp32(a[i]) > cvk_convert_bf16_fp32(b))
+      max[i] = a[i];
+    else
+      max[i] = b;
+  }
+}
+
+static int test_tl_max_const(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 3;
+  int c = 9; // 39 -> 9 for 180x
+  int h = 7;
+  int w = 37;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  cvk_fmt_t fmt_type = CVK_FMT_BF16;
+  uint32_t size = n * c * h  * w;
+  uint32_t data_size = size * (fmt_type == CVK_FMT_BF16 ? 2 : 1);
+
+  uint16_t *a_data = (uint16_t *)malloc(data_size);
+  uint16_t *ref_data = (uint16_t *)malloc(data_size);
+  if (!a_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint32_t i = 0; i < size; i++)
+    a_data[i] = cvk_convert_fp32_bf16(i);
+    //a_data[i] = cvk_convert_fp32_bf16(rand()%100 - 50);
+
+  uint16_t b = cvk_convert_fp32_bf16(20);
+
+  tl_max_const_ref(a_data, b, ref_data, size);
+
+  cvk_tl_t *tl_a = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  cvk_tl_t *tl_max = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  uint16_t *max_data = NULL;
+  if (!tl_a || !tl_max) {
+    ret = -1;
+    goto fail_exit_2;
+  }
+  
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, (uint8_t *)a_data);
+  cvk_tiu_max_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.max = tl_max;
+  p.a = tl_a;
+  p.b_is_const = 1;
+  p.b_const.val = b;
+  p.b_const.is_signed = 1;
+
+  cvk_ctx->ops->tiu_max(cvk_ctx, &p);
+
+  max_data = (uint16_t*) tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_max);
+
+  for (uint32_t i = 0; i < size; i++) {
+    if (max_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%u], got %x, exp %x\n",
+              i, max_data[i], ref_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+fail_exit_2:
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_max);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+  free(max_data);
+
+fail_exit:
+  free(a_data);
+  free(ref_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_max_const(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_max_const(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_bf16_tensor_min.c b/cviruntime/test/180x/test_180x_bf16_tensor_min.c
new file mode 100644
index 000000000..8fda0e58e
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_bf16_tensor_min.c
@@ -0,0 +1,124 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_min_ref(uint16_t *a, uint16_t *b, uint16_t *max, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    float fa = cvk_convert_bf16_fp32(a[i]);
+    float fb = cvk_convert_bf16_fp32(b[i]);
+    float fmax;
+    if (fa > fb)
+      fmax = fb;
+    else
+      fmax = fa;
+    max[i] = cvk_convert_fp32_bf16(fmax);
+  }
+}
+
+static int test_tl_min(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 2; // 3 -> 2 for 1810
+  int c = 9; // 39 -> 9 for 180x
+  int h = 7;
+  int w = 37;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+  cvk_fmt_t fmt_type = CVK_FMT_BF16;
+  uint32_t size = n * c * h * w;
+  uint32_t data_size = size * (fmt_type == CVK_FMT_BF16 ? 2 : 1); 
+
+  uint16_t *a_data = (uint16_t *)malloc(data_size);
+  uint16_t *b_data = (uint16_t *)malloc(data_size);
+  uint16_t *ref_data = (uint16_t *)malloc(data_size);
+  if (!a_data || !b_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint32_t i = 0; i < size; i++)
+    a_data[i] = cvk_convert_fp32_bf16(rand());
+
+  for (uint32_t i = 0; i < size; i++)
+    b_data[i] = cvk_convert_fp32_bf16(rand()/2);
+
+  tl_min_ref(a_data, b_data, ref_data, size);
+
+  cvk_tl_t *tl_a = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  cvk_tl_t *tl_b = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  cvk_tl_t *tl_min = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  uint16_t *min_data = NULL;
+  if (!tl_a || !tl_b || !tl_min) {
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, (uint8_t *)a_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b, (uint8_t *)b_data);
+  cvk_tiu_min_param_t p6;
+  p6.min = tl_min;
+  p6.a = tl_a;
+  p6.b_is_const = 0;
+  p6.b = tl_b;
+  cvk_ctx->ops->tiu_min(cvk_ctx, &p6);
+  min_data = (uint16_t*)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_min);
+
+  for (uint32_t i = 0; i < size; i++) {
+    if (min_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%u], got %x, exp %x\n",
+              i, min_data[i], ref_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+fail_exit_2:
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_min);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+  free(min_data);
+
+fail_exit:
+  free(a_data);
+  free(b_data);
+  free(ref_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_min(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_min(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_bf16_tensor_min_const.c b/cviruntime/test/180x/test_180x_bf16_tensor_min_const.c
new file mode 100644
index 000000000..617431f87
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_bf16_tensor_min_const.c
@@ -0,0 +1,116 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_min_const_ref(uint16_t *a, uint16_t b, uint16_t *max, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    if (cvk_convert_bf16_fp32(a[i]) > cvk_convert_bf16_fp32(b))
+      max[i] = b;
+    else
+      max[i] = a[i];
+  }
+}
+
+static int test_tl_min_const(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 3;
+  int c = 9; // 39 -> 9 for 180x
+  int h = 7;
+  int w = 37;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  cvk_fmt_t fmt_type = CVK_FMT_BF16;
+  uint32_t size = n * c * h  * w;
+  uint32_t data_size = size * (fmt_type == CVK_FMT_BF16 ? 2 : 1);
+
+  uint16_t *a_data = (uint16_t *)malloc(data_size);
+  uint16_t *ref_data = (uint16_t *)malloc(data_size);
+  if (!a_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint32_t i = 0; i < size; i++)
+    a_data[i] = cvk_convert_fp32_bf16(rand() % 100 -50);
+
+  uint16_t b = cvk_convert_fp32_bf16(20);
+
+  tl_min_const_ref(a_data, b, ref_data, size);
+
+  cvk_tl_t *tl_a = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  cvk_tl_t *tl_min = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  uint16_t *min_data = NULL;
+  if (!tl_a || !tl_min) {
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, (uint8_t *)a_data);
+  cvk_tiu_min_param_t p7;
+  p7.min = tl_min;
+  p7.a = tl_a;
+  p7.b_is_const = 1;
+  p7.b_const.val = b;
+  p7.b_const.is_signed = 1;
+  cvk_ctx->ops->tiu_min(cvk_ctx, &p7);
+  min_data = (uint16_t*) tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_min);
+
+  for (uint32_t i = 0; i < size; i++) {
+    if (min_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%u], got %x, exp %x\n",
+             i, min_data[i], ref_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+fail_exit_2:
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_min);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+  free(min_data);
+
+fail_exit:
+  free(a_data);
+  free(ref_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_min_const(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_min_const(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_bf16_tensor_mul.c b/cviruntime/test/180x/test_180x_bf16_tensor_mul.c
new file mode 100644
index 000000000..6475d7494
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_bf16_tensor_mul.c
@@ -0,0 +1,151 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_mul_ref(uint16_t *ofmap, uint16_t *a, uint16_t *b, uint64_t size, int shift_bits, int relu_enable, cvk_fmt_t fmt_type)
+{
+  if(fmt_type == CVK_FMT_BF16) {
+    for (uint64_t i = 0; i < size; i++) {
+      float tmp = cvk_convert_bf16_fp32(a[i]) * cvk_convert_bf16_fp32(b[i]);
+      if(relu_enable)
+        if(tmp<0)
+          tmp=0;
+      ofmap[i] = cvk_convert_fp32_bf16(tmp);
+    }
+  } else {
+    for (uint64_t i = 0; i < size; i++) {
+      int32_t tmp = a[i] * b[i];
+      tmp += 1 << (shift_bits - 1);
+      tmp >>= shift_bits;
+      if (tmp > 127)
+        tmp = 127;
+      else if (tmp < -128)
+        tmp = -128;
+      if(relu_enable)
+        if(tmp<0)
+          tmp=0;
+      ofmap[i] = tmp;
+    }
+  }
+}
+
+static int test_tl_mul(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 2; // 3 -> 2 for 1810
+  int c = 9; // 39 -> 9 for 180x
+  int h = 7;
+  int w = 37;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  cvk_fmt_t fmt_type = CVK_FMT_BF16;
+  uint32_t size = n * c * h  * w;
+  uint64_t data_size = size * (fmt_type == CVK_FMT_BF16 ? 2 : 1);
+  int shift_bits = 1;
+
+  uint16_t *a_data = (uint16_t *)malloc(data_size);
+  uint16_t *b_data = (uint16_t *)malloc(data_size);
+  uint16_t *ref_data = (uint16_t *)malloc(data_size);
+  if (!a_data || !b_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint32_t relu_enable = 0; relu_enable < 2; relu_enable++)
+  {
+    for (uint32_t i = 0; i < size; i++) {
+      a_data[i] = cvk_convert_fp32_bf16(random()%0x10);
+      b_data[i] = cvk_convert_fp32_bf16(random());
+    }
+
+    cvk_tl_t *tl_a = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+    cvk_tl_t *tl_b = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+    cvk_tl_t *tl_res_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+    uint16_t *res_low_data = NULL;
+    if (!tl_a || !tl_b || !tl_res_low) {
+      ret = -1;
+      goto fail_exit_2;
+    }
+
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, (uint8_t *)a_data);
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b, (uint8_t *)b_data);
+
+    cvk_tiu_mul_param_t p1;
+    p1.res_high = NULL;
+    p1.res_low = tl_res_low;
+    p1.a = tl_a;
+    p1.b_is_const = 0;
+    p1.b = tl_b;
+    p1.rshift_bits = shift_bits;
+    p1.relu_enable = relu_enable;
+    cvk_ctx->ops->tiu_mul(cvk_ctx, &p1);
+
+    res_low_data = (uint16_t *)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res_low);
+
+    tl_mul_ref(ref_data, a_data, b_data, size, shift_bits, relu_enable, fmt_type);
+
+    for (uint64_t i = 0; i < size; i++) {
+      if (res_low_data[i] != ref_data[i]) {
+        fprintf(stderr, "comparing failed at res_low_data[%" PRIu64 "], got %x, exp %x\n",
+               i, res_low_data[i], ref_data[i]);
+        ret = -1;
+        break;
+      }
+    }
+
+fail_exit_2:
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res_low);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+    free(res_low_data);
+  }
+
+fail_exit:
+  free(a_data);
+  free(b_data);
+  free(ref_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  int round_mode;
+  round_mode = cvk_set_store_feround();
+
+  ret |= test_tl_mul(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_mul(rt_handle, cvk_ctx, 1);
+
+  cvk_restore_feround(round_mode);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_bf16_tensor_mul_const.c b/cviruntime/test/180x/test_180x_bf16_tensor_mul_const.c
new file mode 100644
index 000000000..ba2e334be
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_bf16_tensor_mul_const.c
@@ -0,0 +1,147 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_mul_const_ref(
+    uint16_t *ofmap, uint16_t *ifmap, uint64_t size, uint16_t mul_const, int shift_bits, int relu_enable, cvk_fmt_t fmt_type)
+{
+
+  if(fmt_type == CVK_FMT_BF16) {
+    for (uint64_t i = 0; i < size; i++) {
+      float tmp = cvk_convert_bf16_fp32(ifmap[i]) * cvk_convert_bf16_fp32(mul_const);
+      if(relu_enable)
+        if(tmp<0)
+          tmp=0;
+      ofmap[i] = cvk_convert_fp32_bf16(tmp);
+    }
+  } else {
+    for (uint64_t i = 0; i < size; i++) {
+      int32_t tmp = ifmap[i] * (int16_t) mul_const;
+      tmp += 1 << (shift_bits - 1);
+      tmp >>= shift_bits;
+      if (tmp > 127)
+        tmp = 127;
+      else if (tmp < -128)
+        tmp = -128;
+      if(relu_enable)
+        if(tmp<0)
+          tmp=0;
+      ofmap[i] = tmp;
+    }
+  }
+}
+
+static int test_tl_mul_const(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 3;
+  int c = 9; // 39 -> 9 for 180x
+  int h = 7;
+  int w = 37;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  uint32_t size = n * c * h  * w;
+  cvk_fmt_t fmt_type = CVK_FMT_BF16;
+  uint32_t data_size = size * (fmt_type == CVK_FMT_BF16 ? 2 : 1);
+  int shift_bits = 1;
+
+  uint16_t *ifmap_data = (uint16_t *)malloc(data_size);
+  uint16_t *ref_data = (uint16_t *)malloc(data_size);
+  cvk_tl_t *tl_ifmap = NULL, *tl_ofmap = NULL;
+  if (!ifmap_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+  
+  tl_ifmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  tl_ofmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  if (!tl_ifmap || !tl_ofmap) {
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  for (uint32_t relu_enable = 0; relu_enable < 2; relu_enable++)
+  {
+    for (uint32_t i = 0; i < size; i++)
+      ifmap_data[i] = cvk_convert_fp32_bf16(random() % 256);
+  
+    uint16_t mul_const = cvk_convert_fp32_bf16(20);
+
+    tl_mul_const_ref(ref_data, ifmap_data, size, mul_const, shift_bits, relu_enable, fmt_type);
+
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_ifmap, (uint8_t *)ifmap_data);
+  
+    cvk_tiu_mul_param_t p;
+    memset(&p, 0, sizeof(p));
+    p.res_high = NULL;
+    p.res_low = tl_ofmap;
+    p.a = tl_ifmap;
+    p.b_is_const = 1;
+    p.b_const.val = mul_const;
+    p.relu_enable = relu_enable;
+
+    cvk_ctx->ops->tiu_mul(cvk_ctx, &p);
+  
+    uint16_t *ofmap_data = (uint16_t*) tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_ofmap);
+  
+    for (uint32_t i = 0; i < size; i++) {
+      if (ofmap_data[i] != ref_data[i]) {
+        fprintf(stderr, "comparing failed at ofmap_data[%u], got %x(%f), exp %x(%f)\n",
+               i,
+               ofmap_data[i], cvk_convert_bf16_fp32(ofmap_data[i]),
+               ref_data[i], cvk_convert_bf16_fp32(ref_data[i]));
+        ret = -1;
+        break;
+      }
+    }
+
+    free(ofmap_data);
+  }
+
+fail_exit_2:
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_ofmap);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_ifmap);
+
+fail_exit:
+  free(ifmap_data);
+  free(ref_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_mul_const(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_mul_const(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_conv.c b/cviruntime/test/180x/test_180x_conv.c
new file mode 100644
index 000000000..620058b45
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_conv.c
@@ -0,0 +1,828 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_native_ref.h"
+
+#define INVALIDE_STRIDE (-1)
+typedef struct {
+  int random_seed;
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int using_bias;
+  int bReLU_EN;
+  int r_shift_m;
+  int opd0_sign;
+  int opd1_sign;
+  int opd2_sign;
+} conv_param_t;
+
+static int index_get(int h, int w1, int w2)
+{
+  return h * w1 + w2;
+}
+
+static int matrix_dot_mult(
+    int8_t *A, int8_t *B, int dim_n, int dim_m,
+    int opd0_sign)
+{
+  int sum = 0;
+  for (int i=0; i<dim_n; i++){
+    for (int j=0; j<dim_m; j++){
+      int index = index_get(i, dim_m, j);
+      if(opd0_sign) {
+        sum += A[index] * B [index];
+      } else {
+        sum += (int)((uint8_t)A[index]) * B [index];
+      }
+    }
+  }
+  return sum;
+}
+
+static int conv_ref(
+    const conv_param_t *p_param,
+    const int8_t *ifmap,
+    const int8_t *weight,
+    const int16_t *bias,
+    int8_t *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+  int r_shift_bits = p_param->r_shift_m;
+  int do_relu = p_param->bReLU_EN;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  int8_t *i_fmap_pad_ker = (int8_t *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return -1;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = 0;
+
+  int8_t *i_fmap_pad = NULL;
+  int8_t *kernel_after = NULL;
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+
+      if (p_param->using_bias) {
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += bias[c]; //bias+c ;
+          }
+        }
+      }
+
+      if (do_relu)
+        relu(&result[n*oc*oh*ow + c*oh*ow], oh * ow);
+
+      // ofmap is int8_t, signed
+      ret = satu_2_8bit(&result[n*oc*oh*ow + c*oh*ow], oh * ow,
+                        &ofmap[n*oc*oh*ow + c*oh*ow], r_shift_bits, /*round_floor=*/1,
+                        /*sign_unsign=*/1);
+
+      if (ret)
+        goto error_release;
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+error_release:
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+static uint8_t * transform_weight(const cvk_tl_shape_t *s, uint8_t before[])
+{
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  uint32_t size = ic * oc * kh * kw;
+  uint8_t *after = (uint8_t *)malloc(size);
+
+  /*
+   * (oc, ic, kh, kw) -> (1, oc, kh * kw, ic)
+   */
+  for (uint32_t oci = 0; oci < oc; oci++) {
+    for (uint32_t ici = 0; ici < ic; ici++) {
+      for (uint32_t khi = 0; khi < kh; khi++) {
+        for (uint32_t kwi = 0; kwi < kw; kwi++) {
+          uint32_t src_i = oci * ic * kh * kw + ici * kh * kw + khi * kw + kwi;
+          uint32_t dst_i = oci * kh * kw * ic + khi * kw * ic + kwi * ic + ici;
+          after[dst_i] = before[src_i];
+        }
+      }
+    }
+  }
+
+  return after;
+}
+
+static void put_conv_weight(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    uint8_t *data)
+{
+#if 0
+  const cvk_tl_shape_t *s = &tl->shape;
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  bmshape_t bms = BM_TENSOR_INT8((int)oc, (int)ic, (int)kh, (int)kw);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  uint8_t *transformed_data = transform_weight(s, data);
+
+  /* we put weight to region 1. bm_memcpy_s2d regard dev_mem as
+   * absolute address, so we should pass abs address to copy weight
+   * to right place.
+   */
+
+  //u64 ab_addr = bm_device_read_base_reg(*ctx, 1);
+  //bmmem_device_t ab_dev_mem =
+    //bmmem_device_prealloc(*ctx, NULL, ab_addr + gaddr, &bms);
+
+  //int ret = bm_memcpy_s2d(*ctx, ab_dev_mem, transformed_data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  cvk_tl_shape_t tdma_shape = { 1, oc, kh * kw, ic };
+
+  tg_t tdma_tg;
+  tdma_tg.base_reg_index = 0;
+  tdma_tg.start_address = gaddr;
+  tdma_tg.fmt = FMT_I8;
+  tdma_tg.shape.n = tdma_shape.n;
+  tdma_tg.shape.c = tdma_shape.c;
+  tdma_tg.shape.h = tdma_shape.h;
+  tdma_tg.shape.w = tdma_shape.w;
+  tdma_tg.stride = bmk1822_tensor_tgmem_default_stride(tdma_tg.shape, tdma_tg.fmt);
+  tdma_tg.base_reg_index = 1;
+
+  tl_t tdma_tl = *tl;
+  tdma_tl.shape = tdma_shape;
+  tdma_tl.stride = bmk1822_tensor_lmem_default_stride(cvk_ctx, tdma_shape, FMT_I8, 0);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tdma_tg;
+  p.dst = &tdma_tl;
+
+  bmk1822_tdma_g2l_tensor_copy(cvk_ctx, &p);
+  test_submit(ctx);
+  bmmem_device_free(*ctx, dev_mem);
+  delete[] transformed_data;
+#endif
+
+  const cvk_tl_shape_t *s = &tl->shape;
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  uint8_t *transformed_data = transform_weight(s, data);
+
+  cvk_tl_shape_t tdma_shape = tl_shape_t4(1, oc, kh * kw, ic);
+  cvk_tl_t tdma_tl;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tdma_tl, tdma_shape, tl->fmt, tl->eu_align);
+  tdma_tl.start_address = tl->start_address;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, &tdma_tl, transformed_data);
+
+  free(transformed_data);
+}
+
+static int8_t * transform_bias(int oc, int16_t before[])
+{
+  int8_t *after = (int8_t *)malloc(2 * oc);
+  if (!after)
+    return NULL;
+
+  for (int i = 0; i < oc; i++){
+    after[i] = before[i] & 0xff;
+    after[i + oc] = (before[i] >> 8) & 0xff;
+  }
+  return after;
+}
+
+static void put_conv_bias(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    int16_t *data)
+{
+#if 0
+  int oc = tl->shape.c;
+
+  bmshape_t bms = BM_TENSOR_INT8(2, oc, 1, 1);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  int8_t *transformed_data = transform_bias(oc, data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, (uint8_t *)transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_I8;
+  tg.shape.n = 2;
+  tg.shape.c = oc;
+  tg.shape.h = 1;
+  tg.shape.w = 1;
+  tg.stride = bmk1822_tensor_tgmem_default_stride(tg.shape, tg.fmt);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1822_tdma_g2l_tensor_copy(cvk_ctx, &p);
+  test_submit(ctx);
+
+  bmmem_device_free(*ctx, dev_mem);
+  delete[] transformed_data;
+#endif
+
+  int oc = tl->shape.c;
+  int8_t *transformed_data = transform_bias(oc, data);
+
+  cvk_tl_shape_t tdma_shape = tl_shape_t4(2, oc, 1, 1);
+  cvk_tl_t tdma_tl;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tdma_tl, tdma_shape, tl->fmt, tl->eu_align);
+  tdma_tl.start_address = tl->start_address;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, &tdma_tl, (uint8_t *)transformed_data);
+
+  free(transformed_data);
+}
+
+static int conv_kh_ext(const conv_param_t *p)
+{
+  return (p->kh - 1) * p->dh + 1;
+}
+
+static int conv_kw_ext(const conv_param_t *p)
+{
+  return (p->kw - 1) * p->dw + 1;
+}
+
+static int conv_ih_ext(const conv_param_t *p)
+{
+  return (p->input_h - 1) * (p->ins_h + 1) +
+      p->ins_h_last + 1 + p->pad_top + p->pad_bot;
+}
+
+static int conv_iw_ext(const conv_param_t *p)
+{
+  return (p->input_w - 1) * (p->ins_w + 1) +
+      p->ins_w_last + 1 + p->pad_left + p->pad_right;
+}
+
+static int conv_oh(const conv_param_t *p)
+{
+  return (conv_ih_ext(p) - conv_kh_ext(p)) / p->stride_h + 1;
+}
+
+static int conv_ow(const conv_param_t *p)
+{
+  return (conv_iw_ext(p) - conv_kw_ext(p)) / p->stride_w + 1;
+}
+
+static int conv_input_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int ic = p->input_c;
+  int ih = p->input_h;
+  int iw = p->input_w;
+  return in * ic * ih * iw;
+}
+
+static int conv_output_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int oc = p->output_c;
+  int oh = conv_oh(p);
+  int ow = conv_ow(p);
+  return in * oc * oh * ow;
+}
+
+static int conv_weight_size(const conv_param_t *p)
+{
+  int oc = p->output_c;
+  int ic = p->input_c;
+  int kh = p->kh;
+  int kw = p->kw;
+  return oc * ic * kh * kw;
+}
+
+static int8_t * alloc_input(const conv_param_t *p)
+{
+  int size = conv_input_size(p);
+
+  int8_t *buf = (int8_t *)malloc(sizeof(int8_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static int8_t * alloc_weight(const conv_param_t *p)
+{
+  int size = conv_weight_size(p);
+
+  int8_t *buf = (int8_t *)malloc(sizeof(int8_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static int16_t * alloc_bias(const conv_param_t *p)
+{
+  int oc = p->output_c;
+
+  int16_t *bias = (int16_t *)malloc(sizeof(int16_t) * oc);
+  if (!bias)
+    return NULL;
+
+  for (int i = 0; i < oc; i++)
+    bias[i] = rand() % 65536 - 32768;
+
+  return bias;
+}
+
+static cvk_tl_t * conv_ifmap_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = p->opd0_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 1);
+}
+
+static cvk_tl_t * conv_weight_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = p->opd1_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 0);
+}
+
+static cvk_tl_t * conv_ofmap_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, CVK_FMT_I8, 1);
+}
+
+static cvk_tl_t * conv_bias_tensor(
+    cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = p->opd2_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 0);
+}
+
+static int conv_param_is_ok(const conv_param_t *p)
+{
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+
+  if ((kh_ext > ih_ext)
+      || (kw_ext > iw_ext)
+      || (kh_ext <= p->pad_top)
+      || (kh_ext <= p->pad_bot)
+      || (kw_ext <= p->pad_left)
+      || (kw_ext <= p->pad_right)
+      || (p->pad_top >= (1 << 4))
+      || (p->pad_bot >= (1 << 4))
+      || (p->pad_left >= (1 << 4))
+      || (p->pad_right >= (1 << 4))) {
+    return 0;
+  }
+
+  return 1;
+}
+
+static int bmk_conv_param_alloc_ok(
+    const cvk_tiu_pt_convolution_param_t *p,
+    const conv_param_t *param)
+{
+  if (!p->ifmap || !p->ofmap || !p->weight)
+    return 0;
+
+  if (param->using_bias)
+    if (!p->bias)
+      return 0;
+
+  return 1;
+}
+
+static void make_bmk_conv_param(
+    cvk_context_t *cvk_ctx,
+    cvk_tiu_pt_convolution_param_t *dst,
+    const conv_param_t *p)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  dst->relu_enable = p->bReLU_EN;
+  dst->rshift_bits = p->r_shift_m;
+
+  dst->ifmap = conv_ifmap_tensor(cvk_ctx, p);
+  dst->weight = conv_weight_tensor(cvk_ctx, p);
+  dst->ofmap = conv_ofmap_tensor(cvk_ctx, p);
+  dst->bias = NULL;
+  dst->ps32_mode = 0;
+  if (p->using_bias)
+    dst->bias = conv_bias_tensor(cvk_ctx, p);
+
+  dst->w_is_const = 0;
+}
+
+static void free_bmk_conv_param(
+    cvk_context_t *cvk_ctx,
+    cvk_tiu_pt_convolution_param_t *r,
+    const conv_param_t *p)
+{
+  if (p->using_bias && r->bias)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->bias);
+  if (r->ofmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ofmap);
+  if (r->weight)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->weight);
+  if (r->ifmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ifmap);
+}
+
+static void _init_conv_param(conv_param_t *p, int stride_w, int stride_h)
+{
+  printf("init_conv_param\n");
+  memset(p, 0, sizeof(*p));
+  p->random_seed = clock();
+  srand(p->random_seed);
+
+retry:
+  p->input_n = rand() % 5 + 1;
+  p->input_c = rand() % (5 * 32) + 1;
+  p->kh = rand() % 7 + 1;
+  p->kw = rand() % 7 + 1;
+  p->input_h = rand() % 40 + p->kh + (INVALIDE_STRIDE == stride_h ? 0 : stride_h);
+  p->input_w = rand() % 40 + p->kw + (INVALIDE_STRIDE == stride_w ? 0 : stride_w);
+  p->output_c = rand() % 10 + 3;
+  p->stride_h = INVALIDE_STRIDE == stride_h ? rand() % (p->kh) + 1 : stride_h;
+  p->stride_w = INVALIDE_STRIDE == stride_w ? rand() % (p->kh) + 1 : stride_w;
+  p->ins_h = rand() % p->kh;
+  p->ins_w = rand() % p->kw;
+  p->ins_h_last = rand() % p->kh;;
+  p->ins_w_last = rand() % p->kw;;
+  p->dh = rand() % 3 + 1;
+  p->dw = rand() % 3 + 1;
+
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  p->pad_top = rand() % kh_ext;
+  p->pad_bot = rand() % kh_ext;
+  p->pad_left = rand() % kw_ext;
+  p->pad_right = rand() % kw_ext;
+
+  if (!conv_param_is_ok(p)) {
+    printf("retry init_conv_param\n");
+    goto retry;
+  }
+
+  p->using_bias = rand() % 2;
+  p->r_shift_m = rand() % 8;
+  p->bReLU_EN = rand() % 2;
+
+  p->opd0_sign = rand() % 2;
+  p->opd1_sign = 1;
+  p->opd2_sign = 1;
+
+  assert(p->opd1_sign == 1 && p->opd2_sign == 1);
+
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+  assert(ih_ext >= kh_ext);
+  assert(iw_ext >= kw_ext);
+}
+
+static void init_conv_param(conv_param_t *p) {
+  _init_conv_param(p, INVALIDE_STRIDE, INVALIDE_STRIDE);
+}
+
+static void print_conv_param(const conv_param_t *p)
+{
+  printf("%s\n", "Conv parameters:");
+  printf("  %s%d;\n", "p->random_seed = ", p->random_seed);
+
+  printf("  %s%d;\n", "p->input_n = ", p->input_n);
+  printf("  %s%d;\n", "p->input_c = ", p->input_c);
+  printf("  %s%d;\n", "p->input_h = ", p->input_h);
+  printf("  %s%d;\n", "p->input_w = ", p->input_w);
+  printf("  %s%d;\n", "p->output_c = ", p->output_c);
+
+  printf("  %s%d;\n", "p->kh = ", p->kh);
+  printf("  %s%d;\n", "p->kw = ", p->kw);
+  printf("  %s%d;\n", "p->dh = ", p->dh);
+  printf("  %s%d;\n", "p->dw = ", p->dw);
+  printf("  %s%d;\n", "p->pad_top = ", p->pad_top);
+  printf("  %s%d;\n", "p->pad_bot = ", p->pad_bot);
+  printf("  %s%d;\n", "p->pad_left = ", p->pad_left);
+  printf("  %s%d;\n", "p->pad_right = ", p->pad_right);
+  printf("  %s%d;\n", "p->stride_h = ", p->stride_h);
+  printf("  %s%d;\n", "p->stride_w = ", p->stride_w);
+  printf("  %s%d;\n", "p->ins_w = ", p->ins_w);
+  printf("  %s%d;\n", "p->ins_h = ", p->ins_h);
+  printf("  %s%d;\n", "p->ins_w_last = ", p->ins_w_last);
+  printf("  %s%d;\n", "p->ins_h_last = ", p->ins_h_last);
+
+  printf("  %s%d;\n", "p->r_shift_m = ", p->r_shift_m);
+  printf("  %s%d;\n", "p->opd0_sign = ", p->opd0_sign);
+  printf("  %s%d;\n", "p->opd1_sign = ", p->opd1_sign);
+  printf("  %s%d;\n", "p->opd2_sign = ", p->opd2_sign);
+  printf("  %s%d;\n", "p->bReLU_EN = ", p->bReLU_EN);
+  printf("  %s%d;\n", "p->using_bias = ", p->using_bias);
+
+  printf("  %s%d\n", "kh_ext = ", conv_kh_ext(p));
+  printf("  %s%d\n", "kw_ext = ", conv_kw_ext(p));
+  printf("  %s%d\n", "ih_ext = ", conv_ih_ext(p));
+  printf("  %s%d\n", "iw_ext = ", conv_iw_ext(p));
+  printf("  %s%d\n", "output_h = ", conv_oh(p));
+  printf("  %s%d\n", "output_w = ", conv_ow(p));
+}
+
+// Calculate the right shift value, m
+// Steps:
+//  1. Get the abs() of each weight;
+//  2. Summary all the abs() in one kernel;
+//  3. Get Log2 of each sum;
+//  4. Downward rounding;
+// After every r_shift value got, sort and find the middle one.
+static int calc_rshift_m(const conv_param_t *p, int8_t* weight)
+{
+  int kernel_cnt = p->output_c * p->input_c;
+  int kernel_size = p->kh * p->kw;
+  int *kernel_shifts = (int *)malloc(sizeof(int) * kernel_cnt);
+  if (!kernel_shifts)
+    return 1;
+
+  memset(kernel_shifts, 0, sizeof(int) * kernel_cnt);
+
+  // Part 1:
+  // Get right shift value for each kernel
+  int sum = 0;
+  for (int i = 0; i < kernel_cnt; i++) {
+    // Step 1 & 2: Get the sum of abs()
+    for (int j = 0; j < kernel_size; j++) {
+      sum += (int)(*weight < 0 ? -(*weight) : (*weight));
+      weight++;
+    }
+    // Step 3 & 4: log2 and downward rounding
+    sum >>= 1;
+    while (sum) {
+      sum >>= 1;
+      kernel_shifts[i]++;
+    }
+  }
+
+  // Part 2:
+  // Find the middle of all the values
+  int tag[32] = {0};
+  for (int cnt = 0; cnt < kernel_cnt; cnt++) {
+    if (kernel_shifts[cnt] < 32)
+      tag[kernel_shifts[cnt]]++;
+  }
+
+  int rshift_m = 0;
+  int mid = 0;
+  do {
+    mid += tag[rshift_m++];
+  } while(mid < (kernel_cnt - 1) >> 1);
+
+  free(kernel_shifts);
+
+  return rshift_m - 1;
+}
+
+static int test_conv(
+    conv_param_t *p_param, CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+  int8_t *input = alloc_input(p_param);
+  int8_t *weight = alloc_weight(p_param);
+  int16_t *bias = alloc_bias(p_param);
+  int8_t *output_ref = (int8_t *)malloc(sizeof(int8_t) * conv_output_size(p_param));
+  if (!input || !weight || !bias || !output_ref) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  p_param->r_shift_m = calc_rshift_m(p_param, weight);
+
+  ret = conv_ref(p_param, input, weight, bias, output_ref);
+  if (ret)
+    goto fail_exit;
+
+  cvk_tiu_pt_convolution_param_t conv_param;
+  make_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+
+  int tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, p_param);
+  if (tl_alloc_success) {
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, conv_param.ifmap, (uint8_t *)input);
+    put_conv_weight(rt_handle, cvk_ctx, conv_param.weight, (uint8_t *)weight);
+    if (p_param->using_bias)
+      put_conv_bias(rt_handle, cvk_ctx, conv_param.bias, bias);
+    cvk_ctx->ops->tiu_pt_convolution(cvk_ctx, &conv_param);
+    uint8_t *output = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, conv_param.ofmap);
+
+    ret = array_cmp_int8(
+        "Comparing results ...\n",
+        output_ref, (int8_t *)output, conv_output_size(p_param));
+
+    if (ret) {
+      print_conv_param(p_param);
+      printf("Comparison FAILED\n");
+      ret = -1;
+    }
+    free(output);
+  }
+
+  free_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+
+fail_exit:
+  free(input);
+  free(weight);
+  free(output_ref);
+  free(bias);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  for (int i = 0; i < 5; i++) {
+    printf("random_test_conv iteration: %d\n", i);
+    conv_param_t test_conv_param;
+    init_conv_param(&test_conv_param);
+    ret |= test_conv(&test_conv_param, rt_handle, cvk_ctx);
+    if (!test_conv_param.using_bias)
+      test_conv_param.using_bias = 1;
+    if (test_conv_param.output_c <= 32)
+      test_conv_param.output_c += 32;
+    ret |= test_conv(&test_conv_param, rt_handle, cvk_ctx);
+  }
+
+  // test stride extend (0, 31]
+  int stride_list[] = {15, 16, 31};
+  int stride_list_len = sizeof(stride_list) / sizeof(stride_list[0]);
+
+  for (int stride_w_idx = 0; stride_w_idx < stride_list_len; stride_w_idx++) {
+    for (int stride_h_idx = 0; stride_h_idx < stride_list_len; stride_h_idx++) {
+      int stride_w = stride_list[stride_w_idx];
+      int stride_h = stride_list[stride_h_idx];
+      conv_param_t test_conv_param;
+      _init_conv_param(&test_conv_param, stride_w, stride_h);
+
+      ret |= test_conv(&test_conv_param, rt_handle, cvk_ctx);
+    }
+  }
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_conv_max_power.c b/cviruntime/test/180x/test_180x_conv_max_power.c
new file mode 100644
index 000000000..92686add8
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_conv_max_power.c
@@ -0,0 +1,1076 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_native_ref.h"
+
+typedef cvk_tdma_l2g_tensor_copy_cw_transposed_param_t l2g_cw_param_t;
+typedef cvk_tdma_g2l_matrix_copy_row_col_transposed_param_t g2l_matrix_param_t;
+typedef cvk_tdma_l2l_tensor_copy_param_t l2l_tensor_copy_param_t;
+
+typedef struct{
+    int8_t *conv_input;
+    int8_t *conv_weight;
+    int16_t *conv_bias;
+    uint8_t *conv_output;
+    int8_t *conv_output_ref;
+    uint8_t *l2g_cw_src;
+    uint8_t *l2g_cw_output;
+    uint8_t *l2g_cw_output_ref;
+    uint8_t *g2l_matrix_src;
+    uint8_t *g2l_matrix_output;
+    uint8_t *g2l_matrix_output_ref;
+    uint8_t *l2l_tensor_src;
+    uint8_t *l2l_tensor_output;
+    uint8_t *l2l_tensor_output_ref;
+}s_test_data;
+
+typedef struct {
+  int random_seed;
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int using_bias;
+  int bReLU_EN;
+  int r_shift_m;
+  int opd0_sign;
+  int opd1_sign;
+  int opd2_sign;
+} conv_param_t;
+
+conv_param_t conv_param;
+l2g_cw_param_t l2g_cw_param;
+g2l_matrix_param_t g2l_matrix_param;
+l2l_tensor_copy_param_t l2l_tensor_copy_param;
+s_test_data s8_test_data;
+cvk_tiu_pt_convolution_param_t bmk_conv_param;
+
+cvk_tl_t *skip_tensor_lmem[10];
+uint32_t skip_tensor_num=0;
+
+/* need to make sure the free order of test_alloc_tl for skip_tensor_lmem*/
+void skip_tensor_lmem_size(cvk_context_t *cvk_ctx, const cvk_tl_t *p)
+{
+  uint32_t needed = align_up(p->shape.n * p->stride.n, cvk_ctx->info.eu_num);
+  uint32_t start_addr = p->start_address + needed; //src_shape.n*src_shape.c*src_shape.h*src_shape.w/32;
+  uint32_t remain_size = start_addr % cvk_ctx->info.lmem_bank_size ? (cvk_ctx->info.lmem_bank_size - start_addr % cvk_ctx->info.lmem_bank_size) : 0; // remain size for each lane
+  if(remain_size)
+  {
+//    cvk_tl_shape_t src_shape2 = {1, cvk_ctx->info.eu_num, 1, remain_size};
+    cvk_tl_shape_t src_shape2 = {1, cvk_ctx->info.npu_num, 1, remain_size};
+    skip_tensor_lmem[skip_tensor_num] = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, src_shape2, CVK_FMT_I8, 1); // skip the lmem size and next tl can alignment to bank size
+  }
+  skip_tensor_num++;
+}
+
+void skip_matrix_lmem_size(cvk_context_t *cvk_ctx, const cvk_ml_t *p)
+{
+  uint32_t needed = align_up(p->shape.n * p->stride.n, cvk_ctx->info.eu_num);
+  uint32_t start_addr = p->start_address + needed; //src_shape.n*src_shape.c*src_shape.h*src_shape.w/32;
+  uint32_t remain_size = start_addr % cvk_ctx->info.lmem_bank_size ? (cvk_ctx->info.lmem_bank_size - start_addr % cvk_ctx->info.lmem_bank_size) : 0; // remain size for each lane
+  if(remain_size)
+  {
+    cvk_tl_shape_t src_shape2 = {1, cvk_ctx->info.npu_num, 1, remain_size};
+    //cvk_tl_shape_t src_shape2 = {1, cvk_ctx->info.eu_num, 1, remain_size};
+    skip_tensor_lmem[skip_tensor_num] = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, src_shape2, CVK_FMT_I8, 1); // skip the lmem size and next tl can alignment to bank size
+  }
+  skip_tensor_num++;
+}
+
+void free_skip_tensor_lmem(cvk_context_t *cvk_ctx)
+{
+  if(skip_tensor_lmem[--skip_tensor_num]!=NULL)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, skip_tensor_lmem[skip_tensor_num]);
+}
+
+static int index_get(int h, int w1, int w2)
+{
+  return h * w1 + w2;
+}
+
+static int matrix_dot_mult(
+    int8_t *A, int8_t *B, int dim_n, int dim_m,
+    int opd0_sign)
+{
+  int sum = 0;
+  for (int i=0; i<dim_n; i++){
+    for (int j=0; j<dim_m; j++){
+      int index = index_get(i, dim_m, j);
+      if(opd0_sign) {
+        sum += A[index] * B [index];
+      } else {
+        sum += (int)((uint8_t)A[index]) * B [index];
+      }
+    }
+  }
+  return sum;
+}
+
+static int conv_ref(
+    const conv_param_t *p_param,
+    const int8_t *ifmap,
+    const int8_t *weight,
+    const int16_t *bias,
+    int8_t *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+  int r_shift_bits = p_param->r_shift_m;
+  int do_relu = p_param->bReLU_EN;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  int8_t *i_fmap_pad_ker = (int8_t *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return -1;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = 0;
+
+  int8_t *i_fmap_pad = NULL;
+  int8_t *kernel_after = NULL;
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+
+      if (p_param->using_bias) {
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += bias[c]; //bias+c ;
+          }
+        }
+      }
+
+      if (do_relu)
+        relu(&result[n*oc*oh*ow + c*oh*ow], oh * ow);
+
+      // ofmap is int8_t, signed
+      ret = satu_2_8bit(&result[n*oc*oh*ow + c*oh*ow], oh * ow,
+                        &ofmap[n*oc*oh*ow + c*oh*ow], r_shift_bits, /*round_floor=*/1,
+                        /*sign_unsign=*/1);
+
+      if (ret)
+        goto error_release;
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+error_release:
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+
+static uint8_t * transform_weight(const cvk_tl_shape_t *s, uint8_t before[])
+{
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  uint32_t size = ic * oc * kh * kw;
+  uint8_t *after = (uint8_t *)malloc(size);
+  if (!after)
+    return NULL;
+
+  /*
+   * (oc, ic, kh, kw) -> (1, oc, kh * kw, ic)
+   */
+  for (uint32_t oci = 0; oci < oc; oci++) {
+    for (uint32_t ici = 0; ici < ic; ici++) {
+      for (uint32_t khi = 0; khi < kh; khi++) {
+        for (uint32_t kwi = 0; kwi < kw; kwi++) {
+          uint32_t src_i = oci * ic * kh * kw + ici * kh * kw + khi * kw + kwi;
+          uint32_t dst_i = oci * kh * kw * ic + khi * kw * ic + kwi * ic + ici;
+          after[dst_i] = before[src_i];
+        }
+      }
+    }
+  }
+
+  return after;
+}
+
+static void put_conv_weight(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    uint8_t *data)
+{
+  const cvk_tl_shape_t *s = &tl->shape;
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  uint8_t *transformed_data = transform_weight(s, data);
+
+  cvk_tl_shape_t tdma_shape = tl_shape_t4(1, oc, kh * kw, ic);
+  cvk_tl_t tdma_tl;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tdma_tl, tdma_shape, tl->fmt, tl->eu_align);
+  tdma_tl.start_address = tl->start_address;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, &tdma_tl, transformed_data);
+
+  free(transformed_data);
+}
+
+static int8_t * transform_bias(int oc, int16_t before[])
+{
+  int8_t *after = (int8_t *)malloc(2 * oc);
+  if (!after)
+    return NULL;
+
+  for (int i = 0; i < oc; i++){
+    after[i] = before[i] & 0xff;
+    after[i + oc] = (before[i] >> 8) & 0xff;
+  }
+  return after;
+}
+
+static void put_conv_bias(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    int16_t *data)
+{
+  int oc = tl->shape.c;
+  int8_t *transformed_data = transform_bias(oc, data);
+
+  cvk_tl_shape_t tdma_shape = tl_shape_t4(2, oc, 1, 1);
+  cvk_tl_t tdma_tl;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tdma_tl, tdma_shape, tl->fmt, tl->eu_align);
+  tdma_tl.start_address = tl->start_address;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, &tdma_tl, (uint8_t *)transformed_data);
+
+  free(transformed_data);
+}
+
+static int conv_kh_ext(const conv_param_t *p)
+{
+  return (p->kh - 1) * p->dh + 1;
+}
+
+static int conv_kw_ext(const conv_param_t *p)
+{
+  return (p->kw - 1) * p->dw + 1;
+}
+
+static int conv_ih_ext(const conv_param_t *p)
+{
+  return (p->input_h - 1) * (p->ins_h + 1) +
+      p->ins_h_last + 1 + p->pad_top + p->pad_bot;
+}
+
+static int conv_iw_ext(const conv_param_t *p)
+{
+  return (p->input_w - 1) * (p->ins_w + 1) +
+      p->ins_w_last + 1 + p->pad_left + p->pad_right;
+}
+
+static int conv_oh(const conv_param_t *p)
+{
+  return (conv_ih_ext(p) - conv_kh_ext(p)) / p->stride_h + 1;
+}
+
+static int conv_ow(const conv_param_t *p)
+{
+  return (conv_iw_ext(p) - conv_kw_ext(p)) / p->stride_w + 1;
+}
+
+static int conv_input_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int ic = p->input_c;
+  int ih = p->input_h;
+  int iw = p->input_w;
+  return in * ic * ih * iw;
+}
+
+static int conv_output_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int oc = p->output_c;
+  int oh = conv_oh(p);
+  int ow = conv_ow(p);
+  return in * oc * oh * ow;
+}
+
+static int conv_weight_size(const conv_param_t *p)
+{
+  int oc = p->output_c;
+  int ic = p->input_c;
+  int kh = p->kh;
+  int kw = p->kw;
+  return oc * ic * kh * kw;
+}
+
+static int8_t * alloc_input(const conv_param_t *p)
+{
+  int size = conv_input_size(p);
+
+  int8_t *buf = (int8_t *)malloc(sizeof(int8_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static int8_t * alloc_weight(const conv_param_t *p)
+{
+  int size = conv_weight_size(p);
+
+  int8_t *buf = (int8_t *)malloc(sizeof(int8_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static int16_t * alloc_bias(const conv_param_t *p)
+{
+  int oc = p->output_c;
+
+  int16_t *bias = (int16_t *)malloc(sizeof(int16_t) * oc);
+  if (!bias)
+    return NULL;
+
+  for (int i = 0; i < oc; i++)
+    bias[i] = rand() % 65536 - 32768;
+
+  return bias;
+}
+
+static cvk_tl_t * conv_ifmap_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = p->opd0_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 1);
+}
+
+static cvk_tl_t * conv_weight_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = p->opd1_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 0);
+}
+
+static cvk_tl_t * conv_ofmap_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, CVK_FMT_I8, 1);
+}
+
+static cvk_tl_t * conv_bias_tensor(
+    cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = p->opd2_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 0);
+}
+
+static int conv_param_is_ok(const conv_param_t *p)
+{
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+
+  if ((kh_ext > ih_ext)
+      || (kw_ext > iw_ext)
+      || (kh_ext <= p->pad_top)
+      || (kh_ext <= p->pad_bot)
+      || (kw_ext <= p->pad_left)
+      || (kw_ext <= p->pad_right)
+      || (p->pad_top >= (1 << 4))
+      || (p->pad_bot >= (1 << 4))
+      || (p->pad_left >= (1 << 4))
+      || (p->pad_right >= (1 << 4))) {
+    return 0;
+  }
+
+  return 1;
+}
+
+static int bmk_conv_param_alloc_ok(
+    const cvk_tiu_pt_convolution_param_t *p,
+    const conv_param_t *param)
+{
+  if (!p->ifmap || !p->ofmap || !p->weight)
+    return 0;
+
+  printf("ifmap shape(%d, %d, %d, %d)\n",
+         p->ifmap->shape.n, p->ifmap->shape.c,
+         p->ifmap->shape.h, p->ifmap->shape.w);
+  printf("ofmap shape(%d, %d, %d, %d)\n",
+         p->ofmap->shape.n, p->ofmap->shape.c,
+         p->ofmap->shape.h, p->ofmap->shape.w);
+
+  if (param->using_bias)
+    if (!p->bias)
+      return 0;
+
+  return 1;
+}
+
+static void make_bmk_conv_param(
+    cvk_context_t *cvk_ctx,
+    cvk_tiu_pt_convolution_param_t *dst,
+    const conv_param_t *p)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  dst->relu_enable = p->bReLU_EN;
+  dst->rshift_bits = p->r_shift_m;
+
+  dst->ifmap = conv_ifmap_tensor(cvk_ctx, p);
+  skip_tensor_lmem_size(cvk_ctx, dst->ifmap);
+  dst->weight = conv_weight_tensor(cvk_ctx, p);
+  skip_tensor_lmem_size(cvk_ctx, dst->weight);
+  dst->ofmap = conv_ofmap_tensor(cvk_ctx, p);
+  skip_tensor_lmem_size(cvk_ctx, dst->ofmap);
+  dst->bias = NULL;
+  dst->ps32_mode = 0;
+  if (p->using_bias)
+  {
+    dst->bias = conv_bias_tensor(cvk_ctx, p);
+    skip_tensor_lmem_size(cvk_ctx, dst->bias);
+  }
+  dst->w_is_const = 0;
+}
+
+static void free_bmk_conv_param(
+    cvk_context_t *cvk_ctx,
+    cvk_tiu_pt_convolution_param_t *r,
+    const conv_param_t *p)
+{
+  if (p->using_bias && r->bias)
+  {
+    free_skip_tensor_lmem(cvk_ctx);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->bias);
+  }
+  if (r->ofmap)
+  {
+    free_skip_tensor_lmem(cvk_ctx);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ofmap);
+  }
+  if (r->weight)
+  {
+    free_skip_tensor_lmem(cvk_ctx);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->weight);
+  }
+  if (r->ifmap)
+  {
+    free_skip_tensor_lmem(cvk_ctx);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ifmap);
+  }
+}
+
+static void init_conv_param(conv_param_t *p)
+{
+retry:
+  p->input_n = 1;
+  p->input_c = 8; // 16 -> 8 for 180x
+  p->input_h = 2;
+  p->input_w = 600;
+
+  p->kh = 2;
+  p->kw = 16;
+  p->output_c = 8; // 16 -> 8 for 180x
+
+  p->stride_h = 1;
+  p->stride_w = 15;
+  p->ins_h = 0;
+  p->ins_w = 0;
+  p->ins_h_last = 0;;
+  p->ins_w_last = 0;;
+  p->dh = 1;
+  p->dw = 1;
+
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  p->pad_top = 1;
+  p->pad_bot = 0;
+  p->pad_left = 0;
+  p->pad_right = 0;
+
+  if (!conv_param_is_ok(p)) {
+    printf("retry init_conv_param\n");
+    goto retry;
+  }
+
+  p->using_bias = 0;
+  p->r_shift_m = 7;
+  p->bReLU_EN = 1;
+
+  p->opd0_sign = 0;
+  p->opd1_sign = 1;
+  p->opd2_sign = 1;
+
+  assert(p->opd1_sign == 1 && p->opd2_sign == 1);
+
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+  assert(ih_ext >= kh_ext);
+  assert(iw_ext >= kw_ext);
+
+}
+
+static void print_conv_param(const conv_param_t *p)
+{
+  printf("%s\n", "Conv parameters:");
+  printf("  %s%d;\n", "p->random_seed = ", p->random_seed);
+
+  printf("  %s%d;\n", "p->input_n = ", p->input_n);
+  printf("  %s%d;\n", "p->input_c = ", p->input_c);
+  printf("  %s%d;\n", "p->input_h = ", p->input_h);
+  printf("  %s%d;\n", "p->input_w = ", p->input_w);
+  printf("  %s%d;\n", "p->output_c = ", p->output_c);
+
+  printf("  %s%d;\n", "p->kh = ", p->kh);
+  printf("  %s%d;\n", "p->kw = ", p->kw);
+  printf("  %s%d;\n", "p->dh = ", p->dh);
+  printf("  %s%d;\n", "p->dw = ", p->dw);
+  printf("  %s%d;\n", "p->pad_top = ", p->pad_top);
+  printf("  %s%d;\n", "p->pad_bot = ", p->pad_bot);
+  printf("  %s%d;\n", "p->pad_left = ", p->pad_left);
+  printf("  %s%d;\n", "p->pad_right = ", p->pad_right);
+  printf("  %s%d;\n", "p->stride_h = ", p->stride_h);
+  printf("  %s%d;\n", "p->stride_w = ", p->stride_w);
+  printf("  %s%d;\n", "p->ins_w = ", p->ins_w);
+  printf("  %s%d;\n", "p->ins_h = ", p->ins_h);
+  printf("  %s%d;\n", "p->ins_w_last = ", p->ins_w_last);
+  printf("  %s%d;\n", "p->ins_h_last = ", p->ins_h_last);
+
+  printf("  %s%d;\n", "p->r_shift_m = ", p->r_shift_m);
+  printf("  %s%d;\n", "p->opd0_sign = ", p->opd0_sign);
+  printf("  %s%d;\n", "p->opd1_sign = ", p->opd1_sign);
+  printf("  %s%d;\n", "p->opd2_sign = ", p->opd2_sign);
+  printf("  %s%d;\n", "p->bReLU_EN = ", p->bReLU_EN);
+  printf("  %s%d;\n", "p->using_bias = ", p->using_bias);
+
+  printf("  %s%d\n", "kh_ext = ", conv_kh_ext(p));
+  printf("  %s%d\n", "kw_ext = ", conv_kw_ext(p));
+  printf("  %s%d\n", "ih_ext = ", conv_ih_ext(p));
+  printf("  %s%d\n", "iw_ext = ", conv_iw_ext(p));
+  printf("  %s%d\n", "output_h = ", conv_oh(p));
+  printf("  %s%d\n", "output_w = ", conv_ow(p));
+}
+
+// Calculate the right shift value, m
+// Steps:
+//  1. Get the abs() of each weight;
+//  2. Summary all the abs() in one kernel;
+//  3. Get Log2 of each sum;
+//  4. Downward rounding;
+// After every r_shift value got, sort and find the middle one.
+static int calc_rshift_m(const conv_param_t *p, int8_t* weight)
+{
+  int kernel_cnt = p->output_c * p->input_c;
+  int kernel_size = p->kh * p->kw;
+  int *kernel_shifts = (int *)malloc(sizeof(int) * kernel_cnt);
+  if (!kernel_shifts)
+    return 1;
+
+  memset(kernel_shifts, 0, sizeof(int) * kernel_cnt);
+
+  // Part 1:
+  // Get right shift value for each kernel
+  int sum = 0;
+  for (int i = 0; i < kernel_cnt; i++) {
+    // Step 1 & 2: Get the sum of abs()
+    for (int j = 0; j < kernel_size; j++) {
+      sum += (int)(*weight < 0 ? -(*weight) : (*weight));
+      weight++;
+    }
+    // Step 3 & 4: log2 and downward rounding
+    sum >>= 1;
+    while (sum) {
+      sum >>= 1;
+      kernel_shifts[i]++;
+    }
+  }
+
+  // Part 2:
+  // Find the middle of all the values
+  int tag[32] = {0};
+  for (int cnt = 0; cnt < kernel_cnt; cnt++) {
+    tag[kernel_shifts[cnt]]++;
+  }
+
+  int rshift_m = 0;
+  int mid = 0;
+  do {
+    mid += tag[rshift_m++];
+  } while(mid < (kernel_cnt - 1) >> 1);
+
+  free(kernel_shifts);
+
+  return rshift_m - 1;
+}
+
+
+static void l2g_tensor_copy_cw_transposed_ref(
+    l2g_cw_param_t *p, uint8_t ref_data[], uint8_t src_data[])
+{
+  cvk_tl_shape_t s = p->src->shape;
+  uint32_t n = s.n;
+  uint32_t c = s.c;
+  uint32_t h = s.h;
+  uint32_t w = s.w;
+
+  for (uint32_t ni = 0; ni < n; ni++) {
+    for (uint32_t ci = 0; ci < c; ci++) {
+      for (uint32_t hi = 0; hi < h; hi++) {
+        for (uint32_t wi = 0; wi < w; wi++) {
+          uint32_t src_i = ni * c * h * w + ci * h * w + hi * w + wi;
+          uint32_t dst_i = ni * c * h * w + wi * h * c + hi * c + ci;
+          ref_data[dst_i] = src_data[src_i];
+        }
+      }
+    }
+  }
+}
+
+static void test_param_l2g(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, l2g_cw_param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->src->shape, p->src->fmt);
+
+  s8_test_data.l2g_cw_src = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!s8_test_data.l2g_cw_src)
+    return;
+
+  for (uint64_t i = 0; i < size; i++)
+    s8_test_data.l2g_cw_src[i] = rand()%0x100;
+
+  s8_test_data.l2g_cw_output_ref = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!s8_test_data.l2g_cw_output_ref)
+    return;
+
+  l2g_tensor_copy_cw_transposed_ref(p, s8_test_data.l2g_cw_output_ref, s8_test_data.l2g_cw_src);
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, p->src, s8_test_data.l2g_cw_src);
+}
+
+static void destroy_param_l2g(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, l2g_cw_param_t *p)
+{
+  free_tensor_dev_mem(rt_handle, p->dst);
+  free_skip_tensor_lmem(cvk_ctx);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->src);
+}
+
+static void test_l2g_cw_transpose(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, l2g_cw_param_t *p)
+{
+  cvk_tl_shape_t src_shape = {1, 0x100, 1, 0x020};
+  cvk_tg_shape_t dst_shape = {1, 0x020, 1, 0x100};
+
+//  cvk_tl_shape_t src_shape = {1, 0x100, 1, 0x080};
+//  cvk_tg_shape_t dst_shape = {1, 0x080, 1, 0x100};
+
+  p->src = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, src_shape, CVK_FMT_I8, 1);
+  p->dst = alloc_tensor_dev_mem(rt_handle, cvk_ctx, dst_shape, CVK_FMT_I8);
+
+  printf("l2g cw src shape(%d, %d, %d, %d)\n",
+         p->src->shape.n, p->src->shape.c,
+         p->src->shape.h, p->src->shape.w);
+  printf("l2g cw dst shape(%d, %d, %d, %d)\n",
+         p->dst->shape.n, p->dst->shape.c,
+         p->dst->shape.h, p->dst->shape.w);
+
+  skip_tensor_lmem_size(cvk_ctx, p->src);
+  test_param_l2g(rt_handle, cvk_ctx, p);
+}
+
+static void g2l_matrix_copy_row_col_transposed_ref(
+    g2l_matrix_param_t *p, uint8_t ref_data[], uint8_t src_data[])
+{
+  uint64_t row = p->src->shape.row;
+  uint64_t col = p->src->shape.col;
+
+  for (uint64_t ri = 0; ri < row; ri++) {
+    for (uint64_t ci = 0; ci < col; ci++) {
+      uint64_t src_i = ri * col + ci;
+      uint64_t dst_i = ci * row + ri;
+      ref_data[dst_i] = src_data[src_i];
+    }
+  }
+}
+
+static void test_param_g2l(CVI_RT_HANDLE rt_handle, g2l_matrix_param_t *p)
+{
+  uint64_t size = ml_shape_size(&p->dst->shape, p->dst->fmt);
+
+  s8_test_data.g2l_matrix_src = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!s8_test_data.g2l_matrix_src)
+    return;
+
+  for (uint64_t i = 0; i < size; i++)
+    s8_test_data.g2l_matrix_src[i] = rand()%0x100;
+
+  s8_test_data.g2l_matrix_output_ref = (uint8_t *)malloc(size);
+  if (!s8_test_data.g2l_matrix_output_ref)
+    return;
+
+  g2l_matrix_copy_row_col_transposed_ref(p, s8_test_data.g2l_matrix_output_ref, s8_test_data.g2l_matrix_src);
+
+  matrix_copy_s2d(rt_handle, p->src, s8_test_data.g2l_matrix_src);
+}
+
+static void destroy_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, g2l_matrix_param_t *p)
+{
+  free_matrix_dev_mem(rt_handle, p->src);
+  free_skip_tensor_lmem(cvk_ctx);
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->dst);
+}
+
+
+static void test_g2l_matrix_transpose(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, g2l_matrix_param_t *p)
+{
+  //g2l_matrix_param_t p;
+  /*
+   * Matrix transpose must be n/c stride alignment
+   * for TDMA limitation
+   */
+  cvk_mg_shape_t src_shape={0x100, 0x20};
+  cvk_ml_shape_t dst_shape={0x20, 0x10, 0x10, 0x100};
+
+//  cvk_mg_shape_t src_shape={0x100, 0x80};
+//  cvk_ml_shape_t dst_shape={0x80, 0x10, 0x10, 0x100};
+
+  int dst_align = 1;
+  cvk_fmt_t fmt = CVK_FMT_I8;
+
+  p->src = alloc_matrix_dev_mem(rt_handle, src_shape, fmt);
+  p->dst = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, dst_shape, fmt, dst_align);
+
+  printf("g2l matrix tp src shape(row=%d, col=%d)\n",
+         p->src->shape.row, p->src->shape.col);
+  printf("gl2 matrix tp dst shape(n=%d, c=%d, w=%d, col=%d)\n",
+         p->dst->shape.n, p->dst->shape.c,
+         p->dst->shape.w, p->dst->shape.col);
+
+  skip_matrix_lmem_size(cvk_ctx, p->dst);
+  test_param_g2l(rt_handle, p);
+}
+
+static void l2l_tensor_copy_ref(l2l_tensor_copy_param_t *p, uint8_t ref_data[], uint8_t src_data[])
+{
+  uint64_t size = tl_shape_size(&p->src->shape, p->src->fmt);
+
+  for (uint64_t i = 0; i < size; i++)
+    ref_data[i] = src_data[i];
+}
+
+static void test_l2l_param(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, l2l_tensor_copy_param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->src->shape, p->src->fmt);
+
+  s8_test_data.l2l_tensor_src = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!s8_test_data.l2l_tensor_src)
+    return;
+
+  for (uint64_t i = 0; i < size; i++)
+    s8_test_data.l2l_tensor_src[i] = rand()%0x100;
+
+  s8_test_data.l2l_tensor_output_ref = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!s8_test_data.l2l_tensor_output_ref)
+    return;
+
+  l2l_tensor_copy_ref(p, s8_test_data.l2l_tensor_output_ref, s8_test_data.l2l_tensor_src);
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, p->src, s8_test_data.l2l_tensor_src);
+}
+
+static void destroy_param_l2l(cvk_context_t *cvk_ctx, l2l_tensor_copy_param_t *p)
+{
+  free_skip_tensor_lmem(cvk_ctx);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->dst);
+  free_skip_tensor_lmem(cvk_ctx);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->src);
+}
+
+static void test_l2l_tensor_copy(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, l2l_tensor_copy_param_t *p)
+{
+  cvk_tl_shape_t src_shape = {1, 0x4, 0x1, 0x40}; // for 180x
+  cvk_tl_shape_t dst_shape = {1, 0x4, 0x1, 0x40}; // for 180x
+
+  //cvk_tl_shape_t src_shape = {1, 0x10, 0x1, 0x100};
+  //cvk_tl_shape_t dst_shape = {1, 0x10, 0x1, 0x100};
+
+//  cvk_tl_shape_t src_shape = {1, 0x10, 0x1, 0x400};
+//  cvk_tl_shape_t dst_shape = {1, 0x10, 0x1, 0x400};
+
+  p->src = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, src_shape, CVK_FMT_I8, 1);
+
+  printf("l2l src shape(%d, %d, %d, %d)\n",
+         p->src->shape.n, p->src->shape.c,
+         p->src->shape.h, p->src->shape.w);
+
+  skip_tensor_lmem_size(cvk_ctx, p->src);
+  p->dst = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, dst_shape, CVK_FMT_I8, 1);
+  skip_tensor_lmem_size(cvk_ctx, p->dst);
+  test_l2l_param(rt_handle, cvk_ctx, p);
+}
+
+static int setup_conv(
+    conv_param_t *p_param, CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  s8_test_data.conv_input = alloc_input(p_param);
+  s8_test_data.conv_weight = alloc_weight(p_param);
+  s8_test_data.conv_bias = alloc_bias(p_param);
+  p_param->r_shift_m = calc_rshift_m(p_param, s8_test_data.conv_weight);
+  s8_test_data.conv_output_ref = (int8_t *)malloc(sizeof(int8_t) * conv_output_size(p_param));
+  if (!s8_test_data.conv_output_ref)
+    return -1;
+
+  int ret = conv_ref(p_param, s8_test_data.conv_input, s8_test_data.conv_weight, s8_test_data.conv_bias, s8_test_data.conv_output_ref);
+  if (ret)
+    return ret;
+
+  make_bmk_conv_param(cvk_ctx, &bmk_conv_param, p_param);
+
+  bmk_conv_param_alloc_ok(&bmk_conv_param, p_param);
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, bmk_conv_param.ifmap, (uint8_t *)s8_test_data.conv_input);
+  put_conv_weight(rt_handle, cvk_ctx, bmk_conv_param.weight, (uint8_t *)s8_test_data.conv_weight);
+  if (p_param->using_bias)
+    put_conv_bias(rt_handle, cvk_ctx, bmk_conv_param.bias, s8_test_data.conv_bias);
+
+  return 0;
+}
+
+void get_result(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  s8_test_data.conv_output = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, bmk_conv_param.ofmap);
+  s8_test_data.l2g_cw_output = tensor_copy_d2s(rt_handle, l2g_cw_param.dst);
+  s8_test_data.g2l_matrix_output = matrix_copy_l2g_d2s(rt_handle, cvk_ctx, g2l_matrix_param.dst);
+  s8_test_data.l2l_tensor_output = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, l2l_tensor_copy_param.dst);
+}
+
+int check_result()
+{
+  int has_error = array_cmp_int8(
+      "conv Comparing results ...\n",
+      s8_test_data.conv_output_ref, (int8_t *)s8_test_data.conv_output, conv_output_size(&conv_param));
+
+  if (has_error) {
+    print_conv_param(&conv_param);
+    printf("Comparison FAILED\n");
+    return -1;
+  }
+
+  for (uint64_t i = 0; i < tl_shape_size(&l2g_cw_param.src->shape, l2g_cw_param.src->fmt); i++) {
+    if (s8_test_data.l2g_cw_output[i] != s8_test_data.l2g_cw_output_ref[i]) {
+      fprintf(stderr, "l2g_cw comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, s8_test_data.l2g_cw_output[i], s8_test_data.l2g_cw_output_ref[i]);
+      return -1;
+    }
+  }
+  for (uint64_t i = 0; i < ml_shape_size(&g2l_matrix_param.dst->shape, g2l_matrix_param.dst->fmt); i++) {
+    if (s8_test_data.g2l_matrix_output[i] != s8_test_data.g2l_matrix_output_ref[i]) {
+      fprintf(stderr, "g2l_matrix comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, s8_test_data.g2l_matrix_output[i], s8_test_data.g2l_matrix_output_ref[i]);
+      return -1;
+    }
+  }
+
+  for (uint64_t i = 0; i < tl_shape_size(&l2l_tensor_copy_param.src->shape, l2l_tensor_copy_param.src->fmt); i++) {
+    if (s8_test_data.l2l_tensor_output[i] != s8_test_data.l2l_tensor_output_ref[i]) {
+      fprintf(stderr, "l2l_tensor comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, s8_test_data.l2l_tensor_output[i], s8_test_data.l2l_tensor_output_ref[i]);
+      return -1;
+    }
+  }
+
+  return 0;
+}
+
+void trigger_max_power(cvk_context_t *cvk_ctx)
+{
+ cvk_ctx->ops->parallel_enable(cvk_ctx);
+ cvk_ctx->ops->tdma_l2g_tensor_copy_cw_transposed(cvk_ctx, &l2g_cw_param);
+ cvk_ctx->ops->tdma_g2l_matrix_copy_row_col_transposed(cvk_ctx, &g2l_matrix_param);
+ cvk_ctx->ops->tdma_l2l_tensor_copy(cvk_ctx, &l2l_tensor_copy_param);
+ cvk_ctx->ops->tiu_pt_convolution(cvk_ctx, &bmk_conv_param);
+ cvk_ctx->ops->parallel_disable(cvk_ctx);
+ cvk_ctx->ops->parallel_enable(cvk_ctx);
+ cvk_ctx->ops->tdma_l2g_tensor_copy_cw_transposed(cvk_ctx, &l2g_cw_param);
+ cvk_ctx->ops->tdma_g2l_matrix_copy_row_col_transposed(cvk_ctx, &g2l_matrix_param);
+ cvk_ctx->ops->tdma_l2l_tensor_copy(cvk_ctx, &l2l_tensor_copy_param);
+ cvk_ctx->ops->tiu_pt_convolution(cvk_ctx, &bmk_conv_param);
+ cvk_ctx->ops->parallel_disable(cvk_ctx);
+ CVI_RT_Submit(cvk_ctx);
+}
+
+void free_s8_data()
+{
+  free(s8_test_data.conv_input);
+  free(s8_test_data.conv_weight);
+  free(s8_test_data.conv_bias);
+  free(s8_test_data.conv_output);
+  free(s8_test_data.conv_output_ref);
+  free(s8_test_data.l2g_cw_src);
+  free(s8_test_data.l2g_cw_output);
+  free(s8_test_data.l2g_cw_output_ref);
+  free(s8_test_data.g2l_matrix_src);
+  free(s8_test_data.g2l_matrix_output);
+  free(s8_test_data.g2l_matrix_output_ref);
+  free(s8_test_data.l2l_tensor_src);
+  free(s8_test_data.l2l_tensor_output);
+  free(s8_test_data.l2l_tensor_output_ref);
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  printf("conv max_power test\n");
+  init_conv_param(&conv_param);
+  ret |= setup_conv(&conv_param, rt_handle, cvk_ctx);
+
+  test_l2g_cw_transpose(rt_handle, cvk_ctx, &l2g_cw_param);
+  test_g2l_matrix_transpose(rt_handle, cvk_ctx, &g2l_matrix_param);
+  test_l2l_tensor_copy(rt_handle, cvk_ctx, &l2l_tensor_copy_param);
+
+  trigger_max_power(cvk_ctx);
+  get_result(rt_handle, cvk_ctx);
+  check_result();
+
+  destroy_param_l2l(cvk_ctx,&l2l_tensor_copy_param);
+  destroy_param_g2l(rt_handle, cvk_ctx, &g2l_matrix_param);
+  destroy_param_l2g(rt_handle, cvk_ctx, &l2g_cw_param);
+  free_bmk_conv_param(cvk_ctx, &bmk_conv_param, &conv_param);
+  free_s8_data();
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_conv_ps32.c b/cviruntime/test/180x/test_180x_conv_ps32.c
new file mode 100644
index 000000000..0cb7c80fa
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_conv_ps32.c
@@ -0,0 +1,1559 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_native_ref.h"
+
+typedef struct {
+  int random_seed;
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int using_bias;
+  int bReLU_EN;
+  int r_shift_m;
+  int opd0_sign;
+  int opd1_sign;
+  int opd2_sign;
+} conv_param_t;
+
+static int index_get(int h, int w1, int w2)
+{
+  return h * w1 + w2;
+}
+
+static int matrix_dot_mult(
+    int8_t *A, int8_t *B, int dim_n, int dim_m,
+    int opd0_sign)
+{
+  int sum = 0;
+  for (int i=0; i<dim_n; i++){
+    for (int j=0; j<dim_m; j++){
+      int index = index_get(i, dim_m, j);
+      if (opd0_sign) {
+        sum += A[index] * B [index];
+      } else {
+        sum += (int)((uint8_t)A[index]) * B [index];
+      }
+    }
+  }
+  return sum;
+}
+
+static int ps32_m2_conv_ref(
+    const conv_param_t *p_param,
+    const int8_t *ifmap,
+    const int8_t *weight,
+    int8_t *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  int8_t *i_fmap_pad_ker = (int8_t *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return -1;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = 0;
+
+  int8_t *i_fmap_pad = NULL;
+  int8_t *kernel_after = NULL;
+  uint32_t bstride = in * oc * oh * ow;
+
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+    ofmap[i] = result[i];
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+    ofmap[bstride + i] = result[i] >> 8;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+    ofmap[2 * bstride + i] = result[i] >> 16;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+    ofmap[3 * bstride + i] = result[i] >> 24;
+
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+static int ps32_m1_conv_ref(
+    const conv_param_t *p_param,
+    const int8_t *ifmap,
+    const int8_t *weight,
+    const int16_t *bias,
+    int8_t *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+  int r_shift_bits = p_param->r_shift_m;
+  int do_relu = p_param->bReLU_EN;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  int8_t *i_fmap_pad_ker = (int8_t *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return -1;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = 0;
+
+  int8_t *i_fmap_pad = NULL;
+  int8_t *kernel_after = NULL;
+
+  uint32_t bstride = in * oc * oh * ow;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      result[i] = (uint8_t)ofmap[i];
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      result[i] |= (uint8_t)ofmap[bstride + i] << 8;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      result[i] |= (uint8_t)ofmap[2 * bstride + i] << 16;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      result[i] |= (uint8_t)ofmap[3 * bstride + i] << 24;
+
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+
+      if (p_param->using_bias) {
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += bias[c]; //bias+c ;
+          }
+        }
+      }
+
+      if (do_relu)
+        relu(&result[n*oc*oh*ow + c*oh*ow], oh * ow);
+
+      // ofmap is int8_t, signed
+      ret = satu_2_8bit(&result[n*oc*oh*ow + c*oh*ow], oh * ow,
+                        &ofmap[n*oc*oh*ow + c*oh*ow], r_shift_bits, /*round_floor=*/1,
+                        /*sign_unsign=*/1);
+
+      if (ret)
+        goto error_release;
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+error_release:
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+
+static int ps32_m3_conv_ref(
+    const conv_param_t *p_param,
+    const int8_t *ifmap,
+    const int8_t *weight,
+    int8_t *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  int8_t *i_fmap_pad_ker = (int8_t *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return -1;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = 0;
+
+  int8_t *i_fmap_pad = NULL;
+  int8_t *kernel_after = NULL;
+
+  uint32_t bstride = in * oc * oh * ow;
+
+  for (int i = 0; i < in * oc * oh * ow; i++)
+      result[i] = (uint8_t)ofmap[i];
+
+  for (int i = 0; i < in * oc * oh * ow; i++)
+      result[i] |= (uint8_t)ofmap[bstride + i] << 8;
+
+  for (int i = 0; i < in * oc * oh * ow; i++)
+      result[i] |= (uint8_t)ofmap[2 * bstride + i] << 16;
+
+  for (int i = 0; i < in * oc * oh * ow; i++)
+      result[i] |= (uint8_t)ofmap[3 * bstride + i] << 24;
+
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      ofmap[i] = result[i];
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      ofmap[bstride + i] = result[i] >> 8;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      ofmap[2 * bstride + i] = result[i] >> 16;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      ofmap[3 * bstride + i] = result[i] >> 24;
+
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+static int conv_ref(
+    const conv_param_t *p_param,
+    const int8_t *ifmap,
+    const int8_t *weight,
+    const int16_t *bias,
+    int8_t *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+  int r_shift_bits = p_param->r_shift_m;
+  int do_relu = p_param->bReLU_EN;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  int8_t *i_fmap_pad_ker = (int8_t *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return -1;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = 0;
+
+  int8_t *i_fmap_pad = NULL;
+  int8_t *kernel_after = NULL;
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+
+      if (p_param->using_bias) {
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += bias[c]; //bias+c ;
+          }
+        }
+      }
+
+      if (do_relu)
+        relu(&result[n*oc*oh*ow + c*oh*ow], oh * ow);
+
+      // ofmap is int8_t, signed
+      ret = satu_2_8bit(&result[n*oc*oh*ow + c*oh*ow], oh * ow,
+                        &ofmap[n*oc*oh*ow + c*oh*ow], r_shift_bits, /*round_floor=*/1,
+                        /*sign_unsign=*/1);
+
+      if (ret)
+        goto error_release;
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+error_release:
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+static uint8_t * transform_weight(const cvk_tl_shape_t *s, uint8_t before[])
+{
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  uint32_t size = ic * oc * kh * kw;
+  uint8_t *after = (uint8_t *)malloc(size);
+
+  /*
+   * (oc, ic, kh, kw) -> (1, oc, kh * kw, ic)
+   */
+  for (uint32_t oci = 0; oci < oc; oci++) {
+    for (uint32_t ici = 0; ici < ic; ici++) {
+      for (uint32_t khi = 0; khi < kh; khi++) {
+        for (uint32_t kwi = 0; kwi < kw; kwi++) {
+          uint32_t src_i = oci * ic * kh * kw + ici * kh * kw + khi * kw + kwi;
+          uint32_t dst_i = oci * kh * kw * ic + khi * kw * ic + kwi * ic + ici;
+          after[dst_i] = before[src_i];
+        }
+      }
+    }
+  }
+
+  return after;
+}
+
+static void put_conv_weight(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    uint8_t *data)
+{
+#if 0
+  const cvk_tl_shape_t *s = &tl->shape;
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  bmshape_t bms = BM_TENSOR_INT8((int)oc, (int)ic, (int)kh, (int)kw);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  uint8_t *transformed_data = transform_weight(s, data);
+
+  /* we put weight to region 1. bm_memcpy_s2d regard dev_mem as
+   * absolute address, so we should pass abs address to copy weight
+   * to right place.
+   */
+
+  //u64 ab_addr = bm_device_read_base_reg(*ctx, 1);
+  //bmmem_device_t ab_dev_mem =
+    //bmmem_device_prealloc(*ctx, NULL, ab_addr + gaddr, &bms);
+
+  //int ret = bm_memcpy_s2d(*ctx, ab_dev_mem, transformed_data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  cvk_tl_shape_t tdma_shape = { 1, oc, kh * kw, ic };
+
+  tg_t tdma_tg;
+  tdma_tg.base_reg_index = 0;
+  tdma_tg.start_address = gaddr;
+  tdma_tg.fmt = FMT_I8;
+  tdma_tg.shape.n = tdma_shape.n;
+  tdma_tg.shape.c = tdma_shape.c;
+  tdma_tg.shape.h = tdma_shape.h;
+  tdma_tg.shape.w = tdma_shape.w;
+  tdma_tg.stride = bmk1822_tensor_tgmem_default_stride(tdma_tg.shape, tdma_tg.fmt);
+  tdma_tg.base_reg_index = 1;
+
+  tl_t tdma_tl = *tl;
+  tdma_tl.shape = tdma_shape;
+  tdma_tl.stride = bmk1822_tensor_lmem_default_stride(cvk_ctx, tdma_shape, FMT_I8, 0);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tdma_tg;
+  p.dst = &tdma_tl;
+
+  bmk1822_tdma_g2l_tensor_copy(cvk_ctx, &p);
+  test_submit(ctx);
+  bmmem_device_free(*ctx, dev_mem);
+  delete[] transformed_data;
+#endif
+
+  const cvk_tl_shape_t *s = &tl->shape;
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  uint8_t *transformed_data = transform_weight(s, data);
+
+  cvk_tl_shape_t tdma_shape = tl_shape_t4(1, oc, kh * kw, ic);
+  cvk_tl_t tdma_tl;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tdma_tl, tdma_shape, tl->fmt, tl->eu_align);
+  tdma_tl.start_address = tl->start_address;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, &tdma_tl, transformed_data);
+
+  free(transformed_data);
+}
+
+static int8_t * transform_bias(int oc, int16_t before[])
+{
+  int8_t *after = (int8_t *)malloc(2 * oc);
+  if (!after)
+    return NULL;
+
+  for (int i = 0; i < oc; i++){
+    after[i] = before[i] & 0xff;
+    after[i + oc] = (before[i] >> 8) & 0xff;
+  }
+  return after;
+}
+
+static void put_conv_bias(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    int16_t *data)
+{
+#if 0
+  int oc = tl->shape.c;
+
+  bmshape_t bms = BM_TENSOR_INT8(2, oc, 1, 1);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  int8_t *transformed_data = transform_bias(oc, data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, (uint8_t *)transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_I8;
+  tg.shape.n = 2;
+  tg.shape.c = oc;
+  tg.shape.h = 1;
+  tg.shape.w = 1;
+  tg.stride = bmk1822_tensor_tgmem_default_stride(tg.shape, tg.fmt);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1822_tdma_g2l_tensor_copy(cvk_ctx, &p);
+  test_submit(ctx);
+
+  bmmem_device_free(*ctx, dev_mem);
+  delete[] transformed_data;
+#endif
+
+  int oc = tl->shape.c;
+  int8_t *transformed_data = transform_bias(oc, data);
+
+  cvk_tl_shape_t tdma_shape = tl_shape_t4(2, oc, 1, 1);
+  cvk_tl_t tdma_tl;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tdma_tl, tdma_shape, tl->fmt, tl->eu_align);
+  tdma_tl.start_address = tl->start_address;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, &tdma_tl, (uint8_t *)transformed_data);
+
+  free(transformed_data);
+}
+
+static int conv_kh_ext(const conv_param_t *p)
+{
+  return (p->kh - 1) * p->dh + 1;
+}
+
+static int conv_kw_ext(const conv_param_t *p)
+{
+  return (p->kw - 1) * p->dw + 1;
+}
+
+static int conv_ih_ext(const conv_param_t *p)
+{
+  return (p->input_h - 1) * (p->ins_h + 1) +
+      p->ins_h_last + 1 + p->pad_top + p->pad_bot;
+}
+
+static int conv_iw_ext(const conv_param_t *p)
+{
+  return (p->input_w - 1) * (p->ins_w + 1) +
+      p->ins_w_last + 1 + p->pad_left + p->pad_right;
+}
+
+static int conv_oh(const conv_param_t *p)
+{
+  return (conv_ih_ext(p) - conv_kh_ext(p)) / p->stride_h + 1;
+}
+
+static int conv_ow(const conv_param_t *p)
+{
+  return (conv_iw_ext(p) - conv_kw_ext(p)) / p->stride_w + 1;
+}
+
+static int conv_input_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int ic = p->input_c;
+  int ih = p->input_h;
+  int iw = p->input_w;
+  return in * ic * ih * iw;
+}
+
+static int conv_output_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int oc = p->output_c;
+  int oh = conv_oh(p);
+  int ow = conv_ow(p);
+  return in * oc * oh * ow;
+}
+
+static int conv_weight_size(const conv_param_t *p)
+{
+  int oc = p->output_c;
+  int ic = p->input_c;
+  int kh = p->kh;
+  int kw = p->kw;
+  return oc * ic * kh * kw;
+}
+
+static int8_t * alloc_input(const conv_param_t *p)
+{
+  int size = conv_input_size(p);
+
+  int8_t *buf = (int8_t *)malloc(sizeof(int8_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static int8_t * alloc_weight(const conv_param_t *p)
+{
+  int size = conv_weight_size(p);
+
+  int8_t *buf = (int8_t *)malloc(sizeof(int8_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static int16_t * alloc_bias(const conv_param_t *p)
+{
+  int oc = p->output_c;
+
+  int16_t *bias = (int16_t *)malloc(sizeof(int16_t) * oc);
+  if (!bias)
+    return NULL;
+
+  for (int i = 0; i < oc; i++)
+    bias[i] = rand() % 65536 - 32768;
+
+  return bias;
+}
+
+static cvk_tl_t * conv_ifmap_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = p->opd0_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 1);
+}
+
+static uint32_t conv_ifmap_tensor_size(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = p->opd0_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, s, fmt, 1);
+}
+
+static cvk_tl_t * conv_weight_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = p->opd1_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 0);
+}
+
+static uint32_t conv_weight_tensor_size(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = p->opd1_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+  return cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, s, fmt, 0);
+}
+
+static cvk_tl_t * conv_ofmap_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_tl_shape_t s;
+  s.n = p->input_n * 4;
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  cvk_tl_t *tl = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, CVK_FMT_I8, 1);
+  if (tl)
+    tl->shape.n = p->input_n;
+  return tl;
+}
+
+static uint32_t conv_ofmap_tensor_size(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_tl_shape_t s;
+  s.n = p->input_n * sizeof(uint32_t) / sizeof(uint8_t);
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  return cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, s, CVK_FMT_I8, 1);
+}
+
+static cvk_tl_t * conv_bias_tensor(
+    cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = p->opd2_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 0);
+}
+
+static uint32_t conv_bias_tensor_size(
+    cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = p->opd2_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+  return cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, s, fmt, 0);
+}
+
+static int conv_param_is_ok(const conv_param_t *p)
+{
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+
+  if ((kh_ext > ih_ext)
+      || (kw_ext > iw_ext)
+      || (kh_ext <= p->pad_top)
+      || (kh_ext <= p->pad_bot)
+      || (kw_ext <= p->pad_left)
+      || (kw_ext <= p->pad_right)
+      || (p->pad_top >= (1 << 4))
+      || (p->pad_bot >= (1 << 4))
+      || (p->pad_left >= (1 << 4))
+      || (p->pad_right >= (1 << 4))) {
+    return 0;
+  }
+
+  return 1;
+}
+
+static int bmk_conv_param_alloc_ok(
+    const cvk_tiu_pt_convolution_param_t *p,
+    const conv_param_t *param)
+{
+  if (!p->ifmap || !p->ofmap || !p->weight)
+    return 0;
+  if(p->ps32_mode==1)
+      if (param->using_bias)
+        if (!p->bias)
+          return 0;
+
+  return 1;
+}
+
+static void make_bmk_conv_param_ps32(
+    cvk_context_t *cvk_ctx,
+    cvk_tiu_pt_convolution_param_t *dst,
+    const conv_param_t *p, uint32_t ps32_mode)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+
+  if(ps32_mode==2)
+  {
+    uint32_t ifmap_size = conv_ifmap_tensor_size(cvk_ctx, p);
+    uint32_t weight_size = conv_weight_tensor_size(cvk_ctx, p);
+    uint32_t ofmap_size = conv_ofmap_tensor_size(cvk_ctx, p);
+    uint32_t bias_size = p->using_bias ? conv_bias_tensor_size(cvk_ctx, p) : 0;
+    uint32_t total_size = ifmap_size + weight_size + ofmap_size + bias_size;
+
+    // Allocation if size fit.
+    if (total_size <= cvk_ctx->info.lmem_size) {
+      dst->ifmap = conv_ifmap_tensor(cvk_ctx, p);
+      dst->weight = conv_weight_tensor(cvk_ctx, p);
+      dst->ofmap = conv_ofmap_tensor(cvk_ctx, p);
+    } else {
+      dst->ifmap = NULL;
+      dst->weight = NULL;
+      dst->ofmap = NULL;
+    }
+  }
+
+  dst->ps32_mode = ps32_mode;
+  dst->bias = NULL;
+  dst->relu_enable = 0;
+  dst->rshift_bits = 0;
+  if(ps32_mode==1)
+  {
+    dst->relu_enable = p->bReLU_EN;
+    dst->rshift_bits = p->r_shift_m;
+    // only mode=1 can use bias
+    if (p->using_bias)
+      dst->bias = conv_bias_tensor(cvk_ctx, p);
+  }
+
+  return;
+}
+
+static void make_bmk_conv_param(
+    cvk_context_t *cvk_ctx,
+    cvk_tiu_pt_convolution_param_t *dst,
+    const conv_param_t *p)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  dst->relu_enable = p->bReLU_EN;
+  dst->rshift_bits = p->r_shift_m;
+  dst->ifmap = conv_ifmap_tensor(cvk_ctx, p);
+  dst->weight = conv_weight_tensor(cvk_ctx, p);
+  dst->ofmap = conv_ofmap_tensor(cvk_ctx, p);
+  dst->bias = NULL;
+  dst->ps32_mode = 0;
+  if (p->using_bias)
+    dst->bias = conv_bias_tensor(cvk_ctx, p);
+}
+
+static void free_bmk_conv_param(
+    cvk_context_t *cvk_ctx,
+    cvk_tiu_pt_convolution_param_t *r,
+    const conv_param_t *p)
+{
+  if (p->using_bias && r->bias) {
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->bias);
+    r->bias = NULL;
+  }
+
+  if (r->ofmap) {
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ofmap);
+    r->ofmap = NULL;
+  }
+
+  if (r->weight) {
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->weight);
+    r->weight = NULL;
+  }
+
+  if (r->ifmap) {
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ifmap);
+    r->ifmap = NULL;
+  }
+
+}
+
+static void init_conv_param(conv_param_t *p)
+{
+  printf("init_conv_param\n");
+
+  memset(p, 0, sizeof(*p));
+
+  p->random_seed = clock();
+  srand(p->random_seed);
+
+retry:
+  p->input_n = 1;
+  p->input_c = rand() % (10) + 2;
+  p->kh = rand() % 7 + 1;
+  p->kw = rand() % 7 + 1;
+  p->input_h = rand() % 10 + p->kh;
+  p->input_w = rand() % 10 + p->kw;
+  p->output_c = rand() % 10 + 3;
+  p->stride_h = rand() % (p->kh) + 1;
+  p->stride_w = rand() % (p->kw) + 1;
+  p->ins_h = rand() % p->kh;
+  p->ins_w = rand() % p->kw;
+  p->ins_h_last = rand() % p->kh;;
+  p->ins_w_last = rand() % p->kw;;
+  p->dh = rand() % 3 + 1;
+  p->dw = rand() % 3 + 1;
+
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  p->pad_top = rand() % kh_ext;
+  p->pad_bot = rand() % kh_ext;
+  p->pad_left = rand() % kw_ext;
+  p->pad_right = rand() % kw_ext;
+
+  if (!conv_param_is_ok(p)) {
+    printf("retry init_conv_param\n");
+    goto retry;
+  }
+
+  p->using_bias = rand() % 2;
+  p->r_shift_m = rand() % 8;
+  p->bReLU_EN = rand() % 2;
+
+  p->opd0_sign = rand() % 2;
+  p->opd1_sign = 1;
+  p->opd2_sign = 1;
+
+  assert(p->opd1_sign == 1 && p->opd2_sign == 1);
+
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+  assert(ih_ext >= kh_ext);
+  assert(iw_ext >= kw_ext);
+}
+
+static void print_conv_param(const conv_param_t *p)
+{
+  printf("%s\n", "Conv parameters:");
+  printf("  %s%d;\n", "p->random_seed = ", p->random_seed);
+
+  printf("  %s%d;\n", "p->input_n = ", p->input_n);
+  printf("  %s%d;\n", "p->input_c = ", p->input_c);
+  printf("  %s%d;\n", "p->input_h = ", p->input_h);
+  printf("  %s%d;\n", "p->input_w = ", p->input_w);
+  printf("  %s%d;\n", "p->output_c = ", p->output_c);
+
+  printf("  %s%d;\n", "p->kh = ", p->kh);
+  printf("  %s%d;\n", "p->kw = ", p->kw);
+  printf("  %s%d;\n", "p->dh = ", p->dh);
+  printf("  %s%d;\n", "p->dw = ", p->dw);
+  printf("  %s%d;\n", "p->pad_top = ", p->pad_top);
+  printf("  %s%d;\n", "p->pad_bot = ", p->pad_bot);
+  printf("  %s%d;\n", "p->pad_left = ", p->pad_left);
+  printf("  %s%d;\n", "p->pad_right = ", p->pad_right);
+  printf("  %s%d;\n", "p->stride_h = ", p->stride_h);
+  printf("  %s%d;\n", "p->stride_w = ", p->stride_w);
+  printf("  %s%d;\n", "p->ins_w = ", p->ins_w);
+  printf("  %s%d;\n", "p->ins_h = ", p->ins_h);
+  printf("  %s%d;\n", "p->ins_w_last = ", p->ins_w_last);
+  printf("  %s%d;\n", "p->ins_h_last = ", p->ins_h_last);
+
+  printf("  %s%d;\n", "p->r_shift_m = ", p->r_shift_m);
+  printf("  %s%d;\n", "p->opd0_sign = ", p->opd0_sign);
+  printf("  %s%d;\n", "p->opd1_sign = ", p->opd1_sign);
+  printf("  %s%d;\n", "p->opd2_sign = ", p->opd2_sign);
+  printf("  %s%d;\n", "p->bReLU_EN = ", p->bReLU_EN);
+  printf("  %s%d;\n", "p->using_bias = ", p->using_bias);
+
+  printf("  %s%d\n", "kh_ext = ", conv_kh_ext(p));
+  printf("  %s%d\n", "kw_ext = ", conv_kw_ext(p));
+  printf("  %s%d\n", "ih_ext = ", conv_ih_ext(p));
+  printf("  %s%d\n", "iw_ext = ", conv_iw_ext(p));
+  printf("  %s%d\n", "output_h = ", conv_oh(p));
+  printf("  %s%d\n", "output_w = ", conv_ow(p));
+}
+
+/* Calculate the right shift value, m
+ * Steps:
+ *  1. Get the abs() of each weight;
+ *  2. Summary all the abs() in one kernel;
+ *  3. Get Log2 of each sum;
+ *  4. Downward rounding;
+ * After every r_shift value got, sort and find the middle one.
+ */
+
+static int calc_rshift_m(const conv_param_t *p, int8_t* weight)
+{
+  int kernel_cnt = p->output_c * p->input_c;
+  int kernel_size = p->kh * p->kw;
+  int *kernel_shifts = (int *)malloc(sizeof(int) * kernel_cnt);
+  if (!kernel_shifts)
+    return 1;
+
+  memset(kernel_shifts, 0, sizeof(int) * kernel_cnt);
+
+  // Part 1:
+  // Get right shift value for each kernel
+  int sum = 0;
+  for (int i = 0; i < kernel_cnt; i++) {
+    // Step 1 & 2: Get the sum of abs()
+    for (int j = 0; j < kernel_size; j++) {
+      sum += (int)(*weight < 0 ? -(*weight) : (*weight));
+      weight++;
+    }
+    // Step 3 & 4: log2 and downward rounding
+    sum >>= 1;
+    while (sum) {
+      sum >>= 1;
+      kernel_shifts[i]++;
+    }
+  }
+
+  // Part 2:
+  // Find the middle of all the values
+  int tag[32] = {0};
+  for (int cnt = 0; cnt < kernel_cnt; cnt++) {
+    tag[kernel_shifts[cnt]]++;
+  }
+
+  int rshift_m = 0;
+  int mid = 0;
+  do {
+    mid += tag[rshift_m++];
+  } while(mid < (kernel_cnt - 1) >> 1);
+
+  free(kernel_shifts);
+
+  return rshift_m - 1;
+}
+
+static int test_ps32_ut(
+    conv_param_t *p_param, CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  printf("  test_ps32_ut\n");
+  int ret = 0;
+  int8_t *input = alloc_input(p_param);
+  int8_t *weight = alloc_weight(p_param);
+  int16_t *bias = alloc_bias(p_param);
+  int8_t *output_ref = (int8_t *)malloc(sizeof(int) * conv_output_size(p_param));
+  if (!input || !weight || !bias || !output_ref) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  p_param->r_shift_m = calc_rshift_m(p_param, weight);
+  ret = ps32_m2_conv_ref(p_param, input, weight, output_ref);
+  if (ret)
+    goto fail_exit;
+
+  cvk_tiu_pt_convolution_param_t conv_param;
+  make_bmk_conv_param_ps32(cvk_ctx, &conv_param, p_param, 2);
+  int tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, p_param);
+
+  if (tl_alloc_success) {
+
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, conv_param.ifmap, (uint8_t *)input);
+    put_conv_weight(rt_handle, cvk_ctx, conv_param.weight, (uint8_t *)weight);
+    cvk_ctx->ops->tiu_pt_convolution(cvk_ctx, &conv_param);
+
+    cvk_tl_t ps32_ofmap;
+    ps32_ofmap = *conv_param.ofmap;
+    ps32_ofmap.shape.n = ps32_ofmap.shape.n * sizeof(int);
+    uint8_t *output = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, &ps32_ofmap);
+
+    ret = array_cmp_int8(
+        "    Comparing begin_mode results ...\n",
+        output_ref, (int8_t *)output, conv_output_size(p_param) * sizeof(int));
+
+    if (ret) {
+      print_conv_param(p_param);
+      printf("    Comparison FAILED\n");
+    }
+    free(output);
+  }
+  free_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+
+  printf("    test_ps32_intermediate_mode\n");
+  for (int i=0; i < conv_input_size(p_param); i++)
+    input[i] = rand() % 256 - 128;
+
+  for (int i=0; i < conv_weight_size(p_param); i++)
+    weight[i] = rand() % 256 - 128;
+
+  ret = ps32_m3_conv_ref(p_param, input, weight, output_ref);
+  if (ret)
+    goto fail_exit;
+
+  make_bmk_conv_param_ps32(cvk_ctx, &conv_param, p_param, 3);
+  tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, p_param);
+
+  if (tl_alloc_success) {
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, conv_param.ifmap, (uint8_t *)input);
+    put_conv_weight(rt_handle, cvk_ctx, conv_param.weight, (uint8_t *)weight);
+
+    cvk_ctx->ops->tiu_pt_convolution(cvk_ctx, &conv_param);
+
+    cvk_tl_t ps32_ofmap;
+    ps32_ofmap = *conv_param.ofmap;
+    ps32_ofmap.shape.n = ps32_ofmap.shape.n * sizeof(int);
+
+    uint8_t *output = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, &ps32_ofmap);
+
+    ret = array_cmp_int8(
+        "    Comparing intermediate results ...\n",
+        output_ref, (int8_t *)output, conv_output_size(p_param) * sizeof(int));
+
+    if (ret) {
+      print_conv_param(p_param);
+      printf("    Comparison FAILED\n");
+    }
+    free(output);
+  }
+  free_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+
+  printf("    test_ps32_end_mode\n");
+  for (int i=0; i < conv_input_size(p_param); i++)
+    input[i] = rand() % 256 - 128;
+
+  for (int i=0; i < conv_weight_size(p_param); i++)
+    weight[i] = rand() % 256 - 128;
+
+  ret = ps32_m1_conv_ref(p_param, input, weight, bias, output_ref);
+  if (ret)
+    goto fail_exit;
+
+  make_bmk_conv_param_ps32(cvk_ctx, &conv_param, p_param, 1);
+
+  tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, p_param);
+
+  if (tl_alloc_success) {
+
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, conv_param.ifmap, (uint8_t *)input);
+    put_conv_weight(rt_handle, cvk_ctx, conv_param.weight, (uint8_t *)weight);
+    if (p_param->using_bias) {
+      put_conv_bias(rt_handle, cvk_ctx, conv_param.bias, bias);
+    }
+    cvk_ctx->ops->tiu_pt_convolution(cvk_ctx, &conv_param);
+    uint8_t *output = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, conv_param.ofmap);
+
+    ret = array_cmp_int8(
+        "    Comparing end results ...\n",
+        output_ref, (int8_t *)output, conv_output_size(p_param));
+
+    if (ret) {
+      print_conv_param(p_param);
+      printf("    Comparison FAILED\n");
+    }
+    free(output);
+  }
+  free_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+
+fail_exit:
+  free(input);
+  free(weight);
+  free(bias);
+  free(output_ref);
+
+  return ret;
+}
+
+static int test_ic_tiling_conv(
+    conv_param_t *p_param, CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  printf("  test tiled ps32 conv\n");
+  int ret = 0;
+  int8_t *input = alloc_input(p_param);
+  int8_t *weight = alloc_weight(p_param);
+  int16_t *bias = alloc_bias(p_param);
+  int8_t *output_ref = (int8_t *)malloc(sizeof(int8_t) * conv_output_size(p_param));
+  if (!input || !weight || !bias || !output_ref) {
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  p_param->r_shift_m = calc_rshift_m(p_param, weight);
+  ret = conv_ref(p_param, input, weight, bias, output_ref);
+  if (ret)
+    goto fail_exit_2;
+
+  cvk_tiu_pt_convolution_param_t conv_tmp_param;
+  cvk_tiu_pt_convolution_param_t conv_param;
+  make_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+
+  int tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, p_param);
+  if (tl_alloc_success) {
+    if (p_param->using_bias) {
+      conv_tmp_param.bias = conv_param.bias;
+      put_conv_bias(rt_handle, cvk_ctx, conv_param.bias, bias);
+    }
+    // for ps32_md[1] = 1, relu_enable & rshift_bits need to set to 0
+    // so we store those parameters to conv_tmp_para
+    conv_tmp_param.relu_enable = conv_param.relu_enable;
+    conv_tmp_param.rshift_bits = conv_param.rshift_bits;
+    conv_tmp_param.bias        = conv_param.bias;
+
+    uint32_t ic_step = 1;
+    uint32_t n_step = 1;
+    cvk_tl_t ifmap = *conv_param.ifmap;
+    cvk_tl_t ofmap = *conv_param.ofmap;
+    cvk_tg_shape_t s;
+    s.n = conv_param.ifmap->shape.n;
+    s.c = conv_param.ifmap->shape.c;
+    s.h = conv_param.ifmap->shape.h;
+    s.w = conv_param.ifmap->shape.w;
+    cvk_tg_t *tg_ifmap = alloc_tensor_dev_mem(rt_handle, cvk_ctx, s, CVK_FMT_I8);
+    tensor_copy_s2d(rt_handle, tg_ifmap, (uint8_t *)input);
+
+    s.n = conv_param.weight->shape.n;
+    s.c = conv_param.weight->shape.c;
+    s.h = conv_param.weight->shape.h;
+    s.w = conv_param.weight->shape.w;
+    uint8_t *transformed_weight =
+      transform_weight(&conv_param.weight->shape, (uint8_t *)weight);
+    cvk_tg_t *tg_weight = alloc_tensor_dev_mem(rt_handle, cvk_ctx, s, CVK_FMT_I8);
+    tensor_copy_s2d(rt_handle, tg_weight, (uint8_t *)transformed_weight);
+    free(transformed_weight);
+
+    cvk_tl_shape_t cur_tl_ifmap_shape = {
+        n_step,
+        ic_step,
+        ifmap.shape.h,
+        ifmap.shape.w
+    };
+
+    cvk_tg_shape_t cur_tg_ifmap_shape = {
+      cur_tl_ifmap_shape.n,
+      cur_tl_ifmap_shape.c,
+      cur_tl_ifmap_shape.h,
+      cur_tl_ifmap_shape.w
+    };
+
+    cvk_tg_stride_t cur_tg_ifmap_stride = {
+      tg_ifmap->stride.n,
+      tg_ifmap->stride.c,
+      tg_ifmap->stride.h,
+      1
+    };
+
+    cvk_tg_t cur_tg_ifmap;
+    cur_tg_ifmap.base_reg_index = 0;
+    cur_tg_ifmap.start_address = tg_ifmap->start_address;
+    cur_tg_ifmap.shape = cur_tg_ifmap_shape;
+    cur_tg_ifmap.stride = cur_tg_ifmap_stride;
+    cur_tg_ifmap.fmt = CVK_FMT_I8;
+
+    cvk_tl_t cur_tl_ifmap;
+    cur_tl_ifmap.shape = cur_tl_ifmap_shape;
+    cur_tl_ifmap.stride =
+      cvk_ctx->ops->tl_default_stride(cvk_ctx, cur_tl_ifmap_shape, CVK_FMT_I8, 1);
+    cur_tl_ifmap.start_address = ifmap.start_address;
+    cur_tl_ifmap.fmt = ifmap.fmt;
+
+    cvk_tl_t cur_tl_ofmap;
+    cur_tl_ofmap.start_address = ofmap.start_address;
+    cur_tl_ofmap.shape = ofmap.shape;
+    cur_tl_ofmap.shape.n = n_step;
+    cur_tl_ofmap.stride =
+      cvk_ctx->ops->tl_default_stride(cvk_ctx, cur_tl_ofmap.shape, CVK_FMT_I8, 1);
+    cur_tl_ofmap.fmt = ofmap.fmt;
+
+    cvk_tl_t cur_tl_weight;
+    cur_tl_weight.start_address = conv_param.weight->start_address;
+    cur_tl_weight.shape = conv_param.weight->shape;
+    cur_tl_weight.shape.n = ic_step;
+    cur_tl_weight.stride.n = 1;
+    cur_tl_weight.stride.c = cur_tl_weight.shape.n * cur_tl_weight.shape.h * cur_tl_weight.shape.w;
+    cur_tl_weight.stride.h = cur_tl_weight.shape.n * cur_tl_weight.shape.w;
+    cur_tl_weight.stride.w = cur_tl_weight.shape.n;
+    cur_tl_weight.fmt = conv_param.weight->fmt;
+
+    const cvk_tl_t *saved_tl_weight = conv_param.weight;
+    const cvk_tl_t *saved_tl_ifmap = conv_param.ifmap;
+    for (uint32_t ci = 0; ci < ifmap.shape.c; ci += ic_step) {
+      {
+        uint32_t ic = tg_weight->shape.n;
+        uint32_t oc = tg_weight->shape.c;
+        uint32_t kh = tg_weight->shape.h;
+        uint32_t kw = tg_weight->shape.w;
+
+        cvk_tg_t cur_tdma_tg_weight;
+        cur_tdma_tg_weight.base_reg_index = tg_weight->base_reg_index;
+        cur_tdma_tg_weight.start_address = tg_weight->start_address + ci;
+        cur_tdma_tg_weight.fmt = tg_weight->fmt;
+        cur_tdma_tg_weight.shape = tg_shape_t4(1, oc, kh * kw, ic);
+        cur_tdma_tg_weight.stride =
+          cvk_ctx->ops->tg_default_stride(cvk_ctx, cur_tdma_tg_weight.shape, cur_tdma_tg_weight.fmt);
+        cur_tdma_tg_weight.shape = tg_shape_t4(1, oc, kh * kw, ic_step);
+
+        cvk_tl_t cur_tdma_tl_weight;
+        cur_tdma_tl_weight = cur_tl_weight;
+        cur_tdma_tl_weight.shape.n = cur_tdma_tg_weight.shape.n;
+        cur_tdma_tl_weight.shape.c = cur_tdma_tg_weight.shape.c;
+        cur_tdma_tl_weight.shape.h = cur_tdma_tg_weight.shape.h;
+        cur_tdma_tl_weight.shape.w = cur_tdma_tg_weight.shape.w;
+        cur_tdma_tl_weight.stride = cvk_ctx->ops->tl_default_stride(
+            cvk_ctx, cur_tdma_tl_weight.shape, CVK_FMT_I8, 0);
+
+        cvk_tdma_g2l_tensor_copy_param_t p1;
+        memset(&p1, 0, sizeof(p1));
+        p1.src = &cur_tdma_tg_weight;
+        p1.dst = &cur_tdma_tl_weight;
+        cvk_ctx->ops->tdma_g2l_tensor_copy(cvk_ctx, &p1);
+        CVI_RT_Submit(cvk_ctx);
+      }
+      {
+        cvk_tdma_g2l_tensor_copy_param_t p2;
+        memset(&p2, 0, sizeof(p2));
+        cur_tg_ifmap.start_address =
+          tg_ifmap->start_address + ci * tg_ifmap->stride.c;
+        p2.src = &cur_tg_ifmap;
+        p2.dst = &cur_tl_ifmap;
+        cvk_ctx->ops->tdma_g2l_tensor_copy(cvk_ctx, &p2);
+        CVI_RT_Submit(cvk_ctx);
+      }
+
+      conv_param.ifmap = &cur_tl_ifmap;
+      conv_param.weight = &cur_tl_weight;
+
+      // for ps32_md[1] = 1, relu_enable & rshift_bits need to set to 0
+      // so we store those parameters to conv_tmp_para
+      if (ci == (ifmap.shape.c - 1))
+      {
+        conv_param.relu_enable = conv_tmp_param.relu_enable;
+        conv_param.rshift_bits = conv_tmp_param.rshift_bits;
+        conv_param.bias        = conv_tmp_param.bias;
+        conv_param.ps32_mode   = 1;
+      }
+      else if (ci == 0)
+      {
+        conv_param.relu_enable = 0;
+        conv_param.rshift_bits = 0;
+        conv_param.bias        = 0;;
+        conv_param.ps32_mode   = 2;
+      }
+      else
+      {
+        conv_param.relu_enable = 0;
+        conv_param.rshift_bits = 0;
+        conv_param.bias        = 0;;
+        conv_param.ps32_mode   = 3;
+      }
+      cvk_ctx->ops->tiu_pt_convolution(cvk_ctx, &conv_param);
+      conv_param.weight = saved_tl_weight;
+      conv_param.ifmap = saved_tl_ifmap;
+    }
+
+    uint8_t *output = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, conv_param.ofmap);
+
+    free_tensor_dev_mem(rt_handle, tg_ifmap);
+    free_tensor_dev_mem(rt_handle, tg_weight);
+    ret = array_cmp_int8(
+        "    Comparing results ...\n",
+        output_ref, (int8_t *)output, conv_output_size(p_param));
+
+    if (ret) {
+      print_conv_param(p_param);
+      printf("    Comparison FAILED\n");
+    }
+    free(output);
+  }
+  free_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+
+fail_exit_2:
+  free(input);
+  free(weight);
+  free(output_ref);
+  free(bias);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  for (int i = 0; i < 5; i++) {
+    printf("random_test_conv iteration: %d\n", i);
+    conv_param_t test_conv_param;
+    init_conv_param(&test_conv_param);
+    ret |= test_ic_tiling_conv(&test_conv_param, rt_handle, cvk_ctx);
+    ret |= test_ps32_ut(&test_conv_param, rt_handle, cvk_ctx);
+    if (!test_conv_param.using_bias)
+      test_conv_param.using_bias = 1;
+    if (test_conv_param.output_c <= 9)
+      test_conv_param.output_c += 3;
+    ret |= test_ic_tiling_conv(&test_conv_param, rt_handle, cvk_ctx);
+    ret |= test_ps32_ut(&test_conv_param, rt_handle, cvk_ctx);
+  }
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_conv_qm.c b/cviruntime/test/180x/test_180x_conv_qm.c
new file mode 100644
index 000000000..9db8764bf
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_conv_qm.c
@@ -0,0 +1,1568 @@
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_tf_quant_util.h"
+#include "test_native_ref.h"
+
+// #define ENABLE_DEBUG_MSG
+// #define ENABLE_FULL_REGRESSION
+
+#define MIN_EXEC_TESTS  20
+
+typedef struct {
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int output_h;
+  int output_w;
+  int has_bias;
+  int relu_enable;
+  int8_t *input_data;
+  int8_t *filter_data;
+  int8_t *output_data;
+  int32_t *bias_data;
+  uint32_t *multiplier_data;
+  int8_t *shift_data;
+  float float_multiplier;
+  int retry_cnt;
+} conv_test_param_t;
+
+static inline int Offset(cvk_tl_shape_t shape, int n, int c, int h, int w)
+{
+  return n * (shape.c * shape.h * shape.w) + c * (shape.h * shape.w) +
+         h * shape.w + w;
+}
+
+void conv_per_channel_ref(conv_test_param_t *p_param)
+{
+  const int stride_width = p_param->stride_w;
+  const int stride_height = p_param->stride_h;
+  const int dilation_width_factor = 1;
+  const int dilation_height_factor = 1;
+  const int pad_width = p_param->pad_left;
+  const int pad_height = p_param->pad_top;
+
+  const int32_t output_activation_min = -128;
+  const int32_t output_activation_max = 127;
+
+  const int batches = p_param->input_n;
+  const int input_depth = p_param->input_c;
+  const int output_depth = p_param->output_c;
+
+  const int input_height = p_param->input_h;
+  const int input_width = p_param->input_w;
+  const int filter_height = p_param->kh;
+  const int filter_width = p_param->kw;
+  const int output_height = p_param->output_h;
+  const int output_width = p_param->output_w;
+  int8_t *input_data = p_param->input_data;
+  int8_t *filter_data = p_param->filter_data;
+  int8_t *output_data = p_param->output_data;
+  int32_t *bias_data = p_param->has_bias ? p_param->bias_data : NULL;
+  uint32_t *output_multiplier = p_param->multiplier_data;
+  int8_t *output_rshift = p_param->shift_data;
+
+  cvk_tl_shape_t input_shape = {
+      batches, input_depth, input_height, input_width};
+  cvk_tl_shape_t filter_shape = {
+      output_depth, filter_height, filter_width, input_depth};
+  cvk_tl_shape_t output_shape = {
+      batches, output_depth, output_height, output_width};
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("conv_per_channel_ref: \n"
+         "  input (n=%d, ic=%d, h=%d, w=%d)\n"
+         "  kernel (oc=%d, kh=%d, kw=%d, ic=%d)\n",
+         batches, input_depth, input_height, input_width, output_depth,
+         filter_height, filter_width, input_depth);
+#endif
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          const int in_y_origin = (out_y * stride_height) - pad_height;
+          int32_t acc = 0;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  int32_t input_val = input_data[Offset(input_shape, batch,
+                                                    in_channel, in_y, in_x)];
+                  // int32_t filter_val = filter_data[Offset(filter_shape,
+                  // out_channel, in_channel,
+                  //                                    filter_y, filter_x)];
+                  int32_t filter_val =
+                      filter_data[Offset(filter_shape, out_channel, filter_y,
+                                         filter_x, in_channel)];
+                  acc += filter_val * input_val;
+
+#ifdef ENABLE_DEBUG_MSG
+                  printf("  [batch=%d][out_channel=%d][out_y=%d][out_x=%d]"
+                         "[filter_y=%d][filter_x=%d][in_channel=%d] acc(%d) += "
+                         "%d * %d = %d\n",
+                         batch, out_channel, out_y, out_x, filter_y, filter_x,
+                         in_channel, acc - filter_val * input_val, filter_val,
+                         input_val, acc);
+#endif
+                }
+              }
+            }
+          }
+
+          if (bias_data) {
+            acc += bias_data[out_channel];
+          }
+
+#ifdef ENABLE_DEBUG_MSG
+          printf("  [batch=%d][out_channel=%d][out_y=%d][out_x=%d] acc = %d, "
+                 "bias %d\n",
+                 batch, out_channel, out_y, out_x, acc,
+                 bias_data ? bias_data[out_channel] : 0);
+#endif
+
+          acc = MultiplyByQuantizedMultiplier(
+              acc, output_multiplier[out_channel], output_rshift[out_channel]);
+
+#ifdef ENABLE_DEBUG_MSG
+          printf("  [batch=%d][out_channel=%d][out_y=%d][out_x=%d] acc = %d, "
+                 "multiplier %d, shift %d\n",
+                 batch, out_channel, out_y, out_x, acc,
+                 output_multiplier[out_channel], output_rshift[out_channel]);
+#endif
+
+          acc = MAX(acc, output_activation_min);
+          acc = MIN(acc, output_activation_max);
+
+#ifdef ENABLE_DEBUG_MSG
+          printf("  [batch=%d][out_channel=%d][out_y=%d][out_x=%d] acc = %d\n",
+                 batch, out_channel, out_y, out_x, acc);
+#endif
+
+          output_data[Offset(output_shape, batch, out_channel, out_y, out_x)] =
+              (uint8_t)acc;
+        }
+      }
+    }
+  }
+}
+
+void calc_conv_float_multiplier(conv_test_param_t *p_param)
+{
+  const int stride_width = p_param->stride_w;
+  const int stride_height = p_param->stride_h;
+  const int dilation_width_factor = 1;
+  const int dilation_height_factor = 1;
+  const int pad_width = p_param->pad_left;
+  const int pad_height = p_param->pad_top;
+
+  const int batches = p_param->input_n;
+  const int input_depth = p_param->input_c;
+  const int output_depth = p_param->output_c;
+
+  const int input_height = p_param->input_h;
+  const int input_width = p_param->input_w;
+  const int filter_height = p_param->kh;
+  const int filter_width = p_param->kw;
+  const int output_height = p_param->output_h;
+  const int output_width = p_param->output_w;
+  int8_t *input_data = p_param->input_data;
+  int8_t *filter_data = p_param->filter_data;
+  int32_t *bias_data = p_param->has_bias ? p_param->bias_data : NULL;
+
+  cvk_tl_shape_t input_shape = {
+      batches, input_depth, input_height, input_width};
+  cvk_tl_shape_t filter_shape = {
+      output_depth, filter_height, filter_width, input_depth};
+
+  int output_accu_min = INT_MAX;
+  int output_accu_max = INT_MIN;
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("calc_conv_float_multiplier =>\n");
+#endif
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          const int in_y_origin = (out_y * stride_height) - pad_height;
+          int32_t acc = 0;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  int32_t input_val = input_data[Offset(input_shape, batch,
+                                                    in_channel, in_y, in_x)];
+                  // int32_t filter_val = filter_data[Offset(filter_shape,
+                  // out_channel, in_channel,
+                  //                                    filter_y, filter_x)];
+                  int32_t filter_val =
+                      filter_data[Offset(filter_shape, out_channel, filter_y,
+                                         filter_x, in_channel)];
+                  acc += filter_val * input_val;
+
+                  // printf("  [batch=%d][out_channel=%d][out_y=%d][out_x=%d]"
+                  //        "[filter_y=%d][filter_x=%d][in_channel=%d] acc(%d)
+                  //        += %d * %d = %d\n", batch, out_channel, out_y,
+                  //        out_x, filter_y, filter_x, in_channel, acc -
+                  //        filter_val * input_val, filter_val, input_val, acc);
+                }
+              }
+            }
+          }
+
+          if (bias_data) {
+            acc += bias_data[out_channel];
+          }
+
+          output_accu_max = MAX(acc, output_accu_max);
+          output_accu_min = MIN(acc, output_accu_min);
+        }
+      }
+    }
+  }
+
+  // Since int8 ranges from -128 to 127, we need to squeeze the accumulator
+  // MIN/MAX fit in those ranges correspondingly as much as possible.
+  if (abs(output_accu_max) > abs(output_accu_min)) {
+    p_param->float_multiplier = 127.0f / abs(output_accu_max);
+  } else {
+    p_param->float_multiplier = 128.0f / abs(output_accu_min);
+  }
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("  output_accu_min %d, output_accu_max %d, output_multiplier %f\n",
+         output_accu_min, output_accu_max, p_param->float_multiplier);
+#endif
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("<= calc_dw_conv_float_multiplier\n");
+#endif
+}
+
+int simple_test(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+
+  const int batches = 1;
+  const int input_depth = 2;
+  const int input_height = 2;
+  const int input_width = 3;
+  cvk_tl_shape_t input_shape = {batches, input_depth, input_height, input_width};
+  int8_t input_data[12] = {
+      9,  1,   -11,  // ic = 0, h = 0
+      13, 5,   -15,  // ic = 0, h = 1
+      5,  -7,  -15,  // ic = 1, h = 0
+      9,  -11, -19   // ic = 1, h = 1
+  };
+
+  const int output_depth = 2;
+  const int kernel_height = 2;
+  const int kernel_width = 2;
+  cvk_tl_shape_t filter_shape = {output_depth, input_depth, kernel_height,
+                             kernel_width};
+
+  // TIU weight layout (1, oc, hw*kc, ic)
+  cvk_tl_shape_t filter_shape_for_dma = {1, output_depth,
+                                     kernel_height * kernel_width, input_depth};
+  int8_t filter_data_for_dma[16] = {
+      2,  4,  6,  8,  6,  8,  10, 12,  // oc = 0
+      28, 32, 20, 24, 12, 16, 4,  8    // oc = 1
+  };
+
+  int32_t bias_data[2] = {12, -16};
+
+  const int output_height = 1;
+  const int output_width = 2;
+  cvk_tl_shape_t output_shape = {1, output_depth, output_height, output_width};
+  // zero_point = 0
+  int8_t ref_output_data[4] = {
+      17, -128,  // oc = 0
+      60, -128,  // oc = 1
+  };
+
+  uint32_t output_multiplier[] = {1073741824, 1073741824};
+  int8_t output_rshift[2] = {1, 2};  // changed to right shift
+
+  int8_t output_data[4];
+
+  conv_test_param_t params;
+  memset(&params, 0, sizeof(params));
+
+  params.input_n = batches;
+  params.input_c = input_depth;
+  params.input_h = input_height;
+  params.input_w = input_width;
+  params.kh = kernel_height;
+  params.kw = kernel_width;
+  params.output_c = output_depth;
+  params.output_h = output_height;
+  params.output_w = output_width;
+  params.stride_w = 1;
+  params.stride_h = 1;
+  params.input_data = input_data;
+  params.filter_data = filter_data_for_dma;
+  params.output_data = output_data;
+  params.has_bias = 1;
+  params.bias_data = bias_data;
+  params.multiplier_data = output_multiplier;
+  params.shift_data = output_rshift;
+  conv_per_channel_ref(&params);
+
+  printf("Compare ref and golden\n");
+  for (int i = 0; i < 4; i++) {
+    if (output_data[i] != ref_output_data[i]) {
+      printf("Error ! output[%d]=%d != ref_output_data[%d]=%d\n", i,
+             output_data[i], i, ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  // cvk_tl_shape_t per_channel_cal_shape = {1, /*oc=*/2, 1, 9};
+  uint8_t per_channel_cal_data[18];
+  pack_chl_quan_param(2, /*has_bias=*/true, bias_data, output_multiplier,
+                      output_rshift, per_channel_cal_data);
+
+  cvk_tl_shape_t quan_param_shape = {1, 2, 1, 9};
+  cvk_tl_t *tl_per_channel_cal =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, quan_param_shape, CVK_FMT_U8,
+                                  /*eu_align*/ 0);
+
+  cvk_tl_t *tl_input =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, input_shape, CVK_FMT_I8, /*eu_align=*/1);
+
+  cvk_tl_t *tl_filter = cvk_ctx->ops->lmem_alloc_tensor(
+      cvk_ctx, filter_shape_for_dma, CVK_FMT_I8, /*eu_align=*/1);
+
+  cvk_tl_t *tl_output =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, output_shape, CVK_FMT_I8, /*eu_align=*/1);
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_per_channel_cal, per_channel_cal_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_input, (uint8_t *)input_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_filter, (uint8_t *)filter_data_for_dma);
+
+  // Restore filter shape for tiu operation
+  tl_filter->shape = filter_shape;
+  tl_filter->stride = cvk_ctx->ops->tl_default_stride(
+      cvk_ctx, tl_filter->shape, CVK_FMT_I8, /*eu_align=*/1);
+
+  {
+    // Reshape per channel quantization data
+    tl_per_channel_cal->shape = tl_shape_t4(1, 2, 1, 1);
+    tl_per_channel_cal->stride = cvk_ctx->ops->tl_default_stride(
+        cvk_ctx, tl_per_channel_cal->shape, CVK_FMT_I8, /*eu_align=*/0);
+
+    cvk_tiu_convolution_param_t param;
+    memset(&param, 0, sizeof(param));
+    param.ofmap = tl_output;
+    param.ifmap = tl_input;
+    param.weight = tl_filter;
+    param.chl_quan_param = tl_per_channel_cal;
+    param.stride_h = 1;
+    param.stride_w = 1;
+    param.dilation_h = 1;
+    param.dilation_w = 1;
+    param.has_bias = 1;
+    cvk_ctx->ops->tiu_convolution(cvk_ctx, &param);
+  }
+
+  CVI_RT_Submit(cvk_ctx);
+
+  printf("Compare tiu and golden\n");
+  int8_t *conv_output_data =
+      (int8_t *)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_output);
+  for (int i = 0; i < (int)sizeof(ref_output_data); i++) {
+    if (conv_output_data[i] != ref_output_data[i]) {
+      printf("output_data[%d] %d != %d\n", i, conv_output_data[i],
+             ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  free(conv_output_data);
+
+  // Reverse order
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_output);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_filter);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_input);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_per_channel_cal);
+
+  return ret;
+}
+
+void fill_random_data_s8(int8_t *input_data, int size)
+{
+  for (int i = 0; i < size; ++i) {
+    int is_satured = ((rand() % 1000) == 1) ? 1 : 0;
+    int is_sign = rand() % 2 ? 1 : -1;
+
+    if (is_satured && is_sign) {
+      input_data[i] = -128;
+    } else if (is_satured) {
+      input_data[i] = 127;
+    } else {
+      input_data[i] = is_sign * rand() % 128;
+    }
+  }
+}
+
+void fill_random_data_s32(int32_t *input_data, int size)
+{
+  for (int i = 0; i < size; ++i) {
+    int is_satured = ((rand() % 1000) == 1) ? 1 : 0;
+    int is_sign = rand() % 2 ? 1 : -1;
+
+    if (is_satured && is_sign) {
+      input_data[i] = INT_MIN;
+    } else if (is_satured) {
+      input_data[i] = INT_MAX;
+    } else {
+      input_data[i] = is_sign * rand() % 128;
+    }
+  }
+}
+
+bool check_valid_test_param(cvk_context_t *cvk_ctx, conv_test_param_t *p_param)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int oh = p_param->output_h;
+  int ow = p_param->output_w;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int chl_quan_per_lane_data_size =
+      p_param->has_bias ? 9 : 5;  // bias(4) + multiplier(4) + shift(1)
+
+  // Skip invalid shape
+  if ((kh > ih) || (kw > iw) || (stride_h > ih) || (stride_w > iw)) {
+    return false;
+  }
+
+  // muliply random-choosen value may exceeded than int32_t
+  uint32_t input_size = in * ic * ih * iw;
+  uint32_t kernel_size = oc * ic * kh * kw;
+  uint32_t output_size = in * oc * oh * ow;
+
+  uint32_t lmem_size_per_lane = cvk_ctx->info.lmem_size;
+  uint32_t total_lmem_size = cvk_ctx->info.lmem_size * cvk_ctx->info.npu_num;
+
+  uint32_t total_needed_size = input_size + kernel_size + output_size +
+                          chl_quan_per_lane_data_size * cvk_ctx->info.npu_num;
+  if (total_needed_size > total_lmem_size) {
+    return false;
+  }
+
+  cvk_tl_shape_t input_shape = tl_shape_t4(in, ic, ih, iw);
+  cvk_tl_shape_t filter_shape = tl_shape_t4(1, oc, kh * kw, ic);
+  cvk_tl_shape_t output_shape = tl_shape_t4(in, oc, oh, ow);
+  cvk_tl_shape_t cal_shape = tl_shape_t4(1, oc, 1, chl_quan_per_lane_data_size);
+
+  uint32_t needed_size =
+      cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, input_shape, CVK_FMT_I8, /*eu_align=*/1) +
+      cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, filter_shape, CVK_FMT_I8, /*eu_align=*/0) +
+      cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, output_shape, CVK_FMT_I8, /*eu_align=*/1) +
+      cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, cal_shape, CVK_FMT_I8, /*eu_align=*/0);
+
+  // Skip invalid shape
+  if (needed_size > lmem_size_per_lane) {
+    return false;
+  }
+
+  return true;
+}
+
+int choose_from_range(int table[], int size, int index)
+{
+  if (index >= size) {
+    return 0;
+  }
+
+  int val = table[index];
+  if (index < (size - 1)) {
+    int range = MAX(table[index + 1] - table[index] - 1, 1);
+    val += rand() % range;
+  }
+
+  return val;
+}
+
+void dump_test_param(conv_test_param_t *p_param, bool dump_content)
+{
+  printf("Dump test parameter:\n");
+  printf("  input_n %d\n", p_param->input_n);
+  printf("  input_c %d\n", p_param->input_c);
+  printf("  input_h %d\n", p_param->input_h);
+  printf("  input_w %d\n", p_param->input_w);
+  printf("  kw %d\n", p_param->kw);
+  printf("  kh %d\n", p_param->kh);
+  printf("  dh %d\n", p_param->dh);
+  printf("  dw %d\n", p_param->dw);
+  printf("  pad_top %d\n", p_param->pad_top);
+  printf("  pad_bot %d\n", p_param->pad_bot);
+  printf("  pad_left %d\n", p_param->pad_left);
+  printf("  pad_right %d\n", p_param->pad_right);
+  printf("  ins_h %d\n", p_param->ins_h);
+  printf("  ins_h_last %d\n", p_param->ins_h_last);
+  printf("  ins_w %d\n", p_param->ins_w);
+  printf("  ins_w_last %d\n", p_param->ins_w_last);
+  printf("  stride_h %d\n", p_param->stride_h);
+  printf("  stride_w %d\n", p_param->stride_w);
+  printf("  output_c %d\n", p_param->output_c);
+  printf("  output_h %d\n", p_param->output_h);
+  printf("  output_w %d\n", p_param->output_w);
+  printf("  has_bias %d\n", p_param->has_bias);
+  printf("  relu_enable %d\n", p_param->relu_enable);
+
+  if (dump_content) {
+    printf("input_data(%d, %d, %d, %d) :\n", p_param->input_n, p_param->input_c,
+           p_param->input_h, p_param->input_w);
+    int in = p_param->input_n;
+    int ic = p_param->input_c;
+    int ih = p_param->input_h;
+    int iw = p_param->input_w;
+    for (int i = 0; i < in; ++i) {
+      for (int j = 0; j < ic; ++j) {
+        for (int k = 0; k < ih; ++k) {
+          for (int l = 0; l < iw; ++l) {
+            int offset = i * (ic * ih * iw) + j * (ih * iw) + k * iw + l;
+            printf("%d, ", p_param->input_data[offset]);
+          }
+          printf("\n");
+        }
+      }
+    }
+    printf("\n\n");
+
+    printf("kener_data (oc=%d, kh=%d, kw=%d, ic=%d)\n", p_param->output_c,
+           p_param->kh, p_param->kw, p_param->input_c);
+    int oc = p_param->output_c;
+    int kh = p_param->kh;
+    int kw = p_param->kw;
+    for (int i = 0; i < oc; ++i) {
+      for (int j = 0; j < kh; ++j) {
+        for (int k = 0; k < kw; ++k) {
+          for (int l = 0; l < ic; ++l) {
+            int offset = i * (kh * kw * ic) + j * (kw * ic) + k * ic + l;
+            printf("%d, ", p_param->filter_data[offset]);
+          }
+          printf("\n");
+        }
+      }
+    }
+    printf("\n\n");
+
+    if (p_param->has_bias) {
+      printf("bias_data:\n");
+      for (int i = 0; i < oc; ++i) {
+        printf("%d, ", p_param->bias_data[i]);
+      }
+      printf("\n\n");
+    }
+
+    printf("multiplier_data:\n");
+    for (int i = 0; i < oc; ++i) {
+      printf("%d, ", p_param->multiplier_data[i]);
+    }
+    printf("\n\n");
+
+    printf("shift_data:\n");
+    for (int i = 0; i < oc; ++i) {
+      printf("%d, ", p_param->shift_data[i]);
+    }
+    printf("\n\n");
+  }
+}
+
+
+
+static conv_test_param_t keepFailParam;;
+static int8_t *keep_input_data = NULL;
+
+static int keep_kernel_size = 0;
+static int8_t *keep_kernel_data = NULL;
+
+static int keep_output_size = 0;
+static int8_t *keep_output_data = NULL;
+
+static int32_t *keep_bias_data = NULL;
+static uint32_t *keep_multiplier_data = NULL;
+static int8_t *keep_shift_data = NULL;
+
+
+int keep_fail_param(conv_test_param_t *p_param)
+{
+	int in = p_param->input_n;
+	int ic = p_param->input_c;
+	int ih = p_param->input_h;
+	int iw = p_param->input_w;
+	int oc = p_param->output_c;
+	int oh = p_param->output_h;
+	int ow = p_param->output_w;
+	int kh = p_param->kh;
+	int kw = p_param->kw;
+	//int dh = p_param->dh;
+	//int dw = p_param->dw;
+	//int pad_top = p_param->pad_top;
+	//int pad_bot = p_param->pad_bot;
+	//int pad_left = p_param->pad_left;
+	//int pad_right = p_param->pad_right;
+	//int ins_h = p_param->ins_h;
+	//int ins_last_h = p_param->ins_h_last;
+	//int ins_w = p_param->ins_w;
+	//int ins_last_w = p_param->ins_w_last;
+	//int stride_h = p_param->stride_h;
+	//int stride_w = p_param->stride_w;
+	int has_bias = p_param->has_bias;
+	//int relu_enable = p_param->relu_enable;
+
+
+	memcpy(&keepFailParam, p_param, sizeof(conv_test_param_t));
+
+	int input_size = in * ic * iw * ih;
+	keep_input_data = (int8_t *)malloc(input_size);
+	memcpy(keep_input_data, p_param->input_data, input_size);
+
+	
+	keep_kernel_size = oc * ic * kh * kw;
+	keep_kernel_data = (int8_t *)malloc(keep_kernel_size);
+	memcpy(keep_kernel_data, p_param->filter_data, keep_kernel_size);
+	
+	keep_output_size = in * oc * oh * ow;
+	keep_output_data = (int8_t *)malloc(keep_output_size);
+	memcpy(keep_output_data, p_param->output_data, keep_output_size);
+
+	keep_bias_data = (int32_t *) malloc(sizeof(int32_t) * oc);
+	memcpy(keep_bias_data, p_param->bias_data, sizeof(int32_t) * oc);
+
+	keep_multiplier_data = (uint32_t *) malloc(sizeof(uint32_t) * oc);
+	memcpy(keep_multiplier_data, p_param->multiplier_data, sizeof(int32_t) * oc);
+
+	keep_shift_data = (int8_t *)malloc(oc);
+	memcpy(keep_shift_data, p_param->shift_data, oc);
+	
+
+
+	keepFailParam.input_data = keep_input_data;
+	keepFailParam.filter_data = keep_kernel_data;
+	keepFailParam.output_data = keep_output_data;
+	keepFailParam.has_bias = has_bias;
+	keepFailParam.bias_data = keep_bias_data;
+	keepFailParam.multiplier_data = keep_multiplier_data;
+	keepFailParam.shift_data = keep_shift_data;
+
+	return 0;
+}
+
+
+void dump2_test_param(conv_test_param_t *p_param)
+{
+	printf("dump2_test_param:\n");
+	printf("  input_n %d\n", p_param->input_n);
+	printf("  input_c %d\n", p_param->input_c);
+	printf("  input_h %d\n", p_param->input_h);
+	printf("  input_w %d\n", p_param->input_w);
+	printf("  kw %d\n", p_param->kw);
+	printf("  kh %d\n", p_param->kh);
+	printf("  dh %d\n", p_param->dh);
+	printf("  dw %d\n", p_param->dw);
+	printf("  pad_top %d\n", p_param->pad_top);
+	printf("  pad_bot %d\n", p_param->pad_bot);
+	printf("  pad_left %d\n", p_param->pad_left);
+	printf("  pad_right %d\n", p_param->pad_right);
+	printf("  ins_h %d\n", p_param->ins_h);
+	printf("  ins_h_last %d\n", p_param->ins_h_last);
+	printf("  ins_w %d\n", p_param->ins_w);
+	printf("  ins_w_last %d\n", p_param->ins_w_last);
+	printf("  stride_h %d\n", p_param->stride_h);
+	printf("  stride_w %d\n", p_param->stride_w);
+	printf("  output_c %d\n", p_param->output_c);
+	printf("  output_h %d\n", p_param->output_h);
+	printf("  output_w %d\n", p_param->output_w);
+	printf("  has_bias %d\n", p_param->has_bias);
+	printf("  relu_enable %d\n", p_param->relu_enable);
+
+	keep_fail_param(p_param);
+	printf("dump2_test_param\n\n");
+	assert(0);
+}
+
+int run_compare_conv(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx,
+                     conv_test_param_t *p_param)
+{
+  int ret = 0;
+
+  if (rt_handle == NULL || cvk_ctx == NULL) {
+    return -1;
+  }
+
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int oh = p_param->output_h;
+  int ow = p_param->output_w;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_last_h = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_last_w = p_param->ins_w_last;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int has_bias = p_param->has_bias;
+  int relu_enable = p_param->relu_enable;
+
+  int input_size = in * ic * iw * ih;
+  int8_t *input_data = (int8_t *)malloc(input_size);
+
+  int kernel_size = oc * ic * kh * kw;
+  int8_t *kernel_data = (int8_t *)malloc(kernel_size);
+
+  int output_size = in * oc * oh * ow;
+  int8_t *output_data = (int8_t *)malloc(output_size);
+  memset(output_data, 0, output_size);
+
+  int32_t *bias_data = (int32_t *) malloc(sizeof(int32_t) * oc);
+  uint32_t *multiplier_data = (uint32_t *) malloc(sizeof(uint32_t) * oc);
+  int8_t *shift_data = (int8_t *)malloc(oc);
+
+  p_param->input_data = input_data;
+  p_param->filter_data = kernel_data;
+  p_param->output_data = output_data;
+  p_param->has_bias = has_bias;
+  p_param->bias_data = bias_data;
+  p_param->multiplier_data = multiplier_data;
+  p_param->shift_data = shift_data;
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    run_compare_conv =>\n");
+  printf("      input (n=%d, ic=%d, h=%d, w=%d), kernel (oc=%d, ic=%d, h=%d, "
+         "w=%d), output (, c=%d, h=%d, w=%d), has_bias %d\n",
+         in, ic, ih, iw, oc, ic, kh, kw, oc, oh, ow, has_bias);
+#endif
+
+  int retry_cnt = p_param->retry_cnt;
+  do {
+    fill_random_data_s8(input_data, input_size);
+    fill_random_data_s8(kernel_data, kernel_size);
+    if (has_bias) {
+      fill_random_data_s32(bias_data, oc);
+    }
+
+    p_param->float_multiplier = 100.0;  // should be < 1.0
+    calc_conv_float_multiplier(p_param);
+
+    if (p_param->float_multiplier > 0.f && p_param->float_multiplier < 1.0) {
+      break;
+    }
+
+  } while (--retry_cnt);
+
+  if (p_param->float_multiplier >= 1.0) {
+    printf("    run_compare_dw_conv: unable to find valid multiplier\n");
+    free(input_data);
+    free(kernel_data);
+    free(output_data);
+    free(bias_data);
+    free(multiplier_data);
+    free(shift_data);
+
+    return -1;
+  }
+
+  uint32_t base_multiplier = 0;
+  int base_shift = 0;
+  QuantizeMultiplierSmallerThanOne(p_param->float_multiplier, &base_multiplier,
+                                   &base_shift);
+
+  for (int i = 0; i < oc; ++i) {
+    // multipliers typically range in [2^30 ; 2^31 - 1].
+    // Values in [0, 2^30 - 1] are normally unused, but harmless.
+    // Thus a good way to randomize multipliers is to subtract from them
+    // a random value smaller than 2^30 but still significant compared to it.
+    p_param->multiplier_data[i] = base_multiplier - (rand() % (1 << 26));
+
+    int right_shift = base_shift - 1 + (rand() % 4);
+    p_param->shift_data[i] =
+        truncate_rshift((int8_t)right_shift, /*allow_lshift*/1);
+
+#ifdef ENABLE_DEBUG_MSG
+    printf("      [oc=%d] multiplier_data %d, shift_data %d\n", i,
+           p_param->multiplier_data[i], p_param->shift_data[i]);
+#endif
+  }
+
+  conv_per_channel_ref(p_param);
+
+  const int chl_quan_per_lane_data_size =
+      p_param->has_bias ? 9 : 5;  // bias(4) + multiplier(4) + shift(1)
+  const int cal_data_size = oc * chl_quan_per_lane_data_size;
+  uint8_t *chl_quan_data = (uint8_t *) malloc(cal_data_size);
+  pack_chl_quan_param(oc, p_param->has_bias, p_param->bias_data,
+                      p_param->multiplier_data, p_param->shift_data,
+                      chl_quan_data);
+
+  cvk_tl_shape_t input_shape = tl_shape_t4(in, ic, ih, iw);
+  cvk_tl_shape_t filter_shape = tl_shape_t4(1, oc, kh * kw, ic);
+  cvk_tl_shape_t output_shape = tl_shape_t4(in, oc, oh, ow);
+  cvk_tl_shape_t cal_shape = tl_shape_t4(1, oc, 1, chl_quan_per_lane_data_size);
+
+  cvk_tl_t *tl_input =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, input_shape, CVK_FMT_I8, /*eu_aign=*/1);
+
+  cvk_tl_t *tl_filter =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, filter_shape, CVK_FMT_I8, /*eu_align=*/0);
+
+  cvk_tl_t *tl_output =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, output_shape, CVK_FMT_I8, /*eu_align=*/1);
+
+  // Shape for TDMA load
+  cvk_tl_t *tl_cal_data =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, cal_shape, CVK_FMT_U8, /*eu_align*/ 0);
+
+  if (!tl_input || !tl_filter || !tl_output || !tl_cal_data) {
+    if (tl_input == NULL) {
+      printf("      fail to alloc tl_input (%d, %d, %d, %d)\n", input_shape.n,
+            input_shape.c, input_shape.h, input_shape.w);
+    }
+    if (tl_filter == NULL) {
+      printf("     fail to alloc tl_filter (%d, %d, %d, %d)\n", filter_shape.n,
+            filter_shape.c, filter_shape.h, filter_shape.w);
+    }
+    if (tl_output == NULL) {
+      printf("    fail to alloc tl_output (%d, %d, %d, %d)\n", output_shape.n,
+            output_shape.c, output_shape.h, output_shape.w);
+    }
+    if (tl_cal_data == NULL) {
+      printf("    fail to alloc tl_cal_data (%d, %d ,%d, %d)\n", cal_shape.n,
+            cal_shape.c, cal_shape.h, cal_shape.w);
+    }
+
+    // Reverse order
+    if (tl_cal_data)
+      cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_cal_data);
+    if (tl_output)
+      cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_output);
+    if (tl_filter)
+      cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_filter);
+    if (tl_input)
+      cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_input);
+
+    return -1;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_cal_data, chl_quan_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_input, (uint8_t *)input_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_filter, (uint8_t *)kernel_data);
+
+  {
+    // Reshape per channel quantization data for TIU
+    tl_cal_data->shape = tl_shape_t4(1, oc, 1, 1);
+    tl_cal_data->stride = cvk_ctx->ops->tl_default_stride(
+        cvk_ctx, tl_cal_data->shape, CVK_FMT_I8, /*eu_align=*/0);
+
+    // Reshape weight for TIU
+    tl_filter->shape = tl_shape_t4(ic, oc, kh, kw);
+
+    cvk_tiu_convolution_param_t param;
+    memset(&param, 0, sizeof(param));
+    param.ofmap = tl_output;
+    param.ifmap = tl_input;
+    param.weight = tl_filter;
+    param.chl_quan_param = tl_cal_data;
+    param.ins_h = ins_h;
+    param.ins_last_h = ins_last_h;
+    param.ins_w = ins_w;
+    param.ins_last_w = ins_last_w;
+    param.stride_h = stride_h;
+    param.stride_w = stride_w;
+    param.dilation_h = dh;
+    param.dilation_w = dw;
+    param.pad_top = pad_top;
+    param.pad_bottom = pad_bot;
+    param.pad_left = pad_left;
+    param.pad_right = pad_right;
+    param.has_bias = has_bias;
+    param.relu_enable = relu_enable;
+
+#ifdef ENABLE_DEBUG_MSG
+    printf("      tiu_conv_qdm:\n");
+    printf("        ifmap shape (%d, %d, %d, %d)\n", param.ifmap->shape.n,
+           param.ifmap->shape.c, param.ifmap->shape.h, param.ifmap->shape.w);
+    printf("        weight shape (%d, %d, %d, %d)\n", param.weight->shape.n,
+           param.weight->shape.c, param.weight->shape.h, param.weight->shape.w);
+    printf("        ofmap shape (%d, %d, %d, %d)\n", param.ofmap->shape.n,
+           param.ofmap->shape.c, param.ofmap->shape.h, param.ofmap->shape.w);
+#endif
+
+    cvk_ctx->ops->tiu_convolution(cvk_ctx, &param);
+  }
+
+  CVI_RT_Submit(cvk_ctx);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("      compare result:\n");
+#endif
+  int8_t *conv_output_data =
+      (int8_t *)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_output);
+  for (int i = 0; i < in; ++i) {
+    for (int j = 0; j < oc; ++j) {
+      for (int k = 0; k < oh; ++k) {
+        for (int l = 0; l < ow; ++l) {
+          int offset = i * (oc * oh * ow) + j * (oh * ow) + k * ow + l;
+          if (conv_output_data[offset] != output_data[offset]) {
+            printf("        [ni=%d][oci=%d][ohi=%d][owi=%d] output %d(tiu) != "
+                   "%d(ref)\n",
+                   i, j, k, l, conv_output_data[offset], output_data[offset]);
+            ret = -1;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  if (ret) {
+    //dump_test_param(p_param, /*dump_content=*/true);
+	dump2_test_param(p_param);
+  }
+
+  // Reverse order
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_cal_data);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_output);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_filter);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_input);
+
+  free(conv_output_data);
+
+  free(input_data);
+  free(kernel_data);
+  free(output_data);
+  free(bias_data);
+  free(multiplier_data);
+  free(shift_data);
+  free(chl_quan_data);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    <= run_compare_conv\n");
+#endif
+
+  return ret;
+}
+
+
+
+
+int run2_compare_conv(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+
+  if (rt_handle == NULL || cvk_ctx == NULL) {
+    return -1;
+  }
+
+  conv_test_param_t *p_param = &keepFailParam; 
+
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int oh = p_param->output_h;
+  int ow = p_param->output_w;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_last_h = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_last_w = p_param->ins_w_last;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int has_bias = p_param->has_bias;
+  int relu_enable = p_param->relu_enable;
+
+  int input_size = in * ic * iw * ih;
+  int8_t *input_data = (int8_t *)malloc(input_size);
+
+  int kernel_size = oc * ic * kh * kw;
+  int8_t *kernel_data = (int8_t *)malloc(kernel_size);
+
+  int output_size = in * oc * oh * ow;
+  int8_t *output_data = (int8_t *)malloc(output_size);
+  if (!input_data || !kernel_data || !output_data) {
+    free(input_data);
+    free(kernel_data);
+    free(output_data);
+    return -1;
+  }
+
+  memset(output_data, 0, output_size);
+
+  int32_t *bias_data = (int32_t *) malloc(sizeof(int32_t) * oc);
+  uint32_t *multiplier_data = (uint32_t *) malloc(sizeof(uint32_t) * oc);
+  int8_t *shift_data = (int8_t *)malloc(oc);
+
+  //p_param->input_data = input_data;
+  //p_param->filter_data = kernel_data;
+  //p_param->output_data = output_data;
+  //p_param->has_bias = has_bias;
+  //p_param->bias_data = bias_data;
+  //p_param->multiplier_data = multiplier_data;
+  //p_param->shift_data = shift_data;
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    run_compare_conv =>\n");
+  printf("      input (n=%d, ic=%d, h=%d, w=%d), kernel (oc=%d, ic=%d, h=%d, "
+         "w=%d), output (, c=%d, h=%d, w=%d), has_bias %d\n",
+         in, ic, ih, iw, oc, ic, kh, kw, oc, oh, ow, has_bias);
+#endif
+
+  int retry_cnt = p_param->retry_cnt;
+  do {
+    fill_random_data_s8(input_data, input_size);
+    fill_random_data_s8(kernel_data, kernel_size);
+    if (has_bias) {
+      fill_random_data_s32(bias_data, oc);
+    }
+
+    p_param->float_multiplier = 100.0;  // should be < 1.0
+    calc_conv_float_multiplier(p_param);
+
+    if (p_param->float_multiplier > 0.f && p_param->float_multiplier < 1.0) {
+      break;
+    }
+
+  } while (--retry_cnt);
+
+  if (p_param->float_multiplier >= 1.0) {
+    printf("    run_compare_dw_conv: unable to find valid multiplier\n");
+    free(input_data);
+    free(kernel_data);
+    free(output_data);
+    free(bias_data);
+    free(multiplier_data);
+    free(shift_data);
+
+    return -1;
+  }
+
+  uint32_t base_multiplier = 0;
+  int base_shift = 0;
+  QuantizeMultiplierSmallerThanOne(p_param->float_multiplier, &base_multiplier,
+                                   &base_shift);
+
+  for (int i = 0; i < oc; ++i) {
+    // multipliers typically range in [2^30 ; 2^31 - 1].
+    // Values in [0, 2^30 - 1] are normally unused, but harmless.
+    // Thus a good way to randomize multipliers is to subtract from them
+    // a random value smaller than 2^30 but still significant compared to it.
+    p_param->multiplier_data[i] = base_multiplier - (rand() % (1 << 26));
+
+    // Our H/W only supports right shift
+    int right_shift = base_shift - 1 + (rand() % 4);
+    p_param->shift_data[i] = right_shift > 0 ? right_shift : 0;
+
+#ifdef ENABLE_DEBUG_MSG
+    printf("      [oc=%d] multiplier_data %d, shift_data %d\n", i,
+           p_param->multiplier_data[i], p_param->shift_data[i]);
+#endif
+  }
+
+  conv_per_channel_ref(p_param);
+
+  const int chl_quan_per_lane_data_size =
+      p_param->has_bias ? 9 : 5;  // bias(4) + multiplier(4) + shift(1)
+  const int cal_data_size = oc * chl_quan_per_lane_data_size;
+  uint8_t *chl_quan_data = (uint8_t *) malloc(cal_data_size);
+  pack_chl_quan_param(oc, p_param->has_bias, p_param->bias_data,
+                      p_param->multiplier_data, p_param->shift_data,
+                      chl_quan_data);
+
+  cvk_tl_shape_t input_shape = tl_shape_t4(in, ic, ih, iw);
+  cvk_tl_shape_t filter_shape = tl_shape_t4(1, oc, kh * kw, ic);
+  cvk_tl_shape_t output_shape = tl_shape_t4(in, oc, oh, ow);
+  cvk_tl_shape_t cal_shape = tl_shape_t4(1, oc, 1, chl_quan_per_lane_data_size);
+
+  cvk_tl_t *tl_input =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, input_shape, CVK_FMT_I8, /*eu_aign=*/1);
+
+  cvk_tl_t *tl_filter =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, filter_shape, CVK_FMT_I8, /*eu_align=*/0);
+
+  cvk_tl_t *tl_output =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, output_shape, CVK_FMT_I8, /*eu_align=*/1);
+
+  // Shape for TDMA load
+  cvk_tl_t *tl_cal_data =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, cal_shape, CVK_FMT_U8, /*eu_align*/ 0);
+
+  if (!tl_input || !tl_filter || !tl_output || !tl_cal_data) {
+    if (tl_input == NULL) {
+      printf("      fail to alloc tl_input (%d, %d, %d, %d)\n", input_shape.n,
+            input_shape.c, input_shape.h, input_shape.w);
+    }
+    if (tl_filter == NULL) {
+      printf("     fail to alloc tl_filter (%d, %d, %d, %d)\n", filter_shape.n,
+            filter_shape.c, filter_shape.h, filter_shape.w);
+    }
+    if (tl_output == NULL) {
+      printf("    fail to alloc tl_output (%d, %d, %d, %d)\n", output_shape.n,
+            output_shape.c, output_shape.h, output_shape.w);
+    }
+    if (tl_cal_data == NULL) {
+      printf("    fail to alloc tl_cal_data (%d, %d ,%d, %d)\n", cal_shape.n,
+            cal_shape.c, cal_shape.h, cal_shape.w);
+    }
+
+    // Reverse order
+    if (tl_cal_data)
+      cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_cal_data);
+    if (tl_output)
+      cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_output);
+    if (tl_filter)
+      cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_filter);
+    if (tl_input)
+      cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_input);
+
+    return -1;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_cal_data, chl_quan_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_input, (uint8_t *)input_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_filter, (uint8_t *)kernel_data);
+
+  {
+    // Reshape per channel quantization data for TIU
+    tl_cal_data->shape = tl_shape_t4(1, oc, 1, 1);
+    tl_cal_data->stride = cvk_ctx->ops->tl_default_stride(
+        cvk_ctx, tl_cal_data->shape, CVK_FMT_I8, /*eu_align=*/0);
+
+    // Reshape weight for TIU
+    tl_filter->shape = tl_shape_t4(ic, oc, kh, kw);
+
+    cvk_tiu_convolution_param_t param;
+    memset(&param, 0, sizeof(param));
+    param.ofmap = tl_output;
+    param.ifmap = tl_input;
+    param.weight = tl_filter;
+    param.chl_quan_param = tl_cal_data;
+    param.ins_h = ins_h;
+    param.ins_last_h = ins_last_h;
+    param.ins_w = ins_w;
+    param.ins_last_w = ins_last_w;
+    param.stride_h = stride_h;
+    param.stride_w = stride_w;
+    param.dilation_h = dh;
+    param.dilation_w = dw;
+    param.pad_top = pad_top;
+    param.pad_bottom = pad_bot;
+    param.pad_left = pad_left;
+    param.pad_right = pad_right;
+    param.has_bias = has_bias;
+    param.relu_enable = relu_enable;
+
+#ifdef ENABLE_DEBUG_MSG
+    printf("      tiu_conv_qdm:\n");
+    printf("        ifmap shape (%d, %d, %d, %d)\n", param.ifmap->shape.n,
+           param.ifmap->shape.c, param.ifmap->shape.h, param.ifmap->shape.w);
+    printf("        weight shape (%d, %d, %d, %d)\n", param.weight->shape.n,
+           param.weight->shape.c, param.weight->shape.h, param.weight->shape.w);
+    printf("        ofmap shape (%d, %d, %d, %d)\n", param.ofmap->shape.n,
+           param.ofmap->shape.c, param.ofmap->shape.h, param.ofmap->shape.w);
+#endif
+
+    cvk_ctx->ops->tiu_convolution(cvk_ctx, &param);
+  }
+
+  CVI_RT_Submit(cvk_ctx);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("      compare result:\n");
+#endif
+  int8_t *conv_output_data =
+      (int8_t *)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_output);
+  for (int i = 0; i < in; ++i) {
+    for (int j = 0; j < oc; ++j) {
+      for (int k = 0; k < oh; ++k) {
+        for (int l = 0; l < ow; ++l) {
+          int offset = i * (oc * oh * ow) + j * (oh * ow) + k * ow + l;
+          if (conv_output_data[offset] != output_data[offset]) {
+            printf("        [ni=%d][oci=%d][ohi=%d][owi=%d] output %d(tiu) != "
+                   "%d(ref)\n",
+                   i, j, k, l, conv_output_data[offset], output_data[offset]);
+            ret = -1;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  if (ret) {
+    //dump_test_param(p_param, /*dump_content=*/true);
+	dump2_test_param(p_param);
+  }
+
+  // Reverse order
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_cal_data);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_output);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_filter);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_input);
+
+  free(conv_output_data);
+
+  free(input_data);
+  free(kernel_data);
+  free(output_data);
+  free(bias_data);
+  free(multiplier_data);
+  free(shift_data);
+  free(chl_quan_data);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    <= run_compare_conv\n");
+#endif
+
+  return ret;
+}
+
+int random_test(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+
+#ifndef ENABLE_FULL_REGRESSION
+  // TV_GEN pattern
+  // Random Test, total 19683, skipped 118066, executed 32, failed 0, ret 0
+
+  // n: 12b, c: 12b, h: 12b(4095-32), wb: 12b(4095-32)
+  int batch_range[] = {1, 1, 32};
+  int input_height_range[] = {1, 512, 4095 - 32};
+  int input_width_range[] = {1, 512, 4095 - 32};
+  int input_depth_range[] = {1, 16, 32, 64, 102, 4095};
+  int output_depth_range[] = {1, 16, 32, 64, 1024, 4095};
+
+  // h: 12b, w: 12b
+  // stride_h: 4b, strid_w: 4b
+  int kernel_height_range[] = {1, 11, 2048, 4095};
+  int kernel_width_range[] = {1, 11, 2048, 4095};
+  int kernel_stride_height_range[] = {1, 5, 16, 31};
+  int kernel_stride_width_range[] = {1, 5, 16, 31};
+#else
+  // n: 12b, c: 12b, h: 12b(4095-32), wb: 12b(4095-32)
+  int batch_range[] = {1, 2, 4, 8, 16, 32, 64, 4095 - 32};
+  int input_height_range[] = {1, 3, 11, 128, 512, 1024, 2048, 4095 - 32};
+  int input_width_range[] = {1, 3, 11, 128, 512, 1024, 2048, 4095 - 32};
+  int input_depth_range[] = {1, 3, 11, 32, 64, 1024, 2048, 4095};
+  int output_depth_range[] = {1, 3, 11, 32, 64, 1024, 2048, 4095};
+
+  // h: 12b, w: 12b
+  // stride_h: 4b, strid_w: 4b
+  int kernel_height_range[] = {1, 3, 11, 511, 4095};
+  int kernel_width_range[] = {1, 3, 11, 511, 4095};
+  int kernel_stride_height_range[] = {1, 3, 5, 7, 15, 16, 31};
+  int kernel_stride_width_range[] = {1, 3, 5, 7, 15, 16, 31};
+#endif /* ENABLE_FULL_REGRESSION */
+
+  const int batch_range_size = sizeof(batch_range) / sizeof(batch_range[0]);
+  const int input_height_range_size =
+      sizeof(input_height_range) / sizeof(input_height_range[0]);
+  const int input_width_range_size =
+      sizeof(input_width_range) / sizeof(input_width_range[0]);
+  const int input_depth_range_size =
+      sizeof(input_depth_range) / sizeof(input_depth_range[0]);
+  const int output_depth_range_size =
+      sizeof(output_depth_range) / sizeof(output_depth_range[0]);
+
+  const int kernel_height_range_size =
+      sizeof(kernel_height_range) / sizeof(kernel_height_range[0]);
+  const int kernel_width_range_size =
+      sizeof(kernel_width_range) / sizeof(kernel_width_range[0]);
+  const int kernel_stride_height_range_size =
+      sizeof(kernel_stride_height_range) /
+      sizeof(kernel_stride_height_range[0]);
+  const int kernel_stride_width_range_size =
+      sizeof(kernel_stride_width_range) / sizeof(kernel_stride_width_range[0]);
+
+  int random_seed = clock();
+  srand(random_seed);
+
+  const int retry_test_count = 100;
+
+  bool stop_at_first_error = true;
+
+  int total_tests = batch_range_size * input_depth_range_size *
+                    input_height_range_size * input_width_range_size *
+                    output_depth_range_size * kernel_height_range_size *
+                    kernel_width_range_size * kernel_stride_height_range_size *
+                    kernel_stride_width_range_size;
+  int skipped_tests = 0;
+  int executed_tests = 0;
+  int failed_tests = 0;
+  int current_test = 0;
+
+  printf("Random Test =>\n");
+  for (int m = 0; m < retry_test_count; ++m) {
+    for (int i = 0; i < batch_range_size; ++i) {
+      // random choosed from [range[i] : range[i+1]]
+      int batch = choose_from_range(batch_range, batch_range_size, i);
+
+      for (int j = 0; j < input_height_range_size; ++j) {
+        int input_height =
+            choose_from_range(input_height_range, input_height_range_size, j);
+
+        for (int k = 0; k < input_width_range_size; ++k) {
+          int input_width =
+              choose_from_range(input_width_range, input_width_range_size, k);
+
+          for (int l = 0; l < input_depth_range_size; ++l) {
+            int input_depth =
+                choose_from_range(input_depth_range, input_depth_range_size, k);
+
+            for (int m = 0; m < kernel_height_range_size; ++m) {
+              int kernel_height = choose_from_range(
+                  kernel_height_range, kernel_height_range_size, m);
+
+              for (int n = 0; n < kernel_width_range_size; ++n) {
+                int kernel_width = choose_from_range(
+                    kernel_width_range, kernel_width_range_size, n);
+
+                for (int x = 0; x < kernel_stride_height_range_size; ++x) {
+                  int kernel_stride_height =
+                      choose_from_range(kernel_stride_height_range,
+                                        kernel_stride_height_range_size, x);
+
+                  for (int y = 0; y < kernel_stride_width_range_size; ++y) {
+                    int kernel_stride_width =
+                        choose_from_range(kernel_stride_width_range,
+                                          kernel_stride_width_range_size, y);
+
+                    for (int z = 0; z < output_depth_range_size; ++z) {
+                      int output_depth = choose_from_range(
+                          output_depth_range, output_depth_range_size, y);
+
+                      current_test++;
+
+                      int has_bias = rand() % 2;
+                      int dh = 1;
+                      int dw = 1;
+                      int ins_h = 0;
+                      int ins_h_last = 0;
+                      int ins_w = 0;
+                      int ins_w_last = 0;
+                      int pad_top = 0;
+                      int pad_bot = 0;
+                      int pad_left = 0;
+                      int pad_right = 0;
+
+                      int ih_ext = calc_dilute_hw(input_height, ins_h,
+                                                  ins_h_last, pad_top, pad_bot);
+                      int iw_ext = calc_dilute_hw(
+                          input_width, ins_w, ins_w_last, pad_left, pad_right);
+                      int kh_ext =
+                          calc_dilute_hw(kernel_height, dh - 1, 0, 0, 0);
+                      int kw_ext =
+                          calc_dilute_hw(kernel_width, dw - 1, 0, 0, 0);
+
+                      int oh =
+                          calc_output_hw(ih_ext, kh_ext, kernel_stride_height);
+                      int ow =
+                          calc_output_hw(iw_ext, kw_ext, kernel_stride_width);
+
+                      conv_test_param_t test_param;
+                      memset(&test_param, 0, sizeof(test_param));
+                      test_param.input_n = batch;
+                      test_param.input_c = input_depth;
+                      test_param.input_h = input_height;
+                      test_param.input_w = input_width;
+                      test_param.kh = kernel_height;
+                      test_param.kw = kernel_width;
+                      test_param.dh = dh;
+                      test_param.dw = dw;
+                      test_param.pad_top = pad_top;
+                      test_param.pad_bot = pad_bot;
+                      test_param.pad_left = pad_left;
+                      test_param.pad_right = pad_right;
+                      test_param.ins_h = ins_h;
+                      test_param.ins_h_last = ins_h_last;
+                      test_param.ins_w = ins_w;
+                      test_param.ins_w_last = ins_w_last;
+                      test_param.stride_h = kernel_stride_height;
+                      test_param.stride_w = kernel_stride_width;
+                      test_param.output_c = output_depth;
+                      test_param.output_h = oh;
+                      test_param.output_w = ow;
+                      test_param.has_bias = has_bias;
+                      test_param.retry_cnt = 5;
+
+                      bool is_valid_param =
+                          check_valid_test_param(cvk_ctx, &test_param);
+                      if (is_valid_param == false) {
+                        skipped_tests++;
+                        continue;
+                      }
+
+                      int ret2 = run_compare_conv(rt_handle, cvk_ctx, &test_param);
+                      failed_tests = ret2 ? failed_tests + 1 : failed_tests;
+                      ret |= ret2;
+                      executed_tests++;
+
+#ifdef ENABLE_DEBUG_MSG
+                      printf(
+                          "  [%d/%d] random test: input shape(%d, %d, %d, %d)",
+                          current_test, total_tests, batch, input_depth,
+                          input_height, input_width);
+                      printf(", kernel shape (%d, %d, %d, %d), result %d\n",
+                             output_depth, input_depth, kernel_height,
+                             kernel_width, ret2);
+#endif
+
+                      // Stop at first error
+                      if (ret && stop_at_first_error) {
+                        break;
+                      }
+                    }
+
+                    // Stop at first error
+                    if (ret && stop_at_first_error) {
+                      break;
+                    }
+                  }
+
+                  // Stop at first error
+                  if (ret && stop_at_first_error) {
+                    break;
+                  }
+                }
+
+                // Stop at first error
+                if (ret && stop_at_first_error) {
+                  break;
+                }
+              }
+
+              // Stop at first error
+              if (ret && stop_at_first_error) {
+                break;
+              }
+            }
+
+            // Stop at first error
+            if (ret && stop_at_first_error) {
+              break;
+            }
+          }
+
+          // Stop at first error
+          if (ret && stop_at_first_error) {
+            break;
+          }
+        }
+
+        // Stop at first error
+        if (ret && stop_at_first_error) {
+          break;
+        }
+      }
+
+      // Stop at first error
+      if (ret && stop_at_first_error) {
+        break;
+      }
+    }
+    // Stop at first error
+    if (ret && stop_at_first_error) {
+      break;
+    }
+
+    if (executed_tests >= MIN_EXEC_TESTS) {
+      break;
+    }
+  }
+
+  printf(
+      "<= Random Test, total %d, skipped %d, executed %d, failed %d, ret %d\n",
+      total_tests, skipped_tests, executed_tests, failed_tests, ret);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= simple_test(rt_handle, cvk_ctx);
+  ret |= random_test(rt_handle, cvk_ctx);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_conv_wtiling.c b/cviruntime/test/180x/test_180x_conv_wtiling.c
new file mode 100644
index 000000000..5e31a5673
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_conv_wtiling.c
@@ -0,0 +1,917 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include "test_cvikernel_util.h"
+#include "test_native_ref.h"
+
+typedef struct {
+  int random_seed;
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int using_bias;
+  int bReLU_EN;
+  int r_shift_m;
+  int opd0_sign;
+  int opd1_sign;
+  int opd2_sign;
+} conv_param_t;
+
+typedef struct {
+  uint32_t n;
+  uint32_t c;
+  uint32_t h;
+  uint32_t w;
+}slice_t;
+
+static int index_get(int h, int w1, int w2)
+{
+  return h * w1 + w2;
+}
+
+static int matrix_dot_mult(
+    int8_t *A, int8_t *B, int dim_n, int dim_m,
+    int opd0_sign)
+{
+  int sum = 0;
+  for (int i=0; i<dim_n; i++){
+    for (int j=0; j<dim_m; j++){
+      int index = index_get(i, dim_m, j);
+      if (opd0_sign) {
+        sum += A[index] * B [index];
+      } else {
+        sum += (int)((uint8_t)A[index]) * B [index];
+      }
+    }
+  }
+  return sum;
+}
+
+static int conv_ref(
+    const conv_param_t *p_param,
+    const int8_t *ifmap,
+    const int8_t *weight,
+    const int16_t *bias,
+    int8_t *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+  int r_shift_bits = p_param->r_shift_m;
+  int do_relu = p_param->bReLU_EN;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  int8_t *i_fmap_pad_ker = (int8_t *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return -1;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = 0;
+
+  int8_t *i_fmap_pad = NULL;
+  int8_t *kernel_after = NULL;
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+
+      if (p_param->using_bias) {
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += bias[c]; //bias+c ;
+          }
+        }
+      }
+
+      if (do_relu)
+        relu(&result[n*oc*oh*ow + c*oh*ow], oh * ow);
+
+      // ofmap is int8_t, signed
+      ret = satu_2_8bit(&result[n*oc*oh*ow + c*oh*ow], oh * ow,
+                        &ofmap[n*oc*oh*ow + c*oh*ow], r_shift_bits, /*round_floor=*/1,
+                        /*sign_unsign=*/1);
+
+      if (ret)
+        goto error_release;
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+error_release:
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+
+static uint8_t * transform_weight(const cvk_tl_shape_t *s, uint8_t before[])
+{
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  uint32_t size = ic * oc * kh * kw;
+  uint8_t *after = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!after)
+    return NULL;
+
+  /*
+   * (oc, ic, kh, kw) -> (1, oc, kh * kw, ic)
+   */
+  for (uint32_t oci = 0; oci < oc; oci++) {
+    for (uint32_t ici = 0; ici < ic; ici++) {
+      for (uint32_t khi = 0; khi < kh; khi++) {
+        for (uint32_t kwi = 0; kwi < kw; kwi++) {
+          uint32_t src_i = oci * ic * kh * kw + ici * kh * kw + khi * kw + kwi;
+          uint32_t dst_i = oci * kh * kw * ic + khi * kw * ic + kwi * ic + ici;
+          after[dst_i] = before[src_i];
+        }
+      }
+    }
+  }
+
+  return after;
+}
+
+static void put_conv_weight(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    uint8_t *data)
+{
+#if 0
+  const cvk_tl_shape_t *s = &tl->shape;
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  bmshape_t bms = BM_TENSOR_INT8((int)oc, (int)ic, (int)kh, (int)kw);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  uint8_t *transformed_data = transform_weight(s, data);
+
+  /* we put weight to region 1. bm_memcpy_s2d regard dev_mem as
+   * absolute address, so we should pass abs address to copy weight
+   * to right place.
+   */
+
+  //u64 ab_addr = bm_device_read_base_reg(*ctx, 1);
+  //bmmem_device_t ab_dev_mem =
+    //bmmem_device_prealloc(*ctx, NULL, ab_addr + gaddr, &bms);
+
+  //int ret = bm_memcpy_s2d(*ctx, ab_dev_mem, transformed_data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  cvk_tl_shape_t tdma_shape = { 1, oc, kh * kw, ic };
+
+  tg_t tdma_tg;
+  tdma_tg.base_reg_index = 0;
+  tdma_tg.start_address = gaddr;
+  tdma_tg.fmt = FMT_I8;
+  tdma_tg.shape.n = tdma_shape.n;
+  tdma_tg.shape.c = tdma_shape.c;
+  tdma_tg.shape.h = tdma_shape.h;
+  tdma_tg.shape.w = tdma_shape.w;
+  tdma_tg.stride = bmk1822_tensor_tgmem_default_stride(tdma_tg.shape, tdma_tg.fmt);
+  tdma_tg.base_reg_index = 1;
+
+  tl_t tdma_tl = *tl;
+  tdma_tl.shape = tdma_shape;
+  tdma_tl.stride = bmk1822_tensor_lmem_default_stride(cvk_ctx, tdma_shape, FMT_I8, 0);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tdma_tg;
+  p.dst = &tdma_tl;
+
+  bmk1822_tdma_g2l_tensor_copy(cvk_ctx, &p);
+  test_submit(ctx);
+
+
+  bmmem_device_free(*ctx, dev_mem);
+
+  free(transformed_data);
+#endif
+
+  const cvk_tl_shape_t *s = &tl->shape;
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  uint8_t *transformed_data = transform_weight(s, data);
+
+  cvk_tl_shape_t tdma_shape = tl_shape_t4(1, oc, kh * kw, ic);
+  cvk_tl_t tdma_tl;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tdma_tl, tdma_shape, tl->fmt, tl->eu_align);
+  tdma_tl.start_address = tl->start_address;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, &tdma_tl, transformed_data);
+
+  free(transformed_data);
+}
+
+static int8_t * transform_bias(int oc, int16_t before[])
+{
+  int8_t *after = (int8_t *)malloc(2 * oc);
+  if (!after)
+    return NULL;
+
+  for (int i = 0; i < oc; i++){
+    after[i] = before[i] & 0xff;
+    after[i + oc] = (before[i] >> 8) & 0xff;
+  }
+  return after;
+}
+
+static void put_conv_bias(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    int16_t *data)
+{
+#if 0
+  int oc = tl->shape.c;
+
+  bmshape_t bms = BM_TENSOR_INT8(2, oc, 1, 1);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  int8_t *transformed_data = transform_bias(oc, data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, (uint8_t *)transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_I8;
+  tg.shape.n = 2;
+  tg.shape.c = oc;
+  tg.shape.h = 1;
+  tg.shape.w = 1;
+  tg.stride = bmk1822_tensor_tgmem_default_stride(tg.shape, tg.fmt);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1822_tdma_g2l_tensor_copy(cvk_ctx, &p);
+  test_submit(ctx);
+
+  bmmem_device_free(*ctx, dev_mem);
+  delete[] transformed_data;
+#endif
+
+  int oc = tl->shape.c;
+  int8_t *transformed_data = transform_bias(oc, data);
+
+  cvk_tl_shape_t tdma_shape = tl_shape_t4(2, oc, 1, 1);
+  cvk_tl_t tdma_tl;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tdma_tl, tdma_shape, tl->fmt, tl->eu_align);
+  tdma_tl.start_address = tl->start_address;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, &tdma_tl, (uint8_t *)transformed_data);
+
+  free(transformed_data);
+}
+
+static int conv_kh_ext(const conv_param_t *p)
+{
+  return (p->kh - 1) * p->dh + 1;
+}
+
+static int conv_kw_ext(const conv_param_t *p)
+{
+  return (p->kw - 1) * p->dw + 1;
+}
+
+static int conv_ih_ext(const conv_param_t *p)
+{
+  return (p->input_h - 1) * (p->ins_h + 1) +
+      p->ins_h_last + 1 + p->pad_top + p->pad_bot;
+}
+
+static int conv_iw_ext(const conv_param_t *p)
+{
+  return (p->input_w - 1) * (p->ins_w + 1) +
+      p->ins_w_last + 1 + p->pad_left + p->pad_right;
+}
+
+static int conv_oh(const conv_param_t *p)
+{
+  return (conv_ih_ext(p) - conv_kh_ext(p)) / p->stride_h + 1;
+}
+
+static int conv_ow(const conv_param_t *p)
+{
+  return (conv_iw_ext(p) - conv_kw_ext(p)) / p->stride_w + 1;
+}
+
+static int conv_input_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int ic = p->input_c;
+  int ih = p->input_h;
+  int iw = p->input_w;
+  return in * ic * ih * iw;
+}
+
+static int conv_output_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int oc = p->output_c;
+  int oh = conv_oh(p);
+  int ow = conv_ow(p);
+  return in * oc * oh * ow;
+}
+
+static int conv_weight_size(const conv_param_t *p)
+{
+  int oc = p->output_c;
+  int ic = p->input_c;
+  int kh = p->kh;
+  int kw = p->kw;
+  return oc * ic * kh * kw;
+}
+
+static int8_t * alloc_input(const conv_param_t *p)
+{
+  int size = conv_input_size(p);
+
+  int8_t *buf = (int8_t *)malloc(sizeof(int8_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static int8_t * alloc_weight(const conv_param_t *p)
+{
+  int size = conv_weight_size(p);
+
+  int8_t *buf = (int8_t *)malloc(sizeof(int8_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static int16_t * alloc_bias(const conv_param_t *p)
+{
+  int oc = p->output_c;
+
+  int16_t *bias = (int16_t *)malloc(sizeof(int16_t) * oc);
+  if (!bias)
+    return NULL;
+
+  for (int i = 0; i < oc; i++)
+    bias[i] = rand() % 65536 - 32768;
+
+  return bias;
+}
+
+static cvk_tl_t * conv_ifmap_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = p->opd0_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 1);
+}
+
+static cvk_tl_t * conv_weight_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = p->opd1_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 0);
+}
+
+static cvk_tl_t * conv_ofmap_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_tl_shape_t s;
+  s.n = p->input_n * 4;
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  cvk_tl_t *tl = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, CVK_FMT_I8, 1);
+  tl->shape.n = p->input_n;
+  return tl;
+}
+
+static cvk_tl_t * conv_bias_tensor(
+    cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = p->opd2_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 0);
+}
+
+static int conv_param_is_ok(const conv_param_t *p)
+{
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+
+  if ((kh_ext > ih_ext)
+      || (kw_ext > iw_ext)
+      || (kh_ext <= p->pad_top)
+      || (kh_ext <= p->pad_bot)
+      || (kw_ext <= p->pad_left)
+      || (kw_ext <= p->pad_right)
+      || (p->pad_top >= (1 << 4))
+      || (p->pad_bot >= (1 << 4))
+      || (p->pad_left >= (1 << 4))
+      || (p->pad_right >= (1 << 4))) {
+    return 0;
+  }
+
+  return 1;
+}
+
+static int bmk_conv_param_alloc_ok(
+    const cvk_tiu_pt_convolution_param_t *p,
+    const conv_param_t *param)
+{
+  if (!p->ifmap || !p->ofmap || !p->weight)
+    return 0;
+
+  if (param->using_bias)
+    if (!p->bias)
+      return 0;
+
+  return 1;
+}
+
+static void make_bmk_conv_param(
+    cvk_context_t *cvk_ctx,
+    cvk_tiu_pt_convolution_param_t *dst,
+    const conv_param_t *p)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  dst->relu_enable = p->bReLU_EN;
+  dst->rshift_bits = p->r_shift_m;
+  dst->ifmap = conv_ifmap_tensor(cvk_ctx, p);
+  dst->weight = conv_weight_tensor(cvk_ctx, p);
+  dst->ofmap = conv_ofmap_tensor(cvk_ctx, p);
+  dst->bias = NULL;
+  dst->ps32_mode = 0;
+  if (p->using_bias)
+    dst->bias = conv_bias_tensor(cvk_ctx, p);
+
+  dst->w_is_const = 0;
+}
+
+static void free_bmk_conv_param(
+    cvk_context_t *cvk_ctx,
+    cvk_tiu_pt_convolution_param_t *r,
+    const conv_param_t *p)
+{
+  if (p->using_bias && r->bias)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->bias);
+  if (r->ofmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ofmap);
+  if (r->weight)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->weight);
+  if (r->ifmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ifmap);
+}
+
+static void init_conv_param(conv_param_t *p)
+{
+  printf("init_conv_param\n");
+  p->random_seed = clock();
+  srand(p->random_seed);
+
+retry:
+  p->input_n = 1;
+  p->input_c = 1;
+  p->kh = 3;
+  p->kw = 3;
+  p->input_h = 4 + p->kh;
+  p->input_w = 4 + p->kw ;
+  p->output_c = 1;
+  p->stride_h = 1;
+  p->stride_w = 1;
+  p->ins_h = 0;
+  p->ins_w = 0;
+  p->ins_h_last = 0;
+  p->ins_w_last = 0;
+  p->dh = 1;
+  p->dw = 1;
+
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  p->pad_top = 0;
+  p->pad_bot = 0;
+  p->pad_left = 0;
+  p->pad_right = 0;
+
+  if (!conv_param_is_ok(p)) {
+    printf("retry init_conv_param\n");
+    goto retry;
+  }
+
+  p->using_bias = 1;
+  p->r_shift_m = rand() % 8;
+  p->bReLU_EN = rand() % 2;
+  p->opd0_sign = rand() % 2;
+
+  p->opd1_sign = 1;
+  p->opd2_sign = 1;
+
+  assert(p->opd1_sign == 1 && p->opd2_sign == 1);
+
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+  assert(ih_ext >= kh_ext);
+  assert(iw_ext >= kw_ext);
+}
+
+static void print_conv_param(const conv_param_t *p)
+{
+  printf("%s\n", "Conv parameters:");
+  printf("  %s%d;\n", "p->random_seed = ", p->random_seed);
+
+  printf("  %s%d;\n", "p->input_n = ", p->input_n);
+  printf("  %s%d;\n", "p->input_c = ", p->input_c);
+  printf("  %s%d;\n", "p->input_h = ", p->input_h);
+  printf("  %s%d;\n", "p->input_w = ", p->input_w);
+  printf("  %s%d;\n", "p->output_c = ", p->output_c);
+
+  printf("  %s%d;\n", "p->kh = ", p->kh);
+  printf("  %s%d;\n", "p->kw = ", p->kw);
+  printf("  %s%d;\n", "p->dh = ", p->dh);
+  printf("  %s%d;\n", "p->dw = ", p->dw);
+  printf("  %s%d;\n", "p->pad_top = ", p->pad_top);
+  printf("  %s%d;\n", "p->pad_bot = ", p->pad_bot);
+  printf("  %s%d;\n", "p->pad_left = ", p->pad_left);
+  printf("  %s%d;\n", "p->pad_right = ", p->pad_right);
+  printf("  %s%d;\n", "p->stride_h = ", p->stride_h);
+  printf("  %s%d;\n", "p->stride_w = ", p->stride_w);
+  printf("  %s%d;\n", "p->ins_w = ", p->ins_w);
+  printf("  %s%d;\n", "p->ins_h = ", p->ins_h);
+  printf("  %s%d;\n", "p->ins_w_last = ", p->ins_w_last);
+  printf("  %s%d;\n", "p->ins_h_last = ", p->ins_h_last);
+
+  printf("  %s%d;\n", "p->r_shift_m = ", p->r_shift_m);
+  printf("  %s%d;\n", "p->opd0_sign = ", p->opd0_sign);
+  printf("  %s%d;\n", "p->opd1_sign = ", p->opd1_sign);
+  printf("  %s%d;\n", "p->opd2_sign = ", p->opd2_sign);
+  printf("  %s%d;\n", "p->bReLU_EN = ", p->bReLU_EN);
+  printf("  %s%d;\n", "p->using_bias = ", p->using_bias);
+
+  printf("  %s%d\n", "kh_ext = ", conv_kh_ext(p));
+  printf("  %s%d\n", "kw_ext = ", conv_kw_ext(p));
+  printf("  %s%d\n", "ih_ext = ", conv_ih_ext(p));
+  printf("  %s%d\n", "iw_ext = ", conv_iw_ext(p));
+  printf("  %s%d\n", "output_h = ", conv_oh(p));
+  printf("  %s%d\n", "output_w = ", conv_ow(p));
+}
+
+/* Calculate the right shift value, m
+ * Steps:
+ *  1. Get the abs() of each weight;
+ *  2. Summary all the abs() in one kernel;
+ *  3. Get Log2 of each sum;
+ *  4. Downward rounding;
+ * After every r_shift value got, sort and find the middle one.
+ */
+
+static int calc_rshift_m(const conv_param_t *p, int8_t* weight)
+{
+  int kernel_cnt = p->output_c * p->input_c;
+  int kernel_size = p->kh * p->kw;
+  int *kernel_shifts = (int *)malloc(sizeof(int) * kernel_cnt);
+  if (!kernel_shifts)
+    return 1;
+
+  memset(kernel_shifts, 0, sizeof(int) * kernel_cnt);
+
+  // Part 1:
+  // Get right shift value for each kernel
+  int sum = 0;
+  for (int i = 0; i < kernel_cnt; i++) {
+    // Step 1 & 2: Get the sum of abs()
+    for (int j = 0; j < kernel_size; j++) {
+      sum += (int)(*weight < 0 ? -(*weight) : (*weight));
+      weight++;
+    }
+    // Step 3 & 4: log2 and downward rounding
+    sum >>= 1;
+    while (sum) {
+      sum >>= 1;
+      kernel_shifts[i]++;
+    }
+  }
+
+  // Part 2:
+  // Find the middle of all the values
+  int tag[32] = {0};
+  for (int cnt = 0; cnt < kernel_cnt; cnt++) {
+    tag[kernel_shifts[cnt]]++;
+  }
+
+  int rshift_m = 0;
+  int mid = 0;
+  do {
+    mid += tag[rshift_m++];
+  } while(mid < (kernel_cnt - 1) >> 1);
+
+  free(kernel_shifts);
+
+  return rshift_m - 1;
+}
+
+static int test_w_tiling_conv(
+    conv_param_t *p_param, CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  printf("test w tiled conv\n");
+  int ret = 0;
+  int8_t *input = alloc_input(p_param);
+  int8_t *weight = alloc_weight(p_param);
+  int16_t *bias = alloc_bias(p_param);
+  int8_t *output_ref = (int8_t *)malloc(sizeof(int8_t) * conv_output_size(p_param));
+  if (!input || !weight || !bias || !output_ref) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  p_param->r_shift_m = calc_rshift_m(p_param, weight);
+  ret = conv_ref(p_param, input, weight, bias, output_ref);
+  if (ret)
+    goto fail_exit;
+
+  cvk_tiu_pt_convolution_param_t conv_param;
+  make_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+
+  /*We tile the finest granule to test w tiling*/
+  uint32_t ow_step = 1;
+
+  int tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, p_param);
+  if (tl_alloc_success) {
+    put_conv_weight(rt_handle, cvk_ctx, conv_param.weight, (uint8_t *)weight);
+    if (p_param->using_bias) {
+      put_conv_bias(rt_handle, cvk_ctx, conv_param.bias, bias);
+    }
+
+    cvk_tl_t tl_ifmap = *conv_param.ifmap;
+    cvk_tl_t tl_ofmap = *conv_param.ofmap;
+
+    cvk_tg_shape_t s;
+    s.n = tl_ifmap.shape.n;
+    s.c = tl_ifmap.shape.c;
+    s.h = tl_ifmap.shape.h;
+    s.w = tl_ifmap.shape.w;
+    cvk_tg_t *ts_ifmap = alloc_tensor_dev_mem(rt_handle, cvk_ctx, s, CVK_FMT_I8);
+    tensor_copy_s2d(rt_handle, ts_ifmap, (uint8_t *)input);
+
+    s.n = tl_ofmap.shape.n;
+    s.c = tl_ofmap.shape.c;
+    s.h = tl_ofmap.shape.h;
+    s.w = tl_ofmap.shape.w;
+    cvk_tg_t *ts_ofmap = alloc_tensor_dev_mem(rt_handle, cvk_ctx, s, CVK_FMT_I8);
+
+    for (uint32_t ow_pos = 0; ow_pos < tl_ofmap.shape.w; ow_pos += ow_step) {
+      uint32_t cur_ow = math_min(tl_ofmap.shape.w - ow_pos, ow_step);
+
+      cvk_tg_t ts_cur_ofmap;
+      ts_cur_ofmap.shape.n = ts_ofmap->shape.n;
+      ts_cur_ofmap.shape.c = ts_ofmap->shape.c;
+      ts_cur_ofmap.shape.h = ts_ofmap->shape.h;
+      ts_cur_ofmap.shape.w = cur_ow;
+      ts_cur_ofmap.stride = ts_ofmap->stride;
+      ts_cur_ofmap.start_address = ts_ofmap->start_address + ow_pos;
+      ts_cur_ofmap.fmt = ts_ofmap->fmt;
+      ts_cur_ofmap.base_reg_index = ts_ofmap->base_reg_index;
+
+      cvk_tl_t tl_cur_ofmap;
+      tl_cur_ofmap.shape.n = tl_ofmap.shape.n;
+      tl_cur_ofmap.shape.c = tl_ofmap.shape.c;
+      tl_cur_ofmap.shape.h = tl_ofmap.shape.h;
+      tl_cur_ofmap.shape.w = cur_ow;
+      tl_cur_ofmap.stride = cvk_ctx->ops->tl_default_stride(cvk_ctx, tl_cur_ofmap.shape, CVK_FMT_I8, 1);
+      tl_cur_ofmap.fmt = tl_ofmap.fmt;
+      tl_cur_ofmap.start_address = tl_ofmap.start_address;
+
+      cvk_tg_t ts_cur_ifmap;
+      ts_cur_ifmap.shape.n = ts_ifmap->shape.n;
+      ts_cur_ifmap.shape.c = ts_ifmap->shape.c;
+      ts_cur_ifmap.shape.h = ts_ifmap->shape.h;
+      ts_cur_ifmap.shape.w = (cur_ow - 1) * conv_param.stride_w + conv_kw_ext(p_param);
+      ts_cur_ifmap.stride = ts_ifmap->stride;
+      ts_cur_ifmap.start_address = ts_ifmap->start_address + ow_pos;
+      ts_cur_ifmap.fmt = ts_ifmap->fmt;
+      ts_cur_ifmap.base_reg_index = ts_ifmap->base_reg_index;
+
+      cvk_tl_t tl_cur_ifmap;
+      tl_cur_ifmap.shape.n = tl_ifmap.shape.n;
+      tl_cur_ifmap.shape.c = tl_ifmap.shape.c;
+      tl_cur_ifmap.shape.h = tl_ifmap.shape.h;
+      tl_cur_ifmap.shape.w = (cur_ow - 1) * conv_param.stride_w + conv_kw_ext(p_param);
+      tl_cur_ifmap.stride = cvk_ctx->ops->tl_default_stride(cvk_ctx, tl_cur_ifmap.shape, CVK_FMT_I8, 1);
+      tl_cur_ifmap.fmt = tl_ifmap.fmt;
+      tl_cur_ifmap.start_address = tl_ifmap.start_address;
+
+      {
+        cvk_tdma_g2l_tensor_copy_param_t p;
+        memset(&p, 0, sizeof(p));
+        p.src = &ts_cur_ifmap;
+        p.dst = &tl_cur_ifmap;
+        cvk_ctx->ops->tdma_g2l_tensor_copy(cvk_ctx, &p);
+        CVI_RT_Submit(cvk_ctx);
+      }
+      {
+        cvk_tiu_pt_convolution_param_t p;
+        memset(&p, 0, sizeof(p));
+        p = conv_param;
+        p.ifmap = &tl_cur_ifmap;
+        p.ofmap = &tl_cur_ofmap;
+        if(p_param->ins_w_last == 1 && (ow_pos + ow_step) >= tl_ofmap.shape.w)
+          p.ins_last_w = 1;
+        else
+          p.ins_last_w = 0;
+
+        cvk_ctx->ops->tiu_pt_convolution(cvk_ctx, &p);
+      }
+      {
+        cvk_tdma_l2g_tensor_copy_param_t p;
+        memset(&p, 0, sizeof(p));
+        p.src = &tl_cur_ofmap;
+        p.dst = &ts_cur_ofmap;
+        cvk_ctx->ops->tdma_l2g_tensor_copy(cvk_ctx, &p);
+        CVI_RT_Submit(cvk_ctx);
+      }
+    }
+    uint8_t *output = tensor_copy_d2s(rt_handle, ts_ofmap);
+    free_tensor_dev_mem(rt_handle, ts_ifmap);
+    free_tensor_dev_mem(rt_handle, ts_ofmap);
+
+    ret = array_cmp_int8(
+        "Comparing results ...\n",
+        output_ref, (int8_t *)output, conv_output_size(p_param));
+
+    if (ret) {
+      print_conv_param(p_param);
+      printf("Comparison FAILED\n");
+    }
+    free(output);
+  }
+
+  free_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+
+fail_exit:
+  free(input);
+  free(weight);
+  free(bias);
+  free(output_ref);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  for (int i = 0; i < 1; i++) {
+    printf("random_test_conv iteration: %d\n", i);
+    conv_param_t test_conv_param;
+    init_conv_param(&test_conv_param);
+    ret |= test_w_tiling_conv(&test_conv_param, rt_handle, cvk_ctx);
+    if (ret)
+      return ret;
+
+    if (!test_conv_param.using_bias)
+      test_conv_param.using_bias = 1;
+    if (test_conv_param.output_c <= 9)
+      test_conv_param.output_c += 3;
+    ret |= test_w_tiling_conv(&test_conv_param, rt_handle, cvk_ctx);
+    if (ret)
+      return ret;
+  }
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_depthwise.c b/cviruntime/test/180x/test_180x_depthwise.c
new file mode 100644
index 000000000..8cafea92c
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_depthwise.c
@@ -0,0 +1,362 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_native_ref.h"
+
+#define INVALIDE_STRIDE (-1)
+typedef cvk_tiu_depthwise_pt_convolution_param_t param_t;
+
+static void print_pooling_param(param_t *p)
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+  int kh = p->weight->shape.h;
+  int kw = p->weight->shape.w;
+
+  printf("  Pooling parameters:\n");
+  printf("    ifmap = (%d, %d, %d, %d)\n", in, ic, ih, iw);
+  printf("    opd0_sign = %d\n", p->ifmap->fmt == CVK_FMT_I8);
+  printf("    weight = (%d, %d)\n", kh, kw);
+  printf("    padding = (%d, %d, %d, %d)\n",
+         p->pad_top, p->pad_bottom, p->pad_left, p->pad_right);
+  printf("    stride = (%d, %d)\n", p->stride_h, p->stride_w);
+  printf("    ins0 = (%d, %d, %d, %d)\n",
+         p->ins_h, p->ins_last_h, p->ins_w, p->ins_last_w);
+  printf("    rshift_bits = %d\n", p->rshift_bits);
+  printf("    relu_enable = %d\n", p->relu_enable);
+  printf("    res0_sign = %d\n", p->ofmap->fmt == CVK_FMT_I8);
+}
+
+static int8_t *alloc_input(param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->ifmap->shape, p->ifmap->fmt);
+  int8_t *data = (int8_t *)malloc(size);
+  if (!data)
+    return NULL;
+
+  for (uint64_t i = 0; i < size; i++)
+    data[i] = rand() % 256 - 128;
+  return data;
+}
+
+static int8_t *alloc_weight(param_t *p)
+{
+  int size = tl_shape_size(&p->weight->shape, p->weight->fmt);
+  int8_t *data = (int8_t *)malloc(size);
+  if (!data)
+    return NULL;
+
+  for (int i = 0; i < size; i++)
+    data[i] = rand() % 256 - 128;
+  return data;
+}
+
+static int16_t *alloc_bias(param_t *p)
+{
+  int c = p->bias->shape.c;
+  int16_t *bias = (int16_t *)malloc(sizeof(int16_t) * c);
+  if (!bias)
+    return NULL;
+
+  for (int i = 0; i < c; i++)
+    bias[i] = rand() % 65536 - 32768;
+  return bias;
+}
+
+static int8_t *alloc_output(param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->ofmap->shape, p->ofmap->fmt);
+  return (int8_t *)malloc(size);
+}
+
+static inline void relu8(int8_t *buf, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++)
+    if (buf[i] < 0)
+      buf[i] = 0;
+}
+
+
+static int compare_results(
+    param_t *p,
+    int8_t input[],
+    int8_t weight[],
+    int16_t bias[],
+    int8_t output[])
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+  int kh = p->weight->shape.h;
+  int kw = p->weight->shape.w;
+  int opd0_sign = (p->ifmap->fmt == CVK_FMT_I8);
+  int res0_sign = (p->ofmap->fmt == CVK_FMT_I8);
+  int8_t *output_ref = alloc_output(p);
+  int ret = native_pooling_ave_int8(
+      input, weight, p->bias ? bias : NULL, output_ref,
+      in, ic, ih, iw, kh, kw,
+      p->pad_top, p->pad_bottom, p->pad_left, p->pad_right,
+      p->stride_h, p->stride_w,
+      p->ins_h, p->ins_w, p->ins_last_h, p->ins_last_w,
+      opd0_sign, res0_sign, p->rshift_bits, 0);
+  if (ret)
+    return ret;
+
+  if(p->relu_enable )
+    relu8(output_ref, tl_shape_size(&p->ofmap->shape, p->ofmap->fmt));
+
+  ret = array_cmp_int8(
+      "Comparing results ...\n", output_ref, output,
+      tl_shape_size(&p->ofmap->shape, p->ofmap->fmt));
+
+  if (ret) {
+    printf("Comparison FAILED!!!\n");
+    print_pooling_param(p);
+  }
+
+  free(output_ref);
+
+  return ret;
+}
+
+static int pooling_ih_ext(param_t *p, int ih)
+{
+  int ins = p->ins_h;
+  int ins_last = p->ins_last_h;
+  int pad = p->pad_top + p->pad_bottom;
+  return (ih - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+static int pooling_iw_ext(param_t *p, int iw)
+{
+  int ins = p->ins_w;
+  int ins_last = p->ins_last_w;
+  int pad = p->pad_left + p->pad_right;
+  return (iw - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+static int pooling_oh(param_t *p, int ih, int kh)
+{
+  int ih_ext = pooling_ih_ext(p, ih);
+  return (ih_ext - kh) / p->stride_h + 1;
+}
+
+static int pooling_ow(param_t *p, int iw, int kw)
+{
+  int iw_ext = pooling_iw_ext(p, iw);
+  return (iw_ext - kw) / p->stride_w + 1;
+}
+
+static void free_depthwise_param(
+    cvk_context_t *cvk_ctx,
+    param_t *p)
+{
+  if (p->bias)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->bias);
+
+  if (p->weight)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->weight);
+
+  if (p->ifmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->ifmap);
+
+  if (p->ofmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->ofmap);
+}
+
+static param_t random_depthwise_param(cvk_context_t *cvk_ctx, int stride_w, int stride_h)
+{
+  srand(clock());
+  param_t p;
+  int retry_cnt = 100;
+
+  for (int i = 0; i < retry_cnt; i++) {
+    int using_bias = rand() % 2;
+    int n = rand() % 5 + 1;
+    int c = rand() % (3 * cvk_ctx->info.npu_num) + 1;
+    int ih = rand() % 30 + 3 + (INVALIDE_STRIDE == stride_h ? 0 : stride_h);
+    int iw = rand() % 30 + 6 + (INVALIDE_STRIDE == stride_w ? 0 : stride_w);
+    int kh = rand() % 7 + 1;
+    int kw = rand() % 7 + 1;
+    int opd0_sign = rand() % 2;
+
+    memset(&p, 0, sizeof(p));
+    p.ins_h = rand() % kh;
+    p.ins_w = rand() % kw;
+    p.ins_last_h = rand() % kh;
+    p.ins_last_w = rand() % kw;
+    p.stride_h = INVALIDE_STRIDE == stride_h ? rand() % (kh) + 1 : stride_h;
+    p.stride_w = INVALIDE_STRIDE == stride_w ? rand() % (kh) + 1 : stride_w;
+    p.pad_top = rand() % kh;
+    p.pad_bottom = rand() % kh;
+    p.pad_left = rand() % kw;
+    p.pad_right = rand() % kw;
+    p.rshift_bits = rand() % 32;
+
+    int oh = pooling_oh(&p, ih, kh);
+    int ow = pooling_ow(&p, iw, kw);
+    cvk_tl_shape_t ofmap_shape;
+    ofmap_shape.n = n;
+    ofmap_shape.c = c;
+    ofmap_shape.h = oh;
+    ofmap_shape.w = ow;
+    cvk_tl_shape_t ifmap_shape;
+    ifmap_shape.n = n;
+    ifmap_shape.c = c;
+    ifmap_shape.h = ih;
+    ifmap_shape.w = iw;
+    cvk_tl_shape_t weight_shape;
+    weight_shape.n = 1;
+    weight_shape.c = c;
+    weight_shape.h = kh;
+    weight_shape.w = kw;
+    cvk_tl_shape_t bias_shape;
+    bias_shape.n = 2;
+    bias_shape.c = c;
+    bias_shape.h = 1;
+    bias_shape.w = 1;
+    p.relu_enable = rand()%2;
+    /*test case ref does not support dilation !=1*/
+    p.dilation_h = 1;
+    p.dilation_w = 1;
+    cvk_fmt_t ifmt = opd0_sign ? CVK_FMT_I8: CVK_FMT_U8;
+
+    p.ofmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, ofmap_shape, CVK_FMT_I8, 1);
+    p.ifmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, ifmap_shape, ifmt, 1);
+    p.weight = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, weight_shape, CVK_FMT_I8, 1);
+    p.bias = NULL;
+    if (using_bias)
+      p.bias = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, bias_shape, CVK_FMT_I8, 0);
+
+    if ((kh > pooling_ih_ext(&p, ih))
+        || (kw > pooling_iw_ext(&p, iw))
+        || (p.pad_top >= (1 << 4))
+        || (p.pad_bottom >= (1 << 4))
+        || (p.pad_left >= (1 << 4))
+        || (p.pad_right >= (1 << 4))
+        || !p.ofmap
+        || !p.ifmap
+        || !p.weight
+        || (using_bias && !p.bias)) {
+      printf("retry init_pooling_param\n");
+      free_depthwise_param(cvk_ctx, &p);
+    } else
+        break;
+  }
+
+  return p;
+}
+
+static void put_bias_tensor(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    int16_t data[])
+{
+  int c = tl->shape.c;
+
+  uint8_t *lo_hi = (uint8_t *)malloc(2 * c);
+  if (!lo_hi)
+    return;
+
+  for (int i = 0; i < c; i++) {
+    lo_hi[i] = data[i] & 0xff;
+    lo_hi[i + c] = (data[i] >> 8) & 0xff;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl, (uint8_t *)lo_hi);
+
+  free(lo_hi);
+}
+
+static int _test_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int stride_w, int stride_h)
+{
+  param_t param = random_depthwise_param(cvk_ctx, stride_w, stride_h);
+
+  int8_t *input = alloc_input(&param);
+  int8_t *weight = alloc_weight(&param);
+  int16_t *bias = NULL;
+  if (param.bias)
+    bias = alloc_bias(&param);
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, param.ifmap, (uint8_t *)input);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, param.weight, (uint8_t *)weight);
+  if (param.bias)
+    put_bias_tensor(rt_handle, cvk_ctx, param.bias, bias);
+
+  cvk_ctx->ops->tiu_pt_depthwise_convolution(cvk_ctx, &param);
+  int8_t *output = (int8_t *)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, param.ofmap);
+
+  int ret = compare_results(&param, input, weight, bias, output);
+
+  free_depthwise_param(cvk_ctx, &param);
+  free(input);
+  free(weight);
+  free(bias);
+  free(output);
+
+  return ret;
+}
+
+
+static int test_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx) {
+  return _test_pooling(rt_handle, cvk_ctx, INVALIDE_STRIDE, INVALIDE_STRIDE);
+}
+
+static int test_depthwise_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+  for (uint64_t i = 0; i < 16; i++)
+    ret |= test_pooling(rt_handle, cvk_ctx);
+
+  // test stride extend (0, 31]
+  int stride_list[] = {15, 16, 31};
+  int stride_list_len = sizeof(stride_list) / sizeof(stride_list[0]);
+
+  for (int stride_w_idx = 0; stride_w_idx < stride_list_len; stride_w_idx++) {
+    for (int stride_h_idx = 0; stride_h_idx < stride_list_len; stride_h_idx++) {
+      int stride_w = stride_list[stride_w_idx];
+      int stride_h = stride_list[stride_h_idx];
+
+      ret |= _test_pooling(rt_handle, cvk_ctx, stride_w, stride_h);
+      if (ret)
+        break;
+    }
+  }
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret = test_depthwise_pooling(rt_handle, cvk_ctx);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_depthwise_conv_qm.c b/cviruntime/test/180x/test_180x_depthwise_conv_qm.c
new file mode 100644
index 000000000..fcd48dcc4
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_depthwise_conv_qm.c
@@ -0,0 +1,1515 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <limits.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_tf_quant_util.h"
+#include "test_native_ref.h"
+
+// #define ENABLE_DEBUG_MSG
+// #define ENABLE_FULL_REGRESSION
+
+#define MIN_EXEC_TESTS  20
+
+typedef struct {
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int output_h;
+  int output_w;
+  int has_bias;
+  int relu_enable;
+  int8_t *input_data;
+  int8_t *filter_data;
+  int8_t *output_data;
+  int32_t *bias_data;
+  uint32_t *multiplier_data;
+  int8_t *shift_data;
+  float float_multiplier;
+  int retry_cnt;
+} dw_conv_test_param_t;
+
+static inline int Offset(cvk_tl_shape_t shape, int i0, int i1, int i2, int i3)
+{
+  // return n * (shape.c * shape.h * shape.w) + c * (shape.h * shape.w) + h *
+  // shape.w + w;
+  int dims_data[4] = {shape.n, shape.c, shape.h, shape.w};
+  return ((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3;
+}
+
+void fill_random_data_s8(int8_t *input_data, int size)
+{
+  for (int i = 0; i < size; ++i) {
+    int is_satured = ((rand() % 1000) == 1) ? 1 : 0;
+    int is_sign = rand() % 2 ? 1 : -1;
+
+    if (is_satured && is_sign) {
+      input_data[i] = -128;
+    } else if (is_satured) {
+      input_data[i] = 127;
+    } else {
+      input_data[i] = is_sign * rand() % 128;
+    }
+  }
+}
+
+void fill_random_data_s32(int32_t *input_data, int size)
+{
+  for (int i = 0; i < size; ++i) {
+    int is_satured = ((rand() % 1000) == 1) ? 1 : 0;
+    int is_sign = rand() % 2 ? 1 : -1;
+
+    if (is_satured && is_sign) {
+      input_data[i] = INT_MIN;
+    } else if (is_satured) {
+      input_data[i] = INT_MAX;
+    } else {
+      input_data[i] = is_sign * rand() % 128;
+    }
+  }
+}
+
+void convert_nhwc_to_nchw(cvk_tl_shape_t tl_shape, int8_t *src, int8_t *dst)
+{
+  // NHWC
+  uint32_t src_shape_n = tl_shape.n;
+  uint32_t src_shape_h = tl_shape.c;
+  uint32_t src_shape_w = tl_shape.h;
+  uint32_t src_shape_c = tl_shape.w;
+  uint32_t src_stride_c = 1;
+  uint32_t src_stride_w = src_shape_c * src_stride_c;
+  uint32_t src_stride_h = src_shape_w * src_stride_w;
+  uint32_t src_stride_n = src_shape_h * src_stride_h;
+
+  // NCHW
+  // uint32_t dst_shape_n = src_shape_n;
+  uint32_t dst_shape_c = src_shape_c;
+  uint32_t dst_shape_h = src_shape_h;
+  uint32_t dst_shape_w = src_shape_w;
+  uint32_t dst_stride_w = 1;
+  uint32_t dst_stride_h = dst_shape_w * dst_stride_w;
+  uint32_t dst_stride_c = dst_shape_h * dst_stride_h;
+  uint32_t dst_stride_n = dst_shape_c * dst_stride_c;
+
+  printf("convert_nhwc_to_nchw:\n");
+  printf("  src shape (%d, %d, %d, %d), stride (%d, %d, %d, %d)\n", src_shape_n,
+         src_shape_c, src_shape_h, src_shape_w, src_stride_n, src_stride_c,
+         src_stride_h, src_stride_w);
+  printf("  dst shape (%d, %d, %d, %d), stride (%d, %d, %d, %d)\n", src_shape_n,
+         dst_shape_c, dst_shape_h, dst_shape_w, dst_stride_n, dst_stride_c,
+         dst_stride_h, dst_stride_w);
+
+  for (uint32_t i = 0; i < src_shape_n; ++i) {
+    for (uint32_t j = 0; j < src_shape_h; ++j) {
+      for (uint32_t k = 0; k < src_shape_w; ++k) {
+        for (uint32_t l = 0; l < src_shape_c; ++l) {
+          uint32_t src_offset = i * src_stride_n + j * src_stride_h +
+                           k * src_stride_w + l * src_stride_c;
+          uint32_t dst_offset = i * dst_stride_n + j * dst_stride_h +
+                           k * dst_stride_w + l * dst_stride_c;
+          dst[dst_offset] = src[src_offset];
+        }
+      }
+    }
+  }
+}
+
+int test_nhwc_to_nchw()
+{
+  int ret = 0;
+
+  cvk_tl_shape_t shape = tl_shape_t4(2, 2, 2, 2);
+  int size = shape.n * shape.c * shape.h * shape.w;
+
+  int8_t src[2 * 2 * 2 * 2] = {1,  5,  2,  6,  3,  7,  4,  8,
+                           11, 15, 12, 16, 13, 17, 14, 18};
+
+  int8_t dst[2 * 2 * 2 * 2] = {0};
+  int8_t ref_dst[2 * 2 * 2 * 2] = {1,  2,  3,  4,  5,  6,  7,  8,
+                               11, 12, 13, 14, 15, 16, 17, 18};
+
+  convert_nhwc_to_nchw(shape, src, dst);
+  for (int i = 0; i < size; ++i) {
+    if (dst[i] != ref_dst[i]) {
+      printf("Error ! dst[%d] %d != %d(expected)\n", i, dst[i], ref_dst[i]);
+      ret = -1;
+    }
+  }
+
+  cvk_tl_shape_t input_shape = {/*n=*/1, /*h=*/5, /*w=*/6, /*c=*/8};
+  int input_size =
+      input_shape.n * input_shape.c * input_shape.h * input_shape.w;
+  int8_t nhwc_input_data[240] = {
+      103,  85,   -96,  120,  105,  -72,  33,   -50,  -104, 12,   -57,  -80,
+      12,   126,  117,  127,  119,  119,  -88,  57,   120,  123,  117,  -100,
+      -4,   76,   76,   -52,  -92,  -127, -21,  -100, 106,  35,   74,   96,
+      117,  0,    39,   76,   -119, -36,  89,   -74,  111,  46,   45,   -26,
+      65,   61,   62,   -7,   -28,  -20,  39,   -84,  -85,  -51,  52,   76,
+      -120, -47,  -58,  95,   -117, -90,  -104, 126,  82,   82,   49,   -96,
+      -47,  67,   115,  -3,   -120, 41,   -16,  -96,  -31,  -75,  67,   -115,
+      75,   -119, -81,  -24,  -3,   -11,  -14,  -4,   37,   75,   53,   107,
+      65,   78,   -58,  52,   46,   -128, 39,   53,   -87,  36,   -98,  -12,
+      -1,   70,   117,  18,   -41,  96,   21,   78,   -71,  -124, 64,   82,
+      -63,  82,   1,    112,  50,   -23,  100,  -20,  117,  20,   12,   -88,
+      -93,  67,   -90,  -70,  -63,  79,   87,   125,  -63,  -43,  80,   -52,
+      -66,  -125, 109,  -73,  -39,  104,  -78,  89,   -64,  116,  29,   71,
+      -7,   124,  -38,  -111, 84,   75,   21,   24,   12,   59,   106,  49,
+      -55,  46,   65,   -28,  64,   15,   -31,  -75,  17,   7,    -109, -25,
+      -115, -38,  7,    23,   71,   -37,  111,  119,  -95,  -89,  17,   -27,
+      -8,   -29,  -125, 58,   -42,  -29,  -87,  109,  75,   -17,  -49,  92,
+      7,    30,   -86,  -98,  26,   -8,   -61,  -41,  39,   7,    48,   55,
+      63,   125,  -13,  56,   -107, 105,  -70,  1,    105,  14,   -89,  0,
+      83,   -10,  9,    11,   127,  -14,  -108, 90,   -15,  26,   -101, -1};
+  int8_t input_data[240];
+  convert_nhwc_to_nchw(input_shape, nhwc_input_data, input_data);
+  printf("NCHW input_data[%d] = {\n", input_size);
+  for (int i = 0; i < input_size; ++i) {
+    printf("%d, ", input_data[i]);
+    if (i && ((i % 16) == 0)) {
+      printf("\n");
+    }
+  }
+  printf("};\n\n");
+
+  cvk_tl_shape_t filter_shape = {1, 3, 3, 8};
+  int filter_size =
+      filter_shape.n * filter_shape.c * filter_shape.h * filter_shape.w;
+  int8_t nhwc_filter_data[72] = {
+      103,  85,  -96, 120, 105,  -72,  33,   -50,  -104, 12,  -57, -80,
+      12,   126, 117, 127, 119,  119,  -88,  57,   120,  123, 117, -100,
+      -4,   76,  76,  -52, -92,  -127, -21,  -100, 106,  35,  74,  96,
+      117,  0,   39,  76,  -119, -36,  89,   -74,  111,  46,  45,  -26,
+      65,   61,  62,  -7,  -28,  -20,  39,   -84,  -85,  -51, 52,  76,
+      -120, -47, -58, 95,  -117, -90,  -104, 126,  82,   82,  49,  -96};
+  int8_t filter_data[72];
+  convert_nhwc_to_nchw(filter_shape, nhwc_filter_data, filter_data);
+  printf("NCHW filter_data[%d] = {\n", filter_size);
+  for (int i = 0; i < filter_size; ++i) {
+    printf("%d, ", filter_data[i]);
+    if (i && ((i % 16) == 0)) {
+      printf("\n");
+    }
+  }
+  printf("}\n\n");
+
+  cvk_tl_shape_t output_shape = {1, 3, 4, 8};
+  int output_size =
+      output_shape.n * output_shape.c * output_shape.h * output_shape.w;
+  int8_t nhwc_output_data[96] = {
+      127,  127,  69,   34,  36,   127,  127,  127,  -101, -65,  39,   13,
+      26,   6,    127,  -67, 60,   123,  31,   17,   3,    -128, -58,  -64,
+      -128, 26,   -128, -21, 72,   55,   127,  94,   -46,  -128, -37,  1,
+      -6,   109,  98,   -14, -11,  48,   -128, -3,   -50,  37,   -20,  79,
+      -94,  -36,  127,  19,  3,    -18,  -40,  -115, 24,   124,  -128, -1,
+      -52,  -123, -54,  -1,  -62,  95,   127,  24,   10,   -74,  127,  -128,
+      -2,   111,  106,  4,   3,    -128, 127,  127,  -30,  98,   -21,  -1,
+      -11,  -12,  58,   -72, -128, 127,  30,   32,   -85,  -11,  -35,  34};
+  int8_t output_data[96] = {0};
+  convert_nhwc_to_nchw(output_shape, nhwc_output_data, output_data);
+  printf("NCHW output_data[%d] = {\n", output_size);
+  for (int i = 0; i < output_size; ++i) {
+    printf("%d, ", output_data[i]);
+    if (i && ((i % 16) == 0)) {
+      printf("\n");
+    }
+  }
+  printf("};\n\n");
+
+  return ret;
+}
+
+int simple_nhwc_dw_conv_test(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+
+  const int stride_width = 1;
+  const int stride_height = 1;
+  const int dilation_width_factor = 1;
+  const int dilation_height_factor = 1;
+  const int pad_width = 0;
+  const int pad_height = 0;
+  const int depth_multiplier = 1;
+  const int input_offset = 0;   // symmetric
+  const int output_offset = 0;  // symmetric
+  const int output_activation_min = -128;
+  const int output_activation_max = 127;
+
+  if (rt_handle == NULL) {
+    return -1;
+  }
+  if (cvk_ctx == NULL) {
+    return -1;
+  }
+
+  cvk_tl_shape_t input_shape = {/*n=*/1, /*h=*/5, /*w=*/6, /*c=*/8};
+  int8_t input_data[240] = {
+      103,  85,   -96,  120,  105,  -72,  33,   -50,  -104, 12,   -57,  -80,
+      12,   126,  117,  127,  119,  119,  -88,  57,   120,  123,  117,  -100,
+      -4,   76,   76,   -52,  -92,  -127, -21,  -100, 106,  35,   74,   96,
+      117,  0,    39,   76,   -119, -36,  89,   -74,  111,  46,   45,   -26,
+      65,   61,   62,   -7,   -28,  -20,  39,   -84,  -85,  -51,  52,   76,
+      -120, -47,  -58,  95,   -117, -90,  -104, 126,  82,   82,   49,   -96,
+      -47,  67,   115,  -3,   -120, 41,   -16,  -96,  -31,  -75,  67,   -115,
+      75,   -119, -81,  -24,  -3,   -11,  -14,  -4,   37,   75,   53,   107,
+      65,   78,   -58,  52,   46,   -128, 39,   53,   -87,  36,   -98,  -12,
+      -1,   70,   117,  18,   -41,  96,   21,   78,   -71,  -124, 64,   82,
+      -63,  82,   1,    112,  50,   -23,  100,  -20,  117,  20,   12,   -88,
+      -93,  67,   -90,  -70,  -63,  79,   87,   125,  -63,  -43,  80,   -52,
+      -66,  -125, 109,  -73,  -39,  104,  -78,  89,   -64,  116,  29,   71,
+      -7,   124,  -38,  -111, 84,   75,   21,   24,   12,   59,   106,  49,
+      -55,  46,   65,   -28,  64,   15,   -31,  -75,  17,   7,    -109, -25,
+      -115, -38,  7,    23,   71,   -37,  111,  119,  -95,  -89,  17,   -27,
+      -8,   -29,  -125, 58,   -42,  -29,  -87,  109,  75,   -17,  -49,  92,
+      7,    30,   -86,  -98,  26,   -8,   -61,  -41,  39,   7,    48,   55,
+      63,   125,  -13,  56,   -107, 105,  -70,  1,    105,  14,   -89,  0,
+      83,   -10,  9,    11,   127,  -14,  -108, 90,   -15,  26,   -101, -1};
+
+  cvk_tl_shape_t filter_shape = {1, 3, 3, 8};
+  int8_t filter_data[72] = {
+      103,  85,  -96, 120, 105,  -72,  33,   -50,  -104, 12,  -57, -80,
+      12,   126, 117, 127, 119,  119,  -88,  57,   120,  123, 117, -100,
+      -4,   76,  76,  -52, -92,  -127, -21,  -100, 106,  35,  74,  96,
+      117,  0,   39,  76,  -119, -36,  89,   -74,  111,  46,  45,  -26,
+      65,   61,  62,  -7,  -28,  -20,  39,   -84,  -85,  -51, 52,  76,
+      -120, -47, -58, 95,  -117, -90,  -104, 126,  82,   82,  49,  -96};
+
+  int32_t bias_data[8] = {812, 670, -746, 938, 827, -558, 265, -384};
+
+  uint32_t output_multiplier[8] = {1155460505, 1210948247, 1203328687, 1166122678,
+                              1155273687, 1196350022, 1169748238, 1183287581};
+
+  int8_t output_rshift[8] = {-7, -6, -6, -9, -8, -6, -6, -7};
+
+  cvk_tl_shape_t output_shape = {1, 3, 4, 8};
+  int8_t output_data[96] = {0};
+  int8_t ref_output_data[96] = {
+      127,  127,  69,   34,  36,   127,  127,  127,  -101, -65,  39,   13,
+      26,   6,    127,  -67, 60,   123,  31,   17,   3,    -128, -58,  -64,
+      -128, 26,   -128, -21, 72,   55,   127,  94,   -46,  -128, -37,  1,
+      -6,   109,  98,   -14, -11,  48,   -128, -3,   -50,  37,   -20,  79,
+      -94,  -36,  127,  19,  3,    -18,  -40,  -115, 24,   124,  -128, -1,
+      -52,  -123, -54,  -1,  -62,  95,   127,  24,   10,   -74,  127,  -128,
+      -2,   111,  106,  4,   3,    -128, 127,  127,  -30,  98,   -21,  -1,
+      -11,  -12,  58,   -72, -128, 127,  30,   32,   -85,  -11,  -35,  34};
+
+  const int batches = input_shape.n;
+  // const int output_depth = 8;
+  const int input_height = input_shape.c;
+  const int input_width = input_shape.h;
+  const int input_depth = input_shape.w;
+  const int filter_height = filter_shape.c;
+  const int filter_width = filter_shape.h;
+  const int output_height = output_shape.c;
+  const int output_width = output_shape.h;
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          for (int m = 0; m < depth_multiplier; ++m) {
+            const int output_channel = m + in_channel * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - pad_width;
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+            int32_t acc = 0;
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  int32_t input_val = input_data[Offset(input_shape, batch, in_y,
+                                                    in_x, in_channel)];
+                  int32_t filter_val = filter_data[Offset(
+                      filter_shape, 0, filter_y, filter_x, output_channel)];
+                  acc += filter_val * (input_val + input_offset);
+
+                  printf("  [batch=%d][out_y=%d][out_x=%d][in_channel=%d][m=%d]"
+                         "[filter_y=%d][filter_x=%d] acc(%d) += %d * (%d + %d) "
+                         "= %d\n",
+                         batch, out_y, out_x, in_channel, m, filter_y, filter_x,
+                         acc - filter_val * (input_val + input_offset),
+                         filter_val, input_val, input_offset, acc);
+                }
+              }
+            }
+            if (1 /*bias_data*/) {
+              acc += bias_data[output_channel];
+            }
+
+            printf("  [batch=%d][out_y=%d][out_x=%d][output_channel=%d] acc = "
+                   "%d, bias %d\n",
+                   batch, out_y, out_x, output_channel, acc,
+                   bias_data[output_channel]);
+
+            acc = MultiplyByQuantizedMultiplier(
+                acc, output_multiplier[output_channel],
+                output_rshift[output_channel]);
+
+            printf("  [batch=%d][out_y=%d][out_x=%d][output_channel=%d] acc = "
+                   "%d, multiplier %d, shift %d\n",
+                   batch, out_y, out_x, output_channel, acc,
+                   output_multiplier[output_channel],
+                   output_rshift[output_channel]);
+
+            acc += output_offset;
+            acc = MAX(acc, output_activation_min);
+            acc = MIN(acc, output_activation_max);
+
+            printf("  [batch=%d][out_y=%d][out_x=%d][output_channel=%d] acc = "
+                   "%d\n",
+                   batch, out_y, out_x, output_channel, acc);
+
+            {
+              int x = Offset(output_shape, batch, out_y, out_x, output_channel);
+              if (x >= 96) {
+                printf("Error ! shape=(%d, %d, %d, %d), batch %d, out_y %d, "
+                       "out_x %d, output_channel %d, offset %d\n",
+                       output_shape.n, output_shape.c, output_shape.h,
+                       output_shape.w, batch, out_y, out_x, output_channel, x);
+              }
+            }
+
+            output_data[Offset(output_shape, batch, out_y, out_x,
+                               output_channel)] = acc;
+          }
+        }
+      }
+    }
+  }
+
+  int output_size =
+      output_shape.n * output_shape.c * output_shape.h * output_shape.w;
+  for (int i = 0; i < output_size; ++i) {
+    if (output_data[i] != ref_output_data[i]) {
+      printf("  output_data[%d] = %d != %d\n", i, output_data[i],
+             ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  return ret;
+}
+
+typedef struct {
+  int stride_width;
+  int stride_height;
+  int dilation_width_factor;
+  int dilation_height_factor;
+  int padding_width;
+  int padding_height;
+  int depth_multiplier;
+} DwConvParams;
+
+void dw_conv_per_channel_ref(const dw_conv_test_param_t *p_param)
+{
+  const int input_offset = 0;   // symmetric
+  const int output_offset = 0;  // symmetric
+  const int output_activation_min = -128;
+  const int output_activation_max = 127;
+
+  const int stride_width = p_param->stride_w;
+  const int stride_height = p_param->stride_h;
+  const int dilation_width_factor = 1;   // params.dilation_width_factor;
+  const int dilation_height_factor = 1;  // params.dilation_height_factor;
+  const int pad_width = p_param->pad_left;
+  const int pad_height = p_param->pad_top;
+  const int depth_multiplier = 1;  // params.depth_multiplier;
+
+  const int batches = p_param->input_n;
+  const int input_height = p_param->input_h;
+  const int input_width = p_param->input_w;
+  const int input_depth = p_param->input_c;
+  const int filter_height = p_param->kh;
+  const int filter_width = p_param->kw;
+  const int output_depth = p_param->output_c;
+  const int output_height = p_param->output_h;
+  const int output_width = p_param->output_w;
+  int8_t *input_data = p_param->input_data;
+  int8_t *filter_data = p_param->filter_data;
+  int8_t *output_data = p_param->output_data;
+  int32_t *bias_data = p_param->has_bias ? p_param->bias_data : NULL;
+  uint32_t *output_multiplier = p_param->multiplier_data;
+  int8_t *output_rshift = p_param->shift_data;
+
+  cvk_tl_shape_t input_shape = {
+      batches, input_depth, input_height, input_width};
+  cvk_tl_shape_t filter_shape = {
+      output_depth, input_depth, filter_height, filter_width};
+  cvk_tl_shape_t output_shape = {
+      batches, output_depth, output_height, output_width};
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("dw_conv_per_channel_ref =>\n");
+  printf("  input shape (n=%d, c=%d, h=%d, w=%d)\n", batches, input_depth,
+         input_height, input_width);
+  // printf("  filter shape (oc=%d, kh=%d, kw=%d\n",
+  //       );
+  printf("  output shape (n=%d, c=%d, h=%d, w=%d)\n", batches, output_depth,
+         output_height, output_width);
+  printf("  stride_h %d, stride_w %d\n", stride_height, stride_width);
+#endif
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          for (int m = 0; m < depth_multiplier; ++m) {
+            const int output_channel = m + in_channel * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - pad_width;
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+            int32_t acc = 0;
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  int32_t input_val = input_data[Offset(input_shape, batch,
+                                                    in_channel, in_y, in_x)];
+                  int32_t filter_val = filter_data[Offset(
+                      filter_shape, 0, output_channel, filter_y, filter_x)];
+                  acc += filter_val * (input_val + input_offset);
+
+#ifdef ENABLE_DEBUG_MSG
+                  printf("  [batch=%d][out_y=%d][out_x=%d][in_channel=%d][m=%d]"
+                         "[filter_y=%d][filter_x=%d] acc(%d) += %d * (%d + %d) "
+                         "= %d, in_x_origin %d, in_x %d\n",
+                         batch, out_y, out_x, in_channel, m, filter_y, filter_x,
+                         acc - filter_val * (input_val + input_offset),
+                         filter_val, input_val, input_offset, acc, in_x_origin,
+                         in_x);
+#endif
+                }
+              }
+            }
+            if (bias_data) {
+              acc += bias_data[output_channel];
+            }
+
+#ifdef ENABLE_DEBUG_MSG
+            printf("  [batch=%d][out_y=%d][out_x=%d][output_channel=%d] acc = "
+                   "%d, bias %d\n",
+                   batch, out_y, out_x, output_channel, acc,
+                   bias_data ? bias_data[output_channel] : 0);
+#endif
+
+            acc = MultiplyByQuantizedMultiplier(
+                acc, output_multiplier[output_channel],
+                output_rshift[output_channel]);
+
+#ifdef ENABLE_DEBUG_MSG
+            printf("  [batch=%d][out_y=%d][out_x=%d][output_channel=%d] acc = "
+                   "%d, multiplier %d, shift %d\n",
+                   batch, out_y, out_x, output_channel, acc,
+                   output_multiplier[output_channel],
+                   output_rshift[output_channel]);
+#endif
+
+            acc += output_offset;
+            acc = MAX(acc, output_activation_min);
+            acc = MIN(acc, output_activation_max);
+
+#ifdef ENABLE_DEBUG_MSG
+            printf("  [batch=%d][out_y=%d][out_x=%d][output_channel=%d] acc = "
+                   "%d\n",
+                   batch, out_y, out_x, output_channel, acc);
+#endif
+
+            output_data[Offset(output_shape, batch, output_channel, out_y,
+                               out_x)] = acc;
+          }
+        }
+      }
+    }
+  }
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("<= dw_conv_per_channel_ref\n");
+#endif
+}
+
+void calc_dw_conv_float_multiplier(dw_conv_test_param_t *p_param)
+{
+  const int input_offset = 0;  // symmetric
+
+  const int stride_width = p_param->stride_w;
+  const int stride_height = p_param->stride_h;
+  const int dilation_width_factor = 1;   // params.dilation_width_factor;
+  const int dilation_height_factor = 1;  // params.dilation_height_factor;
+  const int pad_width = p_param->pad_left;
+  ;
+  const int pad_height = p_param->pad_top;
+  const int depth_multiplier = 1;  // params.depth_multiplier;
+
+  const int batches = p_param->input_n;
+  const int input_height = p_param->input_h;
+  const int input_width = p_param->input_w;
+  const int input_depth = p_param->input_c;
+  const int filter_height = p_param->kh;
+  const int filter_width = p_param->kw;
+  const int output_depth = p_param->output_c;
+  const int output_height = p_param->output_h;
+  const int output_width = p_param->output_w;
+  int8_t *input_data = p_param->input_data;
+  int8_t *filter_data = p_param->filter_data;
+  int32_t *bias_data = p_param->has_bias ? p_param->bias_data : NULL;
+
+  cvk_tl_shape_t input_shape = {
+      batches, input_depth, input_height, input_width};
+  cvk_tl_shape_t filter_shape = {
+      output_depth, input_depth, filter_height, filter_width};
+
+  int output_accu_min = INT_MAX;
+  int output_accu_max = INT_MIN;
+
+  // printf("calc_dw_conv_float_multiplier =>\n");
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          for (int m = 0; m < depth_multiplier; ++m) {
+            const int output_channel = m + in_channel * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - pad_width;
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+            int32_t acc = 0;
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  int32_t input_val = input_data[Offset(input_shape, batch,
+                                                    in_channel, in_y, in_x)];
+                  int32_t filter_val = filter_data[Offset(
+                      filter_shape, 0, output_channel, filter_y, filter_x)];
+                  acc += filter_val * (input_val + input_offset);
+
+                  // printf("
+                  // [batch=%d][out_y=%d][out_x=%d][in_channel=%d][m=%d]"
+                  //        "[filter_y=%d][filter_x=%d] acc(%d) += %d * (%d +
+                  //        %d) = %d\n",
+                  //         batch, out_y, out_x, in_channel, m, filter_y,
+                  //         filter_x, acc - filter_val * (input_val +
+                  //         input_offset), filter_val, input_val, input_offset,
+                  //         acc);
+                }
+              }
+            }
+            if (bias_data) {
+              acc += bias_data[output_channel];
+            }
+
+            output_accu_max = MAX(acc, output_accu_max);
+            output_accu_min = MIN(acc, output_accu_min);
+
+            // printf("  [batch=%d][out_y=%d][out_x=%d][output_channel=%d] acc =
+            // %d, MIN = %d, MAX = %d\n",
+            //        batch, out_y, out_x, output_channel, acc,
+            //        output_accu_min, output_accu_max);
+          }
+        }
+      }
+    }
+  }
+
+  // Since int8 ranges from -128 to 127, we need to squeeze the accumulator
+  // MIN/MAX fit in those ranges correspondingly as much as possible.
+  if (abs(output_accu_max) > abs(output_accu_min)) {
+    p_param->float_multiplier = 127.0f / abs(output_accu_max);
+  } else {
+    p_param->float_multiplier = 128.0f / abs(output_accu_min);
+  }
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("  output_accu_min %d, output_accu_max %d, output_multiplier %f\n",
+         output_accu_min, output_accu_max, p_param->float_multiplier);
+#endif
+
+  // printf("<= calc_dw_conv_float_multiplier\n");
+}
+
+int simple_dw_conv_test(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+
+  if (rt_handle == NULL) {
+    return -1;
+  }
+  if (cvk_ctx == NULL) {
+    return -1;
+  }
+
+  const int batches = 1;
+  const int input_depth = 8;
+  const int input_height = 5;
+  const int input_width = 6;
+  cvk_tl_shape_t input_shape = {batches, input_depth, input_height, input_width};
+  int8_t input_data[240] = {
+      /* ic = 0 */
+      103, -104, 119, -4, 106, -119, 65, -85, -117, -47, -31, -3, 65, -87, -41,
+      -63, 117, -63, -66, -64, 84, -55, 17, 71, -8, 75, 26, 63, 105, 127,
+
+      /* ic = 1 */
+      85, 12, 119, 76, 35, -36, 61, -51, -90, 67, -75, -11, 78, 36, 96, 82, 20,
+      79, -125, 116, 75, 46, 7, -37, -29, -17, -8, 125, 14, -14,
+
+      /* ic = 2 */
+      -96, -57, -88, 76, 74, 89, 62, 52, -104, 115, 67, -14, -58, -98, 21, 1,
+      12, 87, 109, 29, 21, 65, -109, 111, -125, -49, -61, -13, -89, -108,
+
+      /* ic = 3 */
+      120, -80, 57, -52, 96, -74, -7, 76, 126, -3, -115, -4, 52, -12, 78, 112,
+      -88, 125, -73, 71, 24, -28, -25, 119, 58, 92, -41, 56, 0, 90,
+
+      /* ic = 4 */
+      105, 12, 120, -92, 117, 111, -28, -120, 82, -120, 75, 37, 46, -1, -71, 50,
+      -93, -63, -39, -7, 12, 64, -115, -95, -42, 7, 39, -107, 83, -15,
+
+      /* ic = 5 */
+      -72, 126, 123, -127, 0, 46, -20, -47, 82, 41, -119, 75, -128, 70, -124,
+      -23, 67, -43, 104, 124, 59, 15, -38, -89, -29, 30, 7, 105, -10, 26,
+
+      /* ic = 6 */
+      33, 117, 117, -21, 39, 45, 39, -58, 49, -16, -81, 53, 39, 117, 64, 100,
+      -90, 80, -78, -38, 106, -31, 7, 17, -87, -86, 48, -70, 9, -101,
+
+      /* ic = 7 */
+      -50, 127, -100, -100, 76, -26, -84, 95, -96, -96, -24, 107, 53, 18, 82,
+      -20, -70, -52, 89, -111, 49, -75, 23, -27, 109, -98, 55, 1, 11, -1};
+
+  const int kernel_height = 3;
+  const int kernel_width = 3;
+  cvk_tl_shape_t filter_shape = {1, input_depth, kernel_height, kernel_width};
+  // Global memory layout: OcKhKw
+  int8_t filter_data[72] = {
+      103,  -104, 119,  -4,  106, -119, 65,   -85,  -117, 85,  12,  119,
+      76,   35,   -36,  61,  -51, -90,  -96,  -57,  -88,  76,  74,  89,
+      62,   52,   -104, 120, -80, 57,   -52,  96,   -74,  -7,  76,  126,
+      105,  12,   120,  -92, 117, 111,  -28,  -120, 82,   -72, 126, 123,
+      -127, 0,    46,   -20, -47, 82,   33,   117,  117,  -21, 39,  45,
+      39,   -58,  49,   -50, 127, -100, -100, 76,   -26,  -84, 95,  -96};
+
+  int32_t bias_data[8] = {812, 670, -746, 938, 827, -558, 265, -384};
+
+  uint32_t output_multiplier[8] = {1155460505, 1210948247, 1203328687, 1166122678,
+                              1155273687, 1196350022, 1169748238, 1183287581};
+
+  // Change to right shift
+  int8_t output_rshift[8] = {7, 6, 6, 9, 8, 6, 6, 7};
+
+  uint8_t per_channel_cal_data[8 * 4 + 8 * 4 + 8];
+  pack_chl_quan_param(8, /*has_bias=*/true, bias_data, output_multiplier,
+                      output_rshift, per_channel_cal_data);
+
+  const int output_height = 3;
+  const int output_width = 4;
+  cvk_tl_shape_t output_shape = {batches, input_depth, output_height, output_width};
+  int8_t ref_output_data[96] = {
+      /* oc = 0 */
+      127, -101, 60, -128, -46, -11, -94, 24, -62, -2, -30, -128,
+
+      /* oc = 1 */
+      127, -65, 123, 26, -128, 48, -36, 124, 95, 111, 98, 127,
+
+      /* oc = 2 */
+      69, 39, 31, -128, -37, -128, 127, -128, 127, 106, -21, 30,
+
+      /* oc = 3 */
+      34, 13, 17, -21, 1, -3, 19, -1, 24, 4, -1, 32,
+
+      /* oc = 4 */
+      36, 26, 3, 72, -6, -50, 3, -52, 10, 3, -11, -85,
+
+      /* oc = 5 */
+      127, 6, -128, 55, 109, 37, -18, -123, -74, -128, -12, -11,
+
+      /* oc = 6 */
+      127, 127, -58, 127, 98, -20, -40, -54, 127, 127, 58, -35,
+
+      /* oc = 7 */
+      127, -67, -64, 94, -14, 79, -115, -1, -128, 127, -72, 34};
+
+  cvk_tl_t *tl_input =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, input_shape, CVK_FMT_I8, /*eu_aign=*/1);
+
+  cvk_tl_t *tl_filter =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, filter_shape, CVK_FMT_I8, /*eu_align=*/1);
+
+  cvk_tl_t *tl_output =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, output_shape, CVK_FMT_I8, /*eu_align=*/1);
+
+  cvk_tl_t *tl_per_channel_cal =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape_t4(1, 8, 1, 9), CVK_FMT_U8,
+                                  /*eu_align*/ 0);
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_per_channel_cal, per_channel_cal_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_input, (uint8_t *)input_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_filter, (uint8_t *)filter_data);
+
+  {
+    // Reshape per channel quantization data
+    tl_per_channel_cal->shape = tl_shape_t4(1, 8, 1, 1);
+    tl_per_channel_cal->stride = cvk_ctx->ops->tl_default_stride(
+        cvk_ctx, tl_per_channel_cal->shape, CVK_FMT_I8, /*eu_align=*/0);
+
+    cvk_tiu_depthwise_convolution_param_t param;
+    memset(&param, 0, sizeof(param));
+    param.ofmap = tl_output;
+    param.ifmap = tl_input;
+    param.weight = tl_filter;
+    param.chl_quan_param = tl_per_channel_cal;
+    param.dilation_h = 1;
+    param.dilation_w = 1;
+    param.stride_h = 1;
+    param.stride_w = 1;
+    param.has_bias = 1;
+    cvk_ctx->ops->tiu_depthwise_convolution(cvk_ctx, &param);
+  }
+
+  CVI_RT_Submit(cvk_ctx);
+
+  printf("Compare tiu and golden\n");
+  int8_t *conv_output_data =
+      (int8_t *)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_output);
+  for (int i = 0; i < (int)sizeof(ref_output_data); i++) {
+    if (conv_output_data[i] != ref_output_data[i]) {
+      printf("output_data[%d] %d != %d\n", i, conv_output_data[i],
+             ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  free(conv_output_data);
+
+  int8_t output_data[96] = {0};
+  memset(output_data, 0, sizeof(output_data));
+
+  dw_conv_test_param_t params;
+  memset(&params, 0, sizeof(params));
+  params.input_n = batches;
+  params.input_c = input_depth;
+  params.input_h = input_height;
+  params.input_w = input_width;
+  params.kh = kernel_height;
+  params.kw = kernel_width;
+  params.output_c = input_depth;
+  params.output_h = output_height;
+  params.output_w = output_width;
+  params.stride_w = 1;
+  params.stride_h = 1;
+  params.input_data = input_data;
+  params.filter_data = filter_data;
+  params.output_data = output_data;
+  params.has_bias = 1;
+  params.bias_data = bias_data;
+  params.multiplier_data = output_multiplier;
+  params.shift_data = output_rshift;
+
+  dw_conv_per_channel_ref(&params);
+
+  printf("Compare ref and golden\n");
+  int output_size =
+      output_shape.n * output_shape.c * output_shape.h * output_shape.w;
+  for (int i = 0; i < output_size; ++i) {
+    if (output_data[i] != ref_output_data[i]) {
+      printf("  output_data[%d] = %d != %d\n", i, output_data[i],
+             ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  // Reverse order
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_per_channel_cal);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_output);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_filter);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_input);
+
+  return ret;
+}
+
+int choose_from_range(int table[], int size, int index)
+{
+  if (index >= size) {
+    return 0;
+  }
+
+  int val = table[index];
+  if (index < (size - 1)) {
+    int range = MAX(table[index + 1] - table[index] - 1, 1);
+    val += rand() % range;
+  }
+
+  return val;
+}
+
+void dump_test_param(dw_conv_test_param_t *p_param, bool dump_content)
+{
+  printf("Dump test parameter:\n");
+  printf("  input_n %d\n", p_param->input_n);
+  printf("  input_c %d\n", p_param->input_c);
+  printf("  input_h %d\n", p_param->input_h);
+  printf("  input_w %d\n", p_param->input_w);
+  printf("  kw %d\n", p_param->kw);
+  printf("  kh %d\n", p_param->kh);
+  printf("  dh %d\n", p_param->dh);
+  printf("  dw %d\n", p_param->dw);
+  printf("  pad_top %d\n", p_param->pad_top);
+  printf("  pad_bot %d\n", p_param->pad_bot);
+  printf("  pad_left %d\n", p_param->pad_left);
+  printf("  pad_right %d\n", p_param->pad_right);
+  printf("  ins_h %d\n", p_param->ins_h);
+  printf("  ins_h_last %d\n", p_param->ins_h_last);
+  printf("  ins_w %d\n", p_param->ins_w);
+  printf("  ins_w_last %d\n", p_param->ins_w_last);
+  printf("  stride_h %d\n", p_param->stride_h);
+  printf("  stride_w %d\n", p_param->stride_w);
+  printf("  output_c %d\n", p_param->output_c);
+  printf("  output_h %d\n", p_param->output_h);
+  printf("  output_w %d\n", p_param->output_w);
+  printf("  has_bias %d\n", p_param->has_bias);
+  printf("  relu_enable %d\n", p_param->relu_enable);
+
+  if (dump_content) {
+    printf("input_data(%d, %d, %d, %d) :\n", p_param->input_n, p_param->input_c,
+           p_param->input_h, p_param->input_w);
+    int in = p_param->input_n;
+    int ic = p_param->input_c;
+    int ih = p_param->input_h;
+    int iw = p_param->input_w;
+    for (int i = 0; i < in; ++i) {
+      for (int j = 0; j < ic; ++j) {
+        for (int k = 0; k < ih; ++k) {
+          for (int l = 0; l < iw; ++l) {
+            int offset = i * (ic * ih * iw) + j * (ih * iw) + k * iw + l;
+            printf("%d, ", p_param->input_data[offset]);
+          }
+          printf("\n");
+        }
+      }
+    }
+    printf("\n\n");
+
+    printf("kener_data (%d, %d, %d)\n", p_param->output_c, p_param->kh,
+           p_param->kw);
+    int kh = p_param->kh;
+    int kw = p_param->kw;
+    for (int i = 0; i < ic; ++i) {
+      for (int j = 0; j < kh; ++j) {
+        for (int k = 0; k < kw; ++k) {
+          int offset = i * (kh * kw) + j * kw + k;
+          printf("%d, ", p_param->filter_data[offset]);
+        }
+      }
+      printf("\n");
+    }
+    printf("\n\n");
+
+    if (p_param->has_bias) {
+      printf("bias_data:\n");
+      for (int i = 0; i < ic; ++i) {
+        printf("%d, ", p_param->bias_data[i]);
+      }
+      printf("\n\n");
+    }
+
+    printf("multiplier_data:\n");
+    for (int i = 0; i < ic; ++i) {
+      printf("%d, ", p_param->multiplier_data[i]);
+    }
+    printf("\n\n");
+
+    printf("shift_data:\n");
+    for (int i = 0; i < ic; ++i) {
+      printf("%d, ", p_param->shift_data[i]);
+    }
+    printf("\n\n");
+  }
+}
+
+int run_compare_dw_conv(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx,
+                        dw_conv_test_param_t *p_param)
+{
+  int ret = 0;
+
+  if (rt_handle == NULL || cvk_ctx == NULL) {
+    return -1;
+  }
+
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int oh = p_param->output_h;
+  int ow = p_param->output_w;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_last_h = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_last_w = p_param->ins_w_last;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int has_bias = p_param->has_bias;
+  int relu_enable = p_param->relu_enable;
+
+  int input_size = in * ic * iw * ih;
+  int8_t *input_data = (int8_t *)malloc(input_size);
+
+  int kernel_size = oc * ic * kh * kw;
+  int8_t *kernel_data = (int8_t *)malloc(kernel_size);
+
+  int output_size = in * oc * oh * ow;
+  int8_t *output_data = (int8_t *)malloc(output_size);
+  memset(output_data, 0, output_size);
+
+  int32_t *bias_data = (int32_t *)malloc(sizeof(int32_t) * oc);
+  uint32_t *multiplier_data = (uint32_t *)malloc(sizeof(uint32_t) * oc);
+  int8_t *shift_data = (int8_t *)malloc(oc);
+
+  p_param->input_data = input_data;
+  p_param->filter_data = kernel_data;
+  p_param->output_data = output_data;
+  p_param->has_bias = has_bias;
+  p_param->bias_data = bias_data;
+  p_param->multiplier_data = multiplier_data;
+  p_param->shift_data = shift_data;
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    run_compare_dw_conv =>\n");
+  printf("      input (n=%d, ic=%d, h=%d, w=%d), kernel (oc=%d, ic=%d, h=%d, "
+         "w=%d), output (, c=%d, h=%d, w=%d), has_bias %d\n",
+         in, ic, ih, iw, oc, ic, kh, kw, oc, oh, ow, has_bias);
+#endif
+
+  int retry_cnt = p_param->retry_cnt;
+  do {
+    fill_random_data_s8(input_data, input_size);
+    fill_random_data_s8(kernel_data, kernel_size);
+    if (has_bias) {
+      fill_random_data_s32(bias_data, oc);
+    }
+
+    p_param->float_multiplier = 100.0;  // should be < 1.0
+    calc_dw_conv_float_multiplier(p_param);
+
+    if (p_param->float_multiplier > 0.f && p_param->float_multiplier < 1.0) {
+      break;
+    }
+
+  } while (--retry_cnt);
+
+  if (p_param->float_multiplier >= 1.0) {
+    printf("    run_compare_dw_conv: unable to find valid multiplier\n");
+    free(input_data);
+    free(kernel_data);
+    free(output_data);
+    free(bias_data);
+    free(multiplier_data);
+    free(shift_data);
+    return -1;
+  }
+
+  uint32_t base_multiplier = 0;
+  int base_shift = 0;
+  QuantizeMultiplierSmallerThanOne(p_param->float_multiplier, &base_multiplier,
+                                   &base_shift);
+
+  for (int i = 0; i < oc; ++i) {
+    // multipliers typically range in [2^30 ; 2^31 - 1].
+    // Values in [0, 2^30 - 1] are normally unused, but harmless.
+    // Thus a good way to randomize multipliers is to subtract from them
+    // a random value smaller than 2^30 but still significant compared to it.
+    p_param->multiplier_data[i] = base_multiplier - (rand() % (1 << 26));
+
+    int right_shift = base_shift - 1 + (rand() % 4);
+    p_param->shift_data[i] =
+        truncate_rshift((int8_t)right_shift, /*allow_lshift*/1);
+
+#ifdef ENABLE_DEBUG_MSG
+    printf("      [oc=%d] multiplier_data %d, shift_data %d\n", i,
+           p_param->multiplier_data[i], p_param->shift_data[i]);
+#endif
+  }
+
+  dw_conv_per_channel_ref(p_param);
+
+  const int per_chan_cal_data_size =
+      p_param->has_bias ? 9 : 5;  // bias(4) + multiplier(4) + shift(1)
+  const int cal_data_size = oc * per_chan_cal_data_size;
+  uint8_t *cal_data = (uint8_t *)malloc(cal_data_size);
+  pack_chl_quan_param(oc, p_param->has_bias, p_param->bias_data,
+                      p_param->multiplier_data, p_param->shift_data,
+                      cal_data);
+
+  cvk_tl_shape_t input_shape = tl_shape_t4(in, ic, ih, iw);
+  cvk_tl_shape_t filter_shape = tl_shape_t4(1, oc, kh, kw);
+  cvk_tl_shape_t output_shape = tl_shape_t4(in, oc, oh, ow);
+  cvk_tl_shape_t cal_shape = tl_shape_t4(1, oc, 1, per_chan_cal_data_size);
+
+  cvk_tl_t *tl_input =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, input_shape, CVK_FMT_I8, /*eu_aign=*/1);
+
+  cvk_tl_t *tl_filter =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, filter_shape, CVK_FMT_I8, /*eu_align=*/1);
+
+  cvk_tl_t *tl_output =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, output_shape, CVK_FMT_I8, /*eu_align=*/1);
+
+  // Shape for TDMA load
+  cvk_tl_t *tl_cal_data =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, cal_shape, CVK_FMT_U8, /*eu_align*/ 0);
+
+  if (tl_input == NULL) {
+    printf("      fail to alloc tl_input (%d, %d, %d, %d)\n", input_shape.n,
+           input_shape.c, input_shape.h, input_shape.w);
+    return -1;
+  }
+  if (tl_filter == NULL) {
+    printf("      fail to alloc tl_filter (%d, %d, %d, %d)\n", filter_shape.n,
+           filter_shape.c, filter_shape.h, filter_shape.w);
+    return -1;
+  }
+  if (tl_output == NULL) {
+    printf("      fail to alloc tl_output (%d, %d, %d, %d)\n", output_shape.n,
+           output_shape.c, output_shape.h, output_shape.w);
+    return -1;
+  }
+  if (tl_cal_data == NULL) {
+    printf("      fail to alloc tl_cal_data (%d, %d, %d, %d)\n", cal_shape.n,
+           cal_shape.c, cal_shape.h, cal_shape.w);
+    return -1;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_cal_data, cal_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_input, (uint8_t *)input_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_filter, (uint8_t *)kernel_data);
+
+  {
+    // Reshape per channel quantization data for TIU
+    tl_cal_data->shape = tl_shape_t4(1, oc, 1, 1);
+    tl_cal_data->stride = cvk_ctx->ops->tl_default_stride(
+        cvk_ctx, tl_cal_data->shape, CVK_FMT_I8, /*eu_align=*/0);
+
+    cvk_tiu_depthwise_convolution_param_t param;
+    memset(&param, 0, sizeof(param));
+    param.ofmap = tl_output;
+    param.ifmap = tl_input;
+    param.weight = tl_filter;
+    param.chl_quan_param = tl_cal_data;
+    param.ins_h = ins_h;
+    param.ins_last_h = ins_last_h;
+    param.ins_w = ins_w;
+    param.ins_last_w = ins_last_w;
+    param.stride_h = stride_h;
+    param.stride_w = stride_w;
+    param.dilation_h = dh;
+    param.dilation_w = dw;
+    param.pad_top = pad_top;
+    param.pad_bottom = pad_bot;
+    param.pad_left = pad_left;
+    param.pad_right = pad_right;
+    param.has_bias = has_bias;
+    param.relu_enable = relu_enable;
+
+#ifdef ENABLE_DEBUG_MSG
+    printf("      tiu_dw_conv_qdm:\n");
+    printf("        ifmap shape (%d, %d, %d, %d)\n", param.ifmap->shape.n,
+           param.ifmap->shape.c, param.ifmap->shape.h, param.ifmap->shape.w);
+    printf("        weight shape (%d, %d, %d, %d)\n", param.weight->shape.n,
+           param.weight->shape.c, param.weight->shape.h, param.weight->shape.w);
+    printf("        ofmap shape (%d, %d, %d, %d)\n", param.ofmap->shape.n,
+           param.ofmap->shape.c, param.ofmap->shape.h, param.ofmap->shape.w);
+#endif
+
+    cvk_ctx->ops->tiu_depthwise_convolution(cvk_ctx, &param);
+  }
+
+  CVI_RT_Submit(cvk_ctx);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("      compare result:\n");
+#endif
+  int8_t *conv_output_data =
+      (int8_t *)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_output);
+  for (int i = 0; i < output_size; i++) {
+    if (conv_output_data[i] != output_data[i]) {
+      printf("        output_data[%d] %d(tiu) != %d(ref)\n", i,
+             conv_output_data[i], output_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+  if (ret) {
+    dump_test_param(p_param, /*dump_content=*/true);
+  }
+
+  // Reverse order
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_cal_data);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_output);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_filter);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_input);
+
+  free(conv_output_data);
+
+  free(input_data);
+  free(kernel_data);
+  free(output_data);
+  free(bias_data);
+  free(multiplier_data);
+  free(shift_data);
+  free(cal_data);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    <= run_compare_dw_conv\n");
+#endif
+
+  return ret;
+}
+
+bool check_valid_test_param(cvk_context_t *cvk_ctx, dw_conv_test_param_t *p_param)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int oh = p_param->output_h;
+  int ow = p_param->output_w;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int per_chan_cal_data_size =
+      p_param->has_bias ? 9 : 5;  // bias(4) + multiplier(4) + shift(1)
+
+  // Skip invalid shape
+  if ((kh > ih) || (kw > iw) || (stride_h > ih) || (stride_w > iw)) {
+    return false;
+  }
+
+  // muliply random-choosen value may exceeded than int32_t
+  uint32_t input_size = in * ic * ih * iw;
+  uint32_t kernel_size = ic * kh * kw;  // no oc
+  uint32_t output_size = in * oc * oh * ow;
+
+  uint32_t lmem_size_per_lane = cvk_ctx->info.lmem_size;
+  uint32_t total_lmem_size = cvk_ctx->info.lmem_size * cvk_ctx->info.npu_num;
+
+  uint32_t total_needed_size = input_size + kernel_size + output_size +
+                          per_chan_cal_data_size * cvk_ctx->info.npu_num;
+  if (total_needed_size > total_lmem_size) {
+    return false;
+  }
+
+  cvk_tl_shape_t input_shape = {in, ic, ih, iw};
+  cvk_tl_shape_t filter_shape = {1, oc, kh, kw};
+  cvk_tl_shape_t output_shape = {in, oc, oh, ow};
+  cvk_tl_shape_t cal_shape = {1, oc, 1, per_chan_cal_data_size};
+
+  uint32_t needed_size =
+      cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, input_shape, CVK_FMT_I8, /*eu_align=*/1) +
+      cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, filter_shape, CVK_FMT_I8, /*eu_align=*/1) +
+      cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, output_shape, CVK_FMT_I8, /*eu_align=*/1) +
+      cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, cal_shape, CVK_FMT_I8, /*eu_align=*/0);
+
+  // Skip invalid shape
+  if (needed_size > lmem_size_per_lane) {
+    return false;
+  }
+
+  return true;
+}
+
+int random_test(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+
+  if (rt_handle == NULL || cvk_ctx == NULL) {
+    return -1;
+  }
+
+#ifndef ENABLE_FULL_REGRESSION
+  // Input with same range size
+  // n: 12b, c: 12b, h: 12b(4095-32), wb: 12b(4095-32)
+  int batch_range[] = {1, 1, 3232};
+  int input_height_range[] = {1, 512, 4095 - 32};
+  int input_width_range[] = {1, 512, 4095 - 32};
+  int input_depth_range[] = {1, 16, 4095 - 32};
+
+  // Kernel with same range size
+  // h: 12b, w: 12b
+  // stride_h: 4b, strid_w: 4b
+  int kernel_height_range[] = {1, 11, 2048, 4095};
+  int kernel_width_range[] = {1, 11, 2048, 4095};
+  int kernel_stride_height_range[] = {1, 5, 16, 31};
+  int kernel_stride_width_range[] = {1, 5, 16, 31};
+#else
+  // Input with same range size
+  // n: 12b, c: 12b, h: 12b(4095-32), wb: 12b(4095-32)
+  int batch_range[] = {1, 2, 4, 8, 16, 32, 64, 4095 - 32};
+  int input_height_range[] = {1, 3, 11, 128, 512, 1024, 2048, 4095 - 32};
+  int input_width_range[] = {1, 3, 11, 128, 512, 1024, 2048, 4095 - 32};
+  int input_depth_range[] = {1, 3, 11, 32, 64, 1024, 2048, 4095 - 32};
+
+  // Kernel with same range size
+  // h: 12b, w: 12b
+  // stride_h: 4b, strid_w: 4b
+  int kernel_height_range[] = {1, 3, 11, 511, 4095};
+  int kernel_width_range[] = {1, 3, 11, 511, 4095};
+  int kernel_stride_height_range[] = {1, 3, 15, 16, 31};
+  int kernel_stride_width_range[] = {1, 3, 15, 16, 31};
+#endif /* ENABLE_FULL_REGRESSION */
+
+  const int input_range_size =
+      sizeof(input_height_range) / sizeof(input_height_range[0]);
+  const int kernel_range_size =
+      sizeof(kernel_height_range) / sizeof(kernel_height_range[0]);
+
+  int random_seed = clock();
+  srand(random_seed);
+
+  const int retry_test_count = 100;
+  bool stop_at_first_error = true;
+
+  int executed_tests = 0;
+  int failed_tests = 0;
+
+  printf("dw-conv-qm: random test =>\n");
+  for (int m = 0; m < retry_test_count; ++m) {
+    for (int i = 0; i < input_range_size; ++i) {
+      // random choosed from [range[i] : range[i+1]]
+      int batch = choose_from_range(batch_range, input_range_size, i);
+
+      for (int j = 0; j < input_range_size; ++j) {
+        int input_height =
+            choose_from_range(input_height_range, input_range_size, j);
+
+        for (int k = 0; k < input_range_size; ++k) {
+          int input_width =
+              choose_from_range(input_width_range, input_range_size, k);
+
+          for (int l = 0; l < input_range_size; ++l) {
+            int input_depth =
+                choose_from_range(input_depth_range, input_range_size, k);
+
+            for (int m = 0; m < kernel_range_size; ++m) {
+              int kernel_height =
+                  choose_from_range(kernel_height_range, kernel_range_size, m);
+
+              for (int n = 0; n < kernel_range_size; ++n) {
+                int kernel_width =
+                    choose_from_range(kernel_width_range, kernel_range_size, n);
+
+                for (int x = 0; x < kernel_range_size; ++x) {
+                  int kernel_stride_height = choose_from_range(
+                      kernel_stride_height_range, kernel_range_size, x);
+
+                  for (int y = 0; y < kernel_range_size; ++y) {
+                    int kernel_stride_width = choose_from_range(
+                        kernel_stride_width_range, kernel_range_size, y);
+
+                    int has_bias = rand() % 2;
+                    int dh = 1;
+                    int dw = 1;
+                    int ins_h = 0;
+                    int ins_h_last = 0;
+                    int ins_w = 0;
+                    int ins_w_last = 0;
+                    int pad_top = 0;
+                    int pad_bot = 0;
+                    int pad_left = 0;
+                    int pad_right = 0;
+
+                    int ih_ext = calc_dilute_hw(input_height, ins_h, ins_h_last,
+                                                pad_top, pad_bot);
+                    int iw_ext = calc_dilute_hw(input_width, ins_w, ins_w_last,
+                                                pad_left, pad_right);
+                    int kh_ext = calc_dilute_hw(kernel_height, dh - 1, 0, 0, 0);
+                    int kw_ext = calc_dilute_hw(kernel_width, dw - 1, 0, 0, 0);
+
+                    int oh =
+                        calc_output_hw(ih_ext, kh_ext, kernel_stride_height);
+                    int ow =
+                        calc_output_hw(iw_ext, kw_ext, kernel_stride_width);
+
+                    // depthwise, input depth == output depth
+                    int output_depth = input_depth;
+
+                    dw_conv_test_param_t test_param;
+                    memset(&test_param, 0, sizeof(test_param));
+                    test_param.input_n = batch;
+                    test_param.input_c = input_depth;
+                    test_param.input_h = input_height;
+                    test_param.input_w = input_width;
+                    test_param.kh = kernel_height;
+                    test_param.kw = kernel_width;
+                    test_param.dh = dh;
+                    test_param.dw = dw;
+                    test_param.pad_top = pad_top;
+                    test_param.pad_bot = pad_bot;
+                    test_param.pad_left = pad_left;
+                    test_param.pad_right = pad_right;
+                    test_param.ins_h = ins_h;
+                    test_param.ins_h_last = ins_h_last;
+                    test_param.ins_w = ins_w;
+                    test_param.ins_w_last = ins_w_last;
+                    test_param.stride_h = kernel_stride_height;
+                    test_param.stride_w = kernel_stride_width;
+                    test_param.output_c = output_depth;
+                    test_param.output_h = oh;
+                    test_param.output_w = ow;
+                    test_param.has_bias = has_bias;
+                    test_param.retry_cnt = 5;
+
+                    bool is_valid_param =
+                        check_valid_test_param(cvk_ctx, &test_param);
+                    if (is_valid_param == false) {
+                      continue;
+                    }
+
+                    int ret2 = run_compare_dw_conv(rt_handle, cvk_ctx, &test_param);
+                    failed_tests = ret2 ? failed_tests + 1 : failed_tests;
+                    ret |= ret2;
+                    executed_tests++;
+
+#ifdef ENABLE_DEBUG_MSG
+                    printf("  [%d] random test: input shape(%d, %d, %d, %d)",
+                           executed_tests, batch, input_depth,
+                           input_height, input_width);
+                    printf(", kernel shape (%d, %d, %d, %d), result %d\n",
+                           output_depth, input_depth, kernel_height,
+                           kernel_width, ret);
+#endif
+
+                    // Stop at first error
+                    if (ret && stop_at_first_error) {
+                      break;
+                    }
+                  }
+
+                  // Stop at first error
+                  if (ret && stop_at_first_error) {
+                    break;
+                  }
+                }
+
+                // Stop at first error
+                if (ret && stop_at_first_error) {
+                  break;
+                }
+              }
+
+              // Stop at first error
+              if (ret && stop_at_first_error) {
+                break;
+              }
+            }
+
+            // Stop at first error
+            if (ret && stop_at_first_error) {
+              break;
+            }
+          }
+
+          // Stop at first error
+          if (ret && stop_at_first_error) {
+            break;
+          }
+        }
+
+        // Stop at first error
+        if (ret && stop_at_first_error) {
+          break;
+        }
+      }
+
+      // Stop at first error
+      if (ret && stop_at_first_error) {
+        break;
+      }
+    }
+
+    // Stop at first error
+    if (ret && stop_at_first_error) {
+      break;
+    }
+
+    if (executed_tests >= MIN_EXEC_TESTS) {
+      break;
+    }
+  }
+
+  printf("<= dw-conv-qm: random test, total %d, failed %d, ret %d\n",
+         executed_tests, failed_tests, ret);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  // ret = simple_nhwc_dw_conv_test(rt_handle, cvk_ctx);
+  // ret |= test_nhwc_to_nchw();
+  ret |= simple_dw_conv_test(rt_handle, cvk_ctx);
+  ret |= random_test(rt_handle, cvk_ctx);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_depthwise_max_power.c b/cviruntime/test/180x/test_180x_depthwise_max_power.c
new file mode 100644
index 000000000..7cf5a6cd3
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_depthwise_max_power.c
@@ -0,0 +1,633 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_native_ref.h"
+
+typedef cvk_tiu_depthwise_pt_convolution_param_t depthwise_conv_param_t;
+typedef cvk_tdma_l2g_tensor_copy_cw_transposed_param_t l2g_cw_param_t;
+typedef cvk_tdma_g2l_matrix_copy_row_col_transposed_param_t g2l_matrix_param_t;
+typedef cvk_tdma_l2l_tensor_copy_param_t l2l_tensor_copy_param_t;
+
+typedef struct{
+    int8_t  *depthwise_conv_input;
+    int8_t  *depthwise_conv_weight;
+    int16_t *depthwise_conv_bias;
+    uint8_t  *depthwise_conv_output;
+    int8_t  *depthwise_conv_output_ref;
+    uint8_t  *l2g_cw_src;
+    uint8_t  *l2g_cw_output;
+    uint8_t  *l2g_cw_output_ref;
+    uint8_t  *g2l_matrix_src;
+    uint8_t  *g2l_matrix_output;
+    uint8_t  *g2l_matrix_output_ref;
+    uint8_t  *l2l_tensor_src;
+    uint8_t  *l2l_tensor_output;
+    uint8_t  *l2l_tensor_output_ref;
+}s_test_data;
+
+depthwise_conv_param_t depthwise_conv_param;
+l2g_cw_param_t l2g_cw_param;
+g2l_matrix_param_t g2l_matrix_param;
+l2l_tensor_copy_param_t l2l_tensor_copy_param;
+s_test_data s8_test_data;
+
+cvk_tl_t *skip_tensor_lmem[10];
+uint32_t skip_tensor_num=0;
+
+void skip_tensor_lmem_size(cvk_context_t *cvk_ctx, const cvk_tl_t *p)
+{
+  uint32_t needed = align_up(p->shape.n * p->stride.n, cvk_ctx->info.eu_num);
+  uint32_t start_addr = p->start_address + needed;
+  uint32_t remain_size = start_addr % cvk_ctx->info.lmem_bank_size ? (cvk_ctx->info.lmem_bank_size - start_addr % cvk_ctx->info.lmem_bank_size) : 0; // remain size for each lane
+  if(remain_size)
+  {
+    cvk_tl_shape_t src_shape2 = tl_shape_t4(1, cvk_ctx->info.npu_num, 1, remain_size);
+    skip_tensor_lmem[skip_tensor_num] = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, src_shape2, CVK_FMT_I8, 1); // skip the lmem size and next tl can alignment to bank si     ze
+  }
+  skip_tensor_num++;
+}
+
+void skip_matrix_lmem_size(cvk_context_t *cvk_ctx, const cvk_ml_t *p)
+{
+  uint32_t needed = align_up(p->shape.n * p->stride.n, cvk_ctx->info.eu_num);
+
+  uint32_t start_addr = p->start_address + needed; //src_shape.n*src_shape.c*src_shape.h*src_shape.w/32;
+  uint32_t remain_size = start_addr % cvk_ctx->info.lmem_bank_size ? (cvk_ctx->info.lmem_bank_size - start_addr % cvk_ctx->info.lmem_bank_size) : 0; // remain size for each lane
+  if(remain_size)
+  {
+    cvk_tl_shape_t src_shape2 = {1, cvk_ctx->info.npu_num, 1, remain_size};
+    skip_tensor_lmem[skip_tensor_num] = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, src_shape2, CVK_FMT_I8, 1); // skip the lmem size and next tl can alignment to bank si     ze
+  }
+  skip_tensor_num++;
+}
+
+void free_skip_tensor_lmem(cvk_context_t *cvk_ctx)
+{
+  if(skip_tensor_lmem[--skip_tensor_num]!=NULL)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, skip_tensor_lmem[skip_tensor_num]);
+}
+
+static int8_t * alloc_input(const depthwise_conv_param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->ifmap->shape, p->ifmap->fmt);
+  int8_t *buf = (int8_t *)malloc(sizeof(int8_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (uint64_t i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static int8_t * alloc_weight(const depthwise_conv_param_t *p)
+{
+  int size = tl_shape_size(&p->weight->shape, p->weight->fmt);
+  int8_t *buf = (int8_t *)malloc(sizeof(int8_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static int16_t * alloc_bias(const depthwise_conv_param_t *p)
+{
+  int c = p->bias->shape.c;
+  int16_t *bias = (int16_t *)malloc(sizeof(int16_t) * c);
+  if (!bias)
+    return NULL;
+
+  for (int i = 0; i < c; i++)
+    bias[i] = rand() % 65536 - 32768;
+
+  return bias;
+}
+
+static int8_t *alloc_output(depthwise_conv_param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->ofmap->shape, p->ofmap->fmt);
+  int8_t *output = (int8_t *)malloc(sizeof(int8_t) * size);
+  return output;
+}
+
+static inline void relu8(int8_t *buf, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++)
+    if (buf[i] < 0)
+      buf[i] = 0;
+}
+
+static int generate_results(
+    depthwise_conv_param_t *p,
+    int8_t input[],
+    int8_t weight[],
+    int16_t bias[]
+    )
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+  int kh = p->weight->shape.h;
+  int kw = p->weight->shape.w;
+  int opd0_sign = (p->ifmap->fmt == CVK_FMT_I8);
+  int res0_sign = (p->ofmap->fmt == CVK_FMT_I8);
+  s8_test_data.depthwise_conv_output_ref = alloc_output(p);
+
+  int ret = native_pooling_ave_int8(
+      input, weight, p->bias ? bias : NULL, s8_test_data.depthwise_conv_output_ref,
+      in, ic, ih, iw, kh, kw,
+      p->pad_top, p->pad_bottom, p->pad_left, p->pad_right,
+      p->stride_h, p->stride_w,
+      p->ins_h, p->ins_w, p->ins_last_h, p->ins_last_w,
+      opd0_sign, res0_sign, p->rshift_bits, 0);
+  if (ret)
+    return ret;
+
+  if(p->relu_enable )
+    relu8(s8_test_data.depthwise_conv_output_ref, tl_shape_size(&p->ofmap->shape, p->ofmap->fmt));
+
+  return ret;
+}
+
+static int pooling_ih_ext(depthwise_conv_param_t *p, int ih)
+{
+  int ins = p->ins_h;
+  int ins_last = p->ins_last_h;
+  int pad = p->pad_top + p->pad_bottom;
+  return (ih - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+static int pooling_iw_ext(depthwise_conv_param_t *p, int iw)
+{
+  int ins = p->ins_w;
+  int ins_last = p->ins_last_w;
+  int pad = p->pad_left + p->pad_right;
+  return (iw - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+static int pooling_oh(depthwise_conv_param_t *p, int ih, int kh)
+{
+  int ih_ext = pooling_ih_ext(p, ih);
+  return (ih_ext - kh) / p->stride_h + 1;
+}
+
+static int pooling_ow(depthwise_conv_param_t *p, int iw, int kw)
+{
+  int iw_ext = pooling_iw_ext(p, iw);
+  return (iw_ext - kw) / p->stride_w + 1;
+}
+
+static void free_depthwise_param(
+    cvk_context_t *cvk_ctx,
+    depthwise_conv_param_t *p)
+{
+  if (p->bias)
+  {
+    free_skip_tensor_lmem(cvk_ctx);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->bias);
+  }
+  if (p->weight)
+  {
+    free_skip_tensor_lmem(cvk_ctx);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->weight);
+  }
+  if (p->ifmap)
+  {
+    free_skip_tensor_lmem(cvk_ctx);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->ifmap);
+  }
+  if (p->ofmap)
+  {
+    free_skip_tensor_lmem(cvk_ctx);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->ofmap);
+  }
+}
+
+static void put_bias_tensor(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    int16_t data[])
+{
+  int c = tl->shape.c;
+
+  uint8_t *lo_hi = (uint8_t *)malloc(2 * c);
+  if (!lo_hi)
+    return;
+
+  for (int i = 0; i < c; i++) {
+    lo_hi[i] = data[i] & 0xff;
+    lo_hi[i + c] = (data[i] >> 8) & 0xff;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl, (uint8_t *)lo_hi);
+
+  free(lo_hi);
+}
+
+static depthwise_conv_param_t random_depthwise_param(cvk_context_t *cvk_ctx)
+{
+  srand(clock());
+  depthwise_conv_param_t p;
+  int retry_cnt = 100;
+
+  for (int i = 0; i < retry_cnt; i++) {
+    int using_bias = 0;
+    int n = 1;
+    int c = 250; // 1000 -> 250 for 180x
+    int ih = 2;
+    int iw = 8;
+    int kh = 1;
+    int kw = 1;
+    int opd0_sign = 0;
+
+    memset(&p, 0, sizeof(p));
+    p.ins_h = rand() % kh;
+    p.ins_w = rand() % kw;
+    p.ins_last_h = rand() % kh;
+    p.ins_last_w = rand() % kw;
+    p.stride_h = rand() % kh + 1;
+    p.stride_w = rand() % kw + 1;
+    p.pad_top = 0;
+    p.pad_bottom = 0;
+    p.pad_left = 0;
+    p.pad_right = 0;
+    p.rshift_bits = 2;
+    int oh = pooling_oh(&p, ih, kh);
+    int ow = pooling_ow(&p, iw, kw);
+    cvk_tl_shape_t ofmap_shape;
+    ofmap_shape.n = n;
+    ofmap_shape.c = c;
+    ofmap_shape.h = oh;
+    ofmap_shape.w = ow;
+    cvk_tl_shape_t ifmap_shape;
+    ifmap_shape.n = n;
+    ifmap_shape.c = c;
+    ifmap_shape.h = ih;
+    ifmap_shape.w = iw;
+    cvk_tl_shape_t weight_shape;
+    weight_shape.n = 1;
+    weight_shape.c = c;
+    weight_shape.h = kh;
+    weight_shape.w = kw;
+    cvk_tl_shape_t bias_shape;
+    bias_shape.n = 2;
+    bias_shape.c = c;
+    bias_shape.h = 1;
+    bias_shape.w = 1;
+    p.relu_enable = 1;
+    /*test case ref does not support dilation !=1*/
+    p.dilation_w = 1;
+    p.dilation_h = 1;
+    cvk_fmt_t ifmt = opd0_sign ? CVK_FMT_I8: CVK_FMT_U8;
+
+    p.ofmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, ofmap_shape, CVK_FMT_I8, 1);
+    skip_tensor_lmem_size(cvk_ctx, p.ofmap);
+    p.ifmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, ifmap_shape, ifmt, 1);
+    skip_tensor_lmem_size(cvk_ctx, p.ifmap);
+    p.weight = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, weight_shape, CVK_FMT_I8, 1);
+    skip_tensor_lmem_size(cvk_ctx, p.weight);
+    p.bias = NULL;
+    if (using_bias)
+    {
+      p.bias = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, bias_shape, CVK_FMT_I8, 0);
+      skip_tensor_lmem_size(cvk_ctx, p.bias);
+    }
+    if ((kh > pooling_ih_ext(&p, ih))
+        || (kw > pooling_iw_ext(&p, iw))
+        || (p.pad_top >= (1 << 4))
+        || (p.pad_bottom >= (1 << 4))
+        || (p.pad_left >= (1 << 4))
+        || (p.pad_right >= (1 << 4))
+        || !p.ofmap
+        || !p.ifmap
+        || !p.weight
+        || (using_bias && !p.bias)) {
+      printf("retry init_pooling_param\n");
+      free_depthwise_param(cvk_ctx, &p);
+    } else
+      break;
+  }
+
+  return p;
+}
+
+
+static int test_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  depthwise_conv_param = random_depthwise_param(cvk_ctx);
+
+  int8_t *input = alloc_input(&depthwise_conv_param);
+  int8_t *weight = alloc_weight(&depthwise_conv_param);
+  int16_t *bias = NULL;
+  if (depthwise_conv_param.bias)
+    bias = alloc_bias(&depthwise_conv_param);
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, depthwise_conv_param.ifmap, (uint8_t *)input);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, depthwise_conv_param.weight, (uint8_t *)weight);
+  if (depthwise_conv_param.bias)
+    put_bias_tensor(rt_handle, cvk_ctx, depthwise_conv_param.bias, bias);
+
+  int ret = generate_results(&depthwise_conv_param, input, weight, bias);
+
+  free(input);
+  free(weight);
+  free(bias);
+
+  return ret;
+}
+
+static void l2g_tensor_copy_cw_transposed_ref(
+    l2g_cw_param_t *p, uint8_t ref_data[], uint8_t src_data[])
+{
+  cvk_tl_shape_t s = p->src->shape;
+  uint32_t n = s.n;
+  uint32_t c = s.c;
+  uint32_t h = s.h;
+  uint32_t w = s.w;
+
+  for (uint32_t ni = 0; ni < n; ni++) {
+    for (uint32_t ci = 0; ci < c; ci++) {
+      for (uint32_t hi = 0; hi < h; hi++) {
+        for (uint32_t wi = 0; wi < w; wi++) {
+          uint32_t src_i = ni * c * h * w + ci * h * w + hi * w + wi;
+          uint32_t dst_i = ni * c * h * w + wi * h * c + hi * c + ci;
+          ref_data[dst_i] = src_data[src_i];
+        }
+      }
+    }
+  }
+}
+
+static void test_param_l2g(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, l2g_cw_param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->src->shape, p->src->fmt);
+
+  s8_test_data.l2g_cw_src = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!s8_test_data.l2g_cw_src)
+    return;
+
+  for (uint64_t i = 0; i < size; i++)
+    s8_test_data.l2g_cw_src[i] = rand()%0x100;
+
+  s8_test_data.l2g_cw_output_ref = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!s8_test_data.l2g_cw_output_ref)
+    return;
+
+  l2g_tensor_copy_cw_transposed_ref(p, s8_test_data.l2g_cw_output_ref, s8_test_data.l2g_cw_src);
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, p->src, s8_test_data.l2g_cw_src);
+}
+
+static void destroy_param_l2g(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, l2g_cw_param_t *p)
+{
+  free_tensor_dev_mem(rt_handle, p->dst);
+  free_skip_tensor_lmem(cvk_ctx);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->src);
+}
+
+static void test_l2g_cw_transpose(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, l2g_cw_param_t *p)
+{
+  cvk_tl_shape_t src_shape = {1, 0x40, 1, 0x020}; // 0x100 -> 0x40 for 180x
+  cvk_tg_shape_t dst_shape = {1, 0x020, 1, 0x40}; // 0x100 -> 0x40 for 180x
+
+  p->src = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, src_shape, CVK_FMT_I8, 1);
+  p->dst = alloc_tensor_dev_mem(rt_handle, cvk_ctx, dst_shape, CVK_FMT_I8);
+  skip_tensor_lmem_size(cvk_ctx, p->src);
+  test_param_l2g(rt_handle, cvk_ctx, p);
+}
+
+static void g2l_matrix_copy_row_col_transposed_ref(
+    g2l_matrix_param_t *p, uint8_t ref_data[], uint8_t src_data[])
+{
+  uint64_t row = p->src->shape.row;
+  uint64_t col = p->src->shape.col;
+
+  for (uint64_t ri = 0; ri < row; ri++) {
+    for (uint64_t ci = 0; ci < col; ci++) {
+      uint64_t src_i = ri * col + ci;
+      uint64_t dst_i = ci * row + ri;
+      ref_data[dst_i] = src_data[src_i];
+    }
+  }
+}
+
+static void test_param_g2l(CVI_RT_HANDLE rt_handle, g2l_matrix_param_t *p)
+{
+  uint64_t size = ml_shape_size(&p->dst->shape, p->dst->fmt);
+
+  s8_test_data.g2l_matrix_src = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!s8_test_data.g2l_matrix_src)
+    return;
+
+  for (uint64_t i = 0; i < size; i++)
+    s8_test_data.g2l_matrix_src[i] = rand()%0x100;
+
+  s8_test_data.g2l_matrix_output_ref = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!s8_test_data.g2l_matrix_output_ref)
+    return;
+
+  g2l_matrix_copy_row_col_transposed_ref(p, s8_test_data.g2l_matrix_output_ref, s8_test_data.g2l_matrix_src);
+
+  matrix_copy_s2d(rt_handle, p->src, s8_test_data.g2l_matrix_src);
+}
+
+static void destroy_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, g2l_matrix_param_t *p)
+{
+  free_matrix_dev_mem(rt_handle, p->src);
+  free_skip_tensor_lmem(cvk_ctx);
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->dst);
+}
+
+
+static void test_g2l_matrix_transpose(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, g2l_matrix_param_t *p)
+{
+  //g2l_matrix_param_t p;
+  /*
+   * Matrix transpose must be n/c stride alignment
+   * for TDMA limitation
+   */
+  cvk_mg_shape_t src_shape={0x100, 0x20};
+  cvk_ml_shape_t dst_shape={0x20, 0x10, 0x10, 0x100};
+
+  int dst_align = 1;
+  cvk_fmt_t fmt = CVK_FMT_I8;
+  p->src = alloc_matrix_dev_mem(rt_handle, src_shape, fmt);
+  p->dst = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, dst_shape, fmt, dst_align);
+  skip_matrix_lmem_size(cvk_ctx, p->dst);
+  test_param_g2l(rt_handle, p);
+}
+
+static void l2l_tensor_copy_ref(l2l_tensor_copy_param_t *p, uint8_t ref_data[], uint8_t src_data[])
+{
+  uint64_t size = tl_shape_size(&p->src->shape, p->src->fmt);
+
+  for (uint64_t i = 0; i < size; i++)
+    ref_data[i] = src_data[i];
+}
+
+static void test_l2l_param(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, l2l_tensor_copy_param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->src->shape, p->src->fmt);
+
+  s8_test_data.l2l_tensor_src = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!s8_test_data.l2l_tensor_src)
+    return;
+
+  for (uint64_t i = 0; i < size; i++)
+    s8_test_data.l2l_tensor_src[i] = rand()%0x100;
+
+  s8_test_data.l2l_tensor_output_ref = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!s8_test_data.l2l_tensor_output_ref)
+    return;
+
+  l2l_tensor_copy_ref(p, s8_test_data.l2l_tensor_output_ref, s8_test_data.l2l_tensor_src);
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, p->src, s8_test_data.l2l_tensor_src);
+}
+
+static void destroy_param_l2l(cvk_context_t *cvk_ctx, l2l_tensor_copy_param_t *p)
+{
+  free_skip_tensor_lmem(cvk_ctx);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->dst);
+  free_skip_tensor_lmem(cvk_ctx);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->src);
+}
+
+static void test_l2l_tensor_copy(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, l2l_tensor_copy_param_t *p)
+{
+  //cvk_tl_shape_t src_shape = {1, 0x10, 0x1, 0x100};
+  //cvk_tl_shape_t dst_shape = {1, 0x10, 0x1, 0x100};
+  cvk_tl_shape_t src_shape = {1, 0x8, 0x1, 0x40}; // for 180x
+  cvk_tl_shape_t dst_shape = {1, 0x8, 0x1, 0x40}; // for 180x
+
+  p->src = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, src_shape, CVK_FMT_I8, 1);
+  skip_tensor_lmem_size(cvk_ctx, p->src);
+  p->dst = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, dst_shape, CVK_FMT_I8, 1);
+  skip_tensor_lmem_size(cvk_ctx, p->dst);
+  test_l2l_param(rt_handle, cvk_ctx, p);
+}
+
+void get_result(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  s8_test_data.depthwise_conv_output = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, depthwise_conv_param.ofmap);
+  s8_test_data.l2g_cw_output = tensor_copy_d2s(rt_handle, l2g_cw_param.dst);
+  s8_test_data.g2l_matrix_output = matrix_copy_l2g_d2s(rt_handle, cvk_ctx, g2l_matrix_param.dst);
+  s8_test_data.l2l_tensor_output = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, l2l_tensor_copy_param.dst);
+}
+
+int check_result(void)
+{
+
+  int cmp_res = array_cmp_int8(
+      "Comparing results ...\n", s8_test_data.depthwise_conv_output_ref,  (int8_t *)s8_test_data.depthwise_conv_output,
+      tl_shape_size(&depthwise_conv_param.ofmap->shape, depthwise_conv_param.ofmap->fmt));
+  if (cmp_res != 0) {
+    printf("Comparison FAILED!!!\n");
+    return -1;
+  }
+
+  for (uint64_t i = 0; i < tl_shape_size(&l2g_cw_param.src->shape, l2g_cw_param.src->fmt); i++) {
+    if (s8_test_data.l2g_cw_output[i] != s8_test_data.l2g_cw_output_ref[i]) {
+      fprintf(stderr, "l2g_cw comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, s8_test_data.l2g_cw_output[i], s8_test_data.l2g_cw_output_ref[i]);
+      return -1;
+    }
+  }
+  for (uint64_t i = 0; i < ml_shape_size(&g2l_matrix_param.dst->shape, g2l_matrix_param.dst->fmt); i++) {
+    if (s8_test_data.g2l_matrix_output[i] != s8_test_data.g2l_matrix_output_ref[i]) {
+      fprintf(stderr, "g2l_matrix comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, s8_test_data.g2l_matrix_output[i], s8_test_data.g2l_matrix_output_ref[i]);
+      return -1;
+    }
+  }
+
+  for (uint64_t i = 0; i < tl_shape_size(&l2l_tensor_copy_param.src->shape, l2l_tensor_copy_param.src->fmt); i++) {
+    if (s8_test_data.l2l_tensor_output[i] != s8_test_data.l2l_tensor_output_ref[i]) {
+      fprintf(stderr, "l2l_tensor comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, s8_test_data.l2l_tensor_output[i], s8_test_data.l2l_tensor_output_ref[i]);
+      return -1;
+    }
+  }
+
+  return 0;
+}
+
+void trigger_max_power(cvk_context_t *cvk_ctx)
+{
+ cvk_ctx->ops->parallel_enable(cvk_ctx);
+ cvk_ctx->ops->tdma_l2g_tensor_copy_cw_transposed(cvk_ctx, &l2g_cw_param);
+ cvk_ctx->ops->tdma_g2l_matrix_copy_row_col_transposed(cvk_ctx, &g2l_matrix_param);
+ cvk_ctx->ops->tdma_l2l_tensor_copy(cvk_ctx, &l2l_tensor_copy_param);
+ cvk_ctx->ops->tiu_pt_depthwise_convolution(cvk_ctx, &depthwise_conv_param);
+ cvk_ctx->ops->parallel_disable(cvk_ctx);
+ CVI_RT_Submit(cvk_ctx);
+}
+
+void free_s8_data()
+{
+  free(s8_test_data.depthwise_conv_input);
+  free(s8_test_data.depthwise_conv_weight);
+  free(s8_test_data.depthwise_conv_bias);
+  free(s8_test_data.depthwise_conv_output);
+  free(s8_test_data.depthwise_conv_output_ref);
+  free(s8_test_data.l2g_cw_src);
+  free(s8_test_data.l2g_cw_output);
+  free(s8_test_data.l2g_cw_output_ref);
+  free(s8_test_data.g2l_matrix_src);
+  free(s8_test_data.g2l_matrix_output);
+  free(s8_test_data.g2l_matrix_output_ref);
+  free(s8_test_data.l2l_tensor_src);
+  free(s8_test_data.l2l_tensor_output);
+  free(s8_test_data.l2l_tensor_output_ref);
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  printf("depthwise max_power test\n");
+
+  ret |= test_pooling(rt_handle, cvk_ctx);
+  test_l2g_cw_transpose(rt_handle, cvk_ctx, &l2g_cw_param);
+  test_g2l_matrix_transpose(rt_handle, cvk_ctx, &g2l_matrix_param);
+  test_l2l_tensor_copy(rt_handle, cvk_ctx, &l2l_tensor_copy_param);
+
+  trigger_max_power(cvk_ctx);
+  get_result(rt_handle, cvk_ctx);
+  ret |= check_result();
+
+  destroy_param_l2l(cvk_ctx, &l2l_tensor_copy_param);
+  destroy_param_g2l(rt_handle, cvk_ctx, &g2l_matrix_param);
+  destroy_param_l2g(rt_handle, cvk_ctx, &l2g_cw_param);
+  free_depthwise_param(cvk_ctx, &depthwise_conv_param);
+  free_s8_data();
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_double_conv.c b/cviruntime/test/180x/test_180x_double_conv.c
new file mode 100644
index 000000000..de279cfa0
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_double_conv.c
@@ -0,0 +1,807 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_native_ref.h"
+
+typedef struct {
+  int random_seed;
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int using_bias;
+  int bReLU_EN;
+  int r_shift_m;
+  int opd0_sign;
+  int opd1_sign;
+  int opd2_sign;
+} conv_param_t;
+
+static int index_get(int h, int w1, int w2)
+{
+  return h * w1 + w2;
+}
+
+static int matrix_dot_mult(
+    int8_t *A, int8_t *B, int dim_n, int dim_m,
+    int opd0_sign)
+{
+  int sum = 0;
+  for (int i=0; i<dim_n; i++){
+    for (int j=0; j<dim_m; j++){
+      int index = index_get(i, dim_m, j);
+      if(opd0_sign) {
+        sum += A[index] * B [index];
+      } else {
+        sum += (int)((uint8_t)A[index]) * B [index];
+      }
+    }
+  }
+  return sum;
+}
+
+static int conv_ref(
+    const conv_param_t *p_param,
+    const int8_t *ifmap,
+    const int8_t *weight,
+    const int16_t *bias,
+    int8_t *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+  int r_shift_bits = p_param->r_shift_m;
+  int do_relu = p_param->bReLU_EN;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  int8_t *i_fmap_pad_ker = (int8_t *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return -1;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = 0;
+
+  int8_t *i_fmap_pad = NULL;
+  int8_t *kernel_after = NULL;
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+
+      if (p_param->using_bias) {
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += bias[c]; //bias+c ;
+          }
+        }
+      }
+
+      if (do_relu)
+        relu(&result[n*oc*oh*ow + c*oh*ow], oh * ow);
+
+      // ofmap is int8_t, signed
+      ret = satu_2_8bit(&result[n*oc*oh*ow + c*oh*ow], oh * ow,
+                        &ofmap[n*oc*oh*ow + c*oh*ow], r_shift_bits, /*round_floor=*/1,
+                        /*sign_unsign=*/1);
+
+      if (ret)
+        goto error_release;
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+error_release:
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+static uint8_t * transform_weight(const cvk_tl_shape_t *s, uint8_t before[])
+{
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  uint32_t size = ic * oc * kh * kw;
+  uint8_t *after = (uint8_t *)malloc(size);
+  if (!after)
+    return NULL;
+
+  /*
+   * (oc, ic, kh, kw) -> (1, oc, kh * kw, ic)
+   */
+  for (uint32_t oci = 0; oci < oc; oci++) {
+    for (uint32_t ici = 0; ici < ic; ici++) {
+      for (uint32_t khi = 0; khi < kh; khi++) {
+        for (uint32_t kwi = 0; kwi < kw; kwi++) {
+          uint32_t src_i = oci * ic * kh * kw + ici * kh * kw + khi * kw + kwi;
+          uint32_t dst_i = oci * kh * kw * ic + khi * kw * ic + kwi * ic + ici;
+          after[dst_i] = before[src_i];
+        }
+      }
+    }
+  }
+
+  return after;
+}
+
+static void put_conv_weight(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    uint8_t *data)
+{
+#if 0
+  const cvk_tl_shape_t *s = &tl->shape;
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  bmshape_t bms = BM_TENSOR_INT8((int)oc, (int)ic, (int)kh, (int)kw);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  uint8_t *transformed_data = transform_weight(s, data);
+
+  /* we put weight to region 1. bm_memcpy_s2d regard dev_mem as
+   * absolute address, so we should pass abs address to copy weight
+   * to right place.
+   */
+
+  //u64 ab_addr = bm_device_read_base_reg(*ctx, 1);
+  //bmmem_device_t ab_dev_mem =
+    //bmmem_device_prealloc(*ctx, NULL, ab_addr + gaddr, &bms);
+
+  //int ret = bm_memcpy_s2d(*ctx, ab_dev_mem, transformed_data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  cvk_tl_shape_t tdma_shape = { 1, oc, kh * kw, ic };
+
+  tg_t tdma_tg;
+  tdma_tg.base_reg_index = 0;
+  tdma_tg.start_address = gaddr;
+  tdma_tg.fmt = FMT_I8;
+  tdma_tg.shape.n = tdma_shape.n;
+  tdma_tg.shape.c = tdma_shape.c;
+  tdma_tg.shape.h = tdma_shape.h;
+  tdma_tg.shape.w = tdma_shape.w;
+  tdma_tg.stride = bmk1822_tensor_tgmem_default_stride(tdma_tg.shape, tdma_tg.fmt);
+  tdma_tg.base_reg_index = 1;
+
+  tl_t tdma_tl = *tl;
+  tdma_tl.shape = tdma_shape;
+  tdma_tl.stride = bmk1822_tensor_lmem_default_stride(cvk_ctx, tdma_shape, FMT_I8, 0);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tdma_tg;
+  p.dst = &tdma_tl;
+
+  bmk1822_tdma_g2l_tensor_copy(cvk_ctx, &p);
+  test_submit(ctx);
+  bmmem_device_free(*ctx, dev_mem);
+  delete[] transformed_data;
+#endif
+
+  const cvk_tl_shape_t *s = &tl->shape;
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  uint8_t *transformed_data = transform_weight(s, data);
+
+  cvk_tl_shape_t tdma_shape = tl_shape_t4(1, oc, kh * kw, ic);
+  cvk_tl_t tdma_tl;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tdma_tl, tdma_shape, tl->fmt, tl->eu_align);
+  tdma_tl.start_address = tl->start_address;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, &tdma_tl, transformed_data);
+
+  free(transformed_data);
+}
+
+static int8_t * transform_bias(int oc, int16_t before[])
+{
+  int8_t *after = (int8_t *)malloc(2 * oc);
+  if (!after)
+    return NULL;
+
+  for (int i = 0; i < oc; i++){
+    after[i] = before[i] & 0xff;
+    after[i + oc] = (before[i] >> 8) & 0xff;
+  }
+  return after;
+}
+
+static void put_conv_bias(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    int16_t *data)
+{
+#if 0
+  int oc = tl->shape.c;
+
+  bmshape_t bms = BM_TENSOR_INT8(2, oc, 1, 1);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  int8_t *transformed_data = transform_bias(oc, data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, (uint8_t *)transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_I8;
+  tg.shape.n = 2;
+  tg.shape.c = oc;
+  tg.shape.h = 1;
+  tg.shape.w = 1;
+  tg.stride = bmk1822_tensor_tgmem_default_stride(tg.shape, tg.fmt);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1822_tdma_g2l_tensor_copy(cvk_ctx, &p);
+  test_submit(ctx);
+
+  bmmem_device_free(*ctx, dev_mem);
+  delete[] transformed_data;
+#endif
+
+  int oc = tl->shape.c;
+  int8_t *transformed_data = transform_bias(oc, data);
+
+  cvk_tl_shape_t tdma_shape = tl_shape_t4(2, oc, 1, 1);
+  cvk_tl_t tdma_tl;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tdma_tl, tdma_shape, tl->fmt, tl->eu_align);
+  tdma_tl.start_address = tl->start_address;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, &tdma_tl, (uint8_t *)transformed_data);
+
+  free(transformed_data);
+}
+
+static int conv_kh_ext(const conv_param_t *p)
+{
+  return (p->kh - 1) * p->dh + 1;
+}
+
+static int conv_kw_ext(const conv_param_t *p)
+{
+  return (p->kw - 1) * p->dw + 1;
+}
+
+static int conv_ih_ext(const conv_param_t *p)
+{
+  return (p->input_h - 1) * (p->ins_h + 1) +
+      p->ins_h_last + 1 + p->pad_top + p->pad_bot;
+}
+
+static int conv_iw_ext(const conv_param_t *p)
+{
+  return (p->input_w - 1) * (p->ins_w + 1) +
+      p->ins_w_last + 1 + p->pad_left + p->pad_right;
+}
+
+static int conv_oh(const conv_param_t *p)
+{
+  return (conv_ih_ext(p) - conv_kh_ext(p)) / p->stride_h + 1;
+}
+
+static int conv_ow(const conv_param_t *p)
+{
+  return (conv_iw_ext(p) - conv_kw_ext(p)) / p->stride_w + 1;
+}
+
+static int conv_input_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int ic = p->input_c;
+  int ih = p->input_h;
+  int iw = p->input_w;
+  return in * ic * ih * iw;
+}
+
+static int conv_output_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int oc = p->output_c;
+  int oh = conv_oh(p);
+  int ow = conv_ow(p);
+  return in * oc * oh * ow;
+}
+
+static int conv_weight_size(const conv_param_t *p)
+{
+  int oc = p->output_c;
+  int ic = p->input_c;
+  int kh = p->kh;
+  int kw = p->kw;
+  return oc * ic * kh * kw;
+}
+
+static int8_t * alloc_input(const conv_param_t *p)
+{
+  int size = conv_input_size(p);
+
+  int8_t *buf = (int8_t *)malloc(sizeof(int8_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static int8_t * alloc_weight(const conv_param_t *p)
+{
+  int size = conv_weight_size(p);
+
+  int8_t *buf = (int8_t *)malloc(sizeof(int8_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static int16_t * alloc_bias(const conv_param_t *p)
+{
+  int oc = p->output_c;
+
+  int16_t *bias = (int16_t *)malloc(sizeof(int16_t) * oc);
+  if (!bias)
+    return NULL;
+
+  for (int i = 0; i < oc; i++)
+    bias[i] = rand() % 65536 - 32768;
+
+  return bias;
+}
+
+static cvk_tl_t * conv_ifmap_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = p->opd0_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 1);
+}
+
+static cvk_tl_t * conv_weight_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = p->opd1_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 0);
+}
+
+static cvk_tl_t * conv_ofmap_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, CVK_FMT_I8, 1);
+}
+
+static cvk_tl_t * conv_bias_tensor(
+    cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = p->opd2_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 0);
+}
+
+static int conv_param_is_ok(const conv_param_t *p)
+{
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+
+  if ((kh_ext > ih_ext)
+      || (kw_ext > iw_ext)
+      || (kh_ext <= p->pad_top)
+      || (kh_ext <= p->pad_bot)
+      || (kw_ext <= p->pad_left)
+      || (kw_ext <= p->pad_right)
+      || (p->pad_top >= (1 << 4))
+      || (p->pad_bot >= (1 << 4))
+      || (p->pad_left >= (1 << 4))
+      || (p->pad_right >= (1 << 4))) {
+    return 0;
+  }
+
+  return 1;
+}
+
+static int bmk_conv_param_alloc_ok(
+    const cvk_tiu_pt_convolution_param_t *p,
+    const conv_param_t *param)
+{
+  if (!p->ifmap || !p->ofmap || !p->weight)
+    return 0;
+
+  if (param->using_bias)
+    if (!p->bias)
+      return 0;
+
+  return 1;
+}
+
+static void make_bmk_conv_param(
+    cvk_context_t *cvk_ctx,
+    cvk_tiu_pt_convolution_param_t *dst,
+    const conv_param_t *p)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  dst->relu_enable = p->bReLU_EN;
+  dst->rshift_bits = p->r_shift_m;
+
+  dst->ifmap = conv_ifmap_tensor(cvk_ctx, p);
+  dst->weight = conv_weight_tensor(cvk_ctx, p);
+  dst->ofmap = conv_ofmap_tensor(cvk_ctx, p);
+  dst->bias = NULL;
+  dst->ps32_mode = 0;
+  if (p->using_bias)
+    dst->bias = conv_bias_tensor(cvk_ctx, p);
+
+  dst-> w_is_const = 0;
+}
+
+static void free_bmk_conv_param(
+    cvk_context_t *cvk_ctx,
+    cvk_tiu_pt_convolution_param_t *r,
+    const conv_param_t *p)
+{
+  if (p->using_bias && r->bias)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->bias);
+  if (r->ofmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ofmap);
+  if (r->weight)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->weight);
+  if (r->ifmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ifmap);
+}
+
+static void init_conv_param(conv_param_t *p)
+{
+  printf("init_conv_param\n");
+  p->random_seed = clock();
+  srand(p->random_seed);
+
+retry:
+  p->input_n = rand() % 5 + 1;
+  p->input_c = (rand() % (5 * 32)/2)*2 + 8;
+  p->kh = rand() % 7 + 1;
+  p->kw = rand() % 7 + 1;
+  p->input_h = rand() % 40 + p->kh;
+  p->input_w = rand() % 40 + p->kw;
+  p->output_c = rand() % 10 + 3;
+  p->stride_h = rand() % (p->kh) + 1;
+  p->stride_w = rand() % (p->kw) + 1;
+  p->ins_h = rand() % p->kh;
+  p->ins_w = rand() % p->kw;
+  p->ins_h_last = rand() % p->kh;;
+  p->ins_w_last = rand() % p->kw;;
+  p->dh = rand() % 3 + 1;
+  p->dw = rand() % 3 + 1;
+
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  p->pad_top = rand() % kh_ext;
+  p->pad_bot = rand() % kh_ext;
+  p->pad_left = rand() % kw_ext;
+  p->pad_right = rand() % kw_ext;
+
+  if (!conv_param_is_ok(p)) {
+    printf("retry init_conv_param\n");
+    goto retry;
+  }
+
+  p->using_bias = rand() % 2;
+  p->r_shift_m = rand() % 8;
+  p->bReLU_EN = rand() % 2;
+
+  p->opd0_sign = rand() % 2;
+  p->opd1_sign = 1;
+  p->opd2_sign = 1;
+
+  assert(p->opd1_sign == 1 && p->opd2_sign == 1);
+
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+  assert(ih_ext >= kh_ext);
+  assert(iw_ext >= kw_ext);
+}
+
+static void print_conv_param(const conv_param_t *p)
+{
+  printf("%s\n", "Conv parameters:");
+  printf("  %s%d;\n", "p->random_seed = ", p->random_seed);
+
+  printf("  %s%d;\n", "p->input_n = ", p->input_n);
+  printf("  %s%d;\n", "p->input_c = ", p->input_c);
+  printf("  %s%d;\n", "p->input_h = ", p->input_h);
+  printf("  %s%d;\n", "p->input_w = ", p->input_w);
+  printf("  %s%d;\n", "p->output_c = ", p->output_c);
+
+  printf("  %s%d;\n", "p->kh = ", p->kh);
+  printf("  %s%d;\n", "p->kw = ", p->kw);
+  printf("  %s%d;\n", "p->dh = ", p->dh);
+  printf("  %s%d;\n", "p->dw = ", p->dw);
+  printf("  %s%d;\n", "p->pad_top = ", p->pad_top);
+  printf("  %s%d;\n", "p->pad_bot = ", p->pad_bot);
+  printf("  %s%d;\n", "p->pad_left = ", p->pad_left);
+  printf("  %s%d;\n", "p->pad_right = ", p->pad_right);
+  printf("  %s%d;\n", "p->stride_h = ", p->stride_h);
+  printf("  %s%d;\n", "p->stride_w = ", p->stride_w);
+  printf("  %s%d;\n", "p->ins_w = ", p->ins_w);
+  printf("  %s%d;\n", "p->ins_h = ", p->ins_h);
+  printf("  %s%d;\n", "p->ins_w_last = ", p->ins_w_last);
+  printf("  %s%d;\n", "p->ins_h_last = ", p->ins_h_last);
+
+  printf("  %s%d;\n", "p->r_shift_m = ", p->r_shift_m);
+  printf("  %s%d;\n", "p->opd0_sign = ", p->opd0_sign);
+  printf("  %s%d;\n", "p->opd1_sign = ", p->opd1_sign);
+  printf("  %s%d;\n", "p->opd2_sign = ", p->opd2_sign);
+  printf("  %s%d;\n", "p->bReLU_EN = ", p->bReLU_EN);
+  printf("  %s%d;\n", "p->using_bias = ", p->using_bias);
+
+  printf("  %s%d\n", "kh_ext = ", conv_kh_ext(p));
+  printf("  %s%d\n", "kw_ext = ", conv_kw_ext(p));
+  printf("  %s%d\n", "ih_ext = ", conv_ih_ext(p));
+  printf("  %s%d\n", "iw_ext = ", conv_iw_ext(p));
+  printf("  %s%d\n", "output_h = ", conv_oh(p));
+  printf("  %s%d\n", "output_w = ", conv_ow(p));
+}
+
+// Calculate the right shift value, m
+// Steps:
+//  1. Get the abs() of each weight;
+//  2. Summary all the abs() in one kernel;
+//  3. Get Log2 of each sum;
+//  4. Downward rounding;
+// After every r_shift value got, sort and find the middle one.
+static int calc_rshift_m(const conv_param_t *p, int8_t* weight)
+{
+  int kernel_cnt = p->output_c * p->input_c;
+  int kernel_size = p->kh * p->kw;
+  int *kernel_shifts = (int *)malloc(sizeof(int) * kernel_cnt);
+  if (!kernel_shifts)
+    return 1;
+
+  memset(kernel_shifts, 0, sizeof(int) * kernel_cnt);
+
+  // Part 1:
+  // Get right shift value for each kernel
+  int sum = 0;
+  for (int i = 0; i < kernel_cnt; i++) {
+    // Step 1 & 2: Get the sum of abs()
+    for (int j = 0; j < kernel_size; j++) {
+      sum += (int)(*weight < 0 ? -(*weight) : (*weight));
+      weight++;
+    }
+    // Step 3 & 4: log2 and downward rounding
+    sum >>= 1;
+    while (sum) {
+      sum >>= 1;
+      kernel_shifts[i]++;
+    }
+  }
+
+  // Part 2:
+  // Find the middle of all the values
+  int tag[32] = {0};
+  for (int cnt = 0; cnt < kernel_cnt; cnt++) {
+    tag[kernel_shifts[cnt]]++;
+  }
+
+  int rshift_m = 0;
+  int mid = 0;
+  do {
+    mid += tag[rshift_m++];
+  } while(mid < (kernel_cnt - 1) >> 1);
+
+  free(kernel_shifts);
+
+  return rshift_m - 1;
+}
+
+static int test_conv(
+    conv_param_t *p_param, CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+  int8_t *input = alloc_input(p_param);
+  int8_t *weight = alloc_weight(p_param);
+  int16_t *bias = alloc_bias(p_param);
+  int8_t *output_ref = (int8_t *)malloc(sizeof(int8_t) * conv_output_size(p_param));
+  if (!input || !weight || !bias || !output_ref) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  p_param->r_shift_m = calc_rshift_m(p_param, weight);
+
+  ret = conv_ref(p_param, input, weight, bias, output_ref);
+  if (ret)
+    goto fail_exit;
+
+  cvk_tiu_pt_convolution_param_t conv_param;
+  make_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+
+  int tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, p_param);
+  if (tl_alloc_success) {
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, conv_param.ifmap, (uint8_t *)input);
+    put_conv_weight(rt_handle, cvk_ctx, conv_param.weight, (uint8_t *)weight);
+    if (p_param->using_bias)
+      put_conv_bias(rt_handle, cvk_ctx, conv_param.bias, bias);
+    cvk_ctx->ops->tiu_pt_convolution(cvk_ctx, &conv_param);
+    uint8_t *output = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, conv_param.ofmap);
+
+    ret = array_cmp_int8(
+        "Comparing results ...\n",
+        output_ref, (int8_t *)output, conv_output_size(p_param));
+
+    if (ret) {
+      print_conv_param(p_param);
+      printf("Comparison FAILED\n");
+    }
+    free(output);
+  }
+
+  free_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+
+fail_exit:
+  free(input);
+  free(weight);
+  free(bias);
+  free(output_ref);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  for (int i = 0; i < 5; i++) {
+    printf("random_test_conv iteration: %d\n", i);
+    conv_param_t test_conv_param;
+    init_conv_param(&test_conv_param);
+    ret |= test_conv(&test_conv_param, rt_handle, cvk_ctx);
+    if (!test_conv_param.using_bias)
+      test_conv_param.using_bias = 1;
+    if (test_conv_param.output_c <= 32)
+      test_conv_param.output_c += 32;
+    ret |= test_conv(&test_conv_param, rt_handle, cvk_ctx);
+  }
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_double_conv_ps32.c b/cviruntime/test/180x/test_180x_double_conv_ps32.c
new file mode 100644
index 000000000..735bdcf50
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_double_conv_ps32.c
@@ -0,0 +1,1506 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include "test_cvikernel_util.h"
+#include "test_native_ref.h"
+
+typedef struct {
+  int random_seed;
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int using_bias;
+  int bReLU_EN;
+  int r_shift_m;
+  int opd0_sign;
+  int opd1_sign;
+  int opd2_sign;
+} conv_param_t;
+
+static int index_get(int h, int w1, int w2)
+{
+  return h * w1 + w2;
+}
+
+static int matrix_dot_mult(
+    int8_t *A, int8_t *B, int dim_n, int dim_m,
+    int opd0_sign)
+{
+  int sum = 0;
+  for (int i=0; i<dim_n; i++){
+    for (int j=0; j<dim_m; j++){
+      int index = index_get(i, dim_m, j);
+      if (opd0_sign) {
+        sum += A[index] * B [index];
+      } else {
+        sum += (int)((uint8_t)A[index]) * B [index];
+      }
+    }
+  }
+  return sum;
+}
+
+static int ps32_m2_conv_ref(
+    const conv_param_t *p_param,
+    const int8_t *ifmap,
+    const int8_t *weight,
+    int8_t *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  int8_t *i_fmap_pad_ker = (int8_t *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return -1;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = 0;
+
+  int8_t *i_fmap_pad = NULL;
+  int8_t *kernel_after = NULL;
+  uint32_t bstride = in * oc * oh * ow;
+
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+    ofmap[i] = result[i];
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+    ofmap[bstride + i] = result[i] >> 8;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+    ofmap[2 * bstride + i] = result[i] >> 16;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+    ofmap[3 * bstride + i] = result[i] >> 24;
+
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+static int ps32_m1_conv_ref(
+    const conv_param_t *p_param,
+    const int8_t *ifmap,
+    const int8_t *weight,
+    const int16_t *bias,
+    int8_t *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+  int r_shift_bits = p_param->r_shift_m;
+  int do_relu = p_param->bReLU_EN;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  int8_t *i_fmap_pad_ker = (int8_t *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return -1;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = 0;
+
+  int8_t *i_fmap_pad = NULL;
+  int8_t *kernel_after = NULL;
+
+  uint32_t bstride = in * oc * oh * ow;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      result[i] = (uint8_t)ofmap[i];
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      result[i] |= (uint8_t)ofmap[bstride + i] << 8;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      result[i] |= (uint8_t)ofmap[2 * bstride + i] << 16;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      result[i] |= (uint8_t)ofmap[3 * bstride + i] << 24;
+
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+
+      if (p_param->using_bias) {
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += bias[c]; //bias+c ;
+          }
+        }
+      }
+
+      if (do_relu)
+        relu(&result[n*oc*oh*ow + c*oh*ow], oh * ow);
+
+      // ofmap is int8_t, signed
+      ret = satu_2_8bit(&result[n*oc*oh*ow + c*oh*ow], oh * ow,
+                        &ofmap[n*oc*oh*ow + c*oh*ow], r_shift_bits, /*round_floor=*/1,
+                        /*sign_unsign=*/1);
+
+      if (ret)
+        goto error_release;
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+error_release:
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+
+static int ps32_m3_conv_ref(
+    const conv_param_t *p_param,
+    const int8_t *ifmap,
+    const int8_t *weight,
+    int8_t *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  int8_t *i_fmap_pad_ker = (int8_t *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return -1;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = 0;
+
+  int8_t *i_fmap_pad = NULL;
+  int8_t *kernel_after = NULL;
+
+  if (!result || !i_fmap_pad_ker) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  uint32_t bstride = in * oc * oh * ow;
+
+  for (int i = 0; i < in * oc * oh * ow; i++)
+      result[i] = (uint8_t)ofmap[i];
+
+  for (int i = 0; i < in * oc * oh * ow; i++)
+      result[i] |= (uint8_t)ofmap[bstride + i] << 8;
+
+  for (int i = 0; i < in * oc * oh * ow; i++)
+      result[i] |= (uint8_t)ofmap[2 * bstride + i] << 16;
+
+  for (int i = 0; i < in * oc * oh * ow; i++)
+      result[i] |= (uint8_t)ofmap[3 * bstride + i] << 24;
+
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      ofmap[i] = result[i];
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      ofmap[bstride + i] = result[i] >> 8;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      ofmap[2 * bstride + i] = result[i] >> 16;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      ofmap[3 * bstride + i] = result[i] >> 24;
+
+fail_exit:
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+static int conv_ref(
+    const conv_param_t *p_param,
+    const int8_t *ifmap,
+    const int8_t *weight,
+    const int16_t *bias,
+    int8_t *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+  int r_shift_bits = p_param->r_shift_m;
+  int do_relu = p_param->bReLU_EN;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  int8_t *i_fmap_pad_ker = (int8_t *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return -1;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = 0;
+
+  int8_t *i_fmap_pad = NULL;
+  int8_t *kernel_after = NULL;
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+
+      if (p_param->using_bias) {
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += bias[c]; //bias+c ;
+          }
+        }
+      }
+
+      if (do_relu)
+        relu(&result[n*oc*oh*ow + c*oh*ow], oh * ow);
+
+      // ofmap is int8_t, signed
+      ret = satu_2_8bit(&result[n*oc*oh*ow + c*oh*ow], oh * ow,
+                        &ofmap[n*oc*oh*ow + c*oh*ow], r_shift_bits, /*round_floor=*/1,
+                        /*sign_unsign=*/1);
+
+      if (ret)
+        goto error_release;
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+error_release:
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+static uint8_t * transform_weight(const cvk_tl_shape_t *s, uint8_t before[])
+{
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  uint32_t size = ic * oc * kh * kw;
+  uint8_t *after = (uint8_t *)malloc(size);
+
+  /*
+   * (oc, ic, kh, kw) -> (1, oc, kh * kw, ic)
+   */
+  for (uint32_t oci = 0; oci < oc; oci++) {
+    for (uint32_t ici = 0; ici < ic; ici++) {
+      for (uint32_t khi = 0; khi < kh; khi++) {
+        for (uint32_t kwi = 0; kwi < kw; kwi++) {
+          uint32_t src_i = oci * ic * kh * kw + ici * kh * kw + khi * kw + kwi;
+          uint32_t dst_i = oci * kh * kw * ic + khi * kw * ic + kwi * ic + ici;
+          after[dst_i] = before[src_i];
+        }
+      }
+    }
+  }
+
+  return after;
+}
+
+static void put_conv_weight(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    uint8_t *data)
+{
+#if 0
+  const tl_shape_t *s = &tl->shape;
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  bmshape_t bms = BM_TENSOR_INT8((int)oc, (int)ic, (int)kh, (int)kw);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  uint8_t *transformed_data = transform_weight(s, data);
+
+  /* we put weight to region 1. bm_memcpy_s2d regard dev_mem as
+   * absolute address, so we should pass abs address to copy weight
+   * to right place.
+   */
+
+  //u64 ab_addr = bm_device_read_base_reg(*ctx, 1);
+  //bmmem_device_t ab_dev_mem =
+    //bmmem_device_prealloc(*ctx, NULL, ab_addr + gaddr, &bms);
+
+  //int ret = bm_memcpy_s2d(*ctx, ab_dev_mem, transformed_data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tl_shape_t tdma_shape = { 1, oc, kh * kw, ic };
+
+  tg_t tdma_tg;
+  tdma_tg.base_reg_index = 0;
+  tdma_tg.start_address = gaddr;
+  tdma_tg.fmt = FMT_I8;
+  tdma_tg.shape.n = tdma_shape.n;
+  tdma_tg.shape.c = tdma_shape.c;
+  tdma_tg.shape.h = tdma_shape.h;
+  tdma_tg.shape.w = tdma_shape.w;
+  tdma_tg.stride = bmk1822_tensor_tgmem_default_stride(tdma_tg.shape, tdma_tg.fmt);
+  tdma_tg.base_reg_index = 1;
+
+  tl_t tdma_tl = *tl;
+  tdma_tl.shape = tdma_shape;
+  tdma_tl.stride = bmk1822_tensor_lmem_default_stride(bk_ctx, tdma_shape, FMT_I8, 0);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tdma_tg;
+  p.dst = &tdma_tl;
+
+  bmk1822_tdma_g2l_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+  bmmem_device_free(*ctx, dev_mem);
+  delete[] transformed_data;
+#endif
+
+  const cvk_tl_shape_t *s = &tl->shape;
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  uint8_t *transformed_data = transform_weight(s, data);
+
+  cvk_tl_shape_t tdma_shape = tl_shape_t4(1, oc, kh * kw, ic);
+  cvk_tl_t tdma_tl;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tdma_tl, tdma_shape, tl->fmt, tl->eu_align);
+  tdma_tl.start_address = tl->start_address;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, &tdma_tl, transformed_data);
+
+  free(transformed_data);
+}
+
+static int8_t * transform_bias(int oc, int16_t before[])
+{
+  int8_t *after = (int8_t *)malloc(2 * oc);
+  if (!after)
+    return NULL;
+
+  for (int i = 0; i < oc; i++){
+    after[i] = before[i] & 0xff;
+    after[i + oc] = (before[i] >> 8) & 0xff;
+  }
+  return after;
+}
+
+static void put_conv_bias(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    int16_t *data)
+{
+#if 0
+  int oc = tl->shape.c;
+
+  bmshape_t bms = BM_TENSOR_INT8(2, oc, 1, 1);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  int8_t *transformed_data = transform_bias(oc, data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, (uint8_t *)transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_I8;
+  tg.shape.n = 2;
+  tg.shape.c = oc;
+  tg.shape.h = 1;
+  tg.shape.w = 1;
+  tg.stride = bmk1822_tensor_tgmem_default_stride(tg.shape, tg.fmt);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1822_tdma_g2l_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  bmmem_device_free(*ctx, dev_mem);
+  delete[] transformed_data;
+#endif
+
+  int oc = tl->shape.c;
+  int8_t *transformed_data = transform_bias(oc, data);
+
+  cvk_tl_shape_t tdma_shape = tl_shape_t4(2, oc, 1, 1);
+  cvk_tl_t tdma_tl;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tdma_tl, tdma_shape, tl->fmt, tl->eu_align);
+  tdma_tl.start_address = tl->start_address;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, &tdma_tl, (uint8_t *)transformed_data);
+
+  free(transformed_data);
+}
+
+static int conv_kh_ext(const conv_param_t *p)
+{
+  return (p->kh - 1) * p->dh + 1;
+}
+
+static int conv_kw_ext(const conv_param_t *p)
+{
+  return (p->kw - 1) * p->dw + 1;
+}
+
+static int conv_ih_ext(const conv_param_t *p)
+{
+  return (p->input_h - 1) * (p->ins_h + 1) +
+      p->ins_h_last + 1 + p->pad_top + p->pad_bot;
+}
+
+static int conv_iw_ext(const conv_param_t *p)
+{
+  return (p->input_w - 1) * (p->ins_w + 1) +
+      p->ins_w_last + 1 + p->pad_left + p->pad_right;
+}
+
+static int conv_oh(const conv_param_t *p)
+{
+  return (conv_ih_ext(p) - conv_kh_ext(p)) / p->stride_h + 1;
+}
+
+static int conv_ow(const conv_param_t *p)
+{
+  return (conv_iw_ext(p) - conv_kw_ext(p)) / p->stride_w + 1;
+}
+
+static int conv_input_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int ic = p->input_c;
+  int ih = p->input_h;
+  int iw = p->input_w;
+  return in * ic * ih * iw;
+}
+
+static int conv_output_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int oc = p->output_c;
+  int oh = conv_oh(p);
+  int ow = conv_ow(p);
+  return in * oc * oh * ow;
+}
+
+static int conv_weight_size(const conv_param_t *p)
+{
+  int oc = p->output_c;
+  int ic = p->input_c;
+  int kh = p->kh;
+  int kw = p->kw;
+  return oc * ic * kh * kw;
+}
+
+static int8_t * alloc_input(const conv_param_t *p)
+{
+  int size = conv_input_size(p);
+
+  int8_t *buf = (int8_t *)malloc(sizeof(int8_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static int8_t * alloc_weight(const conv_param_t *p)
+{
+  int size = conv_weight_size(p);
+
+  int8_t *buf = (int8_t *)malloc(sizeof(int8_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static int16_t * alloc_bias(const conv_param_t *p)
+{
+  int oc = p->output_c;
+
+  int16_t *bias = (int16_t *)malloc(sizeof(int16_t) * oc);
+  if (!bias)
+    return NULL;
+
+  for (int i = 0; i < oc; i++)
+    bias[i] = rand() % 65536 - 32768;
+
+  return bias;
+}
+
+static cvk_tl_t * conv_ifmap_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = p->opd0_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 1);
+}
+
+static cvk_tl_t * conv_weight_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = p->opd1_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 0);
+}
+
+static cvk_tl_t * conv_ofmap_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_tl_shape_t s;
+  s.n = p->input_n * 4;
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  cvk_tl_t *tl = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, CVK_FMT_I8, 1);
+  if (tl)
+    tl->shape.n = p->input_n;
+  return tl;
+}
+
+static cvk_tl_t * conv_bias_tensor(
+    cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = p->opd2_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 0);
+}
+
+static int conv_param_is_ok(const conv_param_t *p)
+{
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+
+  if ((kh_ext > ih_ext)
+      || (kw_ext > iw_ext)
+      || (kh_ext <= p->pad_top)
+      || (kh_ext <= p->pad_bot)
+      || (kw_ext <= p->pad_left)
+      || (kw_ext <= p->pad_right)
+      || (p->pad_top >= (1 << 4))
+      || (p->pad_bot >= (1 << 4))
+      || (p->pad_left >= (1 << 4))
+      || (p->pad_right >= (1 << 4))) {
+    return 0;
+  }
+
+  return 1;
+}
+
+static int bmk_conv_param_alloc_ok(
+    const cvk_tiu_pt_convolution_param_t *p,
+    const conv_param_t *param)
+{
+  if (!p->ifmap || !p->ofmap || !p->weight)
+    return 0;
+  if(p->ps32_mode==1)
+      if (param->using_bias)
+        if (!p->bias)
+          return 0;
+
+  return 1;
+}
+
+static void make_bmk_conv_param_ps32(
+    cvk_context_t *cvk_ctx,
+    cvk_tiu_pt_convolution_param_t *dst,
+    const conv_param_t *p, uint32_t ps32_mode)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  if(ps32_mode==2)
+  {
+    dst->ifmap = conv_ifmap_tensor(cvk_ctx, p);
+    dst->weight = conv_weight_tensor(cvk_ctx, p);
+    dst->ofmap = conv_ofmap_tensor(cvk_ctx, p);
+  }
+
+  dst->ps32_mode = ps32_mode;
+
+  dst->bias = NULL;
+  dst->relu_enable = 0;
+  dst->rshift_bits = 0;
+  if(ps32_mode==1)
+  {
+    dst->relu_enable = p->bReLU_EN;
+    dst->rshift_bits = p->r_shift_m;
+    // only mode=1 can use bias
+    if (p->using_bias)
+      dst->bias = conv_bias_tensor(cvk_ctx, p);
+  }
+
+  dst->w_is_const = 0;
+}
+
+static void make_bmk_conv_param(
+    cvk_context_t *cvk_ctx,
+    cvk_tiu_pt_convolution_param_t *dst,
+    const conv_param_t *p)
+{
+  memset(dst, 0, sizeof(cvk_tiu_pt_convolution_param_t));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  dst->relu_enable = p->bReLU_EN;
+  dst->rshift_bits = p->r_shift_m;
+  dst->ifmap = conv_ifmap_tensor(cvk_ctx, p);
+  dst->weight = conv_weight_tensor(cvk_ctx, p);
+  dst->ofmap = conv_ofmap_tensor(cvk_ctx, p);
+  dst->bias = NULL;
+  dst->ps32_mode = 0;
+  if (p->using_bias)
+    dst->bias = conv_bias_tensor(cvk_ctx, p);
+
+  dst->w_is_const = 0;
+}
+
+static void free_bmk_conv_param(
+    cvk_context_t *cvk_ctx,
+    cvk_tiu_pt_convolution_param_t *r,
+    const conv_param_t *p)
+{
+  if (p->using_bias && r->bias)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->bias);
+
+  if (r->ofmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ofmap);
+
+  if (r->weight)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->weight);
+
+  if (r->ifmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ifmap);
+}
+
+static void init_conv_param(conv_param_t *p)
+{
+  printf("init_conv_param\n");
+  p->random_seed = clock();
+  srand(p->random_seed);
+
+retry:
+  p->input_n = 1;
+  p->input_c = rand() % (10) + 2;
+  p->kh = rand() % 7 + 1;
+  p->kw = rand() % 7 + 1;
+  p->input_h = rand() % 10 + p->kh;
+  p->input_w = rand() % 10 + p->kw;
+  p->output_c = rand() % 10 + 3;
+  p->stride_h = rand() % (p->kh) + 1;
+  p->stride_w = rand() % (p->kw) + 1;
+  p->ins_h = rand() % p->kh;
+  p->ins_w = rand() % p->kw;
+  p->ins_h_last = rand() % p->kh;;
+  p->ins_w_last = rand() % p->kw;;
+  p->dh = rand() % 3 + 1;
+  p->dw = rand() % 3 + 1;
+
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  p->pad_top = rand() % kh_ext;
+  p->pad_bot = rand() % kh_ext;
+  p->pad_left = rand() % kw_ext;
+  p->pad_right = rand() % kw_ext;
+
+  if (!conv_param_is_ok(p)) {
+    printf("retry init_conv_param\n");
+    goto retry;
+  }
+
+  p->using_bias = rand() % 2;
+  p->r_shift_m = rand() % 8;
+  p->bReLU_EN = rand() % 2;
+
+  p->opd0_sign = rand() % 2;
+  p->opd1_sign = 1;
+  p->opd2_sign = 1;
+
+  assert(p->opd1_sign == 1 && p->opd2_sign == 1);
+
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+  assert(ih_ext >= kh_ext);
+  assert(iw_ext >= kw_ext);
+}
+
+static void print_conv_param(const conv_param_t *p)
+{
+  printf("%s\n", "Conv parameters:");
+  printf("  %s%d;\n", "p->random_seed = ", p->random_seed);
+
+  printf("  %s%d;\n", "p->input_n = ", p->input_n);
+  printf("  %s%d;\n", "p->input_c = ", p->input_c);
+  printf("  %s%d;\n", "p->input_h = ", p->input_h);
+  printf("  %s%d;\n", "p->input_w = ", p->input_w);
+  printf("  %s%d;\n", "p->output_c = ", p->output_c);
+
+  printf("  %s%d;\n", "p->kh = ", p->kh);
+  printf("  %s%d;\n", "p->kw = ", p->kw);
+  printf("  %s%d;\n", "p->dh = ", p->dh);
+  printf("  %s%d;\n", "p->dw = ", p->dw);
+  printf("  %s%d;\n", "p->pad_top = ", p->pad_top);
+  printf("  %s%d;\n", "p->pad_bot = ", p->pad_bot);
+  printf("  %s%d;\n", "p->pad_left = ", p->pad_left);
+  printf("  %s%d;\n", "p->pad_right = ", p->pad_right);
+  printf("  %s%d;\n", "p->stride_h = ", p->stride_h);
+  printf("  %s%d;\n", "p->stride_w = ", p->stride_w);
+  printf("  %s%d;\n", "p->ins_w = ", p->ins_w);
+  printf("  %s%d;\n", "p->ins_h = ", p->ins_h);
+  printf("  %s%d;\n", "p->ins_w_last = ", p->ins_w_last);
+  printf("  %s%d;\n", "p->ins_h_last = ", p->ins_h_last);
+
+  printf("  %s%d;\n", "p->r_shift_m = ", p->r_shift_m);
+  printf("  %s%d;\n", "p->opd0_sign = ", p->opd0_sign);
+  printf("  %s%d;\n", "p->opd1_sign = ", p->opd1_sign);
+  printf("  %s%d;\n", "p->opd2_sign = ", p->opd2_sign);
+  printf("  %s%d;\n", "p->bReLU_EN = ", p->bReLU_EN);
+  printf("  %s%d;\n", "p->using_bias = ", p->using_bias);
+
+  printf("  %s%d\n", "kh_ext = ", conv_kh_ext(p));
+  printf("  %s%d\n", "kw_ext = ", conv_kw_ext(p));
+  printf("  %s%d\n", "ih_ext = ", conv_ih_ext(p));
+  printf("  %s%d\n", "iw_ext = ", conv_iw_ext(p));
+  printf("  %s%d\n", "output_h = ", conv_oh(p));
+  printf("  %s%d\n", "output_w = ", conv_ow(p));
+}
+
+/* Calculate the right shift value, m
+ * Steps:
+ *  1. Get the abs() of each weight;
+ *  2. Summary all the abs() in one kernel;
+ *  3. Get Log2 of each sum;
+ *  4. Downward rounding;
+ * After every r_shift value got, sort and find the middle one.
+ */
+
+static int calc_rshift_m(const conv_param_t *p, int8_t* weight)
+{
+  int kernel_cnt = p->output_c * p->input_c;
+  int kernel_size = p->kh * p->kw;
+  int *kernel_shifts = (int *)malloc(sizeof(int) * kernel_cnt);
+  if (!kernel_shifts)
+    return 1;
+
+  memset(kernel_shifts, 0, sizeof(int) * kernel_cnt);
+
+  // Part 1:
+  // Get right shift value for each kernel
+  int sum = 0;
+  for (int i = 0; i < kernel_cnt; i++) {
+    // Step 1 & 2: Get the sum of abs()
+    for (int j = 0; j < kernel_size; j++) {
+      sum += (int)(*weight < 0 ? -(*weight) : (*weight));
+      weight++;
+    }
+    // Step 3 & 4: log2 and downward rounding
+    sum >>= 1;
+    while (sum) {
+      sum >>= 1;
+      kernel_shifts[i]++;
+    }
+  }
+
+  // Part 2:
+  // Find the middle of all the values
+  int tag[32] = {0};
+  for (int cnt = 0; cnt < kernel_cnt; cnt++) {
+    tag[kernel_shifts[cnt]]++;
+  }
+
+  int rshift_m = 0;
+  int mid = 0;
+  do {
+    mid += tag[rshift_m++];
+  } while(mid < (kernel_cnt - 1) >> 1);
+
+  free(kernel_shifts);
+
+  return rshift_m - 1;
+}
+
+static int test_ps32_ut(
+    conv_param_t *p_param, CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  printf("test_ps32_ut\n");
+  int ret = 0;
+  int8_t *input = alloc_input(p_param);
+  int8_t *weight = alloc_weight(p_param);
+  int16_t *bias = alloc_bias(p_param);
+  int8_t *output_ref = (int8_t *)malloc(sizeof(int8_t) * conv_output_size(p_param) * sizeof(int));
+  if (!input || !weight || !bias || !output_ref) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  p_param->r_shift_m = calc_rshift_m(p_param, weight);
+  ret = ps32_m2_conv_ref(p_param, input, weight, output_ref);
+  if (ret)
+    goto fail_exit;
+
+  cvk_tiu_pt_convolution_param_t conv_param;
+  make_bmk_conv_param_ps32(cvk_ctx, &conv_param, p_param, 2);
+
+  int tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, p_param);
+
+  if (tl_alloc_success) {
+
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, conv_param.ifmap, (uint8_t *)input);
+    put_conv_weight(rt_handle, cvk_ctx, conv_param.weight, (uint8_t *)weight);
+    cvk_ctx->ops->tiu_pt_convolution(cvk_ctx, &conv_param);
+
+    cvk_tl_t ps32_ofmap;
+    ps32_ofmap = *conv_param.ofmap;
+    ps32_ofmap.shape.n = ps32_ofmap.shape.n * sizeof(int);
+    uint8_t *output = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, &ps32_ofmap);
+
+    ret = array_cmp_int8(
+        "Comparing begin_mode results ...\n",
+        output_ref, (int8_t *)output, conv_output_size(p_param) * sizeof(int));
+
+    if (ret) {
+      print_conv_param(p_param);
+      printf("Comparison FAILED\n");
+    }
+
+    free(output);
+  }
+  free_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+
+  printf("test_ps32_intermediate_mode\n");
+  for (int i=0; i < conv_input_size(p_param); i++)
+    input[i] = rand() % 256 - 128;
+
+  for (int i=0; i < conv_weight_size(p_param); i++)
+    weight[i] = rand() % 256 - 128;
+
+  ret = ps32_m3_conv_ref(p_param, input, weight, output_ref);
+  if (ret)
+    goto fail_exit;
+
+  make_bmk_conv_param_ps32(cvk_ctx, &conv_param, p_param, 3);
+  tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, p_param);
+
+  if (tl_alloc_success) {
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, conv_param.ifmap, (uint8_t *)input);
+    put_conv_weight(rt_handle, cvk_ctx, conv_param.weight, (uint8_t *)weight);
+
+    cvk_ctx->ops->tiu_pt_convolution(cvk_ctx, &conv_param);
+
+    cvk_tl_t ps32_ofmap;
+    ps32_ofmap = *conv_param.ofmap;
+    ps32_ofmap.shape.n = ps32_ofmap.shape.n * sizeof(int);
+
+    uint8_t *output = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, &ps32_ofmap);
+
+    ret = array_cmp_int8(
+        "Comparing intermediate results ...\n",
+        output_ref, (int8_t *)output, conv_output_size(p_param) * sizeof(int));
+
+    if (ret) {
+      print_conv_param(p_param);
+      printf("Comparison FAILED\n");
+    }
+
+    free(output);
+  }
+  free_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+
+  printf("test_ps32_end_mode\n");
+  for (int i=0; i < conv_input_size(p_param); i++)
+    input[i] = rand() % 256 - 128;
+
+  for (int i=0; i < conv_weight_size(p_param); i++)
+    weight[i] = rand() % 256 - 128;
+
+  ret = ps32_m1_conv_ref(p_param, input, weight, bias, output_ref);
+  if (ret)
+    goto fail_exit;
+
+  make_bmk_conv_param_ps32(cvk_ctx, &conv_param, p_param, 1);
+
+  tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, p_param);
+
+  if (tl_alloc_success) {
+
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, conv_param.ifmap, (uint8_t *)input);
+    put_conv_weight(rt_handle, cvk_ctx, conv_param.weight, (uint8_t *)weight);
+    if (p_param->using_bias) {
+      put_conv_bias(rt_handle, cvk_ctx, conv_param.bias, bias);
+    }
+    cvk_ctx->ops->tiu_pt_convolution(cvk_ctx, &conv_param);
+    uint8_t *output = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, conv_param.ofmap);
+
+    ret = array_cmp_int8(
+        "Comparing end results ...\n",
+        output_ref, (int8_t *)output, conv_output_size(p_param));
+
+    if (ret) {
+      print_conv_param(p_param);
+      printf("Comparison FAILED\n");
+    }
+
+    free(output);
+  }
+
+  free_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+
+fail_exit:
+  free(input);
+  free(weight);
+  free(bias);
+  free(output_ref);
+
+  return ret;
+}
+
+static int test_ic_tiling_conv(
+    conv_param_t *p_param, CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  printf("test tiled ps32 conv\n");
+  int ret = 0;
+  int8_t *input = alloc_input(p_param);
+  int8_t *weight = alloc_weight(p_param);
+  int16_t *bias = alloc_bias(p_param);
+  int8_t *output_ref = (int8_t *)malloc(sizeof(int8_t) * conv_output_size(p_param));
+  if (!input || !weight || !bias || !output_ref) {
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  p_param->r_shift_m = calc_rshift_m(p_param, weight);
+  ret = conv_ref(p_param, input, weight, bias, output_ref);
+  if (ret)
+    goto fail_exit_2;
+
+  cvk_tiu_pt_convolution_param_t conv_tmp_param;
+  cvk_tiu_pt_convolution_param_t conv_param;
+  make_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+
+  int tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, p_param);
+  if (tl_alloc_success) {
+    if (p_param->using_bias) {
+      conv_tmp_param.bias = conv_param.bias;
+      put_conv_bias(rt_handle, cvk_ctx, conv_param.bias, bias);
+    }
+    // for ps32_md[1] = 1, relu_enable & rshift_bits need to set to 0
+    // so we store those parameters to conv_tmp_para
+    conv_tmp_param.relu_enable = conv_param.relu_enable;
+    conv_tmp_param.rshift_bits = conv_param.rshift_bits;
+    conv_tmp_param.bias        = conv_param.bias;
+
+    uint32_t ic_step = 1;
+    uint32_t n_step = 1;
+    cvk_tl_t ifmap = *conv_param.ifmap;
+    cvk_tl_t ofmap = *conv_param.ofmap;
+    cvk_tg_shape_t s;
+    s.n = conv_param.ifmap->shape.n;
+    s.c = conv_param.ifmap->shape.c;
+    s.h = conv_param.ifmap->shape.h;
+    s.w = conv_param.ifmap->shape.w;
+    cvk_tg_t *tg_ifmap = alloc_tensor_dev_mem(rt_handle, cvk_ctx, s, CVK_FMT_I8);
+    tensor_copy_s2d(rt_handle, tg_ifmap, (uint8_t *)input);
+
+    s.n = conv_param.weight->shape.n;
+    s.c = conv_param.weight->shape.c;
+    s.h = conv_param.weight->shape.h;
+    s.w = conv_param.weight->shape.w;
+    uint8_t *transformed_weight =
+      transform_weight(&conv_param.weight->shape, (uint8_t *)weight);
+    cvk_tg_t *tg_weight = alloc_tensor_dev_mem(rt_handle, cvk_ctx, s, CVK_FMT_I8);
+    tensor_copy_s2d(rt_handle, tg_weight, (uint8_t *)transformed_weight);
+    free(transformed_weight);
+
+    cvk_tl_shape_t cur_tl_ifmap_shape = {
+        n_step,
+        ic_step,
+        ifmap.shape.h,
+        ifmap.shape.w
+    };
+
+    cvk_tg_shape_t cur_tg_ifmap_shape = {
+      cur_tl_ifmap_shape.n,
+      cur_tl_ifmap_shape.c,
+      cur_tl_ifmap_shape.h,
+      cur_tl_ifmap_shape.w
+    };
+
+    cvk_tg_stride_t cur_tg_ifmap_stride = {
+      tg_ifmap->stride.n,
+      tg_ifmap->stride.c,
+      tg_ifmap->stride.h,
+      fmt_size(tg_ifmap->fmt),
+    };
+
+    cvk_tg_t cur_tg_ifmap;
+    cur_tg_ifmap.base_reg_index = 0;
+    cur_tg_ifmap.start_address = tg_ifmap->start_address;
+    cur_tg_ifmap.shape = cur_tg_ifmap_shape;
+    cur_tg_ifmap.stride = cur_tg_ifmap_stride;
+    cur_tg_ifmap.fmt = CVK_FMT_I8;
+
+    cvk_tl_t cur_tl_ifmap;
+    cur_tl_ifmap.shape = cur_tl_ifmap_shape;
+    cur_tl_ifmap.stride =
+      cvk_ctx->ops->tl_default_stride(cvk_ctx, cur_tl_ifmap_shape, CVK_FMT_I8, 1);
+    cur_tl_ifmap.start_address = ifmap.start_address;
+    cur_tl_ifmap.fmt = ifmap.fmt;
+
+    cvk_tl_t cur_tl_ofmap;
+    cur_tl_ofmap.start_address = ofmap.start_address;
+    cur_tl_ofmap.shape = ofmap.shape;
+    cur_tl_ofmap.shape.n = n_step;
+    cur_tl_ofmap.stride =
+      cvk_ctx->ops->tl_default_stride(cvk_ctx, cur_tl_ofmap.shape, CVK_FMT_I8, 1);
+    cur_tl_ofmap.fmt = ofmap.fmt;
+
+    cvk_tl_t cur_tl_weight;
+    memset(&cur_tl_weight, 0, sizeof(cur_tl_weight));
+    cur_tl_weight.start_address = conv_param.weight->start_address;
+    cur_tl_weight.shape = conv_param.weight->shape;
+    cur_tl_weight.shape.n = ic_step;
+    cur_tl_weight.stride.n = 1;
+    cur_tl_weight.stride.c = cur_tl_weight.shape.n * cur_tl_weight.shape.h * cur_tl_weight.shape.w;
+    cur_tl_weight.stride.h = cur_tl_weight.shape.n * cur_tl_weight.shape.w;
+    cur_tl_weight.stride.w = cur_tl_weight.shape.n;
+    cur_tl_weight.fmt = conv_param.weight->fmt;
+
+    const cvk_tl_t *saved_tl_weight = conv_param.weight;
+    const cvk_tl_t *saved_tl_ifmap = conv_param.ifmap;
+    for (uint32_t ci = 0; ci < ifmap.shape.c; ci += ic_step) {
+      {
+        uint32_t ic = tg_weight->shape.n;
+        uint32_t oc = tg_weight->shape.c;
+        uint32_t kh = tg_weight->shape.h;
+        uint32_t kw = tg_weight->shape.w;
+
+        cvk_tg_t cur_tdma_tg_weight;
+        cur_tdma_tg_weight.base_reg_index = tg_weight->base_reg_index;
+        cur_tdma_tg_weight.start_address = tg_weight->start_address + ci;
+        cur_tdma_tg_weight.fmt = tg_weight->fmt;
+        cur_tdma_tg_weight.shape = tg_shape_t4(1, oc, kh * kw, ic);
+        cur_tdma_tg_weight.stride =
+          cvk_ctx->ops->tg_default_stride(cvk_ctx, cur_tdma_tg_weight.shape, cur_tdma_tg_weight.fmt);
+        cur_tdma_tg_weight.shape = tg_shape_t4(1, oc, kh * kw, ic_step);
+
+        cvk_tl_t cur_tdma_tl_weight;
+        cur_tdma_tl_weight = cur_tl_weight;
+        cur_tdma_tl_weight.shape.n = cur_tdma_tg_weight.shape.n;
+        cur_tdma_tl_weight.shape.c = cur_tdma_tg_weight.shape.c;
+        cur_tdma_tl_weight.shape.h = cur_tdma_tg_weight.shape.h;
+        cur_tdma_tl_weight.shape.w = cur_tdma_tg_weight.shape.w;
+        cur_tdma_tl_weight.stride = cvk_ctx->ops->tl_default_stride(
+            cvk_ctx, cur_tdma_tl_weight.shape, CVK_FMT_I8, 0);
+
+        cvk_tdma_g2l_tensor_copy_param_t p1;
+        memset(&p1, 0, sizeof(p1));
+        p1.src = &cur_tdma_tg_weight;
+        p1.dst = &cur_tdma_tl_weight;
+        cvk_ctx->ops->tdma_g2l_tensor_copy(cvk_ctx, &p1);
+        CVI_RT_Submit(cvk_ctx);
+      }
+      {
+        cvk_tdma_g2l_tensor_copy_param_t p2;
+        memset(&p2, 0, sizeof(p2));
+        cur_tg_ifmap.start_address =
+          tg_ifmap->start_address + ci * tg_ifmap->stride.c;
+        p2.src = &cur_tg_ifmap;
+        p2.dst = &cur_tl_ifmap;
+        cvk_ctx->ops->tdma_g2l_tensor_copy(cvk_ctx, &p2);
+        CVI_RT_Submit(cvk_ctx);
+      }
+
+      conv_param.ifmap = &cur_tl_ifmap;
+      conv_param.weight = &cur_tl_weight;
+
+      // for ps32_md[1] = 1, relu_enable & rshift_bits need to set to 0
+      // so we store those parameters to conv_tmp_para
+      if (ci == (ifmap.shape.c - 1))
+      {
+        conv_param.relu_enable = conv_tmp_param.relu_enable;
+        conv_param.rshift_bits = conv_tmp_param.rshift_bits;
+        conv_param.bias        = conv_tmp_param.bias;
+        conv_param.ps32_mode   = 1;
+      }
+      else if (ci == 0)
+      {
+        conv_param.relu_enable = 0;
+        conv_param.rshift_bits = 0;
+        conv_param.bias        = 0;;
+        conv_param.ps32_mode   = 2;
+      }
+      else
+      {
+        conv_param.relu_enable = 0;
+        conv_param.rshift_bits = 0;
+        conv_param.bias        = 0;;
+        conv_param.ps32_mode   = 3;
+      }
+      cvk_ctx->ops->tiu_pt_convolution(cvk_ctx, &conv_param);
+      conv_param.weight = saved_tl_weight;
+      conv_param.ifmap = saved_tl_ifmap;
+    }
+
+    uint8_t *output = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, conv_param.ofmap);
+
+    free_tensor_dev_mem(rt_handle, tg_ifmap);
+    free_tensor_dev_mem(rt_handle, tg_weight);
+    int ret = array_cmp_int8(
+        "Comparing results ...\n",
+        output_ref, (int8_t *)output, conv_output_size(p_param));
+
+    if (ret) {
+      print_conv_param(p_param);
+      printf("Comparison FAILED\n");
+    }
+    free(output);
+  }
+  free_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+
+fail_exit_2:
+  free(input);
+  free(weight);
+  free(output_ref);
+  free(bias);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  for (int i = 0; i < 5; i++) {
+    printf("random_test_conv iteration: %d\n", i);
+    conv_param_t test_conv_param;
+    init_conv_param(&test_conv_param);
+    ret |= test_ic_tiling_conv(&test_conv_param, rt_handle, cvk_ctx);
+    ret |= test_ps32_ut(&test_conv_param, rt_handle, cvk_ctx);
+    if (!test_conv_param.using_bias)
+      test_conv_param.using_bias = 1;
+    if (test_conv_param.output_c <= 9)
+      test_conv_param.output_c += 3;
+    ret |= test_ic_tiling_conv(&test_conv_param, rt_handle, cvk_ctx);
+    ret |= test_ps32_ut(&test_conv_param, rt_handle, cvk_ctx);
+  }
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  if (!ret)
+    printf("%s pass\n", __FILENAME__);
+  else
+    printf("%s fail\n", __FILENAME__);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_g2g_bf16_tensor_copy.c b/cviruntime/test/180x/test_180x_g2g_bf16_tensor_copy.c
new file mode 100644
index 000000000..c8367c045
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_g2g_bf16_tensor_copy.c
@@ -0,0 +1,159 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef cvk_tdma_g2g_tensor_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_tg_shape_t src_shape;
+  cvk_tg_stride_t src_stride;
+  cvk_tg_shape_t dst_shape;
+  cvk_tg_stride_t dst_stride;
+} case_t;
+
+typedef struct {
+  cvk_fmt_t src_fmt;
+  cvk_fmt_t dst_fmt;
+} fmt_type_t;
+
+static fmt_type_t input_fmt[] = {
+ {CVK_FMT_BF16, CVK_FMT_BF16},
+ {CVK_FMT_I8, CVK_FMT_I8},
+};
+
+static case_t g_cases[] = {
+  {
+    {1, 3, 3, 3}, {27, 9, 3, 1},
+    {1, 3, 3, 3}, {27, 9, 3, 1},
+  },
+  {
+    // YOLOv2 concat layer
+    {1, 256, 19, 19}, {92416, 361, 19, 1},
+    {1, 256, 19, 19}, {462080, 361, 19, 1},
+  }
+};
+
+static int test_param_g2g(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  print_param(stderr, p);
+  int ret = 0;
+  uint64_t size = p->src->shape.c * p->src->shape.h * p->src->shape.w;
+
+  uint16_t *u16src_data = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  uint8_t *u8src_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  uint8_t *src_data, *dst_data = NULL;
+  if (!u16src_data || !u8src_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  if(p->src->fmt == CVK_FMT_BF16) {
+    /* bf16*/
+    float val = -100;
+    for(uint64_t i = 0; i < size; i++) {
+      u16src_data[i] = test_generate_bf16_corner_val(val);
+      val += 0.1;
+    }
+    src_data = (uint8_t*)u16src_data;
+  } else {
+    /* int8 -> bf16*/
+    for(uint64_t i = 0; i < size; i++) {
+      u8src_data[i] = 200 + i;
+    }
+    src_data = u8src_data;
+  }
+
+  tensor_copy_s2d(rt_handle, p->src, src_data);
+
+  cvk_ctx->ops->tdma_g2g_bf16_tensor_copy(cvk_ctx, p);
+  CVI_RT_Submit(cvk_ctx);
+  
+  dst_data = tensor_copy_d2s(rt_handle, p->dst);
+  if (!dst_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint64_t i = 0; i < size; i++) {
+    if (dst_data[i] != src_data[i]) {
+      fprintf(stderr, "comparing(%d->%d) failed at dst[%" PRIu64 "], got %x, exp %x\n",
+              p->src->fmt, p->dst->fmt, i, dst_data[i], src_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+fail_exit:
+  free(u8src_data);
+  free(u16src_data);
+  free(dst_data);
+
+  return ret;
+}
+
+static void destroy_param_g2g(CVI_RT_HANDLE rt_handle, param_t *p)
+{
+  free_tensor_dev_mem(rt_handle, p->src);
+  free_tensor_dev_mem(rt_handle, p->dst);
+}
+
+static int test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  int ret = 0;
+  uint32_t nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+
+  for (uint32_t i = 0; i < nr_fmt; i++) {
+        param_t p;
+        memset(&p, 0, sizeof(p));
+        p.src = alloc_tensor_dev_mem(rt_handle, cvk_ctx, c->src_shape, input_fmt[i].src_fmt);
+        p.dst = alloc_tensor_dev_mem(rt_handle, cvk_ctx, c->dst_shape, input_fmt[i].dst_fmt);
+        ret |= test_param_g2g(rt_handle, cvk_ctx, &p);
+        destroy_param_g2g(rt_handle, &p);
+  }
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+  int ret = 0;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    ret |= test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
\ No newline at end of file
diff --git a/cviruntime/test/180x/test_180x_lut.c b/cviruntime/test/180x/test_180x_lut.c
new file mode 100644
index 000000000..5a05c9483
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_lut.c
@@ -0,0 +1,140 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static uint32_t channel = -1; //<! 1822 hardcode
+
+static uint64_t shape_size(cvk_tl_shape_t s)
+{
+  return s.n * s.c * s.h * s.w;
+}
+
+static void tl_lut_ref(
+    uint8_t *ofmap,
+    uint8_t *ifmap,
+    uint8_t *table,
+    cvk_tl_shape_t ifmap_shape,
+    cvk_tl_shape_t table_shape)
+{
+  int ih, iw;
+  int tn, th, tw;
+
+  ih = ifmap_shape.h;
+  iw = ifmap_shape.w;
+  tn = table_shape.n;
+  th = table_shape.h;
+  tw = table_shape.w;
+  assert(tn == 1);
+  assert(th * tw == 256);
+
+  for (uint64_t i = 0; i < shape_size(ifmap_shape); i++) {
+    int ici = i / (ih * iw) % 32;
+    ofmap[i] = table[ici * (th * tw) + ifmap[i]];
+  }
+}
+
+static int test_tl_lut(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+  cvk_tl_shape_t ifmap_shape = {1, channel, 1, 224};
+  cvk_tl_shape_t table_shape = {1, channel, 16, 16};
+  cvk_tl_shape_t ofmap_shape = ifmap_shape;
+
+  uint64_t ifmap_size = shape_size(ifmap_shape);
+  uint64_t table_size = shape_size(table_shape);
+  uint64_t ofmap_size = shape_size(ofmap_shape);
+
+  uint8_t *ifmap_data = (uint8_t *)malloc(ifmap_size);
+  uint8_t *table_data = (uint8_t *)malloc(table_size);
+  uint8_t *ref_data = (uint8_t *)malloc(ofmap_size);
+  if (!ifmap_data || !table_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint64_t i = 0; i < ifmap_size; i++)
+    ifmap_data[i] = i - 20;
+
+  for (uint64_t i = 0; i < table_size; i++)
+    table_data[i] = i + i / 256 * 3;
+
+  tl_lut_ref(ref_data, ifmap_data, table_data, ifmap_shape, table_shape);
+
+  cvk_tl_t *tl_ifmap =
+    cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx,ifmap_shape, CVK_FMT_I8, 1);;
+  cvk_tl_t *tl_table =
+    cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, table_shape, CVK_FMT_I8, /*align*/1);
+  cvk_tl_t *tl_ofmap =
+    cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx,ofmap_shape, CVK_FMT_I8, /*align*/1);
+  uint8_t *ofmap_data = NULL;
+  if (!tl_ifmap || !tl_table || !tl_ofmap) {
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_ifmap, ifmap_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_table, table_data);
+  cvk_tiu_lookup_table_param_t p12;
+  p12.ofmap = tl_ofmap;
+  p12.ifmap = tl_ifmap;
+  p12.table = tl_table;
+  cvk_ctx->ops->tiu_lookup_table(cvk_ctx, &p12);
+  CVI_RT_Submit(cvk_ctx);
+  ofmap_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_ofmap);
+  for (uint64_t i = 0; i < ofmap_size; i++) {
+    if (ofmap_data[i] != ref_data[i]) {
+      fprintf(stderr,
+          "comparing failed at ofmap_data[%" PRIu64 "], got %d, exp %d\n",
+          i, ofmap_data[i], ref_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+fail_exit_2:
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_ofmap);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_table);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_ifmap);
+  free(ofmap_data);
+
+fail_exit:
+  free(ifmap_data);
+  free(table_data);
+  free(ref_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  // get channel info
+  channel = cvk_ctx->info.npu_num;
+
+  ret = test_tl_lut(rt_handle, cvk_ctx);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_matrix_mac.c b/cviruntime/test/180x/test_180x_matrix_mac.c
new file mode 100644
index 000000000..2a959be1e
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_matrix_mac.c
@@ -0,0 +1,2014 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_native_ref.h"
+
+typedef cvk_tiu_matrix_multiplication_param_t param_t;
+
+static uint64_t matrix_size(const cvk_ml_t *ml)
+{
+  uint64_t row = ml->shape.n;
+  uint64_t col = ml->shape.col;
+  return row * col;
+}
+
+static uint64_t res_size(param_t *p)
+{
+  if (p->res_is_int8 && !p->add_result)
+    return matrix_size(p->res);
+  else
+    return matrix_size(p->res) / 2;
+}
+
+static uint8_t * alloc_left(param_t *p)
+{
+  uint64_t size = matrix_size(p->left);
+
+  uint8_t *buf = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (uint64_t i = 0; i < size; i++)
+    buf[i] = i % 17 - 9;
+
+  return buf;
+}
+
+static uint8_t * alloc_right(param_t *p)
+{
+  uint64_t size = matrix_size(p->right);
+
+  uint8_t *buf = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (uint64_t i = 0; i < size; i++)
+    buf[i] = i % 13 - 6;
+
+  return buf;
+}
+
+static uint16_t * alloc_bias(param_t *p)
+{
+  if (!p->bias)
+    return NULL;
+
+  uint64_t size = matrix_size(p->bias) / 2;
+
+  uint16_t *buf = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (uint64_t i = 0; i < size; i++)
+    buf[i] = 5 - (i % 7);
+
+  return buf;
+}
+
+static uint16_t * alloc_res(param_t *p)
+{
+  uint64_t size = res_size(p);
+
+  uint16_t *buf = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (uint64_t i = 0; i < size; i++)
+    buf[i] = 17 - (i % 35);
+
+  return buf;
+}
+
+static void right_shift(param_t *p, int32_t *buf, uint64_t size)
+{
+  int shift_bits = p->rshift_bits;
+  int round_up = 1;
+  if (1)
+    arith_right_shift(buf, size, shift_bits, round_up);
+  else
+    logic_right_shift(buf, size, shift_bits, round_up);
+}
+
+static void matrix_mac_ref(
+    param_t *p, uint8_t left[], uint8_t right[], uint16_t bias[], uint16_t res[])
+{
+  uint64_t size = res_size(p);
+  uint32_t left_col = p->left->shape.col;
+  uint32_t right_col = p->right->shape.col;
+  uint32_t res_row = p->left->shape.n;
+  uint32_t res_col = p->res->shape.col;
+  int left_sign = (p->left->fmt == CVK_FMT_I8);
+  int right_sign = (p->right->fmt == CVK_FMT_I8);
+  int res_sign = (p->res->fmt == CVK_FMT_I8);
+
+  int32_t *tmp_res = (int32_t *)malloc(sizeof(int32_t) * size);
+  if (!tmp_res)
+    return;
+
+  if (p->add_result) {
+    for (uint32_t i = 0; i < res_row * res_col; i++) {
+      tmp_res[i] = res_sign? (int16_t)res[i]: res[i];
+      tmp_res[i] <<= p->lshift_bits;
+    }
+  } else {
+    for (uint32_t i = 0; i < res_row * res_col; i++)
+      tmp_res[i] = 0;
+  }
+
+  for (uint32_t row = 0; row < res_row; row++) {
+    for (uint32_t col = 0; col < res_col; col++) {
+      for (uint32_t i = 0; i < left_col; i++) {
+        uint32_t li = row * left_col + i;
+        uint32_t ri = i * right_col + col;
+        int32_t l = left_sign? (int8_t)left[li]: left[li];
+        int32_t r = right_sign? (int8_t)right[ri]: right[ri];
+        tmp_res[row * res_col + col] += l * r;
+      }
+    }
+  }
+
+  if (p->bias && bias) {
+    for (uint32_t row = 0; row < res_row; row++) {
+      for (uint32_t col = 0; col < res_col; col++) {
+        int bias_sign = (p->bias->fmt == CVK_FMT_I8);
+        int32_t b = bias_sign? (int16_t)bias[col]: bias[col];
+        tmp_res[row * res_col + col] += b;
+      }
+    }
+  }
+
+  if (p->relu_enable)
+    relu(tmp_res, size);
+
+  right_shift(p, tmp_res, size);
+
+  if (p->res_is_int8)
+    saturate_to_int8(tmp_res, size, res_sign);
+  else
+    saturate_to_int16(tmp_res, size, res_sign);
+
+  for (uint64_t i = 0; i < size; i++)
+    res[i] = tmp_res[i];
+
+  free(tmp_res);
+}
+
+static void put_bias(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_ml_t *ml,
+    uint16_t data[])
+{
+  uint64_t size = ml->shape.col;
+
+  uint8_t *tmp = (uint8_t *)malloc(sizeof(uint8_t) * size * 2);
+  if (!tmp)
+    return;
+
+  for (uint64_t i = 0; i < size; i++) {
+    tmp[i] = data[i] & 0xff;
+    tmp[i + size] = (data[i] >> 8) & 0xff;
+  }
+
+  matrix_copy_s2d_g2l(rt_handle, cvk_ctx, ml, tmp);
+
+  free(tmp);
+}
+
+static void put_res(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_ml_t *ml,
+    uint16_t data[])
+{
+  uint64_t size = ml->shape.n / 2 * ml->shape.col;
+
+  uint8_t *tmp = (uint8_t *)malloc(sizeof(uint8_t) * size * 2);
+  if (!tmp)
+    return;
+
+  for (uint64_t i = 0; i < size; i++) {
+    tmp[i] = data[i] & 0xff;
+    tmp[i + size] = (data[i] >> 8) & 0xff;
+  }
+
+  matrix_copy_s2d_g2l(rt_handle, cvk_ctx, ml, tmp);
+
+  free(tmp);
+}
+
+static uint16_t * get_res(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    param_t *p)
+{
+  uint64_t size = res_size(p);
+  uint16_t *res = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  if (!res)
+    return NULL;
+
+  uint8_t *tmp = matrix_copy_l2g_d2s(rt_handle, cvk_ctx, p->res);
+  if (p->res_is_int8) {
+    int res_sign = (p->res->fmt == CVK_FMT_I8);
+    for (uint64_t i = 0; i < size; i++)
+      res[i] = res_sign? (int8_t)tmp[i]: tmp[i];
+  } else {
+    for (uint64_t i = 0; i < size; i++)
+      res[i] = tmp[i] + (tmp[i + size] << 8);
+  }
+
+  free(tmp);
+  return res;
+}
+
+static int test_param(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  int ret = 0;
+  uint8_t *left = alloc_left(p);
+  uint8_t *right = alloc_right(p);
+  uint16_t *bias = alloc_bias(p);
+  uint16_t *ref = alloc_res(p);
+  if (!left || !right || (p->bias && !bias) || !ref) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  matrix_copy_s2d_g2l(rt_handle, cvk_ctx, p->left, left);
+  matrix_copy_s2d_g2l(rt_handle, cvk_ctx, p->right, right);
+  if (bias)
+    put_bias(rt_handle, cvk_ctx, p->bias, bias);
+  if (p->add_result)
+    put_res(rt_handle, cvk_ctx, p->res, ref);
+
+  cvk_ctx->ops->tiu_matrix_multiplication(cvk_ctx, p);
+  uint16_t *res = get_res(rt_handle, cvk_ctx, p);
+
+  matrix_mac_ref(p, left, right, bias, ref);
+
+  uint64_t size = res_size(p);
+  for (uint64_t i = 0; i < size; i++) {
+    if (res[i] != ref[i]) {
+      fprintf(stderr, "comparing failed at out[%" PRIu64 "], got %d, exp %d\n",
+              i, (int16_t)res[i], (int16_t)ref[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+  free(res);
+
+fail_exit:
+  free(left);
+  free(right);
+  free(bias);
+  free(ref);
+
+  return ret;
+}
+
+static void destroy_param(cvk_context_t *cvk_ctx, param_t *p)
+{
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->res);
+  if (p->bias)
+    cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->bias);
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->right);
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->left);
+}
+
+static cvk_ml_t *alloc_param_res(
+    cvk_context_t *cvk_ctx, param_t *p)
+{
+  cvk_ml_shape_t s;
+  s.n = p->left->shape.n;
+  if (p->add_result || !p->res_is_int8)
+    s.n *= 2;
+  s.c = p->right->shape.c;
+  s.w = p->right->shape.w;
+  s.col = p->right->shape.col;
+
+  cvk_fmt_t fmt = CVK_FMT_U8;
+  if (p->left->fmt == CVK_FMT_I8)
+    fmt = CVK_FMT_I8;
+  if (p->right->fmt == CVK_FMT_I8)
+    fmt = CVK_FMT_I8;
+  if (p->bias)
+    if (p->bias->fmt == CVK_FMT_I8)
+      fmt = CVK_FMT_I8;
+
+  if (p->relu_enable)
+    fmt = CVK_FMT_U8;
+
+  return cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, s, fmt, 1);
+}
+
+static param_t param_0(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+  p.ps32_mode = 0;
+
+  uint32_t left_row = 1;
+  uint32_t left_col = 1;
+  uint32_t left_c = 1;
+  uint32_t left_w = 1;
+
+  uint32_t right_row = 1;
+  uint32_t right_col = 1;
+  uint32_t right_c = 1;
+  uint32_t right_w = 1;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_1(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 6;
+  uint32_t left_col = 1;
+  uint32_t left_c = 1;
+  uint32_t left_w = 1;
+
+  uint32_t right_row = 1;
+  uint32_t right_col = 1;
+  uint32_t right_c = 1;
+  uint32_t right_w = 1;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_2(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 6;
+  uint32_t left_col = 25;
+  uint32_t left_c = 1;
+  uint32_t left_w = 25;
+
+  uint32_t right_row = 25;
+  uint32_t right_col = 1;
+  uint32_t right_c = 1;
+  uint32_t right_w = 1;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_3(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 6;
+  uint32_t left_col = 25;
+  uint32_t left_c = 2;
+  uint32_t left_w = 18;
+
+  uint32_t right_row = 25;
+  uint32_t right_col = 1;
+  uint32_t right_c = 1;
+  uint32_t right_w = 1;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_4(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 6;
+  uint32_t left_col = 39;
+  uint32_t left_c = 4;
+  uint32_t left_w = 10;
+
+  uint32_t right_row = 39;
+  uint32_t right_col = 1;
+  uint32_t right_c = 1;
+  uint32_t right_w = 1;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_5(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 1;
+  uint32_t left_col = 1;
+  uint32_t left_c = 1;
+  uint32_t left_w = 1;
+
+  uint32_t right_row = 1;
+  uint32_t right_col = 2;
+  uint32_t right_c = 1;
+  uint32_t right_w = 2;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_6(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 1;
+  uint32_t left_col = 1;
+  uint32_t left_c = 1;
+  uint32_t left_w = 1;
+
+  uint32_t right_row = 1;
+  uint32_t right_col = 2;
+  uint32_t right_c = 2;
+  uint32_t right_w = 1;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_7(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 1;
+  uint32_t left_col = 1;
+  uint32_t left_c = 1;
+  uint32_t left_w = 1;
+
+  uint32_t right_row = 1;
+  uint32_t right_col = 10;
+  uint32_t right_c = 3;
+  uint32_t right_w = 4;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_8(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 1;
+  uint32_t left_c = 1;
+  uint32_t left_w = 1;
+
+  uint32_t right_row = 1;
+  uint32_t right_col = 10;
+  uint32_t right_c = 3;
+  uint32_t right_w = 4;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_9(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 23;
+  uint32_t left_c = 3;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 23;
+  uint32_t right_col = 10;
+  uint32_t right_c = 3;
+  uint32_t right_w = 4;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_10(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 477;
+  uint32_t left_c = 60;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 477;
+  uint32_t right_col = 10;
+  uint32_t right_c = 3;
+  uint32_t right_w = 4;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_11(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 23;
+  uint32_t left_c = 3;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 23;
+  uint32_t right_col = 477;
+  uint32_t right_c = 60;
+  uint32_t right_w = 8;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_12(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 1;
+  uint32_t left_col = 1;
+  uint32_t left_c = 1;
+  uint32_t left_w = 1;
+
+  uint32_t right_row = 1;
+  uint32_t right_col = 1;
+  uint32_t right_c = 1;
+  uint32_t right_w = 1;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_I8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_13(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 1;
+  uint32_t left_col = 1;
+  uint32_t left_c = 1;
+  uint32_t left_w = 1;
+
+  uint32_t right_row = 1;
+  uint32_t right_col = 2;
+  uint32_t right_c = 1;
+  uint32_t right_w = 2;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_I8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_14(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 477;
+  uint32_t left_c = 60;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 477;
+  uint32_t right_col = 10;
+  uint32_t right_c = 3;
+  uint32_t right_w = 4;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_I8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_15(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 3;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 477;
+  uint32_t left_c = 60;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 477;
+  uint32_t right_col = 10;
+  uint32_t right_c = 3;
+  uint32_t right_w = 4;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_I8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_16(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 3;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 477;
+  uint32_t left_c = 60;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 477;
+  uint32_t right_col = 10;
+  uint32_t right_c = 3;
+  uint32_t right_w = 4;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_U8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_17(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 3;
+  p.res_is_int8 = true;
+  p.relu_enable = true;
+  p.add_result = false;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 477;
+  uint32_t left_c = 60;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 477;
+  uint32_t right_col = 10;
+  uint32_t right_c = 3;
+  uint32_t right_w = 4;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_U8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_18(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 23;
+  uint32_t left_c = 3;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 23;
+  uint32_t right_col = 477;
+  uint32_t right_c = 60;
+  uint32_t right_w = 8;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_U8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_19(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 1;
+  uint32_t left_col = 1;
+  uint32_t left_c = 1;
+  uint32_t left_w = 1;
+
+  uint32_t right_row = 1;
+  uint32_t right_col = 1;
+  uint32_t right_c = 1;
+  uint32_t right_w = 1;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_I8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_20(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 4;
+  uint32_t left_col = 1;
+  uint32_t left_c = 1;
+  uint32_t left_w = 1;
+
+  uint32_t right_row = 1;
+  uint32_t right_col = 1;
+  uint32_t right_c = 1;
+  uint32_t right_w = 1;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_I8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_21(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 3;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 477;
+  uint32_t left_c = 60;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 477;
+  uint32_t right_col = 10;
+  uint32_t right_c = 3;
+  uint32_t right_w = 4;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_U8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_22(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 2;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 23;
+  uint32_t left_c = 3;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 23;
+  uint32_t right_col = 477;
+  uint32_t right_c = 60;
+  uint32_t right_w = 8;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_U8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_23(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 2;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 23;
+  uint32_t left_c = 3;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 23;
+  uint32_t right_col = 477;
+  uint32_t right_c = 60;
+  uint32_t right_w = 8;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_U8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_24(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  uint32_t left_row = 1;
+  uint32_t left_col = 1;
+  uint32_t left_c = 1;
+  uint32_t left_w = 1;
+
+  uint32_t right_row = 1;
+  uint32_t right_col = 1;
+  uint32_t right_c = 1;
+  uint32_t right_w = 1;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_I8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_25(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  uint32_t left_row = 4;
+  uint32_t left_col = 1;
+  uint32_t left_c = 1;
+  uint32_t left_w = 1;
+
+  uint32_t right_row = 1;
+  uint32_t right_col = 1;
+  uint32_t right_c = 1;
+  uint32_t right_w = 1;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_I8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_26(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  uint32_t left_row = 4;
+  uint32_t left_col = 1;
+  uint32_t left_c = 1;
+  uint32_t left_w = 1;
+
+  uint32_t right_row = 1;
+  uint32_t right_col = 1;
+  uint32_t right_c = 1;
+  uint32_t right_w = 1;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_I8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_27(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 3;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 477;
+  uint32_t left_c = 60;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 477;
+  uint32_t right_col = 10;
+  uint32_t right_c = 3;
+  uint32_t right_w = 4;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_U8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_28(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 2;
+  p.rshift_bits = 3;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 477;
+  uint32_t left_c = 60;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 477;
+  uint32_t right_col = 10;
+  uint32_t right_c = 3;
+  uint32_t right_w = 4;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_U8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_29(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 2;
+  p.rshift_bits = 3;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 477;
+  uint32_t left_c = 60;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 477;
+  uint32_t right_col = 10;
+  uint32_t right_c = 3;
+  uint32_t right_w = 4;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_U8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_30(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 2;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 23;
+  uint32_t left_c = 3;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 23;
+  uint32_t right_col = 477;
+  uint32_t right_c = 60;
+  uint32_t right_w = 8;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_U8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_31(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 3;
+  p.rshift_bits = 2;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 23;
+  uint32_t left_c = 3;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 23;
+  uint32_t right_col = 477;
+  uint32_t right_c = 60;
+  uint32_t right_w = 8;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_U8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_32(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 6;
+  p.rshift_bits = 2;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 23;
+  uint32_t left_c = 3;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 23;
+  uint32_t right_col = 477;
+  uint32_t right_c = 60;
+  uint32_t right_w = 8;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_U8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_33(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 6;
+  p.rshift_bits = 2;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 23;
+  uint32_t left_c = 3;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 23;
+  uint32_t right_col = 477;
+  uint32_t right_c = 60;
+  uint32_t right_w = 8;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_U8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_34(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 1;
+  p.rshift_bits = 13;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 23;
+  uint32_t left_c = 3;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 23;
+  uint32_t right_col = 477;
+  uint32_t right_c = 60;
+  uint32_t right_w = 8;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_U8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_U8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_U8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_35(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 1;
+  p.rshift_bits = 10;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 23;
+  uint32_t left_c = 3;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 23;
+  uint32_t right_col = 477;
+  uint32_t right_c = 60;
+  uint32_t right_w = 8;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_U8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_U8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_U8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_36(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 1;
+  p.rshift_bits = 10;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 23;
+  uint32_t left_c = 3;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 23;
+  uint32_t right_col = 477;
+  uint32_t right_c = 60;
+  uint32_t right_w = 8;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_U8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_U8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_U8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_37(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 1;
+  p.rshift_bits = 10;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 23;
+  uint32_t left_c = 3;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 23;
+  uint32_t right_col = 477;
+  uint32_t right_c = 60;
+  uint32_t right_w = 8;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_U8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_U8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_38(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 1;
+  p.rshift_bits = 6;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 23;
+  uint32_t left_c = 3;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 23;
+  uint32_t right_col = 477;
+  uint32_t right_c = 60;
+  uint32_t right_w = 8;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_U8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+#define test_one_param(n)                               \
+  do {                                                  \
+    param_t p = param_##n(cvk_ctx);                     \
+    ret |= test_param(rt_handle, cvk_ctx, &p);          \
+    destroy_param(cvk_ctx, &p);                         \
+  } while (0)
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  test_one_param(0);
+  test_one_param(1);
+  test_one_param(2);
+  test_one_param(3);
+  test_one_param(4);
+  test_one_param(5);
+  test_one_param(6);
+  test_one_param(7);
+  test_one_param(8);
+  test_one_param(9);
+  test_one_param(10);
+  test_one_param(11);
+  test_one_param(12);
+  test_one_param(13);
+  test_one_param(14);
+  test_one_param(15);
+  test_one_param(16);
+  test_one_param(17);
+  test_one_param(18);
+  test_one_param(19);
+  test_one_param(20);
+  test_one_param(21);
+  test_one_param(22);
+  test_one_param(23);
+  test_one_param(24);
+  test_one_param(25);
+  test_one_param(26);
+  test_one_param(27);
+  test_one_param(28);
+  test_one_param(29);
+  test_one_param(30);
+  test_one_param(31);
+  test_one_param(32);
+  test_one_param(33);
+  test_one_param(34);
+  test_one_param(35);
+  test_one_param(36);
+  test_one_param(37);
+  test_one_param(38);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_matrix_mac_ps32.c b/cviruntime/test/180x/test_180x_matrix_mac_ps32.c
new file mode 100644
index 000000000..c09d6701c
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_matrix_mac_ps32.c
@@ -0,0 +1,607 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_native_ref.h"
+
+typedef cvk_tiu_matrix_multiplication_param_t param_t;
+
+typedef struct{
+  cvk_fmt_t left_sign;
+  uint32_t left_row ;
+  uint32_t left_col ;
+  uint32_t left_c ;
+  uint32_t left_w ;
+  cvk_fmt_t right_sign;
+  uint32_t right_row ;
+  uint32_t right_col ;
+  uint32_t right_c ;
+  uint32_t right_w ;
+  uint32_t lshift_bits ;
+  uint32_t rshift_bits ;
+  uint32_t relu_enable ;
+  uint32_t using_bias;
+  cvk_fmt_t bias_sign;
+} matrix_init_para_t;
+
+matrix_init_para_t matrix_para_t;
+
+static void make_bmk_matrix_param_ps32(cvk_context_t *cvk_ctx, param_t *p, int ps32_mode);
+static param_t param_init();
+
+void print_param(param_t *p)
+{
+  printf("ps32_mode =%d\n",p->ps32_mode);
+  printf("left_shape.n =%d\n",p->left->shape.n);
+  printf("left_shape.col =%d\n",p->left->shape.col);
+  printf("left_shape.c =%d\n",p->left->shape.c);
+  printf("left_shape.w =%d\n",p->left->shape.w);
+  printf("left_fmt =%d\n",p->left->fmt);
+  printf("right_shape.n =%d\n",p->right->shape.n);
+  printf("right_shape.col =%d\n",p->right->shape.col);
+  printf("right_shape.c =%d\n",p->right->shape.c);
+  printf("right_shape.w =%d\n",p->right->shape.w);
+  printf("right_fmt =%d\n",p->right->fmt);
+  if(p->bias)
+  {
+    printf("bias_shape.n =%d\n",p->bias->shape.n);
+    printf("bias_shape.col =%d\n",p->bias->shape.col);
+    printf("bias_shape.c =%d\n",p->bias->shape.c);
+    printf("bias_shape.w =%d\n",p->bias->shape.w);
+    printf("bias_fmt =%d\n",p->bias->fmt);
+  }
+  printf("result_shape.n =%d\n",p->res->shape.n);
+  printf("result_shape.col =%d\n",p->res->shape.col);
+  printf("result_shape.c =%d\n",p->res->shape.c);
+  printf("result_shape.w =%d\n",p->res->shape.w);
+  printf("result_fmt =%d\n",p->res->fmt);
+  printf("relu_enable=%d\n",p->relu_enable);
+  printf("rshift_bits=%d\n",p->rshift_bits);
+}
+
+
+static uint64_t matrix_size(const cvk_ml_t *ml)
+{
+  uint64_t row = ml->shape.n;
+  uint64_t col = ml->shape.col;
+  return row * col;
+}
+
+static uint64_t res_ps32_size(param_t *p)
+{
+    return matrix_size(p->res);
+}
+
+static uint64_t res_size(param_t *p)
+{
+  if (p->res_is_int8 && !p->add_result)
+    return matrix_size(p->res);
+  else
+    return matrix_size(p->res) *2 ;
+}
+
+static uint8_t * alloc_left(param_t *p)
+{
+  uint64_t size = matrix_size(p->left);
+  uint8_t *buf = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (uint64_t i = 0; i < size; i++)
+    buf[i] = i % 17 - 9;
+
+  return buf;
+}
+
+static uint8_t * alloc_right(param_t *p)
+{
+  uint64_t size = matrix_size(p->right);
+
+  uint8_t *buf = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (uint64_t i = 0; i < size; i++)
+    buf[i] = i % 13 - 6;
+
+  return buf;
+}
+static uint16_t * alloc_bias(param_t *p)
+{
+  if (!p->bias)
+    return NULL;
+
+  uint64_t size = matrix_size(p->bias) / 2;
+
+  uint16_t *buf = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (uint64_t i = 0; i < size; i++)
+    buf[i] = 5 - (i % 7);
+
+  return buf;
+}
+
+static uint8_t * alloc_ps32_res(param_t *p)
+{
+  uint64_t size = res_ps32_size(p)*4;
+  uint8_t *buf = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (uint64_t i = 0; i < size; i++)
+    buf[i] = 17 - (i % 35);
+
+  return buf;
+}
+
+static void right_shift(param_t *p, int32_t *buf, uint64_t size)
+{
+  int shift_bits = p->rshift_bits;
+  int round_up = 1;
+  if (1)
+    arith_right_shift(buf, size, shift_bits, round_up);
+  else
+    logic_right_shift(buf, size, shift_bits, round_up);
+}
+
+static int ps32_m2_matrix_mac_ref(
+  param_t *p,
+  uint8_t *left,
+  uint8_t *right,
+  uint8_t *res)
+{
+  uint64_t size = res_ps32_size(p);
+  uint32_t left_col = p->left->shape.col;
+  uint32_t right_col = p->right->shape.col;
+  uint32_t res_row = p->left->shape.n;
+  uint32_t res_col = p->res->shape.col;
+  int left_sign = (p->left->fmt == CVK_FMT_I8);
+  int right_sign = (p->right->fmt == CVK_FMT_I8);
+  int ret = 0;
+  int bstride = res_row * res_col;
+
+  int32_t *tmp_res = (int32_t *)malloc(sizeof(int32_t) * size);
+  if (!tmp_res)
+    return -1;
+
+  for (uint32_t i = 0; i < res_row * res_col; i++)
+      tmp_res[i] = 0;
+  for (uint32_t row = 0; row < res_row; row++) {
+    for (uint32_t col = 0; col < res_col; col++) {
+      for (uint32_t i = 0; i < left_col; i++) {
+        uint32_t li = row * left_col + i;
+        uint32_t ri = i * right_col + col;
+        int32_t l = left_sign? (int8_t)left[li]: left[li];
+        int32_t r = right_sign? (int8_t)right[ri]: right[ri];
+        tmp_res[row * res_col + col] += l * r;
+      }
+    }
+  }
+
+  for (uint64_t i = 0; i < size; i++)
+    res[i + bstride*0] = tmp_res[i]>>0;
+  for (uint64_t i = 0; i < size; i++)
+    res[i + bstride*1] = tmp_res[i]>>8;
+  for (uint64_t i = 0; i < size; i++)
+    res[i + bstride*2] = tmp_res[i]>>16;
+  for (uint64_t i = 0; i < size; i++)
+    res[i + bstride*3] = tmp_res[i]>>24;
+
+  free(tmp_res);
+
+  return ret;
+}
+
+static int ps32_m3_matrix_mac_ref(
+  param_t *p,
+  uint8_t *left,
+  uint8_t *right,
+  uint8_t *res)
+{
+  uint64_t size = res_ps32_size(p);
+  uint32_t left_col = p->left->shape.col;
+  uint32_t right_col = p->right->shape.col;
+  uint32_t res_row = p->left->shape.n;
+  uint32_t res_col = p->res->shape.col;
+  int left_sign = (p->left->fmt == CVK_FMT_I8);
+  int right_sign = (p->right->fmt == CVK_FMT_I8);
+  int ret = 0;
+  int bstride = res_row * res_col;
+
+  uint32_t *tmp_res = (uint32_t *)malloc(sizeof(uint32_t) * size);
+  if (!tmp_res)
+    return -1;
+
+  for (uint64_t i = 0; i < size; i++)
+    tmp_res[i] = res[i + bstride*0];
+  for (uint64_t i = 0; i < size; i++)
+    tmp_res[i] |= res[i + bstride*1]<<8;
+  for (uint64_t i = 0; i < size; i++)
+    tmp_res[i] |= res[i + bstride*2]<<16;
+  for (uint64_t i = 0; i < size; i++)
+    tmp_res[i] |= res[i + bstride*3]<<24;
+
+  for (uint32_t row = 0; row < res_row; row++) {
+    for (uint32_t col = 0; col < res_col; col++) {
+      for (uint32_t i = 0; i < left_col; i++) {
+        uint32_t li = row * left_col + i;
+        uint32_t ri = i * right_col + col;
+        int32_t l = left_sign? (int8_t)left[li]: left[li];
+        int32_t r = right_sign? (int8_t)right[ri]: right[ri];
+        tmp_res[row * res_col + col] += l * r;
+      }
+    }
+  }
+
+  for (uint64_t i = 0; i < size; i++)
+    res[i + bstride*0] = tmp_res[i]>>0;
+  for (uint64_t i = 0; i < size; i++)
+    res[i + bstride*1] = tmp_res[i]>>8;
+  for (uint64_t i = 0; i < size; i++)
+    res[i + bstride*2] = tmp_res[i]>>16;
+  for (uint64_t i = 0; i < size; i++)
+    res[i + bstride*3] = tmp_res[i]>>24;
+
+  free(tmp_res);
+
+  return ret;
+}
+
+static int ps32_m1_matrix_mac_ref(
+  param_t *p,
+  uint8_t *left,
+  uint8_t *right,
+  uint16_t * bias,
+  uint8_t *res)
+{
+  uint64_t size = res_ps32_size(p);
+  uint32_t left_col = p->left->shape.col;
+  uint32_t right_col = p->right->shape.col;
+  uint32_t res_row = p->left->shape.n;
+  uint32_t res_col = p->res->shape.col;
+  int left_sign = (p->left->fmt == CVK_FMT_I8);
+  int right_sign = (p->right->fmt == CVK_FMT_I8);
+  int res_sign = (p->res->fmt == CVK_FMT_I8);
+  int ret = 0;
+  int bstride = res_row * res_col;
+
+  int32_t *tmp_res = (int32_t *)malloc(sizeof(int32_t) * size);
+  if (!tmp_res)
+    return -1;
+
+  for (uint64_t i = 0; i < size; i++)
+    tmp_res[i] = res[i + bstride*0];
+  for (uint64_t i = 0; i < size; i++)
+    tmp_res[i] |= res[i + bstride*1]<<8;
+  for (uint64_t i = 0; i < size; i++)
+    tmp_res[i] |= res[i + bstride*2]<<16;
+  for (uint64_t i = 0; i < size; i++)
+    tmp_res[i] |= res[i + bstride*3]<<24;
+
+  for (uint32_t row = 0; row < res_row; row++) {
+    for (uint32_t col = 0; col < res_col; col++) {
+      for (uint32_t i = 0; i < left_col; i++) {
+        uint32_t li = row * left_col + i;
+        uint32_t ri = i * right_col + col;
+        int32_t l = left_sign? (int8_t)left[li]: left[li];
+        int32_t r = right_sign? (int8_t)right[ri]: right[ri];
+        tmp_res[row * res_col + col] += l * r;
+      }
+    }
+  }
+
+  if (p->bias && bias) {
+    for (uint32_t row = 0; row < res_row; row++) {
+      for (uint32_t col = 0; col < res_col; col++) {
+        int bias_sign = (p->bias->fmt == CVK_FMT_I8);
+        int32_t b = bias_sign? (int16_t)bias[col]: bias[col];
+        tmp_res[row * res_col + col] += b;
+      }
+    }
+  }
+
+  if (p->relu_enable)
+    relu(tmp_res, size);
+  right_shift(p, tmp_res, size);
+  if (p->res_is_int8)
+    saturate_to_int8(tmp_res, size, res_sign);
+  else
+    saturate_to_int16(tmp_res, size, res_sign);
+
+  for (uint64_t i = 0; i < size; i++)
+    res[i + bstride*0] = tmp_res[i]>>0;
+  for (uint64_t i = 0; i < size; i++)
+    res[i + bstride*1] = tmp_res[i]>>8;
+
+  free(tmp_res);
+
+  return ret;
+}
+
+static void put_bias(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_ml_t *ml,
+    uint16_t data[])
+{
+  uint64_t size = ml->shape.col;
+
+  uint8_t *tmp = (uint8_t *)malloc(sizeof(uint8_t) * size * 2);
+  if (!tmp)
+    return;
+
+  for (uint64_t i = 0; i < size; i++) {
+    tmp[i] = data[i] & 0xff;
+    tmp[i + size] = (data[i] >> 8) & 0xff;
+  }
+
+  matrix_copy_s2d_g2l(rt_handle, cvk_ctx, ml, tmp);
+
+  free(tmp);
+}
+
+
+static int test_matrix_ps32_ut(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  int ret = 0;
+  make_bmk_matrix_param_ps32(cvk_ctx, p, 2);
+  uint8_t *left = alloc_left(p);
+  uint8_t *right = alloc_right(p);
+  uint8_t *ref = alloc_ps32_res(p);
+  if (!left || !right || !ref) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  {
+    ret = ps32_m2_matrix_mac_ref(p, left, right, ref);
+    if (ret)
+      goto fail_exit;
+
+     matrix_copy_s2d_g2l(rt_handle, cvk_ctx, p->left, left);
+     matrix_copy_s2d_g2l(rt_handle, cvk_ctx, p->right, right);
+     cvk_ctx->ops->tiu_matrix_multiplication(cvk_ctx, p);
+     cvk_ml_t ps32_res;
+     ps32_res = *p->res;
+     ps32_res.shape.n *= sizeof(int);
+     uint8_t *res = matrix_copy_l2g_d2s(rt_handle, cvk_ctx, &ps32_res);
+
+     ret = array_cmp_int8(
+         "Comparing begin_mode results ...\n",
+         (int8_t *)ref, (int8_t *)res ,(int)res_ps32_size(p)*sizeof(int));
+     if (ret) {
+       printf("Comparison M2 FAILED\n");
+       print_param(p);
+     }else
+       printf("Comparison M2 PASS\n");
+     free(res);
+  }
+
+  {
+    make_bmk_matrix_param_ps32(cvk_ctx, p, 3);
+
+    ret = ps32_m3_matrix_mac_ref(p, left, right, ref);
+    if (ret)
+      goto fail_exit;
+
+    cvk_ctx->ops->tiu_matrix_multiplication(cvk_ctx, p);
+    cvk_ml_t ps32_res;
+    ps32_res = *p->res;
+    ps32_res.shape.n *= sizeof(int);
+    uint8_t *res = matrix_copy_l2g_d2s(rt_handle, cvk_ctx, &ps32_res);
+
+    ret = array_cmp_int8(
+        "Comparing m3 results ...\n",
+        (int8_t *)ref, (int8_t *)res ,(int)res_ps32_size(p)*sizeof(int));
+    if (ret) {
+      printf("Comparison M3 FAILED\n");
+      print_param(p);
+    }else
+      printf("Comparison M3 PASS\n");
+
+    free(res);
+  }
+  {
+    make_bmk_matrix_param_ps32(cvk_ctx, p, 1);
+    uint16_t *bias = alloc_bias(p);
+
+    ret = ps32_m1_matrix_mac_ref(p, left, right, bias, ref);
+    if (ret)
+      goto fail_exit;
+
+    if(p->bias)
+      put_bias(rt_handle, cvk_ctx, p->bias, bias);
+
+    cvk_ctx->ops->tiu_matrix_multiplication(cvk_ctx, p);
+    cvk_ml_t ps32_res;
+    ps32_res = *p->res;
+    ps32_res.shape.n *= 2;
+
+    uint8_t *res = matrix_copy_l2g_d2s(rt_handle, cvk_ctx, &ps32_res);
+    ret = array_cmp_int8(
+        "Comparing m1 results ...\n",
+        (int8_t *)ref, (int8_t *)res ,(int)res_size(p));
+    if (ret) {
+      printf("Comparison M1 FAILED\n");
+      print_param(p);
+    }else
+      printf("Comparison M1 PASS\n");
+
+    free(res);
+    free(bias);
+  }
+
+fail_exit:
+  free(left);
+  free(right);
+  free(ref);
+
+  return ret;
+}
+
+static void destroy_param(cvk_context_t *cvk_ctx, param_t *p)
+{
+  if (p->bias)
+    cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->bias);
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->res);
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->right);
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->left);
+}
+
+static cvk_fmt_t modify_res_fmt()
+{
+  cvk_fmt_t fmt = CVK_FMT_U8;
+  if (matrix_para_t.left_sign == CVK_FMT_I8)
+    fmt = CVK_FMT_I8;
+  if (matrix_para_t.right_sign == CVK_FMT_I8)
+    fmt = CVK_FMT_I8;
+  if (matrix_para_t.using_bias)
+    if (matrix_para_t.bias_sign == CVK_FMT_I8)
+      fmt = CVK_FMT_I8;
+
+//  if (matrix_para_t.relu_enable)
+//    fmt = CVK_FMT_U8;
+
+  return fmt;
+}
+
+static cvk_ml_t *alloc_param_res(
+    cvk_context_t *cvk_ctx, param_t *p)
+{
+  cvk_ml_shape_t s;
+  s.n = p->left->shape.n;
+  s.c = p->right->shape.c;
+  s.w = p->right->shape.w;
+  s.col = p->right->shape.col;
+
+  cvk_fmt_t fmt = CVK_FMT_U8;
+  fmt = modify_res_fmt();
+  return cvk_ctx->ops->lmem_alloc_ps32_matrix(cvk_ctx, s, fmt, 1);
+}
+
+
+static void make_bmk_matrix_param_ps32(cvk_context_t *cvk_ctx, param_t *p, int ps32_mode)
+{
+
+  cvk_ml_shape_t left_shape;
+  cvk_ml_shape_t right_shape;
+
+  p->ps32_mode = ps32_mode;
+  p->relu_enable = 0;
+  p->lshift_bits = 0;
+  p->rshift_bits = 0;
+
+  if(ps32_mode==2)
+  {
+    left_shape.n = matrix_para_t.left_row;
+    left_shape.c = matrix_para_t.left_c;
+    left_shape.w = matrix_para_t.left_w;
+    left_shape.col = matrix_para_t.left_col;
+
+    right_shape.n = matrix_para_t.right_row;
+    right_shape.c = matrix_para_t.right_c;
+    right_shape.w = matrix_para_t.right_w;
+    right_shape.col = matrix_para_t.right_col;
+    p->left  = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape,  matrix_para_t.left_sign , 1);
+    p->right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, matrix_para_t.right_sign, 1);
+    p->bias = NULL;
+    p->res = alloc_param_res(cvk_ctx, p);
+  }else if(ps32_mode==3)
+  {
+
+  }else if(ps32_mode==1)
+  {
+     p->relu_enable = matrix_para_t.relu_enable;
+     p->rshift_bits = matrix_para_t.rshift_bits;
+     if(matrix_para_t.using_bias)
+     {
+       right_shape.n = matrix_para_t.right_row;
+       right_shape.c = matrix_para_t.right_c;
+       right_shape.w = matrix_para_t.right_w;
+       right_shape.col = matrix_para_t.right_col;
+
+       cvk_ml_shape_t bias_shape = right_shape;
+       bias_shape.n = 2;
+       p->bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, matrix_para_t.bias_sign, 1);
+       assert(p->bias);
+    }
+  }
+
+}
+static param_t param_init(void)
+{
+  param_t p;
+
+  //srand(clock());
+
+  memset(&p, 0, sizeof(param_t));
+  memset(&matrix_para_t, 0, sizeof(matrix_init_para_t));
+
+  matrix_para_t.rshift_bits = rand()%4+2;
+  matrix_para_t.using_bias = rand()%2;
+  matrix_para_t.relu_enable = rand()%2;
+  matrix_para_t.right_sign = rand()%2? CVK_FMT_I8 : CVK_FMT_U8;
+  matrix_para_t.left_sign = rand()%2? CVK_FMT_I8 : CVK_FMT_U8;
+
+  if(matrix_para_t.using_bias)
+    matrix_para_t.bias_sign = rand()%2? CVK_FMT_I8 : CVK_FMT_U8;
+
+  if(matrix_para_t.right_sign != CVK_FMT_I8 && matrix_para_t.left_sign != CVK_FMT_I8)
+    matrix_para_t.relu_enable=0;
+
+  matrix_para_t.left_row = rand()%60+1;
+  matrix_para_t.left_col = rand()%40+1;
+  matrix_para_t.left_w = matrix_para_t.left_col/0x10 ? ((uint32_t)rand())%8+8 : matrix_para_t.left_col;
+  //matrix_para_t.left_w = rand()%16+1;
+  matrix_para_t.left_c =
+    matrix_para_t.left_col%matrix_para_t.left_w?
+      matrix_para_t.left_col/matrix_para_t.left_w+1 : matrix_para_t.left_col/matrix_para_t.left_w;
+
+  matrix_para_t.right_row = matrix_para_t.left_col;
+  matrix_para_t.right_col = rand()%50+1;
+  //matrix_para_t.right_w = 16;
+  matrix_para_t.right_w = rand()%16+1;
+  matrix_para_t.right_c =
+    matrix_para_t.right_col%matrix_para_t.right_w?
+      matrix_para_t.right_col/matrix_para_t.right_w+1 : matrix_para_t.right_col/matrix_para_t.right_w;
+
+  return p;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  for (int i = 0; i < 20; i++) {
+    printf("random_test_conv iteration: %d\n", i);
+    param_t p = param_init();
+
+    ret |= test_matrix_ps32_ut(rt_handle, cvk_ctx, &p);
+    destroy_param(cvk_ctx, &p);
+  }
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_matrix_mac_qm.c b/cviruntime/test/180x/test_180x_matrix_mac_qm.c
new file mode 100644
index 000000000..89cf653ae
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_matrix_mac_qm.c
@@ -0,0 +1,829 @@
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_tf_quant_util.h"
+
+// #define ENABLE_DEBUG_MSG
+
+#define MIN_EXEC_TESTS  20
+
+typedef cvk_tiu_matrix_multiplication_qm_param_t param_t;
+
+typedef struct {
+  int left_row;
+  int left_col;
+  int right_col;
+  int has_bias;
+  int relu_enable;
+  int8_t *input_data;
+  int8_t *filter_data;
+  int8_t *output_data;
+  int32_t *bias_data;
+  uint32_t multiplier;
+  int8_t right_shift;
+  float float_multiplier;
+  int retry_cnt;
+} fc_test_param_t;
+
+void fully_connected_ref(fc_test_param_t *p_param)
+{
+  const int32_t input_offset = 0;
+  const int32_t filter_offset = 0;
+  const int32_t output_offset = 0;
+  const int32_t output_multiplier = p_param->multiplier;
+  const int output_rshift = p_param->right_shift;
+  const int batches = p_param->left_row;
+  const int output_depth = p_param->right_col;
+  const int accum_depth = p_param->left_col;
+  int8_t *input_data = p_param->input_data;
+  int8_t *filter_data = p_param->filter_data;
+  int8_t *output_data = p_param->output_data;
+  int32_t *bias_data = p_param->has_bias ? p_param->bias_data : NULL;
+
+  const int32_t output_activation_min = -128;
+  const int32_t output_activation_max = 127;
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("fully_connected_ref:\n");
+  printf("  batches %d, output_depth %d, accum_depth %d, filter_offset %d, "
+         "input_offset %d\n",
+         batches, output_depth, accum_depth, filter_offset, input_offset);
+  printf("  output_multiplier %d, output_rshift %d\n", output_multiplier,
+         output_rshift);
+#endif
+
+  for (int b = 0; b < batches; ++b) {
+    for (int out_c = 0; out_c < output_depth; ++out_c) {
+      int32_t acc = 0;
+      for (int d = 0; d < accum_depth; ++d) {
+        int32_t input_val = input_data[b * accum_depth + d];
+        // int32_t filter_val = filter_data[out_c * accum_depth + d];
+        int32_t filter_val = filter_data[output_depth * d + out_c];
+        acc += (filter_val + filter_offset) * (input_val + input_offset);
+
+#ifdef ENABLE_DEBUG_MSG
+        printf("  [%d][%d][%d] acc(%d) += (%d + %d) * (%d + %d) = %d\n", b,
+               out_c, d,
+               acc - (filter_val + filter_offset) * (input_val + input_offset),
+               filter_val, filter_offset, input_val, input_offset, acc);
+#endif
+      }
+      if (bias_data) {
+        acc += bias_data[out_c];
+
+#ifdef ENABLE_DEBUG_MSG
+        printf("  [%d][%d] acc %d, bias %d\n", b, out_c, acc,
+               bias_data ? bias_data[out_c] : 0);
+#endif
+      }
+      acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_rshift);
+
+#ifdef ENABLE_DEBUG_MSG
+      printf("  [%d][%d] acc %d, output_multiplier %d, output_rshift %d\n", b,
+             out_c, acc, output_multiplier, output_rshift);
+#endif
+
+      acc += output_offset;
+
+#ifdef ENABLE_DEBUG_MSG
+      printf("  [%d][%d] acc %d, output_offset %d\n", b, out_c, acc,
+             output_offset);
+#endif
+
+      acc = MAX(acc, output_activation_min);
+      acc = MIN(acc, output_activation_max);
+
+#ifdef ENABLE_DEBUG_MSG
+      printf("  [%d][%d] acc %d, output_activation_min %d, "
+             "output_activation_max %d\n",
+             b, out_c, acc, output_activation_min, output_activation_max);
+#endif
+
+      output_data[out_c + output_depth * b] = acc;
+    }
+  }
+}
+
+void calc_fc_float_multiplier(fc_test_param_t *p_param)
+{
+  const int32_t input_offset = 0;
+  const int32_t filter_offset = 0;
+  const int batches = p_param->left_row;
+  const int output_depth = p_param->right_col;
+  const int accum_depth = p_param->left_col;
+  int8_t *input_data = p_param->input_data;
+  int8_t *filter_data = p_param->filter_data;
+  int32_t *bias_data = p_param->has_bias ? p_param->bias_data : NULL;
+
+  int output_accu_min = INT_MIN;
+  int output_accu_max = INT_MAX;
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("calc_fc_float_multiplier:\n");
+#endif
+
+  for (int b = 0; b < batches; ++b) {
+    for (int out_c = 0; out_c < output_depth; ++out_c) {
+      int32_t acc = 0;
+      for (int d = 0; d < accum_depth; ++d) {
+        int32_t input_val = input_data[b * accum_depth + d];
+        // int32_t filter_val = filter_data[out_c * accum_depth + d];
+        int32_t filter_val = filter_data[output_depth * d + out_c];
+        acc += (filter_val + filter_offset) * (input_val + input_offset);
+      }
+      if (bias_data) {
+        acc += bias_data[out_c];
+      }
+
+      output_accu_max = MAX(acc, output_accu_max);
+      output_accu_min = MIN(acc, output_accu_min);
+
+    }
+  }
+
+  // Since int8 ranges from -128 to 127, we need to squeeze the accumulator
+  // MIN/MAX fit in those ranges correspondingly as much as possible.
+  if (abs(output_accu_max) > abs(output_accu_min)) {
+    p_param->float_multiplier = 127.0f / abs(output_accu_max);
+  } else {
+    p_param->float_multiplier = 128.0f / abs(output_accu_min);
+  }
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("  output_accu_min %d, output_accu_max %d, output_multiplier %f\n",
+         output_accu_min, output_accu_max, p_param->float_multiplier);
+#endif
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("<= calc_fc_float_multiplier\n");
+#endif
+}
+
+static void put_bias32(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, const cvk_ml_t *ml,
+                       int32_t data[])
+{
+  uint64_t size = ml->shape.col;
+
+  uint8_t *tmp = (uint8_t *)malloc(size * 4);
+  if (!tmp)
+    return;
+
+  for (uint64_t i = 0; i < size; i++) {
+    uint32_t val = data[i];
+    tmp[i] = val & 0xff;
+    tmp[i + size] = (val >> 8) & 0xff;
+    tmp[i + 2 * size] = (val >> 16) & 0xff;
+    tmp[i + 3 * size] = (val >> 24) & 0xff;
+  }
+
+  matrix_copy_s2d_g2l(rt_handle, cvk_ctx, ml, tmp);
+
+  free(tmp);
+}
+
+#if 0
+typedef struct {
+  int32_t input_offset;
+  int32_t weights_offset;
+  int32_t output_offset;
+  int32_t output_multiplier;
+  int output_rshift;
+} FullyConnectedParams;
+
+int tfl_original_test()
+{
+  int ret = 0;
+
+  // 2x10
+  int8_t input_data[20] = {
+    1, 3, 5, 7,  9, 11, 13,  15, -19, -21,
+    1, 3, 5, 7,  9, 11, 13, -17,  17, -21};
+
+  // 3x10
+  int8_t filter_data[30] = {
+    1, 3, 5, 7, 9, 11, 13, 15, 17, 19,
+    1, 3, 5, 7, 9, 11, 13, 15, 17, 19,
+    1, 3, 5, 7, 9, 11, 13, 15, 17, 19};
+
+  // 1x3
+  int32_t bias_data[3] = {4, 8, 12};
+
+  // 2x3
+  int8_t ref_output_data[6] = {
+    23, 24, 25,
+    57, 58, 59};
+
+  int8_t output_rshift = 1; // change to right shift
+  uint32_t output_multiplier = 1073741824;
+
+  int32_t input_offset = 1;
+  int32_t filter_offset = 1;
+  int32_t output_offset = 1;  // change to right shift
+
+  FullyConnectedParams params;
+  params.input_offset = input_offset;
+  params.weights_offset = filter_offset;
+  params.output_offset = output_offset;
+  params.output_multiplier = output_multiplier;
+  params.output_rshift = output_rshift;
+
+  cvk_tl_shape_t input_shape = {2, 10, 1, 1};
+  cvk_tl_shape_t filter_shape = {3, 10, 1, 1};
+  cvk_tl_shape_t bias_shape = {1, 3, 1, 1};
+  cvk_tl_shape_t output_shape = {2, 3, 1, 1};
+
+  int8_t output_data[6];
+  fully_connected_ref(params, input_shape,
+                      input_data, filter_shape,
+                      filter_data, bias_shape,
+                      bias_data, output_shape,
+                      output_data);
+  for (int i = 0; i < 6; i++) {
+    if (output_data[i] != ref_output_data[i]) {
+      printf("  output_data[%d] %d != %d\n",
+             i, output_data[i], ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  return ret;
+}
+#endif
+
+int simple_test(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+
+  // 2x10
+  int8_t input_data[20] = {1, 3, 5, 7, 9, 11, 13, 15,  -19, -21,
+                       1, 3, 5, 7, 9, 11, 13, -17, 17,  -21};
+
+#if 0
+  // 3x10
+  // tfl use transposed filter
+  int8_t filter_data_tp[30] = {
+    1, 3, 5, 7, 9, 11, 13, 15, 17, 19,
+    1, 3, 5, 7, 9, 11, 13, 15, 17, 19,
+    1, 3, 5, 7, 9, 11, 13, 15, 17, 19};
+#endif
+
+  // 10x3
+  int8_t filter_data[30] = {1,  1,  1,  3,  3,  3,  5,  5,  5,  7,
+                        7,  7,  9,  9,  9,  11, 11, 11, 13, 13,
+                        13, 15, 15, 15, 17, 17, 17, 19, 19, 19};
+
+  // 1x3
+  int32_t bias_data[3] = {4, 8, 12};
+
+  // 2x3, input/kernel/output zero_point = 0
+  int8_t ref_output_data[6] = {-10, -9, -8, 24, 25, 26};
+  int8_t output_data[6];
+
+  int8_t output_rshift = 1;  // change to right shift
+  uint32_t output_multiplier = 1073741824;
+
+  int left_row = 2;
+  int left_col = 10;
+  int right_col = 3;
+
+  fc_test_param_t params;
+  memset(&params, 0, sizeof(params));
+  params.left_row = left_row;
+  params.left_col = left_col;
+  params.right_col = right_col;
+  params.has_bias = 1;
+  params.relu_enable = 0;
+  params.input_data = input_data;
+  params.filter_data = filter_data;
+  params.output_data = output_data;
+  params.bias_data = bias_data;
+  params.multiplier = output_multiplier;
+  params.right_shift = output_rshift;
+  fully_connected_ref(&params);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("Compare ref and golden\n");
+#endif
+  for (int i = 0; i < 6; i++) {
+    if (output_data[i] != ref_output_data[i]) {
+      printf("  output_data[%d] %d(ref) != %d(golden)\n", i, output_data[i],
+             ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  cvk_ml_shape_t left_shape =
+      cvk_ctx->ops->ml_default_shape(cvk_ctx, left_row, left_col, CVK_FMT_I8);
+
+  cvk_ml_shape_t right_shape =
+      cvk_ctx->ops->ml_default_shape(cvk_ctx, left_col, right_col, CVK_FMT_I8);
+
+  cvk_ml_shape_t b_shape =
+      cvk_ctx->ops->ml_default_shape(cvk_ctx, 4, right_col, CVK_FMT_I8);  // 32bit
+
+  cvk_ml_shape_t y_shape =
+      cvk_ctx->ops->ml_default_shape(cvk_ctx, left_row, right_col, CVK_FMT_I8);
+
+  cvk_ml_t *tl_left =
+      cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  cvk_ml_t *tl_right =
+      cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  cvk_ml_t *tl_b =
+      cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, b_shape, CVK_FMT_I8, 1);
+  cvk_ml_t *tl_y =
+      cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, y_shape, CVK_FMT_I8, 1);
+
+  matrix_copy_s2d_g2l(rt_handle, cvk_ctx, tl_left, (uint8_t *)input_data);
+  matrix_copy_s2d_g2l(rt_handle, cvk_ctx, tl_right, (uint8_t *)filter_data);
+  put_bias32(rt_handle, cvk_ctx, tl_b, bias_data);
+
+  {
+    param_t p;
+    memset(&p, 0, sizeof(p));
+    p.left = tl_left;
+    p.right = tl_right;
+    p.bias = tl_b;
+    p.res = tl_y;
+    p.rshift_bits = output_rshift;
+    p.res_is_int8 = 1;
+    p.ps32_mode = 0;
+    p.quan_m = output_multiplier;
+    cvk_ctx->ops->tiu_matrix_multiplication_qm(cvk_ctx, &p);
+  }
+
+  int8_t *tiu_output_data =
+      (int8_t *)matrix_copy_l2g_d2s(rt_handle, cvk_ctx, tl_y);
+#ifdef ENABLE_DEBUG_MSG
+  printf("Compare tiu and ref\n");
+#endif
+  for (int i = 0; i < 6; i++) {
+    if (tiu_output_data[i] != ref_output_data[i]) {
+      printf("  output_data[%d] %d(tiu) != %d(ref)\n", i, tiu_output_data[i],
+             ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  free(tiu_output_data);
+
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, tl_y);
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, tl_b);
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, tl_right);
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, tl_left);
+
+  return ret;
+}
+
+int choose_from_range(int table[], int size, int index)
+{
+  if (index >= size) {
+    return 0;
+  }
+
+  int val = table[index];
+  if (index < (size - 1)) {
+    int range = MAX(table[index + 1] - table[index] - 1, 1);
+    val += rand() % range;
+  }
+
+  return val;
+}
+
+bool check_valid_test_param(cvk_context_t *cvk_ctx, fc_test_param_t *p_param)
+{
+  int left_row = p_param->left_row;
+  int left_col = p_param->left_col;
+  int right_col = p_param->right_col;
+  int has_bias = p_param->has_bias;
+
+  cvk_ml_shape_t tl_input_shape =
+      cvk_ctx->ops->ml_default_shape(cvk_ctx, left_row, left_col, CVK_FMT_I8);
+  cvk_ml_stride_t tl_input_stride =
+      cvk_ctx->ops->ml_default_stride(cvk_ctx, tl_input_shape, CVK_FMT_I8, 1);
+      
+  cvk_ml_shape_t tl_filter_shape =
+      cvk_ctx->ops->ml_default_shape(cvk_ctx, left_col, right_col, CVK_FMT_I8);
+  cvk_ml_stride_t tl_filter_stride =
+      cvk_ctx->ops->ml_default_stride(cvk_ctx, tl_filter_shape, CVK_FMT_I8, 1);
+
+  cvk_ml_shape_t tl_output_shape =
+      cvk_ctx->ops->ml_default_shape(cvk_ctx, left_row, right_col, CVK_FMT_I8);
+  cvk_ml_stride_t tl_output_stride =
+      cvk_ctx->ops->ml_default_stride(cvk_ctx, tl_output_shape, CVK_FMT_I8, 1);
+
+  uint32_t bias_size = 0;
+  if (has_bias) {
+    cvk_ml_shape_t tl_bias_shape =
+        cvk_ctx->ops->ml_default_shape(cvk_ctx, 4, right_col, CVK_FMT_I8);  // 32bit
+    cvk_ml_stride_t tl_bias_stride =
+        cvk_ctx->ops->ml_default_stride(cvk_ctx, tl_bias_shape, CVK_FMT_I8, 1);
+    bias_size = tl_bias_shape.n * tl_bias_stride.n;
+  }
+
+  uint32_t lmem_size_per_lane = cvk_ctx->info.lmem_size;
+  // uint32_t total_lmem_size = cvk_ctx->info.lmem_size * cvk_ctx->info.npu_num;
+
+  uint32_t needed_size = tl_input_shape.n * tl_input_stride.n +
+                    tl_filter_shape.n * tl_filter_stride.n +
+                    tl_output_shape.n * tl_output_stride.n + bias_size;
+
+  if (needed_size > lmem_size_per_lane) {
+    return false;
+  }
+
+  return true;
+}
+
+void fill_random_data_s8(int8_t *input_data, int size)
+{
+  for (int i = 0; i < size; ++i) {
+    int is_satured = ((rand() % 1000) == 1) ? 1 : 0;
+    int is_sign = rand() % 2 ? 1 : -1;
+
+    if (is_satured && is_sign) {
+      input_data[i] = -128;
+    } else if (is_satured) {
+      input_data[i] = 127;
+    } else {
+      input_data[i] = is_sign * rand() % 128;
+    }
+  }
+}
+
+void fill_random_data_s32(int32_t *input_data, int size)
+{
+  for (int i = 0; i < size; ++i) {
+    int is_satured = ((rand() % 1000) == 1) ? 1 : 0;
+    int is_sign = rand() % 2 ? 1 : -1;
+
+    if (is_satured && is_sign) {
+      input_data[i] = INT_MIN;
+    } else if (is_satured) {
+      input_data[i] = INT_MAX;
+    } else {
+      input_data[i] = is_sign * rand() % 128;
+    }
+  }
+}
+
+void dump_test_param(fc_test_param_t *p_param, bool dump_content)
+{
+  printf("Dump test paramter:\n");
+  printf("  left_row %d\n", p_param->left_col);
+  printf("  left_col %d\n", p_param->left_col);
+  printf("  right_col %d\n", p_param->right_col);
+  printf("  has_bias %d\n", p_param->has_bias);
+  printf("  multiplier %d\n", p_param->multiplier);
+  printf("  right_shift %d\n", p_param->right_shift);
+
+  if (dump_content) {
+    printf("input_data(%d, %d)\n", p_param->left_row, p_param->left_col);
+    int left_row = p_param->left_row;
+    int left_col = p_param->left_col;
+    for (int i = 0; i < left_row; ++i) {
+      for (int j = 0; j < left_col; ++j) {
+        int offset = i * left_col + j;
+        printf("%d, ", p_param->input_data[offset]);
+      }
+      printf("\n");
+    }
+    printf("\n\n");
+
+    int right_col = p_param->right_col;
+    printf("kernel_data (%d, %d)\n", left_col, right_col);
+    for (int i = 0; i < left_col; ++i) {
+      for (int j = 0; j < right_col; ++j) {
+        int offset = i * right_col + j;
+        printf("%d, ", p_param->filter_data[offset]);
+      }
+      printf("\n");
+    }
+    printf("\n\n");
+
+    if (p_param->has_bias) {
+      for (int i = 0; i < right_col; ++i) {
+        printf("%d, ", p_param->bias_data[i]);
+      }
+      printf("\n\n");
+    }
+  }
+}
+
+int run_compare_fc(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, fc_test_param_t *p_param)
+{
+  int ret = 0;
+
+  int left_row = p_param->left_row;
+  int left_col = p_param->left_col;
+  int right_col = p_param->right_col;
+  int has_bias = p_param->has_bias;
+
+  int input_size = left_row * left_col;
+  int8_t *input_data = (int8_t *)malloc(input_size);
+
+  int kernel_size = left_col * right_col;
+  int8_t *kernel_data = (int8_t *)malloc(kernel_size);
+
+  int output_size = left_row * right_col;
+  int8_t *output_data = (int8_t *)malloc(output_size);
+
+  int32_t *bias_data = (int32_t *) malloc(sizeof(int32_t) * right_col);
+
+  p_param->input_data = input_data;
+  p_param->filter_data = kernel_data;
+  p_param->output_data = output_data;
+  p_param->bias_data = bias_data;
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    run_compare_conv =>\n");
+  printf("      left (%d, %d), right (%d, %d), has_bias %d\n", left_row,
+         left_col, left_col, right_col, has_bias);
+#endif
+
+  int retry_cnt = p_param->retry_cnt;
+  do {
+    fill_random_data_s8(input_data, input_size);
+    fill_random_data_s8(kernel_data, kernel_size);
+    if (has_bias) {
+      fill_random_data_s32(bias_data, right_col);
+    }
+
+    p_param->float_multiplier = 100.0;  // should be < 1.0
+    calc_fc_float_multiplier(p_param);
+
+    if (p_param->float_multiplier > 0.f && p_param->float_multiplier < 1.0) {
+      break;
+    }
+
+  } while (--retry_cnt);
+
+  if (p_param->float_multiplier >= 1.0) {
+    printf("    run_compare_dw_conv: unable to find valid multiplier\n");
+    free(input_data);
+    free(kernel_data);
+    free(output_data);
+    free(bias_data);
+    return -1;
+  }
+
+  uint32_t base_multiplier = 0;
+  int base_shift = 0;
+  QuantizeMultiplierSmallerThanOne(p_param->float_multiplier, &base_multiplier,
+                                   &base_shift);
+
+  // multipliers typically range in [2^30 ; 2^31 - 1].
+  // Values in [0, 2^30 - 1] are normally unused, but harmless.
+  // Thus a good way to randomize multipliers is to subtract from them
+  // a random value smaller than 2^30 but still significant compared to it.
+  uint32_t output_multiplier = base_multiplier - (rand() % (1 << 26));
+
+  int right_shift = base_shift - 1 + (rand() % 4);
+  int8_t output_rshift = truncate_rshift((int8_t)right_shift, /*allow_lshift*/1);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("      multiplier_data %d, shift_data %d\n", output_multiplier,
+         output_rshift);
+#endif
+
+  p_param->multiplier = output_multiplier;
+  p_param->right_shift = output_rshift;
+  fully_connected_ref(p_param);
+
+  cvk_ml_shape_t tl_input_shape =
+      cvk_ctx->ops->ml_default_shape(cvk_ctx, left_row, left_col, CVK_FMT_I8);
+
+  cvk_ml_shape_t tl_filter_shape =
+      cvk_ctx->ops->ml_default_shape(cvk_ctx, left_col, right_col, CVK_FMT_I8);
+
+  cvk_ml_shape_t tl_output_shape =
+      cvk_ctx->ops->ml_default_shape(cvk_ctx, left_row, right_col, CVK_FMT_I8);
+
+  cvk_ml_shape_t tl_bias_shape =
+      cvk_ctx->ops->ml_default_shape(cvk_ctx, 4, right_col, CVK_FMT_I8);  // 32bit
+
+  cvk_ml_t *tl_input = cvk_ctx->ops->lmem_alloc_matrix(
+      cvk_ctx, tl_input_shape, CVK_FMT_I8, /*eu_align=*/1);
+  cvk_ml_t *tl_filter = cvk_ctx->ops->lmem_alloc_matrix(
+      cvk_ctx, tl_filter_shape, CVK_FMT_I8, /*eu_align=*/1);
+  cvk_ml_t *tl_output = cvk_ctx->ops->lmem_alloc_matrix(
+      cvk_ctx, tl_output_shape, CVK_FMT_I8, /*eu_align=*/1);
+
+  cvk_ml_t *tl_bias = NULL;
+  if (has_bias) {
+    tl_bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, tl_bias_shape, CVK_FMT_I8,
+                                          /*eu_align=*/1);
+  }
+
+  if (tl_input == NULL) {
+    printf("   fail to alloc tl_input (%d, %d)\n", left_row, left_col);
+    return -1;
+  }
+  if (tl_filter == NULL) {
+    printf("    fail to alloc tl_filter (%d, %d)\n", left_col, right_col);
+    return -1;
+  }
+  if (tl_output == NULL) {
+    printf("    fail to alloc tl_output (%d, %d)\n", left_row, right_col);
+    return -1;
+  }
+  if (has_bias && (tl_bias == NULL)) {
+    printf("  fail to alloc bias (%d, %d)\n", 4, right_col);
+    return -1;
+  }
+
+  matrix_copy_s2d_g2l(rt_handle, cvk_ctx, tl_input, (uint8_t *)input_data);
+  matrix_copy_s2d_g2l(rt_handle, cvk_ctx, tl_filter, (uint8_t *)kernel_data);
+  if (tl_bias) {
+    put_bias32(rt_handle, cvk_ctx, tl_bias, bias_data);
+  }
+
+  {
+    param_t p;
+    memset(&p, 0, sizeof(p));
+    p.left = tl_input;
+    p.right = tl_filter;
+    p.bias = tl_bias;
+    p.res = tl_output;
+    p.rshift_bits = (uint8_t)output_rshift;
+    p.res_is_int8 = 1;
+    p.ps32_mode = 0;
+    p.quan_m = output_multiplier;
+    cvk_ctx->ops->tiu_matrix_multiplication_qm(cvk_ctx, &p);
+  }
+
+  int8_t *tiu_output_data =
+      (int8_t *)matrix_copy_l2g_d2s(rt_handle, cvk_ctx, tl_output);
+#ifdef ENABLE_DEBUG_MSG
+  printf("Compare tiu and ref\n");
+#endif
+  for (int i = 0; i < left_row; ++i) {
+    for (int j = 0; j < right_col; ++j) {
+      int offset = i * right_col + j;
+      if (tiu_output_data[offset] != output_data[offset]) {
+        printf("  output_data[%d][%d] %d(tiu) != %d(ref)\n", i, j,
+               tiu_output_data[offset], output_data[offset]);
+        ret = -1;
+      }
+    }
+  }
+
+  if (ret) {
+    dump_test_param(p_param, /*dump_content=*/true);
+  }
+
+  // Reverse order
+  if (tl_bias) {
+    cvk_ctx->ops->lmem_free_matrix(cvk_ctx, tl_bias);
+  }
+
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, tl_output);
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, tl_filter);
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, tl_input);
+
+  free(tiu_output_data);
+
+  free(input_data);
+  free(kernel_data);
+  free(output_data);
+  free(bias_data);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    <= run_compare_conv, ret %d\n", ret);
+#endif
+
+  return ret;
+}
+
+int random_test(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+
+#if 0
+  int left_row_range[] = {1};
+  int left_col_range[] = {1};
+  int right_col_range[] = {1};
+#else
+  int left_row_range[] =  {1, 16, 4095};
+  int left_col_range[] =  {1, 16, 4095};
+  int right_col_range[] = {1, 16, 4095};
+#endif
+
+  const int left_row_range_size =
+      sizeof(left_row_range) / sizeof(left_row_range[0]);
+  const int left_col_range_size =
+      sizeof(left_col_range) / sizeof(left_col_range[0]);
+  const int right_col_range_size =
+      sizeof(right_col_range) / sizeof(right_col_range[0]);
+
+  int random_seed = clock();
+  srand(random_seed);
+
+  const int retry_test_count = 100;
+  bool stop_at_first_error = true;
+
+  int executed_tests = 0;
+  int failed_tests = 0;
+
+  printf("1822-fc-qm: random test =>\n");
+  for (int m = 0; m < retry_test_count; ++m) {
+    for (int i = 0; i < left_row_range_size; ++i) {
+      // random choosed from [range[i] : range[i+1]]
+      int left_row = choose_from_range(left_row_range, left_row_range_size, i);
+
+      for (int j = 0; j < left_col_range_size; ++j) {
+        int left_col =
+            choose_from_range(left_col_range, left_col_range_size, j);
+
+        for (int k = 0; k < right_col_range_size; ++k) {
+          int right_col =
+              choose_from_range(right_col_range, right_col_range_size, k);
+
+          int has_bias = rand() % 2;
+
+          fc_test_param_t test_param;
+          memset(&test_param, 0, sizeof(test_param));
+          test_param.left_row = left_row;
+          test_param.left_col = left_col;
+          test_param.right_col = right_col;
+          test_param.has_bias = has_bias;
+          test_param.retry_cnt = 5;
+
+          bool is_valid_param = check_valid_test_param(cvk_ctx, &test_param);
+          if (is_valid_param == false) {
+            continue;
+          }
+
+          int ret2 = run_compare_fc(rt_handle, cvk_ctx, &test_param);
+          failed_tests = ret2 ? failed_tests + 1 : failed_tests;
+          ret |= ret2;
+          executed_tests++;
+
+#ifdef ENABLE_DEBUG_MSG
+          printf("  [%d] random test: left(%d, %d), right (%d, %d), result "
+                 "%d\n",
+                 executed_tests, left_row, left_col, left_col,
+                 right_col, ret2);
+#endif
+        }
+
+        // Stop at first error
+        if (ret && stop_at_first_error) {
+          break;
+        }
+      }
+
+      // Stop at first error
+      if (ret && stop_at_first_error) {
+        break;
+      }
+    }
+
+    // Stop at first error
+    if (ret && stop_at_first_error) {
+      break;
+    }
+
+    if (executed_tests >= MIN_EXEC_TESTS) {
+      break;
+    }
+  }
+
+  printf("<= 1822-fc-qm: random test, total %d, failed %d, ret %d\n",
+         executed_tests, failed_tests, ret);
+
+  return 0;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  // ret |= tfl_original_test();
+  ret |= simple_test(rt_handle, cvk_ctx);
+  ret |= random_test(rt_handle, cvk_ctx);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_max_pooling.c b/cviruntime/test/180x/test_180x_max_pooling.c
new file mode 100644
index 000000000..2e22e9275
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_max_pooling.c
@@ -0,0 +1,238 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_native_ref.h"
+
+#define INVALIDE_STRIDE (-1)
+typedef cvk_tiu_max_pooling_param_t param_t;
+
+static void print_pooling_param(param_t *p)
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+
+  printf("  Pooling parameters:\n");
+  printf("    ifmap = (%d, %d, %d, %d)\n", in, ic, ih, iw);
+  printf("    opd0_sign = %d\n", p->ifmap->fmt == CVK_FMT_I8);
+  printf("    weight = (%d, %d)\n", p->kh, p->kw);
+  printf("    padding = (%d, %d, %d, %d)\n",
+         p->pad_top, p->pad_bottom, p->pad_left, p->pad_right);
+  printf("    stride = (%d, %d)\n", p->stride_h, p->stride_w);
+}
+
+static int pooling_ih_ext(param_t *p, int ih)
+{
+  int pad = p->pad_top + p->pad_bottom;
+  return ih + pad;
+}
+
+static int pooling_iw_ext(param_t *p, int iw)
+{
+  int pad = p->pad_left + p->pad_right;
+  return iw + pad;
+}
+
+static int pooling_oh(param_t *p, int ih)
+{
+  int ih_ext = pooling_ih_ext(p, ih);
+  return (ih_ext - p->kh) / p->stride_h + 1;
+}
+
+static int pooling_ow(param_t *p, int iw)
+{
+  int iw_ext = pooling_iw_ext(p, iw);
+  return (iw_ext - p->kw) / p->stride_w + 1;
+}
+
+static int8_t *alloc_input(param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->ifmap->shape, p->ifmap->fmt);
+  int8_t *data = (int8_t *)malloc(size);
+  if (!data)
+    return NULL;
+
+  for (uint64_t i = 0; i < size; i++)
+    data[i] = rand() % 256 - 128;
+  return data;
+}
+
+static int8_t *alloc_output(param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->ofmap->shape, p->ofmap->fmt);
+  return (int8_t *)malloc(size);
+}
+
+static void free_pooling_param(
+    cvk_context_t *cvk_ctx,
+    param_t *r)
+{
+  if (r->ifmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ifmap);
+  if (r->ofmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ofmap);
+}
+
+static param_t random_pooling_param(cvk_context_t *cvk_ctx, int stride_w, int stride_h)
+{
+  srand(clock());
+  param_t p;
+  int retry_cnt = 100;
+
+  for (int i = 0; i < retry_cnt; i++) {
+    int in = rand() % 5 + 1;
+    int ic = rand() % (3 * cvk_ctx->info.npu_num) + 1;
+    int ih = rand() % 30 + 3 + (INVALIDE_STRIDE == stride_h ? 0 : stride_h);
+    int iw = rand() % 30 + 6 + (INVALIDE_STRIDE == stride_w ? 0 : stride_w);
+    int opd0_sign = rand() % 2;
+
+    memset(&p, 0, sizeof(p));
+    p.kh = rand() % 7 + 1;
+    p.kw = rand() % 7 + 1;
+    p.stride_h = INVALIDE_STRIDE == stride_h ? rand() % (p.kh) + 1 : stride_h;
+    p.stride_w = INVALIDE_STRIDE == stride_w ? rand() % (p.kh) + 1 : stride_w;
+    p.pad_top = rand() % p.kh;
+    p.pad_bottom = rand() % p.kh;
+    p.pad_left = rand() % p.kw;
+    p.pad_right = rand() % p.kw;
+
+    cvk_tl_shape_t ifmap_shape;
+    ifmap_shape.n = in;
+    ifmap_shape.c = ic;
+    ifmap_shape.h = ih;
+    ifmap_shape.w = iw;
+    cvk_tl_shape_t ofmap_shape;
+    ofmap_shape.n = in;
+    ofmap_shape.c = ic;
+    ofmap_shape.h = pooling_oh(&p, ih);
+    ofmap_shape.w = pooling_ow(&p, iw);
+
+    cvk_fmt_t fmt = opd0_sign? CVK_FMT_I8: CVK_FMT_U8;
+    p.ofmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, ofmap_shape, CVK_FMT_I8, 1);
+    p.ifmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, ifmap_shape, fmt, 1);
+
+    if ((p.kh > pooling_ih_ext(&p, ih))
+        || (p.kw > pooling_iw_ext(&p, iw))
+        || (p.pad_top >= (1 << 4))
+        || (p.pad_bottom >= (1 << 4))
+        || (p.pad_left >= (1 << 4))
+        || (p.pad_right >= (1 << 4))
+        || (p.kh * p.kw == 1)
+        || !p.ofmap || !p.ifmap) {
+      printf("retry init_pooling_param\n");
+      free_pooling_param(cvk_ctx, &p);
+    } else
+      break;
+  }
+
+  return p;
+}
+
+static int compare_results(
+    param_t *p,
+    int8_t input[],
+    int8_t output[])
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+  int sign = (p->ifmap->fmt == CVK_FMT_I8);
+
+  int8_t *output_ref = alloc_output(p);
+  int ret = native_pooling_max_int8(
+      input, output_ref, in, ic, ih, iw, p->kh, p->kw,
+      p->pad_top, p->pad_bottom, p->pad_left, p->pad_right,
+      p->stride_h, p->stride_w, 0, 0, 0, 0, sign);
+  if (ret)
+    return ret;
+
+  ret = array_cmp_int8(
+      "Comparing results ...\n", output_ref, output,
+      tl_shape_size(&p->ofmap->shape, p->ofmap->fmt));
+
+  if (ret != 0) {
+    printf("Comparison FAILED!!!\n");
+    print_pooling_param(p);
+  }
+
+  free(output_ref);
+
+  return ret;
+}
+
+static int _test_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int stride_w, int stride_h)
+{
+  param_t param = random_pooling_param(cvk_ctx, stride_w, stride_h);
+  int8_t *input = alloc_input(&param);
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, param.ifmap, (uint8_t *)input);
+  cvk_ctx->ops->tiu_max_pooling(cvk_ctx, &param);
+  int8_t *output = (int8_t *)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, param.ofmap);
+
+  int ret = compare_results(&param, input, output);
+
+  free_pooling_param(cvk_ctx, &param);
+  free(output);
+  free(input);
+
+  return ret;
+}
+
+static int test_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx) {
+  return _test_pooling(rt_handle, cvk_ctx, INVALIDE_STRIDE, INVALIDE_STRIDE);
+}
+
+static int test_max_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+  for (uint64_t i = 0; i < 16; i++)
+    ret |= test_pooling(rt_handle, cvk_ctx);
+
+  // test stride extend (0, 31]
+  int stride_list[] = {15, 16, 31};
+  int stride_list_len = sizeof(stride_list) / sizeof(stride_list[0]);
+
+  for (int stride_w_idx = 0; stride_w_idx < stride_list_len; stride_w_idx++) {
+    for (int stride_h_idx = 0; stride_h_idx < stride_list_len; stride_h_idx++) {
+      int stride_w = stride_list[stride_w_idx];
+      int stride_h = stride_list[stride_h_idx];
+
+      ret |= _test_pooling(rt_handle, cvk_ctx, stride_w, stride_h);
+    }
+  }
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret = test_max_pooling(rt_handle, cvk_ctx);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_min_pooling.c b/cviruntime/test/180x/test_180x_min_pooling.c
new file mode 100644
index 000000000..4ac76a079
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_min_pooling.c
@@ -0,0 +1,220 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_native_ref.h"
+
+typedef cvk_tiu_min_pooling_param_t param_t;
+
+static void print_pooling_param(param_t *p)
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+
+  printf("  Pooling parameters:\n");
+  printf("    ifmap = (%d, %d, %d, %d)\n", in, ic, ih, iw);
+  printf("    opd0_sign = %d\n", p->ifmap->fmt == CVK_FMT_I8);
+  printf("    weight = (%d, %d)\n", p->kh, p->kw);
+  printf("    padding = (%d, %d, %d, %d)\n",
+         p->pad_top, p->pad_bottom, p->pad_left, p->pad_right);
+  printf("    stride = (%d, %d)\n", p->stride_h, p->stride_w);
+}
+
+static int pooling_ih_ext(param_t *p, int ih)
+{
+  int pad = p->pad_top + p->pad_bottom;
+  return ih + pad;
+}
+
+static int pooling_iw_ext(param_t *p, int iw)
+{
+  int pad = p->pad_left + p->pad_right;
+  return iw + pad;
+}
+
+static int pooling_oh(param_t *p, int ih)
+{
+  int ih_ext = pooling_ih_ext(p, ih);
+  return (ih_ext - p->kh) / p->stride_h + 1;
+}
+
+static int pooling_ow(param_t *p, int iw)
+{
+  int iw_ext = pooling_iw_ext(p, iw);
+  return (iw_ext - p->kw) / p->stride_w + 1;
+}
+
+static int8_t *alloc_input(param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->ifmap->shape, p->ifmap->fmt);
+  int8_t *data = (int8_t *)malloc(size);
+  if (!data)
+    return NULL;
+
+  for (uint64_t i = 0; i < size; i++)
+    data[i] = rand() % 256 - 128;
+  return data;
+}
+
+static int8_t *alloc_output(param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->ofmap->shape, p->ofmap->fmt);
+  return (int8_t *)malloc(size);
+}
+
+static void free_pooling_param(
+    cvk_context_t *cvk_ctx,
+    param_t *r)
+{
+  if (r->ifmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ifmap);
+  if (r->ofmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ofmap);
+}
+
+static param_t random_pooling_param(cvk_context_t *cvk_ctx)
+{
+  srand(clock());
+  param_t p;
+  int retry_cnt = 100;
+
+  for (int i = 0; i < retry_cnt; i++) {
+    int in = rand() % 5 + 1;
+    int ic = rand() % (3 * cvk_ctx->info.npu_num) + 1;
+    int ih = rand() % 30 + 3;
+    int iw = rand() % 30 + 6;
+    int opd0_sign = rand() % 2;
+
+    memset(&p, 0, sizeof(p));
+    p.kh = rand() % 7 + 1;
+    p.kw = rand() % 7 + 1;
+    p.stride_h = rand() % (p.kh) + 1;
+    p.stride_w = rand() % (p.kw) + 1;
+    p.pad_top = rand() % p.kh;
+    p.pad_bottom = rand() % p.kh;
+    p.pad_left = rand() % p.kw;
+    p.pad_right = rand() % p.kw;
+
+    cvk_tl_shape_t ifmap_shape;
+    ifmap_shape.n = in;
+    ifmap_shape.c = ic;
+    ifmap_shape.h = ih;
+    ifmap_shape.w = iw;
+    cvk_tl_shape_t ofmap_shape;
+    ofmap_shape.n = in;
+    ofmap_shape.c = ic;
+    ofmap_shape.h = pooling_oh(&p, ih);
+    ofmap_shape.w = pooling_ow(&p, iw);
+
+    cvk_fmt_t fmt = opd0_sign? CVK_FMT_I8: CVK_FMT_U8;
+    p.ofmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, ofmap_shape, CVK_FMT_I8, 1);
+    p.ifmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, ifmap_shape, fmt, 1);
+
+    if ((p.kh > pooling_ih_ext(&p, ih))
+        || (p.kw > pooling_iw_ext(&p, iw))
+        || (p.pad_top >= (1 << 4))
+        || (p.pad_bottom >= (1 << 4))
+        || (p.pad_left >= (1 << 4))
+        || (p.pad_right >= (1 << 4))
+        || (p.kh * p.kw == 1)
+        || !p.ofmap || !p.ifmap) {
+      printf("retry init_pooling_param\n");
+      free_pooling_param(cvk_ctx, &p);
+    } else
+      break;
+  }
+
+  return p;
+}
+
+static int compare_results(
+    param_t *p,
+    int8_t input[],
+    int8_t output[])
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+  int sign = (p->ifmap->fmt == CVK_FMT_I8);
+
+  int8_t *output_ref = alloc_output(p);
+  int ret = native_pooling_min_int8(
+      input, output_ref, in, ic, ih, iw, p->kh, p->kw,
+      p->pad_top, p->pad_bottom, p->pad_left, p->pad_right,
+      p->stride_h, p->stride_w, 0, 0, 0, 0, sign);
+  if (ret)
+    return ret;
+
+  ret = array_cmp_int8(
+      "Comparing results ...\n", output_ref, output,
+      tl_shape_size(&p->ofmap->shape, p->ofmap->fmt));
+
+  if (ret != 0) {
+    printf("Comparison FAILED!!!\n");
+    print_pooling_param(p);
+  }
+
+  free(output_ref);
+
+  return ret;
+}
+
+static int test_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  param_t param = random_pooling_param(cvk_ctx);
+  int8_t *input = alloc_input(&param);
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, param.ifmap, (uint8_t *)input);
+  cvk_ctx->ops->tiu_min_pooling(cvk_ctx, &param);
+  int8_t *output = (int8_t *)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, param.ofmap);
+
+  int ret = compare_results(&param, input, output);
+
+  free_pooling_param(cvk_ctx, &param);
+  free(output);
+  free(input);
+
+  return ret;
+}
+
+static int test_min_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+  for (uint64_t i = 0; i < 16; i++)
+    ret |= test_pooling(rt_handle, cvk_ctx);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret = test_min_pooling(rt_handle, cvk_ctx);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_tdma_bf16_matrix_vlc_decompress_compress.c b/cviruntime/test/180x/test_180x_tdma_bf16_matrix_vlc_decompress_compress.c
new file mode 100644
index 000000000..e675e277d
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_tdma_bf16_matrix_vlc_decompress_compress.c
@@ -0,0 +1,212 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+
+typedef cvk_tdma_g2l_matrix_copy_decompressed_param_t decompress_param_t;
+typedef cvk_tdma_l2g_matrix_copy_compressed_param_t compress_param_t;
+
+typedef struct{
+  decompress_param_t dec_p;
+  compress_param_t com_p;
+} param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => %s\n",
+      tag,
+      p->dec_p.dst->shape.n, p->dec_p.dst->shape.c, p->dec_p.dst->shape.w, p->dec_p.dst->shape.col,
+      (p->dec_p.dst->fmt == CVK_FMT_I8)? "signed": "unsigned");
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_mg_shape_t src_shape;
+  cvk_ml_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 0, 1 },
+    { 0, 1, 1, 1 },
+  },
+  {
+    { 0, 2 },
+    { 0, 1, 2, 2 },
+  },
+  {
+    { 0, 2 },
+    { 0, 2, 1, 2 },
+  },
+  {
+    { 0, 7 },
+    { 0, 1, 7, 7 },
+  },
+#ifndef ENABEL_SIMPLE_VLC_TEST
+  {
+    { 0, 7 },
+    { 0, 2, 4, 7 },
+  },
+  {
+    { 0, 7 },
+    { 0, 7, 1, 7 },
+  },
+  {
+    { 0, 17 },
+    { 0, 1, 17, 17 },
+  },
+  {
+    { 0, 17 },
+    { 0, 3, 7, 17 },
+  },
+  {
+    { 0, 17 },
+    { 0, 17, 1, 17 },
+  },
+  {
+    { 0, 60 },
+    { 0, 1, 60, 60 },
+  },
+  {
+    { 0, 60 },
+    { 0, 30, 2, 60 },
+  },
+  {
+    { 0, 60 },
+    { 0, 60, 1, 60 },
+  }
+#endif /* ifndef ENABEL_SIMPLE_VLC_TEST*/
+};
+
+static void test_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p, uint16_t *src_data,
+  CommandInfo* cmd_info)
+{
+  print_param(stderr, p);
+  uint64_t size = ml_shape_size(&p->dec_p.dst->shape, CVK_FMT_I8);
+  uint64_t bytesize = size * fmt_size(p->dec_p.dst->fmt);
+  int is_signed = (p->dec_p.dst->fmt == CVK_FMT_I8);
+
+  uint16_t *gmem_data;
+  size_t bs_size;
+  size_t data_type = (p->dec_p.dst->fmt == CVK_FMT_BF16) ? 1 : 0;
+
+  gmem_data = (uint16_t* )test_vlc_compress((uint8_t* )src_data, bytesize, is_signed, data_type, &bs_size, cmd_info, NULL);
+
+  //1. send compressed one to gaddr and decompress from gaddr to local
+  cmpr_matrix_copy_s2d(rt_handle, p->dec_p.src, (uint8_t* ) gmem_data);
+  cvk_ctx->ops->tdma_g2l_matrix_copy_decompressed(cvk_ctx, &p->dec_p);
+  CVI_RT_Submit(cvk_ctx);
+
+  //2. decompress from sram
+  cvk_ctx->ops->tdma_l2g_matrix_copy_compressed(cvk_ctx, &p->com_p);
+  CVI_RT_Submit(cvk_ctx);
+
+  //3. get final data
+  uint16_t *dst_data = (uint16_t* )cmpr_matrix_copy_d2s(rt_handle, p->com_p.dst);
+
+  for (uint64_t i = 0; i < bs_size / 2; i++) {
+    if (dst_data[i] != gmem_data[i]) {
+      fprintf(stderr, "vlc compress comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], gmem_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(dst_data);
+  free(gmem_data);
+}
+
+static void destroy_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  free_cmpr_matrix_dev_mem(rt_handle, p->dec_p.src);
+  free_cmpr_matrix_dev_mem(rt_handle, p->com_p.dst);
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->dec_p.dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  cvk_fmt_t fmts[] = { CVK_FMT_BF16 };
+  uint8_t fmts_sz = sizeof(fmts) / sizeof(fmts[0]);
+
+  for (uint32_t row = 1; row < 13; row += 2) {
+    c->src_shape.row = row;
+    c->dst_shape.n = row;
+    for (int dst_align = 0; dst_align < 2; dst_align++) {
+      for (uint8_t fmt_i = 0; fmt_i < fmts_sz; fmt_i++) {
+        cvk_fmt_t fmt = fmts[fmt_i];
+        param_t p;
+        memset(&p, 0, sizeof(p));
+        //put compressed data to gaddr ->decompress to local -> compress to gaddr
+
+        int is_signed = (fmt == CVK_FMT_I8);
+        int data_type = (fmt == CVK_FMT_BF16) ? 1 : 0;
+
+        CommandInfo cmd_info;
+        memset(&cmd_info, 0, sizeof(CommandInfo));
+        cmd_info.signedness = is_signed;
+        cmd_info.is_bfloat16 = data_type;
+        cmd_info.bias0 = 127;
+
+        // <! not support bias0/1 setting compress by hw
+        //get_vlc_compressed_meta(src_data, size, fmt, &bs_size, &cmd_info);
+
+        //1. alloc decompress
+        p.dec_p.src = alloc_cmpr_matrix_dev_mem(rt_handle, c->src_shape, fmt, &cmd_info);
+        p.dec_p.dst = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, c->dst_shape, fmt, dst_align);
+
+        uint64_t size = ml_shape_size(&p.dec_p.dst->shape, p.dec_p.dst->fmt);
+        uint16_t *src_data = (uint16_t *)malloc(sizeof(uint16_t) * size);
+        test_vlc_init_testdata((uint8_t *)src_data, size, fmt == CVK_FMT_I8, fmt == CVK_FMT_BF16);
+
+        assert(p.dec_p.dst);
+
+        //2. alloc compress
+        p.com_p.src = p.dec_p.dst; //cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, c->lmem_shape, fmt, align);
+        p.com_p.dst = alloc_cmpr_matrix_dev_mem(rt_handle, c->src_shape, fmt, &cmd_info);
+
+        //3. test: the sequence like below:
+        //3.1 put compressed data to gaddr
+        //3.2 decompress to local
+        //3.3 compress to gaddr
+        //printf ("row %u is_align %d fmt %d\n", row, dst_align, fmt);
+        test_param_g2l(rt_handle, cvk_ctx, &p, src_data, &cmd_info);
+        destroy_param_g2l(rt_handle, cvk_ctx, &p);
+        free(src_data);
+      }
+    }
+  }
+}
+
+int main(int argc, char **argv)
+{
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return 0;
+}
diff --git a/cviruntime/test/180x/test_180x_tdma_bf16_tensor_vlc_decompress_compress.c b/cviruntime/test/180x/test_180x_tdma_bf16_tensor_vlc_decompress_compress.c
new file mode 100644
index 000000000..f6fb63a25
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_tdma_bf16_tensor_vlc_decompress_compress.c
@@ -0,0 +1,225 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef cvk_tdma_g2l_tensor_copy_decompressed_param_t decompress_param_t;
+typedef cvk_tdma_l2g_tensor_copy_compressed_param_t compress_param_t;
+
+typedef struct{
+  decompress_param_t dec_p;
+  compress_param_t com_p;
+} param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => %d-bit %s\n",
+      tag,
+      p->dec_p.dst->shape.n, p->dec_p.dst->shape.c, p->dec_p.dst->shape.h, p->dec_p.dst->shape.w,
+      p->dec_p.src->bit_length,
+      (p->dec_p.dst->fmt == CVK_FMT_I8)? "signed": "unsigned");
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_tl_shape_t lmem_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 17, 13 }
+  },
+#if 0 // No enough local memory for 180x 
+  {
+    { 3, 39, 17, 23 }
+  },
+  {
+    { 5, 39, 17, 23 }
+  },
+  {
+    { 20, 35,  2, 2 }
+  },
+#endif
+#ifndef ENABEL_SIMPLE_VLC_TEST
+  {
+    { 1, 1, 1, 1 }
+  },
+  {
+    { 1, 1, 1, 2 }
+  },
+  {
+    { 1, 1, 7, 2 }
+  },
+  {
+    { 1, 1, 10, 60 }
+  },
+  {
+    { 1, 2, 1, 1 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 3, 16, 1, 1 }
+  },
+  #if 0 // No enough local memory for 180x 
+  {
+    { 3, 36, 16,  20 }
+  },
+  #endif
+#endif /* ifndef ENABEL_SIMPLE_VLC_TEST*/
+};
+
+static int test_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p, cvk_cmpr_tg_t* dst)
+{
+  print_param(stderr, p);
+  int ret = 0;
+  uint64_t size = tl_shape_size(&p->dec_p.dst->shape, CVK_FMT_I8);
+  uint64_t bytesize = size * fmt_size(p->dec_p.dst->fmt);
+  int is_signed = (p->dec_p.dst->fmt == CVK_FMT_I8);
+  uint16_t *src_data = (uint16_t *)malloc(bytesize);
+  uint16_t *dst_data = NULL;
+  uint8_t *gmem_data = NULL;
+  if (!src_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  test_vlc_init_testdata((uint8_t *)src_data, size, p->dec_p.dst->fmt == CVK_FMT_I8, p->dec_p.dst->fmt == CVK_FMT_BF16);
+
+  size_t total_size;
+  size_t data_type = (p->dec_p.dst->fmt == CVK_FMT_BF16) ? 1 : 0;
+  size_t bs_buf_size = get_out_bs_buf_size(bytesize, data_type);
+  gmem_data = (uint8_t *) malloc(bs_buf_size * sizeof(uint8_t));
+  if (!gmem_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  // command info
+  CommandInfo cmd_info;
+  memset(&cmd_info, 0, sizeof(CommandInfo));
+  cmd_info.signedness = is_signed;
+  cmd_info.is_bfloat16 = data_type;
+  cmd_info.bias0 = 127;
+  // TODO: test
+  //cmd_info.zero_guard_en = 1;
+  // TODO generate +-inf +-nan, plz refere https://en.wikipedia.org/wiki/Bfloat16_floating-point_format
+
+  // <! not support bias0/1 setting compress by hw
+  //cvk_vlc_est_weight_bias(src_data, in_size, (bool)is_signed, (bool)data_type, &cmd_info);
+  cvk_vlc_enc_bf16((uint16_t* )src_data, bytesize, gmem_data, &total_size, &cmd_info);
+
+  cmpr_tensor_copy_s2d(rt_handle, p->dec_p.src, gmem_data);
+  cvk_ctx->ops->tdma_g2l_tensor_copy_decompressed(cvk_ctx, &p->dec_p);
+  CVI_RT_Submit(cvk_ctx);
+
+  dst->zero_guard_en = cmd_info.zero_guard_en;
+  dst->bias0 = cmd_info.bias0;
+  dst->bias1 = cmd_info.bias1;
+  p->com_p.dst = dst;
+  cvk_ctx->ops->tdma_l2g_tensor_copy_compressed(cvk_ctx, &p->com_p);
+  CVI_RT_Submit(cvk_ctx);
+
+  dst_data = (uint16_t* )cmpr_tensor_copy_d2s(rt_handle, p->com_p.dst);
+  uint16_t* ref_data = (uint16_t* )gmem_data;
+
+  //<! div 2 means compare base bf16(2bytes), total_size unit is byte
+  for (uint64_t i = 0; i < total_size / 2 ; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "vlc compress comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+fail_exit:
+  free(src_data);
+  free(dst_data);
+  free(gmem_data);
+
+  return ret;
+}
+
+static void destroy_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  if (p->dec_p.src)
+    free_cmpr_tensor_dev_mem(rt_handle, p->dec_p.src);
+  if (p->com_p.dst)
+    free_cmpr_tensor_dev_mem(rt_handle, p->com_p.dst);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->dec_p.dst);
+}
+
+static int test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  int ret = 0;
+  cvk_fmt_t fmts[] = { CVK_FMT_BF16 };
+  uint8_t fmts_sz = sizeof(fmts) / sizeof(fmts[0]);
+
+  for (int align = 0; align < 2; align++) {
+    for (uint8_t fmt_i = 0; fmt_i < fmts_sz; fmt_i++) {
+      cvk_fmt_t fmt = fmts[fmt_i];
+
+      param_t p;
+      memset(&p, 0, sizeof(p));
+      cvk_tg_shape_t tg_shape =
+          tg_shape_t4(c->lmem_shape.n, c->lmem_shape.c, c->lmem_shape.h, c->lmem_shape.w);
+      p.dec_p.src = alloc_cmpr_tensor_dev_mem(rt_handle, cvk_ctx, tg_shape, fmt, NULL);
+      p.dec_p.dst = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, c->lmem_shape, fmt, align);
+      if (p.dec_p.src && p.dec_p.dst) {
+        p.com_p.src = p.dec_p.dst; //cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, c->lmem_shape, fmt, align);
+        cvk_cmpr_tg_t* dst = alloc_cmpr_tensor_dev_mem(rt_handle, cvk_ctx, tg_shape, fmt, NULL);
+        if (dst)
+          ret |= test_param_g2l(rt_handle, cvk_ctx, &p, dst);
+      } else if (!p.dec_p.src)
+        fprintf(stderr, "allocate tl shape(%d, %d, %d, %d) failed\n",
+                c->lmem_shape.n, c->lmem_shape.c, c->lmem_shape.h,
+                c->lmem_shape.w);
+      else if (!p.dec_p.dst)
+        fprintf(stderr, "allocate tg failed\n");
+      destroy_param_g2l(rt_handle, cvk_ctx, &p);
+    }
+  }
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    ret |= test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  printf("tdma bf16 tensor vlc test %s\n", ret ? "fail" : "pass");
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_tdma_g2l_bf16_matrix_vlc_copy_decompressed.c b/cviruntime/test/180x/test_180x_tdma_g2l_bf16_matrix_vlc_copy_decompressed.c
new file mode 100644
index 000000000..4f8d8ea70
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_tdma_g2l_bf16_matrix_vlc_copy_decompressed.c
@@ -0,0 +1,203 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef cvk_tdma_g2l_matrix_copy_decompressed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->m.shape.row, p->src->m.shape.col,
+      p->dst->shape.n, p->dst->shape.c,
+      p->dst->shape.w, p->dst->shape.col);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_mg_shape_t src_shape;
+  cvk_ml_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 0, 17 },
+    { 0, 3, 7, 17 },
+  },
+  {
+    { 0, 7 },
+    { 0, 7, 1, 7 },
+  },
+  {
+    { 0, 60 },
+    { 0, 30, 2, 60 },
+  },
+#ifndef ENABEL_SIMPLE_VLC_TEST
+  {
+    { 0, 1 },
+    { 0, 1, 1, 1 },
+  },
+  {
+    { 0, 2 },
+    { 0, 1, 2, 2 },
+  },
+  {
+    { 0, 2 },
+    { 0, 2, 1, 2 },
+  },
+  {
+    { 0, 7 },
+    { 0, 1, 7, 7 },
+  },
+  {
+    { 0, 7 },
+    { 0, 2, 4, 7 },
+  },
+  {
+    { 0, 17 },
+    { 0, 1, 17, 17 },
+  },
+  {
+    { 0, 17 },
+    { 0, 17, 1, 17 },
+  },
+  {
+    { 0, 60 },
+    { 0, 1, 60, 60 },
+  },
+  {
+    { 0, 60 },
+    { 0, 60, 1, 60 },
+  }
+#endif /* ifndef ENABEL_SIMPLE_VLC_TEST*/
+};
+
+static void g2l_matrix_copy_ref(param_t *p, uint16_t ref_data[], uint16_t src_data[])
+{
+  uint64_t size = ml_shape_size(&p->dst->shape, CVK_FMT_I8);
+
+  for (uint64_t i = 0; i < size; i++)
+    ref_data[i] = src_data[i];
+}
+
+static void test_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p, uint16_t *src_data, CommandInfo* cmd_info)
+{
+  print_param(stderr, p);
+
+  uint64_t in_size = ml_shape_size(&p->dst->shape, CVK_FMT_I8);
+  size_t bs_size = 0;
+  int is_signed = (p->dst->fmt == CVK_FMT_I8);
+  size_t data_type = (p->dst->fmt == CVK_FMT_BF16) ? 1 : 0;
+  uint64_t bytesize = in_size * fmt_size(p->dst->fmt);
+
+  uint8_t *bsbuf = test_vlc_compress((uint8_t* )src_data, bytesize, is_signed, data_type, &bs_size, cmd_info, NULL);
+  cmpr_matrix_copy_s2d(rt_handle, p->src, bsbuf);
+  free(bsbuf);
+
+  cvk_ctx->ops->tdma_g2l_matrix_copy_decompressed(cvk_ctx, p);
+  CVI_RT_Submit(cvk_ctx);
+  uint16_t *dst_data = (uint16_t*)matrix_copy_l2g_d2s(rt_handle, cvk_ctx, p->dst);
+
+  uint16_t *ref_data = (uint16_t *)malloc(sizeof(uint16_t) * in_size);
+  g2l_matrix_copy_ref(p, ref_data, src_data);
+
+  for (uint64_t i = 0; i < in_size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(dst_data);
+  free(ref_data);
+}
+
+static void destroy_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  free_cmpr_matrix_dev_mem(rt_handle, p->src);
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  cvk_fmt_t fmts[] = { CVK_FMT_BF16 };
+  uint8_t fmts_sz = sizeof(fmts) / sizeof(fmts[0]);
+
+  for (uint32_t row = 1; row < 13; row += 2) {
+    c->src_shape.row = row;
+    c->dst_shape.n = row;
+    for (int mode = 0; mode < VLC_CMP_MODE_MAX; mode++) {
+      for (int dst_align = 0; dst_align < 2; dst_align++) {
+        for (uint8_t fmt_i = 0; fmt_i < fmts_sz; fmt_i++) {
+          cvk_fmt_t fmt = fmts[fmt_i];
+          param_t p;
+          memset(&p, 0, sizeof(p));
+
+          int is_signed = (fmt == CVK_FMT_I8);
+          size_t data_type = (fmt == CVK_FMT_BF16) ? 1 : 0;
+          CommandInfo cmd_info;
+
+          memset(&cmd_info, 0, sizeof(CommandInfo));
+          cmd_info.signedness = is_signed;
+          cmd_info.is_bfloat16 = data_type;
+
+          // <! 1. alloc source
+          p.dst = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, c->dst_shape, fmt, dst_align);
+          uint64_t in_size = ml_shape_size(&p.dst->shape, CVK_FMT_I8);
+
+          // <! 2 init input
+          uint16_t *src_data = (uint16_t *)malloc(sizeof(uint16_t) * in_size);
+          test_vlc_init_testdata((uint8_t *)src_data, in_size, fmt == CVK_FMT_I8, fmt == CVK_FMT_BF16);
+
+          // <! 3 try to manual set bias0/bias1
+          if (mode == VLC_CMP_MODE_COMPILER) {
+            cvk_vlc_est_weight_bias((uint8_t*) src_data, in_size * sizeof(uint16_t), (bool)is_signed, (bool)data_type, &cmd_info);
+          }
+
+          p.src = alloc_cmpr_matrix_dev_mem(rt_handle, c->src_shape, fmt, &cmd_info);
+
+          //printf ("row %u mode %d is_align %d fmt %d\n", row, mode, dst_align, fmt);
+          test_param_g2l(rt_handle, cvk_ctx, &p, src_data, &cmd_info);
+
+          free(src_data);
+          destroy_param_g2l(rt_handle, cvk_ctx, &p);
+        }
+      }
+    }
+  }
+}
+
+int main(int argc, char **argv)
+{
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return 0;
+}
diff --git a/cviruntime/test/180x/test_180x_tdma_g2l_bf16_tensor_copy_nc_transposed.c b/cviruntime/test/180x/test_180x_tdma_g2l_bf16_tensor_copy_nc_transposed.c
new file mode 100644
index 000000000..e6f9f3ddd
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_tdma_g2l_bf16_tensor_copy_nc_transposed.c
@@ -0,0 +1,310 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef cvk_tdma_g2l_tensor_copy_nc_transposed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_fmt_t src_fmt;
+  cvk_fmt_t dst_fmt;
+} fmt_type_t;
+
+static fmt_type_t input_fmt[] = {
+ {CVK_FMT_BF16, CVK_FMT_BF16},
+ {CVK_FMT_I8, CVK_FMT_BF16},
+ {CVK_FMT_U8, CVK_FMT_BF16},
+};
+
+typedef struct {
+  cvk_tg_shape_t src_shape;
+  cvk_tl_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 1, 1 },
+    { 1, 1, 1, 1 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 1, 2 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 2, 1 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 7, 2 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 2, 7 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 17, 13 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 13, 17 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 10, 60 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 2, 300 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 3, 200 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 4, 150 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 5, 120 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 60, 10 },
+  }, {
+    { 1, 1, 120, 5 },
+    { 1, 1, 120, 5 },
+  }, {
+    { 1, 2, 1, 1 },
+    { 2, 1, 1, 1 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 1, 4 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 2, 2 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 4, 1 },
+  }, {
+    { 17, 2, 2, 2 },
+    { 2, 17, 2, 2 },
+  }, {
+    { 17, 2, 4, 1 },
+    { 2, 17, 4, 1 },
+  }, {
+    {  3, 16, 1, 1 },
+    { 16,  3, 1, 1 },
+#if 0 // No enough local memory for 180x
+  }, {
+    { 3, 39, 23, 17 },
+    { 39, 3, 23, 17 },
+  }, {
+    { 3, 39, 17, 23 },
+    { 39, 3, 17, 23 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  16, 20 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  2, 160 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  4, 80 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  8, 40 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  20, 16 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  32, 10 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  64, 5 },
+  }, {
+    { 5, 39, 17, 23 },
+    { 39, 5, 17, 23 },
+  }, {
+    { 20, 35, 2, 2 },
+    { 35, 20, 2, 2 },
+  }, {
+    { 35, 20, 2, 2 },
+    { 20, 35, 2, 2 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 160, 2 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 2, 160 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 4, 80 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 8, 40 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 10, 32 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 20, 16 },
+  }, {
+    { 39, 5, 23, 17 },
+    { 5, 39, 23, 17 },
+#endif
+  }
+};
+
+static void g2l_tensor_copy_nc_transposed_ref(
+    param_t *p, uint16_t ref_data[], uint16_t src_data[])
+{
+  cvk_tg_shape_t s = p->src->shape;
+  uint32_t n = s.n;
+  uint32_t c = s.c;
+  uint32_t hw = s.h * s.w;
+  for (uint32_t ni = 0; ni < n; ni++) {
+    for (uint32_t ci = 0; ci < c; ci++) {
+      for (uint32_t hwi = 0; hwi < hw; hwi++) {
+        uint32_t src_i = ni * c * hw + ci * hw + hwi;
+        uint32_t dst_i = ci * n * hw + ni * hw + hwi;
+        if(p->src->fmt == CVK_FMT_BF16 && p->dst->fmt == CVK_FMT_BF16)
+          ref_data[dst_i] = src_data[src_i];
+        else {
+          uint8_t* u8src_data = (uint8_t*)src_data;
+          uint8_t sign = p->src->fmt == CVK_FMT_I8 ? 1 : 0;
+          ref_data[dst_i] = cvk_convert_int8_bf16(u8src_data[src_i], sign);
+        }
+      }
+    }
+  }
+}
+
+static int test_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  print_param(stderr, p);
+  int ret = 0;
+  uint64_t size = tl_shape_size(&p->dst->shape, CVK_FMT_I8);
+
+  uint16_t *u16src_data = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  uint8_t *u8src_data = (uint8_t *)malloc(sizeof(uint16_t) * size);
+  uint16_t *dst_data = NULL, *ref_data = NULL;
+  if (!u16src_data || !u8src_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  uint8_t *src_data;
+  if(p->src->fmt == CVK_FMT_BF16) {
+    float val = -100;
+    for(uint64_t i = 0; i < size; i++) {
+      u16src_data[i] = test_generate_bf16_corner_val(val);
+      val += 0.1;
+    }
+    src_data = (uint8_t*)u16src_data;
+  } else {
+    for(uint64_t i = 0; i < size; i++) {
+      u8src_data[i] = 200 + i;
+    }
+    src_data = u8src_data;
+  }
+
+  tensor_copy_s2d(rt_handle, p->src, (uint8_t*) src_data);
+  cvk_ctx->ops->tdma_g2l_bf16_tensor_copy_nc_transposed(cvk_ctx, p);
+  CVI_RT_Submit(cvk_ctx);
+  dst_data = (uint16_t *) tensor_copy_l2g_d2s(rt_handle, cvk_ctx, p->dst);
+
+  ref_data = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  if (!dst_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  g2l_tensor_copy_nc_transposed_ref(p, ref_data, (uint16_t*) src_data);
+
+  for (uint64_t i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %x, exp %x\n",
+              i, dst_data[i], ref_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+fail_exit:
+  free(u8src_data);
+  free(u16src_data);
+  free(dst_data);
+  free(ref_data);
+
+  return ret;
+}
+
+
+static void destroy_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->dst);
+  free_tensor_dev_mem(rt_handle, p->src);
+}
+
+static int test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  int ret = 0;
+  uint32_t nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+  for (uint32_t i = 0; i < nr_fmt; i++) {
+    for (int dst_align = 0; dst_align < 2; dst_align++) {
+      param_t p;
+      memset(&p, 0, sizeof(p));
+
+      p.src = alloc_tensor_dev_mem(rt_handle, cvk_ctx, c->src_shape, input_fmt[i].src_fmt);
+      p.dst = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, c->dst_shape, input_fmt[i].dst_fmt, dst_align);
+      if (p.src && p.dst)
+        ret |= test_param_g2l(rt_handle, cvk_ctx, &p);
+      else if (!p.src)
+          fprintf(stderr, "allocate tg failed\n");
+      else if (!p.dst)
+          fprintf(stderr, "allocate tl shape(%d, %d, %d, %d) failed\n",
+                  c->dst_shape.n, c->dst_shape.c, c->dst_shape.h,
+                  c->dst_shape.w);
+      destroy_param_g2l(rt_handle, cvk_ctx, &p);
+    }
+  }
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+  int ret = 0;
+
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    ret |= test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  printf("tdma g2l bf16 tensor copy nc tp test %s\n", ret ? "fail" : "pass");
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_tdma_g2l_bf16_tensor_fill_constant.c b/cviruntime/test/180x/test_180x_tdma_g2l_bf16_tensor_fill_constant.c
new file mode 100644
index 000000000..d5dc27b71
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_tdma_g2l_bf16_tensor_fill_constant.c
@@ -0,0 +1,191 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+
+typedef cvk_tdma_g2l_tensor_fill_constant_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: %u => (%u, %u, %u, %u)\n",
+      tag, p->constant,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  uint16_t constant;
+  cvk_tl_shape_t dst_shape;
+} case_t;
+
+typedef struct {
+  cvk_fmt_t src_fmt;
+  cvk_fmt_t dst_fmt;
+} fmt_type_t;
+
+static fmt_type_t input_fmt[] = {
+ {CVK_FMT_BF16, CVK_FMT_BF16},
+};
+
+static case_t g_cases[] = {
+  {
+    37, { 1, 1, 1, 1 }
+  }, {
+    39, { 1, 1, 1, 2 }
+  }, {
+    23, { 1, 1, 2, 1 }
+  }, {
+    19, { 1, 1, 7, 2 }
+  }, {
+    17, { 1, 1, 2, 7 }
+  }, {
+    13, { 1, 1, 17, 13 }
+  }, {
+    11, { 1, 1, 13, 17 }
+  }, {
+    7, { 1, 1, 10, 60 }
+  }, {
+    9, { 1, 1, 120, 5 }
+  }, {
+    2, { 1, 2, 1, 1 }
+  }, {
+    3, { 1, 1, 1, 2 }
+  }, {
+    5, { 2, 17, 1,  4 }
+  }, {
+    41, { 2,  1, 4, 17 }
+  }, {
+    5, { 2, 17, 1,  4 }
+  }, {
+    9, { 2,  1, 17, 4 }
+  }, {
+    17, { 3, 16, 1, 1 }
+  }, {
+    26, { 3,  1, 2, 8 }
+#if 0 // No enough local memory for 180x 
+  }, {
+    103, { 3, 39, 17, 23 }
+  }, {
+    255, { 3, 17, 39, 23 }
+  }, {
+    254, { 3, 36, 16,  20 }
+  }, {
+    127, { 3, 18,  1, 640 }
+  }, {
+    128, { 5, 39, 17, 23 }
+  }, {
+    129, { 5, 17, 39, 23 }
+  }, {
+    55, { 20, 35,  2, 2 }
+  }, {
+    1, { 20,  7, 10, 2 }
+#endif    
+  }
+};
+
+static void g2l_tensor_fill_constant_ref(param_t *p, uint16_t ref_data[])
+{
+  uint64_t size = tl_shape_size(&p->dst->shape, p->dst->fmt);
+
+  for (uint64_t i = 0; i < size/sizeof(uint16_t); i++)
+    ref_data[i] = p->constant;
+}
+
+static int test_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  print_param(stderr, p);
+  uint64_t size = tl_shape_size(&p->dst->shape, CVK_FMT_I8);
+  uint64_t bytesize = size * fmt_size(p->dst->fmt);
+  int ret = 0;
+
+  cvk_ctx->ops->tdma_g2l_bf16_tensor_fill_constant(cvk_ctx, p);
+  CVI_RT_Submit(cvk_ctx);
+  uint16_t *dst_data = (uint16_t*) tensor_copy_l2g_d2s(rt_handle, cvk_ctx, p->dst);
+  uint16_t *ref_data = (uint16_t *)malloc(bytesize);
+  if (!dst_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  g2l_tensor_fill_constant_ref(p, ref_data);
+
+  for (uint64_t i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+fail_exit:
+  free(dst_data);
+  free(ref_data);
+
+  return ret;
+}
+
+static void destroy_param_g2l(cvk_context_t *cvk_ctx, param_t *p)
+{
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->dst);
+}
+
+static int test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  int ret = 0;
+  uint32_t nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+  for (uint32_t i = 0; i < nr_fmt; i++) {
+    for (int dst_align = 0; dst_align < 2; dst_align++) {
+      param_t p;
+      memset(&p, 0, sizeof(p));
+      p.constant = test_generate_bf16_corner_val(c->constant);
+      p.dst = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, c->dst_shape, input_fmt[i].src_fmt, dst_align);
+      if (p.dst)
+        ret |= test_param_g2l(rt_handle, cvk_ctx, &p);
+      else if (!p.dst)
+        fprintf(stderr, "allocate tl shape(%d, %d, %d, %d) failed\n",
+                c->dst_shape.n, c->dst_shape.c, c->dst_shape.h,
+                c->dst_shape.w);
+      destroy_param_g2l(cvk_ctx, &p);
+    }
+  }
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+  int ret = 0;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    ret |= test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  printf("tdma g2l bf16 tensor fill const test %s\n", ret ? "fail" : "pass");
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_tdma_g2l_bf16_tensor_vlc_copy_decompressed.c b/cviruntime/test/180x/test_180x_tdma_g2l_bf16_tensor_vlc_copy_decompressed.c
new file mode 100644
index 000000000..5b3273524
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_tdma_g2l_bf16_tensor_vlc_copy_decompressed.c
@@ -0,0 +1,198 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef cvk_tdma_g2l_tensor_copy_decompressed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => fmt(%d) bias0/1/zero is (%u/%u/%u) %s\n",
+      tag,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w,
+      p->dst->fmt,
+      p->src->bias0, p->src->bias1, p->src->zero_guard_en,
+      (p->dst->fmt == CVK_FMT_I8)? "signed": "unsigned");
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_tl_shape_t lmem_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 17, 13 }
+  },
+#if 0 // No enough local memory for 180x
+  {
+    { 3, 39, 17, 23 }
+  },
+
+  {
+    { 5, 39, 17, 23 }
+  },
+  {
+    { 20, 35,  2, 2 }
+  },
+#endif
+#ifndef ENABEL_SIMPLE_VLC_TEST
+  {
+    { 1, 1, 1, 1 }
+  },
+  {
+    { 1, 1, 1, 2 }
+  },
+  {
+    { 1, 1, 7, 2 }
+  },
+  {
+    { 1, 1, 10, 60 }
+  },
+  {
+    { 1, 2, 1, 1 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 3, 16, 1, 1 }
+  },
+  # if 0  // No enough local memory for 180x 
+  {
+    { 3, 36, 16,  20 }
+  },
+  #endif
+#endif /* ifndef ENABEL_SIMPLE_VLC_TEST*/
+};
+
+static int test_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p, uint16_t *src_data, CommandInfo* cmd_info)
+{
+  print_param(stderr, p);
+  uint64_t size = tl_shape_size(&p->dst->shape, CVK_FMT_I8);
+  uint64_t bytesize = size * fmt_size(p->dst->fmt);
+  size_t bs_size = 0;
+  int is_signed = (p->dst->fmt == CVK_FMT_I8);
+  uint8_t data_type = (p->dst->fmt == CVK_FMT_BF16) ? 1 : 0;
+  int ret = 0;
+
+  uint8_t *bsbuf = test_vlc_compress((uint8_t *)src_data, bytesize, is_signed, data_type, &bs_size, cmd_info, NULL);
+
+  uint16_t *ref_data = (uint16_t *)malloc(bytesize);
+  cvk_vlc_dec_bf16(bsbuf, bytesize, (uint16_t* )ref_data);
+
+  cmpr_tensor_copy_s2d(rt_handle, p->src, bsbuf);
+  cvk_ctx->ops->tdma_g2l_tensor_copy_decompressed(cvk_ctx, p);
+  CVI_RT_Submit(cvk_ctx);
+
+  uint16_t *dst_data = (uint16_t* )tensor_copy_l2g_d2s(rt_handle, cvk_ctx, p->dst);
+
+  for (uint64_t i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "vlc decompress comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+          i, dst_data[i], ref_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+  free(bsbuf);
+  free(dst_data);
+  free(ref_data);
+  return ret;
+}
+
+static void destroy_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  free_cmpr_tensor_dev_mem(rt_handle, p->src);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->dst);
+}
+
+static int test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  cvk_fmt_t fmts[] = { CVK_FMT_BF16 };
+  uint8_t fmts_sz = sizeof(fmts) / sizeof(fmts[0]);
+  int ret = 0;
+
+  for (int dst_align = 0; dst_align < 2; dst_align++) {
+    for (int mode = 0; mode < VLC_CMP_MODE_MAX; mode++) {
+      for (uint8_t fmt_i = 0; fmt_i < fmts_sz; fmt_i++) {
+        cvk_fmt_t fmt = fmts[fmt_i];
+        param_t p;
+        memset(&p, 0, sizeof(p));
+        p.dst = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, c->lmem_shape, fmt, dst_align);
+        if (!p.dst) {
+          fprintf(stderr, "allocate tl shape(%d, %d, %d, %d) failed\n",
+                  c->lmem_shape.n, c->lmem_shape.c, c->lmem_shape.h,
+                  c->lmem_shape.w);
+          continue;
+        }
+
+        uint64_t size = tl_shape_size(&p.dst->shape, CVK_FMT_I8);
+        uint16_t *src_data = (uint16_t *)malloc(sizeof(uint16_t) * size);
+        test_vlc_init_testdata((uint8_t *)src_data, size, fmt == CVK_FMT_I8, fmt == CVK_FMT_BF16);
+
+        CommandInfo cmd_info;
+        memset(&cmd_info, 0, sizeof(CommandInfo));
+        int is_signed = (fmt == CVK_FMT_I8);
+        uint8_t data_type = (fmt == CVK_FMT_BF16) ? 1 : 0;
+
+        cmd_info.signedness = is_signed;
+        cmd_info.is_bfloat16 = data_type;
+
+        if (mode == VLC_CMP_MODE_COMPILER) {
+          cvk_vlc_est_weight_bias((uint8_t* )src_data, size * sizeof(uint16_t), (bool)is_signed, (bool)data_type, &cmd_info);
+        }
+
+        cvk_tg_shape_t tg_shape =
+            tg_shape_t4(c->lmem_shape.n, c->lmem_shape.c, c->lmem_shape.h, c->lmem_shape.w);
+        p.src = alloc_cmpr_tensor_dev_mem(rt_handle, cvk_ctx, tg_shape, fmt, &cmd_info);
+
+        ret |= test_param_g2l(rt_handle, cvk_ctx, &p, src_data, &cmd_info);
+
+        free(src_data);
+        destroy_param_g2l(rt_handle, cvk_ctx, &p);
+      }
+    }
+  }
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+  int ret = 0;
+
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    ret |= test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  printf("tdma g2l bf16 tensor vlc test %s\n", ret ? "fail" : "pass");
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+  return 0;
+}
diff --git a/cviruntime/test/180x/test_180x_tdma_g2l_matrix_copy.c b/cviruntime/test/180x/test_180x_tdma_g2l_matrix_copy.c
new file mode 100644
index 000000000..319df190d
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_tdma_g2l_matrix_copy.c
@@ -0,0 +1,182 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef cvk_tdma_g2l_matrix_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.row, p->src->shape.col,
+      p->dst->shape.n, p->dst->shape.c,
+      p->dst->shape.w, p->dst->shape.col);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_mg_shape_t src_shape;
+  cvk_ml_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 0, 1 },
+    { 0, 1, 1, 1 },
+  }, {
+    { 0, 2 },
+    { 0, 1, 2, 2 },
+  }, {
+    { 0, 2 },
+    { 0, 2, 1, 2 },
+  }, {
+    { 0, 7 },
+    { 0, 1, 7, 7 },
+  }, {
+    { 0, 7 },
+    { 0, 2, 4, 7 },
+  }, {
+    { 0, 7 },
+    { 0, 7, 1, 7 },
+  }, {
+    { 0, 17 },
+    { 0, 1, 17, 17 },
+  }, {
+    { 0, 17 },
+    { 0, 3, 7, 17 },
+  }, {
+    { 0, 17 },
+    { 0, 17, 1, 17 },
+  }, {
+    { 0, 60 },
+    { 0, 1, 60, 60 },
+  }, {
+    { 0, 60 },
+    { 0, 30, 2, 60 },
+  }, {
+    { 0, 60 },
+    { 0, 60, 1, 60 },
+  }
+};
+
+static void g2l_matrix_copy_ref(param_t *p, uint8_t ref_data[], uint8_t src_data[])
+{
+  uint64_t size = ml_shape_size(&p->dst->shape, p->dst->fmt);
+
+  for (uint64_t i = 0; i < size; i++)
+    ref_data[i] = src_data[i];
+}
+
+static int test_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  print_param(stderr, p);
+  uint64_t size = ml_shape_size(&p->dst->shape, p->dst->fmt);
+
+  uint8_t *src_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!src_data)
+    return -1;
+
+  for (uint64_t i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  matrix_copy_s2d(rt_handle, p->src, src_data);
+
+  cvk_ctx->ops->tdma_g2l_matrix_copy(cvk_ctx, p);
+  CVI_RT_Submit(cvk_ctx);
+
+  uint8_t *dst_data = matrix_copy_l2g_d2s(rt_handle, cvk_ctx, p->dst);
+
+  uint8_t *ref_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!ref_data)
+    return -1;
+
+  g2l_matrix_copy_ref(p, ref_data, src_data);
+
+  for (uint64_t i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      return -1;
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+
+  return 0;
+}
+
+static void destroy_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  free_matrix_dev_mem(rt_handle, p->src);
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->dst);
+}
+
+static int test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  int ret = 0;
+  cvk_fmt_t fmt = CVK_FMT_I8;
+
+  for (uint32_t row = 1; row < 13; row += 2) {
+    c->src_shape.row = row;
+    c->dst_shape.n = row;
+    for (int dst_align = 0; dst_align < 2; dst_align++) {
+      param_t p;
+      memset(&p, 0, sizeof(p));
+
+      p.src = alloc_matrix_dev_mem(rt_handle, c->src_shape, fmt);
+      p.dst = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, c->dst_shape, fmt, dst_align);
+      ret |= test_param_g2l(rt_handle, cvk_ctx, &p);
+      destroy_param_g2l(rt_handle, cvk_ctx, &p);
+
+      if (ret)
+        return ret;
+    }
+  }
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++) {
+    ret |= test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+    if (ret)
+      break;
+  }
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  if (!ret)
+    printf("%s pass\n", __FILENAME__);
+  else
+    printf("%s fail\n", __FILENAME__);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_tdma_g2l_matrix_copy_row_col_transposed.c b/cviruntime/test/180x/test_180x_tdma_g2l_matrix_copy_row_col_transposed.c
new file mode 100644
index 000000000..a14274508
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_tdma_g2l_matrix_copy_row_col_transposed.c
@@ -0,0 +1,425 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef cvk_tdma_g2l_matrix_copy_row_col_transposed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.row, p->src->shape.col,
+      p->dst->shape.n, p->dst->shape.c,
+      p->dst->shape.w, p->dst->shape.col);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_mg_shape_t src_shape;
+  cvk_ml_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1 },
+    { 1, 1, 1, 1 },
+  }, {
+    { 1, 2 },
+    { 2, 1, 1, 1 },
+  }, {
+    { 1, 7 },
+    { 7, 1, 1, 1 },
+  }, {
+    { 1, 17 },
+    { 17, 1, 1, 1 },
+  }, {
+    { 1, 60 },
+    { 60, 1, 1, 1 },
+  }, {
+    { 1, 139 },
+    { 139, 1, 1, 1 },
+  }, {
+    { 2, 1 },
+    { 1, 1, 2, 2 },
+  }, {
+    { 2, 1 },
+    { 1, 2, 1, 2 },
+  }, {
+    { 2, 2 },
+    { 2, 1, 2, 2 },
+  }, {
+    { 2, 2 },
+    { 2, 2, 1, 2 },
+  }, {
+    { 2, 7 },
+    { 7, 1, 2, 2 },
+  }, {
+    { 2, 7 },
+    { 7, 2, 1, 2 },
+  }, {
+    { 2, 17 },
+    { 17, 1, 2, 2 },
+  }, {
+    { 2, 17 },
+    { 17, 2, 1, 2 },
+  }, {
+    { 2, 60 },
+    { 60, 1, 2, 2 },
+  }, {
+    { 2, 60 },
+    { 60, 2, 1, 2 },
+  }, {
+    { 2, 139 },
+    { 139, 1, 2, 2 },
+  }, {
+    { 2, 139 },
+    { 139, 2, 1, 2 },
+  }, {
+    { 7, 1 },
+    { 1, 1, 7, 7 },
+  }, {
+    { 7, 1 },
+    { 1, 2, 4, 7 },
+  }, {
+    { 7, 1 },
+    { 1, 2, 5, 7 },
+  }, {
+    { 7, 1 },
+    { 1, 2, 6, 7 },
+  }, {
+    { 7, 1 },
+    { 1, 3, 3, 7 },
+  }, {
+    { 7, 1 },
+    { 1, 4, 2, 7 },
+  }, {
+    { 7, 1 },
+    { 1, 7, 1, 7 },
+  }, {
+    { 7, 2 },
+    { 2, 1, 7, 7 },
+  }, {
+    { 7, 2 },
+    { 2, 2, 4, 7 },
+  }, {
+    { 7, 2 },
+    { 2, 2, 5, 7 },
+  }, {
+    { 7, 2 },
+    { 2, 2, 6, 7 },
+  }, {
+    { 7, 2 },
+    { 2, 3, 3, 7 },
+  }, {
+    { 7, 2 },
+    { 2, 4, 2, 7 },
+  }, {
+    { 7, 2 },
+    { 2, 7, 1, 7 },
+  }, {
+    { 7, 7 },
+    { 7, 1, 7, 7 },
+  }, {
+    { 7, 7 },
+    { 7, 3, 3, 7 },
+  }, {
+    { 7, 7 },
+    { 7, 4, 2, 7 },
+  }, {
+    { 7, 7 },
+    { 7, 7, 1, 7 },
+  }, {
+    { 7, 17 },
+    { 17, 1, 7, 7 },
+  }, {
+    { 7, 17 },
+    { 17, 4, 2, 7 },
+  }, {
+    { 7, 17 },
+    { 17, 7, 1, 7 },
+  }, {
+    { 7, 60 },
+    { 60, 1, 7, 7 },
+  }, {
+    { 7, 60 },
+    { 60, 3, 3, 7 },
+  }, {
+    { 7, 60 },
+    { 60, 7, 1, 7 },
+  }, {
+    { 7, 139 },
+    { 139, 1, 7, 7 },
+  }, {
+    { 7, 139 },
+    { 139, 3, 3, 7 },
+  }, {
+    { 7, 139 },
+    { 139, 7, 1, 7 },
+  }, {
+    { 43, 1 },
+    { 1, 1, 43, 43 },
+  }, {
+    { 43, 1 },
+    { 1, 2, 22, 43 },
+  }, {
+    { 43, 1 },
+    { 1, 2, 25, 43 },
+  }, {
+    { 43, 1 },
+    { 1, 2, 37, 43 },
+  }, {
+    { 43, 1 },
+    { 1, 2, 41, 43 },
+  }, {
+    { 43, 1 },
+    { 1, 5, 9, 43 },
+  }, {
+    { 43, 1 },
+    { 1, 5, 10, 43 },
+  }, {
+    { 43, 1 },
+    { 1, 9, 5, 43 },
+  }, {
+    { 43, 1 },
+    { 1, 22, 2, 43 },
+  }, {
+    { 43, 1 },
+    { 1, 43, 1, 43 },
+  }, {
+    { 43, 2 },
+    { 2, 1, 43, 43 },
+  }, {
+    { 43, 2 },
+    { 2, 2, 27, 43 },
+  }, {
+    { 43, 2 },
+    { 2, 22, 2, 43 },
+  }, {
+    { 43, 2 },
+    { 2, 43, 1, 43 },
+  }, {
+    { 57, 7 },
+    { 7, 1, 57, 57 },
+  }, {
+    { 57, 7 },
+    { 7, 2, 37, 57 },
+  }, {
+    { 57, 7 },
+    { 7, 2, 43, 57 },
+  }, {
+    { 57, 7 },
+    { 7, 2, 55, 57 },
+  }, {
+    { 57, 7 },
+    { 7, 2, 56, 57 },
+  }, {
+    { 57, 7 },
+    { 7, 7, 9, 57 },
+  }, {
+    { 57, 7 },
+    { 7, 8, 8, 57 },
+  }, {
+    { 57, 7 },
+    { 7, 29, 2, 57 },
+  }, {
+    { 57, 7 },
+    { 7, 57, 1, 57 },
+  }, {
+    { 67, 17 },
+    { 17, 1, 67, 67 },
+  }, {
+    { 67, 17 },
+    { 17, 2, 34, 67 },
+  }, {
+    { 67, 17 },
+    { 17, 2, 49, 67 },
+  }, {
+    { 67, 17 },
+    { 17, 2, 66, 67 },
+  }, {
+    { 67, 17 },
+    { 17, 6, 12, 67 },
+  }, {
+    { 67, 17 },
+    { 17, 6, 13, 67 },
+  }, {
+    { 67, 17 },
+    { 17, 17, 4, 67 },
+  }, {
+    { 67, 17 },
+    { 17, 34, 2, 67 },
+  }, {
+    { 67, 17 },
+    { 17, 67, 1, 67 },
+  }, {
+    { 129, 139 },
+    { 139, 1, 129, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 2, 65, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 2, 80, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 2, 120, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 2, 128, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 3, 43, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 3, 47, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 3, 59, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 3, 64, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 7, 19, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 7, 20, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 7, 21, 129 },
+#if 0 // Not enough lmem size for 1810
+  }, {
+    { 129, 139 },
+    { 139, 43, 3, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 65, 2, 129 },
+#endif
+  }
+// out of lmem size
+//  , {
+//    { 129, 139 },
+//    { 139, 129, 1, 129 },
+//  }
+};
+
+static void g2l_matrix_copy_row_col_transposed_ref(
+    param_t *p, uint8_t ref_data[], uint8_t src_data[])
+{
+  uint64_t row = p->src->shape.row;
+  uint64_t col = p->src->shape.col;
+
+  for (uint64_t ri = 0; ri < row; ri++) {
+    for (uint64_t ci = 0; ci < col; ci++) {
+      uint64_t src_i = ri * col + ci;
+      uint64_t dst_i = ci * row + ri;
+      ref_data[dst_i] = src_data[src_i];
+    }
+  }
+}
+
+static int test_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  print_param(stderr, p);
+  int ret = 0;
+  uint64_t size = ml_shape_size(&p->dst->shape, p->dst->fmt);
+
+  uint8_t *src_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!src_data)
+    return -1;
+
+  for (uint64_t i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  matrix_copy_s2d(rt_handle, p->src, src_data);
+  cvk_ctx->ops->tdma_g2l_matrix_copy_row_col_transposed(cvk_ctx, p);
+  CVI_RT_Submit(cvk_ctx);
+  uint8_t *dst_data = matrix_copy_l2g_d2s(rt_handle, cvk_ctx, p->dst);
+
+  uint8_t *ref_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!dst_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  g2l_matrix_copy_row_col_transposed_ref(p, ref_data, src_data);
+
+  for (uint64_t i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+fail_exit:
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+
+  return ret;
+}
+
+
+static void destroy_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  free_matrix_dev_mem(rt_handle, p->src);
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->dst);
+}
+
+static int test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  int ret = 0;
+  param_t p;
+  /*
+   * Matrix transpose must be n/c stride alignment
+   * for TDMA limitation
+   */
+  int dst_align = 1;
+  cvk_fmt_t fmt = CVK_FMT_I8;
+
+  memset(&p, 0, sizeof(p));
+
+  p.src = alloc_matrix_dev_mem(rt_handle, c->src_shape, fmt);
+  p.dst = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, c->dst_shape, fmt, dst_align);
+  ret = test_param_g2l(rt_handle, cvk_ctx, &p);
+  destroy_param_g2l(rt_handle, cvk_ctx, &p);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+  int ret = 0;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    ret |= test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_tdma_g2l_matrix_vlc_copy_decompressed.c b/cviruntime/test/180x/test_180x_tdma_g2l_matrix_vlc_copy_decompressed.c
new file mode 100644
index 000000000..87d3337da
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_tdma_g2l_matrix_vlc_copy_decompressed.c
@@ -0,0 +1,202 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef cvk_tdma_g2l_matrix_copy_decompressed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->m.shape.row, p->src->m.shape.col,
+      p->dst->shape.n, p->dst->shape.c,
+      p->dst->shape.w, p->dst->shape.col);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_mg_shape_t src_shape;
+  cvk_ml_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 0, 17 },
+    { 0, 3, 7, 17 },
+  },
+  {
+    { 0, 7 },
+    { 0, 7, 1, 7 },
+  },
+  {
+    { 0, 60 },
+    { 0, 30, 2, 60 },
+  },
+#ifndef ENABEL_SIMPLE_VLC_TEST
+  {
+    { 0, 1 },
+    { 0, 1, 1, 1 },
+  },
+  {
+    { 0, 2 },
+    { 0, 1, 2, 2 },
+  },
+  {
+    { 0, 2 },
+    { 0, 2, 1, 2 },
+  },
+  {
+    { 0, 7 },
+    { 0, 1, 7, 7 },
+  },
+  {
+    { 0, 7 },
+    { 0, 2, 4, 7 },
+  },
+  {
+    { 0, 17 },
+    { 0, 1, 17, 17 },
+  },
+  {
+    { 0, 17 },
+    { 0, 17, 1, 17 },
+  },
+  {
+    { 0, 60 },
+    { 0, 1, 60, 60 },
+  },
+  {
+    { 0, 60 },
+    { 0, 60, 1, 60 },
+  }
+#endif /* ifndef ENABEL_SIMPLE_VLC_TEST*/
+};
+
+static void g2l_matrix_copy_ref(param_t *p, uint8_t ref_data[], uint8_t src_data[])
+{
+  uint64_t size = ml_shape_size(&p->dst->shape, p->dst->fmt);
+
+  for (uint64_t i = 0; i < size; i++)
+    ref_data[i] = src_data[i];
+}
+
+static void test_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p, uint8_t *src_data, CommandInfo* cmd_info)
+{
+  print_param(stderr, p);
+
+  uint64_t in_size = ml_shape_size(&p->dst->shape, p->dst->fmt);
+  size_t bs_size = 0;
+  int is_signed = (p->dst->fmt == CVK_FMT_I8);
+  size_t data_type = (p->dst->fmt == CVK_FMT_BF16) ? 1 : 0;
+
+  uint8_t *bsbuf = test_vlc_compress(src_data, in_size, is_signed, data_type, &bs_size, cmd_info, NULL);
+  cmpr_matrix_copy_s2d(rt_handle, p->src, bsbuf);
+  free(bsbuf);
+
+  cvk_ctx->ops->tdma_g2l_matrix_copy_decompressed(cvk_ctx, p);
+  CVI_RT_Submit(cvk_ctx);
+  uint8_t *dst_data = matrix_copy_l2g_d2s(rt_handle, cvk_ctx, p->dst);
+
+  uint8_t *ref_data = (uint8_t *)malloc(sizeof(uint8_t) * in_size);
+  g2l_matrix_copy_ref(p, ref_data, src_data);
+
+  for (uint64_t i = 0; i < in_size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(dst_data);
+  free(ref_data);
+}
+
+static void destroy_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  free_cmpr_matrix_dev_mem(rt_handle, p->src);
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  cvk_fmt_t fmts[] = { CVK_FMT_I8, CVK_FMT_U8 };
+  uint8_t fmts_sz = sizeof(fmts) / sizeof(fmts[0]);
+
+  for (uint32_t row = 1; row < 13; row += 2) {
+    c->src_shape.row = row;
+    c->dst_shape.n = row;
+    for (int mode = 0; mode < VLC_CMP_MODE_MAX; mode++) {
+      for (int dst_align = 0; dst_align < 2; dst_align++) {
+        for (uint8_t fmt_i = 0; fmt_i < fmts_sz; fmt_i++) {
+          cvk_fmt_t fmt = fmts[fmt_i];
+          param_t p;
+
+          memset(&p, 0, sizeof(p));
+
+          int is_signed = (fmt == CVK_FMT_I8);
+          size_t data_type = (fmt == CVK_FMT_BF16) ? 1 : 0;
+          CommandInfo cmd_info;
+
+          memset(&cmd_info, 0, sizeof(CommandInfo));
+          cmd_info.signedness = is_signed;
+
+          // <! 1. alloc source
+          p.dst = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, c->dst_shape, fmt, dst_align);
+          uint64_t in_size = ml_shape_size(&p.dst->shape, p.dst->fmt);
+
+          // <! 2 init input
+          uint8_t *src_data = (uint8_t *)malloc(sizeof(uint8_t) * in_size);
+          test_vlc_init_testdata(src_data, in_size, fmt == CVK_FMT_I8, fmt == CVK_FMT_BF16);
+
+          // <! 3 try to manual set bias0/bias1
+          if (mode == VLC_CMP_MODE_COMPILER) {
+            cvk_vlc_est_weight_bias(src_data, in_size, (bool)is_signed, (bool)data_type, &cmd_info);
+          }
+
+          p.src = alloc_cmpr_matrix_dev_mem(rt_handle, c->src_shape, fmt, &cmd_info);
+
+          //printf ("row %u mode %d is_align %d fmt %d\n", row, mode, dst_align, fmt);
+          test_param_g2l(rt_handle, cvk_ctx, &p, src_data, &cmd_info);
+
+          free(src_data);
+          destroy_param_g2l(rt_handle, cvk_ctx, &p);
+        }
+      }
+    }
+  }
+}
+
+int main(int argc, char **argv)
+{
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return 0;
+}
diff --git a/cviruntime/test/180x/test_180x_tdma_g2l_tensor_copy.c b/cviruntime/test/180x/test_180x_tdma_g2l_tensor_copy.c
new file mode 100644
index 000000000..2cefdf7d1
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_tdma_g2l_tensor_copy.c
@@ -0,0 +1,176 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <inttypes.h>
+#include "test_cvikernel_util.h"
+
+typedef cvk_tdma_g2l_tensor_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_tg_shape_t src_shape;
+  cvk_tl_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 1, 1 },
+    { 1, 1, 1, 1 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 2, 1 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 2, 7 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 13, 17 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 120, 5 },
+  }, {
+    { 1, 2, 1, 1 },
+    { 1, 1, 1, 2 },
+  }, {
+    { 2, 17, 1,  4 },
+    { 2,  1, 4, 17 },
+  }, {
+    { 2, 17, 1,  4 },
+    { 2,  1, 17, 4 },
+  }, {
+    { 3, 16, 1, 1 },
+    { 3,  1, 2, 8 },
+  }, {
+    { 3, 39, 17, 23 },
+    { 3, 17, 39, 23 },
+  }, {
+    { 3, 36, 16,  20 },
+    { 3, 18,  1, 640 },
+  },
+#if 0 // for 180x
+  {
+    { 5, 39, 17, 23 },
+    { 5, 17, 39, 23 },
+  },
+#endif
+  {
+    { 20, 35,  2, 2 },
+    { 20,  7, 10, 2 },
+  }
+};
+
+static void g2l_tensor_copy_ref(param_t *p, uint8_t ref_data[], uint8_t src_data[])
+{
+  uint64_t size = tl_shape_size(&p->dst->shape, p->dst->fmt);
+
+  for (uint64_t i = 0; i < size; i++)
+    ref_data[i] = src_data[i];
+}
+
+static int test_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  int ret = 0;
+  print_param(stderr, p);
+  uint64_t size = tl_shape_size(&p->dst->shape, p->dst->fmt);
+
+  uint8_t *src_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!src_data)
+    return -1;
+
+  for (uint64_t i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  tensor_copy_s2d(rt_handle, p->src, src_data);
+
+  cvk_ctx->ops->tdma_g2l_tensor_copy(cvk_ctx, p);
+  CVI_RT_Submit(cvk_ctx);
+
+  uint8_t *dst_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, p->dst);
+
+  uint8_t *ref_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  g2l_tensor_copy_ref(p, ref_data, src_data);
+
+  for (uint64_t i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+
+  return ret;
+}
+
+static void destroy_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  free_tensor_dev_mem(rt_handle, p->src);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->dst);
+}
+
+static int test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  int ret = 0;
+  for (int dst_align = 0; dst_align < 2; dst_align++) {
+    param_t p;
+    memset(&p, 0, sizeof(p));
+
+    p.src = alloc_tensor_dev_mem(rt_handle, cvk_ctx, c->src_shape, CVK_FMT_I8);
+    p.dst = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, c->dst_shape, CVK_FMT_I8, dst_align);
+    if (p.src && p.dst)
+      ret |= test_param_g2l(rt_handle, cvk_ctx, &p);
+    else if (!p.src)
+        fprintf(stderr, "allocate tg failed\n");
+    else if (!p.dst)
+        fprintf(stderr, "allocate tl shape(%d, %d, %d, %d) failed\n",
+                c->dst_shape.n, c->dst_shape.c, c->dst_shape.h,
+                c->dst_shape.w);
+    destroy_param_g2l(rt_handle, cvk_ctx, &p);
+  }
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    ret |= test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  printf("tdma g2l tensor copy test %s\n", ret ? "fail" : "pass");
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_tdma_g2l_tensor_copy_chw_rotated.c b/cviruntime/test/180x/test_180x_tdma_g2l_tensor_copy_chw_rotated.c
new file mode 100644
index 000000000..8de7a5103
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_tdma_g2l_tensor_copy_chw_rotated.c
@@ -0,0 +1,206 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef cvk_tdma_g2l_tensor_copy_chw_rotated_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.h, p->src->shape.w, p->src->shape.c,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_tg_shape_t src_shape;
+  cvk_tl_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 3, 1, 1 }, // nchw for neuron
+    { 1, 3, 1, 1 }, // nchw for neuron
+  }, {
+    { 1, 4, 1, 1 },
+    { 1, 4, 1, 1 },
+  }, {
+    { 1, 3, 1, 7 },
+    { 1, 3, 1, 7 },
+  }, {
+    { 1, 4, 1, 7 },
+    { 1, 4, 1, 7 },
+  }, {
+    { 1, 3, 1, 17 },
+    { 1, 3, 1, 17 },
+  }, {
+    { 1, 4, 1, 17 },
+    { 1, 4, 1, 17 },
+  }, {
+    { 1, 3, 2, 1 },
+    { 1, 3, 2, 1 },
+  }, {
+    { 1, 4, 2, 1 },
+    { 1, 4, 2, 1 },
+  }, {
+    {  2, 3, 17, 1 },
+    {  2, 3, 17, 1 },
+  }, {
+    {  2, 4, 17, 1 },
+    {  2, 4, 17, 1 },
+  }, {
+    {  2, 3, 17, 3 },
+    {  2, 3, 17, 3 },
+  }, {
+    {  2, 4, 17, 3 },
+    {  2, 4, 17, 3 },
+  }, {
+    {  3, 3, 16, 7 },
+    {  3, 3, 16, 7 },
+  }, {
+    {  3, 4, 16, 7 },
+    {  3, 4, 16, 7 },
+  }, {
+    {  3, 3, 39, 17 },
+    {  3, 3, 39, 17 },
+  }, {
+    {  3, 4, 39, 17 },
+    {  3, 4, 39, 17 },
+  }, {
+    {  3, 3, 36, 16 },
+    {  3, 3, 36, 16 },
+  }, {
+    {  3, 4, 36, 16 },
+    {  3, 4, 36, 16 },
+  }, {
+    {  5, 3, 39, 17 },
+    {  5, 3, 39, 17 },
+  }, {
+    {  5, 4, 39, 17 },
+    {  5, 4, 39, 17 },
+  }, {
+    { 20, 3, 35, 2 },
+    { 20, 3, 35, 2 },
+  }, {
+    { 20, 4, 35, 2 },
+    { 20, 4, 35, 2 },
+  }, {
+    { 20, 3, 35, 3 },
+    { 20, 3, 35, 3 },
+  }, {
+    { 20, 4, 35, 3 },
+    { 20, 4, 35, 3 },
+  }
+};
+
+static void g2l_tensor_copy_chw_rotated_ref(
+    param_t *p, uint8_t ref_data[], uint8_t src_data[])
+{
+  cvk_tg_shape_t s = p->src->shape;
+  // change nhwc -> nchw by HW design automatically
+  uint32_t n = s.n;
+  uint32_t c = s.h;
+  uint32_t h = s.w;
+  uint32_t w = s.c;
+  for (uint32_t ni = 0; ni < n; ni++) {
+    for (uint32_t ci = 0; ci < c; ci++) {
+      for (uint32_t hi = 0; hi < h; hi++) {
+        for (uint32_t wi = 0; wi < w; wi++) {
+          uint32_t src_i = ni * c * h * w + ci * h * w + hi * w + wi;
+          uint32_t dst_i = ni * w * c * h + wi * c * h + ci * h + hi;
+          ref_data[dst_i] = src_data[src_i];
+        }
+      }
+    }
+  }
+}
+
+static void test_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  print_param(stderr, p);
+  uint64_t size = tg_shape_size(&p->src->shape, p->src->fmt);
+  uint8_t *src_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!src_data)
+    return;
+
+  for (uint64_t i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  tensor_copy_s2d(rt_handle, p->src, src_data);
+  cvk_ctx->ops->tdma_g2l_tensor_copy_chw_rotated(cvk_ctx, p);
+  CVI_RT_Submit(cvk_ctx);
+  uint8_t *dst_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, p->dst);
+
+  uint8_t *ref_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!dst_data || !ref_data)
+    goto fail_exit;
+
+  g2l_tensor_copy_chw_rotated_ref(p, ref_data, src_data);
+ 
+  for (uint64_t i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+fail_exit:
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+static void destroy_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  free_tensor_dev_mem(rt_handle, p->src);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+
+  param_t p;
+  memset(&p, 0, sizeof(p));
+
+  p.src = alloc_tensor_dev_mem(rt_handle, cvk_ctx, c->src_shape, CVK_FMT_I8);
+  p.dst = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, c->dst_shape, CVK_FMT_I8, 1);
+  test_param_g2l(rt_handle, cvk_ctx, &p);
+  destroy_param_g2l(rt_handle, cvk_ctx, &p);
+
+}
+
+int main(int argc, char **argv)
+{
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return 0;
+}
diff --git a/cviruntime/test/180x/test_180x_tdma_g2l_tensor_copy_nc_transposed.c b/cviruntime/test/180x/test_180x_tdma_g2l_tensor_copy_nc_transposed.c
new file mode 100644
index 000000000..33bcede13
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_tdma_g2l_tensor_copy_nc_transposed.c
@@ -0,0 +1,280 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef cvk_tdma_g2l_tensor_copy_nc_transposed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_tg_shape_t src_shape;
+  cvk_tl_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 1, 1 },
+    { 1, 1, 1, 1 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 1, 2 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 2, 1 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 7, 2 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 2, 7 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 17, 13 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 13, 17 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 10, 60 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 2, 300 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 3, 200 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 4, 150 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 5, 120 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 60, 10 },
+  }, {
+    { 1, 1, 120, 5 },
+    { 1, 1, 120, 5 },
+  }, {
+    { 1, 2, 1, 1 },
+    { 2, 1, 1, 1 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 1, 4 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 2, 2 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 4, 1 },
+  }, {
+    { 17, 2, 2, 2 },
+    { 2, 17, 2, 2 },
+  }, {
+    { 17, 2, 4, 1 },
+    { 2, 17, 4, 1 },
+  }, {
+    {  3, 16, 1, 1 },
+    { 16,  3, 1, 1 },
+  }, {
+    { 3, 39, 23, 17 },
+    { 39, 3, 23, 17 },
+  }, {
+    { 3, 39, 17, 23 },
+    { 39, 3, 17, 23 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  16, 20 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  2, 160 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  4, 80 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  8, 40 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  20, 16 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  32, 10 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  64, 5 },
+  },
+#if 0 // for 180x
+  {
+    { 5, 39, 17, 23 },
+    { 39, 5, 17, 23 },
+  },
+#endif
+  {
+    { 20, 35, 2, 2 },
+    { 35, 20, 2, 2 },
+  }, {
+    { 35, 20, 2, 2 },
+    { 20, 35, 2, 2 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 160, 2 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 2, 160 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 4, 80 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 8, 40 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 10, 32 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 20, 16 },
+  }
+#if 0 // for 180x
+  , {
+    { 39, 5, 23, 17 },
+    { 5, 39, 23, 17 },
+  }
+#endif
+};
+
+static void g2l_tensor_copy_nc_transposed_ref(
+    param_t *p, uint8_t ref_data[], uint8_t src_data[])
+{
+  cvk_tg_shape_t s = p->src->shape;
+  uint32_t n = s.n;
+  uint32_t c = s.c;
+  uint32_t hw = s.h * s.w;
+
+  for (uint32_t ni = 0; ni < n; ni++) {
+    for (uint32_t ci = 0; ci < c; ci++) {
+      for (uint32_t hwi = 0; hwi < hw; hwi++) {
+        uint32_t src_i = ni * c * hw + ci * hw + hwi;
+        uint32_t dst_i = ci * n * hw + ni * hw + hwi;
+        ref_data[dst_i] = src_data[src_i];
+      }
+    }
+  }
+}
+
+static int test_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  print_param(stderr, p);
+  int ret = 0;
+  uint64_t size = tl_shape_size(&p->dst->shape, p->dst->fmt);
+
+  uint8_t *src_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!src_data)
+    return -1;
+
+  for (uint64_t i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  tensor_copy_s2d(rt_handle, p->src, src_data);
+  cvk_ctx->ops->tdma_g2l_tensor_copy_nc_transposed(cvk_ctx, p);
+  CVI_RT_Submit(cvk_ctx);
+  uint8_t *dst_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, p->dst);
+
+  uint8_t *ref_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!dst_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  g2l_tensor_copy_nc_transposed_ref(p, ref_data, src_data);
+
+  for (uint64_t i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+fail_exit:
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+
+  return ret;
+}
+
+
+static void destroy_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->dst);
+  free_tensor_dev_mem(rt_handle, p->src);
+}
+
+static int test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  int ret = 0;
+
+  for (int dst_align = 0; dst_align < 2; dst_align++) {
+    param_t p;
+    memset(&p, 0, sizeof(p));
+
+    p.src = alloc_tensor_dev_mem(rt_handle, cvk_ctx, c->src_shape, CVK_FMT_I8);
+    p.dst = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, c->dst_shape, CVK_FMT_I8, dst_align);
+    if (p.src && p.dst)
+      ret |= test_param_g2l(rt_handle, cvk_ctx, &p);
+    else if (!p.src)
+        fprintf(stderr, "allocate tg failed\n");
+    else if (!p.dst)
+        fprintf(stderr, "allocate tl shape(%d, %d, %d, %d) failed\n",
+                c->dst_shape.n, c->dst_shape.c, c->dst_shape.h,
+                c->dst_shape.w);
+    destroy_param_g2l(rt_handle, cvk_ctx, &p);
+  }
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+  int ret = 0;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    ret |= test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  printf("tdma g2l tensor copy nc tp test %s\n", ret ? "fail" : "pass");
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_tdma_g2l_tensor_vlc_copy_decompressed.c b/cviruntime/test/180x/test_180x_tdma_g2l_tensor_vlc_copy_decompressed.c
new file mode 100644
index 000000000..8e0b0109c
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_tdma_g2l_tensor_vlc_copy_decompressed.c
@@ -0,0 +1,195 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef cvk_tdma_g2l_tensor_copy_decompressed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => fmt(%d) bias0/1/zero is (%u/%u/%u) %s\n",
+      tag,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w,
+      p->dst->fmt,
+      p->src->bias0, p->src->bias1, p->src->zero_guard_en,
+      (p->dst->fmt == CVK_FMT_I8)? "signed": "unsigned");
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_tl_shape_t lmem_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 17, 13 }
+  },
+  {
+    { 3, 39, 17, 23 }
+  },
+#if 0 // No enough local memory for 180x
+  {
+    { 5, 39, 17, 23 }
+  },
+#endif
+  {
+    { 20, 35,  2, 2 }
+  },
+#ifndef ENABEL_SIMPLE_VLC_TEST
+  {
+    { 1, 1, 1, 1 }
+  },
+  {
+    { 1, 1, 1, 2 }
+  },
+  {
+    { 1, 1, 7, 2 }
+  },
+  {
+    { 1, 1, 10, 60 }
+  },
+  {
+    { 1, 2, 1, 1 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 3, 16, 1, 1 }
+  },
+  {
+    { 3, 36, 16,  20 }
+  },
+#endif /* ifndef ENABEL_SIMPLE_VLC_TEST*/
+};
+
+static void g2l_tensor_copy_vlc_decompressed_ref(
+    uint8_t ref_data[], uint64_t ref_size, uint8_t src_data[])
+{
+  cvk_vlc_dec_int8(src_data, ref_size, ref_data);
+}
+
+static int test_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p, uint8_t *src_data, CommandInfo* cmd_info)
+{
+  print_param(stderr, p);
+  uint64_t size = tl_shape_size(&p->dst->shape, p->dst->fmt);
+  size_t bs_size = 0;
+  int is_signed = (p->dst->fmt == CVK_FMT_I8);
+  uint8_t data_type = (p->dst->fmt == CVK_FMT_BF16) ? 1 : 0;
+  int ret = 0;
+
+  uint8_t *bsbuf = test_vlc_compress(src_data, size, is_signed, data_type, &bs_size, cmd_info, NULL);
+
+  cmpr_tensor_copy_s2d(rt_handle, p->src, bsbuf);
+  cvk_ctx->ops->tdma_g2l_tensor_copy_decompressed(cvk_ctx, p);
+  CVI_RT_Submit(cvk_ctx);
+
+  uint8_t *dst_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, p->dst);
+  uint8_t *ref_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  g2l_tensor_copy_vlc_decompressed_ref(ref_data, size, bsbuf);
+  for (uint64_t i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "vlc decompress comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+  free(bsbuf);
+  free(dst_data);
+  free(ref_data);
+  return ret;
+}
+
+static void destroy_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  free_cmpr_tensor_dev_mem(rt_handle, p->src);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->dst);
+}
+
+static int test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  cvk_fmt_t fmts[] = { CVK_FMT_I8, CVK_FMT_U8 };
+  int ret = 0;
+
+  for (int dst_align = 0; dst_align < 2; dst_align++) {
+    for (int mode = 0; mode < VLC_CMP_MODE_MAX; mode++) {
+      for (uint8_t fmt_i = 0; fmt_i < 2; fmt_i++) {
+        cvk_fmt_t fmt = fmts[fmt_i];
+        param_t p;
+        memset(&p, 0, sizeof(p));
+        p.dst = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, c->lmem_shape, fmt, dst_align);
+        if (!p.dst) {
+          fprintf(stderr, "allocate tl shape(%d, %d, %d, %d) failed\n",
+                  c->lmem_shape.n, c->lmem_shape.c, c->lmem_shape.h,
+                  c->lmem_shape.w);
+          continue;
+        }
+
+        uint64_t size = tl_shape_size(&p.dst->shape, p.dst->fmt);
+        uint8_t *src_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+        test_vlc_init_testdata(src_data, size, fmt == CVK_FMT_I8, fmt == CVK_FMT_BF16);
+
+        CommandInfo cmd_info;
+        memset(&cmd_info, 0, sizeof(CommandInfo));
+        int is_signed = (fmt == CVK_FMT_I8);
+        uint8_t data_type = (fmt == CVK_FMT_BF16) ? 1 : 0;
+
+        cmd_info.signedness = is_signed;
+
+        if (mode == VLC_CMP_MODE_COMPILER) {
+          cvk_vlc_est_weight_bias(src_data, size, (bool)is_signed, (bool)data_type, &cmd_info);
+        }
+
+        cvk_tg_shape_t tg_shape =
+            tg_shape_t4(c->lmem_shape.n, c->lmem_shape.c, c->lmem_shape.h, c->lmem_shape.w);
+        p.src = alloc_cmpr_tensor_dev_mem(rt_handle, cvk_ctx, tg_shape, fmt, &cmd_info);
+
+        ret |= test_param_g2l(rt_handle, cvk_ctx, &p, src_data, &cmd_info);
+
+        free(src_data);
+        destroy_param_g2l(rt_handle, cvk_ctx, &p);
+      }
+    }
+  }
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+  int ret = 0;
+
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    ret |= test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  printf("tdma g2l tensor vlc test %s\n", ret ? "fail" : "pass");
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return 0;
+}
diff --git a/cviruntime/test/180x/test_180x_tdma_l2g_bf16_matrix_vlc_copy_compressed.c b/cviruntime/test/180x/test_180x_tdma_l2g_bf16_matrix_vlc_copy_compressed.c
new file mode 100644
index 000000000..e9a00907d
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_tdma_l2g_bf16_matrix_vlc_copy_compressed.c
@@ -0,0 +1,186 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef cvk_tdma_l2g_matrix_copy_compressed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.w, p->src->shape.col,
+      p->dst->m.shape.row, p->dst->m.shape.col);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_ml_shape_t src_shape;
+  cvk_mg_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+ {
+    { 0, 2, 4, 7 },
+    { 0, 7 },
+  },
+ {
+    { 0, 1, 2, 2 },
+    { 1, 2 },
+  },
+ {
+    { 0, 3, 7, 17 },
+    { 0, 17 },
+  },
+ {
+    { 0, 17, 1, 17 },
+    { 0, 17 },
+  },
+ {
+    { 0, 60, 1, 60 },
+    { 0, 60 },
+  },
+#ifndef ENABEL_SIMPLE_VLC_TEST
+  {
+    { 0, 1, 1, 1 },
+    { 0, 1 },
+  },
+ {
+    { 0, 2, 1, 2 },
+    { 1, 2 },
+  },
+ {
+    { 0, 1, 7, 7 },
+    { 0, 7 },
+  },
+ {
+    { 0, 7, 1, 7 },
+    { 0, 7 },
+  },
+ {
+    { 0, 1, 17, 17 },
+    { 0, 17 },
+  },
+ {
+    { 0, 1, 60, 60 },
+    { 0, 60 },
+  },
+ {
+    { 0, 30, 2, 60 },
+    { 0, 60 },
+  },
+#endif /* ifndef ENABEL_SIMPLE_VLC_TEST*/
+};
+
+static void test_param_l2g(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p, uint16_t* src_data, CommandInfo * cmd_info)
+{
+  print_param(stderr, p);
+  uint64_t size = ml_shape_size(&p->src->shape, CVK_FMT_I8);
+  uint64_t bytesize = size * fmt_size(p->src->fmt);
+
+  matrix_copy_s2d_g2l(rt_handle, cvk_ctx, p->src, (uint8_t*)src_data);
+  cvk_ctx->ops->tdma_l2g_matrix_copy_compressed(cvk_ctx, p);
+  CVI_RT_Submit(cvk_ctx);
+
+  int is_signed = (p->src->fmt == CVK_FMT_I8);
+  int data_type = (p->src->fmt == CVK_FMT_BF16) ? 1 : 0;
+  size_t bs_size;
+
+  uint16_t *ref_data = (uint16_t* )test_vlc_compress((uint8_t* )src_data, bytesize, is_signed, data_type, &bs_size, cmd_info, NULL);
+  uint16_t *dst_data = (uint16_t* )cmpr_matrix_copy_d2s(rt_handle, p->dst);
+
+  // <! compare unit is 2bytes
+  for (uint64_t i = 0; i < bs_size / 2; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+          i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(dst_data);
+  free(ref_data);
+}
+
+
+static void destroy_param_l2g(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->src);
+  free_cmpr_matrix_dev_mem(rt_handle, p->dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  cvk_fmt_t fmts[] = { CVK_FMT_BF16 };
+  uint8_t fmts_sz = sizeof(fmts) / sizeof(fmts[0]);
+
+  for (uint32_t row = 1; row < 13; row += 2) {
+    c->src_shape.n = row;
+    c->dst_shape.row = row;
+    for (int src_align = 0; src_align < 2; src_align++) {
+      for (uint8_t fmt_i = 0; fmt_i < fmts_sz; fmt_i++) {
+        cvk_fmt_t fmt = fmts[fmt_i];
+        param_t p;
+        memset(&p, 0, sizeof(p));
+        p.src = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, c->src_shape, fmt, src_align);
+
+        uint64_t size = ml_shape_size(&p.src->shape, CVK_FMT_I8);
+        uint16_t *src_data = (uint16_t *)malloc(sizeof(uint16_t) * size);
+        test_vlc_init_testdata((uint8_t *)src_data, size, fmt == CVK_FMT_I8, fmt == CVK_FMT_BF16);
+
+        //size_t bs_size;
+        CommandInfo cmd_info;
+        int is_signed = (p.src->fmt == CVK_FMT_I8);
+        int data_type = (p.src->fmt == CVK_FMT_BF16) ? 1 : 0;
+
+        memset(&cmd_info, 0, sizeof(CommandInfo));
+        cmd_info.signedness = is_signed;
+        cmd_info.is_bfloat16 = data_type;
+        cmd_info.bias0 = 127;
+
+        // <! not support bias0/1 setting compress by hw
+        //get_vlc_compressed_meta(src_data, size, p.src->fmt, &bs_size, &cmd_info);
+
+        // <! max compressed size
+        p.dst = alloc_cmpr_matrix_dev_mem(rt_handle, c->dst_shape, p.src->fmt, &cmd_info);
+
+        test_param_l2g(rt_handle, cvk_ctx, &p, src_data, &cmd_info);
+        destroy_param_l2g(rt_handle, cvk_ctx, &p);
+        free(src_data);
+      }
+    }
+  }
+}
+
+int main(int argc, char **argv)
+{
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return 0;
+}
diff --git a/cviruntime/test/180x/test_180x_tdma_l2g_bf16_tensor_copy_nc_transposed.c b/cviruntime/test/180x/test_180x_tdma_l2g_bf16_tensor_copy_nc_transposed.c
new file mode 100644
index 000000000..f871c2292
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_tdma_l2g_bf16_tensor_copy_nc_transposed.c
@@ -0,0 +1,315 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+
+typedef cvk_tdma_l2g_tensor_copy_nc_transposed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_fmt_t src_fmt;
+  cvk_fmt_t dst_fmt;
+} fmt_type_t;
+
+static fmt_type_t input_fmt[] = {
+ {CVK_FMT_BF16, CVK_FMT_BF16},
+ {CVK_FMT_BF16, CVK_FMT_I8},
+ {CVK_FMT_BF16, CVK_FMT_U8},
+};
+
+typedef struct {
+  cvk_tl_shape_t src_shape;
+  cvk_tg_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 1, 1 },
+    { 1, 1, 1, 1 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 1, 2 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 2, 1 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 7, 2 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 2, 7 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 17, 13 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 13, 17 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 10, 60 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 2, 300 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 3, 200 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 4, 150 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 5, 120 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 60, 10 },
+  }, {
+    { 1, 1, 120, 5 },
+    { 1, 1, 120, 5 },
+  }, {
+    { 1, 2, 1, 1 },
+    { 2, 1, 1, 1 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 1, 4 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 2, 2 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 4, 1 },
+  }, {
+    { 17, 2, 2, 2 },
+    { 2, 17, 2, 2 },
+  }, {
+    { 17, 2, 4, 1 },
+    { 2, 17, 4, 1 },
+  }, {
+    {  3, 16, 1, 1 },
+    { 16,  3, 1, 1 },
+#if 0 // No enough local memory for 180x
+  }, {
+    { 3, 39, 23, 17 },
+    { 39, 3, 23, 17 },
+  }, {
+    { 3, 39, 17, 23 },
+    { 39, 3, 17, 23 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  16, 20 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  2, 160 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  4, 80 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  8, 40 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  20, 16 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  32, 10 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  64, 5 },
+  }, {
+    { 5, 39, 17, 23 },
+    { 39, 5, 17, 23 },
+  }, {
+    { 20, 35, 2, 2 },
+    { 35, 20, 2, 2 },
+  }, {
+    { 35, 20, 2, 2 },
+    { 20, 35, 2, 2 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 160, 2 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 2, 160 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 4, 80 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 8, 40 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 10, 32 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 20, 16 },
+  }, {
+    { 39, 5, 23, 17 },
+    { 5, 39, 23, 17 },
+#endif
+  }
+};
+
+static void l2g_tensor_copy_nc_transposed_ref(
+    param_t *p, uint16_t ref_data[], uint16_t src_data[])
+{
+  cvk_tl_shape_t s = p->src->shape;
+  uint32_t n = s.n;
+  uint32_t c = s.c;
+  uint32_t hw = s.h * s.w;
+
+  for (uint32_t ni = 0; ni < n; ni++) {
+    for (uint32_t ci = 0; ci < c; ci++) {
+      for (uint32_t hwi = 0; hwi < hw; hwi++) {
+        uint32_t src_i = ni * c * hw + ci * hw + hwi;
+        uint32_t dst_i = ci * n * hw + ni * hw + hwi;
+        if(p->src->fmt == CVK_FMT_BF16 && p->dst->fmt == CVK_FMT_BF16)
+          ref_data[dst_i] = src_data[src_i];
+        else if (p->src->fmt == CVK_FMT_BF16 && (p->dst->fmt == CVK_FMT_I8 || p->dst->fmt == CVK_FMT_U8)) {
+          uint8_t sign = p->dst->fmt == CVK_FMT_I8 ? 1 : 0;
+          uint8_t val = sign ? (uint8_t) cvk_convert_bf16_s8(src_data[src_i]) : (uint8_t)cvk_convert_bf16_u8(src_data[src_i]);
+          ref_data[dst_i] = val;
+        } else if(p->dst->fmt == p->src->fmt){ //i8->i8
+          ref_data[dst_i] = src_data[src_i];
+        } else {
+          fprintf(stderr, "Error src_fmt_type (%d) or dst_fmttype (%d)", p->src->fmt, p->dst->fmt);
+          exit(-1);
+        }
+      }
+    }
+  }
+}
+
+static int test_param_l2g(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  print_param(stderr, p);
+  int ret = 0;
+  uint64_t size = tl_shape_size(&p->src->shape, CVK_FMT_I8);
+
+  uint16_t *src_data = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  if (!src_data)
+    return -1;
+
+  float val = -100;
+  for (uint64_t i = 0; i < size; i++) {
+    src_data[i] = test_generate_bf16_corner_val(val);
+    val += 0.1;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, p->src, (uint8_t *)src_data);
+  cvk_ctx->ops->tdma_l2g_bf16_tensor_copy_nc_transposed(cvk_ctx, p);
+  CVI_RT_Submit(cvk_ctx);
+
+  uint16_t *dst_data = (uint16_t*) tensor_copy_d2s(rt_handle, p->dst);
+  uint16_t *ref_data = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  if (!dst_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  l2g_tensor_copy_nc_transposed_ref(p, ref_data, src_data);
+
+  if(p->dst->fmt == CVK_FMT_BF16 && p->src->fmt == CVK_FMT_BF16) {
+    for (uint64_t i = 0; i < size; i++) {
+      if (dst_data[i] != ref_data[i]) {
+        fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+                i, dst_data[i], ref_data[i]);
+        ret = -1;
+        break;
+      }
+    }
+  } else if(p->dst->fmt == CVK_FMT_U8 || p->dst->fmt == CVK_FMT_I8) {
+    for (uint64_t i = 0; i < size; i++) {
+      uint32_t shift = (i%2)*8;
+      if ((uint8_t)(dst_data[i/2] >> shift) != (uint8_t)ref_data[i]) {
+        fprintf(stderr, "comparing (bf16->i8/uint8_t) failed at dst[%" PRIu64 "], got %x, exp %x\n",
+                i,(uint8_t) (dst_data[i/2] >> shift) , ref_data[i]);
+        ret = -1;
+      }
+    }
+  } else {
+    fprintf(stderr, "Error compreing type src_fmt_type (%d) or dst_fmttype (%d)", p->src->fmt, p->dst->fmt);
+    ret = -1;
+  }
+
+fail_exit:
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+
+  return ret;
+}
+
+static void destroy_param_l2g(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  free_tensor_dev_mem(rt_handle, p->dst);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->src);
+}
+
+static int test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  int ret = 0;
+  uint32_t nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+  for (uint32_t i = 0; i < nr_fmt; i++) {
+    for (int src_align = 0; src_align < 2; src_align++) {
+      param_t p;
+      memset(&p, 0, sizeof(p));
+
+      p.src = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, c->src_shape, input_fmt[i].src_fmt, src_align);
+      p.dst = alloc_tensor_dev_mem(rt_handle, cvk_ctx, c->dst_shape, input_fmt[i].dst_fmt);
+      if (p.src)
+        ret |= test_param_l2g(rt_handle, cvk_ctx, &p);
+      else if (!p.src)
+          fprintf(stderr, "allocate tl shape(%d, %d, %d, %d) failed\n",
+                  c->src_shape.n, c->src_shape.c, c->src_shape.h,
+                  c->src_shape.w);
+      else if (!p.dst)
+          fprintf(stderr, "allocate tg failed\n");
+      destroy_param_l2g(rt_handle, cvk_ctx, &p);
+    }
+  }
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+  int ret = 0;
+
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    ret |= test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  printf("tdma l2g bf16 tensor copy nc tp test %s\n", ret ? "fail" : "pass");
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_tdma_l2g_bf16_tensor_fill_constant.c b/cviruntime/test/180x/test_180x_tdma_l2g_bf16_tensor_fill_constant.c
new file mode 100644
index 000000000..4825d0ede
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_tdma_l2g_bf16_tensor_fill_constant.c
@@ -0,0 +1,166 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+
+typedef cvk_tdma_l2g_tensor_fill_constant_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: %u => (%u, %u, %u, %u)\n",
+      tag, p->constant,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  float constant;
+  cvk_tg_shape_t dst_shape;
+} case_t;
+
+typedef struct {
+  cvk_fmt_t src_fmt;
+  cvk_fmt_t dst_fmt;
+} fmt_type_t;
+
+static fmt_type_t input_fmt[] = {
+ {CVK_FMT_BF16, CVK_FMT_BF16},
+};
+
+static case_t g_cases[] = {
+  {
+    37, { 1, 1, 1, 1 }
+  }, {
+    39, { 1, 1, 1, 2 }
+  }, {
+    23, { 1, 1, 2, 1 }
+  }, {
+    19, { 1, 1, 7, 2 }
+  }, {
+    17, { 1, 1, 2, 7 }
+  }, {
+    13, { 1, 1, 17, 13 }
+  }, {
+    11, { 1, 1, 13, 17 }
+  }, {
+    7, { 1, 1, 10, 60 }
+  }, {
+    9, { 1, 1, 120, 5 }
+  }, {
+    2, { 1, 2, 1, 1 }
+  }, {
+    3, { 1, 1, 1, 2 }
+  }, {
+    5, { 2, 17, 1,  4 }
+  }, {
+    41, { 2,  1, 4, 17 }
+  }, {
+    5, { 2, 17, 1,  4 }
+  }, {
+    9, { 2,  1, 17, 4 }
+  }, {
+    17, { 3, 16, 1, 1 }
+  }, {
+    26, { 3,  1, 2, 8 }
+  }, {
+    103, { 3, 39, 17, 23 }
+  }, {
+    255, { 3, 17, 39, 23 }
+  }, {
+    254, { 3, 36, 16,  20 }
+  }, {
+    127, { 3, 18,  1, 640 }
+  }, {
+    128, { 5, 39, 17, 23 }
+  }, {
+    129, { 5, 17, 39, 23 }
+  }, {
+    55, { 20, 35,  2, 2 }
+  }, {
+    1, { 20,  7, 10, 2 }
+  }    
+};
+
+static void l2g_tensor_fill_constant_ref(param_t *p, uint16_t ref_data[])
+{
+  uint64_t size = tg_shape_size(&p->dst->shape, p->dst->fmt);
+  printf("float =%x\n",p->constant);
+  for (uint64_t i = 0; i < size/fmt_size(p->dst->fmt); i++)
+    ref_data[i] = p->constant;
+}
+
+static void test_param_l2g(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  print_param(stderr, p);
+  uint64_t size = tg_shape_size(&p->dst->shape, p->dst->fmt);
+
+  cvk_ctx->ops->tdma_l2g_tensor_fill_constant(cvk_ctx, p);
+  CVI_RT_Submit(cvk_ctx);
+  uint16_t *dst_data = (uint16_t*)tensor_copy_d2s(rt_handle, p->dst);
+
+  uint16_t *ref_data = (uint16_t *)malloc(size);
+  l2g_tensor_fill_constant_ref(p, ref_data);
+
+  for (uint64_t i = 0; i < size/sizeof(uint16_t); i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(dst_data);
+  free(ref_data);
+}
+
+static void destroy_param_l2g(CVI_RT_HANDLE rt_handle, param_t *p)
+{
+  free_tensor_dev_mem(rt_handle, p->dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  uint32_t nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+  for (uint32_t i = 0; i < nr_fmt; i++) {
+    param_t p;
+    memset(&p, 0, sizeof(p));
+    p.constant = test_generate_bf16_corner_val(c->constant);
+    p.dst = alloc_tensor_dev_mem(rt_handle, cvk_ctx, c->dst_shape, input_fmt[i].src_fmt);
+    test_param_l2g(rt_handle, cvk_ctx, &p);
+    destroy_param_l2g(rt_handle, &p);
+  }
+}
+
+int main(int argc, char **argv)
+{
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return 0;
+}
\ No newline at end of file
diff --git a/cviruntime/test/180x/test_180x_tdma_l2g_bf16_tensor_vlc_copy_compressed.c b/cviruntime/test/180x/test_180x_tdma_l2g_bf16_tensor_vlc_copy_compressed.c
new file mode 100644
index 000000000..8d4e15619
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_tdma_l2g_bf16_tensor_vlc_copy_compressed.c
@@ -0,0 +1,189 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef cvk_tdma_l2g_tensor_copy_compressed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => %d-bit %s\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->bit_length,
+      (p->src->fmt == CVK_FMT_I8)? "signed": "unsigned");
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_tl_shape_t lmem_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 17, 13 }
+  },
+#if 0 // No enough local memory for 180x
+  {
+    { 3, 39, 17, 23 }
+  },
+  {
+    { 5, 39, 17, 23 }
+  },
+  {
+    { 20, 35,  2, 2 }
+  },
+#endif
+#ifndef ENABEL_SIMPLE_VLC_TEST
+  {
+    { 1, 1, 1, 1 }
+  },
+  {
+    { 1, 1, 1, 2 }
+  },
+  {
+    { 1, 1, 7, 2 }
+  },
+  {
+    { 1, 1, 10, 60 }
+  },
+  {
+    { 1, 2, 1, 1 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 3, 16, 1, 1 }
+  },
+#if 0 // No enough local memory for 180x
+  {
+    { 3, 36, 16,  20 }
+  },
+#endif
+#endif /* ifndef ENABEL_SIMPLE_VLC_TEST*/
+};
+
+static int test_param_l2g(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p, CommandInfo* cmd_info, uint16_t *src_data)
+{
+  print_param(stderr, p);
+  uint64_t bytesize = tl_shape_size(&p->src->shape, p->src->fmt);
+  int is_signed = (p->src->fmt == CVK_FMT_I8);
+  uint8_t data_type = (p->src->fmt == CVK_FMT_BF16) ? 1 : 0;
+  size_t bs_size = 0;
+  int ret = 0;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, p->src, (uint8_t *)src_data);
+  cvk_ctx->ops->tdma_l2g_tensor_copy_compressed(cvk_ctx, p);
+  CVI_RT_Submit(cvk_ctx);
+
+  uint16_t *dst_data = (uint16_t* )cmpr_tensor_copy_d2s(rt_handle, p->dst);
+  uint16_t *ref_data = (uint16_t* )test_vlc_compress((uint8_t *)src_data, bytesize, is_signed, data_type, &bs_size, cmd_info, NULL);
+
+  for (uint64_t i = 0; i < bs_size / 2 ; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "vlc compress comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+
+      ret = -1;
+      break;
+    }
+  }
+
+  free(dst_data);
+  free(ref_data);
+  return ret;
+}
+
+static void destroy_param_l2g(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  free_cmpr_tensor_dev_mem(rt_handle, p->dst);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->src);
+}
+
+static int test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  cvk_fmt_t fmts[] = { CVK_FMT_BF16 };
+  uint8_t fmts_sz = sizeof(fmts) / sizeof(fmts[0]);
+  int ret = 0;
+
+  for (int src_align = 0; src_align < 2; src_align++) {
+    for (uint8_t fmt_i = 0; fmt_i < fmts_sz; fmt_i++) {
+      cvk_fmt_t fmt = fmts[fmt_i];
+      uint8_t data_type = (fmt == CVK_FMT_BF16) ? 1 : 0;
+      param_t p;
+      memset(&p, 0, sizeof(p));
+
+      p.src = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, c->lmem_shape, fmt, src_align);
+      if (!p.src) {
+        fprintf(stderr, "allocate tl shape(%d, %d, %d, %d) failed\n",
+                c->lmem_shape.n, c->lmem_shape.c, c->lmem_shape.h,
+                c->lmem_shape.w);
+        continue;
+      }
+
+      CommandInfo cmd_info;
+      memset(&cmd_info, 0, sizeof(CommandInfo));
+      uint64_t in_size = tl_shape_size(&p.src->shape, CVK_FMT_I8);
+
+      uint16_t *src_data = (uint16_t *)malloc(sizeof(uint16_t) * in_size);
+      test_vlc_init_testdata((uint8_t *)src_data, in_size, fmt == CVK_FMT_I8, fmt == CVK_FMT_BF16);
+
+      int is_signed = (p.src->fmt == CVK_FMT_I8);
+      cmd_info.signedness = is_signed;
+      cmd_info.is_bfloat16 = data_type;
+      cmd_info.bias0 = 127;
+
+      // <! not support bias0/1 setting compress by hw
+      //cvk_vlc_est_weight_bias(src_data, in_size, (bool)is_signed, (bool)data_type, &cmd_info);
+
+      cvk_tg_shape_t tg_shape =
+          tg_shape_t4(c->lmem_shape.n, c->lmem_shape.c, c->lmem_shape.h, c->lmem_shape.w);
+      p.dst = alloc_cmpr_tensor_dev_mem(rt_handle, cvk_ctx, tg_shape, fmt, &cmd_info);
+      ret |= test_param_l2g(rt_handle, cvk_ctx, &p, &cmd_info, src_data);
+      destroy_param_l2g(rt_handle, cvk_ctx, &p);
+
+      free(src_data);
+    }
+  }
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+  int ret = 0;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    ret |= test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+	
+  printf("tdma l2g bf16 tensor vlc test %s\n", ret ? "fail" : "pass");
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return 0;
+}
diff --git a/cviruntime/test/180x/test_180x_tdma_l2g_matrix_vlc_copy_compressed.c b/cviruntime/test/180x/test_180x_tdma_l2g_matrix_vlc_copy_compressed.c
new file mode 100644
index 000000000..b9a1329a7
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_tdma_l2g_matrix_vlc_copy_compressed.c
@@ -0,0 +1,182 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef cvk_tdma_l2g_matrix_copy_compressed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.w, p->src->shape.col,
+      p->dst->m.shape.row, p->dst->m.shape.col);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_ml_shape_t src_shape;
+  cvk_mg_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+ {
+    { 0, 2, 4, 7 },
+    { 0, 7 },
+  },
+ {
+    { 0, 1, 2, 2 },
+    { 1, 2 },
+  },
+ {
+    { 0, 3, 7, 17 },
+    { 0, 17 },
+  },
+ {
+    { 0, 17, 1, 17 },
+    { 0, 17 },
+  },
+ {
+    { 0, 60, 1, 60 },
+    { 0, 60 },
+  },
+#ifndef ENABEL_SIMPLE_VLC_TEST
+  {
+    { 0, 1, 1, 1 },
+    { 0, 1 },
+  },
+ {
+    { 0, 2, 1, 2 },
+    { 1, 2 },
+  },
+ {
+    { 0, 1, 7, 7 },
+    { 0, 7 },
+  },
+ {
+    { 0, 7, 1, 7 },
+    { 0, 7 },
+  },
+ {
+    { 0, 1, 17, 17 },
+    { 0, 17 },
+  },
+ {
+    { 0, 1, 60, 60 },
+    { 0, 60 },
+  },
+ {
+    { 0, 30, 2, 60 },
+    { 0, 60 },
+  },
+#endif /* ifndef ENABEL_SIMPLE_VLC_TEST*/
+};
+
+static void test_param_l2g(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p, uint8_t* src_data, CommandInfo * cmd_info)
+{
+  print_param(stderr, p);
+  uint64_t size = ml_shape_size(&p->src->shape, p->src->fmt);
+
+  matrix_copy_s2d_g2l(rt_handle, cvk_ctx, p->src, src_data);
+  cvk_ctx->ops->tdma_l2g_matrix_copy_compressed(cvk_ctx, p);
+  CVI_RT_Submit(cvk_ctx);
+
+  int is_signed = (p->src->fmt == CVK_FMT_I8);
+  int data_type = (p->src->fmt == CVK_FMT_BF16) ? 1 : 0;
+  size_t bs_size;
+
+  uint8_t *ref_data = test_vlc_compress(src_data, size, is_signed, data_type, &bs_size, cmd_info, NULL);
+  uint8_t *dst_data = cmpr_matrix_copy_d2s(rt_handle, p->dst);
+
+  for (uint64_t i = 0; i < bs_size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+          i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(dst_data);
+  free(ref_data);
+}
+
+
+static void destroy_param_l2g(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->src);
+  free_cmpr_matrix_dev_mem(rt_handle, p->dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  cvk_fmt_t fmts[] = { CVK_FMT_I8, CVK_FMT_U8 };
+  uint8_t fmts_sz = sizeof(fmts) / sizeof(fmts[0]);
+
+  for (uint32_t row = 1; row < 13; row += 2) {
+    c->src_shape.n = row;
+    c->dst_shape.row = row;
+    for (int src_align = 0; src_align < 2; src_align++) {
+      for (uint8_t fmt_i = 0; fmt_i < fmts_sz; fmt_i++) {
+        cvk_fmt_t fmt = fmts[fmt_i];
+        param_t p;
+        memset(&p, 0, sizeof(p));
+        p.src = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, c->src_shape, fmt, src_align);
+
+        uint64_t size = ml_shape_size(&p.src->shape, p.src->fmt);
+        uint8_t *src_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+        test_vlc_init_testdata(src_data, size, fmt == CVK_FMT_I8, fmt == CVK_FMT_BF16);
+
+        //size_t bs_size;
+        CommandInfo cmd_info;
+        memset(&cmd_info, 0, sizeof(CommandInfo));
+
+        // <! not support bias0/1 setting compress by hw
+        //get_vlc_compressed_meta(src_data, size, p.src->fmt, &bs_size, &cmd_info);
+
+        int is_signed = (p.src->fmt == CVK_FMT_I8);
+        cmd_info.signedness = is_signed;
+
+        // <! max compressed size
+        p.dst = alloc_cmpr_matrix_dev_mem(rt_handle, c->dst_shape, p.src->fmt, &cmd_info);
+
+        //printf ("row %u is_align %d fmt %d\n", row, src_align, fmt);
+        test_param_l2g(rt_handle, cvk_ctx, &p, src_data, &cmd_info);
+        destroy_param_l2g(rt_handle, cvk_ctx, &p);
+        free(src_data);
+      }
+    }
+  }
+}
+
+int main(int argc, char **argv)
+{
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return 0;
+}
diff --git a/cviruntime/test/180x/test_180x_tdma_l2g_tensor_copy.c b/cviruntime/test/180x/test_180x_tdma_l2g_tensor_copy.c
new file mode 100644
index 000000000..d06eb8eb2
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_tdma_l2g_tensor_copy.c
@@ -0,0 +1,186 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef cvk_tdma_l2g_tensor_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_tl_shape_t src_shape;
+  cvk_tg_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 1, 1 },
+    { 1, 1, 1, 1 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 2, 1 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 2, 7 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 13, 17 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 120, 5 },
+  }, {
+    { 1, 2, 1, 1 },
+    { 1, 1, 1, 2 },
+  }, {
+    { 2, 17, 1,  4 },
+    { 2,  1, 4, 17 },
+  }, {
+    { 2, 17, 1,  4 },
+    { 2,  1, 17, 4 },
+  }, {
+    { 3, 16, 1, 1 },
+    { 3,  1, 2, 8 },
+  }, {
+    { 3, 39, 17, 23 },
+    { 3, 17, 39, 23 },
+  }, {
+    { 3, 36, 16,  20 },
+    { 3, 18,  1, 640 },
+  },
+#if 0  // for 180x
+  {
+    { 5, 39, 17, 23 },
+    { 5, 17, 39, 23 },
+  },
+#endif
+  {
+    { 20, 35,  2, 2 },
+    { 20,  7, 10, 2 },
+  }    
+};
+
+static void l2g_tensor_copy_ref(param_t *p, uint8_t ref_data[], uint8_t src_data[])
+{
+  uint64_t size = tl_shape_size(&p->src->shape, p->src->fmt);
+
+  for (uint64_t i = 0; i < size; i++)
+    ref_data[i] = src_data[i];
+}
+
+static int test_param_l2g(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  print_param(stderr, p);
+  int ret = 0;
+  uint64_t size = tl_shape_size(&p->src->shape, p->src->fmt);
+
+  uint8_t *src_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  uint8_t *dst_data = NULL, *ref_data = NULL;
+  if (!src_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint64_t i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, p->src, src_data);
+
+  cvk_ctx->ops->tdma_l2g_tensor_copy(cvk_ctx, p);
+  CVI_RT_Submit(cvk_ctx);
+
+  dst_data = tensor_copy_d2s(rt_handle, p->dst);
+  ref_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!dst_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  l2g_tensor_copy_ref(p, ref_data, src_data);
+
+  for (uint64_t i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+fail_exit:
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+
+  return ret;
+}
+
+static void destroy_param_l2g(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  free_tensor_dev_mem(rt_handle, p->dst);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->src);
+}
+
+static int test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  int ret = 0;
+
+  for (int src_align = 0; src_align < 2; src_align++) {
+    param_t p;
+    memset(&p, 0, sizeof(p));
+
+    p.src = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, c->src_shape, CVK_FMT_I8, src_align);
+    p.dst = alloc_tensor_dev_mem(rt_handle, cvk_ctx, c->dst_shape, CVK_FMT_I8);
+    if (p.src && p.dst)
+      ret |= test_param_l2g(rt_handle, cvk_ctx, &p);
+    else if (!p.src)
+        fprintf(stderr, "allocate tl shape(%d, %d, %d, %d) failed\n",
+                c->src_shape.n, c->src_shape.c, c->src_shape.h,
+                c->src_shape.w);
+    else if (!p.dst)
+        fprintf(stderr, "allocate tg failed\n");
+    destroy_param_l2g(rt_handle, cvk_ctx, &p);
+  }
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    ret |= test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  printf("tdma l2g tensor copy test %s\n", ret ? "fail" : "pass");
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_tdma_l2g_tensor_copy_cw_transposed.c b/cviruntime/test/180x/test_180x_tdma_l2g_tensor_copy_cw_transposed.c
new file mode 100644
index 000000000..726dd8c1c
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_tdma_l2g_tensor_copy_cw_transposed.c
@@ -0,0 +1,197 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef cvk_tdma_l2g_tensor_copy_cw_transposed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_tl_shape_t src_shape;
+  cvk_tg_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 1, 1 },
+    { 1, 1, 1, 1 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 2, 1, 1 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 2, 7, 1 },
+  }, {
+    { 1,  1, 17, 13 },
+    { 1, 13, 17,  1 },
+  }, {
+    { 1,  1, 10, 60 },
+    { 1, 60, 10,  1 },
+  }, {
+    { 1, 2, 1, 1 },
+    { 1, 1, 1, 2 },
+  }, {
+    {  2, 17, 1,  4 },
+    {  2,  4, 1, 17 },
+  }, {
+    {  2, 17, 3,  4 },
+    {  2,  4, 3, 17 },
+  }, {
+    {  3, 16, 7,  1 },
+    {  3,  1, 7, 16 },
+  }, {
+    {  3, 39, 17, 23 },
+    {  3, 23, 17, 39 },
+  }, {
+    {  3, 36,  16, 20 },
+    {  3, 20,  16, 36 },
+  }, {
+    #if 0 // No enough local memory for 180x 
+    {  5, 39, 17, 23 },
+    {  5, 23, 17, 39 },
+  }, {
+    #endif
+    { 20, 35,  2,  2 },
+    { 20,  2,  2, 35 },
+  }, {
+    { 20, 35,  3,  2 },
+    { 20,  2,  3, 35 },
+  }    
+};
+
+static void l2g_tensor_copy_cw_transposed_ref(
+    param_t *p, uint8_t ref_data[], uint8_t src_data[])
+{
+  cvk_tl_shape_t s = p->src->shape;
+  uint32_t n = s.n;
+  uint32_t c = s.c;
+  uint32_t h = s.h;
+  uint32_t w = s.w;
+
+  for (uint32_t ni = 0; ni < n; ni++) {
+    for (uint32_t ci = 0; ci < c; ci++) {
+      for (uint32_t hi = 0; hi < h; hi++) {
+        for (uint32_t wi = 0; wi < w; wi++) {
+          uint32_t src_i = ni * c * h * w + ci * h * w + hi * w + wi;
+          uint32_t dst_i = ni * c * h * w + wi * h * c + hi * c + ci;
+          ref_data[dst_i] = src_data[src_i];
+        }
+      }
+    }
+  }
+}
+
+static int test_param_l2g(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  print_param(stderr, p);
+  uint64_t size = tl_shape_size(&p->src->shape, p->src->fmt);
+  int ret = 0;
+
+  uint8_t *src_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!src_data)
+    return -1;
+
+  for (uint64_t i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, p->src, src_data);
+  cvk_ctx->ops->tdma_l2g_tensor_copy_cw_transposed(cvk_ctx, p);
+  CVI_RT_Submit(cvk_ctx);
+  uint8_t *dst_data = tensor_copy_d2s(rt_handle, p->dst);
+
+  uint8_t *ref_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!dst_data || !ref_data)
+    goto fail_exit;
+
+  l2g_tensor_copy_cw_transposed_ref(p, ref_data, src_data);
+
+  for (uint64_t i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+fail_exit:
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+
+  return ret;
+}
+
+
+static void destroy_param_l2g(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  free_tensor_dev_mem(rt_handle, p->dst);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->src);
+}
+
+static int test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  int ret = 0;
+
+  for (int src_align = 0; src_align < 2; src_align++) {
+    param_t p;
+    memset(&p, 0, sizeof(p));
+
+    p.src = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, c->src_shape, CVK_FMT_I8, src_align);
+    p.dst = alloc_tensor_dev_mem(rt_handle, cvk_ctx, c->dst_shape, CVK_FMT_I8);
+    if (p.src && p.dst)
+      ret |= test_param_l2g(rt_handle, cvk_ctx, &p);
+    else if (!p.src)
+      fprintf(stderr, "allocate tl shape(%d, %d, %d, %d) failed\n",
+              c->src_shape.n, c->src_shape.c, c->src_shape.h,
+              c->src_shape.w);
+    else if (!p.dst)
+      fprintf(stderr, "allocate tg failed\n");
+    destroy_param_l2g(rt_handle, cvk_ctx, &p);
+  }
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+  int ret = 0;
+
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    ret = test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  printf("tdma l2g tensor copy cw tp test %s\n", ret ? "fail" : "pass");
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return 0;
+}
diff --git a/cviruntime/test/180x/test_180x_tdma_l2g_tensor_copy_nc_transposed.c b/cviruntime/test/180x/test_180x_tdma_l2g_tensor_copy_nc_transposed.c
new file mode 100644
index 000000000..0c6f37dda
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_tdma_l2g_tensor_copy_nc_transposed.c
@@ -0,0 +1,274 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef cvk_tdma_l2g_tensor_copy_nc_transposed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_tl_shape_t src_shape;
+  cvk_tg_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 1, 1 },
+    { 1, 1, 1, 1 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 1, 2 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 2, 1 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 7, 2 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 2, 7 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 17, 13 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 13, 17 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 10, 60 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 2, 300 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 3, 200 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 4, 150 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 5, 120 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 60, 10 },
+  }, {
+    { 1, 1, 120, 5 },
+    { 1, 1, 120, 5 },
+  }, {
+    { 1, 2, 1, 1 },
+    { 2, 1, 1, 1 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 1, 4 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 2, 2 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 4, 1 },
+  }, {
+    { 17, 2, 2, 2 },
+    { 2, 17, 2, 2 },
+  }, {
+    { 17, 2, 4, 1 },
+    { 2, 17, 4, 1 },
+  }, {
+    {  3, 16, 1, 1 },
+    { 16,  3, 1, 1 },
+  }, {
+    { 3, 39, 23, 17 },
+    { 39, 3, 23, 17 },
+  }, {
+    { 3, 39, 17, 23 },
+    { 39, 3, 17, 23 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  16, 20 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  2, 160 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  4, 80 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  8, 40 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  20, 16 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  32, 10 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  64, 5 },
+  }, {
+    { 20, 35, 2, 2 },
+    { 35, 20, 2, 2 },
+  }, {
+    { 35, 20, 2, 2 },
+    { 20, 35, 2, 2 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 160, 2 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 2, 160 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 4, 80 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 8, 40 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 10, 32 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 20, 16 },
+    #if 0 // No enough local memory for 180x 
+  }, {
+    { 5, 39, 17, 23 },
+    { 39, 5, 17, 23 },
+  }, {
+    { 39, 5, 23, 17 },
+    { 5, 39, 23, 17 },
+    #endif
+  }    
+};
+
+static void l2g_tensor_copy_nc_transposed_ref(
+    param_t *p, uint8_t ref_data[], uint8_t src_data[])
+{
+  cvk_tl_shape_t s = p->src->shape;
+  uint32_t n = s.n;
+  uint32_t c = s.c;
+  uint32_t hw = s.h * s.w;
+
+  for (uint32_t ni = 0; ni < n; ni++) {
+    for (uint32_t ci = 0; ci < c; ci++) {
+      for (uint32_t hwi = 0; hwi < hw; hwi++) {
+        uint32_t src_i = ni * c * hw + ci * hw + hwi;
+        uint32_t dst_i = ci * n * hw + ni * hw + hwi;
+        ref_data[dst_i] = src_data[src_i];
+      }
+    }
+  }
+}
+
+static int test_param_l2g(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  print_param(stderr, p);
+  int ret = 0;
+  uint64_t size = tl_shape_size(&p->src->shape, p->src->fmt);
+
+  uint8_t *src_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!src_data)
+    return -1;
+
+  for (uint64_t i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, p->src, src_data);
+  cvk_ctx->ops->tdma_l2g_tensor_copy_nc_transposed(cvk_ctx, p);
+  CVI_RT_Submit(cvk_ctx);
+  uint8_t *dst_data = tensor_copy_d2s(rt_handle, p->dst);
+
+  uint8_t *ref_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!dst_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  l2g_tensor_copy_nc_transposed_ref(p, ref_data, src_data);
+
+  for (uint64_t i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+fail_exit:
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+
+  return ret;
+}
+
+static void destroy_param_l2g(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  free_tensor_dev_mem(rt_handle, p->dst);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->src);
+}
+
+static int test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  int ret = 0;
+
+  for (int src_align = 0; src_align < 2; src_align++) {
+    param_t p;
+    memset(&p, 0, sizeof(p));
+
+    p.src = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, c->src_shape, CVK_FMT_I8, src_align);
+    p.dst = alloc_tensor_dev_mem(rt_handle, cvk_ctx, c->dst_shape, CVK_FMT_I8);
+    if (p.src && p.dst)
+      ret |= test_param_l2g(rt_handle, cvk_ctx, &p);
+    else if (!p.src)
+      fprintf(stderr, "allocate tl shape(%d, %d, %d, %d) failed\n",
+              c->src_shape.n, c->src_shape.c, c->src_shape.h,
+              c->src_shape.w);
+    else if (!p.dst)
+      fprintf(stderr, "allocate tg failed\n");
+    destroy_param_l2g(rt_handle, cvk_ctx, &p);
+
+  }
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+  int ret = 0;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    ret |= test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  printf("tdma l2g tensor copy nc tp test %s\n", ret ? "fail" : "pass");
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_tdma_l2g_tensor_vlc_copy_compressed.c b/cviruntime/test/180x/test_180x_tdma_l2g_tensor_vlc_copy_compressed.c
new file mode 100644
index 000000000..14747ff50
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_tdma_l2g_tensor_vlc_copy_compressed.c
@@ -0,0 +1,197 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef cvk_tdma_l2g_tensor_copy_compressed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => %d-bit %s\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->bit_length,
+      (p->src->fmt == CVK_FMT_I8)? "signed": "unsigned");
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_tl_shape_t lmem_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 17, 13 }
+  },
+  {
+    { 3, 39, 17, 23 }
+  },
+  #if 0 // No enough local memory for 180x 
+  {
+    { 5, 39, 17, 23 }
+  },
+  #endif
+  {
+    { 20, 35,  2, 2 }
+  },
+#ifndef ENABEL_SIMPLE_VLC_TEST
+  {
+    { 1, 1, 1, 1 }
+  },
+  {
+    { 1, 1, 1, 2 }
+  },
+  {
+    { 1, 1, 7, 2 }
+  },
+  {
+    { 1, 1, 10, 60 }
+  },
+  {
+    { 1, 2, 1, 1 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 3, 16, 1, 1 }
+  },
+  {
+    { 3, 36, 16,  20 }
+  },
+#endif /* ifndef ENABEL_SIMPLE_VLC_TEST*/
+};
+
+static uint64_t l2g_tensor_copy_vlc_compressed_ref(
+    param_t *p, uint8_t ref_data[], uint8_t src_data[], CommandInfo *cmd_info)
+{
+  uint64_t in_size = tl_shape_size(&p->src->shape, p->src->fmt);
+  size_t bs_size = 0;
+
+  cvk_vlc_enc_int8(src_data, in_size, ref_data, &bs_size, cmd_info);
+  return bs_size;
+}
+
+static int test_param_l2g(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p, CommandInfo* cmd_info_est, uint8_t *src_data)
+{
+  print_param(stderr, p);
+  uint64_t size = tl_shape_size(&p->src->shape, p->src->fmt);
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, p->src, src_data);
+  cvk_ctx->ops->tdma_l2g_tensor_copy_compressed(cvk_ctx, p);
+  CVI_RT_Submit(cvk_ctx);
+
+  uint8_t *dst_data = cmpr_tensor_copy_d2s(rt_handle, p->dst);
+  uint8_t *ref_data = (uint8_t *)malloc(sizeof(uint8_t) * p->dst->reserved_size); //<! bs_buf_size
+  if (!dst_data || !ref_data)
+    return -1;
+
+  size = l2g_tensor_copy_vlc_compressed_ref(p, ref_data, src_data, cmd_info_est);
+
+  for (uint64_t i = 0; i < size ; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "vlc compress comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+
+      return -1;
+    }
+  }
+
+  free(dst_data);
+  free(ref_data);
+
+  return 0;
+}
+
+static void destroy_param_l2g(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  free_cmpr_tensor_dev_mem(rt_handle, p->dst);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->src);
+}
+
+static int test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  int ret = 0;
+  cvk_fmt_t fmts[] = { CVK_FMT_I8, CVK_FMT_U8 };
+
+  for (int src_align = 0; src_align < 2; src_align++) {
+    for (uint8_t fmt_i = 0; fmt_i < 2; fmt_i++) {
+      cvk_fmt_t fmt = fmts[fmt_i];
+      param_t p;
+      memset(&p, 0, sizeof(p));
+
+      p.src = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, c->lmem_shape, fmt, src_align);
+      if (!p.src) {
+        fprintf(stderr, "allocate tl shape(%d, %d, %d, %d) failed\n",
+                c->lmem_shape.n, c->lmem_shape.c, c->lmem_shape.h,
+                c->lmem_shape.w);
+        continue;
+      }
+
+      CommandInfo cmd_info;
+      memset(&cmd_info, 0, sizeof(CommandInfo));
+      uint64_t in_size = tl_shape_size(&p.src->shape, p.src->fmt);
+
+      uint8_t *src_data = (uint8_t *)malloc(sizeof(uint8_t) * in_size);
+      if (!src_data)
+        return -1;
+
+      test_vlc_init_testdata(src_data, in_size, fmt == CVK_FMT_I8, fmt == CVK_FMT_BF16);
+
+      int is_signed = (p.src->fmt == CVK_FMT_I8);
+      cmd_info.signedness = is_signed;
+
+      // <! not support bias0/1 setting compress by hw
+      //cvk_vlc_est_weight_bias(src_data, in_size, (bool)is_signed, (bool)data_type, &cmd_info);
+
+      cvk_tg_shape_t tg_shape =
+          tg_shape_t4(c->lmem_shape.n, c->lmem_shape.c, c->lmem_shape.h, c->lmem_shape.w);
+      p.dst = alloc_cmpr_tensor_dev_mem(rt_handle, cvk_ctx, tg_shape, fmt, &cmd_info);
+      ret |= test_param_l2g(rt_handle, cvk_ctx, &p, &cmd_info, src_data);
+      destroy_param_l2g(rt_handle, cvk_ctx, &p);
+
+      free(src_data);
+    }
+  }
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    ret |= test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  printf("tdma l2g tensor copy vlc test %s\n", ret ? "fail" : "pass");
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_tdma_l2l_tensor_copy.c b/cviruntime/test/180x/test_180x_tdma_l2l_tensor_copy.c
new file mode 100644
index 000000000..dadb286ee
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_tdma_l2l_tensor_copy.c
@@ -0,0 +1,181 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef cvk_tdma_l2l_tensor_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_tl_shape_t src_shape;
+  cvk_tl_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 1, 1 },
+    { 1, 1, 1, 1 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 2, 1 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 2, 7 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 13, 17 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 120, 5 },
+  }, {
+    { 1, 2, 1, 1 },
+    { 1, 1, 1, 2 },
+  }, {
+    { 2, 17, 1,  4 },
+    { 2,  1, 4, 17 },
+  }, {
+    { 2, 17, 1,  4 },
+    { 2,  1, 17, 4 },
+  }, {
+    { 3, 16, 1, 1 },
+    { 3,  1, 2, 8 },
+  }, {
+    #if 0 // No enough local memory for 180x
+    { 3, 39, 17, 23 },
+    { 3, 17, 39, 23 },
+  }, {
+    { 3, 36, 16,  20 },
+    { 3, 18,  1, 640 },
+  }, {
+    { 5, 39, 17, 23 },
+    { 5, 17, 39, 23 },
+  }, {
+    #endif
+    { 20, 35,  2, 2 },
+    { 20,  7, 10, 2 },
+  }    
+};
+
+static void destroy_param(cvk_context_t *cvk_ctx, param_t *p)
+{
+  if (p->dst)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->dst);
+  if (p->src)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->src);
+}
+
+static void l2l_tensor_copy_ref(param_t *p, uint8_t ref_data[], uint8_t src_data[])
+{
+  uint64_t size = tl_shape_size(&p->src->shape, p->src->fmt);
+
+  for (uint64_t i = 0; i < size; i++)
+    ref_data[i] = src_data[i];
+}
+
+static int test_param(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  print_param(stderr, p);
+  int ret = 0;
+  uint64_t size = tl_shape_size(&p->src->shape, p->src->fmt);
+
+  uint8_t *src_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!src_data)
+    return -1;
+
+  for (uint64_t i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, p->src, src_data);
+
+  cvk_ctx->ops->tdma_l2l_tensor_copy(cvk_ctx, p);
+
+  uint8_t *dst_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, p->dst);
+  uint8_t *ref_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!dst_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  l2l_tensor_copy_ref(p, ref_data, src_data);
+
+  for (uint64_t i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      return -1;
+    }
+  }
+
+fail_exit:
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+
+  return ret;
+}
+
+static int test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  int ret = 0;
+
+  for (int src_align = 0; src_align < 2; src_align++) {
+    for (int dst_align = 0; dst_align < 2; dst_align++) {
+      param_t p;
+      memset(&p, 0, sizeof(p));
+      p.src = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, c->src_shape, CVK_FMT_I8, src_align);
+      p.dst = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, c->dst_shape, CVK_FMT_I8, dst_align);
+      if (p.src && p.dst)
+        ret = test_param(rt_handle, cvk_ctx, &p);
+      else if (!p.src)
+        fprintf(stderr, "fail to alloc src (%d, %d, %d, %d)\n",
+                c->src_shape.n, c->src_shape.c, c->src_shape.h, c->src_shape.w);
+      else if (!p.dst)
+        fprintf(stderr, "fail to alloc dst (%d, %d, %d, %d)\n",
+                c->dst_shape.n, c->dst_shape.c, c->dst_shape.h, c->dst_shape.w);
+      destroy_param(cvk_ctx, &p);
+    }
+  }
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    ret |= test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_tdma_matrix_vlc_decompress_compress.c b/cviruntime/test/180x/test_180x_tdma_matrix_vlc_decompress_compress.c
new file mode 100644
index 000000000..8a46ead0a
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_tdma_matrix_vlc_decompress_compress.c
@@ -0,0 +1,208 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef cvk_tdma_g2l_matrix_copy_decompressed_param_t decompress_param_t;
+typedef cvk_tdma_l2g_matrix_copy_compressed_param_t compress_param_t;
+
+typedef struct{
+  decompress_param_t dec_p;
+  compress_param_t com_p;
+} param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => %s\n",
+      tag,
+      p->dec_p.dst->shape.n, p->dec_p.dst->shape.c, p->dec_p.dst->shape.w, p->dec_p.dst->shape.col,
+      (p->dec_p.dst->fmt == CVK_FMT_I8)? "signed": "unsigned");
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_mg_shape_t src_shape;
+  cvk_ml_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 0, 1 },
+    { 0, 1, 1, 1 },
+  },
+  {
+    { 0, 2 },
+    { 0, 1, 2, 2 },
+  },
+  {
+    { 0, 2 },
+    { 0, 2, 1, 2 },
+  },
+  {
+    { 0, 7 },
+    { 0, 1, 7, 7 },
+  },
+#ifndef ENABEL_SIMPLE_VLC_TEST
+  {
+    { 0, 7 },
+    { 0, 2, 4, 7 },
+  },
+  {
+    { 0, 7 },
+    { 0, 7, 1, 7 },
+  },
+  {
+    { 0, 17 },
+    { 0, 1, 17, 17 },
+  },
+  {
+    { 0, 17 },
+    { 0, 3, 7, 17 },
+  },
+  {
+    { 0, 17 },
+    { 0, 17, 1, 17 },
+  },
+  {
+    { 0, 60 },
+    { 0, 1, 60, 60 },
+  },
+  {
+    { 0, 60 },
+    { 0, 30, 2, 60 },
+  },
+  {
+    { 0, 60 },
+    { 0, 60, 1, 60 },
+  }
+#endif /* ifndef ENABEL_SIMPLE_VLC_TEST*/
+};
+
+static void test_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p, uint8_t *src_data,
+  CommandInfo* cmd_info)
+{
+  print_param(stderr, p);
+  uint64_t size = ml_shape_size(&p->dec_p.dst->shape, p->dec_p.dst->fmt);
+  int is_signed = (p->dec_p.dst->fmt == CVK_FMT_I8);
+
+  uint8_t *gmem_data;
+  size_t bs_size;
+  size_t data_type = (p->dec_p.dst->fmt == CVK_FMT_BF16) ? 1 : 0;
+
+  // command info
+  gmem_data = test_vlc_compress(src_data, size, is_signed, data_type, &bs_size, cmd_info, NULL);
+
+  //1. send compressed one to gaddr and decompress from gaddr to local
+  cmpr_matrix_copy_s2d(rt_handle, p->dec_p.src, gmem_data);
+  cvk_ctx->ops->tdma_g2l_matrix_copy_decompressed(cvk_ctx, &p->dec_p);
+  CVI_RT_Submit(cvk_ctx);
+
+  //2. decompress from sram
+  cvk_ctx->ops->tdma_l2g_matrix_copy_compressed(cvk_ctx, &p->com_p);
+  CVI_RT_Submit(cvk_ctx);
+
+  //3. get final data
+  uint8_t *dst_data = cmpr_matrix_copy_d2s(rt_handle, p->com_p.dst);
+
+  for (uint64_t i = 0; i < bs_size ; i++) {
+    if (dst_data[i] != gmem_data[i]) {
+      fprintf(stderr, "vlc compress comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], gmem_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(dst_data);
+  free(gmem_data);
+}
+
+static void destroy_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  free_cmpr_matrix_dev_mem(rt_handle, p->dec_p.src);
+  free_cmpr_matrix_dev_mem(rt_handle, p->com_p.dst);
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->dec_p.dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  cvk_fmt_t fmts[] = { CVK_FMT_I8, CVK_FMT_U8 };
+  uint8_t fmts_sz = sizeof(fmts) / sizeof(fmts[0]);
+
+  for (uint32_t row = 1; row < 13; row += 2) {
+    c->src_shape.row = row;
+    c->dst_shape.n = row;
+    for (int dst_align = 0; dst_align < 2; dst_align++) {
+      for (uint8_t fmt_i = 0; fmt_i < fmts_sz; fmt_i++) {
+        cvk_fmt_t fmt = fmts[fmt_i];
+        param_t p;
+        memset(&p, 0, sizeof(p));
+        //put compressed data to gaddr ->decompress to local -> compress to gaddr
+
+        int is_signed = (fmt == CVK_FMT_I8);
+
+        CommandInfo cmd_info;
+        memset(&cmd_info, 0, sizeof(CommandInfo));
+        cmd_info.signedness = is_signed;
+
+        // <! not support bias0/1 setting compress by hw
+        //get_vlc_compressed_meta(src_data, size, fmt, &bs_size, &cmd_info);
+
+        //1. alloc decompress
+        p.dec_p.src = alloc_cmpr_matrix_dev_mem(rt_handle, c->src_shape, fmt, &cmd_info);
+        p.dec_p.dst = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, c->dst_shape, fmt, dst_align);
+
+        uint64_t size = ml_shape_size(&p.dec_p.dst->shape, p.dec_p.dst->fmt);
+        uint8_t *src_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+        test_vlc_init_testdata(src_data, size, fmt == CVK_FMT_I8, fmt == CVK_FMT_BF16);
+
+        assert(p.dec_p.dst);
+
+        //2. alloc compress
+        p.com_p.src = p.dec_p.dst; //cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, c->lmem_shape, fmt, align);
+        p.com_p.dst = alloc_cmpr_matrix_dev_mem(rt_handle, c->src_shape, fmt, &cmd_info);
+
+        //3. test: the sequence like below:
+        //3.1 put compressed data to gaddr
+        //3.2 decompress to local
+        //3.3 compress to gaddr
+        //printf ("row %u is_align %d fmt %d\n", row, dst_align, fmt);
+        test_param_g2l(rt_handle, cvk_ctx, &p, src_data, &cmd_info);
+        destroy_param_g2l(rt_handle, cvk_ctx, &p);
+        free(src_data);
+      }
+    }
+  }
+}
+
+int main(int argc, char **argv)
+{
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return 0;
+}
diff --git a/cviruntime/test/180x/test_180x_tdma_tensor_vlc_decompress_compress.c b/cviruntime/test/180x/test_180x_tdma_tensor_vlc_decompress_compress.c
new file mode 100644
index 000000000..65e64fb4c
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_tdma_tensor_vlc_decompress_compress.c
@@ -0,0 +1,207 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef cvk_tdma_g2l_tensor_copy_decompressed_param_t decompress_param_t;
+typedef cvk_tdma_l2g_tensor_copy_compressed_param_t compress_param_t;
+
+typedef struct{
+  decompress_param_t dec_p;
+  compress_param_t com_p;
+} param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => %d-bit %s\n",
+      tag,
+      p->dec_p.dst->shape.n, p->dec_p.dst->shape.c, p->dec_p.dst->shape.h, p->dec_p.dst->shape.w,
+      p->dec_p.src->bit_length,
+      (p->dec_p.dst->fmt == CVK_FMT_I8)? "signed": "unsigned");
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_tl_shape_t lmem_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 17, 13 }
+  },
+  {
+    { 3, 39, 17, 23 }
+  },
+  {
+    { 5, 39, 17, 23 }
+  },
+  {
+    { 20, 35,  2, 2 }
+  },
+#ifndef ENABEL_SIMPLE_VLC_TEST
+  {
+    { 1, 1, 1, 1 }
+  },
+  {
+    { 1, 1, 1, 2 }
+  },
+  {
+    { 1, 1, 7, 2 }
+  },
+  {
+    { 1, 1, 10, 60 }
+  },
+  {
+    { 1, 2, 1, 1 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 3, 16, 1, 1 }
+  },
+  {
+    { 3, 36, 16,  20 }
+  },
+#endif /* ifndef ENABEL_SIMPLE_VLC_TEST*/
+};
+
+static int test_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p, cvk_cmpr_tg_t* dst)
+{
+  print_param(stderr, p);
+  int ret = 0;
+  uint64_t size = tl_shape_size(&p->dec_p.dst->shape, p->dec_p.dst->fmt);
+  int is_signed = (p->dec_p.dst->fmt == CVK_FMT_I8);
+  uint8_t *src_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  uint8_t *gmem_data = NULL, *dst_data = NULL;
+  if (!src_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  test_vlc_init_testdata(src_data, size, p->dec_p.dst->fmt == CVK_FMT_I8, p->dec_p.dst->fmt == CVK_FMT_BF16);
+
+  size_t total_size;
+  size_t data_type = (p->dec_p.dst->fmt == CVK_FMT_BF16) ? 1 : 0;
+  size_t in_size = size;
+  size_t bs_buf_size = get_out_bs_buf_size(size, data_type);
+  gmem_data = (uint8_t *) malloc(bs_buf_size * sizeof(uint8_t));
+  if (!gmem_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  // command info
+  CommandInfo cmd_info;
+  memset(&cmd_info, 0, sizeof(CommandInfo));
+  cmd_info.signedness = is_signed;
+
+  // <! not support bias0/1 setting compress by hw
+  cvk_vlc_enc_int8(src_data, in_size, gmem_data, &total_size, &cmd_info);
+
+  cmpr_tensor_copy_s2d(rt_handle, p->dec_p.src, gmem_data);
+  cvk_ctx->ops->tdma_g2l_tensor_copy_decompressed(cvk_ctx, &p->dec_p);
+  CVI_RT_Submit(cvk_ctx);
+
+  dst->zero_guard_en = cmd_info.zero_guard_en;
+  dst->bias0 = cmd_info.bias0;
+  dst->bias1 = cmd_info.bias1;
+  p->com_p.dst = dst;
+  cvk_ctx->ops->tdma_l2g_tensor_copy_compressed(cvk_ctx, &p->com_p);
+  CVI_RT_Submit(cvk_ctx);
+
+  dst_data = cmpr_tensor_copy_d2s(rt_handle, p->com_p.dst);
+
+  for (uint64_t i = 0; i < total_size ; i++) {
+    if (dst_data[i] != gmem_data[i]) {
+      fprintf(stderr, "vlc compress comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], gmem_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+fail_exit:
+  free(src_data);
+  free(dst_data);
+  free(gmem_data);
+
+  return ret;
+}
+
+static void destroy_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  if (p->dec_p.src)
+    free_cmpr_tensor_dev_mem(rt_handle, p->dec_p.src);
+  if (p->com_p.dst)
+    free_cmpr_tensor_dev_mem(rt_handle, p->com_p.dst);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->dec_p.dst);
+}
+
+static int test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  cvk_fmt_t fmts[2] = { CVK_FMT_I8, CVK_FMT_U8 };
+  int ret = 0;
+
+  for (int align = 0; align < 2; align++) {
+    for (uint8_t fmt_i = 0; fmt_i < 2; fmt_i++) {
+      cvk_fmt_t fmt = fmts[fmt_i];
+
+      param_t p;
+      memset(&p, 0, sizeof(p));
+      cvk_tg_shape_t tg_shape =
+          tg_shape_t4(c->lmem_shape.n, c->lmem_shape.c, c->lmem_shape.h, c->lmem_shape.w);
+      p.dec_p.src = alloc_cmpr_tensor_dev_mem(rt_handle, cvk_ctx, tg_shape, fmt, NULL);
+      p.dec_p.dst = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, c->lmem_shape, fmt, align);
+      if (p.dec_p.src && p.dec_p.dst) {
+        p.com_p.src = p.dec_p.dst; //cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, c->lmem_shape, fmt, align);
+        cvk_cmpr_tg_t* dst = alloc_cmpr_tensor_dev_mem(rt_handle, cvk_ctx, tg_shape, fmt, NULL);
+        if (dst)
+          ret |= test_param_g2l(rt_handle, cvk_ctx, &p, dst);
+      }
+
+      destroy_param_g2l(rt_handle, cvk_ctx, &p);
+    }
+  }
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+  int ret = 0;
+
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    ret |= test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  printf("tdma tensor copy vlc test %s\n", ret ? "fail" : "pass");
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_tensor_add.c b/cviruntime/test/180x/test_180x_tensor_add.c
new file mode 100644
index 000000000..47711022f
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_tensor_add.c
@@ -0,0 +1,184 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_add_ref(
+    uint8_t *ref_high, uint8_t *ref_low,
+    uint8_t *a_high, uint8_t *a_low,
+    uint8_t *b_high, uint8_t *b_low,
+    int rshift_bits,
+    uint64_t size, int relu_enable)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    int32_t ta = ((int8_t)a_high[i] << 8) + a_low[i];
+    int32_t tb = ((int8_t)b_high[i] << 8) + b_low[i];
+    int32_t res = ta + tb;
+
+    res += 1 << (rshift_bits - 1);
+    res >>= rshift_bits;
+
+    if(relu_enable)
+    {
+      if (res > 127)
+        res = 127;
+      else if (res < -128)
+        res = -128;
+
+      if(relu_enable)
+        if(res<0)
+          res=0;
+      ref_high[i] = 0;
+      ref_low[i] = res & 0xff;
+
+    }else{
+      if (res > 32767)
+        res = 32767;
+      else if (res < -32768)
+        res = -32768;
+      ref_high[i] = (res >> 8) & 0xff;
+      ref_low[i] = res & 0xff;
+    }
+  }
+}
+
+static int test_tl_add(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 2; // 3 -> 2 for 180x
+  int c = 9; // 39 -> 3 for 180x 
+  int h = 7;
+  int w = 37;
+  int rshift_bits;
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  uint32_t size = n * c * h  * w;
+  uint8_t *a_high_data = (uint8_t *)malloc(size);
+  uint8_t *a_low_data = (uint8_t *)malloc(size);
+  uint8_t *b_high_data = (uint8_t *)malloc(size);
+  uint8_t *b_low_data = (uint8_t *)malloc(size);
+  uint8_t *ref_high_data = (uint8_t *)malloc(size);
+  uint8_t *ref_low_data = (uint8_t *)malloc(size);
+  if (!a_high_data || !a_low_data || !b_high_data || !b_low_data || !ref_high_data || !ref_low_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for(int relu_enable = 0; relu_enable < 2; relu_enable++) {
+    for (uint32_t i = 0; i < size; i++) {
+      a_high_data[i] = rand() % 64+ i ;
+      a_low_data[i] = i;
+      b_high_data[i] = (i + 250) / 20;
+      b_low_data[i] = 100 - i;
+    }
+    if(relu_enable)
+      rshift_bits = 7;
+    else
+      rshift_bits = 1;
+
+    tl_add_ref(ref_high_data, ref_low_data,
+               a_high_data, a_low_data,
+               b_high_data, b_low_data,
+               rshift_bits,
+               size, relu_enable);
+
+    cvk_tl_t *tl_a_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+    cvk_tl_t *tl_a_high = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+    cvk_tl_t *tl_b_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+    cvk_tl_t *tl_b_high = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+    cvk_tl_t *tl_res_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+    cvk_tl_t *tl_res_high = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+    uint8_t *res_high_data = NULL, *res_low_data = NULL;
+    if (!tl_a_low || !tl_a_high || !tl_b_low || !tl_b_high || !tl_res_low || !tl_res_high) {
+      ret = -1;
+      goto fail_exit_2;
+    }
+
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a_low, a_low_data);
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a_high, a_high_data);
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b_low, b_low_data);
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b_high, b_high_data);
+    cvk_tiu_add_param_t p4;
+    p4.res_high = relu_enable ? 0 : tl_res_high;
+    p4.res_low = tl_res_low;
+    p4.a_high = tl_a_high;
+    p4.a_low = tl_a_low;
+    p4.b_is_const = 0;
+    p4.b.high = tl_b_high;
+    p4.b.low = tl_b_low;
+    p4.rshift_bits = rshift_bits;
+    p4.relu_enable = relu_enable;
+    cvk_ctx->ops->tiu_add(cvk_ctx, &p4);
+    res_high_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res_high);
+    res_low_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res_low);
+    for (uint64_t i = 0; i < size; i++) {
+      if(!relu_enable)
+        if (res_high_data[i] != ref_high_data[i]) {
+          fprintf(stderr, "comparing failed at res_high_data[%" PRIu64 "], got %d, exp %d\n",
+                 i, res_high_data[i], ref_high_data[i]);
+          ret = -1;
+        }
+      if (res_low_data[i] != ref_low_data[i]) {
+        fprintf(stderr, "comparing failed at res_low_data[%" PRIu64 "], got %d, exp %d\n",
+               i, res_low_data[i], ref_low_data[i]);
+        ret = -1;
+      }
+    }
+
+fail_exit_2:
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res_high);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res_low);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b_high);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b_low);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a_high);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a_low);
+
+    free(res_high_data);
+    free(res_low_data);
+  }
+
+fail_exit:
+  free(a_high_data);
+  free(a_low_data);
+  free(b_high_data);
+  free(b_low_data);
+  free(ref_high_data);
+  free(ref_low_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_add(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_add(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_tensor_add_const.c b/cviruntime/test/180x/test_180x_tensor_add_const.c
new file mode 100644
index 000000000..3b7ec75aa
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_tensor_add_const.c
@@ -0,0 +1,178 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_add_const_ref(
+    uint8_t *ref_high, uint8_t *ref_low,
+    uint8_t *a_high, uint8_t *a_low,
+    int16_t b, int b_is_signed,
+    int rshift_bits,
+    uint64_t size, int relu_enable)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    int32_t ta = ((int8_t)a_high[i] << 8) + a_low[i];
+    int32_t tb = b_is_signed? b: (uint16_t)b;
+    int32_t res = ta + tb;
+    res += 1 << (rshift_bits - 1);
+    res >>= rshift_bits;
+
+    if(relu_enable)
+    {
+      if (res > 127)
+        res = 127;
+      else if (res < -128)
+        res = -128;
+
+      if(relu_enable)
+        if(res<0)
+          res=0;
+      ref_high[i] = 0;
+      ref_low[i] = res & 0xff;
+    }else{
+      if (res > 32767)
+        res = 32767;
+      else if (res < -32768)
+        res = -32768;
+      ref_high[i] = (res >> 8) & 0xff;
+      ref_low[i] = res & 0xff;
+    }
+  }
+}
+
+static int test_tl_add_const(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 3;
+  int c = 9; // 39 -> 9 for 180x
+  int h = 7;
+  int w = 37;
+  int rshift_bits;
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  uint32_t size = n * c * h  * w;
+  uint8_t *a_high_data = (uint8_t *)malloc(size);
+  uint8_t *a_low_data = (uint8_t *)malloc(size);
+  uint8_t *ref_high_data = (uint8_t *)malloc(size);
+  uint8_t *ref_low_data = (uint8_t *)malloc(size);
+  if (!a_high_data || !a_low_data || !ref_high_data || !ref_low_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for(int relu_enable = 0; relu_enable < 2; relu_enable++) {
+    int16_t b;
+    int b_is_signed = 1;
+    for (uint32_t i = 0; i < size; i++) {
+      a_high_data[i] = rand() % 64+ i;
+      a_low_data[i] = i;
+    }
+
+    if(relu_enable)
+    {
+      b=-64;
+      rshift_bits = 7;
+    }
+    else
+    {
+      b=-278;
+      rshift_bits = 1;
+    }
+
+    tl_add_const_ref(ref_high_data, ref_low_data,
+                     a_high_data, a_low_data,
+                     b, b_is_signed, rshift_bits, size,relu_enable);
+
+    cvk_tl_t *tl_a_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+    cvk_tl_t *tl_a_high = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+    cvk_tl_t *tl_res_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+    cvk_tl_t *tl_res_high = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+    uint8_t *res_high_data = NULL, *res_low_data = NULL;
+    if (!tl_a_low || !tl_a_high || !tl_res_low || !tl_res_high) {
+      ret = -1;
+      goto fail_exit_2;
+    }
+
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a_low, a_low_data);
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a_high, a_high_data);
+
+    cvk_tiu_add_param_t p4;
+    p4.res_high = relu_enable ? 0 : tl_res_high;
+    p4.res_low = tl_res_low;
+    p4.a_high = tl_a_high;
+    p4.a_low = tl_a_low;
+    p4.b_is_const = 1;
+    p4.b_const.val = b;
+    p4.b_const.is_signed = b_is_signed;
+    p4.rshift_bits = rshift_bits;
+    p4.relu_enable = relu_enable;
+    cvk_ctx->ops->tiu_add(cvk_ctx, &p4);
+
+    res_high_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res_high);
+    res_low_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res_low);
+    for (uint64_t i = 0; i < size; i++) {
+      if(!relu_enable)
+        if (res_high_data[i] != ref_high_data[i]) {
+          fprintf(stderr, "comparing failed at res_high_data[%" PRIu64 "], got %d, exp %d\n",
+                  i, res_high_data[i], ref_high_data[i]);
+          ret = -1;
+        }
+      if (res_low_data[i] != ref_low_data[i]) {
+        fprintf(stderr, "comparing failed at res_low_data[%" PRIu64 "], got %d, exp %d\n",
+                i, res_low_data[i], ref_low_data[i]);
+        ret = -1;
+      }
+    }
+
+fail_exit_2:
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res_high);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res_low);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a_high);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a_low);
+    free(res_high_data);
+    free(res_low_data);
+  }
+
+fail_exit:
+  free(a_high_data);
+  free(a_low_data);
+  free(ref_high_data);
+  free(ref_low_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_add_const(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_add_const(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_tensor_and.c b/cviruntime/test/180x/test_180x_tensor_and.c
new file mode 100644
index 000000000..89135a339
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_tensor_and.c
@@ -0,0 +1,240 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_and_int8_ref(int8_t *a, int8_t *b, int8_t *res, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++)
+    res[i] = a[i] & b[i];
+}
+
+static void tl_and_int16_ref(
+    uint8_t *ref_high, uint8_t *ref_low,
+    uint8_t *a_high, uint8_t *a_low,
+    uint8_t *b_high, uint8_t *b_low,
+    uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    int32_t ta = ((int8_t)a_high[i] << 8) + a_low[i];
+    int32_t tb = ((int8_t)b_high[i] << 8) + b_low[i];
+    int32_t res = ta & tb;
+    ref_high[i] = (res >> 8) & 0xff;
+    ref_low[i] = res & 0xff;
+  }
+}
+
+static int test_tl_and_int8(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 3;
+  int c = 9; // 39 -> 9 for 180x
+  int h = 7;
+  int w = 37;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  uint32_t size = n * c * h * w;
+  int8_t *a_data = (int8_t *)malloc(size);
+  int8_t *b_data = (int8_t *)malloc(size);
+  int8_t *ref_data = (int8_t *)malloc(size);
+  if (!a_data || !b_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint32_t i = 0; i < size; i++)
+    a_data[i] = (int8_t)(i % 256);
+
+  for (uint32_t i = 0; i < size; i++)
+    b_data[i] = (int8_t)(100 - i % 256);
+
+  tl_and_int8_ref(a_data, b_data, ref_data, size);
+
+  cvk_tl_t *tl_a = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_b = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_res = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  if (!tl_a || !tl_b || !tl_res) {
+    printf("  %s: fail to alloc tl\n", __FUNCTION__);
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, (uint8_t *)a_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b, (uint8_t *)b_data);
+  cvk_tiu_and_int8_param_t p9;
+  p9.res = tl_res;
+  p9.a = tl_a;
+  p9.b = tl_b;
+  cvk_ctx->ops->tiu_and_int8(cvk_ctx, &p9);
+  uint8_t *res_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res);
+
+  for (uint32_t i = 0; i < size; i++) {
+    if ((int8_t)res_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%u], got %x, exp %x\n",
+             i, res_data[i], ref_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+  free(res_data);
+
+fail_exit_2:
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+
+fail_exit:
+  free(a_data);
+  free(b_data);
+  free(ref_data);
+
+  printf("  %s(eu_align=%d) %s\n", __FUNCTION__, eu_align, ret ? "fail" : "pass");
+
+  return ret;
+}
+
+static int test_tl_and_int16(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 3;
+  int c = 9; // 39 -> 9 for 180x
+  int h = 7;
+  int w = 37;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  uint32_t size = n * c * h * w;
+  uint8_t *a_high_data = (uint8_t *)malloc(size);
+  uint8_t *a_low_data = (uint8_t *)malloc(size);
+  uint8_t *b_high_data = (uint8_t *)malloc(size);
+  uint8_t *b_low_data = (uint8_t *)malloc(size);
+  uint8_t *ref_high_data = (uint8_t *)malloc(size);
+  uint8_t *ref_low_data = (uint8_t *)malloc(size);
+  if (!a_high_data || !a_low_data || !b_high_data || !b_low_data || !ref_high_data || !ref_low_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint32_t i = 0; i < size; i++) {
+    a_high_data[i] = i / 10;
+    a_low_data[i] = i;
+    b_high_data[i] = (i + 250) / 20;
+    b_low_data[i] = 100 - i;
+  }
+
+  tl_and_int16_ref(
+      ref_high_data, ref_low_data,
+      a_high_data, a_low_data,
+      b_high_data, b_low_data,
+      size);
+
+  cvk_tl_t *tl_a_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_a_high = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_b_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_b_high = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_res_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_res_high = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  if (!tl_a_low || !tl_a_high || !tl_b_low || !tl_b_high || !tl_res_low || !tl_res_high){
+    printf("  %s: fail to alloc tl\n", __FUNCTION__);
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a_low, a_low_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a_high, a_high_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b_low, b_low_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b_high, b_high_data);
+  cvk_tiu_and_int16_param_t p8;
+  p8.res_high = tl_res_high;
+  p8.res_low = tl_res_low;
+  p8.a_high = tl_a_high;
+  p8.a_low = tl_a_low;
+  p8.b_high = tl_b_high;
+  p8.b_low = tl_b_low;
+  cvk_ctx->ops->tiu_and_int16(cvk_ctx, &p8);
+  uint8_t *res_high_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res_high);
+  uint8_t *res_low_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res_low);
+
+  for (uint64_t i = 0; i < size; i++) {
+    if (res_high_data[i] != ref_high_data[i]) {
+      fprintf(stderr, "comparing failed at res_high_data[%" PRIu64 "], got %d, exp %d\n",
+             i, res_high_data[i], ref_high_data[i]);
+      ret = 1;
+      break;
+    }
+    if (res_low_data[i] != ref_low_data[i]) {
+      fprintf(stderr, "comparing failed at res_low_data[%" PRIu64 "], got %d, exp %d\n",
+             i, res_low_data[i], ref_low_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+  free(res_high_data);
+  free(res_low_data);
+
+fail_exit_2:
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res_high);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res_low);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b_high);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b_low);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a_high);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a_low);
+
+fail_exit:
+  free(a_high_data);
+  free(a_low_data);
+  free(b_high_data);
+  free(b_low_data);
+  free(ref_high_data);
+  free(ref_low_data);
+
+  printf("  %s(eu_align=%d) %s\n", __FUNCTION__, eu_align, ret ? "fail" : "pass");
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_and_int8(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_and_int8(rt_handle, cvk_ctx, 1);
+  ret |= test_tl_and_int16(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_and_int16(rt_handle, cvk_ctx, 1);
+
+  printf("tensor and test %s\n", ret ? "fail" : "pass");
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_tensor_arith_shift.c b/cviruntime/test/180x/test_180x_tensor_arith_shift.c
new file mode 100644
index 000000000..a6fe77acf
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_tensor_arith_shift.c
@@ -0,0 +1,154 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_arith_shift_ref(
+    uint8_t *ref_high, uint8_t *ref_low,
+    uint8_t *a_high, uint8_t *a_low,
+    uint8_t *bits, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    int32_t ta = ((int8_t)a_high[i] << 8) + a_low[i];
+    int32_t tbits = (int8_t)bits[i];
+
+    /*
+     * Yes, a @tbits bigger than zero means shifting LEFT,
+     * no matter whether the shift type is arithmetic
+     * RIGHT shift or logic RIGHT shift.
+     */
+    int32_t res;
+    if (tbits >= 0)
+      res = ta << tbits;
+    else
+      res = ta >> -tbits;
+
+    ref_high[i] = (res >> 8) & 0xff;
+    ref_low[i] = res & 0xff;
+  }
+}
+
+static int test_tl_arith_shift(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 2; // 3 -> 2 for 1810
+  int c = 9; // 39 -> 9 for 180x
+  int h = 7;
+  int w = 37;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  uint32_t size = n * c * h * w;
+  uint8_t *a_high_data = (uint8_t *)malloc(size);
+  uint8_t *a_low_data = (uint8_t *)malloc(size);
+  uint8_t *bits_data = (uint8_t *)malloc(size);
+  uint8_t *ref_high_data = (uint8_t *)malloc(size);
+  uint8_t *ref_low_data = (uint8_t *)malloc(size);
+  if (!a_high_data || !a_low_data || !bits_data || !ref_high_data || !ref_low_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint32_t i = 0; i < size; i++) {
+    a_high_data[i] = 240 + i;
+    a_low_data[i] = 200 + i;
+    bits_data[i] = (i % 33) - 16;
+  }
+
+  tl_arith_shift_ref(
+      ref_high_data, ref_low_data,
+      a_high_data, a_low_data,
+      bits_data, size);
+
+  cvk_tl_t *tl_a_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_a_high = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_bits = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_res_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_res_high = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  if (!tl_a_low || !tl_a_high || !tl_bits || !tl_res_low || !tl_res_high) {
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a_low, a_low_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a_high, a_high_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_bits, bits_data);
+  cvk_tiu_arith_shift_param_t p8;
+  p8.res_high = tl_res_high;
+  p8.res_low = tl_res_low;
+  p8.a_high = tl_a_high;
+  p8.a_low = tl_a_low;
+  p8.bits = tl_bits;
+  cvk_ctx->ops->tiu_arith_shift(cvk_ctx, &p8);
+  uint8_t *res_high_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res_high);
+  uint8_t *res_low_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res_low);
+
+  for (uint32_t i = 0; i < size; i++) {
+    if (res_high_data[i] != ref_high_data[i]) {
+      fprintf(stderr, "comparing failed at res_high_data[%u], got %d, exp %d\n",
+             i, res_high_data[i], ref_high_data[i]);
+      ret = -1;
+      break;
+    }
+    if (res_low_data[i] != ref_low_data[i]) {
+      fprintf(stderr, "comparing failed at res_low_data[%u], got %d, exp %d\n",
+             i, res_low_data[i], ref_low_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+  free(res_high_data);
+  free(res_low_data);
+
+fail_exit_2:
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res_high);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res_low);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_bits);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a_high);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a_low);
+
+fail_exit:
+  free(a_high_data);
+  free(a_low_data);
+  free(bits_data);
+  free(ref_high_data);
+  free(ref_low_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_arith_shift(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_arith_shift(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_tensor_ge.c b/cviruntime/test/180x/test_180x_tensor_ge.c
new file mode 100644
index 000000000..8406faac1
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_tensor_ge.c
@@ -0,0 +1,122 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_ge_ref(int8_t *a, int8_t *b, int8_t *result, uint64_t size, cvk_fmt_t fmt)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    int32_t a32 = (fmt == CVK_FMT_I8) ? (int8_t)a[i] : (uint8_t)a[i];
+    int32_t b32 = (fmt == CVK_FMT_I8) ? (int8_t)b[i] : (uint8_t)b[i];
+    if (a32 >= b32)
+      result[i] = 1;
+    else
+      result[i] = 0;
+  }
+}
+
+static int test_tl_ge(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 3;
+  int c = 9; // 39 -> 9 for 180x
+  int h = 7;
+  int w = 37;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  uint32_t size = n * c * h * w;
+  int8_t *a_data = (int8_t *)malloc(size);
+  int8_t *b_data = (int8_t *)malloc(size);
+  int8_t *ref_data = (int8_t *)malloc(size);
+  if (!a_data || !b_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (int i = 0; i < 2; i++) {
+    for (uint32_t i = 0; i < size; i++)
+      a_data[i] = (int8_t)(i % 256);
+  
+    for (uint32_t i = 0; i < size; i++)
+      b_data[i] = (int8_t)(100 - i % 256);
+  
+    cvk_fmt_t fmt = (i == 0) ? CVK_FMT_I8 : CVK_FMT_U8;
+    tl_ge_ref(a_data, b_data, ref_data, size, fmt);
+  
+    cvk_tl_t *tl_a  = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt, eu_align);
+    cvk_tl_t *tl_b  = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt, eu_align);
+    cvk_tl_t *tl_ge = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt, eu_align);
+    if (!tl_a || !tl_b || !tl_ge) {
+      ret = -1;
+      goto fail_exit;
+    }
+
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, (uint8_t *)a_data);
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b, (uint8_t *)b_data);
+  
+    cvk_tiu_ge_param_t p;
+    memset(&p, 0, sizeof(p));
+    p.a = tl_a;
+    p.b_is_const = 0;
+    p.b = tl_b;
+    p.ge = tl_ge;
+    cvk_ctx->ops->tiu_ge(cvk_ctx, &p);
+    uint8_t *ge_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_ge);
+  
+    for (uint64_t i = 0; i < size; i++) {
+      if ((int8_t)ge_data[i] != ref_data[i]) {
+        fprintf(stderr, "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+               i, ge_data[i], ref_data[i]);
+        ret = -1;
+      }
+    }
+  
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_ge);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+    free(ge_data);
+  }
+
+fail_exit:
+  free(a_data);
+  free(b_data);
+  free(ref_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_ge(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_ge(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_tensor_ge_const.c b/cviruntime/test/180x/test_180x_tensor_ge_const.c
new file mode 100644
index 000000000..f2392a952
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_tensor_ge_const.c
@@ -0,0 +1,119 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_ge_const_ref(int8_t *a, int8_t b, int8_t *result, uint64_t size, cvk_fmt_t fmt)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    int32_t a32 = (fmt == CVK_FMT_I8) ? (int8_t)a[i] : (uint8_t)a[i];
+    int32_t b32 = (fmt == CVK_FMT_I8) ? (int8_t)b : (uint8_t)b;
+    if (a32 >= b32)
+      result[i] = 1;
+    else
+      result[i] = 0;
+  }
+}
+
+static int test_tl_ge_const(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  uint32_t size = n * c * h * w;
+  int8_t *a_data = (int8_t *)malloc(size);
+  int8_t *ref_data = (int8_t *)malloc(size);
+  if (!a_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (int i = 0; i < 2; i++) {
+    for (uint32_t i = 0; i < size; i++)
+      a_data[i] = (int8_t)(i % 256);
+
+    int8_t b = 47;
+
+    cvk_fmt_t fmt = (i == 1) ? CVK_FMT_I8 : CVK_FMT_U8;
+    tl_ge_const_ref(a_data, b, ref_data, size, fmt);
+  
+    cvk_tl_t *tl_a = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt, eu_align);
+    cvk_tl_t *tl_ge = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt, eu_align);
+    uint8_t *ge_data = NULL;
+    if (!tl_a || !tl_ge) {
+      ret = -1;
+      goto fail_exit_2;
+    }
+  
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, (uint8_t *)a_data);
+    cvk_tiu_ge_param_t p;
+    memset(&p, 0, sizeof(p));
+    p.ge = tl_ge;
+    p.a = tl_a;
+    p.b_is_const = 1;
+    p.b_const.val = b;
+    p.b_const.is_signed = i;
+    cvk_ctx->ops->tiu_ge(cvk_ctx, &p);
+    ge_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_ge);
+
+    for (uint64_t i = 0; i < size; i++) {
+      if ((int8_t)ge_data[i] != (int8_t)ref_data[i]) {
+        fprintf(stderr, "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+               i, ge_data[i], ref_data[i]);
+        ret = -1;
+        break;
+      }
+    }
+
+fail_exit_2:
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_ge);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+    free(ge_data);
+  }
+
+fail_exit:
+  free(a_data);
+  free(ref_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_ge_const(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_ge_const(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_tensor_mac.c b/cviruntime/test/180x/test_180x_tensor_mac.c
new file mode 100644
index 000000000..02c368b26
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_tensor_mac.c
@@ -0,0 +1,184 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_mac_ref(
+    uint8_t *ref_high, uint8_t *ref_low,
+    uint8_t *a, uint8_t *b, uint8_t *c_high, uint8_t *c_low,
+    int lshift_bits, int rshift_bits, uint64_t size, int relu_enable)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    int32_t ta = (int8_t)a[i];
+    int32_t tb = (int8_t)b[i];
+    int32_t tc = ((int8_t)c_high[i] << 8) + c_low[i];
+    tc <<= lshift_bits;
+    int32_t res = ta * tb + tc;
+
+    res += 1 << (rshift_bits - 1);
+    res >>= rshift_bits;
+    if(relu_enable)
+    {
+      if (res > 127)
+        res = 127;
+      else if (res < -128)
+        res = -128;
+
+      if(relu_enable)
+        if(res<0)
+          res=0;
+      ref_high[i] = 0;
+      ref_low[i] = res & 0xff;
+
+    }else{
+      if (res > 32767)
+        res = 32767;
+      else if (res < -32768)
+        res = -32768;
+      ref_high[i] = (res >> 8) & 0xff;
+      ref_low[i] = res & 0xff;
+    }
+  }
+}
+
+static int test_tl_mac(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int lshift_bits;
+  int rshift_bits;
+  int n = 3;
+  int c = 9; // 39 -> 9 for 180x
+  int h = 7;
+  int w = 37;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  uint32_t size = n * c * h * w;
+  uint8_t *a_data = (uint8_t *)malloc(size);
+  uint8_t *b_data = (uint8_t *)malloc(size);
+  uint8_t *c_high_data = (uint8_t *)malloc(size);
+  uint8_t *c_low_data = (uint8_t *)malloc(size);
+  uint8_t *ref_high_data = (uint8_t *)malloc(size);
+  uint8_t *ref_low_data = (uint8_t *)malloc(size);
+  if (!a_data || !b_data || !c_high_data || !c_low_data || !ref_high_data || !ref_low_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for(int relu_enable = 0; relu_enable < 2; relu_enable++) {
+    for (uint32_t i = 0; i < size; i++) {
+      a_data[i] = rand() % 128;
+      b_data[i] = 100 - i;
+      c_high_data[i] = rand() % 64;
+      c_low_data[i] = 200 + 2 * i;
+    }
+
+    if(relu_enable) {
+      lshift_bits= 1;
+      rshift_bits = 7;
+    }else {
+      lshift_bits = 1;
+      rshift_bits = 3;
+    }
+
+    tl_mac_ref(ref_high_data, ref_low_data,
+               a_data, b_data, c_high_data, c_low_data,
+               lshift_bits, rshift_bits, size, relu_enable);
+
+    cvk_tl_t *tl_a = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+    cvk_tl_t *tl_b = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+    cvk_tl_t *tl_c_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+    cvk_tl_t *tl_c_high = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+    uint8_t *mac_high_data = NULL, *mac_low_data = NULL;
+    if (!tl_a || !tl_b || !tl_c_low || !tl_c_high) {
+      ret = -1;
+      goto fail_exit_2;
+    }
+
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, a_data);
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b, b_data);
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_c_low, c_low_data);
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_c_high, c_high_data);
+    cvk_tiu_mac_param_t p2;
+    p2.res_high = tl_c_high;
+    p2.res_low = tl_c_low;
+    p2.res_is_int8 = relu_enable;
+    p2.a = tl_a;
+    p2.b_is_const = 0;
+    p2.b = tl_b;
+    p2.lshift_bits = lshift_bits;
+    p2.rshift_bits = rshift_bits;
+    p2.relu_enable = relu_enable;
+    cvk_ctx->ops->tiu_mac(cvk_ctx, &p2);
+    mac_high_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_c_high);
+    mac_low_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_c_low);
+
+    for (uint32_t i = 0; i < size; i++) {
+      if(!relu_enable)
+        if (mac_high_data[i] != ref_high_data[i]) {
+          fprintf(stderr, "comparing failed at mac_high_data[%u], got %d, exp %d\n",
+                 i, mac_high_data[i], ref_high_data[i]);
+          ret = -1;
+        }
+      if (mac_low_data[i] != ref_low_data[i]) {
+        fprintf(stderr, "comparing failed at mac_low_data[%u], got %d, exp %d\n",
+               i, mac_low_data[i], ref_low_data[i]);
+        ret = -1;
+      }
+    }
+
+fail_exit_2:
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_c_high);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_c_low);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+
+    free(mac_high_data);
+    free(mac_low_data);
+  }
+
+fail_exit:
+  free(a_data);
+  free(b_data);
+  free(c_high_data);
+  free(c_low_data);
+  free(ref_high_data);
+  free(ref_low_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_mac(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_mac(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_tensor_mac_const.c b/cviruntime/test/180x/test_180x_tensor_mac_const.c
new file mode 100644
index 000000000..4f3862d27
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_tensor_mac_const.c
@@ -0,0 +1,184 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_mac_const_ref(
+    uint8_t *ref_high, uint8_t *ref_low,
+    uint8_t *a, uint8_t b_const, int b_is_signed,
+    uint8_t *c_high, uint8_t *c_low,
+    int lshift_bits, int rshift_bits, uint64_t size, int relu_enable)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    int32_t ta = (int8_t)a[i];
+    int32_t tb = b_is_signed? (int8_t)b_const: (uint8_t)b_const;
+    int32_t tc = ((int8_t)c_high[i] << 8) + c_low[i];
+    tc <<= lshift_bits;
+    int32_t res = ta * tb + tc;
+
+    res += 1 << (rshift_bits - 1);
+    res >>= rshift_bits;
+
+    if(relu_enable)
+    {
+      if (res > 127)
+        res = 127;
+      else if (res < -128)
+        res = -128;
+
+      if(relu_enable)
+        if(res<0)
+          res=0;
+      ref_high[i] = 0;
+      ref_low[i] = res & 0xff;
+
+    }else{
+      if (res > 32767)
+        res = 32767;
+      else if (res < -32768)
+        res = -32768;
+      ref_high[i] = (res >> 8) & 0xff;
+      ref_low[i] = res & 0xff;
+    }
+  }
+}
+
+static int test_tl_mac_const(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int lshift_bits;
+  int rshift_bits;
+  int n = 3;
+  int c = 9; // 39 -> 9 for 180x
+  int h = 7;
+  int w = 37;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  uint32_t size = n * c * h * w;
+  uint8_t *a_data = (uint8_t *)malloc(size);
+  uint8_t *c_high_data = (uint8_t *)malloc(size);
+  uint8_t *c_low_data = (uint8_t *)malloc(size);
+  uint8_t *ref_high_data = (uint8_t *)malloc(size);
+  uint8_t *ref_low_data = (uint8_t *)malloc(size);
+  if (!a_data || !c_high_data || !c_low_data || !ref_high_data || !ref_low_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for(int relu_enable = 0; relu_enable < 2; relu_enable++) {
+
+    for (uint64_t i = 0; i < size; i++) {
+      a_data[i] = rand() % 256;
+      c_high_data[i] = rand() % 64;
+      c_low_data[i] = 200 + 2 * i;
+    }
+
+    uint8_t b_const = 37;
+    int b_is_signed = 1;
+     if(relu_enable) {
+      lshift_bits = 1;
+      rshift_bits = 8;
+    }else {
+      lshift_bits = 1;
+      rshift_bits = 3;
+    }
+
+    tl_mac_const_ref(ref_high_data, ref_low_data,
+                     a_data, b_const, b_is_signed, c_high_data, c_low_data,
+                     lshift_bits, rshift_bits, size, relu_enable);
+
+    cvk_tl_t *tl_a = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+    cvk_tl_t *tl_c_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+    cvk_tl_t *tl_c_high = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+    uint8_t *mac_high_data = NULL, *mac_low_data = NULL;
+    if (!tl_a || !tl_c_low || !tl_c_high) {
+      ret = -1;
+      goto fail_exit_2;
+    }
+
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, a_data);
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_c_low, c_low_data);
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_c_high, c_high_data);
+    cvk_tiu_mac_param_t p3;
+    p3.res_high = tl_c_high;
+    p3.res_low = tl_c_low;
+    p3.res_is_int8 = relu_enable;
+    p3.a = tl_a;
+    p3.b_is_const = 1;
+    p3.b_const.val = b_const;
+    p3.b_const.is_signed = b_is_signed;
+    p3.lshift_bits = lshift_bits;
+    p3.rshift_bits = rshift_bits;
+    p3.relu_enable = relu_enable;
+    cvk_ctx->ops->tiu_mac(cvk_ctx, &p3);
+    mac_high_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_c_high);
+    mac_low_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_c_low);
+    for (uint64_t i = 0; i < size; i++) {
+      if(!relu_enable)
+        if (mac_high_data[i] != ref_high_data[i]) {
+          fprintf(stderr, "comparing failed at mac_high_data[%" PRIu64 "], got %d, exp %d\n",
+                 i, mac_high_data[i], ref_high_data[i]);
+          ret = -1;
+          break;
+        }
+      if (mac_low_data[i] != ref_low_data[i]) {
+        fprintf(stderr, "comparing failed at mac_low_data[%" PRIu64 "], got %d, exp %d\n",
+               i, mac_low_data[i], ref_low_data[i]);
+        ret = -1;
+        break;
+      }
+    }
+
+fail_exit_2:
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_c_high);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_c_low);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+    free(mac_high_data);
+    free(mac_low_data);
+  }
+
+fail_exit:
+  free(a_data);
+  free(c_high_data);
+  free(c_low_data);
+  free(ref_high_data);
+  free(ref_low_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_mac_const(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_mac_const(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_tensor_max.c b/cviruntime/test/180x/test_180x_tensor_max.c
new file mode 100644
index 000000000..42ffd9f1c
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_tensor_max.c
@@ -0,0 +1,115 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_max_ref(int8_t *a, int8_t *b, int8_t *max, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    if (a[i] > b[i])
+      max[i] = a[i];
+    else
+      max[i] = b[i];
+  }
+}
+
+static int test_tl_max(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 9; // 39 -> 9 for 180x
+  int h = 7;
+  int w = 37;
+  int ret = 0;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  uint32_t size = n * c * h * w;
+  int8_t *a_data = (int8_t *)malloc(size);
+  int8_t *b_data = (int8_t *)malloc(size);
+  int8_t *ref_data = (int8_t *)malloc(size);
+  uint8_t *max_data = NULL;
+  if (!a_data || !b_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint32_t i = 0; i < size; i++)
+    a_data[i] = (int8_t)(i % 256);
+
+  for (uint32_t i = 0; i < size; i++)
+    b_data[i] = (int8_t)(100 - i % 256);
+
+  tl_max_ref(a_data, b_data, ref_data, size);
+
+  cvk_tl_t *tl_a = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_b = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_max = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, (uint8_t *)a_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b, (uint8_t *)b_data);
+
+  cvk_tiu_max_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.max = tl_max;
+  p.a = tl_a;
+  p.b_is_const = 0;
+  p.b = tl_b;
+  cvk_ctx->ops->tiu_max(cvk_ctx, &p);
+  max_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_max);
+
+  for (uint64_t i = 0; i < size; i++) {
+    if ((int8_t)max_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+             i, max_data[i], ref_data[i]);
+      ret = -1;
+      goto fail_exit;
+    }
+  }
+
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_max);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+
+fail_exit:
+  free(a_data);
+  free(b_data);
+  free(ref_data);
+  free(max_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_max(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_max(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_tensor_max_const.c b/cviruntime/test/180x/test_180x_tensor_max_const.c
new file mode 100644
index 000000000..8287a7d9f
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_tensor_max_const.c
@@ -0,0 +1,109 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_max_const_ref(int8_t *a, int8_t b, int8_t *max, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    if (a[i] > b)
+      max[i] = a[i];
+    else
+      max[i] = b;
+  }
+}
+
+static int test_tl_max_const(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  uint32_t size = n * c * h * w;
+  int8_t *a_data = (int8_t *)malloc(size);
+  int8_t *ref_data = (int8_t *)malloc(size);
+  uint8_t *max_data = NULL;
+  if (!a_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint32_t i = 0; i < size; i++)
+    a_data[i] = (int8_t)(i % 256);
+
+  int8_t b = 47;
+
+  tl_max_const_ref(a_data, b, ref_data, size);
+
+  cvk_tl_t *tl_a = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_max = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, (uint8_t *)a_data);
+  cvk_tiu_max_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.max = tl_max;
+  p.a = tl_a;
+  p.b_is_const = 1;
+  p.b_const.val = b;
+  p.b_const.is_signed = 1;
+  cvk_ctx->ops->tiu_max(cvk_ctx, &p);
+  max_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_max);
+
+  for (uint32_t i = 0; i < size; i++) {
+    if ((int8_t)max_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%u], got %x, exp %x\n",
+             i, max_data[i], ref_data[i]);
+      ret = -1;
+      goto fail_exit;
+    }
+  }
+
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_max);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+
+fail_exit:
+  free(a_data);
+  free(ref_data);
+  free(max_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_max_const(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_max_const(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return 0;
+}
diff --git a/cviruntime/test/180x/test_180x_tensor_min.c b/cviruntime/test/180x/test_180x_tensor_min.c
new file mode 100644
index 000000000..ec585c94f
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_tensor_min.c
@@ -0,0 +1,117 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_min_ref(int8_t *a, int8_t *b, int8_t *max, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    if (a[i] > b[i])
+      max[i] = b[i];
+    else
+      max[i] = a[i];
+  }
+}
+
+static int test_tl_min(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 9; // 39 -> 9 for 180x
+  int h = 7;
+  int w = 37;
+  int ret = 0;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  uint32_t size = n * c * h * w;
+  int8_t *a_data = (int8_t *)malloc(size);
+  int8_t *b_data = (int8_t *)malloc(size);
+  int8_t *ref_data = (int8_t *)malloc(size);
+  if (!a_data || !b_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint32_t i = 0; i < size; i++)
+    a_data[i] = (int8_t)(i % 256);
+
+  for (uint32_t i = 0; i < size; i++)
+    b_data[i] = (int8_t)(100 - i % 256);
+
+  tl_min_ref(a_data, b_data, ref_data, size);
+
+  cvk_tl_t *tl_a = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_b = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_min = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  if (!tl_a || !tl_b || !tl_min) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, (uint8_t *)a_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b, (uint8_t *)b_data);
+  cvk_tiu_min_param_t p6;
+  p6.min = tl_min;
+  p6.a = tl_a;
+  p6.b_is_const = 0;
+  p6.b = tl_b;
+  cvk_ctx->ops->tiu_min(cvk_ctx, &p6);
+  uint8_t *min_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_min);
+
+  for (uint32_t i = 0; i < size; i++) {
+    if ((int8_t)min_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%u], got %x, exp %x\n",
+             i, min_data[i], ref_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_min);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+
+  free(min_data);
+
+fail_exit:
+  free(a_data);
+  free(b_data);
+  free(ref_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_min(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_min(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_tensor_min_const.c b/cviruntime/test/180x/test_180x_tensor_min_const.c
new file mode 100644
index 000000000..6010c3af2
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_tensor_min_const.c
@@ -0,0 +1,113 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_min_const_ref(int8_t *a, int8_t b, int8_t *max, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    if (a[i] > b)
+      max[i] = b;
+    else
+      max[i] = a[i];
+  }
+}
+
+static int test_tl_min_const(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  uint32_t size = n * c * h * w;
+  int8_t *a_data = (int8_t *)malloc(size);
+  int8_t *ref_data = (int8_t *)malloc(size);
+  if (!a_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint32_t i = 0; i < size; i++)
+    a_data[i] = (int8_t)(i % 256);
+
+  int8_t b = 47;
+
+  tl_min_const_ref(a_data, b, ref_data, size);
+
+  cvk_tl_t *tl_a = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_min = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  uint8_t *min_data = NULL;
+  if (!tl_a || !tl_min) {
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, (uint8_t *)a_data);
+  cvk_tiu_min_param_t p7;
+  p7.min = tl_min;
+  p7.a = tl_a;
+  p7.b_is_const = 1;
+  p7.b_const.val = b;
+  p7.b_const.is_signed = 1;
+  cvk_ctx->ops->tiu_min(cvk_ctx, &p7);
+  min_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_min);
+
+  for (uint32_t i = 0; i < size; i++) {
+    if ((int8_t)min_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%u], got %x, exp %x\n",
+             i, min_data[i], ref_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+fail_exit_2:
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_min);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+  free(min_data);
+
+fail_exit:
+  free(a_data);
+  free(ref_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_min_const(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_min_const(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_tensor_mul.c b/cviruntime/test/180x/test_180x_tensor_mul.c
new file mode 100644
index 000000000..903610639
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_tensor_mul.c
@@ -0,0 +1,135 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_mul_ref(int8_t *ofmap, int8_t *a, int8_t *b, uint64_t size, int shift_bits, int relu_enable)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    int32_t tmp = a[i] * b[i];
+    tmp += 1 << (shift_bits - 1);
+    tmp >>= shift_bits;
+    if (tmp > 127)
+      tmp = 127;
+    else if (tmp < -128)
+      tmp = -128;
+    if(relu_enable)
+      if(tmp<0)
+        tmp=0;
+    ofmap[i] = tmp;
+    
+  }
+}
+
+static int test_tl_mul(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 9; // 39 -> 9 for 180x
+  int h = 7;
+  int w = 37;
+  int ret = 0;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  uint32_t size = n * c * h  * w;
+  int shift_bits = 1;
+
+  int8_t *a_data = (int8_t *)malloc(size);
+  int8_t *b_data = (int8_t *)malloc(size);
+  int8_t *ref_data = (int8_t *)malloc(size);
+  if (!a_data || !b_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint32_t relu_enable = 0; relu_enable < 2; relu_enable++)
+  {
+    for (uint32_t i = 0; i < size; i++) {
+      a_data[i] = random()%0x10;
+      b_data[i] = 128 - i;
+    }
+   
+    cvk_tl_t *tl_a = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+    cvk_tl_t *tl_b = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+    cvk_tl_t *tl_res_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+    uint8_t *res_low_data = NULL;
+    if (!tl_a || !tl_b || !tl_res_low) {
+      ret = -1;
+      goto fail_exit_2;
+    }
+   
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, (uint8_t *)a_data);
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b, (uint8_t *)b_data);
+   
+    cvk_tiu_mul_param_t p1;
+    p1.res_high = NULL;
+    p1.res_low = tl_res_low;
+    p1.a = tl_a;
+    p1.b_is_const = 0;
+    p1.b = tl_b;
+    p1.rshift_bits = shift_bits;
+    p1.relu_enable = relu_enable;
+    cvk_ctx->ops->tiu_mul(cvk_ctx, &p1);
+   
+    res_low_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res_low);
+
+    tl_mul_ref(ref_data, a_data, b_data, size, shift_bits, relu_enable);
+   
+    for (uint32_t i = 0; i < size; i++) {
+      if ((int8_t)res_low_data[i] != ref_data[i]) {
+        fprintf(stderr, "comparing failed at res_low_data[%u], got %x, exp %x\n",
+               i, res_low_data[i], ref_data[i]);
+        ret = -1;
+        break;
+      }
+    }
+
+fail_exit_2:
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res_low);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+    free(res_low_data);
+  }
+
+fail_exit:
+  free(a_data);
+  free(b_data);
+  free(ref_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_mul(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_mul(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_tensor_mul_const.c b/cviruntime/test/180x/test_180x_tensor_mul_const.c
new file mode 100644
index 000000000..c704963d6
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_tensor_mul_const.c
@@ -0,0 +1,134 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_mul_const_ref(
+    int8_t *ofmap, int8_t *ifmap, uint64_t size, int8_t mul_const, int shift_bits, int relu_enable)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    int32_t tmp = ifmap[i] * mul_const;
+    tmp += 1 << (shift_bits - 1);
+    tmp >>= shift_bits;
+    if (tmp > 127)
+      tmp = 127;
+    else if (tmp < -128)
+      tmp = -128;
+    if(relu_enable)
+      if(tmp<0)
+        tmp=0;
+
+    ofmap[i] = tmp;
+  }
+}
+
+static int test_tl_mul_const(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  uint32_t size = n * c * h  * w;
+
+  int8_t *ifmap_data = (int8_t *)malloc(size);
+  int8_t *ref_data = (int8_t *)malloc(size);
+  if (!ifmap_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint32_t relu_enable = 0; relu_enable < 2; relu_enable++)
+  {
+    for (uint32_t i = 0; i < size; i++)
+      ifmap_data[i] = (uint8_t)(random() % 256);
+  
+    int8_t mul_const = 20;
+    int shift_bits = 1;
+  
+    tl_mul_const_ref(ref_data, ifmap_data, size, mul_const, shift_bits, relu_enable);
+  
+    cvk_tl_t *tl_ifmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+    cvk_tl_t *tl_ofmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+    uint8_t *ofmap_data = NULL;
+    if (!tl_ifmap || !tl_ofmap) {
+      ret = -1;
+      goto fail_exit_2;
+    }
+  
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_ifmap, (uint8_t *)ifmap_data);
+  
+    cvk_tiu_mul_param_t p;
+    memset(&p, 0, sizeof(p));
+    p.res_high = NULL;
+    p.res_low = tl_ofmap;
+    p.a = tl_ifmap;
+    p.b_is_const = 1;
+    p.b_const.val = mul_const;
+    p.b_const.is_signed = 1;
+    p.rshift_bits = shift_bits;
+    p.relu_enable = relu_enable;
+
+    cvk_ctx->ops->tiu_mul(cvk_ctx, &p);
+  
+    ofmap_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_ofmap);
+  
+    for (uint32_t i = 0; i < size; i++) {
+      if ((int8_t)ofmap_data[i] != ref_data[i]) {
+        fprintf(stderr, "comparing failed at ofmap_data[%u], got %x, exp %x\n",
+               i, ofmap_data[i], ref_data[i]);
+        ret = -1;
+        break;
+      }
+    }
+  
+fail_exit_2:
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_ofmap);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_ifmap);
+    free(ofmap_data);
+  }
+
+fail_exit:
+  free(ifmap_data);
+  free(ref_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_mul_const(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_mul_const(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_tensor_mul_qm.c b/cviruntime/test/180x/test_180x_tensor_mul_qm.c
new file mode 100644
index 000000000..711dc5781
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_tensor_mul_qm.c
@@ -0,0 +1,589 @@
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_tf_quant_util.h"
+
+// #define ENABLE_DEBUG_MSG
+
+#define MIN_EXEC_TESTS  20
+
+typedef struct {
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int relu_enable;
+  int8_t *input1_data;
+  int8_t *input2_data;
+  int8_t *output_data;
+  uint32_t multiplier;
+  int8_t right_shift;
+  float float_multiplier;
+  int retry_cnt;
+} elt_mul_test_param_t;
+
+void elt_mul_ref(elt_mul_test_param_t *p_param)
+{
+  int input_n = p_param->input_n;
+  int input_c = p_param->input_c;
+  int input_h = p_param->input_h;
+  int input_w = p_param->input_w;
+  int32_t output_multiplier = p_param->multiplier;
+  int8_t output_rshift = p_param->right_shift;
+  int8_t *input1_data = p_param->input1_data;
+  int8_t *input2_data = p_param->input2_data;
+  int8_t *output_data = p_param->output_data;
+
+  int32_t quantized_activation_min = -128;
+  int32_t quantized_activation_max = 127;
+
+  int size = input_n * input_c * input_h * input_w;
+#ifdef ENABLE_DEBUG_MSG
+  printf("elt_mul_ref:\n");
+  printf("  shape (%d, %d, %d, %d)\n", input_n, input_c, input_h, input_w);
+#endif
+  for (int i = 0; i < size; ++i) {
+    const int32_t input1_val = input1_data[i];
+    const int32_t input2_val = input2_data[i];
+    const int32_t unclamped_result = MultiplyByQuantizedMultiplier(
+        input1_val * input2_val, output_multiplier, output_rshift);
+    const int32_t clamped_output =
+        MIN(quantized_activation_max,
+                 MAX(quantized_activation_min, unclamped_result));
+
+#ifdef ENABLE_DEBUG_MSG
+    printf("  [%d] unclamped_result %d,  clamped_output %d\n", i,
+           unclamped_result, clamped_output);
+#endif
+
+    output_data[i] = clamped_output;
+  }
+}
+
+void calc_elt_mul_float_multiplier(elt_mul_test_param_t *p_param)
+{
+  int input_n = p_param->input_n;
+  int input_c = p_param->input_c;
+  int input_h = p_param->input_h;
+  int input_w = p_param->input_w;
+  int8_t *input1_data = p_param->input1_data;
+  int8_t *input2_data = p_param->input2_data;
+
+  int output_min = INT_MAX;
+  int output_max = INT_MIN;
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("calc_elt_mul_float_multiplier =>\n");
+#endif
+
+  int size = input_n * input_c * input_h * input_w;
+  for (int i = 0; i < size; ++i) {
+    const int32_t input1_val = input1_data[i];
+    const int32_t input2_val = input2_data[i];
+
+    const int32_t val = input1_val * input2_val;
+
+    output_max = MAX(val, output_max);
+    output_min = MIN(val, output_min);
+  }
+
+  // Since int8 ranges from -128 to 127, we need to squeeze the accumulator
+  // MIN/MAX fit in those ranges correspondingly as much as possible.
+  if (abs(output_max) > abs(output_min)) {
+    p_param->float_multiplier = 127.0f / abs(output_max);
+  } else {
+    p_param->float_multiplier = 128.0f / abs(output_min);
+  }
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("  output_accu_min %d, output_accu_max %d, output_multiplier %f\n",
+         output_min, output_max, p_param->float_multiplier);
+#endif
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("<= calc_elt_mul_float_multiplier\n");
+#endif
+}
+
+int simple_test(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+
+  // TFL: QuantizedMulOpTest.NoActivationInt8
+  int size = 4;
+  int8_t input1_data[4] = {-102, 25, 115, 89};
+  int8_t input2_data[4] = {77, 51, 115, 102};
+  int8_t ref_output_data[4] = {-62, 10, 104, 71};
+  int8_t output_data[4];
+  uint32_t output_multiplier = 1077952640;
+  int8_t output_rshift = 6;  // change to right shift
+
+  elt_mul_test_param_t test_param;
+  memset(&test_param, 0, sizeof(test_param));
+
+  test_param.input_n = 1;
+  test_param.input_c = 1;
+  test_param.input_h = 1;
+  test_param.input_w = 4;
+  test_param.input1_data = input1_data;
+  test_param.input2_data = input2_data;
+  test_param.output_data = output_data;
+  test_param.multiplier = output_multiplier;
+  test_param.right_shift = output_rshift;
+  elt_mul_ref(&test_param);
+
+  for (int i = 0; i < size; ++i) {
+    if (output_data[i] != ref_output_data[i]) {
+      printf("  Error ! output_data[%d] = %d != %d\n", i, output_data[i],
+             ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  cvk_tl_shape_t tl_shape = {1, 1, 1, size};
+  cvk_tl_t *tl_a = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, /*align=*/1);
+  cvk_tl_t *tl_b = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, /*align=*/1);
+  cvk_tl_t *tl_res = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, /*align=*/1);
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, (uint8_t *)input1_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b, (uint8_t *)input2_data);
+
+  {
+    cvk_tiu_mul_qm_param_t p1;
+    p1.res_high = NULL;
+    p1.res_low = tl_res;
+    p1.a = tl_a;
+    p1.b_is_const = 0;
+    p1.b = tl_b;
+    p1.rshift_bits = output_rshift;
+    p1.relu_enable = 0;
+    p1.multiplier = output_multiplier;
+    cvk_ctx->ops->tiu_mul_qm(cvk_ctx, &p1);
+  }
+
+  int8_t *res_tiu_data =
+      (int8_t *)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res);
+  for (int i = 0; i < size; ++i) {
+    if (res_tiu_data[i] != ref_output_data[i]) {
+      printf("  Error ! result[%d] %d != %d\n", i, res_tiu_data[i],
+             ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  free(res_tiu_data);
+
+  // Reserver order
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+
+  return ret;
+}
+
+int choose_from_range(int table[], int size, int index)
+{
+  if (index >= size) {
+    return 0;
+  }
+
+  int val = table[index];
+  if (index < (size - 1)) {
+    int range = MAX(table[index + 1] - table[index] - 1, 1);
+    val += rand() % range;
+  }
+
+  return val;
+}
+
+bool check_valid_test_param(cvk_context_t *cvk_ctx, elt_mul_test_param_t *p_param)
+{
+  uint32_t input_n = p_param->input_n;
+  uint32_t input_c = p_param->input_c;
+  uint32_t input_h = p_param->input_h;
+  uint32_t input_w = p_param->input_w;
+
+  // input1, input2, output
+  uint32_t total_needed_size = 3 * input_n * input_c * input_h * input_w;
+
+  uint32_t lmem_size_per_lane = cvk_ctx->info.lmem_size;
+  uint32_t total_lmem_size = cvk_ctx->info.lmem_size * cvk_ctx->info.npu_num;
+
+  if (total_needed_size > total_lmem_size) {
+    return false;
+  }
+
+  cvk_tl_shape_t input_shape = {input_n, input_c, input_h, input_w};
+
+  uint32_t needed_size =
+      3 * cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, input_shape, CVK_FMT_I8, /*eu_align=*/1);
+
+  // Skip invalid shape
+  if (needed_size > lmem_size_per_lane) {
+    return false;
+  }
+
+  return true;
+}
+
+void fill_random_data_s8(int8_t *input_data, int size)
+{
+  for (int i = 0; i < size; ++i) {
+    int is_satured = ((rand() % 1000) == 1) ? 1 : 0;
+    int is_sign = rand() % 2 ? 1 : -1;
+
+    if (is_satured && is_sign) {
+      input_data[i] = -128;
+    } else if (is_satured) {
+      input_data[i] = 127;
+    } else {
+      input_data[i] = is_sign * rand() % 128;
+    }
+  }
+}
+
+void dump_test_param(elt_mul_test_param_t *p_param, bool dump_content)
+{
+  printf("Dump test parameter:\n");
+  printf("  input_n %d\n", p_param->input_n);
+  printf("  input_c %d\n", p_param->input_c);
+  printf("  input_h %d\n", p_param->input_h);
+  printf("  input_w %d\n", p_param->input_w);
+  printf("  multiplier %d\n", p_param->multiplier);
+  printf("  right_shift %d\n", p_param->right_shift);
+
+  if (dump_content) {
+    printf("input1_data(%d, %d, %d, %d) :\n", p_param->input_n,
+           p_param->input_c, p_param->input_h, p_param->input_w);
+    int in = p_param->input_n;
+    int ic = p_param->input_c;
+    int ih = p_param->input_h;
+    int iw = p_param->input_w;
+    for (int i = 0; i < in; ++i) {
+      for (int j = 0; j < ic; ++j) {
+        for (int k = 0; k < ih; ++k) {
+          for (int l = 0; l < iw; ++l) {
+            int offset = i * (ic * ih * iw) + j * (ih * iw) + k * iw + l;
+            printf("%d, ", p_param->input1_data[offset]);
+          }
+          printf("\n");
+        }
+      }
+    }
+    printf("\n\n");
+
+    printf("input2_data(%d, %d, %d, %d) :\n", p_param->input_n,
+           p_param->input_c, p_param->input_h, p_param->input_w);
+    for (int i = 0; i < in; ++i) {
+      for (int j = 0; j < ic; ++j) {
+        for (int k = 0; k < ih; ++k) {
+          for (int l = 0; l < iw; ++l) {
+            int offset = i * (ic * ih * iw) + j * (ih * iw) + k * iw + l;
+            printf("%d, ", p_param->input2_data[offset]);
+          }
+          printf("\n");
+        }
+      }
+    }
+    printf("\n\n");
+  }
+}
+
+int run_compare_elt_mul(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx,
+                        elt_mul_test_param_t *p_param)
+{
+  int ret = 0;
+
+  int input_n = p_param->input_n;
+  int input_c = p_param->input_c;
+  int input_h = p_param->input_h;
+  int input_w = p_param->input_w;
+
+  int input_size = input_n * input_c * input_h * input_w;
+  int8_t *input1_data = (int8_t *)malloc(input_size);
+  int8_t *input2_data = (int8_t *)malloc(input_size);
+  int8_t *output_data = (int8_t *)malloc(input_size);
+
+  p_param->input1_data = input1_data;
+  p_param->input2_data = input2_data;
+  p_param->output_data = output_data;
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    run_compare_elt_mul => \n");
+#endif
+
+  int retry_cnt = p_param->retry_cnt;
+  do {
+    fill_random_data_s8(input1_data, input_size);
+    fill_random_data_s8(input2_data, input_size);
+
+    p_param->float_multiplier = 100.0;  // should be < 1.0
+    calc_elt_mul_float_multiplier(p_param);
+
+    if (p_param->float_multiplier > 0.f && p_param->float_multiplier < 1.0) {
+      break;
+    }
+
+  } while (--retry_cnt);
+
+  if (p_param->float_multiplier >= 1.0) {
+    printf("    run_compare_elt_mul: unable to find valid multiplier\n");
+    free(input1_data);
+    free(input2_data);
+    free(output_data);
+    return -1;
+  }
+
+  uint32_t base_multiplier = 0;
+  int base_shift = 0;
+  QuantizeMultiplierSmallerThanOne(p_param->float_multiplier, &base_multiplier,
+                                   &base_shift);
+
+  // multipliers typically range in [2^30 ; 2^31 - 1].
+  // Values in [0, 2^30 - 1] are normally unused, but harmless.
+  // Thus a good way to randomize multipliers is to subtract from them
+  // a random value smaller than 2^30 but still significant compared to it.
+  uint32_t output_multiplier = base_multiplier - (rand() % (1 << 26));
+
+  int right_shift = base_shift - 1 + (rand() % 4);
+  int8_t output_right_shift = truncate_rshift((int8_t)right_shift, /*allow_lshift*/1);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("      multiplier_data %d, shift_data %d\n", output_multiplier,
+         output_right_shift);
+#endif
+
+  p_param->multiplier = output_multiplier;
+  p_param->right_shift = output_right_shift;
+
+  elt_mul_ref(p_param);
+
+  cvk_tl_shape_t input_shape = {input_n, input_c, input_h, input_w};
+
+  cvk_tl_t *tl_input1 =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, input_shape, CVK_FMT_I8, /*eu_aign=*/1);
+
+  cvk_tl_t *tl_input2 =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, input_shape, CVK_FMT_I8, /*eu_aign=*/1);
+
+  cvk_tl_t *tl_output =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, input_shape, CVK_FMT_I8, /*eu_aign=*/1);
+
+  if (tl_input1 == NULL) {
+    printf("    fail to alloc tl_input1 (%d, %d, %d, %d)\n", input_n, input_c,
+           input_h, input_w);
+    return -1;
+  }
+  if (tl_input2 == NULL) {
+    printf("    fail to alloc tl_input2 (%d, %d, %d, %d)\n", input_n, input_c,
+           input_h, input_w);
+    return -1;
+  }
+  if (tl_output == NULL) {
+    printf("    fail to alloc tl_output (%d, %d, %d, %d)\n", input_n, input_c,
+           input_h, input_w);
+    return -1;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_input1, (uint8_t *)input1_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_input2, (uint8_t *)input2_data);
+
+  {
+    cvk_tiu_mul_qm_param_t p1;
+    p1.res_high = NULL;
+    p1.res_low = tl_output;
+    p1.a = tl_input1;
+    p1.b_is_const = 0;
+    p1.b = tl_input2;
+    p1.rshift_bits = (uint8_t)output_right_shift;
+    p1.relu_enable = 0;
+    p1.multiplier = output_multiplier;
+    cvk_ctx->ops->tiu_mul_qm(cvk_ctx, &p1);
+  }
+
+
+  CVI_RT_Submit(cvk_ctx);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("      compare result:\n");
+#endif
+  int8_t *tiu_output_data =
+      (int8_t *)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_output);
+  for (int i = 0; i < input_n; ++i) {
+    for (int j = 0; j < input_c; ++j) {
+      for (int k = 0; k < input_h; ++k) {
+        for (int l = 0; l < input_w; ++l) {
+          int offset = i * (input_c * input_h * input_w) +
+                       j * (input_h * input_w) + k * input_w + l;
+          if (tiu_output_data[offset] != output_data[offset]) {
+            printf("        [ni=%d][oci=%d][ohi=%d][owi=%d] output %d(tiu) != "
+                   "%d(ref)\n",
+                   i, j, k, l, tiu_output_data[offset], output_data[offset]);
+            ret = -1;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  free(tiu_output_data);
+
+  if (ret) {
+    dump_test_param(p_param, /*dump_content=*/true);
+  }
+
+  // Reverse order
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_output);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_input2);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_input1);
+
+  free(input1_data);
+  free(input2_data);
+  free(output_data);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    <= run_compare_elt_mul, ret %d\n", ret);
+#endif
+
+  return ret;
+}
+
+int random_test(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+
+#if 0
+  int input_n_range[] = {1};
+  int input_c_range[] = {1};
+  int input_h_range[] = {1};
+  int input_w_range[] = {1};
+#else
+  int input_n_range[] = {1,   2, 4095 - 32};
+  int input_c_range[] = {1, 512, 4095 - 32};
+  int input_h_range[] = {1, 512, 4095 - 32};
+  int input_w_range[] = {1, 512, 4095 - 32};
+#endif
+
+  const int input_n_range_size =
+      sizeof(input_n_range) / sizeof(input_n_range[0]);
+  const int input_c_range_size =
+      sizeof(input_c_range) / sizeof(input_c_range[0]);
+  const int input_h_range_size =
+      sizeof(input_h_range) / sizeof(input_h_range[0]);
+  const int input_w_range_size =
+      sizeof(input_w_range) / sizeof(input_w_range[0]);
+
+  int random_seed = clock();
+  srand(random_seed);
+
+  const int retry_test_count = 100;
+  bool stop_at_first_error = true;
+
+  int executed_tests = 0;
+  int failed_tests = 0;
+
+  printf("1822-mul-qm: random Test =>\n");
+  for (int m = 0; m < retry_test_count; ++m) {
+    for (int i = 0; i < input_n_range_size; ++i) {
+      int input_n = choose_from_range(input_n_range, input_n_range_size, i);
+
+      for (int j = 0; j < input_c_range_size; ++j) {
+        int input_c = choose_from_range(input_c_range, input_c_range_size, j);
+
+        for (int k = 0; k < input_h_range_size; ++k) {
+          int input_h = choose_from_range(input_h_range, input_h_range_size, k);
+
+          for (int l = 0; l < input_w_range_size; ++l) {
+            int input_w =
+                choose_from_range(input_w_range, input_w_range_size, l);
+
+            elt_mul_test_param_t test_param;
+            memset(&test_param, 0, sizeof(test_param));
+            test_param.input_n = input_n;
+            test_param.input_c = input_c;
+            test_param.input_h = input_h;
+            test_param.input_w = input_w;
+            test_param.retry_cnt = 5;
+
+            bool is_valid_param = check_valid_test_param(cvk_ctx, &test_param);
+            if (is_valid_param == false)
+              continue;
+
+            int ret2 = run_compare_elt_mul(rt_handle, cvk_ctx, &test_param);
+            failed_tests = ret2 ? failed_tests + 1 : failed_tests;
+            ret |= ret2;
+            executed_tests++;
+
+#ifdef ENABLE_DEBUG_MSG
+            printf("  [%d] random test: input shape (%d, %d, %d, %d), ret %d\n",
+                   executed_tests, current_testinput_n, input_c, input_h, input_w, ret2);
+#endif
+          }
+
+          // Stop at first error
+          if (ret && stop_at_first_error) {
+            break;
+          }
+        }
+
+        // Stop at first error
+        if (ret && stop_at_first_error) {
+          break;
+        }
+      }
+
+      // Stop at first error
+      if (ret && stop_at_first_error) {
+        break;
+      }
+    }
+
+    // Stop at first error
+    if (ret && stop_at_first_error) {
+      break;
+    }
+
+    if (executed_tests >= MIN_EXEC_TESTS) {
+      break;
+    }
+  }
+
+  printf("<= 1822-mul-qm: random test, total %d, failed %d, ret %d\n",
+         executed_tests, failed_tests, ret);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret = simple_test(rt_handle, cvk_ctx);
+  ret |= random_test(rt_handle, cvk_ctx);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_tensor_or.c b/cviruntime/test/180x/test_180x_tensor_or.c
new file mode 100644
index 000000000..802e35bfa
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_tensor_or.c
@@ -0,0 +1,244 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_or_int8_ref(int8_t *a, int8_t *b, int8_t *res, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++)
+    res[i] = a[i] | b[i];
+}
+
+static void tl_or_int16_ref(
+    uint8_t *ref_high, uint8_t *ref_low,
+    uint8_t *a_high, uint8_t *a_low,
+    uint8_t *b_high, uint8_t *b_low,
+    uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    int32_t ta = ((int8_t)a_high[i] << 8) + a_low[i];
+    int32_t tb = ((int8_t)b_high[i] << 8) + b_low[i];
+    int32_t res = ta | tb;
+    ref_high[i] = (res >> 8) & 0xff;
+    ref_low[i] = res & 0xff;
+  }
+}
+
+static int test_tl_or_int8(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 9; // 39 -> 9 for 180x
+  int h = 7;
+  int w = 37;
+  int ret = 0;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  uint32_t size = n * c * h * w;
+  int8_t *a_data = (int8_t *)malloc(size);
+  int8_t *b_data = (int8_t *)malloc(size);
+  int8_t *ref_data = (int8_t *)malloc(size);
+  if (!a_data || !b_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint32_t i = 0; i < size; i++)
+    a_data[i] = (int8_t)(i % 256);
+
+  for (uint32_t i = 0; i < size; i++)
+    b_data[i] = (int8_t)(100 - i % 256);
+
+  tl_or_int8_ref(a_data, b_data, ref_data, size);
+
+  cvk_tl_t *tl_a = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_b = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_res = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  uint8_t *res_data = NULL;
+  if (!tl_a || !tl_b || !tl_res) {
+    printf("  %s: fail to alloc tl\n", __FUNCTION__);
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, (uint8_t *)a_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b, (uint8_t *)b_data);
+
+  cvk_tiu_or_int8_param_t p9;
+  p9.res = tl_res;
+  p9.a = tl_a;
+  p9.b = tl_b;
+  cvk_ctx->ops->tiu_or_int8(cvk_ctx, &p9);
+  res_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res);
+
+  for (uint32_t i = 0; i < size; i++) {
+    if ((int8_t)res_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%u], got %x, exp %x\n",
+             i, res_data[i], ref_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+fail_exit_2:
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+  free(res_data);
+
+fail_exit:
+  free(a_data);
+  free(b_data);
+  free(ref_data);
+
+  printf("  %s(eu_align=%d) %s\n", __FUNCTION__, eu_align, ret ? "fail" : "pass");
+
+  return ret;
+}
+
+static int test_tl_or_int16(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 9; // 39 -> 9 for 180x
+  int h = 7;
+  int w = 37;
+  int ret = 0;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  uint32_t size = n * c * h * w;
+  uint8_t *a_high_data = (uint8_t *)malloc(size);
+  uint8_t *a_low_data = (uint8_t *)malloc(size);
+  uint8_t *b_high_data = (uint8_t *)malloc(size);
+  uint8_t *b_low_data = (uint8_t *)malloc(size);
+  uint8_t *ref_high_data = (uint8_t *)malloc(size);
+  uint8_t *ref_low_data = (uint8_t *)malloc(size);
+  if (!a_high_data || !a_low_data || !b_high_data ||
+      !b_low_data || !ref_high_data || !ref_low_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint32_t i = 0; i < size; i++) {
+    a_high_data[i] = i / 10;
+    a_low_data[i] = i;
+    b_high_data[i] = (i + 250) / 20;
+    b_low_data[i] = 100 - i;
+  }
+
+
+  tl_or_int16_ref(
+      ref_high_data, ref_low_data,
+      a_high_data, a_low_data,
+      b_high_data, b_low_data,
+      size);
+
+  cvk_tl_t *tl_a_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_a_high = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_b_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_b_high = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_res_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_res_high = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  uint8_t *res_high_data = NULL, *res_low_data = NULL;
+  if (!tl_a_low || !tl_a_high || !tl_b_low || !tl_b_high || !tl_res_low || !tl_res_high) {
+    printf("  %s: fail to alloc tl\n", __FUNCTION__);
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a_low, a_low_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a_high, a_high_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b_low, b_low_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b_high, b_high_data);
+  cvk_tiu_or_int16_param_t p9;
+  p9.res_high = tl_res_high;
+  p9.res_low = tl_res_low;
+  p9.a_high = tl_a_high;
+  p9.a_low = tl_a_low;
+  p9.b_high = tl_b_high;
+  p9.b_low = tl_b_low;
+  cvk_ctx->ops->tiu_or_int16(cvk_ctx, &p9);
+  res_high_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res_high);
+  res_low_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res_low);
+
+  for (uint32_t i = 0; i < size; i++) {
+    if (res_high_data[i] != ref_high_data[i]) {
+      fprintf(stderr, "comparing failed at res_high_data[%u], got %d, exp %d\n",
+             i, res_high_data[i], ref_high_data[i]);
+      ret = -1;
+      break;
+    }
+    if (res_low_data[i] != ref_low_data[i]) {
+      fprintf(stderr, "comparing failed at res_low_data[%u], got %d, exp %d\n",
+             i, res_low_data[i], ref_low_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+fail_exit_2:
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res_high);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res_low);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b_high);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b_low);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a_high);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a_low);
+
+  free(res_high_data);
+  free(res_low_data);
+
+fail_exit:
+  free(a_high_data);
+  free(a_low_data);
+  free(b_high_data);
+  free(b_low_data);
+  free(ref_high_data);
+  free(ref_low_data);
+
+  printf("  %s(eu_align=%d) %s\n", __FUNCTION__, eu_align, ret ? "fail" : "pass");
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_or_int8(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_or_int8(rt_handle, cvk_ctx, 1);
+  ret |= test_tl_or_int16(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_or_int16(rt_handle, cvk_ctx, 1);
+
+  printf("tensor or test %s\n", ret ? "fail" : "pass");
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_tensor_sub.c b/cviruntime/test/180x/test_180x_tensor_sub.c
new file mode 100644
index 000000000..dd0957ed3
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_tensor_sub.c
@@ -0,0 +1,158 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_sub_ref(
+    uint8_t *ref_high, uint8_t *ref_low,
+    uint8_t *a_high, uint8_t *a_low,
+    uint8_t *b_high, uint8_t *b_low,
+    uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    int32_t ta = ((int8_t)a_high[i] << 8) + a_low[i];
+    int32_t tb = ((int8_t)b_high[i] << 8) + b_low[i];
+    int32_t res = ta - tb;
+    if (res > 32767)
+      res = 32767;
+    else if (res < -32768)
+      res = -32768;
+    ref_high[i] = (res >> 8) & 0xff;
+    ref_low[i] = res & 0xff;
+  }
+}
+
+static int test_tl_sub(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 2; // 3 -> 2 for 1810
+  int c = 9; // 39 -> 9 for 180x
+  int h = 7;
+  int w = 37;
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  uint32_t size = n * c * h * w;
+  uint8_t *a_high_data = (uint8_t *)malloc(size);
+  uint8_t *a_low_data = (uint8_t *)malloc(size);
+  uint8_t *b_high_data = (uint8_t *)malloc(size);
+  uint8_t *b_low_data = (uint8_t *)malloc(size);
+  if (!a_high_data || !a_low_data || !b_high_data || !b_low_data)
+    return -1;
+
+  for (uint32_t i = 0; i < size; i++) {
+    a_high_data[i] = i / 10;
+    a_low_data[i] = i;
+    b_high_data[i] = (i + 250) / 20;
+    b_low_data[i] = 100 - i;
+  }
+
+  uint8_t *ref_high_data = (uint8_t *)malloc(size);
+  uint8_t *ref_low_data = (uint8_t *)malloc(size);
+  if (!ref_high_data || !ref_low_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  tl_sub_ref(ref_high_data, ref_low_data,
+             a_high_data, a_low_data,
+             b_high_data, b_low_data,
+             size);
+
+  cvk_tl_t *tl_a_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_a_high = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_b_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_b_high = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_res_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_res_high = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  if (!tl_a_low || !tl_a_high || !tl_b_low || !tl_b_high || !tl_res_low || !tl_res_high) {
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a_low, a_low_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a_high, a_high_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b_low, b_low_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b_high, b_high_data);
+  cvk_tiu_sub_param_t p5;
+  p5.res_high = tl_res_high;
+  p5.res_low = tl_res_low;
+  p5.a_high = tl_a_high;
+  p5.a_low = tl_a_low;
+  p5.b_high = tl_b_high;
+  p5.b_low = tl_b_low;
+  p5.rshift_bits = 0;
+  cvk_ctx->ops->tiu_sub(cvk_ctx, &p5);
+  uint8_t *res_high_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res_high);
+  uint8_t *res_low_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res_low);
+
+  for (uint32_t i = 0; i < size; i++) {
+    if (res_high_data[i] != ref_high_data[i]) {
+      fprintf(stderr, "comparing failed at res_high_data[%u], got %d, exp %d\n",
+             i, res_high_data[i], ref_high_data[i]);
+      ret = -1;
+      break;;
+    }
+    if (res_low_data[i] != ref_low_data[i]) {
+      fprintf(stderr, "comparing failed at res_low_data[%u], got %d, exp %d\n",
+             i, res_low_data[i], ref_low_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+  free(res_high_data);
+  free(res_low_data);
+
+fail_exit_2:
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res_high);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res_low);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b_high);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b_low);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a_high);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a_low);
+
+fail_exit:
+  free(a_high_data);
+  free(a_low_data);
+  free(b_high_data);
+  free(b_low_data);
+  free(ref_high_data);
+  free(ref_low_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_sub(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_sub(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_tensor_transfer.c b/cviruntime/test/180x/test_180x_tensor_transfer.c
new file mode 100644
index 000000000..cb913b192
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_tensor_transfer.c
@@ -0,0 +1,119 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_native_ref.h"
+
+static int test_put_and_get_tensor_l2g(
+    CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int n = 2;
+  int c = 66;
+  int h = 3;
+  int w = 15;
+  int size = n * c * h * w;
+  uint8_t *data_x = (uint8_t *)malloc(size);
+  uint8_t *data_y = (uint8_t *)malloc(size);
+  if (!data_x || !data_y)
+    return -1;
+
+  for (int i = 0; i < size; i++)
+    data_x[i] = i - 100;
+
+  for (int i = 0; i < size; i++)
+    data_y[i] = -i;
+
+  /*
+   * Interleave two tensors in case the same devmem is reused between
+   * tensor_copy_s2d_g2l() and tensor_copy_l2g_d2s(), in which case the content of
+   * devmem is already what is expected before tdma_store(cvk_ctx, ).
+   */
+
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  cvk_tl_t *tl_x =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, 1);
+  cvk_tl_t *tl_y =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, 1);
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_x, data_x);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_y, data_y);
+
+  uint8_t *result_x = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_x);
+  uint8_t *result_y = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_y);
+
+  for (int i = 0; i < size; i++) {
+    if (result_x[i] != data_x[i]) {
+      printf("compare failed at result_x[%d]\n", i);
+      return -1;
+    }
+    if (result_y[i] != data_y[i]) {
+      printf("compare failed at result_y[%d]\n", i);
+      return -1;
+    }
+  }
+  free(result_x);
+  free(result_y);
+
+  /*
+   * Get result_y before result_x.
+   */
+
+
+  result_y = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_y);
+  result_x = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_x);
+  for (int i = 0; i < size; i++) {
+    if (result_x[i] != data_x[i]) {
+      printf("compare failed at result_x[%d]\n", i);
+      return -1;
+    }
+    if (result_y[i] != data_y[i]) {
+      printf("compare failed at result_y[%d]\n", i);
+      return -1;
+    }
+  }
+  free(result_x);
+  free(result_y);
+
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_y);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_x);
+  free(data_x);
+  free(data_y);
+
+  return 0;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+  
+  ret |= test_put_and_get_tensor_l2g(rt_handle, cvk_ctx);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/180x/test_180x_tensor_xor.c b/cviruntime/test/180x/test_180x_tensor_xor.c
new file mode 100644
index 000000000..f398449d4
--- /dev/null
+++ b/cviruntime/test/180x/test_180x_tensor_xor.c
@@ -0,0 +1,246 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_xor_int8_ref(int8_t *a, int8_t *b, int8_t *res, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++)
+    res[i] = a[i] ^ b[i];
+}
+
+static void tl_xor_int16_ref(
+    uint8_t *ref_high, uint8_t *ref_low,
+    uint8_t *a_high, uint8_t *a_low,
+    uint8_t *b_high, uint8_t *b_low,
+    uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    int32_t ta = ((int8_t)a_high[i] << 8) + a_low[i];
+    int32_t tb = ((int8_t)b_high[i] << 8) + b_low[i];
+    int32_t res = ta ^ tb;
+    ref_high[i] = (res >> 8) & 0xff;
+    ref_low[i] = res & 0xff;
+  }
+}
+
+static int test_tl_xor_int8(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 3;
+  int c = 9; // 39 -> 9 for 180x
+  int h = 7;
+  int w = 37;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  uint32_t size = n * c * h * w;
+  int8_t *a_data = (int8_t *)malloc(size);
+  int8_t *b_data = (int8_t *)malloc(size);
+  int8_t *ref_data = (int8_t *)malloc(size);
+  if (!a_data || !b_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint32_t i = 0; i < size; i++)
+    a_data[i] = (int8_t)(i % 256);
+
+  for (uint32_t i = 0; i < size; i++)
+    b_data[i] = (int8_t)(100 - i % 256);
+
+  tl_xor_int8_ref(a_data, b_data, ref_data, size);
+
+  cvk_tl_t *tl_a = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_b = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_res = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  uint8_t *res_data = NULL;
+  if (!tl_a || !tl_b || !tl_res) {
+    printf("  %s: fail to alloc tl\n", __FUNCTION__);
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, (uint8_t *)a_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b, (uint8_t *)b_data);
+
+  cvk_tiu_xor_int8_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.res = tl_res;
+  p.a = tl_a;
+  p.b = tl_b;
+  cvk_ctx->ops->tiu_xor_int8(cvk_ctx, &p);
+  res_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res);
+
+  for (uint32_t i = 0; i < size; i++) {
+    if ((int8_t)res_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%u], got %x, exp %x\n",
+             i, res_data[i], ref_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+fail_exit_2:
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+
+  free(res_data);
+
+fail_exit:
+  free(a_data);
+  free(b_data);
+  free(ref_data);
+
+  printf("  %s(eu_align=%d) %s\n", __FUNCTION__, eu_align, ret ? "fail" : "pass");
+
+  return ret;
+}
+
+static int test_tl_xor_int16(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 9; // 39 -> 9 for 180x
+  int h = 7;
+  int w = 37;
+  int ret = 0;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  uint32_t size = n * c * h * w;
+  uint8_t *a_high_data = (uint8_t *)malloc(size);
+  uint8_t *a_low_data = (uint8_t *)malloc(size);
+  uint8_t *b_high_data = (uint8_t *)malloc(size);
+  uint8_t *b_low_data = (uint8_t *)malloc(size);
+  uint8_t *ref_high_data = (uint8_t *)malloc(size);
+  uint8_t *ref_low_data = (uint8_t *)malloc(size);
+  uint8_t *res_high_data = NULL;
+  uint8_t *res_low_data = NULL;
+  if (!a_high_data || !a_low_data || !b_high_data ||
+      !b_low_data || !ref_high_data || !ref_low_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint32_t i = 0; i < size; i++) {
+    a_high_data[i] = i / 10;
+    a_low_data[i] = i;
+    b_high_data[i] = (i + 250) / 20;
+    b_low_data[i] = 100 - i;
+  }
+
+
+  tl_xor_int16_ref(
+      ref_high_data, ref_low_data,
+      a_high_data, a_low_data,
+      b_high_data, b_low_data,
+      size);
+
+  cvk_tl_t *tl_a_low = cvk_ctx->ops->lmem_alloc_tensor( cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_a_high = cvk_ctx->ops->lmem_alloc_tensor( cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_b_low = cvk_ctx->ops->lmem_alloc_tensor( cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_b_high = cvk_ctx->ops->lmem_alloc_tensor( cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_res_low = cvk_ctx->ops->lmem_alloc_tensor( cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_res_high = cvk_ctx->ops->lmem_alloc_tensor( cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  if (!tl_a_low || !tl_a_high || !tl_b_low || !tl_b_high || !tl_res_low || !tl_res_high){
+    printf("  %s: fail to alloc tl\n", __FUNCTION__);
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a_low, a_low_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a_high, a_high_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b_low, b_low_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b_high, b_high_data);
+
+  cvk_tiu_xor_int16_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.res_high = tl_res_high;
+  p.res_low = tl_res_low;
+  p.a_high = tl_a_high;
+  p.a_low = tl_a_low;
+  p.b_high = tl_b_high;
+  p.b_low = tl_b_low;
+  cvk_ctx->ops->tiu_xor_int16(cvk_ctx, &p);
+  res_high_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res_high);
+  res_low_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res_low);
+
+  for (uint64_t i = 0; i < size; i++) {
+    if (res_high_data[i] != ref_high_data[i]) {
+      fprintf(stderr, "comparing failed at res_high_data[%" PRIu64 "], got %d, exp %d\n",
+             i, res_high_data[i], ref_high_data[i]);
+      return -1;
+    }
+    if (res_low_data[i] != ref_low_data[i]) {
+      fprintf(stderr, "comparing failed at res_low_data[%" PRIu64 "], got %d, exp %d\n",
+             i, res_low_data[i], ref_low_data[i]);
+      return -1;
+    }
+  }
+
+fail_exit_2:
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res_high);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res_low);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b_high);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b_low);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a_high);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a_low);
+
+fail_exit:
+  free(a_high_data);
+  free(a_low_data);
+  free(b_high_data);
+  free(b_low_data);
+  free(ref_high_data);
+  free(ref_low_data);
+  free(res_high_data);
+  free(res_low_data);
+
+  printf("  %s(eu_align=%d) %s\n", __FUNCTION__, eu_align, ret ? "fail" : "pass");
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_xor_int8(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_xor_int8(rt_handle, cvk_ctx, 1);
+  ret |= test_tl_xor_int16(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_xor_int16(rt_handle, cvk_ctx, 1);
+
+  printf("tensor xor test %s\n", ret ? "fail" : "pass");
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_avg_pooling.c b/cviruntime/test/181x/test_181x_avg_pooling.c
new file mode 100644
index 000000000..78817d577
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_avg_pooling.c
@@ -0,0 +1,272 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_native_ref.h"
+
+#define INVALIDE_STRIDE (-1)
+
+static void print_pooling_param(const cvk_tiu_average_pooling_param_t *p)
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+
+  printf("  Pooling parameters:\n");
+  printf("    ifmap = (%d, %d, %d, %d)\n", in, ic, ih, iw);
+  printf("    opd0_sign = %d\n", p->ifmap->fmt == CVK_FMT_I8);
+  printf("    weight = (%d, %d)\n", p->kh, p->kw);
+  printf("    padding = (%d, %d, %d, %d)\n",
+         p->pad_top, p->pad_bottom, p->pad_left, p->pad_right);
+  printf("    stride = (%d, %d)\n", p->stride_h, p->stride_w);
+  printf("    ins0 = (%d, %d, %d, %d)\n",
+         p->ins_h, p->ins_last_h, p->ins_w, p->ins_last_w);
+  printf("    avg_pooling_const = %d\n", p->avg_pooling_const);
+  printf("    rshift_bits = %d\n", p->rshift_bits);
+}
+
+static int8_t *alloc_input(cvk_tiu_average_pooling_param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->ifmap->shape, p->ifmap->fmt);
+  int8_t *data = (int8_t *)malloc(size);
+  if (!data)
+    return NULL;
+
+  for (uint64_t i = 0; i < size; i++)
+    data[i] = rand() % 256 - 128;
+  return data;
+}
+
+static int8_t *alloc_output(cvk_tiu_average_pooling_param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->ofmap->shape, p->ofmap->fmt);
+  return (int8_t *)malloc(size);
+}
+
+static int pooling_ih_ext(cvk_tiu_average_pooling_param_t *p, int ih)
+{
+  int ins = p->ins_h;
+  int ins_last = p->ins_last_h;
+  int pad = p->pad_top + p->pad_bottom;
+  return (ih - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+static int pooling_iw_ext(cvk_tiu_average_pooling_param_t *p, int iw)
+{
+  int ins = p->ins_w;
+  int ins_last = p->ins_last_w;
+  int pad = p->pad_left + p->pad_right;
+  return (iw - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+static int pooling_oh(cvk_tiu_average_pooling_param_t *p, int ih)
+{
+  int ih_ext = pooling_ih_ext(p, ih);
+  return (ih_ext - p->kh) / p->stride_h + 1;
+}
+
+static int pooling_ow(cvk_tiu_average_pooling_param_t *p, int iw)
+{
+  int iw_ext = pooling_iw_ext(p, iw);
+  return (iw_ext - p->kw) / p->stride_w + 1;
+}
+
+static void free_pooling_param(
+    cvk_context_t *cvk_ctx,
+    cvk_tiu_average_pooling_param_t *p)
+{
+  if (p->ifmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->ifmap);
+  if (p->ofmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->ofmap);
+}
+
+static cvk_tiu_average_pooling_param_t random_pooling_param(cvk_context_t *cvk_ctx, int stride_w, int stride_h)
+{
+  int retry_cnt = 100;
+  srand(clock());
+  cvk_tiu_average_pooling_param_t p;
+
+  for (int i = 0; i < retry_cnt; i++) {
+    int in = rand() % 5 + 1;
+    int ic = rand() % (3 * cvk_ctx->info.npu_num) + 1;
+    int ih = rand() % 30 + 3 + (INVALIDE_STRIDE == stride_h ? 0 : stride_h);
+    int iw = rand() % 30 + 6 + (INVALIDE_STRIDE == stride_w ? 0 : stride_w);
+    int opd0_sign = rand() % 2;
+
+    memset(&p, 0, sizeof(p));
+    p.kh = rand() % 7 + 1;
+    p.kw = rand() % 7 + 1;
+    p.stride_h = INVALIDE_STRIDE == stride_h ? rand() % (p.kh) + 1 : stride_h;
+    p.stride_w = INVALIDE_STRIDE == stride_w ? rand() % (p.kh) + 1 : stride_w;
+    p.ins_h = rand() % p.kh;
+    p.ins_w = rand() % p.kw;
+    p.ins_last_h = rand() % p.kh;
+    p.ins_last_w = rand() % p.kw;
+    p.pad_top = rand() % p.kh;
+    p.pad_bottom = rand() % p.kh;
+    p.pad_left = rand() % p.kw;
+    p.pad_right= rand() % p.kw;
+    p.avg_pooling_const = rand() % 256;
+    p.rshift_bits = rand() % 32;
+
+    cvk_tl_shape_t ifmap_shape;
+    ifmap_shape.n = in;
+    ifmap_shape.c = ic;
+    ifmap_shape.h = ih;
+    ifmap_shape.w = iw;
+
+    int on = in;
+    int oc = ic;
+    int oh = pooling_oh(&p, ih);
+    int ow = pooling_ow(&p, iw);
+    cvk_tl_shape_t ofmap_shape;
+    ofmap_shape.n = on;
+    ofmap_shape.c = oc;
+    ofmap_shape.h = oh;
+    ofmap_shape.w = ow;
+
+    cvk_fmt_t fmt = opd0_sign? CVK_FMT_I8: CVK_FMT_U8;
+    p.ofmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, ofmap_shape, CVK_FMT_I8, 1);
+    p.ifmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, ifmap_shape, fmt, 1);
+
+    if ((p.kh > pooling_ih_ext(&p, ih))
+        || (p.kw > pooling_iw_ext(&p, iw))
+        || (p.pad_top >= (1 << 4))
+        || (p.pad_bottom >= (1 << 4))
+        || (p.pad_left >= (1 << 4))
+        || (p.pad_right >= (1 << 4))
+        || !p.ofmap
+        || !p.ifmap) {
+      printf("retry init_pooling_param\n");
+      free_pooling_param(cvk_ctx, &p);
+    } else
+      break;
+  }
+
+  return p;
+}
+
+static int compare_results(
+    cvk_tiu_average_pooling_param_t *p,
+    int8_t input[],
+    int8_t output[])
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+  int opd0_sign = (p->ifmap->fmt == CVK_FMT_I8);
+
+  int8_t *output_ref = alloc_output(p);
+  int ret = native_pooling_ave_int8(
+      input, &p->avg_pooling_const, NULL, output_ref,
+      in, ic, ih, iw, p->kh, p->kw,
+      p->pad_top, p->pad_bottom, p->pad_left, p->pad_right,
+      p->stride_h, p->stride_w,
+      p->ins_h, p->ins_w, p->ins_last_h, p->ins_last_w,
+      opd0_sign, opd0_sign, p->rshift_bits, 1);
+  if (ret)
+    return ret;
+
+  ret = array_cmp_int8(
+      "Comparing results ...\n", output_ref, output,
+      tl_shape_size(&p->ofmap->shape, p->ofmap->fmt));
+
+  if (ret != 0) {
+    printf("Comparison FAILED!!!\n");
+    print_pooling_param(p);
+  }
+
+  free(output_ref);
+
+  return ret;
+}
+
+static int _test_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int stride_w, int stride_h)
+{
+  int ret;
+  cvk_tiu_average_pooling_param_t p = random_pooling_param(cvk_ctx, stride_w, stride_h);
+  int8_t *input = alloc_input(&p);
+  if (!input)
+    return -1;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, p.ifmap, (uint8_t *)input);
+
+  cvk_ctx->ops->tiu_average_pooling(cvk_ctx, &p);
+  CVI_RT_Submit(cvk_ctx);
+
+  int8_t *output = (int8_t *)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, p.ofmap);
+  if (!output)
+    return -1;
+
+  ret = compare_results(&p, input, output);
+
+  free_pooling_param(cvk_ctx, &p);
+  free(output);
+  free(input);
+
+  return ret;
+}
+
+static int test_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx) {
+  return _test_pooling(rt_handle, cvk_ctx, INVALIDE_STRIDE, INVALIDE_STRIDE);
+}
+
+static int test_avg_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+  for (uint64_t i = 0; i < 16; i++)
+    ret |= test_pooling(rt_handle, cvk_ctx);
+
+  // test stride extend (0, 31]
+  int stride_list[] = {15, 16, 31};
+  int stride_list_len = sizeof(stride_list) / sizeof(stride_list[0]);
+
+  for (int stride_w_idx = 0; stride_w_idx < stride_list_len; stride_w_idx++) {
+    for (int stride_h_idx = 0; stride_h_idx < stride_list_len; stride_h_idx++) {
+      int stride_w = stride_list[stride_w_idx];
+      int stride_h = stride_list[stride_h_idx];
+
+      ret |= _test_pooling(rt_handle, cvk_ctx, stride_w, stride_h);
+    }
+  }
+  
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret = test_avg_pooling(rt_handle, cvk_ctx);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  if (!ret)
+    printf("%s pass\n", __FILENAME__);
+  else
+    printf("%s fail\n", __FILENAME__);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_bf16_avg_pooling.c b/cviruntime/test/181x/test_181x_bf16_avg_pooling.c
new file mode 100644
index 000000000..7f3d29c6d
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_bf16_avg_pooling.c
@@ -0,0 +1,353 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_native_ref.h"
+
+#define INVALIDE_STRIDE (-1)
+typedef cvk_tiu_average_pooling_param_t param_t;
+int random_seed;
+static void print_pooling_param(const param_t *p)
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+
+  printf("  Pooling parameters:\n");
+  printf("    random_seed : %d \n", random_seed);
+  printf("    ifmap = (%d, %d, %d, %d)\n", in, ic, ih, iw);
+  printf("    opd0_sign = %d\n", p->ifmap->fmt == CVK_FMT_I8);
+  printf("    weight = (%d, %d)\n", p->kh, p->kw);
+  printf("    padding = (%d, %d, %d, %d)\n",
+         p->pad_top, p->pad_bottom, p->pad_left, p->pad_right);
+  printf("    stride = (%d, %d)\n", p->stride_h, p->stride_w);
+  printf("    ins0 = (%d, %d, %d, %d)\n",
+         p->ins_h, p->ins_last_h, p->ins_w, p->ins_last_w);
+  printf("    avg_pooling_const = %d\n", p->avg_pooling_const);
+  printf("    rshift_bits = %d\n", p->rshift_bits);
+}
+static int index_get(int h, int w1, int w2)
+{
+  return h * w1 + w2;
+}
+
+int native_pooling_avg_bf16(
+    const uint16_t* i_fmap,
+    const void* weight,
+    const uint32_t *bias,
+    uint16_t * o_fmap,
+    int input_n, int input_c, int input_h, int input_w,
+    int kh, int kw,
+    int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r,
+    int stride_h, int stride_w,
+    int ins_h, int ins_w,
+    int ins_h_last, int ins_w_last,
+    int const_weight)
+{
+  if (kh * kw <= 0)
+    return -1;
+
+  float *avg_pooling_mac_a = (float *)malloc(kh * kw * sizeof(float));
+  float *avg_pooling_mac_b = (float *)malloc(kh * kw * sizeof(float));
+
+  uint16_t avg_const_weight = *(uint16_t *)weight;
+  const uint16_t *weight_arr = (uint16_t*)weight;
+
+  int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b);
+  int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r);
+
+  int output_h = calc_output_hw(h_after, kh, stride_h);
+  int output_w = calc_output_hw(w_after, kw, stride_w);
+  uint16_t *i_fmap_pad = NULL;
+  for (int n = 0; n < input_n; n++) {
+    if (const_weight == 0)
+      weight_arr = (uint16_t*)weight;
+
+    for (int c = 0; c < input_c; ++c) {
+      fill_pad_fmap_bf16(i_fmap, &i_fmap_pad, cvk_convert_fp32_bf16(0),
+          pad_w_l, pad_w_r, pad_h_t, pad_h_b,
+          ins_h, ins_w, ins_h_last, ins_w_last,
+          input_h, input_w);
+
+      for (int ph = 0; ph < output_h; ++ph) {
+        for (int pw = 0; pw < output_w; ++pw) {
+          int hstart = ph * stride_h;
+          int wstart = pw * stride_w;
+          int pool_index = index_get(ph, output_w, pw);
+          int mac_index = 0;
+          float avg_pool_result=0;
+          for (int h = 0; h < kh; h++) {
+            for (int w = 0; w < kw; w++) {
+              int index = index_get((hstart+h), w_after, (w+wstart));
+              mac_index = index_get(h, kw, w);
+              float a = cvk_convert_bf16_fp32(i_fmap_pad[index]);
+              float b = const_weight ?
+                  cvk_convert_bf16_fp32(avg_const_weight) : cvk_convert_bf16_fp32(weight_arr[mac_index]);
+
+              avg_pool_result += a*b;
+            }
+          }
+
+          if(bias) {
+            avg_pool_result += cvk_convert_hex_fp32(bias[c]);
+          }
+          *(o_fmap+pool_index) = cvk_convert_fp32_bf16(avg_pool_result);
+        }
+      }
+      i_fmap += input_w * input_h;
+      if (const_weight == 0)
+        weight_arr += kh * kw;
+
+      o_fmap += output_w * output_h;
+    }
+  }
+  free(i_fmap_pad);
+
+  free(avg_pooling_mac_a);
+  free(avg_pooling_mac_b);
+
+  return 0;
+}
+
+static uint16_t *alloc_input(param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->ifmap->shape, p->ifmap->fmt);
+  uint16_t *data = (uint16_t *)malloc(size);
+  if (!data)
+    return NULL;
+
+  for (uint64_t i = 0; i < size / sizeof(uint16_t); i++) {
+    float val = 0;
+    int RAND_MAX2 = RAND_MAX/2; //100 ~ -100
+    val = (float)(rand()-RAND_MAX2)*1000 / (float)RAND_MAX;
+    data[i] = cvk_convert_fp32_bf16(val);//rand() % 256 - 128;
+  }
+  return data;
+}
+
+static uint16_t *alloc_output(param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->ofmap->shape, p->ofmap->fmt);
+  return (uint16_t *)malloc(size);
+}
+
+static int pooling_ih_ext(param_t *p, int ih)
+{
+  int ins = p->ins_h;
+  int ins_last = p->ins_last_h;
+  int pad = p->pad_top + p->pad_bottom;
+  return (ih - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+static int pooling_iw_ext(param_t *p, int iw)
+{
+  int ins = p->ins_w;
+  int ins_last = p->ins_last_w;
+  int pad = p->pad_left + p->pad_right;
+  return (iw - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+static int pooling_oh(param_t *p, int ih)
+{
+  int ih_ext = pooling_ih_ext(p, ih);
+  return (ih_ext - p->kh) / p->stride_h + 1;
+}
+
+static int pooling_ow(param_t *p, int iw)
+{
+  int iw_ext = pooling_iw_ext(p, iw);
+  return (iw_ext - p->kw) / p->stride_w + 1;
+}
+
+static void free_pooling_param(
+    cvk_context_t *cvk_ctx,
+    param_t *p)
+{
+  if (p->ifmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->ifmap);
+  if (p->ofmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->ofmap);
+}
+
+static param_t random_pooling_param(cvk_context_t *cvk_ctx, int stride_w, int stride_h)
+{
+  param_t p;
+
+retry:
+  random_seed = clock();
+  srand(random_seed);
+
+  int in = rand() % 5 + 1;
+  int ic = rand() % (3 * cvk_ctx->info.npu_num) + 1;
+  int ih = rand() % 30 + 3 + (INVALIDE_STRIDE == stride_h ? 0 : stride_h);
+  int iw = rand() % 30 + 6 + (INVALIDE_STRIDE == stride_w ? 0 : stride_w);
+
+  memset(&p, 0, sizeof(p));
+  p.kh = rand() % 7 + 1;
+  p.kw = rand() % 7 + 1;
+  p.stride_h = INVALIDE_STRIDE == stride_h ? rand() % (p.kh) + 1 : stride_h;
+  p.stride_w = INVALIDE_STRIDE == stride_w ? rand() % (p.kh) + 1 : stride_w;
+  p.ins_h = rand() % p.kh;
+  p.ins_w = rand() % p.kw;
+  p.ins_last_h = rand() % p.kh;
+  p.ins_last_w = rand() % p.kw;
+  p.pad_top = rand() % p.kh;
+  p.pad_bottom = rand() % p.kh;
+  p.pad_left = rand() % p.kw;
+  p.pad_right= rand() % p.kw;
+  p.rshift_bits = rand() % 32;
+  p.avg_pooling_const = cvk_convert_fp32_bf16(rand()%0x1000);//rand() % 256;
+  cvk_tl_shape_t ifmap_shape;
+  ifmap_shape.n = in;
+  ifmap_shape.c = ic;
+  ifmap_shape.h = ih;
+  ifmap_shape.w = iw;
+
+  int on = in;
+  int oc = ic;
+  int oh = pooling_oh(&p, ih);
+  int ow = pooling_ow(&p, iw);
+  cvk_tl_shape_t ofmap_shape;
+  ofmap_shape.n = on;
+  ofmap_shape.c = oc;
+  ofmap_shape.h = oh;
+  ofmap_shape.w = ow;
+
+  p.ofmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, ofmap_shape, CVK_FMT_BF16, 1);
+  p.ifmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, ifmap_shape, CVK_FMT_BF16, 1);
+
+  if ((p.kh > pooling_ih_ext(&p, ih))
+      || (p.kw > pooling_iw_ext(&p, iw))
+      || (p.pad_top >= (1 << 4))
+      || (p.pad_bottom >= (1 << 4))
+      || (p.pad_left >= (1 << 4))
+      || (p.pad_right >= (1 << 4))
+      || !p.ofmap
+      || !p.ifmap) {
+    printf("retry init_pooling_param\n");
+    free_pooling_param(cvk_ctx, &p);
+    goto retry;
+  }
+
+  return p;
+}
+
+static int compare_results(
+    param_t *p,
+    uint16_t input[],
+    uint16_t output[])
+{
+  int ret = 0;
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+
+  uint16_t *output_ref = alloc_output(p);
+  p->avg_pooling_const = cvk_convert_fp32_bf16(cvk_convert_bf16_fp32(p->avg_pooling_const)/(p->kh * p->kw));
+  ret = native_pooling_avg_bf16(
+      input, &p->avg_pooling_const, NULL, output_ref,
+      in, ic, ih, iw, p->kh, p->kw,
+      p->pad_top, p->pad_bottom, p->pad_left, p->pad_right,
+      p->stride_h, p->stride_w,
+      p->ins_h, p->ins_w, p->ins_last_h, p->ins_last_w,1
+      );
+  if (ret) {
+    free(output_ref);
+    return ret;
+  }
+
+  ret = array_cmp_int8(
+      "Comparing results ...\n", (int8_t*)output_ref, (int8_t*) output,
+      tl_shape_size(&p->ofmap->shape, p->ofmap->fmt));
+
+  if (ret) {
+    printf("Comparison FAILED!!!\n");
+    print_pooling_param(p);
+    ret = -1;
+  }
+
+  free(output_ref);
+
+  return ret;
+}
+
+static int _test_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int stride_w, int stride_h)
+{
+  param_t p = random_pooling_param(cvk_ctx, stride_w, stride_h);
+//  print_pooling_param(&p);
+
+  uint16_t *input = alloc_input(&p);
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, p.ifmap, (uint8_t *)input);
+  cvk_ctx->ops->tiu_average_pooling(cvk_ctx, &p);
+  uint16_t *output = (uint16_t *)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, p.ofmap);
+
+  int ret = compare_results(&p, input, output);
+
+  free_pooling_param(cvk_ctx, &p);
+  free(output);
+  free(input);
+
+  return ret;
+}
+
+
+static int test_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx) {
+  return _test_pooling(rt_handle, cvk_ctx, INVALIDE_STRIDE, INVALIDE_STRIDE);
+}
+
+static int test_avg_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+  for (uint64_t i = 0; i < 20; i++)
+    ret |= test_pooling(rt_handle, cvk_ctx);
+
+  // test stride extend (0, 31]
+  int stride_list[] = {15, 16, 31};
+  int stride_list_len = sizeof(stride_list) / sizeof(stride_list[0]);
+
+  for (int stride_w_idx = 0; stride_w_idx < stride_list_len; stride_w_idx++) {
+    for (int stride_h_idx = 0; stride_h_idx < stride_list_len; stride_h_idx++) {
+      int stride_w = stride_list[stride_w_idx];
+      int stride_h = stride_list[stride_h_idx];
+
+      ret |= _test_pooling(rt_handle, cvk_ctx, stride_w, stride_h);
+    }
+  }
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+  int round_mode;
+  round_mode = cvk_set_store_feround();
+  ret = test_avg_pooling(rt_handle, cvk_ctx);
+  cvk_restore_feround(round_mode);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_bf16_conv.c b/cviruntime/test/181x/test_181x_bf16_conv.c
new file mode 100644
index 000000000..acefb4130
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_bf16_conv.c
@@ -0,0 +1,780 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_native_ref.h"
+
+#define INVALIDE_STRIDE (-1)
+typedef struct {
+  int random_seed;
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int using_bias;
+  int bReLU_EN;
+  int r_shift_m;
+  int opd0_sign;
+  int opd1_sign;
+  int opd2_sign;
+  int bf16_enable;
+} conv_param_t;
+
+static void print_conv_param(const conv_param_t *p);
+
+static inline void bf16_relu(float *buf, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++)
+    if (buf[i] < 0)
+      buf[i] = 0;
+}
+
+static int conv_ref(
+    const conv_param_t *p_param,
+    const uint16_t *ifmap,
+    const uint16_t *weight,
+    const uint32_t *bias,
+    uint16_t *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int do_relu = p_param->bReLU_EN;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  float *result = (float *)malloc(sizeof(float) * in * oc * oh * ow);
+  if (!result)
+    return -1;
+
+  memset(result, 0, sizeof(float) * in * oc * oh * ow);
+  int ret = 0;
+
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      uint16_t *i_fmap_pad[ic];
+      uint16_t *kernel_pad[ic];
+
+      for (int iic = 0; iic < ic; ++iic) {
+        i_fmap_pad[iic] = NULL;
+        kernel_pad[iic] = NULL;
+        fill_pad_fmap_bf16(
+            (ifmap + n*ic*ih*iw + iic*ih*iw), &i_fmap_pad[iic], cvk_convert_fp32_bf16(0),
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+        //kernel_dilation(
+        fill_pad_fmap_bf16(
+            (weight + c*ic*kh*kw + iic*kh*kw), &kernel_pad[iic], cvk_convert_fp32_bf16(0),
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+                  kh, kw);
+      }
+      for (int ph = 0; ph < oh; ++ph) {
+        for (int pw = 0; pw < ow; ++pw) {
+          float result_val = result[n*oc*oh*ow + c*oh*ow + ph*ow + pw];
+          for (int idxh = 0; idxh < kh_ext; ++idxh) {
+            for (int idxw = 0; idxw < kw_ext; ++idxw) {
+              for (int iic = 0; iic < ic; ++iic){
+                float ifv = cvk_convert_bf16_fp32(i_fmap_pad[iic][(idxh+ph*stride_h) * iw_ext + idxw + pw*stride_w]);
+                float ikv = cvk_convert_bf16_fp32(kernel_pad[iic][idxh* kw_ext + idxw]);
+                result_val += ifv*ikv;
+              }
+            }
+          }
+          result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] = result_val;
+        }
+      }
+
+       if (p_param->using_bias) {
+         for (int ph = 0; ph < oh; ++ph) {
+           for (int pw = 0; pw < ow; ++pw) {
+             result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += cvk_convert_hex_fp32(bias[c]); //bias+c ;
+           }
+         }
+       }
+
+       if (do_relu)
+         bf16_relu(&result[n*oc*oh*ow + c*oh*ow], oh * ow);
+       for(int i = 0 ;i<ic;i++) {
+         free(i_fmap_pad[i]);
+         free(kernel_pad[i]);
+       }
+       if (ret)
+         goto error_release;
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+    for (int i = 0; i < in * oc * oh * ow; i++) {
+      ofmap[i] = cvk_convert_fp32_bf16(result[i]);
+    }
+
+error_release:
+  free(result);
+
+  return ret;
+}
+
+static uint16_t * transform_weight(const cvk_tl_shape_t *s, uint16_t before[])
+{
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  uint32_t size = ic * oc * kh * kw;
+  uint16_t *after = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  if (!after)
+    return NULL;
+
+  /*
+   * (oc, ic, kh, kw) -> (1, oc, kh * kw, ic)
+   */
+  for (uint32_t oci = 0; oci < oc; oci++) {
+    for (uint32_t ici = 0; ici < ic; ici++) {
+      for (uint32_t khi = 0; khi < kh; khi++) {
+        for (uint32_t kwi = 0; kwi < kw; kwi++) {
+          uint32_t src_i = oci * ic * kh * kw + ici * kh * kw + khi * kw + kwi;
+          uint32_t dst_i = oci * kh * kw * ic + khi * kw * ic + kwi * ic + ici;
+          after[dst_i] = before[src_i];
+        }
+      }
+    }
+  }
+
+  return after;
+}
+
+static void put_conv_weight(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    uint16_t *data)
+{
+#if 0
+  const cvk_tl_shape_t *s = &tl->shape;
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  bmshape_t bms = BM_TENSOR_INT8((int)oc, (int)ic, (int)kh, (int)kw*2);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  uint16_t *transformed_data = transform_weight(s, data);
+
+  /* we put weight to region 1. bm_memcpy_s2d regard dev_mem as
+   * absolute address, so we should pass abs address to copy weight
+   * to right place.
+   */
+
+  //u64 ab_addr = bm_device_read_base_reg(*ctx, 1);
+  //bmmem_device_t ab_dev_mem =
+    //bmmem_device_prealloc(*ctx, NULL, ab_addr + gaddr, &bms);
+
+  //int ret = bm_memcpy_s2d(*ctx, ab_dev_mem, (u8*)transformed_data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, (u8*)transformed_data);
+
+  assert(ret == BM_SUCCESS);
+  cvk_tl_shape_t tdma_shape = { 1, oc, kh * kw, ic };
+
+  tg_t tdma_tg;
+  tdma_tg.base_reg_index = 0;
+  tdma_tg.start_address = gaddr;
+  tdma_tg.fmt = FMT_BF16;
+  tdma_tg.shape.n = tdma_shape.n;
+  tdma_tg.shape.c = tdma_shape.c;
+  tdma_tg.shape.h = tdma_shape.h;
+  tdma_tg.shape.w = tdma_shape.w;
+  tdma_tg.stride = bmk1822_tensor_tgmem_default_stride(tdma_tg.shape, FMT_BF16);
+  tdma_tg.base_reg_index = 1;
+
+  tl_t tdma_tl = *tl;
+  tdma_tl.shape = tdma_shape;
+  tdma_tl.stride = bmk1822_tensor_lmem_default_stride(bk_ctx, tdma_shape, FMT_BF16, 0);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tdma_tg;
+  p.dst = &tdma_tl;
+
+  bmk1822_tdma_g2l_bf16_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+  bmmem_device_free(*ctx, dev_mem);
+  //delete[] transformed_data;
+  return transformed_data;
+#endif
+
+  const cvk_tl_shape_t *s = &tl->shape;
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  uint16_t *transformed_data = transform_weight(s, data);
+
+  cvk_tl_shape_t tdma_shape = tl_shape_t4(1, oc, kh * kw, ic);
+  cvk_tl_t tdma_tl;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tdma_tl, tdma_shape, tl->fmt, tl->eu_align);
+  tdma_tl.start_address = tl->start_address;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, &tdma_tl, (uint8_t *)transformed_data);
+
+  free(transformed_data);
+}
+
+
+static uint16_t * transform_bias(int oc, uint32_t before[])
+{
+  uint16_t *after = (uint16_t *)malloc(2 * sizeof(uint16_t) * oc);
+  if (!after)
+    return NULL;
+
+  for (int i = 0; i < oc; i++){
+    after[i] = (before[i] >> 16) & 0xffff;
+    after[i + oc] = before[i] & 0xffff;
+  }
+  return after;
+}
+
+static void put_conv_bias(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    uint32_t *data)
+{
+#if 0
+  int oc = tl->shape.c;
+
+  bmshape_t bms = BM_TENSOR_INT8(2 * sizeof(short), oc, 1, 1);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  uint16_t *transformed_data = transform_bias(oc, data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, (u8 *)transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_BF16;
+  tg.shape.n = 2;
+  tg.shape.c = oc;
+  tg.shape.h = 1;
+  tg.shape.w = 1;
+  tg.stride = bmk1822_tensor_tgmem_default_stride(tg.shape, FMT_BF16);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1822_tdma_g2l_bf16_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  bmmem_device_free(*ctx, dev_mem);
+#endif
+
+  int oc = tl->shape.c;
+  uint16_t *transformed_data = transform_bias(oc, data);
+
+  cvk_tl_shape_t tdma_shape = tl_shape_t4(2, oc, 1, 1);
+  cvk_tl_t tdma_tl;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tdma_tl, tdma_shape, tl->fmt, tl->eu_align);
+  tdma_tl.start_address = tl->start_address;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, &tdma_tl, (uint8_t *)transformed_data);
+
+  free(transformed_data);
+}
+
+static int conv_kh_ext(const conv_param_t *p)
+{
+  return (p->kh - 1) * p->dh + 1;
+}
+
+static int conv_kw_ext(const conv_param_t *p)
+{
+  return (p->kw - 1) * p->dw + 1;
+}
+
+static int conv_ih_ext(const conv_param_t *p)
+{
+  return (p->input_h - 1) * (p->ins_h + 1) +
+      p->ins_h_last + 1 + p->pad_top + p->pad_bot;
+}
+
+static int conv_iw_ext(const conv_param_t *p)
+{
+  return (p->input_w - 1) * (p->ins_w + 1) +
+      p->ins_w_last + 1 + p->pad_left + p->pad_right;
+}
+
+static int conv_oh(const conv_param_t *p)
+{
+  return (conv_ih_ext(p) - conv_kh_ext(p)) / p->stride_h + 1;
+}
+
+static int conv_ow(const conv_param_t *p)
+{
+  return (conv_iw_ext(p) - conv_kw_ext(p)) / p->stride_w + 1;
+}
+
+static int conv_input_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int ic = p->input_c;
+  int ih = p->input_h;
+  int iw = p->input_w;
+  return in * ic * ih * iw;
+}
+
+static int conv_output_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int oc = p->output_c;
+  int oh = conv_oh(p);
+  int ow = conv_ow(p);
+  return in * oc * oh * ow;
+}
+
+static int conv_weight_size(const conv_param_t *p)
+{
+  int oc = p->output_c;
+  int ic = p->input_c;
+  int kh = p->kh;
+  int kw = p->kw;
+  return oc * ic * kh * kw;
+}
+
+static uint16_t * alloc_input(const conv_param_t *p)
+{
+  int size = conv_input_size(p);
+  uint16_t *buf = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (int i = 0; i < size; i++) {
+    float val = 0;
+    int RAND_MAX2 = RAND_MAX/2; //5 ~ -5
+    val = (float)(rand()-RAND_MAX2)*5 / (float)RAND_MAX;
+    buf[i] = cvk_convert_fp32_bf16(val);
+  }
+  return buf;
+}
+
+static uint16_t * alloc_weight(const conv_param_t *p)
+{
+  int size = conv_weight_size(p);
+  uint16_t *buf = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (int i = 0; i < size; i++) {
+    float val = 0;
+    int RAND_MAX2 = RAND_MAX/2; // 5 ~ -5
+    val = (float)(rand()-RAND_MAX2)*5 / (float)RAND_MAX;
+    buf[i] = cvk_convert_fp32_bf16(val);
+  }
+
+  return buf;
+}
+
+static uint32_t * alloc_bias(const conv_param_t *p)
+{
+  int oc = p->output_c;
+  uint32_t *bias = (uint32_t *)malloc(sizeof(uint32_t) * oc);
+  if (!bias)
+    return NULL;
+
+  for (int i = 0; i < oc; i++) {
+    float val = 0;
+    int RAND_MAX2 = RAND_MAX/2; // 5 ~ -5
+    val = (float)(rand()-RAND_MAX2)*5 / (float)RAND_MAX;
+    bias[i] = cvk_convert_fp32_hex(val);
+  }
+
+  return bias;
+}
+
+static cvk_tl_t * conv_ifmap_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = CVK_FMT_BF16;//p->opd0_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 1);
+}
+
+static cvk_tl_t * conv_weight_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = CVK_FMT_BF16;//p->opd1_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 0);
+}
+
+static cvk_tl_t * conv_ofmap_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_tl_shape_t s;
+  cvk_fmt_t fmt = CVK_FMT_BF16;
+  s.n = p->input_n;
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 1);
+}
+
+static cvk_tl_t * conv_bias_tensor(
+    cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = CVK_FMT_BF16;
+  cvk_tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 0);
+}
+
+static int conv_param_is_ok(const conv_param_t *p)
+{
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+
+  if ((kh_ext > ih_ext)
+      || (kw_ext > iw_ext)
+      || (kh_ext <= p->pad_top)
+      || (kh_ext <= p->pad_bot)
+      || (kw_ext <= p->pad_left)
+      || (kw_ext <= p->pad_right)
+      || (p->pad_top >= (1 << 4))
+      || (p->pad_bot >= (1 << 4))
+      || (p->pad_left >= (1 << 4))
+      || (p->pad_right >= (1 << 4))) {
+    return 0;
+  }
+
+  return 1;
+}
+
+static int bmk_conv_param_alloc_ok(
+    const cvk_tiu_pt_convolution_param_t *p,
+    const conv_param_t *param)
+{
+  if (!p->ifmap || !p->ofmap || !p->weight)
+    return 0;
+
+  if (param->using_bias)
+    if (!p->bias)
+      return 0;
+
+  return 1;
+}
+
+static void make_bmk_conv_param(
+    cvk_context_t *cvk_ctx,
+    cvk_tiu_pt_convolution_param_t *dst,
+    const conv_param_t *p)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  dst->relu_enable = p->bReLU_EN;
+  dst->rshift_bits = p->r_shift_m;
+
+  dst->ifmap = conv_ifmap_tensor(cvk_ctx, p);
+  dst->weight = conv_weight_tensor(cvk_ctx, p);
+  dst->ofmap = conv_ofmap_tensor(cvk_ctx, p);
+  dst->bias = NULL;
+  dst->ps32_mode = 0;
+  if (p->using_bias)
+    dst->bias = conv_bias_tensor(cvk_ctx, p);
+  dst->w_is_const = 0;
+}
+
+static void free_bmk_conv_param(
+    cvk_context_t *cvk_ctx,
+    cvk_tiu_pt_convolution_param_t *r,
+    const conv_param_t *p)
+{
+  if (p->using_bias && r->bias)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->bias);
+  if (r->ofmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ofmap);
+  if (r->weight)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->weight);
+  if (r->ifmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ifmap);
+}
+
+static void _init_conv_param(conv_param_t *p, int stride_w, int stride_h)
+{
+  printf("init_conv_param\n");
+  memset(p, 0, sizeof(*p));
+  p->random_seed = clock();
+  srand(p->random_seed);
+
+retry:
+
+  p->input_n = rand() % 5 + 1;
+  p->input_c = rand() % (5 * 32) + 1;
+  p->kh = rand() % 7 + 1;
+  p->kw = rand() % 7 + 1;
+  p->input_h = rand() % 40 + p->kh + (INVALIDE_STRIDE == stride_h ? 0 : stride_h);
+  p->input_w = rand() % 40 + p->kw + (INVALIDE_STRIDE == stride_w ? 0 : stride_w);
+  p->output_c = rand() % 10 + 3;
+  p->stride_h = INVALIDE_STRIDE == stride_h ? rand() % (p->kh) + 1 : stride_h;
+  p->stride_w = INVALIDE_STRIDE == stride_w ? rand() % (p->kh) + 1 : stride_w;
+  p->ins_h = rand() % p->kh;
+  p->ins_w = rand() % p->kw;
+  p->ins_h_last = rand() % p->kh;;
+  p->ins_w_last = rand() % p->kw;;
+  p->dh = rand() % 3 + 1;
+  p->dw = rand() % 3 + 1;
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  p->pad_top = rand() % kh_ext;
+  p->pad_bot = rand() % kh_ext;
+  p->pad_left = rand() % kw_ext;
+  p->pad_right = rand() % kw_ext;
+
+  if (!conv_param_is_ok(p)) {
+    printf("retry init_conv_param\n");
+    goto retry;
+  }
+
+  p->using_bias = rand() % 2;
+  p->bReLU_EN = rand() % 2;
+
+  p->opd0_sign = rand() % 2;
+  p->opd1_sign = 1;
+  p->opd2_sign = 1;
+
+  assert(p->opd1_sign == 1 && p->opd2_sign == 1);
+
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+  assert(ih_ext >= kh_ext);
+  assert(iw_ext >= kw_ext);
+}
+
+static void init_conv_param(conv_param_t *p) {
+  _init_conv_param(p, INVALIDE_STRIDE, INVALIDE_STRIDE);
+}
+
+#if 1
+static void print_conv_param(const conv_param_t *p)
+{
+  printf("%s\n", "Conv parameters:");
+  printf("  %s%d;\n", "p->random_seed = ", p->random_seed);
+
+  printf("  %s%d;\n", "p->input_n = ", p->input_n);
+  printf("  %s%d;\n", "p->input_c = ", p->input_c);
+  printf("  %s%d;\n", "p->input_h = ", p->input_h);
+  printf("  %s%d;\n", "p->input_w = ", p->input_w);
+  printf("  %s%d;\n", "p->output_c = ", p->output_c);
+
+  printf("  %s%d;\n", "p->kh = ", p->kh);
+  printf("  %s%d;\n", "p->kw = ", p->kw);
+  printf("  %s%d;\n", "p->dh = ", p->dh);
+  printf("  %s%d;\n", "p->dw = ", p->dw);
+  printf("  %s%d;\n", "p->pad_top = ", p->pad_top);
+  printf("  %s%d;\n", "p->pad_bot = ", p->pad_bot);
+  printf("  %s%d;\n", "p->pad_left = ", p->pad_left);
+  printf("  %s%d;\n", "p->pad_right = ", p->pad_right);
+  printf("  %s%d;\n", "p->stride_h = ", p->stride_h);
+  printf("  %s%d;\n", "p->stride_w = ", p->stride_w);
+  printf("  %s%d;\n", "p->ins_w = ", p->ins_w);
+  printf("  %s%d;\n", "p->ins_h = ", p->ins_h);
+  printf("  %s%d;\n", "p->ins_w_last = ", p->ins_w_last);
+  printf("  %s%d;\n", "p->ins_h_last = ", p->ins_h_last);
+
+  printf("  %s%d;\n", "p->r_shift_m = ", p->r_shift_m);
+  printf("  %s%d;\n", "p->opd0_sign = ", p->opd0_sign);
+  printf("  %s%d;\n", "p->opd1_sign = ", p->opd1_sign);
+  printf("  %s%d;\n", "p->opd2_sign = ", p->opd2_sign);
+  printf("  %s%d;\n", "p->bReLU_EN = ", p->bReLU_EN);
+  printf("  %s%d;\n", "p->using_bias = ", p->using_bias);
+
+  printf("  %s%d\n", "kh_ext = ", conv_kh_ext(p));
+  printf("  %s%d\n", "kw_ext = ", conv_kw_ext(p));
+  printf("  %s%d\n", "ih_ext = ", conv_ih_ext(p));
+  printf("  %s%d\n", "iw_ext = ", conv_iw_ext(p));
+  printf("  %s%d\n", "output_h = ", conv_oh(p));
+  printf("  %s%d\n", "output_w = ", conv_ow(p));
+}
+#endif
+
+static int test_conv(
+    conv_param_t *p_param, CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+  uint16_t *input = alloc_input(p_param);
+  uint16_t *weight = alloc_weight(p_param);
+  uint32_t *bias = alloc_bias(p_param);
+  uint16_t *output_ref = (uint16_t *)malloc(sizeof(uint16_t) * conv_output_size(p_param));
+  if (!input || !weight || !bias || !output_ref) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  //print_conv_param(p_param);
+
+  ret = conv_ref(p_param, input, weight, bias, output_ref);
+  if (ret)
+    goto fail_exit;
+
+  cvk_tiu_pt_convolution_param_t conv_param;
+  make_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+
+  int tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, p_param);
+  if (tl_alloc_success) {
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, conv_param.ifmap, (uint8_t *)input);
+    put_conv_weight(rt_handle, cvk_ctx, conv_param.weight, weight);
+    if (p_param->using_bias)
+      put_conv_bias(rt_handle, cvk_ctx, conv_param.bias, bias);
+    cvk_ctx->ops->tiu_pt_convolution(cvk_ctx, &conv_param);
+    uint16_t *output = (uint16_t *) tensor_copy_l2g_d2s(rt_handle, cvk_ctx, conv_param.ofmap);
+
+    ret = array_cmp_int8(
+        "Comparing results ...\n",
+        (int8_t*)output_ref, (int8_t*)output, conv_output_size(p_param)*2);
+
+    if (ret) {
+      print_conv_param(p_param);
+      printf("Comparison FAILED\n");
+      ret = -1;
+    }
+    free(output);
+  }
+
+  free_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+
+fail_exit:
+  free(input);
+  free(weight);
+  free(output_ref);
+  free(bias);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+  int round_mode;
+  round_mode = cvk_set_store_feround();
+
+  // 20 -> 5 for 1810
+  for (int i = 0; i < 5; i++) {
+    printf("random_test_conv iteration: %d\n", i);
+    conv_param_t test_conv_param;
+    init_conv_param(&test_conv_param);
+
+    ret |= test_conv(&test_conv_param, rt_handle, cvk_ctx);
+
+    if (!test_conv_param.using_bias)
+      test_conv_param.using_bias = 1;
+
+    if (test_conv_param.output_c <= 32)
+    {
+      test_conv_param.output_c += 32;
+    }
+    ret |= test_conv(&test_conv_param, rt_handle, cvk_ctx);
+  }
+
+  // test stride extend (0, 31]
+  int stride_list[] = {15, 16, 31};
+  int stride_list_len = sizeof(stride_list) / sizeof(stride_list[0]);
+
+  for (int stride_w_idx = 0; stride_w_idx < stride_list_len; stride_w_idx++) {
+    for (int stride_h_idx = 0; stride_h_idx < stride_list_len; stride_h_idx++) {
+      int stride_w = stride_list[stride_w_idx];
+      int stride_h = stride_list[stride_h_idx];
+      conv_param_t test_conv_param;
+      _init_conv_param(&test_conv_param, stride_w, stride_h);
+
+      ret |= test_conv(&test_conv_param, rt_handle, cvk_ctx);
+    }
+  }
+
+  cvk_restore_feround(round_mode);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_bf16_conv_ps32.c b/cviruntime/test/181x/test_181x_bf16_conv_ps32.c
new file mode 100644
index 000000000..5061e5f98
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_bf16_conv_ps32.c
@@ -0,0 +1,1193 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_native_ref.h"
+
+typedef struct {
+  int random_seed;
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int using_bias;
+  int bReLU_EN;
+  int r_shift_m;
+  int opd0_sign;
+  int opd1_sign;
+  int opd2_sign;
+  int bf16_enable;
+} conv_param_t;
+
+static inline void bf16_relu(float *buf, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++)
+    if (buf[i] < 0)
+      buf[i] = 0;
+}
+
+static int ps32_conv_ref(
+    const conv_param_t *p_param,
+    const uint16_t *ifmap,
+    const uint16_t *weight,
+    const uint32_t *bias,
+    uint16_t *ofmap, int ps32_mode)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  float *result = (float *)malloc(sizeof(float) * in * oc * oh * ow);
+  if (!result)
+    return -1;
+
+  uint32_t bstride = in * oc * oh * ow;
+  int ret = 0;
+
+  if (ps32_mode == 2 || ps32_mode == 0)
+    memset(result, 0, sizeof(float) * in * oc * oh * ow);
+  else {
+    for (int i = 0; i < in * oc * oh * ow; i++) {
+      result[i] = cvk_convert_hex_fp32((ofmap[i + bstride * 0] << 16) | ofmap[i + bstride * 1]);
+    }
+  }
+
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      uint16_t *i_fmap_pad[ic];
+      uint16_t *kernel_pad[ic];
+
+      for (int iic = 0; iic < ic; ++iic) {
+        i_fmap_pad[iic] = NULL;
+        kernel_pad[iic] = NULL;
+        fill_pad_fmap_bf16(
+            (ifmap + n*ic*ih*iw + iic*ih*iw), &i_fmap_pad[iic], cvk_convert_fp32_bf16(0),
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+        //kernel_dilation(
+        fill_pad_fmap_bf16(
+            (weight + c*ic*kh*kw + iic*kh*kw), &kernel_pad[iic], cvk_convert_fp32_bf16(0),
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+                  kh, kw);
+      }
+      for (int ph = 0; ph < oh; ++ph) {
+        for (int pw = 0; pw < ow; ++pw) {
+          float result_val= result[n*oc*oh*ow + c*oh*ow + ph*ow + pw];
+          for (int idxh = 0; idxh < kh_ext; ++idxh)  {
+            for (int idxw = 0; idxw < kw_ext; ++idxw) {
+              for (int iic = 0; iic < ic; ++iic){
+                float ifv = cvk_convert_bf16_fp32(i_fmap_pad[iic][(idxh+ph*stride_h) * iw_ext + idxw + pw*stride_w]);
+                float ikv = cvk_convert_bf16_fp32(kernel_pad[iic][idxh* kw_ext + idxw]);
+                result_val += ifv*ikv;
+              }
+            }
+          }
+          result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] = result_val;
+		    }
+      }
+        for(int i = 0; i < ic; i++) {
+          if (i_fmap_pad[i]) {
+            free(i_fmap_pad[i]);
+            i_fmap_pad[i] = NULL;
+          }
+          if (kernel_pad[i]) {
+            free(kernel_pad[i]);
+            kernel_pad[i] = NULL;
+          }
+        }
+    } //end for (int c = 0; c < oc; ++c)
+  }
+
+  if( ps32_mode & 0x2) {
+    for (int i = 0; i < in * oc * oh * ow; i ++) {
+      ofmap[i] = cvk_convert_fp32_hex(result[i]) >> 16;
+      ofmap[bstride + i] = cvk_convert_fp32_hex(result[i]) & 0xFFFF;
+    }
+  } else {
+    for (int n = 0; n < in; ++n) {
+      for (int c = 0; c < oc; ++c) {
+        if (p_param->using_bias) {
+          for (int ph = 0; ph < oh; ++ph) {
+            for (int pw = 0; pw < ow; ++pw) {
+              result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += cvk_convert_hex_fp32(bias[c]); //bias+c ;
+            }
+          }
+        }
+        if (p_param->bReLU_EN)
+          bf16_relu(&result[n*oc*oh*ow + c*oh*ow], oh * ow);
+      }
+    }
+    for (int i = 0; i < in * oc * oh * ow; i++) {
+      ofmap[i] = cvk_convert_fp32_bf16(result[i]);
+    }
+  }
+  free(result);
+  return ret;
+}
+
+static uint16_t * transform_weight(const cvk_tl_shape_t *s, uint16_t before[])
+{
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  uint32_t size = ic * oc * kh * kw;
+  uint16_t *after = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  if (!after)
+    return NULL;
+
+  /*
+   * (oc, ic, kh, kw) -> (1, oc, kh * kw, ic)
+   */
+  for (uint32_t oci = 0; oci < oc; oci++) {
+    for (uint32_t ici = 0; ici < ic; ici++) {
+      for (uint32_t khi = 0; khi < kh; khi++) {
+        for (uint32_t kwi = 0; kwi < kw; kwi++) {
+          uint32_t src_i = oci * ic * kh * kw + ici * kh * kw + khi * kw + kwi;
+          uint32_t dst_i = oci * kh * kw * ic + khi * kw * ic + kwi * ic + ici;
+          after[dst_i] = before[src_i];
+        }
+      }
+    }
+  }
+
+  return after;
+}
+
+static void put_conv_weight(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    uint16_t *data)
+{
+#if 0
+  const cvk_tl_shape_t *s = &tl->shape;
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  bmshape_t bms = BM_TENSOR_INT8((int)oc, (int)ic, (int)kh, (int)kw*2);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  uint16_t *transformed_data = transform_weight(s, data);
+
+  /* we put weight to region 1. bm_memcpy_s2d regard dev_mem as
+   * absolute address, so we should pass abs address to copy weight
+   * to right place.
+   */
+
+  //u64 ab_addr = bm_device_read_base_reg(*ctx, 1);
+  //bmmem_device_t ab_dev_mem =
+    //bmmem_device_prealloc(*ctx, NULL, ab_addr + gaddr, &bms);
+
+  //int ret = bm_memcpy_s2d(*ctx, ab_dev_mem, (u8*)transformed_data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, (u8*)transformed_data);
+
+  assert(ret == BM_SUCCESS);
+  cvk_tl_shape_t tdma_shape = { 1, oc, kh * kw, ic };
+
+  tg_t tdma_tg;
+  tdma_tg.base_reg_index = 0;
+  tdma_tg.start_address = gaddr;
+  tdma_tg.fmt = FMT_BF16;
+  tdma_tg.shape.n = tdma_shape.n;
+  tdma_tg.shape.c = tdma_shape.c;
+  tdma_tg.shape.h = tdma_shape.h;
+  tdma_tg.shape.w = tdma_shape.w;
+  tdma_tg.stride = bmk1822_tensor_tgmem_default_stride(tdma_tg.shape, FMT_BF16);
+  tdma_tg.base_reg_index = 1;
+
+  tl_t tdma_tl = *tl;
+  tdma_tl.shape = tdma_shape;
+  tdma_tl.stride = bmk1822_tensor_lmem_default_stride(bk_ctx, tdma_shape, FMT_BF16, 0);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tdma_tg;
+  p.dst = &tdma_tl;
+
+  bmk1822_tdma_g2l_bf16_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+  bmmem_device_free(*ctx, dev_mem);
+  //delete[] transformed_data;
+  return transformed_data;
+#endif
+
+  const cvk_tl_shape_t *s = &tl->shape;
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  uint16_t *transformed_data = transform_weight(s, data);
+
+  cvk_tl_shape_t tdma_shape = tl_shape_t4(1, oc, kh * kw, ic);
+  cvk_tl_t tdma_tl;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tdma_tl, tdma_shape, tl->fmt, tl->eu_align);
+  tdma_tl.start_address = tl->start_address;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, &tdma_tl, (uint8_t *)transformed_data);
+
+  free(transformed_data);
+}
+
+
+static uint16_t * transform_bias(int oc, uint32_t before[])
+{
+  uint16_t *after = (uint16_t *)malloc(sizeof(uint16_t) * 2 * oc);
+  if (!after)
+    return NULL;
+
+  for (int i = 0; i < oc; i++){
+    after[i] = (before[i] >> 16) & 0xffff;
+    after[i + oc] = before[i] & 0xffff;
+  }
+  return after;
+}
+
+static void put_conv_bias(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    uint32_t *data)
+{
+#if 0
+  int oc = tl->shape.c;
+
+  bmshape_t bms = BM_TENSOR_INT8(2 * sizeof(short), oc, 1, 1);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  uint16_t *transformed_data = transform_bias(oc, data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, (u8 *)transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_BF16;
+  tg.shape.n = 2;
+  tg.shape.c = oc;
+  tg.shape.h = 1;
+  tg.shape.w = 1;
+  tg.stride = bmk1822_tensor_tgmem_default_stride(tg.shape, FMT_BF16);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1822_tdma_g2l_bf16_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  bmmem_device_free(*ctx, dev_mem);
+#endif
+
+  int oc = tl->shape.c;
+  uint16_t *transformed_data = transform_bias(oc, data);
+
+  cvk_tl_shape_t tdma_shape = tl_shape_t4(2, oc, 1, 1);
+  cvk_tl_t tdma_tl;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tdma_tl, tdma_shape, tl->fmt, tl->eu_align);
+  tdma_tl.start_address = tl->start_address;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, &tdma_tl, (uint8_t *)transformed_data);
+
+  free(transformed_data);
+}
+
+static int conv_kh_ext(const conv_param_t *p)
+{
+  return (p->kh - 1) * p->dh + 1;
+}
+
+static int conv_kw_ext(const conv_param_t *p)
+{
+  return (p->kw - 1) * p->dw + 1;
+}
+
+static int conv_ih_ext(const conv_param_t *p)
+{
+  return (p->input_h - 1) * (p->ins_h + 1) +
+      p->ins_h_last + 1 + p->pad_top + p->pad_bot;
+}
+
+static int conv_iw_ext(const conv_param_t *p)
+{
+  return (p->input_w - 1) * (p->ins_w + 1) +
+      p->ins_w_last + 1 + p->pad_left + p->pad_right;
+}
+
+static int conv_oh(const conv_param_t *p)
+{
+  return (conv_ih_ext(p) - conv_kh_ext(p)) / p->stride_h + 1;
+}
+
+static int conv_ow(const conv_param_t *p)
+{
+  return (conv_iw_ext(p) - conv_kw_ext(p)) / p->stride_w + 1;
+}
+
+static int conv_input_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int ic = p->input_c;
+  int ih = p->input_h;
+  int iw = p->input_w;
+  return in * ic * ih * iw;
+}
+
+static int conv_output_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int oc = p->output_c;
+  int oh = conv_oh(p);
+  int ow = conv_ow(p);
+
+  return in * oc * oh * ow;
+}
+
+static int conv_weight_size(const conv_param_t *p)
+{
+  int oc = p->output_c;
+  int ic = p->input_c;
+  int kh = p->kh;
+  int kw = p->kw;
+  return oc * ic * kh * kw;
+}
+
+static uint16_t * alloc_input(const conv_param_t *p)
+{
+  int size = conv_input_size(p);
+  uint16_t *buf = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (int i = 0; i < size; i++) {
+    buf[i] = cvk_convert_fp32_bf16(i);
+  }
+
+  return buf;
+}
+
+static uint16_t * alloc_weight(const conv_param_t *p)
+{
+  int size = conv_weight_size(p);
+
+  uint16_t *buf = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (int i = 0; i < size; i++) {
+    buf[i] = cvk_convert_fp32_bf16(i);
+  }
+
+  return buf;
+}
+
+static uint32_t * alloc_bias(const conv_param_t *p)
+{
+  int oc = p->output_c;
+
+  uint32_t *bias = (uint32_t *)malloc(sizeof(uint32_t) * oc);
+  if (!bias)
+    return NULL;
+
+  float val = 100;
+  for (int i = 0; i < oc; i++) {
+    bias[i] = cvk_convert_fp32_hex(val);
+    val += 1;
+  }
+  return bias;
+}
+
+static cvk_tl_t * conv_ifmap_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = CVK_FMT_BF16;
+  cvk_tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 1);
+}
+
+static uint32_t conv_ifmap_tensor_size(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = CVK_FMT_BF16;
+  cvk_tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, s, fmt, 1);
+}
+
+static cvk_tl_t * conv_weight_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = CVK_FMT_BF16; //p->opd1_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 0);
+}
+
+static uint32_t conv_weight_tensor_to_size(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = CVK_FMT_BF16; //p->opd1_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+  return cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, s, fmt, 0);
+}
+
+static cvk_tl_t * conv_ofmap_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_tl_shape_t s;
+  s.n = p->input_n * 4;
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  cvk_tl_t *tl = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, CVK_FMT_BF16, 1);
+  if (tl)
+    tl->shape.n = p->input_n;
+  return tl;
+}
+
+static uint32_t conv_ofmap_tensor_to_size(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_tl_shape_t s;
+  s.n = p->input_n * sizeof(uint32_t) / sizeof(uint8_t);
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  return cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, s, CVK_FMT_BF16, 1);
+}
+
+static cvk_tl_t * conv_bias_tensor(
+    cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = CVK_FMT_BF16;//p->opd2_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 0);
+}
+
+static uint32_t conv_bias_tensor_size(
+    cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = CVK_FMT_BF16;//p->opd2_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+  return cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, s, fmt, 0);
+}
+
+static int conv_param_is_ok(const conv_param_t *p)
+{
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+
+  if ((kh_ext > ih_ext)
+      || (kw_ext > iw_ext)
+      || (kh_ext <= p->pad_top)
+      || (kh_ext <= p->pad_bot)
+      || (kw_ext <= p->pad_left)
+      || (kw_ext <= p->pad_right)
+      || (p->pad_top >= (1 << 4))
+      || (p->pad_bot >= (1 << 4))
+      || (p->pad_left >= (1 << 4))
+      || (p->pad_right >= (1 << 4))) {
+    return 0;
+  }
+
+  return 1;
+}
+
+static int bmk_conv_param_alloc_ok(
+    const cvk_tiu_pt_convolution_param_t *p,
+    const conv_param_t *param)
+{
+  if (!p->ifmap || !p->ofmap || !p->weight)
+    return 0;
+  if(p->ps32_mode==1)
+      if (param->using_bias)
+        if (!p->bias)
+          return 0;
+
+  return 1;
+}
+
+static void make_bmk_conv_param_ps32(
+    cvk_context_t *cvk_ctx,
+    cvk_tiu_pt_convolution_param_t *dst,
+    const conv_param_t *p, uint32_t ps32_mode)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  if(ps32_mode==2)
+  {
+    uint32_t ifmap_size = conv_ifmap_tensor_size(cvk_ctx, p);
+    uint32_t weight_size = conv_weight_tensor_to_size(cvk_ctx, p);
+    uint32_t ofmap_size = conv_ofmap_tensor_to_size(cvk_ctx, p);
+    uint32_t bias_size = p->using_bias ? conv_bias_tensor_size(cvk_ctx, p) : 0;
+    uint32_t total_size = ifmap_size + weight_size + ofmap_size + bias_size;
+
+    // Allocation if size fit.
+    if (total_size <= cvk_ctx->info.lmem_size) {
+      dst->ifmap = conv_ifmap_tensor(cvk_ctx, p);
+      dst->weight = conv_weight_tensor(cvk_ctx, p);
+      dst->ofmap = conv_ofmap_tensor(cvk_ctx, p);
+    } else {
+      dst->ifmap = NULL;
+      dst->weight = NULL;
+      dst->ofmap = NULL;
+    }
+  }
+
+  dst->ps32_mode = ps32_mode;
+
+  dst->bias = NULL;
+  dst->relu_enable = 0;
+  dst->rshift_bits = 0;
+  if(ps32_mode==1)
+  {
+    dst->relu_enable = p->bReLU_EN;
+    dst->rshift_bits = p->r_shift_m;
+    // only mode=1 can use bias
+    if (p->using_bias)
+      dst->bias = conv_bias_tensor(cvk_ctx, p);
+  }
+}
+
+static void make_bmk_conv_param(
+    cvk_context_t *cvk_ctx,
+    cvk_tiu_pt_convolution_param_t *dst,
+    const conv_param_t *p)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  uint32_t ifmap_size = conv_ifmap_tensor_size(cvk_ctx, p);
+  uint32_t weight_size = conv_weight_tensor_to_size(cvk_ctx, p);
+  uint32_t ofmap_size = conv_ofmap_tensor_to_size(cvk_ctx, p);
+  uint32_t bias_size = p->using_bias ? conv_bias_tensor_size(cvk_ctx, p) : 0;
+  uint32_t total_size = ifmap_size + weight_size + ofmap_size + bias_size;
+
+  // Allocation if size fit.
+  if (total_size <= cvk_ctx->info.lmem_size) {
+    dst->ifmap = conv_ifmap_tensor(cvk_ctx, p);
+    dst->weight = conv_weight_tensor(cvk_ctx, p);
+    dst->ofmap = conv_ofmap_tensor(cvk_ctx, p);
+
+    if (p->using_bias)
+      dst->bias = conv_bias_tensor(cvk_ctx, p);
+  } else {
+    dst->ifmap = NULL;
+    dst->weight = NULL;
+    dst->ofmap = NULL;
+  }
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  dst->relu_enable = p->bReLU_EN;
+  dst->rshift_bits = p->r_shift_m;
+  // dst->ifmap = conv_ifmap_tensor(cvk_ctx, p);
+  // dst->weight = conv_weight_tensor(cvk_ctx, p);
+  // dst->ofmap = conv_ofmap_tensor(cvk_ctx, p);
+  // dst->bias = NULL;
+  dst->ps32_mode = 0;
+  // if (p->using_bias)
+  //   dst->bias = conv_bias_tensor(cvk_ctx, p);
+}
+
+static void free_bmk_conv_param(
+    cvk_context_t *cvk_ctx,
+    cvk_tiu_pt_convolution_param_t *r,
+    const conv_param_t *p)
+{
+  if (p->using_bias && r->bias)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->bias);
+
+  if (r->ofmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ofmap);
+
+  if (r->weight)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->weight);
+
+  if (r->ifmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ifmap);
+}
+
+static void init_conv_param(conv_param_t *p)
+{
+  printf("init_conv_param\n");
+  p->random_seed = clock();
+  srand(p->random_seed);
+
+retry:
+
+  memset(p, 0, sizeof(*p));
+  p->input_n = 1;
+  p->input_c = rand() % (10) + 2;
+  p->kh = rand() % 6 + 1;
+  p->kw = rand() % 6 + 1;
+  p->input_h = rand() % 10 + p->kh;
+  p->input_w = rand() % 10 + p->kw;
+  p->output_c = rand() % 10 + 3;
+  p->stride_h = rand() % (p->kh) + 1;
+  p->stride_w = rand() % (p->kw) + 1;
+  p->ins_h = rand() % p->kh;
+  p->ins_w = rand() % p->kw;
+  p->ins_h_last = rand() % p->kh;;
+  p->ins_w_last = rand() % p->kw;;
+  p->dh = rand() % 3 + 1;
+  p->dw = rand() % 3 + 1;
+
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  p->pad_top = rand() % kh_ext;
+  p->pad_bot = rand() % kh_ext;
+  p->pad_left = rand() % kw_ext;
+  p->pad_right = rand() % kw_ext;
+
+  if (!conv_param_is_ok(p)) {
+    printf("retry init_conv_param\n");
+    goto retry;
+  }
+
+  p->using_bias = rand() % 2;
+  p->r_shift_m = rand() % 8;
+  p->bReLU_EN = rand() % 2;
+  p->opd0_sign = rand() % 2;
+  p->opd1_sign = 1;
+  p->opd2_sign = 1;
+
+  assert(p->opd1_sign == 1 && p->opd2_sign == 1);
+
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+  assert(ih_ext >= kh_ext);
+  assert(iw_ext >= kw_ext);
+}
+
+static void print_conv_param(const conv_param_t *p)
+{
+  printf("%s\n", "Conv parameters:");
+  printf("  %s%d;\n", "p->random_seed = ", p->random_seed);
+
+  printf("  %s%d;\n", "p->input_n = ", p->input_n);
+  printf("  %s%d;\n", "p->input_c = ", p->input_c);
+  printf("  %s%d;\n", "p->input_h = ", p->input_h);
+  printf("  %s%d;\n", "p->input_w = ", p->input_w);
+  printf("  %s%d;\n", "p->output_c = ", p->output_c);
+
+  printf("  %s%d;\n", "p->kh = ", p->kh);
+  printf("  %s%d;\n", "p->kw = ", p->kw);
+  printf("  %s%d;\n", "p->dh = ", p->dh);
+  printf("  %s%d;\n", "p->dw = ", p->dw);
+  printf("  %s%d;\n", "p->pad_top = ", p->pad_top);
+  printf("  %s%d;\n", "p->pad_bot = ", p->pad_bot);
+  printf("  %s%d;\n", "p->pad_left = ", p->pad_left);
+  printf("  %s%d;\n", "p->pad_right = ", p->pad_right);
+  printf("  %s%d;\n", "p->stride_h = ", p->stride_h);
+  printf("  %s%d;\n", "p->stride_w = ", p->stride_w);
+  printf("  %s%d;\n", "p->ins_w = ", p->ins_w);
+  printf("  %s%d;\n", "p->ins_h = ", p->ins_h);
+  printf("  %s%d;\n", "p->ins_w_last = ", p->ins_w_last);
+  printf("  %s%d;\n", "p->ins_h_last = ", p->ins_h_last);
+
+  printf("  %s%d;\n", "p->r_shift_m = ", p->r_shift_m);
+  printf("  %s%d;\n", "p->opd0_sign = ", p->opd0_sign);
+  printf("  %s%d;\n", "p->opd1_sign = ", p->opd1_sign);
+  printf("  %s%d;\n", "p->opd2_sign = ", p->opd2_sign);
+  printf("  %s%d;\n", "p->bReLU_EN = ", p->bReLU_EN);
+  printf("  %s%d;\n", "p->using_bias = ", p->using_bias);
+
+  printf("  %s%d\n", "kh_ext = ", conv_kh_ext(p));
+  printf("  %s%d\n", "kw_ext = ", conv_kw_ext(p));
+  printf("  %s%d\n", "ih_ext = ", conv_ih_ext(p));
+  printf("  %s%d\n", "iw_ext = ", conv_iw_ext(p));
+  printf("  %s%d\n", "output_h = ", conv_oh(p));
+  printf("  %s%d\n", "output_w = ", conv_ow(p));
+}
+
+static int test_ps32_ut(
+    conv_param_t *p_param, CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  printf("test_ps32_ut\n");
+  int ret = 0;
+  uint16_t *input = alloc_input(p_param);
+  uint16_t *weight = alloc_weight(p_param);
+  uint32_t *bias = alloc_bias(p_param);
+  uint16_t *output_ref = (uint16_t *)malloc(sizeof(uint32_t) * conv_output_size(p_param));
+  if (!input || !weight || !bias || !output_ref) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  ret = ps32_conv_ref(p_param, input, weight, bias, output_ref, 2);
+  if (ret)
+    goto fail_exit;
+
+  cvk_tiu_pt_convolution_param_t conv_param;
+  make_bmk_conv_param_ps32(cvk_ctx, &conv_param, p_param, 2);
+
+  int tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, p_param);
+  if (tl_alloc_success) {
+
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, conv_param.ifmap, (uint8_t *)input);
+    put_conv_weight(rt_handle, cvk_ctx, conv_param.weight, weight);
+    cvk_ctx->ops->tiu_pt_convolution(cvk_ctx, &conv_param);
+    cvk_tl_t ps32_ofmap;
+    ps32_ofmap = *conv_param.ofmap;
+    ps32_ofmap.shape.n = ps32_ofmap.shape.n * sizeof(short);
+    uint16_t *output = (uint16_t*) tensor_copy_l2g_d2s(rt_handle, cvk_ctx, &ps32_ofmap);
+
+    ret = array_cmp_int8(
+        "Comparing M2 begin_mode results ...\n",
+        (int8_t*)output_ref, (int8_t *)output, conv_output_size(p_param) * sizeof(int));
+
+    if (ret) {
+      print_conv_param(p_param);
+      printf("Comparison M2 FAILED\n");
+      exit(-1);
+    } else
+      printf("Comparison M2 PASS\n");
+
+    free(output);
+  }
+  free_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+  
+  printf("test_ps32_intermediate_mode\n");
+  for (int i=0; i < conv_input_size(p_param); i++)
+    input[i] = cvk_convert_fp32_bf16(i);
+
+  for (int i=0; i < conv_weight_size(p_param); i++)
+    weight[i] = cvk_convert_fp32_bf16(i);
+
+  ret = ps32_conv_ref(p_param, input, weight, bias, output_ref, 3);
+  if (ret)
+    return ret;
+
+  make_bmk_conv_param_ps32(cvk_ctx, &conv_param, p_param, 3);
+  tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, p_param);
+
+  if (tl_alloc_success) {
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, conv_param.ifmap, (uint8_t *)input);
+    put_conv_weight(rt_handle, cvk_ctx, conv_param.weight, weight);
+
+    cvk_ctx->ops->tiu_pt_convolution(cvk_ctx, &conv_param);
+
+    cvk_tl_t ps32_ofmap;
+    ps32_ofmap = *conv_param.ofmap;
+    ps32_ofmap.shape.n = ps32_ofmap.shape.n * sizeof(short);
+
+    uint16_t *output = (uint16_t*) tensor_copy_l2g_d2s(rt_handle, cvk_ctx, &ps32_ofmap);
+
+    ret = array_cmp_int8(
+        "Comparing M3 intermediate results ...\n",
+        (int8_t*)output_ref, (int8_t *)output, conv_output_size(p_param) * sizeof(int));
+
+    if (ret) {
+      print_conv_param(p_param);
+      printf("Comparison M3 FAILED\n");
+      exit(-1);
+    } else
+      printf("Comparison M3 PASS\n");
+
+    free(output);
+  }
+  free_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+
+  printf("test_ps32_end_mode\n");
+  for (int i=0; i < conv_input_size(p_param); i++)
+    input[i] = cvk_convert_fp32_bf16(i);
+
+  for (int i=0; i < conv_weight_size(p_param); i++)
+    weight[i] = cvk_convert_fp32_bf16(i);
+
+  ret = ps32_conv_ref(p_param, input, weight, bias, output_ref, 1);
+  if (ret)
+    return ret;
+
+  make_bmk_conv_param_ps32(cvk_ctx, &conv_param, p_param, 1);
+
+  tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, p_param);
+
+  if (tl_alloc_success) {
+
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, conv_param.ifmap, (uint8_t *)input);
+    put_conv_weight(rt_handle, cvk_ctx, conv_param.weight, weight);
+    if (p_param->using_bias) {
+      put_conv_bias(rt_handle, cvk_ctx, conv_param.bias, bias);
+    }
+    cvk_ctx->ops->tiu_pt_convolution(cvk_ctx, &conv_param);
+    uint16_t *output = (uint16_t*) tensor_copy_l2g_d2s(rt_handle, cvk_ctx, conv_param.ofmap);
+
+    ret = array_cmp_int8(
+        "Comparing M1 end results ...\n",
+        (int8_t*)output_ref, (int8_t *)output, conv_output_size(p_param) * 2);
+
+    if (ret) {
+      print_conv_param(p_param);
+      printf("Comparison M1 FAILED\n");
+      exit(-1);
+    } else
+      printf("Comparison M1 PASS\n");
+
+    free(output);
+  }
+  free_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+
+fail_exit:
+  free(input);
+  free(weight);
+  free(bias);
+  free(output_ref);
+
+  return ret;
+}
+
+static int test_ic_tiling_conv(
+    conv_param_t *p_param, CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  printf("test tiled ps32 conv\n");
+  int ret = 0;
+  uint32_t output_size = sizeof(uint16_t) * conv_output_size(p_param);
+  uint16_t *input = alloc_input(p_param);
+  uint16_t *weight = alloc_weight(p_param);
+  uint32_t *bias = alloc_bias(p_param);
+  uint16_t *output_ref = (uint16_t *)malloc(output_size);
+  if (!input || !weight || !bias || !output_ref) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  p_param->r_shift_m = 0;
+  memset((uint8_t*)output_ref, 0, conv_output_size(p_param)*2);
+  ret = ps32_conv_ref(p_param, input, weight, bias, output_ref, 0);
+  if (ret)
+    goto fail_exit;
+
+  cvk_tiu_pt_convolution_param_t conv_tmp_param;
+  cvk_tiu_pt_convolution_param_t conv_param;
+  make_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+
+  int tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, p_param);
+  if (tl_alloc_success) {
+    if (p_param->using_bias) {
+      conv_tmp_param.bias = conv_param.bias;
+      put_conv_bias(rt_handle, cvk_ctx, conv_param.bias, bias);
+    }
+    // for ps32_md[1] = 1, relu_enable & rshift_bits need to set to 0
+    // so we store those parameters to conv_tmp_para
+    conv_tmp_param.relu_enable = conv_param.relu_enable;
+    conv_tmp_param.rshift_bits = conv_param.rshift_bits;
+    conv_tmp_param.bias        = conv_param.bias;
+
+    uint32_t ic_step = 1;
+    uint32_t n_step = 1;
+    cvk_tl_t ifmap = *conv_param.ifmap;
+    cvk_tl_t ofmap = *conv_param.ofmap;
+    cvk_tg_shape_t s;
+    s.n = conv_param.ifmap->shape.n;
+    s.c = conv_param.ifmap->shape.c;
+    s.h = conv_param.ifmap->shape.h;
+    s.w = conv_param.ifmap->shape.w;
+    cvk_tg_t *tg_ifmap = alloc_tensor_dev_mem(rt_handle, cvk_ctx, s, CVK_FMT_BF16);
+    tensor_copy_s2d(rt_handle, tg_ifmap, (uint8_t *)input);
+
+    s.n = conv_param.weight->shape.n;
+    s.c = conv_param.weight->shape.c;
+    s.h = conv_param.weight->shape.h;
+    s.w = conv_param.weight->shape.w;
+    uint16_t *transformed_weight =
+      transform_weight(&conv_param.weight->shape, (uint16_t *)weight);
+    cvk_tg_t *tg_weight = alloc_tensor_dev_mem(rt_handle, cvk_ctx, s, CVK_FMT_BF16);
+    tensor_copy_s2d(rt_handle, tg_weight, (uint8_t *)transformed_weight);
+    free(transformed_weight);
+
+    cvk_tl_shape_t cur_tl_ifmap_shape = {
+        n_step,
+        ic_step,
+        ifmap.shape.h,
+        ifmap.shape.w
+    };
+
+    cvk_tg_shape_t cur_tg_ifmap_shape = {
+      cur_tl_ifmap_shape.n,
+      cur_tl_ifmap_shape.c,
+      cur_tl_ifmap_shape.h,
+      cur_tl_ifmap_shape.w
+    };
+
+    cvk_tg_stride_t cur_tg_ifmap_stride = {
+      tg_ifmap->stride.n,
+      tg_ifmap->stride.c,
+      tg_ifmap->stride.h,
+      fmt_size(CVK_FMT_BF16)
+    };
+
+    cvk_tg_t cur_tg_ifmap;
+    cur_tg_ifmap.base_reg_index = 0;
+    cur_tg_ifmap.start_address = tg_ifmap->start_address;
+    cur_tg_ifmap.shape = cur_tg_ifmap_shape;
+    cur_tg_ifmap.stride = cur_tg_ifmap_stride;
+    cur_tg_ifmap.fmt = CVK_FMT_BF16;
+
+    cvk_tl_t cur_tl_ifmap;
+    cur_tl_ifmap.shape = cur_tl_ifmap_shape;
+    cur_tl_ifmap.stride =
+      cvk_ctx->ops->tl_default_stride(cvk_ctx, cur_tl_ifmap_shape, CVK_FMT_BF16, 1);
+    cur_tl_ifmap.start_address = ifmap.start_address;
+    cur_tl_ifmap.fmt = ifmap.fmt;
+
+    cvk_tl_t cur_tl_ofmap;
+    cur_tl_ofmap.start_address = ofmap.start_address;
+    cur_tl_ofmap.shape = ofmap.shape;
+    cur_tl_ofmap.shape.n = n_step;
+    cur_tl_ofmap.stride =
+      cvk_ctx->ops->tl_default_stride(cvk_ctx, cur_tl_ofmap.shape, CVK_FMT_BF16, 1);
+    cur_tl_ofmap.fmt = ofmap.fmt;
+
+    cvk_tl_t cur_tl_weight;
+    cur_tl_weight.start_address = conv_param.weight->start_address;
+    cur_tl_weight.shape = conv_param.weight->shape;
+    cur_tl_weight.shape.n = ic_step;
+    cur_tl_weight.stride.n = 2;
+    cur_tl_weight.stride.c = cur_tl_weight.shape.n * cur_tl_weight.shape.h * cur_tl_weight.shape.w * 2;
+    cur_tl_weight.stride.h = cur_tl_weight.shape.n * cur_tl_weight.shape.w * 2;
+    cur_tl_weight.stride.w = cur_tl_weight.shape.n * 2;
+    cur_tl_weight.fmt = conv_param.weight->fmt;
+
+    const cvk_tl_t *saved_tl_weight = conv_param.weight;
+    const cvk_tl_t *saved_tl_ifmap = conv_param.ifmap;
+    for (uint32_t ci = 0; ci < ifmap.shape.c; ci += ic_step) {
+      {
+        uint32_t ic = tg_weight->shape.n;
+        uint32_t oc = tg_weight->shape.c;
+        uint32_t kh = tg_weight->shape.h;
+        uint32_t kw = tg_weight->shape.w;
+
+        cvk_tg_t cur_tdma_tg_weight;
+        cur_tdma_tg_weight.base_reg_index = tg_weight->base_reg_index;
+        cur_tdma_tg_weight.start_address = tg_weight->start_address + ci * (tg_weight->fmt == CVK_FMT_BF16 ? 2 : 1);
+        cur_tdma_tg_weight.fmt = tg_weight->fmt;
+        cur_tdma_tg_weight.shape = tg_shape_t4(1, oc, kh * kw, ic);
+        cur_tdma_tg_weight.stride =
+          cvk_ctx->ops->tg_default_stride(cvk_ctx, cur_tdma_tg_weight.shape, CVK_FMT_BF16);
+        cur_tdma_tg_weight.shape = tg_shape_t4(1, oc, kh * kw, ic_step);
+
+        cvk_tl_t cur_tdma_tl_weight;
+        cur_tdma_tl_weight = cur_tl_weight;
+        cur_tdma_tl_weight.shape.n = cur_tdma_tg_weight.shape.n;
+        cur_tdma_tl_weight.shape.c = cur_tdma_tg_weight.shape.c;
+        cur_tdma_tl_weight.shape.h = cur_tdma_tg_weight.shape.h;
+        cur_tdma_tl_weight.shape.w = cur_tdma_tg_weight.shape.w;
+        cur_tdma_tl_weight.stride = cvk_ctx->ops->tl_default_stride(
+            cvk_ctx, cur_tdma_tl_weight.shape, cur_tdma_tl_weight.fmt, 0);
+
+        cvk_tdma_g2l_tensor_copy_param_t p1;
+        memset(&p1, 0, sizeof(p1));
+        p1.src = &cur_tdma_tg_weight;
+        p1.dst = &cur_tdma_tl_weight;
+        cvk_ctx->ops->tdma_g2l_bf16_tensor_copy(cvk_ctx, &p1);
+        CVI_RT_Submit(cvk_ctx);
+      }
+      {
+        cvk_tdma_g2l_tensor_copy_param_t p2;
+        memset(&p2, 0, sizeof(p2));
+        cur_tg_ifmap.start_address =
+          tg_ifmap->start_address + ci * tg_ifmap->stride.c;
+        p2.src = &cur_tg_ifmap;
+        p2.dst = &cur_tl_ifmap;
+        cvk_ctx->ops->tdma_g2l_bf16_tensor_copy(cvk_ctx, &p2);
+        CVI_RT_Submit(cvk_ctx);
+      }
+
+      conv_param.ifmap = &cur_tl_ifmap;
+      conv_param.weight = &cur_tl_weight;
+
+      // for ps32_md[1] = 1, relu_enable & rshift_bits need to set to 0
+      // so we store those parameters to conv_tmp_para
+      if (ci == (ifmap.shape.c - 1))
+      {
+        conv_param.relu_enable = conv_tmp_param.relu_enable;
+        conv_param.rshift_bits = conv_tmp_param.rshift_bits;
+        conv_param.bias        = conv_tmp_param.bias;
+        conv_param.ps32_mode   = 1;
+      }
+      else if (ci == 0)
+      {
+        conv_param.relu_enable = 0;
+        conv_param.rshift_bits = 0;
+        conv_param.bias        = 0;;
+        conv_param.ps32_mode   = 2;
+      }
+      else
+      {
+        conv_param.relu_enable = 0;
+        conv_param.rshift_bits = 0;
+        conv_param.bias        = 0;;
+        conv_param.ps32_mode   = 3;
+      }
+      cvk_ctx->ops->tiu_pt_convolution(cvk_ctx, &conv_param);
+      conv_param.weight = saved_tl_weight;
+      conv_param.ifmap = saved_tl_ifmap;
+    }
+
+    uint16_t *output = (uint16_t*) tensor_copy_l2g_d2s(rt_handle, cvk_ctx, conv_param.ofmap);
+
+    free_tensor_dev_mem(rt_handle, tg_ifmap);
+    free_tensor_dev_mem(rt_handle, tg_weight);
+
+    ret = array_cmp_int8(
+        "Comparing results ...\n",
+        (int8_t*) output_ref, (int8_t *)output, conv_output_size(p_param)*2);
+    if (ret) {
+      print_conv_param(p_param);
+      printf("Comparison FAILED\n");
+      ret = -1;
+    }
+    free(output);
+  }
+  free_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+
+fail_exit:
+  free(input);
+  free(weight);
+  free(output_ref);
+  free(bias);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+  int round_mode;
+  round_mode = cvk_set_store_feround();
+
+  for (int i = 0; i < 15; i++) {
+    printf("random_test_conv iteration: %d\n", i);
+    conv_param_t test_conv_param;
+    init_conv_param(&test_conv_param);
+    //print_conv_param(&test_conv_param);
+    ret |= test_ic_tiling_conv(&test_conv_param, rt_handle, cvk_ctx);
+    if (ret)
+      break;
+    ret |= test_ps32_ut(&test_conv_param, rt_handle, cvk_ctx);
+    if (ret)
+      break;
+
+    if (!test_conv_param.using_bias)
+      test_conv_param.using_bias = 1;
+    if (test_conv_param.output_c <= 9)
+      test_conv_param.output_c += 3;
+    //print_conv_param(&test_conv_param);
+    ret |= test_ic_tiling_conv(&test_conv_param, rt_handle, cvk_ctx);
+    if (ret)
+      break;
+
+    ret |= test_ps32_ut(&test_conv_param, rt_handle, cvk_ctx);
+    if (ret)
+      break;
+  }
+  cvk_restore_feround(round_mode);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_bf16_conv_zero_ratio.c b/cviruntime/test/181x/test_181x_bf16_conv_zero_ratio.c
new file mode 100644
index 000000000..04f4510bc
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_bf16_conv_zero_ratio.c
@@ -0,0 +1,810 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_native_ref.h"
+
+typedef struct{
+    uint16_t *conv_input;
+    uint16_t *conv_weight;
+    uint32_t *conv_bias;
+    uint16_t *conv_output;
+    uint16_t *conv_output_ref;
+}u_test_data;
+
+typedef struct {
+  int random_seed;
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int using_bias;
+  int bReLU_EN;
+  int r_shift_m;
+  int opd0_sign;
+  int opd1_sign;
+  int opd2_sign;
+  int izratio;
+  int kzratio;
+} conv_param_t;
+
+conv_param_t conv_param;
+u_test_data u16_test_data;
+cvk_tiu_pt_convolution_param_t bmk_conv_param;
+
+cvk_tl_t *skip_tensor_lmem[10];
+uint32_t skip_tensor_num=0;
+
+/* need to make sure the free order of test_alloc_tl for skip_tensor_lmem*/
+void skip_tensor_lmem_size(cvk_context_t *cvk_ctx, const cvk_tl_t *p)
+{
+  uint32_t needed = align_up(p->shape.n * p->stride.n, cvk_ctx->info.eu_num);
+  uint32_t start_addr = p->start_address + needed; //src_shape.n*src_shape.c*src_shape.h*src_shape.w/32;
+  uint32_t remain_size = start_addr % cvk_ctx->info.lmem_bank_size ? (cvk_ctx->info.lmem_bank_size - start_addr % cvk_ctx->info.lmem_bank_size) : 0; // remain size for each lane
+  if(remain_size)
+  {
+    cvk_tl_shape_t src_shape2 = {1, cvk_ctx->info.npu_num, 1, remain_size};
+    skip_tensor_lmem[skip_tensor_num] = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, src_shape2, CVK_FMT_BF16, 1); // skip the lmem size and next tl can alignment to bank size
+  }
+  skip_tensor_num++;
+}
+
+void free_skip_tensor_lmem(cvk_context_t *cvk_ctx)
+{
+  if(skip_tensor_lmem[--skip_tensor_num]!=NULL)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, skip_tensor_lmem[skip_tensor_num]);
+}
+
+static inline void bf16_relu(float *buf, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++)
+    if (buf[i] < 0)
+      buf[i] = 0;
+}
+
+static int conv_ref(
+    const conv_param_t *p_param,
+    const uint16_t *ifmap,
+    const uint16_t *weight,
+    const uint32_t *bias,
+    uint16_t *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int do_relu = p_param->bReLU_EN;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+  float *result = (float *)malloc(sizeof(float) * in * oc * oh * ow);
+  if (!result)
+    return -1;
+
+  memset(result, 0, sizeof(float) * in * oc * oh * ow);
+  int ret = 0;
+
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      uint16_t *i_fmap_pad[ic];
+      uint16_t *kernel_pad[ic];
+
+      for (int iic = 0; iic < ic; ++iic) {
+        i_fmap_pad[iic] = NULL;
+        kernel_pad[iic] = NULL;
+        fill_pad_fmap_bf16(
+            (ifmap + n*ic*ih*iw + iic*ih*iw), &i_fmap_pad[iic], cvk_convert_fp32_bf16(0),
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+        //kernel_dilation(
+        fill_pad_fmap_bf16(
+            (weight + c*ic*kh*kw + iic*kh*kw), &kernel_pad[iic], cvk_convert_fp32_bf16(0),
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+                  kh, kw);
+      }
+      for (int ph = 0; ph < oh; ++ph) {
+        for (int pw = 0; pw < ow; ++pw) {
+          float result_val = result[n*oc*oh*ow + c*oh*ow + ph*ow + pw];
+          for (int idxh = 0; idxh < kh_ext; idxh += dh) {
+            for (int idxw = 0; idxw < kw_ext; idxw += dw) {
+              for (int iic = 0; iic < ic; ++iic){
+                float ifv = cvk_convert_bf16_fp32(i_fmap_pad[iic][(idxh+ph*stride_h) * iw_ext + idxw + pw*stride_w]);
+                float ikv = cvk_convert_bf16_fp32(kernel_pad[iic][idxh* kw_ext + idxw]);
+                result_val += ifv*ikv;
+              }
+            }
+          }
+          result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] = result_val;
+        }
+      }
+       if (p_param->using_bias) {
+         for (int ph = 0; ph < oh; ++ph) {
+           for (int pw = 0; pw < ow; ++pw) {
+             result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += cvk_convert_hex_fp32(bias[c]); //bias+c ;
+           }
+         }
+       }
+       if (do_relu)
+         bf16_relu(&result[n*oc*oh*ow + c*oh*ow], oh * ow);
+       for(int i = 0 ;i<ic;i++) {
+         free(i_fmap_pad[i]);
+         free(kernel_pad[i]);
+       }
+       if (ret)
+         goto error_release;
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+    for (int i = 0; i < in * oc * oh * ow; i++) {
+      ofmap[i] = cvk_convert_fp32_bf16(result[i]);
+    }
+
+error_release:
+  free(result);
+
+  return ret;
+
+}
+
+static uint16_t * transform_weight(const cvk_tl_shape_t *s, uint16_t before[])
+{
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  uint32_t size = ic * oc * kh * kw;
+  uint16_t *after = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  if (!after)
+    return NULL;
+
+  /*
+   * (oc, ic, kh, kw) -> (1, oc, kh * kw, ic)
+   */
+  for (uint32_t oci = 0; oci < oc; oci++) {
+    for (uint32_t ici = 0; ici < ic; ici++) {
+      for (uint32_t khi = 0; khi < kh; khi++) {
+        for (uint32_t kwi = 0; kwi < kw; kwi++) {
+          uint32_t src_i = oci * ic * kh * kw + ici * kh * kw + khi * kw + kwi;
+          uint32_t dst_i = oci * kh * kw * ic + khi * kw * ic + kwi * ic + ici;
+          after[dst_i] = before[src_i];
+        }
+      }
+    }
+  }
+
+  return after;
+}
+
+static void put_conv_weight(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    uint16_t *data)
+{
+#if 0
+  const cvk_tl_shape_t *s = &tl->shape;
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  bmshape_t bms = BM_TENSOR_INT8((int)oc, (int)ic, (int)kh, (int)kw*2);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  uint16_t *transformed_data = transform_weight(s, data);
+
+  /* we put weight to region 1. bm_memcpy_s2d regard dev_mem as
+   * absolute address, so we should pass abs address to copy weight
+   * to right place.
+   */
+
+  //u64 ab_addr = bm_device_read_base_reg(*ctx, 1);
+  //bmmem_device_t ab_dev_mem =
+    //bmmem_device_prealloc(*ctx, NULL, ab_addr + gaddr, &bms);
+
+  //int ret = bm_memcpy_s2d(*ctx, ab_dev_mem, (u8*)transformed_data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, (u8*)transformed_data);
+
+  assert(ret == BM_SUCCESS);
+  cvk_tl_shape_t tdma_shape = { 1, oc, kh * kw, ic };
+
+  tg_t tdma_tg;
+  tdma_tg.base_reg_index = 0;
+  tdma_tg.start_address = gaddr;
+  tdma_tg.fmt = FMT_BF16;
+  tdma_tg.shape.n = tdma_shape.n;
+  tdma_tg.shape.c = tdma_shape.c;
+  tdma_tg.shape.h = tdma_shape.h;
+  tdma_tg.shape.w = tdma_shape.w;
+  tdma_tg.stride = bmk1822_tensor_tgmem_default_stride(tdma_tg.shape, FMT_BF16);
+  tdma_tg.base_reg_index = 1;
+
+  tl_t tdma_tl = *tl;
+  tdma_tl.shape = tdma_shape;
+  tdma_tl.stride = bmk1822_tensor_lmem_default_stride(bk_ctx, tdma_shape, FMT_BF16, 0);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tdma_tg;
+  p.dst = &tdma_tl;
+
+  bmk1822_tdma_g2l_bf16_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+  bmmem_device_free(*ctx, dev_mem);
+  //delete[] transformed_data;
+  return transformed_data;
+#endif
+
+  const cvk_tl_shape_t *s = &tl->shape;
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  uint16_t *transformed_data = transform_weight(s, data);
+
+  cvk_tl_shape_t tdma_shape = tl_shape_t4(1, oc, kh * kw, ic);
+  cvk_tl_t tdma_tl;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tdma_tl, tdma_shape, tl->fmt, tl->eu_align);
+  tdma_tl.start_address = tl->start_address;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, &tdma_tl, (uint8_t *)transformed_data);
+
+  free(transformed_data);
+}
+
+
+static uint16_t * transform_bias(int oc, uint32_t before[])
+{
+  uint16_t *after = (uint16_t *)malloc(2 * sizeof(uint16_t) * oc);
+  if (!after)
+    return NULL;
+
+  for (int i = 0; i < oc; i++){
+    after[i] = (before[i] >> 16) & 0xffff;
+    after[i + oc] = before[i] & 0xffff;
+  }
+  return after;
+}
+
+static void put_conv_bias(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    uint32_t *data)
+{
+#if 0
+  int oc = tl->shape.c;
+
+  bmshape_t bms = BM_TENSOR_INT8(2 * sizeof(short), oc, 1, 1);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  uint16_t *transformed_data = transform_bias(oc, data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, (u8 *)transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_BF16;
+  tg.shape.n = 2;
+  tg.shape.c = oc;
+  tg.shape.h = 1;
+  tg.shape.w = 1;
+  tg.stride = bmk1822_tensor_tgmem_default_stride(tg.shape, FMT_BF16);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1822_tdma_g2l_bf16_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  bmmem_device_free(*ctx, dev_mem);
+#endif
+
+  int oc = tl->shape.c;
+  uint16_t *transformed_data = transform_bias(oc, data);
+
+  cvk_tl_shape_t tdma_shape = tl_shape_t4(2, oc, 1, 1);
+  cvk_tl_t tdma_tl;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tdma_tl, tdma_shape, tl->fmt, tl->eu_align);
+  tdma_tl.start_address = tl->start_address;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, &tdma_tl, (uint8_t *)transformed_data);
+
+  free(transformed_data);
+}
+
+static int conv_kh_ext(const conv_param_t *p)
+{
+  return (p->kh - 1) * p->dh + 1;
+}
+
+static int conv_kw_ext(const conv_param_t *p)
+{
+  return (p->kw - 1) * p->dw + 1;
+}
+
+static int conv_ih_ext(const conv_param_t *p)
+{
+  return (p->input_h - 1) * (p->ins_h + 1) +
+      p->ins_h_last + 1 + p->pad_top + p->pad_bot;
+}
+
+static int conv_iw_ext(const conv_param_t *p)
+{
+  return (p->input_w - 1) * (p->ins_w + 1) +
+      p->ins_w_last + 1 + p->pad_left + p->pad_right;
+}
+
+static int conv_oh(const conv_param_t *p)
+{
+  return (conv_ih_ext(p) - conv_kh_ext(p)) / p->stride_h + 1;
+}
+
+static int conv_ow(const conv_param_t *p)
+{
+  return (conv_iw_ext(p) - conv_kw_ext(p)) / p->stride_w + 1;
+}
+
+static int conv_input_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int ic = p->input_c;
+  int ih = p->input_h;
+  int iw = p->input_w;
+  return in * ic * ih * iw;
+}
+
+static int conv_output_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int oc = p->output_c;
+  int oh = conv_oh(p);
+  int ow = conv_ow(p);
+  return in * oc * oh * ow;
+}
+
+static int conv_weight_size(const conv_param_t *p)
+{
+  int oc = p->output_c;
+  int ic = p->input_c;
+  int kh = p->kh;
+  int kw = p->kw;
+  return oc * ic * kh * kw;
+}
+
+static uint16_t * alloc_input(const conv_param_t *p)
+{
+  int size = conv_input_size(p);
+  uint16_t *buf = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (int i = 0; i < size; i++) {
+    if (p->izratio == 0) //almost 100% not zero
+      buf[i] = cvk_convert_fp32_bf16(rand() % 256 - 128);
+    else if (p->izratio == 1)
+      buf[i] = cvk_convert_fp32_bf16(rand() % 2 ? rand() % 256 - 128 : 0);
+    else
+      buf[i] = 0;
+  }
+  return buf;
+}
+
+static uint16_t * alloc_weight(const conv_param_t *p)
+{
+  int size = conv_weight_size(p);
+
+  uint16_t *buf = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (int i = 0; i < size; i++) {
+    if (p->kzratio == 0) //almost 100% not zero
+      buf[i] = cvk_convert_fp32_bf16(rand() % 256 - 128);
+    else if (p->kzratio == 1)
+      buf[i] = cvk_convert_fp32_bf16(rand() % 2 ? rand() % 256 - 128 : 0);
+    else
+      buf[i] = 0;
+  }
+  return buf;
+}
+
+static uint32_t * alloc_bias(const conv_param_t *p)
+{
+  int oc = p->output_c;
+
+  uint32_t *bias = (uint32_t *)malloc(sizeof(uint32_t) * oc);
+  if (!bias)
+    return NULL;
+
+  for (int i = 0; i < oc; i++)
+    bias[i] = cvk_convert_fp32_hex(rand() % 65536 - 32768);
+
+  return bias;
+}
+
+static cvk_tl_t * conv_ifmap_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  //cvk_fmt_t fmt = p->opd0_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_fmt_t fmt = CVK_FMT_BF16;
+  cvk_tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 1);
+}
+
+static cvk_tl_t * conv_weight_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  //cvk_fmt_t fmt = p->opd1_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_fmt_t fmt = CVK_FMT_BF16;
+  cvk_tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 0);
+}
+
+static cvk_tl_t * conv_ofmap_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, CVK_FMT_BF16, 1);
+}
+
+static cvk_tl_t * conv_bias_tensor(
+    cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  //cvk_fmt_t fmt = p->opd2_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_fmt_t fmt = CVK_FMT_BF16;
+  cvk_tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 0);
+}
+
+static int conv_param_is_ok(const conv_param_t *p)
+{
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+
+  if ((kh_ext > ih_ext)
+      || (kw_ext > iw_ext)
+      || (kh_ext <= p->pad_top)
+      || (kh_ext <= p->pad_bot)
+      || (kw_ext <= p->pad_left)
+      || (kw_ext <= p->pad_right)
+      || (p->pad_top >= (1 << 4))
+      || (p->pad_bot >= (1 << 4))
+      || (p->pad_left >= (1 << 4))
+      || (p->pad_right >= (1 << 4))) {
+    return 0;
+  }
+
+  return 1;
+}
+
+static int bmk_conv_param_alloc_ok(
+    const cvk_tiu_pt_convolution_param_t *p,
+    const conv_param_t *param)
+{
+  if (!p->ifmap || !p->ofmap || !p->weight)
+    return 0;
+
+  if (param->using_bias)
+    if (!p->bias)
+      return 0;
+
+  return 1;
+}
+
+static void make_bmk_conv_param(
+    cvk_context_t *cvk_ctx,
+    cvk_tiu_pt_convolution_param_t *dst,
+    const conv_param_t *p)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  dst->relu_enable = p->bReLU_EN;
+  dst->rshift_bits = p->r_shift_m;
+
+  dst->ifmap = conv_ifmap_tensor(cvk_ctx, p);
+  skip_tensor_lmem_size(cvk_ctx, dst->ifmap);
+  dst->weight = conv_weight_tensor(cvk_ctx, p);
+  skip_tensor_lmem_size(cvk_ctx, dst->weight);
+  dst->ofmap = conv_ofmap_tensor(cvk_ctx, p);
+  skip_tensor_lmem_size(cvk_ctx, dst->ofmap);
+  dst->bias = NULL;
+  dst->ps32_mode = 0;
+  if (p->using_bias)
+  {
+    dst->bias = conv_bias_tensor(cvk_ctx, p);
+    skip_tensor_lmem_size(cvk_ctx, dst->bias);
+  }
+  dst->w_is_const = 0;
+}
+
+static void free_bmk_conv_param(
+    cvk_context_t *cvk_ctx,
+    cvk_tiu_pt_convolution_param_t *r,
+    const conv_param_t *p)
+{
+  if (p->using_bias && r->bias)
+  {
+    free_skip_tensor_lmem(cvk_ctx);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->bias);
+  }
+  if (r->ofmap)
+  {
+    free_skip_tensor_lmem(cvk_ctx);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ofmap);
+  }
+  if (r->weight)
+  {
+    free_skip_tensor_lmem(cvk_ctx);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->weight);
+  }
+  if (r->ifmap)
+  {
+    free_skip_tensor_lmem(cvk_ctx);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ifmap);
+  }
+}
+
+static void init_conv_param(conv_param_t *p)
+{
+retry:
+  p->input_n = 1;
+  p->input_c = 16;
+  p->input_h = 2;
+  p->input_w = 600;
+
+  p->kh = 2;
+  p->kw = 16;
+  p->output_c = 16;
+
+  p->stride_h = 1;
+  p->stride_w = 15;
+  p->ins_h = 0;
+  p->ins_w = 0;
+  p->ins_h_last = 0;;
+  p->ins_w_last = 0;;
+  p->dh = 1;
+  p->dw = 1;
+
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  p->pad_top = 1;
+  p->pad_bot = 0;
+  p->pad_left = 0;
+  p->pad_right = 0;
+
+  if (!conv_param_is_ok(p)) {
+    printf("retry init_conv_param\n");
+    goto retry;
+  }
+
+  p->using_bias = 0;
+  p->r_shift_m = 7;
+  p->bReLU_EN = 1;
+
+  p->opd0_sign = 0;
+  p->opd1_sign = 1;
+  p->opd2_sign = 1;
+
+  assert(p->opd1_sign == 1 && p->opd2_sign == 1);
+
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+  assert(ih_ext >= kh_ext);
+  assert(iw_ext >= kw_ext);
+
+}
+
+static void print_conv_param(const conv_param_t *p)
+{
+  printf("%s\n", "Conv parameters:");
+  printf("  %s%d;\n", "p->random_seed = ", p->random_seed);
+
+  printf("  %s%d;\n", "p->input_n = ", p->input_n);
+  printf("  %s%d;\n", "p->input_c = ", p->input_c);
+  printf("  %s%d;\n", "p->input_h = ", p->input_h);
+  printf("  %s%d;\n", "p->input_w = ", p->input_w);
+  printf("  %s%d;\n", "p->output_c = ", p->output_c);
+
+  printf("  %s%d;\n", "p->kh = ", p->kh);
+  printf("  %s%d;\n", "p->kw = ", p->kw);
+  printf("  %s%d;\n", "p->dh = ", p->dh);
+  printf("  %s%d;\n", "p->dw = ", p->dw);
+  printf("  %s%d;\n", "p->pad_top = ", p->pad_top);
+  printf("  %s%d;\n", "p->pad_bot = ", p->pad_bot);
+  printf("  %s%d;\n", "p->pad_left = ", p->pad_left);
+  printf("  %s%d;\n", "p->pad_right = ", p->pad_right);
+  printf("  %s%d;\n", "p->stride_h = ", p->stride_h);
+  printf("  %s%d;\n", "p->stride_w = ", p->stride_w);
+  printf("  %s%d;\n", "p->ins_w = ", p->ins_w);
+  printf("  %s%d;\n", "p->ins_h = ", p->ins_h);
+  printf("  %s%d;\n", "p->ins_w_last = ", p->ins_w_last);
+  printf("  %s%d;\n", "p->ins_h_last = ", p->ins_h_last);
+
+  printf("  %s%d;\n", "p->r_shift_m = ", p->r_shift_m);
+  printf("  %s%d;\n", "p->opd0_sign = ", p->opd0_sign);
+  printf("  %s%d;\n", "p->opd1_sign = ", p->opd1_sign);
+  printf("  %s%d;\n", "p->opd2_sign = ", p->opd2_sign);
+  printf("  %s%d;\n", "p->bReLU_EN = ", p->bReLU_EN);
+  printf("  %s%d;\n", "p->using_bias = ", p->using_bias);
+
+  printf("  %s%d\n", "kh_ext = ", conv_kh_ext(p));
+  printf("  %s%d\n", "kw_ext = ", conv_kw_ext(p));
+  printf("  %s%d\n", "ih_ext = ", conv_ih_ext(p));
+  printf("  %s%d\n", "iw_ext = ", conv_iw_ext(p));
+  printf("  %s%d\n", "output_h = ", conv_oh(p));
+  printf("  %s%d\n", "output_w = ", conv_ow(p));
+}
+
+static int setup_conv(
+    conv_param_t *p_param, CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  u16_test_data.conv_input = alloc_input(p_param);
+  u16_test_data.conv_weight = alloc_weight(p_param);
+  u16_test_data.conv_bias = alloc_bias(p_param);
+  //p_param->r_shift_m = calc_rshift_m(p_param, s8_test_data.conv_weight);
+  u16_test_data.conv_output_ref = (uint16_t *)malloc(sizeof(uint16_t) * conv_output_size(p_param));
+  if (!u16_test_data.conv_output_ref)
+    return -1;
+
+  int ret = conv_ref(p_param, u16_test_data.conv_input, u16_test_data.conv_weight, u16_test_data.conv_bias, u16_test_data.conv_output_ref);
+  if (ret)
+    return ret;
+
+  make_bmk_conv_param(cvk_ctx, &bmk_conv_param , p_param);
+
+  bmk_conv_param_alloc_ok(&bmk_conv_param, p_param);
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, bmk_conv_param.ifmap, (uint8_t *)u16_test_data.conv_input);
+  put_conv_weight(rt_handle, cvk_ctx, bmk_conv_param.weight, u16_test_data.conv_weight);
+  if (p_param->using_bias)
+    put_conv_bias(rt_handle, cvk_ctx, bmk_conv_param.bias, u16_test_data.conv_bias);
+
+  return 0;
+}
+
+void get_result(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  u16_test_data.conv_output = (uint16_t*) tensor_copy_l2g_d2s(rt_handle, cvk_ctx, bmk_conv_param.ofmap);
+}
+
+void check_result()
+{
+    int has_error = array_cmp_int8(
+        "conv Comparing results ...\n",
+        (int8_t*)u16_test_data.conv_output_ref, (int8_t *)u16_test_data.conv_output, conv_output_size(&conv_param)*2);
+
+    if (has_error) {
+      print_conv_param(&conv_param);
+      printf("Comparison FAILED\n");
+      exit(-1);
+    }
+
+}
+
+void trigger_max_power(cvk_context_t *cvk_ctx)
+{
+  cvk_ctx->ops->tiu_pt_convolution(cvk_ctx, &bmk_conv_param);
+  CVI_RT_Submit(cvk_ctx);
+}
+
+void free_s8_data()
+{
+  free(u16_test_data.conv_input);
+  free(u16_test_data.conv_weight);
+  free(u16_test_data.conv_bias);
+  free(u16_test_data.conv_output);
+  free(u16_test_data.conv_output_ref);
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  for (int i = 0; i < 3; i++) {
+    for (int k = 0; k < 3; k++) {
+      printf("bf16 conv zero ratio test: ( %d ) ( %d )\n",i,k);
+      init_conv_param(&conv_param);
+      conv_param.izratio = i;
+      conv_param.kzratio = k;
+      ret |= setup_conv(&conv_param, rt_handle, cvk_ctx);
+
+      trigger_max_power(cvk_ctx);
+      get_result(rt_handle, cvk_ctx);
+      check_result();
+
+      free_bmk_conv_param(cvk_ctx, &bmk_conv_param, &conv_param);
+      free_s8_data();
+    }
+  }
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_bf16_depthwise.c b/cviruntime/test/181x/test_181x_bf16_depthwise.c
new file mode 100644
index 000000000..e3339aab7
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_bf16_depthwise.c
@@ -0,0 +1,472 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_native_ref.h"
+
+#define INVALIDE_STRIDE (-1)
+typedef cvk_tiu_depthwise_pt_convolution_param_t param_t;
+int random_seed;
+static void print_pooling_param(param_t *p)
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+  int kh = p->weight->shape.h;
+  int kw = p->weight->shape.w;
+
+  printf("  Pooling parameters:\n");
+  printf("    random_seed : %d \n", random_seed);
+  printf("    ifmap = (%d, %d, %d, %d)\n", in, ic, ih, iw);
+  printf("    opd0_sign = %d\n", p->ifmap->fmt == CVK_FMT_I8);
+  printf("    weight = (%d, %d)\n", kh, kw);
+  printf("    padding = (%d, %d, %d, %d)\n",
+         p->pad_top, p->pad_bottom, p->pad_left, p->pad_right);
+  printf("    stride = (%d, %d)\n", p->stride_h, p->stride_w);
+  printf("    ins0 = (%d, %d, %d, %d)\n",
+         p->ins_h, p->ins_last_h, p->ins_w, p->ins_last_w);
+  printf("    dilation = (%d, %d)\n",p->dilation_h, p->dilation_w);
+  printf("    rshift_bits = %d\n", p->rshift_bits);
+  printf("    relu_enable = %d\n", p->relu_enable);
+  printf("    res0_sign = %d\n", p->ofmap->fmt == CVK_FMT_I8);
+}
+
+static uint16_t *alloc_input(param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->ifmap->shape, p->ifmap->fmt);
+  uint16_t *data = (uint16_t *)malloc(size);
+  if (!data)
+    return NULL;
+
+  for (uint64_t i = 0; i < size / sizeof(uint16_t); i++) {
+    float val = 0;
+    int RAND_MAX2 = RAND_MAX/2; //5 ~ -5
+    val = (float)(rand()-RAND_MAX2)*5 / (float)RAND_MAX;
+    data[i] = cvk_convert_fp32_bf16(val);
+  }
+  return data;
+}
+
+static uint16_t *alloc_weight(param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->weight->shape, p->weight->fmt);
+  uint16_t *data = (uint16_t *)malloc(size);
+  if (!data)
+    return NULL;
+
+  for (uint64_t i = 0; i < size / sizeof(uint16_t); i++) {
+    float val = 0;
+    int RAND_MAX2 = RAND_MAX/2; //5 ~ -5
+    val = (float)(rand()-RAND_MAX2)*5 / (float)RAND_MAX;
+    data[i] = cvk_convert_fp32_bf16(val);
+  }
+  return data;
+}
+
+static uint32_t *alloc_bias(param_t *p)
+{
+  int c = p->bias->shape.c;
+  uint32_t *bias = (uint32_t *)malloc(sizeof(uint32_t) * c);
+  if (!bias)
+    return NULL;
+
+  for (int i = 0; i < c; i++) {
+    float val = 0;
+    int RAND_MAX2 = RAND_MAX/2; //2 ~ -2
+    val = (float)(rand()-RAND_MAX2)*2 / (float)RAND_MAX;
+    bias[i] = cvk_convert_fp32_hex(val);
+  }
+  return bias;
+}
+
+static uint16_t *alloc_output(param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->ofmap->shape, p->ofmap->fmt);
+  return (uint16_t *)malloc(size * 2);
+}
+
+static inline void bf16_relu(uint16_t *buf, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++)
+    if (cvk_convert_bf16_fp32(buf[i]) < 0)
+      buf[i] = cvk_convert_fp32_bf16(0);
+}
+
+static int index_get(int h, int w1, int w2)
+{
+  return h * w1 + w2;
+}
+
+int native_pooling_avg_bf16(
+    const uint16_t* i_fmap,
+    const void* weight,
+    const uint32_t *bias,
+    uint16_t * o_fmap,
+    int input_n, int input_c, int input_h, int input_w,
+    int kh, int kw,
+    int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r,
+    int stride_h, int stride_w,
+    int ins_h, int ins_w,
+    int ins_h_last, int ins_w_last,
+    int dh, int dw,
+    int const_weight)
+{
+  if (kh * kw <= 0)
+    return -1;
+
+  uint16_t avg_const_weight = *(uint16_t *)weight;
+  uint16_t *weight_arr = (uint16_t*)weight;
+  int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b);
+  int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r);
+  int d_kh = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int d_kw = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int output_h = calc_output_hw(h_after, d_kh, stride_h);
+  int output_w = calc_output_hw(w_after, d_kw, stride_w);
+  float *avg_pooling_mac_a = (float *)malloc(d_kh * d_kw * sizeof(float));
+  float *avg_pooling_mac_b = (float *)malloc(d_kh * d_kw * sizeof(float));
+
+  uint16_t *i_fmap_pad = NULL;
+  uint16_t *i_kmap_pad = NULL;
+  for (int n = 0; n < input_n; n++) {
+    if (const_weight == 0)
+      weight_arr = (uint16_t*)weight;
+
+    for (int c = 0; c < input_c; ++c) {
+      fill_pad_fmap_bf16(i_fmap, &i_fmap_pad, 0,
+          pad_w_l, pad_w_r, pad_h_t, pad_h_b,
+          ins_h, ins_w, ins_h_last, ins_w_last,
+          input_h, input_w);
+
+      //kernel_dilation(
+      if (const_weight == 0)
+        fill_pad_fmap_bf16(
+          (weight_arr ), &i_kmap_pad, 0,
+          0, 0, 0, 0,  // no padding
+          dh - 1, dw - 1, 0, 0,
+          kh, kw);
+
+      float avg_pool_result;
+      for (int ph = 0; ph < output_h; ++ph) {
+        for (int pw = 0; pw < output_w; ++pw) {
+          int hstart = ph * stride_h;
+          int wstart = pw * stride_w;
+          int pool_index = index_get(ph, output_w, pw);
+          int mac_index = 0;
+
+          for (int h = 0; h < d_kh; h++) {
+            for (int w = 0; w < d_kw; w++) {
+              int index = index_get((hstart+h), w_after, (w+wstart));
+              mac_index = h*d_kw + w;
+
+              avg_pooling_mac_a[mac_index] = cvk_convert_bf16_fp32(i_fmap_pad[index]);
+
+              avg_pooling_mac_b[h*d_kw+w] = const_weight ?
+                  cvk_convert_bf16_fp32(avg_const_weight) : cvk_convert_bf16_fp32(i_kmap_pad[mac_index]);
+            }
+          }
+          inner_float_product(avg_pooling_mac_a, avg_pooling_mac_b, d_kh * d_kw,
+              &avg_pool_result);
+
+          if(bias) {
+            avg_pool_result += cvk_convert_hex_fp32(bias[c]);
+          }
+          *(o_fmap+pool_index) = cvk_convert_fp32_bf16(avg_pool_result);
+        }
+      }
+      weight_arr += kh * kw;
+      i_fmap += input_w * input_h;
+      o_fmap += output_w * output_h;
+    }
+  }
+  free(i_fmap_pad);
+  free(i_kmap_pad);
+  free(avg_pooling_mac_a);
+  free(avg_pooling_mac_b);
+
+  return 0;
+}
+
+static int compare_results(
+    param_t *p,
+    uint16_t input[],
+    uint16_t weight[],
+    uint32_t bias[],
+    uint16_t output[])
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+  int kh = p->weight->shape.h;
+  int kw = p->weight->shape.w;
+
+  uint16_t *output_ref = alloc_output(p);
+  int ret = native_pooling_avg_bf16(
+      input, weight, p->bias ? bias : NULL, output_ref,
+      in, ic, ih, iw, kh, kw,
+      p->pad_top, p->pad_bottom, p->pad_left, p->pad_right,
+      p->stride_h, p->stride_w,
+      p->ins_h, p->ins_w, p->ins_last_h, p->ins_last_w,
+      p->dilation_h, p->dilation_w, 0
+      );
+  if (ret)
+    goto fail_exit;
+
+  if(p->relu_enable )
+    bf16_relu(output_ref, tl_shape_size(&p->ofmap->shape, p->ofmap->fmt));
+
+  ret = array_cmp_int8(
+      "Comparing results ...\n", (int8_t*) output_ref, (int8_t*) output,
+      tl_shape_size(&p->ofmap->shape, p->ofmap->fmt));
+
+  if (ret) {
+    printf("Comparison FAILED!!!\n");
+    print_pooling_param(p);
+    ret = -1;
+  }
+
+fail_exit:
+  free(output_ref);
+
+  return ret;
+}
+
+static int pooling_ih_ext(param_t *p, int ih)
+{
+  int ins = p->ins_h;
+  int ins_last = p->ins_last_h;
+  int pad = p->pad_top + p->pad_bottom;
+  return (ih - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+static int pooling_iw_ext(param_t *p, int iw)
+{
+  int ins = p->ins_w;
+  int ins_last = p->ins_last_w;
+  int pad = p->pad_left + p->pad_right;
+  return (iw - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+static int pooling_oh(param_t *p, int ih, int kh, int dh)
+{
+  int ih_ext = pooling_ih_ext(p, ih);
+  int d_h = (kh -1) * dh + 1;
+  return (ih_ext - d_h) / p->stride_h + 1;
+}
+
+static int pooling_ow(param_t *p, int iw, int kw, int dw)
+{
+  int iw_ext = pooling_iw_ext(p, iw);
+  int d_w = (kw -1) * dw +1;
+  return (iw_ext - d_w) / p->stride_w + 1;
+}
+
+static void free_depthwise_param(
+    cvk_context_t *cvk_ctx,
+    param_t *p)
+{
+  if (p->bias)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->bias);
+
+  if (p->weight)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->weight);
+
+  if (p->ifmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->ifmap);
+
+  if (p->ofmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->ofmap);
+}
+
+static param_t random_depthwise_param(cvk_context_t *cvk_ctx, int stride_w, int stride_h)
+{
+  param_t p;
+
+  memset(&p, 0, sizeof(p));
+
+retry:
+  random_seed = clock();
+  srand(random_seed);
+  int using_bias = rand() % 2;
+  int n = rand() % 5 + 1;
+  int c = rand() % (3 * cvk_ctx->info.npu_num) + 1;
+  int ih = rand() % 30 + 3 + (INVALIDE_STRIDE == stride_h ? 0 : stride_h);
+  int iw = rand() % 30 + 6 + (INVALIDE_STRIDE == stride_w ? 0 : stride_w);
+  int kh = rand() % 7 + 1;
+  int kw = rand() % 7 + 1;
+
+  p.ins_h = rand() % kh;
+  p.ins_w = rand() % kw;
+  p.ins_last_h = rand() % kh;
+  p.ins_last_w = rand() % kw;
+  p.stride_h = INVALIDE_STRIDE == stride_h ? rand() % (kh) + 1 : stride_h;
+  p.stride_w = INVALIDE_STRIDE == stride_w ? rand() % (kh) + 1 : stride_w;
+  p.pad_top = rand() % kh;
+  p.pad_bottom = rand() % kh;
+  p.pad_left = rand() % kw;
+  p.pad_right = rand() % kw;
+  p.rshift_bits = rand() % 32;
+  p.dilation_h = rand()%4 + 1;
+  p.dilation_w = rand()%4 + 1;
+
+  int oh = pooling_oh(&p, ih, kh, p.dilation_h);
+  int ow = pooling_ow(&p, iw, kw, p.dilation_w);
+  int d_kh = calc_dilute_hw(kh, p.dilation_h - 1, 0, 0, 0);
+  int d_kw = calc_dilute_hw(kw, p.dilation_w - 1, 0, 0, 0);
+
+  cvk_tl_shape_t ofmap_shape;
+  ofmap_shape.n = n;
+  ofmap_shape.c = c;
+  ofmap_shape.h = oh;
+  ofmap_shape.w = ow;
+  cvk_tl_shape_t ifmap_shape;
+  ifmap_shape.n = n;
+  ifmap_shape.c = c;
+  ifmap_shape.h = ih;
+  ifmap_shape.w = iw;
+  cvk_tl_shape_t weight_shape;
+  weight_shape.n = 1;
+  weight_shape.c = c;
+  weight_shape.h = kh;
+  weight_shape.w = kw;
+  cvk_tl_shape_t bias_shape;
+  bias_shape.n = 2;
+  bias_shape.c = c;
+  bias_shape.h = 1;
+  bias_shape.w = 1;
+  p.relu_enable = rand()%2;
+
+  cvk_fmt_t ifmt = CVK_FMT_BF16;
+  p.ofmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, ofmap_shape, CVK_FMT_BF16, 1);
+  p.ifmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, ifmap_shape, ifmt, 1);
+  p.weight = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, weight_shape, CVK_FMT_BF16, 1);
+  p.bias = NULL;
+  if (using_bias)
+    p.bias = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, bias_shape, CVK_FMT_BF16, 0);
+
+  if ((kh > pooling_ih_ext(&p, ih))
+      || (kw > pooling_iw_ext(&p, iw))
+      || (oh < d_kh)
+      || (ow < d_kw)
+      || (p.pad_top >= (1 << 4))
+      || (p.pad_bottom >= (1 << 4))
+      || (p.pad_left >= (1 << 4))
+      || (p.pad_right >= (1 << 4))
+      || !p.ofmap
+      || !p.ifmap
+      || !p.weight
+      || (using_bias && !p.bias)) {
+    printf("retry init_pooling_param\n");
+    free_depthwise_param(cvk_ctx, &p);
+    goto retry;
+  }
+  return p;
+}
+
+static void put_bias_tensor(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    uint32_t data[])
+{
+  int c = tl->shape.c;
+
+  uint16_t *hi_lo = (uint16_t *)malloc(2 * c * 2);
+  if (!hi_lo)
+    return;
+
+  for (int i = 0; i < c; i++) {
+    hi_lo[i] = (data[i] >> 16) & 0xffff;
+    hi_lo[i + c] = (data[i]  & 0xffff);
+  }
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl, (uint8_t *)hi_lo);
+
+  free(hi_lo);
+}
+
+static int _test_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int stride_w, int stride_h)
+{
+  param_t param = random_depthwise_param(cvk_ctx, stride_w, stride_h);
+  //print_pooling_param(&param);
+  uint16_t *input = alloc_input(&param);
+  uint16_t *weight = alloc_weight(&param);
+  uint32_t *bias = NULL;
+  if (param.bias)
+    bias = alloc_bias(&param);
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, param.ifmap, (uint8_t *)input);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, param.weight, (uint8_t *)weight);
+  if (param.bias)
+    put_bias_tensor(rt_handle, cvk_ctx, param.bias, bias);
+
+  cvk_ctx->ops->tiu_pt_depthwise_convolution(cvk_ctx, &param);
+  uint16_t *output = (uint16_t *)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, param.ofmap);
+  int ret = compare_results(&param, input, weight, bias, output);
+
+  free_depthwise_param(cvk_ctx, &param);
+  free(input);
+  free(weight);
+  free(bias);
+  free(output);
+
+  return ret;
+}
+
+static int test_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx) {
+  return _test_pooling(rt_handle, cvk_ctx, INVALIDE_STRIDE, INVALIDE_STRIDE);
+}
+
+static int test_depthwise_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+  for (uint64_t i = 0; i < 20; i++)
+    ret |= test_pooling(rt_handle, cvk_ctx);
+
+  // test stride extend (0, 31]
+  int stride_list[] = {15, 16, 31};
+  int stride_list_len = sizeof(stride_list) / sizeof(stride_list[0]);
+
+  for (int stride_w_idx = 0; stride_w_idx < stride_list_len; stride_w_idx++) {
+    for (int stride_h_idx = 0; stride_h_idx < stride_list_len; stride_h_idx++) {
+      int stride_w = stride_list[stride_w_idx];
+      int stride_h = stride_list[stride_h_idx];
+      ret |= _test_pooling(rt_handle, cvk_ctx, stride_w, stride_h);
+    }
+  }
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+  int round_mode;
+  round_mode = cvk_set_store_feround();
+  ret |= test_depthwise_pooling(rt_handle, cvk_ctx);
+  cvk_restore_feround(round_mode);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_bf16_lut.c b/cviruntime/test/181x/test_181x_bf16_lut.c
new file mode 100644
index 000000000..e04cfc9be
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_bf16_lut.c
@@ -0,0 +1,149 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static uint32_t channel = -1; //<! 1822 hardcode
+
+static uint64_t shape_size(cvk_tl_shape_t s)
+{
+  return s.n * s.c * s.h * s.w;
+}
+
+static void tl_lut_ref(
+    uint16_t *ofmap,
+    uint16_t *ifmap,
+    uint16_t *table,
+    cvk_tl_shape_t ifmap_shape,
+    cvk_tl_shape_t table_shape)
+{
+  int ih, iw;
+  int tn, th, tw;
+
+  ih = ifmap_shape.h;
+  iw = ifmap_shape.w;
+  tn = table_shape.n;
+  th = table_shape.h;
+  tw = table_shape.w;
+  assert(tn == 1);
+  assert(th * tw == 256);
+
+  for (uint64_t i = 0; i < shape_size(ifmap_shape); i++) {
+    int ici = i / (ih * iw) % 32;
+    ofmap[i] = table[ici * (th * tw) + ifmap[i]];
+  }
+}
+
+static int test_tl_lut(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+  cvk_tl_shape_t ifmap_shape = {1, channel, 1, 224};
+  cvk_tl_shape_t table_shape = {1, channel, 32, 8};
+  cvk_tl_shape_t ofmap_shape = ifmap_shape;
+
+  uint64_t ifmap_size = shape_size(ifmap_shape);
+  uint64_t table_size = shape_size(table_shape);
+  uint64_t ofmap_size = shape_size(ofmap_shape);
+
+  cvk_fmt_t fmt = CVK_FMT_BF16;
+
+  int data_type_size = fmt_size(fmt);
+  uint64_t ifmap_bytesize  =  ifmap_size * data_type_size;
+  uint64_t table_bytesize  =  table_size * data_type_size;
+  uint64_t ofmap_bytesize  =  ofmap_size * data_type_size;
+
+  uint16_t *ifmap_data = (uint16_t *)malloc(ifmap_bytesize);
+  uint16_t *table_data = (uint16_t *)malloc(table_bytesize);
+  uint16_t *ref_data = (uint16_t *)malloc(ofmap_bytesize);
+  if (!ifmap_data || !table_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint64_t i = 0; i < ifmap_size; i++)
+    ifmap_data[i] = 0;
+    //ifmap_data[i] = i - 20;
+
+  for (uint64_t i = 0; i < table_size; i++)
+    table_data[i] = i + i / 256 * 3;
+
+  tl_lut_ref(ref_data, ifmap_data, table_data, ifmap_shape, table_shape);
+
+  cvk_tl_t *tl_ifmap =
+    cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx,ifmap_shape, fmt, 1);
+  cvk_tl_t *tl_table =
+    cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, table_shape, fmt, /*align*/1);
+  cvk_tl_t *tl_ofmap =
+    cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx,ofmap_shape, fmt, /*align*/1);
+  uint16_t *ofmap_data = NULL;
+  if (!tl_ifmap || !tl_table || !tl_ofmap) {
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_ifmap, (uint8_t *)ifmap_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_table, (uint8_t *)table_data);
+
+  cvk_tiu_lookup_table_param_t p12;
+  p12.ofmap = tl_ofmap;
+  p12.ifmap = tl_ifmap;
+  p12.table = tl_table;
+  cvk_ctx->ops->tiu_lookup_table(cvk_ctx, &p12);
+  CVI_RT_Submit(cvk_ctx);
+
+  ofmap_data = (uint16_t*)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_ofmap);
+  for (uint64_t i = 0; i < ofmap_size; i++) {
+    if (ofmap_data[i] != ref_data[i]) {
+      fprintf(stderr,
+          "comparing failed at ofmap_data[%" PRIu64 "], got %d, exp %d\n",
+          i, ofmap_data[i], ref_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+fail_exit_2:
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_ofmap);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_table);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_ifmap);
+  free(ofmap_data);
+
+fail_exit:
+  free(ifmap_data);
+  free(table_data);
+  free(ref_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  channel = cvk_ctx->info.npu_num;
+
+  ret |= test_tl_lut(rt_handle, cvk_ctx);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_bf16_matrix_mac.c b/cviruntime/test/181x/test_181x_bf16_matrix_mac.c
new file mode 100644
index 000000000..21ee79017
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_bf16_matrix_mac.c
@@ -0,0 +1,372 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef cvk_tiu_matrix_multiplication_param_t param_t;
+int random_seed;
+
+static uint64_t matrix_size(const cvk_ml_t *ml)
+{
+
+  uint64_t row = ml->shape.n;
+  uint64_t col = ml->shape.col;
+  return row * col;
+}
+
+static uint64_t res_size(param_t *p)
+{
+    return matrix_size(p->res);
+}
+
+static uint16_t * alloc_left(param_t *p)
+{
+  uint64_t size = matrix_size(p->left);
+  uint16_t *buf = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (uint64_t i = 0; i < size; i++) {
+    buf[i] = cvk_convert_fp32_bf16(i);
+  }
+
+  return buf;
+}
+
+static uint16_t * alloc_right(param_t *p)
+{
+  uint64_t size = matrix_size(p->right);
+  uint16_t *buf = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (uint64_t i = 0; i < size; i++) {
+    float val = 0.01;
+    buf[i] = cvk_convert_fp32_bf16(i);
+    val += 0.01;
+  }
+  return buf;
+}
+
+static uint32_t * alloc_bias(param_t *p)
+{
+  if (!p->bias)
+    return NULL;
+
+  uint64_t size = matrix_size(p->bias);
+  uint32_t *buf = (uint32_t *)malloc(sizeof(uint32_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (uint64_t i = 0; i < size; i++) {
+    buf[i] = cvk_convert_fp32_hex(i);
+  }
+  return buf;
+}
+
+static uint32_t * alloc_res(param_t *p)
+{
+  uint64_t size = res_size(p);
+  uint32_t *buf = (uint32_t *)malloc(sizeof(uint32_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (uint64_t i = 0; i < size; i++) {
+    buf[i] = cvk_convert_fp32_bf16(i);
+  }
+  return buf;
+}
+
+static inline void bf16_relu(float *buf, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++)
+    if (buf[i] < 0)
+      buf[i] = 0;
+}
+
+static void matrix_mac_ref(
+    param_t *p, uint16_t left[], uint16_t right[], uint32_t bias[], uint32_t res[])
+{
+  uint64_t size = res_size(p);
+  uint32_t left_col = p->left->shape.col;
+  uint32_t right_col = p->right->shape.col;
+  uint32_t res_row = p->left->shape.n;
+  uint32_t res_col = p->res->shape.col;
+  uint32_t left_c = p->left->shape.c;
+  uint32_t left_w = p->left->shape.w;
+
+  float *tmp_res = (float *)malloc(sizeof(float) * size);
+  if (!tmp_res)
+    return;
+
+  if (p->add_result) {
+    for (uint32_t i = 0; i < res_row * res_col; i++)
+      tmp_res[i] = cvk_convert_bf16_fp32(res[i]);
+  } else {
+    for (uint32_t i = 0; i < res_row * res_col; i++)
+      tmp_res[i] = 0;
+  }
+  for (uint32_t row = 0; row < res_row; row++) {
+    for (uint32_t col = 0; col < res_col; col++) {
+      for (uint32_t wi = 0; wi < left_w; wi++) {
+        for (uint32_t ci = 0; ci < left_c; ci++) {
+          if ((wi + (ci*left_w)) >= left_col)
+            continue;
+          uint32_t li = row * left_col + left_w * ci + wi;
+          uint32_t ri = (ci* left_w + wi )* right_col + col;
+
+          float l = cvk_convert_bf16_fp32(left[li]);
+          float r = cvk_convert_bf16_fp32(right[ri]);
+          tmp_res[row * res_col + col] += l * r;
+        }
+      }
+    }
+  }
+
+  if (p->bias && bias) {
+    for (uint32_t row = 0; row < res_row; row++) {
+      for (uint32_t col = 0; col < res_col; col++) {
+        float b = cvk_convert_hex_fp32(bias[col]);
+        tmp_res[row * res_col + col] += b;
+      }
+    }
+  }
+
+  if (p->relu_enable)
+    bf16_relu(tmp_res, size);
+
+  for (uint64_t i = 0; i < size; i++) {
+    res[i] = cvk_convert_fp32_bf16(tmp_res[i]);
+  }
+  free(tmp_res);
+}
+
+static void put_bias(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_ml_t *ml,
+    uint32_t data[])
+{
+  uint64_t size = ml->shape.col;
+
+  uint16_t *tmp = (uint16_t *)malloc(sizeof(uint16_t) * size * 2);
+  if (!tmp)
+    return;
+
+  for (uint64_t i = 0; i < size; i++) {
+    tmp[i] = (data[i] >> 16) & 0xFFFF;
+    tmp[i + size] = (data[i] & 0xFFFF);
+  }
+
+  matrix_copy_s2d_g2l(rt_handle, cvk_ctx, ml, (uint8_t*)tmp);
+
+  free(tmp);
+}
+
+static void put_res(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_ml_t *ml,
+    uint32_t data[])
+{
+  uint64_t size = ml->shape.n  * ml->shape.col;
+
+  uint16_t *tmp = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  if (!tmp)
+    return;
+
+  for (uint64_t i = 0; i < size; i++) {
+    tmp[i] = (data[i] & 0xFFFF);
+  }
+
+  matrix_copy_s2d_g2l(rt_handle, cvk_ctx, ml, (uint8_t*)tmp);
+
+  free(tmp);
+}
+
+static uint32_t * get_res(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    param_t *p)
+{
+  uint64_t size = res_size(p);
+  uint32_t *res = (uint32_t *)malloc(sizeof(uint32_t) * size);
+  if (!res)
+    return NULL;
+
+  uint16_t *tmp = (uint16_t *)matrix_copy_l2g_d2s(rt_handle, cvk_ctx, p->res);
+  if (tmp) {
+    for (uint64_t i = 0; i < size; i++)
+      res[i] = tmp[i];
+
+    free(tmp);
+  } else {
+    free(res);
+    res = NULL;
+  }
+
+  return res;
+}
+
+static void test_param(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  uint16_t *left = alloc_left(p);
+  uint16_t *right = alloc_right(p);
+  uint32_t *bias = alloc_bias(p);
+  uint32_t *ref = alloc_res(p);
+  matrix_copy_s2d_g2l(rt_handle, cvk_ctx, p->left, (uint8_t*)left);
+  matrix_copy_s2d_g2l(rt_handle, cvk_ctx, p->right, (uint8_t*)right);
+  if (bias)
+    put_bias(rt_handle, cvk_ctx, p->bias, bias);
+  if (p->add_result)
+    put_res(rt_handle, cvk_ctx, p->res, ref);
+
+  cvk_ctx->ops->tiu_matrix_multiplication(cvk_ctx, p);
+  uint32_t *res = get_res(rt_handle, cvk_ctx, p);
+  matrix_mac_ref(p, left, right, bias, ref);
+  uint64_t size = res_size(p);
+  for (uint64_t i = 0; i < size; i++) {
+    if (res[i] != ref[i]) {
+      fprintf(stderr, "comparing failed at out[%" PRIu64 "], got %x, exp %x\n",
+              i, res[i], ref[i]);
+      fprintf(stderr, "random_seed=%d\n", random_seed);
+      exit(-1);
+    }
+  }
+  free(left);
+  free(right);
+  free(bias);
+  free(ref);
+  free(res);
+}
+
+static void destroy_param(cvk_context_t *cvk_ctx, param_t *p)
+{
+  if (p->bias)
+    cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->bias);
+  if (p->res)
+    cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->res);
+  if (p->right)
+    cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->right);
+  if (p->left)
+    cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->left);
+}
+
+static cvk_ml_t *alloc_param_res(
+    cvk_context_t *cvk_ctx, param_t *p)
+{
+  cvk_ml_shape_t s;
+
+  s.n = p->left->shape.n;
+  s.c = p->right->shape.c;
+  s.w = p->right->shape.w;
+  s.col = p->right->shape.col;
+  cvk_fmt_t fmt = CVK_FMT_BF16;
+  return cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, s, fmt, 1);
+}
+
+static param_t param_0(cvk_context_t *cvk_ctx)
+{
+
+retry:
+  random_seed = clock();
+  srand(random_seed);
+
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = rand()%2;
+  p.add_result = 0; /*bf16 HW does not support add_result*/
+  p.ps32_mode = 0;
+
+  uint32_t left_row = rand() % 100 +1;
+  uint32_t left_col = rand() % 100 + 1;
+  uint32_t left_w = rand() % (left_col/5+1) + 1; // c is generate by w, and make c is larger
+  uint32_t left_c = left_col / left_w + (left_col % left_w ? 1: 0);
+
+  uint32_t right_row = left_col;
+  uint32_t right_col = rand() % 100 + 1;
+  uint32_t right_w = (rand() % (right_col/5+1) + 1); // make c is larger
+  uint32_t right_c = right_col / right_w + (right_col % right_w ? 1: 0) ;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  uint32_t bias = rand()%2;
+  p.bias = NULL;
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_BF16, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_BF16, 1);
+  if (!p.left || !p.right) {
+    printf("retry init_matrix_param\n");
+    destroy_param(cvk_ctx, &p);
+    goto retry;
+  }
+
+  p.res = alloc_param_res(cvk_ctx, &p);
+  if (bias) {
+    cvk_ml_shape_t bias_shape = right_shape;
+    bias_shape.n = 2;
+    p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_BF16, 1);
+  }
+
+  if (!p.res || (bias && !p.bias)) {
+    printf("retry init_matrix_param\n");
+    destroy_param(cvk_ctx, &p);
+    goto retry;
+  }
+
+  return p;
+}
+
+
+#define test_one_param(n)                               \
+  do {                                                  \
+    param_t p = param_##n(cvk_ctx);                      \
+    test_param(rt_handle, cvk_ctx, &p);                       \
+    destroy_param(cvk_ctx, &p);                          \
+  } while (0)
+
+int main(int argc, char **argv)
+{
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+  int round_mode;
+  round_mode = cvk_set_store_feround();
+
+  for (int i = 0 ; i < 30 ; i++)
+    test_one_param(0);
+
+  cvk_restore_feround(round_mode);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return 0;
+}
diff --git a/cviruntime/test/181x/test_181x_bf16_matrix_mac_ps32.c b/cviruntime/test/181x/test_181x_bf16_matrix_mac_ps32.c
new file mode 100644
index 000000000..1a0957cfb
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_bf16_matrix_mac_ps32.c
@@ -0,0 +1,571 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_native_ref.h"
+
+typedef cvk_tiu_matrix_multiplication_param_t param_t;
+
+typedef struct{
+  uint32_t left_sign;
+  uint32_t left_row ;
+  uint32_t left_col ;
+  uint32_t left_c ;
+  uint32_t left_w ;
+  uint32_t right_sign;
+  uint32_t right_row ;
+  uint32_t right_col ;
+  uint32_t right_c ;
+  uint32_t right_w ;
+  uint32_t lshift_bits ;
+  uint32_t rshift_bits ;
+  uint32_t relu_enable ;
+  uint32_t using_bias;
+  uint32_t bias_sign;
+} matrix_init_para_t;
+
+uint32_t random_seed;
+matrix_init_para_t matrix_para_t;
+
+static void make_bmk_matrix_param_ps32(cvk_context_t *cvk_ctx, param_t *p, int ps32_mode);
+static param_t param_init();
+
+void print_param(param_t *p)
+{
+  printf("random_seed =%d\n", random_seed);
+  printf("ps32_mode =%d\n",p->ps32_mode);
+  printf("left_shape.n =%d\n",p->left->shape.n);
+  printf("left_shape.col =%d\n",p->left->shape.col);
+  printf("left_shape.c =%d\n",p->left->shape.c);
+  printf("left_shape.w =%d\n",p->left->shape.w);
+  printf("left_fmt =%d\n",p->left->fmt);
+  printf("right_shape.n =%d\n",p->right->shape.n);
+  printf("right_shape.col =%d\n",p->right->shape.col);
+  printf("right_shape.c =%d\n",p->right->shape.c);
+  printf("right_shape.w =%d\n",p->right->shape.w);
+  printf("right_fmt =%d\n",p->right->fmt);
+  if(p->bias)
+  {
+    printf("bias_shape.n =%d\n",p->bias->shape.n);
+    printf("bias_shape.col =%d\n",p->bias->shape.col);
+    printf("bias_shape.c =%d\n",p->bias->shape.c);
+    printf("bias_shape.w =%d\n",p->bias->shape.w);
+    printf("bias_fmt =%d\n",p->bias->fmt);
+  }
+  printf("result_shape.n =%d\n",p->res->shape.n);
+  printf("result_shape.col =%d\n",p->res->shape.col);
+  printf("result_shape.c =%d\n",p->res->shape.c);
+  printf("result_shape.w =%d\n",p->res->shape.w);
+  printf("result_fmt =%d\n",p->res->fmt);
+  printf("relu_enable=%d\n",p->relu_enable);
+  printf("rshift_bits=%d\n",p->rshift_bits);
+}
+
+
+static uint64_t matrix_size(const cvk_ml_t *ml)
+{
+  uint64_t row = ml->shape.n;
+  uint64_t col = ml->shape.col;
+  return row * col;
+}
+
+static uint64_t res_ps32_size(param_t *p)
+{
+    return matrix_size(p->res);
+}
+
+static uint64_t res_size(param_t *p)
+{
+    return matrix_size(p->res);
+}
+
+static uint16_t * alloc_left(param_t *p)
+{
+  uint64_t size = matrix_size(p->left);
+  uint16_t *buf = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (uint64_t i = 0; i < size; i++)
+    buf[i] = cvk_convert_fp32_bf16(i);
+
+  return buf;
+}
+
+static uint16_t * alloc_right(param_t *p)
+{
+  uint64_t size = matrix_size(p->right);
+
+  uint16_t *buf = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (uint64_t i = 0; i < size; i++)
+    buf[i] = cvk_convert_fp32_bf16(i);
+
+  return buf;
+}
+static uint32_t * alloc_bias(param_t *p)
+{
+  if (!p->bias)
+    return NULL;
+
+  uint64_t size = matrix_size(p->bias) / 2;
+
+  uint32_t *buf = (uint32_t *)malloc(sizeof(uint32_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (uint64_t i = 0; i < size; i++)
+    buf[i] = cvk_convert_fp32_hex(i);
+
+  return buf;
+}
+
+static uint16_t * alloc_ps32_res(param_t *p)
+{
+  uint64_t size = res_ps32_size(p)*2;
+  uint16_t *buf = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (uint64_t i = 0; i < size; i++)
+    buf[i] = cvk_convert_fp32_bf16(i);
+
+  return buf;
+}
+
+static inline void bf16_relu(float *buf, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++)
+    if (buf[i] < 0)
+      buf[i] = 0;
+}
+
+static int ps32_m2_matrix_mac_ref(
+  param_t *p,
+  uint16_t *left,
+  uint16_t *right,
+  uint16_t *res)
+{
+  uint64_t size = res_ps32_size(p);
+  uint32_t left_col = p->left->shape.col;
+  uint32_t right_col = p->right->shape.col;
+  uint32_t res_row = p->left->shape.n;
+  uint32_t res_col = p->res->shape.col;
+  uint32_t left_c = p->left->shape.c;
+  uint32_t left_w = p->left->shape.w;
+
+  int ret = 0;
+  int bstride = res_row * res_col;
+
+  float *tmp_res = (float *)malloc(sizeof(float) * size);
+  if (!tmp_res)
+    return -1;
+
+  for (uint32_t i = 0; i < res_row * res_col; i++)
+      tmp_res[i] = 0;
+  for (uint32_t row = 0; row < res_row; row++) {
+    for (uint32_t col = 0; col < res_col; col++) {
+      for (uint32_t wi = 0; wi < left_w; wi++) {
+        for (uint32_t ci = 0; ci < left_c; ci++) {
+          if ((wi + (ci*left_w)) >= left_col)
+            continue;
+          uint32_t li = row * left_col + left_w * ci + wi;
+          uint32_t ri = (ci* left_w + wi )* right_col + col;
+
+          float l = cvk_convert_bf16_fp32(left[li]);
+          float r = cvk_convert_bf16_fp32(right[ri]);
+          tmp_res[row * res_col + col] += l * r;
+        }
+      }
+    }
+  }
+
+  for (uint64_t i = 0; i < size; i++)
+    res[i + bstride*0] = (cvk_convert_fp32_hex(tmp_res[i]) >> 16) & 0xFFFF;
+  for (uint64_t i = 0; i < size; i++)
+    res[i + bstride*1] = (cvk_convert_fp32_hex(tmp_res[i]) >> 0) & 0xFFFF;
+
+  free(tmp_res);
+
+  return ret;
+}
+
+static int ps32_m3_matrix_mac_ref(
+  param_t *p,
+  uint16_t *left,
+  uint16_t *right,
+  uint16_t *res)
+{
+  uint64_t size = res_ps32_size(p);
+  uint32_t left_col = p->left->shape.col;
+  uint32_t right_col = p->right->shape.col;
+  uint32_t res_row = p->left->shape.n;
+  uint32_t res_col = p->res->shape.col;
+  uint32_t left_c = p->left->shape.c;
+  uint32_t left_w = p->left->shape.w;
+
+  int ret = 0;
+  int bstride = res_row * res_col;
+
+  float *tmp_res = (float *)malloc(sizeof(float) * size);
+  if (!tmp_res)
+    return -1;
+
+  for (uint64_t i = 0; i < size; i++)
+    tmp_res[i] = cvk_convert_hex_fp32((res[i + bstride*0] << 16) | res[i + bstride*1]);
+
+  for (uint32_t row = 0; row < res_row; row++) {
+    for (uint32_t col = 0; col < res_col; col++) {
+      for (uint32_t wi = 0; wi < left_w; wi++) {
+        for (uint32_t ci = 0; ci < left_c; ci++) {
+          if ((wi + (ci*left_w)) >= left_col)
+            continue;
+          uint32_t li = row * left_col + left_w * ci + wi;
+          uint32_t ri = (ci* left_w + wi )* right_col + col;
+
+          float l = cvk_convert_bf16_fp32(left[li]);
+          float r = cvk_convert_bf16_fp32(right[ri]);
+          tmp_res[row * res_col + col] += l * r;
+        }
+      }
+    }
+  }
+
+  for (uint64_t i = 0; i < size; i++)
+    res[i + bstride*0] = (cvk_convert_fp32_hex(tmp_res[i]) >> 16) & 0xFFFF;
+  for (uint64_t i = 0; i < size; i++)
+    res[i + bstride*1] = (cvk_convert_fp32_hex(tmp_res[i]) >> 0) & 0xFFFF;
+
+  free(tmp_res);
+
+  return ret;
+}
+
+static int ps32_m1_matrix_mac_ref(
+  param_t *p,
+  uint16_t *left,
+  uint16_t *right,
+  uint32_t * bias,
+  uint16_t *res)
+{
+  uint64_t size = res_ps32_size(p);
+  uint32_t left_col = p->left->shape.col;
+  uint32_t right_col = p->right->shape.col;
+  uint32_t res_row = p->left->shape.n;
+  uint32_t res_col = p->res->shape.col;
+  uint32_t left_c = p->left->shape.c;
+  uint32_t left_w = p->left->shape.w;
+
+  int ret = 0;
+  int bstride = res_row * res_col;
+
+  float *tmp_res = (float *)malloc(sizeof(float) * size);
+  if (!tmp_res)
+    return -1;
+
+  for (uint64_t i = 0; i < size; i++) {
+    tmp_res[i] = cvk_convert_hex_fp32((res[i + bstride*0] << 16) | res[i + bstride*1]);
+   }
+
+  for (uint32_t row = 0; row < res_row; row++) {
+    for (uint32_t col = 0; col < res_col; col++) {
+      for (uint32_t wi = 0; wi < left_w; wi++) {
+        for (uint32_t ci = 0; ci < left_c; ci++) {
+          if ((wi + (ci*left_w)) >= left_col)
+            continue;
+          uint32_t li = row * left_col + left_w * ci + wi;
+          uint32_t ri = (ci* left_w + wi )* right_col + col;
+
+          float l = cvk_convert_bf16_fp32(left[li]);
+          float r = cvk_convert_bf16_fp32(right[ri]);
+          tmp_res[row * res_col + col] += l * r;
+        }
+      }
+    }
+  }
+
+  if (p->bias && bias) {
+    for (uint32_t row = 0; row < res_row; row++) {
+      for (uint32_t col = 0; col < res_col; col++) {
+        float b = cvk_convert_hex_fp32(bias[col]);
+        tmp_res[row * res_col + col] += b;
+      }
+    }
+  }
+
+  if (p->relu_enable)
+    bf16_relu(tmp_res, size);
+
+  for (uint64_t i = 0; i < size; i++)
+    res[i] = cvk_convert_fp32_bf16(tmp_res[i]);
+
+  free(tmp_res);
+
+  return ret;
+}
+
+static void put_bias(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_ml_t *ml,
+    uint32_t data[])
+{
+  uint64_t size = ml->shape.col;
+
+  uint16_t *tmp = (uint16_t *)malloc(sizeof(uint16_t) * size * 2);
+  if (!tmp)
+    return;
+
+  for (uint64_t i = 0; i < size; i++) {
+    tmp[i] = data[i] >> 16;
+    tmp[i + size] = data[i] & 0xFFFF;
+  }
+  matrix_copy_s2d_g2l(rt_handle, cvk_ctx, ml, (uint8_t*) tmp);
+
+  free(tmp);
+}
+
+
+static int test_matrix_ps32_ut(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  int ret = 0;
+  make_bmk_matrix_param_ps32(cvk_ctx, p, 2);
+  uint16_t *left = alloc_left(p);
+  uint16_t *right = alloc_right(p);
+  uint16_t *ref = alloc_ps32_res(p);
+  if (!left || !right || !ref) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  {
+    ret = ps32_m2_matrix_mac_ref(p, left, right, ref);
+    if (ret)
+      goto fail_exit;
+
+    matrix_copy_s2d_g2l(rt_handle, cvk_ctx, p->left, (uint8_t*) left);
+    matrix_copy_s2d_g2l(rt_handle, cvk_ctx, p->right, (uint8_t*) right);
+    cvk_ctx->ops->tiu_matrix_multiplication(cvk_ctx, p);
+    cvk_ml_t ps32_res;
+    ps32_res = *p->res;
+    ps32_res.shape.n *= sizeof(short);
+    uint16_t *res = (uint16_t*) matrix_copy_l2g_d2s(rt_handle, cvk_ctx, &ps32_res);
+
+    ret = array_cmp_int8(
+        "Comparing begin_mode results ...\n",
+        (int8_t *)ref, (int8_t *)res ,(int)res_ps32_size(p)*sizeof(int));
+    if (ret) {
+      printf("Comparison M2 FAILED\n");
+      print_param(p);
+      ret = -1;
+    }else
+      printf("Comparison M2 PASS\n");
+    free(res);
+  }
+
+  {
+    make_bmk_matrix_param_ps32(cvk_ctx, p, 3);
+
+    ret = ps32_m3_matrix_mac_ref(p, left, right, ref);
+    if (ret)
+      goto fail_exit;
+
+    cvk_ctx->ops->tiu_matrix_multiplication(cvk_ctx, p);
+    cvk_ml_t ps32_res;
+    ps32_res = *p->res;
+    ps32_res.shape.n *= sizeof(short);
+    uint16_t *res = (uint16_t *) matrix_copy_l2g_d2s(rt_handle, cvk_ctx, &ps32_res);
+
+    ret = array_cmp_int8(
+        "Comparing m3 results ...\n",
+        (int8_t *)ref, (int8_t *)res ,(int)res_ps32_size(p)*sizeof(int));
+    if (ret) {
+      printf("Comparison M3 FAILED\n");
+      print_param(p);
+      ret = -1;
+    }else
+      printf("Comparison M3 PASS\n");
+
+    free(res);
+  }
+  {
+    make_bmk_matrix_param_ps32(cvk_ctx, p, 1);
+    uint32_t *bias = alloc_bias(p);
+
+    ret = ps32_m1_matrix_mac_ref(p, left, right, bias, ref);
+    if (ret)
+      goto fail_exit;
+
+    if(p->bias)
+      put_bias(rt_handle, cvk_ctx, p->bias, bias);
+
+    cvk_ctx->ops->tiu_matrix_multiplication(cvk_ctx, p);
+    cvk_ml_t ps32_res;
+    ps32_res = *p->res;
+    ps32_res.shape.n *= 2;
+
+    uint16_t *res = (uint16_t *)matrix_copy_l2g_d2s(rt_handle, cvk_ctx, &ps32_res);
+
+    ret = array_cmp_int8(
+        "Comparing m1 results ...\n",
+        (int8_t *)ref, (int8_t *)res ,(int)res_size(p)*2);
+    if (ret) {
+      printf("Comparison M1 FAILED\n");
+      print_param(p);
+      ret = -1;
+    }else
+      printf("Comparison M1 PASS\n");
+
+    free(res);
+    free(bias);
+  }
+
+fail_exit:
+  free(left);
+  free(right);
+  free(ref);
+
+  return ret;
+}
+
+static void destroy_param(cvk_context_t *cvk_ctx, param_t *p)
+{
+  if (p->bias)
+    cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->bias);
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->res);
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->right);
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->left);
+}
+
+static cvk_ml_t *alloc_param_res(
+    cvk_context_t *cvk_ctx, param_t *p)
+{
+  cvk_ml_shape_t s;
+  s.n = p->left->shape.n;
+  s.c = p->right->shape.c;
+  s.w = p->right->shape.w;
+  s.col = p->right->shape.col;
+
+  cvk_fmt_t fmt = CVK_FMT_BF16;
+  return cvk_ctx->ops->lmem_alloc_ps32_matrix(cvk_ctx, s, fmt, 1);
+}
+
+
+static void make_bmk_matrix_param_ps32(cvk_context_t *cvk_ctx, param_t *p, int ps32_mode)
+{
+
+  cvk_ml_shape_t left_shape;
+  cvk_ml_shape_t right_shape;
+
+  p->ps32_mode = ps32_mode;
+  p->relu_enable = 0;
+  p->lshift_bits = 0;
+  p->rshift_bits = 0;
+  if(ps32_mode==2)
+  {
+    left_shape.n = matrix_para_t.left_row;
+    left_shape.c = matrix_para_t.left_c;
+    left_shape.w = matrix_para_t.left_w;
+    left_shape.col = matrix_para_t.left_col;
+
+    right_shape.n = matrix_para_t.right_row;
+    right_shape.c = matrix_para_t.right_c;
+    right_shape.w = matrix_para_t.right_w;
+    right_shape.col = matrix_para_t.right_col;
+    p->left  = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_BF16, 1);
+    p->right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_BF16, 1);
+    p->bias = NULL;
+    p->res = alloc_param_res(cvk_ctx, p);
+  }else if(ps32_mode==3)
+  {
+
+  }else if(ps32_mode==1)
+  {
+     p->relu_enable = matrix_para_t.relu_enable;
+     p->rshift_bits = matrix_para_t.rshift_bits;
+     if(matrix_para_t.using_bias)
+     {
+       right_shape.n = matrix_para_t.right_row;
+       right_shape.c = matrix_para_t.right_c;
+       right_shape.w = matrix_para_t.right_w;
+       right_shape.col = matrix_para_t.right_col;
+
+       cvk_ml_shape_t bias_shape = right_shape;
+       bias_shape.n = 2;
+       p->bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_BF16, 1);
+       assert(p->bias);
+    }
+  }
+  //print_param(p);
+}
+
+static param_t param_init(void)
+{
+  param_t p;
+
+  random_seed = clock();
+  srand(random_seed);
+
+  memset(&p, 0, sizeof(param_t));
+  memset(&matrix_para_t, 0, sizeof(matrix_init_para_t));
+
+  matrix_para_t.using_bias = rand()%2;
+  matrix_para_t.relu_enable = rand()%2;
+
+  matrix_para_t.left_row = rand()%60+1;
+  matrix_para_t.left_col = rand()%40+1;
+  matrix_para_t.left_w = matrix_para_t.left_col/0x10 ? (uint32_t)rand()%8+8 : matrix_para_t.left_col;
+  matrix_para_t.left_c =
+    matrix_para_t.left_col%matrix_para_t.left_w?
+      matrix_para_t.left_col/matrix_para_t.left_w+1 : matrix_para_t.left_col/matrix_para_t.left_w;
+
+  matrix_para_t.right_row = matrix_para_t.left_col;
+  matrix_para_t.right_col = rand()%50+1;
+  matrix_para_t.right_w = rand()%16+1;
+  matrix_para_t.right_c =
+    matrix_para_t.right_col%matrix_para_t.right_w?
+      matrix_para_t.right_col/matrix_para_t.right_w+1 : matrix_para_t.right_col/matrix_para_t.right_w;
+  return p;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+  int round_mode;
+  round_mode = cvk_set_store_feround();
+
+  for (int i = 0; i < 30; i++) {
+    printf("random_test_conv iteration: %d\n", i);
+    param_t p = param_init();
+
+    ret |= test_matrix_ps32_ut(rt_handle, cvk_ctx, &p);
+    destroy_param(cvk_ctx, &p);
+  }
+
+  cvk_restore_feround(round_mode);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_bf16_max_pooling.c b/cviruntime/test/181x/test_181x_bf16_max_pooling.c
new file mode 100644
index 000000000..553499ce8
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_bf16_max_pooling.c
@@ -0,0 +1,345 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+#include <float.h>
+
+#include "test_cvikernel_util.h"
+#include "test_native_ref.h"
+
+#define INVALIDE_STRIDE (-1)
+typedef cvk_tiu_max_pooling_param_t param_t;
+int random_seed;
+static void print_pooling_param(param_t *p)
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+  int on = p->ofmap->shape.n;
+  int oc = p->ofmap->shape.c;
+  int oh = p->ofmap->shape.h;
+  int ow = p->ofmap->shape.w;
+
+  printf("  Pooling parameters:\n");
+  printf("    random_seed : %d \n", random_seed);
+  printf("    ifmap = (%d, %d, %d, %d)\n", in, ic, ih, iw);
+  printf("    opd0_sign = %d\n", p->ifmap->fmt == CVK_FMT_I8);
+  printf("    weight = (%d, %d)\n", p->kh, p->kw);
+  printf("    padding = (%d, %d, %d, %d)\n",
+         p->pad_top, p->pad_bottom, p->pad_left, p->pad_right);
+  printf("    stride = (%d, %d)\n", p->stride_h, p->stride_w);
+  printf("    ofmap = (%d, %d, %d, %d)\n", on, oc, oh, ow);
+}
+
+static int pooling_ih_ext(param_t *p, int ih)
+{
+  int pad = p->pad_top + p->pad_bottom;
+  return ih + pad;
+}
+
+static int pooling_iw_ext(param_t *p, int iw)
+{
+  int pad = p->pad_left + p->pad_right;
+  return iw + pad;
+}
+
+static int pooling_oh(param_t *p, int ih)
+{
+  int ih_ext = pooling_ih_ext(p, ih);
+  return (ih_ext - p->kh) / p->stride_h + 1;
+}
+
+static int pooling_ow(param_t *p, int iw)
+{
+  int iw_ext = pooling_iw_ext(p, iw);
+  return (iw_ext - p->kw) / p->stride_w + 1;
+}
+
+static uint16_t *alloc_input(param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->ifmap->shape, p->ifmap->fmt);
+  uint16_t *data = (uint16_t *)malloc(size);
+  if (!data)
+    return NULL;
+
+  for (uint64_t i = 0; i < size/sizeof(uint16_t); i++) {
+    float val;
+    int RAND_MAX2 = RAND_MAX/2; //100 ~ -100
+    val = (float)(rand()-RAND_MAX2)*100 / (float)RAND_MAX;
+    data[i] = cvk_convert_fp32_bf16(val);
+  }
+  return data;
+}
+
+static uint16_t *alloc_output(param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->ofmap->shape, p->ofmap->fmt);
+  return (uint16_t *)malloc(size);
+}
+
+static void free_pooling_param(
+    cvk_context_t *cvk_ctx,
+    param_t *r)
+{
+  if (r->ifmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ifmap);
+  if (r->ofmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ofmap);
+}
+
+static param_t random_pooling_param(cvk_context_t *cvk_ctx, int stride_w, int stride_h)
+{
+
+  param_t p;
+
+  memset(&p, 0, sizeof(p));
+
+retry:
+  random_seed = clock();
+//  random_seed = 3058538;
+  srand(random_seed);
+
+#if 0
+  int in = 1;
+  int ic = 1;
+  int ih = 6;
+  int iw = 6;
+  //int opd0_sign = rand() % 2;
+
+  p.kh = 3;
+  p.kw = 3;
+  p.stride_h = p.kh;
+  p.stride_w = p.kw;
+  p.pad_top = 3;//rand() % p.kh;
+  p.pad_bottom = 3;//rand() % p.kh;
+  p.pad_left = 3;//rand() % p.kw;
+  p.pad_right = 3;//rand() % p.kw;
+
+  cvk_tl_shape_t ifmap_shape;
+  ifmap_shape.n = in;
+  ifmap_shape.c = ic;
+  ifmap_shape.h = ih;
+  ifmap_shape.w = iw;
+  cvk_tl_shape_t ofmap_shape;
+  ofmap_shape.n = in;
+  ofmap_shape.c = ic;
+  ofmap_shape.h = pooling_oh(&p, ih);
+  ofmap_shape.w = pooling_ow(&p, iw);
+
+#else
+  int in = rand() % 5 + 1;
+  int ic = rand() % (3 * cvk_ctx->info.npu_num) + 1;
+  int ih = rand() % 30 + 3 + (INVALIDE_STRIDE == stride_h ? 0 : stride_h);
+  int iw = rand() % 30 + 6 + (INVALIDE_STRIDE == stride_w ? 0 : stride_w);
+  //int opd0_sign = rand() % 2;
+
+  p.kh = rand() % 5 + 1;
+  p.kw = rand() % 5 + 1;
+  p.stride_h = INVALIDE_STRIDE == stride_h ? rand() % (p.kh) + 1 : stride_h;
+  p.stride_w = INVALIDE_STRIDE == stride_w ? rand() % (p.kh) + 1 : stride_w;
+  p.pad_top = rand() % p.kh;
+  p.pad_bottom = rand() % p.kh;
+  p.pad_left = rand() % p.kw;
+  p.pad_right = rand() % p.kw;
+
+  cvk_tl_shape_t ifmap_shape;
+  ifmap_shape.n = in;
+  ifmap_shape.c = ic;
+  ifmap_shape.h = ih;
+  ifmap_shape.w = iw;
+  cvk_tl_shape_t ofmap_shape;
+  ofmap_shape.n = in;
+  ofmap_shape.c = ic;
+  ofmap_shape.h = pooling_oh(&p, ih);
+  ofmap_shape.w = pooling_ow(&p, iw);
+#endif
+//  cvk_fmt_t fmt = opd0_sign? CVK_FMT_I8: CVK_FMT_U8;
+  p.ofmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, ofmap_shape, CVK_FMT_BF16, 1);
+  p.ifmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, ifmap_shape, CVK_FMT_BF16, 1);
+
+  int RAND_MAX2 = RAND_MAX/2; //20 ~ -20
+  float ins_val = (float)(rand()-RAND_MAX2)*20 / (float)RAND_MAX;
+  p.ins_fp = cvk_convert_fp32_bf16(ins_val);
+
+  if ((p.kh > pooling_ih_ext(&p, ih))
+      || (p.kw > pooling_iw_ext(&p, iw))
+      || (p.pad_top >= (1 << 4))
+      || (p.pad_bottom >= (1 << 4))
+      || (p.pad_left >= (1 << 4))
+      || (p.pad_right >= (1 << 4))
+      || (p.kh * p.kw == 1)
+      || !p.ofmap || !p.ifmap) {
+      printf("retry init_pooling_param\n");
+    free_pooling_param(cvk_ctx, &p);
+    goto retry;
+  }
+
+  return p;
+}
+static int index_get(int h, int w1, int w2)
+{
+  return h * w1 + w2;
+}
+
+int native_pooling_max_bf16(
+    const uint16_t* i_fmap,
+    uint16_t* o_fmap,
+    int input_n, int input_c, int input_h, int input_w,
+    int kh, int kw,
+    int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r,
+    int stride_h, int stride_w,
+    int ins_h, int ins_w,
+    int ins_h_last, int ins_w_last,
+    uint16_t ins_fp
+    )
+{
+  if (ins_h != 0 || ins_w != 0 || ins_h_last != 0  || ins_w_last !=0)
+    return -1;
+
+  int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b);
+  int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r);
+
+  int output_h = calc_output_hw(h_after, kh, stride_h);
+  int output_w = calc_output_hw(w_after, kw, stride_w);
+
+  const float max_init = -FLT_MAX;//cvk_convert_bf16_fp32(ins_fp);
+  uint16_t *i_fmap_pad = NULL;
+  for (int nc = 0; nc < input_n * input_c; nc++) {
+    fill_pad_fmap_bf16(i_fmap, &i_fmap_pad, ins_fp,
+      pad_w_l, pad_w_r, pad_h_t, pad_h_b,
+      0, 0, 0, 0, input_h, input_w);
+
+    for (int ph = 0; ph < output_h; ++ph) {
+      for (int pw = 0; pw < output_w; ++pw) {
+        int hstart = ph * stride_h;
+        int wstart = pw * stride_w;
+        int pool_index = index_get(ph, output_w, pw);
+        float max = max_init;
+        for (int h = 0; h < kh; h++) {
+          for (int w = 0; w < kw; w++) {
+            int index = index_get((hstart + h), (input_w + pad_w_l + pad_w_r),
+                            (w + wstart));
+            float val = cvk_convert_bf16_fp32(i_fmap_pad[index]);
+            max = (val > max)? val: max;
+          }
+        }
+        o_fmap[pool_index] = cvk_convert_fp32_bf16(max);
+      }
+    }
+    i_fmap += input_w * input_h;
+    o_fmap += output_w * output_h;
+  }
+  free(i_fmap_pad);
+
+  return 0;
+}
+
+
+static int compare_results(
+    param_t *p,
+    uint16_t input[],
+    uint16_t output[])
+{
+  int ret = 0;
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+
+  uint16_t *output_ref = alloc_output(p);
+  ret = native_pooling_max_bf16(
+      input, output_ref, in, ic, ih, iw, p->kh, p->kw,
+      p->pad_top, p->pad_bottom, p->pad_left, p->pad_right,
+      p->stride_h, p->stride_w, 0, 0, 0, 0, p->ins_fp);
+  if (ret)
+    goto fail_exit;
+  
+  ret = array_cmp_int8(
+      "Comparing results ...\n", (int8_t*) output_ref, (int8_t*)output,
+      tl_shape_size(&p->ofmap->shape, p->ofmap->fmt));
+  if (ret) {
+    printf("Comparison FAILED!!!\n");
+    print_pooling_param(p);
+    ret = -1;
+  }
+
+fail_exit:
+  free(output_ref);
+
+  return ret;
+}
+
+static int _test_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int stride_w, int stride_h)
+{
+  param_t param = random_pooling_param(cvk_ctx, stride_w, stride_h);
+  //print_pooling_param(&param);
+  uint16_t *input = alloc_input(&param);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, param.ifmap, (uint8_t *)input);
+  cvk_ctx->ops->tiu_max_pooling(cvk_ctx, &param);
+
+  uint16_t *output = (uint16_t *)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, param.ofmap);
+
+  int ret = compare_results(&param, input, output);
+
+  free_pooling_param(cvk_ctx, &param);
+  free(output);
+  free(input);
+
+  return ret;
+}
+
+
+static int test_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx) {
+  return _test_pooling(rt_handle, cvk_ctx, INVALIDE_STRIDE, INVALIDE_STRIDE);
+}
+
+static int test_max_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+  for (uint64_t i = 0; i < 20; i++)
+    ret |= test_pooling(rt_handle, cvk_ctx);
+
+  // test stride extend (0, 31]
+  int stride_list[] = {15, 16, 31};
+  int stride_list_len = sizeof(stride_list) / sizeof(stride_list[0]);
+
+  for (int stride_w_idx = 0; stride_w_idx < stride_list_len; stride_w_idx++) {
+    for (int stride_h_idx = 0; stride_h_idx < stride_list_len; stride_h_idx++) {
+      int stride_w = stride_list[stride_w_idx];
+      int stride_h = stride_list[stride_h_idx];
+
+      ret |= _test_pooling(rt_handle, cvk_ctx, stride_w, stride_h);
+    }
+  }
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret = test_max_pooling(rt_handle, cvk_ctx);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_bf16_min_pooling.c b/cviruntime/test/181x/test_181x_bf16_min_pooling.c
new file mode 100644
index 000000000..71500317b
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_bf16_min_pooling.c
@@ -0,0 +1,325 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <float.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_native_ref.h"
+
+typedef cvk_tiu_min_pooling_param_t param_t;
+int random_seed;
+static void print_pooling_param(param_t *p)
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+  int on = p->ofmap->shape.n;
+  int oc = p->ofmap->shape.c;
+  int oh = p->ofmap->shape.h;
+  int ow = p->ofmap->shape.w;
+
+  printf("  Pooling parameters:\n");
+  printf("    random_seed : %d \n", random_seed);
+  printf("    ifmap = (%d, %d, %d, %d)\n", in, ic, ih, iw);
+  printf("    opd0_sign = %d\n", p->ifmap->fmt == CVK_FMT_I8);
+  printf("    weight = (%d, %d)\n", p->kh, p->kw);
+  printf("    padding = (%d, %d, %d, %d)\n",
+         p->pad_top, p->pad_bottom, p->pad_left, p->pad_right);
+  printf("    stride = (%d, %d)\n", p->stride_h, p->stride_w);
+  printf("    ofmap = (%d, %d, %d, %d)\n", on, oc, oh, ow);
+}
+
+static int pooling_ih_ext(param_t *p, int ih)
+{
+  int pad = p->pad_top + p->pad_bottom;
+  return ih + pad;
+}
+
+static int pooling_iw_ext(param_t *p, int iw)
+{
+  int pad = p->pad_left + p->pad_right;
+  return iw + pad;
+}
+
+static int pooling_oh(param_t *p, int ih)
+{
+  int ih_ext = pooling_ih_ext(p, ih);
+  return (ih_ext - p->kh) / p->stride_h + 1;
+}
+
+static int pooling_ow(param_t *p, int iw)
+{
+  int iw_ext = pooling_iw_ext(p, iw);
+  return (iw_ext - p->kw) / p->stride_w + 1;
+}
+
+static uint16_t *alloc_input(param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->ifmap->shape, p->ifmap->fmt);
+  uint16_t *data = (uint16_t *)malloc(size);
+  if (!data)
+    return NULL;
+
+  for (uint64_t i = 0; i < size/sizeof(uint16_t); i++) {
+    float val;
+    int RAND_MAX2 = RAND_MAX/2; //100 ~ -100
+    val = (float)(rand()-RAND_MAX2)*100 / (float)RAND_MAX;
+    data[i] = cvk_convert_fp32_bf16(val);
+  }
+  return data;
+}
+
+static uint16_t *alloc_output(param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->ofmap->shape, p->ofmap->fmt);
+  return (uint16_t *)malloc(size);
+}
+
+static void free_pooling_param(
+    cvk_context_t *cvk_ctx,
+    param_t *r)
+{
+  if (r->ifmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ifmap);
+  if (r->ofmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ofmap);
+}
+
+static param_t random_pooling_param(cvk_context_t *cvk_ctx)
+{
+
+  param_t p;
+
+  memset(&p, 0, sizeof(p));
+
+retry:
+  random_seed = clock();
+//  random_seed = 3058538;
+  srand(random_seed);
+
+#if 0
+  int in = 1;
+  int ic = 1;
+  int ih = 6;
+  int iw = 6;
+  //int opd0_sign = rand() % 2;
+
+  p.kh = 3;
+  p.kw = 3;
+  p.stride_h = p.kh;
+  p.stride_w = p.kw;
+  p.pad_top = 3;//rand() % p.kh;
+  p.pad_bottom = 3;//rand() % p.kh;
+  p.pad_left = 3;//rand() % p.kw;
+  p.pad_right = 3;//rand() % p.kw;
+
+  cvk_tl_shape_t ifmap_shape;
+  ifmap_shape.n = in;
+  ifmap_shape.c = ic;
+  ifmap_shape.h = ih;
+  ifmap_shape.w = iw;
+  cvk_tl_shape_t ofmap_shape;
+  ofmap_shape.n = in;
+  ofmap_shape.c = ic;
+  ofmap_shape.h = pooling_oh(&p, ih);
+  ofmap_shape.w = pooling_ow(&p, iw);
+
+#else
+  int in = rand() % 5 + 1;
+  int ic = rand() % (3 * cvk_ctx->info.npu_num) + 1;
+  int ih = rand() % 30 + 3;
+  int iw = rand() % 30 + 6;
+  //int opd0_sign = rand() % 2;
+
+  p.kh = rand() % 5 + 1;
+  p.kw = rand() % 5 + 1;
+  p.stride_h = rand() % (p.kh) + 1;
+  p.stride_w = rand() % (p.kw) + 1;
+  p.pad_top = rand() % p.kh;
+  p.pad_bottom = rand() % p.kh;
+  p.pad_left = rand() % p.kw;
+  p.pad_right = rand() % p.kw;
+
+  cvk_tl_shape_t ifmap_shape;
+  ifmap_shape.n = in;
+  ifmap_shape.c = ic;
+  ifmap_shape.h = ih;
+  ifmap_shape.w = iw;
+  cvk_tl_shape_t ofmap_shape;
+  ofmap_shape.n = in;
+  ofmap_shape.c = ic;
+  ofmap_shape.h = pooling_oh(&p, ih);
+  ofmap_shape.w = pooling_ow(&p, iw);
+#endif
+//  cvk_fmt_t fmt = opd0_sign? CVK_FMT_I8: CVK_FMT_U8;
+  p.ofmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, ofmap_shape, CVK_FMT_BF16, 1);
+  p.ifmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, ifmap_shape, CVK_FMT_BF16, 1);
+
+  int RAND_MAX2 = RAND_MAX/2; //20 ~ -20
+  float ins_val = (float)(rand()-RAND_MAX2)*20 / (float)RAND_MAX;
+  p.ins_fp = cvk_convert_fp32_bf16(ins_val);
+
+  if ((p.kh > pooling_ih_ext(&p, ih))
+      || (p.kw > pooling_iw_ext(&p, iw))
+      || (p.pad_top >= (1 << 4))
+      || (p.pad_bottom >= (1 << 4))
+      || (p.pad_left >= (1 << 4))
+      || (p.pad_right >= (1 << 4))
+      || (p.kh * p.kw == 1)
+      || !p.ofmap || !p.ifmap) {
+      printf("retry init_pooling_param\n");
+    free_pooling_param(cvk_ctx, &p);
+    goto retry;
+  }
+
+  return p;
+}
+static int index_get(int h, int w1, int w2)
+{
+  return h * w1 + w2;
+}
+
+int native_pooling_min_bf16(
+    const uint16_t* i_fmap,
+    uint16_t* o_fmap,
+    int input_n, int input_c, int input_h, int input_w,
+    int kh, int kw,
+    int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r,
+    int stride_h, int stride_w,
+    int ins_h, int ins_w,
+    int ins_h_last, int ins_w_last,
+    uint16_t ins_fp
+    )
+{
+  if (ins_h != 0 || ins_w != 0 || ins_h_last != 0  || ins_w_last !=0)
+    return -1;
+
+  int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b);
+  int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r);
+
+  int output_h = calc_output_hw(h_after, kh, stride_h);
+  int output_w = calc_output_hw(w_after, kw, stride_w);
+
+  const float min_init = FLT_MAX;//cvk_convert_bf16_fp32(ins_fp);
+  uint16_t *i_fmap_pad = NULL;
+  for (int nc = 0; nc < input_n * input_c; nc++) {
+    fill_pad_fmap_bf16(i_fmap, &i_fmap_pad, ins_fp,
+      pad_w_l, pad_w_r, pad_h_t, pad_h_b,
+      0, 0, 0, 0, input_h, input_w);
+
+    for (int ph = 0; ph < output_h; ++ph) {
+      for (int pw = 0; pw < output_w; ++pw) {
+        int hstart = ph * stride_h;
+        int wstart = pw * stride_w;
+        int pool_index = index_get(ph, output_w, pw);
+        float min = min_init;
+        for (int h = 0; h < kh; h++) {
+          for (int w = 0; w < kw; w++) {
+            int index = index_get((hstart + h), (input_w + pad_w_l + pad_w_r),
+                            (w + wstart));
+            float val = cvk_convert_bf16_fp32(i_fmap_pad[index]);
+            min = (val < min)? val: min;
+          }
+        }
+        o_fmap[pool_index] = cvk_convert_fp32_bf16(min);
+      }
+    }
+    i_fmap += input_w * input_h;
+    o_fmap += output_w * output_h;
+  }
+  free(i_fmap_pad);
+
+  return 0;
+}
+
+
+static int compare_results(
+    param_t *p,
+    uint16_t input[],
+    uint16_t output[])
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+
+  uint16_t *output_ref = alloc_output(p);
+  int ret = native_pooling_min_bf16(
+      input, output_ref, in, ic, ih, iw, p->kh, p->kw,
+      p->pad_top, p->pad_bottom, p->pad_left, p->pad_right,
+      p->stride_h, p->stride_w, 0, 0, 0, 0, p->ins_fp);
+  if (ret)
+    goto fail_exit;
+
+  ret = array_cmp_int8(
+      "Comparing results ...\n", (int8_t*) output_ref, (int8_t*)output,
+      tl_shape_size(&p->ofmap->shape, p->ofmap->fmt));
+  if (ret != 0) {
+    printf("Comparison FAILED!!!\n");
+    print_pooling_param(p);
+    ret = -1;;
+  }
+
+fail_exit:
+  free(output_ref);
+
+  return ret;
+}
+
+static int test_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  param_t param = random_pooling_param(cvk_ctx);
+  //print_pooling_param(&param);
+  uint16_t *input = alloc_input(&param);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, param.ifmap, (uint8_t *)input);
+  cvk_ctx->ops->tiu_min_pooling(cvk_ctx, &param);
+
+  uint16_t *output = (uint16_t *)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, param.ofmap);
+
+  int ret = compare_results(&param, input, output);
+
+  free_pooling_param(cvk_ctx, &param);
+  free(output);
+  free(input);
+
+  return ret;
+}
+
+static int test_min_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+  for (uint64_t i = 0; i < 20; i++)
+    ret |= test_pooling(rt_handle, cvk_ctx);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret = test_min_pooling(rt_handle, cvk_ctx);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_bf16_tensor_add.c b/cviruntime/test/181x/test_181x_bf16_tensor_add.c
new file mode 100644
index 000000000..7aa473255
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_bf16_tensor_add.c
@@ -0,0 +1,142 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_add_ref(
+    uint16_t *ref_low,
+    uint16_t *a_low,
+    uint16_t *b_low,
+    uint64_t size, int relu_enable)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    float ta = cvk_convert_bf16_fp32(a_low[i]);
+    float tb = cvk_convert_bf16_fp32(b_low[i]);
+    float res = ta + tb;
+    if(relu_enable && res <0)
+        res = 0;
+    ref_low[i] = cvk_convert_fp32_bf16(res);
+  }
+}
+
+static int test_tl_add(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 2;  // 3 -> 2 for 1810
+  int c = 39;
+  int h = 7;
+  int w = 37;
+  int rshift_bits;
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  cvk_fmt_t fmt_type = CVK_FMT_BF16;
+  uint32_t size = n * c * h  * w;
+  uint32_t data_size = size * (fmt_type == CVK_FMT_BF16 ? 2 : 1);
+  uint16_t *a_low_data = (uint16_t *)malloc(data_size);
+  uint16_t *b_low_data = (uint16_t *)malloc(data_size);
+  uint16_t *ref_low_data = (uint16_t *)malloc(data_size);
+  if (!a_low_data || !b_low_data || !ref_low_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  cvk_tl_t *tl_a_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  cvk_tl_t *tl_b_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  cvk_tl_t *tl_res_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  uint16_t *res_low_data = NULL;
+  if (!tl_a_low || !tl_b_low || !tl_res_low) {
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  for(int relu_enable = 0; relu_enable < 2; relu_enable++) {
+    for (uint64_t i = 0; i < size; i++) {
+      a_low_data[i] = cvk_convert_fp32_bf16(i);
+      b_low_data[i] = cvk_convert_fp32_bf16(i);
+    }
+    rshift_bits = 0;
+
+    tl_add_ref(ref_low_data,
+               a_low_data,
+               b_low_data,
+               size, relu_enable);
+
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a_low, (uint8_t *)a_low_data);
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b_low, (uint8_t *)b_low_data);
+    cvk_tiu_add_param_t p4;
+    p4.res_high = 0;
+    p4.res_low = tl_res_low;
+    p4.a_high = 0;
+    p4.a_low = tl_a_low;
+    p4.b_is_const = 0;
+    p4.b.high = 0;
+    p4.b.low = tl_b_low;
+    p4.rshift_bits = rshift_bits;
+    p4.relu_enable = relu_enable;
+    cvk_ctx->ops->tiu_add(cvk_ctx, &p4);
+    res_low_data = (uint16_t*)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res_low);
+
+    for (uint32_t i = 0; i < size; i++) {
+      if (res_low_data[i] != ref_low_data[i]) {
+        fprintf(stderr, "comparing failed at res_low_data[%u], got %x, exp %x\n",
+                i, res_low_data[i], ref_low_data[i]);
+        ret = -1;
+        break;
+      }
+    }
+
+    free(res_low_data);
+  }
+
+fail_exit_2:
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res_low);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b_low);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a_low);
+
+fail_exit:
+  free(a_low_data);
+  free(b_low_data);
+  free(ref_low_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  int round_mode;
+  round_mode = cvk_set_store_feround();
+
+  ret |= test_tl_add(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_add(rt_handle, cvk_ctx, 1);
+
+  cvk_restore_feround(round_mode);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_bf16_tensor_add_const.c b/cviruntime/test/181x/test_181x_bf16_tensor_add_const.c
new file mode 100644
index 000000000..44f01467d
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_bf16_tensor_add_const.c
@@ -0,0 +1,133 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_add_const_ref(
+    uint16_t *ref_low,
+    uint16_t *a_low,
+    uint16_t b,
+    uint64_t size, int relu_enable)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    float ta = cvk_convert_bf16_fp32(a_low[i]);
+    float tb = cvk_convert_bf16_fp32(b);
+    float res = ta + tb;
+    if(relu_enable && res <0)
+        res = 0;
+    ref_low[i] = cvk_convert_fp32_bf16(res);
+  }
+}
+
+static int test_tl_add_const(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  cvk_fmt_t fmt_type = CVK_FMT_BF16;
+  uint32_t size = n * c * h  * w;
+  uint32_t data_size = size * (fmt_type == CVK_FMT_BF16 ? 2 : 1);
+  uint16_t *a_low_data = (uint16_t *)malloc(data_size);
+  uint16_t *ref_low_data = (uint16_t *)malloc(data_size);
+  if (!a_low_data || !ref_low_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  cvk_tl_t *tl_a_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  cvk_tl_t *tl_res_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  if (!tl_a_low || !tl_res_low) {
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  uint16_t b = cvk_convert_fp32_bf16(-3);
+
+  for(int relu_enable = 0; relu_enable < 2; relu_enable++) {
+
+    for (uint64_t i = 0; i < size; i++) {
+      a_low_data[i] = cvk_convert_fp32_bf16(i);
+    }
+
+    tl_add_const_ref(ref_low_data,
+                     a_low_data,
+                     b, size,relu_enable);
+
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a_low, (uint8_t*) a_low_data);
+
+    cvk_tiu_add_param_t p4;
+    p4.res_high = 0;
+    p4.res_low = tl_res_low;
+    p4.a_high = 0;
+    p4.a_low = tl_a_low;
+    p4.b_is_const = 1;
+    p4.b_const.val = b;
+//    p4.b_const.is_signed = b_is_signed;
+//    p4.rshift_bits = rshift_bits;
+    p4.relu_enable = relu_enable;
+    cvk_ctx->ops->tiu_add(cvk_ctx, &p4);
+
+//    uint8_t *res_high_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res_high);
+    uint16_t *res_low_data = (uint16_t *) tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res_low);
+    for (uint64_t i = 0; i < size; i++) {
+      if (res_low_data[i] != ref_low_data[i]) {
+        fprintf(stderr, "comparing failed at res_low_data[%" PRIu64 "], got %d, exp %d\n",
+                i, res_low_data[i], ref_low_data[i]);
+        ret = -1;
+        break;
+      }
+    }
+
+    free(res_low_data);
+  }
+
+fail_exit_2:
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res_low);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a_low);
+
+fail_exit:
+  free(a_low_data);
+  free(ref_low_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+
+  ret |= test_tl_add_const(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_add_const(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_bf16_tensor_ge.c b/cviruntime/test/181x/test_181x_bf16_tensor_ge.c
new file mode 100644
index 000000000..0c81a2221
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_bf16_tensor_ge.c
@@ -0,0 +1,127 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_ge_ref(uint16_t *a, uint16_t *b, uint16_t *result, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    float fa = cvk_convert_bf16_fp32(a[i]);
+    float fb = cvk_convert_bf16_fp32(b[i]);
+    float fge;
+    if (fa >= fb)
+      fge = 1;
+    else
+      fge = 0;
+    result[i] = cvk_convert_fp32_bf16(fge);
+  }
+}
+
+static int test_tl_ge(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 2; // 3 -> 2 for 1810
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  cvk_fmt_t fmt_type = CVK_FMT_BF16;
+  uint32_t size = n * c * h * w;
+  uint32_t data_size = size * (fmt_type == CVK_FMT_BF16 ? 2 : 1);
+  uint16_t *a_data = (uint16_t *)malloc(data_size);
+  uint16_t *b_data = (uint16_t *)malloc(data_size);
+  uint16_t *ref_data = (uint16_t *)malloc(data_size);
+  if (!a_data || !b_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint32_t i = 0; i < size; i++)
+    a_data[i] = cvk_convert_fp32_bf16((int8_t)(i % 256));
+
+  for (uint32_t i = 0; i < size; i++)
+    b_data[i] = cvk_convert_fp32_bf16((int8_t)(100 - i % 256));
+
+  tl_ge_ref(a_data, b_data, ref_data, size);
+
+  cvk_tl_t *tl_a = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  cvk_tl_t *tl_b = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  cvk_tl_t *tl_ge = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  if (!tl_a || !tl_b || !tl_ge) {
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, (uint8_t *)a_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b, (uint8_t *)b_data);
+
+  cvk_tiu_ge_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.ge = tl_ge;
+  p.a = tl_a;
+  p.b_is_const = 0;
+  p.b = tl_b;
+  cvk_ctx->ops->tiu_ge(cvk_ctx, &p);
+  uint16_t *ge_data = (uint16_t*)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_ge);
+
+  for (uint32_t i = 0; i < size; i++) {
+    if (ge_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%u], got %x, exp %x\n",
+             i, ge_data[i], ref_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+  free(ge_data);
+
+fail_exit_2:
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_ge);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+
+fail_exit:
+  free(a_data);
+  free(b_data);
+  free(ref_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_ge(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_ge(rt_handle, cvk_ctx, 1);
+
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_bf16_tensor_ge_const.c b/cviruntime/test/181x/test_181x_bf16_tensor_ge_const.c
new file mode 100644
index 000000000..91e787b56
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_bf16_tensor_ge_const.c
@@ -0,0 +1,124 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_ge_const_ref(uint16_t *a, uint16_t b, uint16_t *result, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    float fa = cvk_convert_bf16_fp32(a[i]);
+    float fb = cvk_convert_bf16_fp32(b);
+    float fge;
+    if (fa >= fb)
+      fge = 1;
+    else
+      fge = 0;
+    result[i] = cvk_convert_fp32_bf16(fge);
+  }
+}
+
+static int test_tl_ge_const(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  cvk_fmt_t fmt_type = CVK_FMT_BF16;
+  uint32_t size = n * c * h  * w;
+  uint32_t data_size = size * (fmt_type == CVK_FMT_BF16 ? 2 : 1);
+
+  uint16_t *a_data = (uint16_t *)malloc(data_size);
+  uint16_t *ref_data = (uint16_t *)malloc(data_size);
+  if (!a_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint32_t i = 0; i < size; i++)
+    a_data[i] = cvk_convert_fp32_bf16(i);
+    //a_data[i] = cvk_convert_fp32_bf16(rand()%100 - 50);
+
+  uint16_t b = cvk_convert_fp32_bf16(20);
+
+  tl_ge_const_ref(a_data, b, ref_data, size);
+
+  cvk_tl_t *tl_a = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  cvk_tl_t *tl_ge = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  if (!tl_a || !tl_ge) {
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, (uint8_t *)a_data);
+  cvk_tiu_ge_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.ge = tl_ge;
+  p.a = tl_a;
+  p.b_is_const = 1;
+  p.b_const.val = b;
+  p.b_const.is_signed = 1;
+
+  cvk_ctx->ops->tiu_ge(cvk_ctx, &p);
+
+  uint16_t *ge_data = (uint16_t*) tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_ge);
+
+  for (uint64_t i = 0; i < size; i++) {
+    if (ge_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+             i, ge_data[i], ref_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+  free(ge_data);
+
+fail_exit_2:
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_ge);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+
+fail_exit:
+  free(a_data);
+  free(ref_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_ge_const(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_ge_const(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_bf16_tensor_mac.c b/cviruntime/test/181x/test_181x_bf16_tensor_mac.c
new file mode 100644
index 000000000..9fcd08114
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_bf16_tensor_mac.c
@@ -0,0 +1,147 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_mac_ref(
+    uint16_t *ref,
+    uint16_t *a, uint16_t *b, uint16_t *c,
+    uint64_t size, int relu_enable)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    float ta = cvk_convert_bf16_fp32(a[i]);
+    float tb = cvk_convert_bf16_fp32(b[i]);
+    float tc = cvk_convert_bf16_fp32(c[i]);
+    float res = ta * tb + tc;
+
+    if(relu_enable)
+      if(res<0)
+        res=0;
+    ref[i] = cvk_convert_fp32_bf16(res);
+  }
+}
+
+static int test_tl_mac(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int lshift_bits = 1;
+  int rshift_bits = 3;
+  int n = 2; // 3 -> 2 for 1810
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+  cvk_fmt_t fmt_type = CVK_FMT_BF16;
+
+  uint32_t size = n * c * h * w;
+  uint32_t data_size = size * (fmt_type == CVK_FMT_BF16 ? 2 : 1);
+  uint16_t *a_data = (uint16_t *)malloc(data_size);
+  uint16_t *b_data = (uint16_t *)malloc(data_size);
+  uint16_t *c_data = (uint16_t *)malloc(data_size);
+  uint16_t *ref_data = (uint16_t *)malloc(data_size);
+  if (!a_data || !b_data || !c_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  cvk_tl_t *tl_a = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  cvk_tl_t *tl_b = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  cvk_tl_t *tl_c = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  if (!tl_a || !tl_b || !tl_c) {
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  for(int relu_enable = 0; relu_enable < 2; relu_enable++) {
+    for (uint32_t i = 0; i < size; i++) {
+      a_data[i] = cvk_convert_fp32_bf16(rand());
+      b_data[i] = cvk_convert_fp32_bf16(rand());
+      c_data[i] = cvk_convert_fp32_bf16(rand());
+    }
+
+    tl_mac_ref(ref_data,
+               a_data, b_data, c_data,
+               size, relu_enable);
+
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, (uint8_t *)a_data);
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b, (uint8_t *)b_data);
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_c, (uint8_t *)c_data);
+
+    cvk_tiu_mac_param_t p2;
+    p2.res_high = 0;
+    p2.res_low = tl_c;
+    p2.res_is_int8 = relu_enable;
+    p2.a = tl_a;
+    p2.b_is_const = 0;
+    p2.b = tl_b;
+    p2.lshift_bits = lshift_bits;
+    p2.rshift_bits = rshift_bits;
+    p2.relu_enable = relu_enable;
+    cvk_ctx->ops->tiu_mac(cvk_ctx, &p2);
+    uint16_t *mac_data = (uint16_t *)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_c);
+
+    for (uint32_t i = 0; i < size; i++) {
+      if (mac_data[i] != ref_data[i]) {
+        fprintf(stderr, "comparing failed at mac_data[%u], got %d, exp %d\n",
+               i, mac_data[i], ref_data[i]);
+        ret = -1;
+        break;
+      }
+    }
+
+    free(mac_data);
+  }
+
+fail_exit_2:
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_c);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+
+fail_exit:
+  free(a_data);
+  free(b_data);
+  free(c_data);
+  free(ref_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  int round_mode;
+  round_mode = cvk_set_store_feround();
+  ret |= test_tl_mac(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_mac(rt_handle, cvk_ctx, 1);
+  cvk_restore_feround(round_mode);
+
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_bf16_tensor_mac_const.c b/cviruntime/test/181x/test_181x_bf16_tensor_mac_const.c
new file mode 100644
index 000000000..155ee0e2a
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_bf16_tensor_mac_const.c
@@ -0,0 +1,143 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_mac_const_ref(
+    uint16_t *ref_low,
+    uint16_t *a, uint16_t b_const,
+    uint16_t *c_low,
+    uint64_t size, int relu_enable)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    float ta = cvk_convert_bf16_fp32(a[i]);
+    float tb = cvk_convert_bf16_fp32(b_const);
+    float tc = cvk_convert_bf16_fp32(c_low[i]);
+    float res = ta * tb + tc;
+
+    if(relu_enable)
+    {
+      if(res<0)
+        res=0;
+    }
+    ref_low[i] = cvk_convert_fp32_bf16(res);
+  }
+}
+
+static int test_tl_mac_const(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  cvk_fmt_t fmt_type = CVK_FMT_BF16;
+  uint32_t size = n * c * h  * w;
+  uint32_t data_size = size * (fmt_type == CVK_FMT_BF16 ? 2 : 1);
+  uint16_t *a_data = (uint16_t *)malloc(data_size);
+  uint16_t *c_low_data = (uint16_t *)malloc(data_size);
+  uint16_t *ref_low_data = (uint16_t *)malloc(data_size);
+  if (!a_data || !c_low_data || !ref_low_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  cvk_tl_t *tl_a = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  cvk_tl_t *tl_c_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  if (!tl_a || !tl_c_low) {
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  uint16_t b_const = cvk_convert_fp32_bf16(37);
+
+  for(int relu_enable = 0; relu_enable < 2; relu_enable++) {
+    for (uint32_t i = 0; i < size; i++) {
+      a_data[i] = cvk_convert_fp32_bf16(rand() % 256);
+      c_low_data[i] = cvk_convert_fp32_bf16(i);
+    }
+
+    tl_mac_const_ref(ref_low_data,
+                     a_data, b_const, c_low_data,
+                     size, relu_enable);
+
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, (uint8_t*) a_data);
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_c_low, (uint8_t*) c_low_data);
+    cvk_tiu_mac_param_t p3;
+    p3.res_high = 0;
+    p3.res_low = tl_c_low;
+    p3.res_is_int8 = 1;//relu_enable;
+    p3.a = tl_a;
+    p3.b_is_const = 1;
+    p3.b_const.val = b_const;
+    p3.relu_enable = relu_enable;
+
+    cvk_ctx->ops->tiu_mac(cvk_ctx, &p3);
+    uint16_t *mac_low_data = (uint16_t*) tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_c_low);
+    for (uint64_t i = 0; i < size; i++) {
+      if (mac_low_data[i] != ref_low_data[i]) {
+        fprintf(stderr, "comparing failed at mac_low_data[%" PRIu64 "], got %d, exp %d\n",
+               i, mac_low_data[i], ref_low_data[i]);
+        ret = -1;
+        break;
+      }
+    }
+
+    free(mac_low_data);
+  }
+
+fail_exit_2:
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_c_low);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+
+
+fail_exit:
+  free(a_data);
+  free(c_low_data);
+  free(ref_low_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  int round_mode;
+  round_mode = cvk_set_store_feround();
+
+  ret |= test_tl_mac_const(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_mac_const(rt_handle, cvk_ctx, 1);
+
+  cvk_restore_feround(round_mode);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_bf16_tensor_max.c b/cviruntime/test/181x/test_181x_bf16_tensor_max.c
new file mode 100644
index 000000000..1dc599148
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_bf16_tensor_max.c
@@ -0,0 +1,126 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_max_ref(uint16_t *a, uint16_t *b, uint16_t *max, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    float fa = cvk_convert_bf16_fp32(a[i]);
+    float fb = cvk_convert_bf16_fp32(b[i]);
+    float fmax;
+    if (fa > fb)
+      fmax = fa;
+    else
+      fmax = fb;
+    max[i] = cvk_convert_fp32_bf16(fmax);
+  }
+}
+
+static int test_tl_max(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 2; // 3 -> 2 for 1810
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  cvk_fmt_t fmt_type = CVK_FMT_BF16;
+  uint32_t size = n * c * h * w;
+  uint32_t data_size = size * (fmt_type == CVK_FMT_BF16 ? 2 : 1);
+  uint16_t *a_data = (uint16_t *)malloc(data_size);
+  uint16_t *b_data = (uint16_t *)malloc(data_size);
+  uint16_t *ref_data = (uint16_t *)malloc(data_size);
+  if (!a_data || !b_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint32_t i = 0; i < size; i++)
+    a_data[i] = cvk_convert_fp32_bf16((int8_t)(i % 256));
+
+  for (uint32_t i = 0; i < size; i++)
+    b_data[i] = cvk_convert_fp32_bf16((int8_t)(100 - i % 256));
+
+  tl_max_ref(a_data, b_data, ref_data, size);
+
+  cvk_tl_t *tl_a = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  cvk_tl_t *tl_b = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  cvk_tl_t *tl_max = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  if (!tl_a || !tl_b || !tl_max) {
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, (uint8_t *)a_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b, (uint8_t *)b_data);
+
+  cvk_tiu_max_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.max = tl_max;
+  p.a = tl_a;
+  p.b_is_const = 0;
+  p.b = tl_b;
+  cvk_ctx->ops->tiu_max(cvk_ctx, &p);
+  uint16_t *max_data = (uint16_t*)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_max);
+
+  for (uint32_t i = 0; i < size; i++) {
+    if (max_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%u], got %x, exp %x\n",
+             i, max_data[i], ref_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+  free(max_data);
+
+fail_exit_2:
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_max);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+
+fail_exit:
+  free(a_data);
+  free(b_data);
+  free(ref_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_max(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_max(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_bf16_tensor_max_const.c b/cviruntime/test/181x/test_181x_bf16_tensor_max_const.c
new file mode 100644
index 000000000..b4d60040c
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_bf16_tensor_max_const.c
@@ -0,0 +1,120 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_max_const_ref(uint16_t *a, uint16_t b, uint16_t *max, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    if (cvk_convert_bf16_fp32(a[i]) > cvk_convert_bf16_fp32(b))
+      max[i] = a[i];
+    else
+      max[i] = b;
+  }
+}
+
+static int test_tl_max_const(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  cvk_fmt_t fmt_type = CVK_FMT_BF16;
+  uint32_t size = n * c * h  * w;
+  uint32_t data_size = size * (fmt_type == CVK_FMT_BF16 ? 2 : 1);
+
+  uint16_t *a_data = (uint16_t *)malloc(data_size);
+  uint16_t *ref_data = (uint16_t *)malloc(data_size);
+  if (!a_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint32_t i = 0; i < size; i++)
+    a_data[i] = cvk_convert_fp32_bf16(i);
+    //a_data[i] = cvk_convert_fp32_bf16(rand()%100 - 50);
+
+  uint16_t b = cvk_convert_fp32_bf16(20);
+
+  tl_max_const_ref(a_data, b, ref_data, size);
+
+  cvk_tl_t *tl_a = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  cvk_tl_t *tl_max = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  uint16_t *max_data = NULL;
+  if (!tl_a || !tl_max) {
+    ret = -1;
+    goto fail_exit_2;
+  }
+  
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, (uint8_t *)a_data);
+  cvk_tiu_max_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.max = tl_max;
+  p.a = tl_a;
+  p.b_is_const = 1;
+  p.b_const.val = b;
+  p.b_const.is_signed = 1;
+
+  cvk_ctx->ops->tiu_max(cvk_ctx, &p);
+
+  max_data = (uint16_t*) tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_max);
+
+  for (uint32_t i = 0; i < size; i++) {
+    if (max_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%u], got %x, exp %x\n",
+              i, max_data[i], ref_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+fail_exit_2:
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_max);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+  free(max_data);
+
+fail_exit:
+  free(a_data);
+  free(ref_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_max_const(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_max_const(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_bf16_tensor_min.c b/cviruntime/test/181x/test_181x_bf16_tensor_min.c
new file mode 100644
index 000000000..0fa0e996d
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_bf16_tensor_min.c
@@ -0,0 +1,124 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_min_ref(uint16_t *a, uint16_t *b, uint16_t *max, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    float fa = cvk_convert_bf16_fp32(a[i]);
+    float fb = cvk_convert_bf16_fp32(b[i]);
+    float fmax;
+    if (fa > fb)
+      fmax = fb;
+    else
+      fmax = fa;
+    max[i] = cvk_convert_fp32_bf16(fmax);
+  }
+}
+
+static int test_tl_min(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 2; // 3 -> 2 for 1810
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+  cvk_fmt_t fmt_type = CVK_FMT_BF16;
+  uint32_t size = n * c * h * w;
+  uint32_t data_size = size * (fmt_type == CVK_FMT_BF16 ? 2 : 1); 
+
+  uint16_t *a_data = (uint16_t *)malloc(data_size);
+  uint16_t *b_data = (uint16_t *)malloc(data_size);
+  uint16_t *ref_data = (uint16_t *)malloc(data_size);
+  if (!a_data || !b_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint32_t i = 0; i < size; i++)
+    a_data[i] = cvk_convert_fp32_bf16(rand());
+
+  for (uint32_t i = 0; i < size; i++)
+    b_data[i] = cvk_convert_fp32_bf16(rand()/2);
+
+  tl_min_ref(a_data, b_data, ref_data, size);
+
+  cvk_tl_t *tl_a = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  cvk_tl_t *tl_b = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  cvk_tl_t *tl_min = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  uint16_t *min_data = NULL;
+  if (!tl_a || !tl_b || !tl_min) {
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, (uint8_t *)a_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b, (uint8_t *)b_data);
+  cvk_tiu_min_param_t p6;
+  p6.min = tl_min;
+  p6.a = tl_a;
+  p6.b_is_const = 0;
+  p6.b = tl_b;
+  cvk_ctx->ops->tiu_min(cvk_ctx, &p6);
+  min_data = (uint16_t*)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_min);
+
+  for (uint32_t i = 0; i < size; i++) {
+    if (min_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%u], got %x, exp %x\n",
+              i, min_data[i], ref_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+fail_exit_2:
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_min);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+  free(min_data);
+
+fail_exit:
+  free(a_data);
+  free(b_data);
+  free(ref_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_min(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_min(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_bf16_tensor_min_const.c b/cviruntime/test/181x/test_181x_bf16_tensor_min_const.c
new file mode 100644
index 000000000..9a756dcc4
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_bf16_tensor_min_const.c
@@ -0,0 +1,116 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_min_const_ref(uint16_t *a, uint16_t b, uint16_t *max, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    if (cvk_convert_bf16_fp32(a[i]) > cvk_convert_bf16_fp32(b))
+      max[i] = b;
+    else
+      max[i] = a[i];
+  }
+}
+
+static int test_tl_min_const(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  cvk_fmt_t fmt_type = CVK_FMT_BF16;
+  uint32_t size = n * c * h  * w;
+  uint32_t data_size = size * (fmt_type == CVK_FMT_BF16 ? 2 : 1);
+
+  uint16_t *a_data = (uint16_t *)malloc(data_size);
+  uint16_t *ref_data = (uint16_t *)malloc(data_size);
+  if (!a_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint32_t i = 0; i < size; i++)
+    a_data[i] = cvk_convert_fp32_bf16(rand() % 100 -50);
+
+  uint16_t b = cvk_convert_fp32_bf16(20);
+
+  tl_min_const_ref(a_data, b, ref_data, size);
+
+  cvk_tl_t *tl_a = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  cvk_tl_t *tl_min = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  uint16_t *min_data = NULL;
+  if (!tl_a || !tl_min) {
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, (uint8_t *)a_data);
+  cvk_tiu_min_param_t p7;
+  p7.min = tl_min;
+  p7.a = tl_a;
+  p7.b_is_const = 1;
+  p7.b_const.val = b;
+  p7.b_const.is_signed = 1;
+  cvk_ctx->ops->tiu_min(cvk_ctx, &p7);
+  min_data = (uint16_t*) tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_min);
+
+  for (uint32_t i = 0; i < size; i++) {
+    if (min_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%u], got %x, exp %x\n",
+             i, min_data[i], ref_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+fail_exit_2:
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_min);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+  free(min_data);
+
+fail_exit:
+  free(a_data);
+  free(ref_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_min_const(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_min_const(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_bf16_tensor_mul.c b/cviruntime/test/181x/test_181x_bf16_tensor_mul.c
new file mode 100644
index 000000000..b6d21cfc0
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_bf16_tensor_mul.c
@@ -0,0 +1,151 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_mul_ref(uint16_t *ofmap, uint16_t *a, uint16_t *b, uint64_t size, int shift_bits, int relu_enable, cvk_fmt_t fmt_type)
+{
+  if(fmt_type == CVK_FMT_BF16) {
+    for (uint64_t i = 0; i < size; i++) {
+      float tmp = cvk_convert_bf16_fp32(a[i]) * cvk_convert_bf16_fp32(b[i]);
+      if(relu_enable)
+        if(tmp<0)
+          tmp=0;
+      ofmap[i] = cvk_convert_fp32_bf16(tmp);
+    }
+  } else {
+    for (uint64_t i = 0; i < size; i++) {
+      int32_t tmp = a[i] * b[i];
+      tmp += 1 << (shift_bits - 1);
+      tmp >>= shift_bits;
+      if (tmp > 127)
+        tmp = 127;
+      else if (tmp < -128)
+        tmp = -128;
+      if(relu_enable)
+        if(tmp<0)
+          tmp=0;
+      ofmap[i] = tmp;
+    }
+  }
+}
+
+static int test_tl_mul(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 2; // 3 -> 2 for 1810
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  cvk_fmt_t fmt_type = CVK_FMT_BF16;
+  uint32_t size = n * c * h  * w;
+  uint64_t data_size = size * (fmt_type == CVK_FMT_BF16 ? 2 : 1);
+  int shift_bits = 1;
+
+  uint16_t *a_data = (uint16_t *)malloc(data_size);
+  uint16_t *b_data = (uint16_t *)malloc(data_size);
+  uint16_t *ref_data = (uint16_t *)malloc(data_size);
+  if (!a_data || !b_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint32_t relu_enable = 0; relu_enable < 2; relu_enable++)
+  {
+    for (uint32_t i = 0; i < size; i++) {
+      a_data[i] = cvk_convert_fp32_bf16(random()%0x10);
+      b_data[i] = cvk_convert_fp32_bf16(random());
+    }
+
+    cvk_tl_t *tl_a = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+    cvk_tl_t *tl_b = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+    cvk_tl_t *tl_res_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+    uint16_t *res_low_data = NULL;
+    if (!tl_a || !tl_b || !tl_res_low) {
+      ret = -1;
+      goto fail_exit_2;
+    }
+
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, (uint8_t *)a_data);
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b, (uint8_t *)b_data);
+
+    cvk_tiu_mul_param_t p1;
+    p1.res_high = NULL;
+    p1.res_low = tl_res_low;
+    p1.a = tl_a;
+    p1.b_is_const = 0;
+    p1.b = tl_b;
+    p1.rshift_bits = shift_bits;
+    p1.relu_enable = relu_enable;
+    cvk_ctx->ops->tiu_mul(cvk_ctx, &p1);
+
+    res_low_data = (uint16_t *)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res_low);
+
+    tl_mul_ref(ref_data, a_data, b_data, size, shift_bits, relu_enable, fmt_type);
+
+    for (uint64_t i = 0; i < size; i++) {
+      if (res_low_data[i] != ref_data[i]) {
+        fprintf(stderr, "comparing failed at res_low_data[%" PRIu64 "], got %x, exp %x\n",
+               i, res_low_data[i], ref_data[i]);
+        ret = -1;
+        break;
+      }
+    }
+
+fail_exit_2:
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res_low);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+    free(res_low_data);
+  }
+
+fail_exit:
+  free(a_data);
+  free(b_data);
+  free(ref_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  int round_mode;
+  round_mode = cvk_set_store_feround();
+
+  ret |= test_tl_mul(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_mul(rt_handle, cvk_ctx, 1);
+
+  cvk_restore_feround(round_mode);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_bf16_tensor_mul_const.c b/cviruntime/test/181x/test_181x_bf16_tensor_mul_const.c
new file mode 100644
index 000000000..1a509af80
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_bf16_tensor_mul_const.c
@@ -0,0 +1,147 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_mul_const_ref(
+    uint16_t *ofmap, uint16_t *ifmap, uint64_t size, uint16_t mul_const, int shift_bits, int relu_enable, cvk_fmt_t fmt_type)
+{
+
+  if(fmt_type == CVK_FMT_BF16) {
+    for (uint64_t i = 0; i < size; i++) {
+      float tmp = cvk_convert_bf16_fp32(ifmap[i]) * cvk_convert_bf16_fp32(mul_const);
+      if(relu_enable)
+        if(tmp<0)
+          tmp=0;
+      ofmap[i] = cvk_convert_fp32_bf16(tmp);
+    }
+  } else {
+    for (uint64_t i = 0; i < size; i++) {
+      int32_t tmp = ifmap[i] * (int16_t) mul_const;
+      tmp += 1 << (shift_bits - 1);
+      tmp >>= shift_bits;
+      if (tmp > 127)
+        tmp = 127;
+      else if (tmp < -128)
+        tmp = -128;
+      if(relu_enable)
+        if(tmp<0)
+          tmp=0;
+      ofmap[i] = tmp;
+    }
+  }
+}
+
+static int test_tl_mul_const(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  uint32_t size = n * c * h  * w;
+  cvk_fmt_t fmt_type = CVK_FMT_BF16;
+  uint32_t data_size = size * (fmt_type == CVK_FMT_BF16 ? 2 : 1);
+  int shift_bits = 1;
+
+  uint16_t *ifmap_data = (uint16_t *)malloc(data_size);
+  uint16_t *ref_data = (uint16_t *)malloc(data_size);
+  cvk_tl_t *tl_ifmap = NULL, *tl_ofmap = NULL;
+  if (!ifmap_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+  
+  tl_ifmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  tl_ofmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt_type, eu_align);
+  if (!tl_ifmap || !tl_ofmap) {
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  for (uint32_t relu_enable = 0; relu_enable < 2; relu_enable++)
+  {
+    for (uint32_t i = 0; i < size; i++)
+      ifmap_data[i] = cvk_convert_fp32_bf16(random() % 256);
+  
+    uint16_t mul_const = cvk_convert_fp32_bf16(20);
+
+    tl_mul_const_ref(ref_data, ifmap_data, size, mul_const, shift_bits, relu_enable, fmt_type);
+
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_ifmap, (uint8_t *)ifmap_data);
+  
+    cvk_tiu_mul_param_t p;
+    memset(&p, 0, sizeof(p));
+    p.res_high = NULL;
+    p.res_low = tl_ofmap;
+    p.a = tl_ifmap;
+    p.b_is_const = 1;
+    p.b_const.val = mul_const;
+    p.relu_enable = relu_enable;
+
+    cvk_ctx->ops->tiu_mul(cvk_ctx, &p);
+  
+    uint16_t *ofmap_data = (uint16_t*) tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_ofmap);
+  
+    for (uint32_t i = 0; i < size; i++) {
+      if (ofmap_data[i] != ref_data[i]) {
+        fprintf(stderr, "comparing failed at ofmap_data[%u], got %x(%f), exp %x(%f)\n",
+               i,
+               ofmap_data[i], cvk_convert_bf16_fp32(ofmap_data[i]),
+               ref_data[i], cvk_convert_bf16_fp32(ref_data[i]));
+        ret = -1;
+        break;
+      }
+    }
+
+    free(ofmap_data);
+  }
+
+fail_exit_2:
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_ofmap);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_ifmap);
+
+fail_exit:
+  free(ifmap_data);
+  free(ref_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_mul_const(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_mul_const(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_conv.c b/cviruntime/test/181x/test_181x_conv.c
new file mode 100644
index 000000000..620058b45
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_conv.c
@@ -0,0 +1,828 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_native_ref.h"
+
+#define INVALIDE_STRIDE (-1)
+typedef struct {
+  int random_seed;
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int using_bias;
+  int bReLU_EN;
+  int r_shift_m;
+  int opd0_sign;
+  int opd1_sign;
+  int opd2_sign;
+} conv_param_t;
+
+static int index_get(int h, int w1, int w2)
+{
+  return h * w1 + w2;
+}
+
+static int matrix_dot_mult(
+    int8_t *A, int8_t *B, int dim_n, int dim_m,
+    int opd0_sign)
+{
+  int sum = 0;
+  for (int i=0; i<dim_n; i++){
+    for (int j=0; j<dim_m; j++){
+      int index = index_get(i, dim_m, j);
+      if(opd0_sign) {
+        sum += A[index] * B [index];
+      } else {
+        sum += (int)((uint8_t)A[index]) * B [index];
+      }
+    }
+  }
+  return sum;
+}
+
+static int conv_ref(
+    const conv_param_t *p_param,
+    const int8_t *ifmap,
+    const int8_t *weight,
+    const int16_t *bias,
+    int8_t *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+  int r_shift_bits = p_param->r_shift_m;
+  int do_relu = p_param->bReLU_EN;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  int8_t *i_fmap_pad_ker = (int8_t *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return -1;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = 0;
+
+  int8_t *i_fmap_pad = NULL;
+  int8_t *kernel_after = NULL;
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+
+      if (p_param->using_bias) {
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += bias[c]; //bias+c ;
+          }
+        }
+      }
+
+      if (do_relu)
+        relu(&result[n*oc*oh*ow + c*oh*ow], oh * ow);
+
+      // ofmap is int8_t, signed
+      ret = satu_2_8bit(&result[n*oc*oh*ow + c*oh*ow], oh * ow,
+                        &ofmap[n*oc*oh*ow + c*oh*ow], r_shift_bits, /*round_floor=*/1,
+                        /*sign_unsign=*/1);
+
+      if (ret)
+        goto error_release;
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+error_release:
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+static uint8_t * transform_weight(const cvk_tl_shape_t *s, uint8_t before[])
+{
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  uint32_t size = ic * oc * kh * kw;
+  uint8_t *after = (uint8_t *)malloc(size);
+
+  /*
+   * (oc, ic, kh, kw) -> (1, oc, kh * kw, ic)
+   */
+  for (uint32_t oci = 0; oci < oc; oci++) {
+    for (uint32_t ici = 0; ici < ic; ici++) {
+      for (uint32_t khi = 0; khi < kh; khi++) {
+        for (uint32_t kwi = 0; kwi < kw; kwi++) {
+          uint32_t src_i = oci * ic * kh * kw + ici * kh * kw + khi * kw + kwi;
+          uint32_t dst_i = oci * kh * kw * ic + khi * kw * ic + kwi * ic + ici;
+          after[dst_i] = before[src_i];
+        }
+      }
+    }
+  }
+
+  return after;
+}
+
+static void put_conv_weight(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    uint8_t *data)
+{
+#if 0
+  const cvk_tl_shape_t *s = &tl->shape;
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  bmshape_t bms = BM_TENSOR_INT8((int)oc, (int)ic, (int)kh, (int)kw);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  uint8_t *transformed_data = transform_weight(s, data);
+
+  /* we put weight to region 1. bm_memcpy_s2d regard dev_mem as
+   * absolute address, so we should pass abs address to copy weight
+   * to right place.
+   */
+
+  //u64 ab_addr = bm_device_read_base_reg(*ctx, 1);
+  //bmmem_device_t ab_dev_mem =
+    //bmmem_device_prealloc(*ctx, NULL, ab_addr + gaddr, &bms);
+
+  //int ret = bm_memcpy_s2d(*ctx, ab_dev_mem, transformed_data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  cvk_tl_shape_t tdma_shape = { 1, oc, kh * kw, ic };
+
+  tg_t tdma_tg;
+  tdma_tg.base_reg_index = 0;
+  tdma_tg.start_address = gaddr;
+  tdma_tg.fmt = FMT_I8;
+  tdma_tg.shape.n = tdma_shape.n;
+  tdma_tg.shape.c = tdma_shape.c;
+  tdma_tg.shape.h = tdma_shape.h;
+  tdma_tg.shape.w = tdma_shape.w;
+  tdma_tg.stride = bmk1822_tensor_tgmem_default_stride(tdma_tg.shape, tdma_tg.fmt);
+  tdma_tg.base_reg_index = 1;
+
+  tl_t tdma_tl = *tl;
+  tdma_tl.shape = tdma_shape;
+  tdma_tl.stride = bmk1822_tensor_lmem_default_stride(cvk_ctx, tdma_shape, FMT_I8, 0);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tdma_tg;
+  p.dst = &tdma_tl;
+
+  bmk1822_tdma_g2l_tensor_copy(cvk_ctx, &p);
+  test_submit(ctx);
+  bmmem_device_free(*ctx, dev_mem);
+  delete[] transformed_data;
+#endif
+
+  const cvk_tl_shape_t *s = &tl->shape;
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  uint8_t *transformed_data = transform_weight(s, data);
+
+  cvk_tl_shape_t tdma_shape = tl_shape_t4(1, oc, kh * kw, ic);
+  cvk_tl_t tdma_tl;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tdma_tl, tdma_shape, tl->fmt, tl->eu_align);
+  tdma_tl.start_address = tl->start_address;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, &tdma_tl, transformed_data);
+
+  free(transformed_data);
+}
+
+static int8_t * transform_bias(int oc, int16_t before[])
+{
+  int8_t *after = (int8_t *)malloc(2 * oc);
+  if (!after)
+    return NULL;
+
+  for (int i = 0; i < oc; i++){
+    after[i] = before[i] & 0xff;
+    after[i + oc] = (before[i] >> 8) & 0xff;
+  }
+  return after;
+}
+
+static void put_conv_bias(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    int16_t *data)
+{
+#if 0
+  int oc = tl->shape.c;
+
+  bmshape_t bms = BM_TENSOR_INT8(2, oc, 1, 1);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  int8_t *transformed_data = transform_bias(oc, data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, (uint8_t *)transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_I8;
+  tg.shape.n = 2;
+  tg.shape.c = oc;
+  tg.shape.h = 1;
+  tg.shape.w = 1;
+  tg.stride = bmk1822_tensor_tgmem_default_stride(tg.shape, tg.fmt);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1822_tdma_g2l_tensor_copy(cvk_ctx, &p);
+  test_submit(ctx);
+
+  bmmem_device_free(*ctx, dev_mem);
+  delete[] transformed_data;
+#endif
+
+  int oc = tl->shape.c;
+  int8_t *transformed_data = transform_bias(oc, data);
+
+  cvk_tl_shape_t tdma_shape = tl_shape_t4(2, oc, 1, 1);
+  cvk_tl_t tdma_tl;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tdma_tl, tdma_shape, tl->fmt, tl->eu_align);
+  tdma_tl.start_address = tl->start_address;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, &tdma_tl, (uint8_t *)transformed_data);
+
+  free(transformed_data);
+}
+
+static int conv_kh_ext(const conv_param_t *p)
+{
+  return (p->kh - 1) * p->dh + 1;
+}
+
+static int conv_kw_ext(const conv_param_t *p)
+{
+  return (p->kw - 1) * p->dw + 1;
+}
+
+static int conv_ih_ext(const conv_param_t *p)
+{
+  return (p->input_h - 1) * (p->ins_h + 1) +
+      p->ins_h_last + 1 + p->pad_top + p->pad_bot;
+}
+
+static int conv_iw_ext(const conv_param_t *p)
+{
+  return (p->input_w - 1) * (p->ins_w + 1) +
+      p->ins_w_last + 1 + p->pad_left + p->pad_right;
+}
+
+static int conv_oh(const conv_param_t *p)
+{
+  return (conv_ih_ext(p) - conv_kh_ext(p)) / p->stride_h + 1;
+}
+
+static int conv_ow(const conv_param_t *p)
+{
+  return (conv_iw_ext(p) - conv_kw_ext(p)) / p->stride_w + 1;
+}
+
+static int conv_input_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int ic = p->input_c;
+  int ih = p->input_h;
+  int iw = p->input_w;
+  return in * ic * ih * iw;
+}
+
+static int conv_output_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int oc = p->output_c;
+  int oh = conv_oh(p);
+  int ow = conv_ow(p);
+  return in * oc * oh * ow;
+}
+
+static int conv_weight_size(const conv_param_t *p)
+{
+  int oc = p->output_c;
+  int ic = p->input_c;
+  int kh = p->kh;
+  int kw = p->kw;
+  return oc * ic * kh * kw;
+}
+
+static int8_t * alloc_input(const conv_param_t *p)
+{
+  int size = conv_input_size(p);
+
+  int8_t *buf = (int8_t *)malloc(sizeof(int8_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static int8_t * alloc_weight(const conv_param_t *p)
+{
+  int size = conv_weight_size(p);
+
+  int8_t *buf = (int8_t *)malloc(sizeof(int8_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static int16_t * alloc_bias(const conv_param_t *p)
+{
+  int oc = p->output_c;
+
+  int16_t *bias = (int16_t *)malloc(sizeof(int16_t) * oc);
+  if (!bias)
+    return NULL;
+
+  for (int i = 0; i < oc; i++)
+    bias[i] = rand() % 65536 - 32768;
+
+  return bias;
+}
+
+static cvk_tl_t * conv_ifmap_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = p->opd0_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 1);
+}
+
+static cvk_tl_t * conv_weight_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = p->opd1_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 0);
+}
+
+static cvk_tl_t * conv_ofmap_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, CVK_FMT_I8, 1);
+}
+
+static cvk_tl_t * conv_bias_tensor(
+    cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = p->opd2_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 0);
+}
+
+static int conv_param_is_ok(const conv_param_t *p)
+{
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+
+  if ((kh_ext > ih_ext)
+      || (kw_ext > iw_ext)
+      || (kh_ext <= p->pad_top)
+      || (kh_ext <= p->pad_bot)
+      || (kw_ext <= p->pad_left)
+      || (kw_ext <= p->pad_right)
+      || (p->pad_top >= (1 << 4))
+      || (p->pad_bot >= (1 << 4))
+      || (p->pad_left >= (1 << 4))
+      || (p->pad_right >= (1 << 4))) {
+    return 0;
+  }
+
+  return 1;
+}
+
+static int bmk_conv_param_alloc_ok(
+    const cvk_tiu_pt_convolution_param_t *p,
+    const conv_param_t *param)
+{
+  if (!p->ifmap || !p->ofmap || !p->weight)
+    return 0;
+
+  if (param->using_bias)
+    if (!p->bias)
+      return 0;
+
+  return 1;
+}
+
+static void make_bmk_conv_param(
+    cvk_context_t *cvk_ctx,
+    cvk_tiu_pt_convolution_param_t *dst,
+    const conv_param_t *p)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  dst->relu_enable = p->bReLU_EN;
+  dst->rshift_bits = p->r_shift_m;
+
+  dst->ifmap = conv_ifmap_tensor(cvk_ctx, p);
+  dst->weight = conv_weight_tensor(cvk_ctx, p);
+  dst->ofmap = conv_ofmap_tensor(cvk_ctx, p);
+  dst->bias = NULL;
+  dst->ps32_mode = 0;
+  if (p->using_bias)
+    dst->bias = conv_bias_tensor(cvk_ctx, p);
+
+  dst->w_is_const = 0;
+}
+
+static void free_bmk_conv_param(
+    cvk_context_t *cvk_ctx,
+    cvk_tiu_pt_convolution_param_t *r,
+    const conv_param_t *p)
+{
+  if (p->using_bias && r->bias)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->bias);
+  if (r->ofmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ofmap);
+  if (r->weight)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->weight);
+  if (r->ifmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ifmap);
+}
+
+static void _init_conv_param(conv_param_t *p, int stride_w, int stride_h)
+{
+  printf("init_conv_param\n");
+  memset(p, 0, sizeof(*p));
+  p->random_seed = clock();
+  srand(p->random_seed);
+
+retry:
+  p->input_n = rand() % 5 + 1;
+  p->input_c = rand() % (5 * 32) + 1;
+  p->kh = rand() % 7 + 1;
+  p->kw = rand() % 7 + 1;
+  p->input_h = rand() % 40 + p->kh + (INVALIDE_STRIDE == stride_h ? 0 : stride_h);
+  p->input_w = rand() % 40 + p->kw + (INVALIDE_STRIDE == stride_w ? 0 : stride_w);
+  p->output_c = rand() % 10 + 3;
+  p->stride_h = INVALIDE_STRIDE == stride_h ? rand() % (p->kh) + 1 : stride_h;
+  p->stride_w = INVALIDE_STRIDE == stride_w ? rand() % (p->kh) + 1 : stride_w;
+  p->ins_h = rand() % p->kh;
+  p->ins_w = rand() % p->kw;
+  p->ins_h_last = rand() % p->kh;;
+  p->ins_w_last = rand() % p->kw;;
+  p->dh = rand() % 3 + 1;
+  p->dw = rand() % 3 + 1;
+
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  p->pad_top = rand() % kh_ext;
+  p->pad_bot = rand() % kh_ext;
+  p->pad_left = rand() % kw_ext;
+  p->pad_right = rand() % kw_ext;
+
+  if (!conv_param_is_ok(p)) {
+    printf("retry init_conv_param\n");
+    goto retry;
+  }
+
+  p->using_bias = rand() % 2;
+  p->r_shift_m = rand() % 8;
+  p->bReLU_EN = rand() % 2;
+
+  p->opd0_sign = rand() % 2;
+  p->opd1_sign = 1;
+  p->opd2_sign = 1;
+
+  assert(p->opd1_sign == 1 && p->opd2_sign == 1);
+
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+  assert(ih_ext >= kh_ext);
+  assert(iw_ext >= kw_ext);
+}
+
+static void init_conv_param(conv_param_t *p) {
+  _init_conv_param(p, INVALIDE_STRIDE, INVALIDE_STRIDE);
+}
+
+static void print_conv_param(const conv_param_t *p)
+{
+  printf("%s\n", "Conv parameters:");
+  printf("  %s%d;\n", "p->random_seed = ", p->random_seed);
+
+  printf("  %s%d;\n", "p->input_n = ", p->input_n);
+  printf("  %s%d;\n", "p->input_c = ", p->input_c);
+  printf("  %s%d;\n", "p->input_h = ", p->input_h);
+  printf("  %s%d;\n", "p->input_w = ", p->input_w);
+  printf("  %s%d;\n", "p->output_c = ", p->output_c);
+
+  printf("  %s%d;\n", "p->kh = ", p->kh);
+  printf("  %s%d;\n", "p->kw = ", p->kw);
+  printf("  %s%d;\n", "p->dh = ", p->dh);
+  printf("  %s%d;\n", "p->dw = ", p->dw);
+  printf("  %s%d;\n", "p->pad_top = ", p->pad_top);
+  printf("  %s%d;\n", "p->pad_bot = ", p->pad_bot);
+  printf("  %s%d;\n", "p->pad_left = ", p->pad_left);
+  printf("  %s%d;\n", "p->pad_right = ", p->pad_right);
+  printf("  %s%d;\n", "p->stride_h = ", p->stride_h);
+  printf("  %s%d;\n", "p->stride_w = ", p->stride_w);
+  printf("  %s%d;\n", "p->ins_w = ", p->ins_w);
+  printf("  %s%d;\n", "p->ins_h = ", p->ins_h);
+  printf("  %s%d;\n", "p->ins_w_last = ", p->ins_w_last);
+  printf("  %s%d;\n", "p->ins_h_last = ", p->ins_h_last);
+
+  printf("  %s%d;\n", "p->r_shift_m = ", p->r_shift_m);
+  printf("  %s%d;\n", "p->opd0_sign = ", p->opd0_sign);
+  printf("  %s%d;\n", "p->opd1_sign = ", p->opd1_sign);
+  printf("  %s%d;\n", "p->opd2_sign = ", p->opd2_sign);
+  printf("  %s%d;\n", "p->bReLU_EN = ", p->bReLU_EN);
+  printf("  %s%d;\n", "p->using_bias = ", p->using_bias);
+
+  printf("  %s%d\n", "kh_ext = ", conv_kh_ext(p));
+  printf("  %s%d\n", "kw_ext = ", conv_kw_ext(p));
+  printf("  %s%d\n", "ih_ext = ", conv_ih_ext(p));
+  printf("  %s%d\n", "iw_ext = ", conv_iw_ext(p));
+  printf("  %s%d\n", "output_h = ", conv_oh(p));
+  printf("  %s%d\n", "output_w = ", conv_ow(p));
+}
+
+// Calculate the right shift value, m
+// Steps:
+//  1. Get the abs() of each weight;
+//  2. Summary all the abs() in one kernel;
+//  3. Get Log2 of each sum;
+//  4. Downward rounding;
+// After every r_shift value got, sort and find the middle one.
+static int calc_rshift_m(const conv_param_t *p, int8_t* weight)
+{
+  int kernel_cnt = p->output_c * p->input_c;
+  int kernel_size = p->kh * p->kw;
+  int *kernel_shifts = (int *)malloc(sizeof(int) * kernel_cnt);
+  if (!kernel_shifts)
+    return 1;
+
+  memset(kernel_shifts, 0, sizeof(int) * kernel_cnt);
+
+  // Part 1:
+  // Get right shift value for each kernel
+  int sum = 0;
+  for (int i = 0; i < kernel_cnt; i++) {
+    // Step 1 & 2: Get the sum of abs()
+    for (int j = 0; j < kernel_size; j++) {
+      sum += (int)(*weight < 0 ? -(*weight) : (*weight));
+      weight++;
+    }
+    // Step 3 & 4: log2 and downward rounding
+    sum >>= 1;
+    while (sum) {
+      sum >>= 1;
+      kernel_shifts[i]++;
+    }
+  }
+
+  // Part 2:
+  // Find the middle of all the values
+  int tag[32] = {0};
+  for (int cnt = 0; cnt < kernel_cnt; cnt++) {
+    if (kernel_shifts[cnt] < 32)
+      tag[kernel_shifts[cnt]]++;
+  }
+
+  int rshift_m = 0;
+  int mid = 0;
+  do {
+    mid += tag[rshift_m++];
+  } while(mid < (kernel_cnt - 1) >> 1);
+
+  free(kernel_shifts);
+
+  return rshift_m - 1;
+}
+
+static int test_conv(
+    conv_param_t *p_param, CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+  int8_t *input = alloc_input(p_param);
+  int8_t *weight = alloc_weight(p_param);
+  int16_t *bias = alloc_bias(p_param);
+  int8_t *output_ref = (int8_t *)malloc(sizeof(int8_t) * conv_output_size(p_param));
+  if (!input || !weight || !bias || !output_ref) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  p_param->r_shift_m = calc_rshift_m(p_param, weight);
+
+  ret = conv_ref(p_param, input, weight, bias, output_ref);
+  if (ret)
+    goto fail_exit;
+
+  cvk_tiu_pt_convolution_param_t conv_param;
+  make_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+
+  int tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, p_param);
+  if (tl_alloc_success) {
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, conv_param.ifmap, (uint8_t *)input);
+    put_conv_weight(rt_handle, cvk_ctx, conv_param.weight, (uint8_t *)weight);
+    if (p_param->using_bias)
+      put_conv_bias(rt_handle, cvk_ctx, conv_param.bias, bias);
+    cvk_ctx->ops->tiu_pt_convolution(cvk_ctx, &conv_param);
+    uint8_t *output = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, conv_param.ofmap);
+
+    ret = array_cmp_int8(
+        "Comparing results ...\n",
+        output_ref, (int8_t *)output, conv_output_size(p_param));
+
+    if (ret) {
+      print_conv_param(p_param);
+      printf("Comparison FAILED\n");
+      ret = -1;
+    }
+    free(output);
+  }
+
+  free_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+
+fail_exit:
+  free(input);
+  free(weight);
+  free(output_ref);
+  free(bias);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  for (int i = 0; i < 5; i++) {
+    printf("random_test_conv iteration: %d\n", i);
+    conv_param_t test_conv_param;
+    init_conv_param(&test_conv_param);
+    ret |= test_conv(&test_conv_param, rt_handle, cvk_ctx);
+    if (!test_conv_param.using_bias)
+      test_conv_param.using_bias = 1;
+    if (test_conv_param.output_c <= 32)
+      test_conv_param.output_c += 32;
+    ret |= test_conv(&test_conv_param, rt_handle, cvk_ctx);
+  }
+
+  // test stride extend (0, 31]
+  int stride_list[] = {15, 16, 31};
+  int stride_list_len = sizeof(stride_list) / sizeof(stride_list[0]);
+
+  for (int stride_w_idx = 0; stride_w_idx < stride_list_len; stride_w_idx++) {
+    for (int stride_h_idx = 0; stride_h_idx < stride_list_len; stride_h_idx++) {
+      int stride_w = stride_list[stride_w_idx];
+      int stride_h = stride_list[stride_h_idx];
+      conv_param_t test_conv_param;
+      _init_conv_param(&test_conv_param, stride_w, stride_h);
+
+      ret |= test_conv(&test_conv_param, rt_handle, cvk_ctx);
+    }
+  }
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_conv_max_power.c b/cviruntime/test/181x/test_181x_conv_max_power.c
new file mode 100644
index 000000000..32ae74a00
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_conv_max_power.c
@@ -0,0 +1,1132 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_native_ref.h"
+
+typedef cvk_tdma_l2g_tensor_copy_cw_transposed_param_t l2g_cw_param_t;
+typedef cvk_tdma_g2l_matrix_copy_row_col_transposed_param_t g2l_matrix_param_t;
+typedef cvk_tdma_l2l_tensor_copy_param_t l2l_tensor_copy_param_t;
+
+typedef struct{
+    int8_t *conv_input;
+    int8_t *conv_weight;
+    int16_t *conv_bias;
+    uint8_t *conv_output;
+    int8_t *conv_output_ref;
+    uint8_t *l2g_cw_src;
+    uint8_t *l2g_cw_output;
+    uint8_t *l2g_cw_output_ref;
+    uint8_t *g2l_matrix_src;
+    uint8_t *g2l_matrix_output;
+    uint8_t *g2l_matrix_output_ref;
+    uint8_t *l2l_tensor_src;
+    uint8_t *l2l_tensor_output;
+    uint8_t *l2l_tensor_output_ref;
+}s_test_data;
+
+typedef struct {
+  int random_seed;
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int using_bias;
+  int bReLU_EN;
+  int r_shift_m;
+  int opd0_sign;
+  int opd1_sign;
+  int opd2_sign;
+} conv_param_t;
+
+conv_param_t conv_param;
+l2g_cw_param_t l2g_cw_param;
+g2l_matrix_param_t g2l_matrix_param;
+l2l_tensor_copy_param_t l2l_tensor_copy_param;
+s_test_data s8_test_data;
+cvk_tiu_pt_convolution_param_t bmk_conv_param;
+
+cvk_tl_t *skip_tensor_lmem[10];
+uint32_t skip_tensor_num=0;
+
+/* need to make sure the free order of test_alloc_tl for skip_tensor_lmem*/
+void skip_tensor_lmem_size(cvk_context_t *cvk_ctx, const cvk_tl_t *p)
+{
+  uint32_t needed = align_up(p->shape.n * p->stride.n, cvk_ctx->info.eu_num);
+  uint32_t start_addr = p->start_address + needed; //src_shape.n*src_shape.c*src_shape.h*src_shape.w/32;
+  uint32_t remain_size = start_addr % cvk_ctx->info.lmem_bank_size ? (cvk_ctx->info.lmem_bank_size - start_addr % cvk_ctx->info.lmem_bank_size) : 0; // remain size for each lane
+  if(remain_size)
+  {
+//    cvk_tl_shape_t src_shape2 = {1, cvk_ctx->info.eu_num, 1, remain_size};
+    cvk_tl_shape_t src_shape2 = {1, cvk_ctx->info.npu_num, 1, remain_size};
+    skip_tensor_lmem[skip_tensor_num] = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, src_shape2, CVK_FMT_I8, 1); // skip the lmem size and next tl can alignment to bank size
+  }
+  skip_tensor_num++;
+}
+
+void skip_matrix_lmem_size(cvk_context_t *cvk_ctx, const cvk_ml_t *p)
+{
+  uint32_t needed = align_up(p->shape.n * p->stride.n, cvk_ctx->info.eu_num);
+  uint32_t start_addr = p->start_address + needed; //src_shape.n*src_shape.c*src_shape.h*src_shape.w/32;
+  uint32_t remain_size = start_addr % cvk_ctx->info.lmem_bank_size ? (cvk_ctx->info.lmem_bank_size - start_addr % cvk_ctx->info.lmem_bank_size) : 0; // remain size for each lane
+  if(remain_size)
+  {
+    cvk_tl_shape_t src_shape2 = {1, cvk_ctx->info.npu_num, 1, remain_size};
+    //cvk_tl_shape_t src_shape2 = {1, cvk_ctx->info.eu_num, 1, remain_size};
+    skip_tensor_lmem[skip_tensor_num] = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, src_shape2, CVK_FMT_I8, 1); // skip the lmem size and next tl can alignment to bank size
+  }
+  skip_tensor_num++;
+}
+
+void free_skip_tensor_lmem(cvk_context_t *cvk_ctx)
+{
+  if(skip_tensor_lmem[--skip_tensor_num]!=NULL)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, skip_tensor_lmem[skip_tensor_num]);
+}
+
+static int index_get(int h, int w1, int w2)
+{
+  return h * w1 + w2;
+}
+
+static int matrix_dot_mult(
+    int8_t *A, int8_t *B, int dim_n, int dim_m,
+    int opd0_sign)
+{
+  int sum = 0;
+  for (int i=0; i<dim_n; i++){
+    for (int j=0; j<dim_m; j++){
+      int index = index_get(i, dim_m, j);
+      if(opd0_sign) {
+        sum += A[index] * B [index];
+      } else {
+        sum += (int)((uint8_t)A[index]) * B [index];
+      }
+    }
+  }
+  return sum;
+}
+
+static int conv_ref(
+    const conv_param_t *p_param,
+    const int8_t *ifmap,
+    const int8_t *weight,
+    const int16_t *bias,
+    int8_t *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+  int r_shift_bits = p_param->r_shift_m;
+  int do_relu = p_param->bReLU_EN;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  int8_t *i_fmap_pad_ker = (int8_t *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return -1;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = 0;
+
+  int8_t *i_fmap_pad = NULL;
+  int8_t *kernel_after = NULL;
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+
+      if (p_param->using_bias) {
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += bias[c]; //bias+c ;
+          }
+        }
+      }
+
+      if (do_relu)
+        relu(&result[n*oc*oh*ow + c*oh*ow], oh * ow);
+
+      // ofmap is int8_t, signed
+      ret = satu_2_8bit(&result[n*oc*oh*ow + c*oh*ow], oh * ow,
+                        &ofmap[n*oc*oh*ow + c*oh*ow], r_shift_bits, /*round_floor=*/1,
+                        /*sign_unsign=*/1);
+
+      if (ret)
+        goto error_release;
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+error_release:
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+
+static uint8_t * transform_weight(const cvk_tl_shape_t *s, uint8_t before[])
+{
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  uint32_t size = ic * oc * kh * kw;
+  uint8_t *after = (uint8_t *)malloc(size);
+  if (!after)
+    return NULL;
+
+  /*
+   * (oc, ic, kh, kw) -> (1, oc, kh * kw, ic)
+   */
+  for (uint32_t oci = 0; oci < oc; oci++) {
+    for (uint32_t ici = 0; ici < ic; ici++) {
+      for (uint32_t khi = 0; khi < kh; khi++) {
+        for (uint32_t kwi = 0; kwi < kw; kwi++) {
+          uint32_t src_i = oci * ic * kh * kw + ici * kh * kw + khi * kw + kwi;
+          uint32_t dst_i = oci * kh * kw * ic + khi * kw * ic + kwi * ic + ici;
+          after[dst_i] = before[src_i];
+        }
+      }
+    }
+  }
+
+  return after;
+}
+
+static void put_conv_weight(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    uint8_t *data)
+{
+#if 0
+  const cvk_tl_shape_t *s = &tl->shape;
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  bmshape_t bms = BM_TENSOR_INT8((int)oc, (int)ic, (int)kh, (int)kw);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  uint8_t *transformed_data = transform_weight(s, data);
+
+  /* we put weight to region 1. bm_memcpy_s2d regard dev_mem as
+   * absolute address, so we should pass abs address to copy weight
+   * to right place.
+   */
+
+  //u64 ab_addr = bm_device_read_base_reg(*ctx, 1);
+  //bmmem_device_t ab_dev_mem =
+    //bmmem_device_prealloc(*ctx, NULL, ab_addr + gaddr, &bms);
+
+  //int ret = bm_memcpy_s2d(*ctx, ab_dev_mem, transformed_data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, transformed_data);
+
+  assert(ret == BM_SUCCESS);
+
+  cvk_tl_shape_t tdma_shape = { 1, oc, kh * kw, ic };
+
+  tg_t tdma_tg;
+  tdma_tg.base_reg_index = 0;
+  tdma_tg.start_address = gaddr;
+  tdma_tg.fmt = FMT_I8;
+  tdma_tg.shape.n = tdma_shape.n;
+  tdma_tg.shape.c = tdma_shape.c;
+  tdma_tg.shape.h = tdma_shape.h;
+  tdma_tg.shape.w = tdma_shape.w;
+  tdma_tg.stride = bmk1822_tensor_tgmem_default_stride(tdma_tg.shape, tdma_tg.fmt);
+  tdma_tg.base_reg_index = 1;
+
+  tl_t tdma_tl = *tl;
+  tdma_tl.shape = tdma_shape;
+  tdma_tl.stride = bmk1822_tensor_lmem_default_stride(cvk_ctx, tdma_shape, FMT_I8, 0);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tdma_tg;
+  p.dst = &tdma_tl;
+
+  bmk1822_tdma_g2l_tensor_copy(cvk_ctx, &p);
+  test_submit(ctx);
+  bmmem_device_free(*ctx, dev_mem);
+  delete[] transformed_data;
+#endif
+
+  const cvk_tl_shape_t *s = &tl->shape;
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  uint8_t *transformed_data = transform_weight(s, data);
+
+  cvk_tl_shape_t tdma_shape = tl_shape_t4(1, oc, kh * kw, ic);
+  cvk_tl_t tdma_tl;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tdma_tl, tdma_shape, tl->fmt, tl->eu_align);
+  tdma_tl.start_address = tl->start_address;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, &tdma_tl, transformed_data);
+
+  free(transformed_data);
+}
+
+static int8_t * transform_bias(int oc, int16_t before[])
+{
+  int8_t *after = (int8_t *)malloc(2 * oc);
+  if (!after)
+    return NULL;
+
+  for (int i = 0; i < oc; i++){
+    after[i] = before[i] & 0xff;
+    after[i + oc] = (before[i] >> 8) & 0xff;
+  }
+  return after;
+}
+
+static void put_conv_bias(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    int16_t *data)
+{
+#if 0
+  int oc = tl->shape.c;
+
+  bmshape_t bms = BM_TENSOR_INT8(2, oc, 1, 1);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  int8_t *transformed_data = transform_bias(oc, data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, (uint8_t *)transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_I8;
+  tg.shape.n = 2;
+  tg.shape.c = oc;
+  tg.shape.h = 1;
+  tg.shape.w = 1;
+  tg.stride = bmk1822_tensor_tgmem_default_stride(tg.shape, tg.fmt);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1822_tdma_g2l_tensor_copy(cvk_ctx, &p);
+  test_submit(ctx);
+
+  bmmem_device_free(*ctx, dev_mem);
+  delete[] transformed_data;
+#endif
+
+  int oc = tl->shape.c;
+  int8_t *transformed_data = transform_bias(oc, data);
+
+  cvk_tl_shape_t tdma_shape = tl_shape_t4(2, oc, 1, 1);
+  cvk_tl_t tdma_tl;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tdma_tl, tdma_shape, tl->fmt, tl->eu_align);
+  tdma_tl.start_address = tl->start_address;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, &tdma_tl, (uint8_t *)transformed_data);
+
+  free(transformed_data);
+}
+
+static int conv_kh_ext(const conv_param_t *p)
+{
+  return (p->kh - 1) * p->dh + 1;
+}
+
+static int conv_kw_ext(const conv_param_t *p)
+{
+  return (p->kw - 1) * p->dw + 1;
+}
+
+static int conv_ih_ext(const conv_param_t *p)
+{
+  return (p->input_h - 1) * (p->ins_h + 1) +
+      p->ins_h_last + 1 + p->pad_top + p->pad_bot;
+}
+
+static int conv_iw_ext(const conv_param_t *p)
+{
+  return (p->input_w - 1) * (p->ins_w + 1) +
+      p->ins_w_last + 1 + p->pad_left + p->pad_right;
+}
+
+static int conv_oh(const conv_param_t *p)
+{
+  return (conv_ih_ext(p) - conv_kh_ext(p)) / p->stride_h + 1;
+}
+
+static int conv_ow(const conv_param_t *p)
+{
+  return (conv_iw_ext(p) - conv_kw_ext(p)) / p->stride_w + 1;
+}
+
+static int conv_input_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int ic = p->input_c;
+  int ih = p->input_h;
+  int iw = p->input_w;
+  return in * ic * ih * iw;
+}
+
+static int conv_output_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int oc = p->output_c;
+  int oh = conv_oh(p);
+  int ow = conv_ow(p);
+  return in * oc * oh * ow;
+}
+
+static int conv_weight_size(const conv_param_t *p)
+{
+  int oc = p->output_c;
+  int ic = p->input_c;
+  int kh = p->kh;
+  int kw = p->kw;
+  return oc * ic * kh * kw;
+}
+
+static int8_t * alloc_input(const conv_param_t *p)
+{
+  int size = conv_input_size(p);
+
+  int8_t *buf = (int8_t *)malloc(sizeof(int8_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static int8_t * alloc_weight(const conv_param_t *p)
+{
+  int size = conv_weight_size(p);
+
+  int8_t *buf = (int8_t *)malloc(sizeof(int8_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static int16_t * alloc_bias(const conv_param_t *p)
+{
+  int oc = p->output_c;
+
+  int16_t *bias = (int16_t *)malloc(sizeof(int16_t) * oc);
+  if (!bias)
+    return NULL;
+
+  for (int i = 0; i < oc; i++)
+    bias[i] = rand() % 65536 - 32768;
+
+  return bias;
+}
+
+static cvk_tl_t * conv_ifmap_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = p->opd0_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 1);
+}
+
+static cvk_tl_t * conv_weight_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = p->opd1_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 0);
+}
+
+static cvk_tl_t * conv_ofmap_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, CVK_FMT_I8, 1);
+}
+
+static cvk_tl_t * conv_bias_tensor(
+    cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = p->opd2_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 0);
+}
+
+static int conv_param_is_ok(const conv_param_t *p)
+{
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+
+  if ((kh_ext > ih_ext)
+      || (kw_ext > iw_ext)
+      || (kh_ext <= p->pad_top)
+      || (kh_ext <= p->pad_bot)
+      || (kw_ext <= p->pad_left)
+      || (kw_ext <= p->pad_right)
+      || (p->pad_top >= (1 << 4))
+      || (p->pad_bot >= (1 << 4))
+      || (p->pad_left >= (1 << 4))
+      || (p->pad_right >= (1 << 4))) {
+    return 0;
+  }
+
+  return 1;
+}
+
+static int bmk_conv_param_alloc_ok(
+    const cvk_tiu_pt_convolution_param_t *p,
+    const conv_param_t *param)
+{
+  if (!p->ifmap || !p->ofmap || !p->weight)
+    return 0;
+
+  if (param->using_bias)
+    if (!p->bias)
+      return 0;
+
+  return 1;
+}
+
+static void make_bmk_conv_param(
+    cvk_context_t *cvk_ctx,
+    cvk_tiu_pt_convolution_param_t *dst,
+    const conv_param_t *p)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  dst->relu_enable = p->bReLU_EN;
+  dst->rshift_bits = p->r_shift_m;
+
+  dst->ifmap = conv_ifmap_tensor(cvk_ctx, p);
+  skip_tensor_lmem_size(cvk_ctx, dst->ifmap);
+  dst->weight = conv_weight_tensor(cvk_ctx, p);
+  skip_tensor_lmem_size(cvk_ctx, dst->weight);
+  dst->ofmap = conv_ofmap_tensor(cvk_ctx, p);
+  skip_tensor_lmem_size(cvk_ctx, dst->ofmap);
+  dst->bias = NULL;
+  dst->ps32_mode = 0;
+  if (p->using_bias)
+  {
+    dst->bias = conv_bias_tensor(cvk_ctx, p);
+    skip_tensor_lmem_size(cvk_ctx, dst->bias);
+  }
+  dst->w_is_const = 0;
+}
+
+static void free_bmk_conv_param(
+    cvk_context_t *cvk_ctx,
+    cvk_tiu_pt_convolution_param_t *r,
+    const conv_param_t *p)
+{
+  if (p->using_bias && r->bias)
+  {
+    free_skip_tensor_lmem(cvk_ctx);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->bias);
+  }
+  if (r->ofmap)
+  {
+    free_skip_tensor_lmem(cvk_ctx);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ofmap);
+  }
+  if (r->weight)
+  {
+    free_skip_tensor_lmem(cvk_ctx);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->weight);
+  }
+  if (r->ifmap)
+  {
+    free_skip_tensor_lmem(cvk_ctx);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ifmap);
+  }
+}
+
+static void init_conv_param(conv_param_t *p)
+{
+retry:
+  p->input_n = 1;
+  p->input_c = 16;
+  p->input_h = 2;
+  p->input_w = 600;
+
+  p->kh = 2;
+  p->kw = 16;
+  p->output_c = 16;
+
+  p->stride_h = 1;
+  p->stride_w = 15;
+  p->ins_h = 0;
+  p->ins_w = 0;
+  p->ins_h_last = 0;;
+  p->ins_w_last = 0;;
+  p->dh = 1;
+  p->dw = 1;
+
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  p->pad_top = 1;
+  p->pad_bot = 0;
+  p->pad_left = 0;
+  p->pad_right = 0;
+
+  if (!conv_param_is_ok(p)) {
+    printf("retry init_conv_param\n");
+    goto retry;
+  }
+
+  p->using_bias = 0;
+  p->r_shift_m = 7;
+  p->bReLU_EN = 1;
+
+  p->opd0_sign = 0;
+  p->opd1_sign = 1;
+  p->opd2_sign = 1;
+
+  assert(p->opd1_sign == 1 && p->opd2_sign == 1);
+
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+  assert(ih_ext >= kh_ext);
+  assert(iw_ext >= kw_ext);
+
+}
+
+static void print_conv_param(const conv_param_t *p)
+{
+  printf("%s\n", "Conv parameters:");
+  printf("  %s%d;\n", "p->random_seed = ", p->random_seed);
+
+  printf("  %s%d;\n", "p->input_n = ", p->input_n);
+  printf("  %s%d;\n", "p->input_c = ", p->input_c);
+  printf("  %s%d;\n", "p->input_h = ", p->input_h);
+  printf("  %s%d;\n", "p->input_w = ", p->input_w);
+  printf("  %s%d;\n", "p->output_c = ", p->output_c);
+
+  printf("  %s%d;\n", "p->kh = ", p->kh);
+  printf("  %s%d;\n", "p->kw = ", p->kw);
+  printf("  %s%d;\n", "p->dh = ", p->dh);
+  printf("  %s%d;\n", "p->dw = ", p->dw);
+  printf("  %s%d;\n", "p->pad_top = ", p->pad_top);
+  printf("  %s%d;\n", "p->pad_bot = ", p->pad_bot);
+  printf("  %s%d;\n", "p->pad_left = ", p->pad_left);
+  printf("  %s%d;\n", "p->pad_right = ", p->pad_right);
+  printf("  %s%d;\n", "p->stride_h = ", p->stride_h);
+  printf("  %s%d;\n", "p->stride_w = ", p->stride_w);
+  printf("  %s%d;\n", "p->ins_w = ", p->ins_w);
+  printf("  %s%d;\n", "p->ins_h = ", p->ins_h);
+  printf("  %s%d;\n", "p->ins_w_last = ", p->ins_w_last);
+  printf("  %s%d;\n", "p->ins_h_last = ", p->ins_h_last);
+
+  printf("  %s%d;\n", "p->r_shift_m = ", p->r_shift_m);
+  printf("  %s%d;\n", "p->opd0_sign = ", p->opd0_sign);
+  printf("  %s%d;\n", "p->opd1_sign = ", p->opd1_sign);
+  printf("  %s%d;\n", "p->opd2_sign = ", p->opd2_sign);
+  printf("  %s%d;\n", "p->bReLU_EN = ", p->bReLU_EN);
+  printf("  %s%d;\n", "p->using_bias = ", p->using_bias);
+
+  printf("  %s%d\n", "kh_ext = ", conv_kh_ext(p));
+  printf("  %s%d\n", "kw_ext = ", conv_kw_ext(p));
+  printf("  %s%d\n", "ih_ext = ", conv_ih_ext(p));
+  printf("  %s%d\n", "iw_ext = ", conv_iw_ext(p));
+  printf("  %s%d\n", "output_h = ", conv_oh(p));
+  printf("  %s%d\n", "output_w = ", conv_ow(p));
+}
+
+// Calculate the right shift value, m
+// Steps:
+//  1. Get the abs() of each weight;
+//  2. Summary all the abs() in one kernel;
+//  3. Get Log2 of each sum;
+//  4. Downward rounding;
+// After every r_shift value got, sort and find the middle one.
+static int calc_rshift_m(const conv_param_t *p, int8_t* weight)
+{
+  int kernel_cnt = p->output_c * p->input_c;
+  int kernel_size = p->kh * p->kw;
+  int *kernel_shifts = (int *)malloc(sizeof(int) * kernel_cnt);
+  if (!kernel_shifts)
+    return 1;
+
+  memset(kernel_shifts, 0, sizeof(int) * kernel_cnt);
+
+  // Part 1:
+  // Get right shift value for each kernel
+  int sum = 0;
+  for (int i = 0; i < kernel_cnt; i++) {
+    // Step 1 & 2: Get the sum of abs()
+    for (int j = 0; j < kernel_size; j++) {
+      sum += (int)(*weight < 0 ? -(*weight) : (*weight));
+      weight++;
+    }
+    // Step 3 & 4: log2 and downward rounding
+    sum >>= 1;
+    while (sum) {
+      sum >>= 1;
+      kernel_shifts[i]++;
+    }
+  }
+
+  // Part 2:
+  // Find the middle of all the values
+  int tag[32] = {0};
+  for (int cnt = 0; cnt < kernel_cnt; cnt++) {
+    tag[kernel_shifts[cnt]]++;
+  }
+
+  int rshift_m = 0;
+  int mid = 0;
+  do {
+    mid += tag[rshift_m++];
+  } while(mid < (kernel_cnt - 1) >> 1);
+
+  free(kernel_shifts);
+
+  return rshift_m - 1;
+}
+
+
+static void l2g_tensor_copy_cw_transposed_ref(
+    l2g_cw_param_t *p, uint8_t ref_data[], uint8_t src_data[])
+{
+  cvk_tl_shape_t s = p->src->shape;
+  uint32_t n = s.n;
+  uint32_t c = s.c;
+  uint32_t h = s.h;
+  uint32_t w = s.w;
+
+  for (uint32_t ni = 0; ni < n; ni++) {
+    for (uint32_t ci = 0; ci < c; ci++) {
+      for (uint32_t hi = 0; hi < h; hi++) {
+        for (uint32_t wi = 0; wi < w; wi++) {
+          uint32_t src_i = ni * c * h * w + ci * h * w + hi * w + wi;
+          uint32_t dst_i = ni * c * h * w + wi * h * c + hi * c + ci;
+          ref_data[dst_i] = src_data[src_i];
+        }
+      }
+    }
+  }
+}
+
+static void test_param_l2g(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, l2g_cw_param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->src->shape, p->src->fmt);
+
+  s8_test_data.l2g_cw_src = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!s8_test_data.l2g_cw_src)
+    return;
+
+  for (uint64_t i = 0; i < size; i++)
+    s8_test_data.l2g_cw_src[i] = rand()%0x100;
+
+  s8_test_data.l2g_cw_output_ref = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!s8_test_data.l2g_cw_output_ref)
+    return;
+
+  l2g_tensor_copy_cw_transposed_ref(p, s8_test_data.l2g_cw_output_ref, s8_test_data.l2g_cw_src);
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, p->src, s8_test_data.l2g_cw_src);
+}
+
+static void destroy_param_l2g(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, l2g_cw_param_t *p)
+{
+  free_tensor_dev_mem(rt_handle, p->dst);
+  free_skip_tensor_lmem(cvk_ctx);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->src);
+}
+
+static void test_l2g_cw_transpose(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, l2g_cw_param_t *p)
+{
+  cvk_tl_shape_t src_shape = {1, 0x100, 1, 0x020};
+  cvk_tg_shape_t dst_shape = {1, 0x020, 1, 0x100};
+
+//  cvk_tl_shape_t src_shape = {1, 0x100, 1, 0x080};
+//  cvk_tg_shape_t dst_shape = {1, 0x080, 1, 0x100};
+
+  p->src = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, src_shape, CVK_FMT_I8, 1);
+  p->dst = alloc_tensor_dev_mem(rt_handle, cvk_ctx, dst_shape, CVK_FMT_I8);
+  skip_tensor_lmem_size(cvk_ctx, p->src);
+  test_param_l2g(rt_handle, cvk_ctx, p);
+}
+
+static void g2l_matrix_copy_row_col_transposed_ref(
+    g2l_matrix_param_t *p, uint8_t ref_data[], uint8_t src_data[])
+{
+  uint64_t row = p->src->shape.row;
+  uint64_t col = p->src->shape.col;
+
+  for (uint64_t ri = 0; ri < row; ri++) {
+    for (uint64_t ci = 0; ci < col; ci++) {
+      uint64_t src_i = ri * col + ci;
+      uint64_t dst_i = ci * row + ri;
+      ref_data[dst_i] = src_data[src_i];
+    }
+  }
+}
+
+static void test_param_g2l(CVI_RT_HANDLE rt_handle, g2l_matrix_param_t *p)
+{
+  uint64_t size = ml_shape_size(&p->dst->shape, p->dst->fmt);
+
+  s8_test_data.g2l_matrix_src = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!s8_test_data.g2l_matrix_src)
+    return;
+
+  for (uint64_t i = 0; i < size; i++)
+    s8_test_data.g2l_matrix_src[i] = rand()%0x100;
+
+  s8_test_data.g2l_matrix_output_ref = (uint8_t *)malloc(size);
+  if (!s8_test_data.g2l_matrix_output_ref)
+    return;
+
+  g2l_matrix_copy_row_col_transposed_ref(p, s8_test_data.g2l_matrix_output_ref, s8_test_data.g2l_matrix_src);
+
+  matrix_copy_s2d(rt_handle, p->src, s8_test_data.g2l_matrix_src);
+}
+
+static void destroy_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, g2l_matrix_param_t *p)
+{
+  free_matrix_dev_mem(rt_handle, p->src);
+  free_skip_tensor_lmem(cvk_ctx);
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->dst);
+}
+
+
+static void test_g2l_matrix_transpose(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, g2l_matrix_param_t *p)
+{
+  //g2l_matrix_param_t p;
+  /*
+   * Matrix transpose must be n/c stride alignment
+   * for TDMA limitation
+   */
+  cvk_mg_shape_t src_shape={0x100, 0x20};
+  cvk_ml_shape_t dst_shape={0x20, 0x10, 0x10, 0x100};
+
+//  cvk_mg_shape_t src_shape={0x100, 0x80};
+//  cvk_ml_shape_t dst_shape={0x80, 0x10, 0x10, 0x100};
+
+  int dst_align = 1;
+  cvk_fmt_t fmt = CVK_FMT_I8;
+
+  p->src = alloc_matrix_dev_mem(rt_handle, src_shape, fmt);
+  p->dst = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, dst_shape, fmt, dst_align);
+  skip_matrix_lmem_size(cvk_ctx, p->dst);
+  test_param_g2l(rt_handle, p);
+}
+
+static void l2l_tensor_copy_ref(l2l_tensor_copy_param_t *p, uint8_t ref_data[], uint8_t src_data[])
+{
+  uint64_t size = tl_shape_size(&p->src->shape, p->src->fmt);
+
+  for (uint64_t i = 0; i < size; i++)
+    ref_data[i] = src_data[i];
+}
+
+static void test_l2l_param(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, l2l_tensor_copy_param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->src->shape, p->src->fmt);
+
+  s8_test_data.l2l_tensor_src = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!s8_test_data.l2l_tensor_src)
+    return;
+
+  for (uint64_t i = 0; i < size; i++)
+    s8_test_data.l2l_tensor_src[i] = rand()%0x100;
+
+  s8_test_data.l2l_tensor_output_ref = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!s8_test_data.l2l_tensor_output_ref)
+    return;
+
+  l2l_tensor_copy_ref(p, s8_test_data.l2l_tensor_output_ref, s8_test_data.l2l_tensor_src);
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, p->src, s8_test_data.l2l_tensor_src);
+}
+
+static void destroy_param_l2l(cvk_context_t *cvk_ctx, l2l_tensor_copy_param_t *p)
+{
+  free_skip_tensor_lmem(cvk_ctx);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->dst);
+  free_skip_tensor_lmem(cvk_ctx);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->src);
+}
+
+static void test_l2l_tensor_copy(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, l2l_tensor_copy_param_t *p)
+{
+  cvk_tl_shape_t src_shape = {1, 0x10, 0x1, 0x100};
+  cvk_tl_shape_t dst_shape = {1, 0x10, 0x1, 0x100};
+
+//  cvk_tl_shape_t src_shape = {1, 0x10, 0x1, 0x400};
+//  cvk_tl_shape_t dst_shape = {1, 0x10, 0x1, 0x400};
+
+  p->src = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, src_shape, CVK_FMT_I8, 1);
+  skip_tensor_lmem_size(cvk_ctx, p->src);
+  p->dst = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, dst_shape, CVK_FMT_I8, 1);
+  skip_tensor_lmem_size(cvk_ctx, p->dst);
+  test_l2l_param(rt_handle, cvk_ctx, p);
+}
+
+static int setup_conv(
+    conv_param_t *p_param, CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  s8_test_data.conv_input = alloc_input(p_param);
+  s8_test_data.conv_weight = alloc_weight(p_param);
+  s8_test_data.conv_bias = alloc_bias(p_param);
+  p_param->r_shift_m = calc_rshift_m(p_param, s8_test_data.conv_weight);
+  s8_test_data.conv_output_ref = (int8_t *)malloc(sizeof(int8_t) * conv_output_size(p_param));
+  if (!s8_test_data.conv_output_ref)
+    return -1;
+
+  int ret = conv_ref(p_param, s8_test_data.conv_input, s8_test_data.conv_weight, s8_test_data.conv_bias, s8_test_data.conv_output_ref);
+  if (ret)
+    return ret;
+
+  make_bmk_conv_param(cvk_ctx, &bmk_conv_param, p_param);
+
+  bmk_conv_param_alloc_ok(&bmk_conv_param, p_param);
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, bmk_conv_param.ifmap, (uint8_t *)s8_test_data.conv_input);
+  put_conv_weight(rt_handle, cvk_ctx, bmk_conv_param.weight, (uint8_t *)s8_test_data.conv_weight);
+  if (p_param->using_bias)
+    put_conv_bias(rt_handle, cvk_ctx, bmk_conv_param.bias, s8_test_data.conv_bias);
+
+  return 0;
+}
+
+void get_result(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  s8_test_data.conv_output = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, bmk_conv_param.ofmap);
+  s8_test_data.l2g_cw_output = tensor_copy_d2s(rt_handle, l2g_cw_param.dst);
+  s8_test_data.g2l_matrix_output = matrix_copy_l2g_d2s(rt_handle, cvk_ctx, g2l_matrix_param.dst);
+  s8_test_data.l2l_tensor_output = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, l2l_tensor_copy_param.dst);
+}
+
+int check_result()
+{
+  int has_error = array_cmp_int8(
+      "conv Comparing results ...\n",
+      s8_test_data.conv_output_ref, (int8_t *)s8_test_data.conv_output, conv_output_size(&conv_param));
+
+  if (has_error) {
+    print_conv_param(&conv_param);
+    printf("Comparison FAILED\n");
+    return -1;
+  }
+
+  for (uint64_t i = 0; i < tl_shape_size(&l2g_cw_param.src->shape, l2g_cw_param.src->fmt); i++) {
+    if (s8_test_data.l2g_cw_output[i] != s8_test_data.l2g_cw_output_ref[i]) {
+      fprintf(stderr, "l2g_cw comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, s8_test_data.l2g_cw_output[i], s8_test_data.l2g_cw_output_ref[i]);
+      return -1;
+    }
+  }
+  for (uint64_t i = 0; i < ml_shape_size(&g2l_matrix_param.dst->shape, g2l_matrix_param.dst->fmt); i++) {
+    if (s8_test_data.g2l_matrix_output[i] != s8_test_data.g2l_matrix_output_ref[i]) {
+      fprintf(stderr, "g2l_matrix comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, s8_test_data.g2l_matrix_output[i], s8_test_data.g2l_matrix_output_ref[i]);
+      return -1;
+    }
+  }
+
+  for (uint64_t i = 0; i < tl_shape_size(&l2l_tensor_copy_param.src->shape, l2l_tensor_copy_param.src->fmt); i++) {
+    if (s8_test_data.l2l_tensor_output[i] != s8_test_data.l2l_tensor_output_ref[i]) {
+      fprintf(stderr, "l2l_tensor comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, s8_test_data.l2l_tensor_output[i], s8_test_data.l2l_tensor_output_ref[i]);
+      return -1;
+    }
+  }
+
+  return 0;
+}
+
+void trigger_max_power(cvk_context_t *cvk_ctx)
+{
+ cvk_ctx->ops->parallel_enable(cvk_ctx);
+ cvk_ctx->ops->tdma_l2g_tensor_copy_cw_transposed(cvk_ctx, &l2g_cw_param);
+ cvk_ctx->ops->tdma_g2l_matrix_copy_row_col_transposed(cvk_ctx, &g2l_matrix_param);
+ cvk_ctx->ops->tdma_l2l_tensor_copy(cvk_ctx, &l2l_tensor_copy_param);
+ cvk_ctx->ops->tiu_pt_convolution(cvk_ctx, &bmk_conv_param);
+ cvk_ctx->ops->parallel_disable(cvk_ctx);
+ cvk_ctx->ops->parallel_enable(cvk_ctx);
+ cvk_ctx->ops->tdma_l2g_tensor_copy_cw_transposed(cvk_ctx, &l2g_cw_param);
+ cvk_ctx->ops->tdma_g2l_matrix_copy_row_col_transposed(cvk_ctx, &g2l_matrix_param);
+ cvk_ctx->ops->tdma_l2l_tensor_copy(cvk_ctx, &l2l_tensor_copy_param);
+ cvk_ctx->ops->tiu_pt_convolution(cvk_ctx, &bmk_conv_param);
+ cvk_ctx->ops->parallel_disable(cvk_ctx);
+ CVI_RT_Submit(cvk_ctx);
+}
+
+void free_s8_data()
+{
+  free(s8_test_data.conv_input);
+  free(s8_test_data.conv_weight);
+  free(s8_test_data.conv_bias);
+  free(s8_test_data.conv_output);
+  free(s8_test_data.conv_output_ref);
+  free(s8_test_data.l2g_cw_src);
+  free(s8_test_data.l2g_cw_output);
+  free(s8_test_data.l2g_cw_output_ref);
+  free(s8_test_data.g2l_matrix_src);
+  free(s8_test_data.g2l_matrix_output);
+  free(s8_test_data.g2l_matrix_output_ref);
+  free(s8_test_data.l2l_tensor_src);
+  free(s8_test_data.l2l_tensor_output);
+  free(s8_test_data.l2l_tensor_output_ref);
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  printf("conv max_power test\n");
+  init_conv_param(&conv_param);
+  ret |= setup_conv(&conv_param, rt_handle, cvk_ctx);
+
+  test_l2g_cw_transpose(rt_handle, cvk_ctx, &l2g_cw_param);
+  test_g2l_matrix_transpose(rt_handle, cvk_ctx, &g2l_matrix_param);
+  test_l2l_tensor_copy(rt_handle, cvk_ctx, &l2l_tensor_copy_param);
+
+  trigger_max_power(cvk_ctx);
+  get_result(rt_handle, cvk_ctx);
+  check_result();
+
+  destroy_param_l2l(cvk_ctx,&l2l_tensor_copy_param);
+  destroy_param_g2l(rt_handle, cvk_ctx, &g2l_matrix_param);
+  destroy_param_l2g(rt_handle, cvk_ctx, &l2g_cw_param);
+  free_bmk_conv_param(cvk_ctx, &bmk_conv_param, &conv_param);
+  free_s8_data();
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_conv_ps32.c b/cviruntime/test/181x/test_181x_conv_ps32.c
new file mode 100644
index 000000000..0cb7c80fa
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_conv_ps32.c
@@ -0,0 +1,1559 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_native_ref.h"
+
+typedef struct {
+  int random_seed;
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int using_bias;
+  int bReLU_EN;
+  int r_shift_m;
+  int opd0_sign;
+  int opd1_sign;
+  int opd2_sign;
+} conv_param_t;
+
+static int index_get(int h, int w1, int w2)
+{
+  return h * w1 + w2;
+}
+
+static int matrix_dot_mult(
+    int8_t *A, int8_t *B, int dim_n, int dim_m,
+    int opd0_sign)
+{
+  int sum = 0;
+  for (int i=0; i<dim_n; i++){
+    for (int j=0; j<dim_m; j++){
+      int index = index_get(i, dim_m, j);
+      if (opd0_sign) {
+        sum += A[index] * B [index];
+      } else {
+        sum += (int)((uint8_t)A[index]) * B [index];
+      }
+    }
+  }
+  return sum;
+}
+
+static int ps32_m2_conv_ref(
+    const conv_param_t *p_param,
+    const int8_t *ifmap,
+    const int8_t *weight,
+    int8_t *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  int8_t *i_fmap_pad_ker = (int8_t *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return -1;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = 0;
+
+  int8_t *i_fmap_pad = NULL;
+  int8_t *kernel_after = NULL;
+  uint32_t bstride = in * oc * oh * ow;
+
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+    ofmap[i] = result[i];
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+    ofmap[bstride + i] = result[i] >> 8;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+    ofmap[2 * bstride + i] = result[i] >> 16;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+    ofmap[3 * bstride + i] = result[i] >> 24;
+
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+static int ps32_m1_conv_ref(
+    const conv_param_t *p_param,
+    const int8_t *ifmap,
+    const int8_t *weight,
+    const int16_t *bias,
+    int8_t *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+  int r_shift_bits = p_param->r_shift_m;
+  int do_relu = p_param->bReLU_EN;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  int8_t *i_fmap_pad_ker = (int8_t *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return -1;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = 0;
+
+  int8_t *i_fmap_pad = NULL;
+  int8_t *kernel_after = NULL;
+
+  uint32_t bstride = in * oc * oh * ow;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      result[i] = (uint8_t)ofmap[i];
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      result[i] |= (uint8_t)ofmap[bstride + i] << 8;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      result[i] |= (uint8_t)ofmap[2 * bstride + i] << 16;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      result[i] |= (uint8_t)ofmap[3 * bstride + i] << 24;
+
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+
+      if (p_param->using_bias) {
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += bias[c]; //bias+c ;
+          }
+        }
+      }
+
+      if (do_relu)
+        relu(&result[n*oc*oh*ow + c*oh*ow], oh * ow);
+
+      // ofmap is int8_t, signed
+      ret = satu_2_8bit(&result[n*oc*oh*ow + c*oh*ow], oh * ow,
+                        &ofmap[n*oc*oh*ow + c*oh*ow], r_shift_bits, /*round_floor=*/1,
+                        /*sign_unsign=*/1);
+
+      if (ret)
+        goto error_release;
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+error_release:
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+
+static int ps32_m3_conv_ref(
+    const conv_param_t *p_param,
+    const int8_t *ifmap,
+    const int8_t *weight,
+    int8_t *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  int8_t *i_fmap_pad_ker = (int8_t *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return -1;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = 0;
+
+  int8_t *i_fmap_pad = NULL;
+  int8_t *kernel_after = NULL;
+
+  uint32_t bstride = in * oc * oh * ow;
+
+  for (int i = 0; i < in * oc * oh * ow; i++)
+      result[i] = (uint8_t)ofmap[i];
+
+  for (int i = 0; i < in * oc * oh * ow; i++)
+      result[i] |= (uint8_t)ofmap[bstride + i] << 8;
+
+  for (int i = 0; i < in * oc * oh * ow; i++)
+      result[i] |= (uint8_t)ofmap[2 * bstride + i] << 16;
+
+  for (int i = 0; i < in * oc * oh * ow; i++)
+      result[i] |= (uint8_t)ofmap[3 * bstride + i] << 24;
+
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      ofmap[i] = result[i];
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      ofmap[bstride + i] = result[i] >> 8;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      ofmap[2 * bstride + i] = result[i] >> 16;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      ofmap[3 * bstride + i] = result[i] >> 24;
+
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+static int conv_ref(
+    const conv_param_t *p_param,
+    const int8_t *ifmap,
+    const int8_t *weight,
+    const int16_t *bias,
+    int8_t *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+  int r_shift_bits = p_param->r_shift_m;
+  int do_relu = p_param->bReLU_EN;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  int8_t *i_fmap_pad_ker = (int8_t *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return -1;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = 0;
+
+  int8_t *i_fmap_pad = NULL;
+  int8_t *kernel_after = NULL;
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+
+      if (p_param->using_bias) {
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += bias[c]; //bias+c ;
+          }
+        }
+      }
+
+      if (do_relu)
+        relu(&result[n*oc*oh*ow + c*oh*ow], oh * ow);
+
+      // ofmap is int8_t, signed
+      ret = satu_2_8bit(&result[n*oc*oh*ow + c*oh*ow], oh * ow,
+                        &ofmap[n*oc*oh*ow + c*oh*ow], r_shift_bits, /*round_floor=*/1,
+                        /*sign_unsign=*/1);
+
+      if (ret)
+        goto error_release;
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+error_release:
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+static uint8_t * transform_weight(const cvk_tl_shape_t *s, uint8_t before[])
+{
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  uint32_t size = ic * oc * kh * kw;
+  uint8_t *after = (uint8_t *)malloc(size);
+
+  /*
+   * (oc, ic, kh, kw) -> (1, oc, kh * kw, ic)
+   */
+  for (uint32_t oci = 0; oci < oc; oci++) {
+    for (uint32_t ici = 0; ici < ic; ici++) {
+      for (uint32_t khi = 0; khi < kh; khi++) {
+        for (uint32_t kwi = 0; kwi < kw; kwi++) {
+          uint32_t src_i = oci * ic * kh * kw + ici * kh * kw + khi * kw + kwi;
+          uint32_t dst_i = oci * kh * kw * ic + khi * kw * ic + kwi * ic + ici;
+          after[dst_i] = before[src_i];
+        }
+      }
+    }
+  }
+
+  return after;
+}
+
+static void put_conv_weight(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    uint8_t *data)
+{
+#if 0
+  const cvk_tl_shape_t *s = &tl->shape;
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  bmshape_t bms = BM_TENSOR_INT8((int)oc, (int)ic, (int)kh, (int)kw);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  uint8_t *transformed_data = transform_weight(s, data);
+
+  /* we put weight to region 1. bm_memcpy_s2d regard dev_mem as
+   * absolute address, so we should pass abs address to copy weight
+   * to right place.
+   */
+
+  //u64 ab_addr = bm_device_read_base_reg(*ctx, 1);
+  //bmmem_device_t ab_dev_mem =
+    //bmmem_device_prealloc(*ctx, NULL, ab_addr + gaddr, &bms);
+
+  //int ret = bm_memcpy_s2d(*ctx, ab_dev_mem, transformed_data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  cvk_tl_shape_t tdma_shape = { 1, oc, kh * kw, ic };
+
+  tg_t tdma_tg;
+  tdma_tg.base_reg_index = 0;
+  tdma_tg.start_address = gaddr;
+  tdma_tg.fmt = FMT_I8;
+  tdma_tg.shape.n = tdma_shape.n;
+  tdma_tg.shape.c = tdma_shape.c;
+  tdma_tg.shape.h = tdma_shape.h;
+  tdma_tg.shape.w = tdma_shape.w;
+  tdma_tg.stride = bmk1822_tensor_tgmem_default_stride(tdma_tg.shape, tdma_tg.fmt);
+  tdma_tg.base_reg_index = 1;
+
+  tl_t tdma_tl = *tl;
+  tdma_tl.shape = tdma_shape;
+  tdma_tl.stride = bmk1822_tensor_lmem_default_stride(cvk_ctx, tdma_shape, FMT_I8, 0);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tdma_tg;
+  p.dst = &tdma_tl;
+
+  bmk1822_tdma_g2l_tensor_copy(cvk_ctx, &p);
+  test_submit(ctx);
+  bmmem_device_free(*ctx, dev_mem);
+  delete[] transformed_data;
+#endif
+
+  const cvk_tl_shape_t *s = &tl->shape;
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  uint8_t *transformed_data = transform_weight(s, data);
+
+  cvk_tl_shape_t tdma_shape = tl_shape_t4(1, oc, kh * kw, ic);
+  cvk_tl_t tdma_tl;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tdma_tl, tdma_shape, tl->fmt, tl->eu_align);
+  tdma_tl.start_address = tl->start_address;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, &tdma_tl, transformed_data);
+
+  free(transformed_data);
+}
+
+static int8_t * transform_bias(int oc, int16_t before[])
+{
+  int8_t *after = (int8_t *)malloc(2 * oc);
+  if (!after)
+    return NULL;
+
+  for (int i = 0; i < oc; i++){
+    after[i] = before[i] & 0xff;
+    after[i + oc] = (before[i] >> 8) & 0xff;
+  }
+  return after;
+}
+
+static void put_conv_bias(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    int16_t *data)
+{
+#if 0
+  int oc = tl->shape.c;
+
+  bmshape_t bms = BM_TENSOR_INT8(2, oc, 1, 1);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  int8_t *transformed_data = transform_bias(oc, data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, (uint8_t *)transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_I8;
+  tg.shape.n = 2;
+  tg.shape.c = oc;
+  tg.shape.h = 1;
+  tg.shape.w = 1;
+  tg.stride = bmk1822_tensor_tgmem_default_stride(tg.shape, tg.fmt);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1822_tdma_g2l_tensor_copy(cvk_ctx, &p);
+  test_submit(ctx);
+
+  bmmem_device_free(*ctx, dev_mem);
+  delete[] transformed_data;
+#endif
+
+  int oc = tl->shape.c;
+  int8_t *transformed_data = transform_bias(oc, data);
+
+  cvk_tl_shape_t tdma_shape = tl_shape_t4(2, oc, 1, 1);
+  cvk_tl_t tdma_tl;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tdma_tl, tdma_shape, tl->fmt, tl->eu_align);
+  tdma_tl.start_address = tl->start_address;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, &tdma_tl, (uint8_t *)transformed_data);
+
+  free(transformed_data);
+}
+
+static int conv_kh_ext(const conv_param_t *p)
+{
+  return (p->kh - 1) * p->dh + 1;
+}
+
+static int conv_kw_ext(const conv_param_t *p)
+{
+  return (p->kw - 1) * p->dw + 1;
+}
+
+static int conv_ih_ext(const conv_param_t *p)
+{
+  return (p->input_h - 1) * (p->ins_h + 1) +
+      p->ins_h_last + 1 + p->pad_top + p->pad_bot;
+}
+
+static int conv_iw_ext(const conv_param_t *p)
+{
+  return (p->input_w - 1) * (p->ins_w + 1) +
+      p->ins_w_last + 1 + p->pad_left + p->pad_right;
+}
+
+static int conv_oh(const conv_param_t *p)
+{
+  return (conv_ih_ext(p) - conv_kh_ext(p)) / p->stride_h + 1;
+}
+
+static int conv_ow(const conv_param_t *p)
+{
+  return (conv_iw_ext(p) - conv_kw_ext(p)) / p->stride_w + 1;
+}
+
+static int conv_input_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int ic = p->input_c;
+  int ih = p->input_h;
+  int iw = p->input_w;
+  return in * ic * ih * iw;
+}
+
+static int conv_output_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int oc = p->output_c;
+  int oh = conv_oh(p);
+  int ow = conv_ow(p);
+  return in * oc * oh * ow;
+}
+
+static int conv_weight_size(const conv_param_t *p)
+{
+  int oc = p->output_c;
+  int ic = p->input_c;
+  int kh = p->kh;
+  int kw = p->kw;
+  return oc * ic * kh * kw;
+}
+
+static int8_t * alloc_input(const conv_param_t *p)
+{
+  int size = conv_input_size(p);
+
+  int8_t *buf = (int8_t *)malloc(sizeof(int8_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static int8_t * alloc_weight(const conv_param_t *p)
+{
+  int size = conv_weight_size(p);
+
+  int8_t *buf = (int8_t *)malloc(sizeof(int8_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static int16_t * alloc_bias(const conv_param_t *p)
+{
+  int oc = p->output_c;
+
+  int16_t *bias = (int16_t *)malloc(sizeof(int16_t) * oc);
+  if (!bias)
+    return NULL;
+
+  for (int i = 0; i < oc; i++)
+    bias[i] = rand() % 65536 - 32768;
+
+  return bias;
+}
+
+static cvk_tl_t * conv_ifmap_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = p->opd0_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 1);
+}
+
+static uint32_t conv_ifmap_tensor_size(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = p->opd0_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, s, fmt, 1);
+}
+
+static cvk_tl_t * conv_weight_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = p->opd1_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 0);
+}
+
+static uint32_t conv_weight_tensor_size(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = p->opd1_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+  return cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, s, fmt, 0);
+}
+
+static cvk_tl_t * conv_ofmap_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_tl_shape_t s;
+  s.n = p->input_n * 4;
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  cvk_tl_t *tl = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, CVK_FMT_I8, 1);
+  if (tl)
+    tl->shape.n = p->input_n;
+  return tl;
+}
+
+static uint32_t conv_ofmap_tensor_size(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_tl_shape_t s;
+  s.n = p->input_n * sizeof(uint32_t) / sizeof(uint8_t);
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  return cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, s, CVK_FMT_I8, 1);
+}
+
+static cvk_tl_t * conv_bias_tensor(
+    cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = p->opd2_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 0);
+}
+
+static uint32_t conv_bias_tensor_size(
+    cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = p->opd2_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+  return cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, s, fmt, 0);
+}
+
+static int conv_param_is_ok(const conv_param_t *p)
+{
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+
+  if ((kh_ext > ih_ext)
+      || (kw_ext > iw_ext)
+      || (kh_ext <= p->pad_top)
+      || (kh_ext <= p->pad_bot)
+      || (kw_ext <= p->pad_left)
+      || (kw_ext <= p->pad_right)
+      || (p->pad_top >= (1 << 4))
+      || (p->pad_bot >= (1 << 4))
+      || (p->pad_left >= (1 << 4))
+      || (p->pad_right >= (1 << 4))) {
+    return 0;
+  }
+
+  return 1;
+}
+
+static int bmk_conv_param_alloc_ok(
+    const cvk_tiu_pt_convolution_param_t *p,
+    const conv_param_t *param)
+{
+  if (!p->ifmap || !p->ofmap || !p->weight)
+    return 0;
+  if(p->ps32_mode==1)
+      if (param->using_bias)
+        if (!p->bias)
+          return 0;
+
+  return 1;
+}
+
+static void make_bmk_conv_param_ps32(
+    cvk_context_t *cvk_ctx,
+    cvk_tiu_pt_convolution_param_t *dst,
+    const conv_param_t *p, uint32_t ps32_mode)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+
+  if(ps32_mode==2)
+  {
+    uint32_t ifmap_size = conv_ifmap_tensor_size(cvk_ctx, p);
+    uint32_t weight_size = conv_weight_tensor_size(cvk_ctx, p);
+    uint32_t ofmap_size = conv_ofmap_tensor_size(cvk_ctx, p);
+    uint32_t bias_size = p->using_bias ? conv_bias_tensor_size(cvk_ctx, p) : 0;
+    uint32_t total_size = ifmap_size + weight_size + ofmap_size + bias_size;
+
+    // Allocation if size fit.
+    if (total_size <= cvk_ctx->info.lmem_size) {
+      dst->ifmap = conv_ifmap_tensor(cvk_ctx, p);
+      dst->weight = conv_weight_tensor(cvk_ctx, p);
+      dst->ofmap = conv_ofmap_tensor(cvk_ctx, p);
+    } else {
+      dst->ifmap = NULL;
+      dst->weight = NULL;
+      dst->ofmap = NULL;
+    }
+  }
+
+  dst->ps32_mode = ps32_mode;
+  dst->bias = NULL;
+  dst->relu_enable = 0;
+  dst->rshift_bits = 0;
+  if(ps32_mode==1)
+  {
+    dst->relu_enable = p->bReLU_EN;
+    dst->rshift_bits = p->r_shift_m;
+    // only mode=1 can use bias
+    if (p->using_bias)
+      dst->bias = conv_bias_tensor(cvk_ctx, p);
+  }
+
+  return;
+}
+
+static void make_bmk_conv_param(
+    cvk_context_t *cvk_ctx,
+    cvk_tiu_pt_convolution_param_t *dst,
+    const conv_param_t *p)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  dst->relu_enable = p->bReLU_EN;
+  dst->rshift_bits = p->r_shift_m;
+  dst->ifmap = conv_ifmap_tensor(cvk_ctx, p);
+  dst->weight = conv_weight_tensor(cvk_ctx, p);
+  dst->ofmap = conv_ofmap_tensor(cvk_ctx, p);
+  dst->bias = NULL;
+  dst->ps32_mode = 0;
+  if (p->using_bias)
+    dst->bias = conv_bias_tensor(cvk_ctx, p);
+}
+
+static void free_bmk_conv_param(
+    cvk_context_t *cvk_ctx,
+    cvk_tiu_pt_convolution_param_t *r,
+    const conv_param_t *p)
+{
+  if (p->using_bias && r->bias) {
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->bias);
+    r->bias = NULL;
+  }
+
+  if (r->ofmap) {
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ofmap);
+    r->ofmap = NULL;
+  }
+
+  if (r->weight) {
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->weight);
+    r->weight = NULL;
+  }
+
+  if (r->ifmap) {
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ifmap);
+    r->ifmap = NULL;
+  }
+
+}
+
+static void init_conv_param(conv_param_t *p)
+{
+  printf("init_conv_param\n");
+
+  memset(p, 0, sizeof(*p));
+
+  p->random_seed = clock();
+  srand(p->random_seed);
+
+retry:
+  p->input_n = 1;
+  p->input_c = rand() % (10) + 2;
+  p->kh = rand() % 7 + 1;
+  p->kw = rand() % 7 + 1;
+  p->input_h = rand() % 10 + p->kh;
+  p->input_w = rand() % 10 + p->kw;
+  p->output_c = rand() % 10 + 3;
+  p->stride_h = rand() % (p->kh) + 1;
+  p->stride_w = rand() % (p->kw) + 1;
+  p->ins_h = rand() % p->kh;
+  p->ins_w = rand() % p->kw;
+  p->ins_h_last = rand() % p->kh;;
+  p->ins_w_last = rand() % p->kw;;
+  p->dh = rand() % 3 + 1;
+  p->dw = rand() % 3 + 1;
+
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  p->pad_top = rand() % kh_ext;
+  p->pad_bot = rand() % kh_ext;
+  p->pad_left = rand() % kw_ext;
+  p->pad_right = rand() % kw_ext;
+
+  if (!conv_param_is_ok(p)) {
+    printf("retry init_conv_param\n");
+    goto retry;
+  }
+
+  p->using_bias = rand() % 2;
+  p->r_shift_m = rand() % 8;
+  p->bReLU_EN = rand() % 2;
+
+  p->opd0_sign = rand() % 2;
+  p->opd1_sign = 1;
+  p->opd2_sign = 1;
+
+  assert(p->opd1_sign == 1 && p->opd2_sign == 1);
+
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+  assert(ih_ext >= kh_ext);
+  assert(iw_ext >= kw_ext);
+}
+
+static void print_conv_param(const conv_param_t *p)
+{
+  printf("%s\n", "Conv parameters:");
+  printf("  %s%d;\n", "p->random_seed = ", p->random_seed);
+
+  printf("  %s%d;\n", "p->input_n = ", p->input_n);
+  printf("  %s%d;\n", "p->input_c = ", p->input_c);
+  printf("  %s%d;\n", "p->input_h = ", p->input_h);
+  printf("  %s%d;\n", "p->input_w = ", p->input_w);
+  printf("  %s%d;\n", "p->output_c = ", p->output_c);
+
+  printf("  %s%d;\n", "p->kh = ", p->kh);
+  printf("  %s%d;\n", "p->kw = ", p->kw);
+  printf("  %s%d;\n", "p->dh = ", p->dh);
+  printf("  %s%d;\n", "p->dw = ", p->dw);
+  printf("  %s%d;\n", "p->pad_top = ", p->pad_top);
+  printf("  %s%d;\n", "p->pad_bot = ", p->pad_bot);
+  printf("  %s%d;\n", "p->pad_left = ", p->pad_left);
+  printf("  %s%d;\n", "p->pad_right = ", p->pad_right);
+  printf("  %s%d;\n", "p->stride_h = ", p->stride_h);
+  printf("  %s%d;\n", "p->stride_w = ", p->stride_w);
+  printf("  %s%d;\n", "p->ins_w = ", p->ins_w);
+  printf("  %s%d;\n", "p->ins_h = ", p->ins_h);
+  printf("  %s%d;\n", "p->ins_w_last = ", p->ins_w_last);
+  printf("  %s%d;\n", "p->ins_h_last = ", p->ins_h_last);
+
+  printf("  %s%d;\n", "p->r_shift_m = ", p->r_shift_m);
+  printf("  %s%d;\n", "p->opd0_sign = ", p->opd0_sign);
+  printf("  %s%d;\n", "p->opd1_sign = ", p->opd1_sign);
+  printf("  %s%d;\n", "p->opd2_sign = ", p->opd2_sign);
+  printf("  %s%d;\n", "p->bReLU_EN = ", p->bReLU_EN);
+  printf("  %s%d;\n", "p->using_bias = ", p->using_bias);
+
+  printf("  %s%d\n", "kh_ext = ", conv_kh_ext(p));
+  printf("  %s%d\n", "kw_ext = ", conv_kw_ext(p));
+  printf("  %s%d\n", "ih_ext = ", conv_ih_ext(p));
+  printf("  %s%d\n", "iw_ext = ", conv_iw_ext(p));
+  printf("  %s%d\n", "output_h = ", conv_oh(p));
+  printf("  %s%d\n", "output_w = ", conv_ow(p));
+}
+
+/* Calculate the right shift value, m
+ * Steps:
+ *  1. Get the abs() of each weight;
+ *  2. Summary all the abs() in one kernel;
+ *  3. Get Log2 of each sum;
+ *  4. Downward rounding;
+ * After every r_shift value got, sort and find the middle one.
+ */
+
+static int calc_rshift_m(const conv_param_t *p, int8_t* weight)
+{
+  int kernel_cnt = p->output_c * p->input_c;
+  int kernel_size = p->kh * p->kw;
+  int *kernel_shifts = (int *)malloc(sizeof(int) * kernel_cnt);
+  if (!kernel_shifts)
+    return 1;
+
+  memset(kernel_shifts, 0, sizeof(int) * kernel_cnt);
+
+  // Part 1:
+  // Get right shift value for each kernel
+  int sum = 0;
+  for (int i = 0; i < kernel_cnt; i++) {
+    // Step 1 & 2: Get the sum of abs()
+    for (int j = 0; j < kernel_size; j++) {
+      sum += (int)(*weight < 0 ? -(*weight) : (*weight));
+      weight++;
+    }
+    // Step 3 & 4: log2 and downward rounding
+    sum >>= 1;
+    while (sum) {
+      sum >>= 1;
+      kernel_shifts[i]++;
+    }
+  }
+
+  // Part 2:
+  // Find the middle of all the values
+  int tag[32] = {0};
+  for (int cnt = 0; cnt < kernel_cnt; cnt++) {
+    tag[kernel_shifts[cnt]]++;
+  }
+
+  int rshift_m = 0;
+  int mid = 0;
+  do {
+    mid += tag[rshift_m++];
+  } while(mid < (kernel_cnt - 1) >> 1);
+
+  free(kernel_shifts);
+
+  return rshift_m - 1;
+}
+
+static int test_ps32_ut(
+    conv_param_t *p_param, CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  printf("  test_ps32_ut\n");
+  int ret = 0;
+  int8_t *input = alloc_input(p_param);
+  int8_t *weight = alloc_weight(p_param);
+  int16_t *bias = alloc_bias(p_param);
+  int8_t *output_ref = (int8_t *)malloc(sizeof(int) * conv_output_size(p_param));
+  if (!input || !weight || !bias || !output_ref) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  p_param->r_shift_m = calc_rshift_m(p_param, weight);
+  ret = ps32_m2_conv_ref(p_param, input, weight, output_ref);
+  if (ret)
+    goto fail_exit;
+
+  cvk_tiu_pt_convolution_param_t conv_param;
+  make_bmk_conv_param_ps32(cvk_ctx, &conv_param, p_param, 2);
+  int tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, p_param);
+
+  if (tl_alloc_success) {
+
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, conv_param.ifmap, (uint8_t *)input);
+    put_conv_weight(rt_handle, cvk_ctx, conv_param.weight, (uint8_t *)weight);
+    cvk_ctx->ops->tiu_pt_convolution(cvk_ctx, &conv_param);
+
+    cvk_tl_t ps32_ofmap;
+    ps32_ofmap = *conv_param.ofmap;
+    ps32_ofmap.shape.n = ps32_ofmap.shape.n * sizeof(int);
+    uint8_t *output = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, &ps32_ofmap);
+
+    ret = array_cmp_int8(
+        "    Comparing begin_mode results ...\n",
+        output_ref, (int8_t *)output, conv_output_size(p_param) * sizeof(int));
+
+    if (ret) {
+      print_conv_param(p_param);
+      printf("    Comparison FAILED\n");
+    }
+    free(output);
+  }
+  free_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+
+  printf("    test_ps32_intermediate_mode\n");
+  for (int i=0; i < conv_input_size(p_param); i++)
+    input[i] = rand() % 256 - 128;
+
+  for (int i=0; i < conv_weight_size(p_param); i++)
+    weight[i] = rand() % 256 - 128;
+
+  ret = ps32_m3_conv_ref(p_param, input, weight, output_ref);
+  if (ret)
+    goto fail_exit;
+
+  make_bmk_conv_param_ps32(cvk_ctx, &conv_param, p_param, 3);
+  tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, p_param);
+
+  if (tl_alloc_success) {
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, conv_param.ifmap, (uint8_t *)input);
+    put_conv_weight(rt_handle, cvk_ctx, conv_param.weight, (uint8_t *)weight);
+
+    cvk_ctx->ops->tiu_pt_convolution(cvk_ctx, &conv_param);
+
+    cvk_tl_t ps32_ofmap;
+    ps32_ofmap = *conv_param.ofmap;
+    ps32_ofmap.shape.n = ps32_ofmap.shape.n * sizeof(int);
+
+    uint8_t *output = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, &ps32_ofmap);
+
+    ret = array_cmp_int8(
+        "    Comparing intermediate results ...\n",
+        output_ref, (int8_t *)output, conv_output_size(p_param) * sizeof(int));
+
+    if (ret) {
+      print_conv_param(p_param);
+      printf("    Comparison FAILED\n");
+    }
+    free(output);
+  }
+  free_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+
+  printf("    test_ps32_end_mode\n");
+  for (int i=0; i < conv_input_size(p_param); i++)
+    input[i] = rand() % 256 - 128;
+
+  for (int i=0; i < conv_weight_size(p_param); i++)
+    weight[i] = rand() % 256 - 128;
+
+  ret = ps32_m1_conv_ref(p_param, input, weight, bias, output_ref);
+  if (ret)
+    goto fail_exit;
+
+  make_bmk_conv_param_ps32(cvk_ctx, &conv_param, p_param, 1);
+
+  tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, p_param);
+
+  if (tl_alloc_success) {
+
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, conv_param.ifmap, (uint8_t *)input);
+    put_conv_weight(rt_handle, cvk_ctx, conv_param.weight, (uint8_t *)weight);
+    if (p_param->using_bias) {
+      put_conv_bias(rt_handle, cvk_ctx, conv_param.bias, bias);
+    }
+    cvk_ctx->ops->tiu_pt_convolution(cvk_ctx, &conv_param);
+    uint8_t *output = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, conv_param.ofmap);
+
+    ret = array_cmp_int8(
+        "    Comparing end results ...\n",
+        output_ref, (int8_t *)output, conv_output_size(p_param));
+
+    if (ret) {
+      print_conv_param(p_param);
+      printf("    Comparison FAILED\n");
+    }
+    free(output);
+  }
+  free_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+
+fail_exit:
+  free(input);
+  free(weight);
+  free(bias);
+  free(output_ref);
+
+  return ret;
+}
+
+static int test_ic_tiling_conv(
+    conv_param_t *p_param, CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  printf("  test tiled ps32 conv\n");
+  int ret = 0;
+  int8_t *input = alloc_input(p_param);
+  int8_t *weight = alloc_weight(p_param);
+  int16_t *bias = alloc_bias(p_param);
+  int8_t *output_ref = (int8_t *)malloc(sizeof(int8_t) * conv_output_size(p_param));
+  if (!input || !weight || !bias || !output_ref) {
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  p_param->r_shift_m = calc_rshift_m(p_param, weight);
+  ret = conv_ref(p_param, input, weight, bias, output_ref);
+  if (ret)
+    goto fail_exit_2;
+
+  cvk_tiu_pt_convolution_param_t conv_tmp_param;
+  cvk_tiu_pt_convolution_param_t conv_param;
+  make_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+
+  int tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, p_param);
+  if (tl_alloc_success) {
+    if (p_param->using_bias) {
+      conv_tmp_param.bias = conv_param.bias;
+      put_conv_bias(rt_handle, cvk_ctx, conv_param.bias, bias);
+    }
+    // for ps32_md[1] = 1, relu_enable & rshift_bits need to set to 0
+    // so we store those parameters to conv_tmp_para
+    conv_tmp_param.relu_enable = conv_param.relu_enable;
+    conv_tmp_param.rshift_bits = conv_param.rshift_bits;
+    conv_tmp_param.bias        = conv_param.bias;
+
+    uint32_t ic_step = 1;
+    uint32_t n_step = 1;
+    cvk_tl_t ifmap = *conv_param.ifmap;
+    cvk_tl_t ofmap = *conv_param.ofmap;
+    cvk_tg_shape_t s;
+    s.n = conv_param.ifmap->shape.n;
+    s.c = conv_param.ifmap->shape.c;
+    s.h = conv_param.ifmap->shape.h;
+    s.w = conv_param.ifmap->shape.w;
+    cvk_tg_t *tg_ifmap = alloc_tensor_dev_mem(rt_handle, cvk_ctx, s, CVK_FMT_I8);
+    tensor_copy_s2d(rt_handle, tg_ifmap, (uint8_t *)input);
+
+    s.n = conv_param.weight->shape.n;
+    s.c = conv_param.weight->shape.c;
+    s.h = conv_param.weight->shape.h;
+    s.w = conv_param.weight->shape.w;
+    uint8_t *transformed_weight =
+      transform_weight(&conv_param.weight->shape, (uint8_t *)weight);
+    cvk_tg_t *tg_weight = alloc_tensor_dev_mem(rt_handle, cvk_ctx, s, CVK_FMT_I8);
+    tensor_copy_s2d(rt_handle, tg_weight, (uint8_t *)transformed_weight);
+    free(transformed_weight);
+
+    cvk_tl_shape_t cur_tl_ifmap_shape = {
+        n_step,
+        ic_step,
+        ifmap.shape.h,
+        ifmap.shape.w
+    };
+
+    cvk_tg_shape_t cur_tg_ifmap_shape = {
+      cur_tl_ifmap_shape.n,
+      cur_tl_ifmap_shape.c,
+      cur_tl_ifmap_shape.h,
+      cur_tl_ifmap_shape.w
+    };
+
+    cvk_tg_stride_t cur_tg_ifmap_stride = {
+      tg_ifmap->stride.n,
+      tg_ifmap->stride.c,
+      tg_ifmap->stride.h,
+      1
+    };
+
+    cvk_tg_t cur_tg_ifmap;
+    cur_tg_ifmap.base_reg_index = 0;
+    cur_tg_ifmap.start_address = tg_ifmap->start_address;
+    cur_tg_ifmap.shape = cur_tg_ifmap_shape;
+    cur_tg_ifmap.stride = cur_tg_ifmap_stride;
+    cur_tg_ifmap.fmt = CVK_FMT_I8;
+
+    cvk_tl_t cur_tl_ifmap;
+    cur_tl_ifmap.shape = cur_tl_ifmap_shape;
+    cur_tl_ifmap.stride =
+      cvk_ctx->ops->tl_default_stride(cvk_ctx, cur_tl_ifmap_shape, CVK_FMT_I8, 1);
+    cur_tl_ifmap.start_address = ifmap.start_address;
+    cur_tl_ifmap.fmt = ifmap.fmt;
+
+    cvk_tl_t cur_tl_ofmap;
+    cur_tl_ofmap.start_address = ofmap.start_address;
+    cur_tl_ofmap.shape = ofmap.shape;
+    cur_tl_ofmap.shape.n = n_step;
+    cur_tl_ofmap.stride =
+      cvk_ctx->ops->tl_default_stride(cvk_ctx, cur_tl_ofmap.shape, CVK_FMT_I8, 1);
+    cur_tl_ofmap.fmt = ofmap.fmt;
+
+    cvk_tl_t cur_tl_weight;
+    cur_tl_weight.start_address = conv_param.weight->start_address;
+    cur_tl_weight.shape = conv_param.weight->shape;
+    cur_tl_weight.shape.n = ic_step;
+    cur_tl_weight.stride.n = 1;
+    cur_tl_weight.stride.c = cur_tl_weight.shape.n * cur_tl_weight.shape.h * cur_tl_weight.shape.w;
+    cur_tl_weight.stride.h = cur_tl_weight.shape.n * cur_tl_weight.shape.w;
+    cur_tl_weight.stride.w = cur_tl_weight.shape.n;
+    cur_tl_weight.fmt = conv_param.weight->fmt;
+
+    const cvk_tl_t *saved_tl_weight = conv_param.weight;
+    const cvk_tl_t *saved_tl_ifmap = conv_param.ifmap;
+    for (uint32_t ci = 0; ci < ifmap.shape.c; ci += ic_step) {
+      {
+        uint32_t ic = tg_weight->shape.n;
+        uint32_t oc = tg_weight->shape.c;
+        uint32_t kh = tg_weight->shape.h;
+        uint32_t kw = tg_weight->shape.w;
+
+        cvk_tg_t cur_tdma_tg_weight;
+        cur_tdma_tg_weight.base_reg_index = tg_weight->base_reg_index;
+        cur_tdma_tg_weight.start_address = tg_weight->start_address + ci;
+        cur_tdma_tg_weight.fmt = tg_weight->fmt;
+        cur_tdma_tg_weight.shape = tg_shape_t4(1, oc, kh * kw, ic);
+        cur_tdma_tg_weight.stride =
+          cvk_ctx->ops->tg_default_stride(cvk_ctx, cur_tdma_tg_weight.shape, cur_tdma_tg_weight.fmt);
+        cur_tdma_tg_weight.shape = tg_shape_t4(1, oc, kh * kw, ic_step);
+
+        cvk_tl_t cur_tdma_tl_weight;
+        cur_tdma_tl_weight = cur_tl_weight;
+        cur_tdma_tl_weight.shape.n = cur_tdma_tg_weight.shape.n;
+        cur_tdma_tl_weight.shape.c = cur_tdma_tg_weight.shape.c;
+        cur_tdma_tl_weight.shape.h = cur_tdma_tg_weight.shape.h;
+        cur_tdma_tl_weight.shape.w = cur_tdma_tg_weight.shape.w;
+        cur_tdma_tl_weight.stride = cvk_ctx->ops->tl_default_stride(
+            cvk_ctx, cur_tdma_tl_weight.shape, CVK_FMT_I8, 0);
+
+        cvk_tdma_g2l_tensor_copy_param_t p1;
+        memset(&p1, 0, sizeof(p1));
+        p1.src = &cur_tdma_tg_weight;
+        p1.dst = &cur_tdma_tl_weight;
+        cvk_ctx->ops->tdma_g2l_tensor_copy(cvk_ctx, &p1);
+        CVI_RT_Submit(cvk_ctx);
+      }
+      {
+        cvk_tdma_g2l_tensor_copy_param_t p2;
+        memset(&p2, 0, sizeof(p2));
+        cur_tg_ifmap.start_address =
+          tg_ifmap->start_address + ci * tg_ifmap->stride.c;
+        p2.src = &cur_tg_ifmap;
+        p2.dst = &cur_tl_ifmap;
+        cvk_ctx->ops->tdma_g2l_tensor_copy(cvk_ctx, &p2);
+        CVI_RT_Submit(cvk_ctx);
+      }
+
+      conv_param.ifmap = &cur_tl_ifmap;
+      conv_param.weight = &cur_tl_weight;
+
+      // for ps32_md[1] = 1, relu_enable & rshift_bits need to set to 0
+      // so we store those parameters to conv_tmp_para
+      if (ci == (ifmap.shape.c - 1))
+      {
+        conv_param.relu_enable = conv_tmp_param.relu_enable;
+        conv_param.rshift_bits = conv_tmp_param.rshift_bits;
+        conv_param.bias        = conv_tmp_param.bias;
+        conv_param.ps32_mode   = 1;
+      }
+      else if (ci == 0)
+      {
+        conv_param.relu_enable = 0;
+        conv_param.rshift_bits = 0;
+        conv_param.bias        = 0;;
+        conv_param.ps32_mode   = 2;
+      }
+      else
+      {
+        conv_param.relu_enable = 0;
+        conv_param.rshift_bits = 0;
+        conv_param.bias        = 0;;
+        conv_param.ps32_mode   = 3;
+      }
+      cvk_ctx->ops->tiu_pt_convolution(cvk_ctx, &conv_param);
+      conv_param.weight = saved_tl_weight;
+      conv_param.ifmap = saved_tl_ifmap;
+    }
+
+    uint8_t *output = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, conv_param.ofmap);
+
+    free_tensor_dev_mem(rt_handle, tg_ifmap);
+    free_tensor_dev_mem(rt_handle, tg_weight);
+    ret = array_cmp_int8(
+        "    Comparing results ...\n",
+        output_ref, (int8_t *)output, conv_output_size(p_param));
+
+    if (ret) {
+      print_conv_param(p_param);
+      printf("    Comparison FAILED\n");
+    }
+    free(output);
+  }
+  free_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+
+fail_exit_2:
+  free(input);
+  free(weight);
+  free(output_ref);
+  free(bias);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  for (int i = 0; i < 5; i++) {
+    printf("random_test_conv iteration: %d\n", i);
+    conv_param_t test_conv_param;
+    init_conv_param(&test_conv_param);
+    ret |= test_ic_tiling_conv(&test_conv_param, rt_handle, cvk_ctx);
+    ret |= test_ps32_ut(&test_conv_param, rt_handle, cvk_ctx);
+    if (!test_conv_param.using_bias)
+      test_conv_param.using_bias = 1;
+    if (test_conv_param.output_c <= 9)
+      test_conv_param.output_c += 3;
+    ret |= test_ic_tiling_conv(&test_conv_param, rt_handle, cvk_ctx);
+    ret |= test_ps32_ut(&test_conv_param, rt_handle, cvk_ctx);
+  }
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_conv_qm.c b/cviruntime/test/181x/test_181x_conv_qm.c
new file mode 100644
index 000000000..9db8764bf
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_conv_qm.c
@@ -0,0 +1,1568 @@
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_tf_quant_util.h"
+#include "test_native_ref.h"
+
+// #define ENABLE_DEBUG_MSG
+// #define ENABLE_FULL_REGRESSION
+
+#define MIN_EXEC_TESTS  20
+
+typedef struct {
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int output_h;
+  int output_w;
+  int has_bias;
+  int relu_enable;
+  int8_t *input_data;
+  int8_t *filter_data;
+  int8_t *output_data;
+  int32_t *bias_data;
+  uint32_t *multiplier_data;
+  int8_t *shift_data;
+  float float_multiplier;
+  int retry_cnt;
+} conv_test_param_t;
+
+static inline int Offset(cvk_tl_shape_t shape, int n, int c, int h, int w)
+{
+  return n * (shape.c * shape.h * shape.w) + c * (shape.h * shape.w) +
+         h * shape.w + w;
+}
+
+void conv_per_channel_ref(conv_test_param_t *p_param)
+{
+  const int stride_width = p_param->stride_w;
+  const int stride_height = p_param->stride_h;
+  const int dilation_width_factor = 1;
+  const int dilation_height_factor = 1;
+  const int pad_width = p_param->pad_left;
+  const int pad_height = p_param->pad_top;
+
+  const int32_t output_activation_min = -128;
+  const int32_t output_activation_max = 127;
+
+  const int batches = p_param->input_n;
+  const int input_depth = p_param->input_c;
+  const int output_depth = p_param->output_c;
+
+  const int input_height = p_param->input_h;
+  const int input_width = p_param->input_w;
+  const int filter_height = p_param->kh;
+  const int filter_width = p_param->kw;
+  const int output_height = p_param->output_h;
+  const int output_width = p_param->output_w;
+  int8_t *input_data = p_param->input_data;
+  int8_t *filter_data = p_param->filter_data;
+  int8_t *output_data = p_param->output_data;
+  int32_t *bias_data = p_param->has_bias ? p_param->bias_data : NULL;
+  uint32_t *output_multiplier = p_param->multiplier_data;
+  int8_t *output_rshift = p_param->shift_data;
+
+  cvk_tl_shape_t input_shape = {
+      batches, input_depth, input_height, input_width};
+  cvk_tl_shape_t filter_shape = {
+      output_depth, filter_height, filter_width, input_depth};
+  cvk_tl_shape_t output_shape = {
+      batches, output_depth, output_height, output_width};
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("conv_per_channel_ref: \n"
+         "  input (n=%d, ic=%d, h=%d, w=%d)\n"
+         "  kernel (oc=%d, kh=%d, kw=%d, ic=%d)\n",
+         batches, input_depth, input_height, input_width, output_depth,
+         filter_height, filter_width, input_depth);
+#endif
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          const int in_y_origin = (out_y * stride_height) - pad_height;
+          int32_t acc = 0;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  int32_t input_val = input_data[Offset(input_shape, batch,
+                                                    in_channel, in_y, in_x)];
+                  // int32_t filter_val = filter_data[Offset(filter_shape,
+                  // out_channel, in_channel,
+                  //                                    filter_y, filter_x)];
+                  int32_t filter_val =
+                      filter_data[Offset(filter_shape, out_channel, filter_y,
+                                         filter_x, in_channel)];
+                  acc += filter_val * input_val;
+
+#ifdef ENABLE_DEBUG_MSG
+                  printf("  [batch=%d][out_channel=%d][out_y=%d][out_x=%d]"
+                         "[filter_y=%d][filter_x=%d][in_channel=%d] acc(%d) += "
+                         "%d * %d = %d\n",
+                         batch, out_channel, out_y, out_x, filter_y, filter_x,
+                         in_channel, acc - filter_val * input_val, filter_val,
+                         input_val, acc);
+#endif
+                }
+              }
+            }
+          }
+
+          if (bias_data) {
+            acc += bias_data[out_channel];
+          }
+
+#ifdef ENABLE_DEBUG_MSG
+          printf("  [batch=%d][out_channel=%d][out_y=%d][out_x=%d] acc = %d, "
+                 "bias %d\n",
+                 batch, out_channel, out_y, out_x, acc,
+                 bias_data ? bias_data[out_channel] : 0);
+#endif
+
+          acc = MultiplyByQuantizedMultiplier(
+              acc, output_multiplier[out_channel], output_rshift[out_channel]);
+
+#ifdef ENABLE_DEBUG_MSG
+          printf("  [batch=%d][out_channel=%d][out_y=%d][out_x=%d] acc = %d, "
+                 "multiplier %d, shift %d\n",
+                 batch, out_channel, out_y, out_x, acc,
+                 output_multiplier[out_channel], output_rshift[out_channel]);
+#endif
+
+          acc = MAX(acc, output_activation_min);
+          acc = MIN(acc, output_activation_max);
+
+#ifdef ENABLE_DEBUG_MSG
+          printf("  [batch=%d][out_channel=%d][out_y=%d][out_x=%d] acc = %d\n",
+                 batch, out_channel, out_y, out_x, acc);
+#endif
+
+          output_data[Offset(output_shape, batch, out_channel, out_y, out_x)] =
+              (uint8_t)acc;
+        }
+      }
+    }
+  }
+}
+
+void calc_conv_float_multiplier(conv_test_param_t *p_param)
+{
+  const int stride_width = p_param->stride_w;
+  const int stride_height = p_param->stride_h;
+  const int dilation_width_factor = 1;
+  const int dilation_height_factor = 1;
+  const int pad_width = p_param->pad_left;
+  const int pad_height = p_param->pad_top;
+
+  const int batches = p_param->input_n;
+  const int input_depth = p_param->input_c;
+  const int output_depth = p_param->output_c;
+
+  const int input_height = p_param->input_h;
+  const int input_width = p_param->input_w;
+  const int filter_height = p_param->kh;
+  const int filter_width = p_param->kw;
+  const int output_height = p_param->output_h;
+  const int output_width = p_param->output_w;
+  int8_t *input_data = p_param->input_data;
+  int8_t *filter_data = p_param->filter_data;
+  int32_t *bias_data = p_param->has_bias ? p_param->bias_data : NULL;
+
+  cvk_tl_shape_t input_shape = {
+      batches, input_depth, input_height, input_width};
+  cvk_tl_shape_t filter_shape = {
+      output_depth, filter_height, filter_width, input_depth};
+
+  int output_accu_min = INT_MAX;
+  int output_accu_max = INT_MIN;
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("calc_conv_float_multiplier =>\n");
+#endif
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          const int in_y_origin = (out_y * stride_height) - pad_height;
+          int32_t acc = 0;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  int32_t input_val = input_data[Offset(input_shape, batch,
+                                                    in_channel, in_y, in_x)];
+                  // int32_t filter_val = filter_data[Offset(filter_shape,
+                  // out_channel, in_channel,
+                  //                                    filter_y, filter_x)];
+                  int32_t filter_val =
+                      filter_data[Offset(filter_shape, out_channel, filter_y,
+                                         filter_x, in_channel)];
+                  acc += filter_val * input_val;
+
+                  // printf("  [batch=%d][out_channel=%d][out_y=%d][out_x=%d]"
+                  //        "[filter_y=%d][filter_x=%d][in_channel=%d] acc(%d)
+                  //        += %d * %d = %d\n", batch, out_channel, out_y,
+                  //        out_x, filter_y, filter_x, in_channel, acc -
+                  //        filter_val * input_val, filter_val, input_val, acc);
+                }
+              }
+            }
+          }
+
+          if (bias_data) {
+            acc += bias_data[out_channel];
+          }
+
+          output_accu_max = MAX(acc, output_accu_max);
+          output_accu_min = MIN(acc, output_accu_min);
+        }
+      }
+    }
+  }
+
+  // Since int8 ranges from -128 to 127, we need to squeeze the accumulator
+  // MIN/MAX fit in those ranges correspondingly as much as possible.
+  if (abs(output_accu_max) > abs(output_accu_min)) {
+    p_param->float_multiplier = 127.0f / abs(output_accu_max);
+  } else {
+    p_param->float_multiplier = 128.0f / abs(output_accu_min);
+  }
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("  output_accu_min %d, output_accu_max %d, output_multiplier %f\n",
+         output_accu_min, output_accu_max, p_param->float_multiplier);
+#endif
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("<= calc_dw_conv_float_multiplier\n");
+#endif
+}
+
+int simple_test(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+
+  const int batches = 1;
+  const int input_depth = 2;
+  const int input_height = 2;
+  const int input_width = 3;
+  cvk_tl_shape_t input_shape = {batches, input_depth, input_height, input_width};
+  int8_t input_data[12] = {
+      9,  1,   -11,  // ic = 0, h = 0
+      13, 5,   -15,  // ic = 0, h = 1
+      5,  -7,  -15,  // ic = 1, h = 0
+      9,  -11, -19   // ic = 1, h = 1
+  };
+
+  const int output_depth = 2;
+  const int kernel_height = 2;
+  const int kernel_width = 2;
+  cvk_tl_shape_t filter_shape = {output_depth, input_depth, kernel_height,
+                             kernel_width};
+
+  // TIU weight layout (1, oc, hw*kc, ic)
+  cvk_tl_shape_t filter_shape_for_dma = {1, output_depth,
+                                     kernel_height * kernel_width, input_depth};
+  int8_t filter_data_for_dma[16] = {
+      2,  4,  6,  8,  6,  8,  10, 12,  // oc = 0
+      28, 32, 20, 24, 12, 16, 4,  8    // oc = 1
+  };
+
+  int32_t bias_data[2] = {12, -16};
+
+  const int output_height = 1;
+  const int output_width = 2;
+  cvk_tl_shape_t output_shape = {1, output_depth, output_height, output_width};
+  // zero_point = 0
+  int8_t ref_output_data[4] = {
+      17, -128,  // oc = 0
+      60, -128,  // oc = 1
+  };
+
+  uint32_t output_multiplier[] = {1073741824, 1073741824};
+  int8_t output_rshift[2] = {1, 2};  // changed to right shift
+
+  int8_t output_data[4];
+
+  conv_test_param_t params;
+  memset(&params, 0, sizeof(params));
+
+  params.input_n = batches;
+  params.input_c = input_depth;
+  params.input_h = input_height;
+  params.input_w = input_width;
+  params.kh = kernel_height;
+  params.kw = kernel_width;
+  params.output_c = output_depth;
+  params.output_h = output_height;
+  params.output_w = output_width;
+  params.stride_w = 1;
+  params.stride_h = 1;
+  params.input_data = input_data;
+  params.filter_data = filter_data_for_dma;
+  params.output_data = output_data;
+  params.has_bias = 1;
+  params.bias_data = bias_data;
+  params.multiplier_data = output_multiplier;
+  params.shift_data = output_rshift;
+  conv_per_channel_ref(&params);
+
+  printf("Compare ref and golden\n");
+  for (int i = 0; i < 4; i++) {
+    if (output_data[i] != ref_output_data[i]) {
+      printf("Error ! output[%d]=%d != ref_output_data[%d]=%d\n", i,
+             output_data[i], i, ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  // cvk_tl_shape_t per_channel_cal_shape = {1, /*oc=*/2, 1, 9};
+  uint8_t per_channel_cal_data[18];
+  pack_chl_quan_param(2, /*has_bias=*/true, bias_data, output_multiplier,
+                      output_rshift, per_channel_cal_data);
+
+  cvk_tl_shape_t quan_param_shape = {1, 2, 1, 9};
+  cvk_tl_t *tl_per_channel_cal =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, quan_param_shape, CVK_FMT_U8,
+                                  /*eu_align*/ 0);
+
+  cvk_tl_t *tl_input =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, input_shape, CVK_FMT_I8, /*eu_align=*/1);
+
+  cvk_tl_t *tl_filter = cvk_ctx->ops->lmem_alloc_tensor(
+      cvk_ctx, filter_shape_for_dma, CVK_FMT_I8, /*eu_align=*/1);
+
+  cvk_tl_t *tl_output =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, output_shape, CVK_FMT_I8, /*eu_align=*/1);
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_per_channel_cal, per_channel_cal_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_input, (uint8_t *)input_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_filter, (uint8_t *)filter_data_for_dma);
+
+  // Restore filter shape for tiu operation
+  tl_filter->shape = filter_shape;
+  tl_filter->stride = cvk_ctx->ops->tl_default_stride(
+      cvk_ctx, tl_filter->shape, CVK_FMT_I8, /*eu_align=*/1);
+
+  {
+    // Reshape per channel quantization data
+    tl_per_channel_cal->shape = tl_shape_t4(1, 2, 1, 1);
+    tl_per_channel_cal->stride = cvk_ctx->ops->tl_default_stride(
+        cvk_ctx, tl_per_channel_cal->shape, CVK_FMT_I8, /*eu_align=*/0);
+
+    cvk_tiu_convolution_param_t param;
+    memset(&param, 0, sizeof(param));
+    param.ofmap = tl_output;
+    param.ifmap = tl_input;
+    param.weight = tl_filter;
+    param.chl_quan_param = tl_per_channel_cal;
+    param.stride_h = 1;
+    param.stride_w = 1;
+    param.dilation_h = 1;
+    param.dilation_w = 1;
+    param.has_bias = 1;
+    cvk_ctx->ops->tiu_convolution(cvk_ctx, &param);
+  }
+
+  CVI_RT_Submit(cvk_ctx);
+
+  printf("Compare tiu and golden\n");
+  int8_t *conv_output_data =
+      (int8_t *)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_output);
+  for (int i = 0; i < (int)sizeof(ref_output_data); i++) {
+    if (conv_output_data[i] != ref_output_data[i]) {
+      printf("output_data[%d] %d != %d\n", i, conv_output_data[i],
+             ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  free(conv_output_data);
+
+  // Reverse order
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_output);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_filter);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_input);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_per_channel_cal);
+
+  return ret;
+}
+
+void fill_random_data_s8(int8_t *input_data, int size)
+{
+  for (int i = 0; i < size; ++i) {
+    int is_satured = ((rand() % 1000) == 1) ? 1 : 0;
+    int is_sign = rand() % 2 ? 1 : -1;
+
+    if (is_satured && is_sign) {
+      input_data[i] = -128;
+    } else if (is_satured) {
+      input_data[i] = 127;
+    } else {
+      input_data[i] = is_sign * rand() % 128;
+    }
+  }
+}
+
+void fill_random_data_s32(int32_t *input_data, int size)
+{
+  for (int i = 0; i < size; ++i) {
+    int is_satured = ((rand() % 1000) == 1) ? 1 : 0;
+    int is_sign = rand() % 2 ? 1 : -1;
+
+    if (is_satured && is_sign) {
+      input_data[i] = INT_MIN;
+    } else if (is_satured) {
+      input_data[i] = INT_MAX;
+    } else {
+      input_data[i] = is_sign * rand() % 128;
+    }
+  }
+}
+
+bool check_valid_test_param(cvk_context_t *cvk_ctx, conv_test_param_t *p_param)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int oh = p_param->output_h;
+  int ow = p_param->output_w;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int chl_quan_per_lane_data_size =
+      p_param->has_bias ? 9 : 5;  // bias(4) + multiplier(4) + shift(1)
+
+  // Skip invalid shape
+  if ((kh > ih) || (kw > iw) || (stride_h > ih) || (stride_w > iw)) {
+    return false;
+  }
+
+  // muliply random-choosen value may exceeded than int32_t
+  uint32_t input_size = in * ic * ih * iw;
+  uint32_t kernel_size = oc * ic * kh * kw;
+  uint32_t output_size = in * oc * oh * ow;
+
+  uint32_t lmem_size_per_lane = cvk_ctx->info.lmem_size;
+  uint32_t total_lmem_size = cvk_ctx->info.lmem_size * cvk_ctx->info.npu_num;
+
+  uint32_t total_needed_size = input_size + kernel_size + output_size +
+                          chl_quan_per_lane_data_size * cvk_ctx->info.npu_num;
+  if (total_needed_size > total_lmem_size) {
+    return false;
+  }
+
+  cvk_tl_shape_t input_shape = tl_shape_t4(in, ic, ih, iw);
+  cvk_tl_shape_t filter_shape = tl_shape_t4(1, oc, kh * kw, ic);
+  cvk_tl_shape_t output_shape = tl_shape_t4(in, oc, oh, ow);
+  cvk_tl_shape_t cal_shape = tl_shape_t4(1, oc, 1, chl_quan_per_lane_data_size);
+
+  uint32_t needed_size =
+      cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, input_shape, CVK_FMT_I8, /*eu_align=*/1) +
+      cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, filter_shape, CVK_FMT_I8, /*eu_align=*/0) +
+      cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, output_shape, CVK_FMT_I8, /*eu_align=*/1) +
+      cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, cal_shape, CVK_FMT_I8, /*eu_align=*/0);
+
+  // Skip invalid shape
+  if (needed_size > lmem_size_per_lane) {
+    return false;
+  }
+
+  return true;
+}
+
+int choose_from_range(int table[], int size, int index)
+{
+  if (index >= size) {
+    return 0;
+  }
+
+  int val = table[index];
+  if (index < (size - 1)) {
+    int range = MAX(table[index + 1] - table[index] - 1, 1);
+    val += rand() % range;
+  }
+
+  return val;
+}
+
+void dump_test_param(conv_test_param_t *p_param, bool dump_content)
+{
+  printf("Dump test parameter:\n");
+  printf("  input_n %d\n", p_param->input_n);
+  printf("  input_c %d\n", p_param->input_c);
+  printf("  input_h %d\n", p_param->input_h);
+  printf("  input_w %d\n", p_param->input_w);
+  printf("  kw %d\n", p_param->kw);
+  printf("  kh %d\n", p_param->kh);
+  printf("  dh %d\n", p_param->dh);
+  printf("  dw %d\n", p_param->dw);
+  printf("  pad_top %d\n", p_param->pad_top);
+  printf("  pad_bot %d\n", p_param->pad_bot);
+  printf("  pad_left %d\n", p_param->pad_left);
+  printf("  pad_right %d\n", p_param->pad_right);
+  printf("  ins_h %d\n", p_param->ins_h);
+  printf("  ins_h_last %d\n", p_param->ins_h_last);
+  printf("  ins_w %d\n", p_param->ins_w);
+  printf("  ins_w_last %d\n", p_param->ins_w_last);
+  printf("  stride_h %d\n", p_param->stride_h);
+  printf("  stride_w %d\n", p_param->stride_w);
+  printf("  output_c %d\n", p_param->output_c);
+  printf("  output_h %d\n", p_param->output_h);
+  printf("  output_w %d\n", p_param->output_w);
+  printf("  has_bias %d\n", p_param->has_bias);
+  printf("  relu_enable %d\n", p_param->relu_enable);
+
+  if (dump_content) {
+    printf("input_data(%d, %d, %d, %d) :\n", p_param->input_n, p_param->input_c,
+           p_param->input_h, p_param->input_w);
+    int in = p_param->input_n;
+    int ic = p_param->input_c;
+    int ih = p_param->input_h;
+    int iw = p_param->input_w;
+    for (int i = 0; i < in; ++i) {
+      for (int j = 0; j < ic; ++j) {
+        for (int k = 0; k < ih; ++k) {
+          for (int l = 0; l < iw; ++l) {
+            int offset = i * (ic * ih * iw) + j * (ih * iw) + k * iw + l;
+            printf("%d, ", p_param->input_data[offset]);
+          }
+          printf("\n");
+        }
+      }
+    }
+    printf("\n\n");
+
+    printf("kener_data (oc=%d, kh=%d, kw=%d, ic=%d)\n", p_param->output_c,
+           p_param->kh, p_param->kw, p_param->input_c);
+    int oc = p_param->output_c;
+    int kh = p_param->kh;
+    int kw = p_param->kw;
+    for (int i = 0; i < oc; ++i) {
+      for (int j = 0; j < kh; ++j) {
+        for (int k = 0; k < kw; ++k) {
+          for (int l = 0; l < ic; ++l) {
+            int offset = i * (kh * kw * ic) + j * (kw * ic) + k * ic + l;
+            printf("%d, ", p_param->filter_data[offset]);
+          }
+          printf("\n");
+        }
+      }
+    }
+    printf("\n\n");
+
+    if (p_param->has_bias) {
+      printf("bias_data:\n");
+      for (int i = 0; i < oc; ++i) {
+        printf("%d, ", p_param->bias_data[i]);
+      }
+      printf("\n\n");
+    }
+
+    printf("multiplier_data:\n");
+    for (int i = 0; i < oc; ++i) {
+      printf("%d, ", p_param->multiplier_data[i]);
+    }
+    printf("\n\n");
+
+    printf("shift_data:\n");
+    for (int i = 0; i < oc; ++i) {
+      printf("%d, ", p_param->shift_data[i]);
+    }
+    printf("\n\n");
+  }
+}
+
+
+
+static conv_test_param_t keepFailParam;;
+static int8_t *keep_input_data = NULL;
+
+static int keep_kernel_size = 0;
+static int8_t *keep_kernel_data = NULL;
+
+static int keep_output_size = 0;
+static int8_t *keep_output_data = NULL;
+
+static int32_t *keep_bias_data = NULL;
+static uint32_t *keep_multiplier_data = NULL;
+static int8_t *keep_shift_data = NULL;
+
+
+int keep_fail_param(conv_test_param_t *p_param)
+{
+	int in = p_param->input_n;
+	int ic = p_param->input_c;
+	int ih = p_param->input_h;
+	int iw = p_param->input_w;
+	int oc = p_param->output_c;
+	int oh = p_param->output_h;
+	int ow = p_param->output_w;
+	int kh = p_param->kh;
+	int kw = p_param->kw;
+	//int dh = p_param->dh;
+	//int dw = p_param->dw;
+	//int pad_top = p_param->pad_top;
+	//int pad_bot = p_param->pad_bot;
+	//int pad_left = p_param->pad_left;
+	//int pad_right = p_param->pad_right;
+	//int ins_h = p_param->ins_h;
+	//int ins_last_h = p_param->ins_h_last;
+	//int ins_w = p_param->ins_w;
+	//int ins_last_w = p_param->ins_w_last;
+	//int stride_h = p_param->stride_h;
+	//int stride_w = p_param->stride_w;
+	int has_bias = p_param->has_bias;
+	//int relu_enable = p_param->relu_enable;
+
+
+	memcpy(&keepFailParam, p_param, sizeof(conv_test_param_t));
+
+	int input_size = in * ic * iw * ih;
+	keep_input_data = (int8_t *)malloc(input_size);
+	memcpy(keep_input_data, p_param->input_data, input_size);
+
+	
+	keep_kernel_size = oc * ic * kh * kw;
+	keep_kernel_data = (int8_t *)malloc(keep_kernel_size);
+	memcpy(keep_kernel_data, p_param->filter_data, keep_kernel_size);
+	
+	keep_output_size = in * oc * oh * ow;
+	keep_output_data = (int8_t *)malloc(keep_output_size);
+	memcpy(keep_output_data, p_param->output_data, keep_output_size);
+
+	keep_bias_data = (int32_t *) malloc(sizeof(int32_t) * oc);
+	memcpy(keep_bias_data, p_param->bias_data, sizeof(int32_t) * oc);
+
+	keep_multiplier_data = (uint32_t *) malloc(sizeof(uint32_t) * oc);
+	memcpy(keep_multiplier_data, p_param->multiplier_data, sizeof(int32_t) * oc);
+
+	keep_shift_data = (int8_t *)malloc(oc);
+	memcpy(keep_shift_data, p_param->shift_data, oc);
+	
+
+
+	keepFailParam.input_data = keep_input_data;
+	keepFailParam.filter_data = keep_kernel_data;
+	keepFailParam.output_data = keep_output_data;
+	keepFailParam.has_bias = has_bias;
+	keepFailParam.bias_data = keep_bias_data;
+	keepFailParam.multiplier_data = keep_multiplier_data;
+	keepFailParam.shift_data = keep_shift_data;
+
+	return 0;
+}
+
+
+void dump2_test_param(conv_test_param_t *p_param)
+{
+	printf("dump2_test_param:\n");
+	printf("  input_n %d\n", p_param->input_n);
+	printf("  input_c %d\n", p_param->input_c);
+	printf("  input_h %d\n", p_param->input_h);
+	printf("  input_w %d\n", p_param->input_w);
+	printf("  kw %d\n", p_param->kw);
+	printf("  kh %d\n", p_param->kh);
+	printf("  dh %d\n", p_param->dh);
+	printf("  dw %d\n", p_param->dw);
+	printf("  pad_top %d\n", p_param->pad_top);
+	printf("  pad_bot %d\n", p_param->pad_bot);
+	printf("  pad_left %d\n", p_param->pad_left);
+	printf("  pad_right %d\n", p_param->pad_right);
+	printf("  ins_h %d\n", p_param->ins_h);
+	printf("  ins_h_last %d\n", p_param->ins_h_last);
+	printf("  ins_w %d\n", p_param->ins_w);
+	printf("  ins_w_last %d\n", p_param->ins_w_last);
+	printf("  stride_h %d\n", p_param->stride_h);
+	printf("  stride_w %d\n", p_param->stride_w);
+	printf("  output_c %d\n", p_param->output_c);
+	printf("  output_h %d\n", p_param->output_h);
+	printf("  output_w %d\n", p_param->output_w);
+	printf("  has_bias %d\n", p_param->has_bias);
+	printf("  relu_enable %d\n", p_param->relu_enable);
+
+	keep_fail_param(p_param);
+	printf("dump2_test_param\n\n");
+	assert(0);
+}
+
+int run_compare_conv(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx,
+                     conv_test_param_t *p_param)
+{
+  int ret = 0;
+
+  if (rt_handle == NULL || cvk_ctx == NULL) {
+    return -1;
+  }
+
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int oh = p_param->output_h;
+  int ow = p_param->output_w;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_last_h = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_last_w = p_param->ins_w_last;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int has_bias = p_param->has_bias;
+  int relu_enable = p_param->relu_enable;
+
+  int input_size = in * ic * iw * ih;
+  int8_t *input_data = (int8_t *)malloc(input_size);
+
+  int kernel_size = oc * ic * kh * kw;
+  int8_t *kernel_data = (int8_t *)malloc(kernel_size);
+
+  int output_size = in * oc * oh * ow;
+  int8_t *output_data = (int8_t *)malloc(output_size);
+  memset(output_data, 0, output_size);
+
+  int32_t *bias_data = (int32_t *) malloc(sizeof(int32_t) * oc);
+  uint32_t *multiplier_data = (uint32_t *) malloc(sizeof(uint32_t) * oc);
+  int8_t *shift_data = (int8_t *)malloc(oc);
+
+  p_param->input_data = input_data;
+  p_param->filter_data = kernel_data;
+  p_param->output_data = output_data;
+  p_param->has_bias = has_bias;
+  p_param->bias_data = bias_data;
+  p_param->multiplier_data = multiplier_data;
+  p_param->shift_data = shift_data;
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    run_compare_conv =>\n");
+  printf("      input (n=%d, ic=%d, h=%d, w=%d), kernel (oc=%d, ic=%d, h=%d, "
+         "w=%d), output (, c=%d, h=%d, w=%d), has_bias %d\n",
+         in, ic, ih, iw, oc, ic, kh, kw, oc, oh, ow, has_bias);
+#endif
+
+  int retry_cnt = p_param->retry_cnt;
+  do {
+    fill_random_data_s8(input_data, input_size);
+    fill_random_data_s8(kernel_data, kernel_size);
+    if (has_bias) {
+      fill_random_data_s32(bias_data, oc);
+    }
+
+    p_param->float_multiplier = 100.0;  // should be < 1.0
+    calc_conv_float_multiplier(p_param);
+
+    if (p_param->float_multiplier > 0.f && p_param->float_multiplier < 1.0) {
+      break;
+    }
+
+  } while (--retry_cnt);
+
+  if (p_param->float_multiplier >= 1.0) {
+    printf("    run_compare_dw_conv: unable to find valid multiplier\n");
+    free(input_data);
+    free(kernel_data);
+    free(output_data);
+    free(bias_data);
+    free(multiplier_data);
+    free(shift_data);
+
+    return -1;
+  }
+
+  uint32_t base_multiplier = 0;
+  int base_shift = 0;
+  QuantizeMultiplierSmallerThanOne(p_param->float_multiplier, &base_multiplier,
+                                   &base_shift);
+
+  for (int i = 0; i < oc; ++i) {
+    // multipliers typically range in [2^30 ; 2^31 - 1].
+    // Values in [0, 2^30 - 1] are normally unused, but harmless.
+    // Thus a good way to randomize multipliers is to subtract from them
+    // a random value smaller than 2^30 but still significant compared to it.
+    p_param->multiplier_data[i] = base_multiplier - (rand() % (1 << 26));
+
+    int right_shift = base_shift - 1 + (rand() % 4);
+    p_param->shift_data[i] =
+        truncate_rshift((int8_t)right_shift, /*allow_lshift*/1);
+
+#ifdef ENABLE_DEBUG_MSG
+    printf("      [oc=%d] multiplier_data %d, shift_data %d\n", i,
+           p_param->multiplier_data[i], p_param->shift_data[i]);
+#endif
+  }
+
+  conv_per_channel_ref(p_param);
+
+  const int chl_quan_per_lane_data_size =
+      p_param->has_bias ? 9 : 5;  // bias(4) + multiplier(4) + shift(1)
+  const int cal_data_size = oc * chl_quan_per_lane_data_size;
+  uint8_t *chl_quan_data = (uint8_t *) malloc(cal_data_size);
+  pack_chl_quan_param(oc, p_param->has_bias, p_param->bias_data,
+                      p_param->multiplier_data, p_param->shift_data,
+                      chl_quan_data);
+
+  cvk_tl_shape_t input_shape = tl_shape_t4(in, ic, ih, iw);
+  cvk_tl_shape_t filter_shape = tl_shape_t4(1, oc, kh * kw, ic);
+  cvk_tl_shape_t output_shape = tl_shape_t4(in, oc, oh, ow);
+  cvk_tl_shape_t cal_shape = tl_shape_t4(1, oc, 1, chl_quan_per_lane_data_size);
+
+  cvk_tl_t *tl_input =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, input_shape, CVK_FMT_I8, /*eu_aign=*/1);
+
+  cvk_tl_t *tl_filter =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, filter_shape, CVK_FMT_I8, /*eu_align=*/0);
+
+  cvk_tl_t *tl_output =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, output_shape, CVK_FMT_I8, /*eu_align=*/1);
+
+  // Shape for TDMA load
+  cvk_tl_t *tl_cal_data =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, cal_shape, CVK_FMT_U8, /*eu_align*/ 0);
+
+  if (!tl_input || !tl_filter || !tl_output || !tl_cal_data) {
+    if (tl_input == NULL) {
+      printf("      fail to alloc tl_input (%d, %d, %d, %d)\n", input_shape.n,
+            input_shape.c, input_shape.h, input_shape.w);
+    }
+    if (tl_filter == NULL) {
+      printf("     fail to alloc tl_filter (%d, %d, %d, %d)\n", filter_shape.n,
+            filter_shape.c, filter_shape.h, filter_shape.w);
+    }
+    if (tl_output == NULL) {
+      printf("    fail to alloc tl_output (%d, %d, %d, %d)\n", output_shape.n,
+            output_shape.c, output_shape.h, output_shape.w);
+    }
+    if (tl_cal_data == NULL) {
+      printf("    fail to alloc tl_cal_data (%d, %d ,%d, %d)\n", cal_shape.n,
+            cal_shape.c, cal_shape.h, cal_shape.w);
+    }
+
+    // Reverse order
+    if (tl_cal_data)
+      cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_cal_data);
+    if (tl_output)
+      cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_output);
+    if (tl_filter)
+      cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_filter);
+    if (tl_input)
+      cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_input);
+
+    return -1;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_cal_data, chl_quan_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_input, (uint8_t *)input_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_filter, (uint8_t *)kernel_data);
+
+  {
+    // Reshape per channel quantization data for TIU
+    tl_cal_data->shape = tl_shape_t4(1, oc, 1, 1);
+    tl_cal_data->stride = cvk_ctx->ops->tl_default_stride(
+        cvk_ctx, tl_cal_data->shape, CVK_FMT_I8, /*eu_align=*/0);
+
+    // Reshape weight for TIU
+    tl_filter->shape = tl_shape_t4(ic, oc, kh, kw);
+
+    cvk_tiu_convolution_param_t param;
+    memset(&param, 0, sizeof(param));
+    param.ofmap = tl_output;
+    param.ifmap = tl_input;
+    param.weight = tl_filter;
+    param.chl_quan_param = tl_cal_data;
+    param.ins_h = ins_h;
+    param.ins_last_h = ins_last_h;
+    param.ins_w = ins_w;
+    param.ins_last_w = ins_last_w;
+    param.stride_h = stride_h;
+    param.stride_w = stride_w;
+    param.dilation_h = dh;
+    param.dilation_w = dw;
+    param.pad_top = pad_top;
+    param.pad_bottom = pad_bot;
+    param.pad_left = pad_left;
+    param.pad_right = pad_right;
+    param.has_bias = has_bias;
+    param.relu_enable = relu_enable;
+
+#ifdef ENABLE_DEBUG_MSG
+    printf("      tiu_conv_qdm:\n");
+    printf("        ifmap shape (%d, %d, %d, %d)\n", param.ifmap->shape.n,
+           param.ifmap->shape.c, param.ifmap->shape.h, param.ifmap->shape.w);
+    printf("        weight shape (%d, %d, %d, %d)\n", param.weight->shape.n,
+           param.weight->shape.c, param.weight->shape.h, param.weight->shape.w);
+    printf("        ofmap shape (%d, %d, %d, %d)\n", param.ofmap->shape.n,
+           param.ofmap->shape.c, param.ofmap->shape.h, param.ofmap->shape.w);
+#endif
+
+    cvk_ctx->ops->tiu_convolution(cvk_ctx, &param);
+  }
+
+  CVI_RT_Submit(cvk_ctx);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("      compare result:\n");
+#endif
+  int8_t *conv_output_data =
+      (int8_t *)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_output);
+  for (int i = 0; i < in; ++i) {
+    for (int j = 0; j < oc; ++j) {
+      for (int k = 0; k < oh; ++k) {
+        for (int l = 0; l < ow; ++l) {
+          int offset = i * (oc * oh * ow) + j * (oh * ow) + k * ow + l;
+          if (conv_output_data[offset] != output_data[offset]) {
+            printf("        [ni=%d][oci=%d][ohi=%d][owi=%d] output %d(tiu) != "
+                   "%d(ref)\n",
+                   i, j, k, l, conv_output_data[offset], output_data[offset]);
+            ret = -1;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  if (ret) {
+    //dump_test_param(p_param, /*dump_content=*/true);
+	dump2_test_param(p_param);
+  }
+
+  // Reverse order
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_cal_data);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_output);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_filter);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_input);
+
+  free(conv_output_data);
+
+  free(input_data);
+  free(kernel_data);
+  free(output_data);
+  free(bias_data);
+  free(multiplier_data);
+  free(shift_data);
+  free(chl_quan_data);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    <= run_compare_conv\n");
+#endif
+
+  return ret;
+}
+
+
+
+
+int run2_compare_conv(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+
+  if (rt_handle == NULL || cvk_ctx == NULL) {
+    return -1;
+  }
+
+  conv_test_param_t *p_param = &keepFailParam; 
+
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int oh = p_param->output_h;
+  int ow = p_param->output_w;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_last_h = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_last_w = p_param->ins_w_last;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int has_bias = p_param->has_bias;
+  int relu_enable = p_param->relu_enable;
+
+  int input_size = in * ic * iw * ih;
+  int8_t *input_data = (int8_t *)malloc(input_size);
+
+  int kernel_size = oc * ic * kh * kw;
+  int8_t *kernel_data = (int8_t *)malloc(kernel_size);
+
+  int output_size = in * oc * oh * ow;
+  int8_t *output_data = (int8_t *)malloc(output_size);
+  if (!input_data || !kernel_data || !output_data) {
+    free(input_data);
+    free(kernel_data);
+    free(output_data);
+    return -1;
+  }
+
+  memset(output_data, 0, output_size);
+
+  int32_t *bias_data = (int32_t *) malloc(sizeof(int32_t) * oc);
+  uint32_t *multiplier_data = (uint32_t *) malloc(sizeof(uint32_t) * oc);
+  int8_t *shift_data = (int8_t *)malloc(oc);
+
+  //p_param->input_data = input_data;
+  //p_param->filter_data = kernel_data;
+  //p_param->output_data = output_data;
+  //p_param->has_bias = has_bias;
+  //p_param->bias_data = bias_data;
+  //p_param->multiplier_data = multiplier_data;
+  //p_param->shift_data = shift_data;
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    run_compare_conv =>\n");
+  printf("      input (n=%d, ic=%d, h=%d, w=%d), kernel (oc=%d, ic=%d, h=%d, "
+         "w=%d), output (, c=%d, h=%d, w=%d), has_bias %d\n",
+         in, ic, ih, iw, oc, ic, kh, kw, oc, oh, ow, has_bias);
+#endif
+
+  int retry_cnt = p_param->retry_cnt;
+  do {
+    fill_random_data_s8(input_data, input_size);
+    fill_random_data_s8(kernel_data, kernel_size);
+    if (has_bias) {
+      fill_random_data_s32(bias_data, oc);
+    }
+
+    p_param->float_multiplier = 100.0;  // should be < 1.0
+    calc_conv_float_multiplier(p_param);
+
+    if (p_param->float_multiplier > 0.f && p_param->float_multiplier < 1.0) {
+      break;
+    }
+
+  } while (--retry_cnt);
+
+  if (p_param->float_multiplier >= 1.0) {
+    printf("    run_compare_dw_conv: unable to find valid multiplier\n");
+    free(input_data);
+    free(kernel_data);
+    free(output_data);
+    free(bias_data);
+    free(multiplier_data);
+    free(shift_data);
+
+    return -1;
+  }
+
+  uint32_t base_multiplier = 0;
+  int base_shift = 0;
+  QuantizeMultiplierSmallerThanOne(p_param->float_multiplier, &base_multiplier,
+                                   &base_shift);
+
+  for (int i = 0; i < oc; ++i) {
+    // multipliers typically range in [2^30 ; 2^31 - 1].
+    // Values in [0, 2^30 - 1] are normally unused, but harmless.
+    // Thus a good way to randomize multipliers is to subtract from them
+    // a random value smaller than 2^30 but still significant compared to it.
+    p_param->multiplier_data[i] = base_multiplier - (rand() % (1 << 26));
+
+    // Our H/W only supports right shift
+    int right_shift = base_shift - 1 + (rand() % 4);
+    p_param->shift_data[i] = right_shift > 0 ? right_shift : 0;
+
+#ifdef ENABLE_DEBUG_MSG
+    printf("      [oc=%d] multiplier_data %d, shift_data %d\n", i,
+           p_param->multiplier_data[i], p_param->shift_data[i]);
+#endif
+  }
+
+  conv_per_channel_ref(p_param);
+
+  const int chl_quan_per_lane_data_size =
+      p_param->has_bias ? 9 : 5;  // bias(4) + multiplier(4) + shift(1)
+  const int cal_data_size = oc * chl_quan_per_lane_data_size;
+  uint8_t *chl_quan_data = (uint8_t *) malloc(cal_data_size);
+  pack_chl_quan_param(oc, p_param->has_bias, p_param->bias_data,
+                      p_param->multiplier_data, p_param->shift_data,
+                      chl_quan_data);
+
+  cvk_tl_shape_t input_shape = tl_shape_t4(in, ic, ih, iw);
+  cvk_tl_shape_t filter_shape = tl_shape_t4(1, oc, kh * kw, ic);
+  cvk_tl_shape_t output_shape = tl_shape_t4(in, oc, oh, ow);
+  cvk_tl_shape_t cal_shape = tl_shape_t4(1, oc, 1, chl_quan_per_lane_data_size);
+
+  cvk_tl_t *tl_input =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, input_shape, CVK_FMT_I8, /*eu_aign=*/1);
+
+  cvk_tl_t *tl_filter =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, filter_shape, CVK_FMT_I8, /*eu_align=*/0);
+
+  cvk_tl_t *tl_output =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, output_shape, CVK_FMT_I8, /*eu_align=*/1);
+
+  // Shape for TDMA load
+  cvk_tl_t *tl_cal_data =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, cal_shape, CVK_FMT_U8, /*eu_align*/ 0);
+
+  if (!tl_input || !tl_filter || !tl_output || !tl_cal_data) {
+    if (tl_input == NULL) {
+      printf("      fail to alloc tl_input (%d, %d, %d, %d)\n", input_shape.n,
+            input_shape.c, input_shape.h, input_shape.w);
+    }
+    if (tl_filter == NULL) {
+      printf("     fail to alloc tl_filter (%d, %d, %d, %d)\n", filter_shape.n,
+            filter_shape.c, filter_shape.h, filter_shape.w);
+    }
+    if (tl_output == NULL) {
+      printf("    fail to alloc tl_output (%d, %d, %d, %d)\n", output_shape.n,
+            output_shape.c, output_shape.h, output_shape.w);
+    }
+    if (tl_cal_data == NULL) {
+      printf("    fail to alloc tl_cal_data (%d, %d ,%d, %d)\n", cal_shape.n,
+            cal_shape.c, cal_shape.h, cal_shape.w);
+    }
+
+    // Reverse order
+    if (tl_cal_data)
+      cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_cal_data);
+    if (tl_output)
+      cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_output);
+    if (tl_filter)
+      cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_filter);
+    if (tl_input)
+      cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_input);
+
+    return -1;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_cal_data, chl_quan_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_input, (uint8_t *)input_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_filter, (uint8_t *)kernel_data);
+
+  {
+    // Reshape per channel quantization data for TIU
+    tl_cal_data->shape = tl_shape_t4(1, oc, 1, 1);
+    tl_cal_data->stride = cvk_ctx->ops->tl_default_stride(
+        cvk_ctx, tl_cal_data->shape, CVK_FMT_I8, /*eu_align=*/0);
+
+    // Reshape weight for TIU
+    tl_filter->shape = tl_shape_t4(ic, oc, kh, kw);
+
+    cvk_tiu_convolution_param_t param;
+    memset(&param, 0, sizeof(param));
+    param.ofmap = tl_output;
+    param.ifmap = tl_input;
+    param.weight = tl_filter;
+    param.chl_quan_param = tl_cal_data;
+    param.ins_h = ins_h;
+    param.ins_last_h = ins_last_h;
+    param.ins_w = ins_w;
+    param.ins_last_w = ins_last_w;
+    param.stride_h = stride_h;
+    param.stride_w = stride_w;
+    param.dilation_h = dh;
+    param.dilation_w = dw;
+    param.pad_top = pad_top;
+    param.pad_bottom = pad_bot;
+    param.pad_left = pad_left;
+    param.pad_right = pad_right;
+    param.has_bias = has_bias;
+    param.relu_enable = relu_enable;
+
+#ifdef ENABLE_DEBUG_MSG
+    printf("      tiu_conv_qdm:\n");
+    printf("        ifmap shape (%d, %d, %d, %d)\n", param.ifmap->shape.n,
+           param.ifmap->shape.c, param.ifmap->shape.h, param.ifmap->shape.w);
+    printf("        weight shape (%d, %d, %d, %d)\n", param.weight->shape.n,
+           param.weight->shape.c, param.weight->shape.h, param.weight->shape.w);
+    printf("        ofmap shape (%d, %d, %d, %d)\n", param.ofmap->shape.n,
+           param.ofmap->shape.c, param.ofmap->shape.h, param.ofmap->shape.w);
+#endif
+
+    cvk_ctx->ops->tiu_convolution(cvk_ctx, &param);
+  }
+
+  CVI_RT_Submit(cvk_ctx);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("      compare result:\n");
+#endif
+  int8_t *conv_output_data =
+      (int8_t *)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_output);
+  for (int i = 0; i < in; ++i) {
+    for (int j = 0; j < oc; ++j) {
+      for (int k = 0; k < oh; ++k) {
+        for (int l = 0; l < ow; ++l) {
+          int offset = i * (oc * oh * ow) + j * (oh * ow) + k * ow + l;
+          if (conv_output_data[offset] != output_data[offset]) {
+            printf("        [ni=%d][oci=%d][ohi=%d][owi=%d] output %d(tiu) != "
+                   "%d(ref)\n",
+                   i, j, k, l, conv_output_data[offset], output_data[offset]);
+            ret = -1;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  if (ret) {
+    //dump_test_param(p_param, /*dump_content=*/true);
+	dump2_test_param(p_param);
+  }
+
+  // Reverse order
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_cal_data);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_output);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_filter);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_input);
+
+  free(conv_output_data);
+
+  free(input_data);
+  free(kernel_data);
+  free(output_data);
+  free(bias_data);
+  free(multiplier_data);
+  free(shift_data);
+  free(chl_quan_data);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    <= run_compare_conv\n");
+#endif
+
+  return ret;
+}
+
+int random_test(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+
+#ifndef ENABLE_FULL_REGRESSION
+  // TV_GEN pattern
+  // Random Test, total 19683, skipped 118066, executed 32, failed 0, ret 0
+
+  // n: 12b, c: 12b, h: 12b(4095-32), wb: 12b(4095-32)
+  int batch_range[] = {1, 1, 32};
+  int input_height_range[] = {1, 512, 4095 - 32};
+  int input_width_range[] = {1, 512, 4095 - 32};
+  int input_depth_range[] = {1, 16, 32, 64, 102, 4095};
+  int output_depth_range[] = {1, 16, 32, 64, 1024, 4095};
+
+  // h: 12b, w: 12b
+  // stride_h: 4b, strid_w: 4b
+  int kernel_height_range[] = {1, 11, 2048, 4095};
+  int kernel_width_range[] = {1, 11, 2048, 4095};
+  int kernel_stride_height_range[] = {1, 5, 16, 31};
+  int kernel_stride_width_range[] = {1, 5, 16, 31};
+#else
+  // n: 12b, c: 12b, h: 12b(4095-32), wb: 12b(4095-32)
+  int batch_range[] = {1, 2, 4, 8, 16, 32, 64, 4095 - 32};
+  int input_height_range[] = {1, 3, 11, 128, 512, 1024, 2048, 4095 - 32};
+  int input_width_range[] = {1, 3, 11, 128, 512, 1024, 2048, 4095 - 32};
+  int input_depth_range[] = {1, 3, 11, 32, 64, 1024, 2048, 4095};
+  int output_depth_range[] = {1, 3, 11, 32, 64, 1024, 2048, 4095};
+
+  // h: 12b, w: 12b
+  // stride_h: 4b, strid_w: 4b
+  int kernel_height_range[] = {1, 3, 11, 511, 4095};
+  int kernel_width_range[] = {1, 3, 11, 511, 4095};
+  int kernel_stride_height_range[] = {1, 3, 5, 7, 15, 16, 31};
+  int kernel_stride_width_range[] = {1, 3, 5, 7, 15, 16, 31};
+#endif /* ENABLE_FULL_REGRESSION */
+
+  const int batch_range_size = sizeof(batch_range) / sizeof(batch_range[0]);
+  const int input_height_range_size =
+      sizeof(input_height_range) / sizeof(input_height_range[0]);
+  const int input_width_range_size =
+      sizeof(input_width_range) / sizeof(input_width_range[0]);
+  const int input_depth_range_size =
+      sizeof(input_depth_range) / sizeof(input_depth_range[0]);
+  const int output_depth_range_size =
+      sizeof(output_depth_range) / sizeof(output_depth_range[0]);
+
+  const int kernel_height_range_size =
+      sizeof(kernel_height_range) / sizeof(kernel_height_range[0]);
+  const int kernel_width_range_size =
+      sizeof(kernel_width_range) / sizeof(kernel_width_range[0]);
+  const int kernel_stride_height_range_size =
+      sizeof(kernel_stride_height_range) /
+      sizeof(kernel_stride_height_range[0]);
+  const int kernel_stride_width_range_size =
+      sizeof(kernel_stride_width_range) / sizeof(kernel_stride_width_range[0]);
+
+  int random_seed = clock();
+  srand(random_seed);
+
+  const int retry_test_count = 100;
+
+  bool stop_at_first_error = true;
+
+  int total_tests = batch_range_size * input_depth_range_size *
+                    input_height_range_size * input_width_range_size *
+                    output_depth_range_size * kernel_height_range_size *
+                    kernel_width_range_size * kernel_stride_height_range_size *
+                    kernel_stride_width_range_size;
+  int skipped_tests = 0;
+  int executed_tests = 0;
+  int failed_tests = 0;
+  int current_test = 0;
+
+  printf("Random Test =>\n");
+  for (int m = 0; m < retry_test_count; ++m) {
+    for (int i = 0; i < batch_range_size; ++i) {
+      // random choosed from [range[i] : range[i+1]]
+      int batch = choose_from_range(batch_range, batch_range_size, i);
+
+      for (int j = 0; j < input_height_range_size; ++j) {
+        int input_height =
+            choose_from_range(input_height_range, input_height_range_size, j);
+
+        for (int k = 0; k < input_width_range_size; ++k) {
+          int input_width =
+              choose_from_range(input_width_range, input_width_range_size, k);
+
+          for (int l = 0; l < input_depth_range_size; ++l) {
+            int input_depth =
+                choose_from_range(input_depth_range, input_depth_range_size, k);
+
+            for (int m = 0; m < kernel_height_range_size; ++m) {
+              int kernel_height = choose_from_range(
+                  kernel_height_range, kernel_height_range_size, m);
+
+              for (int n = 0; n < kernel_width_range_size; ++n) {
+                int kernel_width = choose_from_range(
+                    kernel_width_range, kernel_width_range_size, n);
+
+                for (int x = 0; x < kernel_stride_height_range_size; ++x) {
+                  int kernel_stride_height =
+                      choose_from_range(kernel_stride_height_range,
+                                        kernel_stride_height_range_size, x);
+
+                  for (int y = 0; y < kernel_stride_width_range_size; ++y) {
+                    int kernel_stride_width =
+                        choose_from_range(kernel_stride_width_range,
+                                          kernel_stride_width_range_size, y);
+
+                    for (int z = 0; z < output_depth_range_size; ++z) {
+                      int output_depth = choose_from_range(
+                          output_depth_range, output_depth_range_size, y);
+
+                      current_test++;
+
+                      int has_bias = rand() % 2;
+                      int dh = 1;
+                      int dw = 1;
+                      int ins_h = 0;
+                      int ins_h_last = 0;
+                      int ins_w = 0;
+                      int ins_w_last = 0;
+                      int pad_top = 0;
+                      int pad_bot = 0;
+                      int pad_left = 0;
+                      int pad_right = 0;
+
+                      int ih_ext = calc_dilute_hw(input_height, ins_h,
+                                                  ins_h_last, pad_top, pad_bot);
+                      int iw_ext = calc_dilute_hw(
+                          input_width, ins_w, ins_w_last, pad_left, pad_right);
+                      int kh_ext =
+                          calc_dilute_hw(kernel_height, dh - 1, 0, 0, 0);
+                      int kw_ext =
+                          calc_dilute_hw(kernel_width, dw - 1, 0, 0, 0);
+
+                      int oh =
+                          calc_output_hw(ih_ext, kh_ext, kernel_stride_height);
+                      int ow =
+                          calc_output_hw(iw_ext, kw_ext, kernel_stride_width);
+
+                      conv_test_param_t test_param;
+                      memset(&test_param, 0, sizeof(test_param));
+                      test_param.input_n = batch;
+                      test_param.input_c = input_depth;
+                      test_param.input_h = input_height;
+                      test_param.input_w = input_width;
+                      test_param.kh = kernel_height;
+                      test_param.kw = kernel_width;
+                      test_param.dh = dh;
+                      test_param.dw = dw;
+                      test_param.pad_top = pad_top;
+                      test_param.pad_bot = pad_bot;
+                      test_param.pad_left = pad_left;
+                      test_param.pad_right = pad_right;
+                      test_param.ins_h = ins_h;
+                      test_param.ins_h_last = ins_h_last;
+                      test_param.ins_w = ins_w;
+                      test_param.ins_w_last = ins_w_last;
+                      test_param.stride_h = kernel_stride_height;
+                      test_param.stride_w = kernel_stride_width;
+                      test_param.output_c = output_depth;
+                      test_param.output_h = oh;
+                      test_param.output_w = ow;
+                      test_param.has_bias = has_bias;
+                      test_param.retry_cnt = 5;
+
+                      bool is_valid_param =
+                          check_valid_test_param(cvk_ctx, &test_param);
+                      if (is_valid_param == false) {
+                        skipped_tests++;
+                        continue;
+                      }
+
+                      int ret2 = run_compare_conv(rt_handle, cvk_ctx, &test_param);
+                      failed_tests = ret2 ? failed_tests + 1 : failed_tests;
+                      ret |= ret2;
+                      executed_tests++;
+
+#ifdef ENABLE_DEBUG_MSG
+                      printf(
+                          "  [%d/%d] random test: input shape(%d, %d, %d, %d)",
+                          current_test, total_tests, batch, input_depth,
+                          input_height, input_width);
+                      printf(", kernel shape (%d, %d, %d, %d), result %d\n",
+                             output_depth, input_depth, kernel_height,
+                             kernel_width, ret2);
+#endif
+
+                      // Stop at first error
+                      if (ret && stop_at_first_error) {
+                        break;
+                      }
+                    }
+
+                    // Stop at first error
+                    if (ret && stop_at_first_error) {
+                      break;
+                    }
+                  }
+
+                  // Stop at first error
+                  if (ret && stop_at_first_error) {
+                    break;
+                  }
+                }
+
+                // Stop at first error
+                if (ret && stop_at_first_error) {
+                  break;
+                }
+              }
+
+              // Stop at first error
+              if (ret && stop_at_first_error) {
+                break;
+              }
+            }
+
+            // Stop at first error
+            if (ret && stop_at_first_error) {
+              break;
+            }
+          }
+
+          // Stop at first error
+          if (ret && stop_at_first_error) {
+            break;
+          }
+        }
+
+        // Stop at first error
+        if (ret && stop_at_first_error) {
+          break;
+        }
+      }
+
+      // Stop at first error
+      if (ret && stop_at_first_error) {
+        break;
+      }
+    }
+    // Stop at first error
+    if (ret && stop_at_first_error) {
+      break;
+    }
+
+    if (executed_tests >= MIN_EXEC_TESTS) {
+      break;
+    }
+  }
+
+  printf(
+      "<= Random Test, total %d, skipped %d, executed %d, failed %d, ret %d\n",
+      total_tests, skipped_tests, executed_tests, failed_tests, ret);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= simple_test(rt_handle, cvk_ctx);
+  ret |= random_test(rt_handle, cvk_ctx);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_conv_wtiling.c b/cviruntime/test/181x/test_181x_conv_wtiling.c
new file mode 100644
index 000000000..5e31a5673
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_conv_wtiling.c
@@ -0,0 +1,917 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include "test_cvikernel_util.h"
+#include "test_native_ref.h"
+
+typedef struct {
+  int random_seed;
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int using_bias;
+  int bReLU_EN;
+  int r_shift_m;
+  int opd0_sign;
+  int opd1_sign;
+  int opd2_sign;
+} conv_param_t;
+
+typedef struct {
+  uint32_t n;
+  uint32_t c;
+  uint32_t h;
+  uint32_t w;
+}slice_t;
+
+static int index_get(int h, int w1, int w2)
+{
+  return h * w1 + w2;
+}
+
+static int matrix_dot_mult(
+    int8_t *A, int8_t *B, int dim_n, int dim_m,
+    int opd0_sign)
+{
+  int sum = 0;
+  for (int i=0; i<dim_n; i++){
+    for (int j=0; j<dim_m; j++){
+      int index = index_get(i, dim_m, j);
+      if (opd0_sign) {
+        sum += A[index] * B [index];
+      } else {
+        sum += (int)((uint8_t)A[index]) * B [index];
+      }
+    }
+  }
+  return sum;
+}
+
+static int conv_ref(
+    const conv_param_t *p_param,
+    const int8_t *ifmap,
+    const int8_t *weight,
+    const int16_t *bias,
+    int8_t *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+  int r_shift_bits = p_param->r_shift_m;
+  int do_relu = p_param->bReLU_EN;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  int8_t *i_fmap_pad_ker = (int8_t *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return -1;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = 0;
+
+  int8_t *i_fmap_pad = NULL;
+  int8_t *kernel_after = NULL;
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+
+      if (p_param->using_bias) {
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += bias[c]; //bias+c ;
+          }
+        }
+      }
+
+      if (do_relu)
+        relu(&result[n*oc*oh*ow + c*oh*ow], oh * ow);
+
+      // ofmap is int8_t, signed
+      ret = satu_2_8bit(&result[n*oc*oh*ow + c*oh*ow], oh * ow,
+                        &ofmap[n*oc*oh*ow + c*oh*ow], r_shift_bits, /*round_floor=*/1,
+                        /*sign_unsign=*/1);
+
+      if (ret)
+        goto error_release;
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+error_release:
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+
+static uint8_t * transform_weight(const cvk_tl_shape_t *s, uint8_t before[])
+{
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  uint32_t size = ic * oc * kh * kw;
+  uint8_t *after = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!after)
+    return NULL;
+
+  /*
+   * (oc, ic, kh, kw) -> (1, oc, kh * kw, ic)
+   */
+  for (uint32_t oci = 0; oci < oc; oci++) {
+    for (uint32_t ici = 0; ici < ic; ici++) {
+      for (uint32_t khi = 0; khi < kh; khi++) {
+        for (uint32_t kwi = 0; kwi < kw; kwi++) {
+          uint32_t src_i = oci * ic * kh * kw + ici * kh * kw + khi * kw + kwi;
+          uint32_t dst_i = oci * kh * kw * ic + khi * kw * ic + kwi * ic + ici;
+          after[dst_i] = before[src_i];
+        }
+      }
+    }
+  }
+
+  return after;
+}
+
+static void put_conv_weight(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    uint8_t *data)
+{
+#if 0
+  const cvk_tl_shape_t *s = &tl->shape;
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  bmshape_t bms = BM_TENSOR_INT8((int)oc, (int)ic, (int)kh, (int)kw);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  uint8_t *transformed_data = transform_weight(s, data);
+
+  /* we put weight to region 1. bm_memcpy_s2d regard dev_mem as
+   * absolute address, so we should pass abs address to copy weight
+   * to right place.
+   */
+
+  //u64 ab_addr = bm_device_read_base_reg(*ctx, 1);
+  //bmmem_device_t ab_dev_mem =
+    //bmmem_device_prealloc(*ctx, NULL, ab_addr + gaddr, &bms);
+
+  //int ret = bm_memcpy_s2d(*ctx, ab_dev_mem, transformed_data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  cvk_tl_shape_t tdma_shape = { 1, oc, kh * kw, ic };
+
+  tg_t tdma_tg;
+  tdma_tg.base_reg_index = 0;
+  tdma_tg.start_address = gaddr;
+  tdma_tg.fmt = FMT_I8;
+  tdma_tg.shape.n = tdma_shape.n;
+  tdma_tg.shape.c = tdma_shape.c;
+  tdma_tg.shape.h = tdma_shape.h;
+  tdma_tg.shape.w = tdma_shape.w;
+  tdma_tg.stride = bmk1822_tensor_tgmem_default_stride(tdma_tg.shape, tdma_tg.fmt);
+  tdma_tg.base_reg_index = 1;
+
+  tl_t tdma_tl = *tl;
+  tdma_tl.shape = tdma_shape;
+  tdma_tl.stride = bmk1822_tensor_lmem_default_stride(cvk_ctx, tdma_shape, FMT_I8, 0);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tdma_tg;
+  p.dst = &tdma_tl;
+
+  bmk1822_tdma_g2l_tensor_copy(cvk_ctx, &p);
+  test_submit(ctx);
+
+
+  bmmem_device_free(*ctx, dev_mem);
+
+  free(transformed_data);
+#endif
+
+  const cvk_tl_shape_t *s = &tl->shape;
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  uint8_t *transformed_data = transform_weight(s, data);
+
+  cvk_tl_shape_t tdma_shape = tl_shape_t4(1, oc, kh * kw, ic);
+  cvk_tl_t tdma_tl;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tdma_tl, tdma_shape, tl->fmt, tl->eu_align);
+  tdma_tl.start_address = tl->start_address;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, &tdma_tl, transformed_data);
+
+  free(transformed_data);
+}
+
+static int8_t * transform_bias(int oc, int16_t before[])
+{
+  int8_t *after = (int8_t *)malloc(2 * oc);
+  if (!after)
+    return NULL;
+
+  for (int i = 0; i < oc; i++){
+    after[i] = before[i] & 0xff;
+    after[i + oc] = (before[i] >> 8) & 0xff;
+  }
+  return after;
+}
+
+static void put_conv_bias(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    int16_t *data)
+{
+#if 0
+  int oc = tl->shape.c;
+
+  bmshape_t bms = BM_TENSOR_INT8(2, oc, 1, 1);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  int8_t *transformed_data = transform_bias(oc, data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, (uint8_t *)transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_I8;
+  tg.shape.n = 2;
+  tg.shape.c = oc;
+  tg.shape.h = 1;
+  tg.shape.w = 1;
+  tg.stride = bmk1822_tensor_tgmem_default_stride(tg.shape, tg.fmt);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1822_tdma_g2l_tensor_copy(cvk_ctx, &p);
+  test_submit(ctx);
+
+  bmmem_device_free(*ctx, dev_mem);
+  delete[] transformed_data;
+#endif
+
+  int oc = tl->shape.c;
+  int8_t *transformed_data = transform_bias(oc, data);
+
+  cvk_tl_shape_t tdma_shape = tl_shape_t4(2, oc, 1, 1);
+  cvk_tl_t tdma_tl;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tdma_tl, tdma_shape, tl->fmt, tl->eu_align);
+  tdma_tl.start_address = tl->start_address;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, &tdma_tl, (uint8_t *)transformed_data);
+
+  free(transformed_data);
+}
+
+static int conv_kh_ext(const conv_param_t *p)
+{
+  return (p->kh - 1) * p->dh + 1;
+}
+
+static int conv_kw_ext(const conv_param_t *p)
+{
+  return (p->kw - 1) * p->dw + 1;
+}
+
+static int conv_ih_ext(const conv_param_t *p)
+{
+  return (p->input_h - 1) * (p->ins_h + 1) +
+      p->ins_h_last + 1 + p->pad_top + p->pad_bot;
+}
+
+static int conv_iw_ext(const conv_param_t *p)
+{
+  return (p->input_w - 1) * (p->ins_w + 1) +
+      p->ins_w_last + 1 + p->pad_left + p->pad_right;
+}
+
+static int conv_oh(const conv_param_t *p)
+{
+  return (conv_ih_ext(p) - conv_kh_ext(p)) / p->stride_h + 1;
+}
+
+static int conv_ow(const conv_param_t *p)
+{
+  return (conv_iw_ext(p) - conv_kw_ext(p)) / p->stride_w + 1;
+}
+
+static int conv_input_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int ic = p->input_c;
+  int ih = p->input_h;
+  int iw = p->input_w;
+  return in * ic * ih * iw;
+}
+
+static int conv_output_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int oc = p->output_c;
+  int oh = conv_oh(p);
+  int ow = conv_ow(p);
+  return in * oc * oh * ow;
+}
+
+static int conv_weight_size(const conv_param_t *p)
+{
+  int oc = p->output_c;
+  int ic = p->input_c;
+  int kh = p->kh;
+  int kw = p->kw;
+  return oc * ic * kh * kw;
+}
+
+static int8_t * alloc_input(const conv_param_t *p)
+{
+  int size = conv_input_size(p);
+
+  int8_t *buf = (int8_t *)malloc(sizeof(int8_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static int8_t * alloc_weight(const conv_param_t *p)
+{
+  int size = conv_weight_size(p);
+
+  int8_t *buf = (int8_t *)malloc(sizeof(int8_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static int16_t * alloc_bias(const conv_param_t *p)
+{
+  int oc = p->output_c;
+
+  int16_t *bias = (int16_t *)malloc(sizeof(int16_t) * oc);
+  if (!bias)
+    return NULL;
+
+  for (int i = 0; i < oc; i++)
+    bias[i] = rand() % 65536 - 32768;
+
+  return bias;
+}
+
+static cvk_tl_t * conv_ifmap_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = p->opd0_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 1);
+}
+
+static cvk_tl_t * conv_weight_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = p->opd1_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 0);
+}
+
+static cvk_tl_t * conv_ofmap_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_tl_shape_t s;
+  s.n = p->input_n * 4;
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  cvk_tl_t *tl = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, CVK_FMT_I8, 1);
+  tl->shape.n = p->input_n;
+  return tl;
+}
+
+static cvk_tl_t * conv_bias_tensor(
+    cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = p->opd2_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 0);
+}
+
+static int conv_param_is_ok(const conv_param_t *p)
+{
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+
+  if ((kh_ext > ih_ext)
+      || (kw_ext > iw_ext)
+      || (kh_ext <= p->pad_top)
+      || (kh_ext <= p->pad_bot)
+      || (kw_ext <= p->pad_left)
+      || (kw_ext <= p->pad_right)
+      || (p->pad_top >= (1 << 4))
+      || (p->pad_bot >= (1 << 4))
+      || (p->pad_left >= (1 << 4))
+      || (p->pad_right >= (1 << 4))) {
+    return 0;
+  }
+
+  return 1;
+}
+
+static int bmk_conv_param_alloc_ok(
+    const cvk_tiu_pt_convolution_param_t *p,
+    const conv_param_t *param)
+{
+  if (!p->ifmap || !p->ofmap || !p->weight)
+    return 0;
+
+  if (param->using_bias)
+    if (!p->bias)
+      return 0;
+
+  return 1;
+}
+
+static void make_bmk_conv_param(
+    cvk_context_t *cvk_ctx,
+    cvk_tiu_pt_convolution_param_t *dst,
+    const conv_param_t *p)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  dst->relu_enable = p->bReLU_EN;
+  dst->rshift_bits = p->r_shift_m;
+  dst->ifmap = conv_ifmap_tensor(cvk_ctx, p);
+  dst->weight = conv_weight_tensor(cvk_ctx, p);
+  dst->ofmap = conv_ofmap_tensor(cvk_ctx, p);
+  dst->bias = NULL;
+  dst->ps32_mode = 0;
+  if (p->using_bias)
+    dst->bias = conv_bias_tensor(cvk_ctx, p);
+
+  dst->w_is_const = 0;
+}
+
+static void free_bmk_conv_param(
+    cvk_context_t *cvk_ctx,
+    cvk_tiu_pt_convolution_param_t *r,
+    const conv_param_t *p)
+{
+  if (p->using_bias && r->bias)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->bias);
+  if (r->ofmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ofmap);
+  if (r->weight)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->weight);
+  if (r->ifmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ifmap);
+}
+
+static void init_conv_param(conv_param_t *p)
+{
+  printf("init_conv_param\n");
+  p->random_seed = clock();
+  srand(p->random_seed);
+
+retry:
+  p->input_n = 1;
+  p->input_c = 1;
+  p->kh = 3;
+  p->kw = 3;
+  p->input_h = 4 + p->kh;
+  p->input_w = 4 + p->kw ;
+  p->output_c = 1;
+  p->stride_h = 1;
+  p->stride_w = 1;
+  p->ins_h = 0;
+  p->ins_w = 0;
+  p->ins_h_last = 0;
+  p->ins_w_last = 0;
+  p->dh = 1;
+  p->dw = 1;
+
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  p->pad_top = 0;
+  p->pad_bot = 0;
+  p->pad_left = 0;
+  p->pad_right = 0;
+
+  if (!conv_param_is_ok(p)) {
+    printf("retry init_conv_param\n");
+    goto retry;
+  }
+
+  p->using_bias = 1;
+  p->r_shift_m = rand() % 8;
+  p->bReLU_EN = rand() % 2;
+  p->opd0_sign = rand() % 2;
+
+  p->opd1_sign = 1;
+  p->opd2_sign = 1;
+
+  assert(p->opd1_sign == 1 && p->opd2_sign == 1);
+
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+  assert(ih_ext >= kh_ext);
+  assert(iw_ext >= kw_ext);
+}
+
+static void print_conv_param(const conv_param_t *p)
+{
+  printf("%s\n", "Conv parameters:");
+  printf("  %s%d;\n", "p->random_seed = ", p->random_seed);
+
+  printf("  %s%d;\n", "p->input_n = ", p->input_n);
+  printf("  %s%d;\n", "p->input_c = ", p->input_c);
+  printf("  %s%d;\n", "p->input_h = ", p->input_h);
+  printf("  %s%d;\n", "p->input_w = ", p->input_w);
+  printf("  %s%d;\n", "p->output_c = ", p->output_c);
+
+  printf("  %s%d;\n", "p->kh = ", p->kh);
+  printf("  %s%d;\n", "p->kw = ", p->kw);
+  printf("  %s%d;\n", "p->dh = ", p->dh);
+  printf("  %s%d;\n", "p->dw = ", p->dw);
+  printf("  %s%d;\n", "p->pad_top = ", p->pad_top);
+  printf("  %s%d;\n", "p->pad_bot = ", p->pad_bot);
+  printf("  %s%d;\n", "p->pad_left = ", p->pad_left);
+  printf("  %s%d;\n", "p->pad_right = ", p->pad_right);
+  printf("  %s%d;\n", "p->stride_h = ", p->stride_h);
+  printf("  %s%d;\n", "p->stride_w = ", p->stride_w);
+  printf("  %s%d;\n", "p->ins_w = ", p->ins_w);
+  printf("  %s%d;\n", "p->ins_h = ", p->ins_h);
+  printf("  %s%d;\n", "p->ins_w_last = ", p->ins_w_last);
+  printf("  %s%d;\n", "p->ins_h_last = ", p->ins_h_last);
+
+  printf("  %s%d;\n", "p->r_shift_m = ", p->r_shift_m);
+  printf("  %s%d;\n", "p->opd0_sign = ", p->opd0_sign);
+  printf("  %s%d;\n", "p->opd1_sign = ", p->opd1_sign);
+  printf("  %s%d;\n", "p->opd2_sign = ", p->opd2_sign);
+  printf("  %s%d;\n", "p->bReLU_EN = ", p->bReLU_EN);
+  printf("  %s%d;\n", "p->using_bias = ", p->using_bias);
+
+  printf("  %s%d\n", "kh_ext = ", conv_kh_ext(p));
+  printf("  %s%d\n", "kw_ext = ", conv_kw_ext(p));
+  printf("  %s%d\n", "ih_ext = ", conv_ih_ext(p));
+  printf("  %s%d\n", "iw_ext = ", conv_iw_ext(p));
+  printf("  %s%d\n", "output_h = ", conv_oh(p));
+  printf("  %s%d\n", "output_w = ", conv_ow(p));
+}
+
+/* Calculate the right shift value, m
+ * Steps:
+ *  1. Get the abs() of each weight;
+ *  2. Summary all the abs() in one kernel;
+ *  3. Get Log2 of each sum;
+ *  4. Downward rounding;
+ * After every r_shift value got, sort and find the middle one.
+ */
+
+static int calc_rshift_m(const conv_param_t *p, int8_t* weight)
+{
+  int kernel_cnt = p->output_c * p->input_c;
+  int kernel_size = p->kh * p->kw;
+  int *kernel_shifts = (int *)malloc(sizeof(int) * kernel_cnt);
+  if (!kernel_shifts)
+    return 1;
+
+  memset(kernel_shifts, 0, sizeof(int) * kernel_cnt);
+
+  // Part 1:
+  // Get right shift value for each kernel
+  int sum = 0;
+  for (int i = 0; i < kernel_cnt; i++) {
+    // Step 1 & 2: Get the sum of abs()
+    for (int j = 0; j < kernel_size; j++) {
+      sum += (int)(*weight < 0 ? -(*weight) : (*weight));
+      weight++;
+    }
+    // Step 3 & 4: log2 and downward rounding
+    sum >>= 1;
+    while (sum) {
+      sum >>= 1;
+      kernel_shifts[i]++;
+    }
+  }
+
+  // Part 2:
+  // Find the middle of all the values
+  int tag[32] = {0};
+  for (int cnt = 0; cnt < kernel_cnt; cnt++) {
+    tag[kernel_shifts[cnt]]++;
+  }
+
+  int rshift_m = 0;
+  int mid = 0;
+  do {
+    mid += tag[rshift_m++];
+  } while(mid < (kernel_cnt - 1) >> 1);
+
+  free(kernel_shifts);
+
+  return rshift_m - 1;
+}
+
+static int test_w_tiling_conv(
+    conv_param_t *p_param, CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  printf("test w tiled conv\n");
+  int ret = 0;
+  int8_t *input = alloc_input(p_param);
+  int8_t *weight = alloc_weight(p_param);
+  int16_t *bias = alloc_bias(p_param);
+  int8_t *output_ref = (int8_t *)malloc(sizeof(int8_t) * conv_output_size(p_param));
+  if (!input || !weight || !bias || !output_ref) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  p_param->r_shift_m = calc_rshift_m(p_param, weight);
+  ret = conv_ref(p_param, input, weight, bias, output_ref);
+  if (ret)
+    goto fail_exit;
+
+  cvk_tiu_pt_convolution_param_t conv_param;
+  make_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+
+  /*We tile the finest granule to test w tiling*/
+  uint32_t ow_step = 1;
+
+  int tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, p_param);
+  if (tl_alloc_success) {
+    put_conv_weight(rt_handle, cvk_ctx, conv_param.weight, (uint8_t *)weight);
+    if (p_param->using_bias) {
+      put_conv_bias(rt_handle, cvk_ctx, conv_param.bias, bias);
+    }
+
+    cvk_tl_t tl_ifmap = *conv_param.ifmap;
+    cvk_tl_t tl_ofmap = *conv_param.ofmap;
+
+    cvk_tg_shape_t s;
+    s.n = tl_ifmap.shape.n;
+    s.c = tl_ifmap.shape.c;
+    s.h = tl_ifmap.shape.h;
+    s.w = tl_ifmap.shape.w;
+    cvk_tg_t *ts_ifmap = alloc_tensor_dev_mem(rt_handle, cvk_ctx, s, CVK_FMT_I8);
+    tensor_copy_s2d(rt_handle, ts_ifmap, (uint8_t *)input);
+
+    s.n = tl_ofmap.shape.n;
+    s.c = tl_ofmap.shape.c;
+    s.h = tl_ofmap.shape.h;
+    s.w = tl_ofmap.shape.w;
+    cvk_tg_t *ts_ofmap = alloc_tensor_dev_mem(rt_handle, cvk_ctx, s, CVK_FMT_I8);
+
+    for (uint32_t ow_pos = 0; ow_pos < tl_ofmap.shape.w; ow_pos += ow_step) {
+      uint32_t cur_ow = math_min(tl_ofmap.shape.w - ow_pos, ow_step);
+
+      cvk_tg_t ts_cur_ofmap;
+      ts_cur_ofmap.shape.n = ts_ofmap->shape.n;
+      ts_cur_ofmap.shape.c = ts_ofmap->shape.c;
+      ts_cur_ofmap.shape.h = ts_ofmap->shape.h;
+      ts_cur_ofmap.shape.w = cur_ow;
+      ts_cur_ofmap.stride = ts_ofmap->stride;
+      ts_cur_ofmap.start_address = ts_ofmap->start_address + ow_pos;
+      ts_cur_ofmap.fmt = ts_ofmap->fmt;
+      ts_cur_ofmap.base_reg_index = ts_ofmap->base_reg_index;
+
+      cvk_tl_t tl_cur_ofmap;
+      tl_cur_ofmap.shape.n = tl_ofmap.shape.n;
+      tl_cur_ofmap.shape.c = tl_ofmap.shape.c;
+      tl_cur_ofmap.shape.h = tl_ofmap.shape.h;
+      tl_cur_ofmap.shape.w = cur_ow;
+      tl_cur_ofmap.stride = cvk_ctx->ops->tl_default_stride(cvk_ctx, tl_cur_ofmap.shape, CVK_FMT_I8, 1);
+      tl_cur_ofmap.fmt = tl_ofmap.fmt;
+      tl_cur_ofmap.start_address = tl_ofmap.start_address;
+
+      cvk_tg_t ts_cur_ifmap;
+      ts_cur_ifmap.shape.n = ts_ifmap->shape.n;
+      ts_cur_ifmap.shape.c = ts_ifmap->shape.c;
+      ts_cur_ifmap.shape.h = ts_ifmap->shape.h;
+      ts_cur_ifmap.shape.w = (cur_ow - 1) * conv_param.stride_w + conv_kw_ext(p_param);
+      ts_cur_ifmap.stride = ts_ifmap->stride;
+      ts_cur_ifmap.start_address = ts_ifmap->start_address + ow_pos;
+      ts_cur_ifmap.fmt = ts_ifmap->fmt;
+      ts_cur_ifmap.base_reg_index = ts_ifmap->base_reg_index;
+
+      cvk_tl_t tl_cur_ifmap;
+      tl_cur_ifmap.shape.n = tl_ifmap.shape.n;
+      tl_cur_ifmap.shape.c = tl_ifmap.shape.c;
+      tl_cur_ifmap.shape.h = tl_ifmap.shape.h;
+      tl_cur_ifmap.shape.w = (cur_ow - 1) * conv_param.stride_w + conv_kw_ext(p_param);
+      tl_cur_ifmap.stride = cvk_ctx->ops->tl_default_stride(cvk_ctx, tl_cur_ifmap.shape, CVK_FMT_I8, 1);
+      tl_cur_ifmap.fmt = tl_ifmap.fmt;
+      tl_cur_ifmap.start_address = tl_ifmap.start_address;
+
+      {
+        cvk_tdma_g2l_tensor_copy_param_t p;
+        memset(&p, 0, sizeof(p));
+        p.src = &ts_cur_ifmap;
+        p.dst = &tl_cur_ifmap;
+        cvk_ctx->ops->tdma_g2l_tensor_copy(cvk_ctx, &p);
+        CVI_RT_Submit(cvk_ctx);
+      }
+      {
+        cvk_tiu_pt_convolution_param_t p;
+        memset(&p, 0, sizeof(p));
+        p = conv_param;
+        p.ifmap = &tl_cur_ifmap;
+        p.ofmap = &tl_cur_ofmap;
+        if(p_param->ins_w_last == 1 && (ow_pos + ow_step) >= tl_ofmap.shape.w)
+          p.ins_last_w = 1;
+        else
+          p.ins_last_w = 0;
+
+        cvk_ctx->ops->tiu_pt_convolution(cvk_ctx, &p);
+      }
+      {
+        cvk_tdma_l2g_tensor_copy_param_t p;
+        memset(&p, 0, sizeof(p));
+        p.src = &tl_cur_ofmap;
+        p.dst = &ts_cur_ofmap;
+        cvk_ctx->ops->tdma_l2g_tensor_copy(cvk_ctx, &p);
+        CVI_RT_Submit(cvk_ctx);
+      }
+    }
+    uint8_t *output = tensor_copy_d2s(rt_handle, ts_ofmap);
+    free_tensor_dev_mem(rt_handle, ts_ifmap);
+    free_tensor_dev_mem(rt_handle, ts_ofmap);
+
+    ret = array_cmp_int8(
+        "Comparing results ...\n",
+        output_ref, (int8_t *)output, conv_output_size(p_param));
+
+    if (ret) {
+      print_conv_param(p_param);
+      printf("Comparison FAILED\n");
+    }
+    free(output);
+  }
+
+  free_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+
+fail_exit:
+  free(input);
+  free(weight);
+  free(bias);
+  free(output_ref);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  for (int i = 0; i < 1; i++) {
+    printf("random_test_conv iteration: %d\n", i);
+    conv_param_t test_conv_param;
+    init_conv_param(&test_conv_param);
+    ret |= test_w_tiling_conv(&test_conv_param, rt_handle, cvk_ctx);
+    if (ret)
+      return ret;
+
+    if (!test_conv_param.using_bias)
+      test_conv_param.using_bias = 1;
+    if (test_conv_param.output_c <= 9)
+      test_conv_param.output_c += 3;
+    ret |= test_w_tiling_conv(&test_conv_param, rt_handle, cvk_ctx);
+    if (ret)
+      return ret;
+  }
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_depthwise.c b/cviruntime/test/181x/test_181x_depthwise.c
new file mode 100644
index 000000000..8cafea92c
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_depthwise.c
@@ -0,0 +1,362 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_native_ref.h"
+
+#define INVALIDE_STRIDE (-1)
+typedef cvk_tiu_depthwise_pt_convolution_param_t param_t;
+
+static void print_pooling_param(param_t *p)
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+  int kh = p->weight->shape.h;
+  int kw = p->weight->shape.w;
+
+  printf("  Pooling parameters:\n");
+  printf("    ifmap = (%d, %d, %d, %d)\n", in, ic, ih, iw);
+  printf("    opd0_sign = %d\n", p->ifmap->fmt == CVK_FMT_I8);
+  printf("    weight = (%d, %d)\n", kh, kw);
+  printf("    padding = (%d, %d, %d, %d)\n",
+         p->pad_top, p->pad_bottom, p->pad_left, p->pad_right);
+  printf("    stride = (%d, %d)\n", p->stride_h, p->stride_w);
+  printf("    ins0 = (%d, %d, %d, %d)\n",
+         p->ins_h, p->ins_last_h, p->ins_w, p->ins_last_w);
+  printf("    rshift_bits = %d\n", p->rshift_bits);
+  printf("    relu_enable = %d\n", p->relu_enable);
+  printf("    res0_sign = %d\n", p->ofmap->fmt == CVK_FMT_I8);
+}
+
+static int8_t *alloc_input(param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->ifmap->shape, p->ifmap->fmt);
+  int8_t *data = (int8_t *)malloc(size);
+  if (!data)
+    return NULL;
+
+  for (uint64_t i = 0; i < size; i++)
+    data[i] = rand() % 256 - 128;
+  return data;
+}
+
+static int8_t *alloc_weight(param_t *p)
+{
+  int size = tl_shape_size(&p->weight->shape, p->weight->fmt);
+  int8_t *data = (int8_t *)malloc(size);
+  if (!data)
+    return NULL;
+
+  for (int i = 0; i < size; i++)
+    data[i] = rand() % 256 - 128;
+  return data;
+}
+
+static int16_t *alloc_bias(param_t *p)
+{
+  int c = p->bias->shape.c;
+  int16_t *bias = (int16_t *)malloc(sizeof(int16_t) * c);
+  if (!bias)
+    return NULL;
+
+  for (int i = 0; i < c; i++)
+    bias[i] = rand() % 65536 - 32768;
+  return bias;
+}
+
+static int8_t *alloc_output(param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->ofmap->shape, p->ofmap->fmt);
+  return (int8_t *)malloc(size);
+}
+
+static inline void relu8(int8_t *buf, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++)
+    if (buf[i] < 0)
+      buf[i] = 0;
+}
+
+
+static int compare_results(
+    param_t *p,
+    int8_t input[],
+    int8_t weight[],
+    int16_t bias[],
+    int8_t output[])
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+  int kh = p->weight->shape.h;
+  int kw = p->weight->shape.w;
+  int opd0_sign = (p->ifmap->fmt == CVK_FMT_I8);
+  int res0_sign = (p->ofmap->fmt == CVK_FMT_I8);
+  int8_t *output_ref = alloc_output(p);
+  int ret = native_pooling_ave_int8(
+      input, weight, p->bias ? bias : NULL, output_ref,
+      in, ic, ih, iw, kh, kw,
+      p->pad_top, p->pad_bottom, p->pad_left, p->pad_right,
+      p->stride_h, p->stride_w,
+      p->ins_h, p->ins_w, p->ins_last_h, p->ins_last_w,
+      opd0_sign, res0_sign, p->rshift_bits, 0);
+  if (ret)
+    return ret;
+
+  if(p->relu_enable )
+    relu8(output_ref, tl_shape_size(&p->ofmap->shape, p->ofmap->fmt));
+
+  ret = array_cmp_int8(
+      "Comparing results ...\n", output_ref, output,
+      tl_shape_size(&p->ofmap->shape, p->ofmap->fmt));
+
+  if (ret) {
+    printf("Comparison FAILED!!!\n");
+    print_pooling_param(p);
+  }
+
+  free(output_ref);
+
+  return ret;
+}
+
+static int pooling_ih_ext(param_t *p, int ih)
+{
+  int ins = p->ins_h;
+  int ins_last = p->ins_last_h;
+  int pad = p->pad_top + p->pad_bottom;
+  return (ih - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+static int pooling_iw_ext(param_t *p, int iw)
+{
+  int ins = p->ins_w;
+  int ins_last = p->ins_last_w;
+  int pad = p->pad_left + p->pad_right;
+  return (iw - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+static int pooling_oh(param_t *p, int ih, int kh)
+{
+  int ih_ext = pooling_ih_ext(p, ih);
+  return (ih_ext - kh) / p->stride_h + 1;
+}
+
+static int pooling_ow(param_t *p, int iw, int kw)
+{
+  int iw_ext = pooling_iw_ext(p, iw);
+  return (iw_ext - kw) / p->stride_w + 1;
+}
+
+static void free_depthwise_param(
+    cvk_context_t *cvk_ctx,
+    param_t *p)
+{
+  if (p->bias)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->bias);
+
+  if (p->weight)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->weight);
+
+  if (p->ifmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->ifmap);
+
+  if (p->ofmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->ofmap);
+}
+
+static param_t random_depthwise_param(cvk_context_t *cvk_ctx, int stride_w, int stride_h)
+{
+  srand(clock());
+  param_t p;
+  int retry_cnt = 100;
+
+  for (int i = 0; i < retry_cnt; i++) {
+    int using_bias = rand() % 2;
+    int n = rand() % 5 + 1;
+    int c = rand() % (3 * cvk_ctx->info.npu_num) + 1;
+    int ih = rand() % 30 + 3 + (INVALIDE_STRIDE == stride_h ? 0 : stride_h);
+    int iw = rand() % 30 + 6 + (INVALIDE_STRIDE == stride_w ? 0 : stride_w);
+    int kh = rand() % 7 + 1;
+    int kw = rand() % 7 + 1;
+    int opd0_sign = rand() % 2;
+
+    memset(&p, 0, sizeof(p));
+    p.ins_h = rand() % kh;
+    p.ins_w = rand() % kw;
+    p.ins_last_h = rand() % kh;
+    p.ins_last_w = rand() % kw;
+    p.stride_h = INVALIDE_STRIDE == stride_h ? rand() % (kh) + 1 : stride_h;
+    p.stride_w = INVALIDE_STRIDE == stride_w ? rand() % (kh) + 1 : stride_w;
+    p.pad_top = rand() % kh;
+    p.pad_bottom = rand() % kh;
+    p.pad_left = rand() % kw;
+    p.pad_right = rand() % kw;
+    p.rshift_bits = rand() % 32;
+
+    int oh = pooling_oh(&p, ih, kh);
+    int ow = pooling_ow(&p, iw, kw);
+    cvk_tl_shape_t ofmap_shape;
+    ofmap_shape.n = n;
+    ofmap_shape.c = c;
+    ofmap_shape.h = oh;
+    ofmap_shape.w = ow;
+    cvk_tl_shape_t ifmap_shape;
+    ifmap_shape.n = n;
+    ifmap_shape.c = c;
+    ifmap_shape.h = ih;
+    ifmap_shape.w = iw;
+    cvk_tl_shape_t weight_shape;
+    weight_shape.n = 1;
+    weight_shape.c = c;
+    weight_shape.h = kh;
+    weight_shape.w = kw;
+    cvk_tl_shape_t bias_shape;
+    bias_shape.n = 2;
+    bias_shape.c = c;
+    bias_shape.h = 1;
+    bias_shape.w = 1;
+    p.relu_enable = rand()%2;
+    /*test case ref does not support dilation !=1*/
+    p.dilation_h = 1;
+    p.dilation_w = 1;
+    cvk_fmt_t ifmt = opd0_sign ? CVK_FMT_I8: CVK_FMT_U8;
+
+    p.ofmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, ofmap_shape, CVK_FMT_I8, 1);
+    p.ifmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, ifmap_shape, ifmt, 1);
+    p.weight = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, weight_shape, CVK_FMT_I8, 1);
+    p.bias = NULL;
+    if (using_bias)
+      p.bias = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, bias_shape, CVK_FMT_I8, 0);
+
+    if ((kh > pooling_ih_ext(&p, ih))
+        || (kw > pooling_iw_ext(&p, iw))
+        || (p.pad_top >= (1 << 4))
+        || (p.pad_bottom >= (1 << 4))
+        || (p.pad_left >= (1 << 4))
+        || (p.pad_right >= (1 << 4))
+        || !p.ofmap
+        || !p.ifmap
+        || !p.weight
+        || (using_bias && !p.bias)) {
+      printf("retry init_pooling_param\n");
+      free_depthwise_param(cvk_ctx, &p);
+    } else
+        break;
+  }
+
+  return p;
+}
+
+static void put_bias_tensor(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    int16_t data[])
+{
+  int c = tl->shape.c;
+
+  uint8_t *lo_hi = (uint8_t *)malloc(2 * c);
+  if (!lo_hi)
+    return;
+
+  for (int i = 0; i < c; i++) {
+    lo_hi[i] = data[i] & 0xff;
+    lo_hi[i + c] = (data[i] >> 8) & 0xff;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl, (uint8_t *)lo_hi);
+
+  free(lo_hi);
+}
+
+static int _test_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int stride_w, int stride_h)
+{
+  param_t param = random_depthwise_param(cvk_ctx, stride_w, stride_h);
+
+  int8_t *input = alloc_input(&param);
+  int8_t *weight = alloc_weight(&param);
+  int16_t *bias = NULL;
+  if (param.bias)
+    bias = alloc_bias(&param);
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, param.ifmap, (uint8_t *)input);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, param.weight, (uint8_t *)weight);
+  if (param.bias)
+    put_bias_tensor(rt_handle, cvk_ctx, param.bias, bias);
+
+  cvk_ctx->ops->tiu_pt_depthwise_convolution(cvk_ctx, &param);
+  int8_t *output = (int8_t *)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, param.ofmap);
+
+  int ret = compare_results(&param, input, weight, bias, output);
+
+  free_depthwise_param(cvk_ctx, &param);
+  free(input);
+  free(weight);
+  free(bias);
+  free(output);
+
+  return ret;
+}
+
+
+static int test_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx) {
+  return _test_pooling(rt_handle, cvk_ctx, INVALIDE_STRIDE, INVALIDE_STRIDE);
+}
+
+static int test_depthwise_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+  for (uint64_t i = 0; i < 16; i++)
+    ret |= test_pooling(rt_handle, cvk_ctx);
+
+  // test stride extend (0, 31]
+  int stride_list[] = {15, 16, 31};
+  int stride_list_len = sizeof(stride_list) / sizeof(stride_list[0]);
+
+  for (int stride_w_idx = 0; stride_w_idx < stride_list_len; stride_w_idx++) {
+    for (int stride_h_idx = 0; stride_h_idx < stride_list_len; stride_h_idx++) {
+      int stride_w = stride_list[stride_w_idx];
+      int stride_h = stride_list[stride_h_idx];
+
+      ret |= _test_pooling(rt_handle, cvk_ctx, stride_w, stride_h);
+      if (ret)
+        break;
+    }
+  }
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret = test_depthwise_pooling(rt_handle, cvk_ctx);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_depthwise_conv_qm.c b/cviruntime/test/181x/test_181x_depthwise_conv_qm.c
new file mode 100644
index 000000000..fcd48dcc4
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_depthwise_conv_qm.c
@@ -0,0 +1,1515 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <limits.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_tf_quant_util.h"
+#include "test_native_ref.h"
+
+// #define ENABLE_DEBUG_MSG
+// #define ENABLE_FULL_REGRESSION
+
+#define MIN_EXEC_TESTS  20
+
+typedef struct {
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int output_h;
+  int output_w;
+  int has_bias;
+  int relu_enable;
+  int8_t *input_data;
+  int8_t *filter_data;
+  int8_t *output_data;
+  int32_t *bias_data;
+  uint32_t *multiplier_data;
+  int8_t *shift_data;
+  float float_multiplier;
+  int retry_cnt;
+} dw_conv_test_param_t;
+
+static inline int Offset(cvk_tl_shape_t shape, int i0, int i1, int i2, int i3)
+{
+  // return n * (shape.c * shape.h * shape.w) + c * (shape.h * shape.w) + h *
+  // shape.w + w;
+  int dims_data[4] = {shape.n, shape.c, shape.h, shape.w};
+  return ((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3;
+}
+
+void fill_random_data_s8(int8_t *input_data, int size)
+{
+  for (int i = 0; i < size; ++i) {
+    int is_satured = ((rand() % 1000) == 1) ? 1 : 0;
+    int is_sign = rand() % 2 ? 1 : -1;
+
+    if (is_satured && is_sign) {
+      input_data[i] = -128;
+    } else if (is_satured) {
+      input_data[i] = 127;
+    } else {
+      input_data[i] = is_sign * rand() % 128;
+    }
+  }
+}
+
+void fill_random_data_s32(int32_t *input_data, int size)
+{
+  for (int i = 0; i < size; ++i) {
+    int is_satured = ((rand() % 1000) == 1) ? 1 : 0;
+    int is_sign = rand() % 2 ? 1 : -1;
+
+    if (is_satured && is_sign) {
+      input_data[i] = INT_MIN;
+    } else if (is_satured) {
+      input_data[i] = INT_MAX;
+    } else {
+      input_data[i] = is_sign * rand() % 128;
+    }
+  }
+}
+
+void convert_nhwc_to_nchw(cvk_tl_shape_t tl_shape, int8_t *src, int8_t *dst)
+{
+  // NHWC
+  uint32_t src_shape_n = tl_shape.n;
+  uint32_t src_shape_h = tl_shape.c;
+  uint32_t src_shape_w = tl_shape.h;
+  uint32_t src_shape_c = tl_shape.w;
+  uint32_t src_stride_c = 1;
+  uint32_t src_stride_w = src_shape_c * src_stride_c;
+  uint32_t src_stride_h = src_shape_w * src_stride_w;
+  uint32_t src_stride_n = src_shape_h * src_stride_h;
+
+  // NCHW
+  // uint32_t dst_shape_n = src_shape_n;
+  uint32_t dst_shape_c = src_shape_c;
+  uint32_t dst_shape_h = src_shape_h;
+  uint32_t dst_shape_w = src_shape_w;
+  uint32_t dst_stride_w = 1;
+  uint32_t dst_stride_h = dst_shape_w * dst_stride_w;
+  uint32_t dst_stride_c = dst_shape_h * dst_stride_h;
+  uint32_t dst_stride_n = dst_shape_c * dst_stride_c;
+
+  printf("convert_nhwc_to_nchw:\n");
+  printf("  src shape (%d, %d, %d, %d), stride (%d, %d, %d, %d)\n", src_shape_n,
+         src_shape_c, src_shape_h, src_shape_w, src_stride_n, src_stride_c,
+         src_stride_h, src_stride_w);
+  printf("  dst shape (%d, %d, %d, %d), stride (%d, %d, %d, %d)\n", src_shape_n,
+         dst_shape_c, dst_shape_h, dst_shape_w, dst_stride_n, dst_stride_c,
+         dst_stride_h, dst_stride_w);
+
+  for (uint32_t i = 0; i < src_shape_n; ++i) {
+    for (uint32_t j = 0; j < src_shape_h; ++j) {
+      for (uint32_t k = 0; k < src_shape_w; ++k) {
+        for (uint32_t l = 0; l < src_shape_c; ++l) {
+          uint32_t src_offset = i * src_stride_n + j * src_stride_h +
+                           k * src_stride_w + l * src_stride_c;
+          uint32_t dst_offset = i * dst_stride_n + j * dst_stride_h +
+                           k * dst_stride_w + l * dst_stride_c;
+          dst[dst_offset] = src[src_offset];
+        }
+      }
+    }
+  }
+}
+
+int test_nhwc_to_nchw()
+{
+  int ret = 0;
+
+  cvk_tl_shape_t shape = tl_shape_t4(2, 2, 2, 2);
+  int size = shape.n * shape.c * shape.h * shape.w;
+
+  int8_t src[2 * 2 * 2 * 2] = {1,  5,  2,  6,  3,  7,  4,  8,
+                           11, 15, 12, 16, 13, 17, 14, 18};
+
+  int8_t dst[2 * 2 * 2 * 2] = {0};
+  int8_t ref_dst[2 * 2 * 2 * 2] = {1,  2,  3,  4,  5,  6,  7,  8,
+                               11, 12, 13, 14, 15, 16, 17, 18};
+
+  convert_nhwc_to_nchw(shape, src, dst);
+  for (int i = 0; i < size; ++i) {
+    if (dst[i] != ref_dst[i]) {
+      printf("Error ! dst[%d] %d != %d(expected)\n", i, dst[i], ref_dst[i]);
+      ret = -1;
+    }
+  }
+
+  cvk_tl_shape_t input_shape = {/*n=*/1, /*h=*/5, /*w=*/6, /*c=*/8};
+  int input_size =
+      input_shape.n * input_shape.c * input_shape.h * input_shape.w;
+  int8_t nhwc_input_data[240] = {
+      103,  85,   -96,  120,  105,  -72,  33,   -50,  -104, 12,   -57,  -80,
+      12,   126,  117,  127,  119,  119,  -88,  57,   120,  123,  117,  -100,
+      -4,   76,   76,   -52,  -92,  -127, -21,  -100, 106,  35,   74,   96,
+      117,  0,    39,   76,   -119, -36,  89,   -74,  111,  46,   45,   -26,
+      65,   61,   62,   -7,   -28,  -20,  39,   -84,  -85,  -51,  52,   76,
+      -120, -47,  -58,  95,   -117, -90,  -104, 126,  82,   82,   49,   -96,
+      -47,  67,   115,  -3,   -120, 41,   -16,  -96,  -31,  -75,  67,   -115,
+      75,   -119, -81,  -24,  -3,   -11,  -14,  -4,   37,   75,   53,   107,
+      65,   78,   -58,  52,   46,   -128, 39,   53,   -87,  36,   -98,  -12,
+      -1,   70,   117,  18,   -41,  96,   21,   78,   -71,  -124, 64,   82,
+      -63,  82,   1,    112,  50,   -23,  100,  -20,  117,  20,   12,   -88,
+      -93,  67,   -90,  -70,  -63,  79,   87,   125,  -63,  -43,  80,   -52,
+      -66,  -125, 109,  -73,  -39,  104,  -78,  89,   -64,  116,  29,   71,
+      -7,   124,  -38,  -111, 84,   75,   21,   24,   12,   59,   106,  49,
+      -55,  46,   65,   -28,  64,   15,   -31,  -75,  17,   7,    -109, -25,
+      -115, -38,  7,    23,   71,   -37,  111,  119,  -95,  -89,  17,   -27,
+      -8,   -29,  -125, 58,   -42,  -29,  -87,  109,  75,   -17,  -49,  92,
+      7,    30,   -86,  -98,  26,   -8,   -61,  -41,  39,   7,    48,   55,
+      63,   125,  -13,  56,   -107, 105,  -70,  1,    105,  14,   -89,  0,
+      83,   -10,  9,    11,   127,  -14,  -108, 90,   -15,  26,   -101, -1};
+  int8_t input_data[240];
+  convert_nhwc_to_nchw(input_shape, nhwc_input_data, input_data);
+  printf("NCHW input_data[%d] = {\n", input_size);
+  for (int i = 0; i < input_size; ++i) {
+    printf("%d, ", input_data[i]);
+    if (i && ((i % 16) == 0)) {
+      printf("\n");
+    }
+  }
+  printf("};\n\n");
+
+  cvk_tl_shape_t filter_shape = {1, 3, 3, 8};
+  int filter_size =
+      filter_shape.n * filter_shape.c * filter_shape.h * filter_shape.w;
+  int8_t nhwc_filter_data[72] = {
+      103,  85,  -96, 120, 105,  -72,  33,   -50,  -104, 12,  -57, -80,
+      12,   126, 117, 127, 119,  119,  -88,  57,   120,  123, 117, -100,
+      -4,   76,  76,  -52, -92,  -127, -21,  -100, 106,  35,  74,  96,
+      117,  0,   39,  76,  -119, -36,  89,   -74,  111,  46,  45,  -26,
+      65,   61,  62,  -7,  -28,  -20,  39,   -84,  -85,  -51, 52,  76,
+      -120, -47, -58, 95,  -117, -90,  -104, 126,  82,   82,  49,  -96};
+  int8_t filter_data[72];
+  convert_nhwc_to_nchw(filter_shape, nhwc_filter_data, filter_data);
+  printf("NCHW filter_data[%d] = {\n", filter_size);
+  for (int i = 0; i < filter_size; ++i) {
+    printf("%d, ", filter_data[i]);
+    if (i && ((i % 16) == 0)) {
+      printf("\n");
+    }
+  }
+  printf("}\n\n");
+
+  cvk_tl_shape_t output_shape = {1, 3, 4, 8};
+  int output_size =
+      output_shape.n * output_shape.c * output_shape.h * output_shape.w;
+  int8_t nhwc_output_data[96] = {
+      127,  127,  69,   34,  36,   127,  127,  127,  -101, -65,  39,   13,
+      26,   6,    127,  -67, 60,   123,  31,   17,   3,    -128, -58,  -64,
+      -128, 26,   -128, -21, 72,   55,   127,  94,   -46,  -128, -37,  1,
+      -6,   109,  98,   -14, -11,  48,   -128, -3,   -50,  37,   -20,  79,
+      -94,  -36,  127,  19,  3,    -18,  -40,  -115, 24,   124,  -128, -1,
+      -52,  -123, -54,  -1,  -62,  95,   127,  24,   10,   -74,  127,  -128,
+      -2,   111,  106,  4,   3,    -128, 127,  127,  -30,  98,   -21,  -1,
+      -11,  -12,  58,   -72, -128, 127,  30,   32,   -85,  -11,  -35,  34};
+  int8_t output_data[96] = {0};
+  convert_nhwc_to_nchw(output_shape, nhwc_output_data, output_data);
+  printf("NCHW output_data[%d] = {\n", output_size);
+  for (int i = 0; i < output_size; ++i) {
+    printf("%d, ", output_data[i]);
+    if (i && ((i % 16) == 0)) {
+      printf("\n");
+    }
+  }
+  printf("};\n\n");
+
+  return ret;
+}
+
+int simple_nhwc_dw_conv_test(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+
+  const int stride_width = 1;
+  const int stride_height = 1;
+  const int dilation_width_factor = 1;
+  const int dilation_height_factor = 1;
+  const int pad_width = 0;
+  const int pad_height = 0;
+  const int depth_multiplier = 1;
+  const int input_offset = 0;   // symmetric
+  const int output_offset = 0;  // symmetric
+  const int output_activation_min = -128;
+  const int output_activation_max = 127;
+
+  if (rt_handle == NULL) {
+    return -1;
+  }
+  if (cvk_ctx == NULL) {
+    return -1;
+  }
+
+  cvk_tl_shape_t input_shape = {/*n=*/1, /*h=*/5, /*w=*/6, /*c=*/8};
+  int8_t input_data[240] = {
+      103,  85,   -96,  120,  105,  -72,  33,   -50,  -104, 12,   -57,  -80,
+      12,   126,  117,  127,  119,  119,  -88,  57,   120,  123,  117,  -100,
+      -4,   76,   76,   -52,  -92,  -127, -21,  -100, 106,  35,   74,   96,
+      117,  0,    39,   76,   -119, -36,  89,   -74,  111,  46,   45,   -26,
+      65,   61,   62,   -7,   -28,  -20,  39,   -84,  -85,  -51,  52,   76,
+      -120, -47,  -58,  95,   -117, -90,  -104, 126,  82,   82,   49,   -96,
+      -47,  67,   115,  -3,   -120, 41,   -16,  -96,  -31,  -75,  67,   -115,
+      75,   -119, -81,  -24,  -3,   -11,  -14,  -4,   37,   75,   53,   107,
+      65,   78,   -58,  52,   46,   -128, 39,   53,   -87,  36,   -98,  -12,
+      -1,   70,   117,  18,   -41,  96,   21,   78,   -71,  -124, 64,   82,
+      -63,  82,   1,    112,  50,   -23,  100,  -20,  117,  20,   12,   -88,
+      -93,  67,   -90,  -70,  -63,  79,   87,   125,  -63,  -43,  80,   -52,
+      -66,  -125, 109,  -73,  -39,  104,  -78,  89,   -64,  116,  29,   71,
+      -7,   124,  -38,  -111, 84,   75,   21,   24,   12,   59,   106,  49,
+      -55,  46,   65,   -28,  64,   15,   -31,  -75,  17,   7,    -109, -25,
+      -115, -38,  7,    23,   71,   -37,  111,  119,  -95,  -89,  17,   -27,
+      -8,   -29,  -125, 58,   -42,  -29,  -87,  109,  75,   -17,  -49,  92,
+      7,    30,   -86,  -98,  26,   -8,   -61,  -41,  39,   7,    48,   55,
+      63,   125,  -13,  56,   -107, 105,  -70,  1,    105,  14,   -89,  0,
+      83,   -10,  9,    11,   127,  -14,  -108, 90,   -15,  26,   -101, -1};
+
+  cvk_tl_shape_t filter_shape = {1, 3, 3, 8};
+  int8_t filter_data[72] = {
+      103,  85,  -96, 120, 105,  -72,  33,   -50,  -104, 12,  -57, -80,
+      12,   126, 117, 127, 119,  119,  -88,  57,   120,  123, 117, -100,
+      -4,   76,  76,  -52, -92,  -127, -21,  -100, 106,  35,  74,  96,
+      117,  0,   39,  76,  -119, -36,  89,   -74,  111,  46,  45,  -26,
+      65,   61,  62,  -7,  -28,  -20,  39,   -84,  -85,  -51, 52,  76,
+      -120, -47, -58, 95,  -117, -90,  -104, 126,  82,   82,  49,  -96};
+
+  int32_t bias_data[8] = {812, 670, -746, 938, 827, -558, 265, -384};
+
+  uint32_t output_multiplier[8] = {1155460505, 1210948247, 1203328687, 1166122678,
+                              1155273687, 1196350022, 1169748238, 1183287581};
+
+  int8_t output_rshift[8] = {-7, -6, -6, -9, -8, -6, -6, -7};
+
+  cvk_tl_shape_t output_shape = {1, 3, 4, 8};
+  int8_t output_data[96] = {0};
+  int8_t ref_output_data[96] = {
+      127,  127,  69,   34,  36,   127,  127,  127,  -101, -65,  39,   13,
+      26,   6,    127,  -67, 60,   123,  31,   17,   3,    -128, -58,  -64,
+      -128, 26,   -128, -21, 72,   55,   127,  94,   -46,  -128, -37,  1,
+      -6,   109,  98,   -14, -11,  48,   -128, -3,   -50,  37,   -20,  79,
+      -94,  -36,  127,  19,  3,    -18,  -40,  -115, 24,   124,  -128, -1,
+      -52,  -123, -54,  -1,  -62,  95,   127,  24,   10,   -74,  127,  -128,
+      -2,   111,  106,  4,   3,    -128, 127,  127,  -30,  98,   -21,  -1,
+      -11,  -12,  58,   -72, -128, 127,  30,   32,   -85,  -11,  -35,  34};
+
+  const int batches = input_shape.n;
+  // const int output_depth = 8;
+  const int input_height = input_shape.c;
+  const int input_width = input_shape.h;
+  const int input_depth = input_shape.w;
+  const int filter_height = filter_shape.c;
+  const int filter_width = filter_shape.h;
+  const int output_height = output_shape.c;
+  const int output_width = output_shape.h;
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          for (int m = 0; m < depth_multiplier; ++m) {
+            const int output_channel = m + in_channel * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - pad_width;
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+            int32_t acc = 0;
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  int32_t input_val = input_data[Offset(input_shape, batch, in_y,
+                                                    in_x, in_channel)];
+                  int32_t filter_val = filter_data[Offset(
+                      filter_shape, 0, filter_y, filter_x, output_channel)];
+                  acc += filter_val * (input_val + input_offset);
+
+                  printf("  [batch=%d][out_y=%d][out_x=%d][in_channel=%d][m=%d]"
+                         "[filter_y=%d][filter_x=%d] acc(%d) += %d * (%d + %d) "
+                         "= %d\n",
+                         batch, out_y, out_x, in_channel, m, filter_y, filter_x,
+                         acc - filter_val * (input_val + input_offset),
+                         filter_val, input_val, input_offset, acc);
+                }
+              }
+            }
+            if (1 /*bias_data*/) {
+              acc += bias_data[output_channel];
+            }
+
+            printf("  [batch=%d][out_y=%d][out_x=%d][output_channel=%d] acc = "
+                   "%d, bias %d\n",
+                   batch, out_y, out_x, output_channel, acc,
+                   bias_data[output_channel]);
+
+            acc = MultiplyByQuantizedMultiplier(
+                acc, output_multiplier[output_channel],
+                output_rshift[output_channel]);
+
+            printf("  [batch=%d][out_y=%d][out_x=%d][output_channel=%d] acc = "
+                   "%d, multiplier %d, shift %d\n",
+                   batch, out_y, out_x, output_channel, acc,
+                   output_multiplier[output_channel],
+                   output_rshift[output_channel]);
+
+            acc += output_offset;
+            acc = MAX(acc, output_activation_min);
+            acc = MIN(acc, output_activation_max);
+
+            printf("  [batch=%d][out_y=%d][out_x=%d][output_channel=%d] acc = "
+                   "%d\n",
+                   batch, out_y, out_x, output_channel, acc);
+
+            {
+              int x = Offset(output_shape, batch, out_y, out_x, output_channel);
+              if (x >= 96) {
+                printf("Error ! shape=(%d, %d, %d, %d), batch %d, out_y %d, "
+                       "out_x %d, output_channel %d, offset %d\n",
+                       output_shape.n, output_shape.c, output_shape.h,
+                       output_shape.w, batch, out_y, out_x, output_channel, x);
+              }
+            }
+
+            output_data[Offset(output_shape, batch, out_y, out_x,
+                               output_channel)] = acc;
+          }
+        }
+      }
+    }
+  }
+
+  int output_size =
+      output_shape.n * output_shape.c * output_shape.h * output_shape.w;
+  for (int i = 0; i < output_size; ++i) {
+    if (output_data[i] != ref_output_data[i]) {
+      printf("  output_data[%d] = %d != %d\n", i, output_data[i],
+             ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  return ret;
+}
+
+typedef struct {
+  int stride_width;
+  int stride_height;
+  int dilation_width_factor;
+  int dilation_height_factor;
+  int padding_width;
+  int padding_height;
+  int depth_multiplier;
+} DwConvParams;
+
+void dw_conv_per_channel_ref(const dw_conv_test_param_t *p_param)
+{
+  const int input_offset = 0;   // symmetric
+  const int output_offset = 0;  // symmetric
+  const int output_activation_min = -128;
+  const int output_activation_max = 127;
+
+  const int stride_width = p_param->stride_w;
+  const int stride_height = p_param->stride_h;
+  const int dilation_width_factor = 1;   // params.dilation_width_factor;
+  const int dilation_height_factor = 1;  // params.dilation_height_factor;
+  const int pad_width = p_param->pad_left;
+  const int pad_height = p_param->pad_top;
+  const int depth_multiplier = 1;  // params.depth_multiplier;
+
+  const int batches = p_param->input_n;
+  const int input_height = p_param->input_h;
+  const int input_width = p_param->input_w;
+  const int input_depth = p_param->input_c;
+  const int filter_height = p_param->kh;
+  const int filter_width = p_param->kw;
+  const int output_depth = p_param->output_c;
+  const int output_height = p_param->output_h;
+  const int output_width = p_param->output_w;
+  int8_t *input_data = p_param->input_data;
+  int8_t *filter_data = p_param->filter_data;
+  int8_t *output_data = p_param->output_data;
+  int32_t *bias_data = p_param->has_bias ? p_param->bias_data : NULL;
+  uint32_t *output_multiplier = p_param->multiplier_data;
+  int8_t *output_rshift = p_param->shift_data;
+
+  cvk_tl_shape_t input_shape = {
+      batches, input_depth, input_height, input_width};
+  cvk_tl_shape_t filter_shape = {
+      output_depth, input_depth, filter_height, filter_width};
+  cvk_tl_shape_t output_shape = {
+      batches, output_depth, output_height, output_width};
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("dw_conv_per_channel_ref =>\n");
+  printf("  input shape (n=%d, c=%d, h=%d, w=%d)\n", batches, input_depth,
+         input_height, input_width);
+  // printf("  filter shape (oc=%d, kh=%d, kw=%d\n",
+  //       );
+  printf("  output shape (n=%d, c=%d, h=%d, w=%d)\n", batches, output_depth,
+         output_height, output_width);
+  printf("  stride_h %d, stride_w %d\n", stride_height, stride_width);
+#endif
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          for (int m = 0; m < depth_multiplier; ++m) {
+            const int output_channel = m + in_channel * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - pad_width;
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+            int32_t acc = 0;
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  int32_t input_val = input_data[Offset(input_shape, batch,
+                                                    in_channel, in_y, in_x)];
+                  int32_t filter_val = filter_data[Offset(
+                      filter_shape, 0, output_channel, filter_y, filter_x)];
+                  acc += filter_val * (input_val + input_offset);
+
+#ifdef ENABLE_DEBUG_MSG
+                  printf("  [batch=%d][out_y=%d][out_x=%d][in_channel=%d][m=%d]"
+                         "[filter_y=%d][filter_x=%d] acc(%d) += %d * (%d + %d) "
+                         "= %d, in_x_origin %d, in_x %d\n",
+                         batch, out_y, out_x, in_channel, m, filter_y, filter_x,
+                         acc - filter_val * (input_val + input_offset),
+                         filter_val, input_val, input_offset, acc, in_x_origin,
+                         in_x);
+#endif
+                }
+              }
+            }
+            if (bias_data) {
+              acc += bias_data[output_channel];
+            }
+
+#ifdef ENABLE_DEBUG_MSG
+            printf("  [batch=%d][out_y=%d][out_x=%d][output_channel=%d] acc = "
+                   "%d, bias %d\n",
+                   batch, out_y, out_x, output_channel, acc,
+                   bias_data ? bias_data[output_channel] : 0);
+#endif
+
+            acc = MultiplyByQuantizedMultiplier(
+                acc, output_multiplier[output_channel],
+                output_rshift[output_channel]);
+
+#ifdef ENABLE_DEBUG_MSG
+            printf("  [batch=%d][out_y=%d][out_x=%d][output_channel=%d] acc = "
+                   "%d, multiplier %d, shift %d\n",
+                   batch, out_y, out_x, output_channel, acc,
+                   output_multiplier[output_channel],
+                   output_rshift[output_channel]);
+#endif
+
+            acc += output_offset;
+            acc = MAX(acc, output_activation_min);
+            acc = MIN(acc, output_activation_max);
+
+#ifdef ENABLE_DEBUG_MSG
+            printf("  [batch=%d][out_y=%d][out_x=%d][output_channel=%d] acc = "
+                   "%d\n",
+                   batch, out_y, out_x, output_channel, acc);
+#endif
+
+            output_data[Offset(output_shape, batch, output_channel, out_y,
+                               out_x)] = acc;
+          }
+        }
+      }
+    }
+  }
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("<= dw_conv_per_channel_ref\n");
+#endif
+}
+
+void calc_dw_conv_float_multiplier(dw_conv_test_param_t *p_param)
+{
+  const int input_offset = 0;  // symmetric
+
+  const int stride_width = p_param->stride_w;
+  const int stride_height = p_param->stride_h;
+  const int dilation_width_factor = 1;   // params.dilation_width_factor;
+  const int dilation_height_factor = 1;  // params.dilation_height_factor;
+  const int pad_width = p_param->pad_left;
+  ;
+  const int pad_height = p_param->pad_top;
+  const int depth_multiplier = 1;  // params.depth_multiplier;
+
+  const int batches = p_param->input_n;
+  const int input_height = p_param->input_h;
+  const int input_width = p_param->input_w;
+  const int input_depth = p_param->input_c;
+  const int filter_height = p_param->kh;
+  const int filter_width = p_param->kw;
+  const int output_depth = p_param->output_c;
+  const int output_height = p_param->output_h;
+  const int output_width = p_param->output_w;
+  int8_t *input_data = p_param->input_data;
+  int8_t *filter_data = p_param->filter_data;
+  int32_t *bias_data = p_param->has_bias ? p_param->bias_data : NULL;
+
+  cvk_tl_shape_t input_shape = {
+      batches, input_depth, input_height, input_width};
+  cvk_tl_shape_t filter_shape = {
+      output_depth, input_depth, filter_height, filter_width};
+
+  int output_accu_min = INT_MAX;
+  int output_accu_max = INT_MIN;
+
+  // printf("calc_dw_conv_float_multiplier =>\n");
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          for (int m = 0; m < depth_multiplier; ++m) {
+            const int output_channel = m + in_channel * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - pad_width;
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+            int32_t acc = 0;
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  int32_t input_val = input_data[Offset(input_shape, batch,
+                                                    in_channel, in_y, in_x)];
+                  int32_t filter_val = filter_data[Offset(
+                      filter_shape, 0, output_channel, filter_y, filter_x)];
+                  acc += filter_val * (input_val + input_offset);
+
+                  // printf("
+                  // [batch=%d][out_y=%d][out_x=%d][in_channel=%d][m=%d]"
+                  //        "[filter_y=%d][filter_x=%d] acc(%d) += %d * (%d +
+                  //        %d) = %d\n",
+                  //         batch, out_y, out_x, in_channel, m, filter_y,
+                  //         filter_x, acc - filter_val * (input_val +
+                  //         input_offset), filter_val, input_val, input_offset,
+                  //         acc);
+                }
+              }
+            }
+            if (bias_data) {
+              acc += bias_data[output_channel];
+            }
+
+            output_accu_max = MAX(acc, output_accu_max);
+            output_accu_min = MIN(acc, output_accu_min);
+
+            // printf("  [batch=%d][out_y=%d][out_x=%d][output_channel=%d] acc =
+            // %d, MIN = %d, MAX = %d\n",
+            //        batch, out_y, out_x, output_channel, acc,
+            //        output_accu_min, output_accu_max);
+          }
+        }
+      }
+    }
+  }
+
+  // Since int8 ranges from -128 to 127, we need to squeeze the accumulator
+  // MIN/MAX fit in those ranges correspondingly as much as possible.
+  if (abs(output_accu_max) > abs(output_accu_min)) {
+    p_param->float_multiplier = 127.0f / abs(output_accu_max);
+  } else {
+    p_param->float_multiplier = 128.0f / abs(output_accu_min);
+  }
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("  output_accu_min %d, output_accu_max %d, output_multiplier %f\n",
+         output_accu_min, output_accu_max, p_param->float_multiplier);
+#endif
+
+  // printf("<= calc_dw_conv_float_multiplier\n");
+}
+
+int simple_dw_conv_test(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+
+  if (rt_handle == NULL) {
+    return -1;
+  }
+  if (cvk_ctx == NULL) {
+    return -1;
+  }
+
+  const int batches = 1;
+  const int input_depth = 8;
+  const int input_height = 5;
+  const int input_width = 6;
+  cvk_tl_shape_t input_shape = {batches, input_depth, input_height, input_width};
+  int8_t input_data[240] = {
+      /* ic = 0 */
+      103, -104, 119, -4, 106, -119, 65, -85, -117, -47, -31, -3, 65, -87, -41,
+      -63, 117, -63, -66, -64, 84, -55, 17, 71, -8, 75, 26, 63, 105, 127,
+
+      /* ic = 1 */
+      85, 12, 119, 76, 35, -36, 61, -51, -90, 67, -75, -11, 78, 36, 96, 82, 20,
+      79, -125, 116, 75, 46, 7, -37, -29, -17, -8, 125, 14, -14,
+
+      /* ic = 2 */
+      -96, -57, -88, 76, 74, 89, 62, 52, -104, 115, 67, -14, -58, -98, 21, 1,
+      12, 87, 109, 29, 21, 65, -109, 111, -125, -49, -61, -13, -89, -108,
+
+      /* ic = 3 */
+      120, -80, 57, -52, 96, -74, -7, 76, 126, -3, -115, -4, 52, -12, 78, 112,
+      -88, 125, -73, 71, 24, -28, -25, 119, 58, 92, -41, 56, 0, 90,
+
+      /* ic = 4 */
+      105, 12, 120, -92, 117, 111, -28, -120, 82, -120, 75, 37, 46, -1, -71, 50,
+      -93, -63, -39, -7, 12, 64, -115, -95, -42, 7, 39, -107, 83, -15,
+
+      /* ic = 5 */
+      -72, 126, 123, -127, 0, 46, -20, -47, 82, 41, -119, 75, -128, 70, -124,
+      -23, 67, -43, 104, 124, 59, 15, -38, -89, -29, 30, 7, 105, -10, 26,
+
+      /* ic = 6 */
+      33, 117, 117, -21, 39, 45, 39, -58, 49, -16, -81, 53, 39, 117, 64, 100,
+      -90, 80, -78, -38, 106, -31, 7, 17, -87, -86, 48, -70, 9, -101,
+
+      /* ic = 7 */
+      -50, 127, -100, -100, 76, -26, -84, 95, -96, -96, -24, 107, 53, 18, 82,
+      -20, -70, -52, 89, -111, 49, -75, 23, -27, 109, -98, 55, 1, 11, -1};
+
+  const int kernel_height = 3;
+  const int kernel_width = 3;
+  cvk_tl_shape_t filter_shape = {1, input_depth, kernel_height, kernel_width};
+  // Global memory layout: OcKhKw
+  int8_t filter_data[72] = {
+      103,  -104, 119,  -4,  106, -119, 65,   -85,  -117, 85,  12,  119,
+      76,   35,   -36,  61,  -51, -90,  -96,  -57,  -88,  76,  74,  89,
+      62,   52,   -104, 120, -80, 57,   -52,  96,   -74,  -7,  76,  126,
+      105,  12,   120,  -92, 117, 111,  -28,  -120, 82,   -72, 126, 123,
+      -127, 0,    46,   -20, -47, 82,   33,   117,  117,  -21, 39,  45,
+      39,   -58,  49,   -50, 127, -100, -100, 76,   -26,  -84, 95,  -96};
+
+  int32_t bias_data[8] = {812, 670, -746, 938, 827, -558, 265, -384};
+
+  uint32_t output_multiplier[8] = {1155460505, 1210948247, 1203328687, 1166122678,
+                              1155273687, 1196350022, 1169748238, 1183287581};
+
+  // Change to right shift
+  int8_t output_rshift[8] = {7, 6, 6, 9, 8, 6, 6, 7};
+
+  uint8_t per_channel_cal_data[8 * 4 + 8 * 4 + 8];
+  pack_chl_quan_param(8, /*has_bias=*/true, bias_data, output_multiplier,
+                      output_rshift, per_channel_cal_data);
+
+  const int output_height = 3;
+  const int output_width = 4;
+  cvk_tl_shape_t output_shape = {batches, input_depth, output_height, output_width};
+  int8_t ref_output_data[96] = {
+      /* oc = 0 */
+      127, -101, 60, -128, -46, -11, -94, 24, -62, -2, -30, -128,
+
+      /* oc = 1 */
+      127, -65, 123, 26, -128, 48, -36, 124, 95, 111, 98, 127,
+
+      /* oc = 2 */
+      69, 39, 31, -128, -37, -128, 127, -128, 127, 106, -21, 30,
+
+      /* oc = 3 */
+      34, 13, 17, -21, 1, -3, 19, -1, 24, 4, -1, 32,
+
+      /* oc = 4 */
+      36, 26, 3, 72, -6, -50, 3, -52, 10, 3, -11, -85,
+
+      /* oc = 5 */
+      127, 6, -128, 55, 109, 37, -18, -123, -74, -128, -12, -11,
+
+      /* oc = 6 */
+      127, 127, -58, 127, 98, -20, -40, -54, 127, 127, 58, -35,
+
+      /* oc = 7 */
+      127, -67, -64, 94, -14, 79, -115, -1, -128, 127, -72, 34};
+
+  cvk_tl_t *tl_input =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, input_shape, CVK_FMT_I8, /*eu_aign=*/1);
+
+  cvk_tl_t *tl_filter =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, filter_shape, CVK_FMT_I8, /*eu_align=*/1);
+
+  cvk_tl_t *tl_output =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, output_shape, CVK_FMT_I8, /*eu_align=*/1);
+
+  cvk_tl_t *tl_per_channel_cal =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape_t4(1, 8, 1, 9), CVK_FMT_U8,
+                                  /*eu_align*/ 0);
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_per_channel_cal, per_channel_cal_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_input, (uint8_t *)input_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_filter, (uint8_t *)filter_data);
+
+  {
+    // Reshape per channel quantization data
+    tl_per_channel_cal->shape = tl_shape_t4(1, 8, 1, 1);
+    tl_per_channel_cal->stride = cvk_ctx->ops->tl_default_stride(
+        cvk_ctx, tl_per_channel_cal->shape, CVK_FMT_I8, /*eu_align=*/0);
+
+    cvk_tiu_depthwise_convolution_param_t param;
+    memset(&param, 0, sizeof(param));
+    param.ofmap = tl_output;
+    param.ifmap = tl_input;
+    param.weight = tl_filter;
+    param.chl_quan_param = tl_per_channel_cal;
+    param.dilation_h = 1;
+    param.dilation_w = 1;
+    param.stride_h = 1;
+    param.stride_w = 1;
+    param.has_bias = 1;
+    cvk_ctx->ops->tiu_depthwise_convolution(cvk_ctx, &param);
+  }
+
+  CVI_RT_Submit(cvk_ctx);
+
+  printf("Compare tiu and golden\n");
+  int8_t *conv_output_data =
+      (int8_t *)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_output);
+  for (int i = 0; i < (int)sizeof(ref_output_data); i++) {
+    if (conv_output_data[i] != ref_output_data[i]) {
+      printf("output_data[%d] %d != %d\n", i, conv_output_data[i],
+             ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  free(conv_output_data);
+
+  int8_t output_data[96] = {0};
+  memset(output_data, 0, sizeof(output_data));
+
+  dw_conv_test_param_t params;
+  memset(&params, 0, sizeof(params));
+  params.input_n = batches;
+  params.input_c = input_depth;
+  params.input_h = input_height;
+  params.input_w = input_width;
+  params.kh = kernel_height;
+  params.kw = kernel_width;
+  params.output_c = input_depth;
+  params.output_h = output_height;
+  params.output_w = output_width;
+  params.stride_w = 1;
+  params.stride_h = 1;
+  params.input_data = input_data;
+  params.filter_data = filter_data;
+  params.output_data = output_data;
+  params.has_bias = 1;
+  params.bias_data = bias_data;
+  params.multiplier_data = output_multiplier;
+  params.shift_data = output_rshift;
+
+  dw_conv_per_channel_ref(&params);
+
+  printf("Compare ref and golden\n");
+  int output_size =
+      output_shape.n * output_shape.c * output_shape.h * output_shape.w;
+  for (int i = 0; i < output_size; ++i) {
+    if (output_data[i] != ref_output_data[i]) {
+      printf("  output_data[%d] = %d != %d\n", i, output_data[i],
+             ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  // Reverse order
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_per_channel_cal);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_output);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_filter);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_input);
+
+  return ret;
+}
+
+int choose_from_range(int table[], int size, int index)
+{
+  if (index >= size) {
+    return 0;
+  }
+
+  int val = table[index];
+  if (index < (size - 1)) {
+    int range = MAX(table[index + 1] - table[index] - 1, 1);
+    val += rand() % range;
+  }
+
+  return val;
+}
+
+void dump_test_param(dw_conv_test_param_t *p_param, bool dump_content)
+{
+  printf("Dump test parameter:\n");
+  printf("  input_n %d\n", p_param->input_n);
+  printf("  input_c %d\n", p_param->input_c);
+  printf("  input_h %d\n", p_param->input_h);
+  printf("  input_w %d\n", p_param->input_w);
+  printf("  kw %d\n", p_param->kw);
+  printf("  kh %d\n", p_param->kh);
+  printf("  dh %d\n", p_param->dh);
+  printf("  dw %d\n", p_param->dw);
+  printf("  pad_top %d\n", p_param->pad_top);
+  printf("  pad_bot %d\n", p_param->pad_bot);
+  printf("  pad_left %d\n", p_param->pad_left);
+  printf("  pad_right %d\n", p_param->pad_right);
+  printf("  ins_h %d\n", p_param->ins_h);
+  printf("  ins_h_last %d\n", p_param->ins_h_last);
+  printf("  ins_w %d\n", p_param->ins_w);
+  printf("  ins_w_last %d\n", p_param->ins_w_last);
+  printf("  stride_h %d\n", p_param->stride_h);
+  printf("  stride_w %d\n", p_param->stride_w);
+  printf("  output_c %d\n", p_param->output_c);
+  printf("  output_h %d\n", p_param->output_h);
+  printf("  output_w %d\n", p_param->output_w);
+  printf("  has_bias %d\n", p_param->has_bias);
+  printf("  relu_enable %d\n", p_param->relu_enable);
+
+  if (dump_content) {
+    printf("input_data(%d, %d, %d, %d) :\n", p_param->input_n, p_param->input_c,
+           p_param->input_h, p_param->input_w);
+    int in = p_param->input_n;
+    int ic = p_param->input_c;
+    int ih = p_param->input_h;
+    int iw = p_param->input_w;
+    for (int i = 0; i < in; ++i) {
+      for (int j = 0; j < ic; ++j) {
+        for (int k = 0; k < ih; ++k) {
+          for (int l = 0; l < iw; ++l) {
+            int offset = i * (ic * ih * iw) + j * (ih * iw) + k * iw + l;
+            printf("%d, ", p_param->input_data[offset]);
+          }
+          printf("\n");
+        }
+      }
+    }
+    printf("\n\n");
+
+    printf("kener_data (%d, %d, %d)\n", p_param->output_c, p_param->kh,
+           p_param->kw);
+    int kh = p_param->kh;
+    int kw = p_param->kw;
+    for (int i = 0; i < ic; ++i) {
+      for (int j = 0; j < kh; ++j) {
+        for (int k = 0; k < kw; ++k) {
+          int offset = i * (kh * kw) + j * kw + k;
+          printf("%d, ", p_param->filter_data[offset]);
+        }
+      }
+      printf("\n");
+    }
+    printf("\n\n");
+
+    if (p_param->has_bias) {
+      printf("bias_data:\n");
+      for (int i = 0; i < ic; ++i) {
+        printf("%d, ", p_param->bias_data[i]);
+      }
+      printf("\n\n");
+    }
+
+    printf("multiplier_data:\n");
+    for (int i = 0; i < ic; ++i) {
+      printf("%d, ", p_param->multiplier_data[i]);
+    }
+    printf("\n\n");
+
+    printf("shift_data:\n");
+    for (int i = 0; i < ic; ++i) {
+      printf("%d, ", p_param->shift_data[i]);
+    }
+    printf("\n\n");
+  }
+}
+
+int run_compare_dw_conv(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx,
+                        dw_conv_test_param_t *p_param)
+{
+  int ret = 0;
+
+  if (rt_handle == NULL || cvk_ctx == NULL) {
+    return -1;
+  }
+
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int oh = p_param->output_h;
+  int ow = p_param->output_w;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_last_h = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_last_w = p_param->ins_w_last;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int has_bias = p_param->has_bias;
+  int relu_enable = p_param->relu_enable;
+
+  int input_size = in * ic * iw * ih;
+  int8_t *input_data = (int8_t *)malloc(input_size);
+
+  int kernel_size = oc * ic * kh * kw;
+  int8_t *kernel_data = (int8_t *)malloc(kernel_size);
+
+  int output_size = in * oc * oh * ow;
+  int8_t *output_data = (int8_t *)malloc(output_size);
+  memset(output_data, 0, output_size);
+
+  int32_t *bias_data = (int32_t *)malloc(sizeof(int32_t) * oc);
+  uint32_t *multiplier_data = (uint32_t *)malloc(sizeof(uint32_t) * oc);
+  int8_t *shift_data = (int8_t *)malloc(oc);
+
+  p_param->input_data = input_data;
+  p_param->filter_data = kernel_data;
+  p_param->output_data = output_data;
+  p_param->has_bias = has_bias;
+  p_param->bias_data = bias_data;
+  p_param->multiplier_data = multiplier_data;
+  p_param->shift_data = shift_data;
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    run_compare_dw_conv =>\n");
+  printf("      input (n=%d, ic=%d, h=%d, w=%d), kernel (oc=%d, ic=%d, h=%d, "
+         "w=%d), output (, c=%d, h=%d, w=%d), has_bias %d\n",
+         in, ic, ih, iw, oc, ic, kh, kw, oc, oh, ow, has_bias);
+#endif
+
+  int retry_cnt = p_param->retry_cnt;
+  do {
+    fill_random_data_s8(input_data, input_size);
+    fill_random_data_s8(kernel_data, kernel_size);
+    if (has_bias) {
+      fill_random_data_s32(bias_data, oc);
+    }
+
+    p_param->float_multiplier = 100.0;  // should be < 1.0
+    calc_dw_conv_float_multiplier(p_param);
+
+    if (p_param->float_multiplier > 0.f && p_param->float_multiplier < 1.0) {
+      break;
+    }
+
+  } while (--retry_cnt);
+
+  if (p_param->float_multiplier >= 1.0) {
+    printf("    run_compare_dw_conv: unable to find valid multiplier\n");
+    free(input_data);
+    free(kernel_data);
+    free(output_data);
+    free(bias_data);
+    free(multiplier_data);
+    free(shift_data);
+    return -1;
+  }
+
+  uint32_t base_multiplier = 0;
+  int base_shift = 0;
+  QuantizeMultiplierSmallerThanOne(p_param->float_multiplier, &base_multiplier,
+                                   &base_shift);
+
+  for (int i = 0; i < oc; ++i) {
+    // multipliers typically range in [2^30 ; 2^31 - 1].
+    // Values in [0, 2^30 - 1] are normally unused, but harmless.
+    // Thus a good way to randomize multipliers is to subtract from them
+    // a random value smaller than 2^30 but still significant compared to it.
+    p_param->multiplier_data[i] = base_multiplier - (rand() % (1 << 26));
+
+    int right_shift = base_shift - 1 + (rand() % 4);
+    p_param->shift_data[i] =
+        truncate_rshift((int8_t)right_shift, /*allow_lshift*/1);
+
+#ifdef ENABLE_DEBUG_MSG
+    printf("      [oc=%d] multiplier_data %d, shift_data %d\n", i,
+           p_param->multiplier_data[i], p_param->shift_data[i]);
+#endif
+  }
+
+  dw_conv_per_channel_ref(p_param);
+
+  const int per_chan_cal_data_size =
+      p_param->has_bias ? 9 : 5;  // bias(4) + multiplier(4) + shift(1)
+  const int cal_data_size = oc * per_chan_cal_data_size;
+  uint8_t *cal_data = (uint8_t *)malloc(cal_data_size);
+  pack_chl_quan_param(oc, p_param->has_bias, p_param->bias_data,
+                      p_param->multiplier_data, p_param->shift_data,
+                      cal_data);
+
+  cvk_tl_shape_t input_shape = tl_shape_t4(in, ic, ih, iw);
+  cvk_tl_shape_t filter_shape = tl_shape_t4(1, oc, kh, kw);
+  cvk_tl_shape_t output_shape = tl_shape_t4(in, oc, oh, ow);
+  cvk_tl_shape_t cal_shape = tl_shape_t4(1, oc, 1, per_chan_cal_data_size);
+
+  cvk_tl_t *tl_input =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, input_shape, CVK_FMT_I8, /*eu_aign=*/1);
+
+  cvk_tl_t *tl_filter =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, filter_shape, CVK_FMT_I8, /*eu_align=*/1);
+
+  cvk_tl_t *tl_output =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, output_shape, CVK_FMT_I8, /*eu_align=*/1);
+
+  // Shape for TDMA load
+  cvk_tl_t *tl_cal_data =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, cal_shape, CVK_FMT_U8, /*eu_align*/ 0);
+
+  if (tl_input == NULL) {
+    printf("      fail to alloc tl_input (%d, %d, %d, %d)\n", input_shape.n,
+           input_shape.c, input_shape.h, input_shape.w);
+    return -1;
+  }
+  if (tl_filter == NULL) {
+    printf("      fail to alloc tl_filter (%d, %d, %d, %d)\n", filter_shape.n,
+           filter_shape.c, filter_shape.h, filter_shape.w);
+    return -1;
+  }
+  if (tl_output == NULL) {
+    printf("      fail to alloc tl_output (%d, %d, %d, %d)\n", output_shape.n,
+           output_shape.c, output_shape.h, output_shape.w);
+    return -1;
+  }
+  if (tl_cal_data == NULL) {
+    printf("      fail to alloc tl_cal_data (%d, %d, %d, %d)\n", cal_shape.n,
+           cal_shape.c, cal_shape.h, cal_shape.w);
+    return -1;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_cal_data, cal_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_input, (uint8_t *)input_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_filter, (uint8_t *)kernel_data);
+
+  {
+    // Reshape per channel quantization data for TIU
+    tl_cal_data->shape = tl_shape_t4(1, oc, 1, 1);
+    tl_cal_data->stride = cvk_ctx->ops->tl_default_stride(
+        cvk_ctx, tl_cal_data->shape, CVK_FMT_I8, /*eu_align=*/0);
+
+    cvk_tiu_depthwise_convolution_param_t param;
+    memset(&param, 0, sizeof(param));
+    param.ofmap = tl_output;
+    param.ifmap = tl_input;
+    param.weight = tl_filter;
+    param.chl_quan_param = tl_cal_data;
+    param.ins_h = ins_h;
+    param.ins_last_h = ins_last_h;
+    param.ins_w = ins_w;
+    param.ins_last_w = ins_last_w;
+    param.stride_h = stride_h;
+    param.stride_w = stride_w;
+    param.dilation_h = dh;
+    param.dilation_w = dw;
+    param.pad_top = pad_top;
+    param.pad_bottom = pad_bot;
+    param.pad_left = pad_left;
+    param.pad_right = pad_right;
+    param.has_bias = has_bias;
+    param.relu_enable = relu_enable;
+
+#ifdef ENABLE_DEBUG_MSG
+    printf("      tiu_dw_conv_qdm:\n");
+    printf("        ifmap shape (%d, %d, %d, %d)\n", param.ifmap->shape.n,
+           param.ifmap->shape.c, param.ifmap->shape.h, param.ifmap->shape.w);
+    printf("        weight shape (%d, %d, %d, %d)\n", param.weight->shape.n,
+           param.weight->shape.c, param.weight->shape.h, param.weight->shape.w);
+    printf("        ofmap shape (%d, %d, %d, %d)\n", param.ofmap->shape.n,
+           param.ofmap->shape.c, param.ofmap->shape.h, param.ofmap->shape.w);
+#endif
+
+    cvk_ctx->ops->tiu_depthwise_convolution(cvk_ctx, &param);
+  }
+
+  CVI_RT_Submit(cvk_ctx);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("      compare result:\n");
+#endif
+  int8_t *conv_output_data =
+      (int8_t *)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_output);
+  for (int i = 0; i < output_size; i++) {
+    if (conv_output_data[i] != output_data[i]) {
+      printf("        output_data[%d] %d(tiu) != %d(ref)\n", i,
+             conv_output_data[i], output_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+  if (ret) {
+    dump_test_param(p_param, /*dump_content=*/true);
+  }
+
+  // Reverse order
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_cal_data);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_output);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_filter);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_input);
+
+  free(conv_output_data);
+
+  free(input_data);
+  free(kernel_data);
+  free(output_data);
+  free(bias_data);
+  free(multiplier_data);
+  free(shift_data);
+  free(cal_data);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    <= run_compare_dw_conv\n");
+#endif
+
+  return ret;
+}
+
+bool check_valid_test_param(cvk_context_t *cvk_ctx, dw_conv_test_param_t *p_param)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int oh = p_param->output_h;
+  int ow = p_param->output_w;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int per_chan_cal_data_size =
+      p_param->has_bias ? 9 : 5;  // bias(4) + multiplier(4) + shift(1)
+
+  // Skip invalid shape
+  if ((kh > ih) || (kw > iw) || (stride_h > ih) || (stride_w > iw)) {
+    return false;
+  }
+
+  // muliply random-choosen value may exceeded than int32_t
+  uint32_t input_size = in * ic * ih * iw;
+  uint32_t kernel_size = ic * kh * kw;  // no oc
+  uint32_t output_size = in * oc * oh * ow;
+
+  uint32_t lmem_size_per_lane = cvk_ctx->info.lmem_size;
+  uint32_t total_lmem_size = cvk_ctx->info.lmem_size * cvk_ctx->info.npu_num;
+
+  uint32_t total_needed_size = input_size + kernel_size + output_size +
+                          per_chan_cal_data_size * cvk_ctx->info.npu_num;
+  if (total_needed_size > total_lmem_size) {
+    return false;
+  }
+
+  cvk_tl_shape_t input_shape = {in, ic, ih, iw};
+  cvk_tl_shape_t filter_shape = {1, oc, kh, kw};
+  cvk_tl_shape_t output_shape = {in, oc, oh, ow};
+  cvk_tl_shape_t cal_shape = {1, oc, 1, per_chan_cal_data_size};
+
+  uint32_t needed_size =
+      cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, input_shape, CVK_FMT_I8, /*eu_align=*/1) +
+      cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, filter_shape, CVK_FMT_I8, /*eu_align=*/1) +
+      cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, output_shape, CVK_FMT_I8, /*eu_align=*/1) +
+      cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, cal_shape, CVK_FMT_I8, /*eu_align=*/0);
+
+  // Skip invalid shape
+  if (needed_size > lmem_size_per_lane) {
+    return false;
+  }
+
+  return true;
+}
+
+int random_test(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+
+  if (rt_handle == NULL || cvk_ctx == NULL) {
+    return -1;
+  }
+
+#ifndef ENABLE_FULL_REGRESSION
+  // Input with same range size
+  // n: 12b, c: 12b, h: 12b(4095-32), wb: 12b(4095-32)
+  int batch_range[] = {1, 1, 3232};
+  int input_height_range[] = {1, 512, 4095 - 32};
+  int input_width_range[] = {1, 512, 4095 - 32};
+  int input_depth_range[] = {1, 16, 4095 - 32};
+
+  // Kernel with same range size
+  // h: 12b, w: 12b
+  // stride_h: 4b, strid_w: 4b
+  int kernel_height_range[] = {1, 11, 2048, 4095};
+  int kernel_width_range[] = {1, 11, 2048, 4095};
+  int kernel_stride_height_range[] = {1, 5, 16, 31};
+  int kernel_stride_width_range[] = {1, 5, 16, 31};
+#else
+  // Input with same range size
+  // n: 12b, c: 12b, h: 12b(4095-32), wb: 12b(4095-32)
+  int batch_range[] = {1, 2, 4, 8, 16, 32, 64, 4095 - 32};
+  int input_height_range[] = {1, 3, 11, 128, 512, 1024, 2048, 4095 - 32};
+  int input_width_range[] = {1, 3, 11, 128, 512, 1024, 2048, 4095 - 32};
+  int input_depth_range[] = {1, 3, 11, 32, 64, 1024, 2048, 4095 - 32};
+
+  // Kernel with same range size
+  // h: 12b, w: 12b
+  // stride_h: 4b, strid_w: 4b
+  int kernel_height_range[] = {1, 3, 11, 511, 4095};
+  int kernel_width_range[] = {1, 3, 11, 511, 4095};
+  int kernel_stride_height_range[] = {1, 3, 15, 16, 31};
+  int kernel_stride_width_range[] = {1, 3, 15, 16, 31};
+#endif /* ENABLE_FULL_REGRESSION */
+
+  const int input_range_size =
+      sizeof(input_height_range) / sizeof(input_height_range[0]);
+  const int kernel_range_size =
+      sizeof(kernel_height_range) / sizeof(kernel_height_range[0]);
+
+  int random_seed = clock();
+  srand(random_seed);
+
+  const int retry_test_count = 100;
+  bool stop_at_first_error = true;
+
+  int executed_tests = 0;
+  int failed_tests = 0;
+
+  printf("dw-conv-qm: random test =>\n");
+  for (int m = 0; m < retry_test_count; ++m) {
+    for (int i = 0; i < input_range_size; ++i) {
+      // random choosed from [range[i] : range[i+1]]
+      int batch = choose_from_range(batch_range, input_range_size, i);
+
+      for (int j = 0; j < input_range_size; ++j) {
+        int input_height =
+            choose_from_range(input_height_range, input_range_size, j);
+
+        for (int k = 0; k < input_range_size; ++k) {
+          int input_width =
+              choose_from_range(input_width_range, input_range_size, k);
+
+          for (int l = 0; l < input_range_size; ++l) {
+            int input_depth =
+                choose_from_range(input_depth_range, input_range_size, k);
+
+            for (int m = 0; m < kernel_range_size; ++m) {
+              int kernel_height =
+                  choose_from_range(kernel_height_range, kernel_range_size, m);
+
+              for (int n = 0; n < kernel_range_size; ++n) {
+                int kernel_width =
+                    choose_from_range(kernel_width_range, kernel_range_size, n);
+
+                for (int x = 0; x < kernel_range_size; ++x) {
+                  int kernel_stride_height = choose_from_range(
+                      kernel_stride_height_range, kernel_range_size, x);
+
+                  for (int y = 0; y < kernel_range_size; ++y) {
+                    int kernel_stride_width = choose_from_range(
+                        kernel_stride_width_range, kernel_range_size, y);
+
+                    int has_bias = rand() % 2;
+                    int dh = 1;
+                    int dw = 1;
+                    int ins_h = 0;
+                    int ins_h_last = 0;
+                    int ins_w = 0;
+                    int ins_w_last = 0;
+                    int pad_top = 0;
+                    int pad_bot = 0;
+                    int pad_left = 0;
+                    int pad_right = 0;
+
+                    int ih_ext = calc_dilute_hw(input_height, ins_h, ins_h_last,
+                                                pad_top, pad_bot);
+                    int iw_ext = calc_dilute_hw(input_width, ins_w, ins_w_last,
+                                                pad_left, pad_right);
+                    int kh_ext = calc_dilute_hw(kernel_height, dh - 1, 0, 0, 0);
+                    int kw_ext = calc_dilute_hw(kernel_width, dw - 1, 0, 0, 0);
+
+                    int oh =
+                        calc_output_hw(ih_ext, kh_ext, kernel_stride_height);
+                    int ow =
+                        calc_output_hw(iw_ext, kw_ext, kernel_stride_width);
+
+                    // depthwise, input depth == output depth
+                    int output_depth = input_depth;
+
+                    dw_conv_test_param_t test_param;
+                    memset(&test_param, 0, sizeof(test_param));
+                    test_param.input_n = batch;
+                    test_param.input_c = input_depth;
+                    test_param.input_h = input_height;
+                    test_param.input_w = input_width;
+                    test_param.kh = kernel_height;
+                    test_param.kw = kernel_width;
+                    test_param.dh = dh;
+                    test_param.dw = dw;
+                    test_param.pad_top = pad_top;
+                    test_param.pad_bot = pad_bot;
+                    test_param.pad_left = pad_left;
+                    test_param.pad_right = pad_right;
+                    test_param.ins_h = ins_h;
+                    test_param.ins_h_last = ins_h_last;
+                    test_param.ins_w = ins_w;
+                    test_param.ins_w_last = ins_w_last;
+                    test_param.stride_h = kernel_stride_height;
+                    test_param.stride_w = kernel_stride_width;
+                    test_param.output_c = output_depth;
+                    test_param.output_h = oh;
+                    test_param.output_w = ow;
+                    test_param.has_bias = has_bias;
+                    test_param.retry_cnt = 5;
+
+                    bool is_valid_param =
+                        check_valid_test_param(cvk_ctx, &test_param);
+                    if (is_valid_param == false) {
+                      continue;
+                    }
+
+                    int ret2 = run_compare_dw_conv(rt_handle, cvk_ctx, &test_param);
+                    failed_tests = ret2 ? failed_tests + 1 : failed_tests;
+                    ret |= ret2;
+                    executed_tests++;
+
+#ifdef ENABLE_DEBUG_MSG
+                    printf("  [%d] random test: input shape(%d, %d, %d, %d)",
+                           executed_tests, batch, input_depth,
+                           input_height, input_width);
+                    printf(", kernel shape (%d, %d, %d, %d), result %d\n",
+                           output_depth, input_depth, kernel_height,
+                           kernel_width, ret);
+#endif
+
+                    // Stop at first error
+                    if (ret && stop_at_first_error) {
+                      break;
+                    }
+                  }
+
+                  // Stop at first error
+                  if (ret && stop_at_first_error) {
+                    break;
+                  }
+                }
+
+                // Stop at first error
+                if (ret && stop_at_first_error) {
+                  break;
+                }
+              }
+
+              // Stop at first error
+              if (ret && stop_at_first_error) {
+                break;
+              }
+            }
+
+            // Stop at first error
+            if (ret && stop_at_first_error) {
+              break;
+            }
+          }
+
+          // Stop at first error
+          if (ret && stop_at_first_error) {
+            break;
+          }
+        }
+
+        // Stop at first error
+        if (ret && stop_at_first_error) {
+          break;
+        }
+      }
+
+      // Stop at first error
+      if (ret && stop_at_first_error) {
+        break;
+      }
+    }
+
+    // Stop at first error
+    if (ret && stop_at_first_error) {
+      break;
+    }
+
+    if (executed_tests >= MIN_EXEC_TESTS) {
+      break;
+    }
+  }
+
+  printf("<= dw-conv-qm: random test, total %d, failed %d, ret %d\n",
+         executed_tests, failed_tests, ret);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  // ret = simple_nhwc_dw_conv_test(rt_handle, cvk_ctx);
+  // ret |= test_nhwc_to_nchw();
+  ret |= simple_dw_conv_test(rt_handle, cvk_ctx);
+  ret |= random_test(rt_handle, cvk_ctx);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_depthwise_max_power.c b/cviruntime/test/181x/test_181x_depthwise_max_power.c
new file mode 100644
index 000000000..3539dc77d
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_depthwise_max_power.c
@@ -0,0 +1,631 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_native_ref.h"
+
+typedef cvk_tiu_depthwise_pt_convolution_param_t depthwise_conv_param_t;
+typedef cvk_tdma_l2g_tensor_copy_cw_transposed_param_t l2g_cw_param_t;
+typedef cvk_tdma_g2l_matrix_copy_row_col_transposed_param_t g2l_matrix_param_t;
+typedef cvk_tdma_l2l_tensor_copy_param_t l2l_tensor_copy_param_t;
+
+typedef struct{
+    int8_t  *depthwise_conv_input;
+    int8_t  *depthwise_conv_weight;
+    int16_t *depthwise_conv_bias;
+    uint8_t  *depthwise_conv_output;
+    int8_t  *depthwise_conv_output_ref;
+    uint8_t  *l2g_cw_src;
+    uint8_t  *l2g_cw_output;
+    uint8_t  *l2g_cw_output_ref;
+    uint8_t  *g2l_matrix_src;
+    uint8_t  *g2l_matrix_output;
+    uint8_t  *g2l_matrix_output_ref;
+    uint8_t  *l2l_tensor_src;
+    uint8_t  *l2l_tensor_output;
+    uint8_t  *l2l_tensor_output_ref;
+}s_test_data;
+
+depthwise_conv_param_t depthwise_conv_param;
+l2g_cw_param_t l2g_cw_param;
+g2l_matrix_param_t g2l_matrix_param;
+l2l_tensor_copy_param_t l2l_tensor_copy_param;
+s_test_data s8_test_data;
+
+cvk_tl_t *skip_tensor_lmem[10];
+uint32_t skip_tensor_num=0;
+
+void skip_tensor_lmem_size(cvk_context_t *cvk_ctx, const cvk_tl_t *p)
+{
+  uint32_t needed = align_up(p->shape.n * p->stride.n, cvk_ctx->info.eu_num);
+  uint32_t start_addr = p->start_address + needed;
+  uint32_t remain_size = start_addr % cvk_ctx->info.lmem_bank_size ? (cvk_ctx->info.lmem_bank_size - start_addr % cvk_ctx->info.lmem_bank_size) : 0; // remain size for each lane
+  if(remain_size)
+  {
+    cvk_tl_shape_t src_shape2 = tl_shape_t4(1, cvk_ctx->info.npu_num, 1, remain_size);
+    skip_tensor_lmem[skip_tensor_num] = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, src_shape2, CVK_FMT_I8, 1); // skip the lmem size and next tl can alignment to bank si     ze
+  }
+  skip_tensor_num++;
+}
+
+void skip_matrix_lmem_size(cvk_context_t *cvk_ctx, const cvk_ml_t *p)
+{
+  uint32_t needed = align_up(p->shape.n * p->stride.n, cvk_ctx->info.eu_num);
+
+  uint32_t start_addr = p->start_address + needed; //src_shape.n*src_shape.c*src_shape.h*src_shape.w/32;
+  uint32_t remain_size = start_addr % cvk_ctx->info.lmem_bank_size ? (cvk_ctx->info.lmem_bank_size - start_addr % cvk_ctx->info.lmem_bank_size) : 0; // remain size for each lane
+  if(remain_size)
+  {
+    cvk_tl_shape_t src_shape2 = {1, cvk_ctx->info.npu_num, 1, remain_size};
+    skip_tensor_lmem[skip_tensor_num] = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, src_shape2, CVK_FMT_I8, 1); // skip the lmem size and next tl can alignment to bank si     ze
+  }
+  skip_tensor_num++;
+}
+
+void free_skip_tensor_lmem(cvk_context_t *cvk_ctx)
+{
+  if(skip_tensor_lmem[--skip_tensor_num]!=NULL)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, skip_tensor_lmem[skip_tensor_num]);
+}
+
+static int8_t * alloc_input(const depthwise_conv_param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->ifmap->shape, p->ifmap->fmt);
+  int8_t *buf = (int8_t *)malloc(sizeof(int8_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (uint64_t i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static int8_t * alloc_weight(const depthwise_conv_param_t *p)
+{
+  int size = tl_shape_size(&p->weight->shape, p->weight->fmt);
+  int8_t *buf = (int8_t *)malloc(sizeof(int8_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static int16_t * alloc_bias(const depthwise_conv_param_t *p)
+{
+  int c = p->bias->shape.c;
+  int16_t *bias = (int16_t *)malloc(sizeof(int16_t) * c);
+  if (!bias)
+    return NULL;
+
+  for (int i = 0; i < c; i++)
+    bias[i] = rand() % 65536 - 32768;
+
+  return bias;
+}
+
+static int8_t *alloc_output(depthwise_conv_param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->ofmap->shape, p->ofmap->fmt);
+  int8_t *output = (int8_t *)malloc(sizeof(int8_t) * size);
+  return output;
+}
+
+static inline void relu8(int8_t *buf, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++)
+    if (buf[i] < 0)
+      buf[i] = 0;
+}
+
+static int generate_results(
+    depthwise_conv_param_t *p,
+    int8_t input[],
+    int8_t weight[],
+    int16_t bias[]
+    )
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+  int kh = p->weight->shape.h;
+  int kw = p->weight->shape.w;
+  int opd0_sign = (p->ifmap->fmt == CVK_FMT_I8);
+  int res0_sign = (p->ofmap->fmt == CVK_FMT_I8);
+  s8_test_data.depthwise_conv_output_ref = alloc_output(p);
+
+  int ret = native_pooling_ave_int8(
+      input, weight, p->bias ? bias : NULL, s8_test_data.depthwise_conv_output_ref,
+      in, ic, ih, iw, kh, kw,
+      p->pad_top, p->pad_bottom, p->pad_left, p->pad_right,
+      p->stride_h, p->stride_w,
+      p->ins_h, p->ins_w, p->ins_last_h, p->ins_last_w,
+      opd0_sign, res0_sign, p->rshift_bits, 0);
+  if (ret)
+    return ret;
+
+  if(p->relu_enable )
+    relu8(s8_test_data.depthwise_conv_output_ref, tl_shape_size(&p->ofmap->shape, p->ofmap->fmt));
+
+  return ret;
+}
+
+static int pooling_ih_ext(depthwise_conv_param_t *p, int ih)
+{
+  int ins = p->ins_h;
+  int ins_last = p->ins_last_h;
+  int pad = p->pad_top + p->pad_bottom;
+  return (ih - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+static int pooling_iw_ext(depthwise_conv_param_t *p, int iw)
+{
+  int ins = p->ins_w;
+  int ins_last = p->ins_last_w;
+  int pad = p->pad_left + p->pad_right;
+  return (iw - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+static int pooling_oh(depthwise_conv_param_t *p, int ih, int kh)
+{
+  int ih_ext = pooling_ih_ext(p, ih);
+  return (ih_ext - kh) / p->stride_h + 1;
+}
+
+static int pooling_ow(depthwise_conv_param_t *p, int iw, int kw)
+{
+  int iw_ext = pooling_iw_ext(p, iw);
+  return (iw_ext - kw) / p->stride_w + 1;
+}
+
+static void free_depthwise_param(
+    cvk_context_t *cvk_ctx,
+    depthwise_conv_param_t *p)
+{
+  if (p->bias)
+  {
+    free_skip_tensor_lmem(cvk_ctx);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->bias);
+  }
+  if (p->weight)
+  {
+    free_skip_tensor_lmem(cvk_ctx);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->weight);
+  }
+  if (p->ifmap)
+  {
+    free_skip_tensor_lmem(cvk_ctx);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->ifmap);
+  }
+  if (p->ofmap)
+  {
+    free_skip_tensor_lmem(cvk_ctx);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->ofmap);
+  }
+}
+
+static void put_bias_tensor(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    int16_t data[])
+{
+  int c = tl->shape.c;
+
+  uint8_t *lo_hi = (uint8_t *)malloc(2 * c);
+  if (!lo_hi)
+    return;
+
+  for (int i = 0; i < c; i++) {
+    lo_hi[i] = data[i] & 0xff;
+    lo_hi[i + c] = (data[i] >> 8) & 0xff;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl, (uint8_t *)lo_hi);
+
+  free(lo_hi);
+}
+
+static depthwise_conv_param_t random_depthwise_param(cvk_context_t *cvk_ctx)
+{
+  srand(clock());
+  depthwise_conv_param_t p;
+  int retry_cnt = 100;
+
+  for (int i = 0; i < retry_cnt; i++) {
+    int using_bias = 0;
+    int n = 1;
+    int c = 1000;
+    int ih = 2;
+    int iw = 8;
+    int kh = 1;
+    int kw = 1;
+    int opd0_sign = 0;
+
+    memset(&p, 0, sizeof(p));
+    p.ins_h = rand() % kh;
+    p.ins_w = rand() % kw;
+    p.ins_last_h = rand() % kh;
+    p.ins_last_w = rand() % kw;
+    p.stride_h = rand() % kh + 1;
+    p.stride_w = rand() % kw + 1;
+    p.pad_top = 0;
+    p.pad_bottom = 0;
+    p.pad_left = 0;
+    p.pad_right = 0;
+    p.rshift_bits = 2;
+    int oh = pooling_oh(&p, ih, kh);
+    int ow = pooling_ow(&p, iw, kw);
+    cvk_tl_shape_t ofmap_shape;
+    ofmap_shape.n = n;
+    ofmap_shape.c = c;
+    ofmap_shape.h = oh;
+    ofmap_shape.w = ow;
+    cvk_tl_shape_t ifmap_shape;
+    ifmap_shape.n = n;
+    ifmap_shape.c = c;
+    ifmap_shape.h = ih;
+    ifmap_shape.w = iw;
+    cvk_tl_shape_t weight_shape;
+    weight_shape.n = 1;
+    weight_shape.c = c;
+    weight_shape.h = kh;
+    weight_shape.w = kw;
+    cvk_tl_shape_t bias_shape;
+    bias_shape.n = 2;
+    bias_shape.c = c;
+    bias_shape.h = 1;
+    bias_shape.w = 1;
+    p.relu_enable = 1;
+    /*test case ref does not support dilation !=1*/
+    p.dilation_w = 1;
+    p.dilation_h = 1;
+    cvk_fmt_t ifmt = opd0_sign ? CVK_FMT_I8: CVK_FMT_U8;
+
+    p.ofmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, ofmap_shape, CVK_FMT_I8, 1);
+    skip_tensor_lmem_size(cvk_ctx, p.ofmap);
+    p.ifmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, ifmap_shape, ifmt, 1);
+    skip_tensor_lmem_size(cvk_ctx, p.ifmap);
+    p.weight = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, weight_shape, CVK_FMT_I8, 1);
+    skip_tensor_lmem_size(cvk_ctx, p.weight);
+    p.bias = NULL;
+    if (using_bias)
+    {
+      p.bias = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, bias_shape, CVK_FMT_I8, 0);
+      skip_tensor_lmem_size(cvk_ctx, p.bias);
+    }
+    if ((kh > pooling_ih_ext(&p, ih))
+        || (kw > pooling_iw_ext(&p, iw))
+        || (p.pad_top >= (1 << 4))
+        || (p.pad_bottom >= (1 << 4))
+        || (p.pad_left >= (1 << 4))
+        || (p.pad_right >= (1 << 4))
+        || !p.ofmap
+        || !p.ifmap
+        || !p.weight
+        || (using_bias && !p.bias)) {
+      printf("retry init_pooling_param\n");
+      free_depthwise_param(cvk_ctx, &p);
+    } else
+      break;
+  }
+
+  return p;
+}
+
+
+static int test_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  depthwise_conv_param = random_depthwise_param(cvk_ctx);
+
+  int8_t *input = alloc_input(&depthwise_conv_param);
+  int8_t *weight = alloc_weight(&depthwise_conv_param);
+  int16_t *bias = NULL;
+  if (depthwise_conv_param.bias)
+    bias = alloc_bias(&depthwise_conv_param);
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, depthwise_conv_param.ifmap, (uint8_t *)input);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, depthwise_conv_param.weight, (uint8_t *)weight);
+  if (depthwise_conv_param.bias)
+    put_bias_tensor(rt_handle, cvk_ctx, depthwise_conv_param.bias, bias);
+
+  int ret = generate_results(&depthwise_conv_param, input, weight, bias);
+
+  free(input);
+  free(weight);
+  free(bias);
+
+  return ret;
+}
+
+static void l2g_tensor_copy_cw_transposed_ref(
+    l2g_cw_param_t *p, uint8_t ref_data[], uint8_t src_data[])
+{
+  cvk_tl_shape_t s = p->src->shape;
+  uint32_t n = s.n;
+  uint32_t c = s.c;
+  uint32_t h = s.h;
+  uint32_t w = s.w;
+
+  for (uint32_t ni = 0; ni < n; ni++) {
+    for (uint32_t ci = 0; ci < c; ci++) {
+      for (uint32_t hi = 0; hi < h; hi++) {
+        for (uint32_t wi = 0; wi < w; wi++) {
+          uint32_t src_i = ni * c * h * w + ci * h * w + hi * w + wi;
+          uint32_t dst_i = ni * c * h * w + wi * h * c + hi * c + ci;
+          ref_data[dst_i] = src_data[src_i];
+        }
+      }
+    }
+  }
+}
+
+static void test_param_l2g(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, l2g_cw_param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->src->shape, p->src->fmt);
+
+  s8_test_data.l2g_cw_src = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!s8_test_data.l2g_cw_src)
+    return;
+
+  for (uint64_t i = 0; i < size; i++)
+    s8_test_data.l2g_cw_src[i] = rand()%0x100;
+
+  s8_test_data.l2g_cw_output_ref = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!s8_test_data.l2g_cw_output_ref)
+    return;
+
+  l2g_tensor_copy_cw_transposed_ref(p, s8_test_data.l2g_cw_output_ref, s8_test_data.l2g_cw_src);
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, p->src, s8_test_data.l2g_cw_src);
+}
+
+static void destroy_param_l2g(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, l2g_cw_param_t *p)
+{
+  free_tensor_dev_mem(rt_handle, p->dst);
+  free_skip_tensor_lmem(cvk_ctx);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->src);
+}
+
+static void test_l2g_cw_transpose(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, l2g_cw_param_t *p)
+{
+  cvk_tl_shape_t src_shape = {1, 0x100, 1, 0x020};
+  cvk_tg_shape_t dst_shape = {1, 0x020, 1, 0x100};
+
+  p->src = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, src_shape, CVK_FMT_I8, 1);
+  p->dst = alloc_tensor_dev_mem(rt_handle, cvk_ctx, dst_shape, CVK_FMT_I8);
+  skip_tensor_lmem_size(cvk_ctx, p->src);
+  test_param_l2g(rt_handle, cvk_ctx, p);
+}
+
+static void g2l_matrix_copy_row_col_transposed_ref(
+    g2l_matrix_param_t *p, uint8_t ref_data[], uint8_t src_data[])
+{
+  uint64_t row = p->src->shape.row;
+  uint64_t col = p->src->shape.col;
+
+  for (uint64_t ri = 0; ri < row; ri++) {
+    for (uint64_t ci = 0; ci < col; ci++) {
+      uint64_t src_i = ri * col + ci;
+      uint64_t dst_i = ci * row + ri;
+      ref_data[dst_i] = src_data[src_i];
+    }
+  }
+}
+
+static void test_param_g2l(CVI_RT_HANDLE rt_handle, g2l_matrix_param_t *p)
+{
+  uint64_t size = ml_shape_size(&p->dst->shape, p->dst->fmt);
+
+  s8_test_data.g2l_matrix_src = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!s8_test_data.g2l_matrix_src)
+    return;
+
+  for (uint64_t i = 0; i < size; i++)
+    s8_test_data.g2l_matrix_src[i] = rand()%0x100;
+
+  s8_test_data.g2l_matrix_output_ref = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!s8_test_data.g2l_matrix_output_ref)
+    return;
+
+  g2l_matrix_copy_row_col_transposed_ref(p, s8_test_data.g2l_matrix_output_ref, s8_test_data.g2l_matrix_src);
+
+  matrix_copy_s2d(rt_handle, p->src, s8_test_data.g2l_matrix_src);
+}
+
+static void destroy_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, g2l_matrix_param_t *p)
+{
+  free_matrix_dev_mem(rt_handle, p->src);
+  free_skip_tensor_lmem(cvk_ctx);
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->dst);
+}
+
+
+static void test_g2l_matrix_transpose(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, g2l_matrix_param_t *p)
+{
+  //g2l_matrix_param_t p;
+  /*
+   * Matrix transpose must be n/c stride alignment
+   * for TDMA limitation
+   */
+  cvk_mg_shape_t src_shape={0x100, 0x20};
+  cvk_ml_shape_t dst_shape={0x20, 0x10, 0x10, 0x100};
+
+  int dst_align = 1;
+  cvk_fmt_t fmt = CVK_FMT_I8;
+  p->src = alloc_matrix_dev_mem(rt_handle, src_shape, fmt);
+  p->dst = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, dst_shape, fmt, dst_align);
+  skip_matrix_lmem_size(cvk_ctx, p->dst);
+  test_param_g2l(rt_handle, p);
+}
+
+static void l2l_tensor_copy_ref(l2l_tensor_copy_param_t *p, uint8_t ref_data[], uint8_t src_data[])
+{
+  uint64_t size = tl_shape_size(&p->src->shape, p->src->fmt);
+
+  for (uint64_t i = 0; i < size; i++)
+    ref_data[i] = src_data[i];
+}
+
+static void test_l2l_param(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, l2l_tensor_copy_param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->src->shape, p->src->fmt);
+
+  s8_test_data.l2l_tensor_src = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!s8_test_data.l2l_tensor_src)
+    return;
+
+  for (uint64_t i = 0; i < size; i++)
+    s8_test_data.l2l_tensor_src[i] = rand()%0x100;
+
+  s8_test_data.l2l_tensor_output_ref = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!s8_test_data.l2l_tensor_output_ref)
+    return;
+
+  l2l_tensor_copy_ref(p, s8_test_data.l2l_tensor_output_ref, s8_test_data.l2l_tensor_src);
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, p->src, s8_test_data.l2l_tensor_src);
+}
+
+static void destroy_param_l2l(cvk_context_t *cvk_ctx, l2l_tensor_copy_param_t *p)
+{
+  free_skip_tensor_lmem(cvk_ctx);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->dst);
+  free_skip_tensor_lmem(cvk_ctx);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->src);
+}
+
+static void test_l2l_tensor_copy(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, l2l_tensor_copy_param_t *p)
+{
+  cvk_tl_shape_t src_shape = {1, 0x10, 0x1, 0x100};
+  cvk_tl_shape_t dst_shape = {1, 0x10, 0x1, 0x100};
+
+  p->src = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, src_shape, CVK_FMT_I8, 1);
+  skip_tensor_lmem_size(cvk_ctx, p->src);
+  p->dst = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, dst_shape, CVK_FMT_I8, 1);
+  skip_tensor_lmem_size(cvk_ctx, p->dst);
+  test_l2l_param(rt_handle, cvk_ctx, p);
+}
+
+void get_result(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  s8_test_data.depthwise_conv_output = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, depthwise_conv_param.ofmap);
+  s8_test_data.l2g_cw_output = tensor_copy_d2s(rt_handle, l2g_cw_param.dst);
+  s8_test_data.g2l_matrix_output = matrix_copy_l2g_d2s(rt_handle, cvk_ctx, g2l_matrix_param.dst);
+  s8_test_data.l2l_tensor_output = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, l2l_tensor_copy_param.dst);
+}
+
+int check_result(void)
+{
+
+  int cmp_res = array_cmp_int8(
+      "Comparing results ...\n", s8_test_data.depthwise_conv_output_ref,  (int8_t *)s8_test_data.depthwise_conv_output,
+      tl_shape_size(&depthwise_conv_param.ofmap->shape, depthwise_conv_param.ofmap->fmt));
+  if (cmp_res != 0) {
+    printf("Comparison FAILED!!!\n");
+    return -1;
+  }
+
+  for (uint64_t i = 0; i < tl_shape_size(&l2g_cw_param.src->shape, l2g_cw_param.src->fmt); i++) {
+    if (s8_test_data.l2g_cw_output[i] != s8_test_data.l2g_cw_output_ref[i]) {
+      fprintf(stderr, "l2g_cw comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, s8_test_data.l2g_cw_output[i], s8_test_data.l2g_cw_output_ref[i]);
+      return -1;
+    }
+  }
+  for (uint64_t i = 0; i < ml_shape_size(&g2l_matrix_param.dst->shape, g2l_matrix_param.dst->fmt); i++) {
+    if (s8_test_data.g2l_matrix_output[i] != s8_test_data.g2l_matrix_output_ref[i]) {
+      fprintf(stderr, "g2l_matrix comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, s8_test_data.g2l_matrix_output[i], s8_test_data.g2l_matrix_output_ref[i]);
+      return -1;
+    }
+  }
+
+  for (uint64_t i = 0; i < tl_shape_size(&l2l_tensor_copy_param.src->shape, l2l_tensor_copy_param.src->fmt); i++) {
+    if (s8_test_data.l2l_tensor_output[i] != s8_test_data.l2l_tensor_output_ref[i]) {
+      fprintf(stderr, "l2l_tensor comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, s8_test_data.l2l_tensor_output[i], s8_test_data.l2l_tensor_output_ref[i]);
+      return -1;
+    }
+  }
+
+  return 0;
+}
+
+void trigger_max_power(cvk_context_t *cvk_ctx)
+{
+ cvk_ctx->ops->parallel_enable(cvk_ctx);
+ cvk_ctx->ops->tdma_l2g_tensor_copy_cw_transposed(cvk_ctx, &l2g_cw_param);
+ cvk_ctx->ops->tdma_g2l_matrix_copy_row_col_transposed(cvk_ctx, &g2l_matrix_param);
+ cvk_ctx->ops->tdma_l2l_tensor_copy(cvk_ctx, &l2l_tensor_copy_param);
+ cvk_ctx->ops->tiu_pt_depthwise_convolution(cvk_ctx, &depthwise_conv_param);
+ cvk_ctx->ops->parallel_disable(cvk_ctx);
+ CVI_RT_Submit(cvk_ctx);
+}
+
+void free_s8_data()
+{
+  free(s8_test_data.depthwise_conv_input);
+  free(s8_test_data.depthwise_conv_weight);
+  free(s8_test_data.depthwise_conv_bias);
+  free(s8_test_data.depthwise_conv_output);
+  free(s8_test_data.depthwise_conv_output_ref);
+  free(s8_test_data.l2g_cw_src);
+  free(s8_test_data.l2g_cw_output);
+  free(s8_test_data.l2g_cw_output_ref);
+  free(s8_test_data.g2l_matrix_src);
+  free(s8_test_data.g2l_matrix_output);
+  free(s8_test_data.g2l_matrix_output_ref);
+  free(s8_test_data.l2l_tensor_src);
+  free(s8_test_data.l2l_tensor_output);
+  free(s8_test_data.l2l_tensor_output_ref);
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  printf("depthwise max_power test\n");
+
+  ret |= test_pooling(rt_handle, cvk_ctx);
+  test_l2g_cw_transpose(rt_handle, cvk_ctx, &l2g_cw_param);
+  test_g2l_matrix_transpose(rt_handle, cvk_ctx, &g2l_matrix_param);
+  test_l2l_tensor_copy(rt_handle, cvk_ctx, &l2l_tensor_copy_param);
+
+  trigger_max_power(cvk_ctx);
+  get_result(rt_handle, cvk_ctx);
+  ret |= check_result();
+
+  destroy_param_l2l(cvk_ctx, &l2l_tensor_copy_param);
+  destroy_param_g2l(rt_handle, cvk_ctx, &g2l_matrix_param);
+  destroy_param_l2g(rt_handle, cvk_ctx, &l2g_cw_param);
+  free_depthwise_param(cvk_ctx, &depthwise_conv_param);
+  free_s8_data();
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_double_conv.c b/cviruntime/test/181x/test_181x_double_conv.c
new file mode 100644
index 000000000..de279cfa0
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_double_conv.c
@@ -0,0 +1,807 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_native_ref.h"
+
+typedef struct {
+  int random_seed;
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int using_bias;
+  int bReLU_EN;
+  int r_shift_m;
+  int opd0_sign;
+  int opd1_sign;
+  int opd2_sign;
+} conv_param_t;
+
+static int index_get(int h, int w1, int w2)
+{
+  return h * w1 + w2;
+}
+
+static int matrix_dot_mult(
+    int8_t *A, int8_t *B, int dim_n, int dim_m,
+    int opd0_sign)
+{
+  int sum = 0;
+  for (int i=0; i<dim_n; i++){
+    for (int j=0; j<dim_m; j++){
+      int index = index_get(i, dim_m, j);
+      if(opd0_sign) {
+        sum += A[index] * B [index];
+      } else {
+        sum += (int)((uint8_t)A[index]) * B [index];
+      }
+    }
+  }
+  return sum;
+}
+
+static int conv_ref(
+    const conv_param_t *p_param,
+    const int8_t *ifmap,
+    const int8_t *weight,
+    const int16_t *bias,
+    int8_t *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+  int r_shift_bits = p_param->r_shift_m;
+  int do_relu = p_param->bReLU_EN;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  int8_t *i_fmap_pad_ker = (int8_t *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return -1;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = 0;
+
+  int8_t *i_fmap_pad = NULL;
+  int8_t *kernel_after = NULL;
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+
+      if (p_param->using_bias) {
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += bias[c]; //bias+c ;
+          }
+        }
+      }
+
+      if (do_relu)
+        relu(&result[n*oc*oh*ow + c*oh*ow], oh * ow);
+
+      // ofmap is int8_t, signed
+      ret = satu_2_8bit(&result[n*oc*oh*ow + c*oh*ow], oh * ow,
+                        &ofmap[n*oc*oh*ow + c*oh*ow], r_shift_bits, /*round_floor=*/1,
+                        /*sign_unsign=*/1);
+
+      if (ret)
+        goto error_release;
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+error_release:
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+static uint8_t * transform_weight(const cvk_tl_shape_t *s, uint8_t before[])
+{
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  uint32_t size = ic * oc * kh * kw;
+  uint8_t *after = (uint8_t *)malloc(size);
+  if (!after)
+    return NULL;
+
+  /*
+   * (oc, ic, kh, kw) -> (1, oc, kh * kw, ic)
+   */
+  for (uint32_t oci = 0; oci < oc; oci++) {
+    for (uint32_t ici = 0; ici < ic; ici++) {
+      for (uint32_t khi = 0; khi < kh; khi++) {
+        for (uint32_t kwi = 0; kwi < kw; kwi++) {
+          uint32_t src_i = oci * ic * kh * kw + ici * kh * kw + khi * kw + kwi;
+          uint32_t dst_i = oci * kh * kw * ic + khi * kw * ic + kwi * ic + ici;
+          after[dst_i] = before[src_i];
+        }
+      }
+    }
+  }
+
+  return after;
+}
+
+static void put_conv_weight(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    uint8_t *data)
+{
+#if 0
+  const cvk_tl_shape_t *s = &tl->shape;
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  bmshape_t bms = BM_TENSOR_INT8((int)oc, (int)ic, (int)kh, (int)kw);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  uint8_t *transformed_data = transform_weight(s, data);
+
+  /* we put weight to region 1. bm_memcpy_s2d regard dev_mem as
+   * absolute address, so we should pass abs address to copy weight
+   * to right place.
+   */
+
+  //u64 ab_addr = bm_device_read_base_reg(*ctx, 1);
+  //bmmem_device_t ab_dev_mem =
+    //bmmem_device_prealloc(*ctx, NULL, ab_addr + gaddr, &bms);
+
+  //int ret = bm_memcpy_s2d(*ctx, ab_dev_mem, transformed_data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  cvk_tl_shape_t tdma_shape = { 1, oc, kh * kw, ic };
+
+  tg_t tdma_tg;
+  tdma_tg.base_reg_index = 0;
+  tdma_tg.start_address = gaddr;
+  tdma_tg.fmt = FMT_I8;
+  tdma_tg.shape.n = tdma_shape.n;
+  tdma_tg.shape.c = tdma_shape.c;
+  tdma_tg.shape.h = tdma_shape.h;
+  tdma_tg.shape.w = tdma_shape.w;
+  tdma_tg.stride = bmk1822_tensor_tgmem_default_stride(tdma_tg.shape, tdma_tg.fmt);
+  tdma_tg.base_reg_index = 1;
+
+  tl_t tdma_tl = *tl;
+  tdma_tl.shape = tdma_shape;
+  tdma_tl.stride = bmk1822_tensor_lmem_default_stride(cvk_ctx, tdma_shape, FMT_I8, 0);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tdma_tg;
+  p.dst = &tdma_tl;
+
+  bmk1822_tdma_g2l_tensor_copy(cvk_ctx, &p);
+  test_submit(ctx);
+  bmmem_device_free(*ctx, dev_mem);
+  delete[] transformed_data;
+#endif
+
+  const cvk_tl_shape_t *s = &tl->shape;
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  uint8_t *transformed_data = transform_weight(s, data);
+
+  cvk_tl_shape_t tdma_shape = tl_shape_t4(1, oc, kh * kw, ic);
+  cvk_tl_t tdma_tl;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tdma_tl, tdma_shape, tl->fmt, tl->eu_align);
+  tdma_tl.start_address = tl->start_address;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, &tdma_tl, transformed_data);
+
+  free(transformed_data);
+}
+
+static int8_t * transform_bias(int oc, int16_t before[])
+{
+  int8_t *after = (int8_t *)malloc(2 * oc);
+  if (!after)
+    return NULL;
+
+  for (int i = 0; i < oc; i++){
+    after[i] = before[i] & 0xff;
+    after[i + oc] = (before[i] >> 8) & 0xff;
+  }
+  return after;
+}
+
+static void put_conv_bias(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    int16_t *data)
+{
+#if 0
+  int oc = tl->shape.c;
+
+  bmshape_t bms = BM_TENSOR_INT8(2, oc, 1, 1);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  int8_t *transformed_data = transform_bias(oc, data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, (uint8_t *)transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_I8;
+  tg.shape.n = 2;
+  tg.shape.c = oc;
+  tg.shape.h = 1;
+  tg.shape.w = 1;
+  tg.stride = bmk1822_tensor_tgmem_default_stride(tg.shape, tg.fmt);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1822_tdma_g2l_tensor_copy(cvk_ctx, &p);
+  test_submit(ctx);
+
+  bmmem_device_free(*ctx, dev_mem);
+  delete[] transformed_data;
+#endif
+
+  int oc = tl->shape.c;
+  int8_t *transformed_data = transform_bias(oc, data);
+
+  cvk_tl_shape_t tdma_shape = tl_shape_t4(2, oc, 1, 1);
+  cvk_tl_t tdma_tl;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tdma_tl, tdma_shape, tl->fmt, tl->eu_align);
+  tdma_tl.start_address = tl->start_address;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, &tdma_tl, (uint8_t *)transformed_data);
+
+  free(transformed_data);
+}
+
+static int conv_kh_ext(const conv_param_t *p)
+{
+  return (p->kh - 1) * p->dh + 1;
+}
+
+static int conv_kw_ext(const conv_param_t *p)
+{
+  return (p->kw - 1) * p->dw + 1;
+}
+
+static int conv_ih_ext(const conv_param_t *p)
+{
+  return (p->input_h - 1) * (p->ins_h + 1) +
+      p->ins_h_last + 1 + p->pad_top + p->pad_bot;
+}
+
+static int conv_iw_ext(const conv_param_t *p)
+{
+  return (p->input_w - 1) * (p->ins_w + 1) +
+      p->ins_w_last + 1 + p->pad_left + p->pad_right;
+}
+
+static int conv_oh(const conv_param_t *p)
+{
+  return (conv_ih_ext(p) - conv_kh_ext(p)) / p->stride_h + 1;
+}
+
+static int conv_ow(const conv_param_t *p)
+{
+  return (conv_iw_ext(p) - conv_kw_ext(p)) / p->stride_w + 1;
+}
+
+static int conv_input_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int ic = p->input_c;
+  int ih = p->input_h;
+  int iw = p->input_w;
+  return in * ic * ih * iw;
+}
+
+static int conv_output_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int oc = p->output_c;
+  int oh = conv_oh(p);
+  int ow = conv_ow(p);
+  return in * oc * oh * ow;
+}
+
+static int conv_weight_size(const conv_param_t *p)
+{
+  int oc = p->output_c;
+  int ic = p->input_c;
+  int kh = p->kh;
+  int kw = p->kw;
+  return oc * ic * kh * kw;
+}
+
+static int8_t * alloc_input(const conv_param_t *p)
+{
+  int size = conv_input_size(p);
+
+  int8_t *buf = (int8_t *)malloc(sizeof(int8_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static int8_t * alloc_weight(const conv_param_t *p)
+{
+  int size = conv_weight_size(p);
+
+  int8_t *buf = (int8_t *)malloc(sizeof(int8_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static int16_t * alloc_bias(const conv_param_t *p)
+{
+  int oc = p->output_c;
+
+  int16_t *bias = (int16_t *)malloc(sizeof(int16_t) * oc);
+  if (!bias)
+    return NULL;
+
+  for (int i = 0; i < oc; i++)
+    bias[i] = rand() % 65536 - 32768;
+
+  return bias;
+}
+
+static cvk_tl_t * conv_ifmap_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = p->opd0_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 1);
+}
+
+static cvk_tl_t * conv_weight_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = p->opd1_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 0);
+}
+
+static cvk_tl_t * conv_ofmap_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, CVK_FMT_I8, 1);
+}
+
+static cvk_tl_t * conv_bias_tensor(
+    cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = p->opd2_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 0);
+}
+
+static int conv_param_is_ok(const conv_param_t *p)
+{
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+
+  if ((kh_ext > ih_ext)
+      || (kw_ext > iw_ext)
+      || (kh_ext <= p->pad_top)
+      || (kh_ext <= p->pad_bot)
+      || (kw_ext <= p->pad_left)
+      || (kw_ext <= p->pad_right)
+      || (p->pad_top >= (1 << 4))
+      || (p->pad_bot >= (1 << 4))
+      || (p->pad_left >= (1 << 4))
+      || (p->pad_right >= (1 << 4))) {
+    return 0;
+  }
+
+  return 1;
+}
+
+static int bmk_conv_param_alloc_ok(
+    const cvk_tiu_pt_convolution_param_t *p,
+    const conv_param_t *param)
+{
+  if (!p->ifmap || !p->ofmap || !p->weight)
+    return 0;
+
+  if (param->using_bias)
+    if (!p->bias)
+      return 0;
+
+  return 1;
+}
+
+static void make_bmk_conv_param(
+    cvk_context_t *cvk_ctx,
+    cvk_tiu_pt_convolution_param_t *dst,
+    const conv_param_t *p)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  dst->relu_enable = p->bReLU_EN;
+  dst->rshift_bits = p->r_shift_m;
+
+  dst->ifmap = conv_ifmap_tensor(cvk_ctx, p);
+  dst->weight = conv_weight_tensor(cvk_ctx, p);
+  dst->ofmap = conv_ofmap_tensor(cvk_ctx, p);
+  dst->bias = NULL;
+  dst->ps32_mode = 0;
+  if (p->using_bias)
+    dst->bias = conv_bias_tensor(cvk_ctx, p);
+
+  dst-> w_is_const = 0;
+}
+
+static void free_bmk_conv_param(
+    cvk_context_t *cvk_ctx,
+    cvk_tiu_pt_convolution_param_t *r,
+    const conv_param_t *p)
+{
+  if (p->using_bias && r->bias)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->bias);
+  if (r->ofmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ofmap);
+  if (r->weight)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->weight);
+  if (r->ifmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ifmap);
+}
+
+static void init_conv_param(conv_param_t *p)
+{
+  printf("init_conv_param\n");
+  p->random_seed = clock();
+  srand(p->random_seed);
+
+retry:
+  p->input_n = rand() % 5 + 1;
+  p->input_c = (rand() % (5 * 32)/2)*2 + 8;
+  p->kh = rand() % 7 + 1;
+  p->kw = rand() % 7 + 1;
+  p->input_h = rand() % 40 + p->kh;
+  p->input_w = rand() % 40 + p->kw;
+  p->output_c = rand() % 10 + 3;
+  p->stride_h = rand() % (p->kh) + 1;
+  p->stride_w = rand() % (p->kw) + 1;
+  p->ins_h = rand() % p->kh;
+  p->ins_w = rand() % p->kw;
+  p->ins_h_last = rand() % p->kh;;
+  p->ins_w_last = rand() % p->kw;;
+  p->dh = rand() % 3 + 1;
+  p->dw = rand() % 3 + 1;
+
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  p->pad_top = rand() % kh_ext;
+  p->pad_bot = rand() % kh_ext;
+  p->pad_left = rand() % kw_ext;
+  p->pad_right = rand() % kw_ext;
+
+  if (!conv_param_is_ok(p)) {
+    printf("retry init_conv_param\n");
+    goto retry;
+  }
+
+  p->using_bias = rand() % 2;
+  p->r_shift_m = rand() % 8;
+  p->bReLU_EN = rand() % 2;
+
+  p->opd0_sign = rand() % 2;
+  p->opd1_sign = 1;
+  p->opd2_sign = 1;
+
+  assert(p->opd1_sign == 1 && p->opd2_sign == 1);
+
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+  assert(ih_ext >= kh_ext);
+  assert(iw_ext >= kw_ext);
+}
+
+static void print_conv_param(const conv_param_t *p)
+{
+  printf("%s\n", "Conv parameters:");
+  printf("  %s%d;\n", "p->random_seed = ", p->random_seed);
+
+  printf("  %s%d;\n", "p->input_n = ", p->input_n);
+  printf("  %s%d;\n", "p->input_c = ", p->input_c);
+  printf("  %s%d;\n", "p->input_h = ", p->input_h);
+  printf("  %s%d;\n", "p->input_w = ", p->input_w);
+  printf("  %s%d;\n", "p->output_c = ", p->output_c);
+
+  printf("  %s%d;\n", "p->kh = ", p->kh);
+  printf("  %s%d;\n", "p->kw = ", p->kw);
+  printf("  %s%d;\n", "p->dh = ", p->dh);
+  printf("  %s%d;\n", "p->dw = ", p->dw);
+  printf("  %s%d;\n", "p->pad_top = ", p->pad_top);
+  printf("  %s%d;\n", "p->pad_bot = ", p->pad_bot);
+  printf("  %s%d;\n", "p->pad_left = ", p->pad_left);
+  printf("  %s%d;\n", "p->pad_right = ", p->pad_right);
+  printf("  %s%d;\n", "p->stride_h = ", p->stride_h);
+  printf("  %s%d;\n", "p->stride_w = ", p->stride_w);
+  printf("  %s%d;\n", "p->ins_w = ", p->ins_w);
+  printf("  %s%d;\n", "p->ins_h = ", p->ins_h);
+  printf("  %s%d;\n", "p->ins_w_last = ", p->ins_w_last);
+  printf("  %s%d;\n", "p->ins_h_last = ", p->ins_h_last);
+
+  printf("  %s%d;\n", "p->r_shift_m = ", p->r_shift_m);
+  printf("  %s%d;\n", "p->opd0_sign = ", p->opd0_sign);
+  printf("  %s%d;\n", "p->opd1_sign = ", p->opd1_sign);
+  printf("  %s%d;\n", "p->opd2_sign = ", p->opd2_sign);
+  printf("  %s%d;\n", "p->bReLU_EN = ", p->bReLU_EN);
+  printf("  %s%d;\n", "p->using_bias = ", p->using_bias);
+
+  printf("  %s%d\n", "kh_ext = ", conv_kh_ext(p));
+  printf("  %s%d\n", "kw_ext = ", conv_kw_ext(p));
+  printf("  %s%d\n", "ih_ext = ", conv_ih_ext(p));
+  printf("  %s%d\n", "iw_ext = ", conv_iw_ext(p));
+  printf("  %s%d\n", "output_h = ", conv_oh(p));
+  printf("  %s%d\n", "output_w = ", conv_ow(p));
+}
+
+// Calculate the right shift value, m
+// Steps:
+//  1. Get the abs() of each weight;
+//  2. Summary all the abs() in one kernel;
+//  3. Get Log2 of each sum;
+//  4. Downward rounding;
+// After every r_shift value got, sort and find the middle one.
+static int calc_rshift_m(const conv_param_t *p, int8_t* weight)
+{
+  int kernel_cnt = p->output_c * p->input_c;
+  int kernel_size = p->kh * p->kw;
+  int *kernel_shifts = (int *)malloc(sizeof(int) * kernel_cnt);
+  if (!kernel_shifts)
+    return 1;
+
+  memset(kernel_shifts, 0, sizeof(int) * kernel_cnt);
+
+  // Part 1:
+  // Get right shift value for each kernel
+  int sum = 0;
+  for (int i = 0; i < kernel_cnt; i++) {
+    // Step 1 & 2: Get the sum of abs()
+    for (int j = 0; j < kernel_size; j++) {
+      sum += (int)(*weight < 0 ? -(*weight) : (*weight));
+      weight++;
+    }
+    // Step 3 & 4: log2 and downward rounding
+    sum >>= 1;
+    while (sum) {
+      sum >>= 1;
+      kernel_shifts[i]++;
+    }
+  }
+
+  // Part 2:
+  // Find the middle of all the values
+  int tag[32] = {0};
+  for (int cnt = 0; cnt < kernel_cnt; cnt++) {
+    tag[kernel_shifts[cnt]]++;
+  }
+
+  int rshift_m = 0;
+  int mid = 0;
+  do {
+    mid += tag[rshift_m++];
+  } while(mid < (kernel_cnt - 1) >> 1);
+
+  free(kernel_shifts);
+
+  return rshift_m - 1;
+}
+
+static int test_conv(
+    conv_param_t *p_param, CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+  int8_t *input = alloc_input(p_param);
+  int8_t *weight = alloc_weight(p_param);
+  int16_t *bias = alloc_bias(p_param);
+  int8_t *output_ref = (int8_t *)malloc(sizeof(int8_t) * conv_output_size(p_param));
+  if (!input || !weight || !bias || !output_ref) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  p_param->r_shift_m = calc_rshift_m(p_param, weight);
+
+  ret = conv_ref(p_param, input, weight, bias, output_ref);
+  if (ret)
+    goto fail_exit;
+
+  cvk_tiu_pt_convolution_param_t conv_param;
+  make_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+
+  int tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, p_param);
+  if (tl_alloc_success) {
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, conv_param.ifmap, (uint8_t *)input);
+    put_conv_weight(rt_handle, cvk_ctx, conv_param.weight, (uint8_t *)weight);
+    if (p_param->using_bias)
+      put_conv_bias(rt_handle, cvk_ctx, conv_param.bias, bias);
+    cvk_ctx->ops->tiu_pt_convolution(cvk_ctx, &conv_param);
+    uint8_t *output = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, conv_param.ofmap);
+
+    ret = array_cmp_int8(
+        "Comparing results ...\n",
+        output_ref, (int8_t *)output, conv_output_size(p_param));
+
+    if (ret) {
+      print_conv_param(p_param);
+      printf("Comparison FAILED\n");
+    }
+    free(output);
+  }
+
+  free_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+
+fail_exit:
+  free(input);
+  free(weight);
+  free(bias);
+  free(output_ref);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  for (int i = 0; i < 5; i++) {
+    printf("random_test_conv iteration: %d\n", i);
+    conv_param_t test_conv_param;
+    init_conv_param(&test_conv_param);
+    ret |= test_conv(&test_conv_param, rt_handle, cvk_ctx);
+    if (!test_conv_param.using_bias)
+      test_conv_param.using_bias = 1;
+    if (test_conv_param.output_c <= 32)
+      test_conv_param.output_c += 32;
+    ret |= test_conv(&test_conv_param, rt_handle, cvk_ctx);
+  }
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_double_conv_ps32.c b/cviruntime/test/181x/test_181x_double_conv_ps32.c
new file mode 100644
index 000000000..735bdcf50
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_double_conv_ps32.c
@@ -0,0 +1,1506 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include "test_cvikernel_util.h"
+#include "test_native_ref.h"
+
+typedef struct {
+  int random_seed;
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int using_bias;
+  int bReLU_EN;
+  int r_shift_m;
+  int opd0_sign;
+  int opd1_sign;
+  int opd2_sign;
+} conv_param_t;
+
+static int index_get(int h, int w1, int w2)
+{
+  return h * w1 + w2;
+}
+
+static int matrix_dot_mult(
+    int8_t *A, int8_t *B, int dim_n, int dim_m,
+    int opd0_sign)
+{
+  int sum = 0;
+  for (int i=0; i<dim_n; i++){
+    for (int j=0; j<dim_m; j++){
+      int index = index_get(i, dim_m, j);
+      if (opd0_sign) {
+        sum += A[index] * B [index];
+      } else {
+        sum += (int)((uint8_t)A[index]) * B [index];
+      }
+    }
+  }
+  return sum;
+}
+
+static int ps32_m2_conv_ref(
+    const conv_param_t *p_param,
+    const int8_t *ifmap,
+    const int8_t *weight,
+    int8_t *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  int8_t *i_fmap_pad_ker = (int8_t *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return -1;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = 0;
+
+  int8_t *i_fmap_pad = NULL;
+  int8_t *kernel_after = NULL;
+  uint32_t bstride = in * oc * oh * ow;
+
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+    ofmap[i] = result[i];
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+    ofmap[bstride + i] = result[i] >> 8;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+    ofmap[2 * bstride + i] = result[i] >> 16;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+    ofmap[3 * bstride + i] = result[i] >> 24;
+
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+static int ps32_m1_conv_ref(
+    const conv_param_t *p_param,
+    const int8_t *ifmap,
+    const int8_t *weight,
+    const int16_t *bias,
+    int8_t *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+  int r_shift_bits = p_param->r_shift_m;
+  int do_relu = p_param->bReLU_EN;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  int8_t *i_fmap_pad_ker = (int8_t *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return -1;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = 0;
+
+  int8_t *i_fmap_pad = NULL;
+  int8_t *kernel_after = NULL;
+
+  uint32_t bstride = in * oc * oh * ow;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      result[i] = (uint8_t)ofmap[i];
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      result[i] |= (uint8_t)ofmap[bstride + i] << 8;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      result[i] |= (uint8_t)ofmap[2 * bstride + i] << 16;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      result[i] |= (uint8_t)ofmap[3 * bstride + i] << 24;
+
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+
+      if (p_param->using_bias) {
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += bias[c]; //bias+c ;
+          }
+        }
+      }
+
+      if (do_relu)
+        relu(&result[n*oc*oh*ow + c*oh*ow], oh * ow);
+
+      // ofmap is int8_t, signed
+      ret = satu_2_8bit(&result[n*oc*oh*ow + c*oh*ow], oh * ow,
+                        &ofmap[n*oc*oh*ow + c*oh*ow], r_shift_bits, /*round_floor=*/1,
+                        /*sign_unsign=*/1);
+
+      if (ret)
+        goto error_release;
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+error_release:
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+
+static int ps32_m3_conv_ref(
+    const conv_param_t *p_param,
+    const int8_t *ifmap,
+    const int8_t *weight,
+    int8_t *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  int8_t *i_fmap_pad_ker = (int8_t *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return -1;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = 0;
+
+  int8_t *i_fmap_pad = NULL;
+  int8_t *kernel_after = NULL;
+
+  if (!result || !i_fmap_pad_ker) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  uint32_t bstride = in * oc * oh * ow;
+
+  for (int i = 0; i < in * oc * oh * ow; i++)
+      result[i] = (uint8_t)ofmap[i];
+
+  for (int i = 0; i < in * oc * oh * ow; i++)
+      result[i] |= (uint8_t)ofmap[bstride + i] << 8;
+
+  for (int i = 0; i < in * oc * oh * ow; i++)
+      result[i] |= (uint8_t)ofmap[2 * bstride + i] << 16;
+
+  for (int i = 0; i < in * oc * oh * ow; i++)
+      result[i] |= (uint8_t)ofmap[3 * bstride + i] << 24;
+
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      ofmap[i] = result[i];
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      ofmap[bstride + i] = result[i] >> 8;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      ofmap[2 * bstride + i] = result[i] >> 16;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      ofmap[3 * bstride + i] = result[i] >> 24;
+
+fail_exit:
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+static int conv_ref(
+    const conv_param_t *p_param,
+    const int8_t *ifmap,
+    const int8_t *weight,
+    const int16_t *bias,
+    int8_t *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+  int r_shift_bits = p_param->r_shift_m;
+  int do_relu = p_param->bReLU_EN;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  int8_t *i_fmap_pad_ker = (int8_t *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return -1;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = 0;
+
+  int8_t *i_fmap_pad = NULL;
+  int8_t *kernel_after = NULL;
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+
+      if (p_param->using_bias) {
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += bias[c]; //bias+c ;
+          }
+        }
+      }
+
+      if (do_relu)
+        relu(&result[n*oc*oh*ow + c*oh*ow], oh * ow);
+
+      // ofmap is int8_t, signed
+      ret = satu_2_8bit(&result[n*oc*oh*ow + c*oh*ow], oh * ow,
+                        &ofmap[n*oc*oh*ow + c*oh*ow], r_shift_bits, /*round_floor=*/1,
+                        /*sign_unsign=*/1);
+
+      if (ret)
+        goto error_release;
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+error_release:
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+static uint8_t * transform_weight(const cvk_tl_shape_t *s, uint8_t before[])
+{
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  uint32_t size = ic * oc * kh * kw;
+  uint8_t *after = (uint8_t *)malloc(size);
+
+  /*
+   * (oc, ic, kh, kw) -> (1, oc, kh * kw, ic)
+   */
+  for (uint32_t oci = 0; oci < oc; oci++) {
+    for (uint32_t ici = 0; ici < ic; ici++) {
+      for (uint32_t khi = 0; khi < kh; khi++) {
+        for (uint32_t kwi = 0; kwi < kw; kwi++) {
+          uint32_t src_i = oci * ic * kh * kw + ici * kh * kw + khi * kw + kwi;
+          uint32_t dst_i = oci * kh * kw * ic + khi * kw * ic + kwi * ic + ici;
+          after[dst_i] = before[src_i];
+        }
+      }
+    }
+  }
+
+  return after;
+}
+
+static void put_conv_weight(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    uint8_t *data)
+{
+#if 0
+  const tl_shape_t *s = &tl->shape;
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  bmshape_t bms = BM_TENSOR_INT8((int)oc, (int)ic, (int)kh, (int)kw);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  uint8_t *transformed_data = transform_weight(s, data);
+
+  /* we put weight to region 1. bm_memcpy_s2d regard dev_mem as
+   * absolute address, so we should pass abs address to copy weight
+   * to right place.
+   */
+
+  //u64 ab_addr = bm_device_read_base_reg(*ctx, 1);
+  //bmmem_device_t ab_dev_mem =
+    //bmmem_device_prealloc(*ctx, NULL, ab_addr + gaddr, &bms);
+
+  //int ret = bm_memcpy_s2d(*ctx, ab_dev_mem, transformed_data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tl_shape_t tdma_shape = { 1, oc, kh * kw, ic };
+
+  tg_t tdma_tg;
+  tdma_tg.base_reg_index = 0;
+  tdma_tg.start_address = gaddr;
+  tdma_tg.fmt = FMT_I8;
+  tdma_tg.shape.n = tdma_shape.n;
+  tdma_tg.shape.c = tdma_shape.c;
+  tdma_tg.shape.h = tdma_shape.h;
+  tdma_tg.shape.w = tdma_shape.w;
+  tdma_tg.stride = bmk1822_tensor_tgmem_default_stride(tdma_tg.shape, tdma_tg.fmt);
+  tdma_tg.base_reg_index = 1;
+
+  tl_t tdma_tl = *tl;
+  tdma_tl.shape = tdma_shape;
+  tdma_tl.stride = bmk1822_tensor_lmem_default_stride(bk_ctx, tdma_shape, FMT_I8, 0);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tdma_tg;
+  p.dst = &tdma_tl;
+
+  bmk1822_tdma_g2l_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+  bmmem_device_free(*ctx, dev_mem);
+  delete[] transformed_data;
+#endif
+
+  const cvk_tl_shape_t *s = &tl->shape;
+  uint32_t ic = s->n;
+  uint32_t oc = s->c;
+  uint32_t kh = s->h;
+  uint32_t kw = s->w;
+
+  uint8_t *transformed_data = transform_weight(s, data);
+
+  cvk_tl_shape_t tdma_shape = tl_shape_t4(1, oc, kh * kw, ic);
+  cvk_tl_t tdma_tl;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tdma_tl, tdma_shape, tl->fmt, tl->eu_align);
+  tdma_tl.start_address = tl->start_address;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, &tdma_tl, transformed_data);
+
+  free(transformed_data);
+}
+
+static int8_t * transform_bias(int oc, int16_t before[])
+{
+  int8_t *after = (int8_t *)malloc(2 * oc);
+  if (!after)
+    return NULL;
+
+  for (int i = 0; i < oc; i++){
+    after[i] = before[i] & 0xff;
+    after[i + oc] = (before[i] >> 8) & 0xff;
+  }
+  return after;
+}
+
+static void put_conv_bias(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    int16_t *data)
+{
+#if 0
+  int oc = tl->shape.c;
+
+  bmshape_t bms = BM_TENSOR_INT8(2, oc, 1, 1);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  int8_t *transformed_data = transform_bias(oc, data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, (uint8_t *)transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_I8;
+  tg.shape.n = 2;
+  tg.shape.c = oc;
+  tg.shape.h = 1;
+  tg.shape.w = 1;
+  tg.stride = bmk1822_tensor_tgmem_default_stride(tg.shape, tg.fmt);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1822_tdma_g2l_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  bmmem_device_free(*ctx, dev_mem);
+  delete[] transformed_data;
+#endif
+
+  int oc = tl->shape.c;
+  int8_t *transformed_data = transform_bias(oc, data);
+
+  cvk_tl_shape_t tdma_shape = tl_shape_t4(2, oc, 1, 1);
+  cvk_tl_t tdma_tl;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tdma_tl, tdma_shape, tl->fmt, tl->eu_align);
+  tdma_tl.start_address = tl->start_address;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, &tdma_tl, (uint8_t *)transformed_data);
+
+  free(transformed_data);
+}
+
+static int conv_kh_ext(const conv_param_t *p)
+{
+  return (p->kh - 1) * p->dh + 1;
+}
+
+static int conv_kw_ext(const conv_param_t *p)
+{
+  return (p->kw - 1) * p->dw + 1;
+}
+
+static int conv_ih_ext(const conv_param_t *p)
+{
+  return (p->input_h - 1) * (p->ins_h + 1) +
+      p->ins_h_last + 1 + p->pad_top + p->pad_bot;
+}
+
+static int conv_iw_ext(const conv_param_t *p)
+{
+  return (p->input_w - 1) * (p->ins_w + 1) +
+      p->ins_w_last + 1 + p->pad_left + p->pad_right;
+}
+
+static int conv_oh(const conv_param_t *p)
+{
+  return (conv_ih_ext(p) - conv_kh_ext(p)) / p->stride_h + 1;
+}
+
+static int conv_ow(const conv_param_t *p)
+{
+  return (conv_iw_ext(p) - conv_kw_ext(p)) / p->stride_w + 1;
+}
+
+static int conv_input_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int ic = p->input_c;
+  int ih = p->input_h;
+  int iw = p->input_w;
+  return in * ic * ih * iw;
+}
+
+static int conv_output_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int oc = p->output_c;
+  int oh = conv_oh(p);
+  int ow = conv_ow(p);
+  return in * oc * oh * ow;
+}
+
+static int conv_weight_size(const conv_param_t *p)
+{
+  int oc = p->output_c;
+  int ic = p->input_c;
+  int kh = p->kh;
+  int kw = p->kw;
+  return oc * ic * kh * kw;
+}
+
+static int8_t * alloc_input(const conv_param_t *p)
+{
+  int size = conv_input_size(p);
+
+  int8_t *buf = (int8_t *)malloc(sizeof(int8_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static int8_t * alloc_weight(const conv_param_t *p)
+{
+  int size = conv_weight_size(p);
+
+  int8_t *buf = (int8_t *)malloc(sizeof(int8_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static int16_t * alloc_bias(const conv_param_t *p)
+{
+  int oc = p->output_c;
+
+  int16_t *bias = (int16_t *)malloc(sizeof(int16_t) * oc);
+  if (!bias)
+    return NULL;
+
+  for (int i = 0; i < oc; i++)
+    bias[i] = rand() % 65536 - 32768;
+
+  return bias;
+}
+
+static cvk_tl_t * conv_ifmap_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = p->opd0_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 1);
+}
+
+static cvk_tl_t * conv_weight_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = p->opd1_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 0);
+}
+
+static cvk_tl_t * conv_ofmap_tensor(cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_tl_shape_t s;
+  s.n = p->input_n * 4;
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  cvk_tl_t *tl = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, CVK_FMT_I8, 1);
+  if (tl)
+    tl->shape.n = p->input_n;
+  return tl;
+}
+
+static cvk_tl_t * conv_bias_tensor(
+    cvk_context_t *cvk_ctx, const conv_param_t *p)
+{
+  cvk_fmt_t fmt = p->opd2_sign? CVK_FMT_I8: CVK_FMT_U8;
+  cvk_tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+  return cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, s, fmt, 0);
+}
+
+static int conv_param_is_ok(const conv_param_t *p)
+{
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+
+  if ((kh_ext > ih_ext)
+      || (kw_ext > iw_ext)
+      || (kh_ext <= p->pad_top)
+      || (kh_ext <= p->pad_bot)
+      || (kw_ext <= p->pad_left)
+      || (kw_ext <= p->pad_right)
+      || (p->pad_top >= (1 << 4))
+      || (p->pad_bot >= (1 << 4))
+      || (p->pad_left >= (1 << 4))
+      || (p->pad_right >= (1 << 4))) {
+    return 0;
+  }
+
+  return 1;
+}
+
+static int bmk_conv_param_alloc_ok(
+    const cvk_tiu_pt_convolution_param_t *p,
+    const conv_param_t *param)
+{
+  if (!p->ifmap || !p->ofmap || !p->weight)
+    return 0;
+  if(p->ps32_mode==1)
+      if (param->using_bias)
+        if (!p->bias)
+          return 0;
+
+  return 1;
+}
+
+static void make_bmk_conv_param_ps32(
+    cvk_context_t *cvk_ctx,
+    cvk_tiu_pt_convolution_param_t *dst,
+    const conv_param_t *p, uint32_t ps32_mode)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  if(ps32_mode==2)
+  {
+    dst->ifmap = conv_ifmap_tensor(cvk_ctx, p);
+    dst->weight = conv_weight_tensor(cvk_ctx, p);
+    dst->ofmap = conv_ofmap_tensor(cvk_ctx, p);
+  }
+
+  dst->ps32_mode = ps32_mode;
+
+  dst->bias = NULL;
+  dst->relu_enable = 0;
+  dst->rshift_bits = 0;
+  if(ps32_mode==1)
+  {
+    dst->relu_enable = p->bReLU_EN;
+    dst->rshift_bits = p->r_shift_m;
+    // only mode=1 can use bias
+    if (p->using_bias)
+      dst->bias = conv_bias_tensor(cvk_ctx, p);
+  }
+
+  dst->w_is_const = 0;
+}
+
+static void make_bmk_conv_param(
+    cvk_context_t *cvk_ctx,
+    cvk_tiu_pt_convolution_param_t *dst,
+    const conv_param_t *p)
+{
+  memset(dst, 0, sizeof(cvk_tiu_pt_convolution_param_t));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  dst->relu_enable = p->bReLU_EN;
+  dst->rshift_bits = p->r_shift_m;
+  dst->ifmap = conv_ifmap_tensor(cvk_ctx, p);
+  dst->weight = conv_weight_tensor(cvk_ctx, p);
+  dst->ofmap = conv_ofmap_tensor(cvk_ctx, p);
+  dst->bias = NULL;
+  dst->ps32_mode = 0;
+  if (p->using_bias)
+    dst->bias = conv_bias_tensor(cvk_ctx, p);
+
+  dst->w_is_const = 0;
+}
+
+static void free_bmk_conv_param(
+    cvk_context_t *cvk_ctx,
+    cvk_tiu_pt_convolution_param_t *r,
+    const conv_param_t *p)
+{
+  if (p->using_bias && r->bias)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->bias);
+
+  if (r->ofmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ofmap);
+
+  if (r->weight)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->weight);
+
+  if (r->ifmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ifmap);
+}
+
+static void init_conv_param(conv_param_t *p)
+{
+  printf("init_conv_param\n");
+  p->random_seed = clock();
+  srand(p->random_seed);
+
+retry:
+  p->input_n = 1;
+  p->input_c = rand() % (10) + 2;
+  p->kh = rand() % 7 + 1;
+  p->kw = rand() % 7 + 1;
+  p->input_h = rand() % 10 + p->kh;
+  p->input_w = rand() % 10 + p->kw;
+  p->output_c = rand() % 10 + 3;
+  p->stride_h = rand() % (p->kh) + 1;
+  p->stride_w = rand() % (p->kw) + 1;
+  p->ins_h = rand() % p->kh;
+  p->ins_w = rand() % p->kw;
+  p->ins_h_last = rand() % p->kh;;
+  p->ins_w_last = rand() % p->kw;;
+  p->dh = rand() % 3 + 1;
+  p->dw = rand() % 3 + 1;
+
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  p->pad_top = rand() % kh_ext;
+  p->pad_bot = rand() % kh_ext;
+  p->pad_left = rand() % kw_ext;
+  p->pad_right = rand() % kw_ext;
+
+  if (!conv_param_is_ok(p)) {
+    printf("retry init_conv_param\n");
+    goto retry;
+  }
+
+  p->using_bias = rand() % 2;
+  p->r_shift_m = rand() % 8;
+  p->bReLU_EN = rand() % 2;
+
+  p->opd0_sign = rand() % 2;
+  p->opd1_sign = 1;
+  p->opd2_sign = 1;
+
+  assert(p->opd1_sign == 1 && p->opd2_sign == 1);
+
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+  assert(ih_ext >= kh_ext);
+  assert(iw_ext >= kw_ext);
+}
+
+static void print_conv_param(const conv_param_t *p)
+{
+  printf("%s\n", "Conv parameters:");
+  printf("  %s%d;\n", "p->random_seed = ", p->random_seed);
+
+  printf("  %s%d;\n", "p->input_n = ", p->input_n);
+  printf("  %s%d;\n", "p->input_c = ", p->input_c);
+  printf("  %s%d;\n", "p->input_h = ", p->input_h);
+  printf("  %s%d;\n", "p->input_w = ", p->input_w);
+  printf("  %s%d;\n", "p->output_c = ", p->output_c);
+
+  printf("  %s%d;\n", "p->kh = ", p->kh);
+  printf("  %s%d;\n", "p->kw = ", p->kw);
+  printf("  %s%d;\n", "p->dh = ", p->dh);
+  printf("  %s%d;\n", "p->dw = ", p->dw);
+  printf("  %s%d;\n", "p->pad_top = ", p->pad_top);
+  printf("  %s%d;\n", "p->pad_bot = ", p->pad_bot);
+  printf("  %s%d;\n", "p->pad_left = ", p->pad_left);
+  printf("  %s%d;\n", "p->pad_right = ", p->pad_right);
+  printf("  %s%d;\n", "p->stride_h = ", p->stride_h);
+  printf("  %s%d;\n", "p->stride_w = ", p->stride_w);
+  printf("  %s%d;\n", "p->ins_w = ", p->ins_w);
+  printf("  %s%d;\n", "p->ins_h = ", p->ins_h);
+  printf("  %s%d;\n", "p->ins_w_last = ", p->ins_w_last);
+  printf("  %s%d;\n", "p->ins_h_last = ", p->ins_h_last);
+
+  printf("  %s%d;\n", "p->r_shift_m = ", p->r_shift_m);
+  printf("  %s%d;\n", "p->opd0_sign = ", p->opd0_sign);
+  printf("  %s%d;\n", "p->opd1_sign = ", p->opd1_sign);
+  printf("  %s%d;\n", "p->opd2_sign = ", p->opd2_sign);
+  printf("  %s%d;\n", "p->bReLU_EN = ", p->bReLU_EN);
+  printf("  %s%d;\n", "p->using_bias = ", p->using_bias);
+
+  printf("  %s%d\n", "kh_ext = ", conv_kh_ext(p));
+  printf("  %s%d\n", "kw_ext = ", conv_kw_ext(p));
+  printf("  %s%d\n", "ih_ext = ", conv_ih_ext(p));
+  printf("  %s%d\n", "iw_ext = ", conv_iw_ext(p));
+  printf("  %s%d\n", "output_h = ", conv_oh(p));
+  printf("  %s%d\n", "output_w = ", conv_ow(p));
+}
+
+/* Calculate the right shift value, m
+ * Steps:
+ *  1. Get the abs() of each weight;
+ *  2. Summary all the abs() in one kernel;
+ *  3. Get Log2 of each sum;
+ *  4. Downward rounding;
+ * After every r_shift value got, sort and find the middle one.
+ */
+
+static int calc_rshift_m(const conv_param_t *p, int8_t* weight)
+{
+  int kernel_cnt = p->output_c * p->input_c;
+  int kernel_size = p->kh * p->kw;
+  int *kernel_shifts = (int *)malloc(sizeof(int) * kernel_cnt);
+  if (!kernel_shifts)
+    return 1;
+
+  memset(kernel_shifts, 0, sizeof(int) * kernel_cnt);
+
+  // Part 1:
+  // Get right shift value for each kernel
+  int sum = 0;
+  for (int i = 0; i < kernel_cnt; i++) {
+    // Step 1 & 2: Get the sum of abs()
+    for (int j = 0; j < kernel_size; j++) {
+      sum += (int)(*weight < 0 ? -(*weight) : (*weight));
+      weight++;
+    }
+    // Step 3 & 4: log2 and downward rounding
+    sum >>= 1;
+    while (sum) {
+      sum >>= 1;
+      kernel_shifts[i]++;
+    }
+  }
+
+  // Part 2:
+  // Find the middle of all the values
+  int tag[32] = {0};
+  for (int cnt = 0; cnt < kernel_cnt; cnt++) {
+    tag[kernel_shifts[cnt]]++;
+  }
+
+  int rshift_m = 0;
+  int mid = 0;
+  do {
+    mid += tag[rshift_m++];
+  } while(mid < (kernel_cnt - 1) >> 1);
+
+  free(kernel_shifts);
+
+  return rshift_m - 1;
+}
+
+static int test_ps32_ut(
+    conv_param_t *p_param, CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  printf("test_ps32_ut\n");
+  int ret = 0;
+  int8_t *input = alloc_input(p_param);
+  int8_t *weight = alloc_weight(p_param);
+  int16_t *bias = alloc_bias(p_param);
+  int8_t *output_ref = (int8_t *)malloc(sizeof(int8_t) * conv_output_size(p_param) * sizeof(int));
+  if (!input || !weight || !bias || !output_ref) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  p_param->r_shift_m = calc_rshift_m(p_param, weight);
+  ret = ps32_m2_conv_ref(p_param, input, weight, output_ref);
+  if (ret)
+    goto fail_exit;
+
+  cvk_tiu_pt_convolution_param_t conv_param;
+  make_bmk_conv_param_ps32(cvk_ctx, &conv_param, p_param, 2);
+
+  int tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, p_param);
+
+  if (tl_alloc_success) {
+
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, conv_param.ifmap, (uint8_t *)input);
+    put_conv_weight(rt_handle, cvk_ctx, conv_param.weight, (uint8_t *)weight);
+    cvk_ctx->ops->tiu_pt_convolution(cvk_ctx, &conv_param);
+
+    cvk_tl_t ps32_ofmap;
+    ps32_ofmap = *conv_param.ofmap;
+    ps32_ofmap.shape.n = ps32_ofmap.shape.n * sizeof(int);
+    uint8_t *output = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, &ps32_ofmap);
+
+    ret = array_cmp_int8(
+        "Comparing begin_mode results ...\n",
+        output_ref, (int8_t *)output, conv_output_size(p_param) * sizeof(int));
+
+    if (ret) {
+      print_conv_param(p_param);
+      printf("Comparison FAILED\n");
+    }
+
+    free(output);
+  }
+  free_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+
+  printf("test_ps32_intermediate_mode\n");
+  for (int i=0; i < conv_input_size(p_param); i++)
+    input[i] = rand() % 256 - 128;
+
+  for (int i=0; i < conv_weight_size(p_param); i++)
+    weight[i] = rand() % 256 - 128;
+
+  ret = ps32_m3_conv_ref(p_param, input, weight, output_ref);
+  if (ret)
+    goto fail_exit;
+
+  make_bmk_conv_param_ps32(cvk_ctx, &conv_param, p_param, 3);
+  tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, p_param);
+
+  if (tl_alloc_success) {
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, conv_param.ifmap, (uint8_t *)input);
+    put_conv_weight(rt_handle, cvk_ctx, conv_param.weight, (uint8_t *)weight);
+
+    cvk_ctx->ops->tiu_pt_convolution(cvk_ctx, &conv_param);
+
+    cvk_tl_t ps32_ofmap;
+    ps32_ofmap = *conv_param.ofmap;
+    ps32_ofmap.shape.n = ps32_ofmap.shape.n * sizeof(int);
+
+    uint8_t *output = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, &ps32_ofmap);
+
+    ret = array_cmp_int8(
+        "Comparing intermediate results ...\n",
+        output_ref, (int8_t *)output, conv_output_size(p_param) * sizeof(int));
+
+    if (ret) {
+      print_conv_param(p_param);
+      printf("Comparison FAILED\n");
+    }
+
+    free(output);
+  }
+  free_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+
+  printf("test_ps32_end_mode\n");
+  for (int i=0; i < conv_input_size(p_param); i++)
+    input[i] = rand() % 256 - 128;
+
+  for (int i=0; i < conv_weight_size(p_param); i++)
+    weight[i] = rand() % 256 - 128;
+
+  ret = ps32_m1_conv_ref(p_param, input, weight, bias, output_ref);
+  if (ret)
+    goto fail_exit;
+
+  make_bmk_conv_param_ps32(cvk_ctx, &conv_param, p_param, 1);
+
+  tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, p_param);
+
+  if (tl_alloc_success) {
+
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, conv_param.ifmap, (uint8_t *)input);
+    put_conv_weight(rt_handle, cvk_ctx, conv_param.weight, (uint8_t *)weight);
+    if (p_param->using_bias) {
+      put_conv_bias(rt_handle, cvk_ctx, conv_param.bias, bias);
+    }
+    cvk_ctx->ops->tiu_pt_convolution(cvk_ctx, &conv_param);
+    uint8_t *output = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, conv_param.ofmap);
+
+    ret = array_cmp_int8(
+        "Comparing end results ...\n",
+        output_ref, (int8_t *)output, conv_output_size(p_param));
+
+    if (ret) {
+      print_conv_param(p_param);
+      printf("Comparison FAILED\n");
+    }
+
+    free(output);
+  }
+
+  free_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+
+fail_exit:
+  free(input);
+  free(weight);
+  free(bias);
+  free(output_ref);
+
+  return ret;
+}
+
+static int test_ic_tiling_conv(
+    conv_param_t *p_param, CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  printf("test tiled ps32 conv\n");
+  int ret = 0;
+  int8_t *input = alloc_input(p_param);
+  int8_t *weight = alloc_weight(p_param);
+  int16_t *bias = alloc_bias(p_param);
+  int8_t *output_ref = (int8_t *)malloc(sizeof(int8_t) * conv_output_size(p_param));
+  if (!input || !weight || !bias || !output_ref) {
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  p_param->r_shift_m = calc_rshift_m(p_param, weight);
+  ret = conv_ref(p_param, input, weight, bias, output_ref);
+  if (ret)
+    goto fail_exit_2;
+
+  cvk_tiu_pt_convolution_param_t conv_tmp_param;
+  cvk_tiu_pt_convolution_param_t conv_param;
+  make_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+
+  int tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, p_param);
+  if (tl_alloc_success) {
+    if (p_param->using_bias) {
+      conv_tmp_param.bias = conv_param.bias;
+      put_conv_bias(rt_handle, cvk_ctx, conv_param.bias, bias);
+    }
+    // for ps32_md[1] = 1, relu_enable & rshift_bits need to set to 0
+    // so we store those parameters to conv_tmp_para
+    conv_tmp_param.relu_enable = conv_param.relu_enable;
+    conv_tmp_param.rshift_bits = conv_param.rshift_bits;
+    conv_tmp_param.bias        = conv_param.bias;
+
+    uint32_t ic_step = 1;
+    uint32_t n_step = 1;
+    cvk_tl_t ifmap = *conv_param.ifmap;
+    cvk_tl_t ofmap = *conv_param.ofmap;
+    cvk_tg_shape_t s;
+    s.n = conv_param.ifmap->shape.n;
+    s.c = conv_param.ifmap->shape.c;
+    s.h = conv_param.ifmap->shape.h;
+    s.w = conv_param.ifmap->shape.w;
+    cvk_tg_t *tg_ifmap = alloc_tensor_dev_mem(rt_handle, cvk_ctx, s, CVK_FMT_I8);
+    tensor_copy_s2d(rt_handle, tg_ifmap, (uint8_t *)input);
+
+    s.n = conv_param.weight->shape.n;
+    s.c = conv_param.weight->shape.c;
+    s.h = conv_param.weight->shape.h;
+    s.w = conv_param.weight->shape.w;
+    uint8_t *transformed_weight =
+      transform_weight(&conv_param.weight->shape, (uint8_t *)weight);
+    cvk_tg_t *tg_weight = alloc_tensor_dev_mem(rt_handle, cvk_ctx, s, CVK_FMT_I8);
+    tensor_copy_s2d(rt_handle, tg_weight, (uint8_t *)transformed_weight);
+    free(transformed_weight);
+
+    cvk_tl_shape_t cur_tl_ifmap_shape = {
+        n_step,
+        ic_step,
+        ifmap.shape.h,
+        ifmap.shape.w
+    };
+
+    cvk_tg_shape_t cur_tg_ifmap_shape = {
+      cur_tl_ifmap_shape.n,
+      cur_tl_ifmap_shape.c,
+      cur_tl_ifmap_shape.h,
+      cur_tl_ifmap_shape.w
+    };
+
+    cvk_tg_stride_t cur_tg_ifmap_stride = {
+      tg_ifmap->stride.n,
+      tg_ifmap->stride.c,
+      tg_ifmap->stride.h,
+      fmt_size(tg_ifmap->fmt),
+    };
+
+    cvk_tg_t cur_tg_ifmap;
+    cur_tg_ifmap.base_reg_index = 0;
+    cur_tg_ifmap.start_address = tg_ifmap->start_address;
+    cur_tg_ifmap.shape = cur_tg_ifmap_shape;
+    cur_tg_ifmap.stride = cur_tg_ifmap_stride;
+    cur_tg_ifmap.fmt = CVK_FMT_I8;
+
+    cvk_tl_t cur_tl_ifmap;
+    cur_tl_ifmap.shape = cur_tl_ifmap_shape;
+    cur_tl_ifmap.stride =
+      cvk_ctx->ops->tl_default_stride(cvk_ctx, cur_tl_ifmap_shape, CVK_FMT_I8, 1);
+    cur_tl_ifmap.start_address = ifmap.start_address;
+    cur_tl_ifmap.fmt = ifmap.fmt;
+
+    cvk_tl_t cur_tl_ofmap;
+    cur_tl_ofmap.start_address = ofmap.start_address;
+    cur_tl_ofmap.shape = ofmap.shape;
+    cur_tl_ofmap.shape.n = n_step;
+    cur_tl_ofmap.stride =
+      cvk_ctx->ops->tl_default_stride(cvk_ctx, cur_tl_ofmap.shape, CVK_FMT_I8, 1);
+    cur_tl_ofmap.fmt = ofmap.fmt;
+
+    cvk_tl_t cur_tl_weight;
+    memset(&cur_tl_weight, 0, sizeof(cur_tl_weight));
+    cur_tl_weight.start_address = conv_param.weight->start_address;
+    cur_tl_weight.shape = conv_param.weight->shape;
+    cur_tl_weight.shape.n = ic_step;
+    cur_tl_weight.stride.n = 1;
+    cur_tl_weight.stride.c = cur_tl_weight.shape.n * cur_tl_weight.shape.h * cur_tl_weight.shape.w;
+    cur_tl_weight.stride.h = cur_tl_weight.shape.n * cur_tl_weight.shape.w;
+    cur_tl_weight.stride.w = cur_tl_weight.shape.n;
+    cur_tl_weight.fmt = conv_param.weight->fmt;
+
+    const cvk_tl_t *saved_tl_weight = conv_param.weight;
+    const cvk_tl_t *saved_tl_ifmap = conv_param.ifmap;
+    for (uint32_t ci = 0; ci < ifmap.shape.c; ci += ic_step) {
+      {
+        uint32_t ic = tg_weight->shape.n;
+        uint32_t oc = tg_weight->shape.c;
+        uint32_t kh = tg_weight->shape.h;
+        uint32_t kw = tg_weight->shape.w;
+
+        cvk_tg_t cur_tdma_tg_weight;
+        cur_tdma_tg_weight.base_reg_index = tg_weight->base_reg_index;
+        cur_tdma_tg_weight.start_address = tg_weight->start_address + ci;
+        cur_tdma_tg_weight.fmt = tg_weight->fmt;
+        cur_tdma_tg_weight.shape = tg_shape_t4(1, oc, kh * kw, ic);
+        cur_tdma_tg_weight.stride =
+          cvk_ctx->ops->tg_default_stride(cvk_ctx, cur_tdma_tg_weight.shape, cur_tdma_tg_weight.fmt);
+        cur_tdma_tg_weight.shape = tg_shape_t4(1, oc, kh * kw, ic_step);
+
+        cvk_tl_t cur_tdma_tl_weight;
+        cur_tdma_tl_weight = cur_tl_weight;
+        cur_tdma_tl_weight.shape.n = cur_tdma_tg_weight.shape.n;
+        cur_tdma_tl_weight.shape.c = cur_tdma_tg_weight.shape.c;
+        cur_tdma_tl_weight.shape.h = cur_tdma_tg_weight.shape.h;
+        cur_tdma_tl_weight.shape.w = cur_tdma_tg_weight.shape.w;
+        cur_tdma_tl_weight.stride = cvk_ctx->ops->tl_default_stride(
+            cvk_ctx, cur_tdma_tl_weight.shape, CVK_FMT_I8, 0);
+
+        cvk_tdma_g2l_tensor_copy_param_t p1;
+        memset(&p1, 0, sizeof(p1));
+        p1.src = &cur_tdma_tg_weight;
+        p1.dst = &cur_tdma_tl_weight;
+        cvk_ctx->ops->tdma_g2l_tensor_copy(cvk_ctx, &p1);
+        CVI_RT_Submit(cvk_ctx);
+      }
+      {
+        cvk_tdma_g2l_tensor_copy_param_t p2;
+        memset(&p2, 0, sizeof(p2));
+        cur_tg_ifmap.start_address =
+          tg_ifmap->start_address + ci * tg_ifmap->stride.c;
+        p2.src = &cur_tg_ifmap;
+        p2.dst = &cur_tl_ifmap;
+        cvk_ctx->ops->tdma_g2l_tensor_copy(cvk_ctx, &p2);
+        CVI_RT_Submit(cvk_ctx);
+      }
+
+      conv_param.ifmap = &cur_tl_ifmap;
+      conv_param.weight = &cur_tl_weight;
+
+      // for ps32_md[1] = 1, relu_enable & rshift_bits need to set to 0
+      // so we store those parameters to conv_tmp_para
+      if (ci == (ifmap.shape.c - 1))
+      {
+        conv_param.relu_enable = conv_tmp_param.relu_enable;
+        conv_param.rshift_bits = conv_tmp_param.rshift_bits;
+        conv_param.bias        = conv_tmp_param.bias;
+        conv_param.ps32_mode   = 1;
+      }
+      else if (ci == 0)
+      {
+        conv_param.relu_enable = 0;
+        conv_param.rshift_bits = 0;
+        conv_param.bias        = 0;;
+        conv_param.ps32_mode   = 2;
+      }
+      else
+      {
+        conv_param.relu_enable = 0;
+        conv_param.rshift_bits = 0;
+        conv_param.bias        = 0;;
+        conv_param.ps32_mode   = 3;
+      }
+      cvk_ctx->ops->tiu_pt_convolution(cvk_ctx, &conv_param);
+      conv_param.weight = saved_tl_weight;
+      conv_param.ifmap = saved_tl_ifmap;
+    }
+
+    uint8_t *output = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, conv_param.ofmap);
+
+    free_tensor_dev_mem(rt_handle, tg_ifmap);
+    free_tensor_dev_mem(rt_handle, tg_weight);
+    int ret = array_cmp_int8(
+        "Comparing results ...\n",
+        output_ref, (int8_t *)output, conv_output_size(p_param));
+
+    if (ret) {
+      print_conv_param(p_param);
+      printf("Comparison FAILED\n");
+    }
+    free(output);
+  }
+  free_bmk_conv_param(cvk_ctx, &conv_param, p_param);
+
+fail_exit_2:
+  free(input);
+  free(weight);
+  free(output_ref);
+  free(bias);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  for (int i = 0; i < 5; i++) {
+    printf("random_test_conv iteration: %d\n", i);
+    conv_param_t test_conv_param;
+    init_conv_param(&test_conv_param);
+    ret |= test_ic_tiling_conv(&test_conv_param, rt_handle, cvk_ctx);
+    ret |= test_ps32_ut(&test_conv_param, rt_handle, cvk_ctx);
+    if (!test_conv_param.using_bias)
+      test_conv_param.using_bias = 1;
+    if (test_conv_param.output_c <= 9)
+      test_conv_param.output_c += 3;
+    ret |= test_ic_tiling_conv(&test_conv_param, rt_handle, cvk_ctx);
+    ret |= test_ps32_ut(&test_conv_param, rt_handle, cvk_ctx);
+  }
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  if (!ret)
+    printf("%s pass\n", __FILENAME__);
+  else
+    printf("%s fail\n", __FILENAME__);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_g2g_bf16_tensor_copy.c b/cviruntime/test/181x/test_181x_g2g_bf16_tensor_copy.c
new file mode 100644
index 000000000..c8367c045
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_g2g_bf16_tensor_copy.c
@@ -0,0 +1,159 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef cvk_tdma_g2g_tensor_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_tg_shape_t src_shape;
+  cvk_tg_stride_t src_stride;
+  cvk_tg_shape_t dst_shape;
+  cvk_tg_stride_t dst_stride;
+} case_t;
+
+typedef struct {
+  cvk_fmt_t src_fmt;
+  cvk_fmt_t dst_fmt;
+} fmt_type_t;
+
+static fmt_type_t input_fmt[] = {
+ {CVK_FMT_BF16, CVK_FMT_BF16},
+ {CVK_FMT_I8, CVK_FMT_I8},
+};
+
+static case_t g_cases[] = {
+  {
+    {1, 3, 3, 3}, {27, 9, 3, 1},
+    {1, 3, 3, 3}, {27, 9, 3, 1},
+  },
+  {
+    // YOLOv2 concat layer
+    {1, 256, 19, 19}, {92416, 361, 19, 1},
+    {1, 256, 19, 19}, {462080, 361, 19, 1},
+  }
+};
+
+static int test_param_g2g(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  print_param(stderr, p);
+  int ret = 0;
+  uint64_t size = p->src->shape.c * p->src->shape.h * p->src->shape.w;
+
+  uint16_t *u16src_data = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  uint8_t *u8src_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  uint8_t *src_data, *dst_data = NULL;
+  if (!u16src_data || !u8src_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  if(p->src->fmt == CVK_FMT_BF16) {
+    /* bf16*/
+    float val = -100;
+    for(uint64_t i = 0; i < size; i++) {
+      u16src_data[i] = test_generate_bf16_corner_val(val);
+      val += 0.1;
+    }
+    src_data = (uint8_t*)u16src_data;
+  } else {
+    /* int8 -> bf16*/
+    for(uint64_t i = 0; i < size; i++) {
+      u8src_data[i] = 200 + i;
+    }
+    src_data = u8src_data;
+  }
+
+  tensor_copy_s2d(rt_handle, p->src, src_data);
+
+  cvk_ctx->ops->tdma_g2g_bf16_tensor_copy(cvk_ctx, p);
+  CVI_RT_Submit(cvk_ctx);
+  
+  dst_data = tensor_copy_d2s(rt_handle, p->dst);
+  if (!dst_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint64_t i = 0; i < size; i++) {
+    if (dst_data[i] != src_data[i]) {
+      fprintf(stderr, "comparing(%d->%d) failed at dst[%" PRIu64 "], got %x, exp %x\n",
+              p->src->fmt, p->dst->fmt, i, dst_data[i], src_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+fail_exit:
+  free(u8src_data);
+  free(u16src_data);
+  free(dst_data);
+
+  return ret;
+}
+
+static void destroy_param_g2g(CVI_RT_HANDLE rt_handle, param_t *p)
+{
+  free_tensor_dev_mem(rt_handle, p->src);
+  free_tensor_dev_mem(rt_handle, p->dst);
+}
+
+static int test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  int ret = 0;
+  uint32_t nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+
+  for (uint32_t i = 0; i < nr_fmt; i++) {
+        param_t p;
+        memset(&p, 0, sizeof(p));
+        p.src = alloc_tensor_dev_mem(rt_handle, cvk_ctx, c->src_shape, input_fmt[i].src_fmt);
+        p.dst = alloc_tensor_dev_mem(rt_handle, cvk_ctx, c->dst_shape, input_fmt[i].dst_fmt);
+        ret |= test_param_g2g(rt_handle, cvk_ctx, &p);
+        destroy_param_g2g(rt_handle, &p);
+  }
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+  int ret = 0;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    ret |= test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
\ No newline at end of file
diff --git a/cviruntime/test/181x/test_181x_lut.c b/cviruntime/test/181x/test_181x_lut.c
new file mode 100644
index 000000000..5a05c9483
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_lut.c
@@ -0,0 +1,140 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static uint32_t channel = -1; //<! 1822 hardcode
+
+static uint64_t shape_size(cvk_tl_shape_t s)
+{
+  return s.n * s.c * s.h * s.w;
+}
+
+static void tl_lut_ref(
+    uint8_t *ofmap,
+    uint8_t *ifmap,
+    uint8_t *table,
+    cvk_tl_shape_t ifmap_shape,
+    cvk_tl_shape_t table_shape)
+{
+  int ih, iw;
+  int tn, th, tw;
+
+  ih = ifmap_shape.h;
+  iw = ifmap_shape.w;
+  tn = table_shape.n;
+  th = table_shape.h;
+  tw = table_shape.w;
+  assert(tn == 1);
+  assert(th * tw == 256);
+
+  for (uint64_t i = 0; i < shape_size(ifmap_shape); i++) {
+    int ici = i / (ih * iw) % 32;
+    ofmap[i] = table[ici * (th * tw) + ifmap[i]];
+  }
+}
+
+static int test_tl_lut(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+  cvk_tl_shape_t ifmap_shape = {1, channel, 1, 224};
+  cvk_tl_shape_t table_shape = {1, channel, 16, 16};
+  cvk_tl_shape_t ofmap_shape = ifmap_shape;
+
+  uint64_t ifmap_size = shape_size(ifmap_shape);
+  uint64_t table_size = shape_size(table_shape);
+  uint64_t ofmap_size = shape_size(ofmap_shape);
+
+  uint8_t *ifmap_data = (uint8_t *)malloc(ifmap_size);
+  uint8_t *table_data = (uint8_t *)malloc(table_size);
+  uint8_t *ref_data = (uint8_t *)malloc(ofmap_size);
+  if (!ifmap_data || !table_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint64_t i = 0; i < ifmap_size; i++)
+    ifmap_data[i] = i - 20;
+
+  for (uint64_t i = 0; i < table_size; i++)
+    table_data[i] = i + i / 256 * 3;
+
+  tl_lut_ref(ref_data, ifmap_data, table_data, ifmap_shape, table_shape);
+
+  cvk_tl_t *tl_ifmap =
+    cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx,ifmap_shape, CVK_FMT_I8, 1);;
+  cvk_tl_t *tl_table =
+    cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, table_shape, CVK_FMT_I8, /*align*/1);
+  cvk_tl_t *tl_ofmap =
+    cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx,ofmap_shape, CVK_FMT_I8, /*align*/1);
+  uint8_t *ofmap_data = NULL;
+  if (!tl_ifmap || !tl_table || !tl_ofmap) {
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_ifmap, ifmap_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_table, table_data);
+  cvk_tiu_lookup_table_param_t p12;
+  p12.ofmap = tl_ofmap;
+  p12.ifmap = tl_ifmap;
+  p12.table = tl_table;
+  cvk_ctx->ops->tiu_lookup_table(cvk_ctx, &p12);
+  CVI_RT_Submit(cvk_ctx);
+  ofmap_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_ofmap);
+  for (uint64_t i = 0; i < ofmap_size; i++) {
+    if (ofmap_data[i] != ref_data[i]) {
+      fprintf(stderr,
+          "comparing failed at ofmap_data[%" PRIu64 "], got %d, exp %d\n",
+          i, ofmap_data[i], ref_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+fail_exit_2:
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_ofmap);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_table);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_ifmap);
+  free(ofmap_data);
+
+fail_exit:
+  free(ifmap_data);
+  free(table_data);
+  free(ref_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  // get channel info
+  channel = cvk_ctx->info.npu_num;
+
+  ret = test_tl_lut(rt_handle, cvk_ctx);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_matrix_mac.c b/cviruntime/test/181x/test_181x_matrix_mac.c
new file mode 100644
index 000000000..2a959be1e
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_matrix_mac.c
@@ -0,0 +1,2014 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_native_ref.h"
+
+typedef cvk_tiu_matrix_multiplication_param_t param_t;
+
+static uint64_t matrix_size(const cvk_ml_t *ml)
+{
+  uint64_t row = ml->shape.n;
+  uint64_t col = ml->shape.col;
+  return row * col;
+}
+
+static uint64_t res_size(param_t *p)
+{
+  if (p->res_is_int8 && !p->add_result)
+    return matrix_size(p->res);
+  else
+    return matrix_size(p->res) / 2;
+}
+
+static uint8_t * alloc_left(param_t *p)
+{
+  uint64_t size = matrix_size(p->left);
+
+  uint8_t *buf = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (uint64_t i = 0; i < size; i++)
+    buf[i] = i % 17 - 9;
+
+  return buf;
+}
+
+static uint8_t * alloc_right(param_t *p)
+{
+  uint64_t size = matrix_size(p->right);
+
+  uint8_t *buf = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (uint64_t i = 0; i < size; i++)
+    buf[i] = i % 13 - 6;
+
+  return buf;
+}
+
+static uint16_t * alloc_bias(param_t *p)
+{
+  if (!p->bias)
+    return NULL;
+
+  uint64_t size = matrix_size(p->bias) / 2;
+
+  uint16_t *buf = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (uint64_t i = 0; i < size; i++)
+    buf[i] = 5 - (i % 7);
+
+  return buf;
+}
+
+static uint16_t * alloc_res(param_t *p)
+{
+  uint64_t size = res_size(p);
+
+  uint16_t *buf = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (uint64_t i = 0; i < size; i++)
+    buf[i] = 17 - (i % 35);
+
+  return buf;
+}
+
+static void right_shift(param_t *p, int32_t *buf, uint64_t size)
+{
+  int shift_bits = p->rshift_bits;
+  int round_up = 1;
+  if (1)
+    arith_right_shift(buf, size, shift_bits, round_up);
+  else
+    logic_right_shift(buf, size, shift_bits, round_up);
+}
+
+static void matrix_mac_ref(
+    param_t *p, uint8_t left[], uint8_t right[], uint16_t bias[], uint16_t res[])
+{
+  uint64_t size = res_size(p);
+  uint32_t left_col = p->left->shape.col;
+  uint32_t right_col = p->right->shape.col;
+  uint32_t res_row = p->left->shape.n;
+  uint32_t res_col = p->res->shape.col;
+  int left_sign = (p->left->fmt == CVK_FMT_I8);
+  int right_sign = (p->right->fmt == CVK_FMT_I8);
+  int res_sign = (p->res->fmt == CVK_FMT_I8);
+
+  int32_t *tmp_res = (int32_t *)malloc(sizeof(int32_t) * size);
+  if (!tmp_res)
+    return;
+
+  if (p->add_result) {
+    for (uint32_t i = 0; i < res_row * res_col; i++) {
+      tmp_res[i] = res_sign? (int16_t)res[i]: res[i];
+      tmp_res[i] <<= p->lshift_bits;
+    }
+  } else {
+    for (uint32_t i = 0; i < res_row * res_col; i++)
+      tmp_res[i] = 0;
+  }
+
+  for (uint32_t row = 0; row < res_row; row++) {
+    for (uint32_t col = 0; col < res_col; col++) {
+      for (uint32_t i = 0; i < left_col; i++) {
+        uint32_t li = row * left_col + i;
+        uint32_t ri = i * right_col + col;
+        int32_t l = left_sign? (int8_t)left[li]: left[li];
+        int32_t r = right_sign? (int8_t)right[ri]: right[ri];
+        tmp_res[row * res_col + col] += l * r;
+      }
+    }
+  }
+
+  if (p->bias && bias) {
+    for (uint32_t row = 0; row < res_row; row++) {
+      for (uint32_t col = 0; col < res_col; col++) {
+        int bias_sign = (p->bias->fmt == CVK_FMT_I8);
+        int32_t b = bias_sign? (int16_t)bias[col]: bias[col];
+        tmp_res[row * res_col + col] += b;
+      }
+    }
+  }
+
+  if (p->relu_enable)
+    relu(tmp_res, size);
+
+  right_shift(p, tmp_res, size);
+
+  if (p->res_is_int8)
+    saturate_to_int8(tmp_res, size, res_sign);
+  else
+    saturate_to_int16(tmp_res, size, res_sign);
+
+  for (uint64_t i = 0; i < size; i++)
+    res[i] = tmp_res[i];
+
+  free(tmp_res);
+}
+
+static void put_bias(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_ml_t *ml,
+    uint16_t data[])
+{
+  uint64_t size = ml->shape.col;
+
+  uint8_t *tmp = (uint8_t *)malloc(sizeof(uint8_t) * size * 2);
+  if (!tmp)
+    return;
+
+  for (uint64_t i = 0; i < size; i++) {
+    tmp[i] = data[i] & 0xff;
+    tmp[i + size] = (data[i] >> 8) & 0xff;
+  }
+
+  matrix_copy_s2d_g2l(rt_handle, cvk_ctx, ml, tmp);
+
+  free(tmp);
+}
+
+static void put_res(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_ml_t *ml,
+    uint16_t data[])
+{
+  uint64_t size = ml->shape.n / 2 * ml->shape.col;
+
+  uint8_t *tmp = (uint8_t *)malloc(sizeof(uint8_t) * size * 2);
+  if (!tmp)
+    return;
+
+  for (uint64_t i = 0; i < size; i++) {
+    tmp[i] = data[i] & 0xff;
+    tmp[i + size] = (data[i] >> 8) & 0xff;
+  }
+
+  matrix_copy_s2d_g2l(rt_handle, cvk_ctx, ml, tmp);
+
+  free(tmp);
+}
+
+static uint16_t * get_res(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    param_t *p)
+{
+  uint64_t size = res_size(p);
+  uint16_t *res = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  if (!res)
+    return NULL;
+
+  uint8_t *tmp = matrix_copy_l2g_d2s(rt_handle, cvk_ctx, p->res);
+  if (p->res_is_int8) {
+    int res_sign = (p->res->fmt == CVK_FMT_I8);
+    for (uint64_t i = 0; i < size; i++)
+      res[i] = res_sign? (int8_t)tmp[i]: tmp[i];
+  } else {
+    for (uint64_t i = 0; i < size; i++)
+      res[i] = tmp[i] + (tmp[i + size] << 8);
+  }
+
+  free(tmp);
+  return res;
+}
+
+static int test_param(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  int ret = 0;
+  uint8_t *left = alloc_left(p);
+  uint8_t *right = alloc_right(p);
+  uint16_t *bias = alloc_bias(p);
+  uint16_t *ref = alloc_res(p);
+  if (!left || !right || (p->bias && !bias) || !ref) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  matrix_copy_s2d_g2l(rt_handle, cvk_ctx, p->left, left);
+  matrix_copy_s2d_g2l(rt_handle, cvk_ctx, p->right, right);
+  if (bias)
+    put_bias(rt_handle, cvk_ctx, p->bias, bias);
+  if (p->add_result)
+    put_res(rt_handle, cvk_ctx, p->res, ref);
+
+  cvk_ctx->ops->tiu_matrix_multiplication(cvk_ctx, p);
+  uint16_t *res = get_res(rt_handle, cvk_ctx, p);
+
+  matrix_mac_ref(p, left, right, bias, ref);
+
+  uint64_t size = res_size(p);
+  for (uint64_t i = 0; i < size; i++) {
+    if (res[i] != ref[i]) {
+      fprintf(stderr, "comparing failed at out[%" PRIu64 "], got %d, exp %d\n",
+              i, (int16_t)res[i], (int16_t)ref[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+  free(res);
+
+fail_exit:
+  free(left);
+  free(right);
+  free(bias);
+  free(ref);
+
+  return ret;
+}
+
+static void destroy_param(cvk_context_t *cvk_ctx, param_t *p)
+{
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->res);
+  if (p->bias)
+    cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->bias);
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->right);
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->left);
+}
+
+static cvk_ml_t *alloc_param_res(
+    cvk_context_t *cvk_ctx, param_t *p)
+{
+  cvk_ml_shape_t s;
+  s.n = p->left->shape.n;
+  if (p->add_result || !p->res_is_int8)
+    s.n *= 2;
+  s.c = p->right->shape.c;
+  s.w = p->right->shape.w;
+  s.col = p->right->shape.col;
+
+  cvk_fmt_t fmt = CVK_FMT_U8;
+  if (p->left->fmt == CVK_FMT_I8)
+    fmt = CVK_FMT_I8;
+  if (p->right->fmt == CVK_FMT_I8)
+    fmt = CVK_FMT_I8;
+  if (p->bias)
+    if (p->bias->fmt == CVK_FMT_I8)
+      fmt = CVK_FMT_I8;
+
+  if (p->relu_enable)
+    fmt = CVK_FMT_U8;
+
+  return cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, s, fmt, 1);
+}
+
+static param_t param_0(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+  p.ps32_mode = 0;
+
+  uint32_t left_row = 1;
+  uint32_t left_col = 1;
+  uint32_t left_c = 1;
+  uint32_t left_w = 1;
+
+  uint32_t right_row = 1;
+  uint32_t right_col = 1;
+  uint32_t right_c = 1;
+  uint32_t right_w = 1;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_1(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 6;
+  uint32_t left_col = 1;
+  uint32_t left_c = 1;
+  uint32_t left_w = 1;
+
+  uint32_t right_row = 1;
+  uint32_t right_col = 1;
+  uint32_t right_c = 1;
+  uint32_t right_w = 1;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_2(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 6;
+  uint32_t left_col = 25;
+  uint32_t left_c = 1;
+  uint32_t left_w = 25;
+
+  uint32_t right_row = 25;
+  uint32_t right_col = 1;
+  uint32_t right_c = 1;
+  uint32_t right_w = 1;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_3(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 6;
+  uint32_t left_col = 25;
+  uint32_t left_c = 2;
+  uint32_t left_w = 18;
+
+  uint32_t right_row = 25;
+  uint32_t right_col = 1;
+  uint32_t right_c = 1;
+  uint32_t right_w = 1;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_4(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 6;
+  uint32_t left_col = 39;
+  uint32_t left_c = 4;
+  uint32_t left_w = 10;
+
+  uint32_t right_row = 39;
+  uint32_t right_col = 1;
+  uint32_t right_c = 1;
+  uint32_t right_w = 1;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_5(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 1;
+  uint32_t left_col = 1;
+  uint32_t left_c = 1;
+  uint32_t left_w = 1;
+
+  uint32_t right_row = 1;
+  uint32_t right_col = 2;
+  uint32_t right_c = 1;
+  uint32_t right_w = 2;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_6(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 1;
+  uint32_t left_col = 1;
+  uint32_t left_c = 1;
+  uint32_t left_w = 1;
+
+  uint32_t right_row = 1;
+  uint32_t right_col = 2;
+  uint32_t right_c = 2;
+  uint32_t right_w = 1;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_7(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 1;
+  uint32_t left_col = 1;
+  uint32_t left_c = 1;
+  uint32_t left_w = 1;
+
+  uint32_t right_row = 1;
+  uint32_t right_col = 10;
+  uint32_t right_c = 3;
+  uint32_t right_w = 4;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_8(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 1;
+  uint32_t left_c = 1;
+  uint32_t left_w = 1;
+
+  uint32_t right_row = 1;
+  uint32_t right_col = 10;
+  uint32_t right_c = 3;
+  uint32_t right_w = 4;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_9(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 23;
+  uint32_t left_c = 3;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 23;
+  uint32_t right_col = 10;
+  uint32_t right_c = 3;
+  uint32_t right_w = 4;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_10(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 477;
+  uint32_t left_c = 60;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 477;
+  uint32_t right_col = 10;
+  uint32_t right_c = 3;
+  uint32_t right_w = 4;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_11(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 23;
+  uint32_t left_c = 3;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 23;
+  uint32_t right_col = 477;
+  uint32_t right_c = 60;
+  uint32_t right_w = 8;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_12(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 1;
+  uint32_t left_col = 1;
+  uint32_t left_c = 1;
+  uint32_t left_w = 1;
+
+  uint32_t right_row = 1;
+  uint32_t right_col = 1;
+  uint32_t right_c = 1;
+  uint32_t right_w = 1;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_I8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_13(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 1;
+  uint32_t left_col = 1;
+  uint32_t left_c = 1;
+  uint32_t left_w = 1;
+
+  uint32_t right_row = 1;
+  uint32_t right_col = 2;
+  uint32_t right_c = 1;
+  uint32_t right_w = 2;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_I8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_14(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 477;
+  uint32_t left_c = 60;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 477;
+  uint32_t right_col = 10;
+  uint32_t right_c = 3;
+  uint32_t right_w = 4;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_I8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_15(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 3;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 477;
+  uint32_t left_c = 60;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 477;
+  uint32_t right_col = 10;
+  uint32_t right_c = 3;
+  uint32_t right_w = 4;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_I8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_16(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 3;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 477;
+  uint32_t left_c = 60;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 477;
+  uint32_t right_col = 10;
+  uint32_t right_c = 3;
+  uint32_t right_w = 4;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_U8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_17(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 3;
+  p.res_is_int8 = true;
+  p.relu_enable = true;
+  p.add_result = false;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 477;
+  uint32_t left_c = 60;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 477;
+  uint32_t right_col = 10;
+  uint32_t right_c = 3;
+  uint32_t right_w = 4;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_U8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_18(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 23;
+  uint32_t left_c = 3;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 23;
+  uint32_t right_col = 477;
+  uint32_t right_c = 60;
+  uint32_t right_w = 8;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_U8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_19(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 1;
+  uint32_t left_col = 1;
+  uint32_t left_c = 1;
+  uint32_t left_w = 1;
+
+  uint32_t right_row = 1;
+  uint32_t right_col = 1;
+  uint32_t right_c = 1;
+  uint32_t right_w = 1;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_I8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_20(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 4;
+  uint32_t left_col = 1;
+  uint32_t left_c = 1;
+  uint32_t left_w = 1;
+
+  uint32_t right_row = 1;
+  uint32_t right_col = 1;
+  uint32_t right_c = 1;
+  uint32_t right_w = 1;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_I8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_21(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 3;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 477;
+  uint32_t left_c = 60;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 477;
+  uint32_t right_col = 10;
+  uint32_t right_c = 3;
+  uint32_t right_w = 4;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_U8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_22(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 2;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 23;
+  uint32_t left_c = 3;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 23;
+  uint32_t right_col = 477;
+  uint32_t right_c = 60;
+  uint32_t right_w = 8;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_U8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_23(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 2;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 23;
+  uint32_t left_c = 3;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 23;
+  uint32_t right_col = 477;
+  uint32_t right_c = 60;
+  uint32_t right_w = 8;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_U8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_24(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  uint32_t left_row = 1;
+  uint32_t left_col = 1;
+  uint32_t left_c = 1;
+  uint32_t left_w = 1;
+
+  uint32_t right_row = 1;
+  uint32_t right_col = 1;
+  uint32_t right_c = 1;
+  uint32_t right_w = 1;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_I8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_25(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  uint32_t left_row = 4;
+  uint32_t left_col = 1;
+  uint32_t left_c = 1;
+  uint32_t left_w = 1;
+
+  uint32_t right_row = 1;
+  uint32_t right_col = 1;
+  uint32_t right_c = 1;
+  uint32_t right_w = 1;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_I8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_26(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  uint32_t left_row = 4;
+  uint32_t left_col = 1;
+  uint32_t left_c = 1;
+  uint32_t left_w = 1;
+
+  uint32_t right_row = 1;
+  uint32_t right_col = 1;
+  uint32_t right_c = 1;
+  uint32_t right_w = 1;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_I8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_27(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 3;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 477;
+  uint32_t left_c = 60;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 477;
+  uint32_t right_col = 10;
+  uint32_t right_c = 3;
+  uint32_t right_w = 4;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_U8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_28(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 2;
+  p.rshift_bits = 3;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 477;
+  uint32_t left_c = 60;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 477;
+  uint32_t right_col = 10;
+  uint32_t right_c = 3;
+  uint32_t right_w = 4;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_U8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_29(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 2;
+  p.rshift_bits = 3;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 477;
+  uint32_t left_c = 60;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 477;
+  uint32_t right_col = 10;
+  uint32_t right_c = 3;
+  uint32_t right_w = 4;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_U8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_30(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 2;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 23;
+  uint32_t left_c = 3;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 23;
+  uint32_t right_col = 477;
+  uint32_t right_c = 60;
+  uint32_t right_w = 8;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_U8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_31(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 3;
+  p.rshift_bits = 2;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 23;
+  uint32_t left_c = 3;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 23;
+  uint32_t right_col = 477;
+  uint32_t right_c = 60;
+  uint32_t right_w = 8;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_U8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_32(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 6;
+  p.rshift_bits = 2;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 23;
+  uint32_t left_c = 3;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 23;
+  uint32_t right_col = 477;
+  uint32_t right_c = 60;
+  uint32_t right_w = 8;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_U8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_33(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 6;
+  p.rshift_bits = 2;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 23;
+  uint32_t left_c = 3;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 23;
+  uint32_t right_col = 477;
+  uint32_t right_c = 60;
+  uint32_t right_w = 8;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_U8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_34(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 1;
+  p.rshift_bits = 13;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 23;
+  uint32_t left_c = 3;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 23;
+  uint32_t right_col = 477;
+  uint32_t right_c = 60;
+  uint32_t right_w = 8;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_U8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_U8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_U8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_35(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 1;
+  p.rshift_bits = 10;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 23;
+  uint32_t left_c = 3;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 23;
+  uint32_t right_col = 477;
+  uint32_t right_c = 60;
+  uint32_t right_w = 8;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_U8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_U8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_U8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_36(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 1;
+  p.rshift_bits = 10;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 23;
+  uint32_t left_c = 3;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 23;
+  uint32_t right_col = 477;
+  uint32_t right_c = 60;
+  uint32_t right_w = 8;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  cvk_ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_U8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_U8, 1);
+  p.bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, CVK_FMT_U8, 1);
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_37(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 1;
+  p.rshift_bits = 10;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 23;
+  uint32_t left_c = 3;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 23;
+  uint32_t right_col = 477;
+  uint32_t right_c = 60;
+  uint32_t right_w = 8;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_U8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_U8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_38(cvk_context_t *cvk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 1;
+  p.rshift_bits = 6;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  uint32_t left_row = 7;
+  uint32_t left_col = 23;
+  uint32_t left_c = 3;
+  uint32_t left_w = 8;
+
+  uint32_t right_row = 23;
+  uint32_t right_col = 477;
+  uint32_t right_c = 60;
+  uint32_t right_w = 8;
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_U8, 1);
+  p.right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(cvk_ctx, &p);
+
+  return p;
+}
+
+#define test_one_param(n)                               \
+  do {                                                  \
+    param_t p = param_##n(cvk_ctx);                     \
+    ret |= test_param(rt_handle, cvk_ctx, &p);          \
+    destroy_param(cvk_ctx, &p);                         \
+  } while (0)
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  test_one_param(0);
+  test_one_param(1);
+  test_one_param(2);
+  test_one_param(3);
+  test_one_param(4);
+  test_one_param(5);
+  test_one_param(6);
+  test_one_param(7);
+  test_one_param(8);
+  test_one_param(9);
+  test_one_param(10);
+  test_one_param(11);
+  test_one_param(12);
+  test_one_param(13);
+  test_one_param(14);
+  test_one_param(15);
+  test_one_param(16);
+  test_one_param(17);
+  test_one_param(18);
+  test_one_param(19);
+  test_one_param(20);
+  test_one_param(21);
+  test_one_param(22);
+  test_one_param(23);
+  test_one_param(24);
+  test_one_param(25);
+  test_one_param(26);
+  test_one_param(27);
+  test_one_param(28);
+  test_one_param(29);
+  test_one_param(30);
+  test_one_param(31);
+  test_one_param(32);
+  test_one_param(33);
+  test_one_param(34);
+  test_one_param(35);
+  test_one_param(36);
+  test_one_param(37);
+  test_one_param(38);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_matrix_mac_ps32.c b/cviruntime/test/181x/test_181x_matrix_mac_ps32.c
new file mode 100644
index 000000000..c09d6701c
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_matrix_mac_ps32.c
@@ -0,0 +1,607 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_native_ref.h"
+
+typedef cvk_tiu_matrix_multiplication_param_t param_t;
+
+typedef struct{
+  cvk_fmt_t left_sign;
+  uint32_t left_row ;
+  uint32_t left_col ;
+  uint32_t left_c ;
+  uint32_t left_w ;
+  cvk_fmt_t right_sign;
+  uint32_t right_row ;
+  uint32_t right_col ;
+  uint32_t right_c ;
+  uint32_t right_w ;
+  uint32_t lshift_bits ;
+  uint32_t rshift_bits ;
+  uint32_t relu_enable ;
+  uint32_t using_bias;
+  cvk_fmt_t bias_sign;
+} matrix_init_para_t;
+
+matrix_init_para_t matrix_para_t;
+
+static void make_bmk_matrix_param_ps32(cvk_context_t *cvk_ctx, param_t *p, int ps32_mode);
+static param_t param_init();
+
+void print_param(param_t *p)
+{
+  printf("ps32_mode =%d\n",p->ps32_mode);
+  printf("left_shape.n =%d\n",p->left->shape.n);
+  printf("left_shape.col =%d\n",p->left->shape.col);
+  printf("left_shape.c =%d\n",p->left->shape.c);
+  printf("left_shape.w =%d\n",p->left->shape.w);
+  printf("left_fmt =%d\n",p->left->fmt);
+  printf("right_shape.n =%d\n",p->right->shape.n);
+  printf("right_shape.col =%d\n",p->right->shape.col);
+  printf("right_shape.c =%d\n",p->right->shape.c);
+  printf("right_shape.w =%d\n",p->right->shape.w);
+  printf("right_fmt =%d\n",p->right->fmt);
+  if(p->bias)
+  {
+    printf("bias_shape.n =%d\n",p->bias->shape.n);
+    printf("bias_shape.col =%d\n",p->bias->shape.col);
+    printf("bias_shape.c =%d\n",p->bias->shape.c);
+    printf("bias_shape.w =%d\n",p->bias->shape.w);
+    printf("bias_fmt =%d\n",p->bias->fmt);
+  }
+  printf("result_shape.n =%d\n",p->res->shape.n);
+  printf("result_shape.col =%d\n",p->res->shape.col);
+  printf("result_shape.c =%d\n",p->res->shape.c);
+  printf("result_shape.w =%d\n",p->res->shape.w);
+  printf("result_fmt =%d\n",p->res->fmt);
+  printf("relu_enable=%d\n",p->relu_enable);
+  printf("rshift_bits=%d\n",p->rshift_bits);
+}
+
+
+static uint64_t matrix_size(const cvk_ml_t *ml)
+{
+  uint64_t row = ml->shape.n;
+  uint64_t col = ml->shape.col;
+  return row * col;
+}
+
+static uint64_t res_ps32_size(param_t *p)
+{
+    return matrix_size(p->res);
+}
+
+static uint64_t res_size(param_t *p)
+{
+  if (p->res_is_int8 && !p->add_result)
+    return matrix_size(p->res);
+  else
+    return matrix_size(p->res) *2 ;
+}
+
+static uint8_t * alloc_left(param_t *p)
+{
+  uint64_t size = matrix_size(p->left);
+  uint8_t *buf = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (uint64_t i = 0; i < size; i++)
+    buf[i] = i % 17 - 9;
+
+  return buf;
+}
+
+static uint8_t * alloc_right(param_t *p)
+{
+  uint64_t size = matrix_size(p->right);
+
+  uint8_t *buf = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (uint64_t i = 0; i < size; i++)
+    buf[i] = i % 13 - 6;
+
+  return buf;
+}
+static uint16_t * alloc_bias(param_t *p)
+{
+  if (!p->bias)
+    return NULL;
+
+  uint64_t size = matrix_size(p->bias) / 2;
+
+  uint16_t *buf = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (uint64_t i = 0; i < size; i++)
+    buf[i] = 5 - (i % 7);
+
+  return buf;
+}
+
+static uint8_t * alloc_ps32_res(param_t *p)
+{
+  uint64_t size = res_ps32_size(p)*4;
+  uint8_t *buf = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!buf)
+    return NULL;
+
+  for (uint64_t i = 0; i < size; i++)
+    buf[i] = 17 - (i % 35);
+
+  return buf;
+}
+
+static void right_shift(param_t *p, int32_t *buf, uint64_t size)
+{
+  int shift_bits = p->rshift_bits;
+  int round_up = 1;
+  if (1)
+    arith_right_shift(buf, size, shift_bits, round_up);
+  else
+    logic_right_shift(buf, size, shift_bits, round_up);
+}
+
+static int ps32_m2_matrix_mac_ref(
+  param_t *p,
+  uint8_t *left,
+  uint8_t *right,
+  uint8_t *res)
+{
+  uint64_t size = res_ps32_size(p);
+  uint32_t left_col = p->left->shape.col;
+  uint32_t right_col = p->right->shape.col;
+  uint32_t res_row = p->left->shape.n;
+  uint32_t res_col = p->res->shape.col;
+  int left_sign = (p->left->fmt == CVK_FMT_I8);
+  int right_sign = (p->right->fmt == CVK_FMT_I8);
+  int ret = 0;
+  int bstride = res_row * res_col;
+
+  int32_t *tmp_res = (int32_t *)malloc(sizeof(int32_t) * size);
+  if (!tmp_res)
+    return -1;
+
+  for (uint32_t i = 0; i < res_row * res_col; i++)
+      tmp_res[i] = 0;
+  for (uint32_t row = 0; row < res_row; row++) {
+    for (uint32_t col = 0; col < res_col; col++) {
+      for (uint32_t i = 0; i < left_col; i++) {
+        uint32_t li = row * left_col + i;
+        uint32_t ri = i * right_col + col;
+        int32_t l = left_sign? (int8_t)left[li]: left[li];
+        int32_t r = right_sign? (int8_t)right[ri]: right[ri];
+        tmp_res[row * res_col + col] += l * r;
+      }
+    }
+  }
+
+  for (uint64_t i = 0; i < size; i++)
+    res[i + bstride*0] = tmp_res[i]>>0;
+  for (uint64_t i = 0; i < size; i++)
+    res[i + bstride*1] = tmp_res[i]>>8;
+  for (uint64_t i = 0; i < size; i++)
+    res[i + bstride*2] = tmp_res[i]>>16;
+  for (uint64_t i = 0; i < size; i++)
+    res[i + bstride*3] = tmp_res[i]>>24;
+
+  free(tmp_res);
+
+  return ret;
+}
+
+static int ps32_m3_matrix_mac_ref(
+  param_t *p,
+  uint8_t *left,
+  uint8_t *right,
+  uint8_t *res)
+{
+  uint64_t size = res_ps32_size(p);
+  uint32_t left_col = p->left->shape.col;
+  uint32_t right_col = p->right->shape.col;
+  uint32_t res_row = p->left->shape.n;
+  uint32_t res_col = p->res->shape.col;
+  int left_sign = (p->left->fmt == CVK_FMT_I8);
+  int right_sign = (p->right->fmt == CVK_FMT_I8);
+  int ret = 0;
+  int bstride = res_row * res_col;
+
+  uint32_t *tmp_res = (uint32_t *)malloc(sizeof(uint32_t) * size);
+  if (!tmp_res)
+    return -1;
+
+  for (uint64_t i = 0; i < size; i++)
+    tmp_res[i] = res[i + bstride*0];
+  for (uint64_t i = 0; i < size; i++)
+    tmp_res[i] |= res[i + bstride*1]<<8;
+  for (uint64_t i = 0; i < size; i++)
+    tmp_res[i] |= res[i + bstride*2]<<16;
+  for (uint64_t i = 0; i < size; i++)
+    tmp_res[i] |= res[i + bstride*3]<<24;
+
+  for (uint32_t row = 0; row < res_row; row++) {
+    for (uint32_t col = 0; col < res_col; col++) {
+      for (uint32_t i = 0; i < left_col; i++) {
+        uint32_t li = row * left_col + i;
+        uint32_t ri = i * right_col + col;
+        int32_t l = left_sign? (int8_t)left[li]: left[li];
+        int32_t r = right_sign? (int8_t)right[ri]: right[ri];
+        tmp_res[row * res_col + col] += l * r;
+      }
+    }
+  }
+
+  for (uint64_t i = 0; i < size; i++)
+    res[i + bstride*0] = tmp_res[i]>>0;
+  for (uint64_t i = 0; i < size; i++)
+    res[i + bstride*1] = tmp_res[i]>>8;
+  for (uint64_t i = 0; i < size; i++)
+    res[i + bstride*2] = tmp_res[i]>>16;
+  for (uint64_t i = 0; i < size; i++)
+    res[i + bstride*3] = tmp_res[i]>>24;
+
+  free(tmp_res);
+
+  return ret;
+}
+
+static int ps32_m1_matrix_mac_ref(
+  param_t *p,
+  uint8_t *left,
+  uint8_t *right,
+  uint16_t * bias,
+  uint8_t *res)
+{
+  uint64_t size = res_ps32_size(p);
+  uint32_t left_col = p->left->shape.col;
+  uint32_t right_col = p->right->shape.col;
+  uint32_t res_row = p->left->shape.n;
+  uint32_t res_col = p->res->shape.col;
+  int left_sign = (p->left->fmt == CVK_FMT_I8);
+  int right_sign = (p->right->fmt == CVK_FMT_I8);
+  int res_sign = (p->res->fmt == CVK_FMT_I8);
+  int ret = 0;
+  int bstride = res_row * res_col;
+
+  int32_t *tmp_res = (int32_t *)malloc(sizeof(int32_t) * size);
+  if (!tmp_res)
+    return -1;
+
+  for (uint64_t i = 0; i < size; i++)
+    tmp_res[i] = res[i + bstride*0];
+  for (uint64_t i = 0; i < size; i++)
+    tmp_res[i] |= res[i + bstride*1]<<8;
+  for (uint64_t i = 0; i < size; i++)
+    tmp_res[i] |= res[i + bstride*2]<<16;
+  for (uint64_t i = 0; i < size; i++)
+    tmp_res[i] |= res[i + bstride*3]<<24;
+
+  for (uint32_t row = 0; row < res_row; row++) {
+    for (uint32_t col = 0; col < res_col; col++) {
+      for (uint32_t i = 0; i < left_col; i++) {
+        uint32_t li = row * left_col + i;
+        uint32_t ri = i * right_col + col;
+        int32_t l = left_sign? (int8_t)left[li]: left[li];
+        int32_t r = right_sign? (int8_t)right[ri]: right[ri];
+        tmp_res[row * res_col + col] += l * r;
+      }
+    }
+  }
+
+  if (p->bias && bias) {
+    for (uint32_t row = 0; row < res_row; row++) {
+      for (uint32_t col = 0; col < res_col; col++) {
+        int bias_sign = (p->bias->fmt == CVK_FMT_I8);
+        int32_t b = bias_sign? (int16_t)bias[col]: bias[col];
+        tmp_res[row * res_col + col] += b;
+      }
+    }
+  }
+
+  if (p->relu_enable)
+    relu(tmp_res, size);
+  right_shift(p, tmp_res, size);
+  if (p->res_is_int8)
+    saturate_to_int8(tmp_res, size, res_sign);
+  else
+    saturate_to_int16(tmp_res, size, res_sign);
+
+  for (uint64_t i = 0; i < size; i++)
+    res[i + bstride*0] = tmp_res[i]>>0;
+  for (uint64_t i = 0; i < size; i++)
+    res[i + bstride*1] = tmp_res[i]>>8;
+
+  free(tmp_res);
+
+  return ret;
+}
+
+static void put_bias(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_ml_t *ml,
+    uint16_t data[])
+{
+  uint64_t size = ml->shape.col;
+
+  uint8_t *tmp = (uint8_t *)malloc(sizeof(uint8_t) * size * 2);
+  if (!tmp)
+    return;
+
+  for (uint64_t i = 0; i < size; i++) {
+    tmp[i] = data[i] & 0xff;
+    tmp[i + size] = (data[i] >> 8) & 0xff;
+  }
+
+  matrix_copy_s2d_g2l(rt_handle, cvk_ctx, ml, tmp);
+
+  free(tmp);
+}
+
+
+static int test_matrix_ps32_ut(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  int ret = 0;
+  make_bmk_matrix_param_ps32(cvk_ctx, p, 2);
+  uint8_t *left = alloc_left(p);
+  uint8_t *right = alloc_right(p);
+  uint8_t *ref = alloc_ps32_res(p);
+  if (!left || !right || !ref) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  {
+    ret = ps32_m2_matrix_mac_ref(p, left, right, ref);
+    if (ret)
+      goto fail_exit;
+
+     matrix_copy_s2d_g2l(rt_handle, cvk_ctx, p->left, left);
+     matrix_copy_s2d_g2l(rt_handle, cvk_ctx, p->right, right);
+     cvk_ctx->ops->tiu_matrix_multiplication(cvk_ctx, p);
+     cvk_ml_t ps32_res;
+     ps32_res = *p->res;
+     ps32_res.shape.n *= sizeof(int);
+     uint8_t *res = matrix_copy_l2g_d2s(rt_handle, cvk_ctx, &ps32_res);
+
+     ret = array_cmp_int8(
+         "Comparing begin_mode results ...\n",
+         (int8_t *)ref, (int8_t *)res ,(int)res_ps32_size(p)*sizeof(int));
+     if (ret) {
+       printf("Comparison M2 FAILED\n");
+       print_param(p);
+     }else
+       printf("Comparison M2 PASS\n");
+     free(res);
+  }
+
+  {
+    make_bmk_matrix_param_ps32(cvk_ctx, p, 3);
+
+    ret = ps32_m3_matrix_mac_ref(p, left, right, ref);
+    if (ret)
+      goto fail_exit;
+
+    cvk_ctx->ops->tiu_matrix_multiplication(cvk_ctx, p);
+    cvk_ml_t ps32_res;
+    ps32_res = *p->res;
+    ps32_res.shape.n *= sizeof(int);
+    uint8_t *res = matrix_copy_l2g_d2s(rt_handle, cvk_ctx, &ps32_res);
+
+    ret = array_cmp_int8(
+        "Comparing m3 results ...\n",
+        (int8_t *)ref, (int8_t *)res ,(int)res_ps32_size(p)*sizeof(int));
+    if (ret) {
+      printf("Comparison M3 FAILED\n");
+      print_param(p);
+    }else
+      printf("Comparison M3 PASS\n");
+
+    free(res);
+  }
+  {
+    make_bmk_matrix_param_ps32(cvk_ctx, p, 1);
+    uint16_t *bias = alloc_bias(p);
+
+    ret = ps32_m1_matrix_mac_ref(p, left, right, bias, ref);
+    if (ret)
+      goto fail_exit;
+
+    if(p->bias)
+      put_bias(rt_handle, cvk_ctx, p->bias, bias);
+
+    cvk_ctx->ops->tiu_matrix_multiplication(cvk_ctx, p);
+    cvk_ml_t ps32_res;
+    ps32_res = *p->res;
+    ps32_res.shape.n *= 2;
+
+    uint8_t *res = matrix_copy_l2g_d2s(rt_handle, cvk_ctx, &ps32_res);
+    ret = array_cmp_int8(
+        "Comparing m1 results ...\n",
+        (int8_t *)ref, (int8_t *)res ,(int)res_size(p));
+    if (ret) {
+      printf("Comparison M1 FAILED\n");
+      print_param(p);
+    }else
+      printf("Comparison M1 PASS\n");
+
+    free(res);
+    free(bias);
+  }
+
+fail_exit:
+  free(left);
+  free(right);
+  free(ref);
+
+  return ret;
+}
+
+static void destroy_param(cvk_context_t *cvk_ctx, param_t *p)
+{
+  if (p->bias)
+    cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->bias);
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->res);
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->right);
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->left);
+}
+
+static cvk_fmt_t modify_res_fmt()
+{
+  cvk_fmt_t fmt = CVK_FMT_U8;
+  if (matrix_para_t.left_sign == CVK_FMT_I8)
+    fmt = CVK_FMT_I8;
+  if (matrix_para_t.right_sign == CVK_FMT_I8)
+    fmt = CVK_FMT_I8;
+  if (matrix_para_t.using_bias)
+    if (matrix_para_t.bias_sign == CVK_FMT_I8)
+      fmt = CVK_FMT_I8;
+
+//  if (matrix_para_t.relu_enable)
+//    fmt = CVK_FMT_U8;
+
+  return fmt;
+}
+
+static cvk_ml_t *alloc_param_res(
+    cvk_context_t *cvk_ctx, param_t *p)
+{
+  cvk_ml_shape_t s;
+  s.n = p->left->shape.n;
+  s.c = p->right->shape.c;
+  s.w = p->right->shape.w;
+  s.col = p->right->shape.col;
+
+  cvk_fmt_t fmt = CVK_FMT_U8;
+  fmt = modify_res_fmt();
+  return cvk_ctx->ops->lmem_alloc_ps32_matrix(cvk_ctx, s, fmt, 1);
+}
+
+
+static void make_bmk_matrix_param_ps32(cvk_context_t *cvk_ctx, param_t *p, int ps32_mode)
+{
+
+  cvk_ml_shape_t left_shape;
+  cvk_ml_shape_t right_shape;
+
+  p->ps32_mode = ps32_mode;
+  p->relu_enable = 0;
+  p->lshift_bits = 0;
+  p->rshift_bits = 0;
+
+  if(ps32_mode==2)
+  {
+    left_shape.n = matrix_para_t.left_row;
+    left_shape.c = matrix_para_t.left_c;
+    left_shape.w = matrix_para_t.left_w;
+    left_shape.col = matrix_para_t.left_col;
+
+    right_shape.n = matrix_para_t.right_row;
+    right_shape.c = matrix_para_t.right_c;
+    right_shape.w = matrix_para_t.right_w;
+    right_shape.col = matrix_para_t.right_col;
+    p->left  = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape,  matrix_para_t.left_sign , 1);
+    p->right = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, matrix_para_t.right_sign, 1);
+    p->bias = NULL;
+    p->res = alloc_param_res(cvk_ctx, p);
+  }else if(ps32_mode==3)
+  {
+
+  }else if(ps32_mode==1)
+  {
+     p->relu_enable = matrix_para_t.relu_enable;
+     p->rshift_bits = matrix_para_t.rshift_bits;
+     if(matrix_para_t.using_bias)
+     {
+       right_shape.n = matrix_para_t.right_row;
+       right_shape.c = matrix_para_t.right_c;
+       right_shape.w = matrix_para_t.right_w;
+       right_shape.col = matrix_para_t.right_col;
+
+       cvk_ml_shape_t bias_shape = right_shape;
+       bias_shape.n = 2;
+       p->bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, bias_shape, matrix_para_t.bias_sign, 1);
+       assert(p->bias);
+    }
+  }
+
+}
+static param_t param_init(void)
+{
+  param_t p;
+
+  //srand(clock());
+
+  memset(&p, 0, sizeof(param_t));
+  memset(&matrix_para_t, 0, sizeof(matrix_init_para_t));
+
+  matrix_para_t.rshift_bits = rand()%4+2;
+  matrix_para_t.using_bias = rand()%2;
+  matrix_para_t.relu_enable = rand()%2;
+  matrix_para_t.right_sign = rand()%2? CVK_FMT_I8 : CVK_FMT_U8;
+  matrix_para_t.left_sign = rand()%2? CVK_FMT_I8 : CVK_FMT_U8;
+
+  if(matrix_para_t.using_bias)
+    matrix_para_t.bias_sign = rand()%2? CVK_FMT_I8 : CVK_FMT_U8;
+
+  if(matrix_para_t.right_sign != CVK_FMT_I8 && matrix_para_t.left_sign != CVK_FMT_I8)
+    matrix_para_t.relu_enable=0;
+
+  matrix_para_t.left_row = rand()%60+1;
+  matrix_para_t.left_col = rand()%40+1;
+  matrix_para_t.left_w = matrix_para_t.left_col/0x10 ? ((uint32_t)rand())%8+8 : matrix_para_t.left_col;
+  //matrix_para_t.left_w = rand()%16+1;
+  matrix_para_t.left_c =
+    matrix_para_t.left_col%matrix_para_t.left_w?
+      matrix_para_t.left_col/matrix_para_t.left_w+1 : matrix_para_t.left_col/matrix_para_t.left_w;
+
+  matrix_para_t.right_row = matrix_para_t.left_col;
+  matrix_para_t.right_col = rand()%50+1;
+  //matrix_para_t.right_w = 16;
+  matrix_para_t.right_w = rand()%16+1;
+  matrix_para_t.right_c =
+    matrix_para_t.right_col%matrix_para_t.right_w?
+      matrix_para_t.right_col/matrix_para_t.right_w+1 : matrix_para_t.right_col/matrix_para_t.right_w;
+
+  return p;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  for (int i = 0; i < 20; i++) {
+    printf("random_test_conv iteration: %d\n", i);
+    param_t p = param_init();
+
+    ret |= test_matrix_ps32_ut(rt_handle, cvk_ctx, &p);
+    destroy_param(cvk_ctx, &p);
+  }
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_matrix_mac_qm.c b/cviruntime/test/181x/test_181x_matrix_mac_qm.c
new file mode 100644
index 000000000..89cf653ae
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_matrix_mac_qm.c
@@ -0,0 +1,829 @@
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_tf_quant_util.h"
+
+// #define ENABLE_DEBUG_MSG
+
+#define MIN_EXEC_TESTS  20
+
+typedef cvk_tiu_matrix_multiplication_qm_param_t param_t;
+
+typedef struct {
+  int left_row;
+  int left_col;
+  int right_col;
+  int has_bias;
+  int relu_enable;
+  int8_t *input_data;
+  int8_t *filter_data;
+  int8_t *output_data;
+  int32_t *bias_data;
+  uint32_t multiplier;
+  int8_t right_shift;
+  float float_multiplier;
+  int retry_cnt;
+} fc_test_param_t;
+
+void fully_connected_ref(fc_test_param_t *p_param)
+{
+  const int32_t input_offset = 0;
+  const int32_t filter_offset = 0;
+  const int32_t output_offset = 0;
+  const int32_t output_multiplier = p_param->multiplier;
+  const int output_rshift = p_param->right_shift;
+  const int batches = p_param->left_row;
+  const int output_depth = p_param->right_col;
+  const int accum_depth = p_param->left_col;
+  int8_t *input_data = p_param->input_data;
+  int8_t *filter_data = p_param->filter_data;
+  int8_t *output_data = p_param->output_data;
+  int32_t *bias_data = p_param->has_bias ? p_param->bias_data : NULL;
+
+  const int32_t output_activation_min = -128;
+  const int32_t output_activation_max = 127;
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("fully_connected_ref:\n");
+  printf("  batches %d, output_depth %d, accum_depth %d, filter_offset %d, "
+         "input_offset %d\n",
+         batches, output_depth, accum_depth, filter_offset, input_offset);
+  printf("  output_multiplier %d, output_rshift %d\n", output_multiplier,
+         output_rshift);
+#endif
+
+  for (int b = 0; b < batches; ++b) {
+    for (int out_c = 0; out_c < output_depth; ++out_c) {
+      int32_t acc = 0;
+      for (int d = 0; d < accum_depth; ++d) {
+        int32_t input_val = input_data[b * accum_depth + d];
+        // int32_t filter_val = filter_data[out_c * accum_depth + d];
+        int32_t filter_val = filter_data[output_depth * d + out_c];
+        acc += (filter_val + filter_offset) * (input_val + input_offset);
+
+#ifdef ENABLE_DEBUG_MSG
+        printf("  [%d][%d][%d] acc(%d) += (%d + %d) * (%d + %d) = %d\n", b,
+               out_c, d,
+               acc - (filter_val + filter_offset) * (input_val + input_offset),
+               filter_val, filter_offset, input_val, input_offset, acc);
+#endif
+      }
+      if (bias_data) {
+        acc += bias_data[out_c];
+
+#ifdef ENABLE_DEBUG_MSG
+        printf("  [%d][%d] acc %d, bias %d\n", b, out_c, acc,
+               bias_data ? bias_data[out_c] : 0);
+#endif
+      }
+      acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_rshift);
+
+#ifdef ENABLE_DEBUG_MSG
+      printf("  [%d][%d] acc %d, output_multiplier %d, output_rshift %d\n", b,
+             out_c, acc, output_multiplier, output_rshift);
+#endif
+
+      acc += output_offset;
+
+#ifdef ENABLE_DEBUG_MSG
+      printf("  [%d][%d] acc %d, output_offset %d\n", b, out_c, acc,
+             output_offset);
+#endif
+
+      acc = MAX(acc, output_activation_min);
+      acc = MIN(acc, output_activation_max);
+
+#ifdef ENABLE_DEBUG_MSG
+      printf("  [%d][%d] acc %d, output_activation_min %d, "
+             "output_activation_max %d\n",
+             b, out_c, acc, output_activation_min, output_activation_max);
+#endif
+
+      output_data[out_c + output_depth * b] = acc;
+    }
+  }
+}
+
+void calc_fc_float_multiplier(fc_test_param_t *p_param)
+{
+  const int32_t input_offset = 0;
+  const int32_t filter_offset = 0;
+  const int batches = p_param->left_row;
+  const int output_depth = p_param->right_col;
+  const int accum_depth = p_param->left_col;
+  int8_t *input_data = p_param->input_data;
+  int8_t *filter_data = p_param->filter_data;
+  int32_t *bias_data = p_param->has_bias ? p_param->bias_data : NULL;
+
+  int output_accu_min = INT_MIN;
+  int output_accu_max = INT_MAX;
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("calc_fc_float_multiplier:\n");
+#endif
+
+  for (int b = 0; b < batches; ++b) {
+    for (int out_c = 0; out_c < output_depth; ++out_c) {
+      int32_t acc = 0;
+      for (int d = 0; d < accum_depth; ++d) {
+        int32_t input_val = input_data[b * accum_depth + d];
+        // int32_t filter_val = filter_data[out_c * accum_depth + d];
+        int32_t filter_val = filter_data[output_depth * d + out_c];
+        acc += (filter_val + filter_offset) * (input_val + input_offset);
+      }
+      if (bias_data) {
+        acc += bias_data[out_c];
+      }
+
+      output_accu_max = MAX(acc, output_accu_max);
+      output_accu_min = MIN(acc, output_accu_min);
+
+    }
+  }
+
+  // Since int8 ranges from -128 to 127, we need to squeeze the accumulator
+  // MIN/MAX fit in those ranges correspondingly as much as possible.
+  if (abs(output_accu_max) > abs(output_accu_min)) {
+    p_param->float_multiplier = 127.0f / abs(output_accu_max);
+  } else {
+    p_param->float_multiplier = 128.0f / abs(output_accu_min);
+  }
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("  output_accu_min %d, output_accu_max %d, output_multiplier %f\n",
+         output_accu_min, output_accu_max, p_param->float_multiplier);
+#endif
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("<= calc_fc_float_multiplier\n");
+#endif
+}
+
+static void put_bias32(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, const cvk_ml_t *ml,
+                       int32_t data[])
+{
+  uint64_t size = ml->shape.col;
+
+  uint8_t *tmp = (uint8_t *)malloc(size * 4);
+  if (!tmp)
+    return;
+
+  for (uint64_t i = 0; i < size; i++) {
+    uint32_t val = data[i];
+    tmp[i] = val & 0xff;
+    tmp[i + size] = (val >> 8) & 0xff;
+    tmp[i + 2 * size] = (val >> 16) & 0xff;
+    tmp[i + 3 * size] = (val >> 24) & 0xff;
+  }
+
+  matrix_copy_s2d_g2l(rt_handle, cvk_ctx, ml, tmp);
+
+  free(tmp);
+}
+
+#if 0
+typedef struct {
+  int32_t input_offset;
+  int32_t weights_offset;
+  int32_t output_offset;
+  int32_t output_multiplier;
+  int output_rshift;
+} FullyConnectedParams;
+
+int tfl_original_test()
+{
+  int ret = 0;
+
+  // 2x10
+  int8_t input_data[20] = {
+    1, 3, 5, 7,  9, 11, 13,  15, -19, -21,
+    1, 3, 5, 7,  9, 11, 13, -17,  17, -21};
+
+  // 3x10
+  int8_t filter_data[30] = {
+    1, 3, 5, 7, 9, 11, 13, 15, 17, 19,
+    1, 3, 5, 7, 9, 11, 13, 15, 17, 19,
+    1, 3, 5, 7, 9, 11, 13, 15, 17, 19};
+
+  // 1x3
+  int32_t bias_data[3] = {4, 8, 12};
+
+  // 2x3
+  int8_t ref_output_data[6] = {
+    23, 24, 25,
+    57, 58, 59};
+
+  int8_t output_rshift = 1; // change to right shift
+  uint32_t output_multiplier = 1073741824;
+
+  int32_t input_offset = 1;
+  int32_t filter_offset = 1;
+  int32_t output_offset = 1;  // change to right shift
+
+  FullyConnectedParams params;
+  params.input_offset = input_offset;
+  params.weights_offset = filter_offset;
+  params.output_offset = output_offset;
+  params.output_multiplier = output_multiplier;
+  params.output_rshift = output_rshift;
+
+  cvk_tl_shape_t input_shape = {2, 10, 1, 1};
+  cvk_tl_shape_t filter_shape = {3, 10, 1, 1};
+  cvk_tl_shape_t bias_shape = {1, 3, 1, 1};
+  cvk_tl_shape_t output_shape = {2, 3, 1, 1};
+
+  int8_t output_data[6];
+  fully_connected_ref(params, input_shape,
+                      input_data, filter_shape,
+                      filter_data, bias_shape,
+                      bias_data, output_shape,
+                      output_data);
+  for (int i = 0; i < 6; i++) {
+    if (output_data[i] != ref_output_data[i]) {
+      printf("  output_data[%d] %d != %d\n",
+             i, output_data[i], ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  return ret;
+}
+#endif
+
+int simple_test(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+
+  // 2x10
+  int8_t input_data[20] = {1, 3, 5, 7, 9, 11, 13, 15,  -19, -21,
+                       1, 3, 5, 7, 9, 11, 13, -17, 17,  -21};
+
+#if 0
+  // 3x10
+  // tfl use transposed filter
+  int8_t filter_data_tp[30] = {
+    1, 3, 5, 7, 9, 11, 13, 15, 17, 19,
+    1, 3, 5, 7, 9, 11, 13, 15, 17, 19,
+    1, 3, 5, 7, 9, 11, 13, 15, 17, 19};
+#endif
+
+  // 10x3
+  int8_t filter_data[30] = {1,  1,  1,  3,  3,  3,  5,  5,  5,  7,
+                        7,  7,  9,  9,  9,  11, 11, 11, 13, 13,
+                        13, 15, 15, 15, 17, 17, 17, 19, 19, 19};
+
+  // 1x3
+  int32_t bias_data[3] = {4, 8, 12};
+
+  // 2x3, input/kernel/output zero_point = 0
+  int8_t ref_output_data[6] = {-10, -9, -8, 24, 25, 26};
+  int8_t output_data[6];
+
+  int8_t output_rshift = 1;  // change to right shift
+  uint32_t output_multiplier = 1073741824;
+
+  int left_row = 2;
+  int left_col = 10;
+  int right_col = 3;
+
+  fc_test_param_t params;
+  memset(&params, 0, sizeof(params));
+  params.left_row = left_row;
+  params.left_col = left_col;
+  params.right_col = right_col;
+  params.has_bias = 1;
+  params.relu_enable = 0;
+  params.input_data = input_data;
+  params.filter_data = filter_data;
+  params.output_data = output_data;
+  params.bias_data = bias_data;
+  params.multiplier = output_multiplier;
+  params.right_shift = output_rshift;
+  fully_connected_ref(&params);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("Compare ref and golden\n");
+#endif
+  for (int i = 0; i < 6; i++) {
+    if (output_data[i] != ref_output_data[i]) {
+      printf("  output_data[%d] %d(ref) != %d(golden)\n", i, output_data[i],
+             ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  cvk_ml_shape_t left_shape =
+      cvk_ctx->ops->ml_default_shape(cvk_ctx, left_row, left_col, CVK_FMT_I8);
+
+  cvk_ml_shape_t right_shape =
+      cvk_ctx->ops->ml_default_shape(cvk_ctx, left_col, right_col, CVK_FMT_I8);
+
+  cvk_ml_shape_t b_shape =
+      cvk_ctx->ops->ml_default_shape(cvk_ctx, 4, right_col, CVK_FMT_I8);  // 32bit
+
+  cvk_ml_shape_t y_shape =
+      cvk_ctx->ops->ml_default_shape(cvk_ctx, left_row, right_col, CVK_FMT_I8);
+
+  cvk_ml_t *tl_left =
+      cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, left_shape, CVK_FMT_I8, 1);
+  cvk_ml_t *tl_right =
+      cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, right_shape, CVK_FMT_I8, 1);
+  cvk_ml_t *tl_b =
+      cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, b_shape, CVK_FMT_I8, 1);
+  cvk_ml_t *tl_y =
+      cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, y_shape, CVK_FMT_I8, 1);
+
+  matrix_copy_s2d_g2l(rt_handle, cvk_ctx, tl_left, (uint8_t *)input_data);
+  matrix_copy_s2d_g2l(rt_handle, cvk_ctx, tl_right, (uint8_t *)filter_data);
+  put_bias32(rt_handle, cvk_ctx, tl_b, bias_data);
+
+  {
+    param_t p;
+    memset(&p, 0, sizeof(p));
+    p.left = tl_left;
+    p.right = tl_right;
+    p.bias = tl_b;
+    p.res = tl_y;
+    p.rshift_bits = output_rshift;
+    p.res_is_int8 = 1;
+    p.ps32_mode = 0;
+    p.quan_m = output_multiplier;
+    cvk_ctx->ops->tiu_matrix_multiplication_qm(cvk_ctx, &p);
+  }
+
+  int8_t *tiu_output_data =
+      (int8_t *)matrix_copy_l2g_d2s(rt_handle, cvk_ctx, tl_y);
+#ifdef ENABLE_DEBUG_MSG
+  printf("Compare tiu and ref\n");
+#endif
+  for (int i = 0; i < 6; i++) {
+    if (tiu_output_data[i] != ref_output_data[i]) {
+      printf("  output_data[%d] %d(tiu) != %d(ref)\n", i, tiu_output_data[i],
+             ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  free(tiu_output_data);
+
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, tl_y);
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, tl_b);
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, tl_right);
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, tl_left);
+
+  return ret;
+}
+
+int choose_from_range(int table[], int size, int index)
+{
+  if (index >= size) {
+    return 0;
+  }
+
+  int val = table[index];
+  if (index < (size - 1)) {
+    int range = MAX(table[index + 1] - table[index] - 1, 1);
+    val += rand() % range;
+  }
+
+  return val;
+}
+
+bool check_valid_test_param(cvk_context_t *cvk_ctx, fc_test_param_t *p_param)
+{
+  int left_row = p_param->left_row;
+  int left_col = p_param->left_col;
+  int right_col = p_param->right_col;
+  int has_bias = p_param->has_bias;
+
+  cvk_ml_shape_t tl_input_shape =
+      cvk_ctx->ops->ml_default_shape(cvk_ctx, left_row, left_col, CVK_FMT_I8);
+  cvk_ml_stride_t tl_input_stride =
+      cvk_ctx->ops->ml_default_stride(cvk_ctx, tl_input_shape, CVK_FMT_I8, 1);
+      
+  cvk_ml_shape_t tl_filter_shape =
+      cvk_ctx->ops->ml_default_shape(cvk_ctx, left_col, right_col, CVK_FMT_I8);
+  cvk_ml_stride_t tl_filter_stride =
+      cvk_ctx->ops->ml_default_stride(cvk_ctx, tl_filter_shape, CVK_FMT_I8, 1);
+
+  cvk_ml_shape_t tl_output_shape =
+      cvk_ctx->ops->ml_default_shape(cvk_ctx, left_row, right_col, CVK_FMT_I8);
+  cvk_ml_stride_t tl_output_stride =
+      cvk_ctx->ops->ml_default_stride(cvk_ctx, tl_output_shape, CVK_FMT_I8, 1);
+
+  uint32_t bias_size = 0;
+  if (has_bias) {
+    cvk_ml_shape_t tl_bias_shape =
+        cvk_ctx->ops->ml_default_shape(cvk_ctx, 4, right_col, CVK_FMT_I8);  // 32bit
+    cvk_ml_stride_t tl_bias_stride =
+        cvk_ctx->ops->ml_default_stride(cvk_ctx, tl_bias_shape, CVK_FMT_I8, 1);
+    bias_size = tl_bias_shape.n * tl_bias_stride.n;
+  }
+
+  uint32_t lmem_size_per_lane = cvk_ctx->info.lmem_size;
+  // uint32_t total_lmem_size = cvk_ctx->info.lmem_size * cvk_ctx->info.npu_num;
+
+  uint32_t needed_size = tl_input_shape.n * tl_input_stride.n +
+                    tl_filter_shape.n * tl_filter_stride.n +
+                    tl_output_shape.n * tl_output_stride.n + bias_size;
+
+  if (needed_size > lmem_size_per_lane) {
+    return false;
+  }
+
+  return true;
+}
+
+void fill_random_data_s8(int8_t *input_data, int size)
+{
+  for (int i = 0; i < size; ++i) {
+    int is_satured = ((rand() % 1000) == 1) ? 1 : 0;
+    int is_sign = rand() % 2 ? 1 : -1;
+
+    if (is_satured && is_sign) {
+      input_data[i] = -128;
+    } else if (is_satured) {
+      input_data[i] = 127;
+    } else {
+      input_data[i] = is_sign * rand() % 128;
+    }
+  }
+}
+
+void fill_random_data_s32(int32_t *input_data, int size)
+{
+  for (int i = 0; i < size; ++i) {
+    int is_satured = ((rand() % 1000) == 1) ? 1 : 0;
+    int is_sign = rand() % 2 ? 1 : -1;
+
+    if (is_satured && is_sign) {
+      input_data[i] = INT_MIN;
+    } else if (is_satured) {
+      input_data[i] = INT_MAX;
+    } else {
+      input_data[i] = is_sign * rand() % 128;
+    }
+  }
+}
+
+void dump_test_param(fc_test_param_t *p_param, bool dump_content)
+{
+  printf("Dump test paramter:\n");
+  printf("  left_row %d\n", p_param->left_col);
+  printf("  left_col %d\n", p_param->left_col);
+  printf("  right_col %d\n", p_param->right_col);
+  printf("  has_bias %d\n", p_param->has_bias);
+  printf("  multiplier %d\n", p_param->multiplier);
+  printf("  right_shift %d\n", p_param->right_shift);
+
+  if (dump_content) {
+    printf("input_data(%d, %d)\n", p_param->left_row, p_param->left_col);
+    int left_row = p_param->left_row;
+    int left_col = p_param->left_col;
+    for (int i = 0; i < left_row; ++i) {
+      for (int j = 0; j < left_col; ++j) {
+        int offset = i * left_col + j;
+        printf("%d, ", p_param->input_data[offset]);
+      }
+      printf("\n");
+    }
+    printf("\n\n");
+
+    int right_col = p_param->right_col;
+    printf("kernel_data (%d, %d)\n", left_col, right_col);
+    for (int i = 0; i < left_col; ++i) {
+      for (int j = 0; j < right_col; ++j) {
+        int offset = i * right_col + j;
+        printf("%d, ", p_param->filter_data[offset]);
+      }
+      printf("\n");
+    }
+    printf("\n\n");
+
+    if (p_param->has_bias) {
+      for (int i = 0; i < right_col; ++i) {
+        printf("%d, ", p_param->bias_data[i]);
+      }
+      printf("\n\n");
+    }
+  }
+}
+
+int run_compare_fc(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, fc_test_param_t *p_param)
+{
+  int ret = 0;
+
+  int left_row = p_param->left_row;
+  int left_col = p_param->left_col;
+  int right_col = p_param->right_col;
+  int has_bias = p_param->has_bias;
+
+  int input_size = left_row * left_col;
+  int8_t *input_data = (int8_t *)malloc(input_size);
+
+  int kernel_size = left_col * right_col;
+  int8_t *kernel_data = (int8_t *)malloc(kernel_size);
+
+  int output_size = left_row * right_col;
+  int8_t *output_data = (int8_t *)malloc(output_size);
+
+  int32_t *bias_data = (int32_t *) malloc(sizeof(int32_t) * right_col);
+
+  p_param->input_data = input_data;
+  p_param->filter_data = kernel_data;
+  p_param->output_data = output_data;
+  p_param->bias_data = bias_data;
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    run_compare_conv =>\n");
+  printf("      left (%d, %d), right (%d, %d), has_bias %d\n", left_row,
+         left_col, left_col, right_col, has_bias);
+#endif
+
+  int retry_cnt = p_param->retry_cnt;
+  do {
+    fill_random_data_s8(input_data, input_size);
+    fill_random_data_s8(kernel_data, kernel_size);
+    if (has_bias) {
+      fill_random_data_s32(bias_data, right_col);
+    }
+
+    p_param->float_multiplier = 100.0;  // should be < 1.0
+    calc_fc_float_multiplier(p_param);
+
+    if (p_param->float_multiplier > 0.f && p_param->float_multiplier < 1.0) {
+      break;
+    }
+
+  } while (--retry_cnt);
+
+  if (p_param->float_multiplier >= 1.0) {
+    printf("    run_compare_dw_conv: unable to find valid multiplier\n");
+    free(input_data);
+    free(kernel_data);
+    free(output_data);
+    free(bias_data);
+    return -1;
+  }
+
+  uint32_t base_multiplier = 0;
+  int base_shift = 0;
+  QuantizeMultiplierSmallerThanOne(p_param->float_multiplier, &base_multiplier,
+                                   &base_shift);
+
+  // multipliers typically range in [2^30 ; 2^31 - 1].
+  // Values in [0, 2^30 - 1] are normally unused, but harmless.
+  // Thus a good way to randomize multipliers is to subtract from them
+  // a random value smaller than 2^30 but still significant compared to it.
+  uint32_t output_multiplier = base_multiplier - (rand() % (1 << 26));
+
+  int right_shift = base_shift - 1 + (rand() % 4);
+  int8_t output_rshift = truncate_rshift((int8_t)right_shift, /*allow_lshift*/1);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("      multiplier_data %d, shift_data %d\n", output_multiplier,
+         output_rshift);
+#endif
+
+  p_param->multiplier = output_multiplier;
+  p_param->right_shift = output_rshift;
+  fully_connected_ref(p_param);
+
+  cvk_ml_shape_t tl_input_shape =
+      cvk_ctx->ops->ml_default_shape(cvk_ctx, left_row, left_col, CVK_FMT_I8);
+
+  cvk_ml_shape_t tl_filter_shape =
+      cvk_ctx->ops->ml_default_shape(cvk_ctx, left_col, right_col, CVK_FMT_I8);
+
+  cvk_ml_shape_t tl_output_shape =
+      cvk_ctx->ops->ml_default_shape(cvk_ctx, left_row, right_col, CVK_FMT_I8);
+
+  cvk_ml_shape_t tl_bias_shape =
+      cvk_ctx->ops->ml_default_shape(cvk_ctx, 4, right_col, CVK_FMT_I8);  // 32bit
+
+  cvk_ml_t *tl_input = cvk_ctx->ops->lmem_alloc_matrix(
+      cvk_ctx, tl_input_shape, CVK_FMT_I8, /*eu_align=*/1);
+  cvk_ml_t *tl_filter = cvk_ctx->ops->lmem_alloc_matrix(
+      cvk_ctx, tl_filter_shape, CVK_FMT_I8, /*eu_align=*/1);
+  cvk_ml_t *tl_output = cvk_ctx->ops->lmem_alloc_matrix(
+      cvk_ctx, tl_output_shape, CVK_FMT_I8, /*eu_align=*/1);
+
+  cvk_ml_t *tl_bias = NULL;
+  if (has_bias) {
+    tl_bias = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, tl_bias_shape, CVK_FMT_I8,
+                                          /*eu_align=*/1);
+  }
+
+  if (tl_input == NULL) {
+    printf("   fail to alloc tl_input (%d, %d)\n", left_row, left_col);
+    return -1;
+  }
+  if (tl_filter == NULL) {
+    printf("    fail to alloc tl_filter (%d, %d)\n", left_col, right_col);
+    return -1;
+  }
+  if (tl_output == NULL) {
+    printf("    fail to alloc tl_output (%d, %d)\n", left_row, right_col);
+    return -1;
+  }
+  if (has_bias && (tl_bias == NULL)) {
+    printf("  fail to alloc bias (%d, %d)\n", 4, right_col);
+    return -1;
+  }
+
+  matrix_copy_s2d_g2l(rt_handle, cvk_ctx, tl_input, (uint8_t *)input_data);
+  matrix_copy_s2d_g2l(rt_handle, cvk_ctx, tl_filter, (uint8_t *)kernel_data);
+  if (tl_bias) {
+    put_bias32(rt_handle, cvk_ctx, tl_bias, bias_data);
+  }
+
+  {
+    param_t p;
+    memset(&p, 0, sizeof(p));
+    p.left = tl_input;
+    p.right = tl_filter;
+    p.bias = tl_bias;
+    p.res = tl_output;
+    p.rshift_bits = (uint8_t)output_rshift;
+    p.res_is_int8 = 1;
+    p.ps32_mode = 0;
+    p.quan_m = output_multiplier;
+    cvk_ctx->ops->tiu_matrix_multiplication_qm(cvk_ctx, &p);
+  }
+
+  int8_t *tiu_output_data =
+      (int8_t *)matrix_copy_l2g_d2s(rt_handle, cvk_ctx, tl_output);
+#ifdef ENABLE_DEBUG_MSG
+  printf("Compare tiu and ref\n");
+#endif
+  for (int i = 0; i < left_row; ++i) {
+    for (int j = 0; j < right_col; ++j) {
+      int offset = i * right_col + j;
+      if (tiu_output_data[offset] != output_data[offset]) {
+        printf("  output_data[%d][%d] %d(tiu) != %d(ref)\n", i, j,
+               tiu_output_data[offset], output_data[offset]);
+        ret = -1;
+      }
+    }
+  }
+
+  if (ret) {
+    dump_test_param(p_param, /*dump_content=*/true);
+  }
+
+  // Reverse order
+  if (tl_bias) {
+    cvk_ctx->ops->lmem_free_matrix(cvk_ctx, tl_bias);
+  }
+
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, tl_output);
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, tl_filter);
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, tl_input);
+
+  free(tiu_output_data);
+
+  free(input_data);
+  free(kernel_data);
+  free(output_data);
+  free(bias_data);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    <= run_compare_conv, ret %d\n", ret);
+#endif
+
+  return ret;
+}
+
+int random_test(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+
+#if 0
+  int left_row_range[] = {1};
+  int left_col_range[] = {1};
+  int right_col_range[] = {1};
+#else
+  int left_row_range[] =  {1, 16, 4095};
+  int left_col_range[] =  {1, 16, 4095};
+  int right_col_range[] = {1, 16, 4095};
+#endif
+
+  const int left_row_range_size =
+      sizeof(left_row_range) / sizeof(left_row_range[0]);
+  const int left_col_range_size =
+      sizeof(left_col_range) / sizeof(left_col_range[0]);
+  const int right_col_range_size =
+      sizeof(right_col_range) / sizeof(right_col_range[0]);
+
+  int random_seed = clock();
+  srand(random_seed);
+
+  const int retry_test_count = 100;
+  bool stop_at_first_error = true;
+
+  int executed_tests = 0;
+  int failed_tests = 0;
+
+  printf("1822-fc-qm: random test =>\n");
+  for (int m = 0; m < retry_test_count; ++m) {
+    for (int i = 0; i < left_row_range_size; ++i) {
+      // random choosed from [range[i] : range[i+1]]
+      int left_row = choose_from_range(left_row_range, left_row_range_size, i);
+
+      for (int j = 0; j < left_col_range_size; ++j) {
+        int left_col =
+            choose_from_range(left_col_range, left_col_range_size, j);
+
+        for (int k = 0; k < right_col_range_size; ++k) {
+          int right_col =
+              choose_from_range(right_col_range, right_col_range_size, k);
+
+          int has_bias = rand() % 2;
+
+          fc_test_param_t test_param;
+          memset(&test_param, 0, sizeof(test_param));
+          test_param.left_row = left_row;
+          test_param.left_col = left_col;
+          test_param.right_col = right_col;
+          test_param.has_bias = has_bias;
+          test_param.retry_cnt = 5;
+
+          bool is_valid_param = check_valid_test_param(cvk_ctx, &test_param);
+          if (is_valid_param == false) {
+            continue;
+          }
+
+          int ret2 = run_compare_fc(rt_handle, cvk_ctx, &test_param);
+          failed_tests = ret2 ? failed_tests + 1 : failed_tests;
+          ret |= ret2;
+          executed_tests++;
+
+#ifdef ENABLE_DEBUG_MSG
+          printf("  [%d] random test: left(%d, %d), right (%d, %d), result "
+                 "%d\n",
+                 executed_tests, left_row, left_col, left_col,
+                 right_col, ret2);
+#endif
+        }
+
+        // Stop at first error
+        if (ret && stop_at_first_error) {
+          break;
+        }
+      }
+
+      // Stop at first error
+      if (ret && stop_at_first_error) {
+        break;
+      }
+    }
+
+    // Stop at first error
+    if (ret && stop_at_first_error) {
+      break;
+    }
+
+    if (executed_tests >= MIN_EXEC_TESTS) {
+      break;
+    }
+  }
+
+  printf("<= 1822-fc-qm: random test, total %d, failed %d, ret %d\n",
+         executed_tests, failed_tests, ret);
+
+  return 0;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  // ret |= tfl_original_test();
+  ret |= simple_test(rt_handle, cvk_ctx);
+  ret |= random_test(rt_handle, cvk_ctx);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_max_pooling.c b/cviruntime/test/181x/test_181x_max_pooling.c
new file mode 100644
index 000000000..2e22e9275
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_max_pooling.c
@@ -0,0 +1,238 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_native_ref.h"
+
+#define INVALIDE_STRIDE (-1)
+typedef cvk_tiu_max_pooling_param_t param_t;
+
+static void print_pooling_param(param_t *p)
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+
+  printf("  Pooling parameters:\n");
+  printf("    ifmap = (%d, %d, %d, %d)\n", in, ic, ih, iw);
+  printf("    opd0_sign = %d\n", p->ifmap->fmt == CVK_FMT_I8);
+  printf("    weight = (%d, %d)\n", p->kh, p->kw);
+  printf("    padding = (%d, %d, %d, %d)\n",
+         p->pad_top, p->pad_bottom, p->pad_left, p->pad_right);
+  printf("    stride = (%d, %d)\n", p->stride_h, p->stride_w);
+}
+
+static int pooling_ih_ext(param_t *p, int ih)
+{
+  int pad = p->pad_top + p->pad_bottom;
+  return ih + pad;
+}
+
+static int pooling_iw_ext(param_t *p, int iw)
+{
+  int pad = p->pad_left + p->pad_right;
+  return iw + pad;
+}
+
+static int pooling_oh(param_t *p, int ih)
+{
+  int ih_ext = pooling_ih_ext(p, ih);
+  return (ih_ext - p->kh) / p->stride_h + 1;
+}
+
+static int pooling_ow(param_t *p, int iw)
+{
+  int iw_ext = pooling_iw_ext(p, iw);
+  return (iw_ext - p->kw) / p->stride_w + 1;
+}
+
+static int8_t *alloc_input(param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->ifmap->shape, p->ifmap->fmt);
+  int8_t *data = (int8_t *)malloc(size);
+  if (!data)
+    return NULL;
+
+  for (uint64_t i = 0; i < size; i++)
+    data[i] = rand() % 256 - 128;
+  return data;
+}
+
+static int8_t *alloc_output(param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->ofmap->shape, p->ofmap->fmt);
+  return (int8_t *)malloc(size);
+}
+
+static void free_pooling_param(
+    cvk_context_t *cvk_ctx,
+    param_t *r)
+{
+  if (r->ifmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ifmap);
+  if (r->ofmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ofmap);
+}
+
+static param_t random_pooling_param(cvk_context_t *cvk_ctx, int stride_w, int stride_h)
+{
+  srand(clock());
+  param_t p;
+  int retry_cnt = 100;
+
+  for (int i = 0; i < retry_cnt; i++) {
+    int in = rand() % 5 + 1;
+    int ic = rand() % (3 * cvk_ctx->info.npu_num) + 1;
+    int ih = rand() % 30 + 3 + (INVALIDE_STRIDE == stride_h ? 0 : stride_h);
+    int iw = rand() % 30 + 6 + (INVALIDE_STRIDE == stride_w ? 0 : stride_w);
+    int opd0_sign = rand() % 2;
+
+    memset(&p, 0, sizeof(p));
+    p.kh = rand() % 7 + 1;
+    p.kw = rand() % 7 + 1;
+    p.stride_h = INVALIDE_STRIDE == stride_h ? rand() % (p.kh) + 1 : stride_h;
+    p.stride_w = INVALIDE_STRIDE == stride_w ? rand() % (p.kh) + 1 : stride_w;
+    p.pad_top = rand() % p.kh;
+    p.pad_bottom = rand() % p.kh;
+    p.pad_left = rand() % p.kw;
+    p.pad_right = rand() % p.kw;
+
+    cvk_tl_shape_t ifmap_shape;
+    ifmap_shape.n = in;
+    ifmap_shape.c = ic;
+    ifmap_shape.h = ih;
+    ifmap_shape.w = iw;
+    cvk_tl_shape_t ofmap_shape;
+    ofmap_shape.n = in;
+    ofmap_shape.c = ic;
+    ofmap_shape.h = pooling_oh(&p, ih);
+    ofmap_shape.w = pooling_ow(&p, iw);
+
+    cvk_fmt_t fmt = opd0_sign? CVK_FMT_I8: CVK_FMT_U8;
+    p.ofmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, ofmap_shape, CVK_FMT_I8, 1);
+    p.ifmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, ifmap_shape, fmt, 1);
+
+    if ((p.kh > pooling_ih_ext(&p, ih))
+        || (p.kw > pooling_iw_ext(&p, iw))
+        || (p.pad_top >= (1 << 4))
+        || (p.pad_bottom >= (1 << 4))
+        || (p.pad_left >= (1 << 4))
+        || (p.pad_right >= (1 << 4))
+        || (p.kh * p.kw == 1)
+        || !p.ofmap || !p.ifmap) {
+      printf("retry init_pooling_param\n");
+      free_pooling_param(cvk_ctx, &p);
+    } else
+      break;
+  }
+
+  return p;
+}
+
+static int compare_results(
+    param_t *p,
+    int8_t input[],
+    int8_t output[])
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+  int sign = (p->ifmap->fmt == CVK_FMT_I8);
+
+  int8_t *output_ref = alloc_output(p);
+  int ret = native_pooling_max_int8(
+      input, output_ref, in, ic, ih, iw, p->kh, p->kw,
+      p->pad_top, p->pad_bottom, p->pad_left, p->pad_right,
+      p->stride_h, p->stride_w, 0, 0, 0, 0, sign);
+  if (ret)
+    return ret;
+
+  ret = array_cmp_int8(
+      "Comparing results ...\n", output_ref, output,
+      tl_shape_size(&p->ofmap->shape, p->ofmap->fmt));
+
+  if (ret != 0) {
+    printf("Comparison FAILED!!!\n");
+    print_pooling_param(p);
+  }
+
+  free(output_ref);
+
+  return ret;
+}
+
+static int _test_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int stride_w, int stride_h)
+{
+  param_t param = random_pooling_param(cvk_ctx, stride_w, stride_h);
+  int8_t *input = alloc_input(&param);
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, param.ifmap, (uint8_t *)input);
+  cvk_ctx->ops->tiu_max_pooling(cvk_ctx, &param);
+  int8_t *output = (int8_t *)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, param.ofmap);
+
+  int ret = compare_results(&param, input, output);
+
+  free_pooling_param(cvk_ctx, &param);
+  free(output);
+  free(input);
+
+  return ret;
+}
+
+static int test_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx) {
+  return _test_pooling(rt_handle, cvk_ctx, INVALIDE_STRIDE, INVALIDE_STRIDE);
+}
+
+static int test_max_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+  for (uint64_t i = 0; i < 16; i++)
+    ret |= test_pooling(rt_handle, cvk_ctx);
+
+  // test stride extend (0, 31]
+  int stride_list[] = {15, 16, 31};
+  int stride_list_len = sizeof(stride_list) / sizeof(stride_list[0]);
+
+  for (int stride_w_idx = 0; stride_w_idx < stride_list_len; stride_w_idx++) {
+    for (int stride_h_idx = 0; stride_h_idx < stride_list_len; stride_h_idx++) {
+      int stride_w = stride_list[stride_w_idx];
+      int stride_h = stride_list[stride_h_idx];
+
+      ret |= _test_pooling(rt_handle, cvk_ctx, stride_w, stride_h);
+    }
+  }
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret = test_max_pooling(rt_handle, cvk_ctx);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_min_pooling.c b/cviruntime/test/181x/test_181x_min_pooling.c
new file mode 100644
index 000000000..4ac76a079
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_min_pooling.c
@@ -0,0 +1,220 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_native_ref.h"
+
+typedef cvk_tiu_min_pooling_param_t param_t;
+
+static void print_pooling_param(param_t *p)
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+
+  printf("  Pooling parameters:\n");
+  printf("    ifmap = (%d, %d, %d, %d)\n", in, ic, ih, iw);
+  printf("    opd0_sign = %d\n", p->ifmap->fmt == CVK_FMT_I8);
+  printf("    weight = (%d, %d)\n", p->kh, p->kw);
+  printf("    padding = (%d, %d, %d, %d)\n",
+         p->pad_top, p->pad_bottom, p->pad_left, p->pad_right);
+  printf("    stride = (%d, %d)\n", p->stride_h, p->stride_w);
+}
+
+static int pooling_ih_ext(param_t *p, int ih)
+{
+  int pad = p->pad_top + p->pad_bottom;
+  return ih + pad;
+}
+
+static int pooling_iw_ext(param_t *p, int iw)
+{
+  int pad = p->pad_left + p->pad_right;
+  return iw + pad;
+}
+
+static int pooling_oh(param_t *p, int ih)
+{
+  int ih_ext = pooling_ih_ext(p, ih);
+  return (ih_ext - p->kh) / p->stride_h + 1;
+}
+
+static int pooling_ow(param_t *p, int iw)
+{
+  int iw_ext = pooling_iw_ext(p, iw);
+  return (iw_ext - p->kw) / p->stride_w + 1;
+}
+
+static int8_t *alloc_input(param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->ifmap->shape, p->ifmap->fmt);
+  int8_t *data = (int8_t *)malloc(size);
+  if (!data)
+    return NULL;
+
+  for (uint64_t i = 0; i < size; i++)
+    data[i] = rand() % 256 - 128;
+  return data;
+}
+
+static int8_t *alloc_output(param_t *p)
+{
+  uint64_t size = tl_shape_size(&p->ofmap->shape, p->ofmap->fmt);
+  return (int8_t *)malloc(size);
+}
+
+static void free_pooling_param(
+    cvk_context_t *cvk_ctx,
+    param_t *r)
+{
+  if (r->ifmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ifmap);
+  if (r->ofmap)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, r->ofmap);
+}
+
+static param_t random_pooling_param(cvk_context_t *cvk_ctx)
+{
+  srand(clock());
+  param_t p;
+  int retry_cnt = 100;
+
+  for (int i = 0; i < retry_cnt; i++) {
+    int in = rand() % 5 + 1;
+    int ic = rand() % (3 * cvk_ctx->info.npu_num) + 1;
+    int ih = rand() % 30 + 3;
+    int iw = rand() % 30 + 6;
+    int opd0_sign = rand() % 2;
+
+    memset(&p, 0, sizeof(p));
+    p.kh = rand() % 7 + 1;
+    p.kw = rand() % 7 + 1;
+    p.stride_h = rand() % (p.kh) + 1;
+    p.stride_w = rand() % (p.kw) + 1;
+    p.pad_top = rand() % p.kh;
+    p.pad_bottom = rand() % p.kh;
+    p.pad_left = rand() % p.kw;
+    p.pad_right = rand() % p.kw;
+
+    cvk_tl_shape_t ifmap_shape;
+    ifmap_shape.n = in;
+    ifmap_shape.c = ic;
+    ifmap_shape.h = ih;
+    ifmap_shape.w = iw;
+    cvk_tl_shape_t ofmap_shape;
+    ofmap_shape.n = in;
+    ofmap_shape.c = ic;
+    ofmap_shape.h = pooling_oh(&p, ih);
+    ofmap_shape.w = pooling_ow(&p, iw);
+
+    cvk_fmt_t fmt = opd0_sign? CVK_FMT_I8: CVK_FMT_U8;
+    p.ofmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, ofmap_shape, CVK_FMT_I8, 1);
+    p.ifmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, ifmap_shape, fmt, 1);
+
+    if ((p.kh > pooling_ih_ext(&p, ih))
+        || (p.kw > pooling_iw_ext(&p, iw))
+        || (p.pad_top >= (1 << 4))
+        || (p.pad_bottom >= (1 << 4))
+        || (p.pad_left >= (1 << 4))
+        || (p.pad_right >= (1 << 4))
+        || (p.kh * p.kw == 1)
+        || !p.ofmap || !p.ifmap) {
+      printf("retry init_pooling_param\n");
+      free_pooling_param(cvk_ctx, &p);
+    } else
+      break;
+  }
+
+  return p;
+}
+
+static int compare_results(
+    param_t *p,
+    int8_t input[],
+    int8_t output[])
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+  int sign = (p->ifmap->fmt == CVK_FMT_I8);
+
+  int8_t *output_ref = alloc_output(p);
+  int ret = native_pooling_min_int8(
+      input, output_ref, in, ic, ih, iw, p->kh, p->kw,
+      p->pad_top, p->pad_bottom, p->pad_left, p->pad_right,
+      p->stride_h, p->stride_w, 0, 0, 0, 0, sign);
+  if (ret)
+    return ret;
+
+  ret = array_cmp_int8(
+      "Comparing results ...\n", output_ref, output,
+      tl_shape_size(&p->ofmap->shape, p->ofmap->fmt));
+
+  if (ret != 0) {
+    printf("Comparison FAILED!!!\n");
+    print_pooling_param(p);
+  }
+
+  free(output_ref);
+
+  return ret;
+}
+
+static int test_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  param_t param = random_pooling_param(cvk_ctx);
+  int8_t *input = alloc_input(&param);
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, param.ifmap, (uint8_t *)input);
+  cvk_ctx->ops->tiu_min_pooling(cvk_ctx, &param);
+  int8_t *output = (int8_t *)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, param.ofmap);
+
+  int ret = compare_results(&param, input, output);
+
+  free_pooling_param(cvk_ctx, &param);
+  free(output);
+  free(input);
+
+  return ret;
+}
+
+static int test_min_pooling(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+  for (uint64_t i = 0; i < 16; i++)
+    ret |= test_pooling(rt_handle, cvk_ctx);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret = test_min_pooling(rt_handle, cvk_ctx);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_tdma_bf16_matrix_vlc_decompress_compress.c b/cviruntime/test/181x/test_181x_tdma_bf16_matrix_vlc_decompress_compress.c
new file mode 100644
index 000000000..e675e277d
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_tdma_bf16_matrix_vlc_decompress_compress.c
@@ -0,0 +1,212 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+
+typedef cvk_tdma_g2l_matrix_copy_decompressed_param_t decompress_param_t;
+typedef cvk_tdma_l2g_matrix_copy_compressed_param_t compress_param_t;
+
+typedef struct{
+  decompress_param_t dec_p;
+  compress_param_t com_p;
+} param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => %s\n",
+      tag,
+      p->dec_p.dst->shape.n, p->dec_p.dst->shape.c, p->dec_p.dst->shape.w, p->dec_p.dst->shape.col,
+      (p->dec_p.dst->fmt == CVK_FMT_I8)? "signed": "unsigned");
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_mg_shape_t src_shape;
+  cvk_ml_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 0, 1 },
+    { 0, 1, 1, 1 },
+  },
+  {
+    { 0, 2 },
+    { 0, 1, 2, 2 },
+  },
+  {
+    { 0, 2 },
+    { 0, 2, 1, 2 },
+  },
+  {
+    { 0, 7 },
+    { 0, 1, 7, 7 },
+  },
+#ifndef ENABEL_SIMPLE_VLC_TEST
+  {
+    { 0, 7 },
+    { 0, 2, 4, 7 },
+  },
+  {
+    { 0, 7 },
+    { 0, 7, 1, 7 },
+  },
+  {
+    { 0, 17 },
+    { 0, 1, 17, 17 },
+  },
+  {
+    { 0, 17 },
+    { 0, 3, 7, 17 },
+  },
+  {
+    { 0, 17 },
+    { 0, 17, 1, 17 },
+  },
+  {
+    { 0, 60 },
+    { 0, 1, 60, 60 },
+  },
+  {
+    { 0, 60 },
+    { 0, 30, 2, 60 },
+  },
+  {
+    { 0, 60 },
+    { 0, 60, 1, 60 },
+  }
+#endif /* ifndef ENABEL_SIMPLE_VLC_TEST*/
+};
+
+static void test_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p, uint16_t *src_data,
+  CommandInfo* cmd_info)
+{
+  print_param(stderr, p);
+  uint64_t size = ml_shape_size(&p->dec_p.dst->shape, CVK_FMT_I8);
+  uint64_t bytesize = size * fmt_size(p->dec_p.dst->fmt);
+  int is_signed = (p->dec_p.dst->fmt == CVK_FMT_I8);
+
+  uint16_t *gmem_data;
+  size_t bs_size;
+  size_t data_type = (p->dec_p.dst->fmt == CVK_FMT_BF16) ? 1 : 0;
+
+  gmem_data = (uint16_t* )test_vlc_compress((uint8_t* )src_data, bytesize, is_signed, data_type, &bs_size, cmd_info, NULL);
+
+  //1. send compressed one to gaddr and decompress from gaddr to local
+  cmpr_matrix_copy_s2d(rt_handle, p->dec_p.src, (uint8_t* ) gmem_data);
+  cvk_ctx->ops->tdma_g2l_matrix_copy_decompressed(cvk_ctx, &p->dec_p);
+  CVI_RT_Submit(cvk_ctx);
+
+  //2. decompress from sram
+  cvk_ctx->ops->tdma_l2g_matrix_copy_compressed(cvk_ctx, &p->com_p);
+  CVI_RT_Submit(cvk_ctx);
+
+  //3. get final data
+  uint16_t *dst_data = (uint16_t* )cmpr_matrix_copy_d2s(rt_handle, p->com_p.dst);
+
+  for (uint64_t i = 0; i < bs_size / 2; i++) {
+    if (dst_data[i] != gmem_data[i]) {
+      fprintf(stderr, "vlc compress comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], gmem_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(dst_data);
+  free(gmem_data);
+}
+
+static void destroy_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  free_cmpr_matrix_dev_mem(rt_handle, p->dec_p.src);
+  free_cmpr_matrix_dev_mem(rt_handle, p->com_p.dst);
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->dec_p.dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  cvk_fmt_t fmts[] = { CVK_FMT_BF16 };
+  uint8_t fmts_sz = sizeof(fmts) / sizeof(fmts[0]);
+
+  for (uint32_t row = 1; row < 13; row += 2) {
+    c->src_shape.row = row;
+    c->dst_shape.n = row;
+    for (int dst_align = 0; dst_align < 2; dst_align++) {
+      for (uint8_t fmt_i = 0; fmt_i < fmts_sz; fmt_i++) {
+        cvk_fmt_t fmt = fmts[fmt_i];
+        param_t p;
+        memset(&p, 0, sizeof(p));
+        //put compressed data to gaddr ->decompress to local -> compress to gaddr
+
+        int is_signed = (fmt == CVK_FMT_I8);
+        int data_type = (fmt == CVK_FMT_BF16) ? 1 : 0;
+
+        CommandInfo cmd_info;
+        memset(&cmd_info, 0, sizeof(CommandInfo));
+        cmd_info.signedness = is_signed;
+        cmd_info.is_bfloat16 = data_type;
+        cmd_info.bias0 = 127;
+
+        // <! not support bias0/1 setting compress by hw
+        //get_vlc_compressed_meta(src_data, size, fmt, &bs_size, &cmd_info);
+
+        //1. alloc decompress
+        p.dec_p.src = alloc_cmpr_matrix_dev_mem(rt_handle, c->src_shape, fmt, &cmd_info);
+        p.dec_p.dst = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, c->dst_shape, fmt, dst_align);
+
+        uint64_t size = ml_shape_size(&p.dec_p.dst->shape, p.dec_p.dst->fmt);
+        uint16_t *src_data = (uint16_t *)malloc(sizeof(uint16_t) * size);
+        test_vlc_init_testdata((uint8_t *)src_data, size, fmt == CVK_FMT_I8, fmt == CVK_FMT_BF16);
+
+        assert(p.dec_p.dst);
+
+        //2. alloc compress
+        p.com_p.src = p.dec_p.dst; //cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, c->lmem_shape, fmt, align);
+        p.com_p.dst = alloc_cmpr_matrix_dev_mem(rt_handle, c->src_shape, fmt, &cmd_info);
+
+        //3. test: the sequence like below:
+        //3.1 put compressed data to gaddr
+        //3.2 decompress to local
+        //3.3 compress to gaddr
+        //printf ("row %u is_align %d fmt %d\n", row, dst_align, fmt);
+        test_param_g2l(rt_handle, cvk_ctx, &p, src_data, &cmd_info);
+        destroy_param_g2l(rt_handle, cvk_ctx, &p);
+        free(src_data);
+      }
+    }
+  }
+}
+
+int main(int argc, char **argv)
+{
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return 0;
+}
diff --git a/cviruntime/test/181x/test_181x_tdma_bf16_tensor_vlc_decompress_compress.c b/cviruntime/test/181x/test_181x_tdma_bf16_tensor_vlc_decompress_compress.c
new file mode 100644
index 000000000..74480d13a
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_tdma_bf16_tensor_vlc_decompress_compress.c
@@ -0,0 +1,215 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef cvk_tdma_g2l_tensor_copy_decompressed_param_t decompress_param_t;
+typedef cvk_tdma_l2g_tensor_copy_compressed_param_t compress_param_t;
+
+typedef struct{
+  decompress_param_t dec_p;
+  compress_param_t com_p;
+} param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => %d-bit %s\n",
+      tag,
+      p->dec_p.dst->shape.n, p->dec_p.dst->shape.c, p->dec_p.dst->shape.h, p->dec_p.dst->shape.w,
+      p->dec_p.src->bit_length,
+      (p->dec_p.dst->fmt == CVK_FMT_I8)? "signed": "unsigned");
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_tl_shape_t lmem_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 17, 13 }
+  },
+  {
+    { 3, 39, 17, 23 }
+  },
+#if 0 // No enough local memory for 1810
+  {
+    { 5, 39, 17, 23 }
+  },
+  {
+    { 20, 35,  2, 2 }
+  },
+#endif
+#ifndef ENABEL_SIMPLE_VLC_TEST
+  {
+    { 1, 1, 1, 1 }
+  },
+  {
+    { 1, 1, 1, 2 }
+  },
+  {
+    { 1, 1, 7, 2 }
+  },
+  {
+    { 1, 1, 10, 60 }
+  },
+  {
+    { 1, 2, 1, 1 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 3, 16, 1, 1 }
+  },
+  {
+    { 3, 36, 16,  20 }
+  },
+#endif /* ifndef ENABEL_SIMPLE_VLC_TEST*/
+};
+
+static int test_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p, cvk_cmpr_tg_t* dst)
+{
+  print_param(stderr, p);
+  int ret = 0;
+  uint64_t size = tl_shape_size(&p->dec_p.dst->shape, CVK_FMT_I8);
+  uint64_t bytesize = size * fmt_size(p->dec_p.dst->fmt);
+  int is_signed = (p->dec_p.dst->fmt == CVK_FMT_I8);
+  uint16_t *src_data = (uint16_t *)malloc(bytesize);
+  uint16_t *dst_data = NULL;
+  uint8_t *gmem_data = NULL;
+  if (!src_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  test_vlc_init_testdata((uint8_t *)src_data, size, p->dec_p.dst->fmt == CVK_FMT_I8, p->dec_p.dst->fmt == CVK_FMT_BF16);
+
+  size_t total_size;
+  size_t data_type = (p->dec_p.dst->fmt == CVK_FMT_BF16) ? 1 : 0;
+  size_t bs_buf_size = get_out_bs_buf_size(bytesize, data_type);
+  gmem_data = (uint8_t *) malloc(bs_buf_size * sizeof(uint8_t));
+  if (!gmem_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  // command info
+  CommandInfo cmd_info;
+  memset(&cmd_info, 0, sizeof(CommandInfo));
+  cmd_info.signedness = is_signed;
+  cmd_info.is_bfloat16 = data_type;
+  cmd_info.bias0 = 127;
+  // TODO: test
+  //cmd_info.zero_guard_en = 1;
+  // TODO generate +-inf +-nan, plz refere https://en.wikipedia.org/wiki/Bfloat16_floating-point_format
+
+  // <! not support bias0/1 setting compress by hw
+  //cvk_vlc_est_weight_bias(src_data, in_size, (bool)is_signed, (bool)data_type, &cmd_info);
+  cvk_vlc_enc_bf16((uint16_t* )src_data, bytesize, gmem_data, &total_size, &cmd_info);
+
+  cmpr_tensor_copy_s2d(rt_handle, p->dec_p.src, gmem_data);
+  cvk_ctx->ops->tdma_g2l_tensor_copy_decompressed(cvk_ctx, &p->dec_p);
+  CVI_RT_Submit(cvk_ctx);
+
+  dst->zero_guard_en = cmd_info.zero_guard_en;
+  dst->bias0 = cmd_info.bias0;
+  dst->bias1 = cmd_info.bias1;
+  p->com_p.dst = dst;
+  cvk_ctx->ops->tdma_l2g_tensor_copy_compressed(cvk_ctx, &p->com_p);
+  CVI_RT_Submit(cvk_ctx);
+
+  dst_data = (uint16_t* )cmpr_tensor_copy_d2s(rt_handle, p->com_p.dst);
+  uint16_t* ref_data = (uint16_t* )gmem_data;
+
+  //<! div 2 means compare base bf16(2bytes), total_size unit is byte
+  for (uint64_t i = 0; i < total_size / 2 ; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "vlc compress comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+fail_exit:
+  free(src_data);
+  free(dst_data);
+  free(gmem_data);
+
+  return ret;
+}
+
+static void destroy_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  free_cmpr_tensor_dev_mem(rt_handle, p->dec_p.src);
+  free_cmpr_tensor_dev_mem(rt_handle, p->com_p.dst);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->dec_p.dst);
+}
+
+static int test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  int ret = 0;
+  cvk_fmt_t fmts[] = { CVK_FMT_BF16 };
+  uint8_t fmts_sz = sizeof(fmts) / sizeof(fmts[0]);
+
+  for (int align = 0; align < 2; align++) {
+    for (uint8_t fmt_i = 0; fmt_i < fmts_sz; fmt_i++) {
+      cvk_fmt_t fmt = fmts[fmt_i];
+
+      param_t p;
+      memset(&p, 0, sizeof(p));
+      cvk_tg_shape_t tg_shape =
+          tg_shape_t4(c->lmem_shape.n, c->lmem_shape.c, c->lmem_shape.h, c->lmem_shape.w);
+      p.dec_p.src = alloc_cmpr_tensor_dev_mem(rt_handle, cvk_ctx, tg_shape, fmt, NULL);
+      p.dec_p.dst = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, c->lmem_shape, fmt, align);
+      assert(p.dec_p.dst);
+
+      p.com_p.src = p.dec_p.dst; //cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, c->lmem_shape, fmt, align);
+      assert(p.com_p.src);
+      cvk_cmpr_tg_t* dst = alloc_cmpr_tensor_dev_mem(rt_handle, cvk_ctx, tg_shape, fmt, NULL);
+
+      ret |= test_param_g2l(rt_handle, cvk_ctx, &p, dst);
+      destroy_param_g2l(rt_handle, cvk_ctx, &p);
+    }
+  }
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    ret |= test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_tdma_g2l_bf16_matrix_vlc_copy_decompressed.c b/cviruntime/test/181x/test_181x_tdma_g2l_bf16_matrix_vlc_copy_decompressed.c
new file mode 100644
index 000000000..4f8d8ea70
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_tdma_g2l_bf16_matrix_vlc_copy_decompressed.c
@@ -0,0 +1,203 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef cvk_tdma_g2l_matrix_copy_decompressed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->m.shape.row, p->src->m.shape.col,
+      p->dst->shape.n, p->dst->shape.c,
+      p->dst->shape.w, p->dst->shape.col);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_mg_shape_t src_shape;
+  cvk_ml_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 0, 17 },
+    { 0, 3, 7, 17 },
+  },
+  {
+    { 0, 7 },
+    { 0, 7, 1, 7 },
+  },
+  {
+    { 0, 60 },
+    { 0, 30, 2, 60 },
+  },
+#ifndef ENABEL_SIMPLE_VLC_TEST
+  {
+    { 0, 1 },
+    { 0, 1, 1, 1 },
+  },
+  {
+    { 0, 2 },
+    { 0, 1, 2, 2 },
+  },
+  {
+    { 0, 2 },
+    { 0, 2, 1, 2 },
+  },
+  {
+    { 0, 7 },
+    { 0, 1, 7, 7 },
+  },
+  {
+    { 0, 7 },
+    { 0, 2, 4, 7 },
+  },
+  {
+    { 0, 17 },
+    { 0, 1, 17, 17 },
+  },
+  {
+    { 0, 17 },
+    { 0, 17, 1, 17 },
+  },
+  {
+    { 0, 60 },
+    { 0, 1, 60, 60 },
+  },
+  {
+    { 0, 60 },
+    { 0, 60, 1, 60 },
+  }
+#endif /* ifndef ENABEL_SIMPLE_VLC_TEST*/
+};
+
+static void g2l_matrix_copy_ref(param_t *p, uint16_t ref_data[], uint16_t src_data[])
+{
+  uint64_t size = ml_shape_size(&p->dst->shape, CVK_FMT_I8);
+
+  for (uint64_t i = 0; i < size; i++)
+    ref_data[i] = src_data[i];
+}
+
+static void test_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p, uint16_t *src_data, CommandInfo* cmd_info)
+{
+  print_param(stderr, p);
+
+  uint64_t in_size = ml_shape_size(&p->dst->shape, CVK_FMT_I8);
+  size_t bs_size = 0;
+  int is_signed = (p->dst->fmt == CVK_FMT_I8);
+  size_t data_type = (p->dst->fmt == CVK_FMT_BF16) ? 1 : 0;
+  uint64_t bytesize = in_size * fmt_size(p->dst->fmt);
+
+  uint8_t *bsbuf = test_vlc_compress((uint8_t* )src_data, bytesize, is_signed, data_type, &bs_size, cmd_info, NULL);
+  cmpr_matrix_copy_s2d(rt_handle, p->src, bsbuf);
+  free(bsbuf);
+
+  cvk_ctx->ops->tdma_g2l_matrix_copy_decompressed(cvk_ctx, p);
+  CVI_RT_Submit(cvk_ctx);
+  uint16_t *dst_data = (uint16_t*)matrix_copy_l2g_d2s(rt_handle, cvk_ctx, p->dst);
+
+  uint16_t *ref_data = (uint16_t *)malloc(sizeof(uint16_t) * in_size);
+  g2l_matrix_copy_ref(p, ref_data, src_data);
+
+  for (uint64_t i = 0; i < in_size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(dst_data);
+  free(ref_data);
+}
+
+static void destroy_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  free_cmpr_matrix_dev_mem(rt_handle, p->src);
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  cvk_fmt_t fmts[] = { CVK_FMT_BF16 };
+  uint8_t fmts_sz = sizeof(fmts) / sizeof(fmts[0]);
+
+  for (uint32_t row = 1; row < 13; row += 2) {
+    c->src_shape.row = row;
+    c->dst_shape.n = row;
+    for (int mode = 0; mode < VLC_CMP_MODE_MAX; mode++) {
+      for (int dst_align = 0; dst_align < 2; dst_align++) {
+        for (uint8_t fmt_i = 0; fmt_i < fmts_sz; fmt_i++) {
+          cvk_fmt_t fmt = fmts[fmt_i];
+          param_t p;
+          memset(&p, 0, sizeof(p));
+
+          int is_signed = (fmt == CVK_FMT_I8);
+          size_t data_type = (fmt == CVK_FMT_BF16) ? 1 : 0;
+          CommandInfo cmd_info;
+
+          memset(&cmd_info, 0, sizeof(CommandInfo));
+          cmd_info.signedness = is_signed;
+          cmd_info.is_bfloat16 = data_type;
+
+          // <! 1. alloc source
+          p.dst = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, c->dst_shape, fmt, dst_align);
+          uint64_t in_size = ml_shape_size(&p.dst->shape, CVK_FMT_I8);
+
+          // <! 2 init input
+          uint16_t *src_data = (uint16_t *)malloc(sizeof(uint16_t) * in_size);
+          test_vlc_init_testdata((uint8_t *)src_data, in_size, fmt == CVK_FMT_I8, fmt == CVK_FMT_BF16);
+
+          // <! 3 try to manual set bias0/bias1
+          if (mode == VLC_CMP_MODE_COMPILER) {
+            cvk_vlc_est_weight_bias((uint8_t*) src_data, in_size * sizeof(uint16_t), (bool)is_signed, (bool)data_type, &cmd_info);
+          }
+
+          p.src = alloc_cmpr_matrix_dev_mem(rt_handle, c->src_shape, fmt, &cmd_info);
+
+          //printf ("row %u mode %d is_align %d fmt %d\n", row, mode, dst_align, fmt);
+          test_param_g2l(rt_handle, cvk_ctx, &p, src_data, &cmd_info);
+
+          free(src_data);
+          destroy_param_g2l(rt_handle, cvk_ctx, &p);
+        }
+      }
+    }
+  }
+}
+
+int main(int argc, char **argv)
+{
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return 0;
+}
diff --git a/cviruntime/test/181x/test_181x_tdma_g2l_bf16_tensor_copy_nc_transposed.c b/cviruntime/test/181x/test_181x_tdma_g2l_bf16_tensor_copy_nc_transposed.c
new file mode 100644
index 000000000..4f3b98919
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_tdma_g2l_bf16_tensor_copy_nc_transposed.c
@@ -0,0 +1,301 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef cvk_tdma_g2l_tensor_copy_nc_transposed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_fmt_t src_fmt;
+  cvk_fmt_t dst_fmt;
+} fmt_type_t;
+
+static fmt_type_t input_fmt[] = {
+ {CVK_FMT_BF16, CVK_FMT_BF16},
+ {CVK_FMT_I8, CVK_FMT_BF16},
+ {CVK_FMT_U8, CVK_FMT_BF16},
+};
+
+typedef struct {
+  cvk_tg_shape_t src_shape;
+  cvk_tl_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 1, 1 },
+    { 1, 1, 1, 1 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 1, 2 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 2, 1 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 7, 2 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 2, 7 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 17, 13 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 13, 17 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 10, 60 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 2, 300 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 3, 200 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 4, 150 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 5, 120 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 60, 10 },
+  }, {
+    { 1, 1, 120, 5 },
+    { 1, 1, 120, 5 },
+  }, {
+    { 1, 2, 1, 1 },
+    { 2, 1, 1, 1 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 1, 4 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 2, 2 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 4, 1 },
+  }, {
+    { 17, 2, 2, 2 },
+    { 2, 17, 2, 2 },
+  }, {
+    { 17, 2, 4, 1 },
+    { 2, 17, 4, 1 },
+  }, {
+    {  3, 16, 1, 1 },
+    { 16,  3, 1, 1 },
+  }, {
+    { 3, 39, 23, 17 },
+    { 39, 3, 23, 17 },
+  }, {
+    { 3, 39, 17, 23 },
+    { 39, 3, 17, 23 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  16, 20 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  2, 160 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  4, 80 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  8, 40 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  20, 16 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  32, 10 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  64, 5 },
+#if 0 // No enough local memory for 1810
+  }, {
+    { 5, 39, 17, 23 },
+    { 39, 5, 17, 23 },
+  }, {
+    { 20, 35, 2, 2 },
+    { 35, 20, 2, 2 },
+  }, {
+    { 35, 20, 2, 2 },
+    { 20, 35, 2, 2 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 160, 2 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 2, 160 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 4, 80 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 8, 40 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 10, 32 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 20, 16 },
+  }, {
+    { 39, 5, 23, 17 },
+    { 5, 39, 23, 17 },
+#endif
+  }
+};
+
+static void g2l_tensor_copy_nc_transposed_ref(
+    param_t *p, uint16_t ref_data[], uint16_t src_data[])
+{
+  cvk_tg_shape_t s = p->src->shape;
+  uint32_t n = s.n;
+  uint32_t c = s.c;
+  uint32_t hw = s.h * s.w;
+  for (uint32_t ni = 0; ni < n; ni++) {
+    for (uint32_t ci = 0; ci < c; ci++) {
+      for (uint32_t hwi = 0; hwi < hw; hwi++) {
+        uint32_t src_i = ni * c * hw + ci * hw + hwi;
+        uint32_t dst_i = ci * n * hw + ni * hw + hwi;
+        if(p->src->fmt == CVK_FMT_BF16 && p->dst->fmt == CVK_FMT_BF16)
+          ref_data[dst_i] = src_data[src_i];
+        else {
+          uint8_t* u8src_data = (uint8_t*)src_data;
+          uint8_t sign = p->src->fmt == CVK_FMT_I8 ? 1 : 0;
+          ref_data[dst_i] = cvk_convert_int8_bf16(u8src_data[src_i], sign);
+        }
+      }
+    }
+  }
+}
+
+static int test_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  print_param(stderr, p);
+  int ret = 0;
+  uint64_t size = tl_shape_size(&p->dst->shape, CVK_FMT_I8);
+
+  uint16_t *u16src_data = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  uint8_t *u8src_data = (uint8_t *)malloc(sizeof(uint16_t) * size);
+  uint16_t *dst_data = NULL, *ref_data = NULL;
+  if (!u16src_data || !u8src_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  uint8_t *src_data;
+  if(p->src->fmt == CVK_FMT_BF16) {
+    float val = -100;
+    for(uint64_t i = 0; i < size; i++) {
+      u16src_data[i] = test_generate_bf16_corner_val(val);
+      val += 0.1;
+    }
+    src_data = (uint8_t*)u16src_data;
+  } else {
+    for(uint64_t i = 0; i < size; i++) {
+      u8src_data[i] = 200 + i;
+    }
+    src_data = u8src_data;
+  }
+
+  tensor_copy_s2d(rt_handle, p->src, (uint8_t*) src_data);
+  cvk_ctx->ops->tdma_g2l_bf16_tensor_copy_nc_transposed(cvk_ctx, p);
+  CVI_RT_Submit(cvk_ctx);
+  dst_data = (uint16_t *) tensor_copy_l2g_d2s(rt_handle, cvk_ctx, p->dst);
+
+  ref_data = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  if (!dst_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  g2l_tensor_copy_nc_transposed_ref(p, ref_data, (uint16_t*) src_data);
+
+  for (uint64_t i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %x, exp %x\n",
+              i, dst_data[i], ref_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+fail_exit:
+  free(u8src_data);
+  free(u16src_data);
+  free(dst_data);
+  free(ref_data);
+
+  return ret;
+}
+
+
+static void destroy_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->dst);
+  free_tensor_dev_mem(rt_handle, p->src);
+}
+
+static int test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  int ret = 0;
+  uint32_t nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+  for (uint32_t i = 0; i < nr_fmt; i++) {
+    for (int dst_align = 0; dst_align < 2; dst_align++) {
+      param_t p;
+      memset(&p, 0, sizeof(p));
+
+      p.src = alloc_tensor_dev_mem(rt_handle, cvk_ctx, c->src_shape, input_fmt[i].src_fmt);
+      p.dst = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, c->dst_shape, input_fmt[i].dst_fmt, dst_align);
+      ret |= test_param_g2l(rt_handle, cvk_ctx, &p);
+      destroy_param_g2l(rt_handle, cvk_ctx, &p);
+    }
+  }
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+  int ret = 0;
+
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    ret |= test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_tdma_g2l_bf16_tensor_fill_constant.c b/cviruntime/test/181x/test_181x_tdma_g2l_bf16_tensor_fill_constant.c
new file mode 100644
index 000000000..4260f7aac
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_tdma_g2l_bf16_tensor_fill_constant.c
@@ -0,0 +1,172 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+
+typedef cvk_tdma_g2l_tensor_fill_constant_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: %u => (%u, %u, %u, %u)\n",
+      tag, p->constant,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  uint16_t constant;
+  cvk_tl_shape_t dst_shape;
+} case_t;
+
+typedef struct {
+  cvk_fmt_t src_fmt;
+  cvk_fmt_t dst_fmt;
+} fmt_type_t;
+
+static fmt_type_t input_fmt[] = {
+ {CVK_FMT_BF16, CVK_FMT_BF16},
+};
+
+static case_t g_cases[] = {
+  {
+    37, { 1, 1, 1, 1 }
+  }, {
+    39, { 1, 1, 1, 2 }
+  }, {
+    23, { 1, 1, 2, 1 }
+  }, {
+    19, { 1, 1, 7, 2 }
+  }, {
+    17, { 1, 1, 2, 7 }
+  }, {
+    13, { 1, 1, 17, 13 }
+  }, {
+    11, { 1, 1, 13, 17 }
+  }, {
+    7, { 1, 1, 10, 60 }
+  }, {
+    9, { 1, 1, 120, 5 }
+  }, {
+    2, { 1, 2, 1, 1 }
+  }, {
+    3, { 1, 1, 1, 2 }
+  }, {
+    5, { 2, 17, 1,  4 }
+  }, {
+    41, { 2,  1, 4, 17 }
+  }, {
+    5, { 2, 17, 1,  4 }
+  }, {
+    9, { 2,  1, 17, 4 }
+  }, {
+    17, { 3, 16, 1, 1 }
+  }, {
+    26, { 3,  1, 2, 8 }
+  }, {
+    103, { 3, 39, 17, 23 }
+  }, {
+    255, { 3, 17, 39, 23 }
+  }, {
+    254, { 3, 36, 16,  20 }
+  }, {
+    127, { 3, 18,  1, 640 }
+#if 0 // No enough local memory for 1810
+  }, {
+    128, { 5, 39, 17, 23 }
+  }, {
+    129, { 5, 17, 39, 23 }
+  }, {
+    55, { 20, 35,  2, 2 }
+  }, {
+    1, { 20,  7, 10, 2 }
+#endif    
+  }
+};
+
+static void g2l_tensor_fill_constant_ref(param_t *p, uint16_t ref_data[])
+{
+  uint64_t size = tl_shape_size(&p->dst->shape, p->dst->fmt);
+
+  for (uint64_t i = 0; i < size/sizeof(uint16_t); i++)
+    ref_data[i] = p->constant;
+}
+
+static void test_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  print_param(stderr, p);
+  uint64_t size = tl_shape_size(&p->dst->shape, CVK_FMT_I8);
+  uint64_t bytesize = size * fmt_size(p->dst->fmt);
+
+  cvk_ctx->ops->tdma_g2l_bf16_tensor_fill_constant(cvk_ctx, p);
+  CVI_RT_Submit(cvk_ctx);
+  uint16_t *dst_data = (uint16_t*) tensor_copy_l2g_d2s(rt_handle, cvk_ctx, p->dst);
+
+  uint16_t *ref_data = (uint16_t *)malloc(bytesize);
+  g2l_tensor_fill_constant_ref(p, ref_data);
+
+  for (uint64_t i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(dst_data);
+  free(ref_data);
+}
+
+static void destroy_param_g2l(cvk_context_t *cvk_ctx, param_t *p)
+{
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  uint32_t nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+  for (uint32_t i = 0; i < nr_fmt; i++) {
+    for (int dst_align = 0; dst_align < 2; dst_align++) {
+      param_t p;
+      memset(&p, 0, sizeof(p));
+      p.constant = test_generate_bf16_corner_val(c->constant);
+      p.dst = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, c->dst_shape, input_fmt[i].src_fmt, dst_align);
+
+      test_param_g2l(rt_handle, cvk_ctx, &p);
+      destroy_param_g2l(cvk_ctx, &p);
+    }
+  }
+}
+
+int main(int argc, char **argv)
+{
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return 0;
+}
diff --git a/cviruntime/test/181x/test_181x_tdma_g2l_bf16_tensor_vlc_copy_decompressed.c b/cviruntime/test/181x/test_181x_tdma_g2l_bf16_tensor_vlc_copy_decompressed.c
new file mode 100644
index 000000000..754ee4d7f
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_tdma_g2l_bf16_tensor_vlc_copy_decompressed.c
@@ -0,0 +1,181 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef cvk_tdma_g2l_tensor_copy_decompressed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => fmt(%d) bias0/1/zero is (%u/%u/%u) %s\n",
+      tag,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w,
+      p->dst->fmt,
+      p->src->bias0, p->src->bias1, p->src->zero_guard_en,
+      (p->dst->fmt == CVK_FMT_I8)? "signed": "unsigned");
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_tl_shape_t lmem_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 17, 13 }
+  },
+  {
+    { 3, 39, 17, 23 }
+  },
+#if 0 // No enough local memory for 1810
+  {
+    { 5, 39, 17, 23 }
+  },
+  {
+    { 20, 35,  2, 2 }
+  },
+#endif
+#ifndef ENABEL_SIMPLE_VLC_TEST
+  {
+    { 1, 1, 1, 1 }
+  },
+  {
+    { 1, 1, 1, 2 }
+  },
+  {
+    { 1, 1, 7, 2 }
+  },
+  {
+    { 1, 1, 10, 60 }
+  },
+  {
+    { 1, 2, 1, 1 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 3, 16, 1, 1 }
+  },
+  {
+    { 3, 36, 16,  20 }
+  },
+#endif /* ifndef ENABEL_SIMPLE_VLC_TEST*/
+};
+
+static void test_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p, uint16_t *src_data, CommandInfo* cmd_info)
+{
+  print_param(stderr, p);
+  uint64_t size = tl_shape_size(&p->dst->shape, CVK_FMT_I8);
+  uint64_t bytesize = size * fmt_size(p->dst->fmt);
+  size_t bs_size = 0;
+  int is_signed = (p->dst->fmt == CVK_FMT_I8);
+  uint8_t data_type = (p->dst->fmt == CVK_FMT_BF16) ? 1 : 0;
+
+  uint8_t *bsbuf = test_vlc_compress((uint8_t *)src_data, bytesize, is_signed, data_type, &bs_size, cmd_info, NULL);
+
+  uint16_t *ref_data = (uint16_t *)malloc(bytesize);
+  cvk_vlc_dec_bf16(bsbuf, bytesize, (uint16_t* )ref_data);
+
+  cmpr_tensor_copy_s2d(rt_handle, p->src, bsbuf);
+  cvk_ctx->ops->tdma_g2l_tensor_copy_decompressed(cvk_ctx, p);
+  CVI_RT_Submit(cvk_ctx);
+
+  uint16_t *dst_data = (uint16_t* )tensor_copy_l2g_d2s(rt_handle, cvk_ctx, p->dst);
+
+  for (uint64_t i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "vlc decompress comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+          i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(bsbuf);
+  free(dst_data);
+  free(ref_data);
+}
+
+static void destroy_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  free_cmpr_tensor_dev_mem(rt_handle, p->src);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  cvk_fmt_t fmts[] = { CVK_FMT_BF16 };
+  uint8_t fmts_sz = sizeof(fmts) / sizeof(fmts[0]);
+
+  for (int dst_align = 0; dst_align < 2; dst_align++) {
+    for (int mode = 0; mode < VLC_CMP_MODE_MAX; mode++) {
+      for (uint8_t fmt_i = 0; fmt_i < fmts_sz; fmt_i++) {
+        cvk_fmt_t fmt = fmts[fmt_i];
+        param_t p;
+        memset(&p, 0, sizeof(p));
+        p.dst = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, c->lmem_shape, fmt, dst_align);
+        assert(p.dst);
+
+        uint64_t size = tl_shape_size(&p.dst->shape, CVK_FMT_I8);
+        uint16_t *src_data = (uint16_t *)malloc(sizeof(uint16_t) * size);
+        test_vlc_init_testdata((uint8_t *)src_data, size, fmt == CVK_FMT_I8, fmt == CVK_FMT_BF16);
+
+        CommandInfo cmd_info;
+        memset(&cmd_info, 0, sizeof(CommandInfo));
+        int is_signed = (fmt == CVK_FMT_I8);
+        uint8_t data_type = (fmt == CVK_FMT_BF16) ? 1 : 0;
+
+        cmd_info.signedness = is_signed;
+        cmd_info.is_bfloat16 = data_type;
+
+        if (mode == VLC_CMP_MODE_COMPILER) {
+          cvk_vlc_est_weight_bias((uint8_t* )src_data, size * sizeof(uint16_t), (bool)is_signed, (bool)data_type, &cmd_info);
+        }
+
+        cvk_tg_shape_t tg_shape =
+            tg_shape_t4(c->lmem_shape.n, c->lmem_shape.c, c->lmem_shape.h, c->lmem_shape.w);
+        p.src = alloc_cmpr_tensor_dev_mem(rt_handle, cvk_ctx, tg_shape, fmt, &cmd_info);
+
+        test_param_g2l(rt_handle, cvk_ctx, &p, src_data, &cmd_info);
+
+        free(src_data);
+        destroy_param_g2l(rt_handle, cvk_ctx, &p);
+      }
+    }
+  }
+}
+
+int main(int argc, char **argv)
+{
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+  return 0;
+}
diff --git a/cviruntime/test/181x/test_181x_tdma_g2l_matrix_copy.c b/cviruntime/test/181x/test_181x_tdma_g2l_matrix_copy.c
new file mode 100644
index 000000000..319df190d
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_tdma_g2l_matrix_copy.c
@@ -0,0 +1,182 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef cvk_tdma_g2l_matrix_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.row, p->src->shape.col,
+      p->dst->shape.n, p->dst->shape.c,
+      p->dst->shape.w, p->dst->shape.col);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_mg_shape_t src_shape;
+  cvk_ml_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 0, 1 },
+    { 0, 1, 1, 1 },
+  }, {
+    { 0, 2 },
+    { 0, 1, 2, 2 },
+  }, {
+    { 0, 2 },
+    { 0, 2, 1, 2 },
+  }, {
+    { 0, 7 },
+    { 0, 1, 7, 7 },
+  }, {
+    { 0, 7 },
+    { 0, 2, 4, 7 },
+  }, {
+    { 0, 7 },
+    { 0, 7, 1, 7 },
+  }, {
+    { 0, 17 },
+    { 0, 1, 17, 17 },
+  }, {
+    { 0, 17 },
+    { 0, 3, 7, 17 },
+  }, {
+    { 0, 17 },
+    { 0, 17, 1, 17 },
+  }, {
+    { 0, 60 },
+    { 0, 1, 60, 60 },
+  }, {
+    { 0, 60 },
+    { 0, 30, 2, 60 },
+  }, {
+    { 0, 60 },
+    { 0, 60, 1, 60 },
+  }
+};
+
+static void g2l_matrix_copy_ref(param_t *p, uint8_t ref_data[], uint8_t src_data[])
+{
+  uint64_t size = ml_shape_size(&p->dst->shape, p->dst->fmt);
+
+  for (uint64_t i = 0; i < size; i++)
+    ref_data[i] = src_data[i];
+}
+
+static int test_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  print_param(stderr, p);
+  uint64_t size = ml_shape_size(&p->dst->shape, p->dst->fmt);
+
+  uint8_t *src_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!src_data)
+    return -1;
+
+  for (uint64_t i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  matrix_copy_s2d(rt_handle, p->src, src_data);
+
+  cvk_ctx->ops->tdma_g2l_matrix_copy(cvk_ctx, p);
+  CVI_RT_Submit(cvk_ctx);
+
+  uint8_t *dst_data = matrix_copy_l2g_d2s(rt_handle, cvk_ctx, p->dst);
+
+  uint8_t *ref_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!ref_data)
+    return -1;
+
+  g2l_matrix_copy_ref(p, ref_data, src_data);
+
+  for (uint64_t i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      return -1;
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+
+  return 0;
+}
+
+static void destroy_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  free_matrix_dev_mem(rt_handle, p->src);
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->dst);
+}
+
+static int test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  int ret = 0;
+  cvk_fmt_t fmt = CVK_FMT_I8;
+
+  for (uint32_t row = 1; row < 13; row += 2) {
+    c->src_shape.row = row;
+    c->dst_shape.n = row;
+    for (int dst_align = 0; dst_align < 2; dst_align++) {
+      param_t p;
+      memset(&p, 0, sizeof(p));
+
+      p.src = alloc_matrix_dev_mem(rt_handle, c->src_shape, fmt);
+      p.dst = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, c->dst_shape, fmt, dst_align);
+      ret |= test_param_g2l(rt_handle, cvk_ctx, &p);
+      destroy_param_g2l(rt_handle, cvk_ctx, &p);
+
+      if (ret)
+        return ret;
+    }
+  }
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++) {
+    ret |= test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+    if (ret)
+      break;
+  }
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  if (!ret)
+    printf("%s pass\n", __FILENAME__);
+  else
+    printf("%s fail\n", __FILENAME__);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_tdma_g2l_matrix_copy_row_col_transposed.c b/cviruntime/test/181x/test_181x_tdma_g2l_matrix_copy_row_col_transposed.c
new file mode 100644
index 000000000..a14274508
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_tdma_g2l_matrix_copy_row_col_transposed.c
@@ -0,0 +1,425 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef cvk_tdma_g2l_matrix_copy_row_col_transposed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.row, p->src->shape.col,
+      p->dst->shape.n, p->dst->shape.c,
+      p->dst->shape.w, p->dst->shape.col);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_mg_shape_t src_shape;
+  cvk_ml_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1 },
+    { 1, 1, 1, 1 },
+  }, {
+    { 1, 2 },
+    { 2, 1, 1, 1 },
+  }, {
+    { 1, 7 },
+    { 7, 1, 1, 1 },
+  }, {
+    { 1, 17 },
+    { 17, 1, 1, 1 },
+  }, {
+    { 1, 60 },
+    { 60, 1, 1, 1 },
+  }, {
+    { 1, 139 },
+    { 139, 1, 1, 1 },
+  }, {
+    { 2, 1 },
+    { 1, 1, 2, 2 },
+  }, {
+    { 2, 1 },
+    { 1, 2, 1, 2 },
+  }, {
+    { 2, 2 },
+    { 2, 1, 2, 2 },
+  }, {
+    { 2, 2 },
+    { 2, 2, 1, 2 },
+  }, {
+    { 2, 7 },
+    { 7, 1, 2, 2 },
+  }, {
+    { 2, 7 },
+    { 7, 2, 1, 2 },
+  }, {
+    { 2, 17 },
+    { 17, 1, 2, 2 },
+  }, {
+    { 2, 17 },
+    { 17, 2, 1, 2 },
+  }, {
+    { 2, 60 },
+    { 60, 1, 2, 2 },
+  }, {
+    { 2, 60 },
+    { 60, 2, 1, 2 },
+  }, {
+    { 2, 139 },
+    { 139, 1, 2, 2 },
+  }, {
+    { 2, 139 },
+    { 139, 2, 1, 2 },
+  }, {
+    { 7, 1 },
+    { 1, 1, 7, 7 },
+  }, {
+    { 7, 1 },
+    { 1, 2, 4, 7 },
+  }, {
+    { 7, 1 },
+    { 1, 2, 5, 7 },
+  }, {
+    { 7, 1 },
+    { 1, 2, 6, 7 },
+  }, {
+    { 7, 1 },
+    { 1, 3, 3, 7 },
+  }, {
+    { 7, 1 },
+    { 1, 4, 2, 7 },
+  }, {
+    { 7, 1 },
+    { 1, 7, 1, 7 },
+  }, {
+    { 7, 2 },
+    { 2, 1, 7, 7 },
+  }, {
+    { 7, 2 },
+    { 2, 2, 4, 7 },
+  }, {
+    { 7, 2 },
+    { 2, 2, 5, 7 },
+  }, {
+    { 7, 2 },
+    { 2, 2, 6, 7 },
+  }, {
+    { 7, 2 },
+    { 2, 3, 3, 7 },
+  }, {
+    { 7, 2 },
+    { 2, 4, 2, 7 },
+  }, {
+    { 7, 2 },
+    { 2, 7, 1, 7 },
+  }, {
+    { 7, 7 },
+    { 7, 1, 7, 7 },
+  }, {
+    { 7, 7 },
+    { 7, 3, 3, 7 },
+  }, {
+    { 7, 7 },
+    { 7, 4, 2, 7 },
+  }, {
+    { 7, 7 },
+    { 7, 7, 1, 7 },
+  }, {
+    { 7, 17 },
+    { 17, 1, 7, 7 },
+  }, {
+    { 7, 17 },
+    { 17, 4, 2, 7 },
+  }, {
+    { 7, 17 },
+    { 17, 7, 1, 7 },
+  }, {
+    { 7, 60 },
+    { 60, 1, 7, 7 },
+  }, {
+    { 7, 60 },
+    { 60, 3, 3, 7 },
+  }, {
+    { 7, 60 },
+    { 60, 7, 1, 7 },
+  }, {
+    { 7, 139 },
+    { 139, 1, 7, 7 },
+  }, {
+    { 7, 139 },
+    { 139, 3, 3, 7 },
+  }, {
+    { 7, 139 },
+    { 139, 7, 1, 7 },
+  }, {
+    { 43, 1 },
+    { 1, 1, 43, 43 },
+  }, {
+    { 43, 1 },
+    { 1, 2, 22, 43 },
+  }, {
+    { 43, 1 },
+    { 1, 2, 25, 43 },
+  }, {
+    { 43, 1 },
+    { 1, 2, 37, 43 },
+  }, {
+    { 43, 1 },
+    { 1, 2, 41, 43 },
+  }, {
+    { 43, 1 },
+    { 1, 5, 9, 43 },
+  }, {
+    { 43, 1 },
+    { 1, 5, 10, 43 },
+  }, {
+    { 43, 1 },
+    { 1, 9, 5, 43 },
+  }, {
+    { 43, 1 },
+    { 1, 22, 2, 43 },
+  }, {
+    { 43, 1 },
+    { 1, 43, 1, 43 },
+  }, {
+    { 43, 2 },
+    { 2, 1, 43, 43 },
+  }, {
+    { 43, 2 },
+    { 2, 2, 27, 43 },
+  }, {
+    { 43, 2 },
+    { 2, 22, 2, 43 },
+  }, {
+    { 43, 2 },
+    { 2, 43, 1, 43 },
+  }, {
+    { 57, 7 },
+    { 7, 1, 57, 57 },
+  }, {
+    { 57, 7 },
+    { 7, 2, 37, 57 },
+  }, {
+    { 57, 7 },
+    { 7, 2, 43, 57 },
+  }, {
+    { 57, 7 },
+    { 7, 2, 55, 57 },
+  }, {
+    { 57, 7 },
+    { 7, 2, 56, 57 },
+  }, {
+    { 57, 7 },
+    { 7, 7, 9, 57 },
+  }, {
+    { 57, 7 },
+    { 7, 8, 8, 57 },
+  }, {
+    { 57, 7 },
+    { 7, 29, 2, 57 },
+  }, {
+    { 57, 7 },
+    { 7, 57, 1, 57 },
+  }, {
+    { 67, 17 },
+    { 17, 1, 67, 67 },
+  }, {
+    { 67, 17 },
+    { 17, 2, 34, 67 },
+  }, {
+    { 67, 17 },
+    { 17, 2, 49, 67 },
+  }, {
+    { 67, 17 },
+    { 17, 2, 66, 67 },
+  }, {
+    { 67, 17 },
+    { 17, 6, 12, 67 },
+  }, {
+    { 67, 17 },
+    { 17, 6, 13, 67 },
+  }, {
+    { 67, 17 },
+    { 17, 17, 4, 67 },
+  }, {
+    { 67, 17 },
+    { 17, 34, 2, 67 },
+  }, {
+    { 67, 17 },
+    { 17, 67, 1, 67 },
+  }, {
+    { 129, 139 },
+    { 139, 1, 129, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 2, 65, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 2, 80, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 2, 120, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 2, 128, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 3, 43, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 3, 47, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 3, 59, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 3, 64, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 7, 19, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 7, 20, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 7, 21, 129 },
+#if 0 // Not enough lmem size for 1810
+  }, {
+    { 129, 139 },
+    { 139, 43, 3, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 65, 2, 129 },
+#endif
+  }
+// out of lmem size
+//  , {
+//    { 129, 139 },
+//    { 139, 129, 1, 129 },
+//  }
+};
+
+static void g2l_matrix_copy_row_col_transposed_ref(
+    param_t *p, uint8_t ref_data[], uint8_t src_data[])
+{
+  uint64_t row = p->src->shape.row;
+  uint64_t col = p->src->shape.col;
+
+  for (uint64_t ri = 0; ri < row; ri++) {
+    for (uint64_t ci = 0; ci < col; ci++) {
+      uint64_t src_i = ri * col + ci;
+      uint64_t dst_i = ci * row + ri;
+      ref_data[dst_i] = src_data[src_i];
+    }
+  }
+}
+
+static int test_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  print_param(stderr, p);
+  int ret = 0;
+  uint64_t size = ml_shape_size(&p->dst->shape, p->dst->fmt);
+
+  uint8_t *src_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!src_data)
+    return -1;
+
+  for (uint64_t i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  matrix_copy_s2d(rt_handle, p->src, src_data);
+  cvk_ctx->ops->tdma_g2l_matrix_copy_row_col_transposed(cvk_ctx, p);
+  CVI_RT_Submit(cvk_ctx);
+  uint8_t *dst_data = matrix_copy_l2g_d2s(rt_handle, cvk_ctx, p->dst);
+
+  uint8_t *ref_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!dst_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  g2l_matrix_copy_row_col_transposed_ref(p, ref_data, src_data);
+
+  for (uint64_t i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+fail_exit:
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+
+  return ret;
+}
+
+
+static void destroy_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  free_matrix_dev_mem(rt_handle, p->src);
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->dst);
+}
+
+static int test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  int ret = 0;
+  param_t p;
+  /*
+   * Matrix transpose must be n/c stride alignment
+   * for TDMA limitation
+   */
+  int dst_align = 1;
+  cvk_fmt_t fmt = CVK_FMT_I8;
+
+  memset(&p, 0, sizeof(p));
+
+  p.src = alloc_matrix_dev_mem(rt_handle, c->src_shape, fmt);
+  p.dst = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, c->dst_shape, fmt, dst_align);
+  ret = test_param_g2l(rt_handle, cvk_ctx, &p);
+  destroy_param_g2l(rt_handle, cvk_ctx, &p);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+  int ret = 0;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    ret |= test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_tdma_g2l_matrix_vlc_copy_decompressed.c b/cviruntime/test/181x/test_181x_tdma_g2l_matrix_vlc_copy_decompressed.c
new file mode 100644
index 000000000..87d3337da
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_tdma_g2l_matrix_vlc_copy_decompressed.c
@@ -0,0 +1,202 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef cvk_tdma_g2l_matrix_copy_decompressed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->m.shape.row, p->src->m.shape.col,
+      p->dst->shape.n, p->dst->shape.c,
+      p->dst->shape.w, p->dst->shape.col);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_mg_shape_t src_shape;
+  cvk_ml_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 0, 17 },
+    { 0, 3, 7, 17 },
+  },
+  {
+    { 0, 7 },
+    { 0, 7, 1, 7 },
+  },
+  {
+    { 0, 60 },
+    { 0, 30, 2, 60 },
+  },
+#ifndef ENABEL_SIMPLE_VLC_TEST
+  {
+    { 0, 1 },
+    { 0, 1, 1, 1 },
+  },
+  {
+    { 0, 2 },
+    { 0, 1, 2, 2 },
+  },
+  {
+    { 0, 2 },
+    { 0, 2, 1, 2 },
+  },
+  {
+    { 0, 7 },
+    { 0, 1, 7, 7 },
+  },
+  {
+    { 0, 7 },
+    { 0, 2, 4, 7 },
+  },
+  {
+    { 0, 17 },
+    { 0, 1, 17, 17 },
+  },
+  {
+    { 0, 17 },
+    { 0, 17, 1, 17 },
+  },
+  {
+    { 0, 60 },
+    { 0, 1, 60, 60 },
+  },
+  {
+    { 0, 60 },
+    { 0, 60, 1, 60 },
+  }
+#endif /* ifndef ENABEL_SIMPLE_VLC_TEST*/
+};
+
+static void g2l_matrix_copy_ref(param_t *p, uint8_t ref_data[], uint8_t src_data[])
+{
+  uint64_t size = ml_shape_size(&p->dst->shape, p->dst->fmt);
+
+  for (uint64_t i = 0; i < size; i++)
+    ref_data[i] = src_data[i];
+}
+
+static void test_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p, uint8_t *src_data, CommandInfo* cmd_info)
+{
+  print_param(stderr, p);
+
+  uint64_t in_size = ml_shape_size(&p->dst->shape, p->dst->fmt);
+  size_t bs_size = 0;
+  int is_signed = (p->dst->fmt == CVK_FMT_I8);
+  size_t data_type = (p->dst->fmt == CVK_FMT_BF16) ? 1 : 0;
+
+  uint8_t *bsbuf = test_vlc_compress(src_data, in_size, is_signed, data_type, &bs_size, cmd_info, NULL);
+  cmpr_matrix_copy_s2d(rt_handle, p->src, bsbuf);
+  free(bsbuf);
+
+  cvk_ctx->ops->tdma_g2l_matrix_copy_decompressed(cvk_ctx, p);
+  CVI_RT_Submit(cvk_ctx);
+  uint8_t *dst_data = matrix_copy_l2g_d2s(rt_handle, cvk_ctx, p->dst);
+
+  uint8_t *ref_data = (uint8_t *)malloc(sizeof(uint8_t) * in_size);
+  g2l_matrix_copy_ref(p, ref_data, src_data);
+
+  for (uint64_t i = 0; i < in_size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(dst_data);
+  free(ref_data);
+}
+
+static void destroy_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  free_cmpr_matrix_dev_mem(rt_handle, p->src);
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  cvk_fmt_t fmts[] = { CVK_FMT_I8, CVK_FMT_U8 };
+  uint8_t fmts_sz = sizeof(fmts) / sizeof(fmts[0]);
+
+  for (uint32_t row = 1; row < 13; row += 2) {
+    c->src_shape.row = row;
+    c->dst_shape.n = row;
+    for (int mode = 0; mode < VLC_CMP_MODE_MAX; mode++) {
+      for (int dst_align = 0; dst_align < 2; dst_align++) {
+        for (uint8_t fmt_i = 0; fmt_i < fmts_sz; fmt_i++) {
+          cvk_fmt_t fmt = fmts[fmt_i];
+          param_t p;
+
+          memset(&p, 0, sizeof(p));
+
+          int is_signed = (fmt == CVK_FMT_I8);
+          size_t data_type = (fmt == CVK_FMT_BF16) ? 1 : 0;
+          CommandInfo cmd_info;
+
+          memset(&cmd_info, 0, sizeof(CommandInfo));
+          cmd_info.signedness = is_signed;
+
+          // <! 1. alloc source
+          p.dst = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, c->dst_shape, fmt, dst_align);
+          uint64_t in_size = ml_shape_size(&p.dst->shape, p.dst->fmt);
+
+          // <! 2 init input
+          uint8_t *src_data = (uint8_t *)malloc(sizeof(uint8_t) * in_size);
+          test_vlc_init_testdata(src_data, in_size, fmt == CVK_FMT_I8, fmt == CVK_FMT_BF16);
+
+          // <! 3 try to manual set bias0/bias1
+          if (mode == VLC_CMP_MODE_COMPILER) {
+            cvk_vlc_est_weight_bias(src_data, in_size, (bool)is_signed, (bool)data_type, &cmd_info);
+          }
+
+          p.src = alloc_cmpr_matrix_dev_mem(rt_handle, c->src_shape, fmt, &cmd_info);
+
+          //printf ("row %u mode %d is_align %d fmt %d\n", row, mode, dst_align, fmt);
+          test_param_g2l(rt_handle, cvk_ctx, &p, src_data, &cmd_info);
+
+          free(src_data);
+          destroy_param_g2l(rt_handle, cvk_ctx, &p);
+        }
+      }
+    }
+  }
+}
+
+int main(int argc, char **argv)
+{
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return 0;
+}
diff --git a/cviruntime/test/181x/test_181x_tdma_g2l_tensor_copy.c b/cviruntime/test/181x/test_181x_tdma_g2l_tensor_copy.c
new file mode 100644
index 000000000..92ea39ce4
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_tdma_g2l_tensor_copy.c
@@ -0,0 +1,163 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <inttypes.h>
+#include "test_cvikernel_util.h"
+
+typedef cvk_tdma_g2l_tensor_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_tg_shape_t src_shape;
+  cvk_tl_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 1, 1 },
+    { 1, 1, 1, 1 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 2, 1 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 2, 7 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 13, 17 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 120, 5 },
+  }, {
+    { 1, 2, 1, 1 },
+    { 1, 1, 1, 2 },
+  }, {
+    { 2, 17, 1,  4 },
+    { 2,  1, 4, 17 },
+  }, {
+    { 2, 17, 1,  4 },
+    { 2,  1, 17, 4 },
+  }, {
+    { 3, 16, 1, 1 },
+    { 3,  1, 2, 8 },
+  }, {
+    { 3, 39, 17, 23 },
+    { 3, 17, 39, 23 },
+  }, {
+    { 3, 36, 16,  20 },
+    { 3, 18,  1, 640 },
+  }, {
+    { 5, 39, 17, 23 },
+    { 5, 17, 39, 23 },
+  }, {
+    { 20, 35,  2, 2 },
+    { 20,  7, 10, 2 },
+  }
+};
+
+static void g2l_tensor_copy_ref(param_t *p, uint8_t ref_data[], uint8_t src_data[])
+{
+  uint64_t size = tl_shape_size(&p->dst->shape, p->dst->fmt);
+
+  for (uint64_t i = 0; i < size; i++)
+    ref_data[i] = src_data[i];
+}
+
+static int test_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  int ret = 0;
+  print_param(stderr, p);
+  uint64_t size = tl_shape_size(&p->dst->shape, p->dst->fmt);
+
+  uint8_t *src_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!src_data)
+    return -1;
+
+  for (uint64_t i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  tensor_copy_s2d(rt_handle, p->src, src_data);
+
+  cvk_ctx->ops->tdma_g2l_tensor_copy(cvk_ctx, p);
+  CVI_RT_Submit(cvk_ctx);
+
+  uint8_t *dst_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, p->dst);
+
+  uint8_t *ref_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  g2l_tensor_copy_ref(p, ref_data, src_data);
+
+  for (uint64_t i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+
+  return ret;
+}
+
+static void destroy_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  free_tensor_dev_mem(rt_handle, p->src);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->dst);
+}
+
+static int test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  int ret = 0;
+  for (int dst_align = 0; dst_align < 2; dst_align++) {
+    param_t p;
+    memset(&p, 0, sizeof(p));
+
+    p.src = alloc_tensor_dev_mem(rt_handle, cvk_ctx, c->src_shape, CVK_FMT_I8);
+    p.dst = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, c->dst_shape, CVK_FMT_I8, dst_align);
+    ret |= test_param_g2l(rt_handle, cvk_ctx, &p);
+    destroy_param_g2l(rt_handle, cvk_ctx, &p);
+  }
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    ret |= test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_tdma_g2l_tensor_copy_chw_rotated.c b/cviruntime/test/181x/test_181x_tdma_g2l_tensor_copy_chw_rotated.c
new file mode 100644
index 000000000..8de7a5103
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_tdma_g2l_tensor_copy_chw_rotated.c
@@ -0,0 +1,206 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef cvk_tdma_g2l_tensor_copy_chw_rotated_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.h, p->src->shape.w, p->src->shape.c,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_tg_shape_t src_shape;
+  cvk_tl_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 3, 1, 1 }, // nchw for neuron
+    { 1, 3, 1, 1 }, // nchw for neuron
+  }, {
+    { 1, 4, 1, 1 },
+    { 1, 4, 1, 1 },
+  }, {
+    { 1, 3, 1, 7 },
+    { 1, 3, 1, 7 },
+  }, {
+    { 1, 4, 1, 7 },
+    { 1, 4, 1, 7 },
+  }, {
+    { 1, 3, 1, 17 },
+    { 1, 3, 1, 17 },
+  }, {
+    { 1, 4, 1, 17 },
+    { 1, 4, 1, 17 },
+  }, {
+    { 1, 3, 2, 1 },
+    { 1, 3, 2, 1 },
+  }, {
+    { 1, 4, 2, 1 },
+    { 1, 4, 2, 1 },
+  }, {
+    {  2, 3, 17, 1 },
+    {  2, 3, 17, 1 },
+  }, {
+    {  2, 4, 17, 1 },
+    {  2, 4, 17, 1 },
+  }, {
+    {  2, 3, 17, 3 },
+    {  2, 3, 17, 3 },
+  }, {
+    {  2, 4, 17, 3 },
+    {  2, 4, 17, 3 },
+  }, {
+    {  3, 3, 16, 7 },
+    {  3, 3, 16, 7 },
+  }, {
+    {  3, 4, 16, 7 },
+    {  3, 4, 16, 7 },
+  }, {
+    {  3, 3, 39, 17 },
+    {  3, 3, 39, 17 },
+  }, {
+    {  3, 4, 39, 17 },
+    {  3, 4, 39, 17 },
+  }, {
+    {  3, 3, 36, 16 },
+    {  3, 3, 36, 16 },
+  }, {
+    {  3, 4, 36, 16 },
+    {  3, 4, 36, 16 },
+  }, {
+    {  5, 3, 39, 17 },
+    {  5, 3, 39, 17 },
+  }, {
+    {  5, 4, 39, 17 },
+    {  5, 4, 39, 17 },
+  }, {
+    { 20, 3, 35, 2 },
+    { 20, 3, 35, 2 },
+  }, {
+    { 20, 4, 35, 2 },
+    { 20, 4, 35, 2 },
+  }, {
+    { 20, 3, 35, 3 },
+    { 20, 3, 35, 3 },
+  }, {
+    { 20, 4, 35, 3 },
+    { 20, 4, 35, 3 },
+  }
+};
+
+static void g2l_tensor_copy_chw_rotated_ref(
+    param_t *p, uint8_t ref_data[], uint8_t src_data[])
+{
+  cvk_tg_shape_t s = p->src->shape;
+  // change nhwc -> nchw by HW design automatically
+  uint32_t n = s.n;
+  uint32_t c = s.h;
+  uint32_t h = s.w;
+  uint32_t w = s.c;
+  for (uint32_t ni = 0; ni < n; ni++) {
+    for (uint32_t ci = 0; ci < c; ci++) {
+      for (uint32_t hi = 0; hi < h; hi++) {
+        for (uint32_t wi = 0; wi < w; wi++) {
+          uint32_t src_i = ni * c * h * w + ci * h * w + hi * w + wi;
+          uint32_t dst_i = ni * w * c * h + wi * c * h + ci * h + hi;
+          ref_data[dst_i] = src_data[src_i];
+        }
+      }
+    }
+  }
+}
+
+static void test_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  print_param(stderr, p);
+  uint64_t size = tg_shape_size(&p->src->shape, p->src->fmt);
+  uint8_t *src_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!src_data)
+    return;
+
+  for (uint64_t i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  tensor_copy_s2d(rt_handle, p->src, src_data);
+  cvk_ctx->ops->tdma_g2l_tensor_copy_chw_rotated(cvk_ctx, p);
+  CVI_RT_Submit(cvk_ctx);
+  uint8_t *dst_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, p->dst);
+
+  uint8_t *ref_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!dst_data || !ref_data)
+    goto fail_exit;
+
+  g2l_tensor_copy_chw_rotated_ref(p, ref_data, src_data);
+ 
+  for (uint64_t i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+fail_exit:
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+static void destroy_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  free_tensor_dev_mem(rt_handle, p->src);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+
+  param_t p;
+  memset(&p, 0, sizeof(p));
+
+  p.src = alloc_tensor_dev_mem(rt_handle, cvk_ctx, c->src_shape, CVK_FMT_I8);
+  p.dst = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, c->dst_shape, CVK_FMT_I8, 1);
+  test_param_g2l(rt_handle, cvk_ctx, &p);
+  destroy_param_g2l(rt_handle, cvk_ctx, &p);
+
+}
+
+int main(int argc, char **argv)
+{
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return 0;
+}
diff --git a/cviruntime/test/181x/test_181x_tdma_g2l_tensor_copy_nc_transposed.c b/cviruntime/test/181x/test_181x_tdma_g2l_tensor_copy_nc_transposed.c
new file mode 100644
index 000000000..5c6615536
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_tdma_g2l_tensor_copy_nc_transposed.c
@@ -0,0 +1,264 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef cvk_tdma_g2l_tensor_copy_nc_transposed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_tg_shape_t src_shape;
+  cvk_tl_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 1, 1 },
+    { 1, 1, 1, 1 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 1, 2 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 2, 1 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 7, 2 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 2, 7 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 17, 13 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 13, 17 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 10, 60 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 2, 300 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 3, 200 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 4, 150 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 5, 120 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 60, 10 },
+  }, {
+    { 1, 1, 120, 5 },
+    { 1, 1, 120, 5 },
+  }, {
+    { 1, 2, 1, 1 },
+    { 2, 1, 1, 1 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 1, 4 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 2, 2 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 4, 1 },
+  }, {
+    { 17, 2, 2, 2 },
+    { 2, 17, 2, 2 },
+  }, {
+    { 17, 2, 4, 1 },
+    { 2, 17, 4, 1 },
+  }, {
+    {  3, 16, 1, 1 },
+    { 16,  3, 1, 1 },
+  }, {
+    { 3, 39, 23, 17 },
+    { 39, 3, 23, 17 },
+  }, {
+    { 3, 39, 17, 23 },
+    { 39, 3, 17, 23 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  16, 20 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  2, 160 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  4, 80 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  8, 40 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  20, 16 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  32, 10 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  64, 5 },
+  }, {
+    { 5, 39, 17, 23 },
+    { 39, 5, 17, 23 },
+  }, {
+    { 20, 35, 2, 2 },
+    { 35, 20, 2, 2 },
+  }, {
+    { 35, 20, 2, 2 },
+    { 20, 35, 2, 2 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 160, 2 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 2, 160 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 4, 80 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 8, 40 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 10, 32 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 20, 16 },
+  }, {
+    { 39, 5, 23, 17 },
+    { 5, 39, 23, 17 },
+  }    
+};
+
+static void g2l_tensor_copy_nc_transposed_ref(
+    param_t *p, uint8_t ref_data[], uint8_t src_data[])
+{
+  cvk_tg_shape_t s = p->src->shape;
+  uint32_t n = s.n;
+  uint32_t c = s.c;
+  uint32_t hw = s.h * s.w;
+
+  for (uint32_t ni = 0; ni < n; ni++) {
+    for (uint32_t ci = 0; ci < c; ci++) {
+      for (uint32_t hwi = 0; hwi < hw; hwi++) {
+        uint32_t src_i = ni * c * hw + ci * hw + hwi;
+        uint32_t dst_i = ci * n * hw + ni * hw + hwi;
+        ref_data[dst_i] = src_data[src_i];
+      }
+    }
+  }
+}
+
+static int test_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  print_param(stderr, p);
+  int ret = 0;
+  uint64_t size = tl_shape_size(&p->dst->shape, p->dst->fmt);
+
+  uint8_t *src_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!src_data)
+    return -1;
+
+  for (uint64_t i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  tensor_copy_s2d(rt_handle, p->src, src_data);
+  cvk_ctx->ops->tdma_g2l_tensor_copy_nc_transposed(cvk_ctx, p);
+  CVI_RT_Submit(cvk_ctx);
+  uint8_t *dst_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, p->dst);
+
+  uint8_t *ref_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!dst_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  g2l_tensor_copy_nc_transposed_ref(p, ref_data, src_data);
+
+  for (uint64_t i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+fail_exit:
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+
+  return ret;
+}
+
+
+static void destroy_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->dst);
+  free_tensor_dev_mem(rt_handle, p->src);
+}
+
+static int test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  int ret = 0;
+
+  for (int dst_align = 0; dst_align < 2; dst_align++) {
+    param_t p;
+    memset(&p, 0, sizeof(p));
+
+    p.src = alloc_tensor_dev_mem(rt_handle, cvk_ctx, c->src_shape, CVK_FMT_I8);
+    p.dst = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, c->dst_shape, CVK_FMT_I8, dst_align);
+    ret |= test_param_g2l(rt_handle, cvk_ctx, &p);
+    destroy_param_g2l(rt_handle, cvk_ctx, &p);
+  }
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+  int ret = 0;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    ret |= test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_tdma_g2l_tensor_vlc_copy_decompressed.c b/cviruntime/test/181x/test_181x_tdma_g2l_tensor_vlc_copy_decompressed.c
new file mode 100644
index 000000000..dd01abf1d
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_tdma_g2l_tensor_vlc_copy_decompressed.c
@@ -0,0 +1,180 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef cvk_tdma_g2l_tensor_copy_decompressed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => fmt(%d) bias0/1/zero is (%u/%u/%u) %s\n",
+      tag,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w,
+      p->dst->fmt,
+      p->src->bias0, p->src->bias1, p->src->zero_guard_en,
+      (p->dst->fmt == CVK_FMT_I8)? "signed": "unsigned");
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_tl_shape_t lmem_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 17, 13 }
+  },
+  {
+    { 3, 39, 17, 23 }
+  },
+  {
+    { 5, 39, 17, 23 }
+  },
+  {
+    { 20, 35,  2, 2 }
+  },
+#ifndef ENABEL_SIMPLE_VLC_TEST
+  {
+    { 1, 1, 1, 1 }
+  },
+  {
+    { 1, 1, 1, 2 }
+  },
+  {
+    { 1, 1, 7, 2 }
+  },
+  {
+    { 1, 1, 10, 60 }
+  },
+  {
+    { 1, 2, 1, 1 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 3, 16, 1, 1 }
+  },
+  {
+    { 3, 36, 16,  20 }
+  },
+#endif /* ifndef ENABEL_SIMPLE_VLC_TEST*/
+};
+
+static void g2l_tensor_copy_vlc_decompressed_ref(
+    uint8_t ref_data[], uint64_t ref_size, uint8_t src_data[])
+{
+  cvk_vlc_dec_int8(src_data, ref_size, ref_data);
+}
+
+static void test_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p, uint8_t *src_data, CommandInfo* cmd_info)
+{
+  print_param(stderr, p);
+  uint64_t size = tl_shape_size(&p->dst->shape, p->dst->fmt);
+  size_t bs_size = 0;
+  int is_signed = (p->dst->fmt == CVK_FMT_I8);
+  uint8_t data_type = (p->dst->fmt == CVK_FMT_BF16) ? 1 : 0;
+
+  uint8_t *bsbuf = test_vlc_compress(src_data, size, is_signed, data_type, &bs_size, cmd_info, NULL);
+
+  cmpr_tensor_copy_s2d(rt_handle, p->src, bsbuf);
+  cvk_ctx->ops->tdma_g2l_tensor_copy_decompressed(cvk_ctx, p);
+  CVI_RT_Submit(cvk_ctx);
+
+  uint8_t *dst_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, p->dst);
+  uint8_t *ref_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  g2l_tensor_copy_vlc_decompressed_ref(ref_data, size, bsbuf);
+  for (uint64_t i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "vlc decompress comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+  free(bsbuf);
+  free(dst_data);
+  free(ref_data);
+}
+
+static void destroy_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  free_cmpr_tensor_dev_mem(rt_handle, p->src);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  cvk_fmt_t fmts[] = { CVK_FMT_I8, CVK_FMT_U8 };
+
+  for (int dst_align = 0; dst_align < 2; dst_align++) {
+    for (int mode = 0; mode < VLC_CMP_MODE_MAX; mode++) {
+      for (uint8_t fmt_i = 0; fmt_i < 2; fmt_i++) {
+        cvk_fmt_t fmt = fmts[fmt_i];
+        param_t p;
+        memset(&p, 0, sizeof(p));
+        p.dst = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, c->lmem_shape, fmt, dst_align);
+        assert(p.dst);
+
+        uint64_t size = tl_shape_size(&p.dst->shape, p.dst->fmt);
+        uint8_t *src_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+        test_vlc_init_testdata(src_data, size, fmt == CVK_FMT_I8, fmt == CVK_FMT_BF16);
+
+        CommandInfo cmd_info;
+        memset(&cmd_info, 0, sizeof(CommandInfo));
+        int is_signed = (fmt == CVK_FMT_I8);
+        uint8_t data_type = (fmt == CVK_FMT_BF16) ? 1 : 0;
+
+        cmd_info.signedness = is_signed;
+
+        if (mode == VLC_CMP_MODE_COMPILER) {
+          cvk_vlc_est_weight_bias(src_data, size, (bool)is_signed, (bool)data_type, &cmd_info);
+        }
+
+        cvk_tg_shape_t tg_shape =
+            tg_shape_t4(c->lmem_shape.n, c->lmem_shape.c, c->lmem_shape.h, c->lmem_shape.w);
+        p.src = alloc_cmpr_tensor_dev_mem(rt_handle, cvk_ctx, tg_shape, fmt, &cmd_info);
+
+        test_param_g2l(rt_handle, cvk_ctx, &p, src_data, &cmd_info);
+
+        free(src_data);
+        destroy_param_g2l(rt_handle, cvk_ctx, &p);
+      }
+    }
+  }
+}
+
+int main(int argc, char **argv)
+{
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return 0;
+}
diff --git a/cviruntime/test/181x/test_181x_tdma_l2g_bf16_matrix_vlc_copy_compressed.c b/cviruntime/test/181x/test_181x_tdma_l2g_bf16_matrix_vlc_copy_compressed.c
new file mode 100644
index 000000000..e9a00907d
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_tdma_l2g_bf16_matrix_vlc_copy_compressed.c
@@ -0,0 +1,186 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef cvk_tdma_l2g_matrix_copy_compressed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.w, p->src->shape.col,
+      p->dst->m.shape.row, p->dst->m.shape.col);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_ml_shape_t src_shape;
+  cvk_mg_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+ {
+    { 0, 2, 4, 7 },
+    { 0, 7 },
+  },
+ {
+    { 0, 1, 2, 2 },
+    { 1, 2 },
+  },
+ {
+    { 0, 3, 7, 17 },
+    { 0, 17 },
+  },
+ {
+    { 0, 17, 1, 17 },
+    { 0, 17 },
+  },
+ {
+    { 0, 60, 1, 60 },
+    { 0, 60 },
+  },
+#ifndef ENABEL_SIMPLE_VLC_TEST
+  {
+    { 0, 1, 1, 1 },
+    { 0, 1 },
+  },
+ {
+    { 0, 2, 1, 2 },
+    { 1, 2 },
+  },
+ {
+    { 0, 1, 7, 7 },
+    { 0, 7 },
+  },
+ {
+    { 0, 7, 1, 7 },
+    { 0, 7 },
+  },
+ {
+    { 0, 1, 17, 17 },
+    { 0, 17 },
+  },
+ {
+    { 0, 1, 60, 60 },
+    { 0, 60 },
+  },
+ {
+    { 0, 30, 2, 60 },
+    { 0, 60 },
+  },
+#endif /* ifndef ENABEL_SIMPLE_VLC_TEST*/
+};
+
+static void test_param_l2g(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p, uint16_t* src_data, CommandInfo * cmd_info)
+{
+  print_param(stderr, p);
+  uint64_t size = ml_shape_size(&p->src->shape, CVK_FMT_I8);
+  uint64_t bytesize = size * fmt_size(p->src->fmt);
+
+  matrix_copy_s2d_g2l(rt_handle, cvk_ctx, p->src, (uint8_t*)src_data);
+  cvk_ctx->ops->tdma_l2g_matrix_copy_compressed(cvk_ctx, p);
+  CVI_RT_Submit(cvk_ctx);
+
+  int is_signed = (p->src->fmt == CVK_FMT_I8);
+  int data_type = (p->src->fmt == CVK_FMT_BF16) ? 1 : 0;
+  size_t bs_size;
+
+  uint16_t *ref_data = (uint16_t* )test_vlc_compress((uint8_t* )src_data, bytesize, is_signed, data_type, &bs_size, cmd_info, NULL);
+  uint16_t *dst_data = (uint16_t* )cmpr_matrix_copy_d2s(rt_handle, p->dst);
+
+  // <! compare unit is 2bytes
+  for (uint64_t i = 0; i < bs_size / 2; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+          i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(dst_data);
+  free(ref_data);
+}
+
+
+static void destroy_param_l2g(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->src);
+  free_cmpr_matrix_dev_mem(rt_handle, p->dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  cvk_fmt_t fmts[] = { CVK_FMT_BF16 };
+  uint8_t fmts_sz = sizeof(fmts) / sizeof(fmts[0]);
+
+  for (uint32_t row = 1; row < 13; row += 2) {
+    c->src_shape.n = row;
+    c->dst_shape.row = row;
+    for (int src_align = 0; src_align < 2; src_align++) {
+      for (uint8_t fmt_i = 0; fmt_i < fmts_sz; fmt_i++) {
+        cvk_fmt_t fmt = fmts[fmt_i];
+        param_t p;
+        memset(&p, 0, sizeof(p));
+        p.src = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, c->src_shape, fmt, src_align);
+
+        uint64_t size = ml_shape_size(&p.src->shape, CVK_FMT_I8);
+        uint16_t *src_data = (uint16_t *)malloc(sizeof(uint16_t) * size);
+        test_vlc_init_testdata((uint8_t *)src_data, size, fmt == CVK_FMT_I8, fmt == CVK_FMT_BF16);
+
+        //size_t bs_size;
+        CommandInfo cmd_info;
+        int is_signed = (p.src->fmt == CVK_FMT_I8);
+        int data_type = (p.src->fmt == CVK_FMT_BF16) ? 1 : 0;
+
+        memset(&cmd_info, 0, sizeof(CommandInfo));
+        cmd_info.signedness = is_signed;
+        cmd_info.is_bfloat16 = data_type;
+        cmd_info.bias0 = 127;
+
+        // <! not support bias0/1 setting compress by hw
+        //get_vlc_compressed_meta(src_data, size, p.src->fmt, &bs_size, &cmd_info);
+
+        // <! max compressed size
+        p.dst = alloc_cmpr_matrix_dev_mem(rt_handle, c->dst_shape, p.src->fmt, &cmd_info);
+
+        test_param_l2g(rt_handle, cvk_ctx, &p, src_data, &cmd_info);
+        destroy_param_l2g(rt_handle, cvk_ctx, &p);
+        free(src_data);
+      }
+    }
+  }
+}
+
+int main(int argc, char **argv)
+{
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return 0;
+}
diff --git a/cviruntime/test/181x/test_181x_tdma_l2g_bf16_tensor_copy_nc_transposed.c b/cviruntime/test/181x/test_181x_tdma_l2g_bf16_tensor_copy_nc_transposed.c
new file mode 100644
index 000000000..4466224c5
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_tdma_l2g_bf16_tensor_copy_nc_transposed.c
@@ -0,0 +1,306 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+
+typedef cvk_tdma_l2g_tensor_copy_nc_transposed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_fmt_t src_fmt;
+  cvk_fmt_t dst_fmt;
+} fmt_type_t;
+
+static fmt_type_t input_fmt[] = {
+ {CVK_FMT_BF16, CVK_FMT_BF16},
+ {CVK_FMT_BF16, CVK_FMT_I8},
+ {CVK_FMT_BF16, CVK_FMT_U8},
+};
+
+typedef struct {
+  cvk_tl_shape_t src_shape;
+  cvk_tg_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 1, 1 },
+    { 1, 1, 1, 1 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 1, 2 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 2, 1 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 7, 2 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 2, 7 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 17, 13 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 13, 17 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 10, 60 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 2, 300 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 3, 200 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 4, 150 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 5, 120 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 60, 10 },
+  }, {
+    { 1, 1, 120, 5 },
+    { 1, 1, 120, 5 },
+  }, {
+    { 1, 2, 1, 1 },
+    { 2, 1, 1, 1 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 1, 4 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 2, 2 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 4, 1 },
+  }, {
+    { 17, 2, 2, 2 },
+    { 2, 17, 2, 2 },
+  }, {
+    { 17, 2, 4, 1 },
+    { 2, 17, 4, 1 },
+  }, {
+    {  3, 16, 1, 1 },
+    { 16,  3, 1, 1 },
+  }, {
+    { 3, 39, 23, 17 },
+    { 39, 3, 23, 17 },
+  }, {
+    { 3, 39, 17, 23 },
+    { 39, 3, 17, 23 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  16, 20 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  2, 160 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  4, 80 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  8, 40 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  20, 16 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  32, 10 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  64, 5 },
+#if 0 // No enough local memory for 1810
+  }, {
+    { 5, 39, 17, 23 },
+    { 39, 5, 17, 23 },
+  }, {
+    { 20, 35, 2, 2 },
+    { 35, 20, 2, 2 },
+  }, {
+    { 35, 20, 2, 2 },
+    { 20, 35, 2, 2 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 160, 2 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 2, 160 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 4, 80 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 8, 40 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 10, 32 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 20, 16 },
+  }, {
+    { 39, 5, 23, 17 },
+    { 5, 39, 23, 17 },
+#endif
+  }
+};
+
+static void l2g_tensor_copy_nc_transposed_ref(
+    param_t *p, uint16_t ref_data[], uint16_t src_data[])
+{
+  cvk_tl_shape_t s = p->src->shape;
+  uint32_t n = s.n;
+  uint32_t c = s.c;
+  uint32_t hw = s.h * s.w;
+
+  for (uint32_t ni = 0; ni < n; ni++) {
+    for (uint32_t ci = 0; ci < c; ci++) {
+      for (uint32_t hwi = 0; hwi < hw; hwi++) {
+        uint32_t src_i = ni * c * hw + ci * hw + hwi;
+        uint32_t dst_i = ci * n * hw + ni * hw + hwi;
+        if(p->src->fmt == CVK_FMT_BF16 && p->dst->fmt == CVK_FMT_BF16)
+          ref_data[dst_i] = src_data[src_i];
+        else if (p->src->fmt == CVK_FMT_BF16 && (p->dst->fmt == CVK_FMT_I8 || p->dst->fmt == CVK_FMT_U8)) {
+          uint8_t sign = p->dst->fmt == CVK_FMT_I8 ? 1 : 0;
+          uint8_t val = sign ? (uint8_t) cvk_convert_bf16_s8(src_data[src_i]) : (uint8_t)cvk_convert_bf16_u8(src_data[src_i]);
+          ref_data[dst_i] = val;
+        } else if(p->dst->fmt == p->src->fmt){ //i8->i8
+          ref_data[dst_i] = src_data[src_i];
+        } else {
+          fprintf(stderr, "Error src_fmt_type (%d) or dst_fmttype (%d)", p->src->fmt, p->dst->fmt);
+          exit(-1);
+        }
+      }
+    }
+  }
+}
+
+static int test_param_l2g(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  print_param(stderr, p);
+  int ret = 0;
+  uint64_t size = tl_shape_size(&p->src->shape, CVK_FMT_I8);
+
+  uint16_t *src_data = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  if (!src_data)
+    return -1;
+
+  float val = -100;
+  for (uint64_t i = 0; i < size; i++) {
+    src_data[i] = test_generate_bf16_corner_val(val);
+    val += 0.1;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, p->src, (uint8_t *)src_data);
+  cvk_ctx->ops->tdma_l2g_bf16_tensor_copy_nc_transposed(cvk_ctx, p);
+  CVI_RT_Submit(cvk_ctx);
+
+  uint16_t *dst_data = (uint16_t*) tensor_copy_d2s(rt_handle, p->dst);
+  uint16_t *ref_data = (uint16_t *)malloc(sizeof(uint16_t) * size);
+  if (!dst_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  l2g_tensor_copy_nc_transposed_ref(p, ref_data, src_data);
+
+  if(p->dst->fmt == CVK_FMT_BF16 && p->src->fmt == CVK_FMT_BF16) {
+    for (uint64_t i = 0; i < size; i++) {
+      if (dst_data[i] != ref_data[i]) {
+        fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+                i, dst_data[i], ref_data[i]);
+        ret = -1;
+        break;
+      }
+    }
+  } else if(p->dst->fmt == CVK_FMT_U8 || p->dst->fmt == CVK_FMT_I8) {
+    for (uint64_t i = 0; i < size; i++) {
+      uint32_t shift = (i%2)*8;
+      if ((uint8_t)(dst_data[i/2] >> shift) != (uint8_t)ref_data[i]) {
+        fprintf(stderr, "comparing (bf16->i8/uint8_t) failed at dst[%" PRIu64 "], got %x, exp %x\n",
+                i,(uint8_t) (dst_data[i/2] >> shift) , ref_data[i]);
+        ret = -1;
+      }
+    }
+  } else {
+    fprintf(stderr, "Error compreing type src_fmt_type (%d) or dst_fmttype (%d)", p->src->fmt, p->dst->fmt);
+    ret = -1;
+  }
+
+fail_exit:
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+
+  return ret;
+}
+
+static void destroy_param_l2g(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  free_tensor_dev_mem(rt_handle, p->dst);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->src);
+}
+
+static int test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  int ret = 0;
+  uint32_t nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+  for (uint32_t i = 0; i < nr_fmt; i++) {
+    for (int src_align = 0; src_align < 2; src_align++) {
+      param_t p;
+      memset(&p, 0, sizeof(p));
+
+      p.src = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, c->src_shape, input_fmt[i].src_fmt, src_align);
+      p.dst = alloc_tensor_dev_mem(rt_handle, cvk_ctx, c->dst_shape, input_fmt[i].dst_fmt);
+      ret |= test_param_l2g(rt_handle, cvk_ctx, &p);
+      destroy_param_l2g(rt_handle, cvk_ctx, &p);
+    }
+  }
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+  int ret = 0;
+
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    ret |= test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_tdma_l2g_bf16_tensor_fill_constant.c b/cviruntime/test/181x/test_181x_tdma_l2g_bf16_tensor_fill_constant.c
new file mode 100644
index 000000000..4825d0ede
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_tdma_l2g_bf16_tensor_fill_constant.c
@@ -0,0 +1,166 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+
+typedef cvk_tdma_l2g_tensor_fill_constant_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: %u => (%u, %u, %u, %u)\n",
+      tag, p->constant,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  float constant;
+  cvk_tg_shape_t dst_shape;
+} case_t;
+
+typedef struct {
+  cvk_fmt_t src_fmt;
+  cvk_fmt_t dst_fmt;
+} fmt_type_t;
+
+static fmt_type_t input_fmt[] = {
+ {CVK_FMT_BF16, CVK_FMT_BF16},
+};
+
+static case_t g_cases[] = {
+  {
+    37, { 1, 1, 1, 1 }
+  }, {
+    39, { 1, 1, 1, 2 }
+  }, {
+    23, { 1, 1, 2, 1 }
+  }, {
+    19, { 1, 1, 7, 2 }
+  }, {
+    17, { 1, 1, 2, 7 }
+  }, {
+    13, { 1, 1, 17, 13 }
+  }, {
+    11, { 1, 1, 13, 17 }
+  }, {
+    7, { 1, 1, 10, 60 }
+  }, {
+    9, { 1, 1, 120, 5 }
+  }, {
+    2, { 1, 2, 1, 1 }
+  }, {
+    3, { 1, 1, 1, 2 }
+  }, {
+    5, { 2, 17, 1,  4 }
+  }, {
+    41, { 2,  1, 4, 17 }
+  }, {
+    5, { 2, 17, 1,  4 }
+  }, {
+    9, { 2,  1, 17, 4 }
+  }, {
+    17, { 3, 16, 1, 1 }
+  }, {
+    26, { 3,  1, 2, 8 }
+  }, {
+    103, { 3, 39, 17, 23 }
+  }, {
+    255, { 3, 17, 39, 23 }
+  }, {
+    254, { 3, 36, 16,  20 }
+  }, {
+    127, { 3, 18,  1, 640 }
+  }, {
+    128, { 5, 39, 17, 23 }
+  }, {
+    129, { 5, 17, 39, 23 }
+  }, {
+    55, { 20, 35,  2, 2 }
+  }, {
+    1, { 20,  7, 10, 2 }
+  }    
+};
+
+static void l2g_tensor_fill_constant_ref(param_t *p, uint16_t ref_data[])
+{
+  uint64_t size = tg_shape_size(&p->dst->shape, p->dst->fmt);
+  printf("float =%x\n",p->constant);
+  for (uint64_t i = 0; i < size/fmt_size(p->dst->fmt); i++)
+    ref_data[i] = p->constant;
+}
+
+static void test_param_l2g(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  print_param(stderr, p);
+  uint64_t size = tg_shape_size(&p->dst->shape, p->dst->fmt);
+
+  cvk_ctx->ops->tdma_l2g_tensor_fill_constant(cvk_ctx, p);
+  CVI_RT_Submit(cvk_ctx);
+  uint16_t *dst_data = (uint16_t*)tensor_copy_d2s(rt_handle, p->dst);
+
+  uint16_t *ref_data = (uint16_t *)malloc(size);
+  l2g_tensor_fill_constant_ref(p, ref_data);
+
+  for (uint64_t i = 0; i < size/sizeof(uint16_t); i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(dst_data);
+  free(ref_data);
+}
+
+static void destroy_param_l2g(CVI_RT_HANDLE rt_handle, param_t *p)
+{
+  free_tensor_dev_mem(rt_handle, p->dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  uint32_t nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+  for (uint32_t i = 0; i < nr_fmt; i++) {
+    param_t p;
+    memset(&p, 0, sizeof(p));
+    p.constant = test_generate_bf16_corner_val(c->constant);
+    p.dst = alloc_tensor_dev_mem(rt_handle, cvk_ctx, c->dst_shape, input_fmt[i].src_fmt);
+    test_param_l2g(rt_handle, cvk_ctx, &p);
+    destroy_param_l2g(rt_handle, &p);
+  }
+}
+
+int main(int argc, char **argv)
+{
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return 0;
+}
\ No newline at end of file
diff --git a/cviruntime/test/181x/test_181x_tdma_l2g_bf16_tensor_vlc_copy_compressed.c b/cviruntime/test/181x/test_181x_tdma_l2g_bf16_tensor_vlc_copy_compressed.c
new file mode 100644
index 000000000..45e7468fc
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_tdma_l2g_bf16_tensor_vlc_copy_compressed.c
@@ -0,0 +1,174 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef cvk_tdma_l2g_tensor_copy_compressed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => %d-bit %s\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->bit_length,
+      (p->src->fmt == CVK_FMT_I8)? "signed": "unsigned");
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_tl_shape_t lmem_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 17, 13 }
+  },
+  {
+    { 3, 39, 17, 23 }
+  },
+#if 0 // No enough local memory for 1810
+  {
+    { 5, 39, 17, 23 }
+  },
+  {
+    { 20, 35,  2, 2 }
+  },
+#endif
+#ifndef ENABEL_SIMPLE_VLC_TEST
+  {
+    { 1, 1, 1, 1 }
+  },
+  {
+    { 1, 1, 1, 2 }
+  },
+  {
+    { 1, 1, 7, 2 }
+  },
+  {
+    { 1, 1, 10, 60 }
+  },
+  {
+    { 1, 2, 1, 1 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 3, 16, 1, 1 }
+  },
+  {
+    { 3, 36, 16,  20 }
+  },
+#endif /* ifndef ENABEL_SIMPLE_VLC_TEST*/
+};
+
+static void test_param_l2g(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p, CommandInfo* cmd_info, uint16_t *src_data)
+{
+  print_param(stderr, p);
+  uint64_t bytesize = tl_shape_size(&p->src->shape, p->src->fmt);
+  int is_signed = (p->src->fmt == CVK_FMT_I8);
+  uint8_t data_type = (p->src->fmt == CVK_FMT_BF16) ? 1 : 0;
+  size_t bs_size = 0;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, p->src, (uint8_t *)src_data);
+  cvk_ctx->ops->tdma_l2g_tensor_copy_compressed(cvk_ctx, p);
+  CVI_RT_Submit(cvk_ctx);
+
+  uint16_t *dst_data = (uint16_t* )cmpr_tensor_copy_d2s(rt_handle, p->dst);
+  uint16_t *ref_data = (uint16_t* )test_vlc_compress((uint8_t *)src_data, bytesize, is_signed, data_type, &bs_size, cmd_info, NULL);
+
+  for (uint64_t i = 0; i < bs_size / 2 ; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "vlc compress comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+
+      exit(-1);
+    }
+  }
+
+  free(dst_data);
+  free(ref_data);
+}
+
+static void destroy_param_l2g(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  free_cmpr_tensor_dev_mem(rt_handle, p->dst);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->src);
+}
+
+static void test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  cvk_fmt_t fmts[] = { CVK_FMT_BF16 };
+  uint8_t fmts_sz = sizeof(fmts) / sizeof(fmts[0]);
+
+  for (int src_align = 0; src_align < 2; src_align++) {
+    for (uint8_t fmt_i = 0; fmt_i < fmts_sz; fmt_i++) {
+      cvk_fmt_t fmt = fmts[fmt_i];
+      uint8_t data_type = (fmt == CVK_FMT_BF16) ? 1 : 0;
+      param_t p;
+      memset(&p, 0, sizeof(p));
+
+      p.src = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, c->lmem_shape, fmt, src_align);
+      assert(p.src);
+
+      CommandInfo cmd_info;
+      memset(&cmd_info, 0, sizeof(CommandInfo));
+      uint64_t in_size = tl_shape_size(&p.src->shape, CVK_FMT_I8);
+
+      uint16_t *src_data = (uint16_t *)malloc(sizeof(uint16_t) * in_size);
+      test_vlc_init_testdata((uint8_t *)src_data, in_size, fmt == CVK_FMT_I8, fmt == CVK_FMT_BF16);
+
+      int is_signed = (p.src->fmt == CVK_FMT_I8);
+      cmd_info.signedness = is_signed;
+      cmd_info.is_bfloat16 = data_type;
+      cmd_info.bias0 = 127;
+
+      // <! not support bias0/1 setting compress by hw
+      //cvk_vlc_est_weight_bias(src_data, in_size, (bool)is_signed, (bool)data_type, &cmd_info);
+
+      cvk_tg_shape_t tg_shape =
+          tg_shape_t4(c->lmem_shape.n, c->lmem_shape.c, c->lmem_shape.h, c->lmem_shape.w);
+      p.dst = alloc_cmpr_tensor_dev_mem(rt_handle, cvk_ctx, tg_shape, fmt, &cmd_info);
+      test_param_l2g(rt_handle, cvk_ctx, &p, &cmd_info, src_data);
+      destroy_param_l2g(rt_handle, cvk_ctx, &p);
+
+      free(src_data);
+    }
+  }
+}
+
+int main(int argc, char **argv)
+{
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return 0;
+}
diff --git a/cviruntime/test/181x/test_181x_tdma_l2g_matrix_vlc_copy_compressed.c b/cviruntime/test/181x/test_181x_tdma_l2g_matrix_vlc_copy_compressed.c
new file mode 100644
index 000000000..b9a1329a7
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_tdma_l2g_matrix_vlc_copy_compressed.c
@@ -0,0 +1,182 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef cvk_tdma_l2g_matrix_copy_compressed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.w, p->src->shape.col,
+      p->dst->m.shape.row, p->dst->m.shape.col);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_ml_shape_t src_shape;
+  cvk_mg_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+ {
+    { 0, 2, 4, 7 },
+    { 0, 7 },
+  },
+ {
+    { 0, 1, 2, 2 },
+    { 1, 2 },
+  },
+ {
+    { 0, 3, 7, 17 },
+    { 0, 17 },
+  },
+ {
+    { 0, 17, 1, 17 },
+    { 0, 17 },
+  },
+ {
+    { 0, 60, 1, 60 },
+    { 0, 60 },
+  },
+#ifndef ENABEL_SIMPLE_VLC_TEST
+  {
+    { 0, 1, 1, 1 },
+    { 0, 1 },
+  },
+ {
+    { 0, 2, 1, 2 },
+    { 1, 2 },
+  },
+ {
+    { 0, 1, 7, 7 },
+    { 0, 7 },
+  },
+ {
+    { 0, 7, 1, 7 },
+    { 0, 7 },
+  },
+ {
+    { 0, 1, 17, 17 },
+    { 0, 17 },
+  },
+ {
+    { 0, 1, 60, 60 },
+    { 0, 60 },
+  },
+ {
+    { 0, 30, 2, 60 },
+    { 0, 60 },
+  },
+#endif /* ifndef ENABEL_SIMPLE_VLC_TEST*/
+};
+
+static void test_param_l2g(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p, uint8_t* src_data, CommandInfo * cmd_info)
+{
+  print_param(stderr, p);
+  uint64_t size = ml_shape_size(&p->src->shape, p->src->fmt);
+
+  matrix_copy_s2d_g2l(rt_handle, cvk_ctx, p->src, src_data);
+  cvk_ctx->ops->tdma_l2g_matrix_copy_compressed(cvk_ctx, p);
+  CVI_RT_Submit(cvk_ctx);
+
+  int is_signed = (p->src->fmt == CVK_FMT_I8);
+  int data_type = (p->src->fmt == CVK_FMT_BF16) ? 1 : 0;
+  size_t bs_size;
+
+  uint8_t *ref_data = test_vlc_compress(src_data, size, is_signed, data_type, &bs_size, cmd_info, NULL);
+  uint8_t *dst_data = cmpr_matrix_copy_d2s(rt_handle, p->dst);
+
+  for (uint64_t i = 0; i < bs_size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+          i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(dst_data);
+  free(ref_data);
+}
+
+
+static void destroy_param_l2g(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->src);
+  free_cmpr_matrix_dev_mem(rt_handle, p->dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  cvk_fmt_t fmts[] = { CVK_FMT_I8, CVK_FMT_U8 };
+  uint8_t fmts_sz = sizeof(fmts) / sizeof(fmts[0]);
+
+  for (uint32_t row = 1; row < 13; row += 2) {
+    c->src_shape.n = row;
+    c->dst_shape.row = row;
+    for (int src_align = 0; src_align < 2; src_align++) {
+      for (uint8_t fmt_i = 0; fmt_i < fmts_sz; fmt_i++) {
+        cvk_fmt_t fmt = fmts[fmt_i];
+        param_t p;
+        memset(&p, 0, sizeof(p));
+        p.src = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, c->src_shape, fmt, src_align);
+
+        uint64_t size = ml_shape_size(&p.src->shape, p.src->fmt);
+        uint8_t *src_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+        test_vlc_init_testdata(src_data, size, fmt == CVK_FMT_I8, fmt == CVK_FMT_BF16);
+
+        //size_t bs_size;
+        CommandInfo cmd_info;
+        memset(&cmd_info, 0, sizeof(CommandInfo));
+
+        // <! not support bias0/1 setting compress by hw
+        //get_vlc_compressed_meta(src_data, size, p.src->fmt, &bs_size, &cmd_info);
+
+        int is_signed = (p.src->fmt == CVK_FMT_I8);
+        cmd_info.signedness = is_signed;
+
+        // <! max compressed size
+        p.dst = alloc_cmpr_matrix_dev_mem(rt_handle, c->dst_shape, p.src->fmt, &cmd_info);
+
+        //printf ("row %u is_align %d fmt %d\n", row, src_align, fmt);
+        test_param_l2g(rt_handle, cvk_ctx, &p, src_data, &cmd_info);
+        destroy_param_l2g(rt_handle, cvk_ctx, &p);
+        free(src_data);
+      }
+    }
+  }
+}
+
+int main(int argc, char **argv)
+{
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return 0;
+}
diff --git a/cviruntime/test/181x/test_181x_tdma_l2g_tensor_copy.c b/cviruntime/test/181x/test_181x_tdma_l2g_tensor_copy.c
new file mode 100644
index 000000000..10e66fd86
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_tdma_l2g_tensor_copy.c
@@ -0,0 +1,173 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef cvk_tdma_l2g_tensor_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_tl_shape_t src_shape;
+  cvk_tg_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 1, 1 },
+    { 1, 1, 1, 1 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 2, 1 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 2, 7 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 13, 17 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 120, 5 },
+  }, {
+    { 1, 2, 1, 1 },
+    { 1, 1, 1, 2 },
+  }, {
+    { 2, 17, 1,  4 },
+    { 2,  1, 4, 17 },
+  }, {
+    { 2, 17, 1,  4 },
+    { 2,  1, 17, 4 },
+  }, {
+    { 3, 16, 1, 1 },
+    { 3,  1, 2, 8 },
+  }, {
+    { 3, 39, 17, 23 },
+    { 3, 17, 39, 23 },
+  }, {
+    { 3, 36, 16,  20 },
+    { 3, 18,  1, 640 },
+  }, {
+    { 5, 39, 17, 23 },
+    { 5, 17, 39, 23 },
+  }, {
+    { 20, 35,  2, 2 },
+    { 20,  7, 10, 2 },
+  }    
+};
+
+static void l2g_tensor_copy_ref(param_t *p, uint8_t ref_data[], uint8_t src_data[])
+{
+  uint64_t size = tl_shape_size(&p->src->shape, p->src->fmt);
+
+  for (uint64_t i = 0; i < size; i++)
+    ref_data[i] = src_data[i];
+}
+
+static int test_param_l2g(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  print_param(stderr, p);
+  int ret = 0;
+  uint64_t size = tl_shape_size(&p->src->shape, p->src->fmt);
+
+  uint8_t *src_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  uint8_t *dst_data = NULL, *ref_data = NULL;
+  if (!src_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint64_t i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, p->src, src_data);
+
+  cvk_ctx->ops->tdma_l2g_tensor_copy(cvk_ctx, p);
+  CVI_RT_Submit(cvk_ctx);
+
+  dst_data = tensor_copy_d2s(rt_handle, p->dst);
+  ref_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!dst_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  l2g_tensor_copy_ref(p, ref_data, src_data);
+
+  for (uint64_t i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+fail_exit:
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+
+  return ret;
+}
+
+static void destroy_param_l2g(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  free_tensor_dev_mem(rt_handle, p->dst);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->src);
+}
+
+static int test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  int ret = 0;
+
+  for (int src_align = 0; src_align < 2; src_align++) {
+    param_t p;
+    memset(&p, 0, sizeof(p));
+
+    p.src = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, c->src_shape, CVK_FMT_I8, src_align);
+    p.dst = alloc_tensor_dev_mem(rt_handle, cvk_ctx, c->dst_shape, CVK_FMT_I8);
+    ret |= test_param_l2g(rt_handle, cvk_ctx, &p);
+    destroy_param_l2g(rt_handle, cvk_ctx, &p);
+  }
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    ret |= test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_tdma_l2g_tensor_copy_cw_transposed.c b/cviruntime/test/181x/test_181x_tdma_l2g_tensor_copy_cw_transposed.c
new file mode 100644
index 000000000..d14eed5ca
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_tdma_l2g_tensor_copy_cw_transposed.c
@@ -0,0 +1,179 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef cvk_tdma_l2g_tensor_copy_cw_transposed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_tl_shape_t src_shape;
+  cvk_tg_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 1, 1 },
+    { 1, 1, 1, 1 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 2, 1, 1 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 2, 7, 1 },
+  }, {
+    { 1,  1, 17, 13 },
+    { 1, 13, 17,  1 },
+  }, {
+    { 1,  1, 10, 60 },
+    { 1, 60, 10,  1 },
+  }, {
+    { 1, 2, 1, 1 },
+    { 1, 1, 1, 2 },
+  }, {
+    {  2, 17, 1,  4 },
+    {  2,  4, 1, 17 },
+  }, {
+    {  2, 17, 3,  4 },
+    {  2,  4, 3, 17 },
+  }, {
+    {  3, 16, 7,  1 },
+    {  3,  1, 7, 16 },
+  }, {
+    {  3, 39, 17, 23 },
+    {  3, 23, 17, 39 },
+  }, {
+    {  3, 36,  16, 20 },
+    {  3, 20,  16, 36 },
+  }, {
+    {  5, 39, 17, 23 },
+    {  5, 23, 17, 39 },
+  }, {
+    { 20, 35,  2,  2 },
+    { 20,  2,  2, 35 },
+  }, {
+    { 20, 35,  3,  2 },
+    { 20,  2,  3, 35 },
+  }    
+};
+
+static void l2g_tensor_copy_cw_transposed_ref(
+    param_t *p, uint8_t ref_data[], uint8_t src_data[])
+{
+  cvk_tl_shape_t s = p->src->shape;
+  uint32_t n = s.n;
+  uint32_t c = s.c;
+  uint32_t h = s.h;
+  uint32_t w = s.w;
+
+  for (uint32_t ni = 0; ni < n; ni++) {
+    for (uint32_t ci = 0; ci < c; ci++) {
+      for (uint32_t hi = 0; hi < h; hi++) {
+        for (uint32_t wi = 0; wi < w; wi++) {
+          uint32_t src_i = ni * c * h * w + ci * h * w + hi * w + wi;
+          uint32_t dst_i = ni * c * h * w + wi * h * c + hi * c + ci;
+          ref_data[dst_i] = src_data[src_i];
+        }
+      }
+    }
+  }
+}
+
+static void test_param_l2g(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  print_param(stderr, p);
+  uint64_t size = tl_shape_size(&p->src->shape, p->src->fmt);
+
+  uint8_t *src_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!src_data)
+    return;
+
+  for (uint64_t i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, p->src, src_data);
+  cvk_ctx->ops->tdma_l2g_tensor_copy_cw_transposed(cvk_ctx, p);
+  CVI_RT_Submit(cvk_ctx);
+  uint8_t *dst_data = tensor_copy_d2s(rt_handle, p->dst);
+
+  uint8_t *ref_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!dst_data || !ref_data)
+    goto fail_exit;
+
+  l2g_tensor_copy_cw_transposed_ref(p, ref_data, src_data);
+
+  for (uint64_t i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+fail_exit:
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+
+static void destroy_param_l2g(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  free_tensor_dev_mem(rt_handle, p->dst);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->src);
+}
+
+static void test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  for (int src_align = 0; src_align < 2; src_align++) {
+    param_t p;
+    memset(&p, 0, sizeof(p));
+
+    p.src = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, c->src_shape, CVK_FMT_I8, src_align);
+    p.dst = alloc_tensor_dev_mem(rt_handle, cvk_ctx, c->dst_shape, CVK_FMT_I8);
+    test_param_l2g(rt_handle, cvk_ctx, &p);
+    destroy_param_l2g(rt_handle, cvk_ctx, &p);
+
+  }
+}
+
+int main(int argc, char **argv)
+{
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return 0;
+}
diff --git a/cviruntime/test/181x/test_181x_tdma_l2g_tensor_copy_nc_transposed.c b/cviruntime/test/181x/test_181x_tdma_l2g_tensor_copy_nc_transposed.c
new file mode 100644
index 000000000..1fffadca3
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_tdma_l2g_tensor_copy_nc_transposed.c
@@ -0,0 +1,263 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef cvk_tdma_l2g_tensor_copy_nc_transposed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_tl_shape_t src_shape;
+  cvk_tg_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 1, 1 },
+    { 1, 1, 1, 1 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 1, 2 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 2, 1 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 7, 2 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 2, 7 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 17, 13 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 13, 17 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 10, 60 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 2, 300 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 3, 200 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 4, 150 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 5, 120 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 60, 10 },
+  }, {
+    { 1, 1, 120, 5 },
+    { 1, 1, 120, 5 },
+  }, {
+    { 1, 2, 1, 1 },
+    { 2, 1, 1, 1 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 1, 4 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 2, 2 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 4, 1 },
+  }, {
+    { 17, 2, 2, 2 },
+    { 2, 17, 2, 2 },
+  }, {
+    { 17, 2, 4, 1 },
+    { 2, 17, 4, 1 },
+  }, {
+    {  3, 16, 1, 1 },
+    { 16,  3, 1, 1 },
+  }, {
+    { 3, 39, 23, 17 },
+    { 39, 3, 23, 17 },
+  }, {
+    { 3, 39, 17, 23 },
+    { 39, 3, 17, 23 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  16, 20 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  2, 160 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  4, 80 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  8, 40 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  20, 16 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  32, 10 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  64, 5 },
+  }, {
+    { 5, 39, 17, 23 },
+    { 39, 5, 17, 23 },
+  }, {
+    { 20, 35, 2, 2 },
+    { 35, 20, 2, 2 },
+  }, {
+    { 35, 20, 2, 2 },
+    { 20, 35, 2, 2 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 160, 2 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 2, 160 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 4, 80 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 8, 40 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 10, 32 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 20, 16 },
+  }, {
+    { 39, 5, 23, 17 },
+    { 5, 39, 23, 17 },
+  }    
+};
+
+static void l2g_tensor_copy_nc_transposed_ref(
+    param_t *p, uint8_t ref_data[], uint8_t src_data[])
+{
+  cvk_tl_shape_t s = p->src->shape;
+  uint32_t n = s.n;
+  uint32_t c = s.c;
+  uint32_t hw = s.h * s.w;
+
+  for (uint32_t ni = 0; ni < n; ni++) {
+    for (uint32_t ci = 0; ci < c; ci++) {
+      for (uint32_t hwi = 0; hwi < hw; hwi++) {
+        uint32_t src_i = ni * c * hw + ci * hw + hwi;
+        uint32_t dst_i = ci * n * hw + ni * hw + hwi;
+        ref_data[dst_i] = src_data[src_i];
+      }
+    }
+  }
+}
+
+static int test_param_l2g(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  print_param(stderr, p);
+  int ret = 0;
+  uint64_t size = tl_shape_size(&p->src->shape, p->src->fmt);
+
+  uint8_t *src_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!src_data)
+    return -1;
+
+  for (uint64_t i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, p->src, src_data);
+  cvk_ctx->ops->tdma_l2g_tensor_copy_nc_transposed(cvk_ctx, p);
+  CVI_RT_Submit(cvk_ctx);
+  uint8_t *dst_data = tensor_copy_d2s(rt_handle, p->dst);
+
+  uint8_t *ref_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!dst_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  l2g_tensor_copy_nc_transposed_ref(p, ref_data, src_data);
+
+  for (uint64_t i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+fail_exit:
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+
+  return ret;
+}
+
+static void destroy_param_l2g(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  free_tensor_dev_mem(rt_handle, p->dst);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->src);
+}
+
+static int test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  int ret = 0;
+
+  for (int src_align = 0; src_align < 2; src_align++) {
+    param_t p;
+    memset(&p, 0, sizeof(p));
+
+    p.src = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, c->src_shape, CVK_FMT_I8, src_align);
+    p.dst = alloc_tensor_dev_mem(rt_handle, cvk_ctx, c->dst_shape, CVK_FMT_I8);
+    ret |= test_param_l2g(rt_handle, cvk_ctx, &p);
+    destroy_param_l2g(rt_handle, cvk_ctx, &p);
+
+  }
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+  int ret = 0;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    ret |= test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_tdma_l2g_tensor_vlc_copy_compressed.c b/cviruntime/test/181x/test_181x_tdma_l2g_tensor_vlc_copy_compressed.c
new file mode 100644
index 000000000..6dea5b6e4
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_tdma_l2g_tensor_vlc_copy_compressed.c
@@ -0,0 +1,188 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef cvk_tdma_l2g_tensor_copy_compressed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => %d-bit %s\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->bit_length,
+      (p->src->fmt == CVK_FMT_I8)? "signed": "unsigned");
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_tl_shape_t lmem_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 17, 13 }
+  },
+  {
+    { 3, 39, 17, 23 }
+  },
+  {
+    { 5, 39, 17, 23 }
+  },
+  {
+    { 20, 35,  2, 2 }
+  },
+#ifndef ENABEL_SIMPLE_VLC_TEST
+  {
+    { 1, 1, 1, 1 }
+  },
+  {
+    { 1, 1, 1, 2 }
+  },
+  {
+    { 1, 1, 7, 2 }
+  },
+  {
+    { 1, 1, 10, 60 }
+  },
+  {
+    { 1, 2, 1, 1 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 3, 16, 1, 1 }
+  },
+  {
+    { 3, 36, 16,  20 }
+  },
+#endif /* ifndef ENABEL_SIMPLE_VLC_TEST*/
+};
+
+static uint64_t l2g_tensor_copy_vlc_compressed_ref(
+    param_t *p, uint8_t ref_data[], uint8_t src_data[], CommandInfo *cmd_info)
+{
+  uint64_t in_size = tl_shape_size(&p->src->shape, p->src->fmt);
+  size_t bs_size = 0;
+
+  cvk_vlc_enc_int8(src_data, in_size, ref_data, &bs_size, cmd_info);
+  return bs_size;
+}
+
+static int test_param_l2g(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p, CommandInfo* cmd_info_est, uint8_t *src_data)
+{
+  print_param(stderr, p);
+  uint64_t size = tl_shape_size(&p->src->shape, p->src->fmt);
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, p->src, src_data);
+  cvk_ctx->ops->tdma_l2g_tensor_copy_compressed(cvk_ctx, p);
+  CVI_RT_Submit(cvk_ctx);
+
+  uint8_t *dst_data = cmpr_tensor_copy_d2s(rt_handle, p->dst);
+  uint8_t *ref_data = (uint8_t *)malloc(sizeof(uint8_t) * p->dst->reserved_size); //<! bs_buf_size
+  if (!dst_data || !ref_data)
+    return -1;
+
+  size = l2g_tensor_copy_vlc_compressed_ref(p, ref_data, src_data, cmd_info_est);
+
+  for (uint64_t i = 0; i < size ; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "vlc compress comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+
+      return -1;
+    }
+  }
+
+  free(dst_data);
+  free(ref_data);
+
+  return 0;
+}
+
+static void destroy_param_l2g(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  free_cmpr_tensor_dev_mem(rt_handle, p->dst);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->src);
+}
+
+static int test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  int ret = 0;
+  cvk_fmt_t fmts[] = { CVK_FMT_I8, CVK_FMT_U8 };
+
+  for (int src_align = 0; src_align < 2; src_align++) {
+    for (uint8_t fmt_i = 0; fmt_i < 2; fmt_i++) {
+      cvk_fmt_t fmt = fmts[fmt_i];
+      param_t p;
+      memset(&p, 0, sizeof(p));
+
+      p.src = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, c->lmem_shape, fmt, src_align);
+      assert(p.src);
+
+      CommandInfo cmd_info;
+      memset(&cmd_info, 0, sizeof(CommandInfo));
+      uint64_t in_size = tl_shape_size(&p.src->shape, p.src->fmt);
+
+      uint8_t *src_data = (uint8_t *)malloc(sizeof(uint8_t) * in_size);
+      if (!src_data)
+        return -1;
+
+      test_vlc_init_testdata(src_data, in_size, fmt == CVK_FMT_I8, fmt == CVK_FMT_BF16);
+
+      int is_signed = (p.src->fmt == CVK_FMT_I8);
+      cmd_info.signedness = is_signed;
+
+      // <! not support bias0/1 setting compress by hw
+      //cvk_vlc_est_weight_bias(src_data, in_size, (bool)is_signed, (bool)data_type, &cmd_info);
+
+      cvk_tg_shape_t tg_shape =
+          tg_shape_t4(c->lmem_shape.n, c->lmem_shape.c, c->lmem_shape.h, c->lmem_shape.w);
+      p.dst = alloc_cmpr_tensor_dev_mem(rt_handle, cvk_ctx, tg_shape, fmt, &cmd_info);
+      ret |= test_param_l2g(rt_handle, cvk_ctx, &p, &cmd_info, src_data);
+      destroy_param_l2g(rt_handle, cvk_ctx, &p);
+
+      free(src_data);
+    }
+  }
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    ret |= test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_tdma_l2l_tensor_copy.c b/cviruntime/test/181x/test_181x_tdma_l2l_tensor_copy.c
new file mode 100644
index 000000000..3c6f5d495
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_tdma_l2l_tensor_copy.c
@@ -0,0 +1,179 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef cvk_tdma_l2l_tensor_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_tl_shape_t src_shape;
+  cvk_tl_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 1, 1 },
+    { 1, 1, 1, 1 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 2, 1 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 2, 7 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 13, 17 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 120, 5 },
+  }, {
+    { 1, 2, 1, 1 },
+    { 1, 1, 1, 2 },
+  }, {
+    { 2, 17, 1,  4 },
+    { 2,  1, 4, 17 },
+  }, {
+    { 2, 17, 1,  4 },
+    { 2,  1, 17, 4 },
+  }, {
+    { 3, 16, 1, 1 },
+    { 3,  1, 2, 8 },
+  }, {
+    { 3, 39, 17, 23 },
+    { 3, 17, 39, 23 },
+  }, {
+    { 3, 36, 16,  20 },
+    { 3, 18,  1, 640 },
+  }, {
+    { 5, 39, 17, 23 },
+    { 5, 17, 39, 23 },
+  }, {
+    { 20, 35,  2, 2 },
+    { 20,  7, 10, 2 },
+  }    
+};
+
+static void destroy_param(cvk_context_t *cvk_ctx, param_t *p)
+{
+  if (p->dst)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->dst);
+  if (p->src)
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->src);
+}
+
+static void l2l_tensor_copy_ref(param_t *p, uint8_t ref_data[], uint8_t src_data[])
+{
+  uint64_t size = tl_shape_size(&p->src->shape, p->src->fmt);
+
+  for (uint64_t i = 0; i < size; i++)
+    ref_data[i] = src_data[i];
+}
+
+static int test_param(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  print_param(stderr, p);
+  int ret = 0;
+  uint64_t size = tl_shape_size(&p->src->shape, p->src->fmt);
+
+  uint8_t *src_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!src_data)
+    return -1;
+
+  for (uint64_t i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, p->src, src_data);
+
+  cvk_ctx->ops->tdma_l2l_tensor_copy(cvk_ctx, p);
+
+  uint8_t *dst_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, p->dst);
+  uint8_t *ref_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  if (!dst_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  l2l_tensor_copy_ref(p, ref_data, src_data);
+
+  for (uint64_t i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      return -1;
+    }
+  }
+
+fail_exit:
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+
+  return ret;
+}
+
+static int test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  int ret = 0;
+
+  for (int src_align = 0; src_align < 2; src_align++) {
+    for (int dst_align = 0; dst_align < 2; dst_align++) {
+      param_t p;
+      memset(&p, 0, sizeof(p));
+      p.src = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, c->src_shape, CVK_FMT_I8, src_align);
+      p.dst = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, c->dst_shape, CVK_FMT_I8, dst_align);
+      if (p.src && p.dst)
+        ret = test_param(rt_handle, cvk_ctx, &p);
+      else if (!p.src)
+        fprintf(stderr, "fail to alloc src (%d, %d, %d, %d)\n",
+                c->src_shape.n, c->src_shape.c, c->src_shape.h, c->src_shape.w);
+      else if (!p.dst)
+        fprintf(stderr, "fail to alloc dst (%d, %d, %d, %d)\n",
+                c->dst_shape.n, c->dst_shape.c, c->dst_shape.h, c->dst_shape.w);
+      destroy_param(cvk_ctx, &p);
+    }
+  }
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    ret |= test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_tdma_matrix_vlc_decompress_compress.c b/cviruntime/test/181x/test_181x_tdma_matrix_vlc_decompress_compress.c
new file mode 100644
index 000000000..8a46ead0a
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_tdma_matrix_vlc_decompress_compress.c
@@ -0,0 +1,208 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef cvk_tdma_g2l_matrix_copy_decompressed_param_t decompress_param_t;
+typedef cvk_tdma_l2g_matrix_copy_compressed_param_t compress_param_t;
+
+typedef struct{
+  decompress_param_t dec_p;
+  compress_param_t com_p;
+} param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => %s\n",
+      tag,
+      p->dec_p.dst->shape.n, p->dec_p.dst->shape.c, p->dec_p.dst->shape.w, p->dec_p.dst->shape.col,
+      (p->dec_p.dst->fmt == CVK_FMT_I8)? "signed": "unsigned");
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_mg_shape_t src_shape;
+  cvk_ml_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 0, 1 },
+    { 0, 1, 1, 1 },
+  },
+  {
+    { 0, 2 },
+    { 0, 1, 2, 2 },
+  },
+  {
+    { 0, 2 },
+    { 0, 2, 1, 2 },
+  },
+  {
+    { 0, 7 },
+    { 0, 1, 7, 7 },
+  },
+#ifndef ENABEL_SIMPLE_VLC_TEST
+  {
+    { 0, 7 },
+    { 0, 2, 4, 7 },
+  },
+  {
+    { 0, 7 },
+    { 0, 7, 1, 7 },
+  },
+  {
+    { 0, 17 },
+    { 0, 1, 17, 17 },
+  },
+  {
+    { 0, 17 },
+    { 0, 3, 7, 17 },
+  },
+  {
+    { 0, 17 },
+    { 0, 17, 1, 17 },
+  },
+  {
+    { 0, 60 },
+    { 0, 1, 60, 60 },
+  },
+  {
+    { 0, 60 },
+    { 0, 30, 2, 60 },
+  },
+  {
+    { 0, 60 },
+    { 0, 60, 1, 60 },
+  }
+#endif /* ifndef ENABEL_SIMPLE_VLC_TEST*/
+};
+
+static void test_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p, uint8_t *src_data,
+  CommandInfo* cmd_info)
+{
+  print_param(stderr, p);
+  uint64_t size = ml_shape_size(&p->dec_p.dst->shape, p->dec_p.dst->fmt);
+  int is_signed = (p->dec_p.dst->fmt == CVK_FMT_I8);
+
+  uint8_t *gmem_data;
+  size_t bs_size;
+  size_t data_type = (p->dec_p.dst->fmt == CVK_FMT_BF16) ? 1 : 0;
+
+  // command info
+  gmem_data = test_vlc_compress(src_data, size, is_signed, data_type, &bs_size, cmd_info, NULL);
+
+  //1. send compressed one to gaddr and decompress from gaddr to local
+  cmpr_matrix_copy_s2d(rt_handle, p->dec_p.src, gmem_data);
+  cvk_ctx->ops->tdma_g2l_matrix_copy_decompressed(cvk_ctx, &p->dec_p);
+  CVI_RT_Submit(cvk_ctx);
+
+  //2. decompress from sram
+  cvk_ctx->ops->tdma_l2g_matrix_copy_compressed(cvk_ctx, &p->com_p);
+  CVI_RT_Submit(cvk_ctx);
+
+  //3. get final data
+  uint8_t *dst_data = cmpr_matrix_copy_d2s(rt_handle, p->com_p.dst);
+
+  for (uint64_t i = 0; i < bs_size ; i++) {
+    if (dst_data[i] != gmem_data[i]) {
+      fprintf(stderr, "vlc compress comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], gmem_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(dst_data);
+  free(gmem_data);
+}
+
+static void destroy_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  free_cmpr_matrix_dev_mem(rt_handle, p->dec_p.src);
+  free_cmpr_matrix_dev_mem(rt_handle, p->com_p.dst);
+  cvk_ctx->ops->lmem_free_matrix(cvk_ctx, p->dec_p.dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  cvk_fmt_t fmts[] = { CVK_FMT_I8, CVK_FMT_U8 };
+  uint8_t fmts_sz = sizeof(fmts) / sizeof(fmts[0]);
+
+  for (uint32_t row = 1; row < 13; row += 2) {
+    c->src_shape.row = row;
+    c->dst_shape.n = row;
+    for (int dst_align = 0; dst_align < 2; dst_align++) {
+      for (uint8_t fmt_i = 0; fmt_i < fmts_sz; fmt_i++) {
+        cvk_fmt_t fmt = fmts[fmt_i];
+        param_t p;
+        memset(&p, 0, sizeof(p));
+        //put compressed data to gaddr ->decompress to local -> compress to gaddr
+
+        int is_signed = (fmt == CVK_FMT_I8);
+
+        CommandInfo cmd_info;
+        memset(&cmd_info, 0, sizeof(CommandInfo));
+        cmd_info.signedness = is_signed;
+
+        // <! not support bias0/1 setting compress by hw
+        //get_vlc_compressed_meta(src_data, size, fmt, &bs_size, &cmd_info);
+
+        //1. alloc decompress
+        p.dec_p.src = alloc_cmpr_matrix_dev_mem(rt_handle, c->src_shape, fmt, &cmd_info);
+        p.dec_p.dst = cvk_ctx->ops->lmem_alloc_matrix(cvk_ctx, c->dst_shape, fmt, dst_align);
+
+        uint64_t size = ml_shape_size(&p.dec_p.dst->shape, p.dec_p.dst->fmt);
+        uint8_t *src_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+        test_vlc_init_testdata(src_data, size, fmt == CVK_FMT_I8, fmt == CVK_FMT_BF16);
+
+        assert(p.dec_p.dst);
+
+        //2. alloc compress
+        p.com_p.src = p.dec_p.dst; //cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, c->lmem_shape, fmt, align);
+        p.com_p.dst = alloc_cmpr_matrix_dev_mem(rt_handle, c->src_shape, fmt, &cmd_info);
+
+        //3. test: the sequence like below:
+        //3.1 put compressed data to gaddr
+        //3.2 decompress to local
+        //3.3 compress to gaddr
+        //printf ("row %u is_align %d fmt %d\n", row, dst_align, fmt);
+        test_param_g2l(rt_handle, cvk_ctx, &p, src_data, &cmd_info);
+        destroy_param_g2l(rt_handle, cvk_ctx, &p);
+        free(src_data);
+      }
+    }
+  }
+}
+
+int main(int argc, char **argv)
+{
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return 0;
+}
diff --git a/cviruntime/test/181x/test_181x_tdma_tensor_vlc_decompress_compress.c b/cviruntime/test/181x/test_181x_tdma_tensor_vlc_decompress_compress.c
new file mode 100644
index 000000000..6cb282562
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_tdma_tensor_vlc_decompress_compress.c
@@ -0,0 +1,203 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef cvk_tdma_g2l_tensor_copy_decompressed_param_t decompress_param_t;
+typedef cvk_tdma_l2g_tensor_copy_compressed_param_t compress_param_t;
+
+typedef struct{
+  decompress_param_t dec_p;
+  compress_param_t com_p;
+} param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => %d-bit %s\n",
+      tag,
+      p->dec_p.dst->shape.n, p->dec_p.dst->shape.c, p->dec_p.dst->shape.h, p->dec_p.dst->shape.w,
+      p->dec_p.src->bit_length,
+      (p->dec_p.dst->fmt == CVK_FMT_I8)? "signed": "unsigned");
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_tl_shape_t lmem_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 17, 13 }
+  },
+  {
+    { 3, 39, 17, 23 }
+  },
+  {
+    { 5, 39, 17, 23 }
+  },
+  {
+    { 20, 35,  2, 2 }
+  },
+#ifndef ENABEL_SIMPLE_VLC_TEST
+  {
+    { 1, 1, 1, 1 }
+  },
+  {
+    { 1, 1, 1, 2 }
+  },
+  {
+    { 1, 1, 7, 2 }
+  },
+  {
+    { 1, 1, 10, 60 }
+  },
+  {
+    { 1, 2, 1, 1 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 3, 16, 1, 1 }
+  },
+  {
+    { 3, 36, 16,  20 }
+  },
+#endif /* ifndef ENABEL_SIMPLE_VLC_TEST*/
+};
+
+static int test_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p, cvk_cmpr_tg_t* dst)
+{
+  print_param(stderr, p);
+  int ret = 0;
+  uint64_t size = tl_shape_size(&p->dec_p.dst->shape, p->dec_p.dst->fmt);
+  int is_signed = (p->dec_p.dst->fmt == CVK_FMT_I8);
+  uint8_t *src_data = (uint8_t *)malloc(sizeof(uint8_t) * size);
+  uint8_t *gmem_data = NULL, *dst_data = NULL;
+  if (!src_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  test_vlc_init_testdata(src_data, size, p->dec_p.dst->fmt == CVK_FMT_I8, p->dec_p.dst->fmt == CVK_FMT_BF16);
+
+  size_t total_size;
+  size_t data_type = (p->dec_p.dst->fmt == CVK_FMT_BF16) ? 1 : 0;
+  size_t in_size = size;
+  size_t bs_buf_size = get_out_bs_buf_size(size, data_type);
+  gmem_data = (uint8_t *) malloc(bs_buf_size * sizeof(uint8_t));
+  if (!gmem_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  // command info
+  CommandInfo cmd_info;
+  memset(&cmd_info, 0, sizeof(CommandInfo));
+  cmd_info.signedness = is_signed;
+
+  // <! not support bias0/1 setting compress by hw
+  cvk_vlc_enc_int8(src_data, in_size, gmem_data, &total_size, &cmd_info);
+
+  cmpr_tensor_copy_s2d(rt_handle, p->dec_p.src, gmem_data);
+  cvk_ctx->ops->tdma_g2l_tensor_copy_decompressed(cvk_ctx, &p->dec_p);
+  CVI_RT_Submit(cvk_ctx);
+
+  dst->zero_guard_en = cmd_info.zero_guard_en;
+  dst->bias0 = cmd_info.bias0;
+  dst->bias1 = cmd_info.bias1;
+  p->com_p.dst = dst;
+  cvk_ctx->ops->tdma_l2g_tensor_copy_compressed(cvk_ctx, &p->com_p);
+  CVI_RT_Submit(cvk_ctx);
+
+  dst_data = cmpr_tensor_copy_d2s(rt_handle, p->com_p.dst);
+
+  for (uint64_t i = 0; i < total_size ; i++) {
+    if (dst_data[i] != gmem_data[i]) {
+      fprintf(stderr, "vlc compress comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], gmem_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+fail_exit:
+  free(src_data);
+  free(dst_data);
+  free(gmem_data);
+
+  return ret;
+}
+
+static void destroy_param_g2l(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, param_t *p)
+{
+  free_cmpr_tensor_dev_mem(rt_handle, p->dec_p.src);
+  free_cmpr_tensor_dev_mem(rt_handle, p->com_p.dst);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, p->dec_p.dst);
+}
+
+static int test_one_case(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, case_t *c)
+{
+  cvk_fmt_t fmts[2] = { CVK_FMT_I8, CVK_FMT_U8 };
+  int ret = 0;
+
+  for (int align = 0; align < 2; align++) {
+    for (uint8_t fmt_i = 0; fmt_i < 2; fmt_i++) {
+      cvk_fmt_t fmt = fmts[fmt_i];
+
+      param_t p;
+      memset(&p, 0, sizeof(p));
+      cvk_tg_shape_t tg_shape =
+          tg_shape_t4(c->lmem_shape.n, c->lmem_shape.c, c->lmem_shape.h, c->lmem_shape.w);
+      p.dec_p.src = alloc_cmpr_tensor_dev_mem(rt_handle, cvk_ctx, tg_shape, fmt, NULL);
+      p.dec_p.dst = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, c->lmem_shape, fmt, align);
+      assert(p.dec_p.dst);
+
+      p.com_p.src = p.dec_p.dst; //cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, c->lmem_shape, fmt, align);
+      assert(p.com_p.src);
+      cvk_cmpr_tg_t* dst = alloc_cmpr_tensor_dev_mem(rt_handle, cvk_ctx, tg_shape, fmt, NULL);
+
+      ret |= test_param_g2l(rt_handle, cvk_ctx, &p, dst);
+      destroy_param_g2l(rt_handle, cvk_ctx, &p);
+    }
+  }
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+  int ret = 0;
+
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++)
+    ret |= test_one_case(rt_handle, cvk_ctx, &g_cases[i]);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_tensor_add.c b/cviruntime/test/181x/test_181x_tensor_add.c
new file mode 100644
index 000000000..b81052bcc
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_tensor_add.c
@@ -0,0 +1,184 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_add_ref(
+    uint8_t *ref_high, uint8_t *ref_low,
+    uint8_t *a_high, uint8_t *a_low,
+    uint8_t *b_high, uint8_t *b_low,
+    int rshift_bits,
+    uint64_t size, int relu_enable)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    int32_t ta = ((int8_t)a_high[i] << 8) + a_low[i];
+    int32_t tb = ((int8_t)b_high[i] << 8) + b_low[i];
+    int32_t res = ta + tb;
+
+    res += 1 << (rshift_bits - 1);
+    res >>= rshift_bits;
+
+    if(relu_enable)
+    {
+      if (res > 127)
+        res = 127;
+      else if (res < -128)
+        res = -128;
+
+      if(relu_enable)
+        if(res<0)
+          res=0;
+      ref_high[i] = 0;
+      ref_low[i] = res & 0xff;
+
+    }else{
+      if (res > 32767)
+        res = 32767;
+      else if (res < -32768)
+        res = -32768;
+      ref_high[i] = (res >> 8) & 0xff;
+      ref_low[i] = res & 0xff;
+    }
+  }
+}
+
+static int test_tl_add(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 2; // 3 -> 2 for 1810
+  int c = 39;
+  int h = 7;
+  int w = 37;
+  int rshift_bits;
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  uint32_t size = n * c * h  * w;
+  uint8_t *a_high_data = (uint8_t *)malloc(size);
+  uint8_t *a_low_data = (uint8_t *)malloc(size);
+  uint8_t *b_high_data = (uint8_t *)malloc(size);
+  uint8_t *b_low_data = (uint8_t *)malloc(size);
+  uint8_t *ref_high_data = (uint8_t *)malloc(size);
+  uint8_t *ref_low_data = (uint8_t *)malloc(size);
+  if (!a_high_data || !a_low_data || !b_high_data || !b_low_data || !ref_high_data || !ref_low_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for(int relu_enable = 0; relu_enable < 2; relu_enable++) {
+    for (uint32_t i = 0; i < size; i++) {
+      a_high_data[i] = rand() % 64+ i ;
+      a_low_data[i] = i;
+      b_high_data[i] = (i + 250) / 20;
+      b_low_data[i] = 100 - i;
+    }
+    if(relu_enable)
+      rshift_bits = 7;
+    else
+      rshift_bits = 1;
+
+    tl_add_ref(ref_high_data, ref_low_data,
+               a_high_data, a_low_data,
+               b_high_data, b_low_data,
+               rshift_bits,
+               size, relu_enable);
+
+    cvk_tl_t *tl_a_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+    cvk_tl_t *tl_a_high = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+    cvk_tl_t *tl_b_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+    cvk_tl_t *tl_b_high = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+    cvk_tl_t *tl_res_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+    cvk_tl_t *tl_res_high = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+    uint8_t *res_high_data = NULL, *res_low_data = NULL;
+    if (!tl_a_low || !tl_a_high || !tl_b_low || !tl_b_high || !tl_res_low || !tl_res_high) {
+      ret = -1;
+      goto fail_exit_2;
+    }
+
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a_low, a_low_data);
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a_high, a_high_data);
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b_low, b_low_data);
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b_high, b_high_data);
+    cvk_tiu_add_param_t p4;
+    p4.res_high = relu_enable ? 0 : tl_res_high;
+    p4.res_low = tl_res_low;
+    p4.a_high = tl_a_high;
+    p4.a_low = tl_a_low;
+    p4.b_is_const = 0;
+    p4.b.high = tl_b_high;
+    p4.b.low = tl_b_low;
+    p4.rshift_bits = rshift_bits;
+    p4.relu_enable = relu_enable;
+    cvk_ctx->ops->tiu_add(cvk_ctx, &p4);
+    res_high_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res_high);
+    res_low_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res_low);
+    for (uint64_t i = 0; i < size; i++) {
+      if(!relu_enable)
+        if (res_high_data[i] != ref_high_data[i]) {
+          fprintf(stderr, "comparing failed at res_high_data[%" PRIu64 "], got %d, exp %d\n",
+                 i, res_high_data[i], ref_high_data[i]);
+          ret = -1;
+        }
+      if (res_low_data[i] != ref_low_data[i]) {
+        fprintf(stderr, "comparing failed at res_low_data[%" PRIu64 "], got %d, exp %d\n",
+               i, res_low_data[i], ref_low_data[i]);
+        ret = -1;
+      }
+    }
+
+fail_exit_2:
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res_high);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res_low);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b_high);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b_low);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a_high);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a_low);
+
+    free(res_high_data);
+    free(res_low_data);
+  }
+
+fail_exit:
+  free(a_high_data);
+  free(a_low_data);
+  free(b_high_data);
+  free(b_low_data);
+  free(ref_high_data);
+  free(ref_low_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_add(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_add(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_tensor_add_const.c b/cviruntime/test/181x/test_181x_tensor_add_const.c
new file mode 100644
index 000000000..00ee18b4a
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_tensor_add_const.c
@@ -0,0 +1,178 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_add_const_ref(
+    uint8_t *ref_high, uint8_t *ref_low,
+    uint8_t *a_high, uint8_t *a_low,
+    int16_t b, int b_is_signed,
+    int rshift_bits,
+    uint64_t size, int relu_enable)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    int32_t ta = ((int8_t)a_high[i] << 8) + a_low[i];
+    int32_t tb = b_is_signed? b: (uint16_t)b;
+    int32_t res = ta + tb;
+    res += 1 << (rshift_bits - 1);
+    res >>= rshift_bits;
+
+    if(relu_enable)
+    {
+      if (res > 127)
+        res = 127;
+      else if (res < -128)
+        res = -128;
+
+      if(relu_enable)
+        if(res<0)
+          res=0;
+      ref_high[i] = 0;
+      ref_low[i] = res & 0xff;
+    }else{
+      if (res > 32767)
+        res = 32767;
+      else if (res < -32768)
+        res = -32768;
+      ref_high[i] = (res >> 8) & 0xff;
+      ref_low[i] = res & 0xff;
+    }
+  }
+}
+
+static int test_tl_add_const(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+  int rshift_bits;
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  uint32_t size = n * c * h  * w;
+  uint8_t *a_high_data = (uint8_t *)malloc(size);
+  uint8_t *a_low_data = (uint8_t *)malloc(size);
+  uint8_t *ref_high_data = (uint8_t *)malloc(size);
+  uint8_t *ref_low_data = (uint8_t *)malloc(size);
+  if (!a_high_data || !a_low_data || !ref_high_data || !ref_low_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for(int relu_enable = 0; relu_enable < 2; relu_enable++) {
+    int16_t b;
+    int b_is_signed = 1;
+    for (uint32_t i = 0; i < size; i++) {
+      a_high_data[i] = rand() % 64+ i;
+      a_low_data[i] = i;
+    }
+
+    if(relu_enable)
+    {
+      b=-64;
+      rshift_bits = 7;
+    }
+    else
+    {
+      b=-278;
+      rshift_bits = 1;
+    }
+
+    tl_add_const_ref(ref_high_data, ref_low_data,
+                     a_high_data, a_low_data,
+                     b, b_is_signed, rshift_bits, size,relu_enable);
+
+    cvk_tl_t *tl_a_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+    cvk_tl_t *tl_a_high = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+    cvk_tl_t *tl_res_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+    cvk_tl_t *tl_res_high = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+    uint8_t *res_high_data = NULL, *res_low_data = NULL;
+    if (!tl_a_low || !tl_a_high || !tl_res_low || !tl_res_high) {
+      ret = -1;
+      goto fail_exit_2;
+    }
+
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a_low, a_low_data);
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a_high, a_high_data);
+
+    cvk_tiu_add_param_t p4;
+    p4.res_high = relu_enable ? 0 : tl_res_high;
+    p4.res_low = tl_res_low;
+    p4.a_high = tl_a_high;
+    p4.a_low = tl_a_low;
+    p4.b_is_const = 1;
+    p4.b_const.val = b;
+    p4.b_const.is_signed = b_is_signed;
+    p4.rshift_bits = rshift_bits;
+    p4.relu_enable = relu_enable;
+    cvk_ctx->ops->tiu_add(cvk_ctx, &p4);
+
+    res_high_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res_high);
+    res_low_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res_low);
+    for (uint64_t i = 0; i < size; i++) {
+      if(!relu_enable)
+        if (res_high_data[i] != ref_high_data[i]) {
+          fprintf(stderr, "comparing failed at res_high_data[%" PRIu64 "], got %d, exp %d\n",
+                  i, res_high_data[i], ref_high_data[i]);
+          ret = -1;
+        }
+      if (res_low_data[i] != ref_low_data[i]) {
+        fprintf(stderr, "comparing failed at res_low_data[%" PRIu64 "], got %d, exp %d\n",
+                i, res_low_data[i], ref_low_data[i]);
+        ret = -1;
+      }
+    }
+
+fail_exit_2:
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res_high);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res_low);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a_high);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a_low);
+    free(res_high_data);
+    free(res_low_data);
+  }
+
+fail_exit:
+  free(a_high_data);
+  free(a_low_data);
+  free(ref_high_data);
+  free(ref_low_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_add_const(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_add_const(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_tensor_and.c b/cviruntime/test/181x/test_181x_tensor_and.c
new file mode 100644
index 000000000..c69ad8428
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_tensor_and.c
@@ -0,0 +1,232 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_and_int8_ref(int8_t *a, int8_t *b, int8_t *res, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++)
+    res[i] = a[i] & b[i];
+}
+
+static void tl_and_int16_ref(
+    uint8_t *ref_high, uint8_t *ref_low,
+    uint8_t *a_high, uint8_t *a_low,
+    uint8_t *b_high, uint8_t *b_low,
+    uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    int32_t ta = ((int8_t)a_high[i] << 8) + a_low[i];
+    int32_t tb = ((int8_t)b_high[i] << 8) + b_low[i];
+    int32_t res = ta & tb;
+    ref_high[i] = (res >> 8) & 0xff;
+    ref_low[i] = res & 0xff;
+  }
+}
+
+static int test_tl_and_int8(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  uint32_t size = n * c * h * w;
+  int8_t *a_data = (int8_t *)malloc(size);
+  int8_t *b_data = (int8_t *)malloc(size);
+  int8_t *ref_data = (int8_t *)malloc(size);
+  if (!a_data || !b_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint32_t i = 0; i < size; i++)
+    a_data[i] = (int8_t)(i % 256);
+
+  for (uint32_t i = 0; i < size; i++)
+    b_data[i] = (int8_t)(100 - i % 256);
+
+  tl_and_int8_ref(a_data, b_data, ref_data, size);
+
+  cvk_tl_t *tl_a = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_b = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_res = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  if (!tl_a || !tl_b || !tl_res) {
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, (uint8_t *)a_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b, (uint8_t *)b_data);
+  cvk_tiu_and_int8_param_t p9;
+  p9.res = tl_res;
+  p9.a = tl_a;
+  p9.b = tl_b;
+  cvk_ctx->ops->tiu_and_int8(cvk_ctx, &p9);
+  uint8_t *res_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res);
+
+  for (uint32_t i = 0; i < size; i++) {
+    if ((int8_t)res_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%u], got %x, exp %x\n",
+             i, res_data[i], ref_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+  free(res_data);
+
+fail_exit_2:
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+
+fail_exit:
+  free(a_data);
+  free(b_data);
+  free(ref_data);
+
+  return ret;
+}
+
+static int test_tl_and_int16(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 2; // 3 -> 2 for 1810
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  uint32_t size = n * c * h * w;
+  uint8_t *a_high_data = (uint8_t *)malloc(size);
+  uint8_t *a_low_data = (uint8_t *)malloc(size);
+  uint8_t *b_high_data = (uint8_t *)malloc(size);
+  uint8_t *b_low_data = (uint8_t *)malloc(size);
+  uint8_t *ref_high_data = (uint8_t *)malloc(size);
+  uint8_t *ref_low_data = (uint8_t *)malloc(size);
+  if (!a_high_data || !a_low_data || !b_high_data || !b_low_data || !ref_high_data || !ref_low_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint32_t i = 0; i < size; i++) {
+    a_high_data[i] = i / 10;
+    a_low_data[i] = i;
+    b_high_data[i] = (i + 250) / 20;
+    b_low_data[i] = 100 - i;
+  }
+
+  tl_and_int16_ref(
+      ref_high_data, ref_low_data,
+      a_high_data, a_low_data,
+      b_high_data, b_low_data,
+      size);
+
+  cvk_tl_t *tl_a_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_a_high = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_b_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_b_high = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_res_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_res_high = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  if (!tl_a_low || !tl_a_high || !tl_b_low || !tl_b_high || !tl_res_low || !tl_res_high){
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a_low, a_low_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a_high, a_high_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b_low, b_low_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b_high, b_high_data);
+  cvk_tiu_and_int16_param_t p8;
+  p8.res_high = tl_res_high;
+  p8.res_low = tl_res_low;
+  p8.a_high = tl_a_high;
+  p8.a_low = tl_a_low;
+  p8.b_high = tl_b_high;
+  p8.b_low = tl_b_low;
+  cvk_ctx->ops->tiu_and_int16(cvk_ctx, &p8);
+  uint8_t *res_high_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res_high);
+  uint8_t *res_low_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res_low);
+
+  for (uint64_t i = 0; i < size; i++) {
+    if (res_high_data[i] != ref_high_data[i]) {
+      fprintf(stderr, "comparing failed at res_high_data[%" PRIu64 "], got %d, exp %d\n",
+             i, res_high_data[i], ref_high_data[i]);
+      ret = 1;
+      break;
+    }
+    if (res_low_data[i] != ref_low_data[i]) {
+      fprintf(stderr, "comparing failed at res_low_data[%" PRIu64 "], got %d, exp %d\n",
+             i, res_low_data[i], ref_low_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+  free(res_high_data);
+  free(res_low_data);
+
+fail_exit_2:
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res_high);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res_low);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b_high);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b_low);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a_high);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a_low);
+
+fail_exit:
+  free(a_high_data);
+  free(a_low_data);
+  free(b_high_data);
+  free(b_low_data);
+  free(ref_high_data);
+  free(ref_low_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_and_int8(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_and_int8(rt_handle, cvk_ctx, 1);
+  ret |= test_tl_and_int16(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_and_int16(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_tensor_arith_shift.c b/cviruntime/test/181x/test_181x_tensor_arith_shift.c
new file mode 100644
index 000000000..3e1fa0f43
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_tensor_arith_shift.c
@@ -0,0 +1,154 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_arith_shift_ref(
+    uint8_t *ref_high, uint8_t *ref_low,
+    uint8_t *a_high, uint8_t *a_low,
+    uint8_t *bits, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    int32_t ta = ((int8_t)a_high[i] << 8) + a_low[i];
+    int32_t tbits = (int8_t)bits[i];
+
+    /*
+     * Yes, a @tbits bigger than zero means shifting LEFT,
+     * no matter whether the shift type is arithmetic
+     * RIGHT shift or logic RIGHT shift.
+     */
+    int32_t res;
+    if (tbits >= 0)
+      res = ta << tbits;
+    else
+      res = ta >> -tbits;
+
+    ref_high[i] = (res >> 8) & 0xff;
+    ref_low[i] = res & 0xff;
+  }
+}
+
+static int test_tl_arith_shift(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 2; // 3 -> 2 for 1810
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  uint32_t size = n * c * h * w;
+  uint8_t *a_high_data = (uint8_t *)malloc(size);
+  uint8_t *a_low_data = (uint8_t *)malloc(size);
+  uint8_t *bits_data = (uint8_t *)malloc(size);
+  uint8_t *ref_high_data = (uint8_t *)malloc(size);
+  uint8_t *ref_low_data = (uint8_t *)malloc(size);
+  if (!a_high_data || !a_low_data || !bits_data || !ref_high_data || !ref_low_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint32_t i = 0; i < size; i++) {
+    a_high_data[i] = 240 + i;
+    a_low_data[i] = 200 + i;
+    bits_data[i] = (i % 33) - 16;
+  }
+
+  tl_arith_shift_ref(
+      ref_high_data, ref_low_data,
+      a_high_data, a_low_data,
+      bits_data, size);
+
+  cvk_tl_t *tl_a_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_a_high = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_bits = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_res_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_res_high = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  if (!tl_a_low || !tl_a_high || !tl_bits || !tl_res_low || !tl_res_high) {
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a_low, a_low_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a_high, a_high_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_bits, bits_data);
+  cvk_tiu_arith_shift_param_t p8;
+  p8.res_high = tl_res_high;
+  p8.res_low = tl_res_low;
+  p8.a_high = tl_a_high;
+  p8.a_low = tl_a_low;
+  p8.bits = tl_bits;
+  cvk_ctx->ops->tiu_arith_shift(cvk_ctx, &p8);
+  uint8_t *res_high_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res_high);
+  uint8_t *res_low_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res_low);
+
+  for (uint32_t i = 0; i < size; i++) {
+    if (res_high_data[i] != ref_high_data[i]) {
+      fprintf(stderr, "comparing failed at res_high_data[%u], got %d, exp %d\n",
+             i, res_high_data[i], ref_high_data[i]);
+      ret = -1;
+      break;
+    }
+    if (res_low_data[i] != ref_low_data[i]) {
+      fprintf(stderr, "comparing failed at res_low_data[%u], got %d, exp %d\n",
+             i, res_low_data[i], ref_low_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+  free(res_high_data);
+  free(res_low_data);
+
+fail_exit_2:
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res_high);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res_low);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_bits);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a_high);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a_low);
+
+fail_exit:
+  free(a_high_data);
+  free(a_low_data);
+  free(bits_data);
+  free(ref_high_data);
+  free(ref_low_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_arith_shift(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_arith_shift(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_tensor_ge.c b/cviruntime/test/181x/test_181x_tensor_ge.c
new file mode 100644
index 000000000..e7f15612b
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_tensor_ge.c
@@ -0,0 +1,122 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_ge_ref(int8_t *a, int8_t *b, int8_t *result, uint64_t size, cvk_fmt_t fmt)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    int32_t a32 = (fmt == CVK_FMT_I8) ? (int8_t)a[i] : (uint8_t)a[i];
+    int32_t b32 = (fmt == CVK_FMT_I8) ? (int8_t)b[i] : (uint8_t)b[i];
+    if (a32 >= b32)
+      result[i] = 1;
+    else
+      result[i] = 0;
+  }
+}
+
+static int test_tl_ge(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  uint32_t size = n * c * h * w;
+  int8_t *a_data = (int8_t *)malloc(size);
+  int8_t *b_data = (int8_t *)malloc(size);
+  int8_t *ref_data = (int8_t *)malloc(size);
+  if (!a_data || !b_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (int i = 0; i < 2; i++) {
+    for (uint32_t i = 0; i < size; i++)
+      a_data[i] = (int8_t)(i % 256);
+  
+    for (uint32_t i = 0; i < size; i++)
+      b_data[i] = (int8_t)(100 - i % 256);
+  
+    cvk_fmt_t fmt = (i == 0) ? CVK_FMT_I8 : CVK_FMT_U8;
+    tl_ge_ref(a_data, b_data, ref_data, size, fmt);
+  
+    cvk_tl_t *tl_a  = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt, eu_align);
+    cvk_tl_t *tl_b  = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt, eu_align);
+    cvk_tl_t *tl_ge = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt, eu_align);
+    if (!tl_a || !tl_b || !tl_ge) {
+      ret = -1;
+      goto fail_exit;
+    }
+
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, (uint8_t *)a_data);
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b, (uint8_t *)b_data);
+  
+    cvk_tiu_ge_param_t p;
+    memset(&p, 0, sizeof(p));
+    p.a = tl_a;
+    p.b_is_const = 0;
+    p.b = tl_b;
+    p.ge = tl_ge;
+    cvk_ctx->ops->tiu_ge(cvk_ctx, &p);
+    uint8_t *ge_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_ge);
+  
+    for (uint64_t i = 0; i < size; i++) {
+      if ((int8_t)ge_data[i] != ref_data[i]) {
+        fprintf(stderr, "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+               i, ge_data[i], ref_data[i]);
+        ret = -1;
+      }
+    }
+  
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_ge);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+    free(ge_data);
+  }
+
+fail_exit:
+  free(a_data);
+  free(b_data);
+  free(ref_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_ge(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_ge(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_tensor_ge_const.c b/cviruntime/test/181x/test_181x_tensor_ge_const.c
new file mode 100644
index 000000000..f2392a952
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_tensor_ge_const.c
@@ -0,0 +1,119 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_ge_const_ref(int8_t *a, int8_t b, int8_t *result, uint64_t size, cvk_fmt_t fmt)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    int32_t a32 = (fmt == CVK_FMT_I8) ? (int8_t)a[i] : (uint8_t)a[i];
+    int32_t b32 = (fmt == CVK_FMT_I8) ? (int8_t)b : (uint8_t)b;
+    if (a32 >= b32)
+      result[i] = 1;
+    else
+      result[i] = 0;
+  }
+}
+
+static int test_tl_ge_const(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  uint32_t size = n * c * h * w;
+  int8_t *a_data = (int8_t *)malloc(size);
+  int8_t *ref_data = (int8_t *)malloc(size);
+  if (!a_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (int i = 0; i < 2; i++) {
+    for (uint32_t i = 0; i < size; i++)
+      a_data[i] = (int8_t)(i % 256);
+
+    int8_t b = 47;
+
+    cvk_fmt_t fmt = (i == 1) ? CVK_FMT_I8 : CVK_FMT_U8;
+    tl_ge_const_ref(a_data, b, ref_data, size, fmt);
+  
+    cvk_tl_t *tl_a = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt, eu_align);
+    cvk_tl_t *tl_ge = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, fmt, eu_align);
+    uint8_t *ge_data = NULL;
+    if (!tl_a || !tl_ge) {
+      ret = -1;
+      goto fail_exit_2;
+    }
+  
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, (uint8_t *)a_data);
+    cvk_tiu_ge_param_t p;
+    memset(&p, 0, sizeof(p));
+    p.ge = tl_ge;
+    p.a = tl_a;
+    p.b_is_const = 1;
+    p.b_const.val = b;
+    p.b_const.is_signed = i;
+    cvk_ctx->ops->tiu_ge(cvk_ctx, &p);
+    ge_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_ge);
+
+    for (uint64_t i = 0; i < size; i++) {
+      if ((int8_t)ge_data[i] != (int8_t)ref_data[i]) {
+        fprintf(stderr, "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+               i, ge_data[i], ref_data[i]);
+        ret = -1;
+        break;
+      }
+    }
+
+fail_exit_2:
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_ge);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+    free(ge_data);
+  }
+
+fail_exit:
+  free(a_data);
+  free(ref_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_ge_const(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_ge_const(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_tensor_mac.c b/cviruntime/test/181x/test_181x_tensor_mac.c
new file mode 100644
index 000000000..28077cdc7
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_tensor_mac.c
@@ -0,0 +1,184 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_mac_ref(
+    uint8_t *ref_high, uint8_t *ref_low,
+    uint8_t *a, uint8_t *b, uint8_t *c_high, uint8_t *c_low,
+    int lshift_bits, int rshift_bits, uint64_t size, int relu_enable)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    int32_t ta = (int8_t)a[i];
+    int32_t tb = (int8_t)b[i];
+    int32_t tc = ((int8_t)c_high[i] << 8) + c_low[i];
+    tc <<= lshift_bits;
+    int32_t res = ta * tb + tc;
+
+    res += 1 << (rshift_bits - 1);
+    res >>= rshift_bits;
+    if(relu_enable)
+    {
+      if (res > 127)
+        res = 127;
+      else if (res < -128)
+        res = -128;
+
+      if(relu_enable)
+        if(res<0)
+          res=0;
+      ref_high[i] = 0;
+      ref_low[i] = res & 0xff;
+
+    }else{
+      if (res > 32767)
+        res = 32767;
+      else if (res < -32768)
+        res = -32768;
+      ref_high[i] = (res >> 8) & 0xff;
+      ref_low[i] = res & 0xff;
+    }
+  }
+}
+
+static int test_tl_mac(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int lshift_bits;
+  int rshift_bits;
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  uint32_t size = n * c * h * w;
+  uint8_t *a_data = (uint8_t *)malloc(size);
+  uint8_t *b_data = (uint8_t *)malloc(size);
+  uint8_t *c_high_data = (uint8_t *)malloc(size);
+  uint8_t *c_low_data = (uint8_t *)malloc(size);
+  uint8_t *ref_high_data = (uint8_t *)malloc(size);
+  uint8_t *ref_low_data = (uint8_t *)malloc(size);
+  if (!a_data || !b_data || !c_high_data || !c_low_data || !ref_high_data || !ref_low_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for(int relu_enable = 0; relu_enable < 2; relu_enable++) {
+    for (uint32_t i = 0; i < size; i++) {
+      a_data[i] = rand() % 128;
+      b_data[i] = 100 - i;
+      c_high_data[i] = rand() % 64;
+      c_low_data[i] = 200 + 2 * i;
+    }
+
+    if(relu_enable) {
+      lshift_bits= 1;
+      rshift_bits = 7;
+    }else {
+      lshift_bits = 1;
+      rshift_bits = 3;
+    }
+
+    tl_mac_ref(ref_high_data, ref_low_data,
+               a_data, b_data, c_high_data, c_low_data,
+               lshift_bits, rshift_bits, size, relu_enable);
+
+    cvk_tl_t *tl_a = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+    cvk_tl_t *tl_b = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+    cvk_tl_t *tl_c_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+    cvk_tl_t *tl_c_high = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+    uint8_t *mac_high_data = NULL, *mac_low_data = NULL;
+    if (!tl_a || !tl_b || !tl_c_low || !tl_c_high) {
+      ret = -1;
+      goto fail_exit_2;
+    }
+
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, a_data);
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b, b_data);
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_c_low, c_low_data);
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_c_high, c_high_data);
+    cvk_tiu_mac_param_t p2;
+    p2.res_high = tl_c_high;
+    p2.res_low = tl_c_low;
+    p2.res_is_int8 = relu_enable;
+    p2.a = tl_a;
+    p2.b_is_const = 0;
+    p2.b = tl_b;
+    p2.lshift_bits = lshift_bits;
+    p2.rshift_bits = rshift_bits;
+    p2.relu_enable = relu_enable;
+    cvk_ctx->ops->tiu_mac(cvk_ctx, &p2);
+    mac_high_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_c_high);
+    mac_low_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_c_low);
+
+    for (uint32_t i = 0; i < size; i++) {
+      if(!relu_enable)
+        if (mac_high_data[i] != ref_high_data[i]) {
+          fprintf(stderr, "comparing failed at mac_high_data[%u], got %d, exp %d\n",
+                 i, mac_high_data[i], ref_high_data[i]);
+          ret = -1;
+        }
+      if (mac_low_data[i] != ref_low_data[i]) {
+        fprintf(stderr, "comparing failed at mac_low_data[%u], got %d, exp %d\n",
+               i, mac_low_data[i], ref_low_data[i]);
+        ret = -1;
+      }
+    }
+
+fail_exit_2:
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_c_high);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_c_low);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+
+    free(mac_high_data);
+    free(mac_low_data);
+  }
+
+fail_exit:
+  free(a_data);
+  free(b_data);
+  free(c_high_data);
+  free(c_low_data);
+  free(ref_high_data);
+  free(ref_low_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_mac(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_mac(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_tensor_mac_const.c b/cviruntime/test/181x/test_181x_tensor_mac_const.c
new file mode 100644
index 000000000..732e9af7e
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_tensor_mac_const.c
@@ -0,0 +1,184 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_mac_const_ref(
+    uint8_t *ref_high, uint8_t *ref_low,
+    uint8_t *a, uint8_t b_const, int b_is_signed,
+    uint8_t *c_high, uint8_t *c_low,
+    int lshift_bits, int rshift_bits, uint64_t size, int relu_enable)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    int32_t ta = (int8_t)a[i];
+    int32_t tb = b_is_signed? (int8_t)b_const: (uint8_t)b_const;
+    int32_t tc = ((int8_t)c_high[i] << 8) + c_low[i];
+    tc <<= lshift_bits;
+    int32_t res = ta * tb + tc;
+
+    res += 1 << (rshift_bits - 1);
+    res >>= rshift_bits;
+
+    if(relu_enable)
+    {
+      if (res > 127)
+        res = 127;
+      else if (res < -128)
+        res = -128;
+
+      if(relu_enable)
+        if(res<0)
+          res=0;
+      ref_high[i] = 0;
+      ref_low[i] = res & 0xff;
+
+    }else{
+      if (res > 32767)
+        res = 32767;
+      else if (res < -32768)
+        res = -32768;
+      ref_high[i] = (res >> 8) & 0xff;
+      ref_low[i] = res & 0xff;
+    }
+  }
+}
+
+static int test_tl_mac_const(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int lshift_bits;
+  int rshift_bits;
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  uint32_t size = n * c * h * w;
+  uint8_t *a_data = (uint8_t *)malloc(size);
+  uint8_t *c_high_data = (uint8_t *)malloc(size);
+  uint8_t *c_low_data = (uint8_t *)malloc(size);
+  uint8_t *ref_high_data = (uint8_t *)malloc(size);
+  uint8_t *ref_low_data = (uint8_t *)malloc(size);
+  if (!a_data || !c_high_data || !c_low_data || !ref_high_data || !ref_low_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for(int relu_enable = 0; relu_enable < 2; relu_enable++) {
+
+    for (uint64_t i = 0; i < size; i++) {
+      a_data[i] = rand() % 256;
+      c_high_data[i] = rand() % 64;
+      c_low_data[i] = 200 + 2 * i;
+    }
+
+    uint8_t b_const = 37;
+    int b_is_signed = 1;
+     if(relu_enable) {
+      lshift_bits = 1;
+      rshift_bits = 8;
+    }else {
+      lshift_bits = 1;
+      rshift_bits = 3;
+    }
+
+    tl_mac_const_ref(ref_high_data, ref_low_data,
+                     a_data, b_const, b_is_signed, c_high_data, c_low_data,
+                     lshift_bits, rshift_bits, size, relu_enable);
+
+    cvk_tl_t *tl_a = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+    cvk_tl_t *tl_c_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+    cvk_tl_t *tl_c_high = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+    uint8_t *mac_high_data = NULL, *mac_low_data = NULL;
+    if (!tl_a || !tl_c_low || !tl_c_high) {
+      ret = -1;
+      goto fail_exit_2;
+    }
+
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, a_data);
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_c_low, c_low_data);
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_c_high, c_high_data);
+    cvk_tiu_mac_param_t p3;
+    p3.res_high = tl_c_high;
+    p3.res_low = tl_c_low;
+    p3.res_is_int8 = relu_enable;
+    p3.a = tl_a;
+    p3.b_is_const = 1;
+    p3.b_const.val = b_const;
+    p3.b_const.is_signed = b_is_signed;
+    p3.lshift_bits = lshift_bits;
+    p3.rshift_bits = rshift_bits;
+    p3.relu_enable = relu_enable;
+    cvk_ctx->ops->tiu_mac(cvk_ctx, &p3);
+    mac_high_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_c_high);
+    mac_low_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_c_low);
+    for (uint64_t i = 0; i < size; i++) {
+      if(!relu_enable)
+        if (mac_high_data[i] != ref_high_data[i]) {
+          fprintf(stderr, "comparing failed at mac_high_data[%" PRIu64 "], got %d, exp %d\n",
+                 i, mac_high_data[i], ref_high_data[i]);
+          ret = -1;
+          break;
+        }
+      if (mac_low_data[i] != ref_low_data[i]) {
+        fprintf(stderr, "comparing failed at mac_low_data[%" PRIu64 "], got %d, exp %d\n",
+               i, mac_low_data[i], ref_low_data[i]);
+        ret = -1;
+        break;
+      }
+    }
+
+fail_exit_2:
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_c_high);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_c_low);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+    free(mac_high_data);
+    free(mac_low_data);
+  }
+
+fail_exit:
+  free(a_data);
+  free(c_high_data);
+  free(c_low_data);
+  free(ref_high_data);
+  free(ref_low_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_mac_const(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_mac_const(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_tensor_max.c b/cviruntime/test/181x/test_181x_tensor_max.c
new file mode 100644
index 000000000..6dbf9f7c7
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_tensor_max.c
@@ -0,0 +1,115 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_max_ref(int8_t *a, int8_t *b, int8_t *max, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    if (a[i] > b[i])
+      max[i] = a[i];
+    else
+      max[i] = b[i];
+  }
+}
+
+static int test_tl_max(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+  int ret = 0;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  uint32_t size = n * c * h * w;
+  int8_t *a_data = (int8_t *)malloc(size);
+  int8_t *b_data = (int8_t *)malloc(size);
+  int8_t *ref_data = (int8_t *)malloc(size);
+  uint8_t *max_data = NULL;
+  if (!a_data || !b_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint32_t i = 0; i < size; i++)
+    a_data[i] = (int8_t)(i % 256);
+
+  for (uint32_t i = 0; i < size; i++)
+    b_data[i] = (int8_t)(100 - i % 256);
+
+  tl_max_ref(a_data, b_data, ref_data, size);
+
+  cvk_tl_t *tl_a = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_b = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_max = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, (uint8_t *)a_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b, (uint8_t *)b_data);
+
+  cvk_tiu_max_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.max = tl_max;
+  p.a = tl_a;
+  p.b_is_const = 0;
+  p.b = tl_b;
+  cvk_ctx->ops->tiu_max(cvk_ctx, &p);
+  max_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_max);
+
+  for (uint64_t i = 0; i < size; i++) {
+    if ((int8_t)max_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+             i, max_data[i], ref_data[i]);
+      ret = -1;
+      goto fail_exit;
+    }
+  }
+
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_max);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+
+fail_exit:
+  free(a_data);
+  free(b_data);
+  free(ref_data);
+  free(max_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_max(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_max(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_tensor_max_const.c b/cviruntime/test/181x/test_181x_tensor_max_const.c
new file mode 100644
index 000000000..8287a7d9f
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_tensor_max_const.c
@@ -0,0 +1,109 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_max_const_ref(int8_t *a, int8_t b, int8_t *max, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    if (a[i] > b)
+      max[i] = a[i];
+    else
+      max[i] = b;
+  }
+}
+
+static int test_tl_max_const(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  uint32_t size = n * c * h * w;
+  int8_t *a_data = (int8_t *)malloc(size);
+  int8_t *ref_data = (int8_t *)malloc(size);
+  uint8_t *max_data = NULL;
+  if (!a_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint32_t i = 0; i < size; i++)
+    a_data[i] = (int8_t)(i % 256);
+
+  int8_t b = 47;
+
+  tl_max_const_ref(a_data, b, ref_data, size);
+
+  cvk_tl_t *tl_a = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_max = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, (uint8_t *)a_data);
+  cvk_tiu_max_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.max = tl_max;
+  p.a = tl_a;
+  p.b_is_const = 1;
+  p.b_const.val = b;
+  p.b_const.is_signed = 1;
+  cvk_ctx->ops->tiu_max(cvk_ctx, &p);
+  max_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_max);
+
+  for (uint32_t i = 0; i < size; i++) {
+    if ((int8_t)max_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%u], got %x, exp %x\n",
+             i, max_data[i], ref_data[i]);
+      ret = -1;
+      goto fail_exit;
+    }
+  }
+
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_max);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+
+fail_exit:
+  free(a_data);
+  free(ref_data);
+  free(max_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_max_const(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_max_const(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return 0;
+}
diff --git a/cviruntime/test/181x/test_181x_tensor_min.c b/cviruntime/test/181x/test_181x_tensor_min.c
new file mode 100644
index 000000000..c035e5544
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_tensor_min.c
@@ -0,0 +1,117 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_min_ref(int8_t *a, int8_t *b, int8_t *max, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    if (a[i] > b[i])
+      max[i] = b[i];
+    else
+      max[i] = a[i];
+  }
+}
+
+static int test_tl_min(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+  int ret = 0;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  uint32_t size = n * c * h * w;
+  int8_t *a_data = (int8_t *)malloc(size);
+  int8_t *b_data = (int8_t *)malloc(size);
+  int8_t *ref_data = (int8_t *)malloc(size);
+  if (!a_data || !b_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint32_t i = 0; i < size; i++)
+    a_data[i] = (int8_t)(i % 256);
+
+  for (uint32_t i = 0; i < size; i++)
+    b_data[i] = (int8_t)(100 - i % 256);
+
+  tl_min_ref(a_data, b_data, ref_data, size);
+
+  cvk_tl_t *tl_a = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_b = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_min = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  if (!tl_a || !tl_b || !tl_min) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, (uint8_t *)a_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b, (uint8_t *)b_data);
+  cvk_tiu_min_param_t p6;
+  p6.min = tl_min;
+  p6.a = tl_a;
+  p6.b_is_const = 0;
+  p6.b = tl_b;
+  cvk_ctx->ops->tiu_min(cvk_ctx, &p6);
+  uint8_t *min_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_min);
+
+  for (uint32_t i = 0; i < size; i++) {
+    if ((int8_t)min_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%u], got %x, exp %x\n",
+             i, min_data[i], ref_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_min);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+
+  free(min_data);
+
+fail_exit:
+  free(a_data);
+  free(b_data);
+  free(ref_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_min(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_min(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_tensor_min_const.c b/cviruntime/test/181x/test_181x_tensor_min_const.c
new file mode 100644
index 000000000..6010c3af2
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_tensor_min_const.c
@@ -0,0 +1,113 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_min_const_ref(int8_t *a, int8_t b, int8_t *max, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    if (a[i] > b)
+      max[i] = b;
+    else
+      max[i] = a[i];
+  }
+}
+
+static int test_tl_min_const(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  uint32_t size = n * c * h * w;
+  int8_t *a_data = (int8_t *)malloc(size);
+  int8_t *ref_data = (int8_t *)malloc(size);
+  if (!a_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint32_t i = 0; i < size; i++)
+    a_data[i] = (int8_t)(i % 256);
+
+  int8_t b = 47;
+
+  tl_min_const_ref(a_data, b, ref_data, size);
+
+  cvk_tl_t *tl_a = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_min = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  uint8_t *min_data = NULL;
+  if (!tl_a || !tl_min) {
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, (uint8_t *)a_data);
+  cvk_tiu_min_param_t p7;
+  p7.min = tl_min;
+  p7.a = tl_a;
+  p7.b_is_const = 1;
+  p7.b_const.val = b;
+  p7.b_const.is_signed = 1;
+  cvk_ctx->ops->tiu_min(cvk_ctx, &p7);
+  min_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_min);
+
+  for (uint32_t i = 0; i < size; i++) {
+    if ((int8_t)min_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%u], got %x, exp %x\n",
+             i, min_data[i], ref_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+fail_exit_2:
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_min);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+  free(min_data);
+
+fail_exit:
+  free(a_data);
+  free(ref_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_min_const(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_min_const(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_tensor_mul.c b/cviruntime/test/181x/test_181x_tensor_mul.c
new file mode 100644
index 000000000..f889c7148
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_tensor_mul.c
@@ -0,0 +1,135 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_mul_ref(int8_t *ofmap, int8_t *a, int8_t *b, uint64_t size, int shift_bits, int relu_enable)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    int32_t tmp = a[i] * b[i];
+    tmp += 1 << (shift_bits - 1);
+    tmp >>= shift_bits;
+    if (tmp > 127)
+      tmp = 127;
+    else if (tmp < -128)
+      tmp = -128;
+    if(relu_enable)
+      if(tmp<0)
+        tmp=0;
+    ofmap[i] = tmp;
+    
+  }
+}
+
+static int test_tl_mul(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+  int ret = 0;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  uint32_t size = n * c * h  * w;
+  int shift_bits = 1;
+
+  int8_t *a_data = (int8_t *)malloc(size);
+  int8_t *b_data = (int8_t *)malloc(size);
+  int8_t *ref_data = (int8_t *)malloc(size);
+  if (!a_data || !b_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint32_t relu_enable = 0; relu_enable < 2; relu_enable++)
+  {
+    for (uint32_t i = 0; i < size; i++) {
+      a_data[i] = random()%0x10;
+      b_data[i] = 128 - i;
+    }
+   
+    cvk_tl_t *tl_a = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+    cvk_tl_t *tl_b = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+    cvk_tl_t *tl_res_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+    uint8_t *res_low_data = NULL;
+    if (!tl_a || !tl_b || !tl_res_low) {
+      ret = -1;
+      goto fail_exit_2;
+    }
+   
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, (uint8_t *)a_data);
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b, (uint8_t *)b_data);
+   
+    cvk_tiu_mul_param_t p1;
+    p1.res_high = NULL;
+    p1.res_low = tl_res_low;
+    p1.a = tl_a;
+    p1.b_is_const = 0;
+    p1.b = tl_b;
+    p1.rshift_bits = shift_bits;
+    p1.relu_enable = relu_enable;
+    cvk_ctx->ops->tiu_mul(cvk_ctx, &p1);
+   
+    res_low_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res_low);
+
+    tl_mul_ref(ref_data, a_data, b_data, size, shift_bits, relu_enable);
+   
+    for (uint32_t i = 0; i < size; i++) {
+      if ((int8_t)res_low_data[i] != ref_data[i]) {
+        fprintf(stderr, "comparing failed at res_low_data[%u], got %x, exp %x\n",
+               i, res_low_data[i], ref_data[i]);
+        ret = -1;
+        break;
+      }
+    }
+
+fail_exit_2:
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res_low);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+    free(res_low_data);
+  }
+
+fail_exit:
+  free(a_data);
+  free(b_data);
+  free(ref_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_mul(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_mul(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_tensor_mul_const.c b/cviruntime/test/181x/test_181x_tensor_mul_const.c
new file mode 100644
index 000000000..c704963d6
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_tensor_mul_const.c
@@ -0,0 +1,134 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_mul_const_ref(
+    int8_t *ofmap, int8_t *ifmap, uint64_t size, int8_t mul_const, int shift_bits, int relu_enable)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    int32_t tmp = ifmap[i] * mul_const;
+    tmp += 1 << (shift_bits - 1);
+    tmp >>= shift_bits;
+    if (tmp > 127)
+      tmp = 127;
+    else if (tmp < -128)
+      tmp = -128;
+    if(relu_enable)
+      if(tmp<0)
+        tmp=0;
+
+    ofmap[i] = tmp;
+  }
+}
+
+static int test_tl_mul_const(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  uint32_t size = n * c * h  * w;
+
+  int8_t *ifmap_data = (int8_t *)malloc(size);
+  int8_t *ref_data = (int8_t *)malloc(size);
+  if (!ifmap_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint32_t relu_enable = 0; relu_enable < 2; relu_enable++)
+  {
+    for (uint32_t i = 0; i < size; i++)
+      ifmap_data[i] = (uint8_t)(random() % 256);
+  
+    int8_t mul_const = 20;
+    int shift_bits = 1;
+  
+    tl_mul_const_ref(ref_data, ifmap_data, size, mul_const, shift_bits, relu_enable);
+  
+    cvk_tl_t *tl_ifmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+    cvk_tl_t *tl_ofmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+    uint8_t *ofmap_data = NULL;
+    if (!tl_ifmap || !tl_ofmap) {
+      ret = -1;
+      goto fail_exit_2;
+    }
+  
+    tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_ifmap, (uint8_t *)ifmap_data);
+  
+    cvk_tiu_mul_param_t p;
+    memset(&p, 0, sizeof(p));
+    p.res_high = NULL;
+    p.res_low = tl_ofmap;
+    p.a = tl_ifmap;
+    p.b_is_const = 1;
+    p.b_const.val = mul_const;
+    p.b_const.is_signed = 1;
+    p.rshift_bits = shift_bits;
+    p.relu_enable = relu_enable;
+
+    cvk_ctx->ops->tiu_mul(cvk_ctx, &p);
+  
+    ofmap_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_ofmap);
+  
+    for (uint32_t i = 0; i < size; i++) {
+      if ((int8_t)ofmap_data[i] != ref_data[i]) {
+        fprintf(stderr, "comparing failed at ofmap_data[%u], got %x, exp %x\n",
+               i, ofmap_data[i], ref_data[i]);
+        ret = -1;
+        break;
+      }
+    }
+  
+fail_exit_2:
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_ofmap);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_ifmap);
+    free(ofmap_data);
+  }
+
+fail_exit:
+  free(ifmap_data);
+  free(ref_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_mul_const(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_mul_const(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_tensor_mul_qm.c b/cviruntime/test/181x/test_181x_tensor_mul_qm.c
new file mode 100644
index 000000000..711dc5781
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_tensor_mul_qm.c
@@ -0,0 +1,589 @@
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_tf_quant_util.h"
+
+// #define ENABLE_DEBUG_MSG
+
+#define MIN_EXEC_TESTS  20
+
+typedef struct {
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int relu_enable;
+  int8_t *input1_data;
+  int8_t *input2_data;
+  int8_t *output_data;
+  uint32_t multiplier;
+  int8_t right_shift;
+  float float_multiplier;
+  int retry_cnt;
+} elt_mul_test_param_t;
+
+void elt_mul_ref(elt_mul_test_param_t *p_param)
+{
+  int input_n = p_param->input_n;
+  int input_c = p_param->input_c;
+  int input_h = p_param->input_h;
+  int input_w = p_param->input_w;
+  int32_t output_multiplier = p_param->multiplier;
+  int8_t output_rshift = p_param->right_shift;
+  int8_t *input1_data = p_param->input1_data;
+  int8_t *input2_data = p_param->input2_data;
+  int8_t *output_data = p_param->output_data;
+
+  int32_t quantized_activation_min = -128;
+  int32_t quantized_activation_max = 127;
+
+  int size = input_n * input_c * input_h * input_w;
+#ifdef ENABLE_DEBUG_MSG
+  printf("elt_mul_ref:\n");
+  printf("  shape (%d, %d, %d, %d)\n", input_n, input_c, input_h, input_w);
+#endif
+  for (int i = 0; i < size; ++i) {
+    const int32_t input1_val = input1_data[i];
+    const int32_t input2_val = input2_data[i];
+    const int32_t unclamped_result = MultiplyByQuantizedMultiplier(
+        input1_val * input2_val, output_multiplier, output_rshift);
+    const int32_t clamped_output =
+        MIN(quantized_activation_max,
+                 MAX(quantized_activation_min, unclamped_result));
+
+#ifdef ENABLE_DEBUG_MSG
+    printf("  [%d] unclamped_result %d,  clamped_output %d\n", i,
+           unclamped_result, clamped_output);
+#endif
+
+    output_data[i] = clamped_output;
+  }
+}
+
+void calc_elt_mul_float_multiplier(elt_mul_test_param_t *p_param)
+{
+  int input_n = p_param->input_n;
+  int input_c = p_param->input_c;
+  int input_h = p_param->input_h;
+  int input_w = p_param->input_w;
+  int8_t *input1_data = p_param->input1_data;
+  int8_t *input2_data = p_param->input2_data;
+
+  int output_min = INT_MAX;
+  int output_max = INT_MIN;
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("calc_elt_mul_float_multiplier =>\n");
+#endif
+
+  int size = input_n * input_c * input_h * input_w;
+  for (int i = 0; i < size; ++i) {
+    const int32_t input1_val = input1_data[i];
+    const int32_t input2_val = input2_data[i];
+
+    const int32_t val = input1_val * input2_val;
+
+    output_max = MAX(val, output_max);
+    output_min = MIN(val, output_min);
+  }
+
+  // Since int8 ranges from -128 to 127, we need to squeeze the accumulator
+  // MIN/MAX fit in those ranges correspondingly as much as possible.
+  if (abs(output_max) > abs(output_min)) {
+    p_param->float_multiplier = 127.0f / abs(output_max);
+  } else {
+    p_param->float_multiplier = 128.0f / abs(output_min);
+  }
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("  output_accu_min %d, output_accu_max %d, output_multiplier %f\n",
+         output_min, output_max, p_param->float_multiplier);
+#endif
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("<= calc_elt_mul_float_multiplier\n");
+#endif
+}
+
+int simple_test(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+
+  // TFL: QuantizedMulOpTest.NoActivationInt8
+  int size = 4;
+  int8_t input1_data[4] = {-102, 25, 115, 89};
+  int8_t input2_data[4] = {77, 51, 115, 102};
+  int8_t ref_output_data[4] = {-62, 10, 104, 71};
+  int8_t output_data[4];
+  uint32_t output_multiplier = 1077952640;
+  int8_t output_rshift = 6;  // change to right shift
+
+  elt_mul_test_param_t test_param;
+  memset(&test_param, 0, sizeof(test_param));
+
+  test_param.input_n = 1;
+  test_param.input_c = 1;
+  test_param.input_h = 1;
+  test_param.input_w = 4;
+  test_param.input1_data = input1_data;
+  test_param.input2_data = input2_data;
+  test_param.output_data = output_data;
+  test_param.multiplier = output_multiplier;
+  test_param.right_shift = output_rshift;
+  elt_mul_ref(&test_param);
+
+  for (int i = 0; i < size; ++i) {
+    if (output_data[i] != ref_output_data[i]) {
+      printf("  Error ! output_data[%d] = %d != %d\n", i, output_data[i],
+             ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  cvk_tl_shape_t tl_shape = {1, 1, 1, size};
+  cvk_tl_t *tl_a = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, /*align=*/1);
+  cvk_tl_t *tl_b = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, /*align=*/1);
+  cvk_tl_t *tl_res = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, /*align=*/1);
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, (uint8_t *)input1_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b, (uint8_t *)input2_data);
+
+  {
+    cvk_tiu_mul_qm_param_t p1;
+    p1.res_high = NULL;
+    p1.res_low = tl_res;
+    p1.a = tl_a;
+    p1.b_is_const = 0;
+    p1.b = tl_b;
+    p1.rshift_bits = output_rshift;
+    p1.relu_enable = 0;
+    p1.multiplier = output_multiplier;
+    cvk_ctx->ops->tiu_mul_qm(cvk_ctx, &p1);
+  }
+
+  int8_t *res_tiu_data =
+      (int8_t *)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res);
+  for (int i = 0; i < size; ++i) {
+    if (res_tiu_data[i] != ref_output_data[i]) {
+      printf("  Error ! result[%d] %d != %d\n", i, res_tiu_data[i],
+             ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  free(res_tiu_data);
+
+  // Reserver order
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+
+  return ret;
+}
+
+int choose_from_range(int table[], int size, int index)
+{
+  if (index >= size) {
+    return 0;
+  }
+
+  int val = table[index];
+  if (index < (size - 1)) {
+    int range = MAX(table[index + 1] - table[index] - 1, 1);
+    val += rand() % range;
+  }
+
+  return val;
+}
+
+bool check_valid_test_param(cvk_context_t *cvk_ctx, elt_mul_test_param_t *p_param)
+{
+  uint32_t input_n = p_param->input_n;
+  uint32_t input_c = p_param->input_c;
+  uint32_t input_h = p_param->input_h;
+  uint32_t input_w = p_param->input_w;
+
+  // input1, input2, output
+  uint32_t total_needed_size = 3 * input_n * input_c * input_h * input_w;
+
+  uint32_t lmem_size_per_lane = cvk_ctx->info.lmem_size;
+  uint32_t total_lmem_size = cvk_ctx->info.lmem_size * cvk_ctx->info.npu_num;
+
+  if (total_needed_size > total_lmem_size) {
+    return false;
+  }
+
+  cvk_tl_shape_t input_shape = {input_n, input_c, input_h, input_w};
+
+  uint32_t needed_size =
+      3 * cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, input_shape, CVK_FMT_I8, /*eu_align=*/1);
+
+  // Skip invalid shape
+  if (needed_size > lmem_size_per_lane) {
+    return false;
+  }
+
+  return true;
+}
+
+void fill_random_data_s8(int8_t *input_data, int size)
+{
+  for (int i = 0; i < size; ++i) {
+    int is_satured = ((rand() % 1000) == 1) ? 1 : 0;
+    int is_sign = rand() % 2 ? 1 : -1;
+
+    if (is_satured && is_sign) {
+      input_data[i] = -128;
+    } else if (is_satured) {
+      input_data[i] = 127;
+    } else {
+      input_data[i] = is_sign * rand() % 128;
+    }
+  }
+}
+
+void dump_test_param(elt_mul_test_param_t *p_param, bool dump_content)
+{
+  printf("Dump test parameter:\n");
+  printf("  input_n %d\n", p_param->input_n);
+  printf("  input_c %d\n", p_param->input_c);
+  printf("  input_h %d\n", p_param->input_h);
+  printf("  input_w %d\n", p_param->input_w);
+  printf("  multiplier %d\n", p_param->multiplier);
+  printf("  right_shift %d\n", p_param->right_shift);
+
+  if (dump_content) {
+    printf("input1_data(%d, %d, %d, %d) :\n", p_param->input_n,
+           p_param->input_c, p_param->input_h, p_param->input_w);
+    int in = p_param->input_n;
+    int ic = p_param->input_c;
+    int ih = p_param->input_h;
+    int iw = p_param->input_w;
+    for (int i = 0; i < in; ++i) {
+      for (int j = 0; j < ic; ++j) {
+        for (int k = 0; k < ih; ++k) {
+          for (int l = 0; l < iw; ++l) {
+            int offset = i * (ic * ih * iw) + j * (ih * iw) + k * iw + l;
+            printf("%d, ", p_param->input1_data[offset]);
+          }
+          printf("\n");
+        }
+      }
+    }
+    printf("\n\n");
+
+    printf("input2_data(%d, %d, %d, %d) :\n", p_param->input_n,
+           p_param->input_c, p_param->input_h, p_param->input_w);
+    for (int i = 0; i < in; ++i) {
+      for (int j = 0; j < ic; ++j) {
+        for (int k = 0; k < ih; ++k) {
+          for (int l = 0; l < iw; ++l) {
+            int offset = i * (ic * ih * iw) + j * (ih * iw) + k * iw + l;
+            printf("%d, ", p_param->input2_data[offset]);
+          }
+          printf("\n");
+        }
+      }
+    }
+    printf("\n\n");
+  }
+}
+
+int run_compare_elt_mul(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx,
+                        elt_mul_test_param_t *p_param)
+{
+  int ret = 0;
+
+  int input_n = p_param->input_n;
+  int input_c = p_param->input_c;
+  int input_h = p_param->input_h;
+  int input_w = p_param->input_w;
+
+  int input_size = input_n * input_c * input_h * input_w;
+  int8_t *input1_data = (int8_t *)malloc(input_size);
+  int8_t *input2_data = (int8_t *)malloc(input_size);
+  int8_t *output_data = (int8_t *)malloc(input_size);
+
+  p_param->input1_data = input1_data;
+  p_param->input2_data = input2_data;
+  p_param->output_data = output_data;
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    run_compare_elt_mul => \n");
+#endif
+
+  int retry_cnt = p_param->retry_cnt;
+  do {
+    fill_random_data_s8(input1_data, input_size);
+    fill_random_data_s8(input2_data, input_size);
+
+    p_param->float_multiplier = 100.0;  // should be < 1.0
+    calc_elt_mul_float_multiplier(p_param);
+
+    if (p_param->float_multiplier > 0.f && p_param->float_multiplier < 1.0) {
+      break;
+    }
+
+  } while (--retry_cnt);
+
+  if (p_param->float_multiplier >= 1.0) {
+    printf("    run_compare_elt_mul: unable to find valid multiplier\n");
+    free(input1_data);
+    free(input2_data);
+    free(output_data);
+    return -1;
+  }
+
+  uint32_t base_multiplier = 0;
+  int base_shift = 0;
+  QuantizeMultiplierSmallerThanOne(p_param->float_multiplier, &base_multiplier,
+                                   &base_shift);
+
+  // multipliers typically range in [2^30 ; 2^31 - 1].
+  // Values in [0, 2^30 - 1] are normally unused, but harmless.
+  // Thus a good way to randomize multipliers is to subtract from them
+  // a random value smaller than 2^30 but still significant compared to it.
+  uint32_t output_multiplier = base_multiplier - (rand() % (1 << 26));
+
+  int right_shift = base_shift - 1 + (rand() % 4);
+  int8_t output_right_shift = truncate_rshift((int8_t)right_shift, /*allow_lshift*/1);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("      multiplier_data %d, shift_data %d\n", output_multiplier,
+         output_right_shift);
+#endif
+
+  p_param->multiplier = output_multiplier;
+  p_param->right_shift = output_right_shift;
+
+  elt_mul_ref(p_param);
+
+  cvk_tl_shape_t input_shape = {input_n, input_c, input_h, input_w};
+
+  cvk_tl_t *tl_input1 =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, input_shape, CVK_FMT_I8, /*eu_aign=*/1);
+
+  cvk_tl_t *tl_input2 =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, input_shape, CVK_FMT_I8, /*eu_aign=*/1);
+
+  cvk_tl_t *tl_output =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, input_shape, CVK_FMT_I8, /*eu_aign=*/1);
+
+  if (tl_input1 == NULL) {
+    printf("    fail to alloc tl_input1 (%d, %d, %d, %d)\n", input_n, input_c,
+           input_h, input_w);
+    return -1;
+  }
+  if (tl_input2 == NULL) {
+    printf("    fail to alloc tl_input2 (%d, %d, %d, %d)\n", input_n, input_c,
+           input_h, input_w);
+    return -1;
+  }
+  if (tl_output == NULL) {
+    printf("    fail to alloc tl_output (%d, %d, %d, %d)\n", input_n, input_c,
+           input_h, input_w);
+    return -1;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_input1, (uint8_t *)input1_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_input2, (uint8_t *)input2_data);
+
+  {
+    cvk_tiu_mul_qm_param_t p1;
+    p1.res_high = NULL;
+    p1.res_low = tl_output;
+    p1.a = tl_input1;
+    p1.b_is_const = 0;
+    p1.b = tl_input2;
+    p1.rshift_bits = (uint8_t)output_right_shift;
+    p1.relu_enable = 0;
+    p1.multiplier = output_multiplier;
+    cvk_ctx->ops->tiu_mul_qm(cvk_ctx, &p1);
+  }
+
+
+  CVI_RT_Submit(cvk_ctx);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("      compare result:\n");
+#endif
+  int8_t *tiu_output_data =
+      (int8_t *)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_output);
+  for (int i = 0; i < input_n; ++i) {
+    for (int j = 0; j < input_c; ++j) {
+      for (int k = 0; k < input_h; ++k) {
+        for (int l = 0; l < input_w; ++l) {
+          int offset = i * (input_c * input_h * input_w) +
+                       j * (input_h * input_w) + k * input_w + l;
+          if (tiu_output_data[offset] != output_data[offset]) {
+            printf("        [ni=%d][oci=%d][ohi=%d][owi=%d] output %d(tiu) != "
+                   "%d(ref)\n",
+                   i, j, k, l, tiu_output_data[offset], output_data[offset]);
+            ret = -1;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  free(tiu_output_data);
+
+  if (ret) {
+    dump_test_param(p_param, /*dump_content=*/true);
+  }
+
+  // Reverse order
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_output);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_input2);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_input1);
+
+  free(input1_data);
+  free(input2_data);
+  free(output_data);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    <= run_compare_elt_mul, ret %d\n", ret);
+#endif
+
+  return ret;
+}
+
+int random_test(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+
+#if 0
+  int input_n_range[] = {1};
+  int input_c_range[] = {1};
+  int input_h_range[] = {1};
+  int input_w_range[] = {1};
+#else
+  int input_n_range[] = {1,   2, 4095 - 32};
+  int input_c_range[] = {1, 512, 4095 - 32};
+  int input_h_range[] = {1, 512, 4095 - 32};
+  int input_w_range[] = {1, 512, 4095 - 32};
+#endif
+
+  const int input_n_range_size =
+      sizeof(input_n_range) / sizeof(input_n_range[0]);
+  const int input_c_range_size =
+      sizeof(input_c_range) / sizeof(input_c_range[0]);
+  const int input_h_range_size =
+      sizeof(input_h_range) / sizeof(input_h_range[0]);
+  const int input_w_range_size =
+      sizeof(input_w_range) / sizeof(input_w_range[0]);
+
+  int random_seed = clock();
+  srand(random_seed);
+
+  const int retry_test_count = 100;
+  bool stop_at_first_error = true;
+
+  int executed_tests = 0;
+  int failed_tests = 0;
+
+  printf("1822-mul-qm: random Test =>\n");
+  for (int m = 0; m < retry_test_count; ++m) {
+    for (int i = 0; i < input_n_range_size; ++i) {
+      int input_n = choose_from_range(input_n_range, input_n_range_size, i);
+
+      for (int j = 0; j < input_c_range_size; ++j) {
+        int input_c = choose_from_range(input_c_range, input_c_range_size, j);
+
+        for (int k = 0; k < input_h_range_size; ++k) {
+          int input_h = choose_from_range(input_h_range, input_h_range_size, k);
+
+          for (int l = 0; l < input_w_range_size; ++l) {
+            int input_w =
+                choose_from_range(input_w_range, input_w_range_size, l);
+
+            elt_mul_test_param_t test_param;
+            memset(&test_param, 0, sizeof(test_param));
+            test_param.input_n = input_n;
+            test_param.input_c = input_c;
+            test_param.input_h = input_h;
+            test_param.input_w = input_w;
+            test_param.retry_cnt = 5;
+
+            bool is_valid_param = check_valid_test_param(cvk_ctx, &test_param);
+            if (is_valid_param == false)
+              continue;
+
+            int ret2 = run_compare_elt_mul(rt_handle, cvk_ctx, &test_param);
+            failed_tests = ret2 ? failed_tests + 1 : failed_tests;
+            ret |= ret2;
+            executed_tests++;
+
+#ifdef ENABLE_DEBUG_MSG
+            printf("  [%d] random test: input shape (%d, %d, %d, %d), ret %d\n",
+                   executed_tests, current_testinput_n, input_c, input_h, input_w, ret2);
+#endif
+          }
+
+          // Stop at first error
+          if (ret && stop_at_first_error) {
+            break;
+          }
+        }
+
+        // Stop at first error
+        if (ret && stop_at_first_error) {
+          break;
+        }
+      }
+
+      // Stop at first error
+      if (ret && stop_at_first_error) {
+        break;
+      }
+    }
+
+    // Stop at first error
+    if (ret && stop_at_first_error) {
+      break;
+    }
+
+    if (executed_tests >= MIN_EXEC_TESTS) {
+      break;
+    }
+  }
+
+  printf("<= 1822-mul-qm: random test, total %d, failed %d, ret %d\n",
+         executed_tests, failed_tests, ret);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret = simple_test(rt_handle, cvk_ctx);
+  ret |= random_test(rt_handle, cvk_ctx);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_tensor_or.c b/cviruntime/test/181x/test_181x_tensor_or.c
new file mode 100644
index 000000000..83ed10f8a
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_tensor_or.c
@@ -0,0 +1,236 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_or_int8_ref(int8_t *a, int8_t *b, int8_t *res, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++)
+    res[i] = a[i] | b[i];
+}
+
+static void tl_or_int16_ref(
+    uint8_t *ref_high, uint8_t *ref_low,
+    uint8_t *a_high, uint8_t *a_low,
+    uint8_t *b_high, uint8_t *b_low,
+    uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    int32_t ta = ((int8_t)a_high[i] << 8) + a_low[i];
+    int32_t tb = ((int8_t)b_high[i] << 8) + b_low[i];
+    int32_t res = ta | tb;
+    ref_high[i] = (res >> 8) & 0xff;
+    ref_low[i] = res & 0xff;
+  }
+}
+
+static int test_tl_or_int8(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+  int ret = 0;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  uint32_t size = n * c * h * w;
+  int8_t *a_data = (int8_t *)malloc(size);
+  int8_t *b_data = (int8_t *)malloc(size);
+  int8_t *ref_data = (int8_t *)malloc(size);
+  if (!a_data || !b_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint32_t i = 0; i < size; i++)
+    a_data[i] = (int8_t)(i % 256);
+
+  for (uint32_t i = 0; i < size; i++)
+    b_data[i] = (int8_t)(100 - i % 256);
+
+  tl_or_int8_ref(a_data, b_data, ref_data, size);
+
+  cvk_tl_t *tl_a = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_b = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_res = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  uint8_t *res_data = NULL;
+  if (!tl_a || !tl_b || !tl_res) {
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, (uint8_t *)a_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b, (uint8_t *)b_data);
+
+  cvk_tiu_or_int8_param_t p9;
+  p9.res = tl_res;
+  p9.a = tl_a;
+  p9.b = tl_b;
+  cvk_ctx->ops->tiu_or_int8(cvk_ctx, &p9);
+  res_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res);
+
+  for (uint32_t i = 0; i < size; i++) {
+    if ((int8_t)res_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%u], got %x, exp %x\n",
+             i, res_data[i], ref_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+fail_exit_2:
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+  free(res_data);
+
+fail_exit:
+  free(a_data);
+  free(b_data);
+  free(ref_data);
+
+  return ret;
+}
+
+static int test_tl_or_int16(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int n = 2; // 3 -> 2 for 1810
+  int c = 39;
+  int h = 7;
+  int w = 37;
+  int ret = 0;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  uint32_t size = n * c * h * w;
+  uint8_t *a_high_data = (uint8_t *)malloc(size);
+  uint8_t *a_low_data = (uint8_t *)malloc(size);
+  uint8_t *b_high_data = (uint8_t *)malloc(size);
+  uint8_t *b_low_data = (uint8_t *)malloc(size);
+  uint8_t *ref_high_data = (uint8_t *)malloc(size);
+  uint8_t *ref_low_data = (uint8_t *)malloc(size);
+  if (!a_high_data || !a_low_data || !b_high_data ||
+      !b_low_data || !ref_high_data || !ref_low_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint32_t i = 0; i < size; i++) {
+    a_high_data[i] = i / 10;
+    a_low_data[i] = i;
+    b_high_data[i] = (i + 250) / 20;
+    b_low_data[i] = 100 - i;
+  }
+
+
+  tl_or_int16_ref(
+      ref_high_data, ref_low_data,
+      a_high_data, a_low_data,
+      b_high_data, b_low_data,
+      size);
+
+  cvk_tl_t *tl_a_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_a_high = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_b_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_b_high = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_res_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_res_high = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  uint8_t *res_high_data = NULL, *res_low_data = NULL;
+  if (!tl_a_low || !tl_a_high || !tl_b_low || !tl_b_high || !tl_res_low || !tl_res_high) {
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a_low, a_low_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a_high, a_high_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b_low, b_low_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b_high, b_high_data);
+  cvk_tiu_or_int16_param_t p9;
+  p9.res_high = tl_res_high;
+  p9.res_low = tl_res_low;
+  p9.a_high = tl_a_high;
+  p9.a_low = tl_a_low;
+  p9.b_high = tl_b_high;
+  p9.b_low = tl_b_low;
+  cvk_ctx->ops->tiu_or_int16(cvk_ctx, &p9);
+  res_high_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res_high);
+  res_low_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res_low);
+
+  for (uint32_t i = 0; i < size; i++) {
+    if (res_high_data[i] != ref_high_data[i]) {
+      fprintf(stderr, "comparing failed at res_high_data[%u], got %d, exp %d\n",
+             i, res_high_data[i], ref_high_data[i]);
+      ret = -1;
+      break;
+    }
+    if (res_low_data[i] != ref_low_data[i]) {
+      fprintf(stderr, "comparing failed at res_low_data[%u], got %d, exp %d\n",
+             i, res_low_data[i], ref_low_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+fail_exit_2:
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res_high);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res_low);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b_high);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b_low);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a_high);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a_low);
+
+  free(res_high_data);
+  free(res_low_data);
+
+fail_exit:
+  free(a_high_data);
+  free(a_low_data);
+  free(b_high_data);
+  free(b_low_data);
+  free(ref_high_data);
+  free(ref_low_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_or_int8(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_or_int8(rt_handle, cvk_ctx, 1);
+  ret |= test_tl_or_int16(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_or_int16(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_tensor_sub.c b/cviruntime/test/181x/test_181x_tensor_sub.c
new file mode 100644
index 000000000..f17cf7df5
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_tensor_sub.c
@@ -0,0 +1,158 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_sub_ref(
+    uint8_t *ref_high, uint8_t *ref_low,
+    uint8_t *a_high, uint8_t *a_low,
+    uint8_t *b_high, uint8_t *b_low,
+    uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    int32_t ta = ((int8_t)a_high[i] << 8) + a_low[i];
+    int32_t tb = ((int8_t)b_high[i] << 8) + b_low[i];
+    int32_t res = ta - tb;
+    if (res > 32767)
+      res = 32767;
+    else if (res < -32768)
+      res = -32768;
+    ref_high[i] = (res >> 8) & 0xff;
+    ref_low[i] = res & 0xff;
+  }
+}
+
+static int test_tl_sub(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 2; // 3 -> 2 for 1810
+  int c = 39;
+  int h = 7;
+  int w = 37;
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  uint32_t size = n * c * h * w;
+  uint8_t *a_high_data = (uint8_t *)malloc(size);
+  uint8_t *a_low_data = (uint8_t *)malloc(size);
+  uint8_t *b_high_data = (uint8_t *)malloc(size);
+  uint8_t *b_low_data = (uint8_t *)malloc(size);
+  if (!a_high_data || !a_low_data || !b_high_data || !b_low_data)
+    return -1;
+
+  for (uint32_t i = 0; i < size; i++) {
+    a_high_data[i] = i / 10;
+    a_low_data[i] = i;
+    b_high_data[i] = (i + 250) / 20;
+    b_low_data[i] = 100 - i;
+  }
+
+  uint8_t *ref_high_data = (uint8_t *)malloc(size);
+  uint8_t *ref_low_data = (uint8_t *)malloc(size);
+  if (!ref_high_data || !ref_low_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  tl_sub_ref(ref_high_data, ref_low_data,
+             a_high_data, a_low_data,
+             b_high_data, b_low_data,
+             size);
+
+  cvk_tl_t *tl_a_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_a_high = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_b_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_b_high = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_res_low = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_res_high = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  if (!tl_a_low || !tl_a_high || !tl_b_low || !tl_b_high || !tl_res_low || !tl_res_high) {
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a_low, a_low_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a_high, a_high_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b_low, b_low_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b_high, b_high_data);
+  cvk_tiu_sub_param_t p5;
+  p5.res_high = tl_res_high;
+  p5.res_low = tl_res_low;
+  p5.a_high = tl_a_high;
+  p5.a_low = tl_a_low;
+  p5.b_high = tl_b_high;
+  p5.b_low = tl_b_low;
+  p5.rshift_bits = 0;
+  cvk_ctx->ops->tiu_sub(cvk_ctx, &p5);
+  uint8_t *res_high_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res_high);
+  uint8_t *res_low_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res_low);
+
+  for (uint32_t i = 0; i < size; i++) {
+    if (res_high_data[i] != ref_high_data[i]) {
+      fprintf(stderr, "comparing failed at res_high_data[%u], got %d, exp %d\n",
+             i, res_high_data[i], ref_high_data[i]);
+      ret = -1;
+      break;;
+    }
+    if (res_low_data[i] != ref_low_data[i]) {
+      fprintf(stderr, "comparing failed at res_low_data[%u], got %d, exp %d\n",
+             i, res_low_data[i], ref_low_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+  free(res_high_data);
+  free(res_low_data);
+
+fail_exit_2:
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res_high);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res_low);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b_high);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b_low);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a_high);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a_low);
+
+fail_exit:
+  free(a_high_data);
+  free(a_low_data);
+  free(b_high_data);
+  free(b_low_data);
+  free(ref_high_data);
+  free(ref_low_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_sub(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_sub(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_tensor_transfer.c b/cviruntime/test/181x/test_181x_tensor_transfer.c
new file mode 100644
index 000000000..cb913b192
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_tensor_transfer.c
@@ -0,0 +1,119 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+#include "test_native_ref.h"
+
+static int test_put_and_get_tensor_l2g(
+    CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int n = 2;
+  int c = 66;
+  int h = 3;
+  int w = 15;
+  int size = n * c * h * w;
+  uint8_t *data_x = (uint8_t *)malloc(size);
+  uint8_t *data_y = (uint8_t *)malloc(size);
+  if (!data_x || !data_y)
+    return -1;
+
+  for (int i = 0; i < size; i++)
+    data_x[i] = i - 100;
+
+  for (int i = 0; i < size; i++)
+    data_y[i] = -i;
+
+  /*
+   * Interleave two tensors in case the same devmem is reused between
+   * tensor_copy_s2d_g2l() and tensor_copy_l2g_d2s(), in which case the content of
+   * devmem is already what is expected before tdma_store(cvk_ctx, ).
+   */
+
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  cvk_tl_t *tl_x =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, 1);
+  cvk_tl_t *tl_y =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, 1);
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_x, data_x);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_y, data_y);
+
+  uint8_t *result_x = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_x);
+  uint8_t *result_y = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_y);
+
+  for (int i = 0; i < size; i++) {
+    if (result_x[i] != data_x[i]) {
+      printf("compare failed at result_x[%d]\n", i);
+      return -1;
+    }
+    if (result_y[i] != data_y[i]) {
+      printf("compare failed at result_y[%d]\n", i);
+      return -1;
+    }
+  }
+  free(result_x);
+  free(result_y);
+
+  /*
+   * Get result_y before result_x.
+   */
+
+
+  result_y = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_y);
+  result_x = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_x);
+  for (int i = 0; i < size; i++) {
+    if (result_x[i] != data_x[i]) {
+      printf("compare failed at result_x[%d]\n", i);
+      return -1;
+    }
+    if (result_y[i] != data_y[i]) {
+      printf("compare failed at result_y[%d]\n", i);
+      return -1;
+    }
+  }
+  free(result_x);
+  free(result_y);
+
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_y);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_x);
+  free(data_x);
+  free(data_y);
+
+  return 0;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+  
+  ret |= test_put_and_get_tensor_l2g(rt_handle, cvk_ctx);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/181x/test_181x_tensor_xor.c b/cviruntime/test/181x/test_181x_tensor_xor.c
new file mode 100644
index 000000000..0ee2f8cad
--- /dev/null
+++ b/cviruntime/test/181x/test_181x_tensor_xor.c
@@ -0,0 +1,235 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_xor_int8_ref(int8_t *a, int8_t *b, int8_t *res, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++)
+    res[i] = a[i] ^ b[i];
+}
+
+static void tl_xor_int16_ref(
+    uint8_t *ref_high, uint8_t *ref_low,
+    uint8_t *a_high, uint8_t *a_low,
+    uint8_t *b_high, uint8_t *b_low,
+    uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++) {
+    int32_t ta = ((int8_t)a_high[i] << 8) + a_low[i];
+    int32_t tb = ((int8_t)b_high[i] << 8) + b_low[i];
+    int32_t res = ta ^ tb;
+    ref_high[i] = (res >> 8) & 0xff;
+    ref_low[i] = res & 0xff;
+  }
+}
+
+static int test_tl_xor_int8(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  uint32_t size = n * c * h * w;
+  int8_t *a_data = (int8_t *)malloc(size);
+  int8_t *b_data = (int8_t *)malloc(size);
+  int8_t *ref_data = (int8_t *)malloc(size);
+  if (!a_data || !b_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint32_t i = 0; i < size; i++)
+    a_data[i] = (int8_t)(i % 256);
+
+  for (uint32_t i = 0; i < size; i++)
+    b_data[i] = (int8_t)(100 - i % 256);
+
+  tl_xor_int8_ref(a_data, b_data, ref_data, size);
+
+  cvk_tl_t *tl_a = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_b = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_res = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  uint8_t *res_data = NULL;
+  if (!tl_a || !tl_b || !tl_res) {
+    ret = -1;
+    goto fail_exit_2;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, (uint8_t *)a_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b, (uint8_t *)b_data);
+
+  cvk_tiu_xor_int8_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.res = tl_res;
+  p.a = tl_a;
+  p.b = tl_b;
+  cvk_ctx->ops->tiu_xor_int8(cvk_ctx, &p);
+  res_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res);
+
+  for (uint32_t i = 0; i < size; i++) {
+    if ((int8_t)res_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%u], got %x, exp %x\n",
+             i, res_data[i], ref_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+fail_exit_2:
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+
+  free(res_data);
+
+fail_exit:
+  free(a_data);
+  free(b_data);
+  free(ref_data);
+
+  return ret;
+}
+
+static int test_tl_xor_int16(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int n = 2; // 3 -> 2 for 1810
+  int c = 35; // 35 -> 2 for 1810
+  int h = 7;
+  int w = 37;
+  int ret = 0;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  uint32_t size = n * c * h * w;
+  uint8_t *a_high_data = (uint8_t *)malloc(size);
+  uint8_t *a_low_data = (uint8_t *)malloc(size);
+  uint8_t *b_high_data = (uint8_t *)malloc(size);
+  uint8_t *b_low_data = (uint8_t *)malloc(size);
+  uint8_t *ref_high_data = (uint8_t *)malloc(size);
+  uint8_t *ref_low_data = (uint8_t *)malloc(size);
+  uint8_t *res_high_data = NULL;
+  uint8_t *res_low_data = NULL;
+  if (!a_high_data || !a_low_data || !b_high_data ||
+      !b_low_data || !ref_high_data || !ref_low_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  for (uint32_t i = 0; i < size; i++) {
+    a_high_data[i] = i / 10;
+    a_low_data[i] = i;
+    b_high_data[i] = (i + 250) / 20;
+    b_low_data[i] = 100 - i;
+  }
+
+
+  tl_xor_int16_ref(
+      ref_high_data, ref_low_data,
+      a_high_data, a_low_data,
+      b_high_data, b_low_data,
+      size);
+
+  cvk_tl_t *tl_a_low = cvk_ctx->ops->lmem_alloc_tensor( cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_a_high = cvk_ctx->ops->lmem_alloc_tensor( cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_b_low = cvk_ctx->ops->lmem_alloc_tensor( cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_b_high = cvk_ctx->ops->lmem_alloc_tensor( cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_res_low = cvk_ctx->ops->lmem_alloc_tensor( cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_res_high = cvk_ctx->ops->lmem_alloc_tensor( cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  if (!tl_a_low || !tl_a_high || !tl_b_low || !tl_b_high || !tl_res_low || !tl_res_high)
+    return -1;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a_low, a_low_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a_high, a_high_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b_low, b_low_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_b_high, b_high_data);
+
+  cvk_tiu_xor_int16_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.res_high = tl_res_high;
+  p.res_low = tl_res_low;
+  p.a_high = tl_a_high;
+  p.a_low = tl_a_low;
+  p.b_high = tl_b_high;
+  p.b_low = tl_b_low;
+  cvk_ctx->ops->tiu_xor_int16(cvk_ctx, &p);
+  res_high_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res_high);
+  res_low_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res_low);
+
+  for (uint64_t i = 0; i < size; i++) {
+    if (res_high_data[i] != ref_high_data[i]) {
+      fprintf(stderr, "comparing failed at res_high_data[%" PRIu64 "], got %d, exp %d\n",
+             i, res_high_data[i], ref_high_data[i]);
+      return -1;
+    }
+    if (res_low_data[i] != ref_low_data[i]) {
+      fprintf(stderr, "comparing failed at res_low_data[%" PRIu64 "], got %d, exp %d\n",
+             i, res_low_data[i], ref_low_data[i]);
+      return -1;
+    }
+  }
+
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res_high);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res_low);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b_high);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_b_low);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a_high);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a_low);
+
+fail_exit:
+  free(a_high_data);
+  free(a_low_data);
+  free(b_high_data);
+  free(b_low_data);
+  free(ref_high_data);
+  free(ref_low_data);
+  free(res_high_data);
+  free(res_low_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+  CVI_RT_HANDLE rt_handle = NULL;
+  cvk_context_t *cvk_ctx = NULL;
+
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+  if (!rt_handle || !cvk_ctx) {
+    printf("%s fail\n", __FILENAME__);
+    return -1;
+  }
+
+  ret |= test_tl_xor_int8(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_xor_int8(rt_handle, cvk_ctx, 1);
+  ret |= test_tl_xor_int16(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_xor_int16(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  return ret;
+}
diff --git a/cviruntime/test/1822/1822_test_util.h b/cviruntime/test/1822/1822_test_util.h
new file mode 100644
index 000000000..362109c2d
--- /dev/null
+++ b/cviruntime/test/1822/1822_test_util.h
@@ -0,0 +1,1268 @@
+#ifndef INC_1822_TEST_UTIL_H
+#define INC_1822_TEST_UTIL_H
+
+#include <runtime/debug.h>
+#include <bmruntime_bmkernel.h>
+#include <bmkernel/bm1822/bmkernel_1822.h>
+#include "test_native_ref.h"
+#include <assert.h>
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <inttypes.h>
+#include <fstream>
+#include "compression.h"
+#include "bm_vlc_compress.h"
+#include "1822_vlc_random_gen_nn_data.h"
+#include <bmkernel/bm1822/1822_fp_convert.h>
+#include <bmkernel/bm1822/bm1822_tpu_cfg.h>
+#include <algorithm>
+#include "test_neuron_dump.hpp"
+
+#define math_min(x, y)          ((x) < (y) ? (x) : (y))
+#define math_max(x, y)          ((x) > (y) ? (x) : (y))
+#define __ALIGN_MASK(x,mask)    (((x)+(mask))&~(mask))
+#define ALIGN(x,a)              __ALIGN_MASK(x,(__typeof__(x))(a)-1)
+
+typedef uint32_t laddr_t;
+typedef uint64_t gaddr_t;
+
+//#define ENABEL_SIMPLE_BMK1822_VLC_TEST
+#define ENABEL_GAUSSIANRANDOM_BMK1822_VLC_TEST
+
+typedef bmk1822_context_t bmk_ctx_t;
+
+typedef bmk1822_tensor_lmem_shape_t tl_shape_t;
+typedef bmk1822_matrix_lmem_shape_t ml_shape_t;
+typedef bmk1822_tensor_tgmem_shape_t tg_shape_t;
+typedef bmk1822_matrix_tgmem_shape_t mg_shape_t;
+
+typedef bmk1822_tensor_lmem_t tl_t;
+typedef bmk1822_matrix_lmem_t ml_t;
+typedef bmk1822_tensor_tgmem_t tg_t;
+typedef bmk1822_matrix_tgmem_t mg_t;
+typedef bmk1822_compressed_tensor_tgmem_t compressed_tg_t;
+typedef bmk1822_compressed_matrix_tgmem_t compressed_mg_t;
+
+typedef bmk1822_tensor_tgmem_stride_t tg_stride_t;
+typedef bmk1822_matrix_tgmem_stride_t mg_stride_t;
+
+typedef struct {
+  tg_t tg;
+  bmmem_device_t mem;
+} tg_wrapper_t;
+
+typedef struct {
+  mg_t mg;
+  bmmem_device_t mem;
+} mg_wrapper_t;
+
+typedef struct {
+  compressed_tg_t tg;
+  bmmem_device_t mem;
+} compressed_tg_wrapper_t;
+
+typedef struct {
+  compressed_mg_t mg;
+  bmmem_device_t mem;
+} compressed_mg_wrapper_t;
+
+typedef enum {
+  VLC_CMP_MODE_HW = 0, // <! vlc compress mode - hw, ONLY support bias0/bias1
+  VLC_CMP_MODE_COMPILER, // <! vlc compress mode - sw, compiler, it could call bm_vlc_est_weight_bias
+  VLC_CMP_MODE_MAX,
+} vlc_cmp_mode_e;
+
+typedef struct dim_s {
+  int n, c, h, w;
+} dim_t;
+
+typedef struct {
+  fmt_t src_fmt;
+  fmt_t dst_fmt;
+} fmt_type;
+
+#define BM_TENSOR_FP32(n, c, h, w) \
+    {.fmt = BM_FMT_FP32, \
+     .dim_size = 4, \
+     .dim = {n, c, h, w} \
+    }
+#define BM_TENSOR_INT16(n, c, h, w) \
+    {.fmt = BM_FMT_INT16, \
+     .dim_size = 4, \
+     .dim = {n, c, h, w} \
+    }
+#define BM_TENSOR_INT8(n, c, h, w) \
+    {.fmt = BM_FMT_INT8, \
+     .dim_size = 4, \
+     .dim = {n, c, h, w} \
+    }
+#define BM_TENSOR_BF16(n, c, h, w) \
+    {.fmt = BM_FMT_BF16, \
+     .dim_size = 4, \
+     .dim = {n, c, h, w} \
+    }
+#define BM_TENSOR_WITH_FMT(n, c, h, w, data_fmt) \
+    {.fmt = data_fmt, \
+     .dim_size = 4, \
+     .dim = {n, c, h, w} \
+    }
+#define BM_MATRIX_INT16(l, r) \
+    {.fmt = BM_FMT_INT16, \
+     .dim_size = 2, \
+     .dim = {l, r} \
+    }
+#define BM_MATRIX_INT8(l, r) \
+    {.fmt = BM_FMT_INT8, \
+     .dim_size = 2, \
+     .dim = {l, r} \
+    }
+#define BM_MATRIX_FP32(l, r) \
+    {.fmt = BM_FMT_FP32, \
+     .dim_size = 2, \
+     .dim = {l, r} \
+    }
+
+static const int bmfmt_bpp[BM_FMT_MAX] = {32, 16, 16, 8, 16};
+#define BM_FMT_BPP(_fmt_)      (bmfmt_bpp[(_fmt_)])
+
+static inline size_t bmshape_get_size(bmshape_t *s) {
+  TPU_ASSERT(s->dim_size <= BM_SHAPE_MAX_DIM, NULL);
+  size_t size = BM_FMT_BPP(s->fmt) / 8;
+
+  for (int i = 0; i < s->dim_size; i++) {
+    TPU_ASSERT(s->dim[i] > 0, NULL);
+    size *= s->dim[i];
+  }
+  return size;
+}
+
+static inline int dim_size(const dim_t *dim)
+{
+  return dim->n * dim->c * dim->h * dim->w;
+}
+
+static inline u64 tl_shape_size(const tl_shape_t *s)
+{
+  return (u64)s->n * s->c * s->h * s->w;
+}
+
+static inline u64 ml_shape_size(const ml_shape_t *s)
+{
+  return (u64)s->n * s->col;
+}
+
+static inline u64 mg_shape_size(const mg_shape_t *s)
+{
+  return (u64)s->row * s->col;
+}
+
+static inline u64 tg_shape_size(const tg_shape_t *s)
+{
+  return (u64)s->n * s->c * s->h * s->w;
+}
+
+static inline dim_t dim_of_ith_element(int i, dim_t *dim, int transpose)
+{
+  int channel_offset = i % (dim->h * dim->w);
+  int hidx = channel_offset / dim->w;
+  int widx = channel_offset % dim->w;
+  int channel_index = i / (dim->h * dim->w);
+  int nidx = channel_index / dim->c;
+  int cidx = channel_index % dim->c;
+  if (transpose) {
+    nidx = channel_index % dim->n;
+    cidx = channel_index / dim->n;
+  }
+  dim_t r = { nidx, cidx, hidx, widx };
+  return r;
+}
+
+static inline void * xmalloc(size_t size)
+{
+  void *p = malloc(size);
+  assert(p);
+  return p;
+}
+
+static inline void test_init(bmctx_t *ctx, bmk_ctx_t **bmk)
+{
+  int ret = bm_init(0, ctx);
+  if (ret != BM_SUCCESS) {
+    fprintf(stderr, "bm_init failed, err %d\n", ret);
+    exit(-1);
+  }
+
+  cviruntime_cvikernel_create(*ctx, (void**)bmk);
+}
+
+static inline void test_submit(bmctx_t *ctx)
+{
+  cviruntime_cvikernel_submit(*ctx);
+}
+
+static inline void test_exit(bmctx_t *ctx)
+{
+  cviruntime_cvikernel_destroy(*ctx);
+  bm_exit(*ctx);
+}
+
+static inline tl_t * alloc_tl(bmk_ctx_t *bmk, tl_shape_t s, fmt_t f, int align)
+{
+  tl_t *t = bmk1822_lmem_alloc_tensor(bmk, s, f, align);
+  t->cmprs_fmt = f;
+  assert(t);
+  return t;
+}
+
+static inline ml_t * alloc_ml(bmk_ctx_t *bmk, ml_shape_t s, int align)
+{
+  ml_t *m = bmk1822_lmem_alloc_matrix(bmk, s, FMT_I8, align);
+  assert(m);
+  return m;
+}
+
+static inline ml_t * alloc_ml(bmk_ctx_t *bmk, ml_shape_t s, fmt_t f, int align)
+{
+  ml_t *m = bmk1822_lmem_alloc_matrix(bmk, s, f, align);
+  assert(m);
+  return m;
+}
+
+static inline ml_t * alloc_ml_bf16(bmk_ctx_t *bmk, ml_shape_t s, fmt_t f,int align)
+{
+  ml_t *m = bmk1822_lmem_alloc_matrix(bmk, s, f, align);
+  assert(m);
+  return m;
+}
+
+static inline tg_t * alloc_tg_gmem(bmctx_t *ctx, tg_shape_t s, fmt_t fmt)
+{
+  bmshape_t bms = BM_TENSOR_INT8(
+      (int)s.n,
+      (int)s.c,
+      (int)s.h,
+      (int)s.w);
+
+  tg_wrapper_t *w = (tg_wrapper_t *)malloc(sizeof(tg_wrapper_t));
+  
+  //w->mem = bmmem_device_alloc(*ctx, &bms);
+  w->mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  w->tg.base_reg_index = 0;
+  w->tg.start_address = bmmem_device_addr(w->mem);
+  w->tg.fmt = fmt;
+  w->tg.shape = s;
+  w->tg.stride = bmk1822_tensor_tgmem_default_stride(s, w->tg.fmt);
+
+  return &w->tg;
+}
+
+static inline tg_t * _alloc_tg_bf16_gmem(bmctx_t *ctx, tg_shape_t s, fmt_t fmt,
+    bmk1822_tensor_tgmem_stride_t* tg_stride)
+{
+  u32 val = (fmt == FMT_BF16) ? 2 : 1;
+  bmshape_t bms = BM_TENSOR_INT8(
+      (int)s.n,
+      (int)s.c,
+      (int)s.h,
+      (int)s.w * (int)val);
+
+  tg_wrapper_t *w = (tg_wrapper_t *)malloc(sizeof(tg_wrapper_t));
+  //w->mem = bmmem_device_alloc(*ctx, &bms);
+  w->mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  w->tg.base_reg_index = 0;
+  w->tg.start_address = bmmem_device_addr(w->mem);
+  w->tg.fmt = fmt;
+  w->tg.shape = s;
+  if (tg_stride) {
+    w->tg.stride = *tg_stride;
+  }
+  else {
+    w->tg.stride = bmk1822_tensor_tgmem_default_stride(s, fmt);
+  }
+  return &w->tg;
+}
+
+static inline tg_t * alloc_tg_bf16_gmem(bmctx_t *ctx, tg_shape_t s, fmt_t fmt)
+{
+  return _alloc_tg_bf16_gmem(ctx, s, fmt, NULL);
+}
+
+static inline mg_t * alloc_mg_gmem(bmctx_t *ctx, mg_shape_t s)
+{
+  bmshape_t bms = BM_MATRIX_INT8((int)s.row, (int)s.col);
+  mg_wrapper_t *w = (mg_wrapper_t *)malloc(sizeof(mg_wrapper_t));
+  //w->mem = bmmem_device_alloc(*ctx, &bms);
+  w->mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  w->mg.base_reg_index = 0;
+  w->mg.start_address = bmmem_device_addr(w->mem);
+  w->mg.shape = s;
+  w->mg.stride.row = s.col;
+
+  return &w->mg;
+}
+
+static inline compressed_mg_t* alloc_compressed_mg_gmem(bmctx_t *ctx, mg_shape_t s)
+{
+  bmshape_t bms = BM_MATRIX_INT8((int)s.row, (int)s.col);
+  compressed_mg_wrapper_t *w = (compressed_mg_wrapper_t *)malloc(sizeof(compressed_mg_wrapper_t));
+  //w->mem = bmmem_device_alloc(*ctx, &bms);
+  w->mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  w->mg.m.base_reg_index = 0;
+  w->mg.m.start_address = bmmem_device_addr(w->mem);
+  w->mg.m.shape = s;
+  w->mg.m.stride.row = s.col;
+
+  return &w->mg;
+}
+
+static inline mg_t * alloc_mg_bf16_gmem(bmctx_t *ctx, mg_shape_t s, fmt_t fmt)
+{
+
+  u32 val = (fmt == FMT_BF16) ? 2 : 1;
+  bmshape_t bms = BM_MATRIX_INT8((int)s.row, (int)s.col * (int)val);
+  mg_wrapper_t *w = (mg_wrapper_t *)malloc(sizeof(mg_wrapper_t));
+  //w->mem = bmmem_device_alloc(*ctx, &bms);
+  w->mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  w->mg.base_reg_index = 0;
+  w->mg.start_address = bmmem_device_addr(w->mem);
+  w->mg.shape = s;
+  w->mg.fmt = fmt;
+  w->mg.stride.row = s.col * val;
+  //printf("w->mg.stride.row =%x s.col=%x val=%x\n",w->mg.stride.row, s.col, val);
+  return &w->mg;
+}
+
+static inline compressed_tg_t * alloc_compressed_tg_gmem(
+    bmctx_t *ctx, tl_shape_t *s, u8 bit_length)
+{
+  u64 size = tl_shape_size(s);
+  u64 header_bytes = 16;
+  u64 map_bytes = compression_map_bytes(size);
+  u64 data_bytes = compression_data_bytes(size, bit_length);
+  u64 total_bytes = header_bytes + map_bytes + data_bytes;
+  compressed_tg_wrapper_t *w = (compressed_tg_wrapper_t *)malloc(sizeof(compressed_tg_wrapper_t));
+  w->mem = bmmem_device_alloc_raw(*ctx, total_bytes);
+  w->tg.t.base_reg_index = 0;
+  w->tg.t.start_address = bmmem_device_addr(w->mem);
+  w->tg.reserved_size = total_bytes;
+  w->tg.bit_length = bit_length;
+  w->tg.t.shape.n = s->n;
+  w->tg.t.shape.c = s->c;
+  w->tg.t.shape.h = s->h;
+  w->tg.t.shape.w = s->w;
+  w->tg.t.stride = bmk1822_tensor_tgmem_default_stride(w->tg.t.shape, w->tg.t.fmt);
+  return &w->tg;
+}
+// <! copy from bmkernel/src/kernel_internal.h
+static inline int bitsize_of_fmt(fmt_t fmt)
+{
+  switch (fmt) {
+    case FMT_F32:
+    case FMT_I32:
+      return 32;
+    case FMT_F16:
+    case FMT_I16:
+    case FMT_U16:
+    case FMT_BF16:
+      return 16;
+    case FMT_I8:
+    case FMT_U8:
+      return 8;
+    case FMT_I4:
+      return 4;
+    case FMT_I2:
+      return 2;
+    case FMT_I1:
+      return 1;
+    default:
+      assert(0);
+      return -1;
+  }
+}
+// <! end copy from bmkernel/src/kernel_internal.h
+
+static inline int bytesize_of_fmt(fmt_t fmt)
+{
+  return bitsize_of_fmt(fmt) / 8;
+}
+
+static inline compressed_tg_t * _alloc_vlc_compressed_tg_gmem(
+    bmctx_t *ctx, tl_shape_t *s, fmt_t fmt, CommandInfo* cmd_info)
+{
+  u64 in_size = tl_shape_size(s);
+
+  u8 data_type = (fmt == FMT_BF16) ? 1 : 0;
+  in_size *= bytesize_of_fmt(fmt);
+
+  size_t bs_buf_size = get_out_bs_buf_size(in_size, data_type);
+  compressed_tg_wrapper_t *w = (compressed_tg_wrapper_t *)malloc(sizeof(compressed_tg_wrapper_t));
+  w->mem = bmmem_device_alloc_raw(*ctx, bs_buf_size);
+  w->tg.t.base_reg_index = 0;
+  w->tg.t.start_address = bmmem_device_addr(w->mem);
+  w->tg.reserved_size = bs_buf_size;
+  w->tg.t.fmt = fmt;
+
+  if (cmd_info) {
+    w->tg.bias0 = cmd_info->bias0;
+    w->tg.bias1 = cmd_info->bias1;
+    w->tg.zero_guard_en = cmd_info->zero_guard_en;
+  }
+  else {
+    if (fmt == FMT_BF16) {
+      w->tg.bias0 = 127;
+    }
+    else if (fmt == FMT_I8 || fmt == FMT_U8) {
+      w->tg.bias0 = 0;
+    }
+    else {
+      printf("only accept fmt for FMT_BF16/FMT_I8/FMT_U8/, your format is %d\n", fmt);
+      assert(0);
+    }
+
+    w->tg.bias1 = 0;
+    // <! TODO: need to analyze data contain 0
+    w->tg.zero_guard_en = 0;
+  }
+  w->tg.t.shape.n = s->n;
+  w->tg.t.shape.c = s->c;
+  w->tg.t.shape.h = s->h;
+  w->tg.t.shape.w = s->w;
+  w->tg.t.stride = bmk1822_tensor_tgmem_default_stride(w->tg.t.shape, fmt);
+
+  return &w->tg;
+}
+
+static inline compressed_tg_t * alloc_vlc_compressed_tg_gmem(
+    bmctx_t *ctx, tl_shape_t *s, fmt_t fmt)
+{
+  return _alloc_vlc_compressed_tg_gmem(ctx, s, fmt, NULL);
+}
+
+/**
+ * \shape_size shape size
+ * \signedness 0 means ungiend 1 means signed
+ * \data_type 0 means 8bit 1 means bf16
+ */
+static inline void vlc_init_testdata(u16 *src_data, u64 shape_size, bool signedness, bool data_type) {
+#ifdef ENABEL_GAUSSIANRANDOM_BMK1822_VLC_TEST
+  float zero_ratio = 0;
+  assert(signedness == 0); //<! bf16 only set to 0
+  assert(data_type == 1); //<! bf16 only set to 1
+  random_gen_nn_data((u8* )src_data, shape_size, signedness, data_type, zero_ratio);
+#else /* ! ifdef ENABEL_GAUSSIANRANDOM_BMK1822_VLC_TEST */
+  (void)signedness;
+  (void)data_type;
+
+  printf ("randome signedness %d data_type %d\n", signedness, data_type);
+  memset(src_data, 0x00, shape_size * sizeof(u16));
+  for (u64 i = 0; i < shape_size; i++)
+    src_data[i] = 200 + i;
+
+  u64 zero_range = 20; //<! friendly enhance compress ratio
+  if (shape_size > zero_range) {
+    for (u64 i = 0; i < shape_size - zero_range; i++) {
+      src_data[i] = 0;
+    }
+  }
+#endif /* ifdef ENABEL_GAUSSIANRANDOM_BMK1822_VLC_TEST */
+}
+
+static inline void vlc_init_testdata(u8 *src_data, u64 shape_size, bool signedness, bool data_type) {
+  memset(src_data, 0x00, shape_size);
+#ifdef ENABEL_GAUSSIANRANDOM_BMK1822_VLC_TEST
+  float zero_ratio = 0;
+  assert(data_type == 0); //<! bf16 only set to 1
+  random_gen_nn_data(src_data, shape_size, signedness, data_type, zero_ratio);
+#else /* ! ifdef ENABEL_GAUSSIANRANDOM_BMK1822_VLC_TEST */
+  (void)signedness;
+  (void)data_type;
+
+  for (u64 i = 0; i < shape_size; i++)
+    src_data[i] = 200 + i;
+
+  u64 zero_range = 20; //<! friendly enhance compress ratio
+  if (shape_size > zero_range) {
+    for (u64 i = 0; i < shape_size - zero_range; i++) {
+      src_data[i] = 0;
+    }
+  }
+#endif /* ifdef ENABEL_GAUSSIANRANDOM_BMK1822_VLC_TEST */
+}
+
+static inline compressed_mg_t * alloc_vlc_compressed_mg_gmem(
+    bmctx_t *ctx, mg_shape_t s, fmt_t fmt, CommandInfo* cmd_info)
+{
+  u64 in_size = mg_shape_size(&s);
+  u8 data_type = (fmt == FMT_BF16) ? 1 : 0;
+  in_size *= bytesize_of_fmt(fmt);
+
+  size_t bs_buf_size = get_out_bs_buf_size(in_size, data_type);
+
+  compressed_mg_wrapper_t *w = (compressed_mg_wrapper_t *)malloc(sizeof(compressed_mg_wrapper_t));
+
+  w->mem = bmmem_device_alloc_raw(*ctx, bs_buf_size);
+  w->mg.m.shape = s;
+  w->mg.m.stride.row = s.col * bytesize_of_fmt(fmt);
+  w->mg.m.base_reg_index = 0;
+  w->mg.m.fmt = fmt;
+  w->mg.m.start_address = bmmem_device_addr(w->mem);
+
+  if (cmd_info) {
+    w->mg.bias0 = cmd_info->bias0;
+    w->mg.bias1 = cmd_info->bias1;
+    w->mg.zero_guard_en = cmd_info->zero_guard_en;
+  }
+  else {
+    w->mg.bias0 = 0;
+
+    if (fmt == FMT_BF16) {
+      w->mg.bias0 = 127;
+    }
+    else if (fmt == FMT_I8 || fmt == FMT_U8) {
+      w->mg.bias0 = 0;
+    }
+    else {
+      printf("only accept fmt for FMT_BF16/FMT_I8/FMT_U8/, your format is %d\n", fmt);
+      assert(0);
+    }
+
+    w->mg.bias1 = 0;
+    // <! FIXME: need to analyze data contain 0
+    w->mg.zero_guard_en = 0;
+  }
+
+  return &w->mg;
+}
+
+/**
+ * \cmd_info_est_in that manual set compress parameters, the possible input as below
+    1. NULL, it could call \bm_vlc_est_weight_bias
+    2. not NULL that directly send to \bm_vlc_enc_int8
+ * \cmd_info_est_out output est result, the passble value as following
+    1. \cmd_info_est_out = \cmd_info_est_in once cmd_info_est_in != NULL
+    2. \cmd_info_est_out = est result once cmd_info_est_in == NULL
+    3. NULL if you dont care
+ */
+static inline u8 *vlc_compress (
+    u8 *src_data, u64 size, int is_signed, int data_type, size_t* bs_size, const CommandInfo* cmd_info_est_in, CommandInfo* cmd_info_est_out)
+{
+  CommandInfo cmd_info;
+  size_t bs_buf_size = get_out_bs_buf_size(size, data_type);
+
+  u8 *bsbuf = (u8 *)malloc(sizeof(u8) * bs_buf_size);
+  memset(&cmd_info, 0x00, sizeof(CommandInfo));
+
+  /* generate comparess data (bsbuf)*/
+  if (cmd_info_est_in) {
+    memcpy(&cmd_info, cmd_info_est_in, sizeof(CommandInfo));
+  }
+  else {
+    bm_vlc_est_weight_bias(src_data, size, (bool)is_signed, (bool)data_type, &cmd_info);
+  }
+
+  if (cmd_info_est_out) {
+    memcpy(cmd_info_est_out, &cmd_info, sizeof(CommandInfo));
+  }
+
+  if (data_type) {
+    bm_vlc_enc_bf16((u16*)src_data, size, bsbuf, bs_size, &cmd_info);
+  }
+  else {
+    bm_vlc_enc_int8(src_data, size, bsbuf, bs_size, &cmd_info);
+  }
+
+  return bsbuf;
+}
+
+static inline int get_vlc_compressed_meta(
+    u8 *src_data, u64 in_size, fmt_t fmt, size_t* bs_size, CommandInfo* cmd_info)
+{
+  int is_signed = (fmt == FMT_I8);
+  int data_type = (fmt == FMT_BF16) ? 1 : 0;
+  //bm_vlc_est_weight_bias(src_data, in_size, (bool)is_signed, (bool)data_type, cmd_info);
+
+  u8 *ref_data = vlc_compress(src_data, in_size, is_signed, data_type, bs_size, cmd_info, NULL);
+  free(ref_data);
+  return 0;
+}
+
+static inline void free_tl(bmk_ctx_t *bmk, const tl_t *t)
+{
+  return bmk1822_lmem_free_tensor(bmk, t);
+}
+
+static inline void free_ml(bmk_ctx_t *bmk, const ml_t *m)
+{
+  return bmk1822_lmem_free_matrix(bmk, m);
+}
+
+static inline void free_tg_gmem(bmctx_t *ctx, const tg_t *tg)
+{
+  tg_wrapper_t *w = (typeof(w))tg;
+  bmmem_device_free(*ctx, w->mem);
+  free(w);
+}
+
+static inline void free_mg_gmem(bmctx_t *ctx, const mg_t *mg)
+{
+  mg_wrapper_t *w = (typeof(w))mg;
+  bmmem_device_free(*ctx, w->mem);
+  free(w);
+}
+
+static inline void free_compressed_tg_gmem(
+    bmctx_t *ctx, const compressed_tg_t *t)
+{
+  compressed_tg_wrapper_t *w = (typeof(w))t;
+  bmmem_device_free(*ctx, w->mem);
+  free(w);
+}
+
+static inline void free_compressed_mg_gmem(
+    bmctx_t *ctx, const compressed_mg_t *t)
+{
+  compressed_mg_wrapper_t *w = (typeof(w))t;
+  bmmem_device_free(*ctx, w->mem);
+  free(w);
+}
+
+static inline u8 * get_tg_gmem(bmctx_t *ctx, const tg_t *tg)
+{
+  tg_shape_t s = tg->shape;
+  u32 size = s.n * s.c * s.h * s.w;
+  u8 *data = (u8 *)malloc(sizeof(u8) * size);
+
+  tg_wrapper_t *w = (typeof(w))tg;
+  int ret = bm_memcpy_d2s(*ctx, data, w->mem);
+  assert(ret == BM_SUCCESS);
+
+  return data;
+}
+
+static inline u8 * get_tg_bf16_gmem(bmctx_t *ctx, const tg_t *tg)
+{
+  tg_shape_t s = tg->shape;
+  u32 size = s.n * s.c * s.h * s.w * (tg->fmt == FMT_BF16 ? 2 : 1);
+  u8 *data = (u8 *)malloc(sizeof(u8) * size);
+  tg_wrapper_t *w = (typeof(w))tg;
+  int ret = bm_memcpy_d2s(*ctx, data, w->mem);
+  assert(ret == BM_SUCCESS);
+
+  return data;
+}
+
+static inline u8 * get_mg_gmem(bmctx_t *ctx, const mg_t *mg)
+{
+  mg_shape_t s = mg->shape;
+  u32 size = s.row * s.col;
+  u8 *data = (u8 *) malloc(sizeof(u8) * size);
+
+  mg_wrapper_t *w = (typeof(w))mg;
+  int ret = bm_memcpy_d2s(*ctx, data, w->mem);
+  assert(ret == BM_SUCCESS);
+
+  return data;
+}
+
+static inline u8 * get_compressed_mg_gmem(bmctx_t *ctx, const compressed_mg_t *mg, size_t bs_size)
+{
+  //mg_shape_t s = mg->m.shape;
+  //u32 size = s.row * s.col;
+  u8 *data = (u8 *)malloc(sizeof(u8) * bs_size);
+
+  compressed_mg_wrapper_t *w = (typeof(w))mg;
+  int ret = bm_memcpy_d2s(*ctx, data, w->mem);
+  assert(ret == BM_SUCCESS);
+
+  return data;
+}
+
+static inline u8 * get_mg_bf16_gmem(bmctx_t *ctx, const mg_t *mg)
+{
+  mg_shape_t s = mg->shape;
+  u32 size = s.row * s.col * (mg->fmt == FMT_BF16 ? 2 : 1);
+  u8 *data = (u8 *)malloc(sizeof(u8) *size);
+
+  mg_wrapper_t *w = (typeof(w))mg;
+  int ret = bm_memcpy_d2s(*ctx, data, w->mem);
+  assert(ret == BM_SUCCESS);
+
+  return data;
+}
+
+static inline u8 * get_compressed_tg_gmem(
+    bmctx_t *ctx, const compressed_tg_t *t)
+{
+  compressed_tg_wrapper_t *w = (typeof(w))t;
+
+  u8 *data = (u8 *)malloc(sizeof(u8) * t->reserved_size);
+  int ret = bm_memcpy_d2s(*ctx, data, w->mem);
+  assert(ret == BM_SUCCESS);
+
+  return data;
+}
+
+static inline u8 * get_bytes_gmem(bmctx_t *ctx, bmmem_device_t mem, u64 size)
+{
+  //bmmem_device_t mem = bmmem_device_prealloc_raw(*ctx, NULL, addr, size);
+
+  u8 *data = (u8 *)malloc(sizeof(u8) * size);
+  int ret = bm_memcpy_d2s(*ctx, data, mem);
+  assert(ret == BM_SUCCESS);
+
+  return data;
+}
+
+static inline void put_tg_gmem(bmctx_t *ctx, const tg_t *tg, u8 data[])
+{
+  tg_wrapper_t *w = (typeof(w))tg;
+  int ret = bm_memcpy_s2d(*ctx, w->mem, data);
+  assert(ret == BM_SUCCESS);
+}
+
+static inline void put_tg_bf16_gmem(bmctx_t *ctx, const tg_t *tg, u8 data[])
+{
+  tg_wrapper_t *w = (typeof(w))tg;
+  int ret = bm_memcpy_s2d(*ctx, w->mem, data);
+  assert(ret == BM_SUCCESS);
+}
+
+static inline void put_mg_gmem(bmctx_t *ctx, const mg_t *mg, u8 data[])
+{
+  mg_wrapper_t *w = (typeof(w))mg;
+  int ret = bm_memcpy_s2d(*ctx, w->mem, data);
+  assert(ret == BM_SUCCESS);
+}
+
+static inline void put_mg_bf16_gmem(bmctx_t *ctx, const mg_t *mg, u8 data[])
+{
+  mg_wrapper_t *w = (typeof(w))mg;
+  int ret = bm_memcpy_s2d(*ctx, w->mem, data);
+  assert(ret == BM_SUCCESS);
+}
+
+#if 0
+static inline void put_bytes_gmem(bmctx_t *ctx, u64 addr, u64 size, u8 data[])
+{
+  bmmem_device_t mem = bmmem_device_prealloc_raw(*ctx, NULL, addr, size);
+
+  int ret = bm_memcpy_s2d(*ctx, mem, data);
+  assert(ret == BM_SUCCESS);
+}
+#else
+static inline void put_bytes_gmem(bmctx_t *ctx, bmmem_device_t mem, u8 data[])
+{
+  //bmmem_device_t mem = bmmem_device_prealloc_raw(*ctx, NULL, addr, size);
+
+  int ret = bm_memcpy_s2d(*ctx, mem, data);
+  assert(ret == BM_SUCCESS);
+}
+#endif
+
+static inline void put_compressed_tg_gmem(
+    bmctx_t *ctx, const compressed_tg_t *t, u8 buf[], u64 size)
+{
+  assert(size <= t->reserved_size);
+
+  compressed_tg_wrapper_t *w = (typeof(w))t;
+  //u64 addr = bmmem_device_addr(w->mem);
+
+  //put_bytes_gmem(ctx, addr, size, buf);
+  put_bytes_gmem(ctx, w->mem, buf);
+}
+
+static inline void put_compressed_mg_gmem(
+    bmctx_t *ctx, const compressed_mg_t *t, u8 buf[], u64 size)
+{
+  assert(size != 0);
+
+  compressed_mg_wrapper_t *w = (typeof(w))t;
+  //u64 addr = bmmem_device_addr(w->mem);
+
+  put_bytes_gmem(ctx, w->mem, buf);
+}
+
+static inline void put_tensor_g2l(
+    bmctx_t *ctx, bmk_ctx_t *bmk, const tl_t *tl, u8 data[])
+{
+  tg_shape_t s;
+  s.n = tl->shape.n;
+  s.c = tl->shape.c;
+  s.h = tl->shape.h;
+  s.w = tl->shape.w;
+  tg_t *tg = alloc_tg_gmem(ctx, s, FMT_I8);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = tg;
+  p.dst = tl;
+
+  put_tg_gmem(ctx, tg, data);
+  bmk1822_tdma_g2l_tensor_copy(bmk, &p);
+  test_submit(ctx);
+
+  free_tg_gmem(ctx, tg);
+}
+
+/**
+ * prepard mean you alloc address but not submit it
+ * once submit it could re-assign from head
+ */
+static inline tg_t* prepare_put_bf16_tensor_g2l(
+    bmctx_t *ctx, bmk_ctx_t *bmk, const tl_t *tl, u16 data[], fmt_t tg_data_format,
+bmk1822_tdma_tg2l_tensor_copy_param_t* p)
+{
+  tg_shape_t s;
+  s.n = tl->shape.n;
+  s.c = tl->shape.c;
+  s.h = tl->shape.h;
+  s.w = tl->shape.w;
+  tg_t *tg = alloc_tg_bf16_gmem(ctx, s, tg_data_format);
+
+  p->src = tg;
+  p->dst = tl;
+
+  assert(bmk);
+
+  put_tg_bf16_gmem(ctx, tg, (u8 *)data);
+  return tg;
+}
+
+/**
+ * issue prepared one
+ */
+static inline void launch_put_bf16_tensor_g2l(bmctx_t *ctx, bmk_ctx_t *bmk,
+const tg_t *tg, bmk1822_tdma_tg2l_tensor_copy_param_t* p) {
+  bmk1822_tdma_g2l_bf16_tensor_copy(bmk, p);
+  test_submit(ctx);
+  free_tg_gmem(ctx, tg);
+}
+
+static inline void put_bf16_tensor_g2l(
+    bmctx_t *ctx, bmk_ctx_t *bmk, const tl_t *tl, u16 data[], fmt_t tg_data_format)
+{
+  tg_shape_t s;
+  s.n = tl->shape.n;
+  s.c = tl->shape.c;
+  s.h = tl->shape.h;
+  s.w = tl->shape.w;
+  tg_t *tg = alloc_tg_bf16_gmem(ctx, s, tg_data_format);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = tg;
+  p.dst = tl;
+
+  put_tg_bf16_gmem(ctx, tg, (u8 *)data);
+  bmk1822_tdma_g2l_bf16_tensor_copy(bmk, &p);
+  test_submit(ctx);
+  free_tg_gmem(ctx, tg);
+}
+
+static inline void put_matrix_g2l(
+    bmctx_t *ctx, bmk_ctx_t *bmk, const ml_t *ml, u8 data[])
+{
+  mg_shape_t s;
+  s.row = ml->shape.n;
+  s.col = ml->shape.col;
+  mg_t *mg = alloc_mg_gmem(ctx, s);
+
+  bmk1822_tdma_tg2l_matrix_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = mg;
+  p.dst = ml;
+
+  put_mg_gmem(ctx, mg, data);
+  bmk1822_tdma_g2l_matrix_copy(bmk, &p);
+  test_submit(ctx);
+
+  free_mg_gmem(ctx, mg);
+}
+
+
+static inline void put_bf16_matrix_g2l(
+    bmctx_t *ctx, bmk_ctx_t *bmk, const ml_t *ml, u8 data[], fmt_t mg_data_format)
+{
+  mg_shape_t s;
+  s.row = ml->shape.n;
+  s.col = ml->shape.col;
+  mg_t *mg = alloc_mg_bf16_gmem(ctx, s, mg_data_format);
+
+  bmk1822_tdma_tg2l_matrix_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = mg;
+  p.dst = ml;
+
+  put_mg_bf16_gmem(ctx, mg, data);
+  bmk1822_tdma_g2l_bf16_matrix_copy(bmk, &p);
+  test_submit(ctx);
+
+  free_mg_gmem(ctx, mg);
+}
+
+static inline void put_bytes_g2l(
+    bmctx_t *ctx, bmk_ctx_t *bmk, u32 lmem_addr, u64 size, u8 data[])
+{
+  bmmem_device_t mem = bmmem_device_alloc_raw(*ctx, size);
+  u64 gmem_addr = bmmem_device_addr(mem);
+
+  bmk1822_tdma_tg2l_general_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src_base_reg_index = 0;
+  p.src_address = gmem_addr;
+  p.dst_address = lmem_addr;
+  p.bytes = size;
+
+  //put_bytes_gmem(ctx, gmem_addr, size, data);
+  put_bytes_gmem(ctx, mem, data);
+
+  bmk1822_tdma_g2l_general_copy(bmk, &p);
+  test_submit(ctx);
+
+  bmmem_device_free(*ctx, mem);
+}
+
+static inline u8 * get_tensor_l2g(bmctx_t *ctx, bmk_ctx_t *bmk, const tl_t *tl)
+{
+  tg_shape_t s;
+  s.n = tl->shape.n;
+  s.c = tl->shape.h;
+  s.h = tl->shape.w;
+  s.w = tl->shape.c;
+  tg_t *tg = alloc_tg_gmem(ctx, s, FMT_I8);
+
+  bmk1822_tdma_l2tg_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = tl;
+  p.dst = tg;
+
+  bmk1822_tdma_l2g_tensor_copy(bmk, &p);
+  test_submit(ctx);
+  u8 *data = get_tg_gmem(ctx, tg);
+
+  free_tg_gmem(ctx, tg);
+  return data;
+}
+
+static inline u8 * get_bf16_tensor_l2g(bmctx_t *ctx, bmk_ctx_t *bmk, const tl_t *tl, fmt_t tg_data_format)
+{
+  tg_shape_t s;
+  s.n = tl->shape.n;
+  s.c = tl->shape.h;
+  s.h = tl->shape.w;
+  s.w = tl->shape.c;
+
+  tg_t *tg = alloc_tg_bf16_gmem(ctx, s, tg_data_format); // alloc tg to bf16 or int8 mode
+
+  bmk1822_tdma_l2tg_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = tl;
+  p.dst = tg;
+
+  bmk1822_tdma_l2g_bf16_tensor_copy(bmk, &p);
+  test_submit(ctx);
+  u8 *data = get_tg_bf16_gmem(ctx, tg);
+
+  free_tg_gmem(ctx, tg);
+  return data;
+}
+
+
+static inline u8 * get_matrix_l2g(bmctx_t *ctx, bmk_ctx_t *bmk, const ml_t *ml)
+{
+  mg_shape_t s;
+  s.row = ml->shape.n;
+  s.col = ml->shape.col;
+  mg_t *mg = alloc_mg_gmem(ctx, s);
+
+  bmk1822_tdma_l2tg_matrix_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = ml;
+  p.dst = mg;
+
+  bmk1822_tdma_l2g_matrix_copy(bmk, &p);
+  test_submit(ctx);
+  u8 *data = get_mg_gmem(ctx, mg);
+
+  free_mg_gmem(ctx, mg);
+  return data;
+}
+
+static inline u8 * get_bf16_matrix_l2g(bmctx_t *ctx, bmk_ctx_t *bmk, const ml_t *ml, fmt_t mg_data_format)
+{
+  mg_shape_t s;
+  s.row = ml->shape.n;
+  s.col = ml->shape.col;
+  mg_t *mg = alloc_mg_bf16_gmem(ctx, s, mg_data_format);
+
+  bmk1822_tdma_l2tg_matrix_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = ml;
+  p.dst = mg;
+
+  bmk1822_tdma_l2g_bf16_matrix_copy(bmk, &p);
+  test_submit(ctx);
+  u8 *data = get_mg_bf16_gmem(ctx, mg);
+
+  free_mg_gmem(ctx, mg);
+  return data;
+}
+
+static inline u8 * get_bytes_l2g(
+    bmctx_t *ctx, bmk_ctx_t *bmk, u32 lmem_addr, u64 size)
+{
+  bmmem_device_t mem = bmmem_device_alloc_raw(*ctx, size);
+  u64 gmem_addr = bmmem_device_addr(mem);
+
+  bmk1822_tdma_l2tg_general_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src_address = lmem_addr;
+  p.dst_base_reg_index = 0;
+  p.dst_address = gmem_addr;
+  p.bytes = size;
+
+  bmk1822_tdma_l2g_general_copy(bmk, &p);
+  test_submit(ctx);
+  //u8 *data = get_bytes_gmem(ctx, gmem_addr, size);
+  u8 *data = get_bytes_gmem(ctx, mem, size);
+
+  bmmem_device_free(*ctx, mem);
+  return data;
+}
+
+/*
+ * tensor dump utility
+ * detail = 1, dump all tensor and indicate N and C number
+ * detail = 0, only dump 3 byte closing to begin and end point.
+ */
+static inline void dump_tensor(u8 src[], u32 n, u32 c, u32 h, u32 w, u8 detail)
+{
+  if (detail) {
+    for (u32 ni = 0; ni < n; ni++) {
+      for (u32 ci = 0; ci < c; ci++) {
+        for (u32 hi = 0; hi < h; hi++) {
+          for (u32 wi = 0; wi < w; wi++) {
+            u32 i = ni * c * h * w + ci * h * w + hi * w + wi;
+            printf("%4d ", src[i]);
+
+            if (hi == 0 && wi == w-1)
+              printf("| <= C: %d ", ci);
+
+            if (ci == 0 && hi == 0 && wi == w-1)
+              printf("@ <= N: %d ", ni);
+          }
+          printf("\n");
+        }
+      }
+    }
+  } else {
+    u64 end = (n-1) * c * h * w + (c-1) * h * w + (h-1) * w + (w-1);
+    printf("[");
+    printf("%4d", src[0]);
+    printf("%4d", src[1]);
+    printf("%4d", src[2]);
+    printf(" ... ");
+    printf("%4d", src[end - 2]);
+    printf("%4d", src[end - 1]);
+    printf("%4d", src[end]);
+    printf(" ]\n");
+  }
+}
+
+static inline void saturate_to_int8(s32 *buf, u64 size, int res_sign)
+{
+  s32 max, min;
+  if (res_sign) {
+    max = 127;
+    min = -128;
+  } else {
+    max = 255;
+    min = 0;
+  }
+
+  for (u64 i = 0; i < size; i++) {
+    if (buf[i] > max)
+      buf[i] = max;
+    else if (buf[i] < min)
+      buf[i] = min;
+  }
+}
+
+static inline void saturate_to_int16(s32 *buf, u64 size, int res_sign)
+{
+  s32 max, min;
+  if (res_sign) {
+    max = 32767;
+    min = -32768;
+  } else {
+    max = 65535;
+    min = 0;
+  }
+
+  for (u64 i = 0; i < size; i++) {
+    if (buf[i] > max)
+      buf[i] = max;
+    else if (buf[i] < min)
+      buf[i] = min;
+  }
+}
+
+static inline void arith_right_shift(
+    s32 *buf, u64 size, int shift_bits, int round_up)
+{
+  if (shift_bits == 0)
+    return;
+
+  for (u64 i = 0; i < size; i++) {
+    buf[i] >>= shift_bits - 1;
+    if (round_up)
+      buf[i] += 1;
+    buf[i] >>= 1;
+  }
+}
+
+static inline void logic_right_shift(
+    s32 *buf, u64 size, int shift_bits, int round_up)
+{
+  if (shift_bits == 0)
+    return;
+
+  for (u64 i = 0; i < size; i++) {
+    buf[i] = (u32)buf[i] >> (shift_bits - 1);
+    if (round_up)
+      buf[i] += 1;
+    buf[i] = (u32)buf[i] >> 1;
+  }
+}
+
+#if 0
+/*
+ * \return closest large or equal divisor, -1 means no divisors >= \match_divisor
+ */
+static inline int get_all_divisors(std::vector<int> *v, int n, int match_divisor)
+{
+  int match = -1;
+  for (int i=1; i<=sqrt(n); i++)
+  {
+    if (n%i==0)
+    {
+      if (n/i == i) // check if divisors are equal
+        printf("%d ", i);
+      else
+      {
+        printf("%d ", i);
+
+        // push the second divisor in the vector
+        v->push_back(n/i);
+      }
+    }
+  }
+
+  // The vector will be printed in reverse
+  for (int i=v->size()-1; i>=0; i--) {
+    int d = (*v)[i];
+    if (match == -1 && d >= match_divisor && d != 1) {
+      match = d;
+    }
+    printf("%d ", d);
+  }
+
+  return match;
+}
+
+/*
+ * \return -1 means fail to reshape, 0 means success
+ */
+static inline int get_dup_shape(int in, int ic, int ih, int iw, int dilation_h,
+    bmk1822_tensor_lmem_shape_t* tl_shape, bmk1822_tensor_lmem_stride_t* tl_stride,
+    bmk1822_tensor_tgmem_shape_t* tg_shape, bmk1822_tensor_tgmem_stride_t* tg_stride,
+    fmt_t src_tg_fmt, fmt_t dst_tl_fmt
+    ) {
+
+  // 1. reshape and extend c,h axis in order
+  int ret = 0;
+  int ch = ic * ih;
+  int c_h_gcd = std::__gcd(ch, 32);
+  if (c_h_gcd == 1) {
+    printf("cant reshape it\n");
+    c_h_gcd = ic;
+    ret = -1;
+  }
+
+  int oc = ch / c_h_gcd;
+  int oh = ch / oc;
+
+  if (oh < dilation_h) {
+    // TODO: get property c h
+    std::vector<int> all_divisors;
+    oh = get_all_divisors(&all_divisors, ch, dilation_h);
+    if (oh == -1) {
+      printf("cant reshape it with dilation_h %d\n", dilation_h);
+      ret = -1;
+      oh = ih;
+    }
+    oc = ch / oh;
+  }
+
+  // 2 means 2 bytes
+  int src_tg_fmt_sz = src_tg_fmt == FMT_BF16 ? 2 : 1;
+  int dst_tl_fmt_sz = dst_tl_fmt == FMT_BF16 ? 2 : 1;
+
+  printf ("ic:ih is %d %d, oc:oh is %d:%d, c_h_gcd %d\n", ic, ih, oc, oh, c_h_gcd);
+
+  assert(dilation_h * oc * iw <= oh * oc * iw);
+  tl_shape->n = tg_shape->n = in * 2;
+  tl_shape->c = tg_shape->c = oc;
+  tl_shape->h = tg_shape->h = oh;
+  tl_shape->w = tg_shape->w = iw;
+
+  //tl_stride->n = tg_stride->n = iw * oh * oc;
+
+  tl_stride->c = tg_stride->c = iw * oh;
+  tl_stride->h = tg_stride->h = iw;
+
+  tg_stride->n = iw * (oh) * src_tg_fmt_sz;
+  tg_stride->c *= src_tg_fmt_sz;
+  tg_stride->h *= src_tg_fmt_sz;
+
+  tl_stride->n = iw * oh * oc * dst_tl_fmt_sz;
+  tl_stride->c *= dst_tl_fmt_sz;
+  tl_stride->h *= dst_tl_fmt_sz;
+  tl_stride->w = dst_tl_fmt_sz;
+
+  return ret;
+}
+
+static inline void get_dup_first_channel_shape(int in, int ic, int ih, int iw,
+    bmk1822_tensor_lmem_shape_t* tl_shape, bmk1822_tensor_lmem_stride_t* tl_stride,
+    bmk1822_tensor_tgmem_shape_t* tg_shape, bmk1822_tensor_tgmem_stride_t* tg_stride,
+    fmt_t src_tg_fmt, fmt_t dst_tl_fmt
+    ) {
+
+  // 2 means 2 bytes
+  int src_tg_fmt_sz = src_tg_fmt == FMT_BF16 ? 2 : 1;
+  int dst_tl_fmt_sz = dst_tl_fmt == FMT_BF16 ? 2 : 1;
+
+  tl_shape->n = tg_shape->n = in;
+  tl_shape->c = tg_shape->c = ic;
+  tl_shape->h = tg_shape->h = ih;
+  tl_shape->w = tg_shape->w = iw;
+
+  tl_stride->c = tg_stride->c = iw * ih;
+  tl_stride->h = tg_stride->h = iw;
+
+  tg_stride->n = 0;
+  tg_stride->c = 0;
+  tg_stride->h *= src_tg_fmt_sz;
+
+  tl_stride->n = iw * ih * ic * dst_tl_fmt_sz;
+  tl_stride->c *= dst_tl_fmt_sz;
+  tl_stride->h *= dst_tl_fmt_sz;
+  tl_stride->w = dst_tl_fmt_sz;
+}
+#endif
+#endif /* INC_1822_TEST_UTIL_H */
diff --git a/cviruntime/test/1822/1822_vlc_random_gen_nn_data.h b/cviruntime/test/1822/1822_vlc_random_gen_nn_data.h
new file mode 100644
index 000000000..7ccbcf71b
--- /dev/null
+++ b/cviruntime/test/1822/1822_vlc_random_gen_nn_data.h
@@ -0,0 +1,91 @@
+/**
+ * copy from git@gitlab-ai.bitmain.vip:2290/wesley.teng/tpu_compress.git tpu_compress/test_vlc_compress.c
+   only include random_gen_nn_data relative function
+ */
+
+#ifndef __BM_VLC_COMPRESS_RANDOM_GEN_NN_DATA_H__
+#define __BM_VLC_COMPRESS_RANDOM_GEN_NN_DATA_H__
+#include <stdint.h>
+#include <stdbool.h>
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#include <assert.h>
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+
+// --- contrain random test ---
+double getGaussianRandomVar(double mean, double std)
+{
+  double PI = 3.1415926;
+  double u0 = (double)rand() / RAND_MAX;
+  double u1 = (double)rand() / RAND_MAX;
+  double n = sqrt(-2 * log(u0)) * cos(2 * PI * u1);
+  return n * std + mean;
+}
+
+double getExpRandomVar(double lambda)
+{
+  double x = (double)rand() / RAND_MAX;
+  return log(1 - x) / (-lambda);
+}
+
+void random_gen_nn_data(uint8_t *ibuf, size_t in_num, bool signedness, bool data_type, double zero_ratio)
+{
+  float *random_buf = (float *)malloc(in_num * sizeof(float));
+  int zero_thr = (int)(100 * zero_ratio);
+  double lambda = getGaussianRandomVar(0, 0.5);
+  double mean = getGaussianRandomVar(0, 8);
+  bool pdf_sel = ((rand() % 10) < 9); // 9 over 10 choose exponential pdf
+  double max_v = 0;
+  double eps = 0.0001;
+  lambda += (lambda > 0) ? eps : -eps;
+  for (size_t i = 0; i < in_num; i++)
+  {
+    double val = (pdf_sel) ? getExpRandomVar(lambda) : getGaussianRandomVar(mean, lambda);
+    val = ((signedness || data_type) && rand() % 2) ? -val : val;
+    random_buf[i] = ((rand() % 100) < zero_thr) ? 0 : val;
+    max_v = (fabs(random_buf[i]) > max_v) ? fabs(random_buf[i]) : max_v;
+  }
+
+  if (data_type == 0) // INT8
+  {
+    double cali_decay = (signedness) ? (rand() / (double)RAND_MAX) + 1 : 1; // weight dacay by calibration
+    uint8_t pruned_thr = (signedness && !data_type && (rand() % 2)) ? rand() % 12 : 0;
+    for (size_t i = 0; i < in_num; i++)
+    {
+      int val = (int)((random_buf[i] * 127) / (max_v * cali_decay));
+      ibuf[i] = (abs(val) < pruned_thr)
+                    ? 0
+                    : (val > 127)
+                          ? 127
+                          : (val < (-128))
+                                ? -128
+                                : val;
+    }
+  }
+  else // BFloat16
+  {
+    uint16_t *bf16_buf = (uint16_t *)random_buf;
+    for (size_t i = 0; i < in_num; i++)
+    {
+      short bf16_val = bf16_buf[(i << 1) + 1];
+      // WARNING: set subnormal value to zero since HW do NOT support
+      int exp = ((bf16_val >> 7) & 0xFF);
+      bf16_val = (exp) ? bf16_val : 0;
+
+      ibuf[i << 1] = (uint8_t)(bf16_val & 0xFF);
+      ibuf[(i << 1) + 1] = (uint8_t)(bf16_val >> 8);
+    }
+  }
+  free(random_buf);
+}
+  #ifdef __cplusplus
+}
+#endif
+
+#endif /* __BM_VLC_COMPRESS_RANDOM_GEN_NN_DATA_H__ */
diff --git a/cviruntime/test/1822/bf16/1822_bf16_util.h b/cviruntime/test/1822/bf16/1822_bf16_util.h
new file mode 100644
index 000000000..2f035bc28
--- /dev/null
+++ b/cviruntime/test/1822/bf16/1822_bf16_util.h
@@ -0,0 +1,60 @@
+#ifndef INC_1822_BF16_UTIL_H
+#define INC_1822_BF16_UTIL_H
+
+#define RAND_SEED_MOD 10
+#define COMPARE_PASS 0
+
+u16 corner_val[] = {
+  0x0000, // 0 00000000 0000000 = zero
+  0x8000, // 1 00000000 0000000 = −zero
+  0x7f80, // 0 11111111 0000000 = infinity
+  0xff80, // 1 11111111 0000000 = −infinity
+  0x4049, // 0 10000000 1001001 = 3.140625 ≈ π ( pi )
+  0x3eab, // 0 01111101 0101011 = 0.333984375 ≈ 1/3
+  0xffc1, // x 11111111 1000001 => qNaN
+  0xff81, // x 11111111 0000001 => sNaN
+  0x00ff, // x 00000000 1111111 => denormal
+};
+
+u16 generate_bf16_corner_val(float val)
+{
+  if( rand()%RAND_SEED_MOD == 0 ) {
+    return corner_val[ rand() % (sizeof(corner_val)/sizeof(u16)) ];
+  } else {
+    return convert_fp32_bf16(val);
+  }
+}
+
+int compare_result( void *ref_x, void *result_x , fmt_t fmt, int stride_size)
+{
+  u8 *u8result_x = NULL;
+  u16 *u16result_x = NULL;
+  u8 *u8ref_x = NULL;
+  u16 *u16ref_x = NULL;
+
+  if(fmt == FMT_BF16) {
+    u16result_x = (u16 *)result_x;
+    u16ref_x = (u16 *)ref_x;
+    for (int i = 0; i < stride_size; i++) {
+      if (u16result_x[i] != u16ref_x[i]) {
+        printf("compare failed at result_x[%d], got %d, exp %d\n", 
+               i, u16result_x[i], u16ref_x[i]);
+        return -1;
+      }
+    }
+  } else {
+    u8result_x = (u8 *)result_x;
+    u8ref_x = (u8 *)ref_x;
+    for (int i = 0; i < stride_size; i++) {
+      if (u8result_x[i] != u8ref_x[i]) {
+        printf("compare failed at result_x[%d], got %d, exp %d\n", 
+               i, u8result_x[i], u8ref_x[i]);
+        return -1;
+      }
+    }
+  }
+
+  return 0;
+}
+
+#endif /* INC_1822_BF16_UTIL_H */
diff --git a/cviruntime/test/1822/bf16/test_1822_bf16_avg_pooling.cpp b/cviruntime/test/1822/bf16/test_1822_bf16_avg_pooling.cpp
new file mode 100644
index 000000000..2dc950f8f
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_bf16_avg_pooling.cpp
@@ -0,0 +1,324 @@
+#include "../1822_test_util.h"
+
+#define INVALIDE_STRIDE (-1)
+typedef bmk1822_tiu_average_pooling_param_t param_t;
+int random_seed;
+static void print_pooling_param(const param_t *p)
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+
+  printf("  Pooling parameters:\n");
+  printf("    random_seed : %d \n", random_seed);
+  printf("    ifmap = (%d, %d, %d, %d)\n", in, ic, ih, iw);
+  printf("    opd0_sign = %d\n", p->ifmap->fmt == FMT_I8);
+  printf("    weight = (%d, %d)\n", p->kh, p->kw);
+  printf("    padding = (%d, %d, %d, %d)\n",
+         p->pad_top, p->pad_bottom, p->pad_left, p->pad_right);
+  printf("    stride = (%d, %d)\n", p->stride_h, p->stride_w);
+  printf("    ins0 = (%d, %d, %d, %d)\n",
+         p->ins_h, p->ins_last_h, p->ins_w, p->ins_last_w);
+  printf("    avg_pooling_const = %d\n", p->avg_pooling_const);
+  printf("    rshift_bits = %d\n", p->rshift_bits);
+}
+static int index_get(int h, int w1, int w2)
+{
+  return h * w1 + w2;
+}
+
+int native_pooling_avg_bf16(
+    const u16* i_fmap,
+    const void* weight,
+    const u32 *bias,
+    u16 * o_fmap,
+    int input_n, int input_c, int input_h, int input_w,
+    int kh, int kw,
+    int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r,
+    int stride_h, int stride_w,
+    int ins_h, int ins_w,
+    int ins_h_last, int ins_w_last,
+    int const_weight)
+{
+  if (kh * kw <= 0)
+    return BM_ERR_INVALID_ARGUMENT;
+
+  float *avg_pooling_mac_a = (float *)malloc(kh * kw * sizeof(float));
+  float *avg_pooling_mac_b = (float *)malloc(kh * kw * sizeof(float));
+
+  u16 avg_const_weight = *(u16 *)weight;
+  const u16 *weight_arr = (u16*)weight;
+
+  int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b);
+  int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r);
+
+  int output_h = calc_output_hw(h_after, kh, stride_h);
+  int output_w = calc_output_hw(w_after, kw, stride_w);
+  u16 *i_fmap_pad = NULL;
+  for (int n = 0; n < input_n; n++) {
+    if (const_weight == 0)
+      weight_arr = (u16*)weight;
+
+    for (int c = 0; c < input_c; ++c) {
+      fill_pad_fmap_bf16(i_fmap, &i_fmap_pad, convert_fp32_bf16(0),
+          pad_w_l, pad_w_r, pad_h_t, pad_h_b,
+          ins_h, ins_w, ins_h_last, ins_w_last,
+          input_h, input_w);
+
+      for (int ph = 0; ph < output_h; ++ph) {
+        for (int pw = 0; pw < output_w; ++pw) {
+          int hstart = ph * stride_h;
+          int wstart = pw * stride_w;
+          int pool_index = index_get(ph, output_w, pw);
+          int mac_index = 0;
+          float avg_pool_result=0;
+          for (int h = 0; h < kh; h++) {
+            for (int w = 0; w < kw; w++) {
+              int index = index_get((hstart+h), w_after, (w+wstart));
+              mac_index = index_get(h, kw, w);
+              float a = convert_bf16_fp32(i_fmap_pad[index]);
+              float b = const_weight ?
+                  convert_bf16_fp32(avg_const_weight) : convert_bf16_fp32(weight_arr[mac_index]);
+
+              avg_pool_result += a*b;
+            }
+          }
+
+          if(bias) {
+            avg_pool_result += convert_hex_fp32(bias[c]);
+          }
+          *(o_fmap+pool_index) = convert_fp32_bf16(avg_pool_result);
+        }
+      }
+      i_fmap += input_w * input_h;
+      if (const_weight == 0)
+        weight_arr += kh * kw;
+
+      o_fmap += output_w * output_h;
+    }
+  }
+  free(i_fmap_pad);
+
+  free(avg_pooling_mac_a);
+  free(avg_pooling_mac_b);
+
+  return BM_SUCCESS;
+}
+
+static u16 *alloc_input(param_t *p)
+{
+  u64 size = tl_shape_size(&p->ifmap->shape);
+  u16 *data = (u16 *)xmalloc(size*2);
+  if (!data)
+    return NULL;
+
+  for (u64 i = 0; i < size; i++) {
+    float val = 0;
+    int RAND_MAX2 = RAND_MAX/2; //100 ~ -100
+    val = (float)(rand()-RAND_MAX2)*1000 / (float)RAND_MAX;
+    data[i] = convert_fp32_bf16(val);//rand() % 256 - 128;
+  }
+  return data;
+}
+
+static u16 *alloc_output(param_t *p)
+{
+  u64 size = tl_shape_size(&p->ofmap->shape);
+  return (u16 *)xmalloc(size*2);
+}
+
+static int pooling_ih_ext(param_t *p, int ih)
+{
+  int ins = p->ins_h;
+  int ins_last = p->ins_last_h;
+  int pad = p->pad_top + p->pad_bottom;
+  return (ih - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+static int pooling_iw_ext(param_t *p, int iw)
+{
+  int ins = p->ins_w;
+  int ins_last = p->ins_last_w;
+  int pad = p->pad_left + p->pad_right;
+  return (iw - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+static int pooling_oh(param_t *p, int ih)
+{
+  int ih_ext = pooling_ih_ext(p, ih);
+  return (ih_ext - p->kh) / p->stride_h + 1;
+}
+
+static int pooling_ow(param_t *p, int iw)
+{
+  int iw_ext = pooling_iw_ext(p, iw);
+  return (iw_ext - p->kw) / p->stride_w + 1;
+}
+
+static void free_pooling_param(
+    bmk_ctx_t *ctx,
+    param_t *p)
+{
+  if (p->ifmap)
+    free_tl(ctx, p->ifmap);
+  if (p->ofmap)
+    free_tl(ctx, p->ofmap);
+}
+
+static param_t random_pooling_param(bmk_ctx_t *ctx, int stride_w, int stride_h)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+
+retry:
+  random_seed = clock();
+  srand(random_seed);
+
+  int in = rand() % 5 + 1;
+  int ic = rand() % (3 * BM1822_HW_NPU_NUM) + 1;
+  int ih = rand() % 30 + 3 + (INVALIDE_STRIDE == stride_h ? 0 : stride_h);
+  int iw = rand() % 30 + 6 + (INVALIDE_STRIDE == stride_w ? 0 : stride_w);
+
+  p.kh = rand() % 7 + 1;
+  p.kw = rand() % 7 + 1;
+  p.stride_h = INVALIDE_STRIDE == stride_h ? rand() % (p.kh) + 1 : stride_h;
+  p.stride_w = INVALIDE_STRIDE == stride_w ? rand() % (p.kh) + 1 : stride_w;
+  p.ins_h = rand() % p.kh;
+  p.ins_w = rand() % p.kw;
+  p.ins_last_h = rand() % p.kh;
+  p.ins_last_w = rand() % p.kw;
+  p.pad_top = rand() % p.kh;
+  p.pad_bottom = rand() % p.kh;
+  p.pad_left = rand() % p.kw;
+  p.pad_right= rand() % p.kw;
+  p.rshift_bits = rand() % 32;
+  p.avg_pooling_const = convert_fp32_bf16(rand()%0x1000);//rand() % 256;
+  tl_shape_t ifmap_shape;
+  ifmap_shape.n = in;
+  ifmap_shape.c = ic;
+  ifmap_shape.h = ih;
+  ifmap_shape.w = iw;
+
+  int on = in;
+  int oc = ic;
+  int oh = pooling_oh(&p, ih);
+  int ow = pooling_ow(&p, iw);
+  tl_shape_t ofmap_shape;
+  ofmap_shape.n = on;
+  ofmap_shape.c = oc;
+  ofmap_shape.h = oh;
+  ofmap_shape.w = ow;
+
+  p.ofmap = bmk1822_lmem_alloc_tensor(ctx, ofmap_shape, FMT_BF16, 1);
+  p.ifmap = bmk1822_lmem_alloc_tensor(ctx, ifmap_shape, FMT_BF16, 1);
+
+  if ((p.kh > pooling_ih_ext(&p, ih))
+      || (p.kw > pooling_iw_ext(&p, iw))
+      || (p.pad_top >= (1 << 4))
+      || (p.pad_bottom >= (1 << 4))
+      || (p.pad_left >= (1 << 4))
+      || (p.pad_right >= (1 << 4))
+      || !p.ofmap
+      || !p.ifmap) {
+    printf("retry init_pooling_param\n");
+    free_pooling_param(ctx, &p);
+    goto retry;
+  }
+
+  return p;
+}
+
+static void compare_results(
+    param_t *p,
+    u16 input[],
+    u16 output[])
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+
+  u16 *output_ref = alloc_output(p);
+  p->avg_pooling_const = convert_fp32_bf16(convert_bf16_fp32(p->avg_pooling_const)/(p->kh * p->kw));
+  bmerr_t ret = native_pooling_avg_bf16(
+      input, &p->avg_pooling_const, NULL, output_ref,
+      in, ic, ih, iw, p->kh, p->kw,
+      p->pad_top, p->pad_bottom, p->pad_left, p->pad_right,
+      p->stride_h, p->stride_w,
+      p->ins_h, p->ins_w, p->ins_last_h, p->ins_last_w,1
+      );
+  assert(ret == BM_SUCCESS);
+  int cmp_res = array_cmp_int8(
+      "Comparing results ...\n", (s8*)output_ref, (s8*) output,
+      tl_shape_size(&p->ofmap->shape)*2);
+
+  if (cmp_res != 0) {
+    printf("Comparison FAILED!!!\n");
+    print_pooling_param(p);
+    exit(-1);
+  }
+
+  free(output_ref);
+}
+
+static int _test_pooling(bmctx_t ctx, bmk_ctx_t *bk_ctx, int stride_w, int stride_h)
+{
+  param_t p = random_pooling_param(bk_ctx, stride_w, stride_h);
+//  print_pooling_param(&p);
+
+  u16 *input = alloc_input(&p);
+
+  put_bf16_tensor_g2l(&ctx, bk_ctx, p.ifmap, (u16 *)input, FMT_BF16);
+  bmk1822_tiu_average_pooling(bk_ctx, &p);
+  u16 *output = (u16 *)get_bf16_tensor_l2g(&ctx, bk_ctx, p.ofmap, FMT_BF16);
+
+  compare_results(&p, input, output);
+
+  free_pooling_param(bk_ctx, &p);
+  free(output);
+  free(input);
+
+  return 1;
+}
+
+
+static int test_pooling(bmctx_t ctx, bmk_ctx_t *bk_ctx) {
+  return _test_pooling(ctx, bk_ctx, INVALIDE_STRIDE, INVALIDE_STRIDE);
+}
+
+static void test_avg_pooling(bmctx_t *ctx, bmk_ctx_t *bk_ctx)
+{
+  int test_finished_num = 0;
+  for (u64 i = 0; i < 20; i++)
+    test_finished_num += test_pooling(*ctx, bk_ctx);
+
+  // test stride extend (0, 31]
+  int stride_list[] = {15, 16, 31};
+  int stride_list_len = sizeof(stride_list) / sizeof(stride_list[0]);
+
+  for (int stride_w_idx = 0; stride_w_idx < stride_list_len; stride_w_idx++) {
+    for (int stride_h_idx = 0; stride_h_idx < stride_list_len; stride_h_idx++) {
+      int stride_w = stride_list[stride_w_idx];
+      int stride_h = stride_list[stride_h_idx];
+
+      test_finished_num += _test_pooling(*ctx, bk_ctx, stride_w, stride_h);
+    }
+  }
+
+  printf("Test finished %d\n", test_finished_num);
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  int round_mode;
+  round_mode = set_store_feround();
+  test_avg_pooling(&ctx, bk_ctx);
+  restore_feround(round_mode);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_bf16_conv.cpp b/cviruntime/test/1822/bf16/test_1822_bf16_conv.cpp
new file mode 100644
index 000000000..d3a847efb
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_bf16_conv.cpp
@@ -0,0 +1,715 @@
+#include "../1822_test_util.h"
+//#include <float.h>
+//#undef printf
+//#define printf(...) {}
+
+#define INVALIDE_STRIDE (-1)
+typedef struct {
+  int random_seed;
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int using_bias;
+  int bReLU_EN;
+  int r_shift_m;
+  int opd0_sign;
+  int opd1_sign;
+  int opd2_sign;
+  int bf16_enable;
+} conv_param_t;
+
+static void print_conv_param(const conv_param_t *p);
+
+static inline void bf16_relu(float *buf, u64 size)
+{
+  for (u64 i = 0; i < size; i++)
+    if (buf[i] < 0)
+      buf[i] = 0;
+}
+
+static int conv_ref(
+    const conv_param_t *p_param,
+    const u16 *ifmap,
+    const u16 *weight,
+    const u32 *bias,
+    u16 *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int do_relu = p_param->bReLU_EN;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  float *result = (float *)malloc(sizeof(float) * in * oc * oh * ow);
+  if (!result)
+    return BM_ERR_FAILURE;
+
+  memset(result, 0, sizeof(float) * in * oc * oh * ow);
+  int ret = BM_SUCCESS;
+
+
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      u16 *i_fmap_pad[ic];
+      u16 *kernel_pad[ic];
+
+      for (int iic = 0; iic < ic; ++iic) {
+        i_fmap_pad[iic] = NULL;
+        kernel_pad[iic] = NULL;
+        fill_pad_fmap_bf16(
+            (ifmap + n*ic*ih*iw + iic*ih*iw), &i_fmap_pad[iic], convert_fp32_bf16(0),
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+        //kernel_dilation(
+        fill_pad_fmap_bf16(
+            (weight + c*ic*kh*kw + iic*kh*kw), &kernel_pad[iic], convert_fp32_bf16(0),
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+                  kh, kw);
+      }
+      for (int ph = 0; ph < oh; ++ph) {
+        for (int pw = 0; pw < ow; ++pw) {
+          float result_val = result[n*oc*oh*ow + c*oh*ow + ph*ow + pw];
+          for (int idxh = 0; idxh < kh_ext; ++idxh) {
+            for (int idxw = 0; idxw < kw_ext; ++idxw) {
+              for (int iic = 0; iic < ic; ++iic){
+                float ifv = convert_bf16_fp32(i_fmap_pad[iic][(idxh+ph*stride_h) * iw_ext + idxw + pw*stride_w]);
+                float ikv = convert_bf16_fp32(kernel_pad[iic][idxh* kw_ext + idxw]);
+                result_val += ifv*ikv;
+              }
+            }
+          }
+          result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] = result_val;
+        }
+      }
+
+       if (p_param->using_bias) {
+         for (int ph = 0; ph < oh; ++ph) {
+           for (int pw = 0; pw < ow; ++pw) {
+             result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += convert_hex_fp32(bias[c]); //bias+c ;
+           }
+         }
+       }
+
+       if (do_relu)
+         bf16_relu(&result[n*oc*oh*ow + c*oh*ow], oh * ow);
+       for(int i = 0 ;i<ic;i++) {
+         free(i_fmap_pad[i]);
+         free(kernel_pad[i]);
+       }
+       if (ret != BM_SUCCESS)
+         goto error_release;
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+    for (int i = 0; i < in * oc * oh * ow; i++) {
+      ofmap[i] = convert_fp32_bf16(result[i]);
+    }
+
+error_release:
+  free(result);
+
+  return ret;
+}
+
+static u16 * transform_weight(const tl_shape_t *s, u16 before[])
+{
+  u32 ic = s->n;
+  u32 oc = s->c;
+  u32 kh = s->h;
+  u32 kw = s->w;
+
+  u32 size = ic * oc * kh * kw;
+  u16 *after = (u16 *)malloc(sizeof(u16) * size);
+
+  /*
+   * (oc, ic, kh, kw) -> (1, oc, kh * kw, ic)
+   */
+  for (u32 oci = 0; oci < oc; oci++) {
+    for (u32 ici = 0; ici < ic; ici++) {
+      for (u32 khi = 0; khi < kh; khi++) {
+        for (u32 kwi = 0; kwi < kw; kwi++) {
+          u32 src_i = oci * ic * kh * kw + ici * kh * kw + khi * kw + kwi;
+          u32 dst_i = oci * kh * kw * ic + khi * kw * ic + kwi * ic + ici;
+          after[dst_i] = before[src_i];
+        }
+      }
+    }
+  }
+
+  return after;
+}
+
+static void put_conv_weight(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    const tl_t *tl,
+    u16 *data)
+{
+  const tl_shape_t *s = &tl->shape;
+  u32 ic = s->n;
+  u32 oc = s->c;
+  u32 kh = s->h;
+  u32 kw = s->w;
+
+  bmshape_t bms = BM_TENSOR_INT8((int)oc, (int)ic, (int)kh, (int)kw*2);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  u16 *transformed_data = transform_weight(s, data);
+
+  /* we put weight to region 1. bm_memcpy_s2d regard dev_mem as
+   * absolute address, so we should pass abs address to copy weight
+   * to right place.
+   */
+
+  //u64 ab_addr = bm_device_read_base_reg(*ctx, 1);
+  //bmmem_device_t ab_dev_mem =
+    //bmmem_device_prealloc(*ctx, NULL, ab_addr + gaddr, &bms);
+
+  //int ret = bm_memcpy_s2d(*ctx, ab_dev_mem, (u8*)transformed_data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, (u8*)transformed_data);
+
+  assert(ret == BM_SUCCESS);
+  tl_shape_t tdma_shape = { 1, oc, kh * kw, ic };
+
+  tg_t tdma_tg;
+  tdma_tg.base_reg_index = 0;
+  tdma_tg.start_address = gaddr;
+  tdma_tg.fmt = FMT_BF16;
+  tdma_tg.shape.n = tdma_shape.n;
+  tdma_tg.shape.c = tdma_shape.c;
+  tdma_tg.shape.h = tdma_shape.h;
+  tdma_tg.shape.w = tdma_shape.w;
+  tdma_tg.stride = bmk1822_tensor_tgmem_default_stride(tdma_tg.shape, FMT_BF16);
+  tdma_tg.base_reg_index = 1;
+
+  tl_t tdma_tl = *tl;
+  tdma_tl.shape = tdma_shape;
+  tdma_tl.stride = bmk1822_tensor_lmem_default_stride(bk_ctx, tdma_shape, FMT_BF16, 0);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tdma_tg;
+  p.dst = &tdma_tl;
+
+  bmk1822_tdma_g2l_bf16_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+  bmmem_device_free(*ctx, dev_mem);
+  
+  free(transformed_data);
+}
+
+
+static u16 * transform_bias(int oc, u32 before[])
+{
+  u16 *after = (u16 *)malloc(sizeof(u16) * 2 * oc);
+  if (!after)
+    return NULL;
+
+  for (int i = 0; i < oc; i++){
+    after[i] = (before[i] >> 16) & 0xffff;
+    after[i + oc] = before[i] & 0xffff;
+  }
+  return after;
+}
+
+static void put_conv_bias(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    const tl_t *tl,
+    u32 *data)
+{
+
+  int oc = tl->shape.c;
+
+  bmshape_t bms = BM_TENSOR_INT8(2 * sizeof(short), oc, 1, 1);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  u16 *transformed_data = transform_bias(oc, data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, (u8 *)transformed_data);
+  free(transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_BF16;
+  tg.shape.n = 2;
+  tg.shape.c = oc;
+  tg.shape.h = 1;
+  tg.shape.w = 1;
+  tg.stride = bmk1822_tensor_tgmem_default_stride(tg.shape, FMT_BF16);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1822_tdma_g2l_bf16_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  bmmem_device_free(*ctx, dev_mem);
+}
+
+static int conv_kh_ext(const conv_param_t *p)
+{
+  return (p->kh - 1) * p->dh + 1;
+}
+
+static int conv_kw_ext(const conv_param_t *p)
+{
+  return (p->kw - 1) * p->dw + 1;
+}
+
+static int conv_ih_ext(const conv_param_t *p)
+{
+  return (p->input_h - 1) * (p->ins_h + 1) +
+      p->ins_h_last + 1 + p->pad_top + p->pad_bot;
+}
+
+static int conv_iw_ext(const conv_param_t *p)
+{
+  return (p->input_w - 1) * (p->ins_w + 1) +
+      p->ins_w_last + 1 + p->pad_left + p->pad_right;
+}
+
+static int conv_oh(const conv_param_t *p)
+{
+  return (conv_ih_ext(p) - conv_kh_ext(p)) / p->stride_h + 1;
+}
+
+static int conv_ow(const conv_param_t *p)
+{
+  return (conv_iw_ext(p) - conv_kw_ext(p)) / p->stride_w + 1;
+}
+
+static int conv_input_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int ic = p->input_c;
+  int ih = p->input_h;
+  int iw = p->input_w;
+  return in * ic * ih * iw;
+}
+
+static int conv_output_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int oc = p->output_c;
+  int oh = conv_oh(p);
+  int ow = conv_ow(p);
+  return in * oc * oh * ow;
+}
+
+static int conv_weight_size(const conv_param_t *p)
+{
+  int oc = p->output_c;
+  int ic = p->input_c;
+  int kh = p->kh;
+  int kw = p->kw;
+  return oc * ic * kh * kw;
+}
+
+static u16 * alloc_input(const conv_param_t *p)
+{
+  int size = conv_input_size(p);
+  u16 *buf = (u16 *)malloc(sizeof(u16) * size);
+  for (int i = 0; i < size; i++) {
+    float val = 0;
+    int RAND_MAX2 = RAND_MAX/2; //5 ~ -5
+    val = (float)(rand()-RAND_MAX2)*5 / (float)RAND_MAX;
+    buf[i] = convert_fp32_bf16(val);
+  }
+  return buf;
+}
+
+static u16 * alloc_weight(const conv_param_t *p)
+{
+  int size = conv_weight_size(p);
+  u16 *buf = (u16 *)malloc(sizeof(u16) * size);
+  for (int i = 0; i < size; i++) {
+    float val = 0;
+    int RAND_MAX2 = RAND_MAX/2; // 5 ~ -5
+    val = (float)(rand()-RAND_MAX2)*5 / (float)RAND_MAX;
+    buf[i] = convert_fp32_bf16(val);
+  }
+
+  return buf;
+}
+
+static u32 * alloc_bias(const conv_param_t *p)
+{
+  int oc = p->output_c;
+  u32 *bias = (u32 *)malloc(sizeof(u32) * oc);
+  for (int i = 0; i < oc; i++) {
+    float val = 0;
+    int RAND_MAX2 = RAND_MAX/2; // 5 ~ -5
+    val = (float)(rand()-RAND_MAX2)*5 / (float)RAND_MAX;
+    bias[i] = convert_fp32_hex(val);
+  }
+
+  return bias;
+}
+
+static tl_t * conv_ifmap_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = FMT_BF16;//p->opd0_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return bmk1822_lmem_alloc_tensor(ctx, s, fmt, 1);
+}
+
+static tl_t * conv_weight_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = FMT_BF16;//p->opd1_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+
+  return bmk1822_lmem_alloc_tensor(ctx, s, fmt, 0);
+}
+
+static tl_t * conv_ofmap_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  tl_shape_t s;
+  fmt_t fmt = FMT_BF16;
+  s.n = p->input_n;
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  return bmk1822_lmem_alloc_tensor(ctx, s, fmt, 1);
+}
+
+static tl_t * conv_bias_tensor(
+    bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = FMT_BF16;
+  tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+
+  return bmk1822_lmem_alloc_tensor(ctx, s, fmt, 0);
+}
+
+static int conv_param_is_ok(const conv_param_t *p)
+{
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+
+  if ((kh_ext > ih_ext)
+      || (kw_ext > iw_ext)
+      || (kh_ext <= p->pad_top)
+      || (kh_ext <= p->pad_bot)
+      || (kw_ext <= p->pad_left)
+      || (kw_ext <= p->pad_right)
+      || (p->pad_top >= (1 << 4))
+      || (p->pad_bot >= (1 << 4))
+      || (p->pad_left >= (1 << 4))
+      || (p->pad_right >= (1 << 4))) {
+    return 0;
+  }
+
+  return 1;
+}
+
+static int bmk_conv_param_alloc_ok(
+    const bmk1822_tiu_convolution_param_t *p,
+    const conv_param_t *param)
+{
+  if (!p->ifmap || !p->ofmap || !p->weight)
+    return 0;
+
+  if (param->using_bias)
+    if (!p->bias)
+      return 0;
+
+  return 1;
+}
+
+static void make_bmk_conv_param(
+    bmk_ctx_t *ctx,
+    bmk1822_tiu_convolution_param_t *dst,
+    const conv_param_t *p)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  dst->relu_enable = p->bReLU_EN;
+  dst->rshift_bits = p->r_shift_m;
+
+  dst->ifmap = conv_ifmap_tensor(ctx, p);
+  dst->weight = conv_weight_tensor(ctx, p);
+  dst->ofmap = conv_ofmap_tensor(ctx, p);
+  dst->bias = NULL;
+  dst->ps32_mode = 0;
+  if (p->using_bias)
+    dst->bias = conv_bias_tensor(ctx, p);
+  dst->w_is_const = 0;
+}
+
+static void free_bmk_conv_param(
+    bmk_ctx_t *ctx,
+    bmk1822_tiu_convolution_param_t *r,
+    const conv_param_t *p)
+{
+  if (p->using_bias && r->bias)
+    free_tl(ctx, r->bias);
+  if (r->ofmap)
+    free_tl(ctx, r->ofmap);
+  if (r->weight)
+    free_tl(ctx, r->weight);
+  if (r->ifmap)
+    free_tl(ctx, r->ifmap);
+}
+
+static void _init_conv_param(conv_param_t &p, int stride_w, int stride_h)
+{
+  printf("init_conv_param\n");
+  memset(&p, 0, sizeof(p));
+  p.random_seed = clock();
+  srand(p.random_seed);
+
+retry:
+
+  p.input_n = rand() % 5 + 1;
+  p.input_c = rand() % (5 * 32) + 1;
+  p.kh = rand() % 7 + 1;
+  p.kw = rand() % 7 + 1;
+  p.input_h = rand() % 40 + p.kh + (INVALIDE_STRIDE == stride_h ? 0 : stride_h);
+  p.input_w = rand() % 40 + p.kw + (INVALIDE_STRIDE == stride_w ? 0 : stride_w);
+  p.output_c = rand() % 10 + 3;
+  p.stride_h = INVALIDE_STRIDE == stride_h ? rand() % (p.kh) + 1 : stride_h;
+  p.stride_w = INVALIDE_STRIDE == stride_w ? rand() % (p.kh) + 1 : stride_w;
+  p.ins_h = rand() % p.kh;
+  p.ins_w = rand() % p.kw;
+  p.ins_h_last = rand() % p.kh;;
+  p.ins_w_last = rand() % p.kw;;
+  p.dh = rand() % 3 + 1;
+  p.dw = rand() % 3 + 1;
+  int kh_ext = conv_kh_ext(&p);
+  int kw_ext = conv_kw_ext(&p);
+  p.pad_top = rand() % kh_ext;
+  p.pad_bot = rand() % kh_ext;
+  p.pad_left = rand() % kw_ext;
+  p.pad_right = rand() % kw_ext;
+
+  if (!conv_param_is_ok(&p)) {
+    printf("retry init_conv_param\n");
+    goto retry;
+  }
+
+  p.using_bias = rand() % 2;
+  p.bReLU_EN = rand() % 2;
+
+  p.opd0_sign = rand() % 2;
+  p.opd1_sign = 1;
+  p.opd2_sign = 1;
+
+  assert(p.opd1_sign == 1 && p.opd2_sign == 1);
+
+  int ih_ext = conv_ih_ext(&p);
+  int iw_ext = conv_iw_ext(&p);
+  assert(ih_ext >= kh_ext);
+  assert(iw_ext >= kw_ext);
+}
+
+static void init_conv_param(conv_param_t &p) {
+  _init_conv_param(p, INVALIDE_STRIDE, INVALIDE_STRIDE);
+}
+
+#if 1
+static void print_conv_param(const conv_param_t *p)
+{
+  printf("%s\n", "Conv parameters:");
+  printf("  %s%d;\n", "p->random_seed = ", p->random_seed);
+
+  printf("  %s%d;\n", "p->input_n = ", p->input_n);
+  printf("  %s%d;\n", "p->input_c = ", p->input_c);
+  printf("  %s%d;\n", "p->input_h = ", p->input_h);
+  printf("  %s%d;\n", "p->input_w = ", p->input_w);
+  printf("  %s%d;\n", "p->output_c = ", p->output_c);
+
+  printf("  %s%d;\n", "p->kh = ", p->kh);
+  printf("  %s%d;\n", "p->kw = ", p->kw);
+  printf("  %s%d;\n", "p->dh = ", p->dh);
+  printf("  %s%d;\n", "p->dw = ", p->dw);
+  printf("  %s%d;\n", "p->pad_top = ", p->pad_top);
+  printf("  %s%d;\n", "p->pad_bot = ", p->pad_bot);
+  printf("  %s%d;\n", "p->pad_left = ", p->pad_left);
+  printf("  %s%d;\n", "p->pad_right = ", p->pad_right);
+  printf("  %s%d;\n", "p->stride_h = ", p->stride_h);
+  printf("  %s%d;\n", "p->stride_w = ", p->stride_w);
+  printf("  %s%d;\n", "p->ins_w = ", p->ins_w);
+  printf("  %s%d;\n", "p->ins_h = ", p->ins_h);
+  printf("  %s%d;\n", "p->ins_w_last = ", p->ins_w_last);
+  printf("  %s%d;\n", "p->ins_h_last = ", p->ins_h_last);
+
+  printf("  %s%d;\n", "p->r_shift_m = ", p->r_shift_m);
+  printf("  %s%d;\n", "p->opd0_sign = ", p->opd0_sign);
+  printf("  %s%d;\n", "p->opd1_sign = ", p->opd1_sign);
+  printf("  %s%d;\n", "p->opd2_sign = ", p->opd2_sign);
+  printf("  %s%d;\n", "p->bReLU_EN = ", p->bReLU_EN);
+  printf("  %s%d;\n", "p->using_bias = ", p->using_bias);
+
+  printf("  %s%d\n", "kh_ext = ", conv_kh_ext(p));
+  printf("  %s%d\n", "kw_ext = ", conv_kw_ext(p));
+  printf("  %s%d\n", "ih_ext = ", conv_ih_ext(p));
+  printf("  %s%d\n", "iw_ext = ", conv_iw_ext(p));
+  printf("  %s%d\n", "output_h = ", conv_oh(p));
+  printf("  %s%d\n", "output_w = ", conv_ow(p));
+}
+#endif
+
+static int test_conv(
+    conv_param_t &p_param, bmctx_t ctx, bmk_ctx_t *bk_ctx)
+{
+  u16 *input = alloc_input(&p_param);
+  u16 *weight = alloc_weight(&p_param);
+  u32 *bias = alloc_bias(&p_param);
+
+  //print_conv_param(&p_param);
+
+  u16 *output_ref = (u16 *)malloc(sizeof(u16) * conv_output_size(&p_param));
+  if (!output_ref)
+    return BM_ERR_FAILURE;
+
+  bmerr_t ret = conv_ref(&p_param, input, weight, bias, output_ref);
+
+  assert(ret == BM_SUCCESS);
+
+  bmk1822_tiu_convolution_param_t conv_param;
+  make_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+  int tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, &p_param);
+  if (tl_alloc_success) {
+    put_bf16_tensor_g2l(&ctx, bk_ctx, conv_param.ifmap, (u16 *)input, FMT_BF16);
+    put_conv_weight(&ctx, bk_ctx, conv_param.weight, (u16 *)weight);
+    if (p_param.using_bias)
+      put_conv_bias(&ctx, bk_ctx, conv_param.bias, bias);
+    bmk1822_tiu_convolution(bk_ctx, &conv_param);
+    u16 *output = (u16 *) get_bf16_tensor_l2g(&ctx, bk_ctx, conv_param.ofmap, FMT_BF16 );
+
+    int has_error = array_cmp_int8(
+        "Comparing results ...\n",
+        (s8*)output_ref, (s8*)output, conv_output_size(&p_param)*2);
+
+    if (has_error) {
+      print_conv_param(&p_param);
+      printf("Comparison FAILED\n");
+      exit(-1);
+    }
+    free(output);
+  }
+
+  free_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+  free(input);
+  free(weight);
+  free(output_ref);
+  free(bias);
+  return tl_alloc_success ? 1 : 0;
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  int round_mode;
+  round_mode = set_store_feround();
+  int test_finished_num = 0;
+
+  for (int i = 0; i < 20; i++) {
+    printf("random_test_conv iteration: %d\n", i);
+    conv_param_t test_conv_param;
+    init_conv_param(test_conv_param);
+
+    test_finished_num += test_conv(test_conv_param, ctx, bk_ctx);
+
+    if (!test_conv_param.using_bias)
+      test_conv_param.using_bias = 1;
+
+    if (test_conv_param.output_c <= 32)
+    {
+      test_conv_param.output_c += 32;
+    }
+    test_finished_num += test_conv(test_conv_param, ctx, bk_ctx);
+  }
+
+  // test stride extend (0, 31]
+  int stride_list[] = {15, 16, 31};
+  int stride_list_len = sizeof(stride_list) / sizeof(stride_list[0]);
+
+  for (int stride_w_idx = 0; stride_w_idx < stride_list_len; stride_w_idx++) {
+    for (int stride_h_idx = 0; stride_h_idx < stride_list_len; stride_h_idx++) {
+      int stride_w = stride_list[stride_w_idx];
+      int stride_h = stride_list[stride_h_idx];
+      conv_param_t test_conv_param;
+      _init_conv_param(test_conv_param, stride_w, stride_h);
+
+      test_finished_num += test_conv(test_conv_param, ctx, bk_ctx);
+    }
+  }
+
+  restore_feround(round_mode);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_bf16_conv_ps32.cpp b/cviruntime/test/1822/bf16/test_1822_bf16_conv_ps32.cpp
new file mode 100644
index 000000000..c4fa6ca4b
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_bf16_conv_ps32.cpp
@@ -0,0 +1,1095 @@
+#include "../1822_test_util.h"
+typedef struct {
+  int random_seed;
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int using_bias;
+  int bReLU_EN;
+  int r_shift_m;
+  int opd0_sign;
+  int opd1_sign;
+  int opd2_sign;
+  int bf16_enable;
+} conv_param_t;
+
+static inline void bf16_relu(float *buf, u64 size)
+{
+  for (u64 i = 0; i < size; i++)
+    if (buf[i] < 0)
+      buf[i] = 0;
+}
+
+static int ps32_conv_ref(
+    const conv_param_t *p_param,
+    const u16 *ifmap,
+    const u16 *weight,
+    const u32 *bias,
+    u16 *ofmap, int ps32_mode)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  float *result = (float *)malloc(sizeof(float) * in * oc * oh * ow);
+  if (!result)
+    return BM_ERR_FAILURE;
+
+  u32 bstride = in * oc * oh * ow;
+  int ret = BM_SUCCESS;
+
+  if (ps32_mode == 2 || ps32_mode == 0)
+    memset(result, 0, sizeof(float) * in * oc * oh * ow);
+  else {
+    for (int i = 0; i < in * oc * oh * ow; i++) {
+      result[i] = convert_hex_fp32((ofmap[i + bstride * 0] << 16) | ofmap[i + bstride * 1]);
+    }
+  }
+
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      u16 *i_fmap_pad[ic];
+      u16 *kernel_pad[ic];
+
+      for (int iic = 0; iic < ic; ++iic) {
+        i_fmap_pad[iic] = NULL;
+        kernel_pad[iic] = NULL;
+        fill_pad_fmap_bf16(
+            (ifmap + n*ic*ih*iw + iic*ih*iw), &i_fmap_pad[iic], convert_fp32_bf16(0),
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+        //kernel_dilation(
+        fill_pad_fmap_bf16(
+            (weight + c*ic*kh*kw + iic*kh*kw), &kernel_pad[iic], convert_fp32_bf16(0),
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+                  kh, kw);
+      }
+      for (int ph = 0; ph < oh; ++ph) {
+        for (int pw = 0; pw < ow; ++pw) {
+          float result_val= result[n*oc*oh*ow + c*oh*ow + ph*ow + pw];
+          for (int idxh = 0; idxh < kh_ext; ++idxh)  {
+            for (int idxw = 0; idxw < kw_ext; ++idxw) {
+              for (int iic = 0; iic < ic; ++iic){
+                float ifv = convert_bf16_fp32(i_fmap_pad[iic][(idxh+ph*stride_h) * iw_ext + idxw + pw*stride_w]);
+                float ikv = convert_bf16_fp32(kernel_pad[iic][idxh* kw_ext + idxw]);
+                result_val += ifv*ikv;
+              }
+            }
+          }
+          result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] = result_val;
+		}
+      }
+        for(int i = 0; i < ic; i++) {
+          free(i_fmap_pad[i]);
+          free(kernel_pad[i]);
+        }
+    } //end for (int c = 0; c < oc; ++c)
+  }
+
+  if( ps32_mode & 0x2) {
+    for (int i = 0; i < in * oc * oh * ow; i ++) {
+      ofmap[i] = convert_fp32_hex(result[i]) >> 16;
+      ofmap[bstride + i] = convert_fp32_hex(result[i]) & 0xFFFF;
+    }
+  } else {
+    for (int n = 0; n < in; ++n) {
+      for (int c = 0; c < oc; ++c) {
+        if (p_param->using_bias) {
+          for (int ph = 0; ph < oh; ++ph) {
+            for (int pw = 0; pw < ow; ++pw) {
+              result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += convert_hex_fp32(bias[c]); //bias+c ;
+            }
+          }
+        }
+        if (p_param->bReLU_EN)
+          bf16_relu(&result[n*oc*oh*ow + c*oh*ow], oh * ow);
+      }
+    }
+    for (int i = 0; i < in * oc * oh * ow; i++) {
+      ofmap[i] = convert_fp32_bf16(result[i]);
+    }
+  }
+  free(result);
+  return ret;
+}
+
+static u16 * transform_weight(const tl_shape_t *s, u16 before[])
+{
+  u32 ic = s->n;
+  u32 oc = s->c;
+  u32 kh = s->h;
+  u32 kw = s->w;
+
+  u32 size = ic * oc * kh * kw;
+  u16 *after = (u16 *)malloc(sizeof(u16) * size);
+
+  /*
+   * (oc, ic, kh, kw) -> (1, oc, kh * kw, ic)
+   */
+  for (u32 oci = 0; oci < oc; oci++) {
+    for (u32 ici = 0; ici < ic; ici++) {
+      for (u32 khi = 0; khi < kh; khi++) {
+        for (u32 kwi = 0; kwi < kw; kwi++) {
+          u32 src_i = oci * ic * kh * kw + ici * kh * kw + khi * kw + kwi;
+          u32 dst_i = oci * kh * kw * ic + khi * kw * ic + kwi * ic + ici;
+          after[dst_i] = before[src_i];
+        }
+      }
+    }
+  }
+
+  return after;
+}
+
+static void put_conv_weight(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    const tl_t *tl,
+    u16 *data)
+{
+  const tl_shape_t *s = &tl->shape;
+  u32 ic = s->n;
+  u32 oc = s->c;
+  u32 kh = s->h;
+  u32 kw = s->w;
+
+  bmshape_t bms = BM_TENSOR_INT8((int)oc, (int)ic, (int)kh, (int)kw*2);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  u16 *transformed_data = transform_weight(s, data);
+
+  /* we put weight to region 1. bm_memcpy_s2d regard dev_mem as
+   * absolute address, so we should pass abs address to copy weight
+   * to right place.
+   */
+
+  //u64 ab_addr = bm_device_read_base_reg(*ctx, 1);
+  //bmmem_device_t ab_dev_mem =
+    //bmmem_device_prealloc(*ctx, NULL, ab_addr + gaddr, &bms);
+
+  //int ret = bm_memcpy_s2d(*ctx, ab_dev_mem, (u8*)transformed_data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, (u8*)transformed_data);
+  assert(ret == BM_SUCCESS);
+  tl_shape_t tdma_shape = { 1, oc, kh * kw, ic };
+
+  tg_t tdma_tg;
+  tdma_tg.base_reg_index = 0;
+  tdma_tg.start_address = gaddr;
+  tdma_tg.fmt = FMT_BF16;
+  tdma_tg.shape.n = tdma_shape.n;
+  tdma_tg.shape.c = tdma_shape.c;
+  tdma_tg.shape.h = tdma_shape.h;
+  tdma_tg.shape.w = tdma_shape.w;
+  tdma_tg.stride = bmk1822_tensor_tgmem_default_stride(tdma_tg.shape, FMT_BF16);
+  tdma_tg.base_reg_index = 1;
+
+  tl_t tdma_tl = *tl;
+  tdma_tl.shape = tdma_shape;
+  tdma_tl.stride = bmk1822_tensor_lmem_default_stride(bk_ctx, tdma_shape, FMT_BF16, 0);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tdma_tg;
+  p.dst = &tdma_tl;
+
+  bmk1822_tdma_g2l_bf16_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+  bmmem_device_free(*ctx, dev_mem);
+  free(transformed_data);
+}
+
+static u16 * transform_bias(int oc, u32 before[])
+{
+  u16 *after = (u16 *)malloc(sizeof(u16) * 2 * oc);
+  if (!after)
+    return NULL;
+
+  for (int i = 0; i < oc; i++){
+    after[i] = (before[i] >> 16) & 0xFFFF;
+    after[i + oc] = before[i] & 0xFFFF;
+  }
+  return after;
+}
+
+static void put_conv_bias(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    const tl_t *tl,
+    u32 *data)
+{
+
+  int oc = tl->shape.c;
+
+  bmshape_t bms = BM_TENSOR_INT8(2 * sizeof(short), oc, 1, 1);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  u16 *transformed_data = transform_bias(oc, data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, (u8 *)transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_BF16;
+  tg.shape.n = 2;
+  tg.shape.c = oc;
+  tg.shape.h = 1;
+  tg.shape.w = 1;
+  tg.stride = bmk1822_tensor_tgmem_default_stride(tg.shape, FMT_BF16);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1822_tdma_g2l_bf16_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  bmmem_device_free(*ctx, dev_mem);
+  free(transformed_data);
+}
+
+static int conv_kh_ext(const conv_param_t *p)
+{
+  return (p->kh - 1) * p->dh + 1;
+}
+
+static int conv_kw_ext(const conv_param_t *p)
+{
+  return (p->kw - 1) * p->dw + 1;
+}
+
+static int conv_ih_ext(const conv_param_t *p)
+{
+  return (p->input_h - 1) * (p->ins_h + 1) +
+      p->ins_h_last + 1 + p->pad_top + p->pad_bot;
+}
+
+static int conv_iw_ext(const conv_param_t *p)
+{
+  return (p->input_w - 1) * (p->ins_w + 1) +
+      p->ins_w_last + 1 + p->pad_left + p->pad_right;
+}
+
+static int conv_oh(const conv_param_t *p)
+{
+  return (conv_ih_ext(p) - conv_kh_ext(p)) / p->stride_h + 1;
+}
+
+static int conv_ow(const conv_param_t *p)
+{
+  return (conv_iw_ext(p) - conv_kw_ext(p)) / p->stride_w + 1;
+}
+
+static int conv_input_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int ic = p->input_c;
+  int ih = p->input_h;
+  int iw = p->input_w;
+  return in * ic * ih * iw;
+}
+
+static int conv_output_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int oc = p->output_c;
+  int oh = conv_oh(p);
+  int ow = conv_ow(p);
+  return in * oc * oh * ow;
+}
+
+static int conv_weight_size(const conv_param_t *p)
+{
+  int oc = p->output_c;
+  int ic = p->input_c;
+  int kh = p->kh;
+  int kw = p->kw;
+  return oc * ic * kh * kw;
+}
+
+static u16 * alloc_input(const conv_param_t *p)
+{
+  int size = conv_input_size(p);
+  u16 *buf = (u16 *)malloc(sizeof(u16) * size);
+  for (int i = 0; i < size; i++) {
+    buf[i] = convert_fp32_bf16(i);
+  }
+
+  return buf;
+}
+
+static u16 * alloc_weight(const conv_param_t *p)
+{
+  int size = conv_weight_size(p);
+
+  u16 *buf = (u16 *)malloc(sizeof(u16) * size);
+  for (int i = 0; i < size; i++) {
+    buf[i] = convert_fp32_bf16(i);
+  }
+
+  return buf;
+}
+
+static u32 * alloc_bias(const conv_param_t *p)
+{
+  int oc = p->output_c;
+
+  u32 *bias = (u32 *)malloc(sizeof(u32) * oc);
+  float val = 100;
+  for (int i = 0; i < oc; i++) {
+    bias[i] = convert_fp32_hex(val);
+    val += 1;
+  }
+  return bias;
+}
+
+static tl_t * conv_ifmap_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = FMT_BF16;
+  tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return bmk1822_lmem_alloc_tensor(ctx, s, fmt, 1);
+}
+
+static u32 conv_ifmap_tensor_size(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = FMT_BF16;
+  tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return bmk1822_lmem_tensor_to_size(ctx, s, fmt, 1);
+}
+
+static tl_t * conv_weight_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = FMT_BF16; //p->opd1_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+  return bmk1822_lmem_alloc_tensor(ctx, s, fmt, 0);
+}
+
+static u32 conv_weight_tensor_to_size(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = FMT_BF16; //p->opd1_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+  return bmk1822_lmem_tensor_to_size(ctx, s, fmt, 0);
+}
+
+static tl_t * conv_ofmap_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  return bmk1822_lmem_alloc_ps32_tensor(ctx, s, FMT_BF16, 1);
+}
+
+static u32 conv_ofmap_tensor_to_size(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  tl_shape_t s;
+  s.n = p->input_n * sizeof(u32) / sizeof(u8);
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  return bmk1822_lmem_tensor_to_size(ctx, s, FMT_BF16, 1);
+}
+
+static tl_t * conv_bias_tensor(
+    bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = FMT_BF16;//p->opd2_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+  return bmk1822_lmem_alloc_tensor(ctx, s, fmt, 0);
+}
+
+static u32 conv_bias_tensor_size(
+    bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = FMT_BF16;//p->opd2_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+  return bmk1822_lmem_tensor_to_size(ctx, s, fmt, 0);
+}
+
+static int conv_param_is_ok(const conv_param_t *p)
+{
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+
+  if ((kh_ext > ih_ext)
+      || (kw_ext > iw_ext)
+      || (kh_ext <= p->pad_top)
+      || (kh_ext <= p->pad_bot)
+      || (kw_ext <= p->pad_left)
+      || (kw_ext <= p->pad_right)
+      || (p->pad_top >= (1 << 4))
+      || (p->pad_bot >= (1 << 4))
+      || (p->pad_left >= (1 << 4))
+      || (p->pad_right >= (1 << 4))) {
+    return 0;
+  }
+
+  return 1;
+}
+
+static int bmk_conv_param_alloc_ok(
+    const bmk1822_tiu_convolution_param_t *p,
+    const conv_param_t *param)
+{
+  if (!p->ifmap || !p->ofmap || !p->weight)
+    return 0;
+  if(p->ps32_mode==1)
+      if (param->using_bias)
+        if (!p->bias)
+          return 0;
+
+  return 1;
+}
+
+static void make_bmk_conv_param_ps32(
+    bmk_ctx_t *ctx,
+    bmk1822_tiu_convolution_param_t *dst,
+    const conv_param_t *p, u32 ps32_mode)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  if(ps32_mode==2)
+  {
+    u32 ifmap_size = conv_ifmap_tensor_size(ctx, p);
+    u32 weight_size = conv_weight_tensor_to_size(ctx, p);
+    u32 ofmap_size = conv_ofmap_tensor_to_size(ctx, p);
+    u32 bias_size = p->using_bias ? conv_bias_tensor_size(ctx, p) : 0;
+    u32 total_size = ifmap_size + weight_size + ofmap_size + bias_size;
+
+    // Allocation if size fit.
+    // Assertion check in bmk1822_lmem_alloc_ps32_tensor().
+    bmk1822_chip_info_t chip_info = bmk1822_chip_info();
+    if (total_size <= chip_info.lmem_size) {
+      dst->ifmap = conv_ifmap_tensor(ctx, p);
+      dst->weight = conv_weight_tensor(ctx, p);
+      dst->ofmap = conv_ofmap_tensor(ctx, p);
+    } else {
+      dst->ifmap = nullptr;
+      dst->weight = nullptr;
+      dst->ofmap = nullptr;
+    }
+  }
+
+  dst->ps32_mode = ps32_mode;
+
+  dst->bias = NULL;
+  dst->relu_enable = 0;
+  dst->rshift_bits = 0;
+  if(ps32_mode==1)
+  {
+    dst->relu_enable = p->bReLU_EN;
+    dst->rshift_bits = p->r_shift_m;
+    // only mode=1 can use bias
+    if (p->using_bias)
+      dst->bias = conv_bias_tensor(ctx, p);
+  }
+}
+
+static void make_bmk_conv_param(
+    bmk_ctx_t *ctx,
+    bmk1822_tiu_convolution_param_t *dst,
+    const conv_param_t *p)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  u32 ifmap_size = conv_ifmap_tensor_size(ctx, p);
+  u32 weight_size = conv_weight_tensor_to_size(ctx, p);
+  u32 ofmap_size = conv_ofmap_tensor_to_size(ctx, p);
+  u32 bias_size = p->using_bias ? conv_bias_tensor_size(ctx, p) : 0;
+  u32 total_size = ifmap_size + weight_size + ofmap_size + bias_size;
+
+  // Allocation if size fit.
+  // Assertion check in bmk1822_lmem_alloc_ps32_tensor().
+  bmk1822_chip_info_t chip_info = bmk1822_chip_info();
+  if (total_size <= chip_info.lmem_size) {
+    dst->ifmap = conv_ifmap_tensor(ctx, p);
+    dst->weight = conv_weight_tensor(ctx, p);
+    dst->ofmap = conv_ofmap_tensor(ctx, p);
+
+    if (p->using_bias)
+      dst->bias = conv_bias_tensor(ctx, p);
+  } else {
+    dst->ifmap = nullptr;
+    dst->weight = nullptr;
+    dst->ofmap = nullptr;
+  }
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  dst->relu_enable = p->bReLU_EN;
+  dst->rshift_bits = p->r_shift_m;
+  // dst->ifmap = conv_ifmap_tensor(ctx, p);
+  // dst->weight = conv_weight_tensor(ctx, p);
+  // dst->ofmap = conv_ofmap_tensor(ctx, p);
+  // dst->bias = NULL;
+  dst->ps32_mode = 0;
+  // if (p->using_bias)
+  //   dst->bias = conv_bias_tensor(ctx, p);
+}
+
+static void free_bmk_conv_param(
+    bmk_ctx_t *ctx,
+    bmk1822_tiu_convolution_param_t *r,
+    const conv_param_t *p)
+{
+  if (p->using_bias && r->bias)
+    free_tl(ctx, r->bias);
+
+  if (r->ofmap)
+    free_tl(ctx, r->ofmap);
+
+  if (r->weight)
+    free_tl(ctx, r->weight);
+
+  if (r->ifmap)
+    free_tl(ctx, r->ifmap);
+}
+
+static void init_conv_param(conv_param_t &p)
+{
+  printf("init_conv_param\n");
+  p.random_seed = clock();
+  srand(p.random_seed);
+
+retry:
+
+  p.input_n = 1;
+  p.input_c = rand() % (10) + 2;
+  p.kh = rand() % 6 + 1;
+  p.kw = rand() % 6 + 1;
+  p.input_h = rand() % 10 + p.kh;
+  p.input_w = rand() % 10 + p.kw;
+  p.output_c = rand() % 10 + 3;
+  p.stride_h = rand() % (p.kh) + 1;
+  p.stride_w = rand() % (p.kw) + 1;
+  p.ins_h = rand() % p.kh;
+  p.ins_w = rand() % p.kw;
+  p.ins_h_last = rand() % p.kh;;
+  p.ins_w_last = rand() % p.kw;;
+  p.dh = rand() % 3 + 1;
+  p.dw = rand() % 3 + 1;
+
+  int kh_ext = conv_kh_ext(&p);
+  int kw_ext = conv_kw_ext(&p);
+  p.pad_top = rand() % kh_ext;
+  p.pad_bot = rand() % kh_ext;
+  p.pad_left = rand() % kw_ext;
+  p.pad_right = rand() % kw_ext;
+
+  if (!conv_param_is_ok(&p)) {
+    printf("retry init_conv_param\n");
+    goto retry;
+  }
+
+  p.using_bias = rand() % 2;
+  p.r_shift_m = rand() % 8;
+  p.bReLU_EN = rand() % 2;
+  p.opd0_sign = rand() % 2;
+  p.opd1_sign = 1;
+  p.opd2_sign = 1;
+
+  assert(p.opd1_sign == 1 && p.opd2_sign == 1);
+
+  int ih_ext = conv_ih_ext(&p);
+  int iw_ext = conv_iw_ext(&p);
+  assert(ih_ext >= kh_ext);
+  assert(iw_ext >= kw_ext);
+}
+
+static void print_conv_param(const conv_param_t *p)
+{
+  printf("%s\n", "Conv parameters:");
+  printf("  %s%d;\n", "p->random_seed = ", p->random_seed);
+
+  printf("  %s%d;\n", "p->input_n = ", p->input_n);
+  printf("  %s%d;\n", "p->input_c = ", p->input_c);
+  printf("  %s%d;\n", "p->input_h = ", p->input_h);
+  printf("  %s%d;\n", "p->input_w = ", p->input_w);
+  printf("  %s%d;\n", "p->output_c = ", p->output_c);
+
+  printf("  %s%d;\n", "p->kh = ", p->kh);
+  printf("  %s%d;\n", "p->kw = ", p->kw);
+  printf("  %s%d;\n", "p->dh = ", p->dh);
+  printf("  %s%d;\n", "p->dw = ", p->dw);
+  printf("  %s%d;\n", "p->pad_top = ", p->pad_top);
+  printf("  %s%d;\n", "p->pad_bot = ", p->pad_bot);
+  printf("  %s%d;\n", "p->pad_left = ", p->pad_left);
+  printf("  %s%d;\n", "p->pad_right = ", p->pad_right);
+  printf("  %s%d;\n", "p->stride_h = ", p->stride_h);
+  printf("  %s%d;\n", "p->stride_w = ", p->stride_w);
+  printf("  %s%d;\n", "p->ins_w = ", p->ins_w);
+  printf("  %s%d;\n", "p->ins_h = ", p->ins_h);
+  printf("  %s%d;\n", "p->ins_w_last = ", p->ins_w_last);
+  printf("  %s%d;\n", "p->ins_h_last = ", p->ins_h_last);
+
+  printf("  %s%d;\n", "p->r_shift_m = ", p->r_shift_m);
+  printf("  %s%d;\n", "p->opd0_sign = ", p->opd0_sign);
+  printf("  %s%d;\n", "p->opd1_sign = ", p->opd1_sign);
+  printf("  %s%d;\n", "p->opd2_sign = ", p->opd2_sign);
+  printf("  %s%d;\n", "p->bReLU_EN = ", p->bReLU_EN);
+  printf("  %s%d;\n", "p->using_bias = ", p->using_bias);
+
+  printf("  %s%d\n", "kh_ext = ", conv_kh_ext(p));
+  printf("  %s%d\n", "kw_ext = ", conv_kw_ext(p));
+  printf("  %s%d\n", "ih_ext = ", conv_ih_ext(p));
+  printf("  %s%d\n", "iw_ext = ", conv_iw_ext(p));
+  printf("  %s%d\n", "output_h = ", conv_oh(p));
+  printf("  %s%d\n", "output_w = ", conv_ow(p));
+}
+
+static int test_ps32_ut(
+    conv_param_t &p_param, bmctx_t ctx, bmk_ctx_t *bk_ctx)
+{
+  printf("test_ps32_ut\n");
+  u16 *input = alloc_input(&p_param);
+  u16 *weight = alloc_weight(&p_param);
+  u32 *bias = alloc_bias(&p_param);
+  u16 *output_ref = (u16 *)malloc(sizeof(u16) * conv_output_size(&p_param) * sizeof(short));
+  if (!output_ref)
+    return BM_ERR_FAILURE;
+
+  bmerr_t ret = ps32_conv_ref(&p_param, input, weight, bias, output_ref, 2);
+  assert(ret == BM_SUCCESS);
+  bmk1822_tiu_convolution_param_t conv_param;
+  make_bmk_conv_param_ps32(bk_ctx, &conv_param, &p_param, 2);
+
+  int tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, &p_param);
+  if (tl_alloc_success) {
+
+    put_bf16_tensor_g2l(&ctx, bk_ctx, conv_param.ifmap, (u16 *)input, FMT_BF16);
+    put_conv_weight(&ctx, bk_ctx, conv_param.weight, (u16 *)weight);
+    bmk1822_tiu_convolution(bk_ctx, &conv_param);
+    bmk1822_tensor_lmem_t ps32_ofmap;
+    ps32_ofmap = *conv_param.ofmap;
+    ps32_ofmap.shape.n = ps32_ofmap.shape.n * sizeof(short);
+    u16 *output = (u16*) get_bf16_tensor_l2g(&ctx, bk_ctx, &ps32_ofmap, FMT_BF16);
+
+    int has_error = array_cmp_int8(
+        "Comparing M2 begin_mode results ...\n",
+        (s8*)output_ref, (s8 *)output, conv_output_size(&p_param) * sizeof(int));
+
+    if (has_error) {
+      print_conv_param(&p_param);
+      printf("Comparison M2 FAILED\n");
+      exit(-1);
+    } else
+      printf("Comparison M2 PASS\n");
+
+    free(output);
+  }
+  free_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+  printf("test_ps32_intermediate_mode\n");
+  for (int i=0; i < conv_input_size(&p_param); i++)
+    input[i] = convert_fp32_bf16(i);
+
+  for (int i=0; i < conv_weight_size(&p_param); i++)
+    weight[i] = convert_fp32_bf16(i);
+
+  ret = ps32_conv_ref(&p_param, input, weight, bias, output_ref, 3);
+  assert(ret == BM_SUCCESS);
+
+  make_bmk_conv_param_ps32(bk_ctx, &conv_param, &p_param, 3);
+  tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, &p_param);
+
+  if (tl_alloc_success) {
+    put_bf16_tensor_g2l(&ctx, bk_ctx, conv_param.ifmap, (u16 *)input, FMT_BF16);
+    put_conv_weight(&ctx, bk_ctx, conv_param.weight, (u16 *)weight);
+
+    bmk1822_tiu_convolution(bk_ctx, &conv_param);
+
+    bmk1822_tensor_lmem_t ps32_ofmap;
+    ps32_ofmap = *conv_param.ofmap;
+    ps32_ofmap.shape.n = ps32_ofmap.shape.n * sizeof(short);
+
+    u16 *output = (u16*) get_bf16_tensor_l2g(&ctx, bk_ctx, &ps32_ofmap, FMT_BF16);
+
+    int has_error = array_cmp_int8(
+        "Comparing M3 intermediate results ...\n",
+        (s8*)output_ref, (s8 *)output, conv_output_size(&p_param) * sizeof(int));
+
+    if (has_error) {
+      print_conv_param(&p_param);
+      printf("Comparison M3 FAILED\n");
+      exit(-1);
+    } else
+      printf("Comparison M3 PASS\n");
+
+    free(output);
+  }
+  free_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+  printf("test_ps32_end_mode\n");
+  for (int i=0; i < conv_input_size(&p_param); i++)
+    input[i] = convert_fp32_bf16(i);
+
+  for (int i=0; i < conv_weight_size(&p_param); i++)
+    weight[i] = convert_fp32_bf16(i);
+
+  ret = ps32_conv_ref(&p_param, input, weight, bias, output_ref, 1);
+  assert(ret == BM_SUCCESS);
+
+  make_bmk_conv_param_ps32(bk_ctx, &conv_param, &p_param, 1);
+
+  tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, &p_param);
+
+  if (tl_alloc_success) {
+
+    put_bf16_tensor_g2l(&ctx, bk_ctx, conv_param.ifmap, (u16 *)input, FMT_BF16);
+    put_conv_weight(&ctx, bk_ctx, conv_param.weight, (u16 *)weight);
+    if (p_param.using_bias) {
+      put_conv_bias(&ctx, bk_ctx, conv_param.bias, bias);
+    }
+    bmk1822_tiu_convolution(bk_ctx, &conv_param);
+    u16 *output = (u16*) get_bf16_tensor_l2g(&ctx, bk_ctx, conv_param.ofmap, FMT_BF16);
+
+    int has_error = array_cmp_int8(
+        "Comparing M1 end results ...\n",
+        (s8*)output_ref, (s8 *)output, conv_output_size(&p_param) * 2);
+
+    if (has_error) {
+      print_conv_param(&p_param);
+      printf("Comparison M1 FAILED\n");
+      exit(-1);
+    } else
+      printf("Comparison M1 PASS\n");
+
+    free(output);
+  }
+  free_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+  free(input);
+  free(weight);
+  free(bias);
+  free(output_ref);
+
+  return tl_alloc_success ? 1 : 0;
+}
+
+static int test_ic_tiling_conv(
+    conv_param_t &p_param, bmctx_t ctx, bmk_ctx_t *bk_ctx)
+{
+  printf("test tiled ps32 conv\n");
+  u16 *input = alloc_input(&p_param);
+  u16 *weight = alloc_weight(&p_param);
+  u32 *bias = alloc_bias(&p_param);
+  p_param.r_shift_m = 0;
+  u16 *output_ref = (u16 *)malloc(sizeof(u16) * conv_output_size(&p_param));
+  if (!output_ref)
+    return BM_ERR_FAILURE;
+
+  memset((u8*)output_ref, 0, conv_output_size(&p_param)*2);
+  bmerr_t ret = ps32_conv_ref(&p_param, input, weight, bias, output_ref, 0);
+  assert(ret == BM_SUCCESS);
+
+  bmk1822_tiu_convolution_param_t conv_tmp_param;
+  bmk1822_tiu_convolution_param_t conv_param;
+  make_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+  int tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, &p_param);
+  if (tl_alloc_success) {
+    if (p_param.using_bias) {
+      conv_tmp_param.bias = conv_param.bias;
+      put_conv_bias(&ctx, bk_ctx, conv_param.bias, bias);
+    }
+    // for ps32_md[1] = 1, relu_enable & rshift_bits need to set to 0
+    // so we store those parameters to conv_tmp_para
+    conv_tmp_param.relu_enable = conv_param.relu_enable;
+    conv_tmp_param.rshift_bits = conv_param.rshift_bits;
+    conv_tmp_param.bias        = conv_param.bias;
+
+    u32 ic_step = 1;
+    u32 n_step = 1;
+    tl_t ifmap = *conv_param.ifmap;
+    tl_t ofmap = *conv_param.ofmap;
+    tg_shape_t s;
+    s.n = conv_param.ifmap->shape.n;
+    s.c = conv_param.ifmap->shape.c;
+    s.h = conv_param.ifmap->shape.h;
+    s.w = conv_param.ifmap->shape.w;
+    tg_t *tg_ifmap = alloc_tg_bf16_gmem(&ctx, s, FMT_BF16);
+    put_tg_bf16_gmem(&ctx, tg_ifmap, (u8 *)input);
+
+    s.n = conv_param.weight->shape.n;
+    s.c = conv_param.weight->shape.c;
+    s.h = conv_param.weight->shape.h;
+    s.w = conv_param.weight->shape.w;
+    u16 *transformed_weight =
+      transform_weight(&conv_param.weight->shape, (u16 *)weight);
+    tg_t *tg_weight = alloc_tg_bf16_gmem(&ctx, s, FMT_BF16);
+    put_tg_bf16_gmem(&ctx, tg_weight, (u8 *)transformed_weight);
+    free(transformed_weight);
+
+    tl_shape_t cur_tl_ifmap_shape = {
+        n_step,
+        ic_step,
+        ifmap.shape.h,
+        ifmap.shape.w
+    };
+
+    tg_shape_t cur_tg_ifmap_shape = {
+      cur_tl_ifmap_shape.n,
+      cur_tl_ifmap_shape.c,
+      cur_tl_ifmap_shape.h,
+      cur_tl_ifmap_shape.w
+    };
+
+    tg_stride_t cur_tg_ifmap_stride = {
+      tg_ifmap->stride.n,
+      tg_ifmap->stride.c,
+      tg_ifmap->stride.h,
+    };
+
+    tg_t cur_tg_ifmap;
+    cur_tg_ifmap.base_reg_index = 0;
+    cur_tg_ifmap.start_address = tg_ifmap->start_address;
+    cur_tg_ifmap.shape = cur_tg_ifmap_shape;
+    cur_tg_ifmap.stride = cur_tg_ifmap_stride;
+    cur_tg_ifmap.fmt = FMT_BF16;
+
+    tl_t cur_tl_ifmap;
+    cur_tl_ifmap.shape = cur_tl_ifmap_shape;
+    cur_tl_ifmap.stride =
+      bmk1822_tensor_lmem_default_stride(bk_ctx, cur_tl_ifmap_shape, FMT_BF16, 1);
+    cur_tl_ifmap.start_address = ifmap.start_address;
+    cur_tl_ifmap.fmt = ifmap.fmt;
+
+    tl_t cur_tl_ofmap;
+    cur_tl_ofmap.start_address = ofmap.start_address;
+    cur_tl_ofmap.shape = ofmap.shape;
+    cur_tl_ofmap.shape.n = n_step;
+    cur_tl_ofmap.stride =
+      bmk1822_tensor_lmem_default_stride(bk_ctx, cur_tl_ofmap.shape, FMT_BF16, 1);
+    cur_tl_ofmap.fmt = ofmap.fmt;
+
+    tl_t cur_tl_weight;
+    cur_tl_weight.start_address = conv_param.weight->start_address;
+    cur_tl_weight.shape = conv_param.weight->shape;
+    cur_tl_weight.shape.n = ic_step;
+    cur_tl_weight.stride = {
+      2,
+      cur_tl_weight.shape.n * cur_tl_weight.shape.h * cur_tl_weight.shape.w * 2,
+      cur_tl_weight.shape.n * cur_tl_weight.shape.w * 2,
+      cur_tl_weight.shape.n * 2
+    };
+    cur_tl_weight.fmt = conv_param.weight->fmt;
+
+    const tl_t *saved_tl_weight = conv_param.weight;
+    const tl_t *saved_tl_ifmap = conv_param.ifmap;
+    for (u32 ci = 0; ci < ifmap.shape.c; ci += ic_step) {
+      {
+        u32 ic = tg_weight->shape.n;
+        u32 oc = tg_weight->shape.c;
+        u32 kh = tg_weight->shape.h;
+        u32 kw = tg_weight->shape.w;
+
+        tg_t cur_tdma_tg_weight;
+        cur_tdma_tg_weight.base_reg_index = tg_weight->base_reg_index;
+        cur_tdma_tg_weight.start_address = tg_weight->start_address + ci * (tg_weight->fmt == FMT_BF16 ? 2 : 1);
+        cur_tdma_tg_weight.fmt = tg_weight->fmt;
+        cur_tdma_tg_weight.shape = {1, oc, kh * kw, ic};
+        cur_tdma_tg_weight.stride =
+          bmk1822_tensor_tgmem_default_stride(cur_tdma_tg_weight.shape, FMT_BF16);
+        cur_tdma_tg_weight.shape = {1, oc, kh * kw, ic_step};
+
+        tl_t cur_tdma_tl_weight;
+        cur_tdma_tl_weight = cur_tl_weight;
+        cur_tdma_tl_weight.shape.n = cur_tdma_tg_weight.shape.n;
+        cur_tdma_tl_weight.shape.c = cur_tdma_tg_weight.shape.c;
+        cur_tdma_tl_weight.shape.h = cur_tdma_tg_weight.shape.h;
+        cur_tdma_tl_weight.shape.w = cur_tdma_tg_weight.shape.w;
+        cur_tdma_tl_weight.stride = bmk1822_tensor_lmem_default_stride(
+            bk_ctx, cur_tdma_tl_weight.shape, cur_tdma_tl_weight.fmt, 0);
+
+        bmk1822_tdma_tg2l_tensor_copy_param_t p1;
+        memset(&p1, 0, sizeof(p1));
+        p1.src = &cur_tdma_tg_weight;
+        p1.dst = &cur_tdma_tl_weight;
+        bmk1822_tdma_g2l_bf16_tensor_copy(bk_ctx, &p1);
+        test_submit(&ctx);
+      }
+      {
+        bmk1822_tdma_tg2l_tensor_copy_param_t p2;
+        memset(&p2, 0, sizeof(p2));
+        cur_tg_ifmap.start_address =
+          tg_ifmap->start_address + ci * tg_ifmap->stride.c;
+        p2.src = &cur_tg_ifmap;
+        p2.dst = &cur_tl_ifmap;
+        bmk1822_tdma_g2l_bf16_tensor_copy(bk_ctx, &p2);
+        test_submit(&ctx);
+      }
+
+      conv_param.ifmap = &cur_tl_ifmap;
+      conv_param.weight = &cur_tl_weight;
+
+      // for ps32_md[1] = 1, relu_enable & rshift_bits need to set to 0
+      // so we store those parameters to conv_tmp_para
+      if (ci == (ifmap.shape.c - 1))
+      {
+        conv_param.relu_enable = conv_tmp_param.relu_enable;
+        conv_param.rshift_bits = conv_tmp_param.rshift_bits;
+        conv_param.bias        = conv_tmp_param.bias;
+        conv_param.ps32_mode   = 1;
+      }
+      else if (ci == 0)
+      {
+        conv_param.relu_enable = 0;
+        conv_param.rshift_bits = 0;
+        conv_param.bias        = 0;;
+        conv_param.ps32_mode   = 2;
+      }
+      else
+      {
+        conv_param.relu_enable = 0;
+        conv_param.rshift_bits = 0;
+        conv_param.bias        = 0;;
+        conv_param.ps32_mode   = 3;
+      }
+      bmk1822_tiu_convolution(bk_ctx, &conv_param);
+      conv_param.weight = saved_tl_weight;
+      conv_param.ifmap = saved_tl_ifmap;
+    }
+
+    u16 *output = (u16*) get_bf16_tensor_l2g(&ctx, bk_ctx, conv_param.ofmap, FMT_BF16);
+
+    free_tg_gmem(&ctx, tg_ifmap);
+    free_tg_gmem(&ctx, tg_weight);
+
+    int has_error = array_cmp_int8(
+        "Comparing results ...\n",
+        (s8*) output_ref, (s8 *)output, conv_output_size(&p_param)*2);
+    if (has_error) {
+      print_conv_param(&p_param);
+      printf("Comparison FAILED\n");
+      exit(-1);
+    }
+    free(output);
+  }
+  free_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+  free(input);
+  free(weight);
+  free(output_ref);
+  free(bias);
+
+  return tl_alloc_success ? 1 : 0;
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  int test_finished_num = 0;
+  int round_mode;
+  round_mode = set_store_feround();
+
+  for (int i = 0; i < 15; i++) {
+    printf("random_test_conv iteration: %d\n", i);
+    conv_param_t test_conv_param;
+    init_conv_param(test_conv_param);
+    //print_conv_param(&test_conv_param);
+    test_finished_num += test_ic_tiling_conv(test_conv_param, ctx, bk_ctx);
+    test_finished_num += test_ps32_ut(test_conv_param, ctx, bk_ctx);
+    if (!test_conv_param.using_bias)
+      test_conv_param.using_bias = 1;
+    if (test_conv_param.output_c <= 9)
+      test_conv_param.output_c += 3;
+    //print_conv_param(&test_conv_param);
+    test_finished_num += test_ic_tiling_conv(test_conv_param, ctx, bk_ctx);
+    test_finished_num += test_ps32_ut(test_conv_param, ctx, bk_ctx);
+  }
+  printf("test_finished_num: %d\n", test_finished_num);
+  restore_feround(round_mode);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_bf16_conv_zero_ratio.cpp b/cviruntime/test/1822/bf16/test_1822_bf16_conv_zero_ratio.cpp
new file mode 100644
index 000000000..a74a2040d
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_bf16_conv_zero_ratio.cpp
@@ -0,0 +1,741 @@
+#include "../1822_test_util.h"
+
+typedef struct{
+    u16 *conv_input;
+    u16 *conv_weight;
+    u32 *conv_bias;
+    u16 *conv_output;
+    u16 *conv_output_ref;
+}u_test_data;
+
+typedef struct {
+  int random_seed;
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int using_bias;
+  int bReLU_EN;
+  int r_shift_m;
+  int opd0_sign;
+  int opd1_sign;
+  int opd2_sign;
+  int izratio;
+  int kzratio;
+} conv_param_t;
+
+conv_param_t conv_param;
+u_test_data u16_test_data;
+bmk1822_tiu_convolution_param_t bmk_conv_param;
+
+bmk1822_tensor_lmem_t *skip_tensor_lmem[10];
+u32 skip_tensor_num=0;
+
+/* need to make sure the free order of alloc_tl for skip_tensor_lmem*/
+void skip_tensor_lmem_size(bmk_ctx_t *bmk, const bmk1822_tensor_lmem_t *p)
+{
+  u32 needed = align_up(p->shape.n * p->stride.n, BM1822_HW_EU_NUM);
+  u32 start_addr = p->start_address + needed; //src_shape.n*src_shape.c*src_shape.h*src_shape.w/32;
+  u32 remain_size = start_addr % BM1822_HW_LMEM_BANK_SIZE ? (BM1822_HW_LMEM_BANK_SIZE - start_addr % BM1822_HW_LMEM_BANK_SIZE) : 0; // remain size for each lane
+  if(remain_size)
+  {
+    tl_shape_t src_shape2 = {1, BM1822_HW_NPU_NUM, 1, remain_size};
+    skip_tensor_lmem[skip_tensor_num] = alloc_tl(bmk, src_shape2, FMT_BF16, 1); // skip the lmem size and next tl can alignment to bank size
+  }
+  skip_tensor_num++;
+}
+
+void free_skip_tensor_lmem(bmk_ctx_t *ctx)
+{
+  if(skip_tensor_lmem[--skip_tensor_num]!=NULL)
+    free_tl(ctx, skip_tensor_lmem[skip_tensor_num]);
+}
+
+static inline void bf16_relu(float *buf, u64 size)
+{
+  for (u64 i = 0; i < size; i++)
+    if (buf[i] < 0)
+      buf[i] = 0;
+}
+
+static int conv_ref(
+    const conv_param_t *p_param,
+    const u16 *ifmap,
+    const u16 *weight,
+    const u32 *bias,
+    u16 *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int do_relu = p_param->bReLU_EN;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+  float *result = (float *)malloc(sizeof(float) * in * oc * oh * ow);
+  if (!result)
+    return BM_ERR_FAILURE;
+
+  memset(result, 0, sizeof(float) * in * oc * oh * ow);
+  int ret = BM_SUCCESS;
+
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      u16 *i_fmap_pad[ic];
+      u16 *kernel_pad[ic];
+
+      for (int iic = 0; iic < ic; ++iic) {
+        i_fmap_pad[iic] = NULL;
+        kernel_pad[iic] = NULL;
+        fill_pad_fmap_bf16(
+            (ifmap + n*ic*ih*iw + iic*ih*iw), &i_fmap_pad[iic], convert_fp32_bf16(0),
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+        //kernel_dilation(
+        fill_pad_fmap_bf16(
+            (weight + c*ic*kh*kw + iic*kh*kw), &kernel_pad[iic], convert_fp32_bf16(0),
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+                  kh, kw);
+      }
+      for (int ph = 0; ph < oh; ++ph) {
+        for (int pw = 0; pw < ow; ++pw) {
+          float result_val = result[n*oc*oh*ow + c*oh*ow + ph*ow + pw];
+          for (int idxh = 0; idxh < kh_ext; idxh += dh) {
+            for (int idxw = 0; idxw < kw_ext; idxw += dw) {
+              for (int iic = 0; iic < ic; ++iic){
+                float ifv = convert_bf16_fp32(i_fmap_pad[iic][(idxh+ph*stride_h) * iw_ext + idxw + pw*stride_w]);
+                float ikv = convert_bf16_fp32(kernel_pad[iic][idxh* kw_ext + idxw]);
+                result_val += ifv*ikv;
+              }
+            }
+          }
+          result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] = result_val;
+        }
+      }
+       if (p_param->using_bias) {
+         for (int ph = 0; ph < oh; ++ph) {
+           for (int pw = 0; pw < ow; ++pw) {
+             result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += convert_hex_fp32(bias[c]); //bias+c ;
+           }
+         }
+       }
+       if (do_relu)
+         bf16_relu(&result[n*oc*oh*ow + c*oh*ow], oh * ow);
+       for(int i = 0 ;i<ic;i++) {
+         free(i_fmap_pad[i]);
+         free(kernel_pad[i]);
+       }
+       if (ret != BM_SUCCESS)
+         goto error_release;
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+    for (int i = 0; i < in * oc * oh * ow; i++) {
+      ofmap[i] = convert_fp32_bf16(result[i]);
+    }
+
+error_release:
+  free(result);
+
+  return ret;
+
+}
+
+static u16 * transform_weight(const tl_shape_t *s, u16 before[])
+{
+  u32 ic = s->n;
+  u32 oc = s->c;
+  u32 kh = s->h;
+  u32 kw = s->w;
+
+  u32 size = ic * oc * kh * kw;
+  u16 *after = (u16 *)malloc(sizeof(u16) * size);
+
+  /*
+   * (oc, ic, kh, kw) -> (1, oc, kh * kw, ic)
+   */
+  for (u32 oci = 0; oci < oc; oci++) {
+    for (u32 ici = 0; ici < ic; ici++) {
+      for (u32 khi = 0; khi < kh; khi++) {
+        for (u32 kwi = 0; kwi < kw; kwi++) {
+          u32 src_i = oci * ic * kh * kw + ici * kh * kw + khi * kw + kwi;
+          u32 dst_i = oci * kh * kw * ic + khi * kw * ic + kwi * ic + ici;
+          after[dst_i] = before[src_i];
+        }
+      }
+    }
+  }
+
+  return after;
+}
+
+static void put_conv_weight(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    const tl_t *tl,
+    u16 *data)
+{
+  const tl_shape_t *s = &tl->shape;
+  u32 ic = s->n;
+  u32 oc = s->c;
+  u32 kh = s->h;
+  u32 kw = s->w;
+
+  bmshape_t bms = BM_TENSOR_INT8((int)oc, (int)ic, (int)kh, (int)kw * 2);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  u16 *transformed_data = transform_weight(s, data);
+
+  /* we put weight to region 1. bm_memcpy_s2d regard dev_mem as
+   * absolute address, so we should pass abs address to copy weight
+   * to right place.
+   */
+
+  //u64 ab_addr = bm_device_read_base_reg(*ctx, 1);
+  //bmmem_device_t ab_dev_mem = bmmem_device_prealloc(*ctx, NULL, ab_addr + gaddr, &bms);
+  //bmmem_device_t ab_dev_mem = bmmem_device_prealloc_raw(*ctx, NULL, ab_addr + gaddr, bmshape_get_size(&bms));
+
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, (u8 *)transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tl_shape_t tdma_shape = { 1, oc, kh * kw, ic };
+
+  tg_t tdma_tg;
+  tdma_tg.base_reg_index = 0;
+  tdma_tg.start_address = gaddr;
+  tdma_tg.fmt = FMT_BF16;
+  tdma_tg.shape.n = tdma_shape.n;
+  tdma_tg.shape.c = tdma_shape.c;
+  tdma_tg.shape.h = tdma_shape.h;
+  tdma_tg.shape.w = tdma_shape.w;
+  tdma_tg.stride = bmk1822_tensor_tgmem_default_stride(tdma_tg.shape, FMT_BF16);
+  tdma_tg.base_reg_index = 1;
+
+  tl_t tdma_tl = *tl;
+  tdma_tl.shape = tdma_shape;
+  tdma_tl.stride = bmk1822_tensor_lmem_default_stride(bk_ctx, tdma_shape, FMT_BF16, 0);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tdma_tg;
+  p.dst = &tdma_tl;
+
+  bmk1822_tdma_g2l_bf16_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+  bmmem_device_free(*ctx, dev_mem);
+  //bmmem_device_free(*ctx, ab_dev_mem);
+  free(transformed_data);
+}
+
+static u16 * transform_bias(int oc, u32 before[])
+{
+  u16 *after = (u16 *)malloc(sizeof(u16) * 2 * oc);
+  if (!after)
+    return NULL;
+
+  for (int i = 0; i < oc; i++){
+    after[i] = (before[i] >> 16) & 0xffff;
+    after[i + oc] = before[i] & 0xffff;
+  }
+  return after;
+}
+
+static void put_conv_bias(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    const tl_t *tl,
+    u32 *data)
+{
+
+  int oc = tl->shape.c;
+
+  bmshape_t bms = BM_TENSOR_INT8(2 * sizeof(short), oc, 1, 1);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  u16 *transformed_data = transform_bias(oc, data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, (u8 *)transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_BF16;
+  tg.shape.n = 2;
+  tg.shape.c = oc;
+  tg.shape.h = 1;
+  tg.shape.w = 1;
+  tg.stride = bmk1822_tensor_tgmem_default_stride(tg.shape, FMT_BF16);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1822_tdma_g2l_bf16_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  bmmem_device_free(*ctx, dev_mem);
+  free(transformed_data);
+}
+
+static int conv_kh_ext(const conv_param_t *p)
+{
+  return (p->kh - 1) * p->dh + 1;
+}
+
+static int conv_kw_ext(const conv_param_t *p)
+{
+  return (p->kw - 1) * p->dw + 1;
+}
+
+static int conv_ih_ext(const conv_param_t *p)
+{
+  return (p->input_h - 1) * (p->ins_h + 1) +
+      p->ins_h_last + 1 + p->pad_top + p->pad_bot;
+}
+
+static int conv_iw_ext(const conv_param_t *p)
+{
+  return (p->input_w - 1) * (p->ins_w + 1) +
+      p->ins_w_last + 1 + p->pad_left + p->pad_right;
+}
+
+static int conv_oh(const conv_param_t *p)
+{
+  return (conv_ih_ext(p) - conv_kh_ext(p)) / p->stride_h + 1;
+}
+
+static int conv_ow(const conv_param_t *p)
+{
+  return (conv_iw_ext(p) - conv_kw_ext(p)) / p->stride_w + 1;
+}
+
+static int conv_input_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int ic = p->input_c;
+  int ih = p->input_h;
+  int iw = p->input_w;
+  return in * ic * ih * iw;
+}
+
+static int conv_output_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int oc = p->output_c;
+  int oh = conv_oh(p);
+  int ow = conv_ow(p);
+  return in * oc * oh * ow;
+}
+
+static int conv_weight_size(const conv_param_t *p)
+{
+  int oc = p->output_c;
+  int ic = p->input_c;
+  int kh = p->kh;
+  int kw = p->kw;
+  return oc * ic * kh * kw;
+}
+
+static u16 * alloc_input(const conv_param_t *p)
+{
+  int size = conv_input_size(p);
+  u16 *buf = (u16 *)malloc(sizeof(u16) * size);
+  for (int i = 0; i < size; i++) {
+    if (p->izratio == 0) //almost 100% not zero
+      buf[i] = convert_fp32_bf16(rand() % 256 - 128);
+    else if (p->izratio == 1)
+      buf[i] = convert_fp32_bf16(rand() % 2 ? rand() % 256 - 128 : 0);
+    else
+      buf[i] = 0;
+  }
+  return buf;
+}
+
+static u16 * alloc_weight(const conv_param_t *p)
+{
+  int size = conv_weight_size(p);
+
+  u16 *buf = (u16 *)malloc(sizeof(u16) * size);
+  for (int i = 0; i < size; i++) {
+    if (p->kzratio == 0) //almost 100% not zero
+      buf[i] = convert_fp32_bf16(rand() % 256 - 128);
+    else if (p->kzratio == 1)
+      buf[i] = convert_fp32_bf16(rand() % 2 ? rand() % 256 - 128 : 0);
+    else
+      buf[i] = 0;
+  }
+  return buf;
+}
+
+static u32 * alloc_bias(const conv_param_t *p)
+{
+  int oc = p->output_c;
+
+  u32 *bias = (u32 *)malloc(sizeof(u32) * oc);
+  for (int i = 0; i < oc; i++)
+    bias[i] = convert_fp32_hex(rand() % 65536 - 32768);
+
+  return bias;
+}
+
+static tl_t * conv_ifmap_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  //fmt_t fmt = p->opd0_sign? FMT_I8: FMT_U8;
+  fmt_t fmt = FMT_BF16;
+  tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return bmk1822_lmem_alloc_tensor(ctx, s, fmt, 1);
+}
+
+static tl_t * conv_weight_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  //fmt_t fmt = p->opd1_sign? FMT_I8: FMT_U8;
+  fmt_t fmt = FMT_BF16;
+  tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+  return bmk1822_lmem_alloc_tensor(ctx, s, fmt, 0);
+}
+
+static tl_t * conv_ofmap_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  return bmk1822_lmem_alloc_tensor(ctx, s, FMT_BF16, 1);
+}
+
+static tl_t * conv_bias_tensor(
+    bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  //fmt_t fmt = p->opd2_sign? FMT_I8: FMT_U8;
+  fmt_t fmt = FMT_BF16;
+  tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+  return bmk1822_lmem_alloc_tensor(ctx, s, fmt, 0);
+}
+
+static int conv_param_is_ok(const conv_param_t *p)
+{
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+
+  if ((kh_ext > ih_ext)
+      || (kw_ext > iw_ext)
+      || (kh_ext <= p->pad_top)
+      || (kh_ext <= p->pad_bot)
+      || (kw_ext <= p->pad_left)
+      || (kw_ext <= p->pad_right)
+      || (p->pad_top >= (1 << 4))
+      || (p->pad_bot >= (1 << 4))
+      || (p->pad_left >= (1 << 4))
+      || (p->pad_right >= (1 << 4))) {
+    return 0;
+  }
+
+  return 1;
+}
+
+static int bmk_conv_param_alloc_ok(
+    const bmk1822_tiu_convolution_param_t *p,
+    const conv_param_t *param)
+{
+  if (!p->ifmap || !p->ofmap || !p->weight)
+    return 0;
+
+  if (param->using_bias)
+    if (!p->bias)
+      return 0;
+
+  return 1;
+}
+
+static void make_bmk_conv_param(
+    bmk_ctx_t *ctx,
+    bmk1822_tiu_convolution_param_t *dst,
+    const conv_param_t *p)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  dst->relu_enable = p->bReLU_EN;
+  dst->rshift_bits = p->r_shift_m;
+
+  dst->ifmap = conv_ifmap_tensor(ctx, p);
+  skip_tensor_lmem_size(ctx, dst->ifmap);
+  dst->weight = conv_weight_tensor(ctx, p);
+  skip_tensor_lmem_size(ctx, dst->weight);
+  dst->ofmap = conv_ofmap_tensor(ctx, p);
+  skip_tensor_lmem_size(ctx, dst->ofmap);
+  dst->bias = NULL;
+  dst->ps32_mode = 0;
+  if (p->using_bias)
+  {
+    dst->bias = conv_bias_tensor(ctx, p);
+    skip_tensor_lmem_size(ctx, dst->bias);
+  }
+  dst->w_is_const = 0;
+}
+
+static void free_bmk_conv_param(
+    bmk_ctx_t *ctx,
+    bmk1822_tiu_convolution_param_t *r,
+    const conv_param_t *p)
+{
+  if (p->using_bias && r->bias)
+  {
+    free_skip_tensor_lmem(ctx);
+    free_tl(ctx, r->bias);
+  }
+  if (r->ofmap)
+  {
+    free_skip_tensor_lmem(ctx);
+    free_tl(ctx, r->ofmap);
+  }
+  if (r->weight)
+  {
+    free_skip_tensor_lmem(ctx);
+    free_tl(ctx, r->weight);
+  }
+  if (r->ifmap)
+  {
+    free_skip_tensor_lmem(ctx);
+    free_tl(ctx, r->ifmap);
+  }
+}
+
+static void init_conv_param(conv_param_t &p)
+{
+retry:
+  p.input_n = 1;
+  p.input_c = 16;
+  p.input_h = 2;
+  p.input_w = 600;
+
+  p.kh = 2;
+  p.kw = 16;
+  p.output_c = 16;
+
+  p.stride_h = 1;
+  p.stride_w = 15;
+  p.ins_h = 0;
+  p.ins_w = 0;
+  p.ins_h_last = 0;;
+  p.ins_w_last = 0;;
+  p.dh = 1;
+  p.dw = 1;
+
+  int kh_ext = conv_kh_ext(&p);
+  int kw_ext = conv_kw_ext(&p);
+  p.pad_top = 1;
+  p.pad_bot = 0;
+  p.pad_left = 0;
+  p.pad_right = 0;
+
+  if (!conv_param_is_ok(&p)) {
+    printf("retry init_conv_param\n");
+    goto retry;
+  }
+
+  p.using_bias = 0;
+  p.r_shift_m = 7;
+  p.bReLU_EN = 1;
+
+  p.opd0_sign = 0;
+  p.opd1_sign = 1;
+  p.opd2_sign = 1;
+
+  assert(p.opd1_sign == 1 && p.opd2_sign == 1);
+
+  int ih_ext = conv_ih_ext(&p);
+  int iw_ext = conv_iw_ext(&p);
+  assert(ih_ext >= kh_ext);
+  assert(iw_ext >= kw_ext);
+
+}
+
+static void print_conv_param(const conv_param_t *p)
+{
+  printf("%s\n", "Conv parameters:");
+  printf("  %s%d;\n", "p->random_seed = ", p->random_seed);
+
+  printf("  %s%d;\n", "p->input_n = ", p->input_n);
+  printf("  %s%d;\n", "p->input_c = ", p->input_c);
+  printf("  %s%d;\n", "p->input_h = ", p->input_h);
+  printf("  %s%d;\n", "p->input_w = ", p->input_w);
+  printf("  %s%d;\n", "p->output_c = ", p->output_c);
+
+  printf("  %s%d;\n", "p->kh = ", p->kh);
+  printf("  %s%d;\n", "p->kw = ", p->kw);
+  printf("  %s%d;\n", "p->dh = ", p->dh);
+  printf("  %s%d;\n", "p->dw = ", p->dw);
+  printf("  %s%d;\n", "p->pad_top = ", p->pad_top);
+  printf("  %s%d;\n", "p->pad_bot = ", p->pad_bot);
+  printf("  %s%d;\n", "p->pad_left = ", p->pad_left);
+  printf("  %s%d;\n", "p->pad_right = ", p->pad_right);
+  printf("  %s%d;\n", "p->stride_h = ", p->stride_h);
+  printf("  %s%d;\n", "p->stride_w = ", p->stride_w);
+  printf("  %s%d;\n", "p->ins_w = ", p->ins_w);
+  printf("  %s%d;\n", "p->ins_h = ", p->ins_h);
+  printf("  %s%d;\n", "p->ins_w_last = ", p->ins_w_last);
+  printf("  %s%d;\n", "p->ins_h_last = ", p->ins_h_last);
+
+  printf("  %s%d;\n", "p->r_shift_m = ", p->r_shift_m);
+  printf("  %s%d;\n", "p->opd0_sign = ", p->opd0_sign);
+  printf("  %s%d;\n", "p->opd1_sign = ", p->opd1_sign);
+  printf("  %s%d;\n", "p->opd2_sign = ", p->opd2_sign);
+  printf("  %s%d;\n", "p->bReLU_EN = ", p->bReLU_EN);
+  printf("  %s%d;\n", "p->using_bias = ", p->using_bias);
+
+  printf("  %s%d\n", "kh_ext = ", conv_kh_ext(p));
+  printf("  %s%d\n", "kw_ext = ", conv_kw_ext(p));
+  printf("  %s%d\n", "ih_ext = ", conv_ih_ext(p));
+  printf("  %s%d\n", "iw_ext = ", conv_iw_ext(p));
+  printf("  %s%d\n", "output_h = ", conv_oh(p));
+  printf("  %s%d\n", "output_w = ", conv_ow(p));
+}
+
+static int setup_conv(
+    conv_param_t &p_param, bmctx_t ctx, bmk_ctx_t *bk_ctx)
+{
+  u16_test_data.conv_input = alloc_input(&p_param);
+  u16_test_data.conv_weight = alloc_weight(&p_param);
+  u16_test_data.conv_bias = alloc_bias(&p_param);
+  //p_param.r_shift_m = calc_rshift_m(&p_param, s8_test_data.conv_weight);
+  u16_test_data.conv_output_ref = (u16 *)malloc(sizeof(u16) * conv_output_size(&p_param));
+  if (!u16_test_data.conv_output_ref)
+    return BM_ERR_FAILURE;
+
+  bmerr_t ret = conv_ref(&p_param, u16_test_data.conv_input, u16_test_data.conv_weight, u16_test_data.conv_bias, u16_test_data.conv_output_ref);
+  assert(ret == BM_SUCCESS);
+  make_bmk_conv_param(bk_ctx, &bmk_conv_param , &p_param);
+
+  bmk_conv_param_alloc_ok(&bmk_conv_param, &p_param);
+
+  put_bf16_tensor_g2l(&ctx, bk_ctx, bmk_conv_param.ifmap, (u16 *)u16_test_data.conv_input, FMT_BF16);
+  put_conv_weight(&ctx, bk_ctx, bmk_conv_param.weight, (u16 *)u16_test_data.conv_weight);
+  if (p_param.using_bias)
+    put_conv_bias(&ctx, bk_ctx, bmk_conv_param.bias, u16_test_data.conv_bias);
+
+  return 1;
+}
+
+void get_result(bmctx_t *ctx, bmk_ctx_t *bmk)
+{
+  u16_test_data.conv_output = (u16*) get_bf16_tensor_l2g(ctx, bmk, bmk_conv_param.ofmap, FMT_BF16);
+}
+
+void check_result()
+{
+    int has_error = array_cmp_int8(
+        "conv Comparing results ...\n",
+        (s8*)u16_test_data.conv_output_ref, (s8 *)u16_test_data.conv_output, conv_output_size(&conv_param)*2);
+
+    if (has_error) {
+      print_conv_param(&conv_param);
+      printf("Comparison FAILED\n");
+      exit(-1);
+    }
+
+}
+
+void trigger_max_power(bmctx_t *ctx, bmk_ctx_t *bmk)
+{
+ bmk1822_tiu_convolution(bmk, &bmk_conv_param);
+ test_submit(ctx);
+}
+
+void free_s8_data()
+{
+  free(u16_test_data.conv_input);
+  free(u16_test_data.conv_weight);
+  free(u16_test_data.conv_bias);
+  free(u16_test_data.conv_output);
+  free(u16_test_data.conv_output_ref);
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  for (int i = 0; i < 3; i++) {
+    for (int k = 0; k < 3; k++) {
+      printf("bf16 conv zero ratio test: ( %d ) ( %d )\n",i,k);
+      init_conv_param(conv_param);
+      conv_param.izratio = i;
+      conv_param.kzratio = k;
+      setup_conv(conv_param, ctx, bk_ctx);
+
+      trigger_max_power(&ctx, bk_ctx);
+      get_result(&ctx, bk_ctx);
+      check_result();
+
+      free_bmk_conv_param(bk_ctx, &bmk_conv_param, &conv_param);
+      free_s8_data();
+    }
+  }
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_bf16_depthwise.cpp b/cviruntime/test/1822/bf16/test_1822_bf16_depthwise.cpp
new file mode 100644
index 000000000..bad465cbb
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_bf16_depthwise.cpp
@@ -0,0 +1,444 @@
+#include "../1822_test_util.h"
+
+#define INVALIDE_STRIDE (-1)
+typedef bmk1822_tiu_depthwise_convolution_param_t param_t;
+int random_seed;
+static void print_pooling_param(param_t *p)
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+  int kh = p->weight->shape.h;
+  int kw = p->weight->shape.w;
+
+  printf("  Pooling parameters:\n");
+  printf("    random_seed : %d \n", random_seed);
+  printf("    ifmap = (%d, %d, %d, %d)\n", in, ic, ih, iw);
+  printf("    opd0_sign = %d\n", p->ifmap->fmt == FMT_I8);
+  printf("    weight = (%d, %d)\n", kh, kw);
+  printf("    padding = (%d, %d, %d, %d)\n",
+         p->pad_top, p->pad_bottom, p->pad_left, p->pad_right);
+  printf("    stride = (%d, %d)\n", p->stride_h, p->stride_w);
+  printf("    ins0 = (%d, %d, %d, %d)\n",
+         p->ins_h, p->ins_last_h, p->ins_w, p->ins_last_w);
+  printf("    dilation = (%d, %d)\n",p->dilation_h, p->dilation_w);
+  printf("    rshift_bits = %d\n", p->rshift_bits);
+  printf("    relu_enable = %d\n", p->relu_enable);
+  printf("    res0_sign = %d\n", p->ofmap->fmt == FMT_I8);
+}
+
+static u16 *alloc_input(param_t *p)
+{
+  u64 size = tl_shape_size(&p->ifmap->shape);
+  u16 *data = (u16 *)xmalloc(size * 2);
+  if (!data)
+    return NULL;
+
+  for (u64 i = 0; i < size; i++) {
+    float val = 0;
+    int RAND_MAX2 = RAND_MAX/2; //5 ~ -5
+    val = (float)(rand()-RAND_MAX2)*5 / (float)RAND_MAX;
+    data[i] = convert_fp32_bf16(val);
+  }
+  return data;
+}
+
+static u16 *alloc_weight(param_t *p)
+{
+  int size = tl_shape_size(&p->weight->shape);
+  u16 *data = (u16 *)xmalloc(size * 2);
+  if (!data)
+    return NULL;
+
+  for (int i = 0; i < size; i++) {
+    float val = 0;
+    int RAND_MAX2 = RAND_MAX/2; //5 ~ -5
+    val = (float)(rand()-RAND_MAX2)*5 / (float)RAND_MAX;
+    data[i] = convert_fp32_bf16(val);
+  }
+  return data;
+}
+
+static u32 *alloc_bias(param_t *p)
+{
+  int c = p->bias->shape.c;
+  u32 *bias = (u32 *)malloc(sizeof(u32) * c);
+  if (!bias)
+    return NULL;
+
+  for (int i = 0; i < c; i++) {
+    float val = 0;
+    int RAND_MAX2 = RAND_MAX/2; //2 ~ -2
+    val = (float)(rand()-RAND_MAX2)*2 / (float)RAND_MAX;
+    bias[i] = convert_fp32_hex(val);
+  }
+  return bias;
+}
+
+static u16 *alloc_output(param_t *p)
+{
+  u64 size = tl_shape_size(&p->ofmap->shape);
+  return (u16 *)xmalloc(size * 2);
+}
+
+static inline void bf16_relu(u16 *buf, u64 size)
+{
+  for (u64 i = 0; i < size; i++)
+    if (convert_bf16_fp32(buf[i]) < 0)
+      buf[i] = convert_fp32_bf16(0);
+}
+
+static int index_get(int h, int w1, int w2)
+{
+  return h * w1 + w2;
+}
+
+int native_pooling_avg_bf16(
+    const u16* i_fmap,
+    const void* weight,
+    const u32 *bias,
+    u16 * o_fmap,
+    int input_n, int input_c, int input_h, int input_w,
+    int kh, int kw,
+    int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r,
+    int stride_h, int stride_w,
+    int ins_h, int ins_w,
+    int ins_h_last, int ins_w_last,
+    int dh, int dw,
+    int const_weight)
+{
+  if (kh * kw <= 0)
+    return BM_ERR_INVALID_ARGUMENT;
+
+  u16 avg_const_weight = *(u16 *)weight;
+  u16 *weight_arr = (u16*)weight;
+  int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b);
+  int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r);
+  int d_kh = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int d_kw = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int output_h = calc_output_hw(h_after, d_kh, stride_h);
+  int output_w = calc_output_hw(w_after, d_kw, stride_w);
+  float *avg_pooling_mac_a = (float *)malloc(d_kh * d_kw * sizeof(float));
+  float *avg_pooling_mac_b = (float *)malloc(d_kh * d_kw * sizeof(float));
+
+  u16 *i_fmap_pad = NULL;
+  u16 *i_kmap_pad = NULL;
+  for (int n = 0; n < input_n; n++) {
+    if (const_weight == 0)
+      weight_arr = (u16*)weight;
+
+    for (int c = 0; c < input_c; ++c) {
+      fill_pad_fmap_bf16(i_fmap, &i_fmap_pad, 0,
+          pad_w_l, pad_w_r, pad_h_t, pad_h_b,
+          ins_h, ins_w, ins_h_last, ins_w_last,
+          input_h, input_w);
+
+      //kernel_dilation(
+      if (const_weight == 0)
+        fill_pad_fmap_bf16(
+          (weight_arr ), &i_kmap_pad, 0,
+          0, 0, 0, 0,  // no padding
+          dh - 1, dw - 1, 0, 0,
+          kh, kw);
+
+      float avg_pool_result;
+      for (int ph = 0; ph < output_h; ++ph) {
+        for (int pw = 0; pw < output_w; ++pw) {
+          int hstart = ph * stride_h;
+          int wstart = pw * stride_w;
+          int pool_index = index_get(ph, output_w, pw);
+          int mac_index = 0;
+
+          for (int h = 0; h < d_kh; h++) {
+            for (int w = 0; w < d_kw; w++) {
+              int index = index_get((hstart+h), w_after, (w+wstart));
+              mac_index = h*d_kw + w;
+
+              avg_pooling_mac_a[mac_index] = convert_bf16_fp32(i_fmap_pad[index]);
+
+              avg_pooling_mac_b[h*d_kw+w] = const_weight ?
+                  convert_bf16_fp32(avg_const_weight) : convert_bf16_fp32(i_kmap_pad[mac_index]);
+            }
+          }
+          inner_float_product(avg_pooling_mac_a, avg_pooling_mac_b, d_kh * d_kw,
+              &avg_pool_result);
+
+          if(bias) {
+            avg_pool_result += convert_hex_fp32(bias[c]);
+          }
+          *(o_fmap+pool_index) = convert_fp32_bf16(avg_pool_result);
+        }
+      }
+      weight_arr += kh * kw;
+      i_fmap += input_w * input_h;
+      o_fmap += output_w * output_h;
+    }
+  }
+  free(i_fmap_pad);
+  free(i_kmap_pad);
+  free(avg_pooling_mac_a);
+  free(avg_pooling_mac_b);
+
+  return BM_SUCCESS;
+}
+
+static void compare_results(
+    param_t *p,
+    u16 input[],
+    u16 weight[],
+    u32 bias[],
+    u16 output[])
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+  int kh = p->weight->shape.h;
+  int kw = p->weight->shape.w;
+
+  u16 *output_ref = alloc_output(p);
+  bmerr_t ret = native_pooling_avg_bf16(
+      input, weight, p->bias ? bias : NULL, output_ref,
+      in, ic, ih, iw, kh, kw,
+      p->pad_top, p->pad_bottom, p->pad_left, p->pad_right,
+      p->stride_h, p->stride_w,
+      p->ins_h, p->ins_w, p->ins_last_h, p->ins_last_w,
+      p->dilation_h, p->dilation_w, 0
+      );
+  assert(ret == BM_SUCCESS);
+
+  if(p->relu_enable )
+    bf16_relu(output_ref, tl_shape_size(&p->ofmap->shape));
+
+  int cmp_res = array_cmp_int8(
+      "Comparing results ...\n", (s8*) output_ref, (s8*) output,
+      tl_shape_size(&p->ofmap->shape) * 2);
+
+  if (cmp_res != 0) {
+    printf("Comparison FAILED!!!\n");
+    print_pooling_param(p);
+    exit(-1);
+  }
+
+  free(output_ref);
+}
+
+static int pooling_ih_ext(param_t *p, int ih)
+{
+  int ins = p->ins_h;
+  int ins_last = p->ins_last_h;
+  int pad = p->pad_top + p->pad_bottom;
+  return (ih - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+static int pooling_iw_ext(param_t *p, int iw)
+{
+  int ins = p->ins_w;
+  int ins_last = p->ins_last_w;
+  int pad = p->pad_left + p->pad_right;
+  return (iw - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+static int pooling_oh(param_t *p, int ih, int kh, int dh)
+{
+  int ih_ext = pooling_ih_ext(p, ih);
+  int d_h = (kh -1) * dh + 1;
+  return (ih_ext - d_h) / p->stride_h + 1;
+}
+
+static int pooling_ow(param_t *p, int iw, int kw, int dw)
+{
+  int iw_ext = pooling_iw_ext(p, iw);
+  int d_w = (kw -1) * dw +1;
+  return (iw_ext - d_w) / p->stride_w + 1;
+}
+
+static void free_depthwise_param(
+    bmk_ctx_t *ctx,
+    param_t *p)
+{
+  if (p->bias)
+    free_tl(ctx, p->bias);
+
+  if (p->weight)
+    free_tl(ctx, p->weight);
+
+  if (p->ifmap)
+    free_tl(ctx, p->ifmap);
+
+  if (p->ofmap)
+    free_tl(ctx, p->ofmap);
+}
+
+static param_t random_depthwise_param(bmk_ctx_t *ctx, int stride_w, int stride_h)
+{
+  param_t p;
+
+  memset(&p, 0, sizeof(p));
+
+retry:
+  random_seed = clock();
+  srand(random_seed);
+  int using_bias = rand() % 2;
+  int n = rand() % 5 + 1;
+  int c = rand() % (3 * BM1822_HW_NPU_NUM) + 1;
+  int ih = rand() % 30 + 3 + (INVALIDE_STRIDE == stride_h ? 0 : stride_h);
+  int iw = rand() % 30 + 6 + (INVALIDE_STRIDE == stride_w ? 0 : stride_w);
+  int kh = rand() % 7 + 1;
+  int kw = rand() % 7 + 1;
+
+  p.ins_h = rand() % kh;
+  p.ins_w = rand() % kw;
+  p.ins_last_h = rand() % kh;
+  p.ins_last_w = rand() % kw;
+  p.stride_h = INVALIDE_STRIDE == stride_h ? rand() % (kh) + 1 : stride_h;
+  p.stride_w = INVALIDE_STRIDE == stride_w ? rand() % (kh) + 1 : stride_w;
+  p.pad_top = rand() % kh;
+  p.pad_bottom = rand() % kh;
+  p.pad_left = rand() % kw;
+  p.pad_right = rand() % kw;
+  p.rshift_bits = rand() % 32;
+  p.dilation_h = rand()%4 + 1;
+  p.dilation_w = rand()%4 + 1;
+
+  int oh = pooling_oh(&p, ih, kh, p.dilation_h);
+  int ow = pooling_ow(&p, iw, kw, p.dilation_w);
+  int d_kh = calc_dilute_hw(kh, p.dilation_h - 1, 0, 0, 0);
+  int d_kw = calc_dilute_hw(kw, p.dilation_w - 1, 0, 0, 0);
+
+  tl_shape_t ofmap_shape;
+  ofmap_shape.n = n;
+  ofmap_shape.c = c;
+  ofmap_shape.h = oh;
+  ofmap_shape.w = ow;
+  tl_shape_t ifmap_shape;
+  ifmap_shape.n = n;
+  ifmap_shape.c = c;
+  ifmap_shape.h = ih;
+  ifmap_shape.w = iw;
+  tl_shape_t weight_shape;
+  weight_shape.n = 1;
+  weight_shape.c = c;
+  weight_shape.h = kh;
+  weight_shape.w = kw;
+  tl_shape_t bias_shape;
+  bias_shape.n = 2;
+  bias_shape.c = c;
+  bias_shape.h = 1;
+  bias_shape.w = 1;
+  p.relu_enable = rand()%2;
+
+  fmt_t ifmt = FMT_BF16;
+  p.ofmap = bmk1822_lmem_alloc_tensor(ctx, ofmap_shape, FMT_BF16, 1);
+  p.ifmap = bmk1822_lmem_alloc_tensor(ctx, ifmap_shape, ifmt, 1);
+  p.weight = bmk1822_lmem_alloc_tensor(ctx, weight_shape, FMT_BF16, 1);
+  p.bias = NULL;
+  if (using_bias)
+    p.bias = bmk1822_lmem_alloc_tensor(ctx, bias_shape, FMT_BF16, 0);
+
+  if ((kh > pooling_ih_ext(&p, ih))
+      || (kw > pooling_iw_ext(&p, iw))
+      || (oh < d_kh)
+      || (ow < d_kw)
+      || (p.pad_top >= (1 << 4))
+      || (p.pad_bottom >= (1 << 4))
+      || (p.pad_left >= (1 << 4))
+      || (p.pad_right >= (1 << 4))
+      || !p.ofmap
+      || !p.ifmap
+      || !p.weight
+      || (using_bias && !p.bias)) {
+    printf("retry init_pooling_param\n");
+    free_depthwise_param(ctx, &p);
+    goto retry;
+  }
+  return p;
+}
+
+static void put_bias_tensor(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    const tl_t *tl,
+    u32 data[])
+{
+  int c = tl->shape.c;
+
+  u16 *hi_lo = (u16 *)xmalloc(2 * c * 2);
+  if (!hi_lo)
+    return;
+
+  for (int i = 0; i < c; i++) {
+    hi_lo[i] = (data[i] >> 16) & 0xffff;
+    hi_lo[i + c] = (data[i]  & 0xffff);
+  }
+  put_bf16_tensor_g2l(ctx, bk_ctx, tl, (u16 *)hi_lo, FMT_BF16);
+
+  free(hi_lo);
+}
+
+static int _test_pooling(bmctx_t ctx, bmk_ctx_t *bk_ctx, int stride_w, int stride_h)
+{
+  param_t param = random_depthwise_param(bk_ctx, stride_w, stride_h);
+  //print_pooling_param(&param);
+  u16 *input = alloc_input(&param);
+  u16 *weight = alloc_weight(&param);
+  u32 *bias = NULL;
+  if (param.bias)
+    bias = alloc_bias(&param);
+
+  put_bf16_tensor_g2l(&ctx, bk_ctx, param.ifmap, (u16 *)input, FMT_BF16);
+  put_bf16_tensor_g2l(&ctx, bk_ctx, param.weight, (u16 *)weight, FMT_BF16);
+  if (param.bias)
+    put_bias_tensor(&ctx, bk_ctx, param.bias, bias);
+
+  bmk1822_tiu_depthwise_convolution(bk_ctx, &param);
+  u16 *output = (u16 *)get_bf16_tensor_l2g(&ctx, bk_ctx, param.ofmap, FMT_BF16);
+  compare_results(&param, input, weight, bias, output);
+
+  free_depthwise_param(bk_ctx, &param);
+  free(input);
+  free(weight);
+  free(bias);
+  free(output);
+
+  return 1;
+}
+
+static int test_pooling(bmctx_t ctx, bmk_ctx_t *bk_ctx) {
+  return _test_pooling(ctx, bk_ctx, INVALIDE_STRIDE, INVALIDE_STRIDE);
+}
+
+static void test_depthwise_pooling(bmctx_t *ctx, bmk_ctx_t *bk_ctx)
+{
+  int test_finished_num = 0;
+  for (u64 i = 0; i < 20; i++)
+    test_finished_num += test_pooling(*ctx, bk_ctx);
+
+  // test stride extend (0, 31]
+  int stride_list[] = {15, 16, 31};
+  int stride_list_len = sizeof(stride_list) / sizeof(stride_list[0]);
+
+  for (int stride_w_idx = 0; stride_w_idx < stride_list_len; stride_w_idx++) {
+    for (int stride_h_idx = 0; stride_h_idx < stride_list_len; stride_h_idx++) {
+      int stride_w = stride_list[stride_w_idx];
+      int stride_h = stride_list[stride_h_idx];
+      test_finished_num += _test_pooling(*ctx, bk_ctx, stride_w, stride_h);
+    }
+  }
+  printf("Test finished %d\n", test_finished_num);
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  int round_mode;
+  round_mode = set_store_feround();
+  test_depthwise_pooling(&ctx, bk_ctx);
+  restore_feround(round_mode);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_bf16_inv_sqrt.cpp b/cviruntime/test/1822/bf16/test_1822_bf16_inv_sqrt.cpp
new file mode 100644
index 000000000..f7f6c4f3d
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_bf16_inv_sqrt.cpp
@@ -0,0 +1,2493 @@
+/**
+ */
+#include "../1822_test_util.h"
+#define OUT
+#define IN
+
+using namespace std;
+static u32 channel = -1; //<! 1822 hardcode
+
+//<! 1822 hw config
+static u32 table_h = 32;
+static u32 table_w = 8;
+static u32 table_hw = table_h * table_w;
+
+// NOTICE: all inter result save in doulbe unit
+static double *sqrt_hw = (double *)malloc(sizeof(double) * table_hw);
+
+// fix range
+const static int exp_start = -62;
+const static int exp_end = 63;
+
+/**
+ * pre_data means we test fixed pattern, it should be same sa lut
+ */
+enum TEST_MODE {
+  PRE_DATA_COMPARE_FIX = 0, // pre-data + fix compare
+  TEST_MODE_MAX,
+};
+
+static TEST_MODE mode;
+
+static u16 test_pattern[] = {
+  0x0000,
+  0x38D2,
+  0x3952,
+  0x399D,
+  0x39D2,
+  0x3A03,
+  0x3A1D,
+  0x3A38,
+  0x3A52,
+  0x3A6C,
+  0x3A83,
+  0x3A90,
+  0x3A9D,
+  0x3AAA,
+  0x3AB8,
+  0x3AC5,
+  0x3AD2,
+  0x3ADF,
+  0x3AEC,
+  0x3AF9,
+  0x3B03,
+  0x3B0A,
+  0x3B10,
+  0x3B17,
+  0x3B1D,
+  0x3B24,
+  0x3B2A,
+  0x3B31,
+  0x3B38,
+  0x3B3E,
+  0x3B45,
+  0x3B4B,
+  0x3B52,
+  0x3B58,
+  0x3B5F,
+  0x3B65,
+  0x3B6C,
+  0x3B72,
+  0x3B79,
+  0x3B80,
+  0x3B83,
+  0x3B86,
+  0x3B8A,
+  0x3B8D,
+  0x3B90,
+  0x3B93,
+  0x3B97,
+  0x3B9A,
+  0x3B9D,
+  0x3BA1,
+  0x3BA4,
+  0x3BA7,
+  0x3BAA,
+  0x3BAE,
+  0x3BB1,
+  0x3BB4,
+  0x3BB8,
+  0x3BBB,
+  0x3BBE,
+  0x3BC1,
+  0x3BC5,
+  0x3BC8,
+  0x3BCB,
+  0x3BCE,
+  0x3BD2,
+  0x3BD5,
+  0x3BD8,
+  0x3BDC,
+  0x3BDF,
+  0x3BE2,
+  0x3BE5,
+  0x3BE9,
+  0x3BEC,
+  0x3BEF,
+  0x3BF2,
+  0x3BF6,
+  0x3BF9,
+  0x3BFC,
+  0x3C00,
+  0x3C01,
+  0x3C03,
+  0x3C05,
+  0x3C06,
+  0x3C08,
+  0x3C0A,
+  0x3C0B,
+  0x3C0D,
+  0x3C0F,
+  0x3C10,
+  0x3C12,
+  0x3C13,
+  0x3C15,
+  0x3C17,
+  0x3C18,
+  0x3C1A,
+  0x3C1C,
+  0x3C1D,
+  0x3C1F,
+  0x3C21,
+  0x3C22,
+  0x3C24,
+  0x3C25,
+  0x3C27,
+  0x3C29,
+  0x3C2A,
+  0x3C2C,
+  0x3C2E,
+  0x3C2F,
+  0x3C31,
+  0x3C33,
+  0x3C34,
+  0x3C36,
+  0x3C38,
+  0x3C39,
+  0x3C3B,
+  0x3C3C,
+  0x3C3E,
+  0x3C40,
+  0x3C41,
+  0x3C43,
+  0x3C45,
+  0x3C46,
+  0x3C48,
+  0x3C4A,
+  0x3C4B,
+  0x3C4D,
+  0x3C4E,
+  0x3C50,
+  0x3C52,
+  0x3C53,
+  0x3C55,
+  0x3C57,
+  0x3C58,
+  0x3C5A,
+  0x3C5C,
+  0x3C5D,
+  0x3C5F,
+  0x3C60,
+  0x3C62,
+  0x3C64,
+  0x3C65,
+  0x3C67,
+  0x3C69,
+  0x3C6A,
+  0x3C6C,
+  0x3C6E,
+  0x3C6F,
+  0x3C71,
+  0x3C72,
+  0x3C74,
+  0x3C76,
+  0x3C77,
+  0x3C79,
+  0x3C7B,
+  0x3C7C,
+  0x3C7E,
+  0x3C80,
+  0x3C81,
+  0x3C81,
+  0x3C82,
+  0x3C83,
+  0x3C84,
+  0x3C85,
+  0x3C86,
+  0x3C86,
+  0x3C87,
+  0x3C88,
+  0x3C89,
+  0x3C8A,
+  0x3C8A,
+  0x3C8B,
+  0x3C8C,
+  0x3C8D,
+  0x3C8E,
+  0x3C8F,
+  0x3C8F,
+  0x3C90,
+  0x3C91,
+  0x3C92,
+  0x3C93,
+  0x3C93,
+  0x3C94,
+  0x3C95,
+  0x3C96,
+  0x3C97,
+  0x3C98,
+  0x3C98,
+  0x3C99,
+  0x3C9A,
+  0x3C9B,
+  0x3C9C,
+  0x3C9C,
+  0x3C9D,
+  0x3C9E,
+  0x3C9F,
+  0x3CA0,
+  0x3CA1,
+  0x3CA1,
+  0x3CA2,
+  0x3CA3,
+  0x3CA4,
+  0x3CA5,
+  0x3CA5,
+  0x3CA6,
+  0x3CA7,
+  0x3CA8,
+  0x3CA9,
+  0x3CAA,
+  0x3CAA,
+  0x3CAB,
+  0x3CAC,
+  0x3CAD,
+  0x3CAE,
+  0x3CAE,
+  0x3CAF,
+  0x3CB0,
+  0x3CB1,
+  0x3CB2,
+  0x3CB3,
+  0x3CB3,
+  0x3CB4,
+  0x3CB5,
+  0x3CB6,
+  0x3CB7,
+  0x3CB8,
+  0x3CB8,
+  0x3CB9,
+  0x3CBA,
+  0x3CBB,
+  0x3CBC,
+  0x3CBC,
+  0x3CBD,
+  0x3CBE,
+  0x3CBF,
+  0x3CC0,
+  0x3CC1,
+  0x3CC1,
+  0x3CC2,
+  0x3CC3,
+  0x3CC4,
+  0x3CC5,
+  0x3CC5,
+  0x3CC6,
+  0x3CC7,
+  0x3CC8,
+  0x3CC9,
+  0x3CCA,
+  0x3CCA,
+  0x3CCB,
+  0x3CCC,
+  0x3CCD,
+  0x3CCE,
+  0x3CCE,
+  0x3CCF,
+  0x3CD0,
+  0x3CD1,
+  0x3CD2,
+  0x3CD3,
+  0x3CD3,
+  0x3CD4,
+  0x3CD5,
+  0x3CD6,
+  0x3CD7,
+  0x3CD7,
+  0x3CD8,
+  0x3CD9,
+  0x3CDA,
+  0x3CDB,
+  0x3CDC,
+  0x3CDC,
+  0x3CDD,
+  0x3CDE,
+  0x3CDF,
+  0x3CE0,
+  0x3CE0,
+  0x3CE1,
+  0x3CE2,
+  0x3CE3,
+  0x3CE4,
+  0x3CE5,
+  0x3CE5,
+  0x3CE6,
+  0x3CE7,
+  0x3CE8,
+  0x3CE9,
+  0x3CE9,
+  0x3CEA,
+  0x3CEB,
+  0x3CEC,
+  0x3CED,
+  0x3CEE,
+  0x3CEE,
+  0x3CEF,
+  0x3CF0,
+  0x3CF1,
+  0x3CF2,
+  0x3CF2,
+  0x3CF3,
+  0x3CF4,
+  0x3CF5,
+  0x3CF6,
+  0x3CF7,
+  0x3CF7,
+  0x3CF8,
+  0x3CF9,
+  0x3CFA,
+  0x3CFB,
+  0x3CFB,
+  0x3CFC,
+  0x3CFD,
+  0x3CFE,
+  0x3CFF,
+  0x3D00,
+  0x3D00,
+  0x3D01,
+  0x3D01,
+  0x3D01,
+  0x3D02,
+  0x3D02,
+  0x3D03,
+  0x3D03,
+  0x3D03,
+  0x3D04,
+  0x3D04,
+  0x3D05,
+  0x3D05,
+  0x3D06,
+  0x3D06,
+  0x3D06,
+  0x3D07,
+  0x3D07,
+  0x3D08,
+  0x3D08,
+  0x3D08,
+  0x3D09,
+  0x3D09,
+  0x3D0A,
+  0x3D0A,
+  0x3D0A,
+  0x3D0B,
+  0x3D0B,
+  0x3D0C,
+  0x3D0C,
+  0x3D0C,
+  0x3D0D,
+  0x3D0D,
+  0x3D0E,
+  0x3D0E,
+  0x3D0F,
+  0x3D0F,
+  0x3D0F,
+  0x3D10,
+  0x3D10,
+  0x3D11,
+  0x3D11,
+  0x3D11,
+  0x3D12,
+  0x3D12,
+  0x3D13,
+  0x3D13,
+  0x3D13,
+  0x3D14,
+  0x3D14,
+  0x3D15,
+  0x3D15,
+  0x3D16,
+  0x3D16,
+  0x3D16,
+  0x3D17,
+  0x3D17,
+  0x3D18,
+  0x3D18,
+  0x3D18,
+  0x3D19,
+  0x3D19,
+  0x3D1A,
+  0x3D1A,
+  0x3D1A,
+  0x3D1B,
+  0x3D1B,
+  0x3D1C,
+  0x3D1C,
+  0x3D1C,
+  0x3D1D,
+  0x3D1D,
+  0x3D1E,
+  0x3D1E,
+  0x3D1F,
+  0x3D1F,
+  0x3D1F,
+  0x3D20,
+  0x3D20,
+  0x3D21,
+  0x3D21,
+  0x3D21,
+  0x3D22,
+  0x3D22,
+  0x3D23,
+  0x3D23,
+  0x3D23,
+  0x3D24,
+  0x3D24,
+  0x3D25,
+  0x3D25,
+  0x3D25,
+  0x3D26,
+  0x3D26,
+  0x3D27,
+  0x3D27,
+  0x3D28,
+  0x3D28,
+  0x3D28,
+  0x3D29,
+  0x3D29,
+  0x3D2A,
+  0x3D2A,
+  0x3D2A,
+  0x3D2B,
+  0x3D2B,
+  0x3D2C,
+  0x3D2C,
+  0x3D2C,
+  0x3D2D,
+  0x3D2D,
+  0x3D2E,
+  0x3D2E,
+  0x3D2E,
+  0x3D2F,
+  0x3D2F,
+  0x3D30,
+  0x3D30,
+  0x3D31,
+  0x3D31,
+  0x3D31,
+  0x3D32,
+  0x3D32,
+  0x3D33,
+  0x3D33,
+  0x3D33,
+  0x3D34,
+  0x3D34,
+  0x3D35,
+  0x3D35,
+  0x3D35,
+  0x3D36,
+  0x3D36,
+  0x3D37,
+  0x3D37,
+  0x3D38,
+  0x3D38,
+  0x3D38,
+  0x3D39,
+  0x3D39,
+  0x3D3A,
+  0x3D3A,
+  0x3D3A,
+  0x3D3B,
+  0x3D3B,
+  0x3D3C,
+  0x3D3C,
+  0x3D3C,
+  0x3D3D,
+  0x3D3D,
+  0x3D3E,
+  0x3D3E,
+  0x3D3E,
+  0x3D3F,
+  0x3D3F,
+  0x3D40,
+  0x3D40,
+  0x3D41,
+  0x3D41,
+  0x3D41,
+  0x3D42,
+  0x3D42,
+  0x3D43,
+  0x3D43,
+  0x3D43,
+  0x3D44,
+  0x3D44,
+  0x3D45,
+  0x3D45,
+  0x3D45,
+  0x3D46,
+  0x3D46,
+  0x3D47,
+  0x3D47,
+  0x3D47,
+  0x3D48,
+  0x3D48,
+  0x3D49,
+  0x3D49,
+  0x3D4A,
+  0x3D4A,
+  0x3D4A,
+  0x3D4B,
+  0x3D4B,
+  0x3D4C,
+  0x3D4C,
+  0x3D4C,
+  0x3D4D,
+  0x3D4D,
+  0x3D4E,
+  0x3D4E,
+  0x3D4E,
+  0x3D4F,
+  0x3D4F,
+  0x3D50,
+  0x3D50,
+  0x3D50,
+  0x3D51,
+  0x3D51,
+  0x3D52,
+  0x3D52,
+  0x3D53,
+  0x3D53,
+  0x3D53,
+  0x3D54,
+  0x3D54,
+  0x3D55,
+  0x3D55,
+  0x3D55,
+  0x3D56,
+  0x3D56,
+  0x3D57,
+  0x3D57,
+  0x3D57,
+  0x3D58,
+  0x3D58,
+  0x3D59,
+  0x3D59,
+  0x3D59,
+  0x3D5A,
+  0x3D5A,
+  0x3D5B,
+  0x3D5B,
+  0x3D5C,
+  0x3D5C,
+  0x3D5C,
+  0x3D5D,
+  0x3D5D,
+  0x3D5E,
+  0x3D5E,
+  0x3D5E,
+  0x3D5F,
+  0x3D5F,
+  0x3D60,
+  0x3D60,
+  0x3D60,
+  0x3D61,
+  0x3D61,
+  0x3D62,
+  0x3D62,
+  0x3D63,
+  0x3D63,
+  0x3D63,
+  0x3D64,
+  0x3D64,
+  0x3D65,
+  0x3D65,
+  0x3D65,
+  0x3D66,
+  0x3D66,
+  0x3D67,
+  0x3D67,
+  0x3D67,
+  0x3D68,
+  0x3D68,
+  0x3D69,
+  0x3D69,
+  0x3D69,
+  0x3D6A,
+  0x3D6A,
+  0x3D6B,
+  0x3D6B,
+  0x3D6C,
+  0x3D6C,
+  0x3D6C,
+  0x3D6D,
+  0x3D6D,
+  0x3D6E,
+  0x3D6E,
+  0x3D6E,
+  0x3D6F,
+  0x3D6F,
+  0x3D70,
+  0x3D70,
+  0x3D70,
+  0x3D71,
+  0x3D71,
+  0x3D72,
+  0x3D72,
+  0x3D72,
+  0x3D73,
+  0x3D73,
+  0x3D74,
+  0x3D74,
+  0x3D75,
+  0x3D75,
+  0x3D75,
+  0x3D76,
+  0x3D76,
+  0x3D77,
+  0x3D77,
+  0x3D77,
+  0x3D78,
+  0x3D78,
+  0x3D79,
+  0x3D79,
+  0x3D79,
+  0x3D7A,
+  0x3D7A,
+  0x3D7B,
+  0x3D7B,
+  0x3D7B,
+  0x3D7C,
+  0x3D7C,
+  0x3D7D,
+  0x3D7D,
+  0x3D7E,
+  0x3D7E,
+  0x3D7E,
+  0x3D7F,
+  0x3D7F,
+  0x3D80,
+  0x3D80,
+  0x3D80,
+  0x3D80,
+  0x3D81,
+  0x3D81,
+  0x3D81,
+  0x3D81,
+  0x3D81,
+  0x3D82,
+  0x3D82,
+  0x3D82,
+  0x3D82,
+  0x3D82,
+  0x3D83,
+  0x3D83,
+  0x3D83,
+  0x3D83,
+  0x3D83,
+  0x3D84,
+  0x3D84,
+  0x3D84,
+  0x3D84,
+  0x3D85,
+  0x3D85,
+  0x3D85,
+  0x3D85,
+  0x3D85,
+  0x3D86,
+  0x3D86,
+  0x3D86,
+  0x3D86,
+  0x3D86,
+  0x3D87,
+  0x3D87,
+  0x3D87,
+  0x3D87,
+  0x3D87,
+  0x3D88,
+  0x3D88,
+  0x3D88,
+  0x3D88,
+  0x3D88,
+  0x3D89,
+  0x3D89,
+  0x3D89,
+  0x3D89,
+  0x3D89,
+  0x3D8A,
+  0x3D8A,
+  0x3D8A,
+  0x3D8A,
+  0x3D8A,
+  0x3D8B,
+  0x3D8B,
+  0x3D8B,
+  0x3D8B,
+  0x3D8B,
+  0x3D8C,
+  0x3D8C,
+  0x3D8C,
+  0x3D8C,
+  0x3D8C,
+  0x3D8D,
+  0x3D8D,
+  0x3D8D,
+  0x3D8D,
+  0x3D8E,
+  0x3D8E,
+  0x3D8E,
+  0x3D8E,
+  0x3D8E,
+  0x3D8F,
+  0x3D8F,
+  0x3D8F,
+  0x3D8F,
+  0x3D8F,
+  0x3D90,
+  0x3D90,
+  0x3D90,
+  0x3D90,
+  0x3D90,
+  0x3D91,
+  0x3D91,
+  0x3D91,
+  0x3D91,
+  0x3D91,
+  0x3D92,
+  0x3D92,
+  0x3D92,
+  0x3D92,
+  0x3D92,
+  0x3D93,
+  0x3D93,
+  0x3D93,
+  0x3D93,
+  0x3D93,
+  0x3D94,
+  0x3D94,
+  0x3D94,
+  0x3D94,
+  0x3D94,
+  0x3D95,
+  0x3D95,
+  0x3D95,
+  0x3D95,
+  0x3D96,
+  0x3D96,
+  0x3D96,
+  0x3D96,
+  0x3D96,
+  0x3D97,
+  0x3D97,
+  0x3D97,
+  0x3D97,
+  0x3D97,
+  0x3D98,
+  0x3D98,
+  0x3D98,
+  0x3D98,
+  0x3D98,
+  0x3D99,
+  0x3D99,
+  0x3D99,
+  0x3D99,
+  0x3D99,
+  0x3D9A,
+  0x3D9A,
+  0x3D9A,
+  0x3D9A,
+  0x3D9A,
+  0x3D9B,
+  0x3D9B,
+  0x3D9B,
+  0x3D9B,
+  0x3D9B,
+  0x3D9C,
+  0x3D9C,
+  0x3D9C,
+  0x3D9C,
+  0x3D9C,
+  0x3D9D,
+  0x3D9D,
+  0x3D9D,
+  0x3D9D,
+  0x3D9D,
+  0x3D9E,
+  0x3D9E,
+  0x3D9E,
+  0x3D9E,
+  0x3D9F,
+  0x3D9F,
+  0x3D9F,
+  0x3D9F,
+  0x3D9F,
+  0x3DA0,
+  0x3DA0,
+  0x3DA0,
+  0x3DA0,
+  0x3DA0,
+  0x3DA1,
+  0x3DA1,
+  0x3DA1,
+  0x3DA1,
+  0x3DA1,
+  0x3DA2,
+  0x3DA2,
+  0x3DA2,
+  0x3DA2,
+  0x3DA2,
+  0x3DA3,
+  0x3DA3,
+  0x3DA3,
+  0x3DA3,
+  0x3DA3,
+  0x3DA4,
+  0x3DA4,
+  0x3DA4,
+  0x3DA4,
+  0x3DA4,
+  0x3DA5,
+  0x3DA5,
+  0x3DA5,
+  0x3DA5,
+  0x3DA5,
+  0x3DA6,
+  0x3DA6,
+  0x3DA6,
+  0x3DA6,
+  0x3DA7,
+  0x3DA7,
+  0x3DA7,
+  0x3DA7,
+  0x3DA7,
+  0x3DA8,
+  0x3DA8,
+  0x3DA8,
+  0x3DA8,
+  0x3DA8,
+  0x3DA9,
+  0x3DA9,
+  0x3DA9,
+  0x3DA9,
+  0x3DA9,
+  0x3DAA,
+  0x3DAA,
+  0x3DAA,
+  0x3DAA,
+  0x3DAA,
+  0x3DAB,
+  0x3DAB,
+  0x3DAB,
+  0x3DAB,
+  0x3DAB,
+  0x3DAC,
+  0x3DAC,
+  0x3DAC,
+  0x3DAC,
+  0x3DAC,
+  0x3DAD,
+  0x3DAD,
+  0x3DAD,
+  0x3DAD,
+  0x3DAD,
+  0x3DAE,
+  0x3DAE,
+  0x3DAE,
+  0x3DAE,
+  0x3DAE,
+  0x3DAF,
+  0x3DAF,
+  0x3DAF,
+  0x3DAF,
+  0x3DB0,
+  0x3DB0,
+  0x3DB0,
+  0x3DB0,
+  0x3DB0,
+  0x3DB1,
+  0x3DB1,
+  0x3DB1,
+  0x3DB1,
+  0x3DB1,
+  0x3DB2,
+  0x3DB2,
+  0x3DB2,
+  0x3DB2,
+  0x3DB2,
+  0x3DB3,
+  0x3DB3,
+  0x3DB3,
+  0x3DB3,
+  0x3DB3,
+  0x3DB4,
+  0x3DB4,
+  0x3DB4,
+  0x3DB4,
+  0x3DB4,
+  0x3DB5,
+  0x3DB5,
+  0x3DB5,
+  0x3DB5,
+  0x3DB5,
+  0x3DB6,
+  0x3DB6,
+  0x3DB6,
+  0x3DB6,
+  0x3DB6,
+  0x3DB7,
+  0x3DB7,
+  0x3DB7,
+  0x3DB7,
+  0x3DB8,
+  0x3DB8,
+  0x3DB8,
+  0x3DB8,
+  0x3DB8,
+  0x3DB9,
+  0x3DB9,
+  0x3DB9,
+  0x3DB9,
+  0x3DB9,
+  0x3DBA,
+  0x3DBA,
+  0x3DBA,
+  0x3DBA,
+  0x3DBA,
+  0x3DBB,
+  0x3DBB,
+  0x3DBB,
+  0x3DBB,
+  0x3DBB,
+  0x3DBC,
+  0x3DBC,
+  0x3DBC,
+  0x3DBC,
+  0x3DBC,
+  0x3DBD,
+  0x3DBD,
+  0x3DBD,
+  0x3DBD,
+  0x3DBD,
+  0x3DBE,
+  0x3DBE,
+  0x3DBE,
+  0x3DBE,
+  0x3DBE,
+  0x3DBF,
+  0x3DBF,
+  0x3DBF,
+  0x3DBF,
+  0x3DBF,
+  0x3DC0,
+  0x3DC0,
+  0x3DC0,
+  0x3DC0,
+  0x3DC1,
+  0x3DC1,
+  0x3DC1,
+  0x3DC1,
+  0x3DC1,
+  0x3DC2,
+  0x3DC2,
+  0x3DC2,
+  0x3DC2,
+  0x3DC2,
+  0x3DC3,
+  0x3DC3,
+  0x3DC3,
+  0x3DC3,
+  0x3DC3,
+  0x3DC4,
+  0x3DC4,
+  0x3DC4,
+  0x3DC4,
+  0x3DC4,
+  0x3DC5,
+  0x3DC5,
+  0x3DC5,
+  0x3DC5,
+  0x3DC5,
+  0x3DC6,
+  0x3DC6,
+  0x3DC6,
+  0x3DC6,
+  0x3DC6,
+  0x3DC7,
+  0x3DC7,
+  0x3DC7,
+  0x3DC7,
+  0x3DC7,
+  0x3DC8,
+  0x3DC8,
+  0x3DC8,
+  0x3DC8,
+  0x3DC8,
+  0x3DC9,
+  0x3DC9,
+  0x3DC9,
+  0x3DC9,
+  0x3DCA,
+  0x3DCA,
+  0x3DCA,
+  0x3DCA,
+  0x3DCA,
+  0x3DCB,
+  0x3DCB,
+  0x3DCB,
+  0x3DCB,
+  0x3DCB,
+  0x3DCC,
+  0x3DCC,
+  0x3DCC,
+  0x3DCC,
+  0x3DCC,
+  0x3DCD,
+};
+
+static u16 sigmode_golden_bf16[] = {
+  0x4f00,
+  0x42d0,
+  0x4293,
+  0x426f,
+  0x4250,
+  0x4234,
+  0x4229,
+  0x421e,
+  0x4213,
+  0x4208,
+  0x41fe,
+  0x41f7,
+  0x41ef,
+  0x41e7,
+  0x41df,
+  0x41d8,
+  0x41d0,
+  0x41c8,
+  0x41c1,
+  0x41b9,
+  0x41b4,
+  0x41b1,
+  0x41ae,
+  0x41ab,
+  0x41a9,
+  0x41a6,
+  0x41a4,
+  0x41a1,
+  0x419e,
+  0x419b,
+  0x4198,
+  0x4196,
+  0x4193,
+  0x4191,
+  0x418e,
+  0x418b,
+  0x4188,
+  0x4186,
+  0x4183,
+  0x4180,
+  0x417e,
+  0x417c,
+  0x417a,
+  0x4178,
+  0x4177,
+  0x4175,
+  0x4173,
+  0x4171,
+  0x416f,
+  0x416d,
+  0x416b,
+  0x4169,
+  0x4167,
+  0x4165,
+  0x4163,
+  0x4162,
+  0x415f,
+  0x415d,
+  0x415c,
+  0x415a,
+  0x4158,
+  0x4156,
+  0x4154,
+  0x4152,
+  0x4150,
+  0x414e,
+  0x414c,
+  0x414a,
+  0x4148,
+  0x4147,
+  0x4145,
+  0x4142,
+  0x4141,
+  0x413f,
+  0x413d,
+  0x413b,
+  0x4139,
+  0x4137,
+  0x4135,
+  0x4135,
+  0x4134,
+  0x4133,
+  0x4133,
+  0x4132,
+  0x4131,
+  0x4130,
+  0x4130,
+  0x412f,
+  0x412e,
+  0x412e,
+  0x412d,
+  0x412c,
+  0x412b,
+  0x412b,
+  0x412a,
+  0x4129,
+  0x4129,
+  0x4128,
+  0x4127,
+  0x4127,
+  0x4126,
+  0x4126,
+  0x4125,
+  0x4124,
+  0x4124,
+  0x4123,
+  0x4122,
+  0x4122,
+  0x4121,
+  0x4120,
+  0x411f,
+  0x411f,
+  0x411e,
+  0x411d,
+  0x411d,
+  0x411c,
+  0x411b,
+  0x411a,
+  0x411a,
+  0x4119,
+  0x4118,
+  0x4118,
+  0x4117,
+  0x4116,
+  0x4116,
+  0x4115,
+  0x4115,
+  0x4114,
+  0x4113,
+  0x4113,
+  0x4112,
+  0x4111,
+  0x4111,
+  0x4110,
+  0x410f,
+  0x410e,
+  0x410e,
+  0x410d,
+  0x410c,
+  0x410c,
+  0x410b,
+  0x410a,
+  0x410a,
+  0x4109,
+  0x4108,
+  0x4107,
+  0x4107,
+  0x4106,
+  0x4106,
+  0x4105,
+  0x4104,
+  0x4104,
+  0x4103,
+  0x4102,
+  0x4102,
+  0x4101,
+  0x4100,
+  0x40ff,
+  0x40ff,
+  0x40ff,
+  0x40fe,
+  0x40fe,
+  0x40fd,
+  0x40fc,
+  0x40fc,
+  0x40fc,
+  0x40fb,
+  0x40fb,
+  0x40fa,
+  0x40fa,
+  0x40fa,
+  0x40f9,
+  0x40f8,
+  0x40f8,
+  0x40f7,
+  0x40f7,
+  0x40f7,
+  0x40f6,
+  0x40f5,
+  0x40f5,
+  0x40f5,
+  0x40f4,
+  0x40f4,
+  0x40f3,
+  0x40f3,
+  0x40f2,
+  0x40f2,
+  0x40f1,
+  0x40f1,
+  0x40f0,
+  0x40f0,
+  0x40f0,
+  0x40ef,
+  0x40ee,
+  0x40ee,
+  0x40ed,
+  0x40ed,
+  0x40ed,
+  0x40ec,
+  0x40eb,
+  0x40eb,
+  0x40ea,
+  0x40ea,
+  0x40ea,
+  0x40e9,
+  0x40e9,
+  0x40e8,
+  0x40e7,
+  0x40e7,
+  0x40e7,
+  0x40e6,
+  0x40e6,
+  0x40e5,
+  0x40e5,
+  0x40e4,
+  0x40e4,
+  0x40e3,
+  0x40e3,
+  0x40e2,
+  0x40e2,
+  0x40e2,
+  0x40e1,
+  0x40e0,
+  0x40e0,
+  0x40df,
+  0x40df,
+  0x40df,
+  0x40de,
+  0x40dd,
+  0x40dd,
+  0x40dd,
+  0x40dc,
+  0x40dc,
+  0x40db,
+  0x40da,
+  0x40da,
+  0x40da,
+  0x40d9,
+  0x40d9,
+  0x40d8,
+  0x40d8,
+  0x40d8,
+  0x40d7,
+  0x40d6,
+  0x40d6,
+  0x40d5,
+  0x40d5,
+  0x40d5,
+  0x40d4,
+  0x40d3,
+  0x40d3,
+  0x40d2,
+  0x40d2,
+  0x40d2,
+  0x40d1,
+  0x40d1,
+  0x40d0,
+  0x40cf,
+  0x40cf,
+  0x40cf,
+  0x40ce,
+  0x40ce,
+  0x40cd,
+  0x40cd,
+  0x40cc,
+  0x40cc,
+  0x40cb,
+  0x40cb,
+  0x40ca,
+  0x40ca,
+  0x40ca,
+  0x40c9,
+  0x40c8,
+  0x40c8,
+  0x40c8,
+  0x40c7,
+  0x40c7,
+  0x40c6,
+  0x40c5,
+  0x40c5,
+  0x40c5,
+  0x40c4,
+  0x40c4,
+  0x40c3,
+  0x40c2,
+  0x40c2,
+  0x40c2,
+  0x40c1,
+  0x40c1,
+  0x40c0,
+  0x40c0,
+  0x40c0,
+  0x40bf,
+  0x40be,
+  0x40be,
+  0x40bd,
+  0x40bd,
+  0x40bd,
+  0x40bc,
+  0x40bb,
+  0x40bb,
+  0x40ba,
+  0x40ba,
+  0x40ba,
+  0x40b9,
+  0x40b9,
+  0x40b8,
+  0x40b8,
+  0x40b7,
+  0x40b7,
+  0x40b6,
+  0x40b6,
+  0x40b5,
+  0x40b5,
+  0x40b5,
+  0x40b5,
+  0x40b5,
+  0x40b4,
+  0x40b4,
+  0x40b4,
+  0x40b4,
+  0x40b4,
+  0x40b3,
+  0x40b3,
+  0x40b3,
+  0x40b3,
+  0x40b3,
+  0x40b3,
+  0x40b3,
+  0x40b2,
+  0x40b2,
+  0x40b2,
+  0x40b2,
+  0x40b2,
+  0x40b1,
+  0x40b1,
+  0x40b1,
+  0x40b1,
+  0x40b1,
+  0x40b0,
+  0x40b0,
+  0x40b0,
+  0x40b0,
+  0x40b0,
+  0x40b0,
+  0x40b0,
+  0x40af,
+  0x40af,
+  0x40af,
+  0x40af,
+  0x40af,
+  0x40ae,
+  0x40ae,
+  0x40ae,
+  0x40ae,
+  0x40ae,
+  0x40ae,
+  0x40ae,
+  0x40ad,
+  0x40ad,
+  0x40ad,
+  0x40ad,
+  0x40ad,
+  0x40ac,
+  0x40ac,
+  0x40ac,
+  0x40ac,
+  0x40ac,
+  0x40ab,
+  0x40ab,
+  0x40ab,
+  0x40ab,
+  0x40ab,
+  0x40ab,
+  0x40ab,
+  0x40aa,
+  0x40aa,
+  0x40aa,
+  0x40aa,
+  0x40aa,
+  0x40a9,
+  0x40a9,
+  0x40a9,
+  0x40a9,
+  0x40a9,
+  0x40a9,
+  0x40a9,
+  0x40a8,
+  0x40a8,
+  0x40a8,
+  0x40a8,
+  0x40a8,
+  0x40a7,
+  0x40a7,
+  0x40a7,
+  0x40a7,
+  0x40a7,
+  0x40a7,
+  0x40a7,
+  0x40a7,
+  0x40a6,
+  0x40a6,
+  0x40a6,
+  0x40a6,
+  0x40a6,
+  0x40a5,
+  0x40a5,
+  0x40a5,
+  0x40a5,
+  0x40a4,
+  0x40a4,
+  0x40a4,
+  0x40a4,
+  0x40a4,
+  0x40a4,
+  0x40a4,
+  0x40a4,
+  0x40a3,
+  0x40a3,
+  0x40a3,
+  0x40a3,
+  0x40a3,
+  0x40a2,
+  0x40a2,
+  0x40a2,
+  0x40a2,
+  0x40a2,
+  0x40a2,
+  0x40a2,
+  0x40a1,
+  0x40a1,
+  0x40a1,
+  0x40a1,
+  0x40a1,
+  0x40a0,
+  0x40a0,
+  0x40a0,
+  0x40a0,
+  0x40a0,
+  0x409f,
+  0x409f,
+  0x409f,
+  0x409f,
+  0x409f,
+  0x409f,
+  0x409f,
+  0x409e,
+  0x409e,
+  0x409e,
+  0x409e,
+  0x409e,
+  0x409d,
+  0x409d,
+  0x409d,
+  0x409d,
+  0x409d,
+  0x409d,
+  0x409d,
+  0x409c,
+  0x409c,
+  0x409c,
+  0x409c,
+  0x409c,
+  0x409b,
+  0x409b,
+  0x409b,
+  0x409b,
+  0x409b,
+  0x409a,
+  0x409a,
+  0x409a,
+  0x409a,
+  0x409a,
+  0x409a,
+  0x409a,
+  0x4099,
+  0x4099,
+  0x4099,
+  0x4099,
+  0x4099,
+  0x4098,
+  0x4098,
+  0x4098,
+  0x4098,
+  0x4098,
+  0x4098,
+  0x4098,
+  0x4098,
+  0x4097,
+  0x4097,
+  0x4097,
+  0x4097,
+  0x4096,
+  0x4096,
+  0x4096,
+  0x4096,
+  0x4096,
+  0x4096,
+  0x4096,
+  0x4096,
+  0x4095,
+  0x4095,
+  0x4095,
+  0x4095,
+  0x4095,
+  0x4094,
+  0x4094,
+  0x4094,
+  0x4094,
+  0x4094,
+  0x4093,
+  0x4093,
+  0x4093,
+  0x4093,
+  0x4093,
+  0x4093,
+  0x4093,
+  0x4092,
+  0x4092,
+  0x4092,
+  0x4092,
+  0x4092,
+  0x4091,
+  0x4091,
+  0x4091,
+  0x4091,
+  0x4091,
+  0x4091,
+  0x4091,
+  0x4090,
+  0x4090,
+  0x4090,
+  0x4090,
+  0x4090,
+  0x408f,
+  0x408f,
+  0x408f,
+  0x408f,
+  0x408f,
+  0x408e,
+  0x408e,
+  0x408e,
+  0x408e,
+  0x408e,
+  0x408e,
+  0x408e,
+  0x408d,
+  0x408d,
+  0x408d,
+  0x408d,
+  0x408d,
+  0x408c,
+  0x408c,
+  0x408c,
+  0x408c,
+  0x408c,
+  0x408c,
+  0x408c,
+  0x408b,
+  0x408b,
+  0x408b,
+  0x408b,
+  0x408b,
+  0x408a,
+  0x408a,
+  0x408a,
+  0x408a,
+  0x408a,
+  0x408a,
+  0x408a,
+  0x408a,
+  0x4089,
+  0x4089,
+  0x4089,
+  0x4089,
+  0x4088,
+  0x4088,
+  0x4088,
+  0x4088,
+  0x4088,
+  0x4087,
+  0x4087,
+  0x4087,
+  0x4087,
+  0x4087,
+  0x4087,
+  0x4087,
+  0x4087,
+  0x4086,
+  0x4086,
+  0x4086,
+  0x4086,
+  0x4086,
+  0x4085,
+  0x4085,
+  0x4085,
+  0x4085,
+  0x4085,
+  0x4085,
+  0x4085,
+  0x4084,
+  0x4084,
+  0x4084,
+  0x4084,
+  0x4084,
+  0x4083,
+  0x4083,
+  0x4083,
+  0x4083,
+  0x4083,
+  0x4082,
+  0x4082,
+  0x4082,
+  0x4082,
+  0x4082,
+  0x4082,
+  0x4082,
+  0x4081,
+  0x4081,
+  0x4081,
+  0x4081,
+  0x4081,
+  0x4080,
+  0x4080,
+  0x4080,
+  0x4080,
+  0x4080,
+  0x4080,
+  0x407f,
+  0x407f,
+  0x407f,
+  0x407f,
+  0x407f,
+  0x407f,
+  0x407f,
+  0x407f,
+  0x407f,
+  0x407f,
+  0x407e,
+  0x407e,
+  0x407e,
+  0x407e,
+  0x407e,
+  0x407e,
+  0x407e,
+  0x407e,
+  0x407e,
+  0x407d,
+  0x407d,
+  0x407d,
+  0x407d,
+  0x407d,
+  0x407c,
+  0x407c,
+  0x407c,
+  0x407c,
+  0x407c,
+  0x407c,
+  0x407c,
+  0x407c,
+  0x407c,
+  0x407c,
+  0x407b,
+  0x407b,
+  0x407b,
+  0x407b,
+  0x407b,
+  0x407b,
+  0x407b,
+  0x407b,
+  0x407b,
+  0x407b,
+  0x407a,
+  0x407a,
+  0x407a,
+  0x407a,
+  0x407a,
+  0x407a,
+  0x407a,
+  0x407a,
+  0x407a,
+  0x407a,
+  0x4079,
+  0x4079,
+  0x4079,
+  0x4079,
+  0x4079,
+  0x4078,
+  0x4078,
+  0x4078,
+  0x4078,
+  0x4078,
+  0x4078,
+  0x4078,
+  0x4078,
+  0x4078,
+  0x4077,
+  0x4077,
+  0x4077,
+  0x4077,
+  0x4077,
+  0x4077,
+  0x4077,
+  0x4077,
+  0x4077,
+  0x4077,
+  0x4076,
+  0x4076,
+  0x4076,
+  0x4076,
+  0x4076,
+  0x4075,
+  0x4075,
+  0x4075,
+  0x4075,
+  0x4075,
+  0x4075,
+  0x4075,
+  0x4075,
+  0x4075,
+  0x4075,
+  0x4074,
+  0x4074,
+  0x4074,
+  0x4074,
+  0x4074,
+  0x4074,
+  0x4074,
+  0x4074,
+  0x4074,
+  0x4073,
+  0x4073,
+  0x4073,
+  0x4073,
+  0x4073,
+  0x4073,
+  0x4073,
+  0x4073,
+  0x4073,
+  0x4073,
+  0x4072,
+  0x4072,
+  0x4072,
+  0x4072,
+  0x4072,
+  0x4071,
+  0x4071,
+  0x4071,
+  0x4071,
+  0x4071,
+  0x4071,
+  0x4071,
+  0x4071,
+  0x4071,
+  0x4071,
+  0x4070,
+  0x4070,
+  0x4070,
+  0x4070,
+  0x4070,
+  0x4070,
+  0x4070,
+  0x4070,
+  0x4070,
+  0x4070,
+  0x406f,
+  0x406f,
+  0x406f,
+  0x406f,
+  0x406f,
+  0x406e,
+  0x406e,
+  0x406e,
+  0x406e,
+  0x406e,
+  0x406e,
+  0x406e,
+  0x406e,
+  0x406e,
+  0x406d,
+  0x406d,
+  0x406d,
+  0x406d,
+  0x406d,
+  0x406d,
+  0x406d,
+  0x406d,
+  0x406d,
+  0x406d,
+  0x406c,
+  0x406c,
+  0x406c,
+  0x406c,
+  0x406c,
+  0x406b,
+  0x406b,
+  0x406b,
+  0x406b,
+  0x406b,
+  0x406b,
+  0x406b,
+  0x406b,
+  0x406b,
+  0x406b,
+  0x406a,
+  0x406a,
+  0x406a,
+  0x406a,
+  0x406a,
+  0x406a,
+  0x406a,
+  0x406a,
+  0x406a,
+  0x4069,
+  0x4069,
+  0x4069,
+  0x4069,
+  0x4069,
+  0x4069,
+  0x4069,
+  0x4069,
+  0x4069,
+  0x4069,
+  0x4068,
+  0x4068,
+  0x4068,
+  0x4068,
+  0x4068,
+  0x4067,
+  0x4067,
+  0x4067,
+  0x4067,
+  0x4067,
+  0x4067,
+  0x4067,
+  0x4067,
+  0x4067,
+  0x4067,
+  0x4066,
+  0x4066,
+  0x4066,
+  0x4066,
+  0x4066,
+  0x4066,
+  0x4066,
+  0x4066,
+  0x4066,
+  0x4066,
+  0x4065,
+  0x4065,
+  0x4065,
+  0x4065,
+  0x4065,
+  0x4064,
+  0x4064,
+  0x4064,
+  0x4064,
+  0x4064,
+  0x4064,
+  0x4064,
+  0x4064,
+  0x4064,
+  0x4063,
+  0x4063,
+  0x4063,
+  0x4063,
+  0x4063,
+  0x4063,
+  0x4063,
+  0x4063,
+  0x4063,
+  0x4063,
+  0x4062,
+  0x4062,
+  0x4062,
+  0x4062,
+  0x4062,
+  0x4062,
+  0x4062,
+  0x4062,
+  0x4062,
+  0x4062,
+  0x4061,
+  0x4061,
+  0x4061,
+  0x4061,
+  0x4061,
+  0x4060,
+  0x4060,
+  0x4060,
+  0x4060,
+  0x4060,
+  0x4060,
+  0x4060,
+  0x4060,
+  0x4060,
+  0x405f,
+  0x405f,
+  0x405f,
+  0x405f,
+  0x405f,
+  0x405f,
+  0x405f,
+  0x405f,
+  0x405f,
+  0x405f,
+  0x405e,
+  0x405e,
+  0x405e,
+  0x405e,
+  0x405e,
+  0x405d,
+  0x405d,
+  0x405d,
+  0x405d,
+  0x405d,
+  0x405d,
+  0x405d,
+  0x405d,
+  0x405d,
+  0x405d,
+  0x405c,
+  0x405c,
+  0x405c,
+  0x405c,
+  0x405c,
+  0x405c,
+  0x405c,
+  0x405c,
+  0x405c,
+  0x405c,
+  0x405b,
+  0x405b,
+  0x405b,
+  0x405b,
+  0x405b,
+  0x405a,
+  0x405a,
+  0x405a,
+  0x405a,
+  0x405a,
+  0x405a,
+  0x405a,
+  0x405a,
+  0x405a,
+  0x4059,
+  0x4059,
+  0x4059,
+  0x4059,
+  0x4059,
+  0x4059,
+  0x4059,
+  0x4059,
+  0x4059,
+  0x4059,
+  0x4058,
+  0x4058,
+  0x4058,
+  0x4058,
+  0x4058,
+  0x4058,
+  0x4058,
+  0x4058,
+  0x4058,
+  0x4058,
+  0x4057,
+  0x4057,
+  0x4057,
+  0x4057,
+  0x4057,
+  0x4056,
+  0x4056,
+  0x4056,
+  0x4056,
+  0x4056,
+  0x4056,
+  0x4056,
+  0x4056,
+  0x4056,
+  0x4056,
+  0x4055,
+  0x4055,
+  0x4055,
+  0x4055,
+  0x4055,
+  0x4055,
+  0x4055,
+  0x4055,
+  0x4055,
+  0x4054,
+  0x4054,
+  0x4054,
+  0x4054,
+  0x4054,
+  0x4053,
+  0x4053,
+  0x4053,
+  0x4053,
+  0x4053,
+  0x4053,
+};
+
+static bool check_input_int8_range(float input) {
+  bool ret = input > -128.0 && input < 128.0;
+  if (!ret) {
+    printf("invalid int8 range, input is %f\n", input);
+  }
+  return ret;
+}
+
+// <! gen invert sqrt
+static double _gen_sqrt_inv(int base, int p) {
+  // y = x ^ -0.5
+  int m = 1;
+  if (base < 0 && p % 2) {
+    // cant sqrt with base, it need to hoist it
+    // (-2)^(-31) -> -2 * (-2^-30)
+    m = base;
+    if (p == 0) {
+      m = 1; // pow(base, 0) its fine
+    }
+    else if (p > 0) {
+      p = p - 1;
+    }
+    else {
+      // p < 0
+      p = p + 1;
+    }
+  }
+
+  double f = (double) (m * pow(base, p * -0.5));
+
+  if (isnan(f)) {
+    assert(0);
+  }
+  return f;
+}
+
+static void tl_lut_ref(
+    u16 *ofmap,
+    u16 *ifmap,
+    u16 *table,
+    u16 *table_slope,
+    tl_shape_t ifmap_shape,
+    tl_shape_t table_shape)
+{
+  int tn, th, tw;
+
+  tn = table_shape.n;
+  th = table_shape.h;
+  tw = table_shape.w;
+  assert(tn == 1);
+  assert(th * tw == 256);
+  assert(ofmap);
+  assert(ifmap);
+  assert(table);
+  assert(table_slope);
+  assert(tl_shape_size(&ifmap_shape));
+
+  // TODO: use c function
+  // TODO: cal error with `eval_lut.py`
+#if 0
+  // 1. dump all input as binary file
+  #define INFP32FILE "inv_infp32file.bin"
+  #define OUTBF16FILE "inv_lutbf16out.bin"
+  FILE* pFile;
+  pFile = fopen(INFP32FILE, "wb");
+  fwrite(ifmap, 1, tl_shape_size(&ifmap_shape) *sizeof(u16), pFile);
+  fclose(pFile);
+
+  // 2. read result from `eval_lut.py`
+  char command[256];
+  // func_id 4 means invsqrt
+  // lut_type_id 1 means exp
+  sprintf(command, "python eval_lut.py --lut_input_range_start %d --lut_input_range_end %d --func_id 4 --lut_type_id 1 --inputfloat32 %s --outputbf16 %s 2>&1 > 2\n",
+      exp_start, exp_end,
+      INFP32FILE, OUTBF16FILE);
+
+  // printf ("command is %s\n", command);
+  system(command);
+
+  pFile = fopen(OUTBF16FILE, "rb");
+  if (!pFile) {
+    fprintf(stderr, "open golden %s fail\n", OUTBF16FILE);
+    exit(-1);
+  }
+
+  fread(ofmap, sizeof(u16), tl_shape_size(&ifmap_shape), pFile);
+  fclose(pFile);
+#endif
+
+#if 0
+  for (u64 i = 0; i < tl_shape_size(&ifmap_shape); i++) {
+    printf ("ref %" PRIu64 " input %x golden %x\n", i, ifmap[i], ofmap[i]);
+  }
+#endif
+}
+
+static void gen_sqrt_inv(u16 *table_data, u64 table_size) {
+  // S(x) = 1 / (1 + (e^-x))
+  //<! 32*8 table, duplicate `channel` times;
+  int half = table_size / channel / 2;
+  u64 idx = 0;
+  assert(table_size);
+  assert(half == 128);
+
+  // prepare channel 0
+  double s = _gen_sqrt_inv(2, exp_start);
+  sqrt_hw[idx] = s;
+  table_data[idx] = convert_fp32_bf16(s);
+#if 0
+  printf("t [%" PRIu64 "] is %f(%.8lf)[2^%d] bf %x\n", idx, convert_bf16_fp32(table_data[idx]), s, range_start, table_data[idx]);
+#endif
+  idx++;
+
+  // log scale range from 2^-62 ~ 2^+63
+  // and -2^-62 ~ -2^+63
+
+  // > 0, exp from 0 -62 -61 ..  62  63
+  for (int i = 0; i < half; i++) {
+    float exp = exp_start + i;
+    double s = _gen_sqrt_inv(2, exp);
+    sqrt_hw[idx] = s;
+    table_data[idx] = convert_fp32_bf16(s);
+#if 0
+    printf("t [%" PRIu64 "] is %f(%e - %.8lf)[2^%f] bf %x\n", idx, convert_bf16_fp32(table_data[idx]), s, s, exp, table_data[idx]);
+#endif
+    idx++;
+  }
+
+  //// idx = 127 dont care
+  s = _gen_sqrt_inv(2, -0);
+  sqrt_hw[idx] = s;
+  table_data[idx] = convert_fp32_bf16(s);
+#if 0
+  printf("t [%" PRIu64 "] is %f[%d] bf %x\n", idx, convert_bf16_fp32(table_data[idx]), 0, table_data[idx]);
+#endif
+  idx++;
+
+  for (int i = 1; i < half; i++) {
+    float exp = exp_start + i;
+    double s = _gen_sqrt_inv(-2, exp);
+    sqrt_hw[idx] = s;
+    table_data[idx] = convert_fp32_bf16(s);
+#if 0
+    printf("t [%" PRIu64 "] is %f(%e - %.8lf)[(-2)^%f] bf %x\n", idx, convert_bf16_fp32(table_data[idx]), s, s, exp, table_data[idx]);
+#endif
+    idx++;
+  }
+
+  // idx = 255 dont care
+  //s = _gen_sqrt_inv(2, 0);
+  //table_data[idx] = convert_fp32_bf16(s);
+  //printf("t [%" PRIu64 "] is %f[%d]\n", idx, convert_bf16_fp32(table_data[idx]), 0);
+  //idx++;
+
+#if 0
+  for (u32 i = 0; i < table_hw; i++) {
+    printf("t [%u] is %f\n", i, convert_bf16_fp32(table_data[i]));
+  }
+#endif
+
+  // duplicate channel #1 to #31
+  //TODO: tensor copy
+  for (u32 i = 1; i < channel; i++) {
+    memcpy(&table_data[i * table_hw], &table_data[0], sizeof(u16) * table_hw);
+  }
+}
+
+static void gen_sqrt_inv_slope(u16 IN *table_data, u16* OUT table_slope, u64 table_size) {
+
+  u32 half = table_size / channel / 2;
+  assert(half == 128);
+  assert(table_data);
+
+  int idx = 0;
+  int i = 0;
+  double f_x0 = sqrt_hw[i];
+  double f_x1 = sqrt_hw[i+1];
+  double x0 = 0;
+  double x1 = pow(2.0, exp_start);
+  double s = (f_x1 - f_x0) / (x1 - x0);
+  table_slope[idx] = convert_fp32_bf16(s);
+#if 0
+  printf ("slope [%u]  = %f, 0x%x(org:%e(%.8lf)) f_x0 %lf f_x1 %lf\n", 
+        i, convert_bf16_fp32(table_slope[i]), table_slope[i], s, s, f_x0, f_x1);
+#endif
+  idx++;
+
+  for (u32 i = 0; i < table_hw; i++) {
+    double f_x0 = sqrt_hw[idx];
+    double f_x1 = sqrt_hw[idx+1];
+    int shift = 0;
+    int sign = 1;
+    if (idx >= 128) {
+      shift = 128;
+      sign = -1;
+    }
+    double exp = exp_start + (double)i - (double)shift;
+    double x0 = pow(sign * 2.0, exp);
+    double x1 = pow(sign * 2.0, exp + 1);
+    if (idx == 127 || idx >= 255) {
+      double s = 0.0;
+      table_slope[idx] = convert_fp32_bf16(s); // not used
+      idx++;
+      continue;
+    }
+    else if (idx == 128) {
+      x0 = 0;
+      exp = exp_start; //<! for asset check
+    }
+#if 0
+    printf ("[%u] x0 is %e %.16lf x1 is %.16lf, exp is %f\n", idx, x0, x1, exp);
+#endif
+    assert (!isinf(x0) && !isinf(x1));
+    assert(exp >= exp_start && exp <= exp_end);
+    
+    double s = (f_x1 - f_x0) / (x1 - x0);
+    table_slope[idx] = convert_fp32_bf16(s);
+#if 0
+    printf ("slope [%u]  = %f, 0x%x(org:%e(%.8lf)) (%.8lf - %.8lf) / (%.8lf - %.8lf), diif is %d\n",
+        idx, convert_bf16_fp32(table_slope[idx]), table_slope[idx], s, s, 
+        f_x1, f_x0, x1, x0, exp_start + i - shift);
+#endif
+    idx++;
+  }
+
+  // duplicate channel #1 to #31
+  //TODO: tensor copy
+  for (u64 i = 1; i < channel; i++) {
+    memcpy(&table_slope[table_hw * i], &table_slope[0], sizeof(u16) * table_hw);
+  }
+}
+
+static bool verify(u16 *ofmap_data, u16 *ref_data, u64 ofmap_size) {
+  u64 size = ofmap_size;
+  if (mode == PRE_DATA_COMPARE_FIX) {
+    //size = sizeof(sigmode_golden_bf16) / sizeof(sigmode_golden_bf16[0]);
+  }
+
+  for (u64 i = 0; i < size; i++) {
+    u16 ref = ref_data[i];
+    if (mode == PRE_DATA_COMPARE_FIX) {
+      ref = sigmode_golden_bf16[i];
+    }
+
+    if (ofmap_data[i] != ref) {
+      fprintf(stderr,
+          "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+          i, ofmap_data[i], ref);
+      exit(-1);
+
+#if 0
+      for (u64 i = 0; i < ofmap_size; i++) {
+        printf("error, dump all to [%" PRIx64 "]%" PRIu64 " source %x ref %x\n", i, i, ofmap_data[i], ref_data[i]);
+      }
+
+#endif
+    }
+  }
+  return true;
+}
+
+static void test_tl_int8_lut_bf16(bmctx_t *ctx, bmk_ctx_t *bmk)
+{
+  // TODO: check more shape / align
+  tl_shape_t ifmap_shape;
+  if (mode == PRE_DATA_COMPARE_FIX) {
+    ifmap_shape = {1, channel, 8, 8};
+  }
+  else {
+    ifmap_shape = {1, channel, 16, 16};
+  }
+
+  tl_shape_t table_shape = {1, channel, table_h, table_w}; // hard code for hw, hw:32x8
+  tl_shape_t ofmap_shape = ifmap_shape;
+
+  u64 ifmap_size = tl_shape_size(&ifmap_shape);
+  u64 table_size = tl_shape_size(&table_shape);
+  u64 ofmap_size = tl_shape_size(&ofmap_shape);
+
+  fmt_t fmt = FMT_BF16;
+
+  int data_type_size = bytesize_of_fmt(fmt);
+  u64 ifmap_bytesize  =  ifmap_size * data_type_size;
+  u64 table_bytesize  =  table_size * data_type_size;
+  u64 ofmap_bytesize  =  ofmap_size * data_type_size;
+
+  // hw ONLY support index in int8
+  u16 *ifmap = (u16 *)xmalloc(ifmap_bytesize);
+  memset(ifmap, 0x00, ifmap_bytesize);
+
+  u16 *ifmap_slope = (u16 *)xmalloc(table_bytesize);
+  memset(ifmap_slope, 0x00, table_bytesize);
+
+  if (mode == PRE_DATA_COMPARE_FIX) {
+    memcpy(ifmap, &test_pattern, ifmap_bytesize);
+#if 0
+    for (u64 i = 0; i < ifmap_size; i++) {
+      printf("source if[%" PRIu64 "] is %e bf16 %f (bf16)with 0x%x log2f is %f\n", i, convert_bf16_fp32(ifmap[i]), convert_bf16_fp32(ifmap[i]), ifmap[i],
+          log2f(convert_bf16_fp32(ifmap[i]))); 
+    }
+#endif
+  }
+  else {
+    for (u64 i = 0; i < ifmap_size; i++) {
+      // input range 0.001 - 32
+      float input = ((int)i % 31) + (i % 100) * 0.012;
+      assert(check_input_int8_range(input));
+      ifmap[i] = convert_fp32_bf16(input);
+#if 0
+      printf("source if[%" PRIu64 "] is bf16 %f, input is %f (bf16)with 0x%x\n", i, convert_bf16_fp32(ifmap[i]), input, ifmap[i]); 
+#endif
+    }
+  }
+
+  u16 *table_data = (u16 *)xmalloc(table_bytesize);
+  gen_sqrt_inv (table_data, table_size);
+
+  u16 *table_data_slope = (u16 *)xmalloc(table_bytesize);
+  gen_sqrt_inv_slope(table_data, table_data_slope, table_size);
+
+  u16 *ref_data = (u16 *)xmalloc(ofmap_bytesize);
+  tl_lut_ref(ref_data, ifmap, table_data, table_data_slope, ifmap_shape, table_shape);
+
+  tl_t *tl_ifmap =
+    alloc_tl(bmk,ifmap_shape, fmt, /*align*/1);
+  tl_t *tl_table_answer =
+    alloc_tl(bmk, table_shape, fmt, /*align*/1);
+  tl_t *tl_table_answer_slope =
+    alloc_tl(bmk, table_shape, fmt, /*align*/1);
+
+  tl_t *tl_ofmap_A_idx =
+    alloc_tl(bmk,ofmap_shape, fmt, /*align*/1);
+  tl_t *tl_ofmap_B_slope =
+    alloc_tl(bmk,ofmap_shape, fmt, /*align*/1);
+  tl_t *tl_ofmap_A_base_val =
+    alloc_tl(bmk,ofmap_shape, fmt, /*align*/1);
+  tl_t *tl_ofmap_A_base =
+    alloc_tl(bmk,ofmap_shape, fmt, /*align*/1);
+  tl_t *tl_ofmap_C =
+    alloc_tl(bmk,ofmap_shape, fmt, /*align*/1);
+
+  // <! FIXME: prepare it
+  bmk1822_tdma_tg2l_tensor_copy_param_t copy_p1, copy_p2, copy_p3;
+  memset(&copy_p1, 0, sizeof(copy_p1));
+  memset(&copy_p2, 0, sizeof(copy_p2));
+  memset(&copy_p2, 0, sizeof(copy_p3));
+  prepare_put_bf16_tensor_g2l(ctx, bmk, tl_ifmap, ifmap, fmt, &copy_p1);
+  prepare_put_bf16_tensor_g2l(ctx, bmk, tl_table_answer, table_data, fmt, &copy_p2);
+  prepare_put_bf16_tensor_g2l(ctx, bmk, tl_table_answer_slope, table_data_slope, fmt, &copy_p3);
+
+  launch_put_bf16_tensor_g2l(ctx, bmk, copy_p1.src, &copy_p1); // input
+  launch_put_bf16_tensor_g2l(ctx, bmk, copy_p2.src, &copy_p2); // table value
+  launch_put_bf16_tensor_g2l(ctx, bmk, copy_p3.src, &copy_p3); // table slope
+
+  // <! get base (x0)
+  bmk1822_tdma_l2l_tensor_copy_param_t p10;
+  memset(&p10, 0x00, sizeof(bmk1822_tdma_l2l_tensor_copy_param_t));
+  p10.dst = tl_ofmap_A_base;
+  p10.src = tl_ifmap;
+  p10.mv_lut_base = true;
+  bmk1822_tdma_l2l_bf16_tensor_copy(bmk, &p10);
+  test_submit(ctx);
+
+  // <! get index(pow)
+  memset(&p10, 0x00, sizeof(bmk1822_tdma_l2l_tensor_copy_param_t));
+  p10.dst = tl_ofmap_A_idx;
+  p10.src = tl_ifmap;
+  p10.mv_lut_idx = true;
+  bmk1822_tdma_l2l_bf16_tensor_copy(bmk, &p10);
+  test_submit(ctx);
+
+  // <! get f(x0)
+  bmk1822_tiu_lookup_table_param_t p12;
+  p12.ofmap = tl_ofmap_A_base_val;
+  p12.ifmap = tl_ofmap_A_idx;
+  p12.table = tl_table_answer;
+  bmk1822_tiu_lookup_table(bmk, &p12);
+
+  // <! get slope by index
+  // <! ( (f(x1) - f(x0)) / (x1 - x0) )
+  memset(&p12, 0x0, sizeof(bmk1822_tiu_lookup_table_param_t));
+  p12.ofmap = tl_ofmap_B_slope;
+  p12.ifmap = tl_ofmap_A_idx;
+  p12.table = tl_table_answer_slope;
+  bmk1822_tiu_lookup_table(bmk, &p12);
+
+  // <! sub, diff base , a - b
+  // (x - x0)
+  bmk1822_tiu_element_wise_sub_param_t p5;
+  p5.res_high = 0;
+  p5.res_low = tl_ofmap_C;
+  p5.a_high = 0;
+  p5.a_low = tl_ifmap;
+  p5.b_high = 0;
+  p5.b_low = tl_ofmap_A_base;
+  p5.rshift_bits = 0;
+  bmk1822_tiu_element_wise_sub(bmk, &p5);
+
+  // <! mac
+  // <! part A + part B, a * b + res = res
+  bmk1822_tiu_element_wise_mac_param_t p2;
+  p2.res_high = 0;
+  p2.res_low = tl_ofmap_A_base_val;
+  p2.res_is_int8 = 0;
+  p2.a = tl_ofmap_C;
+  p2.b_is_const = 0;
+  p2.b = tl_ofmap_B_slope;
+  p2.lshift_bits = 0;//lshift_bits;
+  p2.rshift_bits = 0;//rshift_bits;
+  p2.relu_enable = 0;
+  bmk1822_tiu_element_wise_mac(bmk, &p2);
+  test_submit(ctx);
+
+  u16 *ofmap_data = (u16*)get_bf16_tensor_l2g(ctx, bmk, tl_ofmap_A_base_val, fmt);
+  verify(ofmap_data, ref_data, ofmap_size);
+
+  free_tl(bmk, tl_ofmap_C);
+  free_tl(bmk, tl_ofmap_A_base);
+  free_tl(bmk, tl_ofmap_A_base_val);
+  free_tl(bmk, tl_ofmap_B_slope);
+  free_tl(bmk, tl_ofmap_A_idx);
+  free_tl(bmk, tl_table_answer_slope);
+  free_tl(bmk, tl_table_answer);
+  free_tl(bmk, tl_ifmap);
+
+  free(ifmap);
+  free(ifmap_slope);
+  free(table_data);
+  free(table_data_slope);
+  free(ref_data);
+  free(ofmap_data);
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  int round_mode;
+
+  round_mode = set_store_feround();
+
+  test_init(&ctx, &bmk);
+
+  bmk1822_chip_info_t chip_info = bmk1822_chip_info();
+  channel = chip_info.npu_num;
+
+  for (int i = PRE_DATA_COMPARE_FIX; i < TEST_MODE_MAX; i++) {
+    mode = static_cast<TEST_MODE>(i);
+    printf ("test mode %d...\n", mode);
+    test_tl_int8_lut_bf16(&ctx, bmk);
+  }
+
+  test_exit(&ctx);
+  restore_feround(round_mode);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_bf16_lut.cpp b/cviruntime/test/1822/bf16/test_1822_bf16_lut.cpp
new file mode 100644
index 000000000..15dab8dd0
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_bf16_lut.cpp
@@ -0,0 +1,112 @@
+#include "../1822_test_util.h"
+
+static u32 channel = -1; //<! 1822 hardcode
+
+static u64 shape_size(tl_shape_t s)
+{
+  return s.n * s.c * s.h * s.w;
+}
+
+static void tl_lut_ref(
+    u16 *ofmap,
+    u16 *ifmap,
+    u16 *table,
+    tl_shape_t ifmap_shape,
+    tl_shape_t table_shape)
+{
+  int ih, iw;
+  int tn, th, tw;
+
+  ih = ifmap_shape.h;
+  iw = ifmap_shape.w;
+  tn = table_shape.n;
+  th = table_shape.h;
+  tw = table_shape.w;
+  assert(tn == 1);
+  assert(th * tw == 256);
+
+  for (u64 i = 0; i < shape_size(ifmap_shape); i++) {
+    int ici = i / (ih * iw) % 32;
+    ofmap[i] = table[ici * (th * tw) + ifmap[i]];
+  }
+}
+
+static void test_tl_lut(bmctx_t *ctx, bmk_ctx_t *bk_ctx)
+{
+  tl_shape_t ifmap_shape = {1, channel, 1, 224};
+  tl_shape_t table_shape = {1, channel, 32, 8};
+  tl_shape_t ofmap_shape = ifmap_shape;
+
+  u64 ifmap_size = shape_size(ifmap_shape);
+  u64 table_size = shape_size(table_shape);
+  u64 ofmap_size = shape_size(ofmap_shape);
+
+  fmt_t fmt = FMT_BF16;
+
+  int data_type_size = bytesize_of_fmt(fmt);
+  u64 ifmap_bytesize  =  ifmap_size * data_type_size;
+  u64 table_bytesize  =  table_size * data_type_size;
+  u64 ofmap_bytesize  =  ofmap_size * data_type_size;
+
+  u16 *ifmap_data = (u16 *)xmalloc(ifmap_bytesize);
+  for (u64 i = 0; i < ifmap_size; i++)
+    ifmap_data[i] = 0;
+    //ifmap_data[i] = i - 20;
+
+  u16 *table_data = (u16 *)xmalloc(table_bytesize);
+  for (u64 i = 0; i < table_size; i++)
+    table_data[i] = i + i / 256 * 3;
+
+  u16 *ref_data = (u16 *)xmalloc(ofmap_bytesize);
+  tl_lut_ref(ref_data, ifmap_data, table_data, ifmap_shape, table_shape);
+
+  tl_t *tl_ifmap =
+    alloc_tl(bk_ctx,ifmap_shape, fmt, 1);
+  tl_t *tl_table =
+    alloc_tl(bk_ctx, table_shape, fmt, /*align*/1);
+  tl_t *tl_ofmap =
+    alloc_tl(bk_ctx,ofmap_shape, fmt, /*align*/1);
+
+  put_bf16_tensor_g2l(ctx, bk_ctx, tl_ifmap, ifmap_data, fmt);
+  put_bf16_tensor_g2l(ctx, bk_ctx, tl_table, table_data, fmt);
+
+  bmk1822_tiu_lookup_table_param_t p12;
+  p12.ofmap = tl_ofmap;
+  p12.ifmap = tl_ifmap;
+  p12.table = tl_table;
+  bmk1822_tiu_lookup_table(bk_ctx, &p12);
+  test_submit(ctx);
+
+  u16 *ofmap_data = (u16*)get_bf16_tensor_l2g(ctx, bk_ctx, tl_ofmap, fmt);
+  for (u64 i = 0; i < ofmap_size; i++) {
+    if (ofmap_data[i] != ref_data[i]) {
+      fprintf(stderr,
+          "comparing failed at ofmap_data[%" PRIu64 "], got %d, exp %d\n",
+          i, ofmap_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_ofmap);
+  free_tl(bk_ctx, tl_table);
+  free_tl(bk_ctx, tl_ifmap);
+
+  free(ifmap_data);
+  free(table_data);
+  free(ref_data);
+  free(ofmap_data);
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  bmk1822_chip_info_t chip_info = bmk1822_chip_info();
+  channel = chip_info.npu_num;
+
+  test_tl_lut(&ctx, bk_ctx);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_bf16_matrix_mac.cpp b/cviruntime/test/1822/bf16/test_1822_bf16_matrix_mac.cpp
new file mode 100644
index 000000000..7cac1c072
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_bf16_matrix_mac.cpp
@@ -0,0 +1,328 @@
+#include "../1822_test_util.h"
+
+typedef bmk1822_tiu_matrix_multiplication_param_t param_t;
+int random_seed;
+
+static u64 matrix_size(const ml_t *ml)
+{
+
+  u64 row = ml->shape.n;
+  u64 col = ml->shape.col;
+  return row * col;
+}
+
+static u64 res_size(param_t *p)
+{
+    return matrix_size(p->res);
+}
+
+static u16 * alloc_left(param_t *p)
+{
+  u64 size = matrix_size(p->left);
+  u16 *buf = (u16 *)malloc(sizeof(u16) * size);
+  for (u64 i = 0; i < size; i++) {
+    buf[i] = convert_fp32_bf16(i);
+  }
+
+  return buf;
+}
+
+static u16 * alloc_right(param_t *p)
+{
+  u64 size = matrix_size(p->right);
+  u16 *buf = (u16 *)malloc(sizeof(u16) * size);
+  for (u64 i = 0; i < size; i++) {
+    float val = 0.01;
+    buf[i] = convert_fp32_bf16(i);
+    val += 0.01;
+  }
+  return buf;
+}
+
+static u32 * alloc_bias(param_t *p)
+{
+  if (!p->bias)
+    return NULL;
+
+  u64 size = matrix_size(p->bias);
+  u32 *buf = (u32 *)malloc(sizeof(u32) * size);
+  for (u64 i = 0; i < size; i++) {
+    buf[i] = convert_fp32_hex(i);
+  }
+  return buf;
+}
+
+static u32 * alloc_res(param_t *p)
+{
+  u64 size = res_size(p);
+  u32 *buf = (u32 *)malloc(sizeof(u32) * size);
+  for (u64 i = 0; i < size; i++) {
+    buf[i] = convert_fp32_bf16(i);
+  }
+  return buf;
+}
+
+static inline void bf16_relu(float *buf, u64 size)
+{
+  for (u64 i = 0; i < size; i++)
+    if (buf[i] < 0)
+      buf[i] = 0;
+}
+
+static void matrix_mac_ref(
+    param_t *p, u16 left[], u16 right[], u32 bias[], u32 res[])
+{
+  u64 size = res_size(p);
+  u32 left_col = p->left->shape.col;
+  u32 right_col = p->right->shape.col;
+  u32 res_row = p->left->shape.n;
+  u32 res_col = p->res->shape.col;
+  u32 left_c = p->left->shape.c; 
+  u32 left_w = p->left->shape.w; 
+
+  float *tmp_res = (float *)malloc(sizeof(float) * size);
+  if (p->add_result) {
+    for (u32 i = 0; i < res_row * res_col; i++)
+      tmp_res[i] = convert_bf16_fp32(res[i]);
+  } else {
+    for (u32 i = 0; i < res_row * res_col; i++)
+      tmp_res[i] = 0;
+  }
+  for (u32 row = 0; row < res_row; row++) {
+    for (u32 col = 0; col < res_col; col++) {
+      for (u32 wi = 0; wi < left_w; wi++) {
+        for (u32 ci = 0; ci < left_c; ci++) {
+          if ((wi + (ci*left_w)) >= left_col)
+            continue;
+          u32 li = row * left_col + left_w * ci + wi;
+          u32 ri = (ci* left_w + wi )* right_col + col;
+
+          float l = convert_bf16_fp32(left[li]);
+          float r = convert_bf16_fp32(right[ri]);
+          tmp_res[row * res_col + col] += l * r;
+        }
+      }
+    }
+  }
+
+  if (p->bias && bias) {
+    for (u32 row = 0; row < res_row; row++) {
+      for (u32 col = 0; col < res_col; col++) {
+        float b = convert_hex_fp32(bias[col]);
+        tmp_res[row * res_col + col] += b;
+      }
+    }
+  }
+
+  if (p->relu_enable)
+    bf16_relu(tmp_res, size);
+
+  for (u64 i = 0; i < size; i++) {
+    res[i] = convert_fp32_bf16(tmp_res[i]);
+  }
+  free(tmp_res);
+}
+
+static void put_bias(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    const ml_t *ml,
+    u32 data[])
+{
+  u64 size = ml->shape.col;
+
+  u16 *tmp = (u16 *)malloc(sizeof(u16) * size * 2);
+  if (!tmp)
+    return;
+
+  for (u64 i = 0; i < size; i++) {
+    tmp[i] = (data[i] >> 16) & 0xFFFF;
+    tmp[i + size] = (data[i] & 0xFFFF);
+  }
+
+  put_bf16_matrix_g2l(ctx, bk_ctx, ml, (u8*)tmp, FMT_BF16);
+
+  free(tmp);
+}
+
+static void put_res(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    const ml_t *ml,
+    u32 data[])
+{
+  u64 size = ml->shape.n  * ml->shape.col;
+
+  u16 *tmp = (u16 *)malloc(sizeof(u16) * size);
+  if (!tmp)
+    return;
+
+  for (u64 i = 0; i < size; i++) {
+    tmp[i] = (data[i] & 0xFFFF);
+  }
+
+  put_bf16_matrix_g2l(ctx, bk_ctx, ml, (u8*)tmp, FMT_BF16);
+
+  free(tmp);
+}
+
+static u32 * get_res(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    param_t *p)
+{
+  u64 size = res_size(p);
+  u32 *res = (u32 *)malloc(sizeof(u32) * size);
+
+  u16 *tmp = (u16 *)get_bf16_matrix_l2g(ctx, bk_ctx, p->res, FMT_BF16);
+  for (u64 i = 0; i < size; i++)
+    res[i] = tmp[i];
+
+  free(tmp);
+  return res;
+}
+
+static void test_param(bmctx_t *ctx, bmk_ctx_t *bk_ctx, param_t *p)
+{
+  u16 *left = alloc_left(p);
+  u16 *right = alloc_right(p);
+  u32 *bias = alloc_bias(p);
+  u32 *ref = alloc_res(p);
+  put_bf16_matrix_g2l(ctx, bk_ctx, p->left, (u8*)left, FMT_BF16);
+  put_bf16_matrix_g2l(ctx, bk_ctx, p->right, (u8*)right, FMT_BF16);
+  if (bias)
+    put_bias(ctx, bk_ctx, p->bias, bias);
+  if (p->add_result)
+    put_res(ctx, bk_ctx, p->res, ref);
+
+  bmk1822_tiu_matrix_multiplication(bk_ctx, p);
+  u32 *res = get_res(ctx, bk_ctx, p);
+  matrix_mac_ref(p, left, right, bias, ref);
+  u64 size = res_size(p);
+  for (u64 i = 0; i < size; i++) {
+    if (res[i] != ref[i]) {
+      fprintf(stderr, "comparing failed at out[%" PRIu64 "], got %x, exp %x\n",
+              i, res[i], ref[i]);
+      fprintf(stderr, "random_seed=%d\n", random_seed);
+      exit(-1);
+    }
+  }
+  free(left);
+  free(right);
+  free(bias);
+  free(ref);
+  free(res);
+}
+
+static void destroy_param(bmk_ctx_t *bk_ctx, param_t *p)
+{
+  if (p->bias)
+    bmk1822_lmem_free_matrix(bk_ctx, p->bias);
+  if (p->res)
+    bmk1822_lmem_free_matrix(bk_ctx, p->res);
+  if (p->right)
+    bmk1822_lmem_free_matrix(bk_ctx, p->right);
+  if (p->left)
+    bmk1822_lmem_free_matrix(bk_ctx, p->left);
+}
+
+static ml_t *alloc_param_res(
+    bmk_ctx_t *bk_ctx, param_t *p)
+{
+  ml_shape_t s;
+
+  s.n = p->left->shape.n;
+  s.c = p->right->shape.c;
+  s.w = p->right->shape.w;
+  s.col = p->right->shape.col;
+  fmt_t fmt = FMT_BF16;
+  return bmk1822_lmem_alloc_matrix(bk_ctx, s, fmt, 1);
+}
+
+static param_t param_0(bmk_ctx_t *bk_ctx)
+{
+
+retry:
+  random_seed = clock();
+  srand(random_seed);
+
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = rand()%2;
+  p.add_result = 0; /*bf16 HW does not support add_result*/
+  p.ps32_mode = 0;
+
+  u32 left_row = rand() % 100 +1;
+  u32 left_col = rand() % 100 + 1;
+  u32 left_w = rand() % (left_col/5+1) + 1; // c is generate by w, and make c is larger
+  u32 left_c = left_col / left_w + (left_col % left_w ? 1: 0);
+
+  u32 right_row = left_col;
+  u32 right_col = rand() % 100 + 1;
+  u32 right_w = (rand() % (right_col/5+1) + 1); // make c is larger
+  u32 right_c = right_col / right_w + (right_col % right_w ? 1: 0) ;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+  
+  u32 bias = rand()%2;
+  p.bias = NULL;
+  p.left = bmk1822_lmem_alloc_matrix(bk_ctx, left_shape, FMT_BF16, 1);
+  p.right = bmk1822_lmem_alloc_matrix(bk_ctx, right_shape, FMT_BF16, 1);
+  if (!p.left || !p.right) {
+    printf("retry init_matrix_param\n");
+    destroy_param(bk_ctx, &p);
+    goto retry;
+  }
+
+  p.res = alloc_param_res(bk_ctx, &p);
+  if (bias) {
+    ml_shape_t bias_shape = right_shape;
+    bias_shape.n = 2;
+    p.bias = bmk1822_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_BF16, 1);
+  }
+
+  if (!p.res || (bias && !p.bias)) {
+    printf("retry init_matrix_param\n");
+    destroy_param(bk_ctx, &p);
+    goto retry;
+  }
+
+  return p;
+}
+
+
+#define test_one_param(n)                               \
+  do {                                                  \
+    param_t p = param_##n(bk_ctx);                      \
+    test_param(&ctx, bk_ctx, &p);                       \
+    destroy_param(bk_ctx, &p);                          \
+  } while (0)
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  int round_mode;
+  round_mode = set_store_feround();
+
+  for (int i = 0 ; i < 30 ; i++)
+    test_one_param(0);
+
+  restore_feround(round_mode);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_bf16_matrix_mac_ps32.cpp b/cviruntime/test/1822/bf16/test_1822_bf16_matrix_mac_ps32.cpp
new file mode 100644
index 000000000..35558e9d3
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_bf16_matrix_mac_ps32.cpp
@@ -0,0 +1,519 @@
+#include "../1822_test_util.h"
+
+typedef bmk1822_tiu_matrix_multiplication_param_t param_t;
+
+typedef struct{
+  u32 left_sign;
+  u32 left_row ;
+  u32 left_col ;
+  u32 left_c ;
+  u32 left_w ;
+  u32 right_sign;
+  u32 right_row ;
+  u32 right_col ;
+  u32 right_c ;
+  u32 right_w ;
+  u32 lshift_bits ;
+  u32 rshift_bits ;
+  u32 relu_enable ;
+  u32 using_bias;
+  u32 bias_sign;
+} matrix_init_para_t;
+
+u32 random_seed;
+matrix_init_para_t matrix_para_t;
+
+static void make_bmk_matrix_param_ps32(bmk_ctx_t *bk_ctx, param_t *p, int ps32_mode);
+static param_t param_init();
+
+void print_param(param_t *p)
+{
+  printf("random_seed =%d\n", random_seed);
+  printf("ps32_mode =%d\n",p->ps32_mode);
+  printf("left_shape.n =%d\n",p->left->shape.n);
+  printf("left_shape.col =%d\n",p->left->shape.col);
+  printf("left_shape.c =%d\n",p->left->shape.c);
+  printf("left_shape.w =%d\n",p->left->shape.w);
+  printf("left_fmt =%d\n",p->left->fmt);
+  printf("right_shape.n =%d\n",p->right->shape.n);
+  printf("right_shape.col =%d\n",p->right->shape.col);
+  printf("right_shape.c =%d\n",p->right->shape.c);
+  printf("right_shape.w =%d\n",p->right->shape.w);
+  printf("right_fmt =%d\n",p->right->fmt);
+  if(p->bias)
+  {
+    printf("bias_shape.n =%d\n",p->bias->shape.n);
+    printf("bias_shape.col =%d\n",p->bias->shape.col);
+    printf("bias_shape.c =%d\n",p->bias->shape.c);
+    printf("bias_shape.w =%d\n",p->bias->shape.w);
+    printf("bias_fmt =%d\n",p->bias->fmt);
+  }
+  printf("result_shape.n =%d\n",p->res->shape.n);
+  printf("result_shape.col =%d\n",p->res->shape.col);
+  printf("result_shape.c =%d\n",p->res->shape.c);
+  printf("result_shape.w =%d\n",p->res->shape.w);
+  printf("result_fmt =%d\n",p->res->fmt);
+  printf("relu_enable=%d\n",p->relu_enable);
+  printf("rshift_bits=%d\n",p->rshift_bits);
+}
+
+
+static u64 matrix_size(const ml_t *ml)
+{
+  u64 row = ml->shape.n;
+  u64 col = ml->shape.col;
+  return row * col;
+}
+
+static u64 res_ps32_size(param_t *p)
+{
+    return matrix_size(p->res);
+}
+
+static u64 res_size(param_t *p)
+{
+    return matrix_size(p->res);
+}
+
+static u16 * alloc_left(param_t *p)
+{
+  u64 size = matrix_size(p->left);
+  u16 *buf = (u16 *)malloc(sizeof(u16) * size);
+  for (u64 i = 0; i < size; i++)
+    buf[i] = convert_fp32_bf16(i);
+
+  return buf;
+}
+
+static u16 * alloc_right(param_t *p)
+{
+  u64 size = matrix_size(p->right);
+
+  u16 *buf = (u16 *)malloc(sizeof(u16) * size);
+  for (u64 i = 0; i < size; i++)
+    buf[i] = convert_fp32_bf16(i);
+
+  return buf;
+}
+static u32 * alloc_bias(param_t *p)
+{
+  if (!p->bias)
+    return NULL;
+
+  u64 size = matrix_size(p->bias) / 2;
+
+  u32 *buf = (u32 *)malloc(sizeof(u32) * size);
+  for (u64 i = 0; i < size; i++)
+    buf[i] = convert_fp32_hex(i);
+
+  return buf;
+}
+
+static u16 * alloc_ps32_res(param_t *p)
+{
+  u64 size = res_ps32_size(p)*2;
+  u16 *buf = (u16 *)malloc(sizeof(u16) * size);
+  for (u64 i = 0; i < size; i++)
+    buf[i] = convert_fp32_bf16(i);
+
+  return buf;
+}
+
+static inline void bf16_relu(float *buf, u64 size)
+{
+  for (u64 i = 0; i < size; i++)
+    if (buf[i] < 0)
+      buf[i] = 0;
+}
+
+static int ps32_m2_matrix_mac_ref(
+  param_t *p,
+  u16 *left,
+  u16 *right,
+  u16 *res)
+{
+  u64 size = res_ps32_size(p);
+  u32 left_col = p->left->shape.col;
+  u32 right_col = p->right->shape.col;
+  u32 res_row = p->left->shape.n;
+  u32 res_col = p->res->shape.col;
+  u32 left_c = p->left->shape.c;
+  u32 left_w = p->left->shape.w;
+
+  int ret = BM_SUCCESS;
+  int bstride = res_row * res_col;
+
+  float *tmp_res = (float *)malloc(sizeof(float) * size);
+  for (u32 i = 0; i < res_row * res_col; i++)
+      tmp_res[i] = 0;
+  for (u32 row = 0; row < res_row; row++) {
+    for (u32 col = 0; col < res_col; col++) {
+      for (u32 wi = 0; wi < left_w; wi++) {
+        for (u32 ci = 0; ci < left_c; ci++) {
+          if ((wi + (ci*left_w)) >= left_col)
+            continue;
+          u32 li = row * left_col + left_w * ci + wi;
+          u32 ri = (ci* left_w + wi )* right_col + col;
+
+          float l = convert_bf16_fp32(left[li]);
+          float r = convert_bf16_fp32(right[ri]);
+          tmp_res[row * res_col + col] += l * r;
+        }
+      }
+    }
+  }
+
+  for (u64 i = 0; i < size; i++)
+    res[i + bstride*0] = (convert_fp32_hex(tmp_res[i]) >> 16) & 0xFFFF;
+  for (u64 i = 0; i < size; i++)
+    res[i + bstride*1] = (convert_fp32_hex(tmp_res[i]) >> 0) & 0xFFFF;
+
+  free(tmp_res);
+
+  return ret;
+}
+
+static int ps32_m3_matrix_mac_ref(
+  param_t *p,
+  u16 *left,
+  u16 *right,
+  u16 *res)
+{
+  u64 size = res_ps32_size(p);
+  u32 left_col = p->left->shape.col;
+  u32 right_col = p->right->shape.col;
+  u32 res_row = p->left->shape.n;
+  u32 res_col = p->res->shape.col;
+  u32 left_c = p->left->shape.c;
+  u32 left_w = p->left->shape.w;
+
+  int ret = BM_SUCCESS;
+  int bstride = res_row * res_col;
+
+  float *tmp_res = (float *)malloc(sizeof(float) * size);
+
+  for (u64 i = 0; i < size; i++)
+    tmp_res[i] = convert_hex_fp32((res[i + bstride*0] << 16) | res[i + bstride*1]);
+
+  for (u32 row = 0; row < res_row; row++) {
+    for (u32 col = 0; col < res_col; col++) {
+      for (u32 wi = 0; wi < left_w; wi++) {
+        for (u32 ci = 0; ci < left_c; ci++) {
+          if ((wi + (ci*left_w)) >= left_col)
+            continue;
+          u32 li = row * left_col + left_w * ci + wi;
+          u32 ri = (ci* left_w + wi )* right_col + col;
+
+          float l = convert_bf16_fp32(left[li]);
+          float r = convert_bf16_fp32(right[ri]);
+          tmp_res[row * res_col + col] += l * r;
+        }
+      }
+    }
+  }
+
+  for (u64 i = 0; i < size; i++)
+    res[i + bstride*0] = (convert_fp32_hex(tmp_res[i]) >> 16) & 0xFFFF;
+  for (u64 i = 0; i < size; i++)
+    res[i + bstride*1] = (convert_fp32_hex(tmp_res[i]) >> 0) & 0xFFFF;
+
+  free(tmp_res);
+
+  return ret;
+}
+
+static int ps32_m1_matrix_mac_ref(
+  param_t *p,
+  u16 *left,
+  u16 *right,
+  u32 * bias,
+  u16 *res)
+{
+  u64 size = res_ps32_size(p);
+  u32 left_col = p->left->shape.col;
+  u32 right_col = p->right->shape.col;
+  u32 res_row = p->left->shape.n;
+  u32 res_col = p->res->shape.col;
+  u32 left_c = p->left->shape.c;
+  u32 left_w = p->left->shape.w;
+
+  int ret = BM_SUCCESS;
+  int bstride = res_row * res_col;
+
+  float *tmp_res = (float *)malloc(sizeof(float) * size);
+
+  for (u64 i = 0; i < size; i++) {
+    tmp_res[i] = convert_hex_fp32((res[i + bstride*0] << 16) | res[i + bstride*1]);
+   }
+
+  for (u32 row = 0; row < res_row; row++) {
+    for (u32 col = 0; col < res_col; col++) {
+      for (u32 wi = 0; wi < left_w; wi++) {
+        for (u32 ci = 0; ci < left_c; ci++) {
+          if ((wi + (ci*left_w)) >= left_col)
+            continue;
+          u32 li = row * left_col + left_w * ci + wi;
+          u32 ri = (ci* left_w + wi )* right_col + col;
+
+          float l = convert_bf16_fp32(left[li]);
+          float r = convert_bf16_fp32(right[ri]);
+          tmp_res[row * res_col + col] += l * r;
+        }
+      }
+    }
+  }
+
+  if (p->bias && bias) {
+    for (u32 row = 0; row < res_row; row++) {
+      for (u32 col = 0; col < res_col; col++) {
+        float b = convert_hex_fp32(bias[col]);
+        tmp_res[row * res_col + col] += b;
+      }
+    }
+  }
+
+  if (p->relu_enable)
+    bf16_relu(tmp_res, size);
+
+  for (u64 i = 0; i < size; i++)
+    res[i] = convert_fp32_bf16(tmp_res[i]);
+
+  free(tmp_res);
+
+  return ret;
+}
+
+static void put_bias(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    const ml_t *ml,
+    u32 data[])
+{
+  u64 size = ml->shape.col;
+
+  u16 *tmp = (u16 *)malloc(sizeof(u16) * size * 2);
+  if(!tmp)
+    return;
+
+  for (u64 i = 0; i < size; i++) {
+    tmp[i] = data[i] >> 16;
+    tmp[i + size] = data[i] & 0xFFFF;
+  }
+  put_bf16_matrix_g2l(ctx, bk_ctx, ml, (u8*) tmp, FMT_BF16);
+
+  free(tmp);
+}
+
+
+static int test_matrix_ps32_ut(bmctx_t *ctx, bmk_ctx_t *bk_ctx, param_t *p)
+{
+  make_bmk_matrix_param_ps32(bk_ctx, p, 2);
+  u16 *left = alloc_left(p);
+  u16 *right = alloc_right(p);
+  u16 *ref = alloc_ps32_res(p);
+
+  {
+     bmerr_t ret = ps32_m2_matrix_mac_ref(p, left, right, ref);
+     assert(ret == BM_SUCCESS);
+
+     put_bf16_matrix_g2l(ctx, bk_ctx, p->left, (u8*) left, FMT_BF16);
+     put_bf16_matrix_g2l(ctx, bk_ctx, p->right, (u8*) right, FMT_BF16);
+     bmk1822_tiu_matrix_multiplication(bk_ctx, p);
+     bmk1822_matrix_lmem_t ps32_res;
+     ps32_res = *p->res;
+     ps32_res.shape.n *= sizeof(short);
+     u16 *res = (u16*) get_bf16_matrix_l2g(ctx, bk_ctx, &ps32_res, FMT_BF16);
+
+     int has_error = array_cmp_int8(
+         "Comparing begin_mode results ...\n",
+         (s8 *)ref, (s8 *)res ,(int)res_ps32_size(p)*sizeof(int));
+     if (has_error) {
+       printf("Comparison M2 FAILED\n");
+       print_param(p);
+       exit(-1);
+     }else
+       printf("Comparison M2 PASS\n");
+     free(res);
+  }
+
+  {
+    make_bmk_matrix_param_ps32(bk_ctx, p, 3);
+
+    bmerr_t ret = ps32_m3_matrix_mac_ref(p, left, right, ref);
+    assert(ret == BM_SUCCESS);
+
+    bmk1822_tiu_matrix_multiplication(bk_ctx, p);
+    bmk1822_matrix_lmem_t ps32_res;
+    ps32_res = *p->res;
+    ps32_res.shape.n *= sizeof(short);
+    u16 *res = (u16 *) get_bf16_matrix_l2g(ctx, bk_ctx, &ps32_res, FMT_BF16);
+
+    int has_error = array_cmp_int8(
+        "Comparing m3 results ...\n",
+        (s8 *)ref, (s8 *)res ,(int)res_ps32_size(p)*sizeof(int));
+    if (has_error) {
+      printf("Comparison M3 FAILED\n");
+      print_param(p);
+      exit(-1);
+    }else
+      printf("Comparison M3 PASS\n");
+
+    free(res);
+  }
+  {
+    make_bmk_matrix_param_ps32(bk_ctx, p, 1);
+    u32 *bias = alloc_bias(p);
+
+    bmerr_t ret = ps32_m1_matrix_mac_ref(p, left, right, bias, ref);
+    assert(ret == BM_SUCCESS);
+
+    if(p->bias)
+      put_bias(ctx, bk_ctx, p->bias, bias);
+
+    bmk1822_tiu_matrix_multiplication(bk_ctx, p);
+    bmk1822_matrix_lmem_t ps32_res;
+    ps32_res = *p->res;
+    ps32_res.shape.n *= 2;
+
+    u16 *res = (u16 *) get_bf16_matrix_l2g(ctx, bk_ctx, &ps32_res, FMT_BF16);
+
+    int has_error = array_cmp_int8(
+        "Comparing m1 results ...\n",
+        (s8 *)ref, (s8 *)res ,(int)res_size(p)*2);
+    if (has_error) {
+      printf("Comparison M1 FAILED\n");
+      print_param(p);
+      exit(-1);
+    }else
+      printf("Comparison M1 PASS\n");
+
+    free(res);
+    free(bias);
+  }
+    free(left);
+    free(right);
+    free(ref);
+  return 1;
+}
+
+static void destroy_param(bmk_ctx_t *bk_ctx, param_t *p)
+{
+  if (p->bias)
+    bmk1822_lmem_free_matrix(bk_ctx, p->bias);
+  bmk1822_lmem_free_matrix(bk_ctx, p->res);
+  bmk1822_lmem_free_matrix(bk_ctx, p->right);
+  bmk1822_lmem_free_matrix(bk_ctx, p->left);
+}
+
+static ml_t *alloc_param_res(
+    bmk_ctx_t *bk_ctx, param_t *p)
+{
+  ml_shape_t s;
+  s.n = p->left->shape.n;
+  s.c = p->right->shape.c;
+  s.w = p->right->shape.w;
+  s.col = p->right->shape.col;
+
+  fmt_t fmt = FMT_BF16;
+  return bmk1822_lmem_alloc_ps32_matrix(bk_ctx, s, fmt, 1);
+}
+
+
+static void make_bmk_matrix_param_ps32(bmk_ctx_t *bk_ctx, param_t *p, int ps32_mode)
+{
+
+  ml_shape_t left_shape;
+  ml_shape_t right_shape;
+
+  p->ps32_mode = ps32_mode;
+  p->relu_enable = 0;
+  p->lshift_bits = 0;
+  p->rshift_bits = 0;
+  if(ps32_mode==2)
+  {
+    left_shape.n = matrix_para_t.left_row;
+    left_shape.c = matrix_para_t.left_c;
+    left_shape.w = matrix_para_t.left_w;
+    left_shape.col = matrix_para_t.left_col;
+
+    right_shape.n = matrix_para_t.right_row;
+    right_shape.c = matrix_para_t.right_c;
+    right_shape.w = matrix_para_t.right_w;
+    right_shape.col = matrix_para_t.right_col;
+    p->left  = bmk1822_lmem_alloc_matrix(bk_ctx, left_shape, FMT_BF16, 1);
+    p->right = bmk1822_lmem_alloc_matrix(bk_ctx, right_shape, FMT_BF16, 1);
+    p->bias = NULL;
+    p->res = alloc_param_res(bk_ctx, p);
+  }else if(ps32_mode==3)
+  {
+
+  }else if(ps32_mode==1)
+  {
+     p->relu_enable = matrix_para_t.relu_enable;
+     p->rshift_bits = matrix_para_t.rshift_bits;
+     if(matrix_para_t.using_bias)
+     {
+       right_shape.n = matrix_para_t.right_row;
+       right_shape.c = matrix_para_t.right_c;
+       right_shape.w = matrix_para_t.right_w;
+       right_shape.col = matrix_para_t.right_col;
+
+       ml_shape_t bias_shape = right_shape;
+       bias_shape.n = 2;
+       p->bias = bmk1822_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_BF16, 1);
+       assert(p->bias);
+    }
+  }
+  //print_param(p);
+}
+
+static param_t param_init(void)
+{
+  param_t p;
+
+  random_seed = clock();
+  srand(random_seed);
+
+  memset(&p, 0, sizeof(param_t));
+  memset(&matrix_para_t, 0, sizeof(matrix_init_para_t));
+
+  matrix_para_t.using_bias = rand()%2;
+  matrix_para_t.relu_enable = rand()%2;
+
+  matrix_para_t.left_row = rand()%60+1;
+  matrix_para_t.left_col = rand()%40+1;
+  matrix_para_t.left_w = matrix_para_t.left_col/0x10 ? rand()%8+8 : matrix_para_t.left_col;
+  matrix_para_t.left_c =
+    matrix_para_t.left_col%matrix_para_t.left_w?
+      matrix_para_t.left_col/matrix_para_t.left_w+1 : matrix_para_t.left_col/matrix_para_t.left_w;
+
+  matrix_para_t.right_row = matrix_para_t.left_col;
+  matrix_para_t.right_col = rand()%50+1;
+  matrix_para_t.right_w = rand()%16+1;
+  matrix_para_t.right_c =
+    matrix_para_t.right_col%matrix_para_t.right_w?
+      matrix_para_t.right_col/matrix_para_t.right_w+1 : matrix_para_t.right_col/matrix_para_t.right_w;
+  return p;
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  int round_mode;
+  round_mode = set_store_feround();
+
+  int test_finished_num = 0;
+  for (int i = 0; i < 30; i++) {
+    printf("random_test_conv iteration: %d\n", i);
+    param_t p = param_init();
+
+    test_finished_num += test_matrix_ps32_ut(&ctx, bk_ctx, &p);
+    destroy_param(bk_ctx, &p);
+  }
+  printf("test_finished_num: %d\n", test_finished_num);
+  restore_feround(round_mode);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_bf16_matrix_transfer.cpp b/cviruntime/test/1822/bf16/test_1822_bf16_matrix_transfer.cpp
new file mode 100644
index 000000000..769d83162
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_bf16_matrix_transfer.cpp
@@ -0,0 +1,101 @@
+#include "../1822_test_util.h"
+#include "1822_bf16_util.h"
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+ {FMT_I8, FMT_I8},
+};
+
+static void test_put_and_get_matrix_l2g(
+    bmctx_t *ctx, bmk_ctx_t *bk_ctx, fmt_t fmt, int eu_align)
+{
+  int row = 5;
+  int col = 16 * 5 + 2;
+  int size = row * col;
+  float val = -100;
+
+  ml_shape_t s = bmk1822_matrix_lmem_default_shape(bk_ctx, row, col, fmt);
+
+  u16 *u16data_x = (u16 *)malloc(sizeof(u16) * size);
+  u16 *u16data_y = (u16 *)malloc(sizeof(u16) * size);
+  s8 *s8data_x = (s8 *)malloc(sizeof(s8) * size);
+  s8 *s8data_y = (s8 *)malloc(sizeof(s8) * size);
+  void *data_x = NULL;
+  void *data_y = NULL;
+  u8 *result_x =NULL;
+  u8 *result_y = NULL;
+
+  // prepare source data
+  for (int i = 0; i < size; i++) {
+    if(fmt == FMT_BF16) {
+      u16data_x[i] = generate_bf16_corner_val(val);
+      u16data_y[i] = generate_bf16_corner_val(val);
+      val += 0.1;
+    } else {
+      s8data_x[i] = i - 100;
+      s8data_y[i] = -i;
+    }
+  }
+  data_x =  (fmt == FMT_BF16) ? (void *)u16data_x : (void *)s8data_x;
+  data_y =  (fmt == FMT_BF16) ? (void *)u16data_y : (void *)s8data_y;
+
+ // run tpu operations
+  ml_t *ml_x = bmk1822_lmem_alloc_matrix(bk_ctx, s, fmt, eu_align);
+  ml_t *ml_y = bmk1822_lmem_alloc_matrix(bk_ctx, s, fmt, eu_align);
+  /*
+   * Interleave two matrice in case the same devmem is reused between
+   * put_matrix_g2l() and get_matrix_l2g(), in which case the content of
+   * devmem is already what is expected before bmk1822_gdma_store_matrix().
+   */
+  put_bf16_matrix_g2l(ctx, bk_ctx, ml_x, (u8 *)data_x, fmt);
+  put_bf16_matrix_g2l(ctx, bk_ctx, ml_y, (u8 *)data_y, fmt);
+
+
+  // compare data
+  //// Get result_x before result_y.
+  result_x = get_bf16_matrix_l2g(ctx, bk_ctx, ml_x, fmt);
+  result_y = get_bf16_matrix_l2g(ctx, bk_ctx, ml_y, fmt);
+  if( COMPARE_PASS != compare_result( data_x, result_x, fmt, size))
+    exit(-1);
+  if( COMPARE_PASS != compare_result( data_y, result_y, fmt, size))
+    exit(-1);
+  free(result_x);
+  free(result_y);
+
+  //// Get result_y before result_x.
+  result_y = get_bf16_matrix_l2g(ctx, bk_ctx, ml_y, fmt);
+  result_x = get_bf16_matrix_l2g(ctx, bk_ctx, ml_x, fmt);
+  if( COMPARE_PASS != compare_result( data_x, result_x, fmt, size))
+    exit(-1);
+  if( COMPARE_PASS != compare_result( data_y, result_y, fmt, size))
+    exit(-1);
+  free(result_x);
+  free(result_y);
+
+  // free variables
+  bmk1822_lmem_free_matrix(bk_ctx, ml_y);
+  bmk1822_lmem_free_matrix(bk_ctx, ml_x);
+  free(u16data_x);
+  free(u16data_y);
+  free(s8data_x);
+  free(s8data_y);
+}
+
+#define TEST_ALIGNED 2 // 1: test unalign only, 2: test both align/unalign
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+
+  test_init(&ctx, &bk_ctx);
+
+  for (u32 i = 0; i < nr_fmt; i++) {
+    for (u8 u8_align = 0; u8_align < TEST_ALIGNED; u8_align++) {
+      test_put_and_get_matrix_l2g(&ctx, bk_ctx, input_fmt[i].src_fmt, u8_align);
+    }
+  }
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_bf16_max_pooling.cpp b/cviruntime/test/1822/bf16/test_1822_bf16_max_pooling.cpp
new file mode 100644
index 000000000..06a01e1fe
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_bf16_max_pooling.cpp
@@ -0,0 +1,315 @@
+#include "../1822_test_util.h"
+#include <float.h>
+
+#define INVALIDE_STRIDE (-1)
+typedef bmk1822_tiu_max_pooling_param_t param_t;
+int random_seed;
+static void print_pooling_param(param_t *p)
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+  int on = p->ofmap->shape.n;
+  int oc = p->ofmap->shape.c;
+  int oh = p->ofmap->shape.h;
+  int ow = p->ofmap->shape.w;
+
+  printf("  Pooling parameters:\n");
+  printf("    random_seed : %d \n", random_seed);
+  printf("    ifmap = (%d, %d, %d, %d)\n", in, ic, ih, iw);
+  printf("    opd0_sign = %d\n", p->ifmap->fmt == FMT_I8);
+  printf("    weight = (%d, %d)\n", p->kh, p->kw);
+  printf("    padding = (%d, %d, %d, %d)\n",
+         p->pad_top, p->pad_bottom, p->pad_left, p->pad_right);
+  printf("    stride = (%d, %d)\n", p->stride_h, p->stride_w);
+  printf("    ofmap = (%d, %d, %d, %d)\n", on, oc, oh, ow);
+}
+
+static int pooling_ih_ext(param_t *p, int ih)
+{
+  int pad = p->pad_top + p->pad_bottom;
+  return ih + pad;
+}
+
+static int pooling_iw_ext(param_t *p, int iw)
+{
+  int pad = p->pad_left + p->pad_right;
+  return iw + pad;
+}
+
+static int pooling_oh(param_t *p, int ih)
+{
+  int ih_ext = pooling_ih_ext(p, ih);
+  return (ih_ext - p->kh) / p->stride_h + 1;
+}
+
+static int pooling_ow(param_t *p, int iw)
+{
+  int iw_ext = pooling_iw_ext(p, iw);
+  return (iw_ext - p->kw) / p->stride_w + 1;
+}
+
+static u16 *alloc_input(param_t *p)
+{
+  u64 size = tl_shape_size(&p->ifmap->shape);
+  u16 *data = (u16 *)xmalloc(size*2);
+  if (!data)
+    return NULL;
+
+  for (u64 i = 0; i < size; i++) {
+    float val;
+    int RAND_MAX2 = RAND_MAX/2; //100 ~ -100
+    val = (float)(rand()-RAND_MAX2)*100 / (float)RAND_MAX;
+    data[i] = convert_fp32_bf16(val);
+  }
+  return data;
+}
+
+static u16 *alloc_output(param_t *p)
+{
+  u64 size = tl_shape_size(&p->ofmap->shape);
+  return (u16 *)xmalloc(size * 2);
+}
+
+static void free_pooling_param(
+    bmk_ctx_t *ctx,
+    param_t *r)
+{
+  if (r->ifmap)
+    free_tl(ctx, r->ifmap);
+  if (r->ofmap)
+    free_tl(ctx, r->ofmap);
+}
+
+static param_t random_pooling_param(bmk_ctx_t *ctx, int stride_w, int stride_h)
+{
+
+  param_t p;
+
+  memset(&p, 0, sizeof(p));
+
+retry:
+  random_seed = clock();
+//  random_seed = 3058538;
+  srand(random_seed);
+
+#if 0
+  int in = 1;
+  int ic = 1;
+  int ih = 6;
+  int iw = 6;
+  //int opd0_sign = rand() % 2;
+
+  p.kh = 3;
+  p.kw = 3;
+  p.stride_h = p.kh;
+  p.stride_w = p.kw;
+  p.pad_top = 3;//rand() % p.kh;
+  p.pad_bottom = 3;//rand() % p.kh;
+  p.pad_left = 3;//rand() % p.kw;
+  p.pad_right = 3;//rand() % p.kw;
+
+  tl_shape_t ifmap_shape;
+  ifmap_shape.n = in;
+  ifmap_shape.c = ic;
+  ifmap_shape.h = ih;
+  ifmap_shape.w = iw;
+  tl_shape_t ofmap_shape;
+  ofmap_shape.n = in;
+  ofmap_shape.c = ic;
+  ofmap_shape.h = pooling_oh(&p, ih);
+  ofmap_shape.w = pooling_ow(&p, iw);
+
+#else
+  int in = rand() % 5 + 1;
+  int ic = rand() % (3 * BM1822_HW_NPU_NUM) + 1;
+  int ih = rand() % 30 + 3 + (INVALIDE_STRIDE == stride_h ? 0 : stride_h);
+  int iw = rand() % 30 + 6 + (INVALIDE_STRIDE == stride_w ? 0 : stride_w);
+  //int opd0_sign = rand() % 2;
+
+  p.kh = rand() % 5 + 1;
+  p.kw = rand() % 5 + 1;
+  p.stride_h = INVALIDE_STRIDE == stride_h ? rand() % (p.kh) + 1 : stride_h;
+  p.stride_w = INVALIDE_STRIDE == stride_w ? rand() % (p.kh) + 1 : stride_w;
+  p.pad_top = rand() % p.kh;
+  p.pad_bottom = rand() % p.kh;
+  p.pad_left = rand() % p.kw;
+  p.pad_right = rand() % p.kw;
+
+  tl_shape_t ifmap_shape;
+  ifmap_shape.n = in;
+  ifmap_shape.c = ic;
+  ifmap_shape.h = ih;
+  ifmap_shape.w = iw;
+  tl_shape_t ofmap_shape;
+  ofmap_shape.n = in;
+  ofmap_shape.c = ic;
+  ofmap_shape.h = pooling_oh(&p, ih);
+  ofmap_shape.w = pooling_ow(&p, iw);
+#endif
+//  fmt_t fmt = opd0_sign? FMT_I8: FMT_U8;
+  p.ofmap = bmk1822_lmem_alloc_tensor(ctx, ofmap_shape, FMT_BF16, 1);
+  p.ifmap = bmk1822_lmem_alloc_tensor(ctx, ifmap_shape, FMT_BF16, 1);
+
+  int RAND_MAX2 = RAND_MAX/2; //20 ~ -20
+  float ins_val = (float)(rand()-RAND_MAX2)*20 / (float)RAND_MAX;
+  p.ins_fp = convert_fp32_bf16(ins_val);
+
+  if ((p.kh > pooling_ih_ext(&p, ih))
+      || (p.kw > pooling_iw_ext(&p, iw))
+      || (p.pad_top >= (1 << 4))
+      || (p.pad_bottom >= (1 << 4))
+      || (p.pad_left >= (1 << 4))
+      || (p.pad_right >= (1 << 4))
+      || (p.kh * p.kw == 1)
+      || !p.ofmap || !p.ifmap) {
+      printf("retry init_pooling_param\n");
+    free_pooling_param(ctx, &p);
+    goto retry;
+  }
+
+  return p;
+}
+static int index_get(int h, int w1, int w2)
+{
+  return h * w1 + w2;
+}
+
+int native_pooling_max_bf16(
+    const u16* i_fmap,
+    u16* o_fmap,
+    int input_n, int input_c, int input_h, int input_w,
+    int kh, int kw,
+    int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r,
+    int stride_h, int stride_w,
+    int ins_h, int ins_w,
+    int ins_h_last, int ins_w_last,
+    u16 ins_fp
+    )
+{
+  if (ins_h != 0 || ins_w != 0 || ins_h_last != 0  || ins_w_last !=0)
+    return BM_ERR_INVALID_ARGUMENT;
+
+  int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b);
+  int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r);
+
+  int output_h = calc_output_hw(h_after, kh, stride_h);
+  int output_w = calc_output_hw(w_after, kw, stride_w);
+
+  const float max_init = -FLT_MAX;//convert_bf16_fp32(ins_fp);
+  u16 *i_fmap_pad = NULL;
+  for (int nc = 0; nc < input_n * input_c; nc++) {
+    fill_pad_fmap_bf16(i_fmap, &i_fmap_pad, ins_fp,
+      pad_w_l, pad_w_r, pad_h_t, pad_h_b,
+      0, 0, 0, 0, input_h, input_w);
+
+    for (int ph = 0; ph < output_h; ++ph) {
+      for (int pw = 0; pw < output_w; ++pw) {
+        int hstart = ph * stride_h;
+        int wstart = pw * stride_w;
+        int pool_index = index_get(ph, output_w, pw);
+        float max = max_init;
+        for (int h = 0; h < kh; h++) {
+          for (int w = 0; w < kw; w++) {
+            int index = index_get((hstart + h), (input_w + pad_w_l + pad_w_r),
+                            (w + wstart));
+            float val = convert_bf16_fp32(i_fmap_pad[index]);
+            max = (val > max)? val: max;
+          }
+        }
+        o_fmap[pool_index] = convert_fp32_bf16(max);
+      }
+    }
+    i_fmap += input_w * input_h;
+    o_fmap += output_w * output_h;
+  }
+  free(i_fmap_pad);
+
+  return BM_SUCCESS;
+}
+
+
+static void compare_results(
+    param_t *p,
+    u16 input[],
+    u16 output[])
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+
+  u16 *output_ref = alloc_output(p);
+  bmerr_t ret = native_pooling_max_bf16(
+      input, output_ref, in, ic, ih, iw, p->kh, p->kw,
+      p->pad_top, p->pad_bottom, p->pad_left, p->pad_right,
+      p->stride_h, p->stride_w, 0, 0, 0, 0, p->ins_fp);
+  assert(ret == BM_SUCCESS);
+  int cmp_res = array_cmp_int8(
+      "Comparing results ...\n", (s8*) output_ref, (s8*)output,
+      tl_shape_size(&p->ofmap->shape)*2);
+  if (cmp_res != 0) {
+    printf("Comparison FAILED!!!\n");
+    print_pooling_param(p);
+    exit(-1);
+  }
+  free(output_ref);
+}
+
+static int _test_pooling(bmctx_t ctx, bmk_ctx_t *bk_ctx, int stride_w, int stride_h)
+{
+  param_t param = random_pooling_param(bk_ctx, stride_w, stride_h);
+  //print_pooling_param(&param);
+  u16 *input = alloc_input(&param);
+  put_bf16_tensor_g2l(&ctx, bk_ctx, param.ifmap, (u16 *)input, FMT_BF16);
+  bmk1822_tiu_max_pooling(bk_ctx, &param);
+
+  u16 *output = (u16 *)get_bf16_tensor_l2g(&ctx, bk_ctx, param.ofmap, FMT_BF16);
+
+  compare_results(&param, input, output);
+
+  free_pooling_param(bk_ctx, &param);
+  free(output);
+  free(input);
+
+  return 1;
+}
+
+
+static int test_pooling(bmctx_t ctx, bmk_ctx_t *bk_ctx) {
+  return _test_pooling(ctx, bk_ctx, INVALIDE_STRIDE, INVALIDE_STRIDE);
+}
+
+static void test_max_pooling(bmctx_t *ctx, bmk_ctx_t *bk_ctx)
+{
+  int test_finished_num = 0;
+  for (u64 i = 0; i < 20; i++)
+    test_finished_num += test_pooling(*ctx, bk_ctx);
+
+  // test stride extend (0, 31]
+  int stride_list[] = {15, 16, 31};
+  int stride_list_len = sizeof(stride_list) / sizeof(stride_list[0]);
+
+  for (int stride_w_idx = 0; stride_w_idx < stride_list_len; stride_w_idx++) {
+    for (int stride_h_idx = 0; stride_h_idx < stride_list_len; stride_h_idx++) {
+      int stride_w = stride_list[stride_w_idx];
+      int stride_h = stride_list[stride_h_idx];
+
+      test_finished_num += _test_pooling(*ctx, bk_ctx, stride_w, stride_h);
+    }
+  }
+  printf("Test finished %d\n", test_finished_num);
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_max_pooling(&ctx, bk_ctx);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_bf16_min_pooling.cpp b/cviruntime/test/1822/bf16/test_1822_bf16_min_pooling.cpp
new file mode 100644
index 000000000..9f502261f
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_bf16_min_pooling.cpp
@@ -0,0 +1,296 @@
+#include "../1822_test_util.h"
+#include <float.h>
+
+typedef bmk1822_tiu_min_pooling_param_t param_t;
+int random_seed;
+static void print_pooling_param(param_t *p)
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+  int on = p->ofmap->shape.n;
+  int oc = p->ofmap->shape.c;
+  int oh = p->ofmap->shape.h;
+  int ow = p->ofmap->shape.w;
+
+  printf("  Pooling parameters:\n");
+  printf("    random_seed : %d \n", random_seed);
+  printf("    ifmap = (%d, %d, %d, %d)\n", in, ic, ih, iw);
+  printf("    opd0_sign = %d\n", p->ifmap->fmt == FMT_I8);
+  printf("    weight = (%d, %d)\n", p->kh, p->kw);
+  printf("    padding = (%d, %d, %d, %d)\n",
+         p->pad_top, p->pad_bottom, p->pad_left, p->pad_right);
+  printf("    stride = (%d, %d)\n", p->stride_h, p->stride_w);
+  printf("    ofmap = (%d, %d, %d, %d)\n", on, oc, oh, ow);
+}
+
+static int pooling_ih_ext(param_t *p, int ih)
+{
+  int pad = p->pad_top + p->pad_bottom;
+  return ih + pad;
+}
+
+static int pooling_iw_ext(param_t *p, int iw)
+{
+  int pad = p->pad_left + p->pad_right;
+  return iw + pad;
+}
+
+static int pooling_oh(param_t *p, int ih)
+{
+  int ih_ext = pooling_ih_ext(p, ih);
+  return (ih_ext - p->kh) / p->stride_h + 1;
+}
+
+static int pooling_ow(param_t *p, int iw)
+{
+  int iw_ext = pooling_iw_ext(p, iw);
+  return (iw_ext - p->kw) / p->stride_w + 1;
+}
+
+static u16 *alloc_input(param_t *p)
+{
+  u64 size = tl_shape_size(&p->ifmap->shape);
+  u16 *data = (u16 *)xmalloc(size*2);
+  if (!data)
+    return NULL;
+
+  for (u64 i = 0; i < size; i++) {
+    float val;
+    int RAND_MAX2 = RAND_MAX/2; //100 ~ -100
+    val = (float)(rand()-RAND_MAX2)*100 / (float)RAND_MAX;
+    data[i] = convert_fp32_bf16(val);
+  }
+  return data;
+}
+
+static u16 *alloc_output(param_t *p)
+{
+  u64 size = tl_shape_size(&p->ofmap->shape);
+  return (u16 *)xmalloc(size * 2);
+}
+
+static void free_pooling_param(
+    bmk_ctx_t *ctx,
+    param_t *r)
+{
+  if (r->ifmap)
+    free_tl(ctx, r->ifmap);
+  if (r->ofmap)
+    free_tl(ctx, r->ofmap);
+}
+
+static param_t random_pooling_param(bmk_ctx_t *ctx)
+{
+
+  param_t p;
+
+  memset(&p, 0, sizeof(p));
+
+retry:
+  random_seed = clock();
+//  random_seed = 3058538;
+  srand(random_seed);
+
+#if 0
+  int in = 1;
+  int ic = 1;
+  int ih = 6;
+  int iw = 6;
+  //int opd0_sign = rand() % 2;
+
+  p.kh = 3;
+  p.kw = 3;
+  p.stride_h = p.kh;
+  p.stride_w = p.kw;
+  p.pad_top = 3;//rand() % p.kh;
+  p.pad_bottom = 3;//rand() % p.kh;
+  p.pad_left = 3;//rand() % p.kw;
+  p.pad_right = 3;//rand() % p.kw;
+
+  tl_shape_t ifmap_shape;
+  ifmap_shape.n = in;
+  ifmap_shape.c = ic;
+  ifmap_shape.h = ih;
+  ifmap_shape.w = iw;
+  tl_shape_t ofmap_shape;
+  ofmap_shape.n = in;
+  ofmap_shape.c = ic;
+  ofmap_shape.h = pooling_oh(&p, ih);
+  ofmap_shape.w = pooling_ow(&p, iw);
+
+#else
+  int in = rand() % 5 + 1;
+  int ic = rand() % (3 * BM1822_HW_NPU_NUM) + 1;
+  int ih = rand() % 30 + 3;
+  int iw = rand() % 30 + 6;
+  //int opd0_sign = rand() % 2;
+
+  p.kh = rand() % 5 + 1;
+  p.kw = rand() % 5 + 1;
+  p.stride_h = rand() % (p.kh) + 1;
+  p.stride_w = rand() % (p.kw) + 1;
+  p.pad_top = rand() % p.kh;
+  p.pad_bottom = rand() % p.kh;
+  p.pad_left = rand() % p.kw;
+  p.pad_right = rand() % p.kw;
+
+  tl_shape_t ifmap_shape;
+  ifmap_shape.n = in;
+  ifmap_shape.c = ic;
+  ifmap_shape.h = ih;
+  ifmap_shape.w = iw;
+  tl_shape_t ofmap_shape;
+  ofmap_shape.n = in;
+  ofmap_shape.c = ic;
+  ofmap_shape.h = pooling_oh(&p, ih);
+  ofmap_shape.w = pooling_ow(&p, iw);
+#endif
+//  fmt_t fmt = opd0_sign? FMT_I8: FMT_U8;
+  p.ofmap = bmk1822_lmem_alloc_tensor(ctx, ofmap_shape, FMT_BF16, 1);
+  p.ifmap = bmk1822_lmem_alloc_tensor(ctx, ifmap_shape, FMT_BF16, 1);
+
+  int RAND_MAX2 = RAND_MAX/2; //20 ~ -20
+  float ins_val = (float)(rand()-RAND_MAX2)*20 / (float)RAND_MAX;
+  p.ins_fp = convert_fp32_bf16(ins_val);
+
+  if ((p.kh > pooling_ih_ext(&p, ih))
+      || (p.kw > pooling_iw_ext(&p, iw))
+      || (p.pad_top >= (1 << 4))
+      || (p.pad_bottom >= (1 << 4))
+      || (p.pad_left >= (1 << 4))
+      || (p.pad_right >= (1 << 4))
+      || (p.kh * p.kw == 1)
+      || !p.ofmap || !p.ifmap) {
+      printf("retry init_pooling_param\n");
+    free_pooling_param(ctx, &p);
+    goto retry;
+  }
+
+  return p;
+}
+static int index_get(int h, int w1, int w2)
+{
+  return h * w1 + w2;
+}
+
+int native_pooling_min_bf16(
+    const u16* i_fmap,
+    u16* o_fmap,
+    int input_n, int input_c, int input_h, int input_w,
+    int kh, int kw,
+    int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r,
+    int stride_h, int stride_w,
+    int ins_h, int ins_w,
+    int ins_h_last, int ins_w_last,
+    u16 ins_fp
+    )
+{
+  if (ins_h != 0 || ins_w != 0 || ins_h_last != 0  || ins_w_last !=0)
+    return BM_ERR_INVALID_ARGUMENT;
+
+  int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b);
+  int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r);
+
+  int output_h = calc_output_hw(h_after, kh, stride_h);
+  int output_w = calc_output_hw(w_after, kw, stride_w);
+
+  const float min_init = FLT_MAX;//convert_bf16_fp32(ins_fp);
+  u16 *i_fmap_pad = NULL;
+  for (int nc = 0; nc < input_n * input_c; nc++) {
+    fill_pad_fmap_bf16(i_fmap, &i_fmap_pad, ins_fp,
+      pad_w_l, pad_w_r, pad_h_t, pad_h_b,
+      0, 0, 0, 0, input_h, input_w);
+
+    for (int ph = 0; ph < output_h; ++ph) {
+      for (int pw = 0; pw < output_w; ++pw) {
+        int hstart = ph * stride_h;
+        int wstart = pw * stride_w;
+        int pool_index = index_get(ph, output_w, pw);
+        float min = min_init;
+        for (int h = 0; h < kh; h++) {
+          for (int w = 0; w < kw; w++) {
+            int index = index_get((hstart + h), (input_w + pad_w_l + pad_w_r),
+                            (w + wstart));
+            float val = convert_bf16_fp32(i_fmap_pad[index]);
+            min = (val < min)? val: min;
+          }
+        }
+        o_fmap[pool_index] = convert_fp32_bf16(min);
+      }
+    }
+    i_fmap += input_w * input_h;
+    o_fmap += output_w * output_h;
+  }
+  free(i_fmap_pad);
+
+  return BM_SUCCESS;
+}
+
+
+static void compare_results(
+    param_t *p,
+    u16 input[],
+    u16 output[])
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+
+  u16 *output_ref = alloc_output(p);
+  bmerr_t ret = native_pooling_min_bf16(
+      input, output_ref, in, ic, ih, iw, p->kh, p->kw,
+      p->pad_top, p->pad_bottom, p->pad_left, p->pad_right,
+      p->stride_h, p->stride_w, 0, 0, 0, 0, p->ins_fp);
+  assert(ret == BM_SUCCESS);
+  int cmp_res = array_cmp_int8(
+      "Comparing results ...\n", (s8*) output_ref, (s8*)output,
+      tl_shape_size(&p->ofmap->shape)*2);
+  if (cmp_res != 0) {
+    printf("Comparison FAILED!!!\n");
+    print_pooling_param(p);
+    exit(-1);
+  }
+  free(output_ref);
+}
+
+static int test_pooling(bmctx_t ctx, bmk_ctx_t *bk_ctx)
+{
+  param_t param = random_pooling_param(bk_ctx);
+  //print_pooling_param(&param);
+  u16 *input = alloc_input(&param);
+  put_bf16_tensor_g2l(&ctx, bk_ctx, param.ifmap, (u16 *)input, FMT_BF16);
+  bmk1822_tiu_bf16_min_pooling(bk_ctx, &param);
+
+  u16 *output = (u16 *)get_bf16_tensor_l2g(&ctx, bk_ctx, param.ofmap, FMT_BF16);
+
+  compare_results(&param, input, output);
+
+  free_pooling_param(bk_ctx, &param);
+  free(output);
+  free(input);
+
+  return 1;
+}
+
+static void test_min_pooling(bmctx_t *ctx, bmk_ctx_t *bk_ctx)
+{
+  int test_finished_num = 0;
+  for (u64 i = 0; i < 20; i++)
+    test_finished_num += test_pooling(*ctx, bk_ctx);
+  printf("Test finished %d\n", test_finished_num);
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_min_pooling(&ctx, bk_ctx);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_bf16_sigmoid.cpp b/cviruntime/test/1822/bf16/test_1822_bf16_sigmoid.cpp
new file mode 100644
index 000000000..64f4b6839
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_bf16_sigmoid.cpp
@@ -0,0 +1,145 @@
+#include "../1822_test_util.h"
+#include <random>
+
+static u32 channel = -1; //<! 1822 hardcode
+static u32 table_h = 32;
+static u32 table_w = 8;
+static u32 table_hw = table_h * table_w;
+
+using namespace std;
+static void tl_lut_ref(
+    u16 *ofmap,
+    u16 *ifmap,
+    u16 *table,
+    tl_shape_t ifmap_shape,
+    tl_shape_t table_shape)
+{
+  int ih, iw;
+  int tn, th, tw;
+
+  ih = ifmap_shape.h;
+  iw = ifmap_shape.w;
+  tn = table_shape.n;
+  th = table_shape.h;
+  tw = table_shape.w;
+  assert(tn == 1);
+  assert(th * tw == 256);
+
+  for (u64 i = 0; i < tl_shape_size(&ifmap_shape); i++) {
+    int ici = i / (ih * iw) % 32;
+    u8 off = ifmap[i] & 0xff;
+    ofmap[i] = table[ici * (th * tw) + off];
+  }
+}
+
+static void gen_sigmoid(u16 *table_data, u64 table_size) {
+  // S(x) = 1 / (1 + (e^-x))
+  printf ("table_size is %" PRIu64 "\n", table_size);
+
+  for (u64 i = 0; i < table_hw; i++) {
+    int sign = rand() % 2 ? 1 : -1;
+    float s = exp(0.001 * i) * sign;
+    table_data[i] = convert_fp32_bf16(s);
+  }
+
+  for (u32 i = 1; i < channel; i++) {
+    memcpy(&table_data[i * table_hw], &table_data[0], sizeof(u16) * table_hw);
+  }
+}
+
+static bool verify(u16 *ofmap_data, u16 *ref_data, u64 ofmap_size) {
+  for (u64 i = 0; i < ofmap_size; i++) {
+    if (ofmap_data[i] != ref_data[i]) {
+      fprintf(stderr,
+          "comparing failed at ofmap_data[%" PRIu64 "], got %d(0x%x), exp %d(0x%x)\n",
+          i, ofmap_data[i], ofmap_data[i], ref_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+  return true;
+}
+
+union bf16int8 {
+  u16 bf16;
+  u8  int8[2];
+};
+
+static void test_tl_int8_lut_bf16(bmctx_t *ctx, bmk_ctx_t *bk_ctx)
+{
+  // TODO: check more shape / align
+  tl_shape_t ifmap_shape = {1, channel, 16, 16};
+  tl_shape_t table_shape = {1, channel, 32, 8}; // hard code for hw
+  tl_shape_t ofmap_shape = ifmap_shape;
+
+  u64 ifmap_size = tl_shape_size(&ifmap_shape);
+  u64 table_size = tl_shape_size(&table_shape);
+  u64 ofmap_size = tl_shape_size(&ofmap_shape);
+
+  fmt_t fmt = FMT_BF16;
+
+  int data_type_size = bytesize_of_fmt(fmt);
+  u64 ifmap_bytesize  =  ifmap_size * data_type_size;
+  u64 table_bytesize  =  table_size * data_type_size;
+  u64 ofmap_bytesize  =  ofmap_size * data_type_size;
+
+  u16 *ifmap_data = (u16 *)xmalloc(ifmap_bytesize);
+  memset(ifmap_data, 0x00, ifmap_bytesize);
+  // hw ONLY support index in int8
+
+  for (u64 i = 0; i < ifmap_size; i++) {
+    bf16int8 b;
+    b.int8[0] = i % table_hw;
+    b.int8[1] = i % table_hw;
+    ifmap_data[i] = b.bf16;
+  }
+
+  u16 *table_data = (u16 *)xmalloc(table_bytesize);
+  gen_sigmoid (table_data, table_size);
+
+  u16 *ref_data = (u16 *)xmalloc(ofmap_bytesize);
+  tl_lut_ref(ref_data, ifmap_data, table_data, ifmap_shape, table_shape);
+
+  tl_t *tl_ifmap =
+    alloc_tl(bk_ctx,ifmap_shape, fmt, /*align*/1);
+  tl_t *tl_table =
+    alloc_tl(bk_ctx, table_shape, fmt, /*align*/1);
+  tl_t *tl_ofmap =
+    alloc_tl(bk_ctx,ofmap_shape, fmt, /*align*/1);
+
+  put_bf16_tensor_g2l(ctx, bk_ctx, tl_ifmap, ifmap_data, fmt);
+  put_bf16_tensor_g2l(ctx, bk_ctx, tl_table, table_data, fmt);
+
+  bmk1822_tiu_lookup_table_param_t p12;
+  p12.ofmap = tl_ofmap;
+  p12.ifmap = tl_ifmap;
+  p12.table = tl_table;
+  bmk1822_tiu_lookup_table(bk_ctx, &p12);
+  test_submit(ctx);
+
+  u16 *ofmap_data = (u16*)get_bf16_tensor_l2g(ctx, bk_ctx, tl_ofmap, fmt);
+  verify(ofmap_data, ref_data, ofmap_size);
+
+  free_tl(bk_ctx, tl_ofmap);
+  free_tl(bk_ctx, tl_table);
+  free_tl(bk_ctx, tl_ifmap);
+
+  free(ifmap_data);
+  free(table_data);
+  free(ref_data);
+  free(ofmap_data);
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  // get channel info
+  bmk1822_chip_info_t chip_info = bmk1822_chip_info();
+  channel = chip_info.npu_num;
+
+  test_tl_int8_lut_bf16(&ctx, bk_ctx);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_bf16_sigmoid_linear_interp.cpp b/cviruntime/test/1822/bf16/test_1822_bf16_sigmoid_linear_interp.cpp
new file mode 100644
index 000000000..6eee85dff
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_bf16_sigmoid_linear_interp.cpp
@@ -0,0 +1,6680 @@
+/**
+ * implement Linear interpolation serach
+ * 
+ * we need to pass 2 table, one is answer(lut_answer), another is slope with anwser(lut_answer_slope),
+ *
+ * for example, we want to get x value
+ * +------+----+
+ * x0     x    x1
+ *
+ * the [Linear interpolation defined] (https://en.wikipedia.org/wiki/Linear_interpolation) as flowing:
+ *
+ * part C  part A                     part B
+ * +--+    +---+           +----------------------------------------+
+ *
+ * p(x) =  f(x0)     +     ( (f(x1) - f(x0)) / (x1 - x0) ) * (x - x0)
+ *         
+ *         +---+           +-----------------------------+
+ *        lut_answer              lut_answer_slope
+ */
+//* TODO: you could rerange any value to -127~127
+#include "../1822_test_util.h"
+#define OUT
+#define IN
+
+/**
+ * pre_data means we test fixed pattern, it should be same sa lut
+ * compare fix means we MAKE SURE output values equal with golden,
+ * comment it for check with error using `MAX_ERROR`
+ */
+enum TEST_MODE {
+  PRE_DATA_COMPARE_FIX = 0, // pre-data + fix compare
+  PRE_DATA_MAX_ERROR,       // pre-data + compare only diff < MAX_ERROR
+  //GEN_DATA_MAX_ERROR,       // gen data + compare only diff < MAX_ERROR
+  TEST_MODE_MAX,
+};
+
+static TEST_MODE mode;
+#define MAX_ERROR (0.004)
+
+using namespace std;
+//TODO: get from ctx
+//NOTICE: table duplicate by channel
+static u32 channel = -1; //<! 1822 hardcode
+
+//<! 1822 hw config
+static u32 table_h = 32;
+static u32 table_w = 8;
+static u32 table_hw = table_h * table_w;
+
+// NOTICE: activation ragne from -8 ~ +8 and slice to 256, dequantize to -127 ~ 127
+static int range_start = -8;
+static int range_end = 8;
+static float scale = table_hw / (1.0 * abs(range_start - range_end)); // 256 / 16 = 16
+
+// NOTICE: all inter result save in doulbe unit
+static double *sigmode_hw = (double *)malloc(sizeof(double) * table_hw);
+
+static u16 test_pattern[] = {
+  0x0000,
+  0x3C03,
+  0x3C83,
+  0x3CC5,
+  0x3D03,
+  0x3D24,
+  0x3D45,
+  0x3D65,
+  0x3D83,
+  0x3D93,
+  0x3DA4,
+  0x3DB4,
+  0x3DC5,
+  0x3DD5,
+  0x3DE5,
+  0x3DF6,
+  0x3E03,
+  0x3E0B,
+  0x3E13,
+  0x3E1C,
+  0x3E24,
+  0x3E2C,
+  0x3E34,
+  0x3E3C,
+  0x3E45,
+  0x3E4D,
+  0x3E55,
+  0x3E5D,
+  0x3E65,
+  0x3E6E,
+  0x3E76,
+  0x3E7E,
+  0x3E83,
+  0x3E87,
+  0x3E8B,
+  0x3E8F,
+  0x3E93,
+  0x3E98,
+  0x3E9C,
+  0x3EA0,
+  0x3EA4,
+  0x3EA8,
+  0x3EAC,
+  0x3EB0,
+  0x3EB4,
+  0x3EB8,
+  0x3EBC,
+  0x3EC1,
+  0x3EC5,
+  0x3EC9,
+  0x3ECD,
+  0x3ED1,
+  0x3ED5,
+  0x3ED9,
+  0x3EDD,
+  0x3EE1,
+  0x3EE5,
+  0x3EE9,
+  0x3EEE,
+  0x3EF2,
+  0x3EF6,
+  0x3EFA,
+  0x3EFE,
+  0x3F01,
+  0x3F03,
+  0x3F05,
+  0x3F07,
+  0x3F09,
+  0x3F0B,
+  0x3F0D,
+  0x3F0F,
+  0x3F11,
+  0x3F13,
+  0x3F16,
+  0x3F18,
+  0x3F1A,
+  0x3F1C,
+  0x3F1E,
+  0x3F20,
+  0x3F22,
+  0x3F24,
+  0x3F26,
+  0x3F28,
+  0x3F2A,
+  0x3F2C,
+  0x3F2E,
+  0x3F30,
+  0x3F32,
+  0x3F34,
+  0x3F36,
+  0x3F38,
+  0x3F3A,
+  0x3F3C,
+  0x3F3E,
+  0x3F41,
+  0x3F43,
+  0x3F45,
+  0x3F47,
+  0x3F49,
+  0x3F4B,
+  0x3F4D,
+  0x3F4F,
+  0x3F51,
+  0x3F53,
+  0x3F55,
+  0x3F57,
+  0x3F59,
+  0x3F5B,
+  0x3F5D,
+  0x3F5F,
+  0x3F61,
+  0x3F63,
+  0x3F65,
+  0x3F67,
+  0x3F69,
+  0x3F6C,
+  0x3F6E,
+  0x3F70,
+  0x3F72,
+  0x3F74,
+  0x3F76,
+  0x3F78,
+  0x3F7A,
+  0x3F7C,
+  0x3F7E,
+  0x3F80,
+  0x3F81,
+  0x3F82,
+  0x3F83,
+  0x3F84,
+  0x3F85,
+  0x3F86,
+  0x3F87,
+  0x3F88,
+  0x3F89,
+  0x3F8A,
+  0x3F8B,
+  0x3F8C,
+  0x3F8D,
+  0x3F8E,
+  0x3F8F,
+  0x3F90,
+  0x3F91,
+  0x3F92,
+  0x3F93,
+  0x3F94,
+  0x3F96,
+  0x3F97,
+  0x3F98,
+  0x3F99,
+  0x3F9A,
+  0x3F9B,
+  0x3F9C,
+  0x3F9D,
+  0x3F9E,
+  0x3F9F,
+  0x3FA0,
+  0x3FA1,
+  0x3FA2,
+  0x3FA3,
+  0x3FA4,
+  0x3FA5,
+  0x3FA6,
+  0x3FA7,
+  0x3FA8,
+  0x3FA9,
+  0x3FAA,
+  0x3FAB,
+  0x3FAC,
+  0x3FAD,
+  0x3FAE,
+  0x3FAF,
+  0x3FB0,
+  0x3FB1,
+  0x3FB2,
+  0x3FB3,
+  0x3FB4,
+  0x3FB5,
+  0x3FB6,
+  0x3FB7,
+  0x3FB8,
+  0x3FB9,
+  0x3FBA,
+  0x3FBB,
+  0x3FBC,
+  0x3FBD,
+  0x3FBE,
+  0x3FBF,
+  0x3FC1,
+  0x3FC2,
+  0x3FC3,
+  0x3FC4,
+  0x3FC5,
+  0x3FC6,
+  0x3FC7,
+  0x3FC8,
+  0x3FC9,
+  0x3FCA,
+  0x3FCB,
+  0x3FCC,
+  0x3FCD,
+  0x3FCE,
+  0x3FCF,
+  0x3FD0,
+  0x3FD1,
+  0x3FD2,
+  0x3FD3,
+  0x3FD4,
+  0x3FD5,
+  0x3FD6,
+  0x3FD7,
+  0x3FD8,
+  0x3FD9,
+  0x3FDA,
+  0x3FDB,
+  0x3FDC,
+  0x3FDD,
+  0x3FDE,
+  0x3FDF,
+  0x3FE0,
+  0x3FE1,
+  0x3FE2,
+  0x3FE3,
+  0x3FE4,
+  0x3FE5,
+  0x3FE6,
+  0x3FE7,
+  0x3FE8,
+  0x3FE9,
+  0x3FEA,
+  0x3FEC,
+  0x3FED,
+  0x3FEE,
+  0x3FEF,
+  0x3FF0,
+  0x3FF1,
+  0x3FF2,
+  0x3FF3,
+  0x3FF4,
+  0x3FF5,
+  0x3FF6,
+  0x3FF7,
+  0x3FF8,
+  0x3FF9,
+  0x3FFA,
+  0x3FFB,
+  0x3FFC,
+  0x3FFD,
+  0x3FFE,
+  0x3FFF,
+  0x4000,
+  0x4001,
+  0x4001,
+  0x4002,
+  0x4002,
+  0x4003,
+  0x4003,
+  0x4004,
+  0x4004,
+  0x4005,
+  0x4005,
+  0x4006,
+  0x4006,
+  0x4007,
+  0x4007,
+  0x4008,
+  0x4008,
+  0x4009,
+  0x4009,
+  0x400A,
+  0x400A,
+  0x400B,
+  0x400B,
+  0x400C,
+  0x400C,
+  0x400D,
+  0x400D,
+  0x400E,
+  0x400E,
+  0x400F,
+  0x400F,
+  0x4010,
+  0x4010,
+  0x4011,
+  0x4011,
+  0x4012,
+  0x4012,
+  0x4013,
+  0x4013,
+  0x4014,
+  0x4014,
+  0x4015,
+  0x4016,
+  0x4016,
+  0x4017,
+  0x4017,
+  0x4018,
+  0x4018,
+  0x4019,
+  0x4019,
+  0x401A,
+  0x401A,
+  0x401B,
+  0x401B,
+  0x401C,
+  0x401C,
+  0x401D,
+  0x401D,
+  0x401E,
+  0x401E,
+  0x401F,
+  0x401F,
+  0x4020,
+  0x4020,
+  0x4021,
+  0x4021,
+  0x4022,
+  0x4022,
+  0x4023,
+  0x4023,
+  0x4024,
+  0x4024,
+  0x4025,
+  0x4025,
+  0x4026,
+  0x4026,
+  0x4027,
+  0x4027,
+  0x4028,
+  0x4028,
+  0x4029,
+  0x4029,
+  0x402A,
+  0x402A,
+  0x402B,
+  0x402C,
+  0x402C,
+  0x402D,
+  0x402D,
+  0x402E,
+  0x402E,
+  0x402F,
+  0x402F,
+  0x4030,
+  0x4030,
+  0x4031,
+  0x4031,
+  0x4032,
+  0x4032,
+  0x4033,
+  0x4033,
+  0x4034,
+  0x4034,
+  0x4035,
+  0x4035,
+  0x4036,
+  0x4036,
+  0x4037,
+  0x4037,
+  0x4038,
+  0x4038,
+  0x4039,
+  0x4039,
+  0x403A,
+  0x403A,
+  0x403B,
+  0x403B,
+  0x403C,
+  0x403C,
+  0x403D,
+  0x403D,
+  0x403E,
+  0x403E,
+  0x403F,
+  0x403F,
+  0x4040,
+  0x4041,
+  0x4041,
+  0x4042,
+  0x4042,
+  0x4043,
+  0x4043,
+  0x4044,
+  0x4044,
+  0x4045,
+  0x4045,
+  0x4046,
+  0x4046,
+  0x4047,
+  0x4047,
+  0x4048,
+  0x4048,
+  0x4049,
+  0x4049,
+  0x404A,
+  0x404A,
+  0x404B,
+  0x404B,
+  0x404C,
+  0x404C,
+  0x404D,
+  0x404D,
+  0x404E,
+  0x404E,
+  0x404F,
+  0x404F,
+  0x4050,
+  0x4050,
+  0x4051,
+  0x4051,
+  0x4052,
+  0x4052,
+  0x4053,
+  0x4053,
+  0x4054,
+  0x4054,
+  0x4055,
+  0x4056,
+  0x4056,
+  0x4057,
+  0x4057,
+  0x4058,
+  0x4058,
+  0x4059,
+  0x4059,
+  0x405A,
+  0x405A,
+  0x405B,
+  0x405B,
+  0x405C,
+  0x405C,
+  0x405D,
+  0x405D,
+  0x405E,
+  0x405E,
+  0x405F,
+  0x405F,
+  0x4060,
+  0x4060,
+  0x4061,
+  0x4061,
+  0x4062,
+  0x4062,
+  0x4063,
+  0x4063,
+  0x4064,
+  0x4064,
+  0x4065,
+  0x4065,
+  0x4066,
+  0x4066,
+  0x4067,
+  0x4067,
+  0x4068,
+  0x4068,
+  0x4069,
+  0x4069,
+  0x406A,
+  0x406A,
+  0x406B,
+  0x406C,
+  0x406C,
+  0x406D,
+  0x406D,
+  0x406E,
+  0x406E,
+  0x406F,
+  0x406F,
+  0x4070,
+  0x4070,
+  0x4071,
+  0x4071,
+  0x4072,
+  0x4072,
+  0x4073,
+  0x4073,
+  0x4074,
+  0x4074,
+  0x4075,
+  0x4075,
+  0x4076,
+  0x4076,
+  0x4077,
+  0x4077,
+  0x4078,
+  0x4078,
+  0x4079,
+  0x4079,
+  0x407A,
+  0x407A,
+  0x407B,
+  0x407B,
+  0x407C,
+  0x407C,
+  0x407D,
+  0x407D,
+  0x407E,
+  0x407E,
+  0x407F,
+  0x407F,
+  0x4080,
+  0x4080,
+  0x4081,
+  0x4081,
+  0x4081,
+  0x4081,
+  0x4082,
+  0x4082,
+  0x4082,
+  0x4082,
+  0x4083,
+  0x4083,
+  0x4083,
+  0x4083,
+  0x4084,
+  0x4084,
+  0x4084,
+  0x4084,
+  0x4085,
+  0x4085,
+  0x4085,
+  0x4085,
+  0x4086,
+  0x4086,
+  0x4086,
+  0x4086,
+  0x4087,
+  0x4087,
+  0x4087,
+  0x4087,
+  0x4088,
+  0x4088,
+  0x4088,
+  0x4088,
+  0x4089,
+  0x4089,
+  0x4089,
+  0x4089,
+  0x408A,
+  0x408A,
+  0x408A,
+  0x408A,
+  0x408B,
+  0x408B,
+  0x408B,
+  0x408C,
+  0x408C,
+  0x408C,
+  0x408C,
+  0x408D,
+  0x408D,
+  0x408D,
+  0x408D,
+  0x408E,
+  0x408E,
+  0x408E,
+  0x408E,
+  0x408F,
+  0x408F,
+  0x408F,
+  0x408F,
+  0x4090,
+  0x4090,
+  0x4090,
+  0x4090,
+  0x4091,
+  0x4091,
+  0x4091,
+  0x4091,
+  0x4092,
+  0x4092,
+  0x4092,
+  0x4092,
+  0x4093,
+  0x4093,
+  0x4093,
+  0x4093,
+  0x4094,
+  0x4094,
+  0x4094,
+  0x4094,
+  0x4095,
+  0x4095,
+  0x4095,
+  0x4096,
+  0x4096,
+  0x4096,
+  0x4096,
+  0x4097,
+  0x4097,
+  0x4097,
+  0x4097,
+  0x4098,
+  0x4098,
+  0x4098,
+  0x4098,
+  0x4099,
+  0x4099,
+  0x4099,
+  0x4099,
+  0x409A,
+  0x409A,
+  0x409A,
+  0x409A,
+  0x409B,
+  0x409B,
+  0x409B,
+  0x409B,
+  0x409C,
+  0x409C,
+  0x409C,
+  0x409C,
+  0x409D,
+  0x409D,
+  0x409D,
+  0x409D,
+  0x409E,
+  0x409E,
+  0x409E,
+  0x409E,
+  0x409F,
+  0x409F,
+  0x409F,
+  0x409F,
+  0x40A0,
+  0x40A0,
+  0x40A0,
+  0x40A1,
+  0x40A1,
+  0x40A1,
+  0x40A1,
+  0x40A2,
+  0x40A2,
+  0x40A2,
+  0x40A2,
+  0x40A3,
+  0x40A3,
+  0x40A3,
+  0x40A3,
+  0x40A4,
+  0x40A4,
+  0x40A4,
+  0x40A4,
+  0x40A5,
+  0x40A5,
+  0x40A5,
+  0x40A5,
+  0x40A6,
+  0x40A6,
+  0x40A6,
+  0x40A6,
+  0x40A7,
+  0x40A7,
+  0x40A7,
+  0x40A7,
+  0x40A8,
+  0x40A8,
+  0x40A8,
+  0x40A8,
+  0x40A9,
+  0x40A9,
+  0x40A9,
+  0x40A9,
+  0x40AA,
+  0x40AA,
+  0x40AA,
+  0x40AA,
+  0x40AB,
+  0x40AB,
+  0x40AB,
+  0x40AC,
+  0x40AC,
+  0x40AC,
+  0x40AC,
+  0x40AD,
+  0x40AD,
+  0x40AD,
+  0x40AD,
+  0x40AE,
+  0x40AE,
+  0x40AE,
+  0x40AE,
+  0x40AF,
+  0x40AF,
+  0x40AF,
+  0x40AF,
+  0x40B0,
+  0x40B0,
+  0x40B0,
+  0x40B0,
+  0x40B1,
+  0x40B1,
+  0x40B1,
+  0x40B1,
+  0x40B2,
+  0x40B2,
+  0x40B2,
+  0x40B2,
+  0x40B3,
+  0x40B3,
+  0x40B3,
+  0x40B3,
+  0x40B4,
+  0x40B4,
+  0x40B4,
+  0x40B4,
+  0x40B5,
+  0x40B5,
+  0x40B5,
+  0x40B6,
+  0x40B6,
+  0x40B6,
+  0x40B6,
+  0x40B7,
+  0x40B7,
+  0x40B7,
+  0x40B7,
+  0x40B8,
+  0x40B8,
+  0x40B8,
+  0x40B8,
+  0x40B9,
+  0x40B9,
+  0x40B9,
+  0x40B9,
+  0x40BA,
+  0x40BA,
+  0x40BA,
+  0x40BA,
+  0x40BB,
+  0x40BB,
+  0x40BB,
+  0x40BB,
+  0x40BC,
+  0x40BC,
+  0x40BC,
+  0x40BC,
+  0x40BD,
+  0x40BD,
+  0x40BD,
+  0x40BD,
+  0x40BE,
+  0x40BE,
+  0x40BE,
+  0x40BE,
+  0x40BF,
+  0x40BF,
+  0x40BF,
+  0x40BF,
+  0x40C0,
+  0x40C0,
+  0x40C0,
+  0x40C1,
+  0x40C1,
+  0x40C1,
+  0x40C1,
+  0x40C2,
+  0x40C2,
+  0x40C2,
+  0x40C2,
+  0x40C3,
+  0x40C3,
+  0x40C3,
+  0x40C3,
+  0x40C4,
+  0x40C4,
+  0x40C4,
+  0x40C4,
+  0x40C5,
+  0x40C5,
+  0x40C5,
+  0x40C5,
+  0x40C6,
+  0x40C6,
+  0x40C6,
+  0x40C6,
+  0x40C7,
+  0x40C7,
+  0x40C7,
+  0x40C7,
+  0x40C8,
+  0x40C8,
+  0x40C8,
+  0x40C8,
+  0x40C9,
+  0x40C9,
+  0x40C9,
+  0x40C9,
+  0x40CA,
+  0x40CA,
+  0x40CA,
+  0x40CA,
+  0x40CB,
+  0x40CB,
+  0x40CB,
+  0x40CC,
+  0x40CC,
+  0x40CC,
+  0x40CC,
+  0x40CD,
+  0x40CD,
+  0x40CD,
+  0x40CD,
+  0x40CE,
+  0x40CE,
+  0x40CE,
+  0x40CE,
+  0x40CF,
+  0x40CF,
+  0x40CF,
+  0x40CF,
+  0x40D0,
+  0x40D0,
+  0x40D0,
+  0x40D0,
+  0x40D1,
+  0x40D1,
+  0x40D1,
+  0x40D1,
+  0x40D2,
+  0x40D2,
+  0x40D2,
+  0x40D2,
+  0x40D3,
+  0x40D3,
+  0x40D3,
+  0x40D3,
+  0x40D4,
+  0x40D4,
+  0x40D4,
+  0x40D4,
+  0x40D5,
+  0x40D5,
+  0x40D5,
+  0x40D6,
+  0x40D6,
+  0x40D6,
+  0x40D6,
+  0x40D7,
+  0x40D7,
+  0x40D7,
+  0x40D7,
+  0x40D8,
+  0x40D8,
+  0x40D8,
+  0x40D8,
+  0x40D9,
+  0x40D9,
+  0x40D9,
+  0x40D9,
+  0x40DA,
+  0x40DA,
+  0x40DA,
+  0x40DA,
+  0x40DB,
+  0x40DB,
+  0x40DB,
+  0x40DB,
+  0x40DC,
+  0x40DC,
+  0x40DC,
+  0x40DC,
+  0x40DD,
+  0x40DD,
+  0x40DD,
+  0x40DD,
+  0x40DE,
+  0x40DE,
+  0x40DE,
+  0x40DE,
+  0x40DF,
+  0x40DF,
+  0x40DF,
+  0x40DF,
+  0x40E0,
+  0x40E0,
+  0x40E0,
+  0x40E1,
+  0x40E1,
+  0x40E1,
+  0x40E1,
+  0x40E2,
+  0x40E2,
+  0x40E2,
+  0x40E2,
+  0x40E3,
+  0x40E3,
+  0x40E3,
+  0x40E3,
+  0x40E4,
+  0x40E4,
+  0x40E4,
+  0x40E4,
+  0x40E5,
+  0x40E5,
+  0x40E5,
+  0x40E5,
+  0x40E6,
+  0x40E6,
+  0x40E6,
+  0x40E6,
+  0x40E7,
+  0x40E7,
+  0x40E7,
+  0x40E7,
+  0x40E8,
+  0x40E8,
+  0x40E8,
+  0x40E8,
+  0x40E9,
+  0x40E9,
+  0x40E9,
+  0x40E9,
+  0x40EA,
+  0x40EA,
+  0x40EA,
+  0x40EA,
+  0x40EB,
+  0x40EB,
+  0x40EB,
+  0x40EC,
+  0x40EC,
+  0x40EC,
+  0x40EC,
+  0x40ED,
+  0x40ED,
+  0x40ED,
+  0x40ED,
+  0x40EE,
+  0x40EE,
+  0x40EE,
+  0x40EE,
+  0x40EF,
+  0x40EF,
+  0x40EF,
+  0x40EF,
+  0x40F0,
+  0x40F0,
+  0x40F0,
+  0x40F0,
+  0x40F1,
+  0x40F1,
+  0x40F1,
+  0x40F1,
+  0x40F2,
+  0x40F2,
+  0x40F2,
+  0x40F2,
+  0x40F3,
+  0x40F3,
+  0x40F3,
+  0x40F3,
+  0x40F4,
+  0x40F4,
+  0x40F4,
+  0x40F4,
+  0x40F5,
+  0x40F5,
+  0x40F5,
+  0x40F6,
+  0x40F6,
+  0x40F6,
+  0x40F6,
+  0x40F7,
+  0x40F7,
+  0x40F7,
+  0x40F7,
+  0x40F8,
+  0x40F8,
+  0x40F8,
+  0x40F8,
+  0x40F9,
+  0x40F9,
+  0x40F9,
+  0x40F9,
+  0x40FA,
+  0x40FA,
+  0x40FA,
+  0x40FA,
+  0x40FB,
+  0x40FB,
+  0x40FB,
+  0x40FB,
+  0x40FC,
+  0x40FC,
+  0x40FC,
+  0x40FC,
+  0x40FD,
+  0x40FD,
+  0x40FD,
+  0x40FD,
+  0x40FE,
+  0x40FE,
+  0x40FE,
+  0x40FE,
+  0x40FF,
+  0x40FF,
+  0x40FF,
+  0x40FF,
+  0x4100,
+  0xBC03,
+  0xBC83,
+  0xBCC5,
+  0xBD03,
+  0xBD24,
+  0xBD45,
+  0xBD65,
+  0xBD83,
+  0xBD93,
+  0xBDA4,
+  0xBDB4,
+  0xBDC5,
+  0xBDD5,
+  0xBDE5,
+  0xBDF6,
+  0xBE03,
+  0xBE0B,
+  0xBE13,
+  0xBE1C,
+  0xBE24,
+  0xBE2C,
+  0xBE34,
+  0xBE3C,
+  0xBE45,
+  0xBE4D,
+  0xBE55,
+  0xBE5D,
+  0xBE65,
+  0xBE6E,
+  0xBE76,
+  0xBE7E,
+  0xBE83,
+  0xBE87,
+  0xBE8B,
+  0xBE8F,
+  0xBE93,
+  0xBE98,
+  0xBE9C,
+  0xBEA0,
+  0xBEA4,
+  0xBEA8,
+  0xBEAC,
+  0xBEB0,
+  0xBEB4,
+  0xBEB8,
+  0xBEBC,
+  0xBEC1,
+  0xBEC5,
+  0xBEC9,
+  0xBECD,
+  0xBED1,
+  0xBED5,
+  0xBED9,
+  0xBEDD,
+  0xBEE1,
+  0xBEE5,
+  0xBEE9,
+  0xBEEE,
+  0xBEF2,
+  0xBEF6,
+  0xBEFA,
+  0xBEFE,
+  0xBF01,
+  0xBF03,
+  0xBF05,
+  0xBF07,
+  0xBF09,
+  0xBF0B,
+  0xBF0D,
+  0xBF0F,
+  0xBF11,
+  0xBF13,
+  0xBF16,
+  0xBF18,
+  0xBF1A,
+  0xBF1C,
+  0xBF1E,
+  0xBF20,
+  0xBF22,
+  0xBF24,
+  0xBF26,
+  0xBF28,
+  0xBF2A,
+  0xBF2C,
+  0xBF2E,
+  0xBF30,
+  0xBF32,
+  0xBF34,
+  0xBF36,
+  0xBF38,
+  0xBF3A,
+  0xBF3C,
+  0xBF3E,
+  0xBF41,
+  0xBF43,
+  0xBF45,
+  0xBF47,
+  0xBF49,
+  0xBF4B,
+  0xBF4D,
+  0xBF4F,
+  0xBF51,
+  0xBF53,
+  0xBF55,
+  0xBF57,
+  0xBF59,
+  0xBF5B,
+  0xBF5D,
+  0xBF5F,
+  0xBF61,
+  0xBF63,
+  0xBF65,
+  0xBF67,
+  0xBF69,
+  0xBF6C,
+  0xBF6E,
+  0xBF70,
+  0xBF72,
+  0xBF74,
+  0xBF76,
+  0xBF78,
+  0xBF7A,
+  0xBF7C,
+  0xBF7E,
+  0xBF80,
+  0xBF81,
+  0xBF82,
+  0xBF83,
+  0xBF84,
+  0xBF85,
+  0xBF86,
+  0xBF87,
+  0xBF88,
+  0xBF89,
+  0xBF8A,
+  0xBF8B,
+  0xBF8C,
+  0xBF8D,
+  0xBF8E,
+  0xBF8F,
+  0xBF90,
+  0xBF91,
+  0xBF92,
+  0xBF93,
+  0xBF94,
+  0xBF96,
+  0xBF97,
+  0xBF98,
+  0xBF99,
+  0xBF9A,
+  0xBF9B,
+  0xBF9C,
+  0xBF9D,
+  0xBF9E,
+  0xBF9F,
+  0xBFA0,
+  0xBFA1,
+  0xBFA2,
+  0xBFA3,
+  0xBFA4,
+  0xBFA5,
+  0xBFA6,
+  0xBFA7,
+  0xBFA8,
+  0xBFA9,
+  0xBFAA,
+  0xBFAB,
+  0xBFAC,
+  0xBFAD,
+  0xBFAE,
+  0xBFAF,
+  0xBFB0,
+  0xBFB1,
+  0xBFB2,
+  0xBFB3,
+  0xBFB4,
+  0xBFB5,
+  0xBFB6,
+  0xBFB7,
+  0xBFB8,
+  0xBFB9,
+  0xBFBA,
+  0xBFBB,
+  0xBFBC,
+  0xBFBD,
+  0xBFBE,
+  0xBFBF,
+  0xBFC1,
+  0xBFC2,
+  0xBFC3,
+  0xBFC4,
+  0xBFC5,
+  0xBFC6,
+  0xBFC7,
+  0xBFC8,
+  0xBFC9,
+  0xBFCA,
+  0xBFCB,
+  0xBFCC,
+  0xBFCD,
+  0xBFCE,
+  0xBFCF,
+  0xBFD0,
+  0xBFD1,
+  0xBFD2,
+  0xBFD3,
+  0xBFD4,
+  0xBFD5,
+  0xBFD6,
+  0xBFD7,
+  0xBFD8,
+  0xBFD9,
+  0xBFDA,
+  0xBFDB,
+  0xBFDC,
+  0xBFDD,
+  0xBFDE,
+  0xBFDF,
+  0xBFE0,
+  0xBFE1,
+  0xBFE2,
+  0xBFE3,
+  0xBFE4,
+  0xBFE5,
+  0xBFE6,
+  0xBFE7,
+  0xBFE8,
+  0xBFE9,
+  0xBFEA,
+  0xBFEC,
+  0xBFED,
+  0xBFEE,
+  0xBFEF,
+  0xBFF0,
+  0xBFF1,
+  0xBFF2,
+  0xBFF3,
+  0xBFF4,
+  0xBFF5,
+  0xBFF6,
+  0xBFF7,
+  0xBFF8,
+  0xBFF9,
+  0xBFFA,
+  0xBFFB,
+  0xBFFC,
+  0xBFFD,
+  0xBFFE,
+  0xBFFF,
+  0xC000,
+  0xC001,
+  0xC001,
+  0xC002,
+  0xC002,
+  0xC003,
+  0xC003,
+  0xC004,
+  0xC004,
+  0xC005,
+  0xC005,
+  0xC006,
+  0xC006,
+  0xC007,
+  0xC007,
+  0xC008,
+  0xC008,
+  0xC009,
+  0xC009,
+  0xC00A,
+  0xC00A,
+  0xC00B,
+  0xC00B,
+  0xC00C,
+  0xC00C,
+  0xC00D,
+  0xC00D,
+  0xC00E,
+  0xC00E,
+  0xC00F,
+  0xC00F,
+  0xC010,
+  0xC010,
+  0xC011,
+  0xC011,
+  0xC012,
+  0xC012,
+  0xC013,
+  0xC013,
+  0xC014,
+  0xC014,
+  0xC015,
+  0xC016,
+  0xC016,
+  0xC017,
+  0xC017,
+  0xC018,
+  0xC018,
+  0xC019,
+  0xC019,
+  0xC01A,
+  0xC01A,
+  0xC01B,
+  0xC01B,
+  0xC01C,
+  0xC01C,
+  0xC01D,
+  0xC01D,
+  0xC01E,
+  0xC01E,
+  0xC01F,
+  0xC01F,
+  0xC020,
+  0xC020,
+  0xC021,
+  0xC021,
+  0xC022,
+  0xC022,
+  0xC023,
+  0xC023,
+  0xC024,
+  0xC024,
+  0xC025,
+  0xC025,
+  0xC026,
+  0xC026,
+  0xC027,
+  0xC027,
+  0xC028,
+  0xC028,
+  0xC029,
+  0xC029,
+  0xC02A,
+  0xC02A,
+  0xC02B,
+  0xC02C,
+  0xC02C,
+  0xC02D,
+  0xC02D,
+  0xC02E,
+  0xC02E,
+  0xC02F,
+  0xC02F,
+  0xC030,
+  0xC030,
+  0xC031,
+  0xC031,
+  0xC032,
+  0xC032,
+  0xC033,
+  0xC033,
+  0xC034,
+  0xC034,
+  0xC035,
+  0xC035,
+  0xC036,
+  0xC036,
+  0xC037,
+  0xC037,
+  0xC038,
+  0xC038,
+  0xC039,
+  0xC039,
+  0xC03A,
+  0xC03A,
+  0xC03B,
+  0xC03B,
+  0xC03C,
+  0xC03C,
+  0xC03D,
+  0xC03D,
+  0xC03E,
+  0xC03E,
+  0xC03F,
+  0xC03F,
+  0xC040,
+  0xC041,
+  0xC041,
+  0xC042,
+  0xC042,
+  0xC043,
+  0xC043,
+  0xC044,
+  0xC044,
+  0xC045,
+  0xC045,
+  0xC046,
+  0xC046,
+  0xC047,
+  0xC047,
+  0xC048,
+  0xC048,
+  0xC049,
+  0xC049,
+  0xC04A,
+  0xC04A,
+  0xC04B,
+  0xC04B,
+  0xC04C,
+  0xC04C,
+  0xC04D,
+  0xC04D,
+  0xC04E,
+  0xC04E,
+  0xC04F,
+  0xC04F,
+  0xC050,
+  0xC050,
+  0xC051,
+  0xC051,
+  0xC052,
+  0xC052,
+  0xC053,
+  0xC053,
+  0xC054,
+  0xC054,
+  0xC055,
+  0xC056,
+  0xC056,
+  0xC057,
+  0xC057,
+  0xC058,
+  0xC058,
+  0xC059,
+  0xC059,
+  0xC05A,
+  0xC05A,
+  0xC05B,
+  0xC05B,
+  0xC05C,
+  0xC05C,
+  0xC05D,
+  0xC05D,
+  0xC05E,
+  0xC05E,
+  0xC05F,
+  0xC05F,
+  0xC060,
+  0xC060,
+  0xC061,
+  0xC061,
+  0xC062,
+  0xC062,
+  0xC063,
+  0xC063,
+  0xC064,
+  0xC064,
+  0xC065,
+  0xC065,
+  0xC066,
+  0xC066,
+  0xC067,
+  0xC067,
+  0xC068,
+  0xC068,
+  0xC069,
+  0xC069,
+  0xC06A,
+  0xC06A,
+  0xC06B,
+  0xC06C,
+  0xC06C,
+  0xC06D,
+  0xC06D,
+  0xC06E,
+  0xC06E,
+  0xC06F,
+  0xC06F,
+  0xC070,
+  0xC070,
+  0xC071,
+  0xC071,
+  0xC072,
+  0xC072,
+  0xC073,
+  0xC073,
+  0xC074,
+  0xC074,
+  0xC075,
+  0xC075,
+  0xC076,
+  0xC076,
+  0xC077,
+  0xC077,
+  0xC078,
+  0xC078,
+  0xC079,
+  0xC079,
+  0xC07A,
+  0xC07A,
+  0xC07B,
+  0xC07B,
+  0xC07C,
+  0xC07C,
+  0xC07D,
+  0xC07D,
+  0xC07E,
+  0xC07E,
+  0xC07F,
+  0xC07F,
+  0xC080,
+  0xC080,
+  0xC081,
+  0xC081,
+  0xC081,
+  0xC081,
+  0xC082,
+  0xC082,
+  0xC082,
+  0xC082,
+  0xC083,
+  0xC083,
+  0xC083,
+  0xC083,
+  0xC084,
+  0xC084,
+  0xC084,
+  0xC084,
+  0xC085,
+  0xC085,
+  0xC085,
+  0xC085,
+  0xC086,
+  0xC086,
+  0xC086,
+  0xC086,
+  0xC087,
+  0xC087,
+  0xC087,
+  0xC087,
+  0xC088,
+  0xC088,
+  0xC088,
+  0xC088,
+  0xC089,
+  0xC089,
+  0xC089,
+  0xC089,
+  0xC08A,
+  0xC08A,
+  0xC08A,
+  0xC08A,
+  0xC08B,
+  0xC08B,
+  0xC08B,
+  0xC08C,
+  0xC08C,
+  0xC08C,
+  0xC08C,
+  0xC08D,
+  0xC08D,
+  0xC08D,
+  0xC08D,
+  0xC08E,
+  0xC08E,
+  0xC08E,
+  0xC08E,
+  0xC08F,
+  0xC08F,
+  0xC08F,
+  0xC08F,
+  0xC090,
+  0xC090,
+  0xC090,
+  0xC090,
+  0xC091,
+  0xC091,
+  0xC091,
+  0xC091,
+  0xC092,
+  0xC092,
+  0xC092,
+  0xC092,
+  0xC093,
+  0xC093,
+  0xC093,
+  0xC093,
+  0xC094,
+  0xC094,
+  0xC094,
+  0xC094,
+  0xC095,
+  0xC095,
+  0xC095,
+  0xC096,
+  0xC096,
+  0xC096,
+  0xC096,
+  0xC097,
+  0xC097,
+  0xC097,
+  0xC097,
+  0xC098,
+  0xC098,
+  0xC098,
+  0xC098,
+  0xC099,
+  0xC099,
+  0xC099,
+  0xC099,
+  0xC09A,
+  0xC09A,
+  0xC09A,
+  0xC09A,
+  0xC09B,
+  0xC09B,
+  0xC09B,
+  0xC09B,
+  0xC09C,
+  0xC09C,
+  0xC09C,
+  0xC09C,
+  0xC09D,
+  0xC09D,
+  0xC09D,
+  0xC09D,
+  0xC09E,
+  0xC09E,
+  0xC09E,
+  0xC09E,
+  0xC09F,
+  0xC09F,
+  0xC09F,
+  0xC09F,
+  0xC0A0,
+  0xC0A0,
+  0xC0A0,
+  0xC0A1,
+  0xC0A1,
+  0xC0A1,
+  0xC0A1,
+  0xC0A2,
+  0xC0A2,
+  0xC0A2,
+  0xC0A2,
+  0xC0A3,
+  0xC0A3,
+  0xC0A3,
+  0xC0A3,
+  0xC0A4,
+  0xC0A4,
+  0xC0A4,
+  0xC0A4,
+  0xC0A5,
+  0xC0A5,
+  0xC0A5,
+  0xC0A5,
+  0xC0A6,
+  0xC0A6,
+  0xC0A6,
+  0xC0A6,
+  0xC0A7,
+  0xC0A7,
+  0xC0A7,
+  0xC0A7,
+  0xC0A8,
+  0xC0A8,
+  0xC0A8,
+  0xC0A8,
+  0xC0A9,
+  0xC0A9,
+  0xC0A9,
+  0xC0A9,
+  0xC0AA,
+  0xC0AA,
+  0xC0AA,
+  0xC0AA,
+  0xC0AB,
+  0xC0AB,
+  0xC0AB,
+  0xC0AC,
+  0xC0AC,
+  0xC0AC,
+  0xC0AC,
+  0xC0AD,
+  0xC0AD,
+  0xC0AD,
+  0xC0AD,
+  0xC0AE,
+  0xC0AE,
+  0xC0AE,
+  0xC0AE,
+  0xC0AF,
+  0xC0AF,
+  0xC0AF,
+  0xC0AF,
+  0xC0B0,
+  0xC0B0,
+  0xC0B0,
+  0xC0B0,
+  0xC0B1,
+  0xC0B1,
+  0xC0B1,
+  0xC0B1,
+  0xC0B2,
+  0xC0B2,
+  0xC0B2,
+  0xC0B2,
+  0xC0B3,
+  0xC0B3,
+  0xC0B3,
+  0xC0B3,
+  0xC0B4,
+  0xC0B4,
+  0xC0B4,
+  0xC0B4,
+  0xC0B5,
+  0xC0B5,
+  0xC0B5,
+  0xC0B6,
+  0xC0B6,
+  0xC0B6,
+  0xC0B6,
+  0xC0B7,
+  0xC0B7,
+  0xC0B7,
+  0xC0B7,
+  0xC0B8,
+  0xC0B8,
+  0xC0B8,
+  0xC0B8,
+  0xC0B9,
+  0xC0B9,
+  0xC0B9,
+  0xC0B9,
+  0xC0BA,
+  0xC0BA,
+  0xC0BA,
+  0xC0BA,
+  0xC0BB,
+  0xC0BB,
+  0xC0BB,
+  0xC0BB,
+  0xC0BC,
+  0xC0BC,
+  0xC0BC,
+  0xC0BC,
+  0xC0BD,
+  0xC0BD,
+  0xC0BD,
+  0xC0BD,
+  0xC0BE,
+  0xC0BE,
+  0xC0BE,
+  0xC0BE,
+  0xC0BF,
+  0xC0BF,
+  0xC0BF,
+  0xC0BF,
+  0xC0C0,
+  0xC0C0,
+  0xC0C0,
+  0xC0C1,
+  0xC0C1,
+  0xC0C1,
+  0xC0C1,
+  0xC0C2,
+  0xC0C2,
+  0xC0C2,
+  0xC0C2,
+  0xC0C3,
+  0xC0C3,
+  0xC0C3,
+  0xC0C3,
+  0xC0C4,
+  0xC0C4,
+  0xC0C4,
+  0xC0C4,
+  0xC0C5,
+  0xC0C5,
+  0xC0C5,
+  0xC0C5,
+  0xC0C6,
+  0xC0C6,
+  0xC0C6,
+  0xC0C6,
+  0xC0C7,
+  0xC0C7,
+  0xC0C7,
+  0xC0C7,
+  0xC0C8,
+  0xC0C8,
+  0xC0C8,
+  0xC0C8,
+  0xC0C9,
+  0xC0C9,
+  0xC0C9,
+  0xC0C9,
+  0xC0CA,
+  0xC0CA,
+  0xC0CA,
+  0xC0CA,
+  0xC0CB,
+  0xC0CB,
+  0xC0CB,
+  0xC0CC,
+  0xC0CC,
+  0xC0CC,
+  0xC0CC,
+  0xC0CD,
+  0xC0CD,
+  0xC0CD,
+  0xC0CD,
+  0xC0CE,
+  0xC0CE,
+  0xC0CE,
+  0xC0CE,
+  0xC0CF,
+  0xC0CF,
+  0xC0CF,
+  0xC0CF,
+  0xC0D0,
+  0xC0D0,
+  0xC0D0,
+  0xC0D0,
+  0xC0D1,
+  0xC0D1,
+  0xC0D1,
+  0xC0D1,
+  0xC0D2,
+  0xC0D2,
+  0xC0D2,
+  0xC0D2,
+  0xC0D3,
+  0xC0D3,
+  0xC0D3,
+  0xC0D3,
+  0xC0D4,
+  0xC0D4,
+  0xC0D4,
+  0xC0D4,
+  0xC0D5,
+  0xC0D5,
+  0xC0D5,
+  0xC0D6,
+  0xC0D6,
+  0xC0D6,
+  0xC0D6,
+  0xC0D7,
+  0xC0D7,
+  0xC0D7,
+  0xC0D7,
+  0xC0D8,
+  0xC0D8,
+  0xC0D8,
+  0xC0D8,
+  0xC0D9,
+  0xC0D9,
+  0xC0D9,
+  0xC0D9,
+  0xC0DA,
+  0xC0DA,
+  0xC0DA,
+  0xC0DA,
+  0xC0DB,
+  0xC0DB,
+  0xC0DB,
+  0xC0DB,
+  0xC0DC,
+  0xC0DC,
+  0xC0DC,
+  0xC0DC,
+  0xC0DD,
+  0xC0DD,
+  0xC0DD,
+  0xC0DD,
+  0xC0DE,
+  0xC0DE,
+  0xC0DE,
+  0xC0DE,
+  0xC0DF,
+  0xC0DF,
+  0xC0DF,
+  0xC0DF,
+  0xC0E0,
+  0xC0E0,
+  0xC0E0,
+  0xC0E1,
+  0xC0E1,
+  0xC0E1,
+  0xC0E1,
+  0xC0E2,
+  0xC0E2,
+  0xC0E2,
+  0xC0E2,
+  0xC0E3,
+  0xC0E3,
+  0xC0E3,
+  0xC0E3,
+  0xC0E4,
+  0xC0E4,
+  0xC0E4,
+  0xC0E4,
+  0xC0E5,
+  0xC0E5,
+  0xC0E5,
+  0xC0E5,
+  0xC0E6,
+  0xC0E6,
+  0xC0E6,
+  0xC0E6,
+  0xC0E7,
+  0xC0E7,
+  0xC0E7,
+  0xC0E7,
+  0xC0E8,
+  0xC0E8,
+  0xC0E8,
+  0xC0E8,
+  0xC0E9,
+  0xC0E9,
+  0xC0E9,
+  0xC0E9,
+  0xC0EA,
+  0xC0EA,
+  0xC0EA,
+  0xC0EA,
+  0xC0EB,
+  0xC0EB,
+  0xC0EB,
+  0xC0EC,
+  0xC0EC,
+  0xC0EC,
+  0xC0EC,
+  0xC0ED,
+  0xC0ED,
+  0xC0ED,
+  0xC0ED,
+  0xC0EE,
+  0xC0EE,
+  0xC0EE,
+  0xC0EE,
+  0xC0EF,
+  0xC0EF,
+  0xC0EF,
+  0xC0EF,
+  0xC0F0,
+  0xC0F0,
+  0xC0F0,
+  0xC0F0,
+  0xC0F1,
+  0xC0F1,
+  0xC0F1,
+  0xC0F1,
+  0xC0F2,
+  0xC0F2,
+  0xC0F2,
+  0xC0F2,
+  0xC0F3,
+  0xC0F3,
+  0xC0F3,
+  0xC0F3,
+  0xC0F4,
+  0xC0F4,
+  0xC0F4,
+  0xC0F4,
+  0xC0F5,
+  0xC0F5,
+  0xC0F5,
+  0xC0F6,
+  0xC0F6,
+  0xC0F6,
+  0xC0F6,
+  0xC0F7,
+  0xC0F7,
+  0xC0F7,
+  0xC0F7,
+  0xC0F8,
+  0xC0F8,
+  0xC0F8,
+  0xC0F8,
+  0xC0F9,
+  0xC0F9,
+  0xC0F9,
+  0xC0F9,
+  0xC0FA,
+  0xC0FA,
+  0xC0FA,
+  0xC0FA,
+  0xC0FB,
+  0xC0FB,
+  0xC0FB,
+  0xC0FB,
+  0xC0FC,
+  0xC0FC,
+  0xC0FC,
+  0xC0FC,
+  0xC0FD,
+  0xC0FD,
+  0xC0FD,
+  0xC0FD,
+  0xC0FE,
+  0xC0FE,
+  0xC0FE,
+  0xC0FE,
+  0xC0FF,
+  0xC0FF,
+  0xC0FF,
+  0xC0FF,
+  0xC100,
+  0xC100,
+};
+
+static u16 sigmode_golden_bf16[] = {
+0x3f00,
+0x3f01,
+0x3f01,
+0x3f02,
+0x3f02,
+0x3f03,
+0x3f03,
+0x3f04,
+0x3f04,
+0x3f05,
+0x3f05,
+0x3f06,
+0x3f06,
+0x3f07,
+0x3f07,
+0x3f08,
+0x3f08,
+0x3f09,
+0x3f09,
+0x3f0a,
+0x3f0a,
+0x3f0b,
+0x3f0b,
+0x3f0c,
+0x3f0c,
+0x3f0d,
+0x3f0d,
+0x3f0e,
+0x3f0e,
+0x3f0f,
+0x3f0f,
+0x3f10,
+0x3f10,
+0x3f11,
+0x3f11,
+0x3f12,
+0x3f12,
+0x3f13,
+0x3f13,
+0x3f14,
+0x3f14,
+0x3f15,
+0x3f15,
+0x3f16,
+0x3f16,
+0x3f17,
+0x3f17,
+0x3f18,
+0x3f19,
+0x3f19,
+0x3f1a,
+0x3f1a,
+0x3f1b,
+0x3f1b,
+0x3f1b,
+0x3f1c,
+0x3f1d,
+0x3f1d,
+0x3f1e,
+0x3f1e,
+0x3f1f,
+0x3f1f,
+0x3f20,
+0x3f1f,
+0x3f20,
+0x3f20,
+0x3f21,
+0x3f21,
+0x3f22,
+0x3f22,
+0x3f23,
+0x3f23,
+0x3f24,
+0x3f24,
+0x3f25,
+0x3f25,
+0x3f26,
+0x3f26,
+0x3f27,
+0x3f27,
+0x3f28,
+0x3f28,
+0x3f29,
+0x3f29,
+0x3f2a,
+0x3f2a,
+0x3f2a,
+0x3f2a,
+0x3f2b,
+0x3f2b,
+0x3f2c,
+0x3f2c,
+0x3f2d,
+0x3f2d,
+0x3f2e,
+0x3f2f,
+0x3f2f,
+0x3f30,
+0x3f30,
+0x3f30,
+0x3f31,
+0x3f31,
+0x3f31,
+0x3f32,
+0x3f32,
+0x3f32,
+0x3f33,
+0x3f33,
+0x3f34,
+0x3f34,
+0x3f35,
+0x3f36,
+0x3f36,
+0x3f36,
+0x3f37,
+0x3f37,
+0x3f38,
+0x3f38,
+0x3f38,
+0x3f39,
+0x3f39,
+0x3f3a,
+0x3f3a,
+0x3f3a,
+0x3f3b,
+0x3f3b,
+0x3f3b,
+0x3f3c,
+0x3f3c,
+0x3f3d,
+0x3f3d,
+0x3f3d,
+0x3f3e,
+0x3f3e,
+0x3f3e,
+0x3f3f,
+0x3f3f,
+0x3f40,
+0x3f40,
+0x3f40,
+0x3f41,
+0x3f41,
+0x3f41,
+0x3f42,
+0x3f42,
+0x3f42,
+0x3f43,
+0x3f44,
+0x3f44,
+0x3f44,
+0x3f45,
+0x3f45,
+0x3f45,
+0x3f46,
+0x3f46,
+0x3f46,
+0x3f47,
+0x3f47,
+0x3f48,
+0x3f48,
+0x3f48,
+0x3f49,
+0x3f49,
+0x3f49,
+0x3f4a,
+0x3f4a,
+0x3f4b,
+0x3f4b,
+0x3f4b,
+0x3f4c,
+0x3f4c,
+0x3f4c,
+0x3f4c,
+0x3f4c,
+0x3f4d,
+0x3f4d,
+0x3f4d,
+0x3f4e,
+0x3f4e,
+0x3f4e,
+0x3f4f,
+0x3f4f,
+0x3f50,
+0x3f50,
+0x3f50,
+0x3f51,
+0x3f51,
+0x3f51,
+0x3f51,
+0x3f52,
+0x3f52,
+0x3f52,
+0x3f52,
+0x3f53,
+0x3f53,
+0x3f54,
+0x3f54,
+0x3f55,
+0x3f55,
+0x3f55,
+0x3f55,
+0x3f56,
+0x3f56,
+0x3f56,
+0x3f56,
+0x3f57,
+0x3f57,
+0x3f57,
+0x3f57,
+0x3f58,
+0x3f58,
+0x3f58,
+0x3f58,
+0x3f59,
+0x3f59,
+0x3f59,
+0x3f59,
+0x3f5a,
+0x3f5a,
+0x3f5a,
+0x3f5a,
+0x3f5a,
+0x3f5b,
+0x3f5b,
+0x3f5b,
+0x3f5b,
+0x3f5c,
+0x3f5c,
+0x3f5c,
+0x3f5c,
+0x3f5d,
+0x3f5d,
+0x3f5d,
+0x3f5e,
+0x3f5e,
+0x3f5e,
+0x3f5e,
+0x3f5f,
+0x3f5f,
+0x3f5f,
+0x3f5f,
+0x3f60,
+0x3f60,
+0x3f60,
+0x3f60,
+0x3f61,
+0x3f61,
+0x3f61,
+0x3f61,
+0x3f62,
+0x3f61,
+0x3f61,
+0x3f61,
+0x3f62,
+0x3f62,
+0x3f62,
+0x3f62,
+0x3f63,
+0x3f63,
+0x3f63,
+0x3f63,
+0x3f64,
+0x3f64,
+0x3f64,
+0x3f64,
+0x3f65,
+0x3f65,
+0x3f65,
+0x3f65,
+0x3f66,
+0x3f66,
+0x3f66,
+0x3f66,
+0x3f66,
+0x3f66,
+0x3f66,
+0x3f66,
+0x3f67,
+0x3f67,
+0x3f67,
+0x3f67,
+0x3f68,
+0x3f68,
+0x3f68,
+0x3f68,
+0x3f69,
+0x3f69,
+0x3f69,
+0x3f69,
+0x3f69,
+0x3f69,
+0x3f69,
+0x3f6a,
+0x3f6a,
+0x3f6a,
+0x3f6a,
+0x3f6a,
+0x3f6a,
+0x3f6a,
+0x3f6a,
+0x3f6b,
+0x3f6b,
+0x3f6b,
+0x3f6b,
+0x3f6b,
+0x3f6b,
+0x3f6b,
+0x3f6b,
+0x3f6c,
+0x3f6c,
+0x3f6c,
+0x3f6c,
+0x3f6d,
+0x3f6d,
+0x3f6d,
+0x3f6d,
+0x3f6e,
+0x3f6e,
+0x3f6e,
+0x3f6e,
+0x3f6e,
+0x3f6e,
+0x3f6e,
+0x3f6e,
+0x3f6f,
+0x3f6f,
+0x3f6f,
+0x3f6f,
+0x3f6f,
+0x3f6f,
+0x3f6f,
+0x3f6f,
+0x3f6f,
+0x3f6f,
+0x3f70,
+0x3f70,
+0x3f70,
+0x3f70,
+0x3f70,
+0x3f70,
+0x3f70,
+0x3f71,
+0x3f71,
+0x3f71,
+0x3f71,
+0x3f71,
+0x3f71,
+0x3f71,
+0x3f71,
+0x3f72,
+0x3f72,
+0x3f71,
+0x3f71,
+0x3f71,
+0x3f71,
+0x3f71,
+0x3f71,
+0x3f72,
+0x3f72,
+0x3f72,
+0x3f72,
+0x3f72,
+0x3f72,
+0x3f72,
+0x3f72,
+0x3f73,
+0x3f73,
+0x3f73,
+0x3f73,
+0x3f73,
+0x3f73,
+0x3f73,
+0x3f73,
+0x3f74,
+0x3f74,
+0x3f74,
+0x3f74,
+0x3f74,
+0x3f74,
+0x3f74,
+0x3f75,
+0x3f75,
+0x3f75,
+0x3f75,
+0x3f75,
+0x3f75,
+0x3f75,
+0x3f75,
+0x3f75,
+0x3f75,
+0x3f75,
+0x3f75,
+0x3f75,
+0x3f75,
+0x3f75,
+0x3f75,
+0x3f75,
+0x3f75,
+0x3f76,
+0x3f76,
+0x3f76,
+0x3f76,
+0x3f76,
+0x3f76,
+0x3f76,
+0x3f76,
+0x3f76,
+0x3f76,
+0x3f76,
+0x3f76,
+0x3f76,
+0x3f76,
+0x3f76,
+0x3f76,
+0x3f77,
+0x3f77,
+0x3f77,
+0x3f77,
+0x3f77,
+0x3f77,
+0x3f77,
+0x3f78,
+0x3f78,
+0x3f78,
+0x3f78,
+0x3f78,
+0x3f78,
+0x3f78,
+0x3f78,
+0x3f78,
+0x3f78,
+0x3f78,
+0x3f78,
+0x3f78,
+0x3f78,
+0x3f78,
+0x3f78,
+0x3f78,
+0x3f78,
+0x3f78,
+0x3f78,
+0x3f78,
+0x3f78,
+0x3f78,
+0x3f78,
+0x3f79,
+0x3f79,
+0x3f79,
+0x3f79,
+0x3f79,
+0x3f79,
+0x3f79,
+0x3f79,
+0x3f79,
+0x3f79,
+0x3f79,
+0x3f79,
+0x3f79,
+0x3f79,
+0x3f79,
+0x3f7a,
+0x3f7a,
+0x3f7a,
+0x3f7a,
+0x3f7a,
+0x3f7a,
+0x3f7a,
+0x3f7a,
+0x3f7a,
+0x3f7a,
+0x3f7a,
+0x3f7a,
+0x3f7a,
+0x3f7a,
+0x3f7a,
+0x3f7a,
+0x3f7a,
+0x3f7a,
+0x3f7a,
+0x3f7a,
+0x3f7a,
+0x3f7a,
+0x3f7a,
+0x3f7a,
+0x3f7b,
+0x3f7b,
+0x3f7b,
+0x3f7b,
+0x3f7b,
+0x3f7b,
+0x3f7b,
+0x3f7b,
+0x3f7b,
+0x3f7b,
+0x3f7b,
+0x3f7b,
+0x3f7b,
+0x3f7b,
+0x3f7b,
+0x3f7b,
+0x3f7b,
+0x3f7b,
+0x3f7b,
+0x3f7b,
+0x3f7b,
+0x3f7b,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3eff,
+0x3efe,
+0x3efd,
+0x3efc,
+0x3efb,
+0x3efa,
+0x3ef9,
+0x3ef8,
+0x3ef7,
+0x3ef6,
+0x3ef5,
+0x3ef4,
+0x3ef3,
+0x3ef2,
+0x3ef1,
+0x3ef0,
+0x3eef,
+0x3eee,
+0x3eed,
+0x3eec,
+0x3eeb,
+0x3eea,
+0x3ee9,
+0x3ee7,
+0x3ee6,
+0x3ee5,
+0x3ee4,
+0x3ee3,
+0x3ee2,
+0x3ee1,
+0x3ee0,
+0x3edf,
+0x3ede,
+0x3edd,
+0x3edc,
+0x3edb,
+0x3eda,
+0x3ed9,
+0x3ed8,
+0x3ed7,
+0x3ed6,
+0x3ed5,
+0x3ed4,
+0x3ed3,
+0x3ed2,
+0x3ed1,
+0x3ed1,
+0x3ed0,
+0x3ecf,
+0x3ece,
+0x3ecd,
+0x3ecc,
+0x3ecb,
+0x3eca,
+0x3ec9,
+0x3ec8,
+0x3ec7,
+0x3ec6,
+0x3ec5,
+0x3ec4,
+0x3ec3,
+0x3ec2,
+0x3ec1,
+0x3ec0,
+0x3ebf,
+0x3ebe,
+0x3ebd,
+0x3ebc,
+0x3ebb,
+0x3eba,
+0x3eba,
+0x3eb9,
+0x3eb7,
+0x3eb6,
+0x3eb5,
+0x3eb4,
+0x3eb4,
+0x3eb3,
+0x3eb2,
+0x3eb1,
+0x3eb0,
+0x3eaf,
+0x3eaf,
+0x3eae,
+0x3ead,
+0x3eab,
+0x3eaa,
+0x3ea9,
+0x3ea8,
+0x3ea7,
+0x3ea7,
+0x3ea6,
+0x3ea5,
+0x3ea4,
+0x3ea3,
+0x3ea2,
+0x3ea1,
+0x3ea0,
+0x3e9f,
+0x3e9e,
+0x3e9e,
+0x3e9d,
+0x3e9c,
+0x3e9b,
+0x3e9a,
+0x3e99,
+0x3e98,
+0x3e98,
+0x3e97,
+0x3e97,
+0x3e96,
+0x3e95,
+0x3e94,
+0x3e93,
+0x3e92,
+0x3e91,
+0x3e90,
+0x3e8f,
+0x3e8e,
+0x3e8e,
+0x3e8d,
+0x3e8c,
+0x3e8b,
+0x3e8a,
+0x3e8a,
+0x3e89,
+0x3e88,
+0x3e88,
+0x3e87,
+0x3e86,
+0x3e85,
+0x3e85,
+0x3e83,
+0x3e82,
+0x3e82,
+0x3e81,
+0x3e80,
+0x3e7e,
+0x3e7d,
+0x3e7c,
+0x3e7b,
+0x3e7a,
+0x3e78,
+0x3e77,
+0x3e75,
+0x3e72,
+0x3e71,
+0x3e6f,
+0x3e6e,
+0x3e6c,
+0x3e6b,
+0x3e69,
+0x3e68,
+0x3e67,
+0x3e65,
+0x3e64,
+0x3e63,
+0x3e61,
+0x3e60,
+0x3e5f,
+0x3e5d,
+0x3e5c,
+0x3e5a,
+0x3e59,
+0x3e58,
+0x3e56,
+0x3e55,
+0x3e54,
+0x3e52,
+0x3e51,
+0x3e50,
+0x3e4f,
+0x3e4e,
+0x3e4c,
+0x3e4b,
+0x3e4a,
+0x3e49,
+0x3e47,
+0x3e46,
+0x3e45,
+0x3e44,
+0x3e43,
+0x3e41,
+0x3e40,
+0x3e3f,
+0x3e3e,
+0x3e3c,
+0x3e3a,
+0x3e39,
+0x3e37,
+0x3e36,
+0x3e35,
+0x3e34,
+0x3e33,
+0x3e31,
+0x3e30,
+0x3e2f,
+0x3e2e,
+0x3e2c,
+0x3e2b,
+0x3e2a,
+0x3e29,
+0x3e28,
+0x3e27,
+0x3e26,
+0x3e25,
+0x3e24,
+0x3e23,
+0x3e22,
+0x3e20,
+0x3e20,
+0x3e1f,
+0x3e1e,
+0x3e1d,
+0x3e1c,
+0x3e1b,
+0x3e1a,
+0x3e19,
+0x3e18,
+0x3e17,
+0x3e16,
+0x3e15,
+0x3e14,
+0x3e13,
+0x3e12,
+0x3e11,
+0x3e10,
+0x3e0f,
+0x3e0e,
+0x3e0c,
+0x3e0b,
+0x3e0a,
+0x3e09,
+0x3e08,
+0x3e07,
+0x3e06,
+0x3e05,
+0x3e04,
+0x3e03,
+0x3e03,
+0x3e02,
+0x3e01,
+0x3e00,
+0x3dff,
+0x3dfd,
+0x3dfb,
+0x3df9,
+0x3df8,
+0x3df6,
+0x3df4,
+0x3df1,
+0x3df1,
+0x3ded,
+0x3ded,
+0x3dea,
+0x3dea,
+0x3de7,
+0x3de7,
+0x3de4,
+0x3de4,
+0x3de1,
+0x3de1,
+0x3dde,
+0x3dde,
+0x3ddb,
+0x3ddb,
+0x3dd8,
+0x3dd8,
+0x3dd5,
+0x3dd5,
+0x3dd2,
+0x3dd2,
+0x3dcf,
+0x3dcf,
+0x3dcc,
+0x3dcc,
+0x3dc9,
+0x3dc9,
+0x3dc7,
+0x3dc7,
+0x3dc3,
+0x3dc3,
+0x3dc0,
+0x3dc0,
+0x3dbe,
+0x3dbe,
+0x3dbb,
+0x3dbb,
+0x3db9,
+0x3db9,
+0x3db6,
+0x3db4,
+0x3db4,
+0x3db1,
+0x3db1,
+0x3dae,
+0x3dae,
+0x3dac,
+0x3dac,
+0x3da9,
+0x3da9,
+0x3da7,
+0x3da7,
+0x3da5,
+0x3da5,
+0x3da3,
+0x3da3,
+0x3da0,
+0x3da0,
+0x3d9e,
+0x3d9e,
+0x3d9b,
+0x3d9b,
+0x3d99,
+0x3d99,
+0x3d97,
+0x3d97,
+0x3d94,
+0x3d94,
+0x3d93,
+0x3d93,
+0x3d91,
+0x3d91,
+0x3d8f,
+0x3d8f,
+0x3d8d,
+0x3d8d,
+0x3d8a,
+0x3d8a,
+0x3d88,
+0x3d88,
+0x3d86,
+0x3d86,
+0x3d84,
+0x3d82,
+0x3d82,
+0x3d80,
+0x3d80,
+0x3d7d,
+0x3d7d,
+0x3d79,
+0x3d79,
+0x3d76,
+0x3d76,
+0x3d72,
+0x3d72,
+0x3d6f,
+0x3d6f,
+0x3d6b,
+0x3d6b,
+0x3d68,
+0x3d68,
+0x3d65,
+0x3d65,
+0x3d61,
+0x3d61,
+0x3d5e,
+0x3d5e,
+0x3d5b,
+0x3d5b,
+0x3d58,
+0x3d58,
+0x3d55,
+0x3d55,
+0x3d52,
+0x3d52,
+0x3d4e,
+0x3d4e,
+0x3d4b,
+0x3d4b,
+0x3d48,
+0x3d48,
+0x3d45,
+0x3d45,
+0x3d42,
+0x3d3f,
+0x3d3f,
+0x3d3c,
+0x3d3c,
+0x3d3a,
+0x3d3a,
+0x3d37,
+0x3d37,
+0x3d34,
+0x3d34,
+0x3d32,
+0x3d32,
+0x3d2f,
+0x3d2f,
+0x3d2c,
+0x3d2c,
+0x3d2a,
+0x3d2a,
+0x3d27,
+0x3d27,
+0x3d24,
+0x3d24,
+0x3d22,
+0x3d22,
+0x3d20,
+0x3d20,
+0x3d1d,
+0x3d1d,
+0x3d1b,
+0x3d1b,
+0x3d19,
+0x3d19,
+0x3d17,
+0x3d17,
+0x3d15,
+0x3d15,
+0x3d12,
+0x3d12,
+0x3d10,
+0x3d10,
+0x3d0e,
+0x3d0c,
+0x3d0c,
+0x3d0a,
+0x3d0a,
+0x3d08,
+0x3d08,
+0x3d06,
+0x3d06,
+0x3d04,
+0x3d04,
+0x3d02,
+0x3d02,
+0x3cff,
+0x3cff,
+0x3cfb,
+0x3cfb,
+0x3cf8,
+0x3cf8,
+0x3cf4,
+0x3cf4,
+0x3cf0,
+0x3cf0,
+0x3cec,
+0x3cec,
+0x3ce9,
+0x3ce9,
+0x3ce5,
+0x3ce5,
+0x3ce2,
+0x3ce2,
+0x3cdf,
+0x3cdf,
+0x3cdb,
+0x3cdb,
+0x3cd8,
+0x3cd8,
+0x3cd5,
+0x3cd5,
+0x3cd2,
+0x3cd2,
+0x3ccf,
+0x3ccf,
+0x3ccc,
+0x3cc8,
+0x3cc8,
+0x3cc5,
+0x3cc5,
+0x3cc2,
+0x3cc2,
+0x3cbf,
+0x3cbf,
+0x3cbc,
+0x3cbc,
+0x3cb9,
+0x3cb9,
+0x3cb6,
+0x3cb6,
+0x3cb4,
+0x3cb4,
+0x3cb1,
+0x3cb1,
+0x3cae,
+0x3cae,
+0x3cac,
+0x3cac,
+0x3ca9,
+0x3ca9,
+0x3ca7,
+0x3ca7,
+0x3ca5,
+0x3ca5,
+0x3ca2,
+0x3ca2,
+0x3ca0,
+0x3ca0,
+0x3c9d,
+0x3c9d,
+0x3c9b,
+0x3c9b,
+0x3c98,
+0x3c98,
+0x3c96,
+0x3c96,
+0x3c93,
+0x3c93,
+0x3c8f,
+0x3c8f,
+0x3c8f,
+0x3c8f,
+0x3c8b,
+0x3c8b,
+0x3c8b,
+0x3c8b,
+0x3c87,
+0x3c87,
+0x3c87,
+0x3c87,
+0x3c82,
+0x3c82,
+0x3c82,
+0x3c82,
+0x3c7c,
+0x3c7c,
+0x3c7c,
+0x3c7c,
+0x3c75,
+0x3c75,
+0x3c75,
+0x3c75,
+0x3c6e,
+0x3c6e,
+0x3c6e,
+0x3c6e,
+0x3c66,
+0x3c66,
+0x3c66,
+0x3c66,
+0x3c5f,
+0x3c5f,
+0x3c5f,
+0x3c5f,
+0x3c59,
+0x3c59,
+0x3c59,
+0x3c59,
+0x3c53,
+0x3c53,
+0x3c53,
+0x3c4c,
+0x3c4c,
+0x3c4c,
+0x3c4c,
+0x3c46,
+0x3c46,
+0x3c46,
+0x3c46,
+0x3c3f,
+0x3c3f,
+0x3c3f,
+0x3c3f,
+0x3c39,
+0x3c39,
+0x3c39,
+0x3c39,
+0x3c34,
+0x3c34,
+0x3c34,
+0x3c34,
+0x3c2f,
+0x3c2f,
+0x3c2f,
+0x3c2f,
+0x3c29,
+0x3c29,
+0x3c29,
+0x3c29,
+0x3c24,
+0x3c24,
+0x3c24,
+0x3c24,
+0x3c1f,
+0x3c1f,
+0x3c1f,
+0x3c1f,
+0x3c1a,
+0x3c1a,
+0x3c1a,
+0x3c16,
+0x3c16,
+0x3c16,
+0x3c16,
+0x3c12,
+0x3c12,
+0x3c12,
+0x3c12,
+0x3c0d,
+0x3c0d,
+0x3c0d,
+0x3c0d,
+0x3c09,
+0x3c09,
+0x3c09,
+0x3c09,
+0x3c04,
+0x3c04,
+0x3c04,
+0x3c04,
+0x3c00,
+0x3c00,
+0x3c00,
+0x3c00,
+0x3bf8,
+0x3bf8,
+0x3bf8,
+0x3bf8,
+0x3bf1,
+0x3bf1,
+0x3bf1,
+0x3bf1,
+0x3be9,
+0x3be9,
+0x3be9,
+0x3be9,
+0x3be2,
+0x3be2,
+0x3be2,
+0x3be2,
+0x3bdb,
+0x3bdb,
+0x3bdb,
+0x3bd4,
+0x3bd4,
+0x3bd4,
+0x3bd4,
+0x3bce,
+0x3bce,
+0x3bce,
+0x3bce,
+0x3bc8,
+0x3bc8,
+0x3bc8,
+0x3bc8,
+0x3bc2,
+0x3bc2,
+0x3bc2,
+0x3bc2,
+0x3bbc,
+0x3bbc,
+0x3bbc,
+0x3bbc,
+0x3bb6,
+0x3bb6,
+0x3bb6,
+0x3bb6,
+0x3bb0,
+0x3bb0,
+0x3bb0,
+0x3bb0,
+0x3bab,
+0x3bab,
+0x3bab,
+0x3bab,
+0x3ba6,
+0x3ba6,
+0x3ba6,
+0x3ba6,
+0x3ba1,
+0x3ba1,
+0x3ba1,
+0x3ba1,
+0x3b9c,
+0x3b9c,
+0x3b9c,
+0x3b97,
+0x3b97,
+0x3b97,
+0x3b97,
+0x3b92,
+0x3b92,
+0x3b92,
+0x3b92,
+0x3b8e,
+0x3b8e,
+0x3b8e,
+0x3b8e,
+0x3b8a,
+0x3b8a,
+0x3b8a,
+0x3b8a,
+0x3b85,
+0x3b85,
+0x3b85,
+0x3b85,
+0x3b81,
+0x3b81,
+0x3b81,
+0x3b81,
+0x3b7b,
+0x3b7b,
+0x3b7b,
+0x3b7b,
+0x3b73,
+0x3b73,
+0x3b73,
+0x3b73,
+0x3b6c,
+0x3b6c,
+0x3b6c,
+0x3b6c,
+0x3b65,
+0x3b65,
+0x3b65,
+0x3b5d,
+0x3b5d,
+0x3b5d,
+0x3b5d,
+0x3b56,
+0x3b56,
+0x3b56,
+0x3b56,
+0x3b50,
+0x3b50,
+0x3b50,
+0x3b50,
+0x3b4a,
+0x3b4a,
+0x3b4a,
+0x3b4a,
+0x3b43,
+0x3b43,
+0x3b43,
+0x3b43,
+0x3b3d,
+0x3b3d,
+0x3b3d,
+0x3b3d,
+0x3b38,
+0x3b38,
+0x3b38,
+0x3b38,
+0x3b32,
+0x3b32,
+0x3b32,
+0x3b32,
+0x3b2c,
+0x3b2c,
+0x3b2c,
+0x3b2c,
+0x3b27,
+0x3b27,
+0x3b27,
+0x3b27,
+0x3b22,
+0x3b22,
+0x3b22,
+0x3b1d,
+0x3b1d,
+0x3b1d,
+0x3b1d,
+0x3b18,
+0x3b18,
+0x3b18,
+0x3b18,
+0x3b13,
+0x3b13,
+0x3b13,
+0x3b13,
+0x3b0f,
+0x3b0f,
+0x3b0f,
+0x3b0f,
+0x3b0b,
+0x3b0b,
+0x3b0b,
+0x3b0b,
+0x3b06,
+0x3b06,
+0x3b06,
+0x3b06,
+0x3b02,
+0x3b02,
+0x3b02,
+0x3b02,
+0x3afd,
+0x3afd,
+0x3afd,
+0x3afd,
+0x3af5,
+0x3af5,
+0x3af5,
+0x3af5,
+0x3aed,
+0x3aed,
+0x3aed,
+0x3aed,
+0x3ae6,
+0x3ae6,
+0x3ae6,
+0x3adf,
+0x3adf,
+0x3adf,
+0x3adf,
+0x3ad8,
+0x3ad8,
+0x3ad8,
+0x3ad8,
+0x3ad1,
+0x3ad1,
+0x3ad1,
+0x3ad1,
+0x3acb,
+0x3acb,
+0x3acb,
+0x3acb,
+0x3ac5,
+0x3ac5,
+0x3ac5,
+0x3ac5,
+0x3abf,
+0x3abf,
+0x3abf,
+0x3abf,
+0x3ab9,
+0x3ab9,
+0x3ab9,
+0x3ab9,
+0x3ab3,
+0x3ab3,
+0x3ab3,
+0x3ab3,
+0x3aae,
+0x3aae,
+0x3aae,
+0x3aae,
+0x3aa9,
+0x3aa9,
+0x3aa9,
+0x3aa3,
+0x3aa3,
+0x3aa3,
+0x3aa3,
+0x3a9e,
+0x3a9e,
+0x3a9e,
+0x3a9e,
+0x3a99,
+0x3a99,
+0x3a99,
+0x3a99,
+0x3a94,
+0x3a94,
+0x3a94,
+0x3a94,
+0x3a90,
+0x3a90,
+0x3a90,
+0x3a90,
+0x3a8c,
+0x3a8c,
+0x3a8c,
+0x3a8c,
+0x3a87,
+0x3a87,
+0x3a87,
+0x3a87,
+0x3a83,
+0x3a83,
+0x3a83,
+0x3a83,
+0x3a7e,
+0x3a7e,
+0x3a7e,
+0x3a7e,
+0x3a76,
+0x3a76,
+0x3a76,
+0x3a76,
+0x3a6f,
+0x3a6f,
+0x3a6f,
+0x3a68,
+0x3a68,
+0x3a68,
+0x3a68,
+0x3a60,
+0x3a60,
+0x3a60,
+0x3a60,
+0x3a59,
+0x3a59,
+0x3a59,
+0x3a59,
+0x3a53,
+0x3a53,
+0x3a53,
+0x3a53,
+0x3a4d,
+0x3a4d,
+0x3a4d,
+0x3a4d,
+0x3a46,
+0x3a46,
+0x3a46,
+0x3a46,
+0x3a40,
+0x3a40,
+0x3a40,
+0x3a40,
+0x3a3a,
+0x3a3a,
+0x3a3a,
+0x3a3a,
+0x3a34,
+0x3a34,
+0x3a34,
+0x3a34,
+0x3a2f,
+0x3a2f,
+0x3a2f,
+0x3a2f,
+0x3a2a,
+0x3a2a,
+0x3a2a,
+0x3a24,
+0x3a24,
+0x3a24,
+0x3a24,
+0x3a1f,
+0x3a1f,
+0x3a1f,
+0x3a1f,
+0x3a1a,
+0x3a1a,
+0x3a1a,
+0x3a1a,
+0x3a15,
+0x3a15,
+0x3a15,
+0x3a15,
+0x3a11,
+0x3a11,
+0x3a11,
+0x3a11,
+0x3a0d,
+0x3a0d,
+0x3a0d,
+0x3a0d,
+0x3a08,
+0x3a08,
+0x3a08,
+0x3a08,
+0x3a04,
+0x3a04,
+0x3a04,
+0x3a04,
+0x3a00,
+0x3a00,
+0x3a00,
+0x3a00,
+0x39f8,
+0x39f8,
+0x39f8,
+0x39f0,
+0x39f0,
+0x39f0,
+0x39f0,
+0x39e9,
+0x39e9,
+0x39e9,
+0x39e9,
+0x39e2,
+0x39e2,
+0x39e2,
+0x39e2,
+0x39db,
+0x39db,
+0x39db,
+0x39db,
+0x39d4,
+0x39d4,
+0x39d4,
+0x39d4,
+0x39ce,
+0x39ce,
+0x39ce,
+0x39ce,
+0x39c7,
+0x39c7,
+0x39c7,
+0x39c7,
+0x39c1,
+0x39c1,
+0x39c1,
+0x39c1,
+0x39bb,
+0x39bb,
+0x39bb,
+0x39bb,
+0x39b5,
+0x39b5,
+0x39b5,
+0x39b5,
+0x39b0,
+0x39b0,
+};
+
+// FIXME: not hard code
+// contribute from hw, fix with `PRE_DATA` input
+static double sigmode_golden[] = {0.5,
+0.501999989,
+0.503999915,
+0.505999712,
+0.507999317,
+0.509998667,
+0.511997697,
+0.513996342,
+0.515994541,
+0.517992228,
+0.51998934,
+0.521985814,
+0.523981585,
+0.525976591,
+0.527970767,
+0.529964052,
+0.531956381,
+0.533947691,
+0.535937921,
+0.537927006,
+0.539914885,
+0.541901494,
+0.543886772,
+0.545870657,
+0.547853086,
+0.549833997,
+0.55181333,
+0.553791023,
+0.555767014,
+0.557741243,
+0.559713649,
+0.561684172,
+0.56365275,
+0.565619325,
+0.567583836,
+0.569546224,
+0.571506429,
+0.573464394,
+0.575420058,
+0.577373363,
+0.579324252,
+0.581272667,
+0.583218549,
+0.585161842,
+0.58710249,
+0.589040434,
+0.59097562,
+0.59290799,
+0.594837491,
+0.596764066,
+0.59868766,
+0.60060822,
+0.60252569,
+0.604440017,
+0.606351149,
+0.608259031,
+0.610163611,
+0.612064837,
+0.613962657,
+0.61585702,
+0.617747875,
+0.61963517,
+0.621518857,
+0.623398885,
+0.625275204,
+0.627147766,
+0.629016523,
+0.630881426,
+0.632742428,
+0.634599482,
+0.63645254,
+0.638301558,
+0.640146488,
+0.641987286,
+0.643823907,
+0.645656306,
+0.64748444,
+0.649308265,
+0.651127739,
+0.652942818,
+0.654753461,
+0.656559626,
+0.658361272,
+0.66015836,
+0.661950848,
+0.663738697,
+0.665521869,
+0.667300325,
+0.669074026,
+0.670842936,
+0.672607017,
+0.674366233,
+0.676120548,
+0.677869926,
+0.679614333,
+0.681353734,
+0.683088095,
+0.684817383,
+0.686541565,
+0.688260608,
+0.689974481,
+0.691683153,
+0.693386592,
+0.695084769,
+0.696777653,
+0.698465216,
+0.700147429,
+0.701824263,
+0.703495691,
+0.705161686,
+0.706822221,
+0.70847727,
+0.710126808,
+0.71177081,
+0.71340925,
+0.715042106,
+0.716669353,
+0.718290968,
+0.71990693,
+0.721517216,
+0.723121805,
+0.724720676,
+0.726313808,
+0.727901182,
+0.729482779,
+0.731058579,
+0.732628564,
+0.734192716,
+0.735751018,
+0.737303454,
+0.738850006,
+0.740390659,
+0.741925398,
+0.743454208,
+0.744977074,
+0.746493983,
+0.748004922,
+0.749509876,
+0.751008835,
+0.752501785,
+0.753988716,
+0.755469617,
+0.756944477,
+0.758413287,
+0.759876035,
+0.761332715,
+0.762783316,
+0.764227831,
+0.765666252,
+0.767098572,
+0.768524783,
+0.769944881,
+0.771358858,
+0.772766709,
+0.774168429,
+0.775564014,
+0.77695346,
+0.778336762,
+0.779713917,
+0.781084923,
+0.782449776,
+0.783808476,
+0.78516102,
+0.786507407,
+0.787847636,
+0.789181707,
+0.790509619,
+0.791831373,
+0.79314697,
+0.794456411,
+0.795759698,
+0.797056831,
+0.798347814,
+0.79963265,
+0.80091134,
+0.802183889,
+0.803450299,
+0.804710577,
+0.805964724,
+0.807212748,
+0.808454651,
+0.809690441,
+0.810920123,
+0.812143702,
+0.813361186,
+0.814572581,
+0.815777894,
+0.816977132,
+0.818170304,
+0.819357418,
+0.820538481,
+0.821713502,
+0.82288249,
+0.824045455,
+0.825202406,
+0.826353353,
+0.827498306,
+0.828637274,
+0.82977027,
+0.830897303,
+0.832018385,
+0.833133528,
+0.834242742,
+0.83534604,
+0.836443435,
+0.837534937,
+0.838620561,
+0.83970032,
+0.840774225,
+0.841842291,
+0.842904531,
+0.843960959,
+0.84501159,
+0.846056436,
+0.847095514,
+0.848128836,
+0.84915642,
+0.850178278,
+0.851194427,
+0.852204883,
+0.85320966,
+0.854208775,
+0.855202244,
+0.856190082,
+0.857172307,
+0.858148935,
+0.859119982,
+0.860085466,
+0.861045403,
+0.861999811,
+0.862948707,
+0.863892109,
+0.864830034,
+0.8657625,
+0.866689525,
+0.867611126,
+0.868527324,
+0.869438134,
+0.870343577,
+0.871243671,
+0.872138434,
+0.873027885,
+0.873912043,
+0.874790928,
+0.875664558,
+0.876532952,
+0.877396131,
+0.878254114,
+0.879106919,
+0.879954567,
+0.880797078,
+0.881634471,
+0.882466767,
+0.883293985,
+0.884116145,
+0.884933268,
+0.885745374,
+0.886552483,
+0.887354615,
+0.888151792,
+0.888944033,
+0.88973136,
+0.890513792,
+0.89129135,
+0.892064056,
+0.89283193,
+0.893594992,
+0.894353264,
+0.895106767,
+0.895855521,
+0.896599549,
+0.897338869,
+0.898073505,
+0.898803476,
+0.899528804,
+0.900249511,
+0.900965617,
+0.901677143,
+0.902384111,
+0.903086543,
+0.903784458,
+0.90447788,
+0.905166828,
+0.905851324,
+0.90653139,
+0.907207047,
+0.907878316,
+0.908545218,
+0.909207776,
+0.90986601,
+0.910519941,
+0.911169591,
+0.911814981,
+0.912456133,
+0.913093067,
+0.913725806,
+0.914354369,
+0.91497878,
+0.915599058,
+0.916215226,
+0.916827304,
+0.917435313,
+0.918039275,
+0.91863921,
+0.919235141,
+0.919827088,
+0.920415072,
+0.920999114,
+0.921579235,
+0.922155456,
+0.922727798,
+0.923296282,
+0.923860929,
+0.92442176,
+0.924978795,
+0.925532055,
+0.926081561,
+0.926627334,
+0.927169394,
+0.927707762,
+0.928242458,
+0.928773503,
+0.929300917,
+0.929824721,
+0.930344935,
+0.93086158,
+0.931374675,
+0.931884241,
+0.932390297,
+0.932892865,
+0.933391964,
+0.933887615,
+0.934379836,
+0.934868648,
+0.93535407,
+0.935836124,
+0.936314827,
+0.9367902,
+0.937262263,
+0.937731034,
+0.938196534,
+0.938658781,
+0.939117796,
+0.939573597,
+0.940026203,
+0.940475634,
+0.940921909,
+0.941365046,
+0.941805065,
+0.942241985,
+0.942675824,
+0.943106601,
+0.943534335,
+0.943959044,
+0.944380747,
+0.944799462,
+0.945215208,
+0.945628003,
+0.946037865,
+0.946444813,
+0.946848864,
+0.947250036,
+0.947648348,
+0.948043817,
+0.948436462,
+0.948826299,
+0.949213347,
+0.949597623,
+0.949979144,
+0.950357929,
+0.950733994,
+0.951107357,
+0.951478034,
+0.951846044,
+0.952211402,
+0.952574127,
+0.952934234,
+0.953291742,
+0.953646665,
+0.953999022,
+0.954348829,
+0.954696102,
+0.955040858,
+0.955383113,
+0.955722883,
+0.956060185,
+0.956395034,
+0.956727447,
+0.95705744,
+0.957385028,
+0.957710228,
+0.958033055,
+0.958353525,
+0.958671653,
+0.958987455,
+0.959300946,
+0.959612142,
+0.959921058,
+0.960227709,
+0.960532111,
+0.960834277,
+0.961134224,
+0.961431966,
+0.961727518,
+0.962020894,
+0.962312109,
+0.962601179,
+0.962888117,
+0.963172937,
+0.963455655,
+0.963736284,
+0.964014838,
+0.964291332,
+0.96456578,
+0.964838195,
+0.965108591,
+0.965376983,
+0.965643384,
+0.965907808,
+0.966170267,
+0.966430777,
+0.966689349,
+0.966945998,
+0.967200737,
+0.967453578,
+0.967704535,
+0.967953622,
+0.96820085,
+0.968446233,
+0.968689784,
+0.968931516,
+0.96917144,
+0.969409571,
+0.969645919,
+0.969880498,
+0.97011332,
+0.970344398,
+0.970573743,
+0.970801367,
+0.971027284,
+0.971251504,
+0.97147404,
+0.971694904,
+0.971914107,
+0.972131661,
+0.972347578,
+0.972561869,
+0.972774546,
+0.97298562,
+0.973195103,
+0.973403006,
+0.973609341,
+0.973814117,
+0.974017347,
+0.974219042,
+0.974419212,
+0.974617868,
+0.974815021,
+0.975010683,
+0.975204863,
+0.975397572,
+0.97558882,
+0.975778619,
+0.975966979,
+0.97615391,
+0.976339422,
+0.976523525,
+0.97670623,
+0.976887547,
+0.977067486,
+0.977246057,
+0.977423269,
+0.977599132,
+0.977773657,
+0.977946853,
+0.978118729,
+0.978289296,
+0.978458562,
+0.978626537,
+0.978793231,
+0.978958653,
+0.979122812,
+0.979285717,
+0.979447378,
+0.979607804,
+0.979767003,
+0.979924985,
+0.980081758,
+0.980237332,
+0.980391715,
+0.980544915,
+0.980696943,
+0.980847805,
+0.980997512,
+0.981146071,
+0.98129349,
+0.981439779,
+0.981584945,
+0.981728996,
+0.981871942,
+0.98201379,
+0.982154548,
+0.982294225,
+0.982432827,
+0.982570364,
+0.982706843,
+0.982842273,
+0.982976659,
+0.983110012,
+0.983242337,
+0.983373644,
+0.983503939,
+0.983633229,
+0.983761524,
+0.983888829,
+0.984015152,
+0.9841405,
+0.984264882,
+0.984388303,
+0.984510772,
+0.984632294,
+0.984752879,
+0.984872531,
+0.984991259,
+0.985109069,
+0.985225968,
+0.985341963,
+0.985457061,
+0.985571269,
+0.985684592,
+0.985797039,
+0.985908614,
+0.986019326,
+0.98612918,
+0.986238183,
+0.986346341,
+0.986453661,
+0.986560148,
+0.98666581,
+0.986770653,
+0.986874682,
+0.986977903,
+0.987080324,
+0.98718195,
+0.987282786,
+0.987382839,
+0.987482115,
+0.98758062,
+0.98767836,
+0.987775339,
+0.987871565,
+0.987967043,
+0.988061778,
+0.988155776,
+0.988249042,
+0.988341583,
+0.988433404,
+0.98852451,
+0.988614907,
+0.9887046,
+0.988793594,
+0.988881895,
+0.988969507,
+0.989056437,
+0.98914269,
+0.98922827,
+0.989313183,
+0.989397433,
+0.989481027,
+0.989563968,
+0.989646262,
+0.989727914,
+0.989808929,
+0.989889312,
+0.989969066,
+0.990048198,
+0.990126712,
+0.990204613,
+0.990281905,
+0.990358593,
+0.990434681,
+0.990510175,
+0.990585079,
+0.990659397,
+0.990733134,
+0.990806295,
+0.990878883,
+0.990950903,
+0.99102236,
+0.991093257,
+0.9911636,
+0.991233391,
+0.991302637,
+0.99137134,
+0.991439506,
+0.991507137,
+0.991574239,
+0.991640815,
+0.991706869,
+0.991772406,
+0.991837429,
+0.991901942,
+0.99196595,
+0.992029456,
+0.992092463,
+0.992154977,
+0.992217,
+0.992278537,
+0.992339591,
+0.992400166,
+0.992460265,
+0.992519893,
+0.992579053,
+0.992637749,
+0.992695983,
+0.99275376,
+0.992811084,
+0.992867957,
+0.992924384,
+0.992980367,
+0.993035911,
+0.993091018,
+0.993145692,
+0.993199936,
+0.993253754,
+0.993307149,
+0.993360124,
+0.993412683,
+0.993464828,
+0.993516563,
+0.993567892,
+0.993618816,
+0.99366934,
+0.993719466,
+0.993769198,
+0.993818539,
+0.993867491,
+0.993916059,
+0.993964243,
+0.994012049,
+0.994059478,
+0.994106533,
+0.994153219,
+0.994199536,
+0.994245489,
+0.994291079,
+0.994336311,
+0.994381186,
+0.994425708,
+0.994469878,
+0.994513701,
+0.994557178,
+0.994600313,
+0.994643108,
+0.994685565,
+0.994727688,
+0.994769478,
+0.994810939,
+0.994852073,
+0.994892883,
+0.994933371,
+0.994973539,
+0.995013391,
+0.995052928,
+0.995092153,
+0.995131069,
+0.995169677,
+0.995207981,
+0.995245983,
+0.995283685,
+0.995321089,
+0.995358198,
+0.995395014,
+0.995431539,
+0.995467776,
+0.995503727,
+0.995539394,
+0.995574779,
+0.995609885,
+0.995644713,
+0.995679266,
+0.995713547,
+0.995747556,
+0.995781297,
+0.995814772,
+0.995847981,
+0.995880929,
+0.995913616,
+0.995946044,
+0.995978217,
+0.996010135,
+0.996041801,
+0.996073216,
+0.996104383,
+0.996135304,
+0.99616598,
+0.996196413,
+0.996226606,
+0.996256561,
+0.996286278,
+0.99631576,
+0.996345009,
+0.996374027,
+0.996402815,
+0.996431375,
+0.99645971,
+0.99648782,
+0.996515708,
+0.996543375,
+0.996570823,
+0.996598054,
+0.99662507,
+0.996651872,
+0.996678461,
+0.99670484,
+0.99673101,
+0.996756974,
+0.996782731,
+0.996808285,
+0.996833636,
+0.996858787,
+0.996883738,
+0.996908492,
+0.99693305,
+0.996957413,
+0.996981584,
+0.997005563,
+0.997029352,
+0.997052952,
+0.997076366,
+0.997099594,
+0.997122638,
+0.9971455,
+0.99716818,
+0.997190681,
+0.997213004,
+0.997235149,
+0.99725712,
+0.997278916,
+0.997300539,
+0.997321991,
+0.997343273,
+0.997364386,
+0.997385332,
+0.997406112,
+0.997426727,
+0.997447179,
+0.997467468,
+0.997487597,
+0.997507566,
+0.997527377,
+0.997547031,
+0.997566528,
+0.997585872,
+0.997605062,
+0.997624099,
+0.997642986,
+0.997661723,
+0.997680312,
+0.997698752,
+0.997717047,
+0.997735197,
+0.997753202,
+0.997771065,
+0.997788786,
+0.997806367,
+0.997823808,
+0.99784111,
+0.997858276,
+0.997875305,
+0.997892199,
+0.997908959,
+0.997925586,
+0.997942081,
+0.997958445,
+0.99797468,
+0.997990785,
+0.998006763,
+0.998022614,
+0.998038339,
+0.998053939,
+0.998069415,
+0.998084769,
+0.998100001,
+0.998115112,
+0.998130102,
+0.998144974,
+0.998159728,
+0.998174365,
+0.998188885,
+0.99820329,
+0.998217581,
+0.998231759,
+0.998245823,
+0.998259777,
+0.998273619,
+0.998287351,
+0.998300975,
+0.99831449,
+0.998327898,
+0.998341199,
+0.998354395,
+0.998367486,
+0.998380473,
+0.998393356,
+0.998406138,
+0.998418818,
+0.998431397,
+0.998443876,
+0.998456256,
+0.998468538,
+0.998480723,
+0.99849281,
+0.998504802,
+0.998516698,
+0.998528499,
+0.998540207,
+0.998551822,
+0.998563345,
+0.998574776,
+0.998586116,
+0.998597366,
+0.998608527,
+0.998619599,
+0.998630583,
+0.99864148,
+0.99865229,
+0.998663015,
+0.998673654,
+0.998684208,
+0.998694679,
+0.998705066,
+0.998715371,
+0.998725594,
+0.998735736,
+0.998745797,
+0.998755778,
+0.99876568,
+0.998775503,
+0.998785248,
+0.998794916,
+0.998804507,
+0.998814021,
+0.99882346,
+0.998832824,
+0.998842113,
+0.998851329,
+0.998860471,
+0.998869541,
+0.998878538,
+0.998887464,
+0.998896319,
+0.998905104,
+0.998913818,
+0.998922464,
+0.99893104,
+0.998939549,
+0.99894799,
+0.998956364,
+0.998964671,
+0.998972912,
+0.998981088,
+0.998989198,
+0.998997244,
+0.999005226,
+0.999013145,
+0.999021001,
+0.999028794,
+0.999036525,
+0.999044195,
+0.999051803,
+0.999059352,
+0.99906684,
+0.999074268,
+0.999081638,
+0.999088949,
+0.999096202,
+0.999103397,
+0.999110535,
+0.999117616,
+0.99912464,
+0.999131609,
+0.999138523,
+0.999145381,
+0.999152185,
+0.999158935,
+0.999165631,
+0.999172274,
+0.999178864,
+0.999185401,
+0.999191887,
+0.999198321,
+0.999204704,
+0.999211036,
+0.999217317,
+0.999223549,
+0.999229731,
+0.999235864,
+0.999241948,
+0.999247984,
+0.999253971,
+0.999259911,
+0.999265804,
+0.99927165,
+0.999277449,
+0.999283202,
+0.99928891,
+0.999294572,
+0.999300189,
+0.999305761,
+0.999311289,
+0.999316773,
+0.999322213,
+0.99932761,
+0.999332964,
+0.999338276,
+0.999343545,
+0.999348772,
+0.999353958,
+0.999359103,
+0.999364206,
+0.999369269,
+0.999374291,
+0.999379274,
+0.999384217,
+0.999389121,
+0.999393985,
+0.999398811,
+0.999403599,
+0.999408348,
+0.99941306,
+0.999417734,
+0.99942237,
+0.99942697,
+0.999431534,
+0.999436061,
+0.999440552,
+0.999445007,
+0.999449427,
+0.999453811,
+0.999458161,
+0.999462476,
+0.999466757,
+0.999471004,
+0.999475217,
+0.999479396,
+0.999483542,
+0.999487655,
+0.999491735,
+0.999495783,
+0.999499799,
+0.999503783,
+0.999507735,
+0.999511655,
+0.999515544,
+0.999519403,
+0.99952323,
+0.999527027,
+0.999530794,
+0.999534531,
+0.999538238,
+0.999541916,
+0.999545564,
+0.999549184,
+0.999552774,
+0.999556336,
+0.99955987,
+0.999563375,
+0.999566853,
+0.999570303,
+0.999573725,
+0.99957712,
+0.999580488,
+0.99958383,
+0.999587145,
+0.999590433,
+0.999593695,
+0.999596931,
+0.999600142,
+0.999603326,
+0.999606486,
+0.99960962,
+0.99961273,
+0.999615814,
+0.999618874,
+0.99962191,
+0.999624921,
+0.999627909,
+0.999630873,
+0.999633813,
+0.99963673,
+0.999639623,
+0.999642494,
+0.999645341,
+0.999648166,
+0.999650969,
+0.999653749,
+0.999656507,
+0.999659243,
+0.999661957,
+0.498000011,
+0.496000085,
+0.494000288,
+0.492000683,
+0.490001333,
+0.488002303,
+0.486003658,
+0.484005459,
+0.482007772,
+0.48001066,
+0.478014186,
+0.476018415,
+0.474023409,
+0.472029233,
+0.470035948,
+0.468043619,
+0.466052309,
+0.464062079,
+0.462072994,
+0.460085115,
+0.458098506,
+0.456113228,
+0.454129343,
+0.452146914,
+0.450166003,
+0.44818667,
+0.446208977,
+0.444232986,
+0.442258757,
+0.440286351,
+0.438315828,
+0.43634725,
+0.434380675,
+0.432416164,
+0.430453776,
+0.428493571,
+0.426535606,
+0.424579942,
+0.422626637,
+0.420675748,
+0.418727333,
+0.416781451,
+0.414838158,
+0.41289751,
+0.410959566,
+0.40902438,
+0.40709201,
+0.405162509,
+0.403235934,
+0.40131234,
+0.39939178,
+0.39747431,
+0.395559983,
+0.393648851,
+0.391740969,
+0.389836389,
+0.387935163,
+0.386037343,
+0.38414298,
+0.382252125,
+0.38036483,
+0.378481143,
+0.376601115,
+0.374724796,
+0.372852234,
+0.370983477,
+0.369118574,
+0.367257572,
+0.365400518,
+0.36354746,
+0.361698442,
+0.359853512,
+0.358012714,
+0.356176093,
+0.354343694,
+0.35251556,
+0.350691735,
+0.348872261,
+0.347057182,
+0.345246539,
+0.343440374,
+0.341638728,
+0.33984164,
+0.338049152,
+0.336261303,
+0.334478131,
+0.332699675,
+0.330925974,
+0.329157064,
+0.327392983,
+0.325633767,
+0.323879452,
+0.322130074,
+0.320385667,
+0.318646266,
+0.316911905,
+0.315182617,
+0.313458435,
+0.311739392,
+0.310025519,
+0.308316847,
+0.306613408,
+0.304915231,
+0.303222347,
+0.301534784,
+0.299852571,
+0.298175737,
+0.296504309,
+0.294838314,
+0.293177779,
+0.29152273,
+0.289873192,
+0.28822919,
+0.28659075,
+0.284957894,
+0.283330647,
+0.281709032,
+0.28009307,
+0.278482784,
+0.276878195,
+0.275279324,
+0.273686192,
+0.272098818,
+0.270517221,
+0.268941421,
+0.267371436,
+0.265807284,
+0.264248982,
+0.262696546,
+0.261149994,
+0.259609341,
+0.258074602,
+0.256545792,
+0.255022926,
+0.253506017,
+0.251995078,
+0.250490124,
+0.248991165,
+0.247498215,
+0.246011284,
+0.244530383,
+0.243055523,
+0.241586713,
+0.240123965,
+0.238667285,
+0.237216684,
+0.235772169,
+0.234333748,
+0.232901428,
+0.231475217,
+0.230055119,
+0.228641142,
+0.227233291,
+0.225831571,
+0.224435986,
+0.22304654,
+0.221663238,
+0.220286083,
+0.218915077,
+0.217550224,
+0.216191524,
+0.21483898,
+0.213492593,
+0.212152364,
+0.210818293,
+0.209490381,
+0.208168627,
+0.20685303,
+0.205543589,
+0.204240302,
+0.202943169,
+0.201652186,
+0.20036735,
+0.19908866,
+0.197816111,
+0.196549701,
+0.195289423,
+0.194035276,
+0.192787252,
+0.191545349,
+0.190309559,
+0.189079877,
+0.187856298,
+0.186638814,
+0.185427419,
+0.184222106,
+0.183022868,
+0.181829696,
+0.180642582,
+0.179461519,
+0.178286498,
+0.17711751,
+0.175954545,
+0.174797594,
+0.173646647,
+0.172501694,
+0.171362726,
+0.17022973,
+0.169102697,
+0.167981615,
+0.166866472,
+0.165757258,
+0.16465396,
+0.163556565,
+0.162465063,
+0.161379439,
+0.16029968,
+0.159225775,
+0.158157709,
+0.157095469,
+0.156039041,
+0.15498841,
+0.153943564,
+0.152904486,
+0.151871164,
+0.15084358,
+0.149821722,
+0.148805573,
+0.147795117,
+0.14679034,
+0.145791225,
+0.144797756,
+0.143809918,
+0.142827693,
+0.141851065,
+0.140880018,
+0.139914534,
+0.138954597,
+0.138000189,
+0.137051293,
+0.136107891,
+0.135169966,
+0.1342375,
+0.133310475,
+0.132388874,
+0.131472676,
+0.130561866,
+0.129656423,
+0.128756329,
+0.127861566,
+0.126972115,
+0.126087957,
+0.125209072,
+0.124335442,
+0.123467048,
+0.122603869,
+0.121745886,
+0.120893081,
+0.120045433,
+0.119202922,
+0.118365529,
+0.117533233,
+0.116706015,
+0.115883855,
+0.115066732,
+0.114254626,
+0.113447517,
+0.112645385,
+0.111848208,
+0.111055967,
+0.11026864,
+0.109486208,
+0.10870865,
+0.107935944,
+0.10716807,
+0.106405008,
+0.105646736,
+0.104893233,
+0.104144479,
+0.103400451,
+0.102661131,
+0.101926495,
+0.101196524,
+0.100471196,
+0.099750489,
+0.099034383,
+0.098322857,
+0.097615889,
+0.096913457,
+0.096215542,
+0.09552212,
+0.094833172,
+0.094148676,
+0.09346861,
+0.092792953,
+0.092121684,
+0.091454782,
+0.090792224,
+0.09013399,
+0.089480059,
+0.088830409,
+0.088185019,
+0.087543867,
+0.086906933,
+0.086274194,
+0.085645631,
+0.08502122,
+0.084400942,
+0.083784774,
+0.083172696,
+0.082564687,
+0.081960725,
+0.08136079,
+0.080764859,
+0.080172912,
+0.079584928,
+0.079000886,
+0.078420765,
+0.077844544,
+0.077272202,
+0.076703718,
+0.076139071,
+0.07557824,
+0.075021205,
+0.074467945,
+0.073918439,
+0.073372666,
+0.072830606,
+0.072292238,
+0.071757542,
+0.071226497,
+0.070699083,
+0.070175279,
+0.069655065,
+0.06913842,
+0.068625325,
+0.068115759,
+0.067609703,
+0.067107135,
+0.066608036,
+0.066112385,
+0.065620164,
+0.065131352,
+0.06464593,
+0.064163876,
+0.063685173,
+0.0632098,
+0.062737737,
+0.062268966,
+0.061803466,
+0.061341219,
+0.060882204,
+0.060426403,
+0.059973797,
+0.059524366,
+0.059078091,
+0.058634954,
+0.058194935,
+0.057758015,
+0.057324176,
+0.056893399,
+0.056465665,
+0.056040956,
+0.055619253,
+0.055200538,
+0.054784792,
+0.054371997,
+0.053962135,
+0.053555187,
+0.053151136,
+0.052749964,
+0.052351652,
+0.051956183,
+0.051563538,
+0.051173701,
+0.050786653,
+0.050402377,
+0.050020856,
+0.049642071,
+0.049266006,
+0.048892643,
+0.048521966,
+0.048153956,
+0.047788598,
+0.047425873,
+0.047065766,
+0.046708258,
+0.046353335,
+0.046000978,
+0.045651171,
+0.045303898,
+0.044959142,
+0.044616887,
+0.044277117,
+0.043939815,
+0.043604966,
+0.043272553,
+0.04294256,
+0.042614972,
+0.042289772,
+0.041966945,
+0.041646475,
+0.041328347,
+0.041012545,
+0.040699054,
+0.040387858,
+0.040078942,
+0.039772291,
+0.039467889,
+0.039165723,
+0.038865776,
+0.038568034,
+0.038272482,
+0.037979106,
+0.037687891,
+0.037398821,
+0.037111883,
+0.036827063,
+0.036544345,
+0.036263716,
+0.035985162,
+0.035708668,
+0.03543422,
+0.035161805,
+0.034891409,
+0.034623017,
+0.034356616,
+0.034092192,
+0.033829733,
+0.033569223,
+0.033310651,
+0.033054002,
+0.032799263,
+0.032546422,
+0.032295465,
+0.032046378,
+0.03179915,
+0.031553767,
+0.031310216,
+0.031068484,
+0.03082856,
+0.030590429,
+0.030354081,
+0.030119502,
+0.02988668,
+0.029655602,
+0.029426257,
+0.029198633,
+0.028972716,
+0.028748496,
+0.02852596,
+0.028305096,
+0.028085893,
+0.027868339,
+0.027652422,
+0.027438131,
+0.027225454,
+0.02701438,
+0.026804897,
+0.026596994,
+0.026390659,
+0.026185883,
+0.025982653,
+0.025780958,
+0.025580788,
+0.025382132,
+0.025184979,
+0.024989317,
+0.024795137,
+0.024602428,
+0.02441118,
+0.024221381,
+0.024033021,
+0.02384609,
+0.023660578,
+0.023476475,
+0.02329377,
+0.023112453,
+0.022932514,
+0.022753943,
+0.022576731,
+0.022400868,
+0.022226343,
+0.022053147,
+0.021881271,
+0.021710704,
+0.021541438,
+0.021373463,
+0.021206769,
+0.021041347,
+0.020877188,
+0.020714283,
+0.020552622,
+0.020392196,
+0.020232997,
+0.020075015,
+0.019918242,
+0.019762668,
+0.019608285,
+0.019455085,
+0.019303057,
+0.019152195,
+0.019002488,
+0.018853929,
+0.01870651,
+0.018560221,
+0.018415055,
+0.018271004,
+0.018128058,
+0.01798621,
+0.017845452,
+0.017705775,
+0.017567173,
+0.017429636,
+0.017293157,
+0.017157727,
+0.017023341,
+0.016889988,
+0.016757663,
+0.016626356,
+0.016496061,
+0.016366771,
+0.016238476,
+0.016111171,
+0.015984848,
+0.0158595,
+0.015735118,
+0.015611697,
+0.015489228,
+0.015367706,
+0.015247121,
+0.015127469,
+0.015008741,
+0.014890931,
+0.014774032,
+0.014658037,
+0.014542939,
+0.014428731,
+0.014315408,
+0.014202961,
+0.014091386,
+0.013980674,
+0.01387082,
+0.013761817,
+0.013653659,
+0.013546339,
+0.013439852,
+0.01333419,
+0.013229347,
+0.013125318,
+0.013022097,
+0.012919676,
+0.01281805,
+0.012717214,
+0.012617161,
+0.012517885,
+0.01241938,
+0.01232164,
+0.012224661,
+0.012128435,
+0.012032957,
+0.011938222,
+0.011844224,
+0.011750958,
+0.011658417,
+0.011566596,
+0.01147549,
+0.011385093,
+0.0112954,
+0.011206406,
+0.011118105,
+0.011030493,
+0.010943563,
+0.01085731,
+0.01077173,
+0.010686817,
+0.010602567,
+0.010518973,
+0.010436032,
+0.010353738,
+0.010272086,
+0.010191071,
+0.010110688,
+0.010030934,
+0.009951802,
+0.009873288,
+0.009795387,
+0.009718095,
+0.009641407,
+0.009565319,
+0.009489825,
+0.009414921,
+0.009340603,
+0.009266866,
+0.009193705,
+0.009121117,
+0.009049097,
+0.00897764,
+0.008906743,
+0.0088364,
+0.008766609,
+0.008697363,
+0.00862866,
+0.008560494,
+0.008492863,
+0.008425761,
+0.008359185,
+0.008293131,
+0.008227594,
+0.008162571,
+0.008098058,
+0.00803405,
+0.007970544,
+0.007907537,
+0.007845023,
+0.007783,
+0.007721463,
+0.007660409,
+0.007599834,
+0.007539735,
+0.007480107,
+0.007420947,
+0.007362251,
+0.007304017,
+0.00724624,
+0.007188916,
+0.007132043,
+0.007075616,
+0.007019633,
+0.006964089,
+0.006908982,
+0.006854308,
+0.006800064,
+0.006746246,
+0.006692851,
+0.006639876,
+0.006587317,
+0.006535172,
+0.006483437,
+0.006432108,
+0.006381184,
+0.00633066,
+0.006280534,
+0.006230802,
+0.006181461,
+0.006132509,
+0.006083941,
+0.006035757,
+0.005987951,
+0.005940522,
+0.005893467,
+0.005846781,
+0.005800464,
+0.005754511,
+0.005708921,
+0.005663689,
+0.005618814,
+0.005574292,
+0.005530122,
+0.005486299,
+0.005442822,
+0.005399687,
+0.005356892,
+0.005314435,
+0.005272312,
+0.005230522,
+0.005189061,
+0.005147927,
+0.005107117,
+0.005066629,
+0.005026461,
+0.004986609,
+0.004947072,
+0.004907847,
+0.004868931,
+0.004830323,
+0.004792019,
+0.004754017,
+0.004716315,
+0.004678911,
+0.004641802,
+0.004604986,
+0.004568461,
+0.004532224,
+0.004496273,
+0.004460606,
+0.004425221,
+0.004390115,
+0.004355287,
+0.004320734,
+0.004286453,
+0.004252444,
+0.004218703,
+0.004185228,
+0.004152019,
+0.004119071,
+0.004086384,
+0.004053956,
+0.004021783,
+0.003989865,
+0.003958199,
+0.003926784,
+0.003895617,
+0.003864696,
+0.00383402,
+0.003803587,
+0.003773394,
+0.003743439,
+0.003713722,
+0.00368424,
+0.003654991,
+0.003625973,
+0.003597185,
+0.003568625,
+0.00354029,
+0.00351218,
+0.003484292,
+0.003456625,
+0.003429177,
+0.003401946,
+0.00337493,
+0.003348128,
+0.003321539,
+0.00329516,
+0.00326899,
+0.003243026,
+0.003217269,
+0.003191715,
+0.003166364,
+0.003141213,
+0.003116262,
+0.003091508,
+0.00306695,
+0.003042587,
+0.003018416,
+0.002994437,
+0.002970648,
+0.002947048,
+0.002923634,
+0.002900406,
+0.002877362,
+0.0028545,
+0.00283182,
+0.002809319,
+0.002786996,
+0.002764851,
+0.00274288,
+0.002721084,
+0.002699461,
+0.002678009,
+0.002656727,
+0.002635614,
+0.002614668,
+0.002593888,
+0.002573273,
+0.002552821,
+0.002532532,
+0.002512403,
+0.002492434,
+0.002472623,
+0.002452969,
+0.002433472,
+0.002414128,
+0.002394938,
+0.002375901,
+0.002357014,
+0.002338277,
+0.002319688,
+0.002301248,
+0.002282953,
+0.002264803,
+0.002246798,
+0.002228935,
+0.002211214,
+0.002193633,
+0.002176192,
+0.00215889,
+0.002141724,
+0.002124695,
+0.002107801,
+0.002091041,
+0.002074414,
+0.002057919,
+0.002041555,
+0.00202532,
+0.002009215,
+0.001993237,
+0.001977386,
+0.001961661,
+0.001946061,
+0.001930585,
+0.001915231,
+0.001899999,
+0.001884888,
+0.001869898,
+0.001855026,
+0.001840272,
+0.001825635,
+0.001811115,
+0.00179671,
+0.001782419,
+0.001768241,
+0.001754177,
+0.001740223,
+0.001726381,
+0.001712649,
+0.001699025,
+0.00168551,
+0.001672102,
+0.001658801,
+0.001645605,
+0.001632514,
+0.001619527,
+0.001606644,
+0.001593862,
+0.001581182,
+0.001568603,
+0.001556124,
+0.001543744,
+0.001531462,
+0.001519277,
+0.00150719,
+0.001495198,
+0.001483302,
+0.001471501,
+0.001459793,
+0.001448178,
+0.001436655,
+0.001425224,
+0.001413884,
+0.001402634,
+0.001391473,
+0.001380401,
+0.001369417,
+0.00135852,
+0.00134771,
+0.001336985,
+0.001326346,
+0.001315792,
+0.001305321,
+0.001294934,
+0.001284629,
+0.001274406,
+0.001264264,
+0.001254203,
+0.001244222,
+0.00123432,
+0.001224497,
+0.001214752,
+0.001205084,
+0.001195493,
+0.001185979,
+0.00117654,
+0.001167176,
+0.001157887,
+0.001148671,
+0.001139529,
+0.001130459,
+0.001121462,
+0.001112536,
+0.001103681,
+0.001094896,
+0.001086182,
+0.001077536,
+0.00106896,
+0.001060451,
+0.00105201,
+0.001043636,
+0.001035329,
+0.001027088,
+0.001018912,
+0.001010802,
+0.001002756,
+0.000994774,
+0.000986855,
+0.000978999,
+0.000971206,
+0.000963475,
+0.000955805,
+0.000948197,
+0.000940648,
+0.00093316,
+0.000925732,
+0.000918362,
+0.000911051,
+0.000903798,
+0.000896603,
+0.000889465,
+0.000882384,
+0.00087536,
+0.000868391,
+0.000861477,
+0.000854619,
+0.000847815,
+0.000841065,
+0.000834369,
+0.000827726,
+0.000821136,
+0.000814599,
+0.000808113,
+0.000801679,
+0.000795296,
+0.000788964,
+0.000782683,
+0.000776451,
+0.000770269,
+0.000764136,
+0.000758052,
+0.000752016,
+0.000746029,
+0.000740089,
+0.000734196,
+0.00072835,
+0.000722551,
+0.000716798,
+0.00071109,
+0.000705428,
+0.000699811,
+0.000694239,
+0.000688711,
+0.000683227,
+0.000677787,
+0.00067239,
+0.000667036,
+0.000661724,
+0.000656455,
+0.000651228,
+0.000646042,
+0.000640897,
+0.000635794,
+0.000630731,
+0.000625709,
+0.000620726,
+0.000615783,
+0.000610879,
+0.000606015,
+0.000601189,
+0.000596401,
+0.000591652,
+0.00058694,
+0.000582266,
+0.00057763,
+0.00057303,
+0.000568466,
+0.000563939,
+0.000559448,
+0.000554993,
+0.000550573,
+0.000546189,
+0.000541839,
+0.000537524,
+0.000533243,
+0.000528996,
+0.000524783,
+0.000520604,
+0.000516458,
+0.000512345,
+0.000508265,
+0.000504217,
+0.000500201,
+0.000496217,
+0.000492265,
+0.000488345,
+0.000484456,
+0.000480597,
+0.00047677,
+0.000472973,
+0.000469206,
+0.000465469,
+0.000461762,
+0.000458084,
+0.000454436,
+0.000450816,
+0.000447226,
+0.000443664,
+0.00044013,
+0.000436625,
+0.000433147,
+0.000429697,
+0.000426275,
+0.00042288,
+0.000419512,
+0.00041617,
+0.000412855,
+0.000409567,
+0.000406305,
+0.000403069,
+0.000399858,
+0.000396674,
+0.000393514,
+0.00039038,
+0.00038727,
+0.000384186,
+0.000381126,
+0.00037809,
+0.000375079,
+0.000372091,
+0.000369127,
+0.000366187,
+0.00036327,
+0.000360377,
+0.000357506,
+0.000354659,
+0.000351834,
+0.000349031,
+0.000346251,
+0.000343493,
+0.000340757,
+0.000338043,
+0.00033535
+};
+
+static bool check_input_int8_range(float input) {
+  bool ret = input > -128.0 && input < 128.0;
+  if (!ret) {
+    printf("invalid int8 range, input is %f\n", input);
+  }
+  return ret;
+}
+
+static double _gen_sigmoid(float x) {
+  return 1.0 / (1.0 + exp(-(x)));
+}
+
+static void tl_lut_ref(
+    u16 *ofmap,
+    u16 *ifmap,
+    u16 *table,
+    u16 *table_slope,
+    tl_shape_t ifmap_shape,
+    tl_shape_t table_shape)
+{
+  int tn, th, tw;
+
+  tn = table_shape.n;
+  th = table_shape.h;
+  tw = table_shape.w;
+  assert(tn == 1);
+  assert(th * tw == 256);
+  assert(table);
+  assert(table_slope);
+  assert(ifmap_shape.n);
+  assert(ifmap);
+  assert(ofmap);
+
+  // TODO: use c function
+  // 1. dump all input as binary file
+#if 0
+  #define INFP32FILE "infp32file.bin"
+  #define OUTBF16FILE "lutbf16out.bin"
+  FILE* pFile;
+  pFile = fopen(INFP32FILE, "wb");
+  int shape_sz = tl_shape_size(&ifmap_shape);
+  float *f = (float *)malloc(sizeof(float) * shape_sz];
+  for (int i = 0; i < shape_sz; i++) {
+    f[i] = convert_bf16_fp32(ifmap[i]);
+  }
+  fwrite(f, 1, shape_sz *sizeof(float), pFile);
+  fclose(pFile);
+
+  // 2. read result from `eval_lut.py`
+  char command[256];
+  sprintf(command, "python eval_lut.py --lut_input_range_start %d --lut_input_range_end %d --inputfloat32 %s --outputbf16 %s 2>&1 > 2\n",
+      range_start, range_end,
+      INFP32FILE, OUTBF16FILE);
+
+  int r;
+  r = system(command);
+  printf ("command is %s, return %d\n", command, r);
+
+  pFile = fopen(OUTBF16FILE, "rb");
+  if (!pFile) {
+    fprintf(stderr, "open golden %s fail\n", OUTBF16FILE);
+    exit(-1);
+  }
+
+  size_t file_length;
+  file_length = fread(ofmap, sizeof(u16), tl_shape_size(&ifmap_shape), pFile);
+  printf("read from golden, file size %" PRIu64 "\n", file_length);
+  fclose(pFile);
+#endif
+
+#if 0
+  for (u64 i = 0; i < tl_shape_size(&ifmap_shape); i++) {
+    printf ("ref %" PRIu64 " input %x golden %x\n", i, ifmap[i], ofmap[i]);
+  }
+#endif
+}
+
+static void gen_sigmoid(u16 *table_data, u64 table_size) {
+  // S(x) = 1 / (1 + (e^-x))
+  //<! 32*8 table, duplicate `channel` times;
+  int half = table_size / channel / 2;
+  u64 idx = 0;
+  assert(half == 128);
+
+  // prepare channel 0
+  // x [0, 127]
+  // we re-scale [-8, 8] into 256
+  for (int i = 0; i < half; i++) {
+    float _idx = idx / scale;
+    double s = _gen_sigmoid(_idx);
+    sigmode_hw[idx] = s;
+    table_data[idx] = convert_fp32_bf16((float)s);
+#if 0
+    printf("t [%" PRIu64 "] is %f[%d], 0x%x fp is %f d is %.8lf, input is %f\n", idx, convert_bf16_fp32(table_data[idx]), i, table_data[idx], (float)s, s, _idx);
+#endif
+    idx++;
+  }
+
+  // x = -128
+  double s = _gen_sigmoid(range_start);
+  sigmode_hw[idx] = s;
+  table_data[idx] = convert_fp32_bf16((double)s);
+#if 0
+  printf("t [%" PRIu64 "] is %f[%d], 0x%x fp is %f d is %.8lf input is %d\n", idx, convert_bf16_fp32(table_data[idx]), -128, table_data[idx], (float)s, s, range_start);
+#endif
+  idx++;
+
+  // x [-128~-1], 2's complement
+  for (int i = 1; i < half; i++) {
+    float _idx = (i) / scale;
+    double s = _gen_sigmoid(range_start + _idx);
+    sigmode_hw[idx] = s;
+    table_data[idx] = convert_fp32_bf16((double)s);
+#if 0
+    printf("t [%" PRIu64 "] is %f[%d], 0x%x fp is %f d is %.8lf input is %f\n", idx, convert_bf16_fp32(table_data[idx]), -127 + i, table_data[idx], (float)s, s, range_start + _idx);
+#endif
+    idx++;
+  }
+
+  // duplicate channel #1 to #31
+
+  //TODO: tensor copy
+  for (u32 i = 1; i < channel; i++) {
+    memcpy(&table_data[i * table_hw], &table_data[0], sizeof(u16) * table_hw);
+  }
+}
+
+static void gen_sigmoid_slope(u16 IN *table_data, u16* OUT table_slope, u64 table_size) {
+  u32 half = table_size / channel / 2;
+
+  assert(half == 128);
+  assert(table_data); //<! we use global `sigmode_hw` with wise precision
+
+  for (u32 i = 0; i < table_hw; i++) {
+    double x0 = sigmode_hw[i];
+    double x1 = sigmode_hw[i+1];
+    double delta = 1.0;
+    if (i == half - 1) {
+      //<! slope[127] means f(127)~f(128)
+      double f = _gen_sigmoid(range_end);
+      //u16 bf16 = convert_fp32_bf16(f);
+      //x1 = convert_bf16_fp32(bf16);
+      x1 = f;
+    }
+    else if (i == half) {
+      // 128 index mean x1 is -129 and x0 is -128
+      x1 = _gen_sigmoid(range_start - 1/scale);
+      delta = -1.0;
+    }
+    else if (i > half) {
+      x0 = sigmode_hw[i];
+      x1 = sigmode_hw[i-1];
+      delta = -1.0;
+    }
+    double s = (x1 - x0) / delta; // x1 already scale up
+    table_slope[i] = convert_fp32_bf16((float)s);
+#if 0
+    printf ("slope table [%u] = (bf16 %f double %.8lf float %f), 0x%x, %.8lf - %.8lf(%.8lf)\n",
+        i, convert_bf16_fp32(table_slope[i]), s, (float)s, table_slope[i], x1, x0, x1-x0);
+#endif
+  }
+
+  // duplicate channel #1 to #31
+
+  //TODO: tensor copy
+  for (u64 i = 1; i < channel; i++) {
+    memcpy(&table_slope[table_hw * i], &table_slope[0], sizeof(u16) * table_hw);
+  }
+}
+
+static bool verify(u16 *ofmap_data, u16 *ref_data, u64 ofmap_size) {
+  int count = 0;
+  u64 size = ofmap_size;
+  //if (mode == PRE_DATA_COMPARE_FIX) {
+  //  size = sizeof(sigmode_golden_bf16) / sizeof(u16);
+  //}
+  //else if (PRE_DATA_MAX_ERROR) {
+  //  size = sizeof(sigmode_golden) / sizeof(double);
+  //}
+
+  for (u64 i = 0; i < size; i++) {
+    if (mode == PRE_DATA_COMPARE_FIX) {
+      if (ofmap_data[i] != sigmode_golden_bf16[i]) {
+        fprintf(stderr,
+            "[%d] comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+            count, i, ofmap_data[i], sigmode_golden_bf16[i]);
+        exit(-1);
+      }
+    }
+    else {
+      float got = convert_bf16_fp32(ofmap_data[i]);
+      float exp = convert_bf16_fp32(ref_data[i]);
+
+      if (mode == PRE_DATA_MAX_ERROR) {
+        // cus we have better accuracy ~ 0.0039
+        exp = sigmode_golden[i];
+      }
+
+      if (fabs(got - exp) > MAX_ERROR) {
+        fprintf(stderr,
+            "[%d] comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x, diff(%f - %f) is %f\n",
+            count, i, ofmap_data[i], ref_data[i], got, exp, fabs(got - exp)
+            );
+        count++;
+      }
+    }
+  }
+
+  if (count != 0) {
+    printf("error count is %d\n", count);
+    exit(-1);
+  }
+
+  return true;
+}
+
+static void test_tl_int8_lut_bf16(bmctx_t *ctx, bmk_ctx_t *bmk)
+{
+  // TODO: check more shape / align
+  tl_shape_t ifmap_shape;
+  if (mode == PRE_DATA_COMPARE_FIX || mode == PRE_DATA_MAX_ERROR) {
+    ifmap_shape = {1, channel, 8, 8};
+  }
+  else {
+    ifmap_shape = {1, channel, 16, 16};
+  }
+
+  tl_shape_t table_shape = {1, channel, table_h, table_w}; // hard code for hw, hw:32x8
+  tl_shape_t ofmap_shape = ifmap_shape;
+
+  u64 ifmap_size = tl_shape_size(&ifmap_shape);
+  u64 table_size = tl_shape_size(&table_shape);
+  u64 ofmap_size = tl_shape_size(&ofmap_shape);
+
+  fmt_t fmt = FMT_BF16;
+
+  int data_type_size = bytesize_of_fmt(fmt);
+  u64 ifmap_bytesize  =  ifmap_size * data_type_size;
+  u64 table_bytesize  =  table_size * data_type_size;
+  u64 ofmap_bytesize  =  ofmap_size * data_type_size;
+
+  // hw ONLY support index in int8
+  u16 *ifmap = (u16 *)xmalloc(ifmap_bytesize);
+  memset(ifmap, 0x00, ifmap_bytesize);
+
+  u16 *ifmap_slope = (u16 *)xmalloc(ifmap_bytesize);
+  memset(ifmap_slope, 0x00, ifmap_bytesize);
+
+  if (mode == PRE_DATA_COMPARE_FIX || mode == PRE_DATA_MAX_ERROR) {
+    memcpy(ifmap, &test_pattern, ifmap_bytesize);
+
+#if 0
+    for (u64 i = 0; i < ifmap_size; i++) {
+      printf("source if[%" PRIu64 "] is bf16 %f (bf16)with 0x%x\n", i, convert_bf16_fp32(ifmap[i]), ifmap[i]); 
+    }
+#endif
+  }
+  else {
+    for (u64 i = 0; i < ifmap_size; i++) {
+      // input range is -8 ~ +8
+      float input = ((int)i % 7) * (((int)i % 2) ? 1 : -1) + 0.03 + (i % table_hw) * 0.002;
+      //float input = ((int)i % 10) * (((int)i % 2) ? 1 : -1) + 0.03 + (i % table_hw) * 0.002;
+      assert(check_input_int8_range(input));
+      ifmap[i] = convert_fp32_bf16(input);
+#if 1
+      printf("source if[%" PRIu64 "] is bf16 %f, input is %f (bf16)with 0x%x\n", i, convert_bf16_fp32(ifmap[i]), input, ifmap[i]); 
+#endif
+    }
+  }
+
+  u16 *table_data = (u16 *)xmalloc(table_bytesize);
+  gen_sigmoid (table_data, table_size);
+
+  u16 *table_data_slope = (u16 *)xmalloc(table_bytesize);
+  gen_sigmoid_slope(table_data, table_data_slope, table_size);
+
+  u16 *ref_data = (u16 *)xmalloc(ofmap_bytesize);
+  tl_lut_ref(ref_data, ifmap, table_data, table_data_slope, ifmap_shape, table_shape);
+
+  tl_t *tl_ifmap =
+    alloc_tl(bmk,ifmap_shape, fmt, /*align*/1);
+  tl_t *tl_table_answer =
+    alloc_tl(bmk, table_shape, fmt, /*align*/1);
+  tl_t *tl_table_answer_slope =
+    alloc_tl(bmk, table_shape, fmt, /*align*/1);
+
+  tl_t *tl_ofmap_A_idx_int8 =
+    alloc_tl(bmk,ofmap_shape, fmt, /*align*/1);
+  tl_t *tl_ofmap_A_idx =
+    alloc_tl(bmk,ofmap_shape, fmt, /*align*/1);
+  tl_t *tl_ofmap_B_slope =
+    alloc_tl(bmk,ofmap_shape, fmt, /*align*/1);
+  tl_t *tl_ofmap_A_base =
+    alloc_tl(bmk,ofmap_shape, fmt, /*align*/1);
+  tl_t *tl_ofmap_C =
+    alloc_tl(bmk,ofmap_shape, fmt, /*align*/1);
+
+  tl_shape_t tl_ofmap_A_idx_int8_shape = {1, channel, ofmap_shape.h * ofmap_shape.w, 1};
+  tl_t *tl_ofmap_A_idx_int8_reshape =
+    alloc_tl(bmk,tl_ofmap_A_idx_int8_shape, FMT_I8, /*align*/1);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t copy_p1, copy_p2, copy_p3;
+  memset(&copy_p1, 0, sizeof(copy_p1));
+  memset(&copy_p2, 0, sizeof(copy_p2));
+  memset(&copy_p2, 0, sizeof(copy_p3));
+
+  // pre alloc gaddr
+  prepare_put_bf16_tensor_g2l(ctx, bmk, tl_ifmap, ifmap, fmt, &copy_p1);
+  prepare_put_bf16_tensor_g2l(ctx, bmk, tl_table_answer, table_data, fmt, &copy_p2);
+  prepare_put_bf16_tensor_g2l(ctx, bmk, tl_table_answer_slope, table_data_slope, fmt, &copy_p3);
+
+  // load it
+  launch_put_bf16_tensor_g2l(ctx, bmk, copy_p1.src, &copy_p1);
+  // load table f(x0)
+  launch_put_bf16_tensor_g2l(ctx, bmk, copy_p2.src, &copy_p2);
+  launch_put_bf16_tensor_g2l(ctx, bmk, copy_p3.src, &copy_p3);
+
+  bmk1822_tdma_l2l_tensor_copy_param_t p10;
+
+  // scale input for remap its idx(-x~x) to (-127~127), dirty tl_ifmap
+  bmk1822_tiu_element_wise_mul_param_t p1;
+  p1.res_high = NULL;
+  p1.res_low = tl_ifmap;
+  p1.a = tl_ifmap;
+  p1.b_is_const = 1;
+  p1.b_const.val = convert_fp32_bf16(scale);
+  p1.rshift_bits = 0;
+  p1.relu_enable = 0;
+  bmk1822_tiu_element_wise_mul(bmk, &p1);
+
+#if 0
+  // <! get idx from bf16->int8
+  // save by stride
+  memset(&p10, 0x00, sizeof(bmk1822_tdma_l2l_tensor_copy_param_t));
+  bmk1822_tensor_lmem_t dst;
+  memcpy(&dst, tl_ofmap_A_idx_int8_reshape, sizeof(bmk1822_tensor_lmem_t)); 
+  dst.stride.h = dst.stride.h * 2;
+  dst.int8_rnd_mode = 1;
+  p10.dst = &dst;
+  p10.src = tl_ifmap;
+  bmk1822_tdma_l2l_bf16_tensor_copy(bmk, &p10);
+  test_submit(ctx);
+  dst.int8_rnd_mode = 0; // reset
+
+  // <! int8 to fb16 format cus for sub use, sub MUST in the same format
+  memset(&p10, 0x00, sizeof(bmk1822_tdma_l2l_tensor_copy_param_t));
+  p10.dst = tl_ofmap_A_idx; //<! bf16
+  p10.src = &dst;
+  bmk1822_tdma_l2l_bf16_tensor_copy(bmk, &p10);
+  test_submit(ctx);
+
+  // get f(x0) and slope(x)
+  // reshape, 16->16
+  dst.fmt = fmt;
+  dst.shape = tl_ofmap_B_slope->shape;
+  dst.stride = tl_ofmap_B_slope->stride;
+
+  // <! get slope by index
+  // <! ( (f(x1) - f(x0)) / (x1 - x0) )
+  // <! TIU MUST with same shape and stride, we leverage output map shape and stride
+
+  bmk1822_tiu_lookup_table_param_t p12;
+  memset(&p12, 0x0, sizeof(bmk1822_tiu_lookup_table_param_t));
+  p12.ofmap = tl_ofmap_B_slope;
+  p12.ifmap = &dst;
+  p12.table = tl_table_answer_slope;
+  bmk1822_tiu_lookup_table(bmk, &p12);
+
+  // NOTICE: only call test_submit once after all tiu cmd issued
+
+  // base f(x0)
+  memset(&p12, 0x0, sizeof(bmk1822_tiu_lookup_table_param_t));
+  p12.ofmap = tl_ofmap_A_base;
+  p12.ifmap = &dst;
+  p12.table = tl_table_answer;
+  bmk1822_tiu_lookup_table(bmk, &p12);
+
+  // <! sub, diff base , a - b
+  // (x - x0)
+  bmk1822_tiu_element_wise_sub_param_t p5;
+  p5.res_high = 0;
+  p5.res_low = tl_ofmap_C;
+  p5.a_high = 0;
+  p5.a_low = tl_ifmap;
+  p5.b_high = 0;
+  p5.b_low = tl_ofmap_A_idx;
+  p5.rshift_bits = 0;
+  bmk1822_tiu_element_wise_sub(bmk, &p5);
+
+  // <! mac
+  // <! part A + part B, a * b + res = res
+  bmk1822_tiu_element_wise_mac_param_t p2;
+  p2.res_high = 0;
+  p2.res_low = tl_ofmap_A_base;
+  p2.res_is_int8 = 0;
+  p2.a = tl_ofmap_C;
+  p2.b_is_const = 0;
+  p2.b = tl_ofmap_B_slope;
+  p2.lshift_bits = 0;//lshift_bits;
+  p2.rshift_bits = 0;//rshift_bits;
+  p2.relu_enable = 0;
+  bmk1822_tiu_element_wise_mac(bmk, &p2);
+  test_submit(ctx);
+#else
+  #if 0
+  // <! get idx from bf16->int8
+  // save by stride
+  memset(&p10, 0x00, sizeof(bmk1822_tdma_l2l_tensor_copy_param_t));
+  bmk1822_tensor_lmem_t dst;
+  memcpy(&dst, tl_ofmap_A_idx_int8, sizeof(bmk1822_tensor_lmem_t)); 
+  dst.stride = tl_ofmap_A_idx_int8_reshape->stride;
+  dst.stride.h = dst.stride.h * 2;
+  dst.fmt = FMT_I8;
+  dst.int8_rnd_mode = 1;
+  p10.dst = &dst;
+  p10.src = tl_ifmap;
+  bmk1822_tdma_l2l_bf16_tensor_copy(bmk, &p10);
+  test_submit(ctx);
+  dst.int8_rnd_mode = 0; // reset
+
+  // get f(x0) and slope(x)
+  // reshape, 16->16
+  dst.fmt = fmt;
+  dst.shape = tl_ofmap_B_slope->shape;
+  dst.stride = tl_ofmap_B_slope->stride;
+
+  // <! get slope by index
+  // <! ( (f(x1) - f(x0)) / (x1 - x0) )
+  // <! TIU MUST with same shape and stride, we leverage output map shape and stride
+
+  bmk1822_tiu_lookup_table_param_t p12;
+  memset(&p12, 0x0, sizeof(bmk1822_tiu_lookup_table_param_t));
+  p12.ofmap = tl_ofmap_B_slope;
+  p12.ifmap = &dst;
+  p12.table = tl_table_answer_slope;
+  bmk1822_tiu_lookup_table(bmk, &p12);
+
+  // NOTICE: only call test_submit once after all tiu cmd issued
+
+  // base f(x0)
+  memset(&p12, 0x0, sizeof(bmk1822_tiu_lookup_table_param_t));
+  p12.ofmap = tl_ofmap_A_base;
+  p12.ifmap = &dst;
+  p12.table = tl_table_answer;
+  bmk1822_tiu_lookup_table(bmk, &p12);
+
+  // <! int8 to fb16 format cus for sub use, sub MUST in the same format
+  memcpy(&dst, tl_ofmap_A_idx_int8, sizeof(bmk1822_tensor_lmem_t)); 
+  dst.fmt = FMT_I8;
+  dst.stride = tl_ofmap_A_idx_int8_reshape->stride;
+  dst.shape = tl_ofmap_A_idx_int8_reshape->shape;
+  dst.stride.h = dst.stride.h * 2;
+  dst.int8_rnd_mode = 0; // reset
+  memset(&p10, 0x00, sizeof(bmk1822_tdma_l2l_tensor_copy_param_t));
+  p10.dst = tl_ofmap_A_idx_int8; //<! bf16
+  p10.src = &dst;
+  bmk1822_tdma_l2l_bf16_tensor_copy(bmk, &p10);
+  
+
+  // <! sub, diff base , a - b
+  // (x - x0)
+  bmk1822_tiu_element_wise_sub_param_t p5;
+  p5.res_high = 0;
+  p5.res_low = tl_ifmap;
+  p5.a_high = 0;
+  p5.a_low = tl_ifmap;
+  p5.b_high = 0;
+  p5.b_low = tl_ofmap_A_idx_int8;
+  p5.rshift_bits = 0;
+  bmk1822_tiu_element_wise_sub(bmk, &p5);
+
+  // <! mac
+  // <! part A + part B, a * b + res = res
+  bmk1822_tiu_element_wise_mac_param_t p2;
+  p2.res_high = 0;
+  p2.res_low = tl_ofmap_A_base;
+  p2.res_is_int8 = 0;
+  p2.a = tl_ifmap;
+  p2.b_is_const = 0;
+  p2.b = tl_ofmap_B_slope;
+  p2.lshift_bits = 0;//lshift_bits;
+  p2.rshift_bits = 0;//rshift_bits;
+  p2.relu_enable = 0;
+  bmk1822_tiu_element_wise_mac(bmk, &p2);
+  test_submit(ctx);
+  #else
+
+  // <! get idx from bf16->int8
+  // save by stride
+  assert(tl_ofmap_A_idx_int8_reshape);
+  memset(&p10, 0x00, sizeof(bmk1822_tdma_l2l_tensor_copy_param_t));
+  bmk1822_tensor_lmem_t dst;
+  memcpy(&dst, tl_ofmap_A_base, sizeof(bmk1822_tensor_lmem_t)); 
+  dst.fmt = FMT_I8;
+  dst.shape = tl_ofmap_A_idx_int8_shape;
+  dst.stride = bmk1822_tensor_lmem_default_stride(bmk, dst.shape, dst.fmt, /*eu_align*/ 1);
+  dst.stride.h = dst.stride.h * 2;
+  dst.int8_rnd_mode = 1;
+  p10.dst = &dst;
+  p10.src = tl_ifmap;
+  bmk1822_tdma_l2l_bf16_tensor_copy(bmk, &p10);
+  test_submit(ctx);
+  dst.int8_rnd_mode = 0; // reset
+
+  // <! int8 to fb16 format cus for sub use, sub MUST in the same format
+  memset(&p10, 0x00, sizeof(bmk1822_tdma_l2l_tensor_copy_param_t));
+  p10.dst = tl_ofmap_B_slope; //<! bf16
+  p10.src = &dst;
+  bmk1822_tdma_l2l_bf16_tensor_copy(bmk, &p10);
+  test_submit(ctx);
+
+  // <! sub, diff base , a - b
+  // (x - x0)
+  bmk1822_tiu_element_wise_sub_param_t p5;
+  p5.res_high = 0;
+  p5.res_low = tl_ifmap;
+  p5.a_high = 0;
+  p5.a_low = tl_ifmap;
+  p5.b_high = 0;
+  p5.b_low = tl_ofmap_B_slope;
+  p5.rshift_bits = 0;
+  bmk1822_tiu_element_wise_sub(bmk, &p5);
+
+
+  // get f(x0) and slope(x)
+  // reshape, 16->16
+  dst.fmt = fmt;
+  dst.shape = tl_ofmap_B_slope->shape;
+  dst.stride = tl_ofmap_B_slope->stride;
+
+  // <! get slope by index
+  // <! ( (f(x1) - f(x0)) / (x1 - x0) )
+  // <! TIU MUST with same shape and stride, we leverage output map shape and stride
+
+  bmk1822_tiu_lookup_table_param_t p12;
+  memset(&p12, 0x0, sizeof(bmk1822_tiu_lookup_table_param_t));
+  p12.ofmap = tl_ofmap_B_slope;
+  p12.ifmap = &dst;
+  p12.table = tl_table_answer_slope;
+  bmk1822_tiu_lookup_table(bmk, &p12);
+
+  // NOTICE: only call test_submit once after all tiu cmd issued
+
+  // base f(x0)
+  memset(&p12, 0x0, sizeof(bmk1822_tiu_lookup_table_param_t));
+  p12.ofmap = tl_ofmap_A_base;
+  p12.ifmap = &dst;
+  p12.table = tl_table_answer;
+  bmk1822_tiu_lookup_table(bmk, &p12);
+
+  // <! mac
+  // <! part A + part B, a * b + res = res
+  bmk1822_tiu_element_wise_mac_param_t p2;
+  p2.res_high = 0;
+  p2.res_low = tl_ofmap_A_base;
+  p2.res_is_int8 = 0;
+  p2.a = tl_ifmap;
+  p2.b_is_const = 0;
+  p2.b = tl_ofmap_B_slope;
+  p2.lshift_bits = 0;//lshift_bits;
+  p2.rshift_bits = 0;//rshift_bits;
+  p2.relu_enable = 0;
+  bmk1822_tiu_element_wise_mac(bmk, &p2);
+  test_submit(ctx);
+  #endif
+
+#endif
+
+  u16 *ofmap_data = (u16*)get_bf16_tensor_l2g(ctx, bmk, tl_ofmap_A_base, fmt);
+  verify(ofmap_data, ref_data, ofmap_size);
+
+  free_tl(bmk, tl_ofmap_A_idx_int8_reshape);
+  free_tl(bmk, tl_ofmap_C);
+  free_tl(bmk, tl_ofmap_A_base);
+  free_tl(bmk, tl_ofmap_B_slope);
+  free_tl(bmk, tl_ofmap_A_idx);
+  free_tl(bmk, tl_ofmap_A_idx_int8);
+  free_tl(bmk, tl_table_answer_slope);
+  free_tl(bmk, tl_table_answer);
+  free_tl(bmk, tl_ifmap);
+
+  free(ifmap);
+  free(ifmap_slope);
+  free(table_data);
+  free(table_data_slope);
+  free(ref_data);
+  free(ofmap_data);
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  int round_mode;
+
+  round_mode = set_store_feround();
+
+  test_init(&ctx, &bmk);
+
+  // get channel info
+  bmk1822_chip_info_t chip_info = bmk1822_chip_info();
+  channel = chip_info.npu_num;
+
+  for (int i = PRE_DATA_COMPARE_FIX; i < TEST_MODE_MAX; i++) {
+    mode = static_cast<TEST_MODE>(i);
+    printf ("test mode %d...\n", mode);
+    test_tl_int8_lut_bf16(&ctx, bmk);
+  }
+
+  test_exit(&ctx);
+  restore_feround(round_mode);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_bf16_tensor_add.cpp b/cviruntime/test/1822/bf16/test_1822_bf16_tensor_add.cpp
new file mode 100644
index 000000000..7937d3ac5
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_bf16_tensor_add.cpp
@@ -0,0 +1,105 @@
+#include "../1822_test_util.h"
+
+static void tl_add_ref(
+    u16 *ref_low,
+    u16 *a_low,
+    u16 *b_low,
+    u64 size, int relu_enable)
+{
+  for (u64 i = 0; i < size; i++) {
+    float ta = convert_bf16_fp32(a_low[i]);
+    float tb = convert_bf16_fp32(b_low[i]);
+    float res = ta + tb;
+    if(relu_enable && res <0)
+        res = 0;
+    ref_low[i] = convert_fp32_bf16(res);
+  }
+}
+
+static void test_tl_add(bmctx_t *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+  int rshift_bits;
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  fmt_t fmt_type = FMT_BF16;
+  for(int relu_enable = 0; relu_enable < 2; relu_enable++) {
+    u64 size = n * c * h  * w;
+    u64 data_size = size * (fmt_type == FMT_BF16 ? 2 : 1);
+    fmt_t fmt_type = FMT_BF16;
+    u16 *a_low_data = (u16 *)xmalloc(data_size);
+    u16 *b_low_data = (u16 *)xmalloc(data_size);
+
+    for (u64 i = 0; i < size; i++) {
+      a_low_data[i] = convert_fp32_bf16(i);
+      b_low_data[i] = convert_fp32_bf16(i);
+    }
+    rshift_bits = 0;
+
+    u16 *ref_low_data = (u16 *)xmalloc(data_size);
+
+    tl_add_ref(ref_low_data,
+               a_low_data,
+               b_low_data,
+               size, relu_enable);
+
+    tl_t *tl_a_low = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+    tl_t *tl_b_low = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+    tl_t *tl_res_low = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+
+    put_bf16_tensor_g2l(ctx, bk_ctx, tl_a_low, (u16*)a_low_data, fmt_type);
+    put_bf16_tensor_g2l(ctx, bk_ctx, tl_b_low, (u16*)b_low_data, fmt_type);
+    bmk1822_tiu_element_wise_add_param_t p4;
+    p4.res_high = 0;
+    p4.res_low = tl_res_low;
+    p4.a_high = 0;
+    p4.a_low = tl_a_low;
+    p4.b_is_const = 0;
+    p4.b_high = 0;
+    p4.b_low = tl_b_low;
+    p4.rshift_bits = rshift_bits;
+    p4.relu_enable = relu_enable;
+    bmk1822_tiu_element_wise_add(bk_ctx, &p4);
+    u16 *res_low_data = (u16*)get_bf16_tensor_l2g(ctx, bk_ctx, tl_res_low, fmt_type);
+    
+    for (u64 i = 0; i < size; i++) {
+      if (res_low_data[i] != ref_low_data[i]) {
+        fprintf(stderr, "comparing failed at res_low_data[%" PRIu64 "], got %x, exp %x\n",
+               i, res_low_data[i], ref_low_data[i]);
+        exit(-1);
+      }
+    }
+
+    free_tl(bk_ctx, tl_res_low);
+    free_tl(bk_ctx, tl_b_low);
+    free_tl(bk_ctx, tl_a_low);
+
+    free(a_low_data);
+    free(b_low_data);
+    free(ref_low_data);
+    free(res_low_data);
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  int round_mode;
+  round_mode = set_store_feround();
+
+  test_tl_add(&ctx, bk_ctx, 0);
+  test_tl_add(&ctx, bk_ctx, 1);
+
+  restore_feround(round_mode);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_bf16_tensor_add_const.cpp b/cviruntime/test/1822/bf16/test_1822_bf16_tensor_add_const.cpp
new file mode 100644
index 000000000..1356f4d1d
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_bf16_tensor_add_const.cpp
@@ -0,0 +1,95 @@
+#include "../1822_test_util.h"
+
+static void tl_add_const_ref(
+    u16 *ref_low,
+    u16 *a_low,
+    u16 b,
+    u64 size, int relu_enable)
+{
+  for (u64 i = 0; i < size; i++) {
+    float ta = convert_bf16_fp32(a_low[i]);
+    float tb = convert_bf16_fp32(b);
+    float res = ta + tb;
+    if(relu_enable && res <0)
+        res = 0;
+    ref_low[i] = convert_fp32_bf16(res);
+  }
+}
+
+static void test_tl_add_const(bmctx_t *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  fmt_t fmt_type = FMT_BF16;
+  for(int relu_enable = 0; relu_enable < 2; relu_enable++) {
+    u64 size = n * c * h  * w;
+    u64 data_size = size * (fmt_type == FMT_BF16 ? 2 : 1);
+
+    u16 *a_low_data = (u16 *)xmalloc(data_size);
+    u16 b = convert_fp32_bf16(-3);
+
+    for (u64 i = 0; i < size; i++) {
+      a_low_data[i] = convert_fp32_bf16(i);
+    }
+
+    u16 *ref_low_data = (u16 *)xmalloc(data_size);
+    tl_add_const_ref(ref_low_data,
+                     a_low_data,
+                     b, size,relu_enable);
+
+    tl_t *tl_a_low = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+    tl_t *tl_res_low = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+
+    put_bf16_tensor_g2l(ctx, bk_ctx, tl_a_low, (u16*) a_low_data, fmt_type);
+
+    bmk1822_tiu_element_wise_add_param_t p4;
+    p4.res_high = 0;
+    p4.res_low = tl_res_low;
+    p4.a_high = 0;
+    p4.a_low = tl_a_low;
+    p4.b_is_const = 1;
+    p4.b_const.val = b;
+//    p4.b_const.is_signed = b_is_signed;
+//    p4.rshift_bits = rshift_bits;
+    p4.relu_enable = relu_enable;
+    bmk1822_tiu_element_wise_add(bk_ctx, &p4);
+
+//    u8 *res_high_data = get_tensor_l2g(ctx, bk_ctx, tl_res_high);
+    u16 *res_low_data = (u16 *) get_bf16_tensor_l2g(ctx, bk_ctx, tl_res_low, fmt_type);
+    for (u64 i = 0; i < size; i++) {
+      if (res_low_data[i] != ref_low_data[i]) {
+        fprintf(stderr, "comparing failed at res_low_data[%" PRIu64 "], got %d, exp %d\n",
+                i, res_low_data[i], ref_low_data[i]);
+        exit(-1);
+      }
+    }
+
+    free_tl(bk_ctx, tl_res_low);
+    free_tl(bk_ctx, tl_a_low);
+
+    free(a_low_data);
+    free(ref_low_data);
+    free(res_low_data);
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_add_const(&ctx, bk_ctx, 0);
+  test_tl_add_const(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_bf16_tensor_copy.cpp b/cviruntime/test/1822/bf16/test_1822_bf16_tensor_copy.cpp
new file mode 100644
index 000000000..b8b7a46b0
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_bf16_tensor_copy.cpp
@@ -0,0 +1,76 @@
+#include "../1822_test_util.h"
+
+static void tl_copy_ref(u16 *a, u16 *res, u64 size, fmt_t fmt_type)
+{
+  if(fmt_type == FMT_BF16) {
+    for (u64 i = 0; i < size; i++)
+      res[i] = a[i];
+  } else {
+    u8* u8res = (u8*) res;
+    u8* u8a = (u8*) a;
+    for (u64 i = 0; i < size; i++)
+      u8res[i] = u8a[i];
+  }
+}
+
+static void test_tl_copy(bmctx_t *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  fmt_t fmt_type = FMT_BF16;
+  u64 size = n * c * h * w;
+  u64 data_size = size * (fmt_type == FMT_BF16 ? 2 : 1);
+  u16 *a_data = (u16 *)xmalloc(data_size);
+  for (u64 i = 0; i < size; i++)
+    a_data[i] = convert_fp32_bf16(rand());
+
+  u16 *ref_data = (u16 *)xmalloc(data_size);
+  tl_copy_ref(a_data, ref_data, size, fmt_type);
+
+  tl_t *tl_a = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+  tl_t *tl_res = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+
+  put_bf16_tensor_g2l(ctx, bk_ctx, tl_a, (u16 *)a_data, fmt_type);
+  bmk1822_tiu_element_wise_copy_param_t p10;
+  p10.dst = tl_res;
+  p10.src = tl_a;
+  bmk1822_tiu_element_wise_copy(bk_ctx, &p10);
+  u16 *res_data = (u16 *)get_bf16_tensor_l2g(ctx, bk_ctx, tl_res, fmt_type);
+
+  for (u64 i = 0; i < size; i++) {
+    if (res_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+             i, res_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_res);
+  free_tl(bk_ctx, tl_a);
+
+  free(a_data);
+  free(ref_data);
+  free(res_data);
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_copy(&ctx, bk_ctx, 0);
+  test_tl_copy(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_bf16_tensor_copy_with_stride.cpp b/cviruntime/test/1822/bf16/test_1822_bf16_tensor_copy_with_stride.cpp
new file mode 100644
index 000000000..ce233c612
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_bf16_tensor_copy_with_stride.cpp
@@ -0,0 +1,213 @@
+#include "../1822_test_util.h"
+#include "1822_bf16_util.h"
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+ {FMT_I8, FMT_I8},
+};
+
+static int npu_num = BM1822_HW_NPU_NUM;
+
+static u64 shape_size(tl_shape_t s)
+{
+  return s.n * s.c * s.h * s.w;
+}
+
+static tl_shape_t shape_of_stride(
+    tl_shape_t tl_shape,
+    bmk1822_tensor_lmem_stride_t tl_stride,
+    fmt_t fmt)
+{
+  tl_shape_t shape;
+  shape.n = tl_shape.n;
+  shape.c = npu_num;
+  shape.h = tl_stride.n / ((fmt == FMT_BF16) ?2:1);
+  shape.w = 1;
+  return shape;
+}
+
+static void tl_copy_with_stride_ref(
+    void *src,
+    void *dst,
+    tl_shape_t shape,
+    bmk1822_tensor_lmem_stride_t src_stride,
+    bmk1822_tensor_lmem_stride_t dst_stride,
+    fmt_t fmt)
+{
+  int nsrc_byte = ((fmt == FMT_BF16) ? 2 : 1);
+  int n = shape.n;
+  int c = shape.c;
+  int h = shape.h;
+  int w = shape.w;
+
+  tl_shape_t dst_stride_shape = shape_of_stride(shape, dst_stride, fmt);
+  
+  u16 *u16_ref = NULL;
+  u16 *u16_src = NULL;
+  u8 *u8_ref = NULL;
+  u8 *u8_src = NULL;
+
+  int dst_size = 
+      dst_stride_shape.n *
+      dst_stride_shape.c *
+      dst_stride_shape.h *
+      dst_stride_shape.w;
+
+  if (fmt == FMT_BF16) {
+    u16_ref = (u16 *)dst;
+    u16_src = (u16 *)src;
+  } else {
+    u8_ref = (u8 *)dst;
+    u8_src = (u8 *)src;
+  }
+
+  for (int i = 0; i < dst_size; i++) {
+    if (fmt == FMT_BF16) {
+      u16_ref[i] = 0x0;
+    } else {
+      u8_ref[i] = 0x0;
+    }
+  }
+
+  for (int ni = 0; ni < n; ni++) {
+    for (int ci = 0; ci < c; ci++) {
+      for (int hi = 0; hi < h; hi++) {
+        for (int wi = 0; wi < w; wi++) {
+          int src_i = (ni * npu_num + ci % npu_num) * src_stride.n / nsrc_byte +
+              ci / npu_num * src_stride.c / nsrc_byte +
+              hi * src_stride.h / nsrc_byte +
+              wi;
+          int dst_i = (ni * npu_num + ci % npu_num) * dst_stride.n / nsrc_byte +
+              ci / npu_num * dst_stride.c / nsrc_byte +
+              hi * dst_stride.h / nsrc_byte +
+              wi;
+          if (fmt == FMT_BF16) {
+            u16_ref[dst_i] = u16_src[src_i];
+          } else {
+            u8_ref[dst_i] = u8_src[src_i];
+          }
+        }
+      }
+    }
+  }
+
+  dst =  (fmt == FMT_BF16) ? (void *)u16_ref : (void *)u8_ref;
+}
+
+static void test_tl_copy_with_stride(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    fmt_t fmt,
+    int eu_align)
+{
+  int n = 3;
+  int c = 38;
+  int h = 2;
+  int w = 3;
+  int c_layers = ALIGN(c, npu_num) / npu_num;
+  int nsrc_byte = ((fmt == FMT_BF16) ? 2 : 1);
+
+  bmk1822_tensor_lmem_stride_t src_stride;
+  src_stride.w = nsrc_byte;
+  src_stride.h = (w + 3) * nsrc_byte;
+  src_stride.c = h * src_stride.h + (13 * nsrc_byte);
+  src_stride.n = c_layers * src_stride.c + (7 * nsrc_byte);
+
+  bmk1822_tensor_lmem_stride_t dst_stride;
+  dst_stride.w = nsrc_byte;
+  dst_stride.h = (w + 1) * nsrc_byte;
+  dst_stride.c = h * dst_stride.h + (5 * nsrc_byte);
+  dst_stride.n = c_layers * dst_stride.c + (19 * nsrc_byte);
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  tl_shape_t src_stride_shape = shape_of_stride(tl_shape, src_stride, fmt);
+  tl_shape_t dst_stride_shape = shape_of_stride(tl_shape, dst_stride, fmt);
+
+  int src_size = shape_size(src_stride_shape);
+  int dst_size = shape_size(dst_stride_shape);
+
+  float val = -100;
+  void *src_data = NULL;
+  u16 *u16src_data = (u16 *)malloc(sizeof(u16) * src_size);
+  s8 *s8src_data = (s8 *)malloc(sizeof(s8) * src_size);
+  void *dst_data = NULL;
+  u16 *u16dst_data = (u16 *)malloc(sizeof(u16) * dst_size);
+  s8 *s8dst_data = (s8 *)malloc(sizeof(s8) * dst_size);
+  u8 *result_x = NULL;
+  u8 *ref_x = NULL;
+
+  // prepare source data
+  ref_x = (u8 *)xmalloc(dst_size * nsrc_byte);
+  for (int i = 0; i < src_size; i++) {
+    if(fmt == FMT_BF16) {
+      u16src_data[i] = convert_fp32_bf16(val);
+      val += 0.1;
+    } else {
+      s8src_data[i] = i;
+    }
+  }
+  for (int i = 0; i < dst_size; i++) {
+      u16dst_data[i] = s8dst_data[i] = 0;
+  }
+  src_data =  (fmt == FMT_BF16) ? (void *)u16src_data : (void *)s8src_data;
+  dst_data =  (fmt == FMT_BF16) ? (void *)u16dst_data : (void *)s8dst_data;
+
+  // run tpu operations
+  tl_t *tl_src = alloc_tl( bk_ctx, src_stride_shape, fmt, eu_align);
+  tl_t *tl_dst = alloc_tl( bk_ctx, dst_stride_shape, fmt, eu_align);
+  put_bf16_tensor_g2l(ctx, bk_ctx, tl_src, (u16 *)src_data, fmt);
+  put_bf16_tensor_g2l(ctx, bk_ctx, tl_dst, (u16 *)dst_data, fmt);
+  {
+    tl_t src = *tl_src;
+    tl_t dst = *tl_dst;
+    src.shape = dst.shape = tl_shape;
+    src.stride = src_stride;
+    dst.stride = dst_stride;
+    bmk1822_tiu_element_wise_copy_param_t p11;
+    p11.dst = &dst;
+    p11.src = &src;
+    bmk1822_tiu_element_wise_copy(bk_ctx, &p11);
+    
+  }
+  result_x = get_bf16_tensor_l2g(ctx, bk_ctx, tl_dst, fmt);
+
+  tl_copy_with_stride_ref(src_data, ref_x, tl_shape, src_stride, dst_stride, fmt);
+
+  // compare data
+  if( COMPARE_PASS != compare_result( ref_x, result_x, fmt, dst_size))
+    exit(-1);
+  
+  // free variables
+  free_tl(bk_ctx, tl_dst);
+  free_tl(bk_ctx, tl_src);
+  free(s8src_data);
+  free(u16src_data);
+  free(s8dst_data);
+  free(u16dst_data);
+  free(result_x);
+  free(ref_x);
+}
+
+#define TEST_ALIGNED 1 // 1: test unalign only, 2: test both align/unalign
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+
+  test_init(&ctx, &bk_ctx);
+
+  for (u32 i = 0; i < nr_fmt; i++) {
+    for (u8 u8_align = 0; u8_align < TEST_ALIGNED; u8_align++) {
+      test_tl_copy_with_stride(&ctx, bk_ctx, input_fmt[i].src_fmt, u8_align);
+    }
+  }
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_bf16_tensor_ge.cpp b/cviruntime/test/1822/bf16/test_1822_bf16_tensor_ge.cpp
new file mode 100644
index 000000000..086105583
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_bf16_tensor_ge.cpp
@@ -0,0 +1,89 @@
+#include "../1822_test_util.h"
+
+static void tl_ge_ref(u16 *a, u16 *b, u16 *result, u64 size)
+{
+  for (u64 i = 0; i < size; i++) {
+    float fa = convert_bf16_fp32(a[i]);
+    float fb = convert_bf16_fp32(b[i]);
+    float fge;
+    if (fa >= fb)
+      fge = 1;
+    else
+      fge = 0;
+    result[i] = convert_fp32_bf16(fge);
+  }
+}
+
+static void test_tl_ge(bmctx_t *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  fmt_t fmt_type = FMT_BF16;
+  u64 size = n * c * h * w;
+  u64 data_size = size * (fmt_type == FMT_BF16 ? 2 : 1);
+  u16 *a_data = (u16 *)xmalloc(data_size);
+  for (u64 i = 0; i < size; i++)
+    a_data[i] = convert_fp32_bf16((s8)(i % 256));
+
+  u16 *b_data = (u16 *)xmalloc(data_size);
+  for (u64 i = 0; i < size; i++)
+    b_data[i] = convert_fp32_bf16((s8)(100 - i % 256));
+
+  u16 *ref_data = (u16 *)xmalloc(data_size);
+  tl_ge_ref(a_data, b_data, ref_data, size);
+
+  tl_t *tl_a = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+  tl_t *tl_b = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+  tl_t *tl_ge = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+
+  put_bf16_tensor_g2l(ctx, bk_ctx, tl_a, (u16 *)a_data, fmt_type);
+  put_bf16_tensor_g2l(ctx, bk_ctx, tl_b, (u16 *)b_data, fmt_type);
+
+  bmk1822_tiu_element_wise_ge_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.ge = tl_ge;
+  p.a = tl_a;
+  p.b_is_const = 0;
+  p.b = tl_b;
+  bmk1822_tiu_bf16_element_wise_ge(bk_ctx, &p);
+  u16 *ge_data = (u16*)get_bf16_tensor_l2g(ctx, bk_ctx, tl_ge, fmt_type);
+
+  for (u64 i = 0; i < size; i++) {
+    if (ge_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+             i, ge_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_ge);
+  free_tl(bk_ctx, tl_b);
+  free_tl(bk_ctx, tl_a);
+
+  free(a_data);
+  free(b_data);
+  free(ref_data);
+  free(ge_data);
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_ge(&ctx, bk_ctx, 0);
+  test_tl_ge(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_bf16_tensor_ge_const.cpp b/cviruntime/test/1822/bf16/test_1822_bf16_tensor_ge_const.cpp
new file mode 100644
index 000000000..c2f8b04e3
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_bf16_tensor_ge_const.cpp
@@ -0,0 +1,86 @@
+#include "../1822_test_util.h"
+
+static void tl_ge_const_ref(u16 *a, u16 b, u16 *result, u64 size)
+{
+  for (u64 i = 0; i < size; i++) {
+    float fa = convert_bf16_fp32(a[i]);
+    float fb = convert_bf16_fp32(b);
+    float fge;
+    if (fa >= fb)
+      fge = 1;
+    else
+      fge = 0;
+    result[i] = convert_fp32_bf16(fge);
+  }
+}
+
+static void test_tl_ge_const(bmctx_t *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  fmt_t fmt_type = FMT_BF16;
+  u64 size = n * c * h  * w;
+  u64 data_size = size * (fmt_type == FMT_BF16 ? 2 : 1);
+
+  u16 *a_data = (u16 *)xmalloc(data_size);
+  for (u64 i = 0; i < size; i++)
+    a_data[i] = convert_fp32_bf16(i);
+    //a_data[i] = convert_fp32_bf16(rand()%100 - 50);
+
+  u16 b = convert_fp32_bf16(20);
+  u16 *ref_data = (u16 *)xmalloc(data_size);
+  tl_ge_const_ref(a_data, b, ref_data, size);
+
+  tl_t *tl_a = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+  tl_t *tl_ge = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+  
+  put_bf16_tensor_g2l(ctx, bk_ctx, tl_a, (u16 *)a_data, fmt_type);
+  bmk1822_tiu_element_wise_ge_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.ge = tl_ge;
+  p.a = tl_a;
+  p.b_is_const = 1;
+  p.b_const.val = b;
+  p.b_const.is_signed = 1;
+
+  bmk1822_tiu_bf16_element_wise_ge(bk_ctx, &p);
+
+  u16 *ge_data = (u16*) get_bf16_tensor_l2g(ctx, bk_ctx, tl_ge, fmt_type);
+
+  for (u64 i = 0; i < size; i++) {
+    if (ge_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+             i, ge_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_ge);
+  free_tl(bk_ctx, tl_a);
+
+  free(a_data);
+  free(ref_data);
+  free(ge_data);
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_ge_const(&ctx, bk_ctx, 0);
+  test_tl_ge_const(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_bf16_tensor_mac.cpp b/cviruntime/test/1822/bf16/test_1822_bf16_tensor_mac.cpp
new file mode 100644
index 000000000..e87c9fa15
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_bf16_tensor_mac.cpp
@@ -0,0 +1,109 @@
+#include "../1822_test_util.h"
+
+static void tl_mac_ref(
+    u16 *ref,
+    u16 *a, u16 *b, u16 *c,
+    u64 size, int relu_enable)
+{
+  for (u64 i = 0; i < size; i++) {
+    float ta = convert_bf16_fp32(a[i]);
+    float tb = convert_bf16_fp32(b[i]);
+    float tc = convert_bf16_fp32(c[i]);
+    float res = ta * tb + tc;
+
+    if(relu_enable)
+      if(res<0)
+        res=0;
+    ref[i] = convert_fp32_bf16(res);
+  }
+}
+
+static void test_tl_mac(bmctx_t *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int lshift_bits = 1;
+  int rshift_bits = 3;
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+  fmt_t fmt_type = FMT_BF16;
+  for(int relu_enable = 0; relu_enable < 2; relu_enable++) {
+    u64 size = n * c * h * w;
+    u64 data_size = size * (fmt_type == FMT_BF16 ? 2 : 1);
+    u16 *a_data = (u16 *)xmalloc(data_size);
+    u16 *b_data = (u16 *)xmalloc(data_size);
+    u16 *c_data = (u16 *)xmalloc(data_size);
+
+    for (u64 i = 0; i < size; i++) {
+      a_data[i] = convert_fp32_bf16(rand());
+      b_data[i] = convert_fp32_bf16(rand());
+      c_data[i] = convert_fp32_bf16(rand());
+    }
+
+    u16 *ref_data = (u16 *)xmalloc(data_size);
+
+    tl_mac_ref(ref_data,
+               a_data, b_data, c_data,
+               size, relu_enable);
+
+    tl_t *tl_a = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+    tl_t *tl_b = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+    tl_t *tl_c = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+
+    put_bf16_tensor_g2l(ctx, bk_ctx, tl_a, a_data, fmt_type);
+    put_bf16_tensor_g2l(ctx, bk_ctx, tl_b, b_data, fmt_type);
+    put_bf16_tensor_g2l(ctx, bk_ctx, tl_c, c_data, fmt_type);
+
+    bmk1822_tiu_element_wise_mac_param_t p2;
+    p2.res_high = 0;
+    p2.res_low = tl_c;
+    p2.res_is_int8 = relu_enable;
+    p2.a = tl_a;
+    p2.b_is_const = 0;
+    p2.b = tl_b;
+    p2.lshift_bits = lshift_bits;
+    p2.rshift_bits = rshift_bits;
+    p2.relu_enable = relu_enable;
+    bmk1822_tiu_element_wise_mac(bk_ctx, &p2);
+    u16 *mac_data = (u16 *)get_bf16_tensor_l2g(ctx, bk_ctx, tl_c, fmt_type);
+
+    for (u64 i = 0; i < size; i++) {
+      if (mac_data[i] != ref_data[i]) {
+        fprintf(stderr, "comparing failed at mac_data[%" PRIu64 "], got %d, exp %d\n",
+               i, mac_data[i], ref_data[i]);
+        exit(-1);
+      }
+    }
+
+    free_tl(bk_ctx, tl_c);
+    free_tl(bk_ctx, tl_b);
+    free_tl(bk_ctx, tl_a);
+
+    free(a_data);
+    free(b_data);
+    free(c_data);
+    free(ref_data);
+    free(mac_data);
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  int round_mode;
+  round_mode = set_store_feround();
+  test_tl_mac(&ctx, bk_ctx, 0);
+  test_tl_mac(&ctx, bk_ctx, 1);
+  restore_feround(round_mode);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_bf16_tensor_mac_const.cpp b/cviruntime/test/1822/bf16/test_1822_bf16_tensor_mac_const.cpp
new file mode 100644
index 000000000..75dd2a3aa
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_bf16_tensor_mac_const.cpp
@@ -0,0 +1,104 @@
+#include "../1822_test_util.h"
+
+static void tl_mac_const_ref(
+    u16 *ref_low,
+    u16 *a, u16 b_const,
+    u16 *c_low,
+    u64 size, int relu_enable)
+{
+  for (u64 i = 0; i < size; i++) {
+    float ta = convert_bf16_fp32(a[i]);
+    float tb = convert_bf16_fp32(b_const);
+    float tc = convert_bf16_fp32(c_low[i]);
+    float res = ta * tb + tc;
+
+    if(relu_enable)
+    {
+      if(res<0)
+        res=0;
+    }
+    ref_low[i] = convert_fp32_bf16(res);
+  }
+}
+
+static void test_tl_mac_const(bmctx_t *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  for(int relu_enable = 0; relu_enable < 2; relu_enable++) {
+    fmt_t fmt_type = FMT_BF16;
+    u64 size = n * c * h  * w;
+    u64 data_size = size * (fmt_type == FMT_BF16 ? 2 : 1);
+
+    u16 *a_data = (u16 *)xmalloc(data_size);
+    u16 *c_low_data = (u16 *)xmalloc(data_size);
+    for (u64 i = 0; i < size; i++) {
+      a_data[i] = convert_fp32_bf16(rand() % 256);
+      c_low_data[i] = convert_fp32_bf16(i);
+    }
+
+    u16 b_const = convert_fp32_bf16(37);
+
+    u16 *ref_low_data = (u16 *)xmalloc(data_size);
+    tl_mac_const_ref(ref_low_data,
+                     a_data, b_const, c_low_data,
+                     size, relu_enable);
+
+    tl_t *tl_a = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+    tl_t *tl_c_low = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+
+    put_bf16_tensor_g2l(ctx, bk_ctx, tl_a, (u16*) a_data, fmt_type);
+    put_bf16_tensor_g2l(ctx, bk_ctx, tl_c_low, (u16*) c_low_data, fmt_type);
+    bmk1822_tiu_element_wise_mac_param_t p3;
+    p3.res_high = 0;
+    p3.res_low = tl_c_low;
+    p3.res_is_int8 = 1;//relu_enable;
+    p3.a = tl_a;
+    p3.b_is_const = 1;
+    p3.b_const.val = b_const;
+    p3.relu_enable = relu_enable;
+
+    bmk1822_tiu_element_wise_mac(bk_ctx, &p3);
+    u16 *mac_low_data = (u16*) get_bf16_tensor_l2g(ctx, bk_ctx, tl_c_low, fmt_type);
+    for (u64 i = 0; i < size; i++) {
+      if (mac_low_data[i] != ref_low_data[i]) {
+        fprintf(stderr, "comparing failed at mac_low_data[%" PRIu64 "], got %d, exp %d\n",
+               i, mac_low_data[i], ref_low_data[i]);
+        exit(-1);
+      }
+    }
+
+    free_tl(bk_ctx, tl_c_low);
+    free_tl(bk_ctx, tl_a);
+
+    free(a_data);
+    free(c_low_data);
+    free(ref_low_data);
+    free(mac_low_data);
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  int round_mode;
+  round_mode = set_store_feround();
+
+  test_tl_mac_const(&ctx, bk_ctx, 0);
+  test_tl_mac_const(&ctx, bk_ctx, 1);
+
+  restore_feround(round_mode);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_bf16_tensor_max.cpp b/cviruntime/test/1822/bf16/test_1822_bf16_tensor_max.cpp
new file mode 100644
index 000000000..c79f29e0c
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_bf16_tensor_max.cpp
@@ -0,0 +1,89 @@
+#include "../1822_test_util.h"
+
+static void tl_max_ref(u16 *a, u16 *b, u16 *max, u64 size)
+{
+  for (u64 i = 0; i < size; i++) {
+    float fa = convert_bf16_fp32(a[i]);
+    float fb = convert_bf16_fp32(b[i]);
+    float fmax;
+    if (fa > fb)
+      fmax = fa;
+    else
+      fmax = fb;
+    max[i] = convert_fp32_bf16(fmax);
+  }
+}
+
+static void test_tl_max(bmctx_t *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  fmt_t fmt_type = FMT_BF16;
+  u64 size = n * c * h * w;
+  u64 data_size = size * (fmt_type == FMT_BF16 ? 2 : 1);
+  u16 *a_data = (u16 *)xmalloc(data_size);
+  for (u64 i = 0; i < size; i++)
+    a_data[i] = convert_fp32_bf16((s8)(i % 256));
+
+  u16 *b_data = (u16 *)xmalloc(data_size);
+  for (u64 i = 0; i < size; i++)
+    b_data[i] = convert_fp32_bf16((s8)(100 - i % 256));
+
+  u16 *ref_data = (u16 *)xmalloc(data_size);
+  tl_max_ref(a_data, b_data, ref_data, size);
+
+  tl_t *tl_a = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+  tl_t *tl_b = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+  tl_t *tl_max = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+
+  put_bf16_tensor_g2l(ctx, bk_ctx, tl_a, (u16 *)a_data, fmt_type);
+  put_bf16_tensor_g2l(ctx, bk_ctx, tl_b, (u16 *)b_data, fmt_type);
+
+  bmk1822_tiu_element_wise_max_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.max = tl_max;
+  p.a = tl_a;
+  p.b_is_const = 0;
+  p.b = tl_b;
+  bmk1822_tiu_element_wise_max(bk_ctx, &p);
+  u16 *max_data = (u16*)get_bf16_tensor_l2g(ctx, bk_ctx, tl_max, fmt_type);
+
+  for (u64 i = 0; i < size; i++) {
+    if (max_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+             i, max_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_max);
+  free_tl(bk_ctx, tl_b);
+  free_tl(bk_ctx, tl_a);
+
+  free(a_data);
+  free(b_data);
+  free(ref_data);
+  free(max_data);
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_max(&ctx, bk_ctx, 0);
+  test_tl_max(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_bf16_tensor_max_const.cpp b/cviruntime/test/1822/bf16/test_1822_bf16_tensor_max_const.cpp
new file mode 100644
index 000000000..0fec965fe
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_bf16_tensor_max_const.cpp
@@ -0,0 +1,82 @@
+#include "../1822_test_util.h"
+
+static void tl_max_const_ref(u16 *a, u16 b, u16 *max, u64 size)
+{
+  for (u64 i = 0; i < size; i++) {
+    if (convert_bf16_fp32(a[i]) > convert_bf16_fp32(b))
+      max[i] = a[i];
+    else
+      max[i] = b;
+  }
+}
+
+static void test_tl_max_const(bmctx_t *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  fmt_t fmt_type = FMT_BF16;
+  u64 size = n * c * h  * w;
+  u64 data_size = size * (fmt_type == FMT_BF16 ? 2 : 1);
+
+  u16 *a_data = (u16 *)xmalloc(data_size);
+  for (u64 i = 0; i < size; i++)
+    a_data[i] = convert_fp32_bf16(i);
+    //a_data[i] = convert_fp32_bf16(rand()%100 - 50);
+
+  u16 b = convert_fp32_bf16(20);
+  u16 *ref_data = (u16 *)xmalloc(data_size);
+  tl_max_const_ref(a_data, b, ref_data, size);
+
+  tl_t *tl_a = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+  tl_t *tl_max = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+  
+  put_bf16_tensor_g2l(ctx, bk_ctx, tl_a, (u16 *)a_data, fmt_type);
+  bmk1822_tiu_element_wise_max_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.max = tl_max;
+  p.a = tl_a;
+  p.b_is_const = 1;
+  p.b_const.val = b;
+  p.b_const.is_signed = 1;
+
+  bmk1822_tiu_element_wise_max(bk_ctx, &p);
+
+  u16 *max_data = (u16*) get_bf16_tensor_l2g(ctx, bk_ctx, tl_max, fmt_type);
+
+  for (u64 i = 0; i < size; i++) {
+    if (max_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+             i, max_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_max);
+  free_tl(bk_ctx, tl_a);
+
+  free(a_data);
+  free(ref_data);
+  free(max_data);
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_max_const(&ctx, bk_ctx, 0);
+  test_tl_max_const(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_bf16_tensor_min.cpp b/cviruntime/test/1822/bf16/test_1822_bf16_tensor_min.cpp
new file mode 100644
index 000000000..0d86b06a7
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_bf16_tensor_min.cpp
@@ -0,0 +1,87 @@
+#include "../1822_test_util.h"
+
+static void tl_min_ref(u16 *a, u16 *b, u16 *max, u64 size)
+{
+  for (u64 i = 0; i < size; i++) {
+    float fa = convert_bf16_fp32(a[i]);
+    float fb = convert_bf16_fp32(b[i]);
+    float fmax;
+    if (fa > fb)
+      fmax = fb;
+    else
+      fmax = fa;
+    max[i] = convert_fp32_bf16(fmax);
+  }
+}
+
+static void test_tl_min(bmctx_t *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+  fmt_t fmt_type = FMT_BF16;
+  u64 size = n * c * h * w;
+  u64 data_size = size * (fmt_type == FMT_BF16 ? 2 : 1); 
+
+  u16 *a_data = (u16 *)xmalloc(data_size);
+  for (u64 i = 0; i < size; i++)
+    a_data[i] = convert_fp32_bf16(rand());
+
+  u16 *b_data = (u16 *)xmalloc(data_size);
+  for (u64 i = 0; i < size; i++)
+    b_data[i] = convert_fp32_bf16(rand()/2);
+
+  u16 *ref_data = (u16 *)xmalloc(data_size);
+  tl_min_ref(a_data, b_data, ref_data, size);
+
+  tl_t *tl_a = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+  tl_t *tl_b = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+  tl_t *tl_min = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+
+  put_bf16_tensor_g2l(ctx, bk_ctx, tl_a, (u16 *)a_data, fmt_type);
+  put_bf16_tensor_g2l(ctx, bk_ctx, tl_b, (u16 *)b_data, fmt_type);
+  bmk1822_tiu_element_wise_min_param_t p6;
+  p6.min = tl_min;
+  p6.a = tl_a;
+  p6.b_is_const = 0;
+  p6.b = tl_b;
+  bmk1822_tiu_element_wise_min(bk_ctx, &p6);
+  u16 *min_data = (u16*)get_bf16_tensor_l2g(ctx, bk_ctx, tl_min, fmt_type);
+
+  for (u64 i = 0; i < size; i++) {
+    if (min_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+             i, min_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_min);
+  free_tl(bk_ctx, tl_b);
+  free_tl(bk_ctx, tl_a);
+
+  free(a_data);
+  free(b_data);
+  free(ref_data);
+  free(min_data);
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_min(&ctx, bk_ctx, 0);
+  test_tl_min(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_bf16_tensor_min_const.cpp b/cviruntime/test/1822/bf16/test_1822_bf16_tensor_min_const.cpp
new file mode 100644
index 000000000..8b50c3de6
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_bf16_tensor_min_const.cpp
@@ -0,0 +1,79 @@
+#include "../1822_test_util.h"
+
+static void tl_min_const_ref(u16 *a, u16 b, u16 *max, u64 size)
+{
+  for (u64 i = 0; i < size; i++) {
+    if (convert_bf16_fp32(a[i]) > convert_bf16_fp32(b))
+      max[i] = b;
+    else
+      max[i] = a[i];
+  }
+}
+
+static void test_tl_min_const(bmctx_t *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  fmt_t fmt_type = FMT_BF16;
+  u64 size = n * c * h  * w;
+  u64 data_size = size * (fmt_type == FMT_BF16 ? 2 : 1);
+
+  u16 *a_data = (u16 *)xmalloc(data_size);
+  for (u64 i = 0; i < size; i++)
+    a_data[i] = convert_fp32_bf16(rand() % 100 -50);
+
+  u16 b = convert_fp32_bf16(20);
+
+  u16 *ref_data = (u16 *)xmalloc(data_size);
+  tl_min_const_ref(a_data, b, ref_data, size);
+
+  tl_t *tl_a = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+  tl_t *tl_min = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+
+  put_bf16_tensor_g2l(ctx, bk_ctx, tl_a, (u16 *)a_data, fmt_type);
+  bmk1822_tiu_element_wise_min_param_t p7;
+  p7.min = tl_min;
+  p7.a = tl_a;
+  p7.b_is_const = 1;
+  p7.b_const.val = b;
+  p7.b_const.is_signed = 1;
+  bmk1822_tiu_element_wise_min(bk_ctx, &p7);
+  u16 *min_data = (u16*) get_bf16_tensor_l2g(ctx, bk_ctx, tl_min, fmt_type);
+
+  for (u64 i = 0; i < size; i++) {
+    if (min_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+             i, min_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_min);
+  free_tl(bk_ctx, tl_a);
+
+  free(a_data);
+  free(ref_data);
+  free(min_data);
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_min_const(&ctx, bk_ctx, 0);
+  test_tl_min_const(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_bf16_tensor_mul.cpp b/cviruntime/test/1822/bf16/test_1822_bf16_tensor_mul.cpp
new file mode 100644
index 000000000..4fdf392c3
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_bf16_tensor_mul.cpp
@@ -0,0 +1,112 @@
+#include "../1822_test_util.h"
+
+static void tl_mul_ref(u16 *ofmap, u16 *a, u16 *b, u64 size, int shift_bits, int relu_enable, fmt_t fmt_type)
+{
+  if(fmt_type == FMT_BF16) {
+    for (u64 i = 0; i < size; i++) {
+      float tmp = convert_bf16_fp32(a[i]) * convert_bf16_fp32(b[i]);
+      if(relu_enable)
+        if(tmp<0)
+          tmp=0;
+      ofmap[i] = convert_fp32_bf16(tmp);
+    }
+  } else {
+    for (u64 i = 0; i < size; i++) {
+      s32 tmp = a[i] * b[i];
+      tmp += 1 << (shift_bits - 1);
+      tmp >>= shift_bits;
+      if (tmp > 127)
+        tmp = 127;
+      else if (tmp < -128)
+        tmp = -128;
+      if(relu_enable)
+        if(tmp<0)
+          tmp=0;
+      ofmap[i] = tmp;
+    }
+  }
+}
+
+static void test_tl_mul(bmctx_t *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  fmt_t fmt_type = FMT_BF16;
+  u64 size = n * c * h  * w;
+  u64 data_size = size * (fmt_type == FMT_BF16 ? 2 : 1);
+  int shift_bits = 1;
+
+  for (u32 relu_enable = 0; relu_enable < 2; relu_enable++)
+  {
+     u16 *a_data = (u16 *)xmalloc(data_size);
+     u16 *b_data = (u16 *)xmalloc(data_size);
+     for (u64 i = 0; i < size; i++) {
+       a_data[i] = convert_fp32_bf16(random()%0x10);
+       b_data[i] = convert_fp32_bf16(random());
+     }
+   
+     tl_t *tl_a = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+     tl_t *tl_b = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+     tl_t *tl_res_low = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+   
+     put_bf16_tensor_g2l(ctx, bk_ctx, tl_a, (u16 *)a_data, fmt_type);
+     put_bf16_tensor_g2l(ctx, bk_ctx, tl_b, (u16 *)b_data, fmt_type);
+   
+     bmk1822_tiu_element_wise_mul_param_t p1;
+     p1.res_high = NULL;
+     p1.res_low = tl_res_low;
+     p1.a = tl_a;
+     p1.b_is_const = 0;
+     p1.b = tl_b;
+     p1.rshift_bits = shift_bits;
+     p1.relu_enable = relu_enable;
+     bmk1822_tiu_element_wise_mul(bk_ctx, &p1);
+   
+     u16 *res_low_data = (u16 *)get_bf16_tensor_l2g(ctx, bk_ctx, tl_res_low, fmt_type);
+   
+     u16 *ref_data = (u16 *)xmalloc(data_size);
+     tl_mul_ref(ref_data, a_data, b_data, size, shift_bits, relu_enable, fmt_type);
+   
+     for (u64 i = 0; i < size; i++) {
+       if (res_low_data[i] != ref_data[i]) {
+         fprintf(stderr, "comparing failed at res_low_data[%" PRIu64 "], got %x, exp %x\n",
+                i, res_low_data[i], ref_data[i]);
+         exit(-1);
+       }
+     }
+   
+     free_tl(bk_ctx, tl_res_low);
+     free_tl(bk_ctx, tl_b);
+     free_tl(bk_ctx, tl_a);
+   
+     free(a_data);
+     free(b_data);
+     free(ref_data);
+     free(res_low_data);
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  int round_mode;
+  round_mode = set_store_feround();
+
+  test_tl_mul(&ctx, bk_ctx, 0);
+  test_tl_mul(&ctx, bk_ctx, 1);
+
+  restore_feround(round_mode);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_bf16_tensor_mul_const.cpp b/cviruntime/test/1822/bf16/test_1822_bf16_tensor_mul_const.cpp
new file mode 100644
index 000000000..bf2e59815
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_bf16_tensor_mul_const.cpp
@@ -0,0 +1,107 @@
+#include "../1822_test_util.h"
+
+static void tl_mul_const_ref(
+    u16 *ofmap, u16 *ifmap, u64 size, u16 mul_const, int shift_bits, int relu_enable, fmt_t fmt_type)
+{
+
+  if(fmt_type == FMT_BF16) {
+    for (u64 i = 0; i < size; i++) {
+      float tmp = convert_bf16_fp32(ifmap[i]) * convert_bf16_fp32(mul_const);
+      if(relu_enable)
+        if(tmp<0)
+          tmp=0;
+      ofmap[i] = convert_fp32_bf16(tmp);
+    }
+  } else {
+    for (u64 i = 0; i < size; i++) {
+      s32 tmp = ifmap[i] * (s16) mul_const;
+      tmp += 1 << (shift_bits - 1);
+      tmp >>= shift_bits;
+      if (tmp > 127)
+        tmp = 127;
+      else if (tmp < -128)
+        tmp = -128;
+      if(relu_enable)
+        if(tmp<0)
+          tmp=0;
+      ofmap[i] = tmp;
+    }
+  }
+}
+
+static void test_tl_mul_const(bmctx_t *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  u64 size = n * c * h  * w;
+  fmt_t fmt_type = FMT_BF16;
+  u64 data_size = size * (fmt_type == FMT_BF16 ? 2 : 1);
+  int shift_bits = 1;
+
+  for (u32 relu_enable = 0; relu_enable < 2; relu_enable++)
+  {
+    u16 *ifmap_data = (u16 *)xmalloc(data_size);
+    for (u64 i = 0; i < size; i++)
+      ifmap_data[i] = convert_fp32_bf16(random() % 256);
+  
+    u16 mul_const = convert_fp32_bf16(20);
+  
+    u16 *ref_data = (u16 *)xmalloc(data_size);
+    tl_mul_const_ref(ref_data, ifmap_data, size, mul_const, shift_bits, relu_enable, fmt_type);
+  
+    tl_t *tl_ifmap = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+    tl_t *tl_ofmap = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+  
+    put_bf16_tensor_g2l(ctx, bk_ctx, tl_ifmap, (u16 *)ifmap_data, fmt_type);
+  
+    bmk1822_tiu_element_wise_mul_param_t p;
+    memset(&p, 0, sizeof(p));
+    p.res_high = NULL;
+    p.res_low = tl_ofmap;
+    p.a = tl_ifmap;
+    p.b_is_const = 1;
+    p.b_const.val = mul_const;
+    p.relu_enable = relu_enable;
+
+    bmk1822_tiu_element_wise_mul(bk_ctx, &p);
+  
+    u16 *ofmap_data = (u16*) get_bf16_tensor_l2g(ctx, bk_ctx, tl_ofmap, fmt_type);
+  
+    for (u64 i = 0; i < size; i++) {
+      if (ofmap_data[i] != ref_data[i]) {
+        fprintf(stderr, "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+               i, ofmap_data[i], ref_data[i]);
+        exit(-1);
+      }
+    }
+  
+    free_tl(bk_ctx, tl_ofmap);
+    free_tl(bk_ctx, tl_ifmap);
+  
+    free(ifmap_data);
+    free(ref_data);
+    free(ofmap_data);
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_mul_const(&ctx, bk_ctx, 0);
+  test_tl_mul_const(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_bf16_tensor_sub.cpp b/cviruntime/test/1822/bf16/test_1822_bf16_tensor_sub.cpp
new file mode 100644
index 000000000..21c5645d3
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_bf16_tensor_sub.cpp
@@ -0,0 +1,95 @@
+#include "../1822_test_util.h"
+
+static void tl_sub_ref(
+    u16 *ref_low,
+    u16 *a_low,
+    u16 *b_low,
+    u64 size)
+{
+  for (u64 i = 0; i < size; i++) {
+    float ta = convert_bf16_fp32(a_low[i]);
+    float tb = convert_bf16_fp32(b_low[i]);
+    float res = ta - tb;
+
+    ref_low[i] = convert_fp32_bf16(res);
+  }
+}
+
+static void test_tl_sub(bmctx_t *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  fmt_t fmt_type = FMT_BF16;
+  u64 size = n * c * h * w;
+  u64 data_size = size * (fmt_type == FMT_BF16 ? 2 : 1);
+  u16 *a_low_data = (u16 *)xmalloc(data_size);
+  u16 *b_low_data = (u16 *)xmalloc(data_size);
+  for (u64 i = 0; i < size; i++) {
+    a_low_data[i] = convert_fp32_bf16(rand());
+    b_low_data[i] = convert_fp32_bf16(rand());
+  }
+
+  u16 *ref_low_data = (u16 *)xmalloc(data_size);
+  tl_sub_ref(ref_low_data,
+             a_low_data,
+             b_low_data,
+             size);
+
+  tl_t *tl_a_low = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+  tl_t *tl_b_low = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+  tl_t *tl_res_low = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+
+  put_bf16_tensor_g2l(ctx, bk_ctx, tl_a_low, a_low_data, fmt_type);
+  put_bf16_tensor_g2l(ctx, bk_ctx, tl_b_low, b_low_data, fmt_type);
+  bmk1822_tiu_element_wise_sub_param_t p5;
+  p5.res_high = 0;
+  p5.res_low = tl_res_low;
+  p5.a_high = 0;
+  p5.a_low = tl_a_low;
+  p5.b_high = 0;
+  p5.b_low = tl_b_low;
+  p5.rshift_bits = 0;
+  bmk1822_tiu_element_wise_sub(bk_ctx, &p5);
+  u16 *res_low_data = (u16*)get_bf16_tensor_l2g(ctx, bk_ctx, tl_res_low, fmt_type);
+
+  for (u64 i = 0; i < size ; i++) {
+    if (res_low_data[i] != ref_low_data[i]) {
+      fprintf(stderr, "comparing failed at res_low_data[%" PRIu64 "], got %d, exp %d\n",
+             i, res_low_data[i], ref_low_data[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_res_low);
+  free_tl(bk_ctx, tl_b_low);
+  free_tl(bk_ctx, tl_a_low);
+
+  free(a_low_data);
+  free(b_low_data);
+  free(ref_low_data);
+  free(res_low_data);
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  int round_mode;
+  round_mode = set_store_feround();
+
+  test_tl_sub(&ctx, bk_ctx, 0);
+  test_tl_sub(&ctx, bk_ctx, 1);
+
+  restore_feround(round_mode);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_bf16_tensor_transfer.cpp b/cviruntime/test/1822/bf16/test_1822_bf16_tensor_transfer.cpp
new file mode 100644
index 000000000..60f18125d
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_bf16_tensor_transfer.cpp
@@ -0,0 +1,133 @@
+#include "../1822_test_util.h"
+#include "1822_bf16_util.h"
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+ {FMT_I8, FMT_I8},
+};
+
+static void test_put_and_get_tensor_l2g(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    fmt_t fmt)
+{
+  int n = 2;
+  int c = 66;
+  int h = 3;
+  int w = 15;
+  u64 size = n * c * h * w;
+  s8 *s8data_x = (s8 *)malloc(sizeof(s8) * size);
+  s8 *s8data_y = (s8 *)malloc(sizeof(s8) * size);
+  u16 *u16data_x = (u16 *)malloc(sizeof(u16) * size);
+  u16 *u16data_y = (u16 *)malloc(sizeof(u16) * size);
+  u8 *u8src_data_x;
+  u8 *u8src_data_y;
+
+  if(fmt == FMT_BF16) {
+    /* bf16*/
+    float val = -100;
+    for(u64 i = 0; i < size; i++) {
+      u16data_x[i] = generate_bf16_corner_val(val);
+      u16data_y[i] = generate_bf16_corner_val(val);
+      val += 0.1;
+    }
+    u8src_data_x = (u8 *)u16data_x;
+    u8src_data_y = (u8 *)u16data_y;
+  } else {
+    /* int8 -> bf16*/
+    for(u64 i = 0; i < size; i++) {
+      s8data_x[i] = i-100;
+      s8data_y[i] = -i;
+    }
+    u8src_data_x = (u8 *)s8data_x;
+    u8src_data_y = (u8 *)s8data_y;
+  }
+  /*
+   * Interleave two tensors in case the same devmem is reused between
+   * put_tensor_g2l() and get_tensor_l2g(), in which case the content of
+   * devmem is already what is expected before bmk1822_gdma_store(bk_ctx, ).
+   */
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  tg_shape_t ts_shape;
+  ts_shape.n = n;
+  ts_shape.c = c;
+  ts_shape.h = h;
+  ts_shape.w = w;
+
+  tl_t *tl_x = alloc_tl( bk_ctx, tl_shape, fmt, 1);
+  tl_t *tl_y = alloc_tl( bk_ctx, tl_shape, fmt, 1);
+
+  tg_t ts_x;
+  ts_x.base_reg_index = 0;
+  ts_x.start_address = 0;
+  ts_x.shape = ts_shape;
+  ts_x.stride = bmk1822_tensor_tgmem_default_stride(ts_shape, fmt);
+
+  put_bf16_tensor_g2l( ctx, bk_ctx, tl_x, (u16 *)u8src_data_x, fmt);
+  put_bf16_tensor_g2l( ctx, bk_ctx, tl_y, (u16 *)u8src_data_y, fmt);
+
+  u8 *result_x = get_bf16_tensor_l2g( ctx, bk_ctx, tl_x, fmt);
+  u8 *result_y = get_bf16_tensor_l2g( ctx, bk_ctx, tl_y, fmt);
+
+  for (u64 i = 0; i < size; i++) {
+    if (result_x[i] != u8src_data_x[i]) {
+      printf("compare 1 failed at result_x[%d]\n", (int)i);
+      exit(-1);
+    }
+    if (result_y[i] != u8src_data_y[i]) {
+      printf("compare 1 failed at result_y[%d]\n", (int)i);
+      exit(-1);
+    }
+  }
+  free(result_x);
+  free(result_y);
+
+  /*
+   * Get result_y before result_x.
+   */
+
+
+  result_y = get_bf16_tensor_l2g(ctx, bk_ctx, tl_y, fmt);
+  result_x = get_bf16_tensor_l2g(ctx, bk_ctx, tl_x, fmt);
+  for (u64 i = 0; i < size; i++) {
+    if (result_x[i] != u8src_data_x[i]) {
+      printf("compare 2 failed at result_x[%d]\n", (int)i);
+      exit(-1);
+    }
+    if (result_y[i] != u8src_data_y[i]) {
+      printf("compare 2 failed at result_y[%d]\n", (int)i);
+      exit(-1);
+    }
+  }
+  free(result_x);
+  free(result_y);
+
+  free_tl(bk_ctx, tl_y);
+  free_tl(bk_ctx, tl_x);
+
+  free(s8data_x);
+  free(s8data_y);
+  free(u16data_x);
+  free(u16data_y);
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+
+  test_init(&ctx, &bk_ctx);
+  
+  for (u32 i = 0; i < nr_fmt; i++) {
+    test_put_and_get_tensor_l2g(&ctx, bk_ctx, input_fmt[i].src_fmt);
+  }
+  test_exit(&ctx);
+
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_g2g_bf16_tensor_copy.cpp b/cviruntime/test/1822/bf16/test_1822_g2g_bf16_tensor_copy.cpp
new file mode 100644
index 000000000..e1b1726e7
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_g2g_bf16_tensor_copy.cpp
@@ -0,0 +1,120 @@
+#include "../1822_test_util.h"
+#include "1822_bf16_util.h"
+
+typedef bmk1822_tdma_tg2tg_tensor_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  tg_shape_t src_shape;
+  tg_stride_t src_stride;
+  tg_shape_t dst_shape;
+  tg_stride_t dst_stride;
+} case_t;
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+ {FMT_I8, FMT_I8},
+};
+
+static case_t g_cases[] = {
+  {
+    {1, 3, 3, 3}, {27, 9, 3},
+    {1, 3, 3, 3}, {27, 9, 3},
+  },
+  {
+    // YOLOv2 concat layer
+    {1, 256, 19, 19}, {92416, 361, 19},
+    {1, 256, 19, 19}, {462080, 361, 19},
+  }
+};
+
+static void test_param_g2g(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+
+  u64 size = p->src->shape.c * p->src->shape.h * p->src->shape.w;
+
+  u16 *u16src_data = (u16 *)malloc(sizeof(u16) * size);
+  u8 *u8src_data = (u8 *)malloc(sizeof(u8) * size);
+  u8 *src_data;
+
+  if(p->src->fmt == FMT_BF16) {
+    /* bf16*/
+    float val = -100;
+    for(u64 i = 0; i < size; i++) {
+      u16src_data[i] = generate_bf16_corner_val(val);
+      val += 0.1;
+    }
+    src_data = (u8*)u16src_data;
+  } else {
+    /* int8 -> bf16*/
+    for(u64 i = 0; i < size; i++) {
+      u8src_data[i] = 200 + i;
+    }
+    src_data = u8src_data;
+  }
+
+  put_tg_bf16_gmem(ctx, p->src, src_data);
+
+  bmk1822_tdma_tg2tg_bf16_tensor_copy(bmk, p);
+
+  test_submit(ctx);
+  
+  u8 *dst_data = get_tg_bf16_gmem(ctx, p->dst);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != src_data[i]) {
+      fprintf(stderr, "comparing(%d->%d) failed at dst[%" PRIu64 "], got %x, exp %x\n",
+              p->src->fmt, p->dst->fmt, i, dst_data[i], src_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(u8src_data);
+  free(u16src_data);
+  free(dst_data);
+}
+
+static void destroy_param_g2g(bmctx_t *ctx, param_t *p)
+{
+  free_tg_gmem(ctx, p->src);
+  free_tg_gmem(ctx, p->dst);
+}
+
+static void test_one_case(bmctx_t *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+
+  for (u32 i = 0; i < nr_fmt; i++) {
+        param_t p;
+        memset(&p, 0, sizeof(p));
+        p.src = alloc_tg_bf16_gmem(ctx, c->src_shape, input_fmt[i].src_fmt);
+        p.dst = alloc_tg_bf16_gmem(ctx, c->dst_shape, input_fmt[i].dst_fmt);
+        test_param_g2g(ctx, bmk, &p);
+        destroy_param_g2g(ctx, &p);
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
\ No newline at end of file
diff --git a/cviruntime/test/1822/bf16/test_1822_get_bf16_matrix_stride.cpp b/cviruntime/test/1822/bf16/test_1822_get_bf16_matrix_stride.cpp
new file mode 100644
index 000000000..d5d8b38ce
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_get_bf16_matrix_stride.cpp
@@ -0,0 +1,185 @@
+#include "../1822_test_util.h"
+#include "1822_bf16_util.h"
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+ {FMT_I8, FMT_I8},
+};
+
+static void get_matrix_l2g_stride_ref(
+    void *ref,
+    void *a,
+    ml_shape_t ml_shape,
+    bmk1822_matrix_tgmem_stride_t gmem_stride,
+    fmt_t fmt)
+{
+  int row = ml_shape.n;
+  int col = ml_shape.col;
+  int row_stride = gmem_stride.row / ((fmt == FMT_BF16) ?2:1);
+  int stride_size = row * row_stride;
+  u16 *u16_ref = NULL;
+  u16 *u16_src = NULL;
+  u8 *u8_ref = NULL;
+  u8 *u8_src = NULL;
+
+  if (fmt == FMT_BF16) {
+    u16_ref = (u16 *)ref;
+    u16_src = (u16 *)a;
+    for (int i = 0; i < stride_size; i++)
+      u16_ref[i] = 0xaf;
+  } else {
+    u8_ref = (u8 *)ref;
+    u8_src = (u8 *)a;
+    for (int i = 0; i < stride_size; i++)
+      u8_ref[i] = 0xaf;
+  }
+
+  for (int ri = 0; ri < row; ri++) {
+    for (int ci = 0; ci < col; ci++) {
+      if (fmt == FMT_BF16) {
+        u16_ref[ri * row_stride + ci] = u16_src[ri * col + ci];
+      } else {
+        u8_ref[ri * row_stride + ci] = u8_src[ri * col + ci];
+      }
+    }
+  }
+
+  if (fmt == FMT_BF16) {
+    ref = (void *)u16_ref;
+  } else {
+    ref = (void *)u8_ref;
+  }
+}
+
+static u8 * get_matrix_l2g_stride(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    ml_t *ml,
+    bmk1822_matrix_tgmem_stride_t mg_stride,
+    fmt_t fmt)
+{
+  int row = ml->shape.n;
+  int row_stride = mg_stride.row;
+  int col = ml->shape.col;
+  int stride_size = row * row_stride;
+
+  u8 *data = NULL;
+  u8 *u8data = (u8 *)malloc(sizeof(u8) * stride_size);
+  u16 *u16data = (u16 *)malloc(sizeof(u16) * stride_size);
+  if (!u8data || !u16data) {
+    free(u8data);
+    free(u16data);
+    return NULL;
+  }
+
+  for (int i = 0; i < stride_size; i++)
+  {
+    if(fmt == FMT_BF16) {
+      u16data[i] = 0xaf;
+    } else {
+      u8data[i] = 0xaf;
+    }
+  }
+
+  if (fmt == FMT_BF16) {
+    data = (u8 *)u16data;
+    free(u8data);
+  } else {
+    data = u8data;
+    free(u16data);
+  }
+
+  bmshape_t bms = BM_TENSOR_WITH_FMT( row, row_stride, 1, 1,
+                                     (fmt == FMT_BF16)? BM_FMT_BF16 : BM_FMT_INT8);
+  bmmem_device_t devmem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  int ret = bm_memcpy_s2d(*ctx, devmem, data);
+  assert(ret == BM_SUCCESS);
+
+  mg_t mg;
+  mg.base_reg_index = 0;
+  mg.start_address = bmmem_device_addr(devmem);
+  mg.shape.row = row;
+  mg.shape.col = col;
+  mg.stride = mg_stride;
+  mg.fmt = fmt;
+
+  bmk1822_tdma_l2tg_matrix_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = ml;
+  p.dst = &mg;
+
+  bmk1822_tdma_l2g_bf16_matrix_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  ret = bm_memcpy_d2s(*ctx, data, devmem);
+  assert(ret == BM_SUCCESS);
+
+  bmmem_device_free(*ctx, devmem);
+  return data;
+}
+
+static void test_get_matrix_l2g_stride(
+    bmctx_t *ctx, bmk_ctx_t *bk_ctx, fmt_t fmt, int eu_align)
+{
+  int row = 80;
+  int col = 70;
+  float val = -100;
+  int size = row * col;
+  int row_stride = col * 2;
+  int stride_size = row * row_stride;
+  u16 *u16src_data = (u16 *)malloc(sizeof(u16) * size);
+  s8 *s8src_data = (s8 *)malloc(sizeof(s8) * size);
+  void *src_data = NULL;
+  u8 *result_x = NULL;
+  void *ref_x = NULL;
+
+  ml_shape_t ml_shape = bmk1822_matrix_lmem_default_shape(bk_ctx, row, col, fmt);
+  bmk1822_matrix_tgmem_stride_t gmem_stride;
+  gmem_stride.row = row_stride  * ((fmt == FMT_BF16) ?2:1);
+
+  // prepare source data
+  for (int i = 0; i < size; i++) {
+    if(fmt == FMT_BF16) {
+      u16src_data[i] = generate_bf16_corner_val(val);
+      val += 0.1;
+    } else {
+      s8src_data[i] = i;
+    }
+  }
+  ref_x = (u8 *)xmalloc(stride_size * ((fmt == FMT_BF16) ?2:1));
+  src_data =  (fmt == FMT_BF16) ? (void *)u16src_data : (void *)s8src_data;
+
+  // run tpu operations
+  ml_t *ml_x = bmk1822_lmem_alloc_matrix(bk_ctx,ml_shape, fmt, eu_align);
+  put_bf16_matrix_g2l(ctx, bk_ctx, ml_x, (u8 *)src_data, fmt);
+  result_x = get_matrix_l2g_stride(ctx, bk_ctx, ml_x, gmem_stride, fmt);
+  get_matrix_l2g_stride_ref(ref_x, src_data, ml_shape, gmem_stride, fmt);
+
+   // compare data
+  if( COMPARE_PASS != compare_result( ref_x, result_x, fmt, stride_size))
+    exit(-1);
+
+  // free variables
+  bmk1822_lmem_free_matrix(bk_ctx, ml_x);
+  free(s8src_data);
+  free(u16src_data);
+  free(ref_x);
+  free(result_x);
+}
+
+#define TEST_ALIGNED 2 // 1: test unalign only, 2: test both align/unalign
+int main (void)
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+
+  test_init(&ctx, &bk_ctx);
+  for (u32 i = 0; i < nr_fmt; i++) {
+    for (u8 u8_align = 0; u8_align < TEST_ALIGNED; u8_align++) {
+      test_get_matrix_l2g_stride(&ctx, bk_ctx, input_fmt[i].src_fmt, u8_align);
+    }
+  }
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_get_bf16_tensor_gl_stride.cpp b/cviruntime/test/1822/bf16/test_1822_get_bf16_tensor_gl_stride.cpp
new file mode 100644
index 000000000..dfecab9e0
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_get_bf16_tensor_gl_stride.cpp
@@ -0,0 +1,225 @@
+#include "../1822_test_util.h"
+#include "1822_bf16_util.h"
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+ {FMT_I8, FMT_I8},
+};
+
+static void get_tensor_l2g_stride_ref(
+    void *ref, void *a,
+    tl_shape_t tl_shape,
+    bmk1822_tensor_lmem_stride_t tl_stride,
+    bmk1822_tensor_tgmem_stride_t tg_stride,
+    fmt_t fmt)
+{
+  int nsrc_byte = 1;
+  int n = tl_shape.n;
+  int c = tl_shape.c;
+  int h = tl_shape.h;
+  int w = tl_shape.w;
+  u16 *u16_ref = NULL;
+  u16 *u16_src = NULL;
+  u8 *u8_ref = NULL;
+  u8 *u8_src = NULL;
+  int stride_size = n * tg_stride.n;
+
+  if (fmt == FMT_BF16) {
+    nsrc_byte = 2; // FMT_BF16
+    u16_ref = (u16 *)ref;
+    u16_src = (u16 *)a;
+  } else {
+    nsrc_byte = 1; // FMT_U8, FMT_I8
+    u8_ref = (u8 *)ref;
+    u8_src = (u8 *)a;
+  }
+
+  int n_str = tg_stride.n / nsrc_byte;
+  int c_str = tg_stride.c / nsrc_byte;
+  int h_str = tg_stride.h / nsrc_byte;
+  int w_str = 1;
+
+  for (int i = 0; i < stride_size; i++) {
+    if (fmt == FMT_BF16) {
+      u16_ref[i] = 0xcf;
+    } else {
+      u8_ref[i] = 0xcf;
+    }
+  }
+
+  for (int ni = 0; ni < n; ni++) {
+    for (int ci = 0; ci < c; ci++) {
+      for (int hi = 0; hi < h; hi++) {
+        for (int wi = 0; wi < w; wi++) {
+          u64 src_i = (ni * c + ci) * tl_stride.c/nsrc_byte + hi * tl_stride.h/nsrc_byte + wi * 1;
+          u64 dst_i = ni * n_str + ci * c_str + hi * h_str + wi * w_str;
+          if (fmt == FMT_BF16) {
+            u16_ref[dst_i] = u16_src[src_i];
+          } else {
+            u8_ref[dst_i] = u8_src[src_i];
+          }
+        }
+      }
+    }
+  }
+  if (fmt == FMT_BF16) {
+    ref = (void *)u16_ref;
+  } else {
+    ref = (void *)u8_ref;
+  }
+}
+
+static inline u8 * get_tensor_l2g_stride(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    tl_t *tl,
+    bmk1822_tensor_tgmem_stride_t tg_stride,
+    fmt_t fmt)
+{
+  u8 *data = NULL;
+  int n = tl->shape.n;
+  int n_stride = tg_stride.n;
+  int stride_size = n * n_stride;
+  u16 *u16_data = (u16 *)malloc(sizeof(u16) * stride_size);
+  u8 *u8_data = (u8 *)malloc(sizeof(u8) * stride_size);
+  if (!u16_data || !u8_data) {
+    free(u16_data);
+    free(u8_data);
+    return NULL;
+  }
+
+  for (int i = 0; i < stride_size; i++) {
+    if (fmt == FMT_BF16) {
+      u16_data[i] = 0xcf;
+    } else {
+      u8_data[i] = 0xcf;
+    }
+  }
+
+  if (fmt == FMT_BF16) {
+    data = (u8 *)u16_data;
+    free(u8_data);
+  } else {
+    data = u8_data;
+    free(u16_data);
+  }
+
+  bmshape_t bms = BM_TENSOR_WITH_FMT(n, n_stride, 1, 1,
+                                     (fmt == FMT_BF16)? BM_FMT_BF16 : BM_FMT_INT8);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, (u8 *)data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = fmt;
+  tg.shape.n = tl->shape.n;
+  tg.shape.c = tl->shape.c;
+  tg.shape.h = tl->shape.h;
+  tg.shape.w = tl->shape.w;
+  tg.stride = tg_stride;
+  tg.base_reg_index = 0;
+
+  bmk1822_tdma_l2tg_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = tl;
+  p.dst = &tg;
+  bmk1822_tdma_l2g_bf16_tensor_copy(bk_ctx, &p);
+
+  test_submit(ctx);
+
+  ret = bm_memcpy_d2s(*ctx, (u8 *)data, dev_mem);
+  assert(ret == BM_SUCCESS);
+  bmmem_device_free(*ctx, dev_mem);
+
+  return data;
+}
+
+static void test_get_tensor_l2g_gl_stride(
+    bmctx_t *ctx, bmk_ctx_t *bk_ctx, fmt_t fmt, int eu_align)
+{
+  tl_t *tl_x = NULL;
+  int n = 2;
+  int c = 35;
+  int h = 2;
+  int w = 3;
+
+  tg_shape_t tg_shape;
+  tg_shape.n = n;
+  tg_shape.c = c;
+  tg_shape.h = h;
+  tg_shape.w = w;
+
+  bmk1822_tensor_tgmem_stride_t tg_stride =
+      bmk1822_tensor_tgmem_default_stride( tg_shape, fmt);
+
+  int stride_size = n * tg_stride.n;
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h * w;
+  tl_shape.w = 1;
+  float val = -100;
+  u16 *u16src_data = (u16 *)malloc(sizeof(u16) * stride_size);
+  s8 *s8src_data = (s8 *)malloc(sizeof(s8) * stride_size);
+  void *src_data;
+  u8 *result_x = NULL;
+  void *ref_x = NULL;
+  u8 *u8ref_x = NULL;
+  u16 *u16ref_x = NULL;
+
+  // prepare source data
+  ref_x = (u8 *)xmalloc(stride_size * ((fmt == FMT_BF16) ?2:1) );
+  for (int i = 0; i < stride_size; i++) {
+    if(fmt == FMT_BF16) {
+      u16src_data[i] = generate_bf16_corner_val(val);
+      val += 0.1;
+    } else {
+      s8src_data[i] = i;
+    }
+  }
+  src_data =  (fmt == FMT_BF16) ? (void *)u16src_data : (void *)s8src_data;
+
+  // run tpu operations
+  tl_x = alloc_tl( bk_ctx, tl_shape, fmt, eu_align);
+  put_bf16_tensor_g2l( ctx, bk_ctx, tl_x, (u16 *)src_data, fmt);
+  tl_x->shape.n = n;
+  tl_x->shape.c = c;
+  tl_x->shape.h = h;
+  tl_x->shape.w = w;
+  tl_x->stride = bmk1822_tensor_lmem_default_stride(bk_ctx, tl_x->shape, fmt, eu_align);
+  result_x = get_tensor_l2g_stride(ctx, bk_ctx, tl_x, tg_stride, fmt);
+  get_tensor_l2g_stride_ref( ref_x, src_data, tl_x->shape, tl_x->stride, tg_stride, fmt);
+
+  // compare data
+  compare_result( ref_x, result_x, fmt, stride_size);
+
+  // free variables
+  free_tl(bk_ctx, tl_x);
+  free(ref_x);
+  free(result_x);
+  free(u8ref_x);
+  free(u16ref_x);
+  free(s8src_data);
+  free(u16src_data);
+}
+
+#define TEST_ALIGNED 1 // 1: test unalign only, 2: test both align/unalign
+int main (void)
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+
+  test_init(&ctx, &bk_ctx);
+  for (u32 i = 0; i < nr_fmt; i++) {
+    for (u8 u8_align = 0; u8_align < TEST_ALIGNED; u8_align++) {
+        test_get_tensor_l2g_gl_stride(&ctx, bk_ctx, input_fmt[i].src_fmt, u8_align);
+    }
+  }
+  test_exit(&ctx);
+
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_get_bf16_tensor_stride.cpp b/cviruntime/test/1822/bf16/test_1822_get_bf16_tensor_stride.cpp
new file mode 100644
index 000000000..be3e6e0af
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_get_bf16_tensor_stride.cpp
@@ -0,0 +1,212 @@
+#include "../1822_test_util.h"
+#include "1822_bf16_util.h"
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+ {FMT_I8, FMT_I8},
+};
+
+static void get_tensor_l2g_stride_ref(
+    void *ref,
+    void *a,
+    tl_shape_t tl_shape,
+    bmk1822_tensor_tgmem_stride_t tg_stride,
+    fmt_t fmt)
+{
+  int n = tl_shape.n;
+  int c = tl_shape.c;
+  int h = tl_shape.h;
+  int w = tl_shape.w;
+  int nsrc_byte = 1;
+  u16 *u16_ref = NULL;
+  u16 *u16_src = NULL;
+  u8 *u8_ref = NULL;
+  u8 *u8_src = NULL;
+  int stride_size = n * tg_stride.n;
+
+  if (fmt == FMT_BF16) {
+    nsrc_byte = 2; // FMT_BF16
+    u16_ref = (u16 *)ref;
+    u16_src = (u16 *)a;
+  } else {
+    nsrc_byte = 1; // FMT_U8, FMT_I8
+    u8_ref = (u8 *)ref;
+    u8_src = (u8 *)a;
+  }
+  int n_str = tg_stride.n / nsrc_byte;
+  int c_str = tg_stride.c / nsrc_byte;
+  int h_str = tg_stride.h / nsrc_byte;
+  int w_str = 1;
+
+  for (int i = 0; i < stride_size; i++) {
+    if (fmt == FMT_BF16) {
+      u16_ref[i] = 0xcf;
+    } else {
+      u8_ref[i] = 0xcf;
+    }
+  }
+
+  for (int ni = 0; ni < n; ni++) {
+    for (int ci = 0; ci < c; ci++) {
+      for (int hi = 0; hi < h; hi++) {
+        for (int wi = 0; wi < w; wi++) {
+          u64 src_i = ni * c * h * w + ci * h * w + hi * w + wi * w_str;
+          u64 dst_i = ni * n_str + ci * c_str + hi * h_str + wi * w_str;
+          if (fmt == FMT_BF16) {
+            u16_ref[dst_i] = u16_src[src_i];
+          } else {
+            u8_ref[dst_i] = u8_src[src_i];
+          }
+        }
+      }
+    }
+  }
+
+  if (fmt == FMT_BF16) {
+    ref = (void *)u16_ref;
+  } else {
+    ref = (void *)u8_ref;
+  }
+}
+
+static inline u8 * get_tensor_l2g_stride(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    tl_t *tl,
+    bmk1822_tensor_tgmem_stride_t tg_stride,
+    fmt_t fmt)
+{
+  u8 *data = NULL;
+  int n = tl->shape.n;
+  int n_stride = tg_stride.n;
+  int stride_size = n * n_stride;
+  u16 *u16_data = (u16 *)malloc(sizeof(u16) * stride_size);
+  u8 *u8_data = (u8 *)malloc(sizeof(u8) * stride_size);
+  if (!u16_data || !u8_data) {
+    free(u16_data);
+    free(u8_data);
+    return NULL;
+  }
+
+  for (int i = 0; i < stride_size; i++) {
+    if (fmt == FMT_BF16) {
+      u16_data[i] = 0xcf;
+    } else {
+      u8_data[i] = 0xcf;
+    }
+  }
+
+  if (fmt == FMT_BF16) {
+    data = (u8 *)u16_data;
+    free(u8_data);
+  } else {
+    data = u8_data;
+    free(u16_data);
+  }
+
+  bmshape_t bms = BM_TENSOR_WITH_FMT(n, n_stride, 1, 1,
+                                     (fmt == FMT_BF16)? BM_FMT_BF16 : BM_FMT_INT8);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, (u8 *)data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = fmt;
+  tg.shape.n = tl->shape.n;
+  tg.shape.c = tl->shape.c;
+  tg.shape.h = tl->shape.h;
+  tg.shape.w = tl->shape.w;
+  tg.stride = tg_stride;
+
+  bmk1822_tdma_l2tg_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = tl;
+  p.dst = &tg;
+  bmk1822_tdma_l2g_bf16_tensor_copy(bk_ctx, &p);
+
+  test_submit(ctx);
+
+  ret = bm_memcpy_d2s(*ctx, (u8 *)data, dev_mem);
+  assert(ret == BM_SUCCESS);
+  bmmem_device_free(*ctx, dev_mem);
+
+  return data;
+}
+
+static void test_get_tensor_l2g_stride(
+    bmctx_t *ctx, bmk_ctx_t *bk_ctx, fmt_t fmt, int eu_align)
+{
+  tl_t *tl_x = NULL;
+  int n = 2;
+  int c = 15;
+  int h = 10;
+  int w = 8;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  bmk1822_tensor_tgmem_stride_t tg_stride;
+  tg_stride.h = w * 2;
+  tg_stride.c = tg_stride.h * h * 2;
+  tg_stride.n = tg_stride.c * c * 2;
+  int stride_size = n * tg_stride.n;
+
+  float val = -100;
+  void *src_data = NULL;
+  u16 *u16src_data = (u16 *)malloc(sizeof(u16) * stride_size);
+  s8 *s8src_data = (s8 *)malloc(sizeof(s8) * stride_size);
+  u8 *result_x = NULL;
+  void *ref_x = NULL;
+
+  // prepare source data
+  ref_x = (u8 *)xmalloc(stride_size * ((fmt == FMT_BF16) ?2:1) );
+  for (int i = 0; i < stride_size; i++) {
+    if(fmt == FMT_BF16) {
+      u16src_data[i] = generate_bf16_corner_val(val);
+      val += 0.1;
+    } else {
+      s8src_data[i] = i;
+    }
+  }
+  src_data =  (fmt == FMT_BF16) ? (void *)u16src_data : (void *)s8src_data;
+
+  // run tpu operations
+  tl_x = alloc_tl(bk_ctx, tl_shape, fmt, eu_align);
+  put_bf16_tensor_g2l(ctx, bk_ctx, tl_x, (u16 *)src_data, fmt);
+  result_x = get_tensor_l2g_stride(ctx, bk_ctx ,tl_x, tg_stride, fmt);
+  get_tensor_l2g_stride_ref( ref_x, src_data, tl_shape, tg_stride, fmt);
+
+  // compare data
+  compare_result( ref_x, result_x, fmt, stride_size);
+
+  // free variables
+  free_tl(bk_ctx, tl_x);
+  free(ref_x);
+  free(result_x);
+  free(s8src_data);
+  free(u16src_data);
+}
+
+#define TEST_ALIGNED 2 // 1: test unalign only, 2: test both align/unalign
+int main (void)
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+
+  test_init(&ctx, &bk_ctx);
+  for (u32 i = 0; i < nr_fmt; i++) {
+    for (u8 u8_align = 0; u8_align < TEST_ALIGNED; u8_align++) {
+      test_get_tensor_l2g_stride(&ctx, bk_ctx, input_fmt[i].src_fmt, u8_align);
+    }
+  }
+  test_exit(&ctx);
+
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_get_bf16_tensor_stride_unalign.cpp b/cviruntime/test/1822/bf16/test_1822_get_bf16_tensor_stride_unalign.cpp
new file mode 100644
index 000000000..7a5c5cbd0
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_get_bf16_tensor_stride_unalign.cpp
@@ -0,0 +1,232 @@
+#include "../1822_test_util.h"
+#include "1822_bf16_util.h"
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+ {FMT_I8, FMT_I8},
+};
+
+static void get_tensor_l2g_stride_unalign_ref(
+    void *ref,
+    void *a,
+    tl_shape_t tl_shape,
+    bmk1822_tensor_tgmem_stride_t gmem_stride,
+    fmt_t fmt)
+{
+  int n = tl_shape.n;
+  int c = tl_shape.c;
+  int h = tl_shape.h;
+  int w = tl_shape.w;
+  int nsrc_byte = 1;
+  int new_n = n * 2;
+  int new_h = h / 2;
+  u16 *u16_ref = NULL;
+  u16 *u16_src = NULL;
+  u8 *u8_ref = NULL;
+  u8 *u8_src = NULL;
+
+  if (fmt == FMT_BF16) {
+    nsrc_byte = 2; // FMT_BF16
+    u16_ref = (u16 *)ref;
+    u16_src = (u16 *)a;
+  } else {
+    nsrc_byte = 1; // FMT_U8, FMT_I8
+    u8_ref = (u8 *)ref;
+    u8_src = (u8 *)a;
+  }
+  int n_str = gmem_stride.n / nsrc_byte;
+  int c_str = gmem_stride.c / nsrc_byte;
+  int h_str = gmem_stride.h / nsrc_byte;
+  /*
+   * Same as in get_tensor_l2g_stride_unalign().
+   */
+  int stride_size = new_n * gmem_stride.n;
+  for (int i = 0; i < stride_size; i++) {
+    if (fmt == FMT_BF16) {
+      u16_ref[i] = 0xcf;
+    } else {
+      u8_ref[i] = 0xcf;
+    }
+  }
+  /*
+   * (n, c, h, w) => (n * 2, c, h / 2, w)
+   */
+  for (int ni = 0; ni < n; ni++) {
+    for (int ci = 0; ci < c; ci++) {
+      for (int hi = 0; hi < h; hi++) {
+        for (int wi = 0; wi < w; wi++) {
+          u64 src_i = ni * c * h * w + ci * h * w + hi * w + wi;
+          u64 dst_i = (ni * 2 + hi / new_h) * n_str +
+              ci * c_str + (hi % new_h) * h_str + wi;
+          if (fmt == FMT_BF16) {
+            u16_ref[dst_i] = u16_src[src_i];
+          } else {
+            u8_ref[dst_i] = u8_src[src_i];
+          }
+        }
+      }
+    }
+  }
+
+  if (fmt == FMT_BF16) {
+    ref = (void *)u16_ref;
+  } else {
+    ref = (void *)u8_ref;
+  }
+}
+
+static inline u8 * get_tensor_l2g_stride(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    tl_t *tl,
+    bmk1822_tensor_tgmem_stride_t tg_stride,
+    fmt_t fmt)
+{
+  bmk1822_tdma_l2tg_tensor_copy_param_t p;
+  int n = tl->shape.n;
+  int n_stride = tg_stride.n;
+  int stride_size = n * n_stride;
+  u8 *data = NULL;
+  u16 *u16_data = (u16 *)malloc(sizeof(u16) * stride_size);
+  u8 *u8_data = (u8 *)malloc(sizeof(u8) * stride_size);
+  if (!u16_data || !u8_data) {
+    free(u16_data);
+    free(u8_data);
+    return NULL;
+  }
+
+  for (int i = 0; i < stride_size; i++) {
+    if (fmt == FMT_BF16) {
+      u16_data[i] = 0xcf;
+    } else {
+      u8_data[i] = 0xcf;
+    }
+  }
+
+  if (fmt == FMT_BF16) {
+    data = (u8 *)u16_data;
+    free(u8_data);
+  } else {
+    data = u8_data;
+    free(u16_data);
+  }
+
+  bmshape_t bms = BM_TENSOR_WITH_FMT(n, n_stride, 1, 1,
+                                     (fmt == FMT_BF16)? BM_FMT_BF16 : BM_FMT_INT8);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, (u8 *)data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = fmt;
+  tg.shape.n = tl->shape.n;
+  tg.shape.c = tl->shape.c;
+  tg.shape.h = tl->shape.h;
+  tg.shape.w = tl->shape.w;
+  tg.stride = tg_stride;
+  tg.base_reg_index = 0;
+
+  memset(&p, 0, sizeof(p));
+  p.src = tl;
+  p.dst = &tg;
+
+  bmk1822_tdma_l2g_bf16_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+  ret = bm_memcpy_d2s(*ctx, (u8 *)data, dev_mem);
+  assert(ret == BM_SUCCESS);
+  bmmem_device_free(*ctx, dev_mem);
+
+  return data;
+}
+
+static void test_get_tensor_l2g_stride_unalign(
+    bmctx_t *ctx, bmk_ctx_t *bk_ctx, fmt_t fmt)
+{
+  bmk1822_tensor_tgmem_stride_t tg_stride;
+  /*
+   * Make sure (h / 2 * w) is not eu-aligned.
+   */
+  int n = 1;
+  int c = 5;
+  int h = 18;
+  int w = 7;
+  tl_t *tl_x = NULL;
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+  int new_n = n * 2;
+  int new_h = h / 2;
+  tg_stride.h = w * 2;
+  tg_stride.c = w * 2 * new_h * 2;
+  tg_stride.n = w * 2 * new_h * 2 * c * 2;
+
+  float val = -100;
+  int stride_size = new_n * tg_stride.n;
+  u16 *u16src_data = (u16 *)malloc(sizeof(u16) * stride_size);
+  s8 *s8src_data = (s8 *)malloc(sizeof(s8) * stride_size);
+  void *src_data = NULL;
+  u8 *result_x = NULL;
+  void *ref_x = NULL;
+  u8 *u8ref_x = NULL;
+  u16 *u16ref_x = NULL;
+
+  // prepare source data
+  ref_x = (u8 *)xmalloc(stride_size * ((fmt == FMT_BF16) ?2:1) );
+  for (int i = 0; i < stride_size; i++) {
+    if(fmt == FMT_BF16) {
+      u16src_data[i] = generate_bf16_corner_val(val);
+      val += 0.1;
+    } else {
+      s8src_data[i] = i;
+    }
+  }
+  src_data =  (fmt == FMT_BF16) ? (void *)u16src_data : (void *)s8src_data;
+
+  // run tpu operations
+  tl_x = alloc_tl(bk_ctx, tl_shape, fmt, 1);
+  put_bf16_tensor_g2l(ctx, bk_ctx, tl_x, (u16 *)src_data, fmt);
+  tl_x->shape.n = new_n;
+  tl_x->shape.c = c;
+  tl_x->shape.h = new_h;
+  tl_x->shape.w = w;
+  tl_x->stride = bmk1822_tensor_lmem_default_stride(bk_ctx, tl_x->shape, fmt, 0);
+  result_x = get_tensor_l2g_stride(ctx, bk_ctx, tl_x, tg_stride, fmt);
+  tl_x->shape = tl_shape;
+  tl_x->stride = bmk1822_tensor_lmem_default_stride(bk_ctx, tl_x->shape, fmt, 1);
+  get_tensor_l2g_stride_unalign_ref(ref_x, (u16 *)src_data, tl_shape, tg_stride, fmt);
+
+  // compare data
+  compare_result( ref_x, result_x, fmt, stride_size);
+
+  // free variables
+  free_tl(bk_ctx, tl_x);
+  free(ref_x);
+  free(result_x);
+  free(u8ref_x);
+  free(u16ref_x);
+  free(s8src_data);
+  free(u16src_data);
+}
+
+#define TEST_ALIGNED 1 // 1: test unalign only, 2: test both align/unalign
+int main (void)
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+
+  test_init(&ctx, &bk_ctx);
+  for (u32 i = 0; i < nr_fmt; i++) {
+    for (u8 u8_align = 0; u8_align < TEST_ALIGNED; u8_align++) {
+      test_get_tensor_l2g_stride_unalign(&ctx, bk_ctx, input_fmt[i].src_fmt);
+    }
+  }
+  test_exit(&ctx);
+
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_put_bf16_matrix_stride.cpp b/cviruntime/test/1822/bf16/test_1822_put_bf16_matrix_stride.cpp
new file mode 100644
index 000000000..ab4fa16dd
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_put_bf16_matrix_stride.cpp
@@ -0,0 +1,151 @@
+#include "../1822_test_util.h"
+#include "1822_bf16_util.h"
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+ {FMT_I8, FMT_I8},
+};
+
+static void put_matrix_g2l_stride_ref(
+    void *ref,
+    void *a,
+    ml_shape_t  lmem_shape,
+    bmk1822_matrix_tgmem_stride_t gmem_stride,
+    fmt_t fmt)
+{
+  int row = lmem_shape.n;
+  int col = lmem_shape.col;
+  int row_stride = gmem_stride.row / ((fmt == FMT_BF16) ?2:1);
+  u16 *u16_ref = NULL;
+  u16 *u16_src = NULL;
+  u8 *u8_ref = NULL;
+  u8 *u8_src = NULL;
+
+  if (fmt == FMT_BF16) {
+    u16_ref = (u16 *)ref;
+    u16_src = (u16 *)a;
+  } else {
+    u8_ref = (u8 *)ref;
+    u8_src = (u8 *)a;
+  }
+
+  for (int ri = 0; ri < row; ri++) {
+    for (int ci = 0; ci < col; ci++) {
+      if (fmt == FMT_BF16) {
+        u16_ref[ri * col + ci] = u16_src[ri * row_stride + ci];
+      } else {
+        u8_ref[ri * col + ci] = u8_src[ri * row_stride + ci];
+      }
+    }
+  }
+
+  if (fmt == FMT_BF16) {
+    ref = (void *)u16_ref;
+  } else {
+    ref = (void *)u8_ref;
+  }
+}
+
+static void put_matrix_g2l_stride(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    ml_t *ml,
+    bmk1822_matrix_tgmem_stride_t gmem_stride,
+    void *data,
+    fmt_t fmt)
+{
+  int row = ml->shape.n;
+  int col = ml->shape.col;
+  int row_stride = gmem_stride.row;
+
+  bmshape_t bms = BM_MATRIX_INT16(row, row_stride );
+  bmmem_device_t devmem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  int ret = bm_memcpy_s2d(*ctx, devmem, (u8 *)data);
+  assert(ret == BM_SUCCESS);
+
+  gaddr_t gaddr = bmmem_device_addr(devmem);
+  mg_t mg;
+  mg.base_reg_index = 0;
+  mg.start_address = gaddr;
+  mg.shape.row = row;
+  mg.shape.col = col;
+  mg.stride = gmem_stride;
+  mg.fmt = fmt;
+  mg.base_reg_index = 0;
+
+  bmk1822_tdma_tg2l_matrix_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.dst = ml;
+  p.src = &mg;
+  bmk1822_tdma_g2l_bf16_matrix_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  bmmem_device_free(*ctx, devmem);
+  return ;
+}
+
+static void test_put_matrix_g2l_stride(
+    bmctx_t *ctx, bmk_ctx_t *bk_ctx, fmt_t fmt, int eu_align)
+{
+  int row = 80;
+  int col = 70;
+  float val = -100;
+  int size = row * col;
+  int row_stride = col * 2;
+  int stride_size = row * row_stride;
+  u16 *u16src_data = (u16 *)malloc(sizeof(u16) * stride_size);
+  s8 *s8src_data = (s8 *)malloc(sizeof(s8) * stride_size);
+  void *src_data = NULL;
+  u8 *result_x = NULL;
+  u8 *ref_x = NULL;
+
+  ml_shape_t mls = bmk1822_matrix_lmem_default_shape(bk_ctx, row, col, fmt);
+  ml_t *ml = bmk1822_lmem_alloc_matrix(bk_ctx, mls, fmt, eu_align);
+  bmk1822_matrix_tgmem_stride_t gmem_stride;
+  gmem_stride.row = row_stride * ((fmt == FMT_BF16) ?2:1);
+
+  // prepare source data
+  for (int i = 0; i < stride_size; i++) {
+    if(fmt == FMT_BF16) {
+      u16src_data[i] = generate_bf16_corner_val(val);
+      val += 0.1;
+    } else {
+      s8src_data[i] = i;
+    }
+  }
+  ref_x = (u8 *)xmalloc(size * ((fmt == FMT_BF16) ?2:1));
+  src_data =  (fmt == FMT_BF16) ? (void *)u16src_data : (void *)s8src_data;
+
+  // run tpu operations
+  put_matrix_g2l_stride(ctx, bk_ctx, ml, gmem_stride, src_data, fmt);
+  result_x = get_bf16_matrix_l2g(ctx, bk_ctx, ml, fmt);
+  put_matrix_g2l_stride_ref(ref_x, src_data, mls, gmem_stride, fmt);
+
+  // compare data
+  if( COMPARE_PASS != compare_result( ref_x, result_x, fmt, size))
+    exit(-1);
+
+  // free variables
+  bmk1822_lmem_free_matrix(bk_ctx, ml);
+  free(s8src_data);
+  free(u16src_data);
+  free(result_x);
+  free(ref_x);
+}
+
+#define TEST_ALIGNED 2 // 1: test unalign only, 2: test both align/unalign
+int main ()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+
+  test_init(&ctx, &bk_ctx);
+  for (u32 i = 0; i < nr_fmt; i++) {
+    for (u8 u8_align = 0; u8_align < TEST_ALIGNED; u8_align++) {
+      test_put_matrix_g2l_stride(&ctx, bk_ctx, input_fmt[i].src_fmt, u8_align);
+    }
+  }
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_put_bf16_tensor_stride.cpp b/cviruntime/test/1822/bf16/test_1822_put_bf16_tensor_stride.cpp
new file mode 100644
index 000000000..5c65a13d9
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_put_bf16_tensor_stride.cpp
@@ -0,0 +1,181 @@
+#include "../1822_test_util.h"
+#include "1822_bf16_util.h"
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+ {FMT_I8, FMT_I8},
+};
+
+static void put_tensor_g2l_stride_ref(
+    void *ref,
+    void *a,
+    tl_shape_t lmem_shape,
+    bmk1822_tensor_tgmem_stride_t gmem_stride,
+    fmt_t fmt)
+{
+  uint32_t n = lmem_shape.n;
+  uint32_t c = lmem_shape.c;
+  uint32_t h = lmem_shape.h;
+  uint32_t w = lmem_shape.w;
+  uint32_t nsrc_byte = 1;
+  u16 *u16_ref = NULL;
+  u16 *u16_src = NULL;
+  u8 *u8_ref = NULL;
+  u8 *u8_src = NULL;
+  if (fmt == FMT_BF16) {
+    nsrc_byte = 2; // FMT_BF16
+    u16_ref = (u16 *)ref;
+    u16_src = (u16 *)a;
+  } else {
+    nsrc_byte = 1; // FMT_U8, FMT_I8
+    u8_ref = (u8 *)ref;
+    u8_src = (u8 *)a;
+  }
+  uint32_t n_str = gmem_stride.n / nsrc_byte;
+  uint32_t c_str = gmem_stride.c / nsrc_byte;
+  uint32_t h_str = gmem_stride.h / nsrc_byte;
+  uint32_t w_str = 1;
+
+  /*
+   * put stride ddr tensor to local memory in default stride.
+   */
+
+  for (uint32_t ni = 0; ni < n; ni++) {
+    for (uint32_t ci = 0; ci < c; ci++) {
+      for (uint32_t hi = 0; hi < h; hi++) {
+        for (uint32_t wi = 0; wi < w; wi++) {
+          uint32_t src_i = ni * n_str + ci * c_str + hi * h_str + wi * w_str;
+          uint32_t dst_i = ni * c * h * w + ci * h * w + hi * w + wi;
+          if (fmt == FMT_BF16) {
+            u16_ref[dst_i] = u16_src[src_i];
+          } else {
+            u8_ref[dst_i] = u8_src[src_i];
+          }
+        }
+      }
+    }
+  }
+
+  if (fmt == FMT_BF16) {
+    ref = (void *)u16_ref;
+  } else {
+    ref = (void *)u8_ref;
+  }
+}
+
+static inline void put_tensor_g2l_stride(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    tl_t *tl,
+    bmk1822_tensor_tgmem_stride_t tg_stride,
+    void *data,
+    fmt_t fmt)
+{
+  int n = tl->shape.n;
+  int n_stride = tg_stride.n;
+  bmshape_t bms = BM_TENSOR_WITH_FMT(n, n_stride, 1, 1,
+                                     (fmt == FMT_BF16)? BM_FMT_BF16 : BM_FMT_INT8);
+  bmmem_device_t devmem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  int ret = bm_memcpy_s2d(*ctx, devmem, (u8 *)data);
+  assert(ret == BM_SUCCESS);
+
+  gaddr_t gaddr = bmmem_device_addr(devmem);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = fmt;
+  tg.shape.n = tl->shape.n;
+  tg.shape.c = tl->shape.c;
+  tg.shape.h = tl->shape.h;
+  tg.shape.w = tl->shape.w;
+  tg.stride = tg_stride;
+  tg.base_reg_index = 0;
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1822_tdma_g2l_bf16_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  bmmem_device_free(*ctx, devmem);
+}
+
+static void test_put_tensor_g2l_stride(
+    bmctx_t *ctx, bmk_ctx_t *bk_ctx, fmt_t fmt, int eu_align)
+{
+  tl_t *tl_x = NULL;
+  int n = 2;
+  int c = 15;
+  int h = 10;
+  int w = 8;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  bmk1822_tensor_tgmem_stride_t gmem_stride;
+  gmem_stride.h = w * 2;
+  gmem_stride.c = gmem_stride.h * h * 2;
+  gmem_stride.n = gmem_stride.c * c * 2;
+
+  int size = n * c * h * w;
+  int stride_size = gmem_stride.n * n;
+  float val = -100;
+  void *src_data = NULL;
+  u16 *u16src_data = (u16 *)malloc(sizeof(u16) * stride_size);
+  s8 *s8src_data = (s8 *)malloc(sizeof(s8) * stride_size);
+  u8 *result_x = NULL;
+  u8 *ref_x = NULL;
+
+  // prepare source data
+  ref_x = (u8 *)xmalloc(size * ((fmt == FMT_BF16) ?2:1) );
+  for (int i = 0; i < stride_size; i++) {
+    if(fmt == FMT_BF16) {
+      u16src_data[i] = generate_bf16_corner_val(val);
+      val += 0.1;
+    } else {
+      s8src_data[i] = i;
+    }
+  }
+  src_data =  (fmt == FMT_BF16) ? (void *)u16src_data : (void *)s8src_data;
+
+  // run tpu operations
+  tl_x = alloc_tl(bk_ctx, tl_shape, fmt, eu_align);
+  put_tensor_g2l_stride(ctx, bk_ctx, tl_x, gmem_stride, (u8 *)src_data, fmt);
+  result_x = get_bf16_tensor_l2g(ctx, bk_ctx, tl_x, fmt);
+  put_tensor_g2l_stride_ref(ref_x, src_data, tl_shape, gmem_stride, fmt);
+
+  // compare data
+  if( COMPARE_PASS != compare_result( ref_x, result_x, fmt, size))
+      exit(-1);
+
+  // free variables
+  free_tl(bk_ctx, tl_x);
+  free(s8src_data);
+  free(u16src_data);
+  free(result_x);
+  free(ref_x);
+}
+
+#define TEST_ALIGNED 2 // 1: test unalign only, 2: test both align/unalign
+int main (void)
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+
+  test_init(&ctx, &bk_ctx);
+  for (u32 i = 0; i < nr_fmt; i++) {
+    for (u8 u8_align = 0; u8_align < TEST_ALIGNED; u8_align++) {
+        test_put_tensor_g2l_stride(&ctx, bk_ctx, input_fmt[i].src_fmt, u8_align);
+    }
+  }
+  test_exit(&ctx);
+
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_put_bf16_tensor_stride_unalign.cpp b/cviruntime/test/1822/bf16/test_1822_put_bf16_tensor_stride_unalign.cpp
new file mode 100644
index 000000000..85fd9cd57
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_put_bf16_tensor_stride_unalign.cpp
@@ -0,0 +1,181 @@
+#include "../1822_test_util.h"
+#include "1822_bf16_util.h"
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+ {FMT_I8, FMT_I8},
+};
+
+static void put_tensor_g2l_stride_unalign_ref(
+    void *ref,
+    void *a,
+    tl_shape_t tl_shape,
+    bmk1822_tensor_tgmem_stride_t gmem_stride,
+    fmt_t fmt)
+{
+  int n = tl_shape.n;
+  int c = tl_shape.c;
+  int h = tl_shape.h;
+  int w = tl_shape.w;
+  int nsrc_byte = 1;
+  u16 *u16_ref = NULL;
+  u16 *u16_src = NULL;
+  u8 *u8_ref = NULL;
+  u8 *u8_src = NULL;
+    if (fmt == FMT_BF16) {
+    nsrc_byte = 2; // FMT_BF16
+    u16_ref = (u16 *)ref;
+    u16_src = (u16 *)a;
+  } else {
+    nsrc_byte = 1; // FMT_U8, FMT_I8
+    u8_ref = (u8 *)ref;
+    u8_src = (u8 *)a;
+  }
+  int n_str = gmem_stride.n / nsrc_byte;
+  int c_str = gmem_stride.c / nsrc_byte;
+  int h_str = gmem_stride.h / nsrc_byte;
+  int w_str = 1;
+
+  for (int ni = 0; ni < n; ni++) {
+    for (int ci = 0; ci < c; ci++) {
+      for (int hi = 0; hi < h; hi++) {
+        for (int wi = 0; wi < w; wi++) {
+          u64 src_i = ni * n_str + ci * c_str + hi * h_str + wi * w_str;
+          u64 dst_i = ci * n * h * w + ni * h * w + hi * w + wi;
+          if (fmt == FMT_BF16) {
+            u16_ref[dst_i] = u16_src[src_i];
+          } else {
+            u8_ref[dst_i] = u8_src[src_i];
+          }
+        }
+      }
+    }
+  }
+
+  if (fmt == FMT_BF16) {
+    ref = (void *)u16_ref;
+  } else {
+    ref = (void *)u8_ref;
+  }
+}
+
+static inline void put_tensor_g2l_stride(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    tl_t *tl,
+    bmk1822_tensor_tgmem_stride_t tg_stride,
+    void *data,
+    fmt_t fmt)
+{
+  int n = tl->shape.n;
+  int n_stride = tg_stride.n;
+  bmshape_t bms = BM_TENSOR_WITH_FMT(n, n_stride, 1, 1,
+                                     (fmt == FMT_BF16)? BM_FMT_BF16 : BM_FMT_INT8);
+  bmmem_device_t devmem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  int ret = bm_memcpy_s2d(*ctx, devmem, (u8 *)data);
+  assert(ret == BM_SUCCESS);
+
+  gaddr_t gaddr = bmmem_device_addr(devmem);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = fmt;
+  tg.shape.n = tl->shape.n;
+  tg.shape.c = tl->shape.c;
+  tg.shape.h = tl->shape.h;
+  tg.shape.w = tl->shape.w;
+  tg.stride = tg_stride;
+  tg.base_reg_index = 0;
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1822_tdma_g2l_bf16_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  bmmem_device_free(*ctx, devmem);
+}
+
+static void test_put_tensor_g2l_stride_unalign(
+    bmctx_t *ctx, bmk_ctx_t *bk_ctx, fmt_t fmt, int eu_align)
+{
+  tl_t *tl_x = NULL;
+  int n = 6;
+  int c = (BM1822_HW_NPU_NUM/2+1); //just larger than (npu_num/2)
+  int h = 1;
+  int w = 8;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  bmk1822_tensor_tgmem_stride_t gmem_stride;
+  gmem_stride.h = w * 2;
+  gmem_stride.c = gmem_stride.h * h * 2;
+  gmem_stride.n = gmem_stride.c * c * 2;
+
+  int size = n * c * h * w;
+  int stride_size = gmem_stride.n * n;
+  float val = -100;
+  void *src_data = NULL;
+  u16 *u16src_data = (u16 *)malloc(sizeof(u16) * stride_size);
+  s8 *s8src_data = (s8 *)malloc(sizeof(s8) * stride_size);
+  u8 *result_x = NULL;
+  void *ref_x = NULL;
+
+  // prepare source data
+  ref_x = (u8 *)xmalloc(size * ((fmt == FMT_BF16) ?2:1) );
+  for (int i = 0; i < stride_size; i++) {
+    if(fmt == FMT_BF16) {
+      u16src_data[i] = generate_bf16_corner_val(val);
+      val += 0.1;
+    } else {
+      s8src_data[i] = i;
+    }
+  }
+  src_data =  (fmt == FMT_BF16) ? (void *)u16src_data : (void *)s8src_data;
+
+  // run tpu operations
+  tl_x = alloc_tl(bk_ctx, tl_shape, fmt, eu_align);
+  put_tensor_g2l_stride(ctx, bk_ctx, tl_x, gmem_stride, (u8 *)src_data, fmt);
+  tl_x->shape.n = 1;
+  tl_x->shape.c = c;
+  tl_x->shape.h = n * h;
+  tl_x->shape.w = w;
+  result_x = get_bf16_tensor_l2g(ctx, bk_ctx, tl_x, fmt);
+  put_tensor_g2l_stride_unalign_ref(ref_x, src_data, tl_shape, gmem_stride, fmt);
+
+  // compare data
+  if( COMPARE_PASS != compare_result( ref_x, result_x, fmt, size))
+      exit(-1);
+
+  // free variables
+  free_tl(bk_ctx, tl_x);
+  free(ref_x);
+  free(result_x);
+  free(s8src_data);
+  free(u16src_data);
+}
+
+#define TEST_ALIGNED 1 // 1: test unalign only, 2: test both align/unalign
+int main (void)
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+
+  test_init(&ctx, &bk_ctx);
+  for (u32 i = 0; i < nr_fmt; i++) {
+    for (u8 u8_align = 0; u8_align < TEST_ALIGNED; u8_align++) {
+        test_put_tensor_g2l_stride_unalign(&ctx, bk_ctx, input_fmt[i].src_fmt, u8_align);
+    }
+  }
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_put_bf16_tensor_tp_unalign.cpp b/cviruntime/test/1822/bf16/test_1822_put_bf16_tensor_tp_unalign.cpp
new file mode 100644
index 000000000..8369220e4
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_put_bf16_tensor_tp_unalign.cpp
@@ -0,0 +1,168 @@
+#include "../1822_test_util.h"
+#include "1822_bf16_util.h"
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+ {FMT_I8, FMT_I8},
+};
+
+static void put_tensor_g2l_tp_unalign_ref(
+    void *ref,
+    void *a,
+    tl_shape_t tl_shape,
+    fmt_t fmt)
+{
+  /*
+   * (c, n, h, w) => (n, c, h, w) => (1, c, n * h, w)
+   */
+  int n = tl_shape.n;
+  int c = tl_shape.c;
+  int h = tl_shape.h;
+  int w = tl_shape.w;
+  u16 *u16_ref = NULL;
+  u16 *u16_src = NULL;
+  u8 *u8_ref = NULL;
+  u8 *u8_src = NULL;
+
+  int size = n * c * h * w;
+
+  if (fmt == FMT_BF16) {
+    u16_ref = (u16 *)ref;
+    u16_src = (u16 *)a;
+  } else {
+    u8_ref = (u8 *)ref;
+    u8_src = (u8 *)a;
+  }
+
+  for (int i = 0; i < size; i++)
+  {
+    if (fmt == FMT_BF16) {
+      u16_ref[i] = u16_src[i];
+    } else {
+      u8_ref[i] = u8_src[i];
+    }
+  }
+
+  if (fmt == FMT_BF16) {
+    ref = (void *)u16_ref;
+  } else {
+    ref = (void *)u8_ref;
+  }
+}
+
+static void put_tensor_g2l_tp(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    tl_t *tl,
+    void *data,
+    fmt_t fmt)
+{
+  int n = tl->shape.n;
+  int c = tl->shape.c;
+  int h = tl->shape.h;
+  int w = tl->shape.w;
+
+  bmshape_t bms = BM_TENSOR_WITH_FMT(n, c, h, w,
+                                     (fmt == FMT_BF16)? BM_FMT_BF16 : BM_FMT_INT8);
+
+  bmmem_device_t devmem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  int ret = bm_memcpy_s2d(*ctx, devmem, (u8 *)data);
+  assert(ret == BM_SUCCESS);
+
+  gaddr_t gaddr = bmmem_device_addr(devmem);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = fmt;
+  tg.shape.n = tl->shape.c;
+  tg.shape.c = tl->shape.n;
+  tg.shape.h = tl->shape.h;
+  tg.shape.w = tl->shape.w;
+  tg.stride = bmk1822_tensor_tgmem_default_stride(tg.shape, fmt);
+  tg.base_reg_index = 0;
+
+  bmk1822_tdma_tg2l_tensor_copy_nc_transposed_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1822_tdma_g2l_bf16_tensor_copy_nc_transposed(bk_ctx, &p);
+  test_submit(ctx);
+
+  bmmem_device_free(*ctx, devmem);
+}
+
+static void test_put_tensor_g2l_tp_unalign(
+    bmctx_t *ctx, bmk_ctx_t *bk_ctx, fmt_t fmt, int eu_align)
+{
+  tl_t *tl_x = NULL;
+  int n = 2;
+  int c = 15;
+  int h = 1;
+  int w = 8;
+  int size = n * c * h * w;
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  float val = -100;
+  void *src_data = NULL;
+  u16 *u16src_data = (u16 *)malloc(sizeof(u16) * size);
+  s8 *s8src_data = (s8 *)malloc(sizeof(s8) * size);
+  u8 *result_x = NULL;
+  u8 *ref_x = NULL;
+
+  // prepare source data
+  ref_x = (u8 *)xmalloc(size * ((fmt == FMT_BF16) ?2:1) );
+  for (int i = 0; i < size; i++) {
+    if(fmt == FMT_BF16) {
+      u16src_data[i] = generate_bf16_corner_val(val);
+      val += 0.1;
+    } else {
+      s8src_data[i] = i;
+    }
+  }
+  src_data = (fmt == FMT_BF16) ? (void *)u16src_data : (void *)s8src_data;
+
+  // run tpu operations
+  tl_x = alloc_tl(bk_ctx, tl_shape, fmt, eu_align);
+  put_tensor_g2l_tp(ctx, bk_ctx, tl_x, src_data, fmt);
+  tl_x->shape.n = 1;
+  tl_x->shape.c = c;
+  tl_x->shape.h = n * h;
+  tl_x->shape.w = w;
+  result_x = get_bf16_tensor_l2g(ctx, bk_ctx, tl_x, fmt);
+  tl_x->shape = tl_shape;
+  put_tensor_g2l_tp_unalign_ref( ref_x, src_data, tl_shape, fmt);
+
+  // compare data
+  compare_result( ref_x, result_x, fmt, size);
+
+  // free variables
+  free_tl(bk_ctx, tl_x);
+  free(ref_x);
+  free(s8src_data);
+  free(u16src_data);
+  free(result_x);
+}
+
+#define TEST_ALIGNED 2 // 1: test unalign only, 2: test both align/unalign
+int main (void)
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+
+  test_init(&ctx, &bk_ctx);
+  for (u32 i = 0; i < nr_fmt; i++) {
+    for (u8 u8_align = 0; u8_align < TEST_ALIGNED; u8_align++) {
+      test_put_tensor_g2l_tp_unalign(&ctx, bk_ctx, input_fmt[i].src_fmt, u8_align);
+    }
+  }
+  test_exit(&ctx);
+
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_put_bf16_tensor_unalign.cpp b/cviruntime/test/1822/bf16/test_1822_put_bf16_tensor_unalign.cpp
new file mode 100644
index 000000000..3dacc9351
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_put_bf16_tensor_unalign.cpp
@@ -0,0 +1,131 @@
+#include "../1822_test_util.h"
+#include "1822_bf16_util.h"
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+ {FMT_I8, FMT_I8},
+};
+
+static void put_tensor_g2l_unalign_ref(
+    void *ref,
+    void *a,
+    tl_shape_t tl_shape,
+    fmt_t fmt)
+{
+  int n = tl_shape.n;
+  int c = tl_shape.c;
+  int h = tl_shape.h;
+  int w = tl_shape.w;
+  u16 *u16_ref = NULL;
+  u16 *u16_src = NULL;
+  u8 *u8_ref = NULL;
+  u8 *u8_src = NULL;
+
+  if (fmt == FMT_BF16) {
+    u16_ref = (u16 *)ref;
+    u16_src = (u16 *)a;
+  } else {
+    u8_ref = (u8 *)ref;
+    u8_src = (u8 *)a;
+  }
+  /*
+   * (n, c, h, w) => (1, c, n * h, w)
+   */
+  for (int ni = 0; ni < n; ni++) {
+    for (int ci = 0; ci < c; ci++) {
+      for (int hi = 0; hi < h; hi++) {
+        for (int wi = 0; wi < w; wi++) {
+          u64 src_i = ni * c * h * w + ci * h * w + hi * w + wi;
+          u64 dst_i = ci * n * h * w + ni * h * w + hi * w + wi;
+          if (fmt == FMT_BF16) {
+            u16_ref[dst_i] = u16_src[src_i];
+          } else {
+            u8_ref[dst_i] = u8_src[src_i];
+          }
+        }
+      }
+    }
+  }
+
+  if (fmt == FMT_BF16) {
+    ref = (void *)u16_ref;
+  } else {
+    ref = (void *)u8_ref;
+  }
+}
+
+static void test_put_tensor_g2l_unalign(
+    bmctx_t *ctx, bmk_ctx_t *bk_ctx, fmt_t fmt, int eu_align)
+{
+  tl_t *tl_x = NULL;
+  int n = 4;
+  int c = 9; //just larger than (npu_num/2)
+  int h = 1;
+  int w = 8;
+  int size = n * c * h * w;
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+  float val = -100;
+  u16 *u16src_data = (u16 *)malloc(sizeof(u16) * size);
+  s8 *s8src_data = (s8 *)malloc(sizeof(s8) * size);
+  void *src_data;
+  void *result_x = NULL;
+  void *ref_x = NULL;
+  u8 *u8ref_x = NULL;
+  u16 *u16ref_x = NULL;
+
+  // prepare source data
+  ref_x = (u8 *)xmalloc(size * ((fmt == FMT_BF16) ?2:1) );
+  for (int i = 0; i < size; i++) {
+    if(fmt == FMT_BF16) {
+      u16src_data[i] = generate_bf16_corner_val(val);
+      val += 0.1;
+    } else {
+      s8src_data[i] = i;
+    }
+  }
+  src_data =  (fmt == FMT_BF16) ? (void *)u16src_data : (void *)s8src_data;
+
+  // run tpu operations
+  tl_x = alloc_tl(bk_ctx, tl_shape, fmt, eu_align);
+  put_bf16_tensor_g2l(ctx, bk_ctx, tl_x, (u16 *)src_data, fmt);
+  tl_x->shape.n = 1;
+  tl_x->shape.c = c;
+  tl_x->shape.h = n * h;
+  tl_x->shape.w = w;
+  result_x = get_bf16_tensor_l2g(ctx, bk_ctx, tl_x, fmt);
+  put_tensor_g2l_unalign_ref(ref_x, src_data, tl_shape, fmt);
+
+   // compare data
+  compare_result( ref_x, result_x, fmt, size);
+
+  // free variables
+  free_tl(bk_ctx, tl_x);
+  free(ref_x);
+  free(u8ref_x);
+  free(u16ref_x);
+  free(s8src_data);
+  free(u16src_data);
+  free(result_x);
+}
+
+#define TEST_ALIGNED 1 // 1: test unalign only, 2: test both align/unalign
+int main (void)
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+
+  test_init(&ctx, &bk_ctx);
+  for (u32 i = 0; i < nr_fmt; i++) {
+    for (u8 u8_align = 0; u8_align < TEST_ALIGNED; u8_align++) {
+      test_put_tensor_g2l_unalign(&ctx, bk_ctx, input_fmt[i].src_fmt, u8_align);
+    }
+  }
+  test_exit(&ctx);
+
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_tdma_bf16_matrix_vlc_decompress_compress.cpp b/cviruntime/test/1822/bf16/test_1822_tdma_bf16_matrix_vlc_decompress_compress.cpp
new file mode 100644
index 000000000..f8b905d2f
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_tdma_bf16_matrix_vlc_decompress_compress.cpp
@@ -0,0 +1,192 @@
+#include "../1822_test_util.h"
+
+typedef bmk1822_tdma_tg2l_matrix_copy_decompressed_param_t decompress_param_t;
+typedef bmk1822_tdma_l2tg_matrix_copy_compressed_param_t compress_param_t;
+
+typedef struct{
+  decompress_param_t dec_p;
+  compress_param_t com_p;
+} param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => %s\n",
+      tag,
+      p->dec_p.dst->shape.n, p->dec_p.dst->shape.c, p->dec_p.dst->shape.w, p->dec_p.dst->shape.col,
+      (p->dec_p.dst->fmt == FMT_I8)? "signed": "unsigned");
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  mg_shape_t src_shape;
+  ml_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 0, 1 },
+    { 0, 1, 1, 1 },
+  },
+  {
+    { 0, 2 },
+    { 0, 1, 2, 2 },
+  },
+  {
+    { 0, 2 },
+    { 0, 2, 1, 2 },
+  },
+  {
+    { 0, 7 },
+    { 0, 1, 7, 7 },
+  },
+#ifndef ENABEL_SIMPLE_BMK1822_VLC_TEST
+  {
+    { 0, 7 },
+    { 0, 2, 4, 7 },
+  },
+  {
+    { 0, 7 },
+    { 0, 7, 1, 7 },
+  },
+  {
+    { 0, 17 },
+    { 0, 1, 17, 17 },
+  },
+  {
+    { 0, 17 },
+    { 0, 3, 7, 17 },
+  },
+  {
+    { 0, 17 },
+    { 0, 17, 1, 17 },
+  },
+  {
+    { 0, 60 },
+    { 0, 1, 60, 60 },
+  },
+  {
+    { 0, 60 },
+    { 0, 30, 2, 60 },
+  },
+  {
+    { 0, 60 },
+    { 0, 60, 1, 60 },
+  }
+#endif /* ifndef ENABEL_SIMPLE_BMK1822_VLC_TEST*/
+};
+
+static void test_param_g2l(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p, u16 *src_data,
+  CommandInfo* cmd_info)
+{
+  print_param(stderr, p);
+  u64 size = ml_shape_size(&p->dec_p.dst->shape);
+  u64 bytesize = size * bytesize_of_fmt(p->dec_p.dst->fmt);
+  int is_signed = (p->dec_p.dst->fmt == FMT_I8);
+
+  u16 *gmem_data;
+  size_t bs_size;
+  size_t data_type = (p->dec_p.dst->fmt == FMT_BF16) ? 1 : 0;
+
+  gmem_data = (u16* ) vlc_compress((u8* )src_data, bytesize, is_signed, data_type, &bs_size, cmd_info, NULL);
+
+  //1. send compressed one to gaddr and decompress from gaddr to local
+  put_compressed_mg_gmem(ctx, p->dec_p.src, (u8* ) gmem_data, bs_size);
+  bmk1822_tdma_g2l_matrix_copy_decompressed(bmk, &p->dec_p);
+  test_submit(ctx);
+
+  //2. decompress from sram
+  bmk1822_tdma_l2g_matrix_copy_compressed(bmk, &p->com_p);
+  test_submit(ctx);
+
+  //3. get final data
+  size_t bs_buf_size = get_out_bs_buf_size(bytesize, data_type);
+  u16 *dst_data = (u16* )get_compressed_mg_gmem(ctx, p->com_p.dst, bs_buf_size);
+
+  for (u64 i = 0; i < bs_size / 2; i++) {
+    if (dst_data[i] != gmem_data[i]) {
+      fprintf(stderr, "vlc compress comparing failed at dst[%" PRIx64 "], got %d, exp %d\n",
+              i, dst_data[i], gmem_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(dst_data);
+  free(gmem_data);
+}
+
+static void destroy_param_g2l(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_compressed_mg_gmem(ctx, p->dec_p.src);
+  free_compressed_mg_gmem(ctx, p->com_p.dst);
+  free_ml(bmk, p->dec_p.dst);
+}
+
+static void test_one_case(bmctx_t *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  fmt_t fmts[] = { FMT_BF16 };
+  u8 fmts_sz = sizeof(fmts) / sizeof(fmts[0]);
+
+  for (u32 row = 1; row < 13; row += 2) {
+    c->src_shape.row = row;
+    c->dst_shape.n = row;
+    for (int dst_align = 0; dst_align < 2; dst_align++) {
+      for (u8 fmt_i = 0; fmt_i < fmts_sz; fmt_i++) {
+        fmt_t fmt = fmts[fmt_i];
+        param_t p;
+        memset(&p, 0, sizeof(p));
+        //put compressed data to gaddr ->decompress to local -> compress to gaddr
+
+        int is_signed = (fmt == FMT_I8);
+        int data_type = (fmt == FMT_BF16) ? 1 : 0;
+
+        CommandInfo cmd_info;
+        memset(&cmd_info, 0, sizeof(CommandInfo));
+        cmd_info.signedness = is_signed;
+        cmd_info.is_bfloat16 = data_type;
+        cmd_info.bias0 = 127;
+
+        // <! not support bias0/1 setting compress by hw
+        //get_vlc_compressed_meta(src_data, size, fmt, &bs_size, &cmd_info);
+
+        //1. alloc decompress
+        p.dec_p.src = alloc_vlc_compressed_mg_gmem(ctx, c->src_shape, fmt, &cmd_info);
+        p.dec_p.dst = alloc_ml_bf16(bmk, c->dst_shape, fmt, dst_align);
+
+        u64 size = ml_shape_size(&p.dec_p.dst->shape);
+        u16 *src_data = (u16 *)malloc(sizeof(u16) * size);
+        vlc_init_testdata(src_data, size, fmt == FMT_I8, fmt == FMT_BF16);
+
+        assert(p.dec_p.dst);
+
+        //2. alloc compress
+        p.com_p.src = p.dec_p.dst; //alloc_tl(bmk, c->lmem_shape, fmt, align);
+        p.com_p.dst = alloc_vlc_compressed_mg_gmem(ctx, c->src_shape, fmt, &cmd_info);
+
+        //3. test: the seqence like below:
+        //3.1 put compressed data to gaddr
+        //3.2 decompress to local
+        //3.3 compress to gaddr
+        //printf ("row %u is_align %d fmt %d\n", row, dst_align, fmt);
+        test_param_g2l(ctx, bmk, &p, src_data, &cmd_info);
+        destroy_param_g2l(ctx, bmk, &p);
+        free(src_data);
+      }
+    }
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_tdma_bf16_tensor_vlc_decompress_compress.cpp b/cviruntime/test/1822/bf16/test_1822_tdma_bf16_tensor_vlc_decompress_compress.cpp
new file mode 100644
index 000000000..dbc8629e1
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_tdma_bf16_tensor_vlc_decompress_compress.cpp
@@ -0,0 +1,174 @@
+#include "../1822_test_util.h"
+
+typedef bmk1822_tdma_tg2l_tensor_copy_decompressed_param_t decompress_param_t;
+typedef bmk1822_tdma_l2tg_tensor_copy_compressed_param_t compress_param_t;
+
+typedef struct{
+  decompress_param_t dec_p;
+  compress_param_t com_p;
+} param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => %d-bit %s\n",
+      tag,
+      p->dec_p.dst->shape.n, p->dec_p.dst->shape.c, p->dec_p.dst->shape.h, p->dec_p.dst->shape.w,
+      p->dec_p.src->bit_length,
+      (p->dec_p.dst->fmt == FMT_I8)? "signed": "unsigned");
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  tl_shape_t lmem_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 17, 13 }
+  },
+  {
+    { 3, 39, 17, 23 }
+  },
+  {
+    { 5, 39, 17, 23 }
+  },
+  {
+    { 20, 35,  2, 2 }
+  },
+#ifndef ENABEL_SIMPLE_BMK1822_VLC_TEST
+  {
+    { 1, 1, 1, 1 }
+  },
+  {
+    { 1, 1, 1, 2 }
+  },
+  {
+    { 1, 1, 7, 2 }
+  },
+  {
+    { 1, 1, 10, 60 }
+  },
+  {
+    { 1, 2, 1, 1 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 3, 16, 1, 1 }
+  },
+  {
+    { 3, 36, 16,  20 }
+  },
+#endif /* ifndef ENABEL_SIMPLE_BMK1822_VLC_TEST*/
+};
+
+static void test_param_g2l(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p, compressed_tg_t* dst)
+{
+  print_param(stderr, p);
+  u64 size = tl_shape_size(&p->dec_p.dst->shape);
+  u64 bytesize = size * bytesize_of_fmt(p->dec_p.dst->fmt);
+  int is_signed = (p->dec_p.dst->fmt == FMT_I8);
+  u16 *src_data = (u16 *)malloc(sizeof(u16) * size);
+  vlc_init_testdata(src_data, size, p->dec_p.dst->fmt == FMT_I8, p->dec_p.dst->fmt == FMT_BF16);
+
+  u8 *gmem_data;
+  size_t total_size;
+  size_t data_type = (p->dec_p.dst->fmt == FMT_BF16) ? 1 : 0;
+  size_t bs_buf_size = get_out_bs_buf_size(bytesize, data_type);
+  gmem_data = (uint8_t *) malloc(bs_buf_size * sizeof(uint8_t));
+
+  // command info
+  CommandInfo cmd_info;
+  memset(&cmd_info, 0, sizeof(CommandInfo));
+  cmd_info.signedness = is_signed;
+  cmd_info.is_bfloat16 = data_type;
+  cmd_info.bias0 = 127;
+  // TODO: test
+  //cmd_info.zero_guard_en = 1;
+  // TODO generate +-inf +-nan, plz refere https://en.wikipedia.org/wiki/Bfloat16_floating-point_format
+
+  // <! not support bias0/1 setting compress by hw
+  //bm_vlc_est_weight_bias(src_data, in_size, (bool)is_signed, (bool)data_type, &cmd_info);
+  bm_vlc_enc_bf16((u16* )src_data, bytesize, gmem_data, &total_size, &cmd_info);
+
+  put_compressed_tg_gmem(ctx, p->dec_p.src, gmem_data, total_size);
+  bmk1822_tdma_g2l_tensor_copy_decompressed(bmk, &p->dec_p);
+  test_submit(ctx);
+
+  dst->zero_guard_en = cmd_info.zero_guard_en;
+  dst->bias0 = cmd_info.bias0;
+  dst->bias1 = cmd_info.bias1;
+  p->com_p.dst = dst;
+  bmk1822_tdma_l2g_tensor_copy_compressed(bmk, &p->com_p);
+  test_submit(ctx);
+
+  u16 *dst_data = (u16* ) get_compressed_tg_gmem(ctx, p->com_p.dst);
+  u16* ref_data = (u16* ) gmem_data;
+
+  //<! div 2 means compare base bf16(2bytes), total_size unit is byte
+  for (u64 i = 0; i < total_size / 2 ; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "vlc compress comparing failed at dst[%" PRIx64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+  free(gmem_data);
+}
+
+static void destroy_param_g2l(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_compressed_tg_gmem(ctx, p->dec_p.src);
+  free_compressed_tg_gmem(ctx, p->com_p.dst);
+  free_tl(bmk, p->dec_p.dst);
+}
+
+static void test_one_case(bmctx_t *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  fmt_t fmts[] = { FMT_BF16 };
+  u8 fmts_sz = sizeof(fmts) / sizeof(fmts[0]);
+
+  for (int align = 0; align < 2; align++) {
+    for (u8 fmt_i = 0; fmt_i < fmts_sz; fmt_i++) {
+      fmt_t fmt = fmts[fmt_i];
+
+      param_t p;
+      memset(&p, 0, sizeof(p));
+      p.dec_p.src = alloc_vlc_compressed_tg_gmem(ctx,
+          &c->lmem_shape, fmt);
+      p.dec_p.dst = alloc_tl(bmk, c->lmem_shape, fmt, align);
+      assert(p.dec_p.dst);
+
+      p.com_p.src = p.dec_p.dst; //alloc_tl(bmk, c->lmem_shape, fmt, align);
+      assert(p.com_p.src);
+      compressed_tg_t* dst = alloc_vlc_compressed_tg_gmem(ctx,
+          &c->lmem_shape, fmt);
+
+      test_param_g2l(ctx, bmk, &p, dst);
+      destroy_param_g2l(ctx, bmk, &p);
+    }
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_tdma_l2l_bf16_tensor_copy.cpp b/cviruntime/test/1822/bf16/test_1822_tdma_l2l_bf16_tensor_copy.cpp
new file mode 100644
index 000000000..2cbd6f113
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_tdma_l2l_bf16_tensor_copy.cpp
@@ -0,0 +1,196 @@
+#include "../1822_test_util.h"
+#include "1822_bf16_util.h"
+
+typedef bmk1822_tdma_l2l_tensor_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+ {FMT_BF16, FMT_I8},
+ {FMT_BF16, FMT_U8},
+ {FMT_I8, FMT_BF16},
+ {FMT_U8, FMT_BF16},
+ {FMT_U8, FMT_U8},
+ {FMT_I8, FMT_I8},
+};
+
+typedef struct {
+  tl_shape_t src_shape;
+  tl_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 1, 1 },
+    { 1, 1, 1, 1 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 2, 1 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 2, 7 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 13, 17 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 120, 5 },
+  }, {
+    { 1, 2, 1, 1 },
+    { 1, 1, 1, 2 },
+  }, {
+    { 2, 17, 1,  4 },
+    { 2,  1, 4, 17 },
+  }, {
+    { 2, 17, 1,  4 },
+    { 2,  1, 17, 4 },
+  }, {
+    { 3, 16, 1, 1 },
+    { 3,  1, 2, 8 },
+  }, {
+    { 3, 39, 17, 23 },
+    { 3, 17, 39, 23 },
+  }, {
+    { 3, 36, 16,  20 },
+    { 3, 18,  1, 640 },
+  }, {
+    { 5, 31, 13, 25 },
+    { 5, 13, 31, 25 },
+  }, {
+    { 20, 35,  2, 2 },
+    { 20,  7, 10, 2 },
+  }
+};
+
+static void destroy_param(bmk_ctx_t *bmk, param_t *p)
+{
+  free_tl(bmk, p->dst);
+  free_tl(bmk, p->src);
+}
+
+static void l2l_tensor_copy_ref(param_t *p, u16 ref_data[], u16 src_data[])
+{
+  u64 size = tl_shape_size(&p->src->shape);
+
+  for (u64 i = 0; i < size; i++) {
+    if(p->src->fmt == FMT_BF16 && p->dst->fmt == FMT_BF16) {
+      ref_data[i] = src_data[i];
+    } else if(p->src->fmt == FMT_BF16 && (p->dst->fmt == FMT_I8 || p->dst->fmt == FMT_U8)){
+      ref_data[i] = (p->dst->fmt == FMT_I8) ? (u8) convert_bf16_s8(src_data[i]) : (u8) convert_bf16_u8(src_data[i]);
+    } else if(p->dst->fmt == FMT_BF16 && (p->src->fmt == FMT_I8 || p->src->fmt == FMT_U8)){
+      u8* u8src_data = (u8*)src_data;
+      u8 sign = p->src->fmt == FMT_I8 ? 1 : 0;
+      ref_data[i] = convert_int8_bf16(u8src_data[i], sign);
+    } else if(p->dst->fmt == p->src->fmt){ // fix8b -> fix8b
+      u8* u8src_data;
+      u8src_data = (u8*) src_data;
+      ref_data[i] = u8src_data[i];
+    } else {
+      fprintf(stderr, "Error src_fmt_type (%d) or dst_fmttype (%d)", p->src->fmt, p->dst->fmt);
+      exit(-1);
+    }
+  }
+}
+
+static void test_param(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = tl_shape_size(&p->src->shape);
+
+  u16 *u16src_data = (u16 *)malloc(sizeof(u16) * size);
+  u8 *u8src_data = (u8 *)malloc(sizeof(u8) * size);
+  u8 *src_data;
+
+  if(p->src->fmt == FMT_BF16) {
+    /* bf16*/
+    float val = -100;
+    for(u64 i = 0; i < size; i++) {
+      u16src_data[i] = generate_bf16_corner_val(val);
+      val += 0.1;
+    }
+    src_data = (u8*)u16src_data;
+  } else {
+    /* int8 -> bf16*/
+    for(u64 i = 0; i < size; i++) {
+      u8src_data[i] = 200 + i;
+    }
+    src_data = u8src_data;
+  }
+
+  put_bf16_tensor_g2l(ctx, bmk, p->src, (u16*)src_data, p->src->fmt);
+  bmk1822_tdma_l2l_bf16_tensor_copy(bmk, p);
+  u16 *dst_data = (u16*) get_bf16_tensor_l2g(ctx, bmk, p->dst, p->dst->fmt);
+
+  u16 *ref_data = (u16 *)malloc(sizeof(u16) * size);
+  l2l_tensor_copy_ref(p, ref_data, (u16*)src_data);
+
+  if(p->dst->fmt == FMT_BF16) {
+    for (u64 i = 0; i < size; i++) {
+      if (dst_data[i] != ref_data[i]) {
+        fprintf(stderr, "comparing(%d->%d) failed at dst[%" PRIu64 "], got %x, exp %x\n",
+                p->src->fmt, p->dst->fmt, i, dst_data[i], ref_data[i]);
+        exit(-1);
+      }
+    }
+  } else if(p->dst->fmt == FMT_U8 || p->dst->fmt == FMT_I8) {
+    for (u64 i = 0; i < size; i++) {
+      u32 shift = (i%2)*8;
+      if ((u8)(dst_data[i/2] >> shift) != (u8)ref_data[i]) {
+        fprintf(stderr, "comparing(%d->%d) failed at dst[%" PRIu64 "], got %x, exp %x\n",
+                p->src->fmt, p->dst->fmt, i, (dst_data[i/2] >> shift), ref_data[i]);
+        exit(-1);
+      }
+    }
+  } else {
+    fprintf(stderr, "Error compreing type src_fmt_type (%d) or dst_fmttype (%d)", p->src->fmt, p->dst->fmt);
+  }
+
+
+
+  free(u8src_data);
+  free(u16src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+static void test_one_case(bmctx_t *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+  for (u32 i = 0; i < nr_fmt; i++) {
+    for (int src_align = 0; src_align < 2; src_align++) {
+      for (int dst_align = 0; dst_align < 2; dst_align++) {
+        param_t p;
+        memset(&p, 0, sizeof(p));
+        p.src = alloc_tl(bmk, c->src_shape, input_fmt[i].src_fmt, src_align);
+        p.dst = alloc_tl(bmk, c->dst_shape, input_fmt[i].dst_fmt, dst_align);
+        test_param(ctx, bmk, &p);
+        destroy_param(bmk, &p);
+      }
+    }
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_tdma_l2tg_bf16_general_copy.cpp b/cviruntime/test/1822/bf16/test_1822_tdma_l2tg_bf16_general_copy.cpp
new file mode 100644
index 000000000..2c68e4180
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_tdma_l2tg_bf16_general_copy.cpp
@@ -0,0 +1,92 @@
+#include "../1822_test_util.h"
+#include "1822_bf16_util.h"
+
+typedef bmk1822_tdma_l2tg_bf16_general_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: %u bytes from %" PRIu32 " to %u:%" PRIx64 "\n", tag,
+      p->src_bytes, p->src_address, p->dst_base_reg_index, p->dst_address);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef param_t case_t;
+
+static fmt_type input_fmt[] = {
+  {FMT_BF16, FMT_BF16},
+};
+
+static case_t g_cases[] = {
+  { 0, 0, 0, 1 * 2, FMT_F32, FMT_F32 },
+  { 0, 0, 0, 39 * 2, FMT_F32, FMT_F32 },
+  { 0, 0, 0, 4096 * 2, FMT_F32, FMT_F32 },
+  { 0, 0, 100, 1 * 2, FMT_F32, FMT_F32 },
+  { 0, 0, 200, 39 * 2, FMT_F32, FMT_F32 },
+  { 0, 0, 1024, 4096 * 2, FMT_F32, FMT_F32 },
+  { 39, 0, 100, 1 * 2, FMT_F32, FMT_F32 },
+  { 47, 0, 200, 39 * 2, FMT_F32, FMT_F32 },
+  { 2048, 0, 1024, 4096 * 2, FMT_F32, FMT_F32 },
+};
+
+static void l2tg_general_copy_ref(param_t *p, u16 ref_data[], u16 src_data[])
+{
+  for (u32 i = 0; i < p->src_bytes/2; i++)
+    ref_data[i] = src_data[i];
+}
+
+static void test_param_l2g(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = p->src_bytes/2 ;
+  u16 *src_data = (u16 *)malloc(sizeof(u16) * size);
+  static float val = -100;
+  for (u64 i = 0; i < size; i++) {
+    src_data[i] = generate_bf16_corner_val(val);
+    val += 0.1;
+  }
+  put_bytes_g2l(ctx, bmk, p->src_address, size * 2, (u8*)src_data);
+
+  bmk1822_tdma_l2g_bf16_general_copy(bmk, p);
+  test_submit(ctx);
+  //u16 *dst_data = (u16*) get_bytes_gmem(ctx, p->dst_address, size * 2);
+  u16 *dst_data = (u16*)get_bytes_l2g(ctx, bmk, p->src_address, size * 2);
+
+  u16 *ref_data = (u16 *)malloc(sizeof(u16) * size);
+  l2tg_general_copy_ref(p, ref_data, src_data);
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      exit(-1);
+    }
+  }
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+
+static void test_one_case(bmctx_t *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  param_t *p = c;
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+  for (u32 i = 0; i < nr_fmt; i++) {
+    p->src_fmt = input_fmt[i].src_fmt;
+    p->dst_fmt = input_fmt[i].dst_fmt;
+    test_param_l2g(ctx, bmk, p);
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_tdma_l2tg_bf16_matrix_copy.cpp b/cviruntime/test/1822/bf16/test_1822_tdma_l2tg_bf16_matrix_copy.cpp
new file mode 100644
index 000000000..55e58e3d5
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_tdma_l2tg_bf16_matrix_copy.cpp
@@ -0,0 +1,191 @@
+#include "../1822_test_util.h"
+#include "1822_bf16_util.h"
+
+typedef bmk1822_tdma_l2tg_matrix_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.w, p->src->shape.col,
+      p->dst->shape.row, p->dst->shape.col);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+ {FMT_BF16, FMT_I8},
+ {FMT_BF16, FMT_U8},
+ {FMT_U8, FMT_U8},
+ {FMT_I8, FMT_I8},
+};
+
+typedef struct {
+  ml_shape_t src_shape;
+  mg_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 0, 1, 1, 1 },
+    { 0, 1 },
+  }, {
+    { 0, 1, 2, 2 },
+    { 1, 2 },
+  }, {
+    { 0, 2, 1, 2 },
+    { 1, 2 },
+  }, {
+    { 0, 1, 7, 7 },
+    { 0, 7 },
+  }, {
+    { 0, 2, 4, 7 },
+    { 0, 7 },
+  }, {
+    { 0, 7, 1, 7 },
+    { 0, 7 },
+  }, {
+    { 0, 1, 17, 17 },
+    { 0, 17 },
+  }, {
+    { 0, 3, 7, 17 },
+    { 0, 17 },
+  }, {
+    { 0, 17, 1, 17 },
+    { 0, 17 },
+  }, {
+    { 0, 1, 60, 60 },
+    { 0, 60 },
+  }, {
+    { 0, 30, 2, 60 },
+    { 0, 60 },
+  }, {
+    { 0, 60, 1, 60 },
+    { 0, 60 },
+  }
+};
+
+static void l2tg_matrix_copy_ref(param_t *p, u16 ref_data[], u16 src_data[])
+{
+  u64 size = ml_shape_size(&p->src->shape);
+
+  for (u64 i = 0; i < size; i++) {
+    if(p->src->fmt == FMT_BF16 && p->dst->fmt == FMT_BF16) // bf16 -> bf16
+      ref_data[i] = src_data[i];
+    else if(p->src->fmt == FMT_BF16 && (p->dst->fmt == FMT_I8 || p->dst->fmt == FMT_U8)){ // i8/u8 -> bf16
+      u8 sign = p->dst->fmt == FMT_I8 ? 1 : 0;
+      u8 val = sign ? (u8) convert_bf16_s8(src_data[i]) : (u8) convert_bf16_u8(src_data[i]);
+      ref_data[i] = (u16) val;
+    } else if(p->dst->fmt == p->src->fmt) { // i8/u8 -> i8/u8
+      u8* u8src_data;
+      u8src_data = (u8*) src_data;
+      ref_data[i] = u8src_data[i];
+    } else {
+      fprintf(stderr, "Error src_fmt_type (%d) or dst_fmttype (%d)", p->src->fmt, p->dst->fmt);
+    }
+  }
+}
+
+static void test_param_l2g(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = ml_shape_size(&p->src->shape);
+
+  u16 *u16src_data = (u16 *)malloc(sizeof(u16) * size);
+  u8 *u8src_data = (u8 *)malloc(sizeof(u8) * size);
+  u8 *src_data;
+
+  if(p->src->fmt == FMT_BF16) {
+    /* bf16*/
+    float val = -100;
+    for(u64 i = 0; i < size; i++) {
+      u16src_data[i] = generate_bf16_corner_val(val);
+      val += 0.1;
+    }
+    src_data = (u8*)u16src_data;
+  } else {
+    /* int8 -> bf16*/
+    for(u64 i = 0; i < size; i++) {
+      u8src_data[i] = 200 + i;
+    }
+    src_data = u8src_data;
+  }
+
+  put_bf16_matrix_g2l(ctx, bmk, p->src, (u8*)src_data, p->src->fmt);
+  bmk1822_tdma_l2g_bf16_matrix_copy(bmk, p);
+  test_submit(ctx);
+  u16 *dst_data = (u16*) get_mg_bf16_gmem(ctx, p->dst);
+
+  u16 *ref_data = (u16 *)malloc(sizeof(u16) * size);
+  l2tg_matrix_copy_ref(p, ref_data, (u16*) src_data);
+
+  if(p->dst->fmt == FMT_BF16 && p->src->fmt == FMT_BF16) {
+    for (u64 i = 0; i < size; i++) {
+      if (dst_data[i] != ref_data[i]) {
+        fprintf(stderr, "comparing(%d->%d) failed at dst[%" PRIu64 "], got %x, exp %x\n",
+                p->src->fmt, p->dst->fmt, i, dst_data[i], ref_data[i]);
+        exit(-1);
+      }
+    }
+  } else if(p->dst->fmt == FMT_U8 || p->dst->fmt == FMT_I8) {
+    for (u64 i = 0; i < size; i++) {
+      u32 shift = (i%2)*8;
+      if ((u8)(dst_data[i/2] >> shift) != (u8)ref_data[i]) {
+        fprintf(stderr, "comparing(%d->%d) failed at dst[%" PRIu64 "], got %x, exp %x\n",
+                p->src->fmt, p->dst->fmt, i, (dst_data[i/2] >> shift), ref_data[i]);
+        exit(-1);
+      }
+    }
+  } else {
+    fprintf(stderr, "Error compreing type src_fmt_type (%d) or dst_fmttype (%d)", p->src->fmt, p->dst->fmt);
+  }
+
+  free(u8src_data);
+  free(u16src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+
+static void destroy_param_l2g(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_ml(bmk, p->src);
+  free_mg_gmem(ctx, p->dst);
+}
+
+static void test_one_case(bmctx_t *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+  for (u32 i = 0; i < nr_fmt; i++) {
+    for (u32 row = 1; row < 13; row += 2) {
+      c->src_shape.n = row;
+      c->dst_shape.row = row;
+      for (int src_align = 0; src_align < 2; src_align++) {
+        param_t p;
+        memset(&p, 0, sizeof(p));
+  
+        p.src = alloc_ml_bf16(bmk, c->src_shape, input_fmt[i].src_fmt, src_align);
+        p.dst = alloc_mg_bf16_gmem(ctx, c->dst_shape, input_fmt[i].dst_fmt);
+        test_param_l2g(ctx, bmk, &p);
+        destroy_param_l2g(ctx, bmk, &p);
+  
+      }
+    }
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_tdma_l2tg_bf16_matrix_vlc_copy_compressed.cpp b/cviruntime/test/1822/bf16/test_1822_tdma_l2tg_bf16_matrix_vlc_copy_compressed.cpp
new file mode 100644
index 000000000..baa94063a
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_tdma_l2tg_bf16_matrix_vlc_copy_compressed.cpp
@@ -0,0 +1,167 @@
+#include "../1822_test_util.h"
+
+typedef bmk1822_tdma_l2tg_matrix_copy_compressed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.w, p->src->shape.col,
+      p->dst->m.shape.row, p->dst->m.shape.col);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  ml_shape_t src_shape;
+  mg_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+ {
+    { 0, 2, 4, 7 },
+    { 0, 7 },
+  },
+ {
+    { 0, 1, 2, 2 },
+    { 1, 2 },
+  },
+ {
+    { 0, 3, 7, 17 },
+    { 0, 17 },
+  },
+ {
+    { 0, 17, 1, 17 },
+    { 0, 17 },
+  },
+ {
+    { 0, 60, 1, 60 },
+    { 0, 60 },
+  },
+#ifndef ENABEL_SIMPLE_BMK1822_VLC_TEST
+  {
+    { 0, 1, 1, 1 },
+    { 0, 1 },
+  },
+ {
+    { 0, 2, 1, 2 },
+    { 1, 2 },
+  },
+ {
+    { 0, 1, 7, 7 },
+    { 0, 7 },
+  },
+ {
+    { 0, 7, 1, 7 },
+    { 0, 7 },
+  },
+ {
+    { 0, 1, 17, 17 },
+    { 0, 17 },
+  },
+ {
+    { 0, 1, 60, 60 },
+    { 0, 60 },
+  },
+ {
+    { 0, 30, 2, 60 },
+    { 0, 60 },
+  },
+#endif /* ifndef ENABEL_SIMPLE_BMK1822_VLC_TEST*/
+};
+
+static void test_param_l2g(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p, u16* src_data, CommandInfo * cmd_info)
+{
+  print_param(stderr, p);
+  u64 size = ml_shape_size(&p->src->shape);
+  u64 bytesize = size * bytesize_of_fmt(p->src->fmt);
+
+  put_bf16_matrix_g2l(ctx, bmk, p->src, (u8*)src_data, p->src->fmt);
+  bmk1822_tdma_l2g_matrix_copy_compressed(bmk, p);
+  test_submit(ctx);
+
+  int is_signed = (p->src->fmt == FMT_I8);
+  int data_type = (p->src->fmt == FMT_BF16) ? 1 : 0;
+  size_t bs_size;
+
+  size_t bs_buf_size = get_out_bs_buf_size(bytesize, data_type);
+  u16 *ref_data = (u16* ) vlc_compress((u8* )src_data, bytesize, is_signed, data_type, &bs_size, cmd_info, NULL);
+  u16 *dst_data = (u16* ) get_compressed_mg_gmem(ctx, p->dst, bs_buf_size);
+
+  // <! compare unit is 2bytes
+  for (u64 i = 0; i < bs_size / 2; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+          i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(dst_data);
+  free(ref_data);
+}
+
+
+static void destroy_param_l2g(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_ml(bmk, p->src);
+  free_compressed_mg_gmem(ctx, p->dst);
+}
+
+static void test_one_case(bmctx_t *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  fmt_t fmts[] = { FMT_BF16 };
+  u8 fmts_sz = sizeof(fmts) / sizeof(fmts[0]);
+
+  for (u32 row = 1; row < 13; row += 2) {
+    c->src_shape.n = row;
+    c->dst_shape.row = row;
+    for (int src_align = 0; src_align < 2; src_align++) {
+      for (u8 fmt_i = 0; fmt_i < fmts_sz; fmt_i++) {
+        fmt_t fmt = fmts[fmt_i];
+        param_t p;
+        memset(&p, 0, sizeof(p));
+        p.src = alloc_ml_bf16(bmk, c->src_shape, fmt, src_align);
+
+        u64 size = ml_shape_size(&p.src->shape);
+        u16 *src_data = (u16 *)malloc(sizeof(u16) * size);
+        vlc_init_testdata(src_data, size, fmt == FMT_I8, fmt == FMT_BF16);
+
+        //size_t bs_size;
+        CommandInfo cmd_info;
+        int is_signed = (p.src->fmt == FMT_I8);
+        int data_type = (p.src->fmt == FMT_BF16) ? 1 : 0;
+
+        memset(&cmd_info, 0, sizeof(CommandInfo));
+        cmd_info.signedness = is_signed;
+        cmd_info.is_bfloat16 = data_type;
+        cmd_info.bias0 = 127;
+
+        // <! not support bias0/1 setting compress by hw
+        //get_vlc_compressed_meta(src_data, size, p.src->fmt, &bs_size, &cmd_info);
+
+        // <! max compressed size
+        p.dst = alloc_vlc_compressed_mg_gmem(ctx, c->dst_shape, p.src->fmt, &cmd_info);
+
+        test_param_l2g(ctx, bmk, &p, src_data, &cmd_info);
+        destroy_param_l2g(ctx, bmk, &p);
+        free(src_data);
+      }
+    }
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_tdma_l2tg_bf16_tensor_copy.cpp b/cviruntime/test/1822/bf16/test_1822_tdma_l2tg_bf16_tensor_copy.cpp
new file mode 100644
index 000000000..9409a9a81
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_tdma_l2tg_bf16_tensor_copy.cpp
@@ -0,0 +1,174 @@
+#include "../1822_test_util.h"
+#include "1822_bf16_util.h"
+
+typedef bmk1822_tdma_l2tg_tensor_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+ {FMT_BF16, FMT_I8},
+ {FMT_BF16, FMT_U8},
+};
+
+typedef struct {
+  tl_shape_t src_shape;
+  tg_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 1, 1 },
+    { 1, 1, 1, 1 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 2, 1 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 2, 7 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 13, 17 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 120, 5 },
+  }, {
+    { 1, 2, 1, 1 },
+    { 1, 1, 1, 2 },
+  }, {
+    { 2, 17, 1,  4 },
+    { 2,  1, 4, 17 },
+  }, {
+    { 2, 17, 1,  4 },
+    { 2,  1, 17, 4 },
+  }, {
+    { 3, 16, 1, 1 },
+    { 3,  1, 2, 8 },
+  }, {
+    { 3, 39, 17, 23 },
+    { 3, 17, 39, 23 },
+  }, {
+    { 3, 36, 16,  20 },
+    { 3, 18,  1, 640 },
+  }, {
+    { 5, 39, 17, 23 },
+    { 5, 17, 39, 23 },
+  }, {
+    { 20, 35,  2, 2 },
+    { 20,  7, 10, 2 },
+  }    
+};
+
+static void l2tg_tensor_copy_ref(param_t *p, u16 ref_data[], u16 src_data[])
+{
+  u64 size = tl_shape_size(&p->src->shape);
+
+  for (u64 i = 0; i < size; i++) {
+    if(p->src->fmt == FMT_BF16 && p->dst->fmt == FMT_BF16)
+      ref_data[i] = src_data[i];
+    else if (p->src->fmt == FMT_BF16 && (p->dst->fmt == FMT_I8 || p->dst->fmt == FMT_U8)) {
+      u8 sign = p->dst->fmt == FMT_I8 ? 1 : 0;
+      s16 val = sign ? (s16) convert_bf16_s8(src_data[i]) : (u16) convert_bf16_u8(src_data[i]);
+      ref_data[i] = u16 (val);
+    } else if(p->dst->fmt == p->src->fmt){ //i8->i8
+      ref_data[i] = src_data[i];
+    } else {
+      fprintf(stderr, "Error src_fmt_type (%d) or dst_fmttype (%d)", p->src->fmt, p->dst->fmt);
+      exit(-1);
+    }
+  }
+}
+
+static void test_param_l2g(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = tl_shape_size(&p->src->shape);
+
+  u16 *src_data = (u16 *)malloc(sizeof(u16) * size);
+  float val = -100;
+  for(u64 i = 0; i < size; i++) {
+    src_data[i] = generate_bf16_corner_val(val);
+    val += 0.1;
+  }
+
+  put_bf16_tensor_g2l(ctx, bmk, p->src, src_data, p->src->fmt);
+  bmk1822_tdma_l2g_bf16_tensor_copy(bmk, p);
+  test_submit(ctx);
+  u16 *dst_data = (u16*)get_tg_bf16_gmem(ctx, p->dst);
+
+  u16 *ref_data = (u16 *)malloc(sizeof(u16) * size);
+  l2tg_tensor_copy_ref(p, ref_data, src_data);
+
+  if(p->dst->fmt == FMT_BF16 && p->src->fmt == FMT_BF16) {
+    for (u64 i = 0; i < size; i++) {
+      if (dst_data[i] != ref_data[i]) {
+        fprintf(stderr, "comparing(%d->%d) failed at dst[%" PRIu64 "], got %x, exp %x\n",
+                p->src->fmt, p->dst->fmt, i, dst_data[i], ref_data[i]);
+        exit(-1);
+      }
+    }
+  } else if(p->dst->fmt == FMT_U8 || p->dst->fmt == FMT_I8) {
+    for (u64 i = 0; i < size; i++) {
+      u32 shift = (i%2)*8;
+      if ((u8)(dst_data[i/2] >> shift) != (u8)ref_data[i]) {
+        fprintf(stderr, "comparing (bf16->i8/u8) failed at dst[%" PRIu64 "], got %x, exp %x\n",
+                i, (dst_data[i/2] >> shift) , ref_data[i]);
+        exit(-1);
+      }
+    }
+  } else {
+    fprintf(stderr, "Error compreing type src_fmt_type (%d) or dst_fmttype (%d)", p->src->fmt, p->dst->fmt);
+    exit(-1);
+  }
+
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+
+static void destroy_param_l2g(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_tg_gmem(ctx, p->dst);
+  free_tl(bmk, p->src);
+}
+
+static void test_one_case(bmctx_t *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+  for (u32 i = 0; i < nr_fmt; i++) {
+    for (int src_align = 0; src_align < 2; src_align++) {
+      param_t p;
+      memset(&p, 0, sizeof(p));
+  
+      p.src = alloc_tl(bmk, c->src_shape, input_fmt[i].src_fmt, src_align);
+      p.dst = alloc_tg_bf16_gmem(ctx, c->dst_shape, input_fmt[i].dst_fmt);
+      test_param_l2g(ctx, bmk, &p);
+      destroy_param_l2g(ctx, bmk, &p);
+  
+    }
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_tdma_l2tg_bf16_tensor_copy_nc_transposed.cpp b/cviruntime/test/1822/bf16/test_1822_tdma_l2tg_bf16_tensor_copy_nc_transposed.cpp
new file mode 100644
index 000000000..c08c9bf54
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_tdma_l2tg_bf16_tensor_copy_nc_transposed.cpp
@@ -0,0 +1,263 @@
+#include "../1822_test_util.h"
+#include "1822_bf16_util.h"
+
+typedef bmk1822_tdma_l2tg_tensor_copy_nc_transposed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+ {FMT_BF16, FMT_I8},
+ {FMT_BF16, FMT_U8},
+};
+
+typedef struct {
+  tl_shape_t src_shape;
+  tg_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 1, 1 },
+    { 1, 1, 1, 1 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 1, 2 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 2, 1 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 7, 2 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 2, 7 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 17, 13 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 13, 17 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 10, 60 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 2, 300 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 3, 200 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 4, 150 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 5, 120 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 60, 10 },
+  }, {
+    { 1, 1, 120, 5 },
+    { 1, 1, 120, 5 },
+  }, {
+    { 1, 2, 1, 1 },
+    { 2, 1, 1, 1 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 1, 4 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 2, 2 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 4, 1 },
+  }, {
+    { 17, 2, 2, 2 },
+    { 2, 17, 2, 2 },
+  }, {
+    { 17, 2, 4, 1 },
+    { 2, 17, 4, 1 },
+  }, {
+    {  3, 16, 1, 1 },
+    { 16,  3, 1, 1 },
+  }, {
+    { 3, 39, 23, 17 },
+    { 39, 3, 23, 17 },
+  }, {
+    { 3, 39, 17, 23 },
+    { 39, 3, 17, 23 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  16, 20 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  2, 160 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  4, 80 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  8, 40 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  20, 16 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  32, 10 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  64, 5 },
+  }, {
+    { 5, 39, 17, 23 },
+    { 39, 5, 17, 23 },
+  }, {
+    { 20, 35, 2, 2 },
+    { 35, 20, 2, 2 },
+  }, {
+    { 35, 20, 2, 2 },
+    { 20, 35, 2, 2 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 160, 2 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 2, 160 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 4, 80 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 8, 40 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 10, 32 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 20, 16 },
+  }, {
+    { 39, 5, 23, 17 },
+    { 5, 39, 23, 17 },
+  }    
+};
+
+static void l2tg_tensor_copy_nc_transposed_ref(
+    param_t *p, u16 ref_data[], u16 src_data[])
+{
+  tl_shape_t s = p->src->shape;
+  u32 n = s.n;
+  u32 c = s.c;
+  u32 hw = s.h * s.w;
+
+  for (u32 ni = 0; ni < n; ni++) {
+    for (u32 ci = 0; ci < c; ci++) {
+      for (u32 hwi = 0; hwi < hw; hwi++) {
+        u32 src_i = ni * c * hw + ci * hw + hwi;
+        u32 dst_i = ci * n * hw + ni * hw + hwi;
+        if(p->src->fmt == FMT_BF16 && p->dst->fmt == FMT_BF16)
+          ref_data[dst_i] = src_data[src_i];
+        else if (p->src->fmt == FMT_BF16 && (p->dst->fmt == FMT_I8 || p->dst->fmt == FMT_U8)) {
+          u8 sign = p->dst->fmt == FMT_I8 ? 1 : 0;
+          u8 val = sign ? (u8) convert_bf16_s8(src_data[src_i]) : (u8) convert_bf16_u8(src_data[src_i]);
+          ref_data[dst_i] = u8 (val);
+        } else if(p->dst->fmt == p->src->fmt){ //i8->i8
+          ref_data[dst_i] = src_data[src_i];
+        } else {
+          fprintf(stderr, "Error src_fmt_type (%d) or dst_fmttype (%d)", p->src->fmt, p->dst->fmt);
+          exit(-1);
+        }
+      }
+    }
+  }
+}
+
+static void test_param_l2g(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = tl_shape_size(&p->src->shape);
+
+  u16 *src_data = (u16 *)malloc(sizeof(u16) * size);
+  float val = -100;
+  for (u64 i = 0; i < size; i++) {
+    src_data[i] = generate_bf16_corner_val(val);
+    val += 0.1;
+  }
+
+  put_bf16_tensor_g2l(ctx, bmk, p->src, src_data,  p->src->fmt);
+  bmk1822_tdma_l2g_bf16_tensor_copy_nc_transposed(bmk, p);
+  test_submit(ctx);
+  u16 *dst_data = (u16*) get_tg_bf16_gmem(ctx, p->dst);
+
+  u16 *ref_data = (u16 *)malloc(sizeof(u16) * size);
+  l2tg_tensor_copy_nc_transposed_ref(p, ref_data, src_data);
+
+  if(p->dst->fmt == FMT_BF16 && p->src->fmt == FMT_BF16) {
+    for (u64 i = 0; i < size; i++) {
+      if (dst_data[i] != ref_data[i]) {
+        fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+                i, dst_data[i], ref_data[i]);
+        exit(-1);
+      }
+    }
+  } else if(p->dst->fmt == FMT_U8 || p->dst->fmt == FMT_I8) {
+    for (u64 i = 0; i < size; i++) {
+      u32 shift = (i%2)*8;
+      if ((u8)(dst_data[i/2] >> shift) != (u8)ref_data[i]) {
+        fprintf(stderr, "comparing (bf16->i8/u8) failed at dst[%" PRIu64 "], got %x, exp %x\n",
+                i,(u8) (dst_data[i/2] >> shift) , ref_data[i]);
+        exit(-1);
+      }
+    }
+  } else {
+    fprintf(stderr, "Error compreing type src_fmt_type (%d) or dst_fmttype (%d)", p->src->fmt, p->dst->fmt);
+  }
+
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+static void destroy_param_l2g(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_tg_gmem(ctx, p->dst);
+  free_tl(bmk, p->src);
+}
+
+static void test_one_case(bmctx_t *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+  for (u32 i = 0; i < nr_fmt; i++) {
+    for (int src_align = 0; src_align < 2; src_align++) {
+      param_t p;
+      memset(&p, 0, sizeof(p));
+  
+      p.src = alloc_tl(bmk, c->src_shape, input_fmt[i].src_fmt, src_align);
+      p.dst = alloc_tg_bf16_gmem(ctx, c->dst_shape, input_fmt[i].dst_fmt);
+      test_param_l2g(ctx, bmk, &p);
+      destroy_param_l2g(ctx, bmk, &p);
+  
+    }
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_tdma_l2tg_bf16_tensor_fill_constant.cpp b/cviruntime/test/1822/bf16/test_1822_tdma_l2tg_bf16_tensor_fill_constant.cpp
new file mode 100644
index 000000000..452157c1d
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_tdma_l2tg_bf16_tensor_fill_constant.cpp
@@ -0,0 +1,141 @@
+#include "../1822_test_util.h"
+#include "1822_bf16_util.h"
+
+typedef bmk1822_tdma_l2tg_tensor_fill_constant_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: %u => (%u, %u, %u, %u)\n",
+      tag, p->constant,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  float constant;
+  tg_shape_t dst_shape;
+} case_t;
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+};
+
+static case_t g_cases[] = {
+  {
+    37, { 1, 1, 1, 1 }
+  }, {
+    39, { 1, 1, 1, 2 }
+  }, {
+    23, { 1, 1, 2, 1 }
+  }, {
+    19, { 1, 1, 7, 2 }
+  }, {
+    17, { 1, 1, 2, 7 }
+  }, {
+    13, { 1, 1, 17, 13 }
+  }, {
+    11, { 1, 1, 13, 17 }
+  }, {
+    7, { 1, 1, 10, 60 }
+  }, {
+    9, { 1, 1, 120, 5 }
+  }, {
+    2, { 1, 2, 1, 1 }
+  }, {
+    3, { 1, 1, 1, 2 }
+  }, {
+    5, { 2, 17, 1,  4 }
+  }, {
+    41, { 2,  1, 4, 17 }
+  }, {
+    5, { 2, 17, 1,  4 }
+  }, {
+    9, { 2,  1, 17, 4 }
+  }, {
+    17, { 3, 16, 1, 1 }
+  }, {
+    26, { 3,  1, 2, 8 }
+  }, {
+    103, { 3, 39, 17, 23 }
+  }, {
+    255, { 3, 17, 39, 23 }
+  }, {
+    254, { 3, 36, 16,  20 }
+  }, {
+    127, { 3, 18,  1, 640 }
+  }, {
+    128, { 5, 39, 17, 23 }
+  }, {
+    129, { 5, 17, 39, 23 }
+  }, {
+    55, { 20, 35,  2, 2 }
+  }, {
+    1, { 20,  7, 10, 2 }
+  }    
+};
+
+static void l2tg_tensor_fill_constant_ref(param_t *p, u16 ref_data[])
+{
+  u64 size = tg_shape_size(&p->dst->shape);
+  printf("float =%x\n",p->constant);
+  for (u64 i = 0; i < size; i++)
+    ref_data[i] = p->constant;
+}
+
+static void test_param_l2g(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = tg_shape_size(&p->dst->shape);
+
+  bmk1822_tdma_l2g_tensor_fill_constant(bmk, p);
+  test_submit(ctx);
+  u16 *dst_data = (u16*)get_tg_bf16_gmem(ctx, p->dst);
+
+  u16 *ref_data = (u16 *)malloc(sizeof(u16) * size);
+  l2tg_tensor_fill_constant_ref(p, ref_data);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %x, exp %x\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(dst_data);
+  free(ref_data);
+}
+
+static void destroy_param_l2g(bmctx_t *ctx, param_t *p)
+{
+  free_tg_gmem(ctx, p->dst);
+}
+
+static void test_one_case(bmctx_t *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+  for (u32 i = 0; i < nr_fmt; i++) {
+    param_t p;
+    memset(&p, 0, sizeof(p));
+    p.constant = generate_bf16_corner_val(c->constant);
+    p.dst = alloc_tg_bf16_gmem(ctx, c->dst_shape, input_fmt[i].src_fmt);
+    test_param_l2g(ctx, bmk, &p);
+    destroy_param_l2g(ctx, &p);
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
\ No newline at end of file
diff --git a/cviruntime/test/1822/bf16/test_1822_tdma_l2tg_bf16_tensor_vlc_copy_compressed.cpp b/cviruntime/test/1822/bf16/test_1822_tdma_l2tg_bf16_tensor_vlc_copy_compressed.cpp
new file mode 100644
index 000000000..166245247
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_tdma_l2tg_bf16_tensor_vlc_copy_compressed.cpp
@@ -0,0 +1,153 @@
+#include "../1822_test_util.h"
+
+
+typedef bmk1822_tdma_l2tg_tensor_copy_compressed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => %d-bit %s\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->bit_length,
+      (p->src->fmt == FMT_I8)? "signed": "unsigned");
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  tl_shape_t lmem_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 17, 13 }
+  },
+  {
+    { 3, 39, 17, 23 }
+  },
+  {
+    { 5, 39, 17, 23 }
+  },
+  {
+    { 20, 35,  2, 2 }
+  },
+#ifndef ENABEL_SIMPLE_BMK1822_VLC_TEST
+  {
+    { 1, 1, 1, 1 }
+  },
+  {
+    { 1, 1, 1, 2 }
+  },
+  {
+    { 1, 1, 7, 2 }
+  },
+  {
+    { 1, 1, 10, 60 }
+  },
+  {
+    { 1, 2, 1, 1 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 3, 16, 1, 1 }
+  },
+  {
+    { 3, 36, 16,  20 }
+  },
+#endif /* ifndef ENABEL_SIMPLE_BMK1822_VLC_TEST*/
+};
+
+static void test_param_l2g(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p, CommandInfo* cmd_info, u16 *src_data)
+{
+  print_param(stderr, p);
+  u64 size = tl_shape_size(&p->src->shape);
+  u64 bytesize = size * bytesize_of_fmt(p->src->fmt);
+  int is_signed = (p->src->fmt == FMT_I8);
+  u8 data_type = (p->src->fmt == FMT_BF16) ? 1 : 0;
+  size_t bs_size = 0;
+
+  put_bf16_tensor_g2l(ctx, bmk, p->src, src_data, p->src->fmt);
+  bmk1822_tdma_l2g_tensor_copy_compressed(bmk, p);
+  test_submit(ctx);
+
+  u16 *dst_data = (u16* ) get_compressed_tg_gmem(ctx, p->dst);
+  u16 *ref_data = (u16* ) vlc_compress((u8 *)src_data, bytesize, is_signed, data_type, &bs_size, cmd_info, NULL);
+
+  for (u64 i = 0; i < bs_size / 2 ; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "vlc compress comparing failed at dst[%" PRIx64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+
+      exit(-1);
+    }
+  }
+
+  free(dst_data);
+  free(ref_data);
+}
+
+static void destroy_param_l2g(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_compressed_tg_gmem(ctx, p->dst);
+  free_tl(bmk, p->src);
+}
+
+static void test_one_case(bmctx_t *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  fmt_t fmts[] = { FMT_BF16 };
+  u8 fmts_sz = sizeof(fmts) / sizeof(fmts[0]);
+
+  for (int src_align = 0; src_align < 2; src_align++) {
+    for (u8 fmt_i = 0; fmt_i < fmts_sz; fmt_i++) {
+      fmt_t fmt = fmts[fmt_i];
+      u8 data_type = (fmt == FMT_BF16) ? 1 : 0;
+      param_t p;
+      memset(&p, 0, sizeof(p));
+
+      p.src = alloc_tl(bmk, c->lmem_shape, fmt, src_align);
+      assert(p.src);
+
+      CommandInfo cmd_info;
+      memset(&cmd_info, 0, sizeof(CommandInfo));
+      u64 in_size = tl_shape_size(&p.src->shape);
+
+      u16 *src_data = (u16 *)malloc(sizeof(u16) * in_size);
+      vlc_init_testdata(src_data, in_size, fmt == FMT_I8, fmt == FMT_BF16);
+
+      int is_signed = (p.src->fmt == FMT_I8);
+      cmd_info.signedness = is_signed;
+      cmd_info.is_bfloat16 = data_type;
+      cmd_info.bias0 = 127;
+
+      // <! not support bias0/1 setting compress by hw
+      //bm_vlc_est_weight_bias(src_data, in_size, (bool)is_signed, (bool)data_type, &cmd_info);
+
+      p.dst = _alloc_vlc_compressed_tg_gmem(ctx,
+          &c->lmem_shape, fmt, &cmd_info);
+      test_param_l2g(ctx, bmk, &p, &cmd_info, src_data);
+      destroy_param_l2g(ctx, bmk, &p);
+
+      free(src_data);
+    }
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_tdma_tg2l_bf16_general_copy.cpp b/cviruntime/test/1822/bf16/test_1822_tdma_tg2l_bf16_general_copy.cpp
new file mode 100644
index 000000000..bee441157
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_tdma_tg2l_bf16_general_copy.cpp
@@ -0,0 +1,104 @@
+#include "../1822_test_util.h"
+#include "1822_bf16_util.h"
+
+typedef bmk1822_tdma_tg2l_bf16_general_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: %u bytes from %u:%" PRIu64 " to %" PRIu32 "\n", tag,
+      p->src_bytes, p->src_base_reg_index, p->src_address, p->dst_address);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef param_t case_t;
+
+static fmt_type input_fmt[] = {
+  {FMT_BF16, FMT_BF16},
+};
+
+static case_t g_cases[] = {
+  { 0, 0, 0, 1 * 2 , FMT_F32, FMT_F32 },
+  { 0, 0, 0, 39 * 2, FMT_F32, FMT_F32 },
+  { 0, 0, 0, 4096 * 2, FMT_F32, FMT_F32 },
+  { 0, 1, 0, 1 * 2, FMT_F32, FMT_F32 },
+  { 0, 1, 0, 39 * 2, FMT_F32, FMT_F32 },
+  { 0, 1, 0, 4096 * 2, FMT_F32, FMT_F32 },
+  { 0, 1, 100, 1 * 2, FMT_F32, FMT_F32 },
+  { 0, 1, 200, 39 * 2, FMT_F32, FMT_F32 },
+  { 0, 1, 4096, 4096 * 2, FMT_F32, FMT_F32 },
+  { 0, 257, 100, 1 * 2, FMT_F32, FMT_F32 },
+  { 0, 349, 200, 39 * 2, FMT_F32, FMT_F32 },
+  { 0, 3356, 4096, 4096 * 2, FMT_F32, FMT_F32 },
+};
+
+static void tg2l_general_copy_ref(param_t *p, u16 ref_data[], u16 src_data[])
+{
+  for (u32 i = 0; i < p->src_bytes/2; i++)
+    ref_data[i] = src_data[i];
+}
+
+static void test_param_g2l(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = p->src_bytes/2;
+  float val = -100;
+  u16 *src_data = (u16 *)malloc(sizeof(u16) * size);
+  for (u64 i = 0; i < size; i++) {
+    src_data[i] = generate_bf16_corner_val(val);
+    val += 0.1;
+  }
+
+  bmmem_device_t mem = bmmem_device_alloc_raw(*ctx, size * 2);
+  u64 gmem_addr = bmmem_device_addr(mem);
+  put_bytes_gmem(ctx, mem, (u8*)src_data);
+
+  p->src_address = gmem_addr;
+  bmk1822_tdma_g2l_bf16_general_copy(bmk, p);
+  test_submit(ctx);
+  bmmem_device_free(*ctx, mem);
+
+
+  u16 *dst_data = (u16*) get_bytes_l2g(ctx, bmk, p->dst_address, size * 2);
+
+  u16 *ref_data = (u16 *)malloc(sizeof(u16) * size);
+  tg2l_general_copy_ref(p, ref_data, src_data);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+static void test_one_case(bmctx_t *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  param_t *p = c;
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+  for (u32 i = 0; i < nr_fmt; i++) {
+    p->src_fmt = input_fmt[i].src_fmt;
+    p->dst_fmt = input_fmt[i].dst_fmt;
+    test_param_g2l(ctx, bmk, p);
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_tdma_tg2l_bf16_matrix_copy.cpp b/cviruntime/test/1822/bf16/test_1822_tdma_tg2l_bf16_matrix_copy.cpp
new file mode 100644
index 000000000..63547ed13
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_tdma_tg2l_bf16_matrix_copy.cpp
@@ -0,0 +1,180 @@
+#include "../1822_test_util.h"
+#include "1822_bf16_util.h"
+
+typedef bmk1822_tdma_tg2l_matrix_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.row, p->src->shape.col,
+      p->dst->shape.n, p->dst->shape.c,
+      p->dst->shape.w, p->dst->shape.col);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+ {FMT_I8, FMT_BF16},
+ {FMT_U8, FMT_BF16},
+};
+
+typedef struct {
+  mg_shape_t src_shape;
+  ml_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 0, 1 },
+    { 0, 1, 1, 1 },
+  }, {
+    { 0, 2 },
+    { 0, 1, 2, 2 },
+  }, {
+    { 0, 2 },
+    { 0, 2, 1, 2 },
+  }, {
+    { 0, 7 },
+    { 0, 1, 7, 7 },
+  }, {
+    { 0, 7 },
+    { 0, 2, 4, 7 },
+  }, {
+    { 0, 7 },
+    { 0, 7, 1, 7 },
+  }, {
+    { 0, 17 },
+    { 0, 1, 17, 17 },
+  }, {
+    { 0, 17 },
+    { 0, 3, 7, 17 },
+  }, {
+    { 0, 17 },
+    { 0, 17, 1, 17 },
+  }, {
+    { 0, 60 },
+    { 0, 1, 60, 60 },
+  }, {
+    { 0, 60 },
+    { 0, 30, 2, 60 },
+  }, {
+    { 0, 60 },
+    { 0, 60, 1, 60 },
+  }
+};
+
+static void tg2l_matrix_copy_ref(param_t *p, u16 ref_data[], u16 src_data[])
+{
+  u64 size = ml_shape_size(&p->dst->shape);
+
+  for (u64 i = 0; i < size; i++) {
+    if(p->src->fmt == FMT_BF16) {
+      ref_data[i] = src_data[i];
+    } else {
+      u8* u8src_data = (u8*)src_data;
+      u8 sign = p->src->fmt == FMT_I8 ? 1 : 0;
+      ref_data[i] = (u16)convert_int8_bf16(u8src_data[i], sign);
+    }
+  }
+}
+
+static void test_param_g2l(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = ml_shape_size(&p->dst->shape);
+
+  u16 *u16src_data = (u16 *)malloc(sizeof(u16) * size);
+  u8 *u8src_data = (u8 *)malloc(sizeof(u8) * size);
+  u8 *src_data;
+
+  if(p->src->fmt == FMT_BF16) {
+    float val = -10;
+    for(u64 i = 0; i < size; i++) {
+      u16src_data[i] = generate_bf16_corner_val(val);
+      val += 0.2;
+    }
+    src_data = (u8*)u16src_data;
+  } else {
+    for(u64 i = 0; i < size; i++) {
+      u8src_data[i] = 200 + i;
+    }
+    src_data = u8src_data;
+  }
+
+  put_mg_bf16_gmem(ctx, p->src, (u8*) src_data);
+  bmk1822_tdma_g2l_bf16_matrix_copy(bmk, p);
+  test_submit(ctx);
+  u16 *dst_data = (u16*) get_bf16_matrix_l2g(ctx, bmk, p->dst, p->dst->fmt);
+
+  u16 *ref_data = (u16 *)malloc(sizeof(u16) * size);
+  tg2l_matrix_copy_ref(p, ref_data, (u16*)src_data);
+
+  if(p->dst->fmt == FMT_BF16 && p->src->fmt == FMT_BF16) {
+    for (u64 i = 0; i < size; i++) {
+      if (dst_data[i] != ref_data[i]) {
+        fprintf(stderr, "comparing(%d->%d) failed at dst[%" PRIu64 "], got %x, exp %x\n",
+                p->src->fmt, p->dst->fmt, i, dst_data[i], ref_data[i]);
+        exit(-1);
+      }
+    }
+  } else if(p->src->fmt == FMT_U8 || p->src->fmt == FMT_I8) {
+    for (u64 i = 0; i < size; i++) {
+      if (dst_data[i] != ref_data[i]) {
+        fprintf(stderr, "comparing(%d->%d) failed at dst[%" PRIu64 "], got %x, exp %x\n",
+                p->src->fmt, p->dst->fmt, i, dst_data[i] , ref_data[i]);
+        exit(-1);
+      }
+    }
+  } else {
+    fprintf(stderr, "Error compreing type src_fmt_type (%d) or dst_fmttype (%d)", p->src->fmt, p->dst->fmt);
+    exit(-1);
+  }
+
+  free(u8src_data);
+  free(u16src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+static void destroy_param_g2l(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_mg_gmem(ctx, p->src);
+  free_ml(bmk, p->dst);
+}
+
+static void test_one_case(bmctx_t *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+  for (u32 i = 0; i < nr_fmt; i++) {
+    for (u32 row = 1; row < 13; row += 2) {
+      c->src_shape.row = row;
+      c->dst_shape.n = row;
+      for (int dst_align = 0; dst_align < 2; dst_align++) {
+        param_t p;
+        memset(&p, 0, sizeof(p));
+
+        p.src = alloc_mg_bf16_gmem(ctx, c->src_shape, input_fmt[i].src_fmt);
+        p.dst = alloc_ml_bf16(bmk, c->dst_shape, input_fmt[i].dst_fmt, dst_align);
+        test_param_g2l(ctx, bmk, &p);
+        destroy_param_g2l(ctx, bmk, &p);
+      }
+    }
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_tdma_tg2l_bf16_matrix_vlc_copy_decompressed.cpp b/cviruntime/test/1822/bf16/test_1822_tdma_tg2l_bf16_matrix_vlc_copy_decompressed.cpp
new file mode 100644
index 000000000..8ba0fd2b5
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_tdma_tg2l_bf16_matrix_vlc_copy_decompressed.cpp
@@ -0,0 +1,184 @@
+#include "../1822_test_util.h"
+
+typedef bmk1822_tdma_tg2l_matrix_copy_decompressed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->m.shape.row, p->src->m.shape.col,
+      p->dst->shape.n, p->dst->shape.c,
+      p->dst->shape.w, p->dst->shape.col);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  mg_shape_t src_shape;
+  ml_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 0, 17 },
+    { 0, 3, 7, 17 },
+  },
+  {
+    { 0, 7 },
+    { 0, 7, 1, 7 },
+  },
+  {
+    { 0, 60 },
+    { 0, 30, 2, 60 },
+  },
+#ifndef ENABEL_SIMPLE_BMK1822_VLC_TEST
+  {
+    { 0, 1 },
+    { 0, 1, 1, 1 },
+  },
+  {
+    { 0, 2 },
+    { 0, 1, 2, 2 },
+  },
+  {
+    { 0, 2 },
+    { 0, 2, 1, 2 },
+  },
+  {
+    { 0, 7 },
+    { 0, 1, 7, 7 },
+  },
+  {
+    { 0, 7 },
+    { 0, 2, 4, 7 },
+  },
+  {
+    { 0, 17 },
+    { 0, 1, 17, 17 },
+  },
+  {
+    { 0, 17 },
+    { 0, 17, 1, 17 },
+  },
+  {
+    { 0, 60 },
+    { 0, 1, 60, 60 },
+  },
+  {
+    { 0, 60 },
+    { 0, 60, 1, 60 },
+  }
+#endif /* ifndef ENABEL_SIMPLE_BMK1822_VLC_TEST*/
+};
+
+static void tg2l_matrix_copy_ref(param_t *p, u16 ref_data[], u16 src_data[])
+{
+  u64 size = ml_shape_size(&p->dst->shape);
+
+  for (u64 i = 0; i < size; i++)
+    ref_data[i] = src_data[i];
+}
+
+static void test_param_g2l(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p, u16 *src_data, CommandInfo* cmd_info)
+{
+  print_param(stderr, p);
+
+  u64 in_size = ml_shape_size(&p->dst->shape);
+  size_t bs_size = 0;
+  int is_signed = (p->dst->fmt == FMT_I8);
+  size_t data_type = (p->dst->fmt == FMT_BF16) ? 1 : 0;
+  u64 size = ml_shape_size(&p->dst->shape);
+  u64 bytesize = size * bytesize_of_fmt(p->dst->fmt);
+
+  u8 *bsbuf = vlc_compress((u8* )src_data, bytesize, is_signed, data_type, &bs_size, cmd_info, NULL);
+
+  put_compressed_mg_gmem(ctx, p->src, bsbuf, bs_size);
+  bmk1822_tdma_g2l_matrix_copy_decompressed(bmk, p);
+  test_submit(ctx);
+  u16 *dst_data = (u16*) get_bf16_matrix_l2g(ctx, bmk, p->dst, p->dst->fmt);
+
+  u16 *ref_data = (u16 *)malloc(sizeof(u16) * in_size);
+  tg2l_matrix_copy_ref(p, ref_data, src_data);
+
+  for (u64 i = 0; i < in_size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(dst_data);
+  free(ref_data);
+  free(bsbuf);
+}
+
+static void destroy_param_g2l(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_compressed_mg_gmem(ctx, p->src);
+  free_ml(bmk, p->dst);
+}
+
+static void test_one_case(bmctx_t *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  fmt_t fmts[] = { FMT_BF16 };
+  u8 fmts_sz = sizeof(fmts) / sizeof(fmts[0]);
+
+  for (u32 row = 1; row < 13; row += 2) {
+    c->src_shape.row = row;
+    c->dst_shape.n = row;
+    for (int mode = 0; mode < VLC_CMP_MODE_MAX; mode++) {
+      for (int dst_align = 0; dst_align < 2; dst_align++) {
+        for (u8 fmt_i = 0; fmt_i < fmts_sz; fmt_i++) {
+          fmt_t fmt = fmts[fmt_i];
+          param_t p;
+          memset(&p, 0, sizeof(p));
+
+          int is_signed = (fmt == FMT_I8);
+          size_t data_type = (fmt == FMT_BF16) ? 1 : 0;
+          CommandInfo cmd_info;
+
+          memset(&cmd_info, 0, sizeof(CommandInfo));
+          cmd_info.signedness = is_signed;
+          cmd_info.is_bfloat16 = data_type;
+
+          // <! 1. alloc source
+          p.dst = alloc_ml_bf16(bmk, c->dst_shape, fmt, dst_align);
+          u64 in_size = ml_shape_size(&p.dst->shape);
+
+          // <! 2 init input
+          u16 *src_data = (u16 *)malloc(sizeof(u16) * in_size);
+          vlc_init_testdata(src_data, in_size, fmt == FMT_I8, fmt == FMT_BF16);
+
+          // <! 3 try to manual set bias0/bias1
+          if (mode == VLC_CMP_MODE_COMPILER) {
+            bm_vlc_est_weight_bias((u8*) src_data, in_size * sizeof(u16), (bool)is_signed, (bool)data_type, &cmd_info);
+          }
+
+          p.src = alloc_vlc_compressed_mg_gmem(ctx, c->src_shape, fmt, &cmd_info);
+
+          //printf ("row %u mode %d is_align %d fmt %d\n", row, mode, dst_align, fmt);
+          test_param_g2l(ctx, bmk, &p, src_data, &cmd_info);
+
+          free(src_data);
+          destroy_param_g2l(ctx, bmk, &p);
+        }
+      }
+    }
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_tdma_tg2l_bf16_tensor_copy.cpp b/cviruntime/test/1822/bf16/test_1822_tdma_tg2l_bf16_tensor_copy.cpp
new file mode 100644
index 000000000..506369f48
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_tdma_tg2l_bf16_tensor_copy.cpp
@@ -0,0 +1,162 @@
+#include "../1822_test_util.h"
+#include "1822_bf16_util.h"
+
+typedef bmk1822_tdma_tg2l_tensor_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+ {FMT_I8, FMT_BF16},
+ {FMT_U8, FMT_BF16},
+};
+
+typedef struct {
+  tg_shape_t src_shape;
+  tl_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 1, 1 },
+    { 1, 1, 1, 1 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 2, 1 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 2, 7 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 13, 17 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 120, 5 },
+  }, {
+    { 1, 2, 1, 1 },
+    { 1, 1, 1, 2 },
+  }, {
+    { 2, 17, 1,  4 },
+    { 2,  1, 4, 17 },
+  }, {
+    { 2, 17, 1,  4 },
+    { 2,  1, 17, 4 },
+  }, {
+    { 3, 16, 1, 1 },
+    { 3,  1, 2, 8 },
+  }, {
+    { 3, 39, 17, 23 },
+    { 3, 17, 39, 23 },
+  }, {
+    { 3, 36, 16,  20 },
+    { 3, 18,  1, 640 },
+  }, {
+    { 5, 39, 17, 23 },
+    { 5, 17, 39, 23 },
+  }, {
+    { 20, 35,  2, 2 },
+    { 20,  7, 10, 2 },
+  }
+};
+
+static void tg2l_tensor_copy_ref(param_t *p, u16 ref_data[], u16 src_data[])
+{
+  u64 size = tl_shape_size(&p->dst->shape);
+  for (u64 i = 0; i < size; i++) {
+    if(p->src->fmt == FMT_BF16) {
+      ref_data[i] = src_data[i];
+    }else {
+      u8* u8src_data = (u8*)src_data;
+      u8 sign = p->src->fmt == FMT_I8 ? 1 : 0;
+      ref_data[i] = convert_int8_bf16(u8src_data[i], sign);
+    }
+  }
+}
+
+static void test_param_g2l(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = tl_shape_size(&p->dst->shape);
+
+  u16 *u16src_data = (u16 *)malloc(sizeof(u16) * size);
+  u8 *u8src_data = (u8 *)malloc(sizeof(u8) * size);
+  u8 *src_data;
+  if(p->src->fmt == FMT_BF16) {
+    float val = -100;
+    for(u64 i = 0; i < size; i++) {
+      u16src_data[i] = generate_bf16_corner_val(val);
+      val += 0.1;
+    }
+    src_data = (u8*)u16src_data;
+  } else {
+    for(u64 i = 0; i < size; i++) {
+      u8src_data[i] = 200 + i;
+    }
+    src_data = u8src_data;
+  }
+
+  put_tg_bf16_gmem(ctx, p->src, (u8*) src_data);
+  bmk1822_tdma_g2l_bf16_tensor_copy(bmk, p);
+  test_submit(ctx);
+  u16 *dst_data = (u16*) get_bf16_tensor_l2g(ctx, bmk, p->dst, p->dst->fmt);
+
+  u16 *ref_data = (u16 *)malloc(sizeof(u16) * size);
+  tg2l_tensor_copy_ref(p, ref_data, (u16*) src_data);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %x, exp %x\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+  free(u8src_data);
+  free(u16src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+static void destroy_param_g2l(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_tg_gmem(ctx, p->src);
+  free_tl(bmk, p->dst);
+}
+
+static void test_one_case(bmctx_t *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+  for (u32 i = 0; i < nr_fmt; i++) {
+    for (int dst_align = 0; dst_align < 2; dst_align++) {
+      param_t p;
+      memset(&p, 0, sizeof(p));
+
+      p.src = alloc_tg_bf16_gmem(ctx, c->src_shape, input_fmt[i].src_fmt);
+      p.dst = alloc_tl(bmk, c->dst_shape, input_fmt[i].dst_fmt, dst_align);
+      test_param_g2l(ctx, bmk, &p);
+      destroy_param_g2l(ctx, bmk, &p);
+    }
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_tdma_tg2l_bf16_tensor_copy_nc_transposed.cpp b/cviruntime/test/1822/bf16/test_1822_tdma_tg2l_bf16_tensor_copy_nc_transposed.cpp
new file mode 100644
index 000000000..95c228d79
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_tdma_tg2l_bf16_tensor_copy_nc_transposed.cpp
@@ -0,0 +1,256 @@
+#include "../1822_test_util.h"
+#include "1822_bf16_util.h"
+
+typedef bmk1822_tdma_tg2l_tensor_copy_nc_transposed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+ {FMT_I8, FMT_BF16},
+ {FMT_U8, FMT_BF16},
+};
+
+typedef struct {
+  tg_shape_t src_shape;
+  tl_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 1, 1 },
+    { 1, 1, 1, 1 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 1, 2 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 2, 1 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 7, 2 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 2, 7 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 17, 13 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 13, 17 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 10, 60 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 2, 300 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 3, 200 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 4, 150 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 5, 120 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 60, 10 },
+  }, {
+    { 1, 1, 120, 5 },
+    { 1, 1, 120, 5 },
+  }, {
+    { 1, 2, 1, 1 },
+    { 2, 1, 1, 1 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 1, 4 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 2, 2 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 4, 1 },
+  }, {
+    { 17, 2, 2, 2 },
+    { 2, 17, 2, 2 },
+  }, {
+    { 17, 2, 4, 1 },
+    { 2, 17, 4, 1 },
+  }, {
+    {  3, 16, 1, 1 },
+    { 16,  3, 1, 1 },
+  }, {
+    { 3, 39, 23, 17 },
+    { 39, 3, 23, 17 },
+  }, {
+    { 3, 39, 17, 23 },
+    { 39, 3, 17, 23 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  16, 20 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  2, 160 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  4, 80 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  8, 40 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  20, 16 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  32, 10 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  64, 5 },
+  }, {
+    { 5, 39, 17, 23 },
+    { 39, 5, 17, 23 },
+  }, {
+    { 20, 35, 2, 2 },
+    { 35, 20, 2, 2 },
+  }, {
+    { 35, 20, 2, 2 },
+    { 20, 35, 2, 2 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 160, 2 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 2, 160 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 4, 80 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 8, 40 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 10, 32 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 20, 16 },
+  }, {
+    { 39, 5, 23, 17 },
+    { 5, 39, 23, 17 },
+  }
+};
+
+static void tg2l_tensor_copy_nc_transposed_ref(
+    param_t *p, u16 ref_data[], u16 src_data[])
+{
+  tg_shape_t s = p->src->shape;
+  u32 n = s.n;
+  u32 c = s.c;
+  u32 hw = s.h * s.w;
+  for (u32 ni = 0; ni < n; ni++) {
+    for (u32 ci = 0; ci < c; ci++) {
+      for (u32 hwi = 0; hwi < hw; hwi++) {
+        u32 src_i = ni * c * hw + ci * hw + hwi;
+        u32 dst_i = ci * n * hw + ni * hw + hwi;
+        if(p->src->fmt == FMT_BF16 && p->dst->fmt == FMT_BF16)
+          ref_data[dst_i] = src_data[src_i];
+        else {
+          u8* u8src_data = (u8*)src_data;
+          u8 sign = p->src->fmt == FMT_I8 ? 1 : 0;
+          ref_data[dst_i] = convert_int8_bf16(u8src_data[src_i], sign);
+        }
+      }
+    }
+  }
+}
+
+static void test_param_g2l(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = tl_shape_size(&p->dst->shape);
+
+  u16 *u16src_data = (u16 *)malloc(sizeof(u16) * size);
+  u8 *u8src_data = (u8 *)malloc(sizeof(u8) * size);
+  u8 *src_data;
+  if(p->src->fmt == FMT_BF16) {
+    float val = -100;
+    for(u64 i = 0; i < size; i++) {
+      u16src_data[i] = generate_bf16_corner_val(val);
+      val += 0.1;
+    }
+    src_data = (u8*)u16src_data;
+  } else {
+    for(u64 i = 0; i < size; i++) {
+      u8src_data[i] = 200 + i;
+    }
+    src_data = u8src_data;
+  }
+
+  put_tg_bf16_gmem(ctx, p->src, (u8*) src_data);
+  bmk1822_tdma_g2l_bf16_tensor_copy_nc_transposed(bmk, p);
+  test_submit(ctx);
+  u16 *dst_data = (u16 *) get_bf16_tensor_l2g(ctx, bmk, p->dst, p->dst->fmt);
+
+  u16 *ref_data = (u16 *)malloc(sizeof(u16) * size);
+  tg2l_tensor_copy_nc_transposed_ref(p, ref_data, (u16*) src_data);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %x, exp %x\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(u8src_data);
+  free(u16src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+
+static void destroy_param_g2l(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_tl(bmk, p->dst);
+  free_tg_gmem(ctx, p->src);
+}
+
+static void test_one_case(bmctx_t *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+  for (u32 i = 0; i < nr_fmt; i++) {
+    for (int dst_align = 0; dst_align < 2; dst_align++) {
+      param_t p;
+      memset(&p, 0, sizeof(p));
+
+      p.src = alloc_tg_bf16_gmem(ctx, c->src_shape, input_fmt[i].src_fmt);
+      p.dst = alloc_tl(bmk, c->dst_shape, input_fmt[i].dst_fmt, dst_align);
+      test_param_g2l(ctx, bmk, &p);
+      destroy_param_g2l(ctx, bmk, &p);
+
+    }
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_tdma_tg2l_bf16_tensor_fill_constant.cpp b/cviruntime/test/1822/bf16/test_1822_tdma_tg2l_bf16_tensor_fill_constant.cpp
new file mode 100644
index 000000000..be86789ef
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_tdma_tg2l_bf16_tensor_fill_constant.cpp
@@ -0,0 +1,144 @@
+#include "../1822_test_util.h"
+#include "1822_bf16_util.h"
+
+typedef bmk1822_tdma_tg2l_tensor_fill_constant_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: %u => (%u, %u, %u, %u)\n",
+      tag, p->constant,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  u16 constant;
+  tl_shape_t dst_shape;
+} case_t;
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+};
+
+static case_t g_cases[] = {
+  {
+    37, { 1, 1, 1, 1 }
+  }, {
+    39, { 1, 1, 1, 2 }
+  }, {
+    23, { 1, 1, 2, 1 }
+  }, {
+    19, { 1, 1, 7, 2 }
+  }, {
+    17, { 1, 1, 2, 7 }
+  }, {
+    13, { 1, 1, 17, 13 }
+  }, {
+    11, { 1, 1, 13, 17 }
+  }, {
+    7, { 1, 1, 10, 60 }
+  }, {
+    9, { 1, 1, 120, 5 }
+  }, {
+    2, { 1, 2, 1, 1 }
+  }, {
+    3, { 1, 1, 1, 2 }
+  }, {
+    5, { 2, 17, 1,  4 }
+  }, {
+    41, { 2,  1, 4, 17 }
+  }, {
+    5, { 2, 17, 1,  4 }
+  }, {
+    9, { 2,  1, 17, 4 }
+  }, {
+    17, { 3, 16, 1, 1 }
+  }, {
+    26, { 3,  1, 2, 8 }
+  }, {
+    103, { 3, 39, 17, 23 }
+  }, {
+    255, { 3, 17, 39, 23 }
+  }, {
+    254, { 3, 36, 16,  20 }
+  }, {
+    127, { 3, 18,  1, 640 }
+  }, {
+    128, { 5, 39, 17, 23 }
+  }, {
+    129, { 5, 17, 39, 23 }
+  }, {
+    55, { 20, 35,  2, 2 }
+  }, {
+    1, { 20,  7, 10, 2 }
+  }    
+};
+
+static void tg2l_tensor_fill_constant_ref(param_t *p, u16 ref_data[])
+{
+  u64 size = tl_shape_size(&p->dst->shape);
+
+  for (u64 i = 0; i < size; i++)
+    ref_data[i] = p->constant;
+}
+
+static void test_param_tg2l(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = tl_shape_size(&p->dst->shape);
+
+  bmk1822_tdma_tg2l_bf16_tensor_fill_constant(bmk, p);
+  test_submit(ctx);
+  u16 *dst_data = (u16 *) get_bf16_tensor_l2g(ctx, bmk, p->dst, FMT_BF16);
+
+  u16 *ref_data = (u16 *)malloc(sizeof(u16) * size);
+  tg2l_tensor_fill_constant_ref(p, ref_data);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(dst_data);
+  free(ref_data);
+}
+
+static void destroy_param_tg2l(bmk_ctx_t *bmk, param_t *p)
+{
+  free_tl(bmk, p->dst);
+}
+
+static void test_one_case(bmctx_t *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+  for (u32 i = 0; i < nr_fmt; i++) {
+    for (int dst_align = 0; dst_align < 2; dst_align++) {
+      param_t p;
+      memset(&p, 0, sizeof(p));
+      p.constant = generate_bf16_corner_val(c->constant);
+      p.dst = alloc_tl(bmk, c->dst_shape, input_fmt[i].src_fmt, dst_align);
+
+      test_param_tg2l(ctx, bmk, &p);
+      destroy_param_tg2l(bmk, &p);
+    }
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_tdma_tg2l_bf16_tensor_vlc_copy_decompressed.cpp b/cviruntime/test/1822/bf16/test_1822_tdma_tg2l_bf16_tensor_vlc_copy_decompressed.cpp
new file mode 100644
index 000000000..334eb9fc1
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_tdma_tg2l_bf16_tensor_vlc_copy_decompressed.cpp
@@ -0,0 +1,160 @@
+#include "../1822_test_util.h"
+#include "../bm_vlc_compress.h"
+
+typedef bmk1822_tdma_tg2l_tensor_copy_decompressed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => fmt(%d) bias0/1/zero is (%u/%u/%u) %s\n",
+      tag,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w,
+      p->dst->fmt,
+      p->src->bias0, p->src->bias1, p->src->zero_guard_en,
+      (p->dst->fmt == FMT_I8)? "signed": "unsigned");
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  tl_shape_t lmem_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 17, 13 }
+  },
+  {
+    { 3, 39, 17, 23 }
+  },
+  {
+    { 5, 39, 17, 23 }
+  },
+  {
+    { 20, 35,  2, 2 }
+  },
+#ifndef ENABEL_SIMPLE_BMK1822_VLC_TEST
+  {
+    { 1, 1, 1, 1 }
+  },
+  {
+    { 1, 1, 1, 2 }
+  },
+  {
+    { 1, 1, 7, 2 }
+  },
+  {
+    { 1, 1, 10, 60 }
+  },
+  {
+    { 1, 2, 1, 1 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 3, 16, 1, 1 }
+  },
+  {
+    { 3, 36, 16,  20 }
+  },
+#endif /* ifndef ENABEL_SIMPLE_BMK1822_VLC_TEST*/
+};
+
+static void test_param_g2l(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p, u16 *src_data, CommandInfo* cmd_info)
+{
+  print_param(stderr, p);
+  u64 size = tl_shape_size(&p->dst->shape);
+  size_t bs_size = 0;
+  int is_signed = (p->dst->fmt == FMT_I8);
+  u8 data_type = (p->dst->fmt == FMT_BF16) ? 1 : 0;
+  u64 bytesize = size * bytesize_of_fmt(p->dst->fmt);
+
+  u8 *bsbuf = vlc_compress((u8 *)src_data, bytesize, is_signed, data_type, &bs_size, cmd_info, NULL);
+
+  u16 *ref_data = (u16 *)malloc(sizeof(u16) * size);
+  bm_vlc_dec_bf16(bsbuf, bytesize, (u16* )ref_data);
+
+  put_compressed_tg_gmem(ctx, p->src, bsbuf, bs_size);
+  bmk1822_tdma_g2l_tensor_copy_decompressed(bmk, p);
+  test_submit(ctx);
+
+  u16 *dst_data = (u16* )get_bf16_tensor_l2g(ctx, bmk, p->dst, p->dst->fmt);
+
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "vlc decompress comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+          i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(bsbuf);
+  free(dst_data);
+  free(ref_data);
+}
+
+static void destroy_param_g2l(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_compressed_tg_gmem(ctx, p->src);
+  free_tl(bmk, p->dst);
+}
+
+static void test_one_case(bmctx_t *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  fmt_t fmts[] = { FMT_BF16 };
+  u8 fmts_sz = sizeof(fmts) / sizeof(fmts[0]);
+
+  for (int dst_align = 0; dst_align < 2; dst_align++) {
+    for (int mode = 0; mode < VLC_CMP_MODE_MAX; mode++) {
+      for (u8 fmt_i = 0; fmt_i < fmts_sz; fmt_i++) {
+        fmt_t fmt = fmts[fmt_i];
+        param_t p;
+        memset(&p, 0, sizeof(p));
+        p.dst = alloc_tl(bmk, c->lmem_shape, fmt, dst_align);
+        assert(p.dst);
+
+        u64 size = tl_shape_size(&p.dst->shape);
+        u16 *src_data = (u16 *)malloc(sizeof(u16) * size);
+        vlc_init_testdata(src_data, size, fmt == FMT_I8, fmt == FMT_BF16);
+
+        CommandInfo cmd_info;
+        memset(&cmd_info, 0, sizeof(CommandInfo));
+        int is_signed = (fmt == FMT_I8);
+        u8 data_type = (fmt == FMT_BF16) ? 1 : 0;
+
+        cmd_info.signedness = is_signed;
+        cmd_info.is_bfloat16 = data_type;
+
+        if (mode == VLC_CMP_MODE_COMPILER) {
+          bm_vlc_est_weight_bias((u8* )src_data, size * sizeof(u16), (bool)is_signed, (bool)data_type, &cmd_info);
+        }
+
+        p.src = _alloc_vlc_compressed_tg_gmem(ctx, &c->lmem_shape, fmt, &cmd_info);
+
+        test_param_g2l(ctx, bmk, &p, src_data, &cmd_info);
+
+        free(src_data);
+        destroy_param_g2l(ctx, bmk, &p);
+      }
+    }
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bf16/test_1822_tg_copy_bf16_tensor.cpp b/cviruntime/test/1822/bf16/test_1822_tg_copy_bf16_tensor.cpp
new file mode 100644
index 000000000..9c1f34726
--- /dev/null
+++ b/cviruntime/test/1822/bf16/test_1822_tg_copy_bf16_tensor.cpp
@@ -0,0 +1,110 @@
+#include "../1822_test_util.h"
+#include "1822_bf16_util.h"
+
+typedef bmk1822_tdma_tg2tg_tensor_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  tg_shape_t src_shape;
+  tg_stride_t src_stride;
+  tg_shape_t dst_shape;
+  tg_stride_t dst_stride;
+} case_t;
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+};
+
+static case_t g_cases[] = {
+  {
+    {1, 3, 3, 3}, {27*2, 9*2, 3*2},
+    {1, 3, 3, 3}, {27*2, 9*2, 3*2},
+  },
+  {
+    // YOLOv2 concat layer
+    {1, 256, 19, 19}, {92416*2, 361*2, 19*2},
+    {1, 256, 19, 19}, {462080*2, 361*2, 19*2},
+  }
+};
+
+static void test_param_g2g(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+
+  u64 size = p->src->shape.c * p->src->shape.h * p->src->shape.w;
+  u16 *src_data = (u16 *)malloc(sizeof(u16) * size);
+  for (u64 i = 0; i < size; i++)
+    src_data[i] = 0x1234 + i;
+
+  put_tg_bf16_gmem(ctx, p->src, (u8*)src_data);
+
+  bmk1822_tdma_tg2tg_bf16_tensor_copy(bmk, p);
+  test_submit(ctx);
+
+  u16 *dst_data = (u16*) get_tg_bf16_gmem(ctx, p->dst);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != src_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %x, exp %x\n",
+              i, dst_data[i], src_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+}
+
+static void destroy_param_g2g(bmctx_t *ctx, param_t *p)
+{
+  free_tg_gmem(ctx, p->src);
+  free_tg_gmem(ctx, p->dst);
+}
+
+static void test_one_case(bmctx_t *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+  for (u32 i = 0; i < nr_fmt; i++) {
+        param_t p;
+        bmk1822_tensor_tgmem_t *src, *dst;
+        src = alloc_tg_bf16_gmem(ctx, c->src_shape, input_fmt[i].src_fmt);
+        src->stride.n = c->src_stride.n;
+        src->stride.c = c->src_stride.c;
+        src->stride.h = c->src_stride.h;
+
+        dst = alloc_tg_bf16_gmem(ctx, c->dst_shape, input_fmt[i].dst_fmt);
+        dst->stride.n = c->dst_stride.n;
+        dst->stride.c = c->dst_stride.c;
+        dst->stride.h = c->dst_stride.h;
+
+        memset(&p, 0, sizeof(p));
+        p.src = src;
+        p.dst = dst;
+        test_param_g2g(ctx, bmk, &p);
+        destroy_param_g2g(ctx, &p);
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/bm_vlc_compress.h b/cviruntime/test/1822/bm_vlc_compress.h
new file mode 100644
index 000000000..e82a96abd
--- /dev/null
+++ b/cviruntime/test/1822/bm_vlc_compress.h
@@ -0,0 +1,672 @@
+#ifndef __BM_VLC_COMPRESS_H__
+#define __BM_VLC_COMPRESS_H__
+#include <stdint.h>
+#include <stdbool.h>
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#define MAX_UNARY_FIELD_SIZE 47
+#define MAX_ORDER_K 5
+
+  /**
+   * \data_type 0 means 8bit, 1 means 16bit
+   */
+  static inline size_t get_out_bs_buf_size(u64 in_size, u8 data_type) {
+    size_t blk_num = (data_type) ? ((in_size + 31) >> 5) : ((in_size + 15) >> 4);
+    size_t in_size_pad = blk_num << (4 + data_type);
+    size_t bs_buf_size = in_size_pad + (ceiling_func(blk_num, 16) << 4) + 16;
+    return bs_buf_size;
+  }
+
+  typedef struct
+  {
+    bool signedness;
+    bool is_bfloat16;
+    uint8_t bias0;
+    uint8_t bias1;
+    bool zero_guard_en;
+  } CommandInfo;
+  typedef struct
+  {
+    uint8_t *stream; // stream buffer pointer
+    int bit_pos;     // current pointer (in bit)
+    int buf_size;    // in byte
+  } StreamBuffer;
+
+static inline int8_t two_side_circular_shift(int8_t val, uint8_t bias0, uint8_t bias1);
+static inline int8_t inv_two_side_circular_shift(int8_t val, uint8_t bias0, uint8_t bias1);
+static inline uint8_t center_shift(uint8_t val, uint8_t bias, bool zero_guard);
+static inline uint8_t inv_center_shift(uint8_t val, uint8_t bias, bool zero_guard);
+
+static inline void init_stream(StreamBuffer *bs, const uint8_t *buf, int buf_size, bool read_only);
+
+static inline void bm_vlc_est_weight_bias(const uint8_t *ibuf, size_t isz, bool signedness, bool isBfloat16, CommandInfo *cmd_info);
+static inline void bm_vlc_enc_int8(const uint8_t *ibuf, size_t isz, uint8_t *obuf, size_t *osz, CommandInfo *cmd_info);
+static inline void bm_vlc_dec_int8(const uint8_t *ibuf, size_t isz, uint8_t *obuf);
+static inline void bm_vlc_enc_bf16(const uint16_t *ibuf, size_t isz, uint8_t *obuf, size_t *osz, CommandInfo *cmd_info);
+ static inline void bm_vlc_dec_bf16(const uint8_t *ibuf, size_t isz, uint16_t *obuf);
+
+static inline uint8_t get_bit_val(uint8_t *buf, int byte_idx, int bit_idx)
+  {
+    return (buf[byte_idx] >> bit_idx) & 0x1;
+  }
+
+static inline uint8_t sign_to_unsign(uint8_t val)
+  {
+    uint8_t sign_i = (val >> 7) & 0x1;
+    int abs_data_i = abs(((int8_t)val));
+    return ((abs_data_i << 1) - sign_i);
+  }
+
+static inline int8_t unsign_to_sign(uint8_t val)
+  {
+    uint8_t sign_i = val & 0x1;
+    int abs_data_i = (((int)val) + 1) >> 1;
+    return (uint8_t)((sign_i == 1) ? (-abs_data_i) : abs_data_i);
+  }
+
+static inline void dispatch_bf16_data(const uint16_t *bf16_in, uint8_t *exp, uint8_t *frac, size_t isz)
+{
+  for (size_t i = 0; i < isz; i++)
+  {
+    exp[i] = (uint8_t)((bf16_in[i] >> 7) & 0xFF);
+    frac[i] = (uint8_t)(((bf16_in[i] >> 15) << 7) | (bf16_in[i] & 0x7F));
+  }
+}
+
+static inline void merge_bf16_data(const uint8_t *exp_in, const uint8_t *frac_in, uint16_t *bf16_out, size_t isz)
+{
+  memset(bf16_out, 0, sizeof(uint16_t));
+  for (size_t i = 0; i < isz; i++)
+  {
+    bf16_out[i] = ((frac_in[i] >> 7) << 15) | (exp_in[i] << 7) | (frac_in[i] & 0x7F);
+  }
+}
+
+// -- streaming operation handler --
+static inline void init_stream(StreamBuffer *bs, const uint8_t *buf, int buf_size, bool read_only)
+{
+  bs->bit_pos = 0;
+  bs->stream = (uint8_t *)buf;
+  bs->buf_size = buf_size;
+  if (!read_only)
+    memset((uint8_t *)buf, 0, sizeof(uint8_t) * buf_size);
+}
+
+static inline void write_stream(StreamBuffer *bs, uint8_t *src, int bit_len)
+{
+  for (int bit = 0; bit < bit_len; bit++)
+  {
+    int src_byte_i = bit / 8;
+    int src_bit_i = bit % 8;
+    int dest_byte_i = (bs->bit_pos + bit) / 8;
+    int dest_bit_i = (bs->bit_pos + bit) % 8;
+    bs->stream[dest_byte_i] |= (get_bit_val(src, src_byte_i, src_bit_i) << dest_bit_i);
+  }
+  bs->bit_pos += bit_len;
+}
+
+static inline void move_stream_ptr(StreamBuffer *bs, int bit_len)
+{
+  bs->bit_pos += bit_len;
+}
+
+static inline void parse_stream(StreamBuffer *bs, uint8_t *dest, int bit_len)
+{
+  memset(dest, 0, sizeof(uint8_t) * (bit_len + 7) >> 3);
+  for (int bit = 0; bit < bit_len; bit++)
+  {
+    int dest_byte_i = bit / 8;
+    int dest_bit_i = bit % 8;
+    int bs_byte_i = (bs->bit_pos + bit) / 8;
+    int bs_bit_i = (bs->bit_pos + bit) % 8;
+    dest[dest_byte_i] |= (get_bit_val(bs->stream, bs_byte_i, bs_bit_i) << dest_bit_i);
+  }
+  bs->bit_pos += bit_len;
+}
+
+// -- header read/write operation handler --
+static inline void vlc_enc_header(StreamBuffer *bs_header, CommandInfo *cmd_info, size_t blk_bs_size)
+{
+  write_stream(bs_header, (uint8_t *)&blk_bs_size, 24);            // bit[23:0] compressed block stream size
+  move_stream_ptr(bs_header, 4);                                   // bit[27:24] reserved
+  write_stream(bs_header, (uint8_t *)&cmd_info->signedness, 1);    // bit[28] signedness
+  write_stream(bs_header, (uint8_t *)&cmd_info->is_bfloat16, 1);   // bit[29] data type
+  move_stream_ptr(bs_header, 2);                                   // bit[31:30] bit depth
+  write_stream(bs_header, (uint8_t *)&cmd_info->bias0, 8);         // bit[39:32] bias0 for symbol remapping
+  write_stream(bs_header, (uint8_t *)&cmd_info->bias1, 7);         // bit[46:40] bias1 for symbol remapping
+  write_stream(bs_header, (uint8_t *)&cmd_info->zero_guard_en, 1); // bit[47] zero guard
+}
+
+static inline void vlc_dec_header(StreamBuffer *bs_header, CommandInfo *cmd_info)
+{
+  move_stream_ptr(bs_header, 28);                                // bit[27:24] reserved
+  parse_stream(bs_header, (uint8_t *)&cmd_info->signedness, 1);  // bit[28] signedness
+  parse_stream(bs_header, (uint8_t *)&cmd_info->is_bfloat16, 1); // bit[29] data type
+  move_stream_ptr(bs_header, 2);
+  parse_stream(bs_header, (uint8_t *)&cmd_info->bias0, 8);         // bit[39:32] bias0 for symbol remapping
+  parse_stream(bs_header, (uint8_t *)&cmd_info->bias1, 7);         // bit[46:40] bias1 for symbol remapping
+  parse_stream(bs_header, (uint8_t *)&cmd_info->zero_guard_en, 1); // bit[47] zero guard
+}
+
+// -- symbol remmaping handler --
+static inline uint8_t center_shift(uint8_t val, uint8_t bias, bool zero_guard)
+{
+  if (val == 0 && zero_guard)
+    return 0;
+
+  int16_t shift_data_i = val - bias;
+  uint8_t range = (bias <= 128) ? bias : 255 - bias;
+  if (bias <= 128)
+  {
+    return (val >= (range << 1)) ? val : sign_to_unsign(shift_data_i) + zero_guard;
+  }
+  else
+  {
+    return (val < (bias - range)) ? (range + bias - val + zero_guard) : (sign_to_unsign(shift_data_i) + zero_guard);
+  }
+}
+
+static inline uint8_t inv_center_shift(uint8_t val, uint8_t bias, bool zero_guard)
+{
+  if (val == 0 && zero_guard)
+    return 0;
+
+  uint8_t unsign_data_i = val - zero_guard;
+  uint8_t range = (bias <= 128) ? bias : 255 - bias;
+  if (bias <= 128)
+  {
+    return (val >= (range << 1)) ? val : unsign_to_sign(unsign_data_i) + bias;
+  }
+  else
+  {
+    return (unsign_data_i > (range << 1)) ? (range + bias - val + zero_guard) : unsign_to_sign(unsign_data_i) + bias;
+  }
+}
+
+static inline int8_t two_side_circular_shift(int8_t val, uint8_t bias0, uint8_t bias1)
+{
+  if (val == 0)
+    return 0;
+
+  bool sign = (val < 0) ? true : false;
+  int32_t abs_val = abs(val);
+  abs_val -= (sign) ? bias1 : bias0;
+  abs_val += (abs_val <= 0) ? (127 + sign) : 0;
+  return (sign) ? -abs_val : abs_val;
+}
+
+static inline int8_t inv_two_side_circular_shift(int8_t val, uint8_t bias0, uint8_t bias1)
+{
+  if (val == 0)
+    return 0;
+
+  bool sign = (val < 0) ? true : false;
+  uint32_t abs_val = abs(val);
+  abs_val += (sign) ? bias1 : bias0;
+  int32_t abs_val_minus = abs_val - (127 + sign);
+  uint8_t abs_val_lsb = ((abs_val_minus <= 0)
+                             ? abs_val
+                             : abs_val_minus) &
+                        0xFF;
+  return (sign) ? -abs_val_lsb : abs_val_lsb;
+}
+
+static inline void symbol_remapping(uint8_t *blk_in, uint8_t *blk_out, uint8_t bias0, uint8_t bias1, bool signedness, bool is_bf16_exp, bool zero_guard)
+{
+  if (is_bf16_exp == false && signedness == false)
+  {
+    // remapping bypass
+    memcpy(blk_out, blk_in, sizeof(uint8_t) * 16);
+    return;
+  }
+
+  if (is_bf16_exp == true)
+  {
+    // center circular shift
+    for (int i = 0; i < 16; i++)
+    {
+      blk_out[i] = center_shift(blk_in[i], bias0, zero_guard);
+    }
+  }
+  else
+  {
+    // two-side circular shift
+    for (int i = 0; i < 16; i++)
+    {
+      int8_t shift_data_i = two_side_circular_shift((int8_t)blk_in[i], bias0, bias1);
+      blk_out[i] = sign_to_unsign(shift_data_i);
+    }
+  }
+}
+
+static inline void inv_symbol_remapping(uint8_t *blk_in, uint8_t *blk_out, uint8_t bias0, uint8_t bias1, bool signedness, bool is_bf16_exp, bool zero_guard)
+{
+  if (is_bf16_exp == false && signedness == false)
+  {
+    // remapping bypass
+    memcpy(blk_out, blk_in, sizeof(uint8_t) * 16);
+    return;
+  }
+
+  if (is_bf16_exp == true)
+  {
+    // center circular shift
+    for (int i = 0; i < 16; i++)
+    {
+      blk_out[i] = inv_center_shift(blk_in[i], bias0, zero_guard);
+    }
+  }
+  else
+  {
+    // two-side circular shift
+    for (int i = 0; i < 16; i++)
+    {
+      int8_t sign_data_i = unsign_to_sign(blk_in[i]);
+      blk_out[i] = (uint8_t)inv_two_side_circular_shift(sign_data_i, bias0, bias1);
+    }
+  }
+}
+
+static inline int vlc_estimate_block_order(uint8_t *blk_in, bool bf16_zvc_en)
+{
+  int best_k = 0;
+  int best_bs_size = 0x7FFFFFFF;
+
+  for (int k = 0; k <= (int)MAX_ORDER_K; k++)
+  {
+    uint8_t remain_field_size = k << 4;
+    int unary_field_len = 0;
+    for (int i = 0; i < 16; i++)
+    {
+      uint8_t group_idx = blk_in[i] >> k;
+      unary_field_len += (group_idx + 1);
+    }
+    int znum_bit = (bf16_zvc_en && k > 0) ? 4 : 0;
+    int blk_size = (unary_field_len <= MAX_UNARY_FIELD_SIZE)
+                       ? remain_field_size + unary_field_len + znum_bit
+                       : 255;
+    if (blk_size < best_bs_size)
+    {
+      best_k = k;
+      best_bs_size = blk_size;
+    }
+  }
+
+  best_k = (best_bs_size > 128) ? -1 : best_k;
+  return best_k;
+}
+// -- vlc block parrelel GR encode/decode --
+static inline uint8_t vlc_gr_enc_block_data(uint8_t *blk_in, StreamBuffer *bs, int order_k, bool bf16_zvc_en)
+{
+  // uncompressed mode
+  if (order_k == -1)
+  {
+    write_stream(bs, blk_in, 128);
+    return 128;
+  }
+
+  // remain field
+  uint8_t remain_field[16] = {0};
+  uint8_t unary_field[8] = {0};
+  uint8_t sym_end_pos[16] = {0};
+  uint8_t unary_field_len = 0;
+  int sym_end_pos_accum = -1;
+
+  // bit plane encode for remain field
+  for (int k = 0; k < order_k; k++)
+  {
+    uint8_t bit_plane0 = 0, bit_plane1 = 0;
+    for (int i = 0; i < 8; i++)
+    {
+      bit_plane0 |= (get_bit_val(blk_in, i, k) << i);
+      bit_plane1 |= (get_bit_val(blk_in, i + 8, k) << i);
+    }
+    remain_field[k << 1] = bit_plane0;
+    remain_field[(k << 1) + 1] = bit_plane1;
+  }
+  write_stream(bs, remain_field, order_k << 4);
+
+  if (bf16_zvc_en && order_k > 0)
+  {
+    int zero_num = 0;
+    for (int i = 0; i < 16; i++)
+    {
+      if (blk_in[i] == 0)
+        zero_num++;
+    }
+    assert(zero_num < 16);
+    write_stream(bs, (uint8_t *)&zero_num, 4);
+  }
+
+  // unary encode for unary field
+  for (int i = 0; i < 16; i++)
+  {
+    int group_idx = blk_in[i] >> order_k;
+    sym_end_pos_accum += (group_idx + 1);
+    sym_end_pos[i] = sym_end_pos_accum;
+    int byte_idx = sym_end_pos[i] / 8;
+    int bit_idx = sym_end_pos[i] % 8;
+    unary_field[byte_idx] |= (1 << (bit_idx));
+  }
+  unary_field_len = sym_end_pos[15] + 1;
+  assert(unary_field_len <= MAX_UNARY_FIELD_SIZE);
+  uint8_t ulen = (unary_field_len - 16) & 0x1F;
+  write_stream(bs, unary_field, unary_field_len);
+
+  return ulen;
+}
+
+static inline void vlc_gr_dec_block_data(StreamBuffer *bs, uint8_t bs_size, uint8_t *rec, int order_k, bool bf16_zvc_en)
+{
+  assert(bs_size <= 128);
+  // uncompressed mode
+  if (order_k == -1)
+  {
+    parse_stream(bs, rec, 128);
+    return;
+  }
+
+  // remain field
+  uint8_t remain_data[16] = {0};
+  uint8_t remain_bs[16] = {0};
+  uint8_t unary_field[8] = {0};
+  uint8_t sym_end_pos[16] = {0};
+  uint8_t unary_sym[16] = {0};
+  uint8_t remain_field_size = order_k << 4;
+
+  parse_stream(bs, remain_bs, remain_field_size);
+  // bit plane encode for remain field
+  for (int k = 0; k < order_k; k++)
+  {
+    for (int i = 0; i < 8; i++)
+    {
+      remain_data[i] |= (get_bit_val(remain_bs, k << 1, i) << k);
+      remain_data[i + 8] |= (get_bit_val(remain_bs, (k << 1) + 1, i) << k);
+    }
+  }
+
+  // zero number info
+  int znum_bit = (bf16_zvc_en && order_k > 0) ? 4 : 0;
+  uint8_t znum = 0;
+  parse_stream(bs, &znum, znum_bit);
+
+  // unary encode for unary field
+  uint8_t unary_field_len = bs_size - remain_field_size - znum_bit;
+  parse_stream(bs, unary_field, unary_field_len);
+
+  int sym_cnt = 0;
+  for (uint8_t ubit_i = 0; ubit_i < unary_field_len; ubit_i++)
+  {
+    int byte_idx = ubit_i / 8;
+    int bit_idx = ubit_i % 8;
+    if (get_bit_val(unary_field, byte_idx, bit_idx) == 1)
+    {
+      sym_end_pos[sym_cnt] = ubit_i;
+      sym_cnt++;
+    }
+  }
+  unary_sym[0] = sym_end_pos[0];
+  for (int i = 1; i < 16; i++)
+  {
+    unary_sym[i] = sym_end_pos[i] - sym_end_pos[i - 1] - 1;
+  }
+  for (int i = 0; i < 16; i++)
+  {
+    rec[i] = (unary_sym[i] << order_k) + remain_data[i];
+  }
+}
+
+// -- vlc encode int8 entry funtion --
+static inline void bm_vlc_enc_int8(const uint8_t *ibuf, size_t isz, uint8_t *obuf, size_t *osz, CommandInfo *cmd_info)
+{
+  StreamBuffer bs_header, bs_kmap, bs_data;
+  size_t blk_num = (isz + 15) >> 4;
+  size_t header_size = 16;
+  size_t kmap_size = ceiling_func(blk_num, 16) << 4;
+  size_t bs_buf_size = header_size + kmap_size + (blk_num << 4);
+  uint8_t *bsbuf = (uint8_t *)calloc(bs_buf_size, sizeof(uint8_t));
+
+  // block encode
+  init_stream(&bs_kmap, bsbuf + header_size, kmap_size, false);
+  init_stream(&bs_data, bsbuf + header_size + kmap_size, blk_num << 4, false);
+
+  for (size_t blk_idx = 0; blk_idx < blk_num; blk_idx++)
+  {
+    uint8_t blk_data[16] = {0}, blk_sr_data[16] = {0};
+    size_t in_size = (blk_idx == (blk_num - 1)) ? isz - (blk_idx << 4) : 16;
+    memcpy(blk_data, &ibuf[blk_idx << 4], sizeof(uint8_t) * in_size);
+
+    symbol_remapping(blk_data, blk_sr_data, cmd_info->bias0, cmd_info->bias1, cmd_info->signedness, false, false);
+
+    int k = vlc_estimate_block_order(blk_sr_data, false);
+    uint8_t ulen = vlc_gr_enc_block_data(blk_sr_data, &bs_data, k, false);
+    uint8_t k_info = (k == -1) ? 0xE0 : (k << 5) + ulen;
+    write_stream(&bs_kmap, &k_info, 8);
+  }
+
+  int blk_bs_size = ceiling_func(((bs_data.bit_pos + 7) >> 3), 16) << 4; // 16 byte align
+  *osz = header_size + kmap_size + blk_bs_size;
+
+  // write header
+  init_stream(&bs_header, bsbuf, header_size, false);
+  vlc_enc_header(&bs_header, cmd_info, blk_bs_size);
+
+  memcpy(obuf, bsbuf, (*osz) * sizeof(uint8_t));
+  free(bsbuf);
+}
+
+// -- vlc decode int8 entry funtion --
+static inline void bm_vlc_dec_int8(const uint8_t *ibuf, size_t isz, uint8_t *obuf)
+{
+  StreamBuffer bs_header, bs_kmap, bs_data;
+  CommandInfo cmd_info;
+  memset(&cmd_info, 0, sizeof(CommandInfo));
+
+  size_t blk_num = (isz + 15) >> 4;
+  int header_size = 16;
+  int kmap_size = ceiling_func(blk_num, 16) << 4;
+
+  // parse header
+  init_stream(&bs_header, ibuf, header_size, true);
+  vlc_dec_header(&bs_header, &cmd_info);
+
+  // block decode
+  init_stream(&bs_kmap, ibuf + header_size, kmap_size, true);
+  init_stream(&bs_data, ibuf + header_size + kmap_size, blk_num << 4, true);
+
+  for (size_t blk_idx = 0; blk_idx < blk_num; blk_idx++)
+  {
+    uint8_t blk_data[16] = {0}, blk_sr_data[16] = {0};
+    uint8_t k_info = 0;
+    parse_stream(&bs_kmap, &k_info, 8);
+    uint8_t ulen = k_info & 0x1F;
+    int k = (k_info >> 5 == 7) ? -1 : k_info >> 5;
+    int blk_bs_size = (k == -1) ? 128 : (k << 4) + ulen + 16;
+    vlc_gr_dec_block_data(&bs_data, blk_bs_size, blk_data, k, false);
+
+    inv_symbol_remapping(blk_data, blk_sr_data, cmd_info.bias0, cmd_info.bias1, cmd_info.signedness, false, false);
+
+    int out_size = (blk_idx == (blk_num - 1)) ? isz - (blk_idx << 4) : 16;
+    memcpy(&obuf[blk_idx << 4], blk_sr_data, sizeof(uint8_t) * out_size);
+  }
+}
+
+// -- vlc encode bfloat16 entry funtion --
+static inline void bm_vlc_enc_bf16(const uint16_t *ibuf, size_t isz, uint8_t *obuf, size_t *osz, CommandInfo *cmd_info)
+{
+  StreamBuffer bs_header, bs_kmap, bs_data;
+  size_t blk_num = (isz + 31) >> 5; // 32 bytes per blok
+  size_t header_size = 16;
+  size_t kmap_size = ceiling_func(blk_num, 16) << 4;
+  size_t bs_buf_size = header_size + kmap_size + (blk_num << 5);
+  uint8_t *bsbuf = (uint8_t *)calloc(bs_buf_size, sizeof(uint8_t));
+
+  // block encode
+  init_stream(&bs_kmap, bsbuf + header_size, kmap_size, false);
+  init_stream(&bs_data, bsbuf + header_size + kmap_size, blk_num << 5, false);
+
+  for (size_t blk_idx = 0; blk_idx < blk_num; blk_idx++)
+  {
+    uint8_t blk_data[16] = {0}, blk_sr_data[16] = {0}, blk_data_frac[16] = {0};
+    size_t in_num = (blk_idx == (blk_num - 1)) ? ((isz >> 1) - (blk_idx << 4)) : 16;
+    dispatch_bf16_data(&ibuf[blk_idx << 4], blk_data, blk_data_frac, in_num);
+
+    // exp: BGR encode
+    symbol_remapping(blk_data, blk_sr_data, cmd_info->bias0, cmd_info->bias1, false, true, cmd_info->zero_guard_en);
+
+    int k = vlc_estimate_block_order(blk_sr_data, cmd_info->zero_guard_en);
+    uint8_t ulen = vlc_gr_enc_block_data(blk_sr_data, &bs_data, k, cmd_info->zero_guard_en);
+    uint8_t k_info = (k == -1) ? 0xE0 : (k << 5) + ulen;
+    write_stream(&bs_kmap, &k_info, 8);
+
+    // frac: implicit zero compression
+    for (size_t i = 0; i < 16; i++)
+    {
+      if (!cmd_info->zero_guard_en || blk_data[i] != 0)
+      {
+        write_stream(&bs_data, &blk_data_frac[i], 8);
+      }
+    }
+  }
+
+  int blk_bs_size = ceiling_func(((bs_data.bit_pos + 7) >> 3), 16) << 4; // 16 byte align
+  *osz = header_size + kmap_size + blk_bs_size;
+
+  // write header
+  init_stream(&bs_header, bsbuf, header_size, false);
+  vlc_enc_header(&bs_header, cmd_info, blk_bs_size);
+
+  memcpy(obuf, bsbuf, (*osz) * sizeof(uint8_t));
+  free(bsbuf);
+}
+
+// -- vlc decode bfloat16 entry funtion --
+static inline void bm_vlc_dec_bf16(const uint8_t *ibuf, size_t isz, uint16_t *obuf)
+{
+  StreamBuffer bs_header, bs_kmap, bs_data;
+  CommandInfo cmd_info;
+  memset(&cmd_info, 0, sizeof(CommandInfo));
+
+  size_t blk_num = (isz + 31) >> 5; // 32 bytes per blok
+  int header_size = 16;
+  int kmap_size = ceiling_func(blk_num, 16) << 4;
+
+  // parse header
+  init_stream(&bs_header, ibuf, header_size, true);
+  vlc_dec_header(&bs_header, &cmd_info);
+
+  // block decode
+  init_stream(&bs_kmap, ibuf + header_size, kmap_size, true);
+  init_stream(&bs_data, ibuf + header_size + kmap_size, blk_num << 5, true);
+
+  for (size_t blk_idx = 0; blk_idx < blk_num; blk_idx++)
+  {
+    uint8_t blk_data[16] = {0}, blk_sr_data[16] = {0}, blk_data_frac[16] = {0};
+    uint8_t k_info = 0;
+    parse_stream(&bs_kmap, &k_info, 8);
+    uint8_t ulen = k_info & 0x1F;
+    int k = (k_info >> 5 == 7) ? -1 : k_info >> 5;
+    int znum_bit = (cmd_info.zero_guard_en && k > 0) ? 4 : 0;
+    uint8_t blk_bs_size = (k == -1) ? 128 : (k << 4) + ulen + 16 + znum_bit;
+
+    // exp: BGR decode
+    vlc_gr_dec_block_data(&bs_data, blk_bs_size, blk_data, k, cmd_info.zero_guard_en);
+
+    inv_symbol_remapping(blk_data, blk_sr_data, cmd_info.bias0, cmd_info.bias1, false, true, cmd_info.zero_guard_en);
+
+    size_t out_num = (blk_idx == (blk_num - 1)) ? ((isz >> 1) - (blk_idx << 4)) : 16;
+
+    // frac: implicit zero compression
+    for (size_t i = 0; i < out_num; i++)
+    {
+      if (!cmd_info.zero_guard_en || blk_sr_data[i] != 0)
+      {
+        parse_stream(&bs_data, &blk_data_frac[i], 8);
+      }
+    }
+    merge_bf16_data(blk_sr_data, blk_data_frac, &obuf[blk_idx << 4], out_num);
+  }
+}
+
+// -- offline estimate model weight params --
+static inline void bm_vlc_est_weight_bias(const uint8_t *ibuf, size_t isz, bool signedness, bool isBfloat16, CommandInfo *cmd_info)
+{
+  assert(!(isBfloat16 && signedness)); // WARNING: signedness MUST be 0 as isBfloat16==True
+
+  cmd_info->is_bfloat16 = isBfloat16;
+  if (isBfloat16 == false && signedness == true)
+  {
+    // two-side circular shift
+    int hist[256] = {0};
+    for (size_t i = 0; i < isz; i++)
+    {
+      hist[ibuf[i]]++;
+    }
+
+    int8_t pos_v = 1;
+    //while (pos_v < 128)
+    // comparison is always   true due to limited range of data type [-Werror=type-limits]
+    while (true)
+    {
+      if (hist[((uint8_t)pos_v)] == 0)
+      {
+        pos_v++;
+      }
+      else
+      {
+        break;
+      }
+    }
+    //cmd_info->bias0 = (pos_v > 1 && pos_v < 128) ? (pos_v - 1) : 0;
+    // comparison is always   true due to limited range of data type [-Werror=type-limits]
+    cmd_info->bias0 = (pos_v > 1) ? (pos_v - 1) : 0;
+    int8_t neg_v = -1;
+    //while (neg_v >= (-128)) // comparison is always   true due to limited range of data type [-Werror=type-limits]
+    while (true)
+    {
+      if (hist[(uint8_t)neg_v] == 0)
+      {
+        neg_v--;
+      }
+      else
+      {
+        break;
+      }
+    }
+    //cmd_info->bias1 = (neg_v < -1 && neg_v >= -128) ? abs(neg_v + 1) : 0;
+    // comparison is always   true due to limited range of data type [-Werror=type-limits]
+    cmd_info->bias1 = (neg_v < -1) ? abs(neg_v + 1) : 0;
+    cmd_info->signedness = true;
+  }
+
+  if (isBfloat16 == true)
+  {
+    // center shift
+    int64_t exp_accum = 0;
+    uint16_t *bf16_in = (uint16_t *)ibuf;
+    size_t inum = (isz >> 1), cnt = 0;
+    for (size_t i = 0; i < inum; i++)
+    {
+      uint8_t exp = ((bf16_in[i] >> 7) & 0xFF);
+      if (exp != 0)
+      {
+        exp_accum += exp;
+        cnt++;
+      }
+    }
+    if (cnt > 0)
+    {
+      cmd_info->bias0 = (uint8_t)((exp_accum / (float)cnt) + 0.5);
+    }
+    cmd_info->zero_guard_en = (inum == cnt) ? false : true;
+    cmd_info->signedness = false;
+  }
+}
+  #ifdef __cplusplus
+}
+#endif
+
+#endif /* __BM_VLC_COMPRESS_H__ */
diff --git a/cviruntime/test/1822/compression.h b/cviruntime/test/1822/compression.h
new file mode 100644
index 000000000..10452c738
--- /dev/null
+++ b/cviruntime/test/1822/compression.h
@@ -0,0 +1,367 @@
+#ifndef COMPRESSION_H
+#define COMPRESSION_H
+
+typedef struct {
+  u32 compress_md;
+  u32 bit_length;
+  int is_signed;
+
+  u64 total_data_num;
+  u32 non_zero_data_num;
+
+  u64 header_bytes;
+  u64 map_bytes;
+  u64 data_bytes;
+  u64 total_bytes;
+
+  int compressed_min;
+  int compressed_max;
+} compression_info_t;
+
+typedef struct {
+  u64 header_offset;
+  u64 header_size;
+  u64 map_offset;
+  u64 map_size;
+  u64 data_offset;
+  u64 data_size;
+  u64 total_size;
+} compress_addr_info;
+
+static u64 compression_map_bytes(u64 total_data_num)
+{
+  u64 bit_alignment = 16 * 8;
+  u64 bits = total_data_num;
+
+  return ceiling_func(bits, bit_alignment)*16;
+}
+
+static u64 compression_map_clear_bytes(u64 total_data_num)
+{
+  u64 bit_alignment = 2 * 8;
+  u64 bits = total_data_num;
+
+  return ceiling_func(bits, bit_alignment)*2;
+}
+
+
+static u64 compression_data_bytes(u64 non_zero_data_num, u32 bit_length)
+{
+  if (bit_length == 1)
+    return 0;
+
+  u64 bit_alignment = 8;
+  u64 bits = non_zero_data_num * bit_length;
+
+  return ceiling_func(bits, bit_alignment);
+}
+
+static inline u32 compression_bit_length(u32 compress_md)
+{
+  switch (compress_md) {
+    case 0:
+      return 8;
+    case 1:
+      return 4;
+    case 2:
+      return 2;
+    case 3:
+      return 1;
+    default:
+      assert(0);
+  }
+}
+
+static inline void compute_compressed_range(
+    u32 bit_length, int is_signed, int *min, int *max)
+{
+  if (is_signed) {
+    switch (bit_length) {
+      case 1:
+        *min = -1;
+        *max = 0;
+        return;
+      case 2:
+        *min = -2;
+        *max = 1;
+        return;
+      case 4:
+        *min = -8;
+        *max = 7;
+        return;
+      case 8:
+        *min = -128;
+        *max = 127;
+        return;
+    }
+  } else {
+    *min = 0;
+    switch (bit_length) {
+      case 1:
+        *max = 1;
+        return;
+      case 2:
+        *max = 3;
+        return;
+      case 4:
+        *max = 15;
+        return;
+      case 8:
+        *max = 255;
+        return;
+    }
+  }
+  assert(0);
+}
+
+static inline int saturate(int val, int max, int min)
+{
+  if (val < min)
+    return min;
+  else if (val > max)
+    return max;
+  else
+    return val;
+}
+
+static inline u64 count_non_zero_results(
+    u8 buf[], u64 size, int is_signed, int max, int min)
+{
+  u64 n = 0;
+
+  for (u64 i = 0; i < size; i++) {
+    int val = is_signed? (s8)buf[i]: buf[i];
+    int res = saturate(val, max, min);
+    if (res != 0)
+      n++;
+  }
+
+  return n;
+}
+
+static inline void set_map_bit(u8 map[], u64 i)
+{
+  u64 byte_i = i / 8;
+  u64 bit_i = i % 8;
+
+  map[byte_i] |= (1 << bit_i);
+}
+
+static inline u8 read_map_bit(u8 map[], u64 i)
+{
+  u64 byte_i = i / 8;
+  u64 bit_i = i % 8;
+
+  return (map[byte_i] >> bit_i) & 1;
+}
+
+static inline void parse_header(
+    u32 header, int *is_signed, u32 *compress_md, u32 *nz_num)
+{
+  *is_signed = (header >> 29) & 1;
+  *compress_md = (header >> 24) & 0b11;
+  *nz_num = header & 0xffffff;
+}
+
+static inline void fill_header(u32 *hdr, compression_info_t *info)
+{
+  if(compression_bit_length(info->compress_md)!=1)
+  {
+    *hdr = (info->is_signed << 29) | (1 << 28) |
+        (info->compress_md << 24) |
+        info->non_zero_data_num;
+  }else
+  {
+    *hdr = (info->is_signed << 29) | (1 << 28) |
+        (info->compress_md << 24);
+  }
+}
+
+static inline void fill_map(u8 map[], u8 buf[], compression_info_t *info)
+{
+  int min = info->compressed_min;
+  int max = info->compressed_max;
+
+  u64 clear_map = compression_map_clear_bytes(info->total_data_num);
+  for (u64 i = 0; i < clear_map; i++)
+    map[i] = 0;
+
+  for (u64 i = 0; i < info->total_data_num; i++) {
+    int val = info->is_signed? (s8)buf[i]: buf[i];
+    int res = saturate(val, max, min);
+    if (res != 0)
+      set_map_bit(map, i);
+  }
+}
+
+static inline void compress_one_data(
+    u8 data[], u64 i, u8 val, compression_info_t *info)
+{
+  u32 bit_len = info->bit_length;
+  u32 data_per_byte = 8 / bit_len;
+
+  u32 byte_i = i / data_per_byte;
+  u32 bit_i = (i % data_per_byte) * bit_len;
+  u8 mask = (1 << bit_len) - 1;
+
+  data[byte_i] |= (val & mask) << bit_i;
+}
+
+static inline u8 sign_extend(u8 val, u32 bit_len)
+{
+  int shift = 8 - bit_len;
+  return (s8)(val << shift) >> shift;
+}
+
+static inline u8 decompress_one_data(
+    u8 data[], u64 i, compression_info_t *info)
+{
+  u32 bit_len = info->bit_length;
+  u32 data_per_byte = 8 / bit_len;
+
+  u32 byte_i = i / data_per_byte;
+  u32 bit_i = (i % data_per_byte) * bit_len;
+  u8 mask = (1 << bit_len) - 1;
+
+  u8 val = (data[byte_i] >> bit_i) & mask;
+  if (info->is_signed)
+    val = sign_extend(val, bit_len);
+
+  return val;
+}
+
+static inline void fill_data(u8 data[], u8 buf[], compression_info_t *info)
+{
+  int min = info->compressed_min;
+  int max = info->compressed_max;
+
+  for (u64 i = 0; i < info->data_bytes; i++)
+    data[i] = 0;
+
+  u64 nz_i = 0;
+  for (u64 i = 0; i < info->total_data_num; i++) {
+    int val = info->is_signed? (s8)buf[i]: buf[i];
+    int res = saturate(val, max, min);
+    if (res != 0) {
+      compress_one_data(data, nz_i, res, info);
+      nz_i++;
+    }
+  }
+}
+
+static inline compression_info_t make_compression_info(
+    u8 buf[], u64 size, u32 compress_md, int is_signed)
+{
+  u32 bit_length = compression_bit_length(compress_md);
+
+  int min, max;
+  compute_compressed_range(bit_length, is_signed, &min, &max);
+
+  u32 nz_num = count_non_zero_results(buf, size, is_signed, max, min);
+  assert(nz_num <= 0xffffff);
+
+  compression_info_t info;
+  info.compress_md = compress_md;
+  info.bit_length = bit_length;
+  info.is_signed = is_signed;
+  info.total_data_num = size;
+  info.non_zero_data_num = nz_num;
+  info.header_bytes = 16;
+  info.map_bytes = compression_map_bytes(size);
+  info.data_bytes = compression_data_bytes(nz_num, bit_length);
+  info.total_bytes = info.header_bytes + info.map_bytes + info.data_bytes;
+  info.compressed_min = min;
+  info.compressed_max = max;
+  return info;
+}
+
+static inline compression_info_t parse_compression_info(
+    u8 compressed_buf[], u64 max_size, u64 total_data_num)
+{
+  u64 header_bytes = 16;
+  assert(header_bytes <= max_size);
+
+  int is_signed;
+  u32 compress_md, nz_num;
+  parse_header(*(u32 *)compressed_buf, &is_signed, &compress_md, &nz_num);
+
+  u32 bit_length = compression_bit_length(compress_md);
+  int min, max;
+  compute_compressed_range(bit_length, is_signed, &min, &max);
+
+  compression_info_t info;
+  info.compress_md = compress_md;
+  info.bit_length = compression_bit_length(compress_md);
+  info.is_signed = is_signed;
+  info.total_data_num = total_data_num;
+  info.non_zero_data_num = nz_num;
+  info.header_bytes = header_bytes;
+  info.map_bytes = compression_map_bytes(total_data_num);
+  info.data_bytes = compression_data_bytes(nz_num, info.bit_length);
+  info.total_bytes = header_bytes + info.map_bytes + info.data_bytes;
+  info.compressed_min = min;
+  info.compressed_max = max;
+
+  assert(info.total_bytes <= max_size);
+
+  return info;
+}
+
+static inline u8 * compress(
+    u8 buf[], u64 size, u32 compress_md, int is_signed, compress_addr_info *compressed_data)
+{
+  compression_info_t info =
+      make_compression_info(buf, size, compress_md, is_signed);
+
+  assert(info.total_bytes < 0x100000);
+  static u8 *result = (u8 *)malloc(sizeof(u8) * 0x100000);
+  u32 *hdr = (u32 *)result;
+  u8 *map = &result[info.header_bytes];
+  u8 *data = &map[info.map_bytes];
+
+  fill_header(hdr, &info);
+  fill_map(map, buf, &info);
+  if (info.bit_length != 1)
+    fill_data(data, buf, &info);
+
+  compressed_data->header_offset = 0;
+  compressed_data->header_size = 4;
+  compressed_data->map_offset = info.header_bytes;
+  compressed_data->map_size = compression_map_clear_bytes(info.total_data_num);
+  compressed_data->data_offset = info.map_bytes + info.header_bytes;
+  compressed_data->data_size = info.data_bytes;
+  compressed_data->total_size = info.total_bytes;
+
+  return result;
+}
+
+static inline void decompress(
+    u8 buf[], u64 size, u8 compressed_buf[], u64 max_size)
+{
+  compression_info_t info =
+      parse_compression_info(compressed_buf, max_size, size);
+  assert(info.total_bytes <= max_size);
+  assert(info.total_data_num == size);
+
+  u8 *map = &compressed_buf[info.header_bytes];
+  if (info.bit_length == 1) {
+    for (u64 i = 0; i < size; i++) {
+      u8 val = read_map_bit(map, i);
+      buf[i] = info.is_signed? sign_extend(val, 1): val;
+    }
+  } else {
+    u8 *data = &map[info.map_bytes];
+    u64 data_i = 0;
+    for (u64 i = 0; i < size; i++) {
+      u8 val = read_map_bit(map, i);
+      if (val == 0) {
+        buf[i] = 0;
+      } else {
+        buf[i] = decompress_one_data(data, data_i, &info);
+        data_i++;
+      }
+    }
+  }
+}
+
+#endif /* COMPRESSION_H */
diff --git a/cviruntime/test/1822/test_1822_avg_pooling.cpp b/cviruntime/test/1822/test_1822_avg_pooling.cpp
new file mode 100644
index 000000000..c3d368bdc
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_avg_pooling.cpp
@@ -0,0 +1,234 @@
+#include "1822_test_util.h"
+
+#define INVALIDE_STRIDE (-1)
+typedef bmk1822_tiu_average_pooling_param_t param_t;
+
+static void print_pooling_param(const param_t *p)
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+
+  printf("  Pooling parameters:\n");
+  printf("    ifmap = (%d, %d, %d, %d)\n", in, ic, ih, iw);
+  printf("    opd0_sign = %d\n", p->ifmap->fmt == FMT_I8);
+  printf("    weight = (%d, %d)\n", p->kh, p->kw);
+  printf("    padding = (%d, %d, %d, %d)\n",
+         p->pad_top, p->pad_bottom, p->pad_left, p->pad_right);
+  printf("    stride = (%d, %d)\n", p->stride_h, p->stride_w);
+  printf("    ins0 = (%d, %d, %d, %d)\n",
+         p->ins_h, p->ins_last_h, p->ins_w, p->ins_last_w);
+  printf("    avg_pooling_const = %d\n", p->avg_pooling_const);
+  printf("    rshift_bits = %d\n", p->rshift_bits);
+}
+
+static s8 *alloc_input(param_t *p)
+{
+  u64 size = tl_shape_size(&p->ifmap->shape);
+  s8 *data = (s8 *)xmalloc(size);
+  if (!data)
+    return NULL;
+
+  for (u64 i = 0; i < size; i++)
+    data[i] = rand() % 256 - 128;
+  return data;
+}
+
+static s8 *alloc_output(param_t *p)
+{
+  u64 size = tl_shape_size(&p->ofmap->shape);
+  return (s8 *)xmalloc(size);
+}
+
+static int pooling_ih_ext(param_t *p, int ih)
+{
+  int ins = p->ins_h;
+  int ins_last = p->ins_last_h;
+  int pad = p->pad_top + p->pad_bottom;
+  return (ih - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+static int pooling_iw_ext(param_t *p, int iw)
+{
+  int ins = p->ins_w;
+  int ins_last = p->ins_last_w;
+  int pad = p->pad_left + p->pad_right;
+  return (iw - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+static int pooling_oh(param_t *p, int ih)
+{
+  int ih_ext = pooling_ih_ext(p, ih);
+  return (ih_ext - p->kh) / p->stride_h + 1;
+}
+
+static int pooling_ow(param_t *p, int iw)
+{
+  int iw_ext = pooling_iw_ext(p, iw);
+  return (iw_ext - p->kw) / p->stride_w + 1;
+}
+
+static void free_pooling_param(
+    bmk_ctx_t *ctx,
+    param_t *p)
+{
+  if (p->ifmap)
+    free_tl(ctx, p->ifmap);
+  if (p->ofmap)
+    free_tl(ctx, p->ofmap);
+}
+
+static param_t random_pooling_param(bmk_ctx_t *ctx, int stride_w, int stride_h)
+{
+  srand(clock());
+  param_t p;
+
+  memset(&p, 0, sizeof(p));
+
+retry:
+  int in = rand() % 5 + 1;
+  int ic = rand() % (3 * BM1822_HW_NPU_NUM) + 1;
+  int ih = rand() % 30 + 3 + (INVALIDE_STRIDE == stride_h ? 0 : stride_h);
+  int iw = rand() % 30 + 6 + (INVALIDE_STRIDE == stride_w ? 0 : stride_w);
+  int opd0_sign = rand() % 2;
+
+  p.kh = rand() % 7 + 1;
+  p.kw = rand() % 7 + 1;
+  p.stride_h = INVALIDE_STRIDE == stride_h ? rand() % (p.kh) + 1 : stride_h;
+  p.stride_w = INVALIDE_STRIDE == stride_w ? rand() % (p.kh) + 1 : stride_w;
+  p.ins_h = rand() % p.kh;
+  p.ins_w = rand() % p.kw;
+  p.ins_last_h = rand() % p.kh;
+  p.ins_last_w = rand() % p.kw;
+  p.pad_top = rand() % p.kh;
+  p.pad_bottom = rand() % p.kh;
+  p.pad_left = rand() % p.kw;
+  p.pad_right= rand() % p.kw;
+  p.avg_pooling_const = rand() % 256;
+  p.rshift_bits = rand() % 32;
+
+  tl_shape_t ifmap_shape;
+  ifmap_shape.n = in;
+  ifmap_shape.c = ic;
+  ifmap_shape.h = ih;
+  ifmap_shape.w = iw;
+
+  int on = in;
+  int oc = ic;
+  int oh = pooling_oh(&p, ih);
+  int ow = pooling_ow(&p, iw);
+  tl_shape_t ofmap_shape;
+  ofmap_shape.n = on;
+  ofmap_shape.c = oc;
+  ofmap_shape.h = oh;
+  ofmap_shape.w = ow;
+
+  fmt_t fmt = opd0_sign? FMT_I8: FMT_U8;
+  p.ofmap = bmk1822_lmem_alloc_tensor(ctx, ofmap_shape, FMT_I8, 1);
+  p.ifmap = bmk1822_lmem_alloc_tensor(ctx, ifmap_shape, fmt, 1);
+
+  if ((p.kh > pooling_ih_ext(&p, ih))
+      || (p.kw > pooling_iw_ext(&p, iw))
+      || (p.pad_top >= (1 << 4))
+      || (p.pad_bottom >= (1 << 4))
+      || (p.pad_left >= (1 << 4))
+      || (p.pad_right >= (1 << 4))
+      || !p.ofmap
+      || !p.ifmap) {
+    printf("retry init_pooling_param\n");
+    free_pooling_param(ctx, &p);
+    goto retry;
+  }
+
+  return p;
+}
+
+static void compare_results(
+    param_t *p,
+    s8 input[],
+    s8 output[])
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+  int opd0_sign = (p->ifmap->fmt == FMT_I8);
+
+  s8 *output_ref = alloc_output(p);
+  bmerr_t ret = native_pooling_ave_int8(
+      input, &p->avg_pooling_const, NULL, output_ref,
+      in, ic, ih, iw, p->kh, p->kw,
+      p->pad_top, p->pad_bottom, p->pad_left, p->pad_right,
+      p->stride_h, p->stride_w,
+      p->ins_h, p->ins_w, p->ins_last_h, p->ins_last_w,
+      opd0_sign, opd0_sign, p->rshift_bits, 1);
+  assert(ret == BM_SUCCESS);
+
+  int cmp_res = array_cmp_int8(
+      "Comparing results ...\n", output_ref, output,
+      tl_shape_size(&p->ofmap->shape));
+
+  if (cmp_res != 0) {
+    printf("Comparison FAILED!!!\n");
+    print_pooling_param(p);
+    exit(-1);
+  }
+
+  free(output_ref);
+}
+
+static int _test_pooling(bmctx_t ctx, bmk_ctx_t *bk_ctx, int stride_w, int stride_h)
+{
+  param_t p = random_pooling_param(bk_ctx, stride_w, stride_h);
+  s8 *input = alloc_input(&p);
+
+  put_tensor_g2l(&ctx, bk_ctx, p.ifmap, (u8 *)input);
+  bmk1822_tiu_average_pooling(bk_ctx, &p);
+  s8 *output = (s8 *)get_tensor_l2g(&ctx, bk_ctx, p.ofmap);
+
+  compare_results(&p, input, output);
+
+  free_pooling_param(bk_ctx, &p);
+  free(output);
+  free(input);
+
+  return 1;
+}
+
+static int test_pooling(bmctx_t ctx, bmk_ctx_t *bk_ctx) {
+  return _test_pooling(ctx, bk_ctx, INVALIDE_STRIDE, INVALIDE_STRIDE);
+}
+
+static void test_avg_pooling(bmctx_t *ctx, bmk_ctx_t *bk_ctx)
+{
+  int test_finished_num = 0;
+  for (u64 i = 0; i < 16; i++)
+    test_finished_num += test_pooling(*ctx, bk_ctx);
+
+  // test stride extend (0, 31]
+  int stride_list[] = {15, 16, 31};
+  int stride_list_len = sizeof(stride_list) / sizeof(stride_list[0]);
+
+  for (int stride_w_idx = 0; stride_w_idx < stride_list_len; stride_w_idx++) {
+    for (int stride_h_idx = 0; stride_h_idx < stride_list_len; stride_h_idx++) {
+      int stride_w = stride_list[stride_w_idx];
+      int stride_h = stride_list[stride_h_idx];
+
+      test_finished_num += _test_pooling(*ctx, bk_ctx, stride_w, stride_h);
+    }
+  }
+  printf("Test finished %d\n", test_finished_num);
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_avg_pooling(&ctx, bk_ctx);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_base_reg_selection.cpp b/cviruntime/test/1822/test_1822_base_reg_selection.cpp
new file mode 100644
index 000000000..a45771082
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_base_reg_selection.cpp
@@ -0,0 +1,284 @@
+#include "1822_test_util.h"
+
+typedef struct {
+  long index;
+  long offset;
+}Base_reg;
+
+Base_reg base_reg[]={
+ {0, 0x000000 },
+ {1, 0x100000 },
+ {2, 0x200000 },
+ {3, 0x300000 },
+ {4, 0x400000 },
+ {5, 0x500000 },
+ {6, 0x600000 },
+ {7, 0x700000 },
+};
+static void test_tensor_base_selection(
+    bmctx_t *ctx, bmk_ctx_t *bk_ctx, u32 reg_index, int offset)
+{
+  int n = 2;
+  int c = 66;
+  int h = 3;
+  int w = 15;
+
+  int size = n * c * h * w;
+  u8 *data_x = (u8 *)xmalloc(size);
+
+  for (int i = 0; i < size; i++)
+    data_x[i] = i - 100;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  tg_shape_t ts_shape;
+  ts_shape.n = n;
+  ts_shape.c = c;
+  ts_shape.h = h;
+  ts_shape.w = w;
+
+  tl_t *tl_x = alloc_tl(bk_ctx, tl_shape, FMT_I8, 1);
+
+  /*
+   * Copy test data to the fixed address.(gaddr + offset)
+   */
+  bmshape_t bms = BM_TENSOR_INT8((int)n, (int)c, (int)h, (int)w);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  //bmmem_device_t ab_dev_mem = bmmem_device_prealloc(*ctx, NULL, gaddr + offset, &bms);
+  bmmem_device_t ab_dev_mem = bmmem_device_prealloc_raw(*ctx, NULL, gaddr + offset, bmshape_get_size(&bms));
+  
+  int ret = bm_memcpy_s2d(*ctx, ab_dev_mem, data_x);
+  assert(ret == BM_SUCCESS);
+
+  /*
+   * tensor transfer
+   * g2l array base = offset, index = reg_index
+   * l2g array base = 0, index = 0
+   */
+  bm_device_set_base_reg(*ctx, reg_index, offset);
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_I8;
+  tg.shape = ts_shape;
+  tg.stride = bmk1822_tensor_tgmem_default_stride(ts_shape, tg.fmt);
+  tg.base_reg_index = reg_index;
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl_x;
+
+  bmk1822_tdma_g2l_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  bm_device_set_base_reg(*ctx, 0, 0);
+  u8 *result_x = get_tensor_l2g(ctx, bk_ctx, tl_x);
+  for (int i = 0; i < size; i++) {
+    if (result_x[i] != data_x[i]) {
+      fprintf(stderr, "compare failed at result_x[%d]\n", i);
+      exit(-1);
+    }
+  }
+  free(result_x);
+
+  /*
+   * tensor transfer
+   * g2l array base = 0, index = reg_index
+   * l2g array base = 0, index = 0
+   */
+  bm_device_set_base_reg(*ctx, reg_index, 0);
+  tg.start_address = gaddr + offset;
+  bmk1822_tdma_g2l_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+  bm_device_set_base_reg(*ctx, 0, 0);
+  result_x = get_tensor_l2g(ctx, bk_ctx, tl_x);
+  for (int i = 0; i < size; i++) {
+    if (result_x[i] != data_x[i]) {
+      fprintf(stderr, "compare failed at result_x[%d]\n", i);
+      exit(-1);
+    }
+  }
+
+  /*
+   * tensor transfer
+   * g2l, array base = offset, index = reg_index
+   * l2g, array_base = offset, index = reg_index
+   */
+  bm_device_set_base_reg(*ctx, reg_index, offset);
+  tg.start_address = gaddr;
+  tg.base_reg_index = reg_index;
+  bmk1822_tdma_g2l_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  bm_device_set_base_reg(*ctx, reg_index, offset);
+  tg.start_address = gaddr;
+  bmk1822_tdma_l2tg_tensor_copy_param_t l2g_p;
+  memset(&l2g_p, 0, sizeof(l2g_p));
+  l2g_p.src = tl_x;
+  l2g_p.dst = &tg;
+  bmk1822_tdma_l2g_tensor_copy(bk_ctx, &l2g_p);
+  test_submit(ctx);
+  ret = bm_memcpy_d2s(*ctx, result_x,ab_dev_mem);
+  assert(ret == BM_SUCCESS);
+
+  for (int i = 0; i < size; i++) {
+    if (result_x[i] != data_x[i]) {
+      fprintf(stderr, "compare failed at result_x[%d]\n", i);
+      exit(-1);
+    }
+  }
+  free(result_x);
+
+  bm_device_set_base_reg(*ctx, 0, 0);
+  bm_device_set_base_reg(*ctx, 1, 0);
+  free_tl(bk_ctx, tl_x);
+  bmmem_device_free(*ctx, dev_mem);
+  bmmem_device_free(*ctx, ab_dev_mem);
+  free(data_x);
+}
+static void test_matrix_base_selection(
+    bmctx_t *ctx, bmk_ctx_t *bk_ctx, u32 reg_index, int offset)
+{
+  int row = 5;
+  int col = 16 * 5 + 2;
+  int size = row * col;
+
+  u8 *data_x = (u8 *)xmalloc(size);
+
+  for (int i = 0; i < size; i++)
+    data_x[i] = i - 100;
+
+  ml_shape_t ml_shape =
+    bmk1822_matrix_lmem_default_shape(bk_ctx, row, col, FMT_I8);
+  mg_shape_t mg_shape;
+  mg_shape.row = row;
+  mg_shape.col = col;
+
+  ml_t *ml =
+    bmk1822_lmem_alloc_matrix(bk_ctx, ml_shape, FMT_I8, 1);
+
+  /*
+   * Copy test data to the specified offset address.
+   */
+
+  bmshape_t bms = BM_MATRIX_INT8(row,col);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  //bmmem_device_t ab_dev_mem = bmmem_device_prealloc(*ctx, NULL, gaddr + offset, &bms);
+  bmmem_device_t ab_dev_mem = bmmem_device_prealloc_raw(*ctx, NULL, gaddr + offset, bmshape_get_size(&bms));
+  
+  int ret = bm_memcpy_s2d(*ctx, ab_dev_mem, data_x);
+  assert(ret == BM_SUCCESS);
+
+  /*
+   * matrix transfer
+   * g2l array base = offset, index = reg_index
+   * l2g array base = 0, index = 0
+   */
+  bm_device_set_base_reg(*ctx, reg_index, offset);
+  mg_t mg;
+  mg.base_reg_index = 0;
+  mg.start_address = gaddr;
+  mg.shape = mg_shape;
+  mg.stride.row = mg_shape.col;
+  mg.base_reg_index = reg_index;
+
+  bmk1822_tdma_tg2l_matrix_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &mg;
+  p.dst = ml;
+
+  bmk1822_tdma_g2l_matrix_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  bm_device_set_base_reg(*ctx, 0, 0);
+  u8 *result_x = get_matrix_l2g(ctx, bk_ctx, ml);
+  for (int i = 0; i < size; i++) {
+    if (result_x[i] != data_x[i]) {
+      fprintf(stderr, "compare failed at result_x[%d]\n", i);
+      exit(-1);
+    }
+  }
+  free(result_x);
+
+  /*
+   * matrix transfer
+   * g2l array base = 0, index = reg_index
+   * l2g array base = 0, index = 0
+   */
+  bm_device_set_base_reg(*ctx, reg_index, 0);
+  mg.start_address = gaddr + offset;
+  bmk1822_tdma_g2l_matrix_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  bm_device_set_base_reg(*ctx, 0, 0);
+  result_x = get_matrix_l2g(ctx, bk_ctx, ml);
+  for (int i = 0; i < size; i++) {
+    if (result_x[i] != data_x[i]) {
+      fprintf(stderr, "compare failed at result_x[%d]\n", i);
+      exit(-1);
+    }
+  }
+
+ /*
+  * Matrix transfer
+  * g2l, array base = offset, index = reg_index
+  * l2g, array_base = offset, index = reg_index
+  */
+  bm_device_set_base_reg(*ctx, reg_index, offset);
+  mg.start_address = gaddr;
+  mg.base_reg_index = reg_index;
+  bmk1822_tdma_g2l_matrix_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  mg.start_address = gaddr;
+  bmk1822_tdma_l2tg_matrix_copy_param_t l2g_p;
+  memset(&l2g_p, 0, sizeof(l2g_p));
+  l2g_p.src = ml;
+  l2g_p.dst = &mg;
+
+  bm_device_set_base_reg(*ctx, reg_index, offset);
+
+  bmk1822_tdma_l2g_matrix_copy(bk_ctx, &l2g_p);
+  test_submit(ctx);
+
+  ret = bm_memcpy_d2s(*ctx, result_x,ab_dev_mem);
+  assert(ret == BM_SUCCESS);
+
+  for (int i = 0; i < size; i++) {
+    if (result_x[i] != data_x[i]) {
+      fprintf(stderr, "compare failed at result_x[%d]\n", i);
+      exit(-1);
+    }
+  }
+
+  bm_device_set_base_reg(*ctx, 0, 0);
+  bm_device_set_base_reg(*ctx, 1, 0);
+  free(result_x);
+  bmk1822_lmem_free_matrix(bk_ctx, ml);
+  bmmem_device_free(*ctx, dev_mem);
+  bmmem_device_free(*ctx, ab_dev_mem);
+  free(data_x);
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  for(int i=0; i<8; i ++)
+  {
+    test_matrix_base_selection(&ctx, bk_ctx, base_reg[i].index, base_reg[i].offset );
+    test_tensor_base_selection(&ctx, bk_ctx, base_reg[i].index, base_reg[i].offset);
+  }
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_conv.cpp b/cviruntime/test/1822/test_1822_conv.cpp
new file mode 100644
index 000000000..78e8e451f
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_conv.cpp
@@ -0,0 +1,758 @@
+#include "1822_test_util.h"
+
+#define INVALIDE_STRIDE (-1)
+typedef struct {
+  int random_seed;
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int using_bias;
+  int bReLU_EN;
+  int r_shift_m;
+  int opd0_sign;
+  int opd1_sign;
+  int opd2_sign;
+} conv_param_t;
+
+static int index_get(int h, int w1, int w2)
+{
+  return h * w1 + w2;
+}
+
+static int matrix_dot_mult(
+    s8 *A, s8 *B, int dim_n, int dim_m,
+    int opd0_sign)
+{
+  int sum = 0;
+  for (int i=0; i<dim_n; i++){
+    for (int j=0; j<dim_m; j++){
+      int index = index_get(i, dim_m, j);
+      if(opd0_sign) {
+        sum += A[index] * B [index];
+      } else {
+        sum += (int)((uint8_t)A[index]) * B [index];
+      }
+    }
+  }
+  return sum;
+}
+
+static int conv_ref(
+    const conv_param_t *p_param,
+    const s8 *ifmap,
+    const s8 *weight,
+    const s16 *bias,
+    s8 *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+  int r_shift_bits = p_param->r_shift_m;
+  int do_relu = p_param->bReLU_EN;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  s8 *i_fmap_pad_ker = (s8 *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return BM_ERR_FAILURE;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = BM_SUCCESS;
+
+  s8 *i_fmap_pad = NULL;
+  s8 *kernel_after = NULL;
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+
+      if (p_param->using_bias) {
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += bias[c]; //bias+c ;
+          }
+        }
+      }
+
+      if (do_relu)
+        relu(&result[n*oc*oh*ow + c*oh*ow], oh * ow);
+
+      // ofmap is s8, signed
+      ret = satu_2_8bit(&result[n*oc*oh*ow + c*oh*ow], oh * ow,
+                        &ofmap[n*oc*oh*ow + c*oh*ow], r_shift_bits, /*round_floor=*/1,
+                        /*sign_unsign=*/1);
+
+      if (ret != BM_SUCCESS)
+        goto error_release;
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+error_release:
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+static u8 * transform_weight(const tl_shape_t *s, u8 before[])
+{
+  u32 ic = s->n;
+  u32 oc = s->c;
+  u32 kh = s->h;
+  u32 kw = s->w;
+
+  u32 size = ic * oc * kh * kw;
+  u8 *after = (u8 *)malloc(sizeof(u8) * size);
+
+  /*
+   * (oc, ic, kh, kw) -> (1, oc, kh * kw, ic)
+   */
+  for (u32 oci = 0; oci < oc; oci++) {
+    for (u32 ici = 0; ici < ic; ici++) {
+      for (u32 khi = 0; khi < kh; khi++) {
+        for (u32 kwi = 0; kwi < kw; kwi++) {
+          u32 src_i = oci * ic * kh * kw + ici * kh * kw + khi * kw + kwi;
+          u32 dst_i = oci * kh * kw * ic + khi * kw * ic + kwi * ic + ici;
+          after[dst_i] = before[src_i];
+        }
+      }
+    }
+  }
+
+  return after;
+}
+
+static void put_conv_weight(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    const tl_t *tl,
+    u8 *data)
+{
+  const tl_shape_t *s = &tl->shape;
+  u32 ic = s->n;
+  u32 oc = s->c;
+  u32 kh = s->h;
+  u32 kw = s->w;
+
+  bmshape_t bms = BM_TENSOR_INT8((int)oc, (int)ic, (int)kh, (int)kw);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  u8 *transformed_data = transform_weight(s, data);
+
+  /* we put weight to region 1. bm_memcpy_s2d regard dev_mem as
+   * absolute address, so we should pass abs address to copy weight
+   * to right place.
+   */
+
+  //u64 ab_addr = bm_device_read_base_reg(*ctx, 1);
+  //bmmem_device_t ab_dev_mem =
+    //bmmem_device_prealloc(*ctx, NULL, ab_addr + gaddr, &bms);
+
+  //int ret = bm_memcpy_s2d(*ctx, ab_dev_mem, transformed_data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tl_shape_t tdma_shape = { 1, oc, kh * kw, ic };
+
+  tg_t tdma_tg;
+  tdma_tg.base_reg_index = 0;
+  tdma_tg.start_address = gaddr;
+  tdma_tg.fmt = FMT_I8;
+  tdma_tg.shape.n = tdma_shape.n;
+  tdma_tg.shape.c = tdma_shape.c;
+  tdma_tg.shape.h = tdma_shape.h;
+  tdma_tg.shape.w = tdma_shape.w;
+  tdma_tg.stride = bmk1822_tensor_tgmem_default_stride(tdma_tg.shape, tdma_tg.fmt);
+  tdma_tg.base_reg_index = 1;
+
+  tl_t tdma_tl = *tl;
+  tdma_tl.shape = tdma_shape;
+  tdma_tl.stride = bmk1822_tensor_lmem_default_stride(bk_ctx, tdma_shape, FMT_I8, 0);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tdma_tg;
+  p.dst = &tdma_tl;
+
+  bmk1822_tdma_g2l_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+  bmmem_device_free(*ctx, dev_mem);
+  free(transformed_data);
+}
+
+static s8 * transform_bias(int oc, s16 before[])
+{
+  s8 *after = (s8 *)malloc(sizeof(s8) * 2 * oc);
+  if (!after)
+    return NULL;
+
+  for (int i = 0; i < oc; i++){
+    after[i] = before[i] & 0xff;
+    after[i + oc] = (before[i] >> 8) & 0xff;
+  }
+  return after;
+}
+
+static void put_conv_bias(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    const tl_t *tl,
+    s16 *data)
+{
+  int oc = tl->shape.c;
+
+  bmshape_t bms = BM_TENSOR_INT8(2, oc, 1, 1);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  s8 *transformed_data = transform_bias(oc, data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, (u8 *)transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_I8;
+  tg.shape.n = 2;
+  tg.shape.c = oc;
+  tg.shape.h = 1;
+  tg.shape.w = 1;
+  tg.stride = bmk1822_tensor_tgmem_default_stride(tg.shape, tg.fmt);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1822_tdma_g2l_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  bmmem_device_free(*ctx, dev_mem);
+  free(transformed_data);
+}
+
+static int conv_kh_ext(const conv_param_t *p)
+{
+  return (p->kh - 1) * p->dh + 1;
+}
+
+static int conv_kw_ext(const conv_param_t *p)
+{
+  return (p->kw - 1) * p->dw + 1;
+}
+
+static int conv_ih_ext(const conv_param_t *p)
+{
+  return (p->input_h - 1) * (p->ins_h + 1) +
+      p->ins_h_last + 1 + p->pad_top + p->pad_bot;
+}
+
+static int conv_iw_ext(const conv_param_t *p)
+{
+  return (p->input_w - 1) * (p->ins_w + 1) +
+      p->ins_w_last + 1 + p->pad_left + p->pad_right;
+}
+
+static int conv_oh(const conv_param_t *p)
+{
+  return (conv_ih_ext(p) - conv_kh_ext(p)) / p->stride_h + 1;
+}
+
+static int conv_ow(const conv_param_t *p)
+{
+  return (conv_iw_ext(p) - conv_kw_ext(p)) / p->stride_w + 1;
+}
+
+static int conv_input_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int ic = p->input_c;
+  int ih = p->input_h;
+  int iw = p->input_w;
+  return in * ic * ih * iw;
+}
+
+static int conv_output_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int oc = p->output_c;
+  int oh = conv_oh(p);
+  int ow = conv_ow(p);
+  return in * oc * oh * ow;
+}
+
+static int conv_weight_size(const conv_param_t *p)
+{
+  int oc = p->output_c;
+  int ic = p->input_c;
+  int kh = p->kh;
+  int kw = p->kw;
+  return oc * ic * kh * kw;
+}
+
+static s8 * alloc_input(const conv_param_t *p)
+{
+  int size = conv_input_size(p);
+
+  s8 *buf = (s8 *)malloc(sizeof(s8) * size);
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static s8 * alloc_weight(const conv_param_t *p)
+{
+  int size = conv_weight_size(p);
+
+  s8 *buf = (s8 *)malloc(sizeof(s8) * size);
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static s16 * alloc_bias(const conv_param_t *p)
+{
+  int oc = p->output_c;
+
+  s16 *bias = (s16 *)malloc(sizeof(s16) * oc);
+  for (int i = 0; i < oc; i++)
+    bias[i] = rand() % 65536 - 32768;
+
+  return bias;
+}
+
+static tl_t * conv_ifmap_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = p->opd0_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return bmk1822_lmem_alloc_tensor(ctx, s, fmt, 1);
+}
+
+static tl_t * conv_weight_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = p->opd1_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+  return bmk1822_lmem_alloc_tensor(ctx, s, fmt, 0);
+}
+
+static tl_t * conv_ofmap_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  return bmk1822_lmem_alloc_tensor(ctx, s, FMT_I8, 1);
+}
+
+static tl_t * conv_bias_tensor(
+    bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = p->opd2_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+  return bmk1822_lmem_alloc_tensor(ctx, s, fmt, 0);
+}
+
+static int conv_param_is_ok(const conv_param_t *p)
+{
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+
+  if ((kh_ext > ih_ext)
+      || (kw_ext > iw_ext)
+      || (kh_ext <= p->pad_top)
+      || (kh_ext <= p->pad_bot)
+      || (kw_ext <= p->pad_left)
+      || (kw_ext <= p->pad_right)
+      || (p->pad_top >= (1 << 4))
+      || (p->pad_bot >= (1 << 4))
+      || (p->pad_left >= (1 << 4))
+      || (p->pad_right >= (1 << 4))) {
+    return 0;
+  }
+
+  return 1;
+}
+
+static int bmk_conv_param_alloc_ok(
+    const bmk1822_tiu_convolution_param_t *p,
+    const conv_param_t *param)
+{
+  if (!p->ifmap || !p->ofmap || !p->weight)
+    return 0;
+
+  if (param->using_bias)
+    if (!p->bias)
+      return 0;
+
+  return 1;
+}
+
+static void make_bmk_conv_param(
+    bmk_ctx_t *ctx,
+    bmk1822_tiu_convolution_param_t *dst,
+    const conv_param_t *p)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  dst->relu_enable = p->bReLU_EN;
+  dst->rshift_bits = p->r_shift_m;
+
+  dst->ifmap = conv_ifmap_tensor(ctx, p);
+  dst->weight = conv_weight_tensor(ctx, p);
+  dst->ofmap = conv_ofmap_tensor(ctx, p);
+  dst->bias = NULL;
+  dst->ps32_mode = 0;
+  if (p->using_bias)
+    dst->bias = conv_bias_tensor(ctx, p);
+
+  dst->w_is_const = 0;
+}
+
+static void free_bmk_conv_param(
+    bmk_ctx_t *ctx,
+    bmk1822_tiu_convolution_param_t *r,
+    const conv_param_t *p)
+{
+  if (p->using_bias && r->bias)
+    free_tl(ctx, r->bias);
+  if (r->ofmap)
+    free_tl(ctx, r->ofmap);
+  if (r->weight)
+    free_tl(ctx, r->weight);
+  if (r->ifmap)
+    free_tl(ctx, r->ifmap);
+}
+
+static void _init_conv_param(conv_param_t &p, int stride_w, int stride_h)
+{
+  printf("init_conv_param\n");
+  memset(&p, 0, sizeof(p));
+  p.random_seed = clock();
+  srand(p.random_seed);
+
+retry:
+  p.input_n = rand() % 5 + 1;
+  p.input_c = rand() % (5 * 32) + 1;
+  p.kh = rand() % 7 + 1;
+  p.kw = rand() % 7 + 1;
+  p.input_h = rand() % 40 + p.kh + (INVALIDE_STRIDE == stride_h ? 0 : stride_h);
+  p.input_w = rand() % 40 + p.kw + (INVALIDE_STRIDE == stride_w ? 0 : stride_w);
+  p.output_c = rand() % 10 + 3;
+  p.stride_h = INVALIDE_STRIDE == stride_h ? rand() % (p.kh) + 1 : stride_h;
+  p.stride_w = INVALIDE_STRIDE == stride_w ? rand() % (p.kh) + 1 : stride_w;
+  p.ins_h = rand() % p.kh;
+  p.ins_w = rand() % p.kw;
+  p.ins_h_last = rand() % p.kh;;
+  p.ins_w_last = rand() % p.kw;;
+  p.dh = rand() % 3 + 1;
+  p.dw = rand() % 3 + 1;
+
+  int kh_ext = conv_kh_ext(&p);
+  int kw_ext = conv_kw_ext(&p);
+  p.pad_top = rand() % kh_ext;
+  p.pad_bot = rand() % kh_ext;
+  p.pad_left = rand() % kw_ext;
+  p.pad_right = rand() % kw_ext;
+
+  if (!conv_param_is_ok(&p)) {
+    printf("retry init_conv_param\n");
+    goto retry;
+  }
+
+  p.using_bias = rand() % 2;
+  p.r_shift_m = rand() % 8;
+  p.bReLU_EN = rand() % 2;
+
+  p.opd0_sign = rand() % 2;
+  p.opd1_sign = 1;
+  p.opd2_sign = 1;
+
+  assert(p.opd1_sign == 1 && p.opd2_sign == 1);
+
+  int ih_ext = conv_ih_ext(&p);
+  int iw_ext = conv_iw_ext(&p);
+  assert(ih_ext >= kh_ext);
+  assert(iw_ext >= kw_ext);
+}
+
+static void init_conv_param(conv_param_t &p) {
+  _init_conv_param(p, INVALIDE_STRIDE, INVALIDE_STRIDE);
+}
+
+static void print_conv_param(const conv_param_t *p)
+{
+  printf("%s\n", "Conv parameters:");
+  printf("  %s%d;\n", "p->random_seed = ", p->random_seed);
+
+  printf("  %s%d;\n", "p->input_n = ", p->input_n);
+  printf("  %s%d;\n", "p->input_c = ", p->input_c);
+  printf("  %s%d;\n", "p->input_h = ", p->input_h);
+  printf("  %s%d;\n", "p->input_w = ", p->input_w);
+  printf("  %s%d;\n", "p->output_c = ", p->output_c);
+
+  printf("  %s%d;\n", "p->kh = ", p->kh);
+  printf("  %s%d;\n", "p->kw = ", p->kw);
+  printf("  %s%d;\n", "p->dh = ", p->dh);
+  printf("  %s%d;\n", "p->dw = ", p->dw);
+  printf("  %s%d;\n", "p->pad_top = ", p->pad_top);
+  printf("  %s%d;\n", "p->pad_bot = ", p->pad_bot);
+  printf("  %s%d;\n", "p->pad_left = ", p->pad_left);
+  printf("  %s%d;\n", "p->pad_right = ", p->pad_right);
+  printf("  %s%d;\n", "p->stride_h = ", p->stride_h);
+  printf("  %s%d;\n", "p->stride_w = ", p->stride_w);
+  printf("  %s%d;\n", "p->ins_w = ", p->ins_w);
+  printf("  %s%d;\n", "p->ins_h = ", p->ins_h);
+  printf("  %s%d;\n", "p->ins_w_last = ", p->ins_w_last);
+  printf("  %s%d;\n", "p->ins_h_last = ", p->ins_h_last);
+
+  printf("  %s%d;\n", "p->r_shift_m = ", p->r_shift_m);
+  printf("  %s%d;\n", "p->opd0_sign = ", p->opd0_sign);
+  printf("  %s%d;\n", "p->opd1_sign = ", p->opd1_sign);
+  printf("  %s%d;\n", "p->opd2_sign = ", p->opd2_sign);
+  printf("  %s%d;\n", "p->bReLU_EN = ", p->bReLU_EN);
+  printf("  %s%d;\n", "p->using_bias = ", p->using_bias);
+
+  printf("  %s%d\n", "kh_ext = ", conv_kh_ext(p));
+  printf("  %s%d\n", "kw_ext = ", conv_kw_ext(p));
+  printf("  %s%d\n", "ih_ext = ", conv_ih_ext(p));
+  printf("  %s%d\n", "iw_ext = ", conv_iw_ext(p));
+  printf("  %s%d\n", "output_h = ", conv_oh(p));
+  printf("  %s%d\n", "output_w = ", conv_ow(p));
+}
+
+// Calculate the right shift value, m
+// Steps:
+//  1. Get the abs() of each weight;
+//  2. Summary all the abs() in one kernel;
+//  3. Get Log2 of each sum;
+//  4. Downward rounding;
+// After every r_shift value got, sort and find the middle one.
+static int calc_rshift_m(const conv_param_t *p, s8* weight)
+{
+  int kernel_cnt = p->output_c * p->input_c;
+  int kernel_size = p->kh * p->kw;
+  int *kernel_shifts = (int *)malloc(sizeof(int) * kernel_cnt);
+
+  // Tscan does not recognize C++ zero-initialized.
+  memset(kernel_shifts, 0, sizeof(int) * kernel_cnt);
+
+  // Part 1:
+  // Get right shift value for each kernel
+  int sum = 0;
+  for (int i = 0; i < kernel_cnt; i++) {
+    // Step 1 & 2: Get the sum of abs()
+    for (int j = 0; j < kernel_size; j++) {
+      sum += (int)(*weight < 0 ? -(*weight) : (*weight));
+      weight++;
+    }
+    // Step 3 & 4: log2 and downward rounding
+    sum >>= 1;
+    while (sum) {
+      sum >>= 1;
+      kernel_shifts[i]++;
+    }
+  }
+
+  // Part 2:
+  // Find the middle of all the values
+  int tag[32] = {0};
+  for (int cnt = 0; cnt < kernel_cnt; cnt++) {
+    tag[kernel_shifts[cnt]]++;
+  }
+
+  int rshift_m = 0;
+  int mid = 0;
+  do {
+    mid += tag[rshift_m++];
+  } while(mid < (kernel_cnt - 1) >> 1);
+
+  free(kernel_shifts);
+
+  return rshift_m - 1;
+}
+
+static int test_conv(
+    conv_param_t &p_param, bmctx_t ctx, bmk_ctx_t *bk_ctx)
+{
+  s8 *input = alloc_input(&p_param);
+  s8 *weight = alloc_weight(&p_param);
+  s16 *bias = alloc_bias(&p_param);
+  p_param.r_shift_m = calc_rshift_m(&p_param, weight);
+
+  s8 *output_ref = (s8 *)malloc(sizeof(s8) * conv_output_size(&p_param));
+  if (!output_ref)
+    return BM_ERR_FAILURE;
+
+  bmerr_t ret = conv_ref(&p_param, input, weight, bias, output_ref);
+  assert(ret == BM_SUCCESS);
+
+  bmk1822_tiu_convolution_param_t conv_param;
+  make_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+  int tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, &p_param);
+  if (tl_alloc_success) {
+    put_tensor_g2l(&ctx, bk_ctx, conv_param.ifmap, (u8 *)input);
+    put_conv_weight(&ctx, bk_ctx, conv_param.weight, (u8 *)weight);
+    if (p_param.using_bias)
+      put_conv_bias(&ctx, bk_ctx, conv_param.bias, bias);
+    bmk1822_tiu_convolution(bk_ctx, &conv_param);
+    u8 *output = get_tensor_l2g(&ctx, bk_ctx, conv_param.ofmap);
+
+    int has_error = array_cmp_int8(
+        "Comparing results ...\n",
+        output_ref, (s8 *)output, conv_output_size(&p_param));
+
+    if (has_error) {
+      print_conv_param(&p_param);
+      printf("Comparison FAILED\n");
+      exit(-1);
+    }
+    free(output);
+  }
+
+  free_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+  free(input);
+  free(weight);
+  free(output_ref);
+  free(bias);
+
+  return tl_alloc_success ? 1 : 0;
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  int test_finished_num = 0;
+  for (int i = 0; i < 5; i++) {
+    printf("random_test_conv iteration: %d\n", i);
+    conv_param_t test_conv_param;
+    init_conv_param(test_conv_param);
+    test_finished_num += test_conv(test_conv_param, ctx, bk_ctx);
+    if (!test_conv_param.using_bias)
+      test_conv_param.using_bias = 1;
+    if (test_conv_param.output_c <= 32)
+      test_conv_param.output_c += 32;
+    test_finished_num += test_conv(test_conv_param, ctx, bk_ctx);
+  }
+
+  // test stride extend (0, 31]
+  int stride_list[] = {15, 16, 31};
+  int stride_list_len = sizeof(stride_list) / sizeof(stride_list[0]);
+
+  for (int stride_w_idx = 0; stride_w_idx < stride_list_len; stride_w_idx++) {
+    for (int stride_h_idx = 0; stride_h_idx < stride_list_len; stride_h_idx++) {
+      int stride_w = stride_list[stride_w_idx];
+      int stride_h = stride_list[stride_h_idx];
+      conv_param_t test_conv_param;
+      _init_conv_param(test_conv_param, stride_w, stride_h);
+
+      test_finished_num += test_conv(test_conv_param, ctx, bk_ctx);
+    }
+  }
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_conv_max_power.cpp b/cviruntime/test/1822/test_1822_conv_max_power.cpp
new file mode 100644
index 000000000..01dd31562
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_conv_max_power.cpp
@@ -0,0 +1,1059 @@
+#include "1822_test_util.h"
+
+typedef bmk1822_tdma_l2tg_tensor_copy_cw_transposed_param_t l2tg_cw_param_t;
+typedef bmk1822_tdma_tg2l_matrix_copy_row_col_transposed_param_t tg2l_matrix_param_t;
+typedef bmk1822_tdma_l2l_tensor_copy_param_t l2l_tensor_copy_param_t;
+
+typedef struct{
+    s8 *conv_input;
+    s8 *conv_weight;
+    s16 *conv_bias;
+    u8 *conv_output;
+    s8 *conv_output_ref;
+    u8 *l2g_cw_src;
+    u8 *l2g_cw_output;
+    u8 *l2g_cw_output_ref;
+    u8 *g2l_matrix_src;
+    u8 *g2l_matrix_output;
+    u8 *g2l_matrix_output_ref;
+    u8 *l2l_tensor_src;
+    u8 *l2l_tensor_output;
+    u8 *l2l_tensor_output_ref;
+}s_test_data;
+
+typedef struct {
+  int random_seed;
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int using_bias;
+  int bReLU_EN;
+  int r_shift_m;
+  int opd0_sign;
+  int opd1_sign;
+  int opd2_sign;
+} conv_param_t;
+
+conv_param_t conv_param;
+l2tg_cw_param_t l2tg_cw_param;
+tg2l_matrix_param_t tg2l_matrix_param;
+l2l_tensor_copy_param_t l2l_tensor_copy_param;
+s_test_data s8_test_data;
+bmk1822_tiu_convolution_param_t bmk_conv_param;
+
+bmk1822_tensor_lmem_t *skip_tensor_lmem[10];
+u32 skip_tensor_num=0;
+
+/* need to make sure the free order of alloc_tl for skip_tensor_lmem*/
+void skip_tensor_lmem_size(bmk_ctx_t *bmk, const bmk1822_tensor_lmem_t *p)
+{
+  u32 needed = align_up(p->shape.n * p->stride.n, BM1822_HW_EU_NUM);
+  u32 start_addr = p->start_address + needed; //src_shape.n*src_shape.c*src_shape.h*src_shape.w/32;
+  u32 remain_size = start_addr % BM1822_HW_LMEM_BANK_SIZE ? (BM1822_HW_LMEM_BANK_SIZE - start_addr % BM1822_HW_LMEM_BANK_SIZE) : 0; // remain size for each lane
+  if(remain_size)
+  {
+//    tl_shape_t src_shape2 = {1, BM1822_HW_EU_NUM, 1, remain_size};
+    tl_shape_t src_shape2 = {1, BM1822_HW_NPU_NUM, 1, remain_size};
+    skip_tensor_lmem[skip_tensor_num] = alloc_tl(bmk, src_shape2, FMT_I8, 1); // skip the lmem size and next tl can alignment to bank size
+  }
+  skip_tensor_num++;
+}
+
+void skip_matrix_lmem_size(bmk_ctx_t *bmk, const bmk1822_matrix_lmem_t *p)
+{
+  u32 needed = align_up(p->shape.n * p->stride.n, BM1822_HW_EU_NUM);
+  u32 start_addr = p->start_address + needed; //src_shape.n*src_shape.c*src_shape.h*src_shape.w/32;
+  u32 remain_size = start_addr % BM1822_HW_LMEM_BANK_SIZE ? (BM1822_HW_LMEM_BANK_SIZE - start_addr % BM1822_HW_LMEM_BANK_SIZE) : 0; // remain size for each lane
+  if(remain_size)
+  {
+    tl_shape_t src_shape2 = {1, BM1822_HW_NPU_NUM, 1, remain_size};
+    //tl_shape_t src_shape2 = {1, BM1822_HW_EU_NUM, 1, remain_size};
+    skip_tensor_lmem[skip_tensor_num] = alloc_tl(bmk, src_shape2, FMT_I8, 1); // skip the lmem size and next tl can alignment to bank size
+  }
+  skip_tensor_num++;
+}
+
+void free_skip_tensor_lmem(bmk_ctx_t *ctx)
+{
+  if(skip_tensor_lmem[--skip_tensor_num]!=NULL)
+    free_tl(ctx, skip_tensor_lmem[skip_tensor_num]);
+}
+
+static int index_get(int h, int w1, int w2)
+{
+  return h * w1 + w2;
+}
+
+static int matrix_dot_mult(
+    s8 *A, s8 *B, int dim_n, int dim_m,
+    int opd0_sign)
+{
+  int sum = 0;
+  for (int i=0; i<dim_n; i++){
+    for (int j=0; j<dim_m; j++){
+      int index = index_get(i, dim_m, j);
+      if(opd0_sign) {
+        sum += A[index] * B [index];
+      } else {
+        sum += (int)((uint8_t)A[index]) * B [index];
+      }
+    }
+  }
+  return sum;
+}
+
+static int conv_ref(
+    const conv_param_t *p_param,
+    const s8 *ifmap,
+    const s8 *weight,
+    const s16 *bias,
+    s8 *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+  int r_shift_bits = p_param->r_shift_m;
+  int do_relu = p_param->bReLU_EN;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  s8 *i_fmap_pad_ker = (s8 *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return BM_ERR_FAILURE;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = BM_SUCCESS;
+
+  s8 *i_fmap_pad = NULL;
+  s8 *kernel_after = NULL;
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+
+      if (p_param->using_bias) {
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += bias[c]; //bias+c ;
+          }
+        }
+      }
+
+      if (do_relu)
+        relu(&result[n*oc*oh*ow + c*oh*ow], oh * ow);
+
+      // ofmap is s8, signed
+      ret = satu_2_8bit(&result[n*oc*oh*ow + c*oh*ow], oh * ow,
+                        &ofmap[n*oc*oh*ow + c*oh*ow], r_shift_bits, /*round_floor=*/1,
+                        /*sign_unsign=*/1);
+
+      if (ret != BM_SUCCESS)
+        goto error_release;
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+error_release:
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+static u8 * transform_weight(const tl_shape_t *s, u8 before[])
+{
+  u32 ic = s->n;
+  u32 oc = s->c;
+  u32 kh = s->h;
+  u32 kw = s->w;
+
+  u32 size = ic * oc * kh * kw;
+  u8 *after = (u8 *)malloc(sizeof(u8) * size);
+
+  /*
+   * (oc, ic, kh, kw) -> (1, oc, kh * kw, ic)
+   */
+  for (u32 oci = 0; oci < oc; oci++) {
+    for (u32 ici = 0; ici < ic; ici++) {
+      for (u32 khi = 0; khi < kh; khi++) {
+        for (u32 kwi = 0; kwi < kw; kwi++) {
+          u32 src_i = oci * ic * kh * kw + ici * kh * kw + khi * kw + kwi;
+          u32 dst_i = oci * kh * kw * ic + khi * kw * ic + kwi * ic + ici;
+          after[dst_i] = before[src_i];
+        }
+      }
+    }
+  }
+
+  return after;
+}
+
+static void put_conv_weight(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    const tl_t *tl,
+    u8 *data)
+{
+  const tl_shape_t *s = &tl->shape;
+  u32 ic = s->n;
+  u32 oc = s->c;
+  u32 kh = s->h;
+  u32 kw = s->w;
+
+  bmshape_t bms = BM_TENSOR_INT8((int)oc, (int)ic, (int)kh, (int)kw);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  u8 *transformed_data = transform_weight(s, data);
+
+  /* we put weight to region 1. bm_memcpy_s2d regard dev_mem as
+   * absolute address, so we should pass abs address to copy weight
+   * to right place.
+   */
+
+  //u64 ab_addr = bm_device_read_base_reg(*ctx, 1);
+  //bmmem_device_t ab_dev_mem =
+    //bmmem_device_prealloc(*ctx, NULL, ab_addr + gaddr, &bms);
+
+  //int ret = bm_memcpy_s2d(*ctx, ab_dev_mem, transformed_data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, transformed_data);
+
+  assert(ret == BM_SUCCESS);
+
+  tl_shape_t tdma_shape = { 1, oc, kh * kw, ic };
+
+  tg_t tdma_tg;
+  tdma_tg.base_reg_index = 0;
+  tdma_tg.start_address = gaddr;
+  tdma_tg.fmt = FMT_I8;
+  tdma_tg.shape.n = tdma_shape.n;
+  tdma_tg.shape.c = tdma_shape.c;
+  tdma_tg.shape.h = tdma_shape.h;
+  tdma_tg.shape.w = tdma_shape.w;
+  tdma_tg.stride = bmk1822_tensor_tgmem_default_stride(tdma_tg.shape, tdma_tg.fmt);
+  tdma_tg.base_reg_index = 1;
+
+  tl_t tdma_tl = *tl;
+  tdma_tl.shape = tdma_shape;
+  tdma_tl.stride = bmk1822_tensor_lmem_default_stride(bk_ctx, tdma_shape, FMT_I8, 0);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tdma_tg;
+  p.dst = &tdma_tl;
+
+  bmk1822_tdma_g2l_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+  bmmem_device_free(*ctx, dev_mem);
+  free(transformed_data);
+}
+
+static s8 * transform_bias(int oc, s16 before[])
+{
+  s8 *after = (s8 *)malloc(sizeof(s8) * 2 * oc);
+  if (!after)
+    return NULL;
+
+  for (int i = 0; i < oc; i++){
+    after[i] = before[i] & 0xff;
+    after[i + oc] = (before[i] >> 8) & 0xff;
+  }
+  return after;
+}
+
+static void put_conv_bias(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    const tl_t *tl,
+    s16 *data)
+{
+  int oc = tl->shape.c;
+
+  bmshape_t bms = BM_TENSOR_INT8(2, oc, 1, 1);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  s8 *transformed_data = transform_bias(oc, data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, (u8 *)transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_I8;
+  tg.shape.n = 2;
+  tg.shape.c = oc;
+  tg.shape.h = 1;
+  tg.shape.w = 1;
+  tg.stride = bmk1822_tensor_tgmem_default_stride(tg.shape, tg.fmt);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1822_tdma_g2l_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  bmmem_device_free(*ctx, dev_mem);
+  free(transformed_data);
+}
+
+static int conv_kh_ext(const conv_param_t *p)
+{
+  return (p->kh - 1) * p->dh + 1;
+}
+
+static int conv_kw_ext(const conv_param_t *p)
+{
+  return (p->kw - 1) * p->dw + 1;
+}
+
+static int conv_ih_ext(const conv_param_t *p)
+{
+  return (p->input_h - 1) * (p->ins_h + 1) +
+      p->ins_h_last + 1 + p->pad_top + p->pad_bot;
+}
+
+static int conv_iw_ext(const conv_param_t *p)
+{
+  return (p->input_w - 1) * (p->ins_w + 1) +
+      p->ins_w_last + 1 + p->pad_left + p->pad_right;
+}
+
+static int conv_oh(const conv_param_t *p)
+{
+  return (conv_ih_ext(p) - conv_kh_ext(p)) / p->stride_h + 1;
+}
+
+static int conv_ow(const conv_param_t *p)
+{
+  return (conv_iw_ext(p) - conv_kw_ext(p)) / p->stride_w + 1;
+}
+
+static int conv_input_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int ic = p->input_c;
+  int ih = p->input_h;
+  int iw = p->input_w;
+  return in * ic * ih * iw;
+}
+
+static int conv_output_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int oc = p->output_c;
+  int oh = conv_oh(p);
+  int ow = conv_ow(p);
+  return in * oc * oh * ow;
+}
+
+static int conv_weight_size(const conv_param_t *p)
+{
+  int oc = p->output_c;
+  int ic = p->input_c;
+  int kh = p->kh;
+  int kw = p->kw;
+  return oc * ic * kh * kw;
+}
+
+static s8 * alloc_input(const conv_param_t *p)
+{
+  int size = conv_input_size(p);
+
+  s8 *buf = (s8 *)malloc(sizeof(s8) * size);
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static s8 * alloc_weight(const conv_param_t *p)
+{
+  int size = conv_weight_size(p);
+
+  s8 *buf = (s8 *)malloc(sizeof(s8) * size);
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static s16 * alloc_bias(const conv_param_t *p)
+{
+  int oc = p->output_c;
+
+  s16 *bias = (s16 *)malloc(sizeof(s16) * oc);
+  for (int i = 0; i < oc; i++)
+    bias[i] = rand() % 65536 - 32768;
+
+  return bias;
+}
+
+static tl_t * conv_ifmap_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = p->opd0_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return bmk1822_lmem_alloc_tensor(ctx, s, fmt, 1);
+}
+
+static tl_t * conv_weight_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = p->opd1_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+  return bmk1822_lmem_alloc_tensor(ctx, s, fmt, 0);
+}
+
+static tl_t * conv_ofmap_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  return bmk1822_lmem_alloc_tensor(ctx, s, FMT_I8, 1);
+}
+
+static tl_t * conv_bias_tensor(
+    bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = p->opd2_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+  return bmk1822_lmem_alloc_tensor(ctx, s, fmt, 0);
+}
+
+static int conv_param_is_ok(const conv_param_t *p)
+{
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+
+  if ((kh_ext > ih_ext)
+      || (kw_ext > iw_ext)
+      || (kh_ext <= p->pad_top)
+      || (kh_ext <= p->pad_bot)
+      || (kw_ext <= p->pad_left)
+      || (kw_ext <= p->pad_right)
+      || (p->pad_top >= (1 << 4))
+      || (p->pad_bot >= (1 << 4))
+      || (p->pad_left >= (1 << 4))
+      || (p->pad_right >= (1 << 4))) {
+    return 0;
+  }
+
+  return 1;
+}
+
+static int bmk_conv_param_alloc_ok(
+    const bmk1822_tiu_convolution_param_t *p,
+    const conv_param_t *param)
+{
+  if (!p->ifmap || !p->ofmap || !p->weight)
+    return 0;
+
+  if (param->using_bias)
+    if (!p->bias)
+      return 0;
+
+  return 1;
+}
+
+static void make_bmk_conv_param(
+    bmk_ctx_t *ctx,
+    bmk1822_tiu_convolution_param_t *dst,
+    const conv_param_t *p)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  dst->relu_enable = p->bReLU_EN;
+  dst->rshift_bits = p->r_shift_m;
+
+  dst->ifmap = conv_ifmap_tensor(ctx, p);
+  skip_tensor_lmem_size(ctx, dst->ifmap);
+  dst->weight = conv_weight_tensor(ctx, p);
+  skip_tensor_lmem_size(ctx, dst->weight);
+  dst->ofmap = conv_ofmap_tensor(ctx, p);
+  skip_tensor_lmem_size(ctx, dst->ofmap);
+  dst->bias = NULL;
+  dst->ps32_mode = 0;
+  if (p->using_bias)
+  {
+    dst->bias = conv_bias_tensor(ctx, p);
+    skip_tensor_lmem_size(ctx, dst->bias);
+  }
+  dst->w_is_const = 0;
+}
+
+static void free_bmk_conv_param(
+    bmk_ctx_t *ctx,
+    bmk1822_tiu_convolution_param_t *r,
+    const conv_param_t *p)
+{
+  if (p->using_bias && r->bias)
+  {
+    free_skip_tensor_lmem(ctx);
+    free_tl(ctx, r->bias);
+  }
+  if (r->ofmap)
+  {
+    free_skip_tensor_lmem(ctx);
+    free_tl(ctx, r->ofmap);
+  }
+  if (r->weight)
+  {
+    free_skip_tensor_lmem(ctx);
+    free_tl(ctx, r->weight);
+  }
+  if (r->ifmap)
+  {
+    free_skip_tensor_lmem(ctx);
+    free_tl(ctx, r->ifmap);
+  }
+}
+
+static void init_conv_param(conv_param_t &p)
+{
+retry:
+  p.input_n = 1;
+  p.input_c = 16;
+  p.input_h = 2;
+  p.input_w = 600;
+
+  p.kh = 2;
+  p.kw = 16;
+  p.output_c = 16;
+
+  p.stride_h = 1;
+  p.stride_w = 15;
+  p.ins_h = 0;
+  p.ins_w = 0;
+  p.ins_h_last = 0;;
+  p.ins_w_last = 0;;
+  p.dh = 1;
+  p.dw = 1;
+
+  int kh_ext = conv_kh_ext(&p);
+  int kw_ext = conv_kw_ext(&p);
+  p.pad_top = 1;
+  p.pad_bot = 0;
+  p.pad_left = 0;
+  p.pad_right = 0;
+
+  if (!conv_param_is_ok(&p)) {
+    printf("retry init_conv_param\n");
+    goto retry;
+  }
+
+  p.using_bias = 0;
+  p.r_shift_m = 7;
+  p.bReLU_EN = 1;
+
+  p.opd0_sign = 0;
+  p.opd1_sign = 1;
+  p.opd2_sign = 1;
+
+  assert(p.opd1_sign == 1 && p.opd2_sign == 1);
+
+  int ih_ext = conv_ih_ext(&p);
+  int iw_ext = conv_iw_ext(&p);
+  assert(ih_ext >= kh_ext);
+  assert(iw_ext >= kw_ext);
+
+}
+
+static void print_conv_param(const conv_param_t *p)
+{
+  printf("%s\n", "Conv parameters:");
+  printf("  %s%d;\n", "p->random_seed = ", p->random_seed);
+
+  printf("  %s%d;\n", "p->input_n = ", p->input_n);
+  printf("  %s%d;\n", "p->input_c = ", p->input_c);
+  printf("  %s%d;\n", "p->input_h = ", p->input_h);
+  printf("  %s%d;\n", "p->input_w = ", p->input_w);
+  printf("  %s%d;\n", "p->output_c = ", p->output_c);
+
+  printf("  %s%d;\n", "p->kh = ", p->kh);
+  printf("  %s%d;\n", "p->kw = ", p->kw);
+  printf("  %s%d;\n", "p->dh = ", p->dh);
+  printf("  %s%d;\n", "p->dw = ", p->dw);
+  printf("  %s%d;\n", "p->pad_top = ", p->pad_top);
+  printf("  %s%d;\n", "p->pad_bot = ", p->pad_bot);
+  printf("  %s%d;\n", "p->pad_left = ", p->pad_left);
+  printf("  %s%d;\n", "p->pad_right = ", p->pad_right);
+  printf("  %s%d;\n", "p->stride_h = ", p->stride_h);
+  printf("  %s%d;\n", "p->stride_w = ", p->stride_w);
+  printf("  %s%d;\n", "p->ins_w = ", p->ins_w);
+  printf("  %s%d;\n", "p->ins_h = ", p->ins_h);
+  printf("  %s%d;\n", "p->ins_w_last = ", p->ins_w_last);
+  printf("  %s%d;\n", "p->ins_h_last = ", p->ins_h_last);
+
+  printf("  %s%d;\n", "p->r_shift_m = ", p->r_shift_m);
+  printf("  %s%d;\n", "p->opd0_sign = ", p->opd0_sign);
+  printf("  %s%d;\n", "p->opd1_sign = ", p->opd1_sign);
+  printf("  %s%d;\n", "p->opd2_sign = ", p->opd2_sign);
+  printf("  %s%d;\n", "p->bReLU_EN = ", p->bReLU_EN);
+  printf("  %s%d;\n", "p->using_bias = ", p->using_bias);
+
+  printf("  %s%d\n", "kh_ext = ", conv_kh_ext(p));
+  printf("  %s%d\n", "kw_ext = ", conv_kw_ext(p));
+  printf("  %s%d\n", "ih_ext = ", conv_ih_ext(p));
+  printf("  %s%d\n", "iw_ext = ", conv_iw_ext(p));
+  printf("  %s%d\n", "output_h = ", conv_oh(p));
+  printf("  %s%d\n", "output_w = ", conv_ow(p));
+}
+
+// Calculate the right shift value, m
+// Steps:
+//  1. Get the abs() of each weight;
+//  2. Summary all the abs() in one kernel;
+//  3. Get Log2 of each sum;
+//  4. Downward rounding;
+// After every r_shift value got, sort and find the middle one.
+static int calc_rshift_m(const conv_param_t *p, s8* weight)
+{
+  int kernel_cnt = p->output_c * p->input_c;
+  int kernel_size = p->kh * p->kw;
+  int *kernel_shifts = (int *)malloc(sizeof(int) * kernel_cnt);
+
+  memset(kernel_shifts, 0, sizeof(int) * kernel_cnt);
+
+  // Part 1:
+  // Get right shift value for each kernel
+  int sum = 0;
+  for (int i = 0; i < kernel_cnt; i++) {
+    // Step 1 & 2: Get the sum of abs()
+    for (int j = 0; j < kernel_size; j++) {
+      sum += (int)(*weight < 0 ? -(*weight) : (*weight));
+      weight++;
+    }
+    // Step 3 & 4: log2 and downward rounding
+    sum >>= 1;
+    while (sum) {
+      sum >>= 1;
+      kernel_shifts[i]++;
+    }
+  }
+
+  // Part 2:
+  // Find the middle of all the values
+  int tag[32] = {0};
+  for (int cnt = 0; cnt < kernel_cnt; cnt++) {
+    tag[kernel_shifts[cnt]]++;
+  }
+
+  int rshift_m = 0;
+  int mid = 0;
+  do {
+    mid += tag[rshift_m++];
+  } while(mid < (kernel_cnt - 1) >> 1);
+
+  free(kernel_shifts);
+
+  return rshift_m - 1;
+}
+
+
+static void l2tg_tensor_copy_cw_transposed_ref(
+    l2tg_cw_param_t *p, u8 ref_data[], u8 src_data[])
+{
+  tl_shape_t s = p->src->shape;
+  u32 n = s.n;
+  u32 c = s.c;
+  u32 h = s.h;
+  u32 w = s.w;
+
+  for (u32 ni = 0; ni < n; ni++) {
+    for (u32 ci = 0; ci < c; ci++) {
+      for (u32 hi = 0; hi < h; hi++) {
+        for (u32 wi = 0; wi < w; wi++) {
+          u32 src_i = ni * c * h * w + ci * h * w + hi * w + wi;
+          u32 dst_i = ni * c * h * w + wi * h * c + hi * c + ci;
+          ref_data[dst_i] = src_data[src_i];
+        }
+      }
+    }
+  }
+}
+
+static void test_param_l2g(bmctx_t *ctx, bmk_ctx_t *bmk, l2tg_cw_param_t *p)
+{
+  u64 size = tl_shape_size(&p->src->shape);
+
+  s8_test_data.l2g_cw_src = (u8 *)malloc(sizeof(u8) * size);
+  if (!s8_test_data.l2g_cw_src)
+    return;
+
+  for (u64 i = 0; i < size; i++)
+    s8_test_data.l2g_cw_src[i] = rand()%0x100;
+
+  s8_test_data.l2g_cw_output_ref = (u8 *)malloc(sizeof(u8) * size);
+  if (!s8_test_data.l2g_cw_output_ref)
+    return;
+
+  l2tg_tensor_copy_cw_transposed_ref(p, s8_test_data.l2g_cw_output_ref, s8_test_data.l2g_cw_src);
+
+  put_tensor_g2l(ctx, bmk, p->src, s8_test_data.l2g_cw_src);
+}
+
+static void destroy_param_l2g(bmctx_t *ctx, bmk_ctx_t *bmk, l2tg_cw_param_t *p)
+{
+  free_tg_gmem(ctx, p->dst);
+  free_skip_tensor_lmem(bmk);
+  free_tl(bmk, p->src);
+}
+
+static void test_l2tg_cw_transpose(bmctx_t *ctx, bmk_ctx_t *bmk, l2tg_cw_param_t *p)
+{
+  tl_shape_t src_shape = {1, 0x100, 1, 0x020};
+  tg_shape_t dst_shape = {1, 0x020, 1, 0x100};
+
+//  tl_shape_t src_shape = {1, 0x100, 1, 0x080};
+//  tg_shape_t dst_shape = {1, 0x080, 1, 0x100};
+
+  p->src = alloc_tl(bmk, src_shape, FMT_I8, 1);
+  p->dst = alloc_tg_gmem(ctx, dst_shape, FMT_I8);
+  skip_tensor_lmem_size(bmk, p->src);
+  test_param_l2g(ctx, bmk, p);
+}
+
+static void tg2l_matrix_copy_row_col_transposed_ref(
+    tg2l_matrix_param_t *p, u8 ref_data[], u8 src_data[])
+{
+  u64 row = p->src->shape.row;
+  u64 col = p->src->shape.col;
+
+  for (u64 ri = 0; ri < row; ri++) {
+    for (u64 ci = 0; ci < col; ci++) {
+      u64 src_i = ri * col + ci;
+      u64 dst_i = ci * row + ri;
+      ref_data[dst_i] = src_data[src_i];
+    }
+  }
+}
+
+static void test_param_g2l(bmctx_t *ctx, tg2l_matrix_param_t *p)
+{
+  u64 size = ml_shape_size(&p->dst->shape);
+
+  s8_test_data.g2l_matrix_src = (u8 *)malloc(sizeof(u8) * size);
+  if (!s8_test_data.g2l_matrix_src)
+    return;
+
+  for (u64 i = 0; i < size; i++)
+    s8_test_data.g2l_matrix_src[i] = rand()%0x100;
+
+  s8_test_data.g2l_matrix_output_ref = (u8 *)malloc(sizeof(u8) * size);
+  if (!s8_test_data.g2l_matrix_output_ref)
+    return;
+
+  tg2l_matrix_copy_row_col_transposed_ref(p, s8_test_data.g2l_matrix_output_ref, s8_test_data.g2l_matrix_src);
+
+  put_mg_gmem(ctx, p->src, s8_test_data.g2l_matrix_src);
+}
+
+static void destroy_param_g2l(bmctx_t *ctx, bmk_ctx_t *bmk, tg2l_matrix_param_t *p)
+{
+  free_mg_gmem(ctx, p->src);
+  free_skip_tensor_lmem(bmk);
+  free_ml(bmk, p->dst);
+}
+
+
+static void test_tg2l_matrix_transpose(bmctx_t *ctx, bmk_ctx_t *bmk, tg2l_matrix_param_t *p)
+{
+  //tg2l_matrix_param_t p;
+  /*
+   * Matrix transpose must be n/c stride alignment
+   * for TDMA limitation
+   */
+  mg_shape_t src_shape={0x100, 0x20};
+  ml_shape_t dst_shape={0x20, 0x10, 0x10, 0x100};
+
+//  mg_shape_t src_shape={0x100, 0x80};
+//  ml_shape_t dst_shape={0x80, 0x10, 0x10, 0x100};
+
+  int dst_align = 1;
+
+  p->src = alloc_mg_gmem(ctx, src_shape);
+  p->dst = alloc_ml(bmk, dst_shape, dst_align);
+  skip_matrix_lmem_size(bmk, p->dst);
+  test_param_g2l(ctx, p);
+}
+
+static void l2l_tensor_copy_ref(l2l_tensor_copy_param_t *p, u8 ref_data[], u8 src_data[])
+{
+  u64 size = tl_shape_size(&p->src->shape);
+
+  for (u64 i = 0; i < size; i++)
+    ref_data[i] = src_data[i];
+}
+
+static void test_l2l_param(bmctx_t *ctx, bmk_ctx_t *bmk, l2l_tensor_copy_param_t *p)
+{
+  u64 size = tl_shape_size(&p->src->shape);
+
+  s8_test_data.l2l_tensor_src = (u8 *)malloc(sizeof(u8) * size);
+  if (!s8_test_data.l2l_tensor_src)
+    return;
+
+  for (u64 i = 0; i < size; i++)
+    s8_test_data.l2l_tensor_src[i] = rand()%0x100;
+
+  s8_test_data.l2l_tensor_output_ref = (u8 *)malloc(sizeof(u8) * size);
+  if (!s8_test_data.l2l_tensor_output_ref)
+    return;
+
+  l2l_tensor_copy_ref(p, s8_test_data.l2l_tensor_output_ref, s8_test_data.l2l_tensor_src);
+
+  put_tensor_g2l(ctx, bmk, p->src, s8_test_data.l2l_tensor_src);
+}
+
+static void destroy_param_l2l(bmk_ctx_t *bmk, l2l_tensor_copy_param_t *p)
+{
+  free_skip_tensor_lmem(bmk);
+  free_tl(bmk, p->dst);
+  free_skip_tensor_lmem(bmk);
+  free_tl(bmk, p->src);
+}
+
+static void test_l2l_tensor_copy(bmctx_t *ctx, bmk_ctx_t *bmk, l2l_tensor_copy_param_t *p)
+{
+  tl_shape_t src_shape = {1, 0x10, 0x1, 0x100};
+  tl_shape_t dst_shape = {1, 0x10, 0x1, 0x100};
+
+//  tl_shape_t src_shape = {1, 0x10, 0x1, 0x400};
+//  tl_shape_t dst_shape = {1, 0x10, 0x1, 0x400};
+
+  p->src = alloc_tl(bmk, src_shape, FMT_I8, 1);
+  skip_tensor_lmem_size(bmk, p->src);
+  p->dst = alloc_tl(bmk, dst_shape, FMT_I8, 1);
+  skip_tensor_lmem_size(bmk, p->dst);
+  test_l2l_param(ctx, bmk, p);
+}
+
+static int setup_conv(
+    conv_param_t &p_param, bmctx_t ctx, bmk_ctx_t *bk_ctx)
+{
+  s8_test_data.conv_input = alloc_input(&p_param);
+  s8_test_data.conv_weight = alloc_weight(&p_param);
+  s8_test_data.conv_bias = alloc_bias(&p_param);
+  p_param.r_shift_m = calc_rshift_m(&p_param, s8_test_data.conv_weight);
+  s8_test_data.conv_output_ref = (s8 *)malloc(sizeof(s8) * conv_output_size(&p_param));
+  if (!s8_test_data.conv_output_ref)
+    return BM_ERR_FAILURE;
+
+  bmerr_t ret = conv_ref(&p_param, s8_test_data.conv_input, s8_test_data.conv_weight, s8_test_data.conv_bias, s8_test_data.conv_output_ref);
+  assert(ret == BM_SUCCESS);
+  make_bmk_conv_param(bk_ctx, &bmk_conv_param, &p_param);
+
+  bmk_conv_param_alloc_ok(&bmk_conv_param, &p_param);
+
+  put_tensor_g2l(&ctx, bk_ctx, bmk_conv_param.ifmap, (u8 *)s8_test_data.conv_input);
+  put_conv_weight(&ctx, bk_ctx, bmk_conv_param.weight, (u8 *)s8_test_data.conv_weight);
+  if (p_param.using_bias)
+    put_conv_bias(&ctx, bk_ctx, bmk_conv_param.bias, s8_test_data.conv_bias);
+
+  return 1;
+}
+
+void get_result(bmctx_t *ctx, bmk_ctx_t *bmk)
+{
+  s8_test_data.conv_output = get_tensor_l2g(ctx, bmk, bmk_conv_param.ofmap);
+  s8_test_data.l2g_cw_output = get_tg_gmem(ctx, l2tg_cw_param.dst);
+  s8_test_data.g2l_matrix_output = get_matrix_l2g(ctx, bmk, tg2l_matrix_param.dst);
+  s8_test_data.l2l_tensor_output = get_tensor_l2g(ctx, bmk, l2l_tensor_copy_param.dst);
+}
+
+void check_result()
+{
+    int has_error = array_cmp_int8(
+        "conv Comparing results ...\n",
+        s8_test_data.conv_output_ref, (s8 *)s8_test_data.conv_output, conv_output_size(&conv_param));
+
+    if (has_error) {
+      print_conv_param(&conv_param);
+      printf("Comparison FAILED\n");
+      exit(-1);
+    }
+
+  for (u64 i = 0; i < tl_shape_size(&l2tg_cw_param.src->shape); i++) {
+    if (s8_test_data.l2g_cw_output[i] != s8_test_data.l2g_cw_output_ref[i]) {
+      fprintf(stderr, "l2g_cw comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, s8_test_data.l2g_cw_output[i], s8_test_data.l2g_cw_output_ref[i]);
+      exit(-1);
+    }
+  }
+  for (u64 i = 0; i < ml_shape_size(&tg2l_matrix_param.dst->shape); i++) {
+    if (s8_test_data.g2l_matrix_output[i] != s8_test_data.g2l_matrix_output_ref[i]) {
+      fprintf(stderr, "g2l_matrix comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, s8_test_data.g2l_matrix_output[i], s8_test_data.g2l_matrix_output_ref[i]);
+      exit(-1);
+    }
+  }
+
+  for (u64 i = 0; i < tl_shape_size(&l2l_tensor_copy_param.src->shape); i++) {
+    if (s8_test_data.l2l_tensor_output[i] != s8_test_data.l2l_tensor_output_ref[i]) {
+      fprintf(stderr, "l2l_tensor comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, s8_test_data.l2l_tensor_output[i], s8_test_data.l2l_tensor_output_ref[i]);
+      exit(-1);
+    }
+  }
+
+
+}
+
+void trigger_max_power(bmctx_t *ctx, bmk_ctx_t *bmk)
+{
+ bmk1822_parallel_enable(bmk);
+ bmk1822_tdma_l2g_tensor_copy_cw_transposed(bmk, &l2tg_cw_param);
+ bmk1822_tdma_g2l_matrix_copy_row_col_transposed(bmk, &tg2l_matrix_param);
+ bmk1822_tdma_l2l_tensor_copy(bmk, &l2l_tensor_copy_param);
+ bmk1822_tiu_convolution(bmk, &bmk_conv_param);
+ bmk1822_parallel_disable(bmk);
+ bmk1822_parallel_enable(bmk);
+ bmk1822_tdma_l2g_tensor_copy_cw_transposed(bmk, &l2tg_cw_param);
+ bmk1822_tdma_g2l_matrix_copy_row_col_transposed(bmk, &tg2l_matrix_param);
+ bmk1822_tdma_l2l_tensor_copy(bmk, &l2l_tensor_copy_param);
+ bmk1822_tiu_convolution(bmk, &bmk_conv_param);
+ bmk1822_parallel_disable(bmk);
+ test_submit(ctx);
+}
+
+void free_s8_data()
+{
+  free(s8_test_data.conv_input);
+  free(s8_test_data.conv_weight);
+  free(s8_test_data.conv_bias);
+  free(s8_test_data.conv_output);
+  free(s8_test_data.conv_output_ref);
+  free(s8_test_data.l2g_cw_src);
+  free(s8_test_data.l2g_cw_output);
+  free(s8_test_data.l2g_cw_output_ref);
+  free(s8_test_data.g2l_matrix_src);
+  free(s8_test_data.g2l_matrix_output);
+  free(s8_test_data.g2l_matrix_output_ref);
+  free(s8_test_data.l2l_tensor_src);
+  free(s8_test_data.l2l_tensor_output);
+  free(s8_test_data.l2l_tensor_output_ref);
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  printf("conv max_power test\n");
+  init_conv_param(conv_param);
+  setup_conv(conv_param, ctx, bk_ctx);
+
+  test_l2tg_cw_transpose(&ctx, bk_ctx, &l2tg_cw_param);
+  test_tg2l_matrix_transpose(&ctx, bk_ctx, &tg2l_matrix_param);
+  test_l2l_tensor_copy(&ctx, bk_ctx, &l2l_tensor_copy_param);
+
+  trigger_max_power(&ctx, bk_ctx);
+  get_result(&ctx, bk_ctx);
+  check_result();
+
+  destroy_param_l2l(bk_ctx,&l2l_tensor_copy_param);
+  destroy_param_g2l(&ctx, bk_ctx, &tg2l_matrix_param);
+  destroy_param_l2g(&ctx, bk_ctx, &l2tg_cw_param);
+  free_bmk_conv_param(bk_ctx, &bmk_conv_param, &conv_param);
+  free_s8_data();
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_conv_ps32.cpp b/cviruntime/test/1822/test_1822_conv_ps32.cpp
new file mode 100644
index 000000000..e2013f87f
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_conv_ps32.cpp
@@ -0,0 +1,1517 @@
+#include "1822_test_util.h"
+
+typedef struct {
+  int random_seed;
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int using_bias;
+  int bReLU_EN;
+  int r_shift_m;
+  int opd0_sign;
+  int opd1_sign;
+  int opd2_sign;
+} conv_param_t;
+
+static int index_get(int h, int w1, int w2)
+{
+  return h * w1 + w2;
+}
+
+static int matrix_dot_mult(
+    s8 *A, s8 *B, int dim_n, int dim_m,
+    int opd0_sign)
+{
+  int sum = 0;
+  for (int i=0; i<dim_n; i++){
+    for (int j=0; j<dim_m; j++){
+      int index = index_get(i, dim_m, j);
+      if (opd0_sign) {
+        sum += A[index] * B [index];
+      } else {
+        sum += (int)((uint8_t)A[index]) * B [index];
+      }
+    }
+  }
+  return sum;
+}
+
+static int ps32_m2_conv_ref(
+    const conv_param_t *p_param,
+    const s8 *ifmap,
+    const s8 *weight,
+    s8 *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  s8 *i_fmap_pad_ker = (s8 *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return BM_ERR_FAILURE;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = BM_SUCCESS;
+
+  s8 *i_fmap_pad = NULL;
+  s8 *kernel_after = NULL;
+  u32 bstride = in * oc * oh * ow;
+
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+    ofmap[i] = result[i];
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+    ofmap[bstride + i] = result[i] >> 8;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+    ofmap[2 * bstride + i] = result[i] >> 16;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+    ofmap[3 * bstride + i] = result[i] >> 24;
+
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+static int ps32_m1_conv_ref(
+    const conv_param_t *p_param,
+    const s8 *ifmap,
+    const s8 *weight,
+    const s16 *bias,
+    s8 *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+  int r_shift_bits = p_param->r_shift_m;
+  int do_relu = p_param->bReLU_EN;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  s8 *i_fmap_pad_ker = (s8 *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return BM_ERR_FAILURE;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = BM_SUCCESS;
+
+  s8 *i_fmap_pad = NULL;
+  s8 *kernel_after = NULL;
+
+  u32 bstride = in * oc * oh * ow;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      result[i] = (u8)ofmap[i];
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      result[i] |= (u8)ofmap[bstride + i] << 8;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      result[i] |= (u8)ofmap[2 * bstride + i] << 16;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      result[i] |= (u8)ofmap[3 * bstride + i] << 24;
+
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+
+      if (p_param->using_bias) {
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += bias[c]; //bias+c ;
+          }
+        }
+      }
+
+      if (do_relu)
+        relu(&result[n*oc*oh*ow + c*oh*ow], oh * ow);
+
+      // ofmap is s8, signed
+      ret = satu_2_8bit(&result[n*oc*oh*ow + c*oh*ow], oh * ow,
+                        &ofmap[n*oc*oh*ow + c*oh*ow], r_shift_bits, /*round_floor=*/1,
+                        /*sign_unsign=*/1);
+
+      if (ret != BM_SUCCESS)
+        goto error_release;
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+error_release:
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+
+static int ps32_m3_conv_ref(
+    const conv_param_t *p_param,
+    const s8 *ifmap,
+    const s8 *weight,
+    s8 *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  s8 *i_fmap_pad_ker = (s8 *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return BM_ERR_FAILURE;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = BM_SUCCESS;
+
+  s8 *i_fmap_pad = NULL;
+  s8 *kernel_after = NULL;
+
+  u32 bstride = in * oc * oh * ow;
+
+  for (int i = 0; i < in * oc * oh * ow; i++)
+      result[i] = (u8)ofmap[i];
+
+  for (int i = 0; i < in * oc * oh * ow; i++)
+      result[i] |= (u8)ofmap[bstride + i] << 8;
+
+  for (int i = 0; i < in * oc * oh * ow; i++)
+      result[i] |= (u8)ofmap[2 * bstride + i] << 16;
+
+  for (int i = 0; i < in * oc * oh * ow; i++)
+      result[i] |= (u8)ofmap[3 * bstride + i] << 24;
+
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      ofmap[i] = result[i];
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      ofmap[bstride + i] = result[i] >> 8;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      ofmap[2 * bstride + i] = result[i] >> 16;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      ofmap[3 * bstride + i] = result[i] >> 24;
+
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+static int conv_ref(
+    const conv_param_t *p_param,
+    const s8 *ifmap,
+    const s8 *weight,
+    const s16 *bias,
+    s8 *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+  int r_shift_bits = p_param->r_shift_m;
+  int do_relu = p_param->bReLU_EN;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  s8 *i_fmap_pad_ker = (s8 *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return BM_ERR_FAILURE;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = BM_SUCCESS;
+
+  s8 *i_fmap_pad = NULL;
+  s8 *kernel_after = NULL;
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+
+      if (p_param->using_bias) {
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += bias[c]; //bias+c ;
+          }
+        }
+      }
+
+      if (do_relu)
+        relu(&result[n*oc*oh*ow + c*oh*ow], oh * ow);
+
+      // ofmap is s8, signed
+      ret = satu_2_8bit(&result[n*oc*oh*ow + c*oh*ow], oh * ow,
+                        &ofmap[n*oc*oh*ow + c*oh*ow], r_shift_bits, /*round_floor=*/1,
+                        /*sign_unsign=*/1);
+
+      if (ret != BM_SUCCESS)
+        goto error_release;
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+  neuron_dump <s32>(
+      "test_code:conv_ref:pure result + bias",
+      (u32)in,
+      (u32)oc,
+      (u32)oh,
+      (u32)ow,
+      (s32 *)result);
+
+
+error_release:
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+static u8 * transform_weight(const tl_shape_t *s, u8 before[])
+{
+  u32 ic = s->n;
+  u32 oc = s->c;
+  u32 kh = s->h;
+  u32 kw = s->w;
+
+  u32 size = ic * oc * kh * kw;
+  u8 *after = (u8 *)malloc(sizeof(u8) * size);
+
+  /*
+   * (oc, ic, kh, kw) -> (1, oc, kh * kw, ic)
+   */
+  for (u32 oci = 0; oci < oc; oci++) {
+    for (u32 ici = 0; ici < ic; ici++) {
+      for (u32 khi = 0; khi < kh; khi++) {
+        for (u32 kwi = 0; kwi < kw; kwi++) {
+          u32 src_i = oci * ic * kh * kw + ici * kh * kw + khi * kw + kwi;
+          u32 dst_i = oci * kh * kw * ic + khi * kw * ic + kwi * ic + ici;
+          after[dst_i] = before[src_i];
+        }
+      }
+    }
+  }
+
+  return after;
+}
+
+static void put_conv_weight(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    const tl_t *tl,
+    u8 *data)
+{
+  const tl_shape_t *s = &tl->shape;
+  u32 ic = s->n;
+  u32 oc = s->c;
+  u32 kh = s->h;
+  u32 kw = s->w;
+
+  bmshape_t bms = BM_TENSOR_INT8((int)oc, (int)ic, (int)kh, (int)kw);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  u8 *transformed_data = transform_weight(s, data);
+
+  /* we put weight to region 1. bm_memcpy_s2d regard dev_mem as
+   * absolute address, so we should pass abs address to copy weight
+   * to right place.
+   */
+
+  //u64 ab_addr = bm_device_read_base_reg(*ctx, 1);
+  //bmmem_device_t ab_dev_mem =
+    //bmmem_device_prealloc(*ctx, NULL, ab_addr + gaddr, &bms);
+
+  //int ret = bm_memcpy_s2d(*ctx, ab_dev_mem, transformed_data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tl_shape_t tdma_shape = { 1, oc, kh * kw, ic };
+
+  tg_t tdma_tg;
+  tdma_tg.base_reg_index = 0;
+  tdma_tg.start_address = gaddr;
+  tdma_tg.fmt = FMT_I8;
+  tdma_tg.shape.n = tdma_shape.n;
+  tdma_tg.shape.c = tdma_shape.c;
+  tdma_tg.shape.h = tdma_shape.h;
+  tdma_tg.shape.w = tdma_shape.w;
+  tdma_tg.stride = bmk1822_tensor_tgmem_default_stride(tdma_tg.shape, tdma_tg.fmt);
+  tdma_tg.base_reg_index = 1;
+
+  tl_t tdma_tl = *tl;
+  tdma_tl.shape = tdma_shape;
+  tdma_tl.stride = bmk1822_tensor_lmem_default_stride(bk_ctx, tdma_shape, FMT_I8, 0);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tdma_tg;
+  p.dst = &tdma_tl;
+
+  bmk1822_tdma_g2l_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+  bmmem_device_free(*ctx, dev_mem);
+  free(transformed_data);
+}
+
+static s8 * transform_bias(int oc, s16 before[])
+{
+  s8 *after = (s8 *)malloc(sizeof(s8) * 2 * oc);
+  if (!after)
+    return NULL;
+
+  for (int i = 0; i < oc; i++){
+    after[i] = before[i] & 0xff;
+    after[i + oc] = (before[i] >> 8) & 0xff;
+  }
+  return after;
+}
+
+static void put_conv_bias(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    const tl_t *tl,
+    s16 *data)
+{
+  int oc = tl->shape.c;
+
+  bmshape_t bms = BM_TENSOR_INT8(2, oc, 1, 1);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  s8 *transformed_data = transform_bias(oc, data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, (u8 *)transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_I8;
+  tg.shape.n = 2;
+  tg.shape.c = oc;
+  tg.shape.h = 1;
+  tg.shape.w = 1;
+  tg.stride = bmk1822_tensor_tgmem_default_stride(tg.shape, tg.fmt);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1822_tdma_g2l_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  bmmem_device_free(*ctx, dev_mem);
+  free(transformed_data);
+}
+
+static int conv_kh_ext(const conv_param_t *p)
+{
+  return (p->kh - 1) * p->dh + 1;
+}
+
+static int conv_kw_ext(const conv_param_t *p)
+{
+  return (p->kw - 1) * p->dw + 1;
+}
+
+static int conv_ih_ext(const conv_param_t *p)
+{
+  return (p->input_h - 1) * (p->ins_h + 1) +
+      p->ins_h_last + 1 + p->pad_top + p->pad_bot;
+}
+
+static int conv_iw_ext(const conv_param_t *p)
+{
+  return (p->input_w - 1) * (p->ins_w + 1) +
+      p->ins_w_last + 1 + p->pad_left + p->pad_right;
+}
+
+static int conv_oh(const conv_param_t *p)
+{
+  return (conv_ih_ext(p) - conv_kh_ext(p)) / p->stride_h + 1;
+}
+
+static int conv_ow(const conv_param_t *p)
+{
+  return (conv_iw_ext(p) - conv_kw_ext(p)) / p->stride_w + 1;
+}
+
+static int conv_input_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int ic = p->input_c;
+  int ih = p->input_h;
+  int iw = p->input_w;
+  return in * ic * ih * iw;
+}
+
+static int conv_output_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int oc = p->output_c;
+  int oh = conv_oh(p);
+  int ow = conv_ow(p);
+  return in * oc * oh * ow;
+}
+
+static int conv_weight_size(const conv_param_t *p)
+{
+  int oc = p->output_c;
+  int ic = p->input_c;
+  int kh = p->kh;
+  int kw = p->kw;
+  return oc * ic * kh * kw;
+}
+
+static s8 * alloc_input(const conv_param_t *p)
+{
+  int size = conv_input_size(p);
+
+  s8 *buf = (s8 *)malloc(sizeof(s8) * size);
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static s8 * alloc_weight(const conv_param_t *p)
+{
+  int size = conv_weight_size(p);
+
+  s8 *buf = (s8 *)malloc(sizeof(s8) * size);
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static s16 * alloc_bias(const conv_param_t *p)
+{
+  int oc = p->output_c;
+
+  s16 *bias = (s16 *)malloc(sizeof(s16) * oc);
+  for (int i = 0; i < oc; i++)
+    bias[i] = rand() % 65536 - 32768;
+
+  return bias;
+}
+
+static tl_t * conv_ifmap_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = p->opd0_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return bmk1822_lmem_alloc_tensor(ctx, s, fmt, 1);
+}
+
+static uint32_t conv_ifmap_tensor_size(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = p->opd0_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return bmk1822_lmem_tensor_to_size(ctx, s, fmt, 1);
+}
+
+static tl_t * conv_weight_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = p->opd1_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+  return bmk1822_lmem_alloc_tensor(ctx, s, fmt, 0);
+}
+
+static uint32_t conv_weight_tensor_size(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = p->opd1_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+  return bmk1822_lmem_tensor_to_size(ctx, s, fmt, 0);
+}
+
+static tl_t * conv_ofmap_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  return bmk1822_lmem_alloc_ps32_tensor(ctx, s, FMT_I8, 1);
+}
+
+static uint32_t conv_ofmap_tensor_size(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  tl_shape_t s;
+  s.n = p->input_n * sizeof(u32) / sizeof(u8);
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  return bmk1822_lmem_tensor_to_size(ctx, s, FMT_I8, 1);
+}
+
+static tl_t * conv_bias_tensor(
+    bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = p->opd2_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+  return bmk1822_lmem_alloc_tensor(ctx, s, fmt, 0);
+}
+
+static uint32_t conv_bias_tensor_size(
+    bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = p->opd2_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+  return bmk1822_lmem_tensor_to_size(ctx, s, fmt, 0);
+}
+
+static int conv_param_is_ok(const conv_param_t *p)
+{
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+
+  if ((kh_ext > ih_ext)
+      || (kw_ext > iw_ext)
+      || (kh_ext <= p->pad_top)
+      || (kh_ext <= p->pad_bot)
+      || (kw_ext <= p->pad_left)
+      || (kw_ext <= p->pad_right)
+      || (p->pad_top >= (1 << 4))
+      || (p->pad_bot >= (1 << 4))
+      || (p->pad_left >= (1 << 4))
+      || (p->pad_right >= (1 << 4))) {
+    return 0;
+  }
+
+  return 1;
+}
+
+static int bmk_conv_param_alloc_ok(
+    const bmk1822_tiu_convolution_param_t *p,
+    const conv_param_t *param)
+{
+  if (!p->ifmap || !p->ofmap || !p->weight)
+    return 0;
+  if(p->ps32_mode==1)
+      if (param->using_bias)
+        if (!p->bias)
+          return 0;
+
+  return 1;
+}
+
+static void make_bmk_conv_param_ps32(
+    bmk_ctx_t *ctx,
+    bmk1822_tiu_convolution_param_t *dst,
+    const conv_param_t *p, u32 ps32_mode)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+
+  if(ps32_mode==2)
+  {
+    u32 ifmap_size = conv_ifmap_tensor_size(ctx, p);
+    u32 weight_size = conv_weight_tensor_size(ctx, p);
+    u32 ofmap_size = conv_ofmap_tensor_size(ctx, p);
+    u32 bias_size = p->using_bias ? conv_bias_tensor_size(ctx, p) : 0;
+    u32 total_size = ifmap_size + weight_size + ofmap_size + bias_size;
+
+    // Allocation if size fit.
+    // Assertion check in bmk1822_lmem_alloc_ps32_tensor().
+    bmk1822_chip_info_t chip_info = bmk1822_chip_info();
+    if (total_size <= chip_info.lmem_size) {
+      dst->ifmap = conv_ifmap_tensor(ctx, p);
+      dst->weight = conv_weight_tensor(ctx, p);
+      dst->ofmap = conv_ofmap_tensor(ctx, p);
+    } else {
+      dst->ifmap = nullptr;
+      dst->weight = nullptr;
+      dst->ofmap = nullptr;
+    }
+  }
+
+  dst->ps32_mode = ps32_mode;
+  dst->bias = NULL;
+  dst->relu_enable = 0;
+  dst->rshift_bits = 0;
+  if(ps32_mode==1)
+  {
+    dst->relu_enable = p->bReLU_EN;
+    dst->rshift_bits = p->r_shift_m;
+    // only mode=1 can use bias
+    if (p->using_bias)
+      dst->bias = conv_bias_tensor(ctx, p);
+  }
+
+  return;
+}
+
+static void make_bmk_conv_param(
+    bmk_ctx_t *ctx,
+    bmk1822_tiu_convolution_param_t *dst,
+    const conv_param_t *p)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  dst->relu_enable = p->bReLU_EN;
+  dst->rshift_bits = p->r_shift_m;
+  dst->ifmap = conv_ifmap_tensor(ctx, p);
+  dst->weight = conv_weight_tensor(ctx, p);
+  dst->ofmap = conv_ofmap_tensor(ctx, p);
+  dst->bias = NULL;
+  dst->ps32_mode = 0;
+  if (p->using_bias)
+    dst->bias = conv_bias_tensor(ctx, p);
+}
+
+static void free_bmk_conv_param(
+    bmk_ctx_t *ctx,
+    bmk1822_tiu_convolution_param_t *r,
+    const conv_param_t *p)
+{
+  if (p->using_bias && r->bias) {
+    free_tl(ctx, r->bias);
+    r->bias = nullptr;
+  }
+
+  if (r->ofmap) {
+    free_tl(ctx, r->ofmap);
+    r->ofmap = nullptr;
+  }
+
+  if (r->weight) {
+    free_tl(ctx, r->weight);
+    r->weight = nullptr;
+  }
+
+  if (r->ifmap) {
+    free_tl(ctx, r->ifmap);
+    r->ifmap = nullptr;
+  }
+
+}
+
+static void init_conv_param(conv_param_t &p)
+{
+  printf("init_conv_param\n");
+
+  memset(&p, 0, sizeof(p));
+
+  p.random_seed = clock();
+  srand(p.random_seed);
+
+retry:
+  p.input_n = 1;
+  p.input_c = rand() % (10) + 2;
+  p.kh = rand() % 7 + 1;
+  p.kw = rand() % 7 + 1;
+  p.input_h = rand() % 10 + p.kh;
+  p.input_w = rand() % 10 + p.kw;
+  p.output_c = rand() % 10 + 3;
+  p.stride_h = rand() % (p.kh) + 1;
+  p.stride_w = rand() % (p.kw) + 1;
+  p.ins_h = rand() % p.kh;
+  p.ins_w = rand() % p.kw;
+  p.ins_h_last = rand() % p.kh;;
+  p.ins_w_last = rand() % p.kw;;
+  p.dh = rand() % 3 + 1;
+  p.dw = rand() % 3 + 1;
+
+  int kh_ext = conv_kh_ext(&p);
+  int kw_ext = conv_kw_ext(&p);
+  p.pad_top = rand() % kh_ext;
+  p.pad_bot = rand() % kh_ext;
+  p.pad_left = rand() % kw_ext;
+  p.pad_right = rand() % kw_ext;
+
+  if (!conv_param_is_ok(&p)) {
+    printf("retry init_conv_param\n");
+    goto retry;
+  }
+
+  p.using_bias = rand() % 2;
+  p.r_shift_m = rand() % 8;
+  p.bReLU_EN = rand() % 2;
+
+  p.opd0_sign = rand() % 2;
+  p.opd1_sign = 1;
+  p.opd2_sign = 1;
+
+  assert(p.opd1_sign == 1 && p.opd2_sign == 1);
+
+  int ih_ext = conv_ih_ext(&p);
+  int iw_ext = conv_iw_ext(&p);
+  assert(ih_ext >= kh_ext);
+  assert(iw_ext >= kw_ext);
+}
+
+static void print_conv_param(const conv_param_t *p)
+{
+  printf("%s\n", "Conv parameters:");
+  printf("  %s%d;\n", "p->random_seed = ", p->random_seed);
+
+  printf("  %s%d;\n", "p->input_n = ", p->input_n);
+  printf("  %s%d;\n", "p->input_c = ", p->input_c);
+  printf("  %s%d;\n", "p->input_h = ", p->input_h);
+  printf("  %s%d;\n", "p->input_w = ", p->input_w);
+  printf("  %s%d;\n", "p->output_c = ", p->output_c);
+
+  printf("  %s%d;\n", "p->kh = ", p->kh);
+  printf("  %s%d;\n", "p->kw = ", p->kw);
+  printf("  %s%d;\n", "p->dh = ", p->dh);
+  printf("  %s%d;\n", "p->dw = ", p->dw);
+  printf("  %s%d;\n", "p->pad_top = ", p->pad_top);
+  printf("  %s%d;\n", "p->pad_bot = ", p->pad_bot);
+  printf("  %s%d;\n", "p->pad_left = ", p->pad_left);
+  printf("  %s%d;\n", "p->pad_right = ", p->pad_right);
+  printf("  %s%d;\n", "p->stride_h = ", p->stride_h);
+  printf("  %s%d;\n", "p->stride_w = ", p->stride_w);
+  printf("  %s%d;\n", "p->ins_w = ", p->ins_w);
+  printf("  %s%d;\n", "p->ins_h = ", p->ins_h);
+  printf("  %s%d;\n", "p->ins_w_last = ", p->ins_w_last);
+  printf("  %s%d;\n", "p->ins_h_last = ", p->ins_h_last);
+
+  printf("  %s%d;\n", "p->r_shift_m = ", p->r_shift_m);
+  printf("  %s%d;\n", "p->opd0_sign = ", p->opd0_sign);
+  printf("  %s%d;\n", "p->opd1_sign = ", p->opd1_sign);
+  printf("  %s%d;\n", "p->opd2_sign = ", p->opd2_sign);
+  printf("  %s%d;\n", "p->bReLU_EN = ", p->bReLU_EN);
+  printf("  %s%d;\n", "p->using_bias = ", p->using_bias);
+
+  printf("  %s%d\n", "kh_ext = ", conv_kh_ext(p));
+  printf("  %s%d\n", "kw_ext = ", conv_kw_ext(p));
+  printf("  %s%d\n", "ih_ext = ", conv_ih_ext(p));
+  printf("  %s%d\n", "iw_ext = ", conv_iw_ext(p));
+  printf("  %s%d\n", "output_h = ", conv_oh(p));
+  printf("  %s%d\n", "output_w = ", conv_ow(p));
+}
+
+/* Calculate the right shift value, m
+ * Steps:
+ *  1. Get the abs() of each weight;
+ *  2. Summary all the abs() in one kernel;
+ *  3. Get Log2 of each sum;
+ *  4. Downward rounding;
+ * After every r_shift value got, sort and find the middle one.
+ */
+
+static int calc_rshift_m(const conv_param_t *p, s8* weight)
+{
+  int kernel_cnt = p->output_c * p->input_c;
+  int kernel_size = p->kh * p->kw;
+  int *kernel_shifts = (int *)malloc(sizeof(int) * kernel_cnt);
+
+  // Tscan does not recognize C++ zero-initialized.
+  memset(kernel_shifts, 0, sizeof(int) * kernel_cnt);
+
+  // Part 1:
+  // Get right shift value for each kernel
+  int sum = 0;
+  for (int i = 0; i < kernel_cnt; i++) {
+    // Step 1 & 2: Get the sum of abs()
+    for (int j = 0; j < kernel_size; j++) {
+      sum += (int)(*weight < 0 ? -(*weight) : (*weight));
+      weight++;
+    }
+    // Step 3 & 4: log2 and downward rounding
+    sum >>= 1;
+    while (sum) {
+      sum >>= 1;
+      kernel_shifts[i]++;
+    }
+  }
+
+  // Part 2:
+  // Find the middle of all the values
+  int tag[32] = {0};
+  for (int cnt = 0; cnt < kernel_cnt; cnt++) {
+    tag[kernel_shifts[cnt]]++;
+  }
+
+  int rshift_m = 0;
+  int mid = 0;
+  do {
+    mid += tag[rshift_m++];
+  } while(mid < (kernel_cnt - 1) >> 1);
+
+  free(kernel_shifts);
+
+  return rshift_m - 1;
+}
+
+static int test_ps32_ut(
+    conv_param_t &p_param, bmctx_t ctx, bmk_ctx_t *bk_ctx)
+{
+  printf("  test_ps32_ut\n");
+  s8 *input = alloc_input(&p_param);
+  s8 *weight = alloc_weight(&p_param);
+  s16 *bias = alloc_bias(&p_param);
+  p_param.r_shift_m = calc_rshift_m(&p_param, weight);
+  s8 *output_ref = (s8 *)malloc(sizeof(s8) * conv_output_size(&p_param) * sizeof(int));
+  if (!output_ref)
+    return BM_ERR_FAILURE;
+
+  bmerr_t ret = ps32_m2_conv_ref(&p_param, input, weight, output_ref);
+  assert(ret == BM_SUCCESS);
+
+  bmk1822_tiu_convolution_param_t conv_param;
+  make_bmk_conv_param_ps32(bk_ctx, &conv_param, &p_param, 2);
+  int tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, &p_param);
+
+  if (tl_alloc_success) {
+
+    put_tensor_g2l(&ctx, bk_ctx, conv_param.ifmap, (u8 *)input);
+    put_conv_weight(&ctx, bk_ctx, conv_param.weight, (u8 *)weight);
+    bmk1822_tiu_convolution(bk_ctx, &conv_param);
+
+    bmk1822_tensor_lmem_t ps32_ofmap;
+    ps32_ofmap = *conv_param.ofmap;
+    ps32_ofmap.shape.n = ps32_ofmap.shape.n * sizeof(int);
+    u8 *output = get_tensor_l2g(&ctx, bk_ctx, &ps32_ofmap);
+
+    int has_error = array_cmp_int8(
+        "    Comparing begin_mode results ...\n",
+        output_ref, (s8 *)output, conv_output_size(&p_param) * sizeof(int));
+
+    if (has_error) {
+      print_conv_param(&p_param);
+      printf("    Comparison FAILED\n");
+      exit(-1);
+    }
+    free(output);
+  }
+  free_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+  printf("    test_ps32_intermediate_mode\n");
+  for (int i=0; i < conv_input_size(&p_param); i++)
+    input[i] = rand() % 256 - 128;
+
+  for (int i=0; i < conv_weight_size(&p_param); i++)
+    weight[i] = rand() % 256 - 128;
+
+  ret = ps32_m3_conv_ref(&p_param, input, weight, output_ref);
+  assert(ret == BM_SUCCESS);
+
+  make_bmk_conv_param_ps32(bk_ctx, &conv_param, &p_param, 3);
+  tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, &p_param);
+
+  if (tl_alloc_success) {
+    put_tensor_g2l(&ctx, bk_ctx, conv_param.ifmap, (u8 *)input);
+    put_conv_weight(&ctx, bk_ctx, conv_param.weight, (u8 *)weight);
+
+    bmk1822_tiu_convolution(bk_ctx, &conv_param);
+
+    bmk1822_tensor_lmem_t ps32_ofmap;
+    ps32_ofmap = *conv_param.ofmap;
+    ps32_ofmap.shape.n = ps32_ofmap.shape.n * sizeof(int);
+
+    u8 *output = get_tensor_l2g(&ctx, bk_ctx, &ps32_ofmap);
+
+    int has_error = array_cmp_int8(
+        "    Comparing intermediate results ...\n",
+        output_ref, (s8 *)output, conv_output_size(&p_param) * sizeof(int));
+
+    if (has_error) {
+      print_conv_param(&p_param);
+      printf("    Comparison FAILED\n");
+      exit(-1);
+    }
+    free(output);
+  }
+  free_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+  printf("    test_ps32_end_mode\n");
+  for (int i=0; i < conv_input_size(&p_param); i++)
+    input[i] = rand() % 256 - 128;
+
+  for (int i=0; i < conv_weight_size(&p_param); i++)
+    weight[i] = rand() % 256 - 128;
+
+  ret = ps32_m1_conv_ref(&p_param, input, weight, bias, output_ref);
+  assert(ret == BM_SUCCESS);
+
+  make_bmk_conv_param_ps32(bk_ctx, &conv_param, &p_param, 1);
+
+  tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, &p_param);
+
+  if (tl_alloc_success) {
+
+    put_tensor_g2l(&ctx, bk_ctx, conv_param.ifmap, (u8 *)input);
+    put_conv_weight(&ctx, bk_ctx, conv_param.weight, (u8 *)weight);
+    if (p_param.using_bias) {
+      put_conv_bias(&ctx, bk_ctx, conv_param.bias, bias);
+    }
+    bmk1822_tiu_convolution(bk_ctx, &conv_param);
+    u8 *output = get_tensor_l2g(&ctx, bk_ctx, conv_param.ofmap);
+
+    int has_error = array_cmp_int8(
+        "    Comparing end results ...\n",
+        output_ref, (s8 *)output, conv_output_size(&p_param));
+
+    if (has_error) {
+      print_conv_param(&p_param);
+      printf("    Comparison FAILED\n");
+      exit(-1);
+    }
+    free(output);
+  }
+  free_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+  free(input);
+  free(weight);
+  free(bias);
+  free(output_ref);
+
+  return tl_alloc_success ? 1 : 0;
+}
+
+static int test_ic_tiling_conv(
+    conv_param_t &p_param, bmctx_t ctx, bmk_ctx_t *bk_ctx)
+{
+  printf("  test tiled ps32 conv\n");
+  s8 *input = alloc_input(&p_param);
+  s8 *weight = alloc_weight(&p_param);
+  s16 *bias = alloc_bias(&p_param);
+  p_param.r_shift_m = calc_rshift_m(&p_param, weight);
+  s8 *output_ref = (s8 *)malloc(sizeof(s8) * conv_output_size(&p_param));
+  if (!output_ref)
+    return BM_ERR_FAILURE;
+
+  bmerr_t ret = conv_ref(&p_param, input, weight, bias, output_ref);
+  assert(ret == BM_SUCCESS);
+  bmk1822_tiu_convolution_param_t conv_tmp_param;
+  bmk1822_tiu_convolution_param_t conv_param;
+  make_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+  int tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, &p_param);
+  if (tl_alloc_success) {
+    if (p_param.using_bias) {
+      conv_tmp_param.bias = conv_param.bias;
+      put_conv_bias(&ctx, bk_ctx, conv_param.bias, bias);
+      neuron_dump <s16> (
+        "    test_ic_tiling_conv: bias",
+        1,
+        conv_param.bias->shape.c,
+        conv_param.bias->shape.h,
+        conv_param.bias->shape.w,
+        (s16 *)bias);
+    }
+    // for ps32_md[1] = 1, relu_enable & rshift_bits need to set to 0
+    // so we store those parameters to conv_tmp_para
+    conv_tmp_param.relu_enable = conv_param.relu_enable;
+    conv_tmp_param.rshift_bits = conv_param.rshift_bits;
+    conv_tmp_param.bias        = conv_param.bias;
+
+    u32 ic_step = 1;
+    u32 n_step = 1;
+    tl_t ifmap = *conv_param.ifmap;
+    tl_t ofmap = *conv_param.ofmap;
+    tg_shape_t s;
+    s.n = conv_param.ifmap->shape.n;
+    s.c = conv_param.ifmap->shape.c;
+    s.h = conv_param.ifmap->shape.h;
+    s.w = conv_param.ifmap->shape.w;
+    tg_t *tg_ifmap = alloc_tg_gmem(&ctx, s, FMT_I8);
+    put_tg_gmem(&ctx, tg_ifmap, (u8 *)input);
+
+    s.n = conv_param.weight->shape.n;
+    s.c = conv_param.weight->shape.c;
+    s.h = conv_param.weight->shape.h;
+    s.w = conv_param.weight->shape.w;
+    u8 *transformed_weight =
+      transform_weight(&conv_param.weight->shape, (u8 *)weight);
+    tg_t *tg_weight = alloc_tg_gmem(&ctx, s, FMT_I8);
+    put_tg_gmem(&ctx, tg_weight, (u8 *)transformed_weight);
+
+    neuron_dump <s8> (
+        "    test_ic_tiling_conv: input",
+        p_param.input_n,
+        p_param.input_c,
+        p_param.input_h,
+        p_param.input_w,
+        (s8 *)input);
+
+    neuron_dump <s8> (
+        "    test_ic_tiling_conv: kernel",
+        1,
+        conv_param.weight->shape.c,
+        conv_param.weight->shape.h * conv_param.weight->shape.w,
+        conv_param.weight->shape.n,
+        (s8 *)transformed_weight);
+    free(transformed_weight);
+
+    tl_shape_t cur_tl_ifmap_shape = {
+        n_step,
+        ic_step,
+        ifmap.shape.h,
+        ifmap.shape.w
+    };
+
+    tg_shape_t cur_tg_ifmap_shape = {
+      cur_tl_ifmap_shape.n,
+      cur_tl_ifmap_shape.c,
+      cur_tl_ifmap_shape.h,
+      cur_tl_ifmap_shape.w
+    };
+
+    tg_stride_t cur_tg_ifmap_stride = {
+      tg_ifmap->stride.n,
+      tg_ifmap->stride.c,
+      tg_ifmap->stride.h,
+    };
+
+    tg_t cur_tg_ifmap;
+    cur_tg_ifmap.base_reg_index = 0;
+    cur_tg_ifmap.start_address = tg_ifmap->start_address;
+    cur_tg_ifmap.shape = cur_tg_ifmap_shape;
+    cur_tg_ifmap.stride = cur_tg_ifmap_stride;
+    cur_tg_ifmap.fmt = FMT_I8;
+
+    tl_t cur_tl_ifmap;
+    cur_tl_ifmap.shape = cur_tl_ifmap_shape;
+    cur_tl_ifmap.stride =
+      bmk1822_tensor_lmem_default_stride(bk_ctx, cur_tl_ifmap_shape, FMT_I8, 1);
+    cur_tl_ifmap.start_address = ifmap.start_address;
+    cur_tl_ifmap.fmt = ifmap.fmt;
+
+    tl_t cur_tl_ofmap;
+    cur_tl_ofmap.start_address = ofmap.start_address;
+    cur_tl_ofmap.shape = ofmap.shape;
+    cur_tl_ofmap.shape.n = n_step;
+    cur_tl_ofmap.stride =
+      bmk1822_tensor_lmem_default_stride(bk_ctx, cur_tl_ofmap.shape, FMT_I8, 1);
+    cur_tl_ofmap.fmt = ofmap.fmt;
+
+    tl_t cur_tl_weight;
+    cur_tl_weight.start_address = conv_param.weight->start_address;
+    cur_tl_weight.shape = conv_param.weight->shape;
+    cur_tl_weight.shape.n = ic_step;
+    cur_tl_weight.stride = {
+      1,
+      cur_tl_weight.shape.n * cur_tl_weight.shape.h * cur_tl_weight.shape.w,
+      cur_tl_weight.shape.n * cur_tl_weight.shape.w,
+      cur_tl_weight.shape.n
+    };
+    cur_tl_weight.fmt = conv_param.weight->fmt;
+
+    const tl_t *saved_tl_weight = conv_param.weight;
+    const tl_t *saved_tl_ifmap = conv_param.ifmap;
+    for (u32 ci = 0; ci < ifmap.shape.c; ci += ic_step) {
+      {
+        u32 ic = tg_weight->shape.n;
+        u32 oc = tg_weight->shape.c;
+        u32 kh = tg_weight->shape.h;
+        u32 kw = tg_weight->shape.w;
+
+        tg_t cur_tdma_tg_weight;
+        cur_tdma_tg_weight.base_reg_index = tg_weight->base_reg_index;
+        cur_tdma_tg_weight.start_address = tg_weight->start_address + ci;
+        cur_tdma_tg_weight.fmt = tg_weight->fmt;
+        cur_tdma_tg_weight.shape = {1, oc, kh * kw, ic};
+        cur_tdma_tg_weight.stride =
+          bmk1822_tensor_tgmem_default_stride(cur_tdma_tg_weight.shape, cur_tdma_tg_weight.fmt);
+        cur_tdma_tg_weight.shape = {1, oc, kh * kw, ic_step};
+
+        tl_t cur_tdma_tl_weight;
+        cur_tdma_tl_weight = cur_tl_weight;
+        cur_tdma_tl_weight.shape.n = cur_tdma_tg_weight.shape.n;
+        cur_tdma_tl_weight.shape.c = cur_tdma_tg_weight.shape.c;
+        cur_tdma_tl_weight.shape.h = cur_tdma_tg_weight.shape.h;
+        cur_tdma_tl_weight.shape.w = cur_tdma_tg_weight.shape.w;
+        cur_tdma_tl_weight.stride = bmk1822_tensor_lmem_default_stride(
+            bk_ctx, cur_tdma_tl_weight.shape, FMT_I8, 0);
+
+        bmk1822_tdma_tg2l_tensor_copy_param_t p1;
+        memset(&p1, 0, sizeof(p1));
+        p1.src = &cur_tdma_tg_weight;
+        p1.dst = &cur_tdma_tl_weight;
+        bmk1822_tdma_g2l_tensor_copy(bk_ctx, &p1);
+        test_submit(&ctx);
+      }
+      {
+        bmk1822_tdma_tg2l_tensor_copy_param_t p2;
+        memset(&p2, 0, sizeof(p2));
+        cur_tg_ifmap.start_address =
+          tg_ifmap->start_address + ci * tg_ifmap->stride.c;
+        p2.src = &cur_tg_ifmap;
+        p2.dst = &cur_tl_ifmap;
+        bmk1822_tdma_g2l_tensor_copy(bk_ctx, &p2);
+        test_submit(&ctx);
+      }
+
+      conv_param.ifmap = &cur_tl_ifmap;
+      conv_param.weight = &cur_tl_weight;
+
+      // for ps32_md[1] = 1, relu_enable & rshift_bits need to set to 0
+      // so we store those parameters to conv_tmp_para
+      if (ci == (ifmap.shape.c - 1))
+      {
+        conv_param.relu_enable = conv_tmp_param.relu_enable;
+        conv_param.rshift_bits = conv_tmp_param.rshift_bits;
+        conv_param.bias        = conv_tmp_param.bias;
+        conv_param.ps32_mode   = 1;
+      }
+      else if (ci == 0)
+      {
+        conv_param.relu_enable = 0;
+        conv_param.rshift_bits = 0;
+        conv_param.bias        = 0;;
+        conv_param.ps32_mode   = 2;
+      }
+      else
+      {
+        conv_param.relu_enable = 0;
+        conv_param.rshift_bits = 0;
+        conv_param.bias        = 0;;
+        conv_param.ps32_mode   = 3;
+      }
+      bmk1822_tiu_convolution(bk_ctx, &conv_param);
+      conv_param.weight = saved_tl_weight;
+      conv_param.ifmap = saved_tl_ifmap;
+    }
+
+    u8 *output = get_tensor_l2g(&ctx, bk_ctx, conv_param.ofmap);
+
+    free_tg_gmem(&ctx, tg_ifmap);
+    free_tg_gmem(&ctx, tg_weight);
+    int has_error = array_cmp_int8(
+        "    Comparing results ...\n",
+        output_ref, (s8 *)output, conv_output_size(&p_param));
+
+    if (has_error) {
+      print_conv_param(&p_param);
+      printf("    Comparison FAILED\n");
+      exit(-1);
+    }
+    free(output);
+  }
+  free_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+  free(input);
+  free(weight);
+  free(output_ref);
+  free(bias);
+
+  return tl_alloc_success ? 1 : 0;
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  int test_finished_num = 0;
+  for (int i = 0; i < 5; i++) {
+    printf("random_test_conv iteration: %d\n", i);
+    conv_param_t test_conv_param;
+    init_conv_param(test_conv_param);
+    test_finished_num += test_ic_tiling_conv(test_conv_param, ctx, bk_ctx);
+    test_finished_num += test_ps32_ut(test_conv_param, ctx, bk_ctx);
+    if (!test_conv_param.using_bias)
+      test_conv_param.using_bias = 1;
+    if (test_conv_param.output_c <= 9)
+      test_conv_param.output_c += 3;
+    test_finished_num += test_ic_tiling_conv(test_conv_param, ctx, bk_ctx);
+    test_finished_num += test_ps32_ut(test_conv_param, ctx, bk_ctx);
+  }
+  printf("test_finished_num: %d\n", test_finished_num);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_conv_qdm.cpp b/cviruntime/test/1822/test_1822_conv_qdm.cpp
new file mode 100644
index 000000000..c6886af60
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_conv_qdm.cpp
@@ -0,0 +1,1616 @@
+#include <limits.h>
+#include "1822_test_util.h"
+#include "test_tf_quant_util.h"
+
+// #define ENABLE_DEBUG_MSG
+// #define ENABLE_FULL_REGRESSION
+// #define ENABLE_TV_GEN_PATTERN
+
+#define MIN_EXEC_TESTS  20
+
+typedef struct {
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int output_h;
+  int output_w;
+  int has_bias;
+  int relu_enable;
+  s8 *input_data;
+  s8 *filter_data;
+  s8 *output_data;
+  s32 *bias_data;
+  u32 *multiplier_data;
+  s8 *shift_data;
+  float float_multiplier;
+  int retry_cnt;
+} conv_test_param_t;
+
+inline int Offset(tl_shape_t shape, int n, int c, int h, int w)
+{
+  return n * (shape.c * shape.h * shape.w) + c * (shape.h * shape.w) +
+         h * shape.w + w;
+}
+
+void conv_per_channel_ref(conv_test_param_t *p_param)
+{
+  const int stride_width = p_param->stride_w;
+  const int stride_height = p_param->stride_h;
+  const int dilation_width_factor = 1;
+  const int dilation_height_factor = 1;
+  const int pad_width = p_param->pad_left;
+  const int pad_height = p_param->pad_top;
+
+  const s32 output_activation_min = -128;
+  const s32 output_activation_max = 127;
+
+  const int batches = p_param->input_n;
+  const int input_depth = p_param->input_c;
+  const int output_depth = p_param->output_c;
+
+  const int input_height = p_param->input_h;
+  const int input_width = p_param->input_w;
+  const int filter_height = p_param->kh;
+  const int filter_width = p_param->kw;
+  const int output_height = p_param->output_h;
+  const int output_width = p_param->output_w;
+  s8 *input_data = p_param->input_data;
+  s8 *filter_data = p_param->filter_data;
+  s8 *output_data = p_param->output_data;
+  s32 *bias_data = p_param->has_bias ? p_param->bias_data : nullptr;
+  u32 *output_multiplier = p_param->multiplier_data;
+  s8 *output_rshift = p_param->shift_data;
+
+  tl_shape_t input_shape = {
+      static_cast<u32>(batches), static_cast<u32>(input_depth),
+      static_cast<u32>(input_height), static_cast<u32>(input_width)};
+  tl_shape_t filter_shape = {
+      static_cast<u32>(output_depth), static_cast<u32>(filter_height),
+      static_cast<u32>(filter_width), static_cast<u32>(input_depth)};
+  tl_shape_t output_shape = {
+      static_cast<u32>(batches), static_cast<u32>(output_depth),
+      static_cast<u32>(output_height), static_cast<u32>(output_width)};
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("conv_per_channel_ref: \n"
+         "  input (n=%d, ic=%d, h=%d, w=%d)\n"
+         "  kernel (oc=%d, kh=%d, kw=%d, ic=%d)\n",
+         batches, input_depth, input_height, input_width, output_depth,
+         filter_height, filter_width, input_depth);
+#endif
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          const int in_y_origin = (out_y * stride_height) - pad_height;
+          s32 acc = 0;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  s32 input_val = input_data[Offset(input_shape, batch,
+                                                    in_channel, in_y, in_x)];
+                  // s32 filter_val = filter_data[Offset(filter_shape,
+                  // out_channel, in_channel,
+                  //                                    filter_y, filter_x)];
+                  s32 filter_val =
+                      filter_data[Offset(filter_shape, out_channel, filter_y,
+                                         filter_x, in_channel)];
+                  acc += filter_val * input_val;
+
+#ifdef ENABLE_DEBUG_MSG
+                  printf("  [batch=%d][out_channel=%d][out_y=%d][out_x=%d]"
+                         "[filter_y=%d][filter_x=%d][in_channel=%d] acc(%d) += "
+                         "%d * %d = %d\n",
+                         batch, out_channel, out_y, out_x, filter_y, filter_x,
+                         in_channel, acc - filter_val * input_val, filter_val,
+                         input_val, acc);
+#endif
+                }
+              }
+            }
+          }
+
+          if (bias_data) {
+            acc += bias_data[out_channel];
+          }
+
+#ifdef ENABLE_DEBUG_MSG
+          printf("  [batch=%d][out_channel=%d][out_y=%d][out_x=%d] acc = %d, "
+                 "bias %d\n",
+                 batch, out_channel, out_y, out_x, acc,
+                 bias_data ? bias_data[out_channel] : 0);
+#endif
+
+          acc = MultiplyByQuantizedMultiplier(
+              acc, output_multiplier[out_channel], output_rshift[out_channel]);
+
+#ifdef ENABLE_DEBUG_MSG
+          printf("  [batch=%d][out_channel=%d][out_y=%d][out_x=%d] acc = %d, "
+                 "multiplier %d, shift %d\n",
+                 batch, out_channel, out_y, out_x, acc,
+                 output_multiplier[out_channel], output_rshift[out_channel]);
+#endif
+
+          acc = MAX(acc, output_activation_min);
+          acc = MIN(acc, output_activation_max);
+
+#ifdef ENABLE_DEBUG_MSG
+          printf("  [batch=%d][out_channel=%d][out_y=%d][out_x=%d] acc = %d\n",
+                 batch, out_channel, out_y, out_x, acc);
+#endif
+
+          output_data[Offset(output_shape, batch, out_channel, out_y, out_x)] =
+              static_cast<s8>(acc);
+        }
+      }
+    }
+  }
+}
+
+void calc_conv_float_multiplier(conv_test_param_t *p_param)
+{
+  const int stride_width = p_param->stride_w;
+  const int stride_height = p_param->stride_h;
+  const int dilation_width_factor = 1;
+  const int dilation_height_factor = 1;
+  const int pad_width = p_param->pad_left;
+  const int pad_height = p_param->pad_top;
+
+  const int batches = p_param->input_n;
+  const int input_depth = p_param->input_c;
+  const int output_depth = p_param->output_c;
+
+  const int input_height = p_param->input_h;
+  const int input_width = p_param->input_w;
+  const int filter_height = p_param->kh;
+  const int filter_width = p_param->kw;
+  const int output_height = p_param->output_h;
+  const int output_width = p_param->output_w;
+  s8 *input_data = p_param->input_data;
+  s8 *filter_data = p_param->filter_data;
+  s32 *bias_data = p_param->has_bias ? p_param->bias_data : nullptr;
+
+  tl_shape_t input_shape = {
+      static_cast<u32>(batches), static_cast<u32>(input_depth),
+      static_cast<u32>(input_height), static_cast<u32>(input_width)};
+  tl_shape_t filter_shape = {
+      static_cast<u32>(output_depth), static_cast<u32>(filter_height),
+      static_cast<u32>(filter_width), static_cast<u32>(input_depth)};
+
+  int output_accu_min = INT_MAX;
+  int output_accu_max = INT_MIN;
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("calc_conv_float_multiplier =>\n");
+#endif
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          const int in_y_origin = (out_y * stride_height) - pad_height;
+          s32 acc = 0;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  s32 input_val = input_data[Offset(input_shape, batch,
+                                                    in_channel, in_y, in_x)];
+                  // s32 filter_val = filter_data[Offset(filter_shape,
+                  // out_channel, in_channel,
+                  //                                    filter_y, filter_x)];
+                  s32 filter_val =
+                      filter_data[Offset(filter_shape, out_channel, filter_y,
+                                         filter_x, in_channel)];
+                  acc += filter_val * input_val;
+
+                  // printf("  [batch=%d][out_channel=%d][out_y=%d][out_x=%d]"
+                  //        "[filter_y=%d][filter_x=%d][in_channel=%d] acc(%d)
+                  //        += %d * %d = %d\n", batch, out_channel, out_y,
+                  //        out_x, filter_y, filter_x, in_channel, acc -
+                  //        filter_val * input_val, filter_val, input_val, acc);
+                }
+              }
+            }
+          }
+
+          if (bias_data) {
+            acc += bias_data[out_channel];
+          }
+
+          output_accu_max = MAX(acc, output_accu_max);
+          output_accu_min = MIN(acc, output_accu_min);
+        }
+      }
+    }
+  }
+
+  // Since int8 ranges from -128 to 127, we need to squeeze the accumulator
+  // MIN/MAX fit in those ranges correspondingly as much as possible.
+  if (abs(output_accu_max) > abs(output_accu_min)) {
+    p_param->float_multiplier = 127.0f / abs(output_accu_max);
+  } else {
+    p_param->float_multiplier = 128.0f / abs(output_accu_min);
+  }
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("  output_accu_min %d, output_accu_max %d, output_multiplier %f\n",
+         output_accu_min, output_accu_max, p_param->float_multiplier);
+#endif
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("<= calc_dw_conv_float_multiplier\n");
+#endif
+}
+
+int simple_test(bmctx_t *ctx, bmk_ctx_t *bmk)
+{
+  int ret = 0;
+
+  const int batches = 1;
+  const int input_depth = 2;
+  const int input_height = 2;
+  const int input_width = 3;
+  tl_shape_t input_shape = {batches, input_depth, input_height, input_width};
+  s8 input_data[12] = {
+      9,  1,   -11,  // ic = 0, h = 0
+      13, 5,   -15,  // ic = 0, h = 1
+      5,  -7,  -15,  // ic = 1, h = 0
+      9,  -11, -19   // ic = 1, h = 1
+  };
+
+  const int output_depth = 2;
+  const int kernel_height = 2;
+  const int kernel_width = 2;
+  tl_shape_t filter_shape = {output_depth, input_depth, kernel_height,
+                             kernel_width};
+
+  // TIU weight layout (1, oc, hw*kc, ic)
+  tl_shape_t filter_shape_for_dma = {1, output_depth,
+                                     kernel_height * kernel_width, input_depth};
+  s8 filter_data_for_dma[16] = {
+      2,  4,  6,  8,  6,  8,  10, 12,  // oc = 0
+      28, 32, 20, 24, 12, 16, 4,  8    // oc = 1
+  };
+
+  s32 bias_data[2] = {12, -16};
+
+  const int output_height = 1;
+  const int output_width = 2;
+  tl_shape_t output_shape = {1, output_depth, output_height, output_width};
+  // zero_point = 0
+  s8 ref_output_data[4] = {
+      17, -128,  // oc = 0
+      60, -128,  // oc = 1
+  };
+
+  u32 output_multiplier[] = {1073741824, 1073741824};
+  s8 output_rshift[2] = {1, 2};  // changed to right shift
+
+  s8 output_data[4];
+
+  conv_test_param_t params;
+  memset(&params, 0, sizeof(params));
+
+  params.input_n = batches;
+  params.input_c = input_depth;
+  params.input_h = input_height;
+  params.input_w = input_width;
+  params.kh = kernel_height;
+  params.kw = kernel_width;
+  params.output_c = output_depth;
+  params.output_h = output_height;
+  params.output_w = output_width;
+  params.stride_w = 1;
+  params.stride_h = 1;
+  params.input_data = input_data;
+  params.filter_data = filter_data_for_dma;
+  params.output_data = output_data;
+  params.has_bias = 1;
+  params.bias_data = bias_data;
+  params.multiplier_data = output_multiplier;
+  params.shift_data = output_rshift;
+  conv_per_channel_ref(&params);
+
+  printf("Compare ref and golden\n");
+  for (int i = 0; i < 4; i++) {
+    if (output_data[i] != ref_output_data[i]) {
+      printf("Error ! output[%d]=%d != ref_output_data[%d]=%d\n", i,
+             output_data[i], i, ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  // tl_shape_t per_channel_cal_shape = {1, /*oc=*/2, 1, 9};
+  u8 per_channel_cal_data[18];
+  pack_chl_quan_param(2, /*has_bias=*/true, bias_data, output_multiplier,
+                      output_rshift, per_channel_cal_data);
+
+  bmk1822_tensor_lmem_t *tl_per_channel_cal =
+      bmk1822_lmem_alloc_tensor(bmk, {1, 2, 1, 9}, FMT_U8,
+                                  /*eu_align*/ 0);
+
+  bmk1822_tensor_lmem_t *tl_input =
+      bmk1822_lmem_alloc_tensor(bmk, input_shape, FMT_I8, /*eu_aign=*/1);
+
+  bmk1822_tensor_lmem_t *tl_filter = bmk1822_lmem_alloc_tensor(
+      bmk, filter_shape_for_dma, FMT_I8, /*eu_align=*/1);
+
+  bmk1822_tensor_lmem_t *tl_output =
+      bmk1822_lmem_alloc_tensor(bmk, output_shape, FMT_I8, /*eu_align=*/1);
+
+  put_tensor_g2l(ctx, bmk, tl_per_channel_cal, per_channel_cal_data);
+  put_tensor_g2l(ctx, bmk, tl_input, reinterpret_cast<u8 *>(input_data));
+  put_tensor_g2l(ctx, bmk, tl_filter,
+                 reinterpret_cast<u8 *>(filter_data_for_dma));
+
+  // Restore filter shape for tiu operation
+  tl_filter->shape = filter_shape;
+  tl_filter->stride = bmk1822_tensor_lmem_default_stride(
+      bmk, tl_filter->shape, FMT_I8, /*eu_align=*/1);
+
+  {
+    // Reshape per channel quantization data
+    tl_per_channel_cal->shape = {1, 2, 1, 1};
+    tl_per_channel_cal->stride = bmk1822_tensor_lmem_default_stride(
+        bmk, tl_per_channel_cal->shape, FMT_I8, /*eu_align=*/0);
+
+    bmk1822_tiu_convolution_qdm_param_t param;
+    memset(&param, 0, sizeof(param));
+    param.ofmap = tl_output;
+    param.ifmap = tl_input;
+    param.weight = tl_filter;
+    param.chl_quan_param = tl_per_channel_cal;
+    param.stride_h = 1;
+    param.stride_w = 1;
+    param.dilation_h = 1;
+    param.dilation_w = 1;
+    param.has_bias = 1;
+    bmk1822_tiu_convolution_qdm(bmk, &param);
+  }
+
+  test_submit(ctx);
+
+  printf("Compare tiu and golden\n");
+  s8 *conv_output_data =
+      reinterpret_cast<s8 *>(get_tensor_l2g(ctx, bmk, tl_output));
+  for (int i = 0; i < static_cast<int>(sizeof(ref_output_data)); i++) {
+    if (conv_output_data[i] != ref_output_data[i]) {
+      printf("output_data[%d] %d != %d\n", i, conv_output_data[i],
+             ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  free(conv_output_data);
+
+  // Reverse order
+  bmk1822_lmem_free_tensor(bmk, tl_output);
+  bmk1822_lmem_free_tensor(bmk, tl_filter);
+  bmk1822_lmem_free_tensor(bmk, tl_input);
+  bmk1822_lmem_free_tensor(bmk, tl_per_channel_cal);
+
+  return ret;
+}
+
+void fill_random_data_s8(s8 *input_data, int size)
+{
+  for (int i = 0; i < size; ++i) {
+    int is_satured = ((rand() % 1000) == 1) ? 1 : 0;
+    int is_sign = rand() % 2 ? 1 : -1;
+
+    if (is_satured && is_sign) {
+      input_data[i] = -128;
+    } else if (is_satured) {
+      input_data[i] = 127;
+    } else {
+      input_data[i] = is_sign * rand() % 128;
+    }
+  }
+}
+
+void fill_random_data_s32(s32 *input_data, int size)
+{
+  for (int i = 0; i < size; ++i) {
+    int is_satured = ((rand() % 1000) == 1) ? 1 : 0;
+    int is_sign = rand() % 2 ? 1 : -1;
+
+    if (is_satured && is_sign) {
+      input_data[i] = INT_MIN;
+    } else if (is_satured) {
+      input_data[i] = INT_MAX;
+    } else {
+      input_data[i] = is_sign * rand() % 128;
+    }
+  }
+}
+
+bool check_valid_test_param(bmk_ctx_t *bk_ctx, conv_test_param_t *p_param)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int oh = p_param->output_h;
+  int ow = p_param->output_w;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int chl_quan_per_lane_data_size =
+      p_param->has_bias ? 9 : 5;  // bias(4) + multiplier(4) + shift(1)
+
+  // Skip invalid shape
+  if ((kh > ih) || (kw > iw) || (stride_h > ih) || (stride_w > iw)) {
+    return false;
+  }
+
+  // muliply random-choosen value may exceeded than s32
+  u32 input_size = in * ic * ih * iw;
+  u32 kernel_size = oc * ic * kh * kw;
+  u32 output_size = in * oc * oh * ow;
+
+  bmk1822_chip_info_t chip_info = bmk1822_chip_info();
+  u32 lmem_size_per_lane = chip_info.lmem_size;
+  u32 total_lmem_size = chip_info.lmem_size * chip_info.npu_num;
+
+  u32 total_needed_size = input_size + kernel_size + output_size +
+                          chl_quan_per_lane_data_size * chip_info.npu_num;
+  if (total_needed_size > total_lmem_size) {
+    return false;
+  }
+
+  tl_shape_t input_shape = {static_cast<u32>(in), static_cast<u32>(ic),
+                            static_cast<u32>(ih), static_cast<u32>(iw)};
+  tl_shape_t filter_shape = {1, static_cast<u32>(oc),
+                             static_cast<u32>(kh) * static_cast<u32>(kw),
+                             static_cast<u32>(ic)};
+  tl_shape_t output_shape = {static_cast<u32>(in), static_cast<u32>(oc),
+                             static_cast<u32>(oh), static_cast<u32>(ow)};
+  tl_shape_t cal_shape = {1, static_cast<u32>(oc), 1,
+                          static_cast<u32>(chl_quan_per_lane_data_size)};
+
+  u32 needed_size =
+      bmk1822_lmem_tensor_to_size(bk_ctx, input_shape, FMT_I8, /*eu_align=*/1) +
+      bmk1822_lmem_tensor_to_size(bk_ctx, filter_shape, FMT_I8, /*eu_align=*/0) +
+      bmk1822_lmem_tensor_to_size(bk_ctx, output_shape, FMT_I8, /*eu_align=*/1) +
+      bmk1822_lmem_tensor_to_size(bk_ctx, cal_shape, FMT_I8, /*eu_align=*/0);
+
+  // Skip invalid shape
+  if (needed_size > lmem_size_per_lane) {
+    return false;
+  }
+
+  return true;
+}
+
+int choose_from_range(int table[], int size, int index)
+{
+  if (index >= size) {
+    return 0;
+  }
+
+  int val = table[index];
+  if (index < (size - 1)) {
+    int range = MAX(table[index + 1] - table[index] - 1, 1);
+    val += rand() % range;
+  }
+
+  return val;
+}
+
+void dump_test_param(conv_test_param_t *p_param, bool dump_content)
+{
+  printf("Dump test parameter:\n");
+  printf("  input_n %d\n", p_param->input_n);
+  printf("  input_c %d\n", p_param->input_c);
+  printf("  input_h %d\n", p_param->input_h);
+  printf("  input_w %d\n", p_param->input_w);
+  printf("  kw %d\n", p_param->kw);
+  printf("  kh %d\n", p_param->kh);
+  printf("  dh %d\n", p_param->dh);
+  printf("  dw %d\n", p_param->dw);
+  printf("  pad_top %d\n", p_param->pad_top);
+  printf("  pad_bot %d\n", p_param->pad_bot);
+  printf("  pad_left %d\n", p_param->pad_left);
+  printf("  pad_right %d\n", p_param->pad_right);
+  printf("  ins_h %d\n", p_param->ins_h);
+  printf("  ins_h_last %d\n", p_param->ins_h_last);
+  printf("  ins_w %d\n", p_param->ins_w);
+  printf("  ins_w_last %d\n", p_param->ins_w_last);
+  printf("  stride_h %d\n", p_param->stride_h);
+  printf("  stride_w %d\n", p_param->stride_w);
+  printf("  output_c %d\n", p_param->output_c);
+  printf("  output_h %d\n", p_param->output_h);
+  printf("  output_w %d\n", p_param->output_w);
+  printf("  has_bias %d\n", p_param->has_bias);
+  printf("  relu_enable %d\n", p_param->relu_enable);
+
+  if (dump_content) {
+    printf("input_data(%d, %d, %d, %d) :\n", p_param->input_n, p_param->input_c,
+           p_param->input_h, p_param->input_w);
+    int in = p_param->input_n;
+    int ic = p_param->input_c;
+    int ih = p_param->input_h;
+    int iw = p_param->input_w;
+    for (int i = 0; i < in; ++i) {
+      for (int j = 0; j < ic; ++j) {
+        for (int k = 0; k < ih; ++k) {
+          for (int l = 0; l < iw; ++l) {
+            int offset = i * (ic * ih * iw) + j * (ih * iw) + k * iw + l;
+            printf("%d, ", p_param->input_data[offset]);
+          }
+          printf("\n");
+        }
+      }
+    }
+    printf("\n\n");
+
+    printf("kener_data (oc=%d, kh=%d, kw=%d, ic=%d)\n", p_param->output_c,
+           p_param->kh, p_param->kw, p_param->input_c);
+    int oc = p_param->output_c;
+    int kh = p_param->kh;
+    int kw = p_param->kw;
+    for (int i = 0; i < oc; ++i) {
+      for (int j = 0; j < kh; ++j) {
+        for (int k = 0; k < kw; ++k) {
+          for (int l = 0; l < ic; ++l) {
+            int offset = i * (kh * kw * ic) + j * (kw * ic) + k * ic + l;
+            printf("%d, ", p_param->filter_data[offset]);
+          }
+          printf("\n");
+        }
+      }
+    }
+    printf("\n\n");
+
+    if (p_param->has_bias) {
+      printf("bias_data:\n");
+      for (int i = 0; i < oc; ++i) {
+        printf("%d, ", p_param->bias_data[i]);
+      }
+      printf("\n\n");
+    }
+
+    printf("multiplier_data:\n");
+    for (int i = 0; i < oc; ++i) {
+      printf("%d, ", p_param->multiplier_data[i]);
+    }
+    printf("\n\n");
+
+    printf("shift_data:\n");
+    for (int i = 0; i < oc; ++i) {
+      printf("%d, ", p_param->shift_data[i]);
+    }
+    printf("\n\n");
+  }
+}
+
+
+
+static conv_test_param_t keepFailParam;;
+static s8 *keep_input_data = NULL;
+
+static int keep_kernel_size = 0;
+static s8 *keep_kernel_data = NULL;
+
+static int keep_output_size = 0;
+static s8 *keep_output_data = NULL;
+
+static s32 *keep_bias_data = NULL;
+static u32 *keep_multiplier_data = NULL;
+static s8 *keep_shift_data = NULL;
+
+
+int keep_fail_param(conv_test_param_t *p_param)
+{
+	int in = p_param->input_n;
+	int ic = p_param->input_c;
+	int ih = p_param->input_h;
+	int iw = p_param->input_w;
+	int oc = p_param->output_c;
+	int oh = p_param->output_h;
+	int ow = p_param->output_w;
+	int kh = p_param->kh;
+	int kw = p_param->kw;
+	//int dh = p_param->dh;
+	//int dw = p_param->dw;
+	//int pad_top = p_param->pad_top;
+	//int pad_bot = p_param->pad_bot;
+	//int pad_left = p_param->pad_left;
+	//int pad_right = p_param->pad_right;
+	//int ins_h = p_param->ins_h;
+	//int ins_last_h = p_param->ins_h_last;
+	//int ins_w = p_param->ins_w;
+	//int ins_last_w = p_param->ins_w_last;
+	//int stride_h = p_param->stride_h;
+	//int stride_w = p_param->stride_w;
+	int has_bias = p_param->has_bias;
+	//int relu_enable = p_param->relu_enable;
+
+
+	memcpy(&keepFailParam, p_param, sizeof(conv_test_param_t));
+
+	int input_size = in * ic * iw * ih;
+	keep_input_data = (s8 *)malloc(input_size);
+	memcpy(keep_input_data, p_param->input_data, input_size);
+
+	
+	keep_kernel_size = oc * ic * kh * kw;
+	keep_kernel_data = (s8 *)malloc(keep_kernel_size);
+	memcpy(keep_kernel_data, p_param->filter_data, keep_kernel_size);
+	
+	keep_output_size = in * oc * oh * ow;
+	keep_output_data = (s8 *)malloc(keep_output_size);
+	memcpy(keep_output_data, p_param->output_data, keep_output_size);
+
+	keep_bias_data = (s32 *) malloc(sizeof(s32) * oc);
+	memcpy(keep_bias_data, p_param->bias_data, sizeof(s32) * oc);
+
+	keep_multiplier_data = (u32 *) malloc(sizeof(u32) * oc);
+	memcpy(keep_multiplier_data, p_param->multiplier_data, sizeof(s32) * oc);
+
+	keep_shift_data = (s8 *)malloc(oc);
+	memcpy(keep_shift_data, p_param->shift_data, oc);
+	
+
+
+	keepFailParam.input_data = keep_input_data;
+	keepFailParam.filter_data = keep_kernel_data;
+	keepFailParam.output_data = keep_output_data;
+	keepFailParam.has_bias = has_bias;
+	keepFailParam.bias_data = keep_bias_data;
+	keepFailParam.multiplier_data = keep_multiplier_data;
+	keepFailParam.shift_data = keep_shift_data;
+
+	return 0;
+}
+
+
+void dump2_test_param(conv_test_param_t *p_param)
+{
+	printf("dump2_test_param:\n");
+	printf("  input_n %d\n", p_param->input_n);
+	printf("  input_c %d\n", p_param->input_c);
+	printf("  input_h %d\n", p_param->input_h);
+	printf("  input_w %d\n", p_param->input_w);
+	printf("  kw %d\n", p_param->kw);
+	printf("  kh %d\n", p_param->kh);
+	printf("  dh %d\n", p_param->dh);
+	printf("  dw %d\n", p_param->dw);
+	printf("  pad_top %d\n", p_param->pad_top);
+	printf("  pad_bot %d\n", p_param->pad_bot);
+	printf("  pad_left %d\n", p_param->pad_left);
+	printf("  pad_right %d\n", p_param->pad_right);
+	printf("  ins_h %d\n", p_param->ins_h);
+	printf("  ins_h_last %d\n", p_param->ins_h_last);
+	printf("  ins_w %d\n", p_param->ins_w);
+	printf("  ins_w_last %d\n", p_param->ins_w_last);
+	printf("  stride_h %d\n", p_param->stride_h);
+	printf("  stride_w %d\n", p_param->stride_w);
+	printf("  output_c %d\n", p_param->output_c);
+	printf("  output_h %d\n", p_param->output_h);
+	printf("  output_w %d\n", p_param->output_w);
+	printf("  has_bias %d\n", p_param->has_bias);
+	printf("  relu_enable %d\n", p_param->relu_enable);
+
+	keep_fail_param(p_param);
+	printf("dump2_test_param\n\n");
+	assert(0);
+}
+
+int run_compare_conv(bmctx_t *ctx, bmk_ctx_t *bk_ctx,
+                     conv_test_param_t *p_param)
+{
+  int ret = 0;
+
+  if (ctx == nullptr || bk_ctx == nullptr) {
+    return -1;
+  }
+
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int oh = p_param->output_h;
+  int ow = p_param->output_w;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_last_h = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_last_w = p_param->ins_w_last;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int has_bias = p_param->has_bias;
+  int relu_enable = p_param->relu_enable;
+
+  int input_size = in * ic * iw * ih;
+  s8 *input_data = (s8 *)malloc(input_size);
+
+  int kernel_size = oc * ic * kh * kw;
+  s8 *kernel_data = (s8 *)malloc(kernel_size);
+
+  int output_size = in * oc * oh * ow;
+  s8 *output_data = (s8 *)malloc(output_size);
+  if (!input_data || !kernel_data || !output_data) {
+    free(input_data);
+    free(kernel_data);
+    free(output_data);
+    return -1;
+  }
+
+  memset(output_data, 0, output_size);
+
+  s32 *bias_data = (s32 *) malloc(sizeof(s32) * oc);
+  u32 *multiplier_data = (u32 *) malloc(sizeof(u32) * oc);
+  s8 *shift_data = (s8 *)malloc(oc);
+
+  p_param->input_data = input_data;
+  p_param->filter_data = kernel_data;
+  p_param->output_data = output_data;
+  p_param->has_bias = has_bias;
+  p_param->bias_data = bias_data;
+  p_param->multiplier_data = multiplier_data;
+  p_param->shift_data = shift_data;
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    run_compare_conv =>\n");
+  printf("      input (n=%d, ic=%d, h=%d, w=%d), kernel (oc=%d, ic=%d, h=%d, "
+         "w=%d), output (, c=%d, h=%d, w=%d), has_bias %d\n",
+         in, ic, ih, iw, oc, ic, kh, kw, oc, oh, ow, has_bias);
+#endif
+
+  int retry_cnt = p_param->retry_cnt;
+  do {
+    fill_random_data_s8(input_data, input_size);
+    fill_random_data_s8(kernel_data, kernel_size);
+    if (has_bias) {
+      fill_random_data_s32(bias_data, oc);
+    }
+
+    p_param->float_multiplier = 100.0;  // should be < 1.0
+    calc_conv_float_multiplier(p_param);
+
+    if (p_param->float_multiplier > 0.f && p_param->float_multiplier < 1.0) {
+      break;
+    }
+
+  } while (--retry_cnt);
+
+  if (p_param->float_multiplier >= 1.0) {
+    printf("    run_compare_dw_conv: unable to find valid multiplier\n");
+    free(input_data);
+    free(kernel_data);
+    free(output_data);
+    free(bias_data);
+    free(multiplier_data);
+    free(shift_data);
+
+    return -1;
+  }
+
+  u32 base_multiplier = 0;
+  int base_shift = 0;
+  QuantizeMultiplierSmallerThanOne(p_param->float_multiplier, &base_multiplier,
+                                   &base_shift);
+
+  for (int i = 0; i < oc; ++i) {
+    // multipliers typically range in [2^30 ; 2^31 - 1].
+    // Values in [0, 2^30 - 1] are normally unused, but harmless.
+    // Thus a good way to randomize multipliers is to subtract from them
+    // a random value smaller than 2^30 but still significant compared to it.
+    p_param->multiplier_data[i] = base_multiplier - (rand() % (1 << 26));
+
+    int right_shift = base_shift - 1 + (rand() % 4);
+    p_param->shift_data[i] =
+        truncate_rshift((s8)right_shift, /*allow_lshift*/1);
+
+#ifdef ENABLE_DEBUG_MSG
+    printf("      [oc=%d] multiplier_data %d, shift_data %d\n", i,
+           p_param->multiplier_data[i], p_param->shift_data[i]);
+#endif
+  }
+
+  conv_per_channel_ref(p_param);
+
+  const int chl_quan_per_lane_data_size =
+      p_param->has_bias ? 9 : 5;  // bias(4) + multiplier(4) + shift(1)
+  const int cal_data_size = oc * chl_quan_per_lane_data_size;
+  u8 *chl_quan_data = (u8 *) malloc(cal_data_size);
+  pack_chl_quan_param(oc, p_param->has_bias, p_param->bias_data,
+                      p_param->multiplier_data, p_param->shift_data,
+                      chl_quan_data);
+
+  tl_shape_t input_shape = {static_cast<u32>(in), static_cast<u32>(ic),
+                            static_cast<u32>(ih), static_cast<u32>(iw)};
+  tl_shape_t filter_shape = {1, static_cast<u32>(oc),
+                             static_cast<u32>(kh) * static_cast<u32>(kw),
+                             static_cast<u32>(ic)};
+  tl_shape_t output_shape = {static_cast<u32>(in), static_cast<u32>(oc),
+                             static_cast<u32>(oh), static_cast<u32>(ow)};
+  tl_shape_t cal_shape = {1, static_cast<u32>(oc), 1,
+                          static_cast<u32>(chl_quan_per_lane_data_size)};
+
+  bmk1822_tensor_lmem_t *tl_input =
+      bmk1822_lmem_alloc_tensor(bk_ctx, input_shape, FMT_I8, /*eu_aign=*/1);
+
+  bmk1822_tensor_lmem_t *tl_filter =
+      bmk1822_lmem_alloc_tensor(bk_ctx, filter_shape, FMT_I8, /*eu_align=*/0);
+
+  bmk1822_tensor_lmem_t *tl_output =
+      bmk1822_lmem_alloc_tensor(bk_ctx, output_shape, FMT_I8, /*eu_align=*/1);
+
+  // Shape for TDMA load
+  bmk1822_tensor_lmem_t *tl_cal_data =
+      bmk1822_lmem_alloc_tensor(bk_ctx, cal_shape, FMT_U8, /*eu_align*/ 0);
+
+  if (!tl_input || !tl_filter || !tl_output || !tl_cal_data) {
+    if (tl_input == nullptr) {
+      printf("      fail to alloc tl_input (%d, %d, %d, %d)\n", input_shape.n,
+            input_shape.c, input_shape.h, input_shape.w);
+    }
+    if (tl_filter == nullptr) {
+      printf("     fail to alloc tl_filter (%d, %d, %d, %d)\n", filter_shape.n,
+            filter_shape.c, filter_shape.h, filter_shape.w);
+    }
+    if (tl_output == nullptr) {
+      printf("    fail to alloc tl_output (%d, %d, %d, %d)\n", output_shape.n,
+            output_shape.c, output_shape.h, output_shape.w);
+    }
+    if (tl_cal_data == nullptr) {
+      printf("    fail to alloc tl_cal_data (%d, %d ,%d, %d)\n", cal_shape.n,
+            cal_shape.c, cal_shape.h, cal_shape.w);
+    }
+
+    // Reverse order
+    if (tl_cal_data)
+      bmk1822_lmem_free_tensor(bk_ctx, tl_cal_data);
+    if (tl_output)
+      bmk1822_lmem_free_tensor(bk_ctx, tl_output);
+    if (tl_filter)
+      bmk1822_lmem_free_tensor(bk_ctx, tl_filter);
+    if (tl_input)
+      bmk1822_lmem_free_tensor(bk_ctx, tl_input);
+
+    return -1;
+  }
+
+  put_tensor_g2l(ctx, bk_ctx, tl_cal_data, chl_quan_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_input, reinterpret_cast<u8 *>(input_data));
+  put_tensor_g2l(ctx, bk_ctx, tl_filter, reinterpret_cast<u8 *>(kernel_data));
+
+  {
+    // Reshape per channel quantization data for TIU
+    tl_cal_data->shape = {1, static_cast<u32>(oc), 1, 1};
+    tl_cal_data->stride = bmk1822_tensor_lmem_default_stride(
+        bk_ctx, tl_cal_data->shape, FMT_I8, /*eu_align=*/0);
+
+    // Reshape weight for TIU
+    tl_filter->shape = {static_cast<u32>(ic), static_cast<u32>(oc),
+                        static_cast<u32>(kh), static_cast<u32>(kw)};
+
+    bmk1822_tiu_convolution_qdm_param_t param;
+    memset(&param, 0, sizeof(param));
+    param.ofmap = tl_output;
+    param.ifmap = tl_input;
+    param.weight = tl_filter;
+    param.chl_quan_param = tl_cal_data;
+    param.ins_h = ins_h;
+    param.ins_last_h = ins_last_h;
+    param.ins_w = ins_w;
+    param.ins_last_w = ins_last_w;
+    param.stride_h = stride_h;
+    param.stride_w = stride_w;
+    param.dilation_h = dh;
+    param.dilation_w = dw;
+    param.pad_top = pad_top;
+    param.pad_bottom = pad_bot;
+    param.pad_left = pad_left;
+    param.pad_right = pad_right;
+    param.has_bias = has_bias;
+    param.relu_enable = relu_enable;
+
+#ifdef ENABLE_DEBUG_MSG
+    printf("      tiu_conv_qdm:\n");
+    printf("        ifmap shape (%d, %d, %d, %d)\n", param.ifmap->shape.n,
+           param.ifmap->shape.c, param.ifmap->shape.h, param.ifmap->shape.w);
+    printf("        weight shape (%d, %d, %d, %d)\n", param.weight->shape.n,
+           param.weight->shape.c, param.weight->shape.h, param.weight->shape.w);
+    printf("        ofmap shape (%d, %d, %d, %d)\n", param.ofmap->shape.n,
+           param.ofmap->shape.c, param.ofmap->shape.h, param.ofmap->shape.w);
+#endif
+
+    bmk1822_tiu_convolution_qdm(bk_ctx, &param);
+  }
+
+  test_submit(ctx);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("      compare result:\n");
+#endif
+  s8 *conv_output_data =
+      reinterpret_cast<s8 *>(get_tensor_l2g(ctx, bk_ctx, tl_output));
+  for (int i = 0; i < in; ++i) {
+    for (int j = 0; j < oc; ++j) {
+      for (int k = 0; k < oh; ++k) {
+        for (int l = 0; l < ow; ++l) {
+          int offset = i * (oc * oh * ow) + j * (oh * ow) + k * ow + l;
+          if (conv_output_data[offset] != output_data[offset]) {
+            printf("        [ni=%d][oci=%d][ohi=%d][owi=%d] output %d(tiu) != "
+                   "%d(ref)\n",
+                   i, j, k, l, conv_output_data[offset], output_data[offset]);
+            ret = -1;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  if (ret) {
+    //dump_test_param(p_param, /*dump_content=*/true);
+	dump2_test_param(p_param);
+  }
+
+  // Reverse order
+  bmk1822_lmem_free_tensor(bk_ctx, tl_cal_data);
+  bmk1822_lmem_free_tensor(bk_ctx, tl_output);
+  bmk1822_lmem_free_tensor(bk_ctx, tl_filter);
+  bmk1822_lmem_free_tensor(bk_ctx, tl_input);
+
+  free(conv_output_data);
+
+  free(input_data);
+  free(kernel_data);
+  free(output_data);
+  free(bias_data);
+  free(multiplier_data);
+  free(shift_data);
+  free(chl_quan_data);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    <= run_compare_conv\n");
+#endif
+
+  return ret;
+}
+
+
+
+
+int run2_compare_conv(bmctx_t *ctx, bmk_ctx_t *bk_ctx)
+{
+  int ret = 0;
+
+  if (ctx == nullptr || bk_ctx == nullptr) {
+    return -1;
+  }
+
+  conv_test_param_t *p_param = &keepFailParam; 
+
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int oh = p_param->output_h;
+  int ow = p_param->output_w;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_last_h = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_last_w = p_param->ins_w_last;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int has_bias = p_param->has_bias;
+  int relu_enable = p_param->relu_enable;
+
+  int input_size = in * ic * iw * ih;
+  s8 *input_data = (s8 *)malloc(input_size);
+
+  int kernel_size = oc * ic * kh * kw;
+  s8 *kernel_data = (s8 *)malloc(kernel_size);
+
+  int output_size = in * oc * oh * ow;
+  s8 *output_data = (s8 *)malloc(output_size);
+  if (!input_data || !kernel_data || !output_data) {
+    free(input_data);
+    free(kernel_data);
+    free(output_data);
+    return -1;
+  }
+
+  memset(output_data, 0, output_size);
+
+  s32 *bias_data = (s32 *) malloc(sizeof(s32) * oc);
+  u32 *multiplier_data = (u32 *) malloc(sizeof(u32) * oc);
+  s8 *shift_data = (s8 *)malloc(oc);
+
+  //p_param->input_data = input_data;
+  //p_param->filter_data = kernel_data;
+  //p_param->output_data = output_data;
+  //p_param->has_bias = has_bias;
+  //p_param->bias_data = bias_data;
+  //p_param->multiplier_data = multiplier_data;
+  //p_param->shift_data = shift_data;
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    run_compare_conv =>\n");
+  printf("      input (n=%d, ic=%d, h=%d, w=%d), kernel (oc=%d, ic=%d, h=%d, "
+         "w=%d), output (, c=%d, h=%d, w=%d), has_bias %d\n",
+         in, ic, ih, iw, oc, ic, kh, kw, oc, oh, ow, has_bias);
+#endif
+
+  int retry_cnt = p_param->retry_cnt;
+  do {
+    fill_random_data_s8(input_data, input_size);
+    fill_random_data_s8(kernel_data, kernel_size);
+    if (has_bias) {
+      fill_random_data_s32(bias_data, oc);
+    }
+
+    p_param->float_multiplier = 100.0;  // should be < 1.0
+    calc_conv_float_multiplier(p_param);
+
+    if (p_param->float_multiplier > 0.f && p_param->float_multiplier < 1.0) {
+      break;
+    }
+
+  } while (--retry_cnt);
+
+  if (p_param->float_multiplier >= 1.0) {
+    printf("    run_compare_dw_conv: unable to find valid multiplier\n");
+    free(input_data);
+    free(kernel_data);
+    free(output_data);
+    free(bias_data);
+    free(multiplier_data);
+    free(shift_data);
+
+    return -1;
+  }
+
+  u32 base_multiplier = 0;
+  int base_shift = 0;
+  QuantizeMultiplierSmallerThanOne(p_param->float_multiplier, &base_multiplier,
+                                   &base_shift);
+
+  for (int i = 0; i < oc; ++i) {
+    // multipliers typically range in [2^30 ; 2^31 - 1].
+    // Values in [0, 2^30 - 1] are normally unused, but harmless.
+    // Thus a good way to randomize multipliers is to subtract from them
+    // a random value smaller than 2^30 but still significant compared to it.
+    p_param->multiplier_data[i] = base_multiplier - (rand() % (1 << 26));
+
+    // Our H/W only supports right shift
+    int right_shift = base_shift - 1 + (rand() % 4);
+    p_param->shift_data[i] = right_shift > 0 ? right_shift : 0;
+
+#ifdef ENABLE_DEBUG_MSG
+    printf("      [oc=%d] multiplier_data %d, shift_data %d\n", i,
+           p_param->multiplier_data[i], p_param->shift_data[i]);
+#endif
+  }
+
+  conv_per_channel_ref(p_param);
+
+  const int chl_quan_per_lane_data_size =
+      p_param->has_bias ? 9 : 5;  // bias(4) + multiplier(4) + shift(1)
+  const int cal_data_size = oc * chl_quan_per_lane_data_size;
+  u8 *chl_quan_data = (u8 *) malloc(cal_data_size);
+  pack_chl_quan_param(oc, p_param->has_bias, p_param->bias_data,
+                      p_param->multiplier_data, p_param->shift_data,
+                      chl_quan_data);
+
+  tl_shape_t input_shape = {static_cast<u32>(in), static_cast<u32>(ic),
+                            static_cast<u32>(ih), static_cast<u32>(iw)};
+  tl_shape_t filter_shape = {1, static_cast<u32>(oc),
+                             static_cast<u32>(kh) * static_cast<u32>(kw),
+                             static_cast<u32>(ic)};
+  tl_shape_t output_shape = {static_cast<u32>(in), static_cast<u32>(oc),
+                             static_cast<u32>(oh), static_cast<u32>(ow)};
+  tl_shape_t cal_shape = {1, static_cast<u32>(oc), 1,
+                          static_cast<u32>(chl_quan_per_lane_data_size)};
+
+  bmk1822_tensor_lmem_t *tl_input =
+      bmk1822_lmem_alloc_tensor(bk_ctx, input_shape, FMT_I8, /*eu_aign=*/1);
+
+  bmk1822_tensor_lmem_t *tl_filter =
+      bmk1822_lmem_alloc_tensor(bk_ctx, filter_shape, FMT_I8, /*eu_align=*/0);
+
+  bmk1822_tensor_lmem_t *tl_output =
+      bmk1822_lmem_alloc_tensor(bk_ctx, output_shape, FMT_I8, /*eu_align=*/1);
+
+  // Shape for TDMA load
+  bmk1822_tensor_lmem_t *tl_cal_data =
+      bmk1822_lmem_alloc_tensor(bk_ctx, cal_shape, FMT_U8, /*eu_align*/ 0);
+
+  if (!tl_input || !tl_filter || !tl_output || !tl_cal_data) {
+    if (tl_input == nullptr) {
+      printf("      fail to alloc tl_input (%d, %d, %d, %d)\n", input_shape.n,
+            input_shape.c, input_shape.h, input_shape.w);
+    }
+    if (tl_filter == nullptr) {
+      printf("     fail to alloc tl_filter (%d, %d, %d, %d)\n", filter_shape.n,
+            filter_shape.c, filter_shape.h, filter_shape.w);
+    }
+    if (tl_output == nullptr) {
+      printf("    fail to alloc tl_output (%d, %d, %d, %d)\n", output_shape.n,
+            output_shape.c, output_shape.h, output_shape.w);
+    }
+    if (tl_cal_data == nullptr) {
+      printf("    fail to alloc tl_cal_data (%d, %d ,%d, %d)\n", cal_shape.n,
+            cal_shape.c, cal_shape.h, cal_shape.w);
+    }
+
+    // Reverse order
+    if (tl_cal_data)
+      bmk1822_lmem_free_tensor(bk_ctx, tl_cal_data);
+    if (tl_output)
+      bmk1822_lmem_free_tensor(bk_ctx, tl_output);
+    if (tl_filter)
+      bmk1822_lmem_free_tensor(bk_ctx, tl_filter);
+    if (tl_input)
+      bmk1822_lmem_free_tensor(bk_ctx, tl_input);
+
+    return -1;
+  }
+
+  put_tensor_g2l(ctx, bk_ctx, tl_cal_data, chl_quan_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_input, reinterpret_cast<u8 *>(input_data));
+  put_tensor_g2l(ctx, bk_ctx, tl_filter, reinterpret_cast<u8 *>(kernel_data));
+
+  {
+    // Reshape per channel quantization data for TIU
+    tl_cal_data->shape = {1, static_cast<u32>(oc), 1, 1};
+    tl_cal_data->stride = bmk1822_tensor_lmem_default_stride(
+        bk_ctx, tl_cal_data->shape, FMT_I8, /*eu_align=*/0);
+
+    // Reshape weight for TIU
+    tl_filter->shape = {static_cast<u32>(ic), static_cast<u32>(oc),
+                        static_cast<u32>(kh), static_cast<u32>(kw)};
+
+    bmk1822_tiu_convolution_qdm_param_t param;
+    memset(&param, 0, sizeof(param));
+    param.ofmap = tl_output;
+    param.ifmap = tl_input;
+    param.weight = tl_filter;
+    param.chl_quan_param = tl_cal_data;
+    param.ins_h = ins_h;
+    param.ins_last_h = ins_last_h;
+    param.ins_w = ins_w;
+    param.ins_last_w = ins_last_w;
+    param.stride_h = stride_h;
+    param.stride_w = stride_w;
+    param.dilation_h = dh;
+    param.dilation_w = dw;
+    param.pad_top = pad_top;
+    param.pad_bottom = pad_bot;
+    param.pad_left = pad_left;
+    param.pad_right = pad_right;
+    param.has_bias = has_bias;
+    param.relu_enable = relu_enable;
+
+#ifdef ENABLE_DEBUG_MSG
+    printf("      tiu_conv_qdm:\n");
+    printf("        ifmap shape (%d, %d, %d, %d)\n", param.ifmap->shape.n,
+           param.ifmap->shape.c, param.ifmap->shape.h, param.ifmap->shape.w);
+    printf("        weight shape (%d, %d, %d, %d)\n", param.weight->shape.n,
+           param.weight->shape.c, param.weight->shape.h, param.weight->shape.w);
+    printf("        ofmap shape (%d, %d, %d, %d)\n", param.ofmap->shape.n,
+           param.ofmap->shape.c, param.ofmap->shape.h, param.ofmap->shape.w);
+#endif
+
+    bmk1822_tiu_convolution_qdm(bk_ctx, &param);
+  }
+
+  test_submit(ctx);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("      compare result:\n");
+#endif
+  s8 *conv_output_data =
+      reinterpret_cast<s8 *>(get_tensor_l2g(ctx, bk_ctx, tl_output));
+  for (int i = 0; i < in; ++i) {
+    for (int j = 0; j < oc; ++j) {
+      for (int k = 0; k < oh; ++k) {
+        for (int l = 0; l < ow; ++l) {
+          int offset = i * (oc * oh * ow) + j * (oh * ow) + k * ow + l;
+          if (conv_output_data[offset] != output_data[offset]) {
+            printf("        [ni=%d][oci=%d][ohi=%d][owi=%d] output %d(tiu) != "
+                   "%d(ref)\n",
+                   i, j, k, l, conv_output_data[offset], output_data[offset]);
+            ret = -1;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  if (ret) {
+    //dump_test_param(p_param, /*dump_content=*/true);
+    dump2_test_param(p_param);
+  }
+
+  // Reverse order
+  bmk1822_lmem_free_tensor(bk_ctx, tl_cal_data);
+  bmk1822_lmem_free_tensor(bk_ctx, tl_output);
+  bmk1822_lmem_free_tensor(bk_ctx, tl_filter);
+  bmk1822_lmem_free_tensor(bk_ctx, tl_input);
+
+  free(conv_output_data);
+
+  free(input_data);
+  free(kernel_data);
+  free(output_data);
+  free(bias_data);
+  free(multiplier_data);
+  free(shift_data);
+  free(chl_quan_data);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    <= run_compare_conv\n");
+#endif
+
+  return ret;
+}
+
+int random_test(bmctx_t *ctx, bmk_ctx_t *bk_ctx)
+{
+  int ret = 0;
+
+#ifndef ENABLE_FULL_REGRESSION
+#ifndef ENABLE_TV_GEN_PATTERN
+  // n: 12b, c: 12b, h: 12b(4095-32), wb: 12b(4095-32)
+  int batch_range[] = {1, 1, 2, 4095 - 32};
+  int input_height_range[] = {1, 512, 1024, 4095 - 32};
+  int input_width_range[] = {1, 512, 1024, 4095 - 32};
+  int input_depth_range[] = {1, 16, 32, 64, 102, 4095};
+  int output_depth_range[] = {1, 16, 32, 64, 128, 4095};
+
+  // h: 12b, w: 12b
+  // stride_h: 4b, strid_w: 4b
+  int kernel_height_range[] = {1, 11, 2048, 4095};
+  int kernel_width_range[] = {1, 11, 2048, 4095};
+  int kernel_stride_height_range[] = {1, 5, 16, 31};
+  int kernel_stride_width_range[] = {1, 5, 16, 31};
+#else
+  // TV_GEN pattern
+  // Random Test, total 19683, skipped 118066, executed 32, failed 0, ret 0
+
+  // n: 12b, c: 12b, h: 12b(4095-32), wb: 12b(4095-32)
+  int batch_range[] = {1, 1, 32};
+  int input_height_range[] = {1, 512, 4095 - 32};
+  int input_width_range[] = {1, 512, 4095 - 32};
+  int input_depth_range[] = {1, 16, 32, 64, 102, 4095};
+  int output_depth_range[] = {1, 16, 32, 64, 1024, 4095};
+
+  // h: 12b, w: 12b
+  // stride_h: 4b, strid_w: 4b
+  int kernel_height_range[] = {1, 11, 2048, 4095};
+  int kernel_width_range[] = {1, 11, 2048, 4095};
+  int kernel_stride_height_range[] = {1, 5, 16, 31};
+  int kernel_stride_width_range[] = {1, 5, 16, 31};
+
+#endif //ENABLE_TV_GEN_PATTERN
+#else
+#if 0
+  // Input with same range size
+  int batch_range[] = {1};
+  int input_height_range[] = {1};
+  int input_width_range[] = {1};
+  int input_depth_range[] = {1};
+  const int input_range_size = sizeof(input_height_range)/sizeof(input_height_range[0]);
+
+  // Kernel with same range size
+  int kernel_height_range[] = {1};
+  int kernel_width_range[] = {1};
+  int kernel_stride_height_range[] = {1};
+  int kernel_stride_width_range[] = {1};
+  int output_depth_range[] = {1};
+  const int kernel_range_size = sizeof(kernel_height_range)/sizeof(kernel_height_range[0]);
+#else
+  // 10/21/2019 overnight
+  // total 20480000, skipped 20301713, executed 178287, failed 0
+
+  // n: 12b, c: 12b, h: 12b(4095-32), wb: 12b(4095-32)
+  int batch_range[] = {1, 2, 4, 8, 16, 32, 64, 4095 - 32};
+  int input_height_range[] = {1, 3, 11, 128, 512, 1024, 2048, 4095 - 32};
+  int input_width_range[] = {1, 3, 11, 128, 512, 1024, 2048, 4095 - 32};
+  int input_depth_range[] = {1, 3, 11, 32, 64, 1024, 2048, 4095};
+  int output_depth_range[] = {1, 3, 11, 32, 64, 1024, 2048, 4095};
+
+  // h: 12b, w: 12b
+  // stride_h: 4b, strid_w: 4b
+  int kernel_height_range[] = {1, 3, 11, 511, 4095};
+  int kernel_width_range[] = {1, 3, 11, 511, 4095};
+  int kernel_stride_height_range[] = {1, 3, 5, 7, 15, 16, 31};
+  int kernel_stride_width_range[] = {1, 3, 5, 7, 15, 16, 31};
+#endif
+#endif /* ENABLE_FULL_REGRESSION */
+
+  const int batch_range_size = sizeof(batch_range) / sizeof(batch_range[0]);
+  const int input_height_range_size =
+      sizeof(input_height_range) / sizeof(input_height_range[0]);
+  const int input_width_range_size =
+      sizeof(input_width_range) / sizeof(input_width_range[0]);
+  const int input_depth_range_size =
+      sizeof(input_depth_range) / sizeof(input_depth_range[0]);
+  const int output_depth_range_size =
+      sizeof(output_depth_range) / sizeof(output_depth_range[0]);
+
+  const int kernel_height_range_size =
+      sizeof(kernel_height_range) / sizeof(kernel_height_range[0]);
+  const int kernel_width_range_size =
+      sizeof(kernel_width_range) / sizeof(kernel_width_range[0]);
+  const int kernel_stride_height_range_size =
+      sizeof(kernel_stride_height_range) /
+      sizeof(kernel_stride_height_range[0]);
+  const int kernel_stride_width_range_size =
+      sizeof(kernel_stride_width_range) / sizeof(kernel_stride_width_range[0]);
+
+  int random_seed = clock();
+  srand(random_seed);
+
+  const int retry_test_count = 100;
+
+  bool stop_at_first_error = true;
+
+  int total_tests = batch_range_size * input_depth_range_size *
+                    input_height_range_size * input_width_range_size *
+                    output_depth_range_size * kernel_height_range_size *
+                    kernel_width_range_size * kernel_stride_height_range_size *
+                    kernel_stride_width_range_size;
+  int skipped_tests = 0;
+  int executed_tests = 0;
+  int failed_tests = 0;
+  int current_test = 0;
+
+  printf("Random Test =>\n");
+  for (int m = 0; m < retry_test_count; ++m) {
+    for (int i = 0; i < batch_range_size; ++i) {
+      // random choosed from [range[i] : range[i+1]]
+      int batch = choose_from_range(batch_range, batch_range_size, i);
+
+      for (int j = 0; j < input_height_range_size; ++j) {
+        int input_height =
+            choose_from_range(input_height_range, input_height_range_size, j);
+
+        for (int k = 0; k < input_width_range_size; ++k) {
+          int input_width =
+              choose_from_range(input_width_range, input_width_range_size, k);
+
+          for (int l = 0; l < input_depth_range_size; ++l) {
+            int input_depth =
+                choose_from_range(input_depth_range, input_depth_range_size, k);
+
+            for (int m = 0; m < kernel_height_range_size; ++m) {
+              int kernel_height = choose_from_range(
+                  kernel_height_range, kernel_height_range_size, m);
+
+              for (int n = 0; n < kernel_width_range_size; ++n) {
+                int kernel_width = choose_from_range(
+                    kernel_width_range, kernel_width_range_size, n);
+
+                for (int x = 0; x < kernel_stride_height_range_size; ++x) {
+                  int kernel_stride_height =
+                      choose_from_range(kernel_stride_height_range,
+                                        kernel_stride_height_range_size, x);
+
+                  for (int y = 0; y < kernel_stride_width_range_size; ++y) {
+                    int kernel_stride_width =
+                        choose_from_range(kernel_stride_width_range,
+                                          kernel_stride_width_range_size, y);
+
+                    for (int z = 0; z < output_depth_range_size; ++z) {
+                      int output_depth = choose_from_range(
+                          output_depth_range, output_depth_range_size, y);
+
+                      current_test++;
+
+                      int has_bias = rand() % 2;
+                      int dh = 1;
+                      int dw = 1;
+                      int ins_h = 0;
+                      int ins_h_last = 0;
+                      int ins_w = 0;
+                      int ins_w_last = 0;
+                      int pad_top = 0;
+                      int pad_bot = 0;
+                      int pad_left = 0;
+                      int pad_right = 0;
+
+                      int ih_ext = calc_dilute_hw(input_height, ins_h,
+                                                  ins_h_last, pad_top, pad_bot);
+                      int iw_ext = calc_dilute_hw(
+                          input_width, ins_w, ins_w_last, pad_left, pad_right);
+                      int kh_ext =
+                          calc_dilute_hw(kernel_height, dh - 1, 0, 0, 0);
+                      int kw_ext =
+                          calc_dilute_hw(kernel_width, dw - 1, 0, 0, 0);
+
+                      int oh =
+                          calc_output_hw(ih_ext, kh_ext, kernel_stride_height);
+                      int ow =
+                          calc_output_hw(iw_ext, kw_ext, kernel_stride_width);
+
+                      conv_test_param_t test_param;
+                      memset(&test_param, 0, sizeof(test_param));
+                      test_param.input_n = batch;
+                      test_param.input_c = input_depth;
+                      test_param.input_h = input_height;
+                      test_param.input_w = input_width;
+                      test_param.kh = kernel_height;
+                      test_param.kw = kernel_width;
+                      test_param.dh = dh;
+                      test_param.dw = dw;
+                      test_param.pad_top = pad_top;
+                      test_param.pad_bot = pad_bot;
+                      test_param.pad_left = pad_left;
+                      test_param.pad_right = pad_right;
+                      test_param.ins_h = ins_h;
+                      test_param.ins_h_last = ins_h_last;
+                      test_param.ins_w = ins_w;
+                      test_param.ins_w_last = ins_w_last;
+                      test_param.stride_h = kernel_stride_height;
+                      test_param.stride_w = kernel_stride_width;
+                      test_param.output_c = output_depth;
+                      test_param.output_h = oh;
+                      test_param.output_w = ow;
+                      test_param.has_bias = has_bias;
+                      test_param.retry_cnt = 5;
+
+                      bool is_valid_param =
+                          check_valid_test_param(bk_ctx, &test_param);
+                      if (is_valid_param == false) {
+                        skipped_tests++;
+                        continue;
+                      }
+
+                      int ret2 = run_compare_conv(ctx, bk_ctx, &test_param);
+                      failed_tests = ret2 ? failed_tests + 1 : failed_tests;
+                      ret |= ret2;
+                      executed_tests++;
+
+#ifdef ENABLE_DEBUG_MSG
+                      printf(
+                          "  [%d/%d] random test: input shape(%d, %d, %d, %d)",
+                          current_test, total_tests, batch, input_depth,
+                          input_height, input_width);
+                      printf(", kernel shape (%d, %d, %d, %d), result %d\n",
+                             output_depth, input_depth, kernel_height,
+                             kernel_width, ret2);
+#endif
+
+                      // Stop at first error
+                      if (ret && stop_at_first_error) {
+                        break;
+                      }
+                    }
+
+                    // Stop at first error
+                    if (ret && stop_at_first_error) {
+                      break;
+                    }
+                  }
+
+                  // Stop at first error
+                  if (ret && stop_at_first_error) {
+                    break;
+                  }
+                }
+
+                // Stop at first error
+                if (ret && stop_at_first_error) {
+                  break;
+                }
+              }
+
+              // Stop at first error
+              if (ret && stop_at_first_error) {
+                break;
+              }
+            }
+
+            // Stop at first error
+            if (ret && stop_at_first_error) {
+              break;
+            }
+          }
+
+          // Stop at first error
+          if (ret && stop_at_first_error) {
+            break;
+          }
+        }
+
+        // Stop at first error
+        if (ret && stop_at_first_error) {
+          break;
+        }
+      }
+
+      // Stop at first error
+      if (ret && stop_at_first_error) {
+        break;
+      }
+    }
+    // Stop at first error
+    if (ret && stop_at_first_error) {
+      break;
+    }
+
+    if (executed_tests >= MIN_EXEC_TESTS) {
+      break;
+    }
+  }
+
+  printf(
+      "<= Random Test, total %d, skipped %d, executed %d, failed %d, ret %d\n",
+      total_tests, skipped_tests, executed_tests, failed_tests, ret);
+
+  return ret;
+}
+
+int main()
+{
+  int ret = 0;
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  ret |= simple_test(&ctx, bk_ctx);
+  ret |= random_test(&ctx, bk_ctx);
+
+  test_exit(&ctx);
+
+  return ret;
+}
diff --git a/cviruntime/test/1822/test_1822_conv_wtiling.cpp b/cviruntime/test/1822/test_1822_conv_wtiling.cpp
new file mode 100644
index 000000000..6d75136a3
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_conv_wtiling.cpp
@@ -0,0 +1,884 @@
+#include "1822_test_util.h"
+#include <iostream>
+typedef struct {
+  int random_seed;
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int using_bias;
+  int bReLU_EN;
+  int r_shift_m;
+  int opd0_sign;
+  int opd1_sign;
+  int opd2_sign;
+} conv_param_t;
+
+typedef struct {
+  u32 n;
+  u32 c;
+  u32 h;
+  u32 w;
+}slice_t;
+
+static int index_get(int h, int w1, int w2)
+{
+  return h * w1 + w2;
+}
+
+static int matrix_dot_mult(
+    s8 *A, s8 *B, int dim_n, int dim_m,
+    int opd0_sign)
+{
+  int sum = 0;
+  for (int i=0; i<dim_n; i++){
+    for (int j=0; j<dim_m; j++){
+      int index = index_get(i, dim_m, j);
+      if (opd0_sign) {
+        sum += A[index] * B [index];
+      } else {
+        sum += (int)((uint8_t)A[index]) * B [index];
+      }
+    }
+  }
+  return sum;
+}
+
+static int conv_ref(
+    const conv_param_t *p_param,
+    const s8 *ifmap,
+    const s8 *weight,
+    const s16 *bias,
+    s8 *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+  int r_shift_bits = p_param->r_shift_m;
+  int do_relu = p_param->bReLU_EN;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  s8 *i_fmap_pad_ker = (s8 *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return BM_ERR_FAILURE;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = BM_SUCCESS;
+
+  s8 *i_fmap_pad = NULL;
+  s8 *kernel_after = NULL;
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+
+      if (p_param->using_bias) {
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += bias[c]; //bias+c ;
+          }
+        }
+      }
+
+      if (do_relu)
+        relu(&result[n*oc*oh*ow + c*oh*ow], oh * ow);
+
+      // ofmap is s8, signed
+      ret = satu_2_8bit(&result[n*oc*oh*ow + c*oh*ow], oh * ow,
+                        &ofmap[n*oc*oh*ow + c*oh*ow], r_shift_bits, /*round_floor=*/1,
+                        /*sign_unsign=*/1);
+
+      if (ret != BM_SUCCESS)
+        goto error_release;
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+  neuron_dump <s32>(
+      "test_w_tiling_code:conv_ref:pure result + bias",
+      (u32)in,
+      (u32)oc,
+      (u32)oh,
+      (u32)ow,
+      (s32 *)result);
+
+  neuron_dump <s8>(
+      "test_w_tiling_code:conv_ref:final result",
+      (u32)in,
+      (u32)oc,
+      (u32)oh,
+      (u32)ow,
+      (s8 *)ofmap);
+
+error_release:
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+static u8 * transform_weight(const tl_shape_t *s, u8 before[])
+{
+  u32 ic = s->n;
+  u32 oc = s->c;
+  u32 kh = s->h;
+  u32 kw = s->w;
+
+  u32 size = ic * oc * kh * kw;
+  u8 *after = (u8 *)malloc(sizeof(u8) * size);
+
+  /*
+   * (oc, ic, kh, kw) -> (1, oc, kh * kw, ic)
+   */
+  for (u32 oci = 0; oci < oc; oci++) {
+    for (u32 ici = 0; ici < ic; ici++) {
+      for (u32 khi = 0; khi < kh; khi++) {
+        for (u32 kwi = 0; kwi < kw; kwi++) {
+          u32 src_i = oci * ic * kh * kw + ici * kh * kw + khi * kw + kwi;
+          u32 dst_i = oci * kh * kw * ic + khi * kw * ic + kwi * ic + ici;
+          after[dst_i] = before[src_i];
+        }
+      }
+    }
+  }
+
+  return after;
+}
+
+static void put_conv_weight(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    const tl_t *tl,
+    u8 *data)
+{
+  const tl_shape_t *s = &tl->shape;
+  u32 ic = s->n;
+  u32 oc = s->c;
+  u32 kh = s->h;
+  u32 kw = s->w;
+
+  bmshape_t bms = BM_TENSOR_INT8((int)oc, (int)ic, (int)kh, (int)kw);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  u8 *transformed_data = transform_weight(s, data);
+
+  /* we put weight to region 1. bm_memcpy_s2d regard dev_mem as
+   * absolute address, so we should pass abs address to copy weight
+   * to right place.
+   */
+
+  //u64 ab_addr = bm_device_read_base_reg(*ctx, 1);
+  //bmmem_device_t ab_dev_mem =
+    //bmmem_device_prealloc(*ctx, NULL, ab_addr + gaddr, &bms);
+
+  //int ret = bm_memcpy_s2d(*ctx, ab_dev_mem, transformed_data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tl_shape_t tdma_shape = { 1, oc, kh * kw, ic };
+
+  tg_t tdma_tg;
+  tdma_tg.base_reg_index = 0;
+  tdma_tg.start_address = gaddr;
+  tdma_tg.fmt = FMT_I8;
+  tdma_tg.shape.n = tdma_shape.n;
+  tdma_tg.shape.c = tdma_shape.c;
+  tdma_tg.shape.h = tdma_shape.h;
+  tdma_tg.shape.w = tdma_shape.w;
+  tdma_tg.stride = bmk1822_tensor_tgmem_default_stride(tdma_tg.shape, tdma_tg.fmt);
+  tdma_tg.base_reg_index = 1;
+
+  tl_t tdma_tl = *tl;
+  tdma_tl.shape = tdma_shape;
+  tdma_tl.stride = bmk1822_tensor_lmem_default_stride(bk_ctx, tdma_shape, FMT_I8, 0);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tdma_tg;
+  p.dst = &tdma_tl;
+
+  bmk1822_tdma_g2l_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  neuron_dump <s8> (
+      "test_ic_tiling_conv: kernel",
+      tdma_tg.shape.n,
+      tdma_tg.shape.c,
+      tdma_tg.shape.h,
+      tdma_tg.shape.w,
+      (s8 *)transformed_data);
+
+  bmmem_device_free(*ctx, dev_mem);
+  free(transformed_data);
+}
+
+static s8 * transform_bias(int oc, s16 before[])
+{
+  s8 *after = (s8 *)malloc(sizeof(s8) * 2 * oc);
+  if (!after)
+    return NULL;
+
+  for (int i = 0; i < oc; i++){
+    after[i] = before[i] & 0xff;
+    after[i + oc] = (before[i] >> 8) & 0xff;
+  }
+  return after;
+}
+
+static void put_conv_bias(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    const tl_t *tl,
+    s16 *data)
+{
+  int oc = tl->shape.c;
+
+  bmshape_t bms = BM_TENSOR_INT8(2, oc, 1, 1);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  s8 *transformed_data = transform_bias(oc, data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, (u8 *)transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_I8;
+  tg.shape.n = 2;
+  tg.shape.c = oc;
+  tg.shape.h = 1;
+  tg.shape.w = 1;
+  tg.stride = bmk1822_tensor_tgmem_default_stride(tg.shape, tg.fmt);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1822_tdma_g2l_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  bmmem_device_free(*ctx, dev_mem);
+  free(transformed_data);
+}
+
+static int conv_kh_ext(const conv_param_t *p)
+{
+  return (p->kh - 1) * p->dh + 1;
+}
+
+static int conv_kw_ext(const conv_param_t *p)
+{
+  return (p->kw - 1) * p->dw + 1;
+}
+
+static int conv_ih_ext(const conv_param_t *p)
+{
+  return (p->input_h - 1) * (p->ins_h + 1) +
+      p->ins_h_last + 1 + p->pad_top + p->pad_bot;
+}
+
+static int conv_iw_ext(const conv_param_t *p)
+{
+  return (p->input_w - 1) * (p->ins_w + 1) +
+      p->ins_w_last + 1 + p->pad_left + p->pad_right;
+}
+
+static int conv_oh(const conv_param_t *p)
+{
+  return (conv_ih_ext(p) - conv_kh_ext(p)) / p->stride_h + 1;
+}
+
+static int conv_ow(const conv_param_t *p)
+{
+  return (conv_iw_ext(p) - conv_kw_ext(p)) / p->stride_w + 1;
+}
+
+static int conv_input_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int ic = p->input_c;
+  int ih = p->input_h;
+  int iw = p->input_w;
+  return in * ic * ih * iw;
+}
+
+static int conv_output_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int oc = p->output_c;
+  int oh = conv_oh(p);
+  int ow = conv_ow(p);
+  return in * oc * oh * ow;
+}
+
+static int conv_weight_size(const conv_param_t *p)
+{
+  int oc = p->output_c;
+  int ic = p->input_c;
+  int kh = p->kh;
+  int kw = p->kw;
+  return oc * ic * kh * kw;
+}
+
+static s8 * alloc_input(const conv_param_t *p)
+{
+  int size = conv_input_size(p);
+
+  s8 *buf = (s8 *)malloc(sizeof(s8) * size);
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static s8 * alloc_weight(const conv_param_t *p)
+{
+  int size = conv_weight_size(p);
+
+  s8 *buf = (s8 *)malloc(sizeof(s8) * size);
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static s16 * alloc_bias(const conv_param_t *p)
+{
+  int oc = p->output_c;
+
+  s16 *bias = (s16 *)malloc(sizeof(s16) * oc);
+  for (int i = 0; i < oc; i++)
+    bias[i] = rand() % 65536 - 32768;
+
+  return bias;
+}
+
+static tl_t * conv_ifmap_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = p->opd0_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return bmk1822_lmem_alloc_tensor(ctx, s, fmt, 1);
+}
+
+static tl_t * conv_weight_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = p->opd1_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+  return bmk1822_lmem_alloc_tensor(ctx, s, fmt, 0);
+}
+
+static tl_t * conv_ofmap_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  return bmk1822_lmem_alloc_ps32_tensor(ctx, s, FMT_I8, 1);
+}
+
+static tl_t * conv_bias_tensor(
+    bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = p->opd2_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+  return bmk1822_lmem_alloc_tensor(ctx, s, fmt, 0);
+}
+
+static int conv_param_is_ok(const conv_param_t *p)
+{
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+
+  if ((kh_ext > ih_ext)
+      || (kw_ext > iw_ext)
+      || (kh_ext <= p->pad_top)
+      || (kh_ext <= p->pad_bot)
+      || (kw_ext <= p->pad_left)
+      || (kw_ext <= p->pad_right)
+      || (p->pad_top >= (1 << 4))
+      || (p->pad_bot >= (1 << 4))
+      || (p->pad_left >= (1 << 4))
+      || (p->pad_right >= (1 << 4))) {
+    return 0;
+  }
+
+  return 1;
+}
+
+static int bmk_conv_param_alloc_ok(
+    const bmk1822_tiu_convolution_param_t *p,
+    const conv_param_t *param)
+{
+  if (!p->ifmap || !p->ofmap || !p->weight)
+    return 0;
+
+  if (param->using_bias)
+    if (!p->bias)
+      return 0;
+
+  return 1;
+}
+
+static void make_bmk_conv_param(
+    bmk_ctx_t *ctx,
+    bmk1822_tiu_convolution_param_t *dst,
+    const conv_param_t *p)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  dst->relu_enable = p->bReLU_EN;
+  dst->rshift_bits = p->r_shift_m;
+  dst->ifmap = conv_ifmap_tensor(ctx, p);
+  dst->weight = conv_weight_tensor(ctx, p);
+  dst->ofmap = conv_ofmap_tensor(ctx, p);
+  dst->bias = NULL;
+  dst->ps32_mode = 0;
+  if (p->using_bias)
+    dst->bias = conv_bias_tensor(ctx, p);
+
+  dst->w_is_const = 0;
+}
+
+static void free_bmk_conv_param(
+    bmk_ctx_t *ctx,
+    bmk1822_tiu_convolution_param_t *r,
+    const conv_param_t *p)
+{
+  if (p->using_bias && r->bias)
+    free_tl(ctx, r->bias);
+  if (r->ofmap)
+    free_tl(ctx, r->ofmap);
+  if (r->weight)
+    free_tl(ctx, r->weight);
+  if (r->ifmap)
+    free_tl(ctx, r->ifmap);
+}
+
+static void init_conv_param(conv_param_t &p)
+{
+  printf("init_conv_param\n");
+  p.random_seed = clock();
+  srand(p.random_seed);
+
+retry:
+  p.input_n = 1;
+  p.input_c = 1;
+  p.kh = 3;
+  p.kw = 3;
+  p.input_h = 4 + p.kh;
+  p.input_w = 4 + p.kw ;
+  p.output_c = 1;
+  p.stride_h = 1;
+  p.stride_w = 1;
+  p.ins_h = 0;
+  p.ins_w = 0;
+  p.ins_h_last = 0;
+  p.ins_w_last = 0;
+  p.dh = 1;
+  p.dw = 1;
+
+  int kh_ext = conv_kh_ext(&p);
+  int kw_ext = conv_kw_ext(&p);
+  p.pad_top = 0;
+  p.pad_bot = 0;
+  p.pad_left = 0;
+  p.pad_right = 0;
+
+  if (!conv_param_is_ok(&p)) {
+    printf("retry init_conv_param\n");
+    goto retry;
+  }
+
+  p.using_bias = 1;
+  p.r_shift_m = rand() % 8;
+  p.bReLU_EN = rand() % 2;
+  p.opd0_sign = rand() % 2;
+
+  p.opd1_sign = 1;
+  p.opd2_sign = 1;
+
+  assert(p.opd1_sign == 1 && p.opd2_sign == 1);
+
+  int ih_ext = conv_ih_ext(&p);
+  int iw_ext = conv_iw_ext(&p);
+  assert(ih_ext >= kh_ext);
+  assert(iw_ext >= kw_ext);
+}
+
+static void print_conv_param(const conv_param_t *p)
+{
+  printf("%s\n", "Conv parameters:");
+  printf("  %s%d;\n", "p->random_seed = ", p->random_seed);
+
+  printf("  %s%d;\n", "p->input_n = ", p->input_n);
+  printf("  %s%d;\n", "p->input_c = ", p->input_c);
+  printf("  %s%d;\n", "p->input_h = ", p->input_h);
+  printf("  %s%d;\n", "p->input_w = ", p->input_w);
+  printf("  %s%d;\n", "p->output_c = ", p->output_c);
+
+  printf("  %s%d;\n", "p->kh = ", p->kh);
+  printf("  %s%d;\n", "p->kw = ", p->kw);
+  printf("  %s%d;\n", "p->dh = ", p->dh);
+  printf("  %s%d;\n", "p->dw = ", p->dw);
+  printf("  %s%d;\n", "p->pad_top = ", p->pad_top);
+  printf("  %s%d;\n", "p->pad_bot = ", p->pad_bot);
+  printf("  %s%d;\n", "p->pad_left = ", p->pad_left);
+  printf("  %s%d;\n", "p->pad_right = ", p->pad_right);
+  printf("  %s%d;\n", "p->stride_h = ", p->stride_h);
+  printf("  %s%d;\n", "p->stride_w = ", p->stride_w);
+  printf("  %s%d;\n", "p->ins_w = ", p->ins_w);
+  printf("  %s%d;\n", "p->ins_h = ", p->ins_h);
+  printf("  %s%d;\n", "p->ins_w_last = ", p->ins_w_last);
+  printf("  %s%d;\n", "p->ins_h_last = ", p->ins_h_last);
+
+  printf("  %s%d;\n", "p->r_shift_m = ", p->r_shift_m);
+  printf("  %s%d;\n", "p->opd0_sign = ", p->opd0_sign);
+  printf("  %s%d;\n", "p->opd1_sign = ", p->opd1_sign);
+  printf("  %s%d;\n", "p->opd2_sign = ", p->opd2_sign);
+  printf("  %s%d;\n", "p->bReLU_EN = ", p->bReLU_EN);
+  printf("  %s%d;\n", "p->using_bias = ", p->using_bias);
+
+  printf("  %s%d\n", "kh_ext = ", conv_kh_ext(p));
+  printf("  %s%d\n", "kw_ext = ", conv_kw_ext(p));
+  printf("  %s%d\n", "ih_ext = ", conv_ih_ext(p));
+  printf("  %s%d\n", "iw_ext = ", conv_iw_ext(p));
+  printf("  %s%d\n", "output_h = ", conv_oh(p));
+  printf("  %s%d\n", "output_w = ", conv_ow(p));
+}
+
+/* Calculate the right shift value, m
+ * Steps:
+ *  1. Get the abs() of each weight;
+ *  2. Summary all the abs() in one kernel;
+ *  3. Get Log2 of each sum;
+ *  4. Downward rounding;
+ * After every r_shift value got, sort and find the middle one.
+ */
+
+static int calc_rshift_m(const conv_param_t *p, s8* weight)
+{
+  int kernel_cnt = p->output_c * p->input_c;
+  int kernel_size = p->kh * p->kw;
+  int *kernel_shifts = (int *)malloc(sizeof(int) * kernel_cnt);
+
+  // Tscan does not recognize C++ zero-initialized.
+  memset(kernel_shifts, 0, sizeof(int) * kernel_cnt);
+
+  // Part 1:
+  // Get right shift value for each kernel
+  int sum = 0;
+  for (int i = 0; i < kernel_cnt; i++) {
+    // Step 1 & 2: Get the sum of abs()
+    for (int j = 0; j < kernel_size; j++) {
+      sum += (int)(*weight < 0 ? -(*weight) : (*weight));
+      weight++;
+    }
+    // Step 3 & 4: log2 and downward rounding
+    sum >>= 1;
+    while (sum) {
+      sum >>= 1;
+      kernel_shifts[i]++;
+    }
+  }
+
+  // Part 2:
+  // Find the middle of all the values
+  int tag[32] = {0};
+  for (int cnt = 0; cnt < kernel_cnt; cnt++) {
+    tag[kernel_shifts[cnt]]++;
+  }
+
+  int rshift_m = 0;
+  int mid = 0;
+  do {
+    mid += tag[rshift_m++];
+  } while(mid < (kernel_cnt - 1) >> 1);
+
+  free(kernel_shifts);
+
+  return rshift_m - 1;
+}
+
+static int test_w_tiling_conv(
+    conv_param_t &p_param, bmctx_t ctx, bmk_ctx_t *bk_ctx)
+{
+  printf("test w tiled conv\n");
+  s8 *input = alloc_input(&p_param);
+  s8 *weight = alloc_weight(&p_param);
+  s16 *bias = alloc_bias(&p_param);
+  p_param.r_shift_m = calc_rshift_m(&p_param, weight);
+  s8 *output_ref = (s8 *)malloc(sizeof(s8) * conv_output_size(&p_param));
+  if (!output_ref)
+    return BM_ERR_FAILURE;
+
+  bmerr_t ret = conv_ref(&p_param, input, weight, bias, output_ref);
+  assert(ret == BM_SUCCESS);
+
+  bmk1822_tiu_convolution_param_t conv_param;
+  make_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+  /*We tile the finest granule to test w tiling*/
+  u32 ow_step = 1;
+
+  int tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, &p_param);
+  if (tl_alloc_success) {
+    put_conv_weight(&ctx, bk_ctx, conv_param.weight, (u8 *)weight);
+    if (p_param.using_bias) {
+      put_conv_bias(&ctx, bk_ctx, conv_param.bias, bias);
+      neuron_dump <s16> (
+        "test_w_tiling_conv: bias",
+        1,
+        conv_param.bias->shape.c,
+        conv_param.bias->shape.h,
+        conv_param.bias->shape.w,
+        (s16 *)bias);
+    }
+
+    tl_t tl_ifmap = *conv_param.ifmap;
+    tl_t tl_ofmap = *conv_param.ofmap;
+
+    tg_shape_t s;
+    s.n = tl_ifmap.shape.n;
+    s.c = tl_ifmap.shape.c;
+    s.h = tl_ifmap.shape.h;
+    s.w = tl_ifmap.shape.w;
+    tg_t *ts_ifmap = alloc_tg_gmem(&ctx, s, FMT_I8);
+    put_tg_gmem(&ctx, ts_ifmap, (u8 *)input);
+
+    s.n = tl_ofmap.shape.n;
+    s.c = tl_ofmap.shape.c;
+    s.h = tl_ofmap.shape.h;
+    s.w = tl_ofmap.shape.w;
+    tg_t *ts_ofmap = alloc_tg_gmem(&ctx, s, FMT_I8);
+
+    neuron_dump <s8> (
+      "test_w_tiling_conv: input",
+      conv_param.ifmap->shape.n,
+      conv_param.ifmap->shape.c,
+      conv_param.ifmap->shape.h,
+      conv_param.ifmap->shape.w,
+      (s8 *)input);
+
+    for (u32 ow_pos = 0; ow_pos < tl_ofmap.shape.w; ow_pos += ow_step) {
+      u32 cur_ow = math_min(tl_ofmap.shape.w - ow_pos, ow_step);
+
+      tg_t ts_cur_ofmap;
+      ts_cur_ofmap.shape.n = ts_ofmap->shape.n;
+      ts_cur_ofmap.shape.c = ts_ofmap->shape.c;
+      ts_cur_ofmap.shape.h = ts_ofmap->shape.h;
+      ts_cur_ofmap.shape.w = cur_ow;
+      ts_cur_ofmap.stride = ts_ofmap->stride;
+      ts_cur_ofmap.start_address = ts_ofmap->start_address + ow_pos;
+      ts_cur_ofmap.fmt = ts_ofmap->fmt;
+      ts_cur_ofmap.base_reg_index = ts_ofmap->base_reg_index;
+
+      tl_t tl_cur_ofmap;
+      tl_cur_ofmap.shape.n = tl_ofmap.shape.n;
+      tl_cur_ofmap.shape.c = tl_ofmap.shape.c;
+      tl_cur_ofmap.shape.h = tl_ofmap.shape.h;
+      tl_cur_ofmap.shape.w = cur_ow;
+      tl_cur_ofmap.stride = bmk1822_tensor_lmem_default_stride(bk_ctx, tl_cur_ofmap.shape, FMT_I8, 1);
+      tl_cur_ofmap.fmt = tl_ofmap.fmt;
+      tl_cur_ofmap.start_address = tl_ofmap.start_address;
+
+      tg_t ts_cur_ifmap;
+      ts_cur_ifmap.shape.n = ts_ifmap->shape.n;
+      ts_cur_ifmap.shape.c = ts_ifmap->shape.c;
+      ts_cur_ifmap.shape.h = ts_ifmap->shape.h;
+      ts_cur_ifmap.shape.w = (cur_ow - 1) * conv_param.stride_w + conv_kw_ext(&p_param);
+      ts_cur_ifmap.stride = ts_ifmap->stride;
+      ts_cur_ifmap.start_address = ts_ifmap->start_address + ow_pos;
+      ts_cur_ifmap.fmt = ts_ifmap->fmt;
+      ts_cur_ifmap.base_reg_index = ts_ifmap->base_reg_index;
+
+      tl_t tl_cur_ifmap;
+      tl_cur_ifmap.shape.n = tl_ifmap.shape.n;
+      tl_cur_ifmap.shape.c = tl_ifmap.shape.c;
+      tl_cur_ifmap.shape.h = tl_ifmap.shape.h;
+      tl_cur_ifmap.shape.w = (cur_ow - 1) * conv_param.stride_w + conv_kw_ext(&p_param);
+      tl_cur_ifmap.stride = bmk1822_tensor_lmem_default_stride(bk_ctx, tl_cur_ifmap.shape, FMT_I8, 1);
+      tl_cur_ifmap.fmt = tl_ifmap.fmt;
+      tl_cur_ifmap.start_address = tl_ifmap.start_address;
+
+      {
+        bmk1822_tdma_tg2l_tensor_copy_param_t p;
+        memset(&p, 0, sizeof(p));
+        p.src = &ts_cur_ifmap;
+        p.dst = &tl_cur_ifmap;
+        bmk1822_tdma_g2l_tensor_copy(bk_ctx, &p);
+        test_submit(&ctx);
+      }
+      {
+        bmk1822_tiu_convolution_param_t p;
+        memset(&p, 0, sizeof(p));
+        p = conv_param;
+        p.ifmap = &tl_cur_ifmap;
+        p.ofmap = &tl_cur_ofmap;
+        if(p_param.ins_w_last == 1 && (ow_pos + ow_step) >= tl_ofmap.shape.w)
+          p.ins_last_w = 1;
+        else
+          p.ins_last_w = 0;
+
+        bmk1822_tiu_convolution(bk_ctx, &p);
+      }
+      {
+        bmk1822_tdma_l2tg_tensor_copy_param_t p;
+        memset(&p, 0, sizeof(p));
+        p.src = &tl_cur_ofmap;
+        p.dst = &ts_cur_ofmap;
+        bmk1822_tdma_l2g_tensor_copy(bk_ctx, &p);
+        test_submit(&ctx);
+      }
+    }
+    u8 *output = get_tg_gmem(&ctx, ts_ofmap);
+    free_tg_gmem(&ctx, ts_ifmap);
+    free_tg_gmem(&ctx, ts_ofmap);
+
+    neuron_dump <s8> (
+      "test_w_tiling_conv: output",
+      conv_param.ofmap->shape.n,
+      conv_param.ofmap->shape.c,
+      conv_param.ofmap->shape.h,
+      conv_param.ofmap->shape.w,
+      (s8 *)output);
+
+    int has_error = array_cmp_int8(
+        "Comparing results ...\n",
+        output_ref, (s8 *)output, conv_output_size(&p_param));
+
+    free(output);
+
+    if (has_error) {
+      print_conv_param(&p_param);
+      printf("Comparison FAILED\n");
+      exit(-1);
+    }
+  }
+
+  free_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+  free(input);
+  free(weight);
+  free(output_ref);
+  free(bias);
+  return tl_alloc_success ? 1 : 0;
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  int test_finished_num = 0;
+  for (int i = 0; i < 1; i++) {
+    printf("random_test_conv iteration: %d\n", i);
+    conv_param_t test_conv_param;
+    init_conv_param(test_conv_param);
+    test_finished_num += test_w_tiling_conv(test_conv_param, ctx, bk_ctx);
+    if (!test_conv_param.using_bias)
+      test_conv_param.using_bias = 1;
+    if (test_conv_param.output_c <= 9)
+      test_conv_param.output_c += 3;
+    test_finished_num += test_w_tiling_conv(test_conv_param, ctx, bk_ctx);
+  }
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_depthwise.cpp b/cviruntime/test/1822/test_1822_depthwise.cpp
new file mode 100644
index 000000000..c4a7de431
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_depthwise.cpp
@@ -0,0 +1,333 @@
+#include "1822_test_util.h"
+
+#define INVALIDE_STRIDE (-1)
+typedef bmk1822_tiu_depthwise_convolution_param_t param_t;
+
+static void print_pooling_param(param_t *p)
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+  int kh = p->weight->shape.h;
+  int kw = p->weight->shape.w;
+
+  printf("  Pooling parameters:\n");
+  printf("    ifmap = (%d, %d, %d, %d)\n", in, ic, ih, iw);
+  printf("    opd0_sign = %d\n", p->ifmap->fmt == FMT_I8);
+  printf("    weight = (%d, %d)\n", kh, kw);
+  printf("    padding = (%d, %d, %d, %d)\n",
+         p->pad_top, p->pad_bottom, p->pad_left, p->pad_right);
+  printf("    stride = (%d, %d)\n", p->stride_h, p->stride_w);
+  printf("    ins0 = (%d, %d, %d, %d)\n",
+         p->ins_h, p->ins_last_h, p->ins_w, p->ins_last_w);
+  printf("    rshift_bits = %d\n", p->rshift_bits);
+  printf("    relu_enable = %d\n", p->relu_enable);
+  printf("    res0_sign = %d\n", p->ofmap->fmt == FMT_I8);
+}
+
+static s8 *alloc_input(param_t *p)
+{
+  u64 size = tl_shape_size(&p->ifmap->shape);
+  s8 *data = (s8 *)xmalloc(size);
+  if (!data)
+    return NULL;
+
+  for (u64 i = 0; i < size; i++)
+    data[i] = rand() % 256 - 128;
+  return data;
+}
+
+static s8 *alloc_weight(param_t *p)
+{
+  int size = tl_shape_size(&p->weight->shape);
+  s8 *data = (s8 *)xmalloc(size);
+  if (!data)
+    return NULL;
+
+  for (int i = 0; i < size; i++)
+    data[i] = rand() % 256 - 128;
+  return data;
+}
+
+static s16 *alloc_bias(param_t *p)
+{
+  int c = p->bias->shape.c;
+  s16 *bias = (s16 *)malloc(sizeof(s16) * c);
+  if (!bias)
+    return NULL;
+
+  for (int i = 0; i < c; i++)
+    bias[i] = rand() % 65536 - 32768;
+  return bias;
+}
+
+static s8 *alloc_output(param_t *p)
+{
+  u64 size = tl_shape_size(&p->ofmap->shape);
+  return (s8 *)xmalloc(size);
+}
+
+static inline void relu8(s8 *buf, u64 size)
+{
+  for (u64 i = 0; i < size; i++)
+    if (buf[i] < 0)
+      buf[i] = 0;
+}
+
+
+static void compare_results(
+    param_t *p,
+    s8 input[],
+    s8 weight[],
+    s16 bias[],
+    s8 output[])
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+  int kh = p->weight->shape.h;
+  int kw = p->weight->shape.w;
+  int opd0_sign = (p->ifmap->fmt == FMT_I8);
+  int res0_sign = (p->ofmap->fmt == FMT_I8);
+  s8 *output_ref = alloc_output(p);
+  bmerr_t ret = native_pooling_ave_int8(
+      input, weight, p->bias ? bias : NULL, output_ref,
+      in, ic, ih, iw, kh, kw,
+      p->pad_top, p->pad_bottom, p->pad_left, p->pad_right,
+      p->stride_h, p->stride_w,
+      p->ins_h, p->ins_w, p->ins_last_h, p->ins_last_w,
+      opd0_sign, res0_sign, p->rshift_bits, 0);
+  assert(ret == BM_SUCCESS);
+
+  if(p->relu_enable )
+    relu8(output_ref, tl_shape_size(&p->ofmap->shape));
+
+  int cmp_res = array_cmp_int8(
+      "Comparing results ...\n", output_ref, output,
+      tl_shape_size(&p->ofmap->shape));
+
+  if (cmp_res != 0) {
+    printf("Comparison FAILED!!!\n");
+    print_pooling_param(p);
+    exit(-1);
+  }
+
+  free(output_ref);
+}
+
+static int pooling_ih_ext(param_t *p, int ih)
+{
+  int ins = p->ins_h;
+  int ins_last = p->ins_last_h;
+  int pad = p->pad_top + p->pad_bottom;
+  return (ih - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+static int pooling_iw_ext(param_t *p, int iw)
+{
+  int ins = p->ins_w;
+  int ins_last = p->ins_last_w;
+  int pad = p->pad_left + p->pad_right;
+  return (iw - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+static int pooling_oh(param_t *p, int ih, int kh)
+{
+  int ih_ext = pooling_ih_ext(p, ih);
+  return (ih_ext - kh) / p->stride_h + 1;
+}
+
+static int pooling_ow(param_t *p, int iw, int kw)
+{
+  int iw_ext = pooling_iw_ext(p, iw);
+  return (iw_ext - kw) / p->stride_w + 1;
+}
+
+static void free_depthwise_param(
+    bmk_ctx_t *ctx,
+    param_t *p)
+{
+  if (p->bias)
+    free_tl(ctx, p->bias);
+
+  if (p->weight)
+    free_tl(ctx, p->weight);
+
+  if (p->ifmap)
+    free_tl(ctx, p->ifmap);
+
+  if (p->ofmap)
+    free_tl(ctx, p->ofmap);
+}
+
+static param_t random_depthwise_param(bmk_ctx_t *ctx, int stride_w, int stride_h)
+{
+  srand(clock());
+  param_t p;
+
+  memset(&p, 0, sizeof(p));
+
+retry:
+  int using_bias = rand() % 2;
+  int n = rand() % 5 + 1;
+  int c = rand() % (3 * BM1822_HW_NPU_NUM) + 1;
+  int ih = rand() % 30 + 3 + (INVALIDE_STRIDE == stride_h ? 0 : stride_h);
+  int iw = rand() % 30 + 6 + (INVALIDE_STRIDE == stride_w ? 0 : stride_w);
+  int kh = rand() % 7 + 1;
+  int kw = rand() % 7 + 1;
+  int opd0_sign = rand() % 2;
+
+  p.ins_h = rand() % kh;
+  p.ins_w = rand() % kw;
+  p.ins_last_h = rand() % kh;
+  p.ins_last_w = rand() % kw;
+  p.stride_h = INVALIDE_STRIDE == stride_h ? rand() % (kh) + 1 : stride_h;
+  p.stride_w = INVALIDE_STRIDE == stride_w ? rand() % (kh) + 1 : stride_w;
+  p.pad_top = rand() % kh;
+  p.pad_bottom = rand() % kh;
+  p.pad_left = rand() % kw;
+  p.pad_right = rand() % kw;
+  p.rshift_bits = rand() % 32;
+
+  int oh = pooling_oh(&p, ih, kh);
+  int ow = pooling_ow(&p, iw, kw);
+  tl_shape_t ofmap_shape;
+  ofmap_shape.n = n;
+  ofmap_shape.c = c;
+  ofmap_shape.h = oh;
+  ofmap_shape.w = ow;
+  tl_shape_t ifmap_shape;
+  ifmap_shape.n = n;
+  ifmap_shape.c = c;
+  ifmap_shape.h = ih;
+  ifmap_shape.w = iw;
+  tl_shape_t weight_shape;
+  weight_shape.n = 1;
+  weight_shape.c = c;
+  weight_shape.h = kh;
+  weight_shape.w = kw;
+  tl_shape_t bias_shape;
+  bias_shape.n = 2;
+  bias_shape.c = c;
+  bias_shape.h = 1;
+  bias_shape.w = 1;
+  p.relu_enable = rand()%2;
+  /*test case ref does not support dilation !=1*/
+  p.dilation_h = 1;
+  p.dilation_w = 1;
+  fmt_t ifmt = opd0_sign ? FMT_I8: FMT_U8;
+
+  p.ofmap = bmk1822_lmem_alloc_tensor(ctx, ofmap_shape, FMT_I8, 1);
+  p.ifmap = bmk1822_lmem_alloc_tensor(ctx, ifmap_shape, ifmt, 1);
+  p.weight = bmk1822_lmem_alloc_tensor(ctx, weight_shape, FMT_I8, 1);
+  p.bias = NULL;
+  if (using_bias)
+    p.bias = bmk1822_lmem_alloc_tensor(ctx, bias_shape, FMT_I8, 0);
+
+  if ((kh > pooling_ih_ext(&p, ih))
+      || (kw > pooling_iw_ext(&p, iw))
+      || (p.pad_top >= (1 << 4))
+      || (p.pad_bottom >= (1 << 4))
+      || (p.pad_left >= (1 << 4))
+      || (p.pad_right >= (1 << 4))
+      || !p.ofmap
+      || !p.ifmap
+      || !p.weight
+      || (using_bias && !p.bias)) {
+    printf("retry init_pooling_param\n");
+    free_depthwise_param(ctx, &p);
+    goto retry;
+  }
+  return p;
+}
+
+static void put_bias_tensor(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    const tl_t *tl,
+    s16 data[])
+{
+  int c = tl->shape.c;
+
+  u8 *lo_hi = (u8 *)xmalloc(2 * c);
+  if (!lo_hi)
+    return;
+
+  for (int i = 0; i < c; i++) {
+    lo_hi[i] = data[i] & 0xff;
+    lo_hi[i + c] = (data[i] >> 8) & 0xff;
+  }
+
+  put_tensor_g2l(ctx, bk_ctx, tl, (u8 *)lo_hi);
+
+  free(lo_hi);
+}
+
+static int _test_pooling(bmctx_t ctx, bmk_ctx_t *bk_ctx, int stride_w, int stride_h)
+{
+  param_t param = random_depthwise_param(bk_ctx, stride_w, stride_h);
+
+  s8 *input = alloc_input(&param);
+  s8 *weight = alloc_weight(&param);
+  s16 *bias = NULL;
+  if (param.bias)
+    bias = alloc_bias(&param);
+
+  put_tensor_g2l(&ctx, bk_ctx, param.ifmap, (u8 *)input);
+  put_tensor_g2l(&ctx, bk_ctx, param.weight, (u8 *)weight);
+  if (param.bias)
+    put_bias_tensor(&ctx, bk_ctx, param.bias, bias);
+
+  bmk1822_tiu_depthwise_convolution(bk_ctx, &param);
+  s8 *output = (s8 *)get_tensor_l2g(&ctx, bk_ctx, param.ofmap);
+
+  compare_results(&param, input, weight, bias, output);
+
+  free_depthwise_param(bk_ctx, &param);
+  free(input);
+  free(weight);
+  free(bias);
+  free(output);
+
+  return 1;
+}
+
+
+static int test_pooling(bmctx_t ctx, bmk_ctx_t *bk_ctx) {
+  return _test_pooling(ctx, bk_ctx, INVALIDE_STRIDE, INVALIDE_STRIDE);
+}
+
+static void test_depthwise_pooling(bmctx_t *ctx, bmk_ctx_t *bk_ctx)
+{
+  int test_finished_num = 0;
+  for (u64 i = 0; i < 16; i++)
+    test_finished_num += test_pooling(*ctx, bk_ctx);
+
+  // test stride extend (0, 31]
+  int stride_list[] = {15, 16, 31};
+  int stride_list_len = sizeof(stride_list) / sizeof(stride_list[0]);
+
+  for (int stride_w_idx = 0; stride_w_idx < stride_list_len; stride_w_idx++) {
+    for (int stride_h_idx = 0; stride_h_idx < stride_list_len; stride_h_idx++) {
+      int stride_w = stride_list[stride_w_idx];
+      int stride_h = stride_list[stride_h_idx];
+
+      test_finished_num += _test_pooling(*ctx, bk_ctx, stride_w, stride_h);
+    }
+  }
+  printf("Test finished %d\n", test_finished_num);
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_depthwise_pooling(&ctx, bk_ctx);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_depthwise_conv_qdm.cpp b/cviruntime/test/1822/test_1822_depthwise_conv_qdm.cpp
new file mode 100644
index 000000000..959e7e888
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_depthwise_conv_qdm.cpp
@@ -0,0 +1,1556 @@
+#include <limits.h>
+
+#include "1822_test_util.h"
+#include "test_tf_quant_util.h"
+
+// #define ENABLE_DEBUG_MSG
+// #define ENABLE_FULL_REGRESSION
+// #define ENABLE_TV_GEN_PATTERN
+
+#define MIN_EXEC_TESTS  20
+
+typedef struct {
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int output_h;
+  int output_w;
+  int has_bias;
+  int relu_enable;
+  s8 *input_data;
+  s8 *filter_data;
+  s8 *output_data;
+  s32 *bias_data;
+  u32 *multiplier_data;
+  s8 *shift_data;
+  float float_multiplier;
+  int retry_cnt;
+} dw_conv_test_param_t;
+
+static inline int Offset(tl_shape_t shape, int i0, int i1, int i2, int i3)
+{
+  // return n * (shape.c * shape.h * shape.w) + c * (shape.h * shape.w) + h *
+  // shape.w + w;
+  int dims_data[4] = {static_cast<int>(shape.n), static_cast<int>(shape.c),
+                      static_cast<int>(shape.h), static_cast<int>(shape.w)};
+  return ((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3;
+}
+
+void fill_random_data_s8(s8 *input_data, int size)
+{
+  for (int i = 0; i < size; ++i) {
+    int is_satured = ((rand() % 1000) == 1) ? 1 : 0;
+    int is_sign = rand() % 2 ? 1 : -1;
+
+    if (is_satured && is_sign) {
+      input_data[i] = -128;
+    } else if (is_satured) {
+      input_data[i] = 127;
+    } else {
+      input_data[i] = is_sign * rand() % 128;
+    }
+  }
+}
+
+void fill_random_data_s32(s32 *input_data, int size)
+{
+  for (int i = 0; i < size; ++i) {
+    int is_satured = ((rand() % 1000) == 1) ? 1 : 0;
+    int is_sign = rand() % 2 ? 1 : -1;
+
+    if (is_satured && is_sign) {
+      input_data[i] = INT_MIN;
+    } else if (is_satured) {
+      input_data[i] = INT_MAX;
+    } else {
+      input_data[i] = is_sign * rand() % 128;
+    }
+  }
+}
+
+void convert_nhwc_to_nchw(tl_shape_t tl_shape, s8 *src, s8 *dst)
+{
+  // NHWC
+  u32 src_shape_n = tl_shape.n;
+  u32 src_shape_h = tl_shape.c;
+  u32 src_shape_w = tl_shape.h;
+  u32 src_shape_c = tl_shape.w;
+  u32 src_stride_c = 1;
+  u32 src_stride_w = src_shape_c * src_stride_c;
+  u32 src_stride_h = src_shape_w * src_stride_w;
+  u32 src_stride_n = src_shape_h * src_stride_h;
+
+  // NCHW
+  // u32 dst_shape_n = src_shape_n;
+  u32 dst_shape_c = src_shape_c;
+  u32 dst_shape_h = src_shape_h;
+  u32 dst_shape_w = src_shape_w;
+  u32 dst_stride_w = 1;
+  u32 dst_stride_h = dst_shape_w * dst_stride_w;
+  u32 dst_stride_c = dst_shape_h * dst_stride_h;
+  u32 dst_stride_n = dst_shape_c * dst_stride_c;
+
+  printf("convert_nhwc_to_nchw:\n");
+  printf("  src shape (%d, %d, %d, %d), stride (%d, %d, %d, %d)\n", src_shape_n,
+         src_shape_c, src_shape_h, src_shape_w, src_stride_n, src_stride_c,
+         src_stride_h, src_stride_w);
+  printf("  dst shape (%d, %d, %d, %d), stride (%d, %d, %d, %d)\n", src_shape_n,
+         dst_shape_c, dst_shape_h, dst_shape_w, dst_stride_n, dst_stride_c,
+         dst_stride_h, dst_stride_w);
+
+  for (u32 i = 0; i < src_shape_n; ++i) {
+    for (u32 j = 0; j < src_shape_h; ++j) {
+      for (u32 k = 0; k < src_shape_w; ++k) {
+        for (u32 l = 0; l < src_shape_c; ++l) {
+          u32 src_offset = i * src_stride_n + j * src_stride_h +
+                           k * src_stride_w + l * src_stride_c;
+          u32 dst_offset = i * dst_stride_n + j * dst_stride_h +
+                           k * dst_stride_w + l * dst_stride_c;
+          dst[dst_offset] = src[src_offset];
+        }
+      }
+    }
+  }
+}
+
+int test_nhwc_to_nchw()
+{
+  int ret = 0;
+
+  tl_shape_t shape = {2, 2, 2, 2};
+  int size = shape.n * shape.c * shape.h * shape.w;
+
+  s8 src[2 * 2 * 2 * 2] = {1,  5,  2,  6,  3,  7,  4,  8,
+                           11, 15, 12, 16, 13, 17, 14, 18};
+
+  s8 dst[2 * 2 * 2 * 2] = {0};
+  s8 ref_dst[2 * 2 * 2 * 2] = {1,  2,  3,  4,  5,  6,  7,  8,
+                               11, 12, 13, 14, 15, 16, 17, 18};
+
+  convert_nhwc_to_nchw(shape, src, dst);
+  for (int i = 0; i < size; ++i) {
+    if (dst[i] != ref_dst[i]) {
+      printf("Error ! dst[%d] %d != %d(expected)\n", i, dst[i], ref_dst[i]);
+      ret = -1;
+    }
+  }
+
+  tl_shape_t input_shape = {/*n=*/1, /*h=*/5, /*w=*/6, /*c=*/8};
+  int input_size =
+      input_shape.n * input_shape.c * input_shape.h * input_shape.w;
+  s8 nhwc_input_data[240] = {
+      103,  85,   -96,  120,  105,  -72,  33,   -50,  -104, 12,   -57,  -80,
+      12,   126,  117,  127,  119,  119,  -88,  57,   120,  123,  117,  -100,
+      -4,   76,   76,   -52,  -92,  -127, -21,  -100, 106,  35,   74,   96,
+      117,  0,    39,   76,   -119, -36,  89,   -74,  111,  46,   45,   -26,
+      65,   61,   62,   -7,   -28,  -20,  39,   -84,  -85,  -51,  52,   76,
+      -120, -47,  -58,  95,   -117, -90,  -104, 126,  82,   82,   49,   -96,
+      -47,  67,   115,  -3,   -120, 41,   -16,  -96,  -31,  -75,  67,   -115,
+      75,   -119, -81,  -24,  -3,   -11,  -14,  -4,   37,   75,   53,   107,
+      65,   78,   -58,  52,   46,   -128, 39,   53,   -87,  36,   -98,  -12,
+      -1,   70,   117,  18,   -41,  96,   21,   78,   -71,  -124, 64,   82,
+      -63,  82,   1,    112,  50,   -23,  100,  -20,  117,  20,   12,   -88,
+      -93,  67,   -90,  -70,  -63,  79,   87,   125,  -63,  -43,  80,   -52,
+      -66,  -125, 109,  -73,  -39,  104,  -78,  89,   -64,  116,  29,   71,
+      -7,   124,  -38,  -111, 84,   75,   21,   24,   12,   59,   106,  49,
+      -55,  46,   65,   -28,  64,   15,   -31,  -75,  17,   7,    -109, -25,
+      -115, -38,  7,    23,   71,   -37,  111,  119,  -95,  -89,  17,   -27,
+      -8,   -29,  -125, 58,   -42,  -29,  -87,  109,  75,   -17,  -49,  92,
+      7,    30,   -86,  -98,  26,   -8,   -61,  -41,  39,   7,    48,   55,
+      63,   125,  -13,  56,   -107, 105,  -70,  1,    105,  14,   -89,  0,
+      83,   -10,  9,    11,   127,  -14,  -108, 90,   -15,  26,   -101, -1};
+  s8 input_data[240];
+  convert_nhwc_to_nchw(input_shape, nhwc_input_data, input_data);
+  printf("NCHW input_data[%d] = {\n", input_size);
+  for (int i = 0; i < input_size; ++i) {
+    printf("%d, ", input_data[i]);
+    if (i && ((i % 16) == 0)) {
+      printf("\n");
+    }
+  }
+  printf("};\n\n");
+
+  tl_shape_t filter_shape = {1, 3, 3, 8};
+  int filter_size =
+      filter_shape.n * filter_shape.c * filter_shape.h * filter_shape.w;
+  s8 nhwc_filter_data[72] = {
+      103,  85,  -96, 120, 105,  -72,  33,   -50,  -104, 12,  -57, -80,
+      12,   126, 117, 127, 119,  119,  -88,  57,   120,  123, 117, -100,
+      -4,   76,  76,  -52, -92,  -127, -21,  -100, 106,  35,  74,  96,
+      117,  0,   39,  76,  -119, -36,  89,   -74,  111,  46,  45,  -26,
+      65,   61,  62,  -7,  -28,  -20,  39,   -84,  -85,  -51, 52,  76,
+      -120, -47, -58, 95,  -117, -90,  -104, 126,  82,   82,  49,  -96};
+  s8 filter_data[72];
+  convert_nhwc_to_nchw(filter_shape, nhwc_filter_data, filter_data);
+  printf("NCHW filter_data[%d] = {\n", filter_size);
+  for (int i = 0; i < filter_size; ++i) {
+    printf("%d, ", filter_data[i]);
+    if (i && ((i % 16) == 0)) {
+      printf("\n");
+    }
+  }
+  printf("}\n\n");
+
+  tl_shape_t output_shape = {1, 3, 4, 8};
+  int output_size =
+      output_shape.n * output_shape.c * output_shape.h * output_shape.w;
+  s8 nhwc_output_data[96] = {
+      127,  127,  69,   34,  36,   127,  127,  127,  -101, -65,  39,   13,
+      26,   6,    127,  -67, 60,   123,  31,   17,   3,    -128, -58,  -64,
+      -128, 26,   -128, -21, 72,   55,   127,  94,   -46,  -128, -37,  1,
+      -6,   109,  98,   -14, -11,  48,   -128, -3,   -50,  37,   -20,  79,
+      -94,  -36,  127,  19,  3,    -18,  -40,  -115, 24,   124,  -128, -1,
+      -52,  -123, -54,  -1,  -62,  95,   127,  24,   10,   -74,  127,  -128,
+      -2,   111,  106,  4,   3,    -128, 127,  127,  -30,  98,   -21,  -1,
+      -11,  -12,  58,   -72, -128, 127,  30,   32,   -85,  -11,  -35,  34};
+  s8 output_data[96] = {0};
+  convert_nhwc_to_nchw(output_shape, nhwc_output_data, output_data);
+  printf("NCHW output_data[%d] = {\n", output_size);
+  for (int i = 0; i < output_size; ++i) {
+    printf("%d, ", output_data[i]);
+    if (i && ((i % 16) == 0)) {
+      printf("\n");
+    }
+  }
+  printf("};\n\n");
+
+  return ret;
+}
+
+int simple_nhwc_dw_conv_test(bmctx_t *ctx, bmk_ctx_t *bmk)
+{
+  int ret = 0;
+
+  const int stride_width = 1;
+  const int stride_height = 1;
+  const int dilation_width_factor = 1;
+  const int dilation_height_factor = 1;
+  const int pad_width = 0;
+  const int pad_height = 0;
+  const int depth_multiplier = 1;
+  const int input_offset = 0;   // symmetric
+  const int output_offset = 0;  // symmetric
+  const int output_activation_min = -128;
+  const int output_activation_max = 127;
+
+  if (ctx == nullptr) {
+    return -1;
+  }
+  if (bmk == nullptr) {
+    return -1;
+  }
+
+  tl_shape_t input_shape = {/*n=*/1, /*h=*/5, /*w=*/6, /*c=*/8};
+  s8 input_data[240] = {
+      103,  85,   -96,  120,  105,  -72,  33,   -50,  -104, 12,   -57,  -80,
+      12,   126,  117,  127,  119,  119,  -88,  57,   120,  123,  117,  -100,
+      -4,   76,   76,   -52,  -92,  -127, -21,  -100, 106,  35,   74,   96,
+      117,  0,    39,   76,   -119, -36,  89,   -74,  111,  46,   45,   -26,
+      65,   61,   62,   -7,   -28,  -20,  39,   -84,  -85,  -51,  52,   76,
+      -120, -47,  -58,  95,   -117, -90,  -104, 126,  82,   82,   49,   -96,
+      -47,  67,   115,  -3,   -120, 41,   -16,  -96,  -31,  -75,  67,   -115,
+      75,   -119, -81,  -24,  -3,   -11,  -14,  -4,   37,   75,   53,   107,
+      65,   78,   -58,  52,   46,   -128, 39,   53,   -87,  36,   -98,  -12,
+      -1,   70,   117,  18,   -41,  96,   21,   78,   -71,  -124, 64,   82,
+      -63,  82,   1,    112,  50,   -23,  100,  -20,  117,  20,   12,   -88,
+      -93,  67,   -90,  -70,  -63,  79,   87,   125,  -63,  -43,  80,   -52,
+      -66,  -125, 109,  -73,  -39,  104,  -78,  89,   -64,  116,  29,   71,
+      -7,   124,  -38,  -111, 84,   75,   21,   24,   12,   59,   106,  49,
+      -55,  46,   65,   -28,  64,   15,   -31,  -75,  17,   7,    -109, -25,
+      -115, -38,  7,    23,   71,   -37,  111,  119,  -95,  -89,  17,   -27,
+      -8,   -29,  -125, 58,   -42,  -29,  -87,  109,  75,   -17,  -49,  92,
+      7,    30,   -86,  -98,  26,   -8,   -61,  -41,  39,   7,    48,   55,
+      63,   125,  -13,  56,   -107, 105,  -70,  1,    105,  14,   -89,  0,
+      83,   -10,  9,    11,   127,  -14,  -108, 90,   -15,  26,   -101, -1};
+
+  tl_shape_t filter_shape = {1, 3, 3, 8};
+  s8 filter_data[72] = {
+      103,  85,  -96, 120, 105,  -72,  33,   -50,  -104, 12,  -57, -80,
+      12,   126, 117, 127, 119,  119,  -88,  57,   120,  123, 117, -100,
+      -4,   76,  76,  -52, -92,  -127, -21,  -100, 106,  35,  74,  96,
+      117,  0,   39,  76,  -119, -36,  89,   -74,  111,  46,  45,  -26,
+      65,   61,  62,  -7,  -28,  -20,  39,   -84,  -85,  -51, 52,  76,
+      -120, -47, -58, 95,  -117, -90,  -104, 126,  82,   82,  49,  -96};
+
+  s32 bias_data[8] = {812, 670, -746, 938, 827, -558, 265, -384};
+
+  u32 output_multiplier[8] = {1155460505, 1210948247, 1203328687, 1166122678,
+                              1155273687, 1196350022, 1169748238, 1183287581};
+
+  s8 output_rshift[8] = {-7, -6, -6, -9, -8, -6, -6, -7};
+
+  tl_shape_t output_shape = {1, 3, 4, 8};
+  s8 output_data[96] = {0};
+  s8 ref_output_data[96] = {
+      127,  127,  69,   34,  36,   127,  127,  127,  -101, -65,  39,   13,
+      26,   6,    127,  -67, 60,   123,  31,   17,   3,    -128, -58,  -64,
+      -128, 26,   -128, -21, 72,   55,   127,  94,   -46,  -128, -37,  1,
+      -6,   109,  98,   -14, -11,  48,   -128, -3,   -50,  37,   -20,  79,
+      -94,  -36,  127,  19,  3,    -18,  -40,  -115, 24,   124,  -128, -1,
+      -52,  -123, -54,  -1,  -62,  95,   127,  24,   10,   -74,  127,  -128,
+      -2,   111,  106,  4,   3,    -128, 127,  127,  -30,  98,   -21,  -1,
+      -11,  -12,  58,   -72, -128, 127,  30,   32,   -85,  -11,  -35,  34};
+
+  const int batches = input_shape.n;
+  // const int output_depth = 8;
+  const int input_height = input_shape.c;
+  const int input_width = input_shape.h;
+  const int input_depth = input_shape.w;
+  const int filter_height = filter_shape.c;
+  const int filter_width = filter_shape.h;
+  const int output_height = output_shape.c;
+  const int output_width = output_shape.h;
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          for (int m = 0; m < depth_multiplier; ++m) {
+            const int output_channel = m + in_channel * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - pad_width;
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+            s32 acc = 0;
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  s32 input_val = input_data[Offset(input_shape, batch, in_y,
+                                                    in_x, in_channel)];
+                  s32 filter_val = filter_data[Offset(
+                      filter_shape, 0, filter_y, filter_x, output_channel)];
+                  acc += filter_val * (input_val + input_offset);
+
+                  printf("  [batch=%d][out_y=%d][out_x=%d][in_channel=%d][m=%d]"
+                         "[filter_y=%d][filter_x=%d] acc(%d) += %d * (%d + %d) "
+                         "= %d\n",
+                         batch, out_y, out_x, in_channel, m, filter_y, filter_x,
+                         acc - filter_val * (input_val + input_offset),
+                         filter_val, input_val, input_offset, acc);
+                }
+              }
+            }
+            if (1 /*bias_data*/) {
+              acc += bias_data[output_channel];
+            }
+
+            printf("  [batch=%d][out_y=%d][out_x=%d][output_channel=%d] acc = "
+                   "%d, bias %d\n",
+                   batch, out_y, out_x, output_channel, acc,
+                   bias_data[output_channel]);
+
+            acc = MultiplyByQuantizedMultiplier(
+                acc, output_multiplier[output_channel],
+                output_rshift[output_channel]);
+
+            printf("  [batch=%d][out_y=%d][out_x=%d][output_channel=%d] acc = "
+                   "%d, multiplier %d, shift %d\n",
+                   batch, out_y, out_x, output_channel, acc,
+                   output_multiplier[output_channel],
+                   output_rshift[output_channel]);
+
+            acc += output_offset;
+            acc = MAX(acc, output_activation_min);
+            acc = MIN(acc, output_activation_max);
+
+            printf("  [batch=%d][out_y=%d][out_x=%d][output_channel=%d] acc = "
+                   "%d\n",
+                   batch, out_y, out_x, output_channel, acc);
+
+            {
+              int x = Offset(output_shape, batch, out_y, out_x, output_channel);
+              if (x >= 96) {
+                printf("Error ! shape=(%d, %d, %d, %d), batch %d, out_y %d, "
+                       "out_x %d, output_channel %d, offset %d\n",
+                       output_shape.n, output_shape.c, output_shape.h,
+                       output_shape.w, batch, out_y, out_x, output_channel, x);
+              }
+            }
+
+            output_data[Offset(output_shape, batch, out_y, out_x,
+                               output_channel)] = static_cast<int8_t>(acc);
+          }
+        }
+      }
+    }
+  }
+
+  int output_size =
+      output_shape.n * output_shape.c * output_shape.h * output_shape.w;
+  for (int i = 0; i < output_size; ++i) {
+    if (output_data[i] != ref_output_data[i]) {
+      printf("  output_data[%d] = %d != %d\n", i, output_data[i],
+             ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  return ret;
+}
+
+typedef struct {
+  int stride_width;
+  int stride_height;
+  int dilation_width_factor;
+  int dilation_height_factor;
+  int padding_width;
+  int padding_height;
+  int depth_multiplier;
+} DwConvParams;
+
+void dw_conv_per_channel_ref(const dw_conv_test_param_t *p_param)
+{
+  const int input_offset = 0;   // symmetric
+  const int output_offset = 0;  // symmetric
+  const int output_activation_min = -128;
+  const int output_activation_max = 127;
+
+  const int stride_width = p_param->stride_w;
+  const int stride_height = p_param->stride_h;
+  const int dilation_width_factor = 1;   // params.dilation_width_factor;
+  const int dilation_height_factor = 1;  // params.dilation_height_factor;
+  const int pad_width = p_param->pad_left;
+  const int pad_height = p_param->pad_top;
+  const int depth_multiplier = 1;  // params.depth_multiplier;
+
+  const int batches = p_param->input_n;
+  const int input_height = p_param->input_h;
+  const int input_width = p_param->input_w;
+  const int input_depth = p_param->input_c;
+  const int filter_height = p_param->kh;
+  const int filter_width = p_param->kw;
+  const int output_depth = p_param->output_c;
+  const int output_height = p_param->output_h;
+  const int output_width = p_param->output_w;
+  s8 *input_data = p_param->input_data;
+  s8 *filter_data = p_param->filter_data;
+  s8 *output_data = p_param->output_data;
+  s32 *bias_data = p_param->has_bias ? p_param->bias_data : nullptr;
+  u32 *output_multiplier = p_param->multiplier_data;
+  s8 *output_rshift = p_param->shift_data;
+
+  tl_shape_t input_shape = {
+      static_cast<u32>(batches), static_cast<u32>(input_depth),
+      static_cast<u32>(input_height), static_cast<u32>(input_width)};
+  tl_shape_t filter_shape = {
+      static_cast<u32>(output_depth), static_cast<u32>(input_depth),
+      static_cast<u32>(filter_height), static_cast<u32>(filter_width)};
+  tl_shape_t output_shape = {
+      static_cast<u32>(batches), static_cast<u32>(output_depth),
+      static_cast<u32>(output_height), static_cast<u32>(output_width)};
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("dw_conv_per_channel_ref =>\n");
+  printf("  input shape (n=%d, c=%d, h=%d, w=%d)\n", batches, input_depth,
+         input_height, input_width);
+  // printf("  filter shape (oc=%d, kh=%d, kw=%d\n",
+  //       );
+  printf("  output shape (n=%d, c=%d, h=%d, w=%d)\n", batches, output_depth,
+         output_height, output_width);
+  printf("  stride_h %d, stride_w %d\n", stride_height, stride_width);
+#endif
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          for (int m = 0; m < depth_multiplier; ++m) {
+            const int output_channel = m + in_channel * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - pad_width;
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+            s32 acc = 0;
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  s32 input_val = input_data[Offset(input_shape, batch,
+                                                    in_channel, in_y, in_x)];
+                  s32 filter_val = filter_data[Offset(
+                      filter_shape, 0, output_channel, filter_y, filter_x)];
+                  acc += filter_val * (input_val + input_offset);
+
+#ifdef ENABLE_DEBUG_MSG
+                  printf("  [batch=%d][out_y=%d][out_x=%d][in_channel=%d][m=%d]"
+                         "[filter_y=%d][filter_x=%d] acc(%d) += %d * (%d + %d) "
+                         "= %d, in_x_origin %d, in_x %d\n",
+                         batch, out_y, out_x, in_channel, m, filter_y, filter_x,
+                         acc - filter_val * (input_val + input_offset),
+                         filter_val, input_val, input_offset, acc, in_x_origin,
+                         in_x);
+#endif
+                }
+              }
+            }
+            if (bias_data) {
+              acc += bias_data[output_channel];
+            }
+
+#ifdef ENABLE_DEBUG_MSG
+            printf("  [batch=%d][out_y=%d][out_x=%d][output_channel=%d] acc = "
+                   "%d, bias %d\n",
+                   batch, out_y, out_x, output_channel, acc,
+                   bias_data ? bias_data[output_channel] : 0);
+#endif
+
+            acc = MultiplyByQuantizedMultiplier(
+                acc, output_multiplier[output_channel],
+                static_cast<int>(output_rshift[output_channel]));
+
+#ifdef ENABLE_DEBUG_MSG
+            printf("  [batch=%d][out_y=%d][out_x=%d][output_channel=%d] acc = "
+                   "%d, multiplier %d, shift %d\n",
+                   batch, out_y, out_x, output_channel, acc,
+                   output_multiplier[output_channel],
+                   output_rshift[output_channel]);
+#endif
+
+            acc += output_offset;
+            acc = MAX(acc, output_activation_min);
+            acc = MIN(acc, output_activation_max);
+
+#ifdef ENABLE_DEBUG_MSG
+            printf("  [batch=%d][out_y=%d][out_x=%d][output_channel=%d] acc = "
+                   "%d\n",
+                   batch, out_y, out_x, output_channel, acc);
+#endif
+
+            output_data[Offset(output_shape, batch, output_channel, out_y,
+                               out_x)] = static_cast<int8_t>(acc);
+          }
+        }
+      }
+    }
+  }
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("<= dw_conv_per_channel_ref\n");
+#endif
+}
+
+void calc_dw_conv_float_multiplier(dw_conv_test_param_t *p_param)
+{
+  const int input_offset = 0;  // symmetric
+
+  const int stride_width = p_param->stride_w;
+  const int stride_height = p_param->stride_h;
+  const int dilation_width_factor = 1;   // params.dilation_width_factor;
+  const int dilation_height_factor = 1;  // params.dilation_height_factor;
+  const int pad_width = p_param->pad_left;
+  ;
+  const int pad_height = p_param->pad_top;
+  const int depth_multiplier = 1;  // params.depth_multiplier;
+
+  const int batches = p_param->input_n;
+  const int input_height = p_param->input_h;
+  const int input_width = p_param->input_w;
+  const int input_depth = p_param->input_c;
+  const int filter_height = p_param->kh;
+  const int filter_width = p_param->kw;
+  const int output_depth = p_param->output_c;
+  const int output_height = p_param->output_h;
+  const int output_width = p_param->output_w;
+  s8 *input_data = p_param->input_data;
+  s8 *filter_data = p_param->filter_data;
+  s32 *bias_data = p_param->has_bias ? p_param->bias_data : nullptr;
+
+  tl_shape_t input_shape = {
+      static_cast<u32>(batches), static_cast<u32>(input_depth),
+      static_cast<u32>(input_height), static_cast<u32>(input_width)};
+  tl_shape_t filter_shape = {
+      static_cast<u32>(output_depth), static_cast<u32>(input_depth),
+      static_cast<u32>(filter_height), static_cast<u32>(filter_width)};
+
+  int output_accu_min = INT_MAX;
+  int output_accu_max = INT_MIN;
+
+  // printf("calc_dw_conv_float_multiplier =>\n");
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          for (int m = 0; m < depth_multiplier; ++m) {
+            const int output_channel = m + in_channel * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - pad_width;
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+            s32 acc = 0;
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  s32 input_val = input_data[Offset(input_shape, batch,
+                                                    in_channel, in_y, in_x)];
+                  s32 filter_val = filter_data[Offset(
+                      filter_shape, 0, output_channel, filter_y, filter_x)];
+                  acc += filter_val * (input_val + input_offset);
+
+                  // printf("
+                  // [batch=%d][out_y=%d][out_x=%d][in_channel=%d][m=%d]"
+                  //        "[filter_y=%d][filter_x=%d] acc(%d) += %d * (%d +
+                  //        %d) = %d\n",
+                  //         batch, out_y, out_x, in_channel, m, filter_y,
+                  //         filter_x, acc - filter_val * (input_val +
+                  //         input_offset), filter_val, input_val, input_offset,
+                  //         acc);
+                }
+              }
+            }
+            if (bias_data) {
+              acc += bias_data[output_channel];
+            }
+
+            output_accu_max = MAX(acc, output_accu_max);
+            output_accu_min = MIN(acc, output_accu_min);
+
+            // printf("  [batch=%d][out_y=%d][out_x=%d][output_channel=%d] acc =
+            // %d, MIN = %d, MAX = %d\n",
+            //        batch, out_y, out_x, output_channel, acc,
+            //        output_accu_min, output_accu_max);
+          }
+        }
+      }
+    }
+  }
+
+  // Since int8 ranges from -128 to 127, we need to squeeze the accumulator
+  // MIN/MAX fit in those ranges correspondingly as much as possible.
+  if (abs(output_accu_max) > abs(output_accu_min)) {
+    p_param->float_multiplier = 127.0f / abs(output_accu_max);
+  } else {
+    p_param->float_multiplier = 128.0f / abs(output_accu_min);
+  }
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("  output_accu_min %d, output_accu_max %d, output_multiplier %f\n",
+         output_accu_min, output_accu_max, p_param->float_multiplier);
+#endif
+
+  // printf("<= calc_dw_conv_float_multiplier\n");
+}
+
+int simple_dw_conv_test(bmctx_t *ctx, bmk_ctx_t *bmk)
+{
+  int ret = 0;
+
+  if (ctx == nullptr) {
+    return -1;
+  }
+  if (bmk == nullptr) {
+    return -1;
+  }
+
+  const int batches = 1;
+  const int input_depth = 8;
+  const int input_height = 5;
+  const int input_width = 6;
+  tl_shape_t input_shape = {batches, input_depth, input_height, input_width};
+  s8 input_data[240] = {
+      /* ic = 0 */
+      103, -104, 119, -4, 106, -119, 65, -85, -117, -47, -31, -3, 65, -87, -41,
+      -63, 117, -63, -66, -64, 84, -55, 17, 71, -8, 75, 26, 63, 105, 127,
+
+      /* ic = 1 */
+      85, 12, 119, 76, 35, -36, 61, -51, -90, 67, -75, -11, 78, 36, 96, 82, 20,
+      79, -125, 116, 75, 46, 7, -37, -29, -17, -8, 125, 14, -14,
+
+      /* ic = 2 */
+      -96, -57, -88, 76, 74, 89, 62, 52, -104, 115, 67, -14, -58, -98, 21, 1,
+      12, 87, 109, 29, 21, 65, -109, 111, -125, -49, -61, -13, -89, -108,
+
+      /* ic = 3 */
+      120, -80, 57, -52, 96, -74, -7, 76, 126, -3, -115, -4, 52, -12, 78, 112,
+      -88, 125, -73, 71, 24, -28, -25, 119, 58, 92, -41, 56, 0, 90,
+
+      /* ic = 4 */
+      105, 12, 120, -92, 117, 111, -28, -120, 82, -120, 75, 37, 46, -1, -71, 50,
+      -93, -63, -39, -7, 12, 64, -115, -95, -42, 7, 39, -107, 83, -15,
+
+      /* ic = 5 */
+      -72, 126, 123, -127, 0, 46, -20, -47, 82, 41, -119, 75, -128, 70, -124,
+      -23, 67, -43, 104, 124, 59, 15, -38, -89, -29, 30, 7, 105, -10, 26,
+
+      /* ic = 6 */
+      33, 117, 117, -21, 39, 45, 39, -58, 49, -16, -81, 53, 39, 117, 64, 100,
+      -90, 80, -78, -38, 106, -31, 7, 17, -87, -86, 48, -70, 9, -101,
+
+      /* ic = 7 */
+      -50, 127, -100, -100, 76, -26, -84, 95, -96, -96, -24, 107, 53, 18, 82,
+      -20, -70, -52, 89, -111, 49, -75, 23, -27, 109, -98, 55, 1, 11, -1};
+
+  const int kernel_height = 3;
+  const int kernel_width = 3;
+  tl_shape_t filter_shape = {1, input_depth, kernel_height, kernel_width};
+  // Global memory layout: OcKhKw
+  s8 filter_data[72] = {
+      103,  -104, 119,  -4,  106, -119, 65,   -85,  -117, 85,  12,  119,
+      76,   35,   -36,  61,  -51, -90,  -96,  -57,  -88,  76,  74,  89,
+      62,   52,   -104, 120, -80, 57,   -52,  96,   -74,  -7,  76,  126,
+      105,  12,   120,  -92, 117, 111,  -28,  -120, 82,   -72, 126, 123,
+      -127, 0,    46,   -20, -47, 82,   33,   117,  117,  -21, 39,  45,
+      39,   -58,  49,   -50, 127, -100, -100, 76,   -26,  -84, 95,  -96};
+
+  s32 bias_data[8] = {812, 670, -746, 938, 827, -558, 265, -384};
+
+  u32 output_multiplier[8] = {1155460505, 1210948247, 1203328687, 1166122678,
+                              1155273687, 1196350022, 1169748238, 1183287581};
+
+  // Change to right shift
+  s8 output_rshift[8] = {7, 6, 6, 9, 8, 6, 6, 7};
+
+  u8 per_channel_cal_data[8 * 4 + 8 * 4 + 8];
+  pack_chl_quan_param(8, /*has_bias=*/true, bias_data, output_multiplier,
+                      output_rshift, per_channel_cal_data);
+
+  const int output_height = 3;
+  const int output_width = 4;
+  tl_shape_t output_shape = {batches, input_depth, output_height, output_width};
+  s8 ref_output_data[96] = {
+      /* oc = 0 */
+      127, -101, 60, -128, -46, -11, -94, 24, -62, -2, -30, -128,
+
+      /* oc = 1 */
+      127, -65, 123, 26, -128, 48, -36, 124, 95, 111, 98, 127,
+
+      /* oc = 2 */
+      69, 39, 31, -128, -37, -128, 127, -128, 127, 106, -21, 30,
+
+      /* oc = 3 */
+      34, 13, 17, -21, 1, -3, 19, -1, 24, 4, -1, 32,
+
+      /* oc = 4 */
+      36, 26, 3, 72, -6, -50, 3, -52, 10, 3, -11, -85,
+
+      /* oc = 5 */
+      127, 6, -128, 55, 109, 37, -18, -123, -74, -128, -12, -11,
+
+      /* oc = 6 */
+      127, 127, -58, 127, 98, -20, -40, -54, 127, 127, 58, -35,
+
+      /* oc = 7 */
+      127, -67, -64, 94, -14, 79, -115, -1, -128, 127, -72, 34};
+
+  bmk1822_tensor_lmem_t *tl_input =
+      bmk1822_lmem_alloc_tensor(bmk, input_shape, FMT_I8, /*eu_aign=*/1);
+
+  bmk1822_tensor_lmem_t *tl_filter =
+      bmk1822_lmem_alloc_tensor(bmk, filter_shape, FMT_I8, /*eu_align=*/1);
+
+  bmk1822_tensor_lmem_t *tl_output =
+      bmk1822_lmem_alloc_tensor(bmk, output_shape, FMT_I8, /*eu_align=*/1);
+
+  bmk1822_tensor_lmem_t *tl_per_channel_cal =
+      bmk1822_lmem_alloc_tensor(bmk, {1, 8, 1, 9}, FMT_U8,
+                                  /*eu_align*/ 0);
+
+  put_tensor_g2l(ctx, bmk, tl_per_channel_cal, per_channel_cal_data);
+  put_tensor_g2l(ctx, bmk, tl_input, reinterpret_cast<u8 *>(input_data));
+  put_tensor_g2l(ctx, bmk, tl_filter, reinterpret_cast<u8 *>(filter_data));
+
+  {
+    // Reshape per channel quantization data
+    tl_per_channel_cal->shape = {1, 8, 1, 1};
+    tl_per_channel_cal->stride = bmk1822_tensor_lmem_default_stride(
+        bmk, tl_per_channel_cal->shape, FMT_I8, /*eu_align=*/0);
+
+    bmk1822_tiu_depthwise_convolution_qdm_param_t param;
+    memset(&param, 0, sizeof(param));
+    param.ofmap = tl_output;
+    param.ifmap = tl_input;
+    param.weight = tl_filter;
+    param.chl_quan_param = tl_per_channel_cal;
+    param.dilation_h = 1;
+    param.dilation_w = 1;
+    param.stride_h = 1;
+    param.stride_w = 1;
+    param.has_bias = 1;
+    bmk1822_tiu_depthwise_convolution_qdm(bmk, &param);
+  }
+
+  test_submit(ctx);
+
+  printf("Compare tiu and golden\n");
+  s8 *conv_output_data =
+      reinterpret_cast<s8 *>(get_tensor_l2g(ctx, bmk, tl_output));
+  for (int i = 0; i < static_cast<int>(sizeof(ref_output_data)); i++) {
+    if (conv_output_data[i] != ref_output_data[i]) {
+      printf("output_data[%d] %d != %d\n", i, conv_output_data[i],
+             ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  free(conv_output_data);
+
+  s8 output_data[96] = {0};
+  memset(output_data, 0, sizeof(output_data));
+
+  dw_conv_test_param_t params;
+  memset(&params, 0, sizeof(params));
+  params.input_n = batches;
+  params.input_c = input_depth;
+  params.input_h = input_height;
+  params.input_w = input_width;
+  params.kh = kernel_height;
+  params.kw = kernel_width;
+  params.output_c = input_depth;
+  params.output_h = output_height;
+  params.output_w = output_width;
+  params.stride_w = 1;
+  params.stride_h = 1;
+  params.input_data = input_data;
+  params.filter_data = filter_data;
+  params.output_data = output_data;
+  params.has_bias = 1;
+  params.bias_data = bias_data;
+  params.multiplier_data = output_multiplier;
+  params.shift_data = output_rshift;
+
+  dw_conv_per_channel_ref(&params);
+
+  printf("Compare ref and golden\n");
+  int output_size =
+      output_shape.n * output_shape.c * output_shape.h * output_shape.w;
+  for (int i = 0; i < output_size; ++i) {
+    if (output_data[i] != ref_output_data[i]) {
+      printf("  output_data[%d] = %d != %d\n", i, output_data[i],
+             ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  // Reverse order
+  bmk1822_lmem_free_tensor(bmk, tl_per_channel_cal);
+  bmk1822_lmem_free_tensor(bmk, tl_output);
+  bmk1822_lmem_free_tensor(bmk, tl_filter);
+  bmk1822_lmem_free_tensor(bmk, tl_input);
+
+  return ret;
+}
+
+int choose_from_range(int table[], int size, int index)
+{
+  if (index >= size) {
+    return 0;
+  }
+
+  int val = table[index];
+  if (index < (size - 1)) {
+    int range = MAX(table[index + 1] - table[index] - 1, 1);
+    val += rand() % range;
+  }
+
+  return val;
+}
+
+void dump_test_param(dw_conv_test_param_t *p_param, bool dump_content)
+{
+  printf("Dump test parameter:\n");
+  printf("  input_n %d\n", p_param->input_n);
+  printf("  input_c %d\n", p_param->input_c);
+  printf("  input_h %d\n", p_param->input_h);
+  printf("  input_w %d\n", p_param->input_w);
+  printf("  kw %d\n", p_param->kw);
+  printf("  kh %d\n", p_param->kh);
+  printf("  dh %d\n", p_param->dh);
+  printf("  dw %d\n", p_param->dw);
+  printf("  pad_top %d\n", p_param->pad_top);
+  printf("  pad_bot %d\n", p_param->pad_bot);
+  printf("  pad_left %d\n", p_param->pad_left);
+  printf("  pad_right %d\n", p_param->pad_right);
+  printf("  ins_h %d\n", p_param->ins_h);
+  printf("  ins_h_last %d\n", p_param->ins_h_last);
+  printf("  ins_w %d\n", p_param->ins_w);
+  printf("  ins_w_last %d\n", p_param->ins_w_last);
+  printf("  stride_h %d\n", p_param->stride_h);
+  printf("  stride_w %d\n", p_param->stride_w);
+  printf("  output_c %d\n", p_param->output_c);
+  printf("  output_h %d\n", p_param->output_h);
+  printf("  output_w %d\n", p_param->output_w);
+  printf("  has_bias %d\n", p_param->has_bias);
+  printf("  relu_enable %d\n", p_param->relu_enable);
+
+  if (dump_content) {
+    printf("input_data(%d, %d, %d, %d) :\n", p_param->input_n, p_param->input_c,
+           p_param->input_h, p_param->input_w);
+    int in = p_param->input_n;
+    int ic = p_param->input_c;
+    int ih = p_param->input_h;
+    int iw = p_param->input_w;
+    for (int i = 0; i < in; ++i) {
+      for (int j = 0; j < ic; ++j) {
+        for (int k = 0; k < ih; ++k) {
+          for (int l = 0; l < iw; ++l) {
+            int offset = i * (ic * ih * iw) + j * (ih * iw) + k * iw + l;
+            printf("%d, ", p_param->input_data[offset]);
+          }
+          printf("\n");
+        }
+      }
+    }
+    printf("\n\n");
+
+    printf("kener_data (%d, %d, %d)\n", p_param->output_c, p_param->kh,
+           p_param->kw);
+    int kh = p_param->kh;
+    int kw = p_param->kw;
+    for (int i = 0; i < ic; ++i) {
+      for (int j = 0; j < kh; ++j) {
+        for (int k = 0; k < kw; ++k) {
+          int offset = i * (kh * kw) + j * kw + k;
+          printf("%d, ", p_param->filter_data[offset]);
+        }
+      }
+      printf("\n");
+    }
+    printf("\n\n");
+
+    if (p_param->has_bias) {
+      printf("bias_data:\n");
+      for (int i = 0; i < ic; ++i) {
+        printf("%d, ", p_param->bias_data[i]);
+      }
+      printf("\n\n");
+    }
+
+    printf("multiplier_data:\n");
+    for (int i = 0; i < ic; ++i) {
+      printf("%d, ", p_param->multiplier_data[i]);
+    }
+    printf("\n\n");
+
+    printf("shift_data:\n");
+    for (int i = 0; i < ic; ++i) {
+      printf("%d, ", p_param->shift_data[i]);
+    }
+    printf("\n\n");
+  }
+}
+
+int run_compare_dw_conv(bmctx_t *ctx, bmk_ctx_t *bk_ctx,
+                        dw_conv_test_param_t *p_param)
+{
+  int ret = 0;
+
+  if (ctx == nullptr || bk_ctx == nullptr) {
+    return -1;
+  }
+
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int oh = p_param->output_h;
+  int ow = p_param->output_w;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_last_h = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_last_w = p_param->ins_w_last;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int has_bias = p_param->has_bias;
+  int relu_enable = p_param->relu_enable;
+
+  int input_size = in * ic * iw * ih;
+  s8 *input_data = (s8 *)malloc(input_size);
+
+  int kernel_size = oc * ic * kh * kw;
+  s8 *kernel_data = (s8 *)malloc(kernel_size);
+
+  int output_size = in * oc * oh * ow;
+  s8 *output_data = (s8 *)malloc(output_size);
+  if (!input_data || !kernel_data || !output_data) {
+    free(input_data);
+    free(kernel_data);
+    free(output_data);
+    return -1;
+  }
+
+  memset(output_data, 0, output_size);
+
+  s32 *bias_data = (s32 *)malloc(sizeof(s32) * oc);
+  u32 *multiplier_data = (u32 *)malloc(sizeof(u32) * oc);
+  s8 *shift_data = (s8 *)malloc(oc);
+
+  p_param->input_data = input_data;
+  p_param->filter_data = kernel_data;
+  p_param->output_data = output_data;
+  p_param->has_bias = has_bias;
+  p_param->bias_data = bias_data;
+  p_param->multiplier_data = multiplier_data;
+  p_param->shift_data = shift_data;
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    run_compare_dw_conv =>\n");
+  printf("      input (n=%d, ic=%d, h=%d, w=%d), kernel (oc=%d, ic=%d, h=%d, "
+         "w=%d), output (, c=%d, h=%d, w=%d), has_bias %d\n",
+         in, ic, ih, iw, oc, ic, kh, kw, oc, oh, ow, has_bias);
+#endif
+
+  int retry_cnt = p_param->retry_cnt;
+  do {
+    fill_random_data_s8(input_data, input_size);
+    fill_random_data_s8(kernel_data, kernel_size);
+    if (has_bias) {
+      fill_random_data_s32(bias_data, oc);
+    }
+
+    p_param->float_multiplier = 100.0;  // should be < 1.0
+    calc_dw_conv_float_multiplier(p_param);
+
+    if (p_param->float_multiplier > 0.f && p_param->float_multiplier < 1.0) {
+      break;
+    }
+
+  } while (--retry_cnt);
+
+  if (p_param->float_multiplier >= 1.0) {
+    printf("    run_compare_dw_conv: unable to find valid multiplier\n");
+    free(input_data);
+    free(kernel_data);
+    free(output_data);
+    free(bias_data);
+    free(multiplier_data);
+    free(shift_data);
+    return -1;
+  }
+
+  u32 base_multiplier = 0;
+  int base_shift = 0;
+  QuantizeMultiplierSmallerThanOne(p_param->float_multiplier, &base_multiplier,
+                                   &base_shift);
+
+  for (int i = 0; i < oc; ++i) {
+    // multipliers typically range in [2^30 ; 2^31 - 1].
+    // Values in [0, 2^30 - 1] are normally unused, but harmless.
+    // Thus a good way to randomize multipliers is to subtract from them
+    // a random value smaller than 2^30 but still significant compared to it.
+    p_param->multiplier_data[i] = base_multiplier - (rand() % (1 << 26));
+
+    int right_shift = base_shift - 1 + (rand() % 4);
+    p_param->shift_data[i] =
+        truncate_rshift((s8)right_shift, /*allow_lshift*/1);
+
+#ifdef ENABLE_DEBUG_MSG
+    printf("      [oc=%d] multiplier_data %d, shift_data %d\n", i,
+           p_param->multiplier_data[i], p_param->shift_data[i]);
+#endif
+  }
+
+  dw_conv_per_channel_ref(p_param);
+
+  const int per_chan_cal_data_size =
+      p_param->has_bias ? 9 : 5;  // bias(4) + multiplier(4) + shift(1)
+  const int cal_data_size = oc * per_chan_cal_data_size;
+  u8 *cal_data = (u8 *)malloc(cal_data_size);
+  pack_chl_quan_param(oc, p_param->has_bias, p_param->bias_data,
+                      p_param->multiplier_data, p_param->shift_data,
+                      cal_data);
+
+  tl_shape_t input_shape = {static_cast<u32>(in), static_cast<u32>(ic),
+                            static_cast<u32>(ih), static_cast<u32>(iw)};
+  tl_shape_t filter_shape = {1, static_cast<u32>(oc), static_cast<u32>(kh),
+                             static_cast<u32>(kw)};
+  tl_shape_t output_shape = {static_cast<u32>(in), static_cast<u32>(oc),
+                             static_cast<u32>(oh), static_cast<u32>(ow)};
+  tl_shape_t cal_shape = {1, static_cast<u32>(oc), 1,
+                          static_cast<u32>(per_chan_cal_data_size)};
+
+  bmk1822_tensor_lmem_t *tl_input =
+      bmk1822_lmem_alloc_tensor(bk_ctx, input_shape, FMT_I8, /*eu_aign=*/1);
+
+  bmk1822_tensor_lmem_t *tl_filter =
+      bmk1822_lmem_alloc_tensor(bk_ctx, filter_shape, FMT_I8, /*eu_align=*/1);
+
+  bmk1822_tensor_lmem_t *tl_output =
+      bmk1822_lmem_alloc_tensor(bk_ctx, output_shape, FMT_I8, /*eu_align=*/1);
+
+  // Shape for TDMA load
+  bmk1822_tensor_lmem_t *tl_cal_data =
+      bmk1822_lmem_alloc_tensor(bk_ctx, cal_shape, FMT_U8, /*eu_align*/ 0);
+
+  if (tl_input == nullptr) {
+    printf("      fail to alloc tl_input (%d, %d, %d, %d)\n", input_shape.n,
+           input_shape.c, input_shape.h, input_shape.w);
+    return -1;
+  }
+  if (tl_filter == nullptr) {
+    printf("      fail to alloc tl_filter (%d, %d, %d, %d)\n", filter_shape.n,
+           filter_shape.c, filter_shape.h, filter_shape.w);
+    return -1;
+  }
+  if (tl_output == nullptr) {
+    printf("      fail to alloc tl_output (%d, %d, %d, %d)\n", output_shape.n,
+           output_shape.c, output_shape.h, output_shape.w);
+    return -1;
+  }
+  if (tl_cal_data == nullptr) {
+    printf("      fail to alloc tl_cal_data (%d, %d, %d, %d)\n", cal_shape.n,
+           cal_shape.c, cal_shape.h, cal_shape.w);
+    return -1;
+  }
+
+  put_tensor_g2l(ctx, bk_ctx, tl_cal_data, cal_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_input, reinterpret_cast<u8 *>(input_data));
+  put_tensor_g2l(ctx, bk_ctx, tl_filter, reinterpret_cast<u8 *>(kernel_data));
+
+  {
+    // Reshape per channel quantization data for TIU
+    tl_cal_data->shape = {1, static_cast<u32>(oc), 1, 1};
+    tl_cal_data->stride = bmk1822_tensor_lmem_default_stride(
+        bk_ctx, tl_cal_data->shape, FMT_I8, /*eu_align=*/0);
+
+    bmk1822_tiu_depthwise_convolution_qdm_param_t param;
+    memset(&param, 0, sizeof(param));
+    param.ofmap = tl_output;
+    param.ifmap = tl_input;
+    param.weight = tl_filter;
+    param.chl_quan_param = tl_cal_data;
+    param.ins_h = ins_h;
+    param.ins_last_h = ins_last_h;
+    param.ins_w = ins_w;
+    param.ins_last_w = ins_last_w;
+    param.stride_h = stride_h;
+    param.stride_w = stride_w;
+    param.dilation_h = dh;
+    param.dilation_w = dw;
+    param.pad_top = pad_top;
+    param.pad_bottom = pad_bot;
+    param.pad_left = pad_left;
+    param.pad_right = pad_right;
+    param.has_bias = has_bias;
+    param.relu_enable = relu_enable;
+
+#ifdef ENABLE_DEBUG_MSG
+    printf("      tiu_dw_conv_qdm:\n");
+    printf("        ifmap shape (%d, %d, %d, %d)\n", param.ifmap->shape.n,
+           param.ifmap->shape.c, param.ifmap->shape.h, param.ifmap->shape.w);
+    printf("        weight shape (%d, %d, %d, %d)\n", param.weight->shape.n,
+           param.weight->shape.c, param.weight->shape.h, param.weight->shape.w);
+    printf("        ofmap shape (%d, %d, %d, %d)\n", param.ofmap->shape.n,
+           param.ofmap->shape.c, param.ofmap->shape.h, param.ofmap->shape.w);
+#endif
+
+    bmk1822_tiu_depthwise_convolution_qdm(bk_ctx, &param);
+  }
+
+  test_submit(ctx);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("      compare result:\n");
+#endif
+  s8 *conv_output_data =
+      reinterpret_cast<s8 *>(get_tensor_l2g(ctx, bk_ctx, tl_output));
+  for (int i = 0; i < output_size; i++) {
+    if (conv_output_data[i] != output_data[i]) {
+      printf("        output_data[%d] %d(tiu) != %d(ref)\n", i,
+             conv_output_data[i], output_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+  if (ret) {
+    dump_test_param(p_param, /*dump_content=*/true);
+  }
+
+  // Reverse order
+  bmk1822_lmem_free_tensor(bk_ctx, tl_cal_data);
+  bmk1822_lmem_free_tensor(bk_ctx, tl_output);
+  bmk1822_lmem_free_tensor(bk_ctx, tl_filter);
+  bmk1822_lmem_free_tensor(bk_ctx, tl_input);
+
+  free(conv_output_data);
+
+  free(input_data);
+  free(kernel_data);
+  free(output_data);
+  free(bias_data);
+  free(multiplier_data);
+  free(shift_data);
+  free(cal_data);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    <= run_compare_dw_conv\n");
+#endif
+
+  return ret;
+}
+
+bool check_valid_test_param(bmk_ctx_t *bk_ctx, dw_conv_test_param_t *p_param)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int oh = p_param->output_h;
+  int ow = p_param->output_w;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int per_chan_cal_data_size =
+      p_param->has_bias ? 9 : 5;  // bias(4) + multiplier(4) + shift(1)
+
+  // Skip invalid shape
+  if ((kh > ih) || (kw > iw) || (stride_h > ih) || (stride_w > iw)) {
+    return false;
+  }
+
+  // muliply random-choosen value may exceeded than s32
+  u32 input_size = in * ic * ih * iw;
+  u32 kernel_size = ic * kh * kw;  // no oc
+  u32 output_size = in * oc * oh * ow;
+
+  bmk1822_chip_info_t chip_info = bmk1822_chip_info();
+  u32 lmem_size_per_lane = chip_info.lmem_size;
+  u32 total_lmem_size = chip_info.lmem_size * chip_info.npu_num;
+
+  u32 total_needed_size = input_size + kernel_size + output_size +
+                          per_chan_cal_data_size * chip_info.npu_num;
+  if (total_needed_size > total_lmem_size) {
+    return false;
+  }
+
+  tl_shape_t input_shape = {static_cast<u32>(in), static_cast<u32>(ic),
+                            static_cast<u32>(ih), static_cast<u32>(iw)};
+  tl_shape_t filter_shape = {1, static_cast<u32>(oc), static_cast<u32>(kh),
+                             static_cast<u32>(kw)};
+  tl_shape_t output_shape = {static_cast<u32>(in), static_cast<u32>(oc),
+                             static_cast<u32>(oh), static_cast<u32>(ow)};
+  tl_shape_t cal_shape = {1, static_cast<u32>(oc), 1,
+                          static_cast<u32>(per_chan_cal_data_size)};
+
+  u32 needed_size =
+      bmk1822_lmem_tensor_to_size(bk_ctx, input_shape, FMT_I8, /*eu_align=*/1) +
+      bmk1822_lmem_tensor_to_size(bk_ctx, filter_shape, FMT_I8, /*eu_align=*/1) +
+      bmk1822_lmem_tensor_to_size(bk_ctx, output_shape, FMT_I8, /*eu_align=*/1) +
+      bmk1822_lmem_tensor_to_size(bk_ctx, cal_shape, FMT_I8, /*eu_align=*/0);
+
+  // Skip invalid shape
+  if (needed_size > lmem_size_per_lane) {
+    return false;
+  }
+
+  return true;
+}
+
+int random_test(bmctx_t *ctx, bmk_ctx_t *bk_ctx)
+{
+  int ret = 0;
+
+  if (ctx == nullptr || bk_ctx == nullptr) {
+    return -1;
+  }
+
+#ifndef ENABLE_FULL_REGRESSION
+#ifndef ENABLE_TV_GEN_PATTERN
+  // Input with same range size
+  // n: 12b, c: 12b, h: 12b(4095-32), wb: 12b(4095-32)
+  int batch_range[] = {1, 1, 2, 4095 - 32};
+  int input_height_range[] = {1, 512, 1024, 4095 - 32};
+  int input_width_range[] = {1, 512, 1024, 4095 - 32};
+  int input_depth_range[] = {1, 16, 32, 4095 - 32};
+
+  // Kernel with same range size
+  // h: 12b, w: 12b
+  // stride_h: 4b, strid_w: 4b
+  int kernel_height_range[] = {1, 11, 2048, 4095};
+  int kernel_width_range[] = {1, 11, 2048, 4095};
+  int kernel_stride_height_range[] = {1, 5, 16, 31};
+  int kernel_stride_width_range[] = {1, 5, 16, 31};
+#else
+  // TV_GEN pattern
+  //  Random Test, total 2187, skipped 13095, executed 27, failed 0, ret 0
+
+  // Input with same range size
+  // n: 12b, c: 12b, h: 12b(4095-32), wb: 12b(4095-32)
+  int batch_range[] = {1, 1, 3232};
+  int input_height_range[] = {1, 512, 4095 - 32};
+  int input_width_range[] = {1, 512, 4095 - 32};
+  int input_depth_range[] = {1, 16, 4095 - 32};
+
+  // Kernel with same range size
+  // h: 12b, w: 12b
+  // stride_h: 4b, strid_w: 4b
+  int kernel_height_range[] = {1, 11, 2048, 4095};
+  int kernel_width_range[] = {1, 11, 2048, 4095};
+  int kernel_stride_height_range[] = {1, 5, 16, 31};
+  int kernel_stride_width_range[] = {1, 5, 16, 31};
+#endif // ENABLE_TV_GEN_PATTERN
+#else
+#if 0
+  // Input with same range size
+  int batch_range[] = {1};
+  int input_height_range[] = {1};
+  int input_width_range[] = {1};
+  int input_depth_range[] = {1};
+
+  // Kernel with same range size
+  int kernel_height_range[] = {1};
+  int kernel_width_range[] = {1};
+  int kernel_stride_height_range[] = {1};
+  int kernel_stride_width_range[] = {1};
+  int output_depth_range[] = {1};
+#else
+  // 10/21/2019
+  // Random Test, total 512000, skipped 2535629, executed 24371
+
+  // Input with same range size
+  // n: 12b, c: 12b, h: 12b(4095-32), wb: 12b(4095-32)
+  int batch_range[] = {1, 2, 4, 8, 16, 32, 64, 4095 - 32};
+  int input_height_range[] = {1, 3, 11, 128, 512, 1024, 2048, 4095 - 32};
+  int input_width_range[] = {1, 3, 11, 128, 512, 1024, 2048, 4095 - 32};
+  int input_depth_range[] = {1, 3, 11, 32, 64, 1024, 2048, 4095 - 32};
+
+  // Kernel with same range size
+  // h: 12b, w: 12b
+  // stride_h: 4b, strid_w: 4b
+  int kernel_height_range[] = {1, 3, 11, 511, 4095};
+  int kernel_width_range[] = {1, 3, 11, 511, 4095};
+  int kernel_stride_height_range[] = {1, 3, 15, 16, 31};
+  int kernel_stride_width_range[] = {1, 3, 15, 16, 31};
+#endif
+#endif /* ENABLE_FULL_REGRESSION */
+
+  const int input_range_size =
+      sizeof(input_height_range) / sizeof(input_height_range[0]);
+  const int kernel_range_size =
+      sizeof(kernel_height_range) / sizeof(kernel_height_range[0]);
+
+  int random_seed = clock();
+  srand(random_seed);
+
+  const int retry_test_count = 100;
+  bool stop_at_first_error = true;
+
+  int executed_tests = 0;
+  int failed_tests = 0;
+
+  printf("1822-dw-conv-qm: random test =>\n");
+  for (int m = 0; m < retry_test_count; ++m) {
+    for (int i = 0; i < input_range_size; ++i) {
+      // random choosed from [range[i] : range[i+1]]
+      int batch = choose_from_range(batch_range, input_range_size, i);
+
+      for (int j = 0; j < input_range_size; ++j) {
+        int input_height =
+            choose_from_range(input_height_range, input_range_size, j);
+
+        for (int k = 0; k < input_range_size; ++k) {
+          int input_width =
+              choose_from_range(input_width_range, input_range_size, k);
+
+          for (int l = 0; l < input_range_size; ++l) {
+            int input_depth =
+                choose_from_range(input_depth_range, input_range_size, k);
+
+            for (int m = 0; m < kernel_range_size; ++m) {
+              int kernel_height =
+                  choose_from_range(kernel_height_range, kernel_range_size, m);
+
+              for (int n = 0; n < kernel_range_size; ++n) {
+                int kernel_width =
+                    choose_from_range(kernel_width_range, kernel_range_size, n);
+
+                for (int x = 0; x < kernel_range_size; ++x) {
+                  int kernel_stride_height = choose_from_range(
+                      kernel_stride_height_range, kernel_range_size, x);
+
+                  for (int y = 0; y < kernel_range_size; ++y) {
+                    int kernel_stride_width = choose_from_range(
+                        kernel_stride_width_range, kernel_range_size, y);
+
+                    int has_bias = rand() % 2;
+                    int dh = 1;
+                    int dw = 1;
+                    int ins_h = 0;
+                    int ins_h_last = 0;
+                    int ins_w = 0;
+                    int ins_w_last = 0;
+                    int pad_top = 0;
+                    int pad_bot = 0;
+                    int pad_left = 0;
+                    int pad_right = 0;
+
+                    int ih_ext = calc_dilute_hw(input_height, ins_h, ins_h_last,
+                                                pad_top, pad_bot);
+                    int iw_ext = calc_dilute_hw(input_width, ins_w, ins_w_last,
+                                                pad_left, pad_right);
+                    int kh_ext = calc_dilute_hw(kernel_height, dh - 1, 0, 0, 0);
+                    int kw_ext = calc_dilute_hw(kernel_width, dw - 1, 0, 0, 0);
+
+                    int oh =
+                        calc_output_hw(ih_ext, kh_ext, kernel_stride_height);
+                    int ow =
+                        calc_output_hw(iw_ext, kw_ext, kernel_stride_width);
+
+                    // depthwise, input depth == output depth
+                    int output_depth = input_depth;
+
+                    dw_conv_test_param_t test_param;
+                    memset(&test_param, 0, sizeof(test_param));
+                    test_param.input_n = batch;
+                    test_param.input_c = input_depth;
+                    test_param.input_h = input_height;
+                    test_param.input_w = input_width;
+                    test_param.kh = kernel_height;
+                    test_param.kw = kernel_width;
+                    test_param.dh = dh;
+                    test_param.dw = dw;
+                    test_param.pad_top = pad_top;
+                    test_param.pad_bot = pad_bot;
+                    test_param.pad_left = pad_left;
+                    test_param.pad_right = pad_right;
+                    test_param.ins_h = ins_h;
+                    test_param.ins_h_last = ins_h_last;
+                    test_param.ins_w = ins_w;
+                    test_param.ins_w_last = ins_w_last;
+                    test_param.stride_h = kernel_stride_height;
+                    test_param.stride_w = kernel_stride_width;
+                    test_param.output_c = output_depth;
+                    test_param.output_h = oh;
+                    test_param.output_w = ow;
+                    test_param.has_bias = has_bias;
+                    test_param.retry_cnt = 5;
+
+                    bool is_valid_param =
+                        check_valid_test_param(bk_ctx, &test_param);
+                    if (is_valid_param == false) {
+                      continue;
+                    }
+
+                    int ret2 = run_compare_dw_conv(ctx, bk_ctx, &test_param);
+                    failed_tests = ret2 ? failed_tests + 1 : failed_tests;
+                    ret |= ret2;
+                    executed_tests++;
+
+#ifdef ENABLE_DEBUG_MSG
+                    printf("  [%d] random test: input shape(%d, %d, %d, %d)",
+                           executed_tests, batch, input_depth,
+                           input_height, input_width);
+                    printf(", kernel shape (%d, %d, %d, %d), result %d\n",
+                           output_depth, input_depth, kernel_height,
+                           kernel_width, ret);
+#endif
+
+                    // Stop at first error
+                    if (ret && stop_at_first_error) {
+                      break;
+                    }
+                  }
+
+                  // Stop at first error
+                  if (ret && stop_at_first_error) {
+                    break;
+                  }
+                }
+
+                // Stop at first error
+                if (ret && stop_at_first_error) {
+                  break;
+                }
+              }
+
+              // Stop at first error
+              if (ret && stop_at_first_error) {
+                break;
+              }
+            }
+
+            // Stop at first error
+            if (ret && stop_at_first_error) {
+              break;
+            }
+          }
+
+          // Stop at first error
+          if (ret && stop_at_first_error) {
+            break;
+          }
+        }
+
+        // Stop at first error
+        if (ret && stop_at_first_error) {
+          break;
+        }
+      }
+
+      // Stop at first error
+      if (ret && stop_at_first_error) {
+        break;
+      }
+    }
+
+    // Stop at first error
+    if (ret && stop_at_first_error) {
+      break;
+    }
+
+    if (executed_tests >= MIN_EXEC_TESTS) {
+      break;
+    }
+  }
+
+  printf("<= 1822-dw-conv-qm: random test, total %d, failed %d, ret %d\n",
+         executed_tests, failed_tests, ret);
+
+  return ret;
+}
+
+int main()
+{
+  int ret = 0;
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  // ret = simple_nhwc_dw_conv_test(&ctx, bk_ctx);
+  // ret |= test_nhwc_to_nchw();
+  ret |= simple_dw_conv_test(&ctx, bk_ctx);
+  ret |= random_test(&ctx, bk_ctx);
+
+  test_exit(&ctx);
+  return ret;
+}
diff --git a/cviruntime/test/1822/test_1822_depthwise_max_power.cpp b/cviruntime/test/1822/test_1822_depthwise_max_power.cpp
new file mode 100644
index 000000000..a7e2c3aee
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_depthwise_max_power.cpp
@@ -0,0 +1,600 @@
+#include "1822_test_util.h"
+
+typedef bmk1822_tiu_depthwise_convolution_param_t depthwise_conv_param_t;
+
+typedef bmk1822_tdma_l2tg_tensor_copy_cw_transposed_param_t l2tg_cw_param_t;
+typedef bmk1822_tdma_tg2l_matrix_copy_row_col_transposed_param_t tg2l_matrix_param_t;
+typedef bmk1822_tdma_l2l_tensor_copy_param_t l2l_tensor_copy_param_t;
+
+typedef struct{
+    s8  *depthwise_conv_input;
+    s8  *depthwise_conv_weight;
+    s16 *depthwise_conv_bias;
+    u8  *depthwise_conv_output;
+    s8  *depthwise_conv_output_ref;
+    u8  *l2g_cw_src;
+    u8  *l2g_cw_output;
+    u8  *l2g_cw_output_ref;
+    u8  *g2l_matrix_src;
+    u8  *g2l_matrix_output;
+    u8  *g2l_matrix_output_ref;
+    u8  *l2l_tensor_src;
+    u8  *l2l_tensor_output;
+    u8  *l2l_tensor_output_ref;
+}s_test_data;
+
+depthwise_conv_param_t depthwise_conv_param;
+l2tg_cw_param_t l2tg_cw_param;
+tg2l_matrix_param_t tg2l_matrix_param;
+l2l_tensor_copy_param_t l2l_tensor_copy_param;
+s_test_data s8_test_data;
+
+bmk1822_tensor_lmem_t *skip_tensor_lmem[10];
+u32 skip_tensor_num=0;
+
+void skip_tensor_lmem_size(bmk_ctx_t *bmk, const bmk1822_tensor_lmem_t *p)
+{
+  if (!p)
+    return;
+
+  u32 needed = align_up(p->shape.n * p->stride.n, BM1822_HW_EU_NUM);
+  u32 start_addr = p->start_address + needed;
+  u32 remain_size = start_addr % BM1822_HW_LMEM_BANK_SIZE ? (BM1822_HW_LMEM_BANK_SIZE - start_addr % BM1822_HW_LMEM_BANK_SIZE) : 0; // remain size for each lane
+  if(remain_size)
+  {
+    tl_shape_t src_shape2 = {1, BM1822_HW_NPU_NUM, 1, remain_size};
+    skip_tensor_lmem[skip_tensor_num] = alloc_tl(bmk, src_shape2, FMT_I8, 1); // skip the lmem size and next tl can alignment to bank si     ze
+  }
+  skip_tensor_num++;
+}
+
+void skip_matrix_lmem_size(bmk_ctx_t *bmk, const bmk1822_matrix_lmem_t *p)
+{
+  u32 needed = align_up(p->shape.n * p->stride.n, BM1822_HW_EU_NUM);
+
+  u32 start_addr = p->start_address + needed; //src_shape.n*src_shape.c*src_shape.h*src_shape.w/32;
+  u32 remain_size = start_addr % BM1822_HW_LMEM_BANK_SIZE ? (BM1822_HW_LMEM_BANK_SIZE - start_addr % BM1822_HW_LMEM_BANK_SIZE) : 0; // remain size for each lane
+  if(remain_size)
+  {
+    tl_shape_t src_shape2 = {1, BM1822_HW_NPU_NUM, 1, remain_size};
+    skip_tensor_lmem[skip_tensor_num] = alloc_tl(bmk, src_shape2, FMT_I8, 1); // skip the lmem size and next tl can alignment to bank si     ze
+  }
+  skip_tensor_num++;
+}
+
+void free_skip_tensor_lmem(bmk_ctx_t *ctx)
+{
+  if(skip_tensor_lmem[--skip_tensor_num]!=NULL)
+    free_tl(ctx, skip_tensor_lmem[skip_tensor_num]);
+}
+
+static s8 * alloc_input(const depthwise_conv_param_t *p)
+{
+  u64 size = tl_shape_size(&p->ifmap->shape);
+  s8 *buf = (s8 *)malloc(sizeof(s8) * size);
+
+  for (u64 i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static s8 * alloc_weight(const depthwise_conv_param_t *p)
+{
+  int size = tl_shape_size(&p->weight->shape);
+  s8 *buf = (s8 *)malloc(sizeof(s8) * size);
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static s16 * alloc_bias(const depthwise_conv_param_t *p)
+{
+  int c = p->bias->shape.c;
+  s16 *bias = (s16 *)malloc(sizeof(s16) * c);
+
+  for (int i = 0; i < c; i++)
+    bias[i] = rand() % 65536 - 32768;
+
+  return bias;
+}
+
+static s8 *alloc_output(depthwise_conv_param_t *p)
+{
+  u64 size = tl_shape_size(&p->ofmap->shape);
+  s8 *output = (s8 *)malloc(sizeof(s8) * size);
+  return output;
+}
+
+static inline void relu8(s8 *buf, u64 size)
+{
+  for (u64 i = 0; i < size; i++)
+    if (buf[i] < 0)
+      buf[i] = 0;
+}
+
+static void generate_results(
+    depthwise_conv_param_t *p,
+    s8 input[],
+    s8 weight[],
+    s16 bias[]
+    )
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+  int kh = p->weight->shape.h;
+  int kw = p->weight->shape.w;
+  int opd0_sign = (p->ifmap->fmt == FMT_I8);
+  int res0_sign = (p->ofmap->fmt == FMT_I8);
+  s8_test_data.depthwise_conv_output_ref = alloc_output(p);
+
+  bmerr_t ret = native_pooling_ave_int8(
+      input, weight, p->bias ? bias : NULL, s8_test_data.depthwise_conv_output_ref,
+      in, ic, ih, iw, kh, kw,
+      p->pad_top, p->pad_bottom, p->pad_left, p->pad_right,
+      p->stride_h, p->stride_w,
+      p->ins_h, p->ins_w, p->ins_last_h, p->ins_last_w,
+      opd0_sign, res0_sign, p->rshift_bits, 0);
+  assert(ret == BM_SUCCESS);
+
+  if(p->relu_enable )
+    relu8(s8_test_data.depthwise_conv_output_ref, tl_shape_size(&p->ofmap->shape));
+}
+
+static int pooling_ih_ext(depthwise_conv_param_t *p, int ih)
+{
+  int ins = p->ins_h;
+  int ins_last = p->ins_last_h;
+  int pad = p->pad_top + p->pad_bottom;
+  return (ih - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+static int pooling_iw_ext(depthwise_conv_param_t *p, int iw)
+{
+  int ins = p->ins_w;
+  int ins_last = p->ins_last_w;
+  int pad = p->pad_left + p->pad_right;
+  return (iw - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+static int pooling_oh(depthwise_conv_param_t *p, int ih, int kh)
+{
+  int ih_ext = pooling_ih_ext(p, ih);
+  return (ih_ext - kh) / p->stride_h + 1;
+}
+
+static int pooling_ow(depthwise_conv_param_t *p, int iw, int kw)
+{
+  int iw_ext = pooling_iw_ext(p, iw);
+  return (iw_ext - kw) / p->stride_w + 1;
+}
+
+static void free_depthwise_param(
+    bmk_ctx_t *ctx,
+    depthwise_conv_param_t *p)
+{
+  if (p->bias)
+  {
+    free_skip_tensor_lmem(ctx);
+    free_tl(ctx, p->bias);
+  }
+  if (p->weight)
+  {
+    free_skip_tensor_lmem(ctx);
+    free_tl(ctx, p->weight);
+  }
+  if (p->ifmap)
+  {
+    free_skip_tensor_lmem(ctx);
+    free_tl(ctx, p->ifmap);
+  }
+  if (p->ofmap)
+  {
+    free_skip_tensor_lmem(ctx);
+    free_tl(ctx, p->ofmap);
+  }
+}
+
+static void put_bias_tensor(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    const tl_t *tl,
+    s16 data[])
+{
+  int c = tl->shape.c;
+
+  u8 *lo_hi = (u8 *)xmalloc(2 * c);
+  if (!lo_hi)
+    return;
+
+  for (int i = 0; i < c; i++) {
+    lo_hi[i] = data[i] & 0xff;
+    lo_hi[i + c] = (data[i] >> 8) & 0xff;
+  }
+
+  put_tensor_g2l(ctx, bk_ctx, tl, (u8 *)lo_hi);
+
+  free(lo_hi);
+}
+
+static depthwise_conv_param_t random_depthwise_param(bmk_ctx_t *ctx)
+{
+  srand(clock());
+  depthwise_conv_param_t p;
+
+  memset(&p, 0, sizeof(p));
+
+retry:
+  int using_bias = 0;
+  int n = 1;
+  int c = 1000;
+  int ih = 2;
+  int iw = 8;
+  int kh = 1;
+  int kw = 1;
+  int opd0_sign = 0;
+
+  p.ins_h = rand() % kh;
+  p.ins_w = rand() % kw;
+  p.ins_last_h = rand() % kh;
+  p.ins_last_w = rand() % kw;
+  p.stride_h = rand() % kh + 1;
+  p.stride_w = rand() % kw + 1;
+  p.pad_top = 0;
+  p.pad_bottom = 0;
+  p.pad_left = 0;
+  p.pad_right = 0;
+  p.rshift_bits = 2;
+  int oh = pooling_oh(&p, ih, kh);
+  int ow = pooling_ow(&p, iw, kw);
+  tl_shape_t ofmap_shape;
+  ofmap_shape.n = n;
+  ofmap_shape.c = c;
+  ofmap_shape.h = oh;
+  ofmap_shape.w = ow;
+  tl_shape_t ifmap_shape;
+  ifmap_shape.n = n;
+  ifmap_shape.c = c;
+  ifmap_shape.h = ih;
+  ifmap_shape.w = iw;
+  tl_shape_t weight_shape;
+  weight_shape.n = 1;
+  weight_shape.c = c;
+  weight_shape.h = kh;
+  weight_shape.w = kw;
+  tl_shape_t bias_shape;
+  bias_shape.n = 2;
+  bias_shape.c = c;
+  bias_shape.h = 1;
+  bias_shape.w = 1;
+  p.relu_enable = 1;
+  /*test case ref does not support dilation !=1*/
+  p.dilation_w = 1;
+  p.dilation_h = 1;
+  fmt_t ifmt = opd0_sign ? FMT_I8: FMT_U8;
+
+  p.ofmap = bmk1822_lmem_alloc_tensor(ctx, ofmap_shape, FMT_I8, 1);
+  skip_tensor_lmem_size(ctx, p.ofmap);
+  p.ifmap = bmk1822_lmem_alloc_tensor(ctx, ifmap_shape, ifmt, 1);
+  skip_tensor_lmem_size(ctx, p.ifmap);
+  p.weight = bmk1822_lmem_alloc_tensor(ctx, weight_shape, FMT_I8, 1);
+  skip_tensor_lmem_size(ctx, p.weight);
+  p.bias = NULL;
+  if (using_bias)
+  {
+    p.bias = bmk1822_lmem_alloc_tensor(ctx, bias_shape, FMT_I8, 0);
+    skip_tensor_lmem_size(ctx, p.bias);
+  }
+  if ((kh > pooling_ih_ext(&p, ih))
+      || (kw > pooling_iw_ext(&p, iw))
+      || (p.pad_top >= (1 << 4))
+      || (p.pad_bottom >= (1 << 4))
+      || (p.pad_left >= (1 << 4))
+      || (p.pad_right >= (1 << 4))
+      || !p.ofmap
+      || !p.ifmap
+      || !p.weight
+      || (using_bias && !p.bias)) {
+    printf("retry init_pooling_param\n");
+    free_depthwise_param(ctx, &p);
+    goto retry;
+  }
+  return p;
+}
+
+
+static int test_pooling(bmctx_t ctx, bmk_ctx_t *bk_ctx)
+{
+  depthwise_conv_param = random_depthwise_param(bk_ctx);
+
+  s8 *input = alloc_input(&depthwise_conv_param);
+  s8 *weight = alloc_weight(&depthwise_conv_param);
+  s16 *bias = NULL;
+  if (depthwise_conv_param.bias)
+    bias = alloc_bias(&depthwise_conv_param);
+
+  put_tensor_g2l(&ctx, bk_ctx, depthwise_conv_param.ifmap, (u8 *)input);
+  put_tensor_g2l(&ctx, bk_ctx, depthwise_conv_param.weight, (u8 *)weight);
+  if (depthwise_conv_param.bias)
+    put_bias_tensor(&ctx, bk_ctx, depthwise_conv_param.bias, bias);
+
+  generate_results(&depthwise_conv_param, input, weight, bias);
+
+  free(input);
+  free(weight);
+  free(bias);
+
+  return 1;
+}
+
+static void l2tg_tensor_copy_cw_transposed_ref(
+    l2tg_cw_param_t *p, u8 ref_data[], u8 src_data[])
+{
+  tl_shape_t s = p->src->shape;
+  u32 n = s.n;
+  u32 c = s.c;
+  u32 h = s.h;
+  u32 w = s.w;
+
+  for (u32 ni = 0; ni < n; ni++) {
+    for (u32 ci = 0; ci < c; ci++) {
+      for (u32 hi = 0; hi < h; hi++) {
+        for (u32 wi = 0; wi < w; wi++) {
+          u32 src_i = ni * c * h * w + ci * h * w + hi * w + wi;
+          u32 dst_i = ni * c * h * w + wi * h * c + hi * c + ci;
+          ref_data[dst_i] = src_data[src_i];
+        }
+      }
+    }
+  }
+}
+
+static void test_param_l2g(bmctx_t *ctx, bmk_ctx_t *bmk, l2tg_cw_param_t *p)
+{
+  u64 size = tl_shape_size(&p->src->shape);
+
+  s8_test_data.l2g_cw_src = (u8 *)malloc(sizeof(u8) * size);
+  if (!s8_test_data.l2g_cw_src)
+    return;
+
+  for (u64 i = 0; i < size; i++)
+    s8_test_data.l2g_cw_src[i] = rand()%0x100;
+
+  s8_test_data.l2g_cw_output_ref = (u8 *)malloc(sizeof(u8) * size);
+  if (!s8_test_data.l2g_cw_output_ref)
+    return;
+
+  l2tg_tensor_copy_cw_transposed_ref(p, s8_test_data.l2g_cw_output_ref, s8_test_data.l2g_cw_src);
+
+  put_tensor_g2l(ctx, bmk, p->src, s8_test_data.l2g_cw_src);
+}
+
+static void destroy_param_l2g(bmctx_t *ctx, bmk_ctx_t *bmk, l2tg_cw_param_t *p)
+{
+  free_tg_gmem(ctx, p->dst);
+  free_skip_tensor_lmem(bmk);
+  free_tl(bmk, p->src);
+}
+
+static void test_l2tg_cw_transpose(bmctx_t *ctx, bmk_ctx_t *bmk, l2tg_cw_param_t *p)
+{
+  tl_shape_t src_shape = {1, 0x100, 1, 0x020};
+  tg_shape_t dst_shape = {1, 0x020, 1, 0x100};
+
+  p->src = alloc_tl(bmk, src_shape, FMT_I8, 1);
+  p->dst = alloc_tg_gmem(ctx, dst_shape, FMT_I8);
+  skip_tensor_lmem_size(bmk, p->src);
+  test_param_l2g(ctx, bmk, p);
+}
+
+static void tg2l_matrix_copy_row_col_transposed_ref(
+    tg2l_matrix_param_t *p, u8 ref_data[], u8 src_data[])
+{
+  u64 row = p->src->shape.row;
+  u64 col = p->src->shape.col;
+
+  for (u64 ri = 0; ri < row; ri++) {
+    for (u64 ci = 0; ci < col; ci++) {
+      u64 src_i = ri * col + ci;
+      u64 dst_i = ci * row + ri;
+      ref_data[dst_i] = src_data[src_i];
+    }
+  }
+}
+
+static void test_param_g2l(bmctx_t *ctx, tg2l_matrix_param_t *p)
+{
+  u64 size = ml_shape_size(&p->dst->shape);
+
+  s8_test_data.g2l_matrix_src = (u8 *)malloc(sizeof(u8) * size);
+  if (!s8_test_data.g2l_matrix_src)
+    return;
+
+  for (u64 i = 0; i < size; i++)
+    s8_test_data.g2l_matrix_src[i] = rand()%0x100;
+
+  s8_test_data.g2l_matrix_output_ref = (u8 *)malloc(sizeof(u8) * size);
+  if (!s8_test_data.g2l_matrix_output_ref)
+    return;
+
+  tg2l_matrix_copy_row_col_transposed_ref(p, s8_test_data.g2l_matrix_output_ref, s8_test_data.g2l_matrix_src);
+
+  put_mg_gmem(ctx, p->src, s8_test_data.g2l_matrix_src);
+}
+
+static void destroy_param_g2l(bmctx_t *ctx, bmk_ctx_t *bmk, tg2l_matrix_param_t *p)
+{
+  free_mg_gmem(ctx, p->src);
+  free_skip_tensor_lmem(bmk);
+  free_ml(bmk, p->dst);
+}
+
+
+static void test_tg2l_matrix_transpose(bmctx_t *ctx, bmk_ctx_t *bmk, tg2l_matrix_param_t *p)
+{
+  //tg2l_matrix_param_t p;
+  /*
+   * Matrix transpose must be n/c stride alignment
+   * for TDMA limitation
+   */
+  mg_shape_t src_shape={0x100, 0x20};
+  ml_shape_t dst_shape={0x20, 0x10, 0x10, 0x100};
+
+  int dst_align = 1;
+
+  p->src = alloc_mg_gmem(ctx, src_shape);
+  p->dst = alloc_ml(bmk, dst_shape, dst_align);
+  skip_matrix_lmem_size(bmk, p->dst);
+  test_param_g2l(ctx, p);
+}
+
+static void l2l_tensor_copy_ref(l2l_tensor_copy_param_t *p, u8 ref_data[], u8 src_data[])
+{
+  u64 size = tl_shape_size(&p->src->shape);
+
+  for (u64 i = 0; i < size; i++)
+    ref_data[i] = src_data[i];
+}
+
+static void test_l2l_param(bmctx_t *ctx, bmk_ctx_t *bmk, l2l_tensor_copy_param_t *p)
+{
+  u64 size = tl_shape_size(&p->src->shape);
+
+  s8_test_data.l2l_tensor_src = (u8 *)malloc(sizeof(u8) * size);
+  if (!s8_test_data.l2l_tensor_src)
+    return;
+
+  for (u64 i = 0; i < size; i++)
+    s8_test_data.l2l_tensor_src[i] = rand()%0x100;
+
+  s8_test_data.l2l_tensor_output_ref = (u8 *)malloc(sizeof(u8) * size);
+  if (!s8_test_data.l2l_tensor_output_ref)
+    return;
+
+  l2l_tensor_copy_ref(p, s8_test_data.l2l_tensor_output_ref, s8_test_data.l2l_tensor_src);
+
+  put_tensor_g2l(ctx, bmk, p->src, s8_test_data.l2l_tensor_src);
+}
+
+static void destroy_param_l2l(bmk_ctx_t *bmk, l2l_tensor_copy_param_t *p)
+{
+  free_skip_tensor_lmem(bmk);
+  free_tl(bmk, p->dst);
+  free_skip_tensor_lmem(bmk);
+  free_tl(bmk, p->src);
+}
+
+static void test_l2l_tensor_copy(bmctx_t *ctx, bmk_ctx_t *bmk, l2l_tensor_copy_param_t *p)
+{
+  tl_shape_t src_shape = {1, 0x10, 0x1, 0x100};
+  tl_shape_t dst_shape = {1, 0x10, 0x1, 0x100};
+
+  p->src = alloc_tl(bmk, src_shape, FMT_I8, 1);
+  skip_tensor_lmem_size(bmk, p->src);
+  p->dst = alloc_tl(bmk, dst_shape, FMT_I8, 1);
+  skip_tensor_lmem_size(bmk, p->dst);
+  test_l2l_param(ctx, bmk, p);
+}
+
+void get_result(bmctx_t *ctx, bmk_ctx_t *bmk)
+{
+  s8_test_data.depthwise_conv_output = get_tensor_l2g(ctx, bmk, depthwise_conv_param.ofmap);
+  s8_test_data.l2g_cw_output = get_tg_gmem(ctx, l2tg_cw_param.dst);
+  s8_test_data.g2l_matrix_output = get_matrix_l2g(ctx, bmk, tg2l_matrix_param.dst);
+  s8_test_data.l2l_tensor_output = get_tensor_l2g(ctx, bmk, l2l_tensor_copy_param.dst);
+}
+
+void check_result()
+{
+
+  int cmp_res = array_cmp_int8(
+      "Comparing results ...\n", s8_test_data.depthwise_conv_output_ref,  (s8 *)s8_test_data.depthwise_conv_output,
+      tl_shape_size(&depthwise_conv_param.ofmap->shape));
+  if (cmp_res != 0) {
+    printf("Comparison FAILED!!!\n");
+    exit(-1);
+  }
+
+  for (u64 i = 0; i < tl_shape_size(&l2tg_cw_param.src->shape); i++) {
+    if (s8_test_data.l2g_cw_output[i] != s8_test_data.l2g_cw_output_ref[i]) {
+      fprintf(stderr, "l2g_cw comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, s8_test_data.l2g_cw_output[i], s8_test_data.l2g_cw_output_ref[i]);
+      exit(-1);
+    }
+  }
+  for (u64 i = 0; i < ml_shape_size(&tg2l_matrix_param.dst->shape); i++) {
+    if (s8_test_data.g2l_matrix_output[i] != s8_test_data.g2l_matrix_output_ref[i]) {
+      fprintf(stderr, "g2l_matrix comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, s8_test_data.g2l_matrix_output[i], s8_test_data.g2l_matrix_output_ref[i]);
+      exit(-1);
+    }
+  }
+
+  for (u64 i = 0; i < tl_shape_size(&l2l_tensor_copy_param.src->shape); i++) {
+    if (s8_test_data.l2l_tensor_output[i] != s8_test_data.l2l_tensor_output_ref[i]) {
+      fprintf(stderr, "l2l_tensor comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, s8_test_data.l2l_tensor_output[i], s8_test_data.l2l_tensor_output_ref[i]);
+      exit(-1);
+    }
+  }
+
+
+}
+
+void trigger_max_power(bmctx_t *ctx, bmk_ctx_t *bmk)
+{
+ bmk1822_parallel_enable(bmk);
+ bmk1822_tdma_l2g_tensor_copy_cw_transposed(bmk, &l2tg_cw_param);
+ bmk1822_tdma_g2l_matrix_copy_row_col_transposed(bmk, &tg2l_matrix_param);
+ bmk1822_tdma_l2l_tensor_copy(bmk, &l2l_tensor_copy_param);
+ bmk1822_tiu_depthwise_convolution(bmk, &depthwise_conv_param);
+ bmk1822_parallel_disable(bmk);
+ test_submit(ctx);
+}
+
+void free_s8_data()
+{
+  free(s8_test_data.depthwise_conv_input);
+  free(s8_test_data.depthwise_conv_weight);
+  free(s8_test_data.depthwise_conv_bias);
+  free(s8_test_data.depthwise_conv_output);
+  free(s8_test_data.depthwise_conv_output_ref);
+  free(s8_test_data.l2g_cw_src);
+  free(s8_test_data.l2g_cw_output);
+  free(s8_test_data.l2g_cw_output_ref);
+  free(s8_test_data.g2l_matrix_src);
+  free(s8_test_data.g2l_matrix_output);
+  free(s8_test_data.g2l_matrix_output_ref);
+  free(s8_test_data.l2l_tensor_src);
+  free(s8_test_data.l2l_tensor_output);
+  free(s8_test_data.l2l_tensor_output_ref);
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  printf("depthwise max_power test\n");
+
+  test_pooling(ctx, bk_ctx);
+  test_l2tg_cw_transpose(&ctx, bk_ctx, &l2tg_cw_param);
+  test_tg2l_matrix_transpose(&ctx, bk_ctx, &tg2l_matrix_param);
+  test_l2l_tensor_copy(&ctx, bk_ctx, &l2l_tensor_copy_param);
+
+  trigger_max_power(&ctx, bk_ctx);
+  get_result(&ctx, bk_ctx);
+  check_result();
+
+  destroy_param_l2l(bk_ctx,&l2l_tensor_copy_param);
+  destroy_param_g2l(&ctx, bk_ctx, &tg2l_matrix_param);
+  destroy_param_l2g(&ctx, bk_ctx, &l2tg_cw_param);
+  free_depthwise_param(bk_ctx, &depthwise_conv_param);
+  free_s8_data();
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_double_conv.cpp b/cviruntime/test/1822/test_1822_double_conv.cpp
new file mode 100644
index 000000000..ea5607de8
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_double_conv.cpp
@@ -0,0 +1,737 @@
+#include "1822_test_util.h"
+
+typedef struct {
+  int random_seed;
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int using_bias;
+  int bReLU_EN;
+  int r_shift_m;
+  int opd0_sign;
+  int opd1_sign;
+  int opd2_sign;
+} conv_param_t;
+
+static int index_get(int h, int w1, int w2)
+{
+  return h * w1 + w2;
+}
+
+static int matrix_dot_mult(
+    s8 *A, s8 *B, int dim_n, int dim_m,
+    int opd0_sign)
+{
+  int sum = 0;
+  for (int i=0; i<dim_n; i++){
+    for (int j=0; j<dim_m; j++){
+      int index = index_get(i, dim_m, j);
+      if(opd0_sign) {
+        sum += A[index] * B [index];
+      } else {
+        sum += (int)((uint8_t)A[index]) * B [index];
+      }
+    }
+  }
+  return sum;
+}
+
+static int conv_ref(
+    const conv_param_t *p_param,
+    const s8 *ifmap,
+    const s8 *weight,
+    const s16 *bias,
+    s8 *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+  int r_shift_bits = p_param->r_shift_m;
+  int do_relu = p_param->bReLU_EN;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  s8 *i_fmap_pad_ker = (s8 *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return BM_ERR_FAILURE;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = BM_SUCCESS;
+
+  s8 *i_fmap_pad = NULL;
+  s8 *kernel_after = NULL;
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+
+      if (p_param->using_bias) {
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += bias[c]; //bias+c ;
+          }
+        }
+      }
+
+      if (do_relu)
+        relu(&result[n*oc*oh*ow + c*oh*ow], oh * ow);
+
+      // ofmap is s8, signed
+      ret = satu_2_8bit(&result[n*oc*oh*ow + c*oh*ow], oh * ow,
+                        &ofmap[n*oc*oh*ow + c*oh*ow], r_shift_bits, /*round_floor=*/1,
+                        /*sign_unsign=*/1);
+
+      if (ret != BM_SUCCESS)
+        goto error_release;
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+error_release:
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+static u8 * transform_weight(const tl_shape_t *s, u8 before[])
+{
+  u32 ic = s->n;
+  u32 oc = s->c;
+  u32 kh = s->h;
+  u32 kw = s->w;
+
+  u32 size = ic * oc * kh * kw;
+  u8 *after = (u8 *)malloc(sizeof(u8) * size);
+
+  /*
+   * (oc, ic, kh, kw) -> (1, oc, kh * kw, ic)
+   */
+  for (u32 oci = 0; oci < oc; oci++) {
+    for (u32 ici = 0; ici < ic; ici++) {
+      for (u32 khi = 0; khi < kh; khi++) {
+        for (u32 kwi = 0; kwi < kw; kwi++) {
+          u32 src_i = oci * ic * kh * kw + ici * kh * kw + khi * kw + kwi;
+          u32 dst_i = oci * kh * kw * ic + khi * kw * ic + kwi * ic + ici;
+          after[dst_i] = before[src_i];
+        }
+      }
+    }
+  }
+
+  return after;
+}
+
+static void put_conv_weight(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    const tl_t *tl,
+    u8 *data)
+{
+  const tl_shape_t *s = &tl->shape;
+  u32 ic = s->n;
+  u32 oc = s->c;
+  u32 kh = s->h;
+  u32 kw = s->w;
+
+  bmshape_t bms = BM_TENSOR_INT8((int)oc, (int)ic, (int)kh, (int)kw);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  u8 *transformed_data = transform_weight(s, data);
+
+  /* we put weight to region 1. bm_memcpy_s2d regard dev_mem as
+   * absolute address, so we should pass abs address to copy weight
+   * to right place.
+   */
+
+  //u64 ab_addr = bm_device_read_base_reg(*ctx, 1);
+  //bmmem_device_t ab_dev_mem =
+    //bmmem_device_prealloc(*ctx, NULL, ab_addr + gaddr, &bms);
+
+  //int ret = bm_memcpy_s2d(*ctx, ab_dev_mem, transformed_data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tl_shape_t tdma_shape = { 1, oc, kh * kw, ic };
+
+  tg_t tdma_tg;
+  tdma_tg.base_reg_index = 0;
+  tdma_tg.start_address = gaddr;
+  tdma_tg.fmt = FMT_I8;
+  tdma_tg.shape.n = tdma_shape.n;
+  tdma_tg.shape.c = tdma_shape.c;
+  tdma_tg.shape.h = tdma_shape.h;
+  tdma_tg.shape.w = tdma_shape.w;
+  tdma_tg.stride = bmk1822_tensor_tgmem_default_stride(tdma_tg.shape, tdma_tg.fmt);
+  tdma_tg.base_reg_index = 1;
+
+  tl_t tdma_tl = *tl;
+  tdma_tl.shape = tdma_shape;
+  tdma_tl.stride = bmk1822_tensor_lmem_default_stride(bk_ctx, tdma_shape, FMT_I8, 0);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tdma_tg;
+  p.dst = &tdma_tl;
+
+  bmk1822_tdma_g2l_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+  bmmem_device_free(*ctx, dev_mem);
+  free(transformed_data);
+}
+
+static s8 * transform_bias(int oc, s16 before[])
+{
+  s8 *after = (s8 *)malloc(sizeof(s8) * 2 * oc);
+  if (!after)
+    return NULL;
+
+  for (int i = 0; i < oc; i++){
+    after[i] = before[i] & 0xff;
+    after[i + oc] = (before[i] >> 8) & 0xff;
+  }
+  return after;
+}
+
+static void put_conv_bias(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    const tl_t *tl,
+    s16 *data)
+{
+  int oc = tl->shape.c;
+
+  bmshape_t bms = BM_TENSOR_INT8(2, oc, 1, 1);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  s8 *transformed_data = transform_bias(oc, data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, (u8 *)transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_I8;
+  tg.shape.n = 2;
+  tg.shape.c = oc;
+  tg.shape.h = 1;
+  tg.shape.w = 1;
+  tg.stride = bmk1822_tensor_tgmem_default_stride(tg.shape, tg.fmt);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1822_tdma_g2l_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  bmmem_device_free(*ctx, dev_mem);
+  free(transformed_data);
+}
+
+static int conv_kh_ext(const conv_param_t *p)
+{
+  return (p->kh - 1) * p->dh + 1;
+}
+
+static int conv_kw_ext(const conv_param_t *p)
+{
+  return (p->kw - 1) * p->dw + 1;
+}
+
+static int conv_ih_ext(const conv_param_t *p)
+{
+  return (p->input_h - 1) * (p->ins_h + 1) +
+      p->ins_h_last + 1 + p->pad_top + p->pad_bot;
+}
+
+static int conv_iw_ext(const conv_param_t *p)
+{
+  return (p->input_w - 1) * (p->ins_w + 1) +
+      p->ins_w_last + 1 + p->pad_left + p->pad_right;
+}
+
+static int conv_oh(const conv_param_t *p)
+{
+  return (conv_ih_ext(p) - conv_kh_ext(p)) / p->stride_h + 1;
+}
+
+static int conv_ow(const conv_param_t *p)
+{
+  return (conv_iw_ext(p) - conv_kw_ext(p)) / p->stride_w + 1;
+}
+
+static int conv_input_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int ic = p->input_c;
+  int ih = p->input_h;
+  int iw = p->input_w;
+  return in * ic * ih * iw;
+}
+
+static int conv_output_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int oc = p->output_c;
+  int oh = conv_oh(p);
+  int ow = conv_ow(p);
+  return in * oc * oh * ow;
+}
+
+static int conv_weight_size(const conv_param_t *p)
+{
+  int oc = p->output_c;
+  int ic = p->input_c;
+  int kh = p->kh;
+  int kw = p->kw;
+  return oc * ic * kh * kw;
+}
+
+static s8 * alloc_input(const conv_param_t *p)
+{
+  int size = conv_input_size(p);
+
+  s8 *buf = (s8 *)malloc(sizeof(s8) * size);
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static s8 * alloc_weight(const conv_param_t *p)
+{
+  int size = conv_weight_size(p);
+
+  s8 *buf = (s8 *)malloc(sizeof(s8) * size);
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static s16 * alloc_bias(const conv_param_t *p)
+{
+  int oc = p->output_c;
+
+  s16 *bias = (s16 *)malloc(sizeof(s16) * oc);
+  for (int i = 0; i < oc; i++)
+    bias[i] = rand() % 65536 - 32768;
+
+  return bias;
+}
+
+static tl_t * conv_ifmap_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = p->opd0_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return bmk1822_lmem_alloc_tensor(ctx, s, fmt, 1);
+}
+
+static tl_t * conv_weight_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = p->opd1_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+  return bmk1822_lmem_alloc_tensor(ctx, s, fmt, 0);
+}
+
+static tl_t * conv_ofmap_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  return bmk1822_lmem_alloc_tensor(ctx, s, FMT_I8, 1);
+}
+
+static tl_t * conv_bias_tensor(
+    bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = p->opd2_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+  return bmk1822_lmem_alloc_tensor(ctx, s, fmt, 0);
+}
+
+static int conv_param_is_ok(const conv_param_t *p)
+{
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+
+  if ((kh_ext > ih_ext)
+      || (kw_ext > iw_ext)
+      || (kh_ext <= p->pad_top)
+      || (kh_ext <= p->pad_bot)
+      || (kw_ext <= p->pad_left)
+      || (kw_ext <= p->pad_right)
+      || (p->pad_top >= (1 << 4))
+      || (p->pad_bot >= (1 << 4))
+      || (p->pad_left >= (1 << 4))
+      || (p->pad_right >= (1 << 4))) {
+    return 0;
+  }
+
+  return 1;
+}
+
+static int bmk_conv_param_alloc_ok(
+    const bmk1822_tiu_convolution_param_t *p,
+    const conv_param_t *param)
+{
+  if (!p->ifmap || !p->ofmap || !p->weight)
+    return 0;
+
+  if (param->using_bias)
+    if (!p->bias)
+      return 0;
+
+  return 1;
+}
+
+static void make_bmk_conv_param(
+    bmk_ctx_t *ctx,
+    bmk1822_tiu_convolution_param_t *dst,
+    const conv_param_t *p)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  dst->relu_enable = p->bReLU_EN;
+  dst->rshift_bits = p->r_shift_m;
+
+  dst->ifmap = conv_ifmap_tensor(ctx, p);
+  dst->weight = conv_weight_tensor(ctx, p);
+  dst->ofmap = conv_ofmap_tensor(ctx, p);
+  dst->bias = NULL;
+  dst->ps32_mode = 0;
+  if (p->using_bias)
+    dst->bias = conv_bias_tensor(ctx, p);
+
+  dst-> w_is_const = 0;
+}
+
+static void free_bmk_conv_param(
+    bmk_ctx_t *ctx,
+    bmk1822_tiu_convolution_param_t *r,
+    const conv_param_t *p)
+{
+  if (p->using_bias && r->bias)
+    free_tl(ctx, r->bias);
+  if (r->ofmap)
+    free_tl(ctx, r->ofmap);
+  if (r->weight)
+    free_tl(ctx, r->weight);
+  if (r->ifmap)
+    free_tl(ctx, r->ifmap);
+}
+
+static void init_conv_param(conv_param_t &p)
+{
+  printf("init_conv_param\n");
+  p.random_seed = clock();
+  srand(p.random_seed);
+
+retry:
+  p.input_n = rand() % 5 + 1;
+  p.input_c = (rand() % (5 * 32)/2)*2 + 8;
+  p.kh = rand() % 7 + 1;
+  p.kw = rand() % 7 + 1;
+  p.input_h = rand() % 40 + p.kh;
+  p.input_w = rand() % 40 + p.kw;
+  p.output_c = rand() % 10 + 3;
+  p.stride_h = rand() % (p.kh) + 1;
+  p.stride_w = rand() % (p.kw) + 1;
+  p.ins_h = rand() % p.kh;
+  p.ins_w = rand() % p.kw;
+  p.ins_h_last = rand() % p.kh;;
+  p.ins_w_last = rand() % p.kw;;
+  p.dh = rand() % 3 + 1;
+  p.dw = rand() % 3 + 1;
+
+  int kh_ext = conv_kh_ext(&p);
+  int kw_ext = conv_kw_ext(&p);
+  p.pad_top = rand() % kh_ext;
+  p.pad_bot = rand() % kh_ext;
+  p.pad_left = rand() % kw_ext;
+  p.pad_right = rand() % kw_ext;
+
+  if (!conv_param_is_ok(&p)) {
+    printf("retry init_conv_param\n");
+    goto retry;
+  }
+
+  p.using_bias = rand() % 2;
+  p.r_shift_m = rand() % 8;
+  p.bReLU_EN = rand() % 2;
+
+  p.opd0_sign = rand() % 2;
+  p.opd1_sign = 1;
+  p.opd2_sign = 1;
+
+  assert(p.opd1_sign == 1 && p.opd2_sign == 1);
+
+  int ih_ext = conv_ih_ext(&p);
+  int iw_ext = conv_iw_ext(&p);
+  assert(ih_ext >= kh_ext);
+  assert(iw_ext >= kw_ext);
+}
+
+static void print_conv_param(const conv_param_t *p)
+{
+  printf("%s\n", "Conv parameters:");
+  printf("  %s%d;\n", "p->random_seed = ", p->random_seed);
+
+  printf("  %s%d;\n", "p->input_n = ", p->input_n);
+  printf("  %s%d;\n", "p->input_c = ", p->input_c);
+  printf("  %s%d;\n", "p->input_h = ", p->input_h);
+  printf("  %s%d;\n", "p->input_w = ", p->input_w);
+  printf("  %s%d;\n", "p->output_c = ", p->output_c);
+
+  printf("  %s%d;\n", "p->kh = ", p->kh);
+  printf("  %s%d;\n", "p->kw = ", p->kw);
+  printf("  %s%d;\n", "p->dh = ", p->dh);
+  printf("  %s%d;\n", "p->dw = ", p->dw);
+  printf("  %s%d;\n", "p->pad_top = ", p->pad_top);
+  printf("  %s%d;\n", "p->pad_bot = ", p->pad_bot);
+  printf("  %s%d;\n", "p->pad_left = ", p->pad_left);
+  printf("  %s%d;\n", "p->pad_right = ", p->pad_right);
+  printf("  %s%d;\n", "p->stride_h = ", p->stride_h);
+  printf("  %s%d;\n", "p->stride_w = ", p->stride_w);
+  printf("  %s%d;\n", "p->ins_w = ", p->ins_w);
+  printf("  %s%d;\n", "p->ins_h = ", p->ins_h);
+  printf("  %s%d;\n", "p->ins_w_last = ", p->ins_w_last);
+  printf("  %s%d;\n", "p->ins_h_last = ", p->ins_h_last);
+
+  printf("  %s%d;\n", "p->r_shift_m = ", p->r_shift_m);
+  printf("  %s%d;\n", "p->opd0_sign = ", p->opd0_sign);
+  printf("  %s%d;\n", "p->opd1_sign = ", p->opd1_sign);
+  printf("  %s%d;\n", "p->opd2_sign = ", p->opd2_sign);
+  printf("  %s%d;\n", "p->bReLU_EN = ", p->bReLU_EN);
+  printf("  %s%d;\n", "p->using_bias = ", p->using_bias);
+
+  printf("  %s%d\n", "kh_ext = ", conv_kh_ext(p));
+  printf("  %s%d\n", "kw_ext = ", conv_kw_ext(p));
+  printf("  %s%d\n", "ih_ext = ", conv_ih_ext(p));
+  printf("  %s%d\n", "iw_ext = ", conv_iw_ext(p));
+  printf("  %s%d\n", "output_h = ", conv_oh(p));
+  printf("  %s%d\n", "output_w = ", conv_ow(p));
+}
+
+// Calculate the right shift value, m
+// Steps:
+//  1. Get the abs() of each weight;
+//  2. Summary all the abs() in one kernel;
+//  3. Get Log2 of each sum;
+//  4. Downward rounding;
+// After every r_shift value got, sort and find the middle one.
+static int calc_rshift_m(const conv_param_t *p, s8* weight)
+{
+  int kernel_cnt = p->output_c * p->input_c;
+  int kernel_size = p->kh * p->kw;
+  int *kernel_shifts = (int *)malloc(sizeof(int) * kernel_cnt);
+
+  // Tscan does not recognize C++ zero-initialized.
+  memset(kernel_shifts, 0, sizeof(int) * kernel_cnt);
+
+  // Part 1:
+  // Get right shift value for each kernel
+  int sum = 0;
+  for (int i = 0; i < kernel_cnt; i++) {
+    // Step 1 & 2: Get the sum of abs()
+    for (int j = 0; j < kernel_size; j++) {
+      sum += (int)(*weight < 0 ? -(*weight) : (*weight));
+      weight++;
+    }
+    // Step 3 & 4: log2 and downward rounding
+    sum >>= 1;
+    while (sum) {
+      sum >>= 1;
+      kernel_shifts[i]++;
+    }
+  }
+
+  // Part 2:
+  // Find the middle of all the values
+  int tag[32] = {0};
+  for (int cnt = 0; cnt < kernel_cnt; cnt++) {
+    tag[kernel_shifts[cnt]]++;
+  }
+
+  int rshift_m = 0;
+  int mid = 0;
+  do {
+    mid += tag[rshift_m++];
+  } while(mid < (kernel_cnt - 1) >> 1);
+
+  free(kernel_shifts);
+
+  return rshift_m - 1;
+}
+
+static int test_conv(
+    conv_param_t &p_param, bmctx_t ctx, bmk_ctx_t *bk_ctx)
+{
+  s8 *input = alloc_input(&p_param);
+  s8 *weight = alloc_weight(&p_param);
+  s16 *bias = alloc_bias(&p_param);
+  p_param.r_shift_m = calc_rshift_m(&p_param, weight);
+
+  s8 *output_ref = (s8 *)malloc(sizeof(s8) * conv_output_size(&p_param));
+  if (!output_ref)
+    return BM_ERR_FAILURE;
+
+  bmerr_t ret = conv_ref(&p_param, input, weight, bias, output_ref);
+  assert(ret == BM_SUCCESS);
+
+  bmk1822_tiu_convolution_param_t conv_param;
+  make_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+  int tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, &p_param);
+  if (tl_alloc_success) {
+    put_tensor_g2l(&ctx, bk_ctx, conv_param.ifmap, (u8 *)input);
+    put_conv_weight(&ctx, bk_ctx, conv_param.weight, (u8 *)weight);
+    if (p_param.using_bias)
+      put_conv_bias(&ctx, bk_ctx, conv_param.bias, bias);
+    bmk1822_tiu_convolution(bk_ctx, &conv_param);
+    u8 *output = get_tensor_l2g(&ctx, bk_ctx, conv_param.ofmap);
+
+    int has_error = array_cmp_int8(
+        "Comparing results ...\n",
+        output_ref, (s8 *)output, conv_output_size(&p_param));
+
+    if (has_error) {
+      print_conv_param(&p_param);
+      printf("Comparison FAILED\n");
+      exit(-1);
+    }
+    free(output);
+  }
+
+  free_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+  free(input);
+  free(weight);
+  free(output_ref);
+  free(bias);
+
+  return tl_alloc_success ? 1 : 0;
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  int test_finished_num = 0;
+  for (int i = 0; i < 5; i++) {
+    printf("random_test_conv iteration: %d\n", i);
+    conv_param_t test_conv_param;
+    init_conv_param(test_conv_param);
+    test_finished_num += test_conv(test_conv_param, ctx, bk_ctx);
+    if (!test_conv_param.using_bias)
+      test_conv_param.using_bias = 1;
+    if (test_conv_param.output_c <= 32)
+      test_conv_param.output_c += 32;
+    test_finished_num += test_conv(test_conv_param, ctx, bk_ctx);
+  }
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_double_conv_ps32.cpp b/cviruntime/test/1822/test_1822_double_conv_ps32.cpp
new file mode 100644
index 000000000..69e1fe34a
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_double_conv_ps32.cpp
@@ -0,0 +1,1467 @@
+#include "1822_test_util.h"
+
+typedef struct {
+  int random_seed;
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int using_bias;
+  int bReLU_EN;
+  int r_shift_m;
+  int opd0_sign;
+  int opd1_sign;
+  int opd2_sign;
+} conv_param_t;
+
+static int index_get(int h, int w1, int w2)
+{
+  return h * w1 + w2;
+}
+
+static int matrix_dot_mult(
+    s8 *A, s8 *B, int dim_n, int dim_m,
+    int opd0_sign)
+{
+  int sum = 0;
+  for (int i=0; i<dim_n; i++){
+    for (int j=0; j<dim_m; j++){
+      int index = index_get(i, dim_m, j);
+      if (opd0_sign) {
+        sum += A[index] * B [index];
+      } else {
+        sum += (int)((uint8_t)A[index]) * B [index];
+      }
+    }
+  }
+  return sum;
+}
+
+static int ps32_m2_conv_ref(
+    const conv_param_t *p_param,
+    const s8 *ifmap,
+    const s8 *weight,
+    s8 *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  s8 *i_fmap_pad_ker = (s8 *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return BM_ERR_FAILURE;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = BM_SUCCESS;
+
+  s8 *i_fmap_pad = NULL;
+  s8 *kernel_after = NULL;
+  u32 bstride = in * oc * oh * ow;
+
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+    ofmap[i] = result[i];
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+    ofmap[bstride + i] = result[i] >> 8;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+    ofmap[2 * bstride + i] = result[i] >> 16;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+    ofmap[3 * bstride + i] = result[i] >> 24;
+
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+static int ps32_m1_conv_ref(
+    const conv_param_t *p_param,
+    const s8 *ifmap,
+    const s8 *weight,
+    const s16 *bias,
+    s8 *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+  int r_shift_bits = p_param->r_shift_m;
+  int do_relu = p_param->bReLU_EN;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  s8 *i_fmap_pad_ker = (s8 *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return BM_ERR_FAILURE;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = BM_SUCCESS;
+
+  s8 *i_fmap_pad = NULL;
+  s8 *kernel_after = NULL;
+
+  u32 bstride = in * oc * oh * ow;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      result[i] = (u8)ofmap[i];
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      result[i] |= (u8)ofmap[bstride + i] << 8;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      result[i] |= (u8)ofmap[2 * bstride + i] << 16;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      result[i] |= (u8)ofmap[3 * bstride + i] << 24;
+
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+
+      if (p_param->using_bias) {
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += bias[c]; //bias+c ;
+          }
+        }
+      }
+
+      if (do_relu)
+        relu(&result[n*oc*oh*ow + c*oh*ow], oh * ow);
+
+      // ofmap is s8, signed
+      ret = satu_2_8bit(&result[n*oc*oh*ow + c*oh*ow], oh * ow,
+                        &ofmap[n*oc*oh*ow + c*oh*ow], r_shift_bits, /*round_floor=*/1,
+                        /*sign_unsign=*/1);
+
+      if (ret != BM_SUCCESS)
+        goto error_release;
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+error_release:
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+
+static int ps32_m3_conv_ref(
+    const conv_param_t *p_param,
+    const s8 *ifmap,
+    const s8 *weight,
+    s8 *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  s8 *i_fmap_pad_ker = (s8 *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return BM_ERR_FAILURE;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = BM_SUCCESS;
+
+  s8 *i_fmap_pad = NULL;
+  s8 *kernel_after = NULL;
+
+  u32 bstride = in * oc * oh * ow;
+
+  for (int i = 0; i < in * oc * oh * ow; i++)
+      result[i] = (u8)ofmap[i];
+
+  for (int i = 0; i < in * oc * oh * ow; i++)
+      result[i] |= (u8)ofmap[bstride + i] << 8;
+
+  for (int i = 0; i < in * oc * oh * ow; i++)
+      result[i] |= (u8)ofmap[2 * bstride + i] << 16;
+
+  for (int i = 0; i < in * oc * oh * ow; i++)
+      result[i] |= (u8)ofmap[3 * bstride + i] << 24;
+
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      ofmap[i] = result[i];
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      ofmap[bstride + i] = result[i] >> 8;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      ofmap[2 * bstride + i] = result[i] >> 16;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      ofmap[3 * bstride + i] = result[i] >> 24;
+
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+static int conv_ref(
+    const conv_param_t *p_param,
+    const s8 *ifmap,
+    const s8 *weight,
+    const s16 *bias,
+    s8 *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+  int r_shift_bits = p_param->r_shift_m;
+  int do_relu = p_param->bReLU_EN;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  s8 *i_fmap_pad_ker = (s8 *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return BM_ERR_FAILURE;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = BM_SUCCESS;
+
+  s8 *i_fmap_pad = NULL;
+  s8 *kernel_after = NULL;
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+
+      if (p_param->using_bias) {
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += bias[c]; //bias+c ;
+          }
+        }
+      }
+
+      if (do_relu)
+        relu(&result[n*oc*oh*ow + c*oh*ow], oh * ow);
+
+      // ofmap is s8, signed
+      ret = satu_2_8bit(&result[n*oc*oh*ow + c*oh*ow], oh * ow,
+                        &ofmap[n*oc*oh*ow + c*oh*ow], r_shift_bits, /*round_floor=*/1,
+                        /*sign_unsign=*/1);
+
+      if (ret != BM_SUCCESS)
+        goto error_release;
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+  neuron_dump <s32>(
+      "test_code:conv_ref:pure result + bias",
+      (u32)in,
+      (u32)oc,
+      (u32)oh,
+      (u32)ow,
+      (s32 *)result);
+
+
+error_release:
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+static u8 * transform_weight(const tl_shape_t *s, u8 before[])
+{
+  u32 ic = s->n;
+  u32 oc = s->c;
+  u32 kh = s->h;
+  u32 kw = s->w;
+
+  u32 size = ic * oc * kh * kw;
+  u8 *after = (u8 *)malloc(sizeof(u8) * size);
+
+  /*
+   * (oc, ic, kh, kw) -> (1, oc, kh * kw, ic)
+   */
+  for (u32 oci = 0; oci < oc; oci++) {
+    for (u32 ici = 0; ici < ic; ici++) {
+      for (u32 khi = 0; khi < kh; khi++) {
+        for (u32 kwi = 0; kwi < kw; kwi++) {
+          u32 src_i = oci * ic * kh * kw + ici * kh * kw + khi * kw + kwi;
+          u32 dst_i = oci * kh * kw * ic + khi * kw * ic + kwi * ic + ici;
+          after[dst_i] = before[src_i];
+        }
+      }
+    }
+  }
+
+  return after;
+}
+
+static void put_conv_weight(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    const tl_t *tl,
+    u8 *data)
+{
+  const tl_shape_t *s = &tl->shape;
+  u32 ic = s->n;
+  u32 oc = s->c;
+  u32 kh = s->h;
+  u32 kw = s->w;
+
+  bmshape_t bms = BM_TENSOR_INT8((int)oc, (int)ic, (int)kh, (int)kw);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  u8 *transformed_data = transform_weight(s, data);
+
+  /* we put weight to region 1. bm_memcpy_s2d regard dev_mem as
+   * absolute address, so we should pass abs address to copy weight
+   * to right place.
+   */
+
+  //u64 ab_addr = bm_device_read_base_reg(*ctx, 1);
+  //bmmem_device_t ab_dev_mem =
+    //bmmem_device_prealloc(*ctx, NULL, ab_addr + gaddr, &bms);
+
+  //int ret = bm_memcpy_s2d(*ctx, ab_dev_mem, transformed_data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tl_shape_t tdma_shape = { 1, oc, kh * kw, ic };
+
+  tg_t tdma_tg;
+  tdma_tg.base_reg_index = 0;
+  tdma_tg.start_address = gaddr;
+  tdma_tg.fmt = FMT_I8;
+  tdma_tg.shape.n = tdma_shape.n;
+  tdma_tg.shape.c = tdma_shape.c;
+  tdma_tg.shape.h = tdma_shape.h;
+  tdma_tg.shape.w = tdma_shape.w;
+  tdma_tg.stride = bmk1822_tensor_tgmem_default_stride(tdma_tg.shape, tdma_tg.fmt);
+  tdma_tg.base_reg_index = 1;
+
+  tl_t tdma_tl = *tl;
+  tdma_tl.shape = tdma_shape;
+  tdma_tl.stride = bmk1822_tensor_lmem_default_stride(bk_ctx, tdma_shape, FMT_I8, 0);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tdma_tg;
+  p.dst = &tdma_tl;
+
+  bmk1822_tdma_g2l_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+  bmmem_device_free(*ctx, dev_mem);
+  free(transformed_data);
+}
+
+static s8 * transform_bias(int oc, s16 before[])
+{
+  s8 *after = (s8 *)malloc(sizeof(s8) * 2 * oc);
+  if (!after)
+    return NULL;
+
+  for (int i = 0; i < oc; i++){
+    after[i] = before[i] & 0xff;
+    after[i + oc] = (before[i] >> 8) & 0xff;
+  }
+  return after;
+}
+
+static void put_conv_bias(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    const tl_t *tl,
+    s16 *data)
+{
+  int oc = tl->shape.c;
+
+  bmshape_t bms = BM_TENSOR_INT8(2, oc, 1, 1);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  s8 *transformed_data = transform_bias(oc, data);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, (u8 *)transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_I8;
+  tg.shape.n = 2;
+  tg.shape.c = oc;
+  tg.shape.h = 1;
+  tg.shape.w = 1;
+  tg.stride = bmk1822_tensor_tgmem_default_stride(tg.shape, tg.fmt);
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1822_tdma_g2l_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  bmmem_device_free(*ctx, dev_mem);
+  free(transformed_data);
+}
+
+static int conv_kh_ext(const conv_param_t *p)
+{
+  return (p->kh - 1) * p->dh + 1;
+}
+
+static int conv_kw_ext(const conv_param_t *p)
+{
+  return (p->kw - 1) * p->dw + 1;
+}
+
+static int conv_ih_ext(const conv_param_t *p)
+{
+  return (p->input_h - 1) * (p->ins_h + 1) +
+      p->ins_h_last + 1 + p->pad_top + p->pad_bot;
+}
+
+static int conv_iw_ext(const conv_param_t *p)
+{
+  return (p->input_w - 1) * (p->ins_w + 1) +
+      p->ins_w_last + 1 + p->pad_left + p->pad_right;
+}
+
+static int conv_oh(const conv_param_t *p)
+{
+  return (conv_ih_ext(p) - conv_kh_ext(p)) / p->stride_h + 1;
+}
+
+static int conv_ow(const conv_param_t *p)
+{
+  return (conv_iw_ext(p) - conv_kw_ext(p)) / p->stride_w + 1;
+}
+
+static int conv_input_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int ic = p->input_c;
+  int ih = p->input_h;
+  int iw = p->input_w;
+  return in * ic * ih * iw;
+}
+
+static int conv_output_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int oc = p->output_c;
+  int oh = conv_oh(p);
+  int ow = conv_ow(p);
+  return in * oc * oh * ow;
+}
+
+static int conv_weight_size(const conv_param_t *p)
+{
+  int oc = p->output_c;
+  int ic = p->input_c;
+  int kh = p->kh;
+  int kw = p->kw;
+  return oc * ic * kh * kw;
+}
+
+static s8 * alloc_input(const conv_param_t *p)
+{
+  int size = conv_input_size(p);
+
+  s8 *buf = (s8 *)malloc(sizeof(s8) * size);
+  if (!buf)
+    return NULL;
+
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static s8 * alloc_weight(const conv_param_t *p)
+{
+  int size = conv_weight_size(p);
+
+  s8 *buf = (s8 *)malloc(sizeof(s8) * size);
+  if (!buf)
+    return NULL;
+
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static s16 * alloc_bias(const conv_param_t *p)
+{
+  int oc = p->output_c;
+
+  s16 *bias = (s16 *)malloc(sizeof(s16) * oc);
+  if (!bias)
+    return NULL;
+
+  for (int i = 0; i < oc; i++)
+    bias[i] = rand() % 65536 - 32768;
+
+  return bias;
+}
+
+static tl_t * conv_ifmap_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = p->opd0_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return bmk1822_lmem_alloc_tensor(ctx, s, fmt, 1);
+}
+
+static tl_t * conv_weight_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = p->opd1_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+  return bmk1822_lmem_alloc_tensor(ctx, s, fmt, 0);
+}
+
+static tl_t * conv_ofmap_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  return bmk1822_lmem_alloc_ps32_tensor(ctx, s, FMT_I8, 1);
+}
+
+static tl_t * conv_bias_tensor(
+    bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = p->opd2_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+  return bmk1822_lmem_alloc_tensor(ctx, s, fmt, 0);
+}
+
+static int conv_param_is_ok(const conv_param_t *p)
+{
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+
+  if ((kh_ext > ih_ext)
+      || (kw_ext > iw_ext)
+      || (kh_ext <= p->pad_top)
+      || (kh_ext <= p->pad_bot)
+      || (kw_ext <= p->pad_left)
+      || (kw_ext <= p->pad_right)
+      || (p->pad_top >= (1 << 4))
+      || (p->pad_bot >= (1 << 4))
+      || (p->pad_left >= (1 << 4))
+      || (p->pad_right >= (1 << 4))) {
+    return 0;
+  }
+
+  return 1;
+}
+
+static int bmk_conv_param_alloc_ok(
+    const bmk1822_tiu_convolution_param_t *p,
+    const conv_param_t *param)
+{
+  if (!p->ifmap || !p->ofmap || !p->weight)
+    return 0;
+  if(p->ps32_mode==1)
+      if (param->using_bias)
+        if (!p->bias)
+          return 0;
+
+  return 1;
+}
+
+static void make_bmk_conv_param_ps32(
+    bmk_ctx_t *ctx,
+    bmk1822_tiu_convolution_param_t *dst,
+    const conv_param_t *p, u32 ps32_mode)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  if(ps32_mode==2)
+  {
+    dst->ifmap = conv_ifmap_tensor(ctx, p);
+    dst->weight = conv_weight_tensor(ctx, p);
+    dst->ofmap = conv_ofmap_tensor(ctx, p);
+  }
+
+  dst->ps32_mode = ps32_mode;
+
+  dst->bias = NULL;
+  dst->relu_enable = 0;
+  dst->rshift_bits = 0;
+  if(ps32_mode==1)
+  {
+    dst->relu_enable = p->bReLU_EN;
+    dst->rshift_bits = p->r_shift_m;
+    // only mode=1 can use bias
+    if (p->using_bias)
+      dst->bias = conv_bias_tensor(ctx, p);
+  }
+
+  dst->w_is_const = 0;
+}
+
+static void make_bmk_conv_param(
+    bmk_ctx_t *ctx,
+    bmk1822_tiu_convolution_param_t *dst,
+    const conv_param_t *p)
+{
+  memset(dst, 0, sizeof(bmk1822_tiu_convolution_param_t));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  dst->relu_enable = p->bReLU_EN;
+  dst->rshift_bits = p->r_shift_m;
+  dst->ifmap = conv_ifmap_tensor(ctx, p);
+  dst->weight = conv_weight_tensor(ctx, p);
+  dst->ofmap = conv_ofmap_tensor(ctx, p);
+  dst->bias = NULL;
+  dst->ps32_mode = 0;
+  if (p->using_bias)
+    dst->bias = conv_bias_tensor(ctx, p);
+
+  dst->w_is_const = 0;
+}
+
+static void free_bmk_conv_param(
+    bmk_ctx_t *ctx,
+    bmk1822_tiu_convolution_param_t *r,
+    const conv_param_t *p)
+{
+  if (p->using_bias && r->bias)
+    free_tl(ctx, r->bias);
+
+  if (r->ofmap)
+    free_tl(ctx, r->ofmap);
+
+  if (r->weight)
+    free_tl(ctx, r->weight);
+
+  if (r->ifmap)
+    free_tl(ctx, r->ifmap);
+}
+
+static void init_conv_param(conv_param_t &p)
+{
+  printf("init_conv_param\n");
+  p.random_seed = clock();
+  srand(p.random_seed);
+
+retry:
+  p.input_n = 1;
+  p.input_c = rand() % (10) + 2;
+  p.kh = rand() % 7 + 1;
+  p.kw = rand() % 7 + 1;
+  p.input_h = rand() % 10 + p.kh;
+  p.input_w = rand() % 10 + p.kw;
+  p.output_c = rand() % 10 + 3;
+  p.stride_h = rand() % (p.kh) + 1;
+  p.stride_w = rand() % (p.kw) + 1;
+  p.ins_h = rand() % p.kh;
+  p.ins_w = rand() % p.kw;
+  p.ins_h_last = rand() % p.kh;;
+  p.ins_w_last = rand() % p.kw;;
+  p.dh = rand() % 3 + 1;
+  p.dw = rand() % 3 + 1;
+
+  int kh_ext = conv_kh_ext(&p);
+  int kw_ext = conv_kw_ext(&p);
+  p.pad_top = rand() % kh_ext;
+  p.pad_bot = rand() % kh_ext;
+  p.pad_left = rand() % kw_ext;
+  p.pad_right = rand() % kw_ext;
+
+  if (!conv_param_is_ok(&p)) {
+    printf("retry init_conv_param\n");
+    goto retry;
+  }
+
+  p.using_bias = rand() % 2;
+  p.r_shift_m = rand() % 8;
+  p.bReLU_EN = rand() % 2;
+
+  p.opd0_sign = rand() % 2;
+  p.opd1_sign = 1;
+  p.opd2_sign = 1;
+
+  assert(p.opd1_sign == 1 && p.opd2_sign == 1);
+
+  int ih_ext = conv_ih_ext(&p);
+  int iw_ext = conv_iw_ext(&p);
+  assert(ih_ext >= kh_ext);
+  assert(iw_ext >= kw_ext);
+}
+
+static void print_conv_param(const conv_param_t *p)
+{
+  printf("%s\n", "Conv parameters:");
+  printf("  %s%d;\n", "p->random_seed = ", p->random_seed);
+
+  printf("  %s%d;\n", "p->input_n = ", p->input_n);
+  printf("  %s%d;\n", "p->input_c = ", p->input_c);
+  printf("  %s%d;\n", "p->input_h = ", p->input_h);
+  printf("  %s%d;\n", "p->input_w = ", p->input_w);
+  printf("  %s%d;\n", "p->output_c = ", p->output_c);
+
+  printf("  %s%d;\n", "p->kh = ", p->kh);
+  printf("  %s%d;\n", "p->kw = ", p->kw);
+  printf("  %s%d;\n", "p->dh = ", p->dh);
+  printf("  %s%d;\n", "p->dw = ", p->dw);
+  printf("  %s%d;\n", "p->pad_top = ", p->pad_top);
+  printf("  %s%d;\n", "p->pad_bot = ", p->pad_bot);
+  printf("  %s%d;\n", "p->pad_left = ", p->pad_left);
+  printf("  %s%d;\n", "p->pad_right = ", p->pad_right);
+  printf("  %s%d;\n", "p->stride_h = ", p->stride_h);
+  printf("  %s%d;\n", "p->stride_w = ", p->stride_w);
+  printf("  %s%d;\n", "p->ins_w = ", p->ins_w);
+  printf("  %s%d;\n", "p->ins_h = ", p->ins_h);
+  printf("  %s%d;\n", "p->ins_w_last = ", p->ins_w_last);
+  printf("  %s%d;\n", "p->ins_h_last = ", p->ins_h_last);
+
+  printf("  %s%d;\n", "p->r_shift_m = ", p->r_shift_m);
+  printf("  %s%d;\n", "p->opd0_sign = ", p->opd0_sign);
+  printf("  %s%d;\n", "p->opd1_sign = ", p->opd1_sign);
+  printf("  %s%d;\n", "p->opd2_sign = ", p->opd2_sign);
+  printf("  %s%d;\n", "p->bReLU_EN = ", p->bReLU_EN);
+  printf("  %s%d;\n", "p->using_bias = ", p->using_bias);
+
+  printf("  %s%d\n", "kh_ext = ", conv_kh_ext(p));
+  printf("  %s%d\n", "kw_ext = ", conv_kw_ext(p));
+  printf("  %s%d\n", "ih_ext = ", conv_ih_ext(p));
+  printf("  %s%d\n", "iw_ext = ", conv_iw_ext(p));
+  printf("  %s%d\n", "output_h = ", conv_oh(p));
+  printf("  %s%d\n", "output_w = ", conv_ow(p));
+}
+
+/* Calculate the right shift value, m
+ * Steps:
+ *  1. Get the abs() of each weight;
+ *  2. Summary all the abs() in one kernel;
+ *  3. Get Log2 of each sum;
+ *  4. Downward rounding;
+ * After every r_shift value got, sort and find the middle one.
+ */
+
+static int calc_rshift_m(const conv_param_t *p, s8* weight)
+{
+  int kernel_cnt = p->output_c * p->input_c;
+  int kernel_size = p->kh * p->kw;
+  int *kernel_shifts = (int *)malloc(sizeof(int) * kernel_cnt);
+
+  // Tscan does not recognize C++ zero-initialized.
+  memset(kernel_shifts, 0, sizeof(int) * kernel_cnt);
+
+  // Part 1:
+  // Get right shift value for each kernel
+  int sum = 0;
+  for (int i = 0; i < kernel_cnt; i++) {
+    // Step 1 & 2: Get the sum of abs()
+    for (int j = 0; j < kernel_size; j++) {
+      sum += (int)(*weight < 0 ? -(*weight) : (*weight));
+      weight++;
+    }
+    // Step 3 & 4: log2 and downward rounding
+    sum >>= 1;
+    while (sum) {
+      sum >>= 1;
+      kernel_shifts[i]++;
+    }
+  }
+
+  // Part 2:
+  // Find the middle of all the values
+  int tag[32] = {0};
+  for (int cnt = 0; cnt < kernel_cnt; cnt++) {
+    tag[kernel_shifts[cnt]]++;
+  }
+
+  int rshift_m = 0;
+  int mid = 0;
+  do {
+    mid += tag[rshift_m++];
+  } while(mid < (kernel_cnt - 1) >> 1);
+
+  free(kernel_shifts);
+
+  return rshift_m - 1;
+}
+
+static int test_ps32_ut(
+    conv_param_t &p_param, bmctx_t ctx, bmk_ctx_t *bk_ctx)
+{
+  printf("test_ps32_ut\n");
+  s8 *input = alloc_input(&p_param);
+  s8 *weight = alloc_weight(&p_param);
+  s16 *bias = alloc_bias(&p_param);
+  p_param.r_shift_m = calc_rshift_m(&p_param, weight);
+  s8 *output_ref = (s8 *)malloc(sizeof(s8) * conv_output_size(&p_param) * sizeof(int));
+  if (!input || !weight || !bias || !output_ref) {
+    free(input);
+    free(weight);
+    free(bias);
+    free(output_ref);
+    return BM_ERR_FAILURE;
+  }
+
+  bmerr_t ret = ps32_m2_conv_ref(&p_param, input, weight, output_ref);
+  assert(ret == BM_SUCCESS);
+
+  bmk1822_tiu_convolution_param_t conv_param;
+  make_bmk_conv_param_ps32(bk_ctx, &conv_param, &p_param, 2);
+
+  int tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, &p_param);
+
+  if (tl_alloc_success) {
+
+    put_tensor_g2l(&ctx, bk_ctx, conv_param.ifmap, (u8 *)input);
+    put_conv_weight(&ctx, bk_ctx, conv_param.weight, (u8 *)weight);
+    bmk1822_tiu_convolution(bk_ctx, &conv_param);
+
+    bmk1822_tensor_lmem_t ps32_ofmap;
+    ps32_ofmap = *conv_param.ofmap;
+    ps32_ofmap.shape.n = ps32_ofmap.shape.n * sizeof(int);
+    u8 *output = get_tensor_l2g(&ctx, bk_ctx, &ps32_ofmap);
+
+    int has_error = array_cmp_int8(
+        "Comparing begin_mode results ...\n",
+        output_ref, (s8 *)output, conv_output_size(&p_param) * sizeof(int));
+
+    if (has_error) {
+      print_conv_param(&p_param);
+      printf("Comparison FAILED\n");
+      exit(-1);
+    }
+
+    free(output);
+  }
+  free_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+  printf("test_ps32_intermediate_mode\n");
+  for (int i=0; i < conv_input_size(&p_param); i++)
+    input[i] = rand() % 256 - 128;
+
+  for (int i=0; i < conv_weight_size(&p_param); i++)
+    weight[i] = rand() % 256 - 128;
+
+  ret = ps32_m3_conv_ref(&p_param, input, weight, output_ref);
+  assert(ret == BM_SUCCESS);
+
+  make_bmk_conv_param_ps32(bk_ctx, &conv_param, &p_param, 3);
+  tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, &p_param);
+
+  if (tl_alloc_success) {
+    put_tensor_g2l(&ctx, bk_ctx, conv_param.ifmap, (u8 *)input);
+    put_conv_weight(&ctx, bk_ctx, conv_param.weight, (u8 *)weight);
+
+    bmk1822_tiu_convolution(bk_ctx, &conv_param);
+
+    bmk1822_tensor_lmem_t ps32_ofmap;
+    ps32_ofmap = *conv_param.ofmap;
+    ps32_ofmap.shape.n = ps32_ofmap.shape.n * sizeof(int);
+
+    u8 *output = get_tensor_l2g(&ctx, bk_ctx, &ps32_ofmap);
+
+    int has_error = array_cmp_int8(
+        "Comparing intermediate results ...\n",
+        output_ref, (s8 *)output, conv_output_size(&p_param) * sizeof(int));
+
+    if (has_error) {
+      print_conv_param(&p_param);
+      printf("Comparison FAILED\n");
+      exit(-1);
+    }
+
+    free(output);
+  }
+  free_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+  printf("test_ps32_end_mode\n");
+  for (int i=0; i < conv_input_size(&p_param); i++)
+    input[i] = rand() % 256 - 128;
+
+  for (int i=0; i < conv_weight_size(&p_param); i++)
+    weight[i] = rand() % 256 - 128;
+
+  ret = ps32_m1_conv_ref(&p_param, input, weight, bias, output_ref);
+  assert(ret == BM_SUCCESS);
+
+  make_bmk_conv_param_ps32(bk_ctx, &conv_param, &p_param, 1);
+
+  tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, &p_param);
+
+  if (tl_alloc_success) {
+
+    put_tensor_g2l(&ctx, bk_ctx, conv_param.ifmap, (u8 *)input);
+    put_conv_weight(&ctx, bk_ctx, conv_param.weight, (u8 *)weight);
+    if (p_param.using_bias) {
+      put_conv_bias(&ctx, bk_ctx, conv_param.bias, bias);
+    }
+    bmk1822_tiu_convolution(bk_ctx, &conv_param);
+    u8 *output = get_tensor_l2g(&ctx, bk_ctx, conv_param.ofmap);
+
+    int has_error = array_cmp_int8(
+        "Comparing end results ...\n",
+        output_ref, (s8 *)output, conv_output_size(&p_param));
+
+    if (has_error) {
+      print_conv_param(&p_param);
+      printf("Comparison FAILED\n");
+      exit(-1);
+    }
+
+    free(output);
+  }
+  free_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+  free(input);
+  free(weight);
+  free(bias);
+  free(output_ref);
+
+  return tl_alloc_success ? 1 : 0;
+}
+
+static int test_ic_tiling_conv(
+    conv_param_t &p_param, bmctx_t ctx, bmk_ctx_t *bk_ctx)
+{
+  printf("test tiled ps32 conv\n");
+  s8 *input = alloc_input(&p_param);
+  s8 *weight = alloc_weight(&p_param);
+  s16 *bias = alloc_bias(&p_param);
+  p_param.r_shift_m = calc_rshift_m(&p_param, weight);
+  s8 *output_ref = (s8 *)malloc(sizeof(s8) * conv_output_size(&p_param));
+  if (!output_ref)
+    return BM_ERR_FAILURE;
+
+  bmerr_t ret = conv_ref(&p_param, input, weight, bias, output_ref);
+  assert(ret == BM_SUCCESS);
+  bmk1822_tiu_convolution_param_t conv_tmp_param;
+  bmk1822_tiu_convolution_param_t conv_param;
+  make_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+  int tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, &p_param);
+  if (tl_alloc_success) {
+    if (p_param.using_bias) {
+      conv_tmp_param.bias = conv_param.bias;
+      put_conv_bias(&ctx, bk_ctx, conv_param.bias, bias);
+      neuron_dump <s16> (
+        "test_ic_tiling_conv: bias",
+        1,
+        conv_param.bias->shape.c,
+        conv_param.bias->shape.h,
+        conv_param.bias->shape.w,
+        (s16 *)bias);
+    }
+    // for ps32_md[1] = 1, relu_enable & rshift_bits need to set to 0
+    // so we store those parameters to conv_tmp_para
+    conv_tmp_param.relu_enable = conv_param.relu_enable;
+    conv_tmp_param.rshift_bits = conv_param.rshift_bits;
+    conv_tmp_param.bias        = conv_param.bias;
+
+    u32 ic_step = 1;
+    u32 n_step = 1;
+    tl_t ifmap = *conv_param.ifmap;
+    tl_t ofmap = *conv_param.ofmap;
+    tg_shape_t s;
+    s.n = conv_param.ifmap->shape.n;
+    s.c = conv_param.ifmap->shape.c;
+    s.h = conv_param.ifmap->shape.h;
+    s.w = conv_param.ifmap->shape.w;
+    tg_t *tg_ifmap = alloc_tg_gmem(&ctx, s, FMT_I8);
+    put_tg_gmem(&ctx, tg_ifmap, (u8 *)input);
+
+    s.n = conv_param.weight->shape.n;
+    s.c = conv_param.weight->shape.c;
+    s.h = conv_param.weight->shape.h;
+    s.w = conv_param.weight->shape.w;
+    u8 *transformed_weight =
+      transform_weight(&conv_param.weight->shape, (u8 *)weight);
+    tg_t *tg_weight = alloc_tg_gmem(&ctx, s, FMT_I8);
+    put_tg_gmem(&ctx, tg_weight, (u8 *)transformed_weight);
+
+    neuron_dump <s8> (
+        "test_ic_tiling_conv: input",
+        p_param.input_n,
+        p_param.input_c,
+        p_param.input_h,
+        p_param.input_w,
+        (s8 *)input);
+
+    neuron_dump <s8> (
+        "test_ic_tiling_conv: kernel",
+        1,
+        conv_param.weight->shape.c,
+        conv_param.weight->shape.h * conv_param.weight->shape.w,
+        conv_param.weight->shape.n,
+        (s8 *)transformed_weight);
+    free(transformed_weight);
+
+    tl_shape_t cur_tl_ifmap_shape = {
+        n_step,
+        ic_step,
+        ifmap.shape.h,
+        ifmap.shape.w
+    };
+
+    tg_shape_t cur_tg_ifmap_shape = {
+      cur_tl_ifmap_shape.n,
+      cur_tl_ifmap_shape.c,
+      cur_tl_ifmap_shape.h,
+      cur_tl_ifmap_shape.w
+    };
+
+    tg_stride_t cur_tg_ifmap_stride = {
+      tg_ifmap->stride.n,
+      tg_ifmap->stride.c,
+      tg_ifmap->stride.h,
+    };
+
+    tg_t cur_tg_ifmap;
+    cur_tg_ifmap.base_reg_index = 0;
+    cur_tg_ifmap.start_address = tg_ifmap->start_address;
+    cur_tg_ifmap.shape = cur_tg_ifmap_shape;
+    cur_tg_ifmap.stride = cur_tg_ifmap_stride;
+    cur_tg_ifmap.fmt = FMT_I8;
+
+    tl_t cur_tl_ifmap;
+    cur_tl_ifmap.shape = cur_tl_ifmap_shape;
+    cur_tl_ifmap.stride =
+      bmk1822_tensor_lmem_default_stride(bk_ctx, cur_tl_ifmap_shape, FMT_I8, 1);
+    cur_tl_ifmap.start_address = ifmap.start_address;
+    cur_tl_ifmap.fmt = ifmap.fmt;
+
+    tl_t cur_tl_ofmap;
+    cur_tl_ofmap.start_address = ofmap.start_address;
+    cur_tl_ofmap.shape = ofmap.shape;
+    cur_tl_ofmap.shape.n = n_step;
+    cur_tl_ofmap.stride =
+      bmk1822_tensor_lmem_default_stride(bk_ctx, cur_tl_ofmap.shape, FMT_I8, 1);
+    cur_tl_ofmap.fmt = ofmap.fmt;
+
+    tl_t cur_tl_weight;
+    memset(&cur_tl_weight, 0, sizeof(cur_tl_weight));
+    cur_tl_weight.start_address = conv_param.weight->start_address;
+    cur_tl_weight.shape = conv_param.weight->shape;
+    cur_tl_weight.shape.n = ic_step;
+    cur_tl_weight.stride = {
+      1,
+      cur_tl_weight.shape.n * cur_tl_weight.shape.h * cur_tl_weight.shape.w,
+      cur_tl_weight.shape.n * cur_tl_weight.shape.w,
+      cur_tl_weight.shape.n
+    };
+    cur_tl_weight.fmt = conv_param.weight->fmt;
+
+    const tl_t *saved_tl_weight = conv_param.weight;
+    const tl_t *saved_tl_ifmap = conv_param.ifmap;
+    for (u32 ci = 0; ci < ifmap.shape.c; ci += ic_step) {
+      {
+        u32 ic = tg_weight->shape.n;
+        u32 oc = tg_weight->shape.c;
+        u32 kh = tg_weight->shape.h;
+        u32 kw = tg_weight->shape.w;
+
+        tg_t cur_tdma_tg_weight;
+        cur_tdma_tg_weight.base_reg_index = tg_weight->base_reg_index;
+        cur_tdma_tg_weight.start_address = tg_weight->start_address + ci;
+        cur_tdma_tg_weight.fmt = tg_weight->fmt;
+        cur_tdma_tg_weight.shape = {1, oc, kh * kw, ic};
+        cur_tdma_tg_weight.stride =
+          bmk1822_tensor_tgmem_default_stride(cur_tdma_tg_weight.shape, cur_tdma_tg_weight.fmt);
+        cur_tdma_tg_weight.shape = {1, oc, kh * kw, ic_step};
+
+        tl_t cur_tdma_tl_weight;
+        cur_tdma_tl_weight = cur_tl_weight;
+        cur_tdma_tl_weight.shape.n = cur_tdma_tg_weight.shape.n;
+        cur_tdma_tl_weight.shape.c = cur_tdma_tg_weight.shape.c;
+        cur_tdma_tl_weight.shape.h = cur_tdma_tg_weight.shape.h;
+        cur_tdma_tl_weight.shape.w = cur_tdma_tg_weight.shape.w;
+        cur_tdma_tl_weight.stride = bmk1822_tensor_lmem_default_stride(
+            bk_ctx, cur_tdma_tl_weight.shape, FMT_I8, 0);
+
+        bmk1822_tdma_tg2l_tensor_copy_param_t p1;
+        memset(&p1, 0, sizeof(p1));
+        p1.src = &cur_tdma_tg_weight;
+        p1.dst = &cur_tdma_tl_weight;
+        bmk1822_tdma_g2l_tensor_copy(bk_ctx, &p1);
+        test_submit(&ctx);
+      }
+      {
+        bmk1822_tdma_tg2l_tensor_copy_param_t p2;
+        memset(&p2, 0, sizeof(p2));
+        cur_tg_ifmap.start_address =
+          tg_ifmap->start_address + ci * tg_ifmap->stride.c;
+        p2.src = &cur_tg_ifmap;
+        p2.dst = &cur_tl_ifmap;
+        bmk1822_tdma_g2l_tensor_copy(bk_ctx, &p2);
+        test_submit(&ctx);
+      }
+
+      conv_param.ifmap = &cur_tl_ifmap;
+      conv_param.weight = &cur_tl_weight;
+
+      // for ps32_md[1] = 1, relu_enable & rshift_bits need to set to 0
+      // so we store those parameters to conv_tmp_para
+      if (ci == (ifmap.shape.c - 1))
+      {
+        conv_param.relu_enable = conv_tmp_param.relu_enable;
+        conv_param.rshift_bits = conv_tmp_param.rshift_bits;
+        conv_param.bias        = conv_tmp_param.bias;
+        conv_param.ps32_mode   = 1;
+      }
+      else if (ci == 0)
+      {
+        conv_param.relu_enable = 0;
+        conv_param.rshift_bits = 0;
+        conv_param.bias        = 0;;
+        conv_param.ps32_mode   = 2;
+      }
+      else
+      {
+        conv_param.relu_enable = 0;
+        conv_param.rshift_bits = 0;
+        conv_param.bias        = 0;;
+        conv_param.ps32_mode   = 3;
+      }
+      bmk1822_tiu_convolution(bk_ctx, &conv_param);
+      conv_param.weight = saved_tl_weight;
+      conv_param.ifmap = saved_tl_ifmap;
+    }
+
+    u8 *output = get_tensor_l2g(&ctx, bk_ctx, conv_param.ofmap);
+
+    free_tg_gmem(&ctx, tg_ifmap);
+    free_tg_gmem(&ctx, tg_weight);
+    int has_error = array_cmp_int8(
+        "Comparing results ...\n",
+        output_ref, (s8 *)output, conv_output_size(&p_param));
+
+    if (has_error) {
+      print_conv_param(&p_param);
+      printf("Comparison FAILED\n");
+      exit(-1);
+    }
+    free(output);
+  }
+  free_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+  free(input);
+  free(weight);
+  free(output_ref);
+  free(bias);
+
+  return tl_alloc_success ? 1 : 0;
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  int test_finished_num = 0;
+  for (int i = 0; i < 5; i++) {
+    printf("random_test_conv iteration: %d\n", i);
+    conv_param_t test_conv_param;
+    init_conv_param(test_conv_param);
+    test_finished_num += test_ic_tiling_conv(test_conv_param, ctx, bk_ctx);
+    test_finished_num += test_ps32_ut(test_conv_param, ctx, bk_ctx);
+    if (!test_conv_param.using_bias)
+      test_conv_param.using_bias = 1;
+    if (test_conv_param.output_c <= 9)
+      test_conv_param.output_c += 3;
+    test_finished_num += test_ic_tiling_conv(test_conv_param, ctx, bk_ctx);
+    test_finished_num += test_ps32_ut(test_conv_param, ctx, bk_ctx);
+  }
+  printf("test_finished_num: %d\n", test_finished_num);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_g2g_general_copy.cpp b/cviruntime/test/1822/test_1822_g2g_general_copy.cpp
new file mode 100644
index 000000000..e71e147dc
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_g2g_general_copy.cpp
@@ -0,0 +1,107 @@
+#include "1822_test_util.h"
+
+typedef bmk1822_tdma_tg2tg_tensor_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  tg_shape_t src_shape;
+  tg_stride_t src_stride;
+  tg_shape_t dst_shape;
+  tg_stride_t dst_stride;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    {1, 3, 3, 3}, {27, 9, 3},
+    {1, 3, 3, 3}, {27, 9, 3},
+  },
+  {
+    // YOLOv2 concat layer
+    {1, 256, 19, 19}, {92416, 361, 19},
+    {1, 256, 19, 19}, {462080, 361, 19},
+  }
+};
+
+
+static void test_param_g2g(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+
+  u64 size = p->src->shape.c * p->src->shape.h * p->src->shape.w;
+  u8 *src_data = (u8 *)malloc(sizeof(u8) * size);
+  for (u64 i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  put_tg_gmem(ctx, p->src, src_data);
+
+  bmk1822_tdma_tg2tg_general_copy(bmk, p);
+
+  test_submit(ctx);
+
+  u8 *dst_data = get_tg_gmem(ctx, p->dst);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != src_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], src_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+}
+
+static void destroy_param_g2g(bmctx_t *ctx, param_t *p)
+{
+  free_tg_gmem(ctx, p->src);
+  free_tg_gmem(ctx, p->dst);
+}
+
+static void test_one_case(bmctx_t *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+    param_t p;
+    bmk1822_tensor_tgmem_t *src, *dst;
+
+    memset(&p, 0, sizeof(p));
+
+    src = alloc_tg_gmem(ctx, c->src_shape, FMT_I8);
+    src->stride.n = c->src_stride.n;
+    src->stride.c = c->src_stride.c;
+    src->stride.h = c->src_stride.h;
+
+    dst = alloc_tg_gmem(ctx, c->dst_shape, FMT_I8);
+    dst->stride.n = c->dst_stride.n;
+    dst->stride.c = c->dst_stride.c;
+    dst->stride.h = c->dst_stride.h;
+
+    p.src = src;
+    p.dst = dst;
+    test_param_g2g(ctx, bmk, &p);
+
+    destroy_param_g2g(ctx, &p);
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_g2g_tensor_copy.cpp b/cviruntime/test/1822/test_1822_g2g_tensor_copy.cpp
new file mode 100644
index 000000000..4a96c4755
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_g2g_tensor_copy.cpp
@@ -0,0 +1,107 @@
+#include "1822_test_util.h"
+
+typedef bmk1822_tdma_tg2tg_tensor_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  tg_shape_t src_shape;
+  tg_stride_t src_stride;
+  tg_shape_t dst_shape;
+  tg_stride_t dst_stride;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    {1, 3, 3, 3}, {27, 9, 3},
+    {1, 3, 3, 3}, {27, 9, 3},
+  },
+  {
+    // YOLOv2 concat layer
+    {1, 256, 19, 19}, {92416, 361, 19},
+    {1, 256, 19, 19}, {462080, 361, 19},
+  }
+};
+
+
+static void test_param_g2g(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+
+  u64 size = p->src->shape.c * p->src->shape.h * p->src->shape.w;
+  u8 *src_data = (u8 *)malloc(sizeof(u8) * size);
+  for (u64 i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  put_tg_gmem(ctx, p->src, src_data);
+
+  bmk1822_tdma_tg2tg_tensor_copy(bmk, p);
+
+  test_submit(ctx);
+  
+  u8 *dst_data = get_tg_gmem(ctx, p->dst);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != src_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], src_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+}
+
+static void destroy_param_g2g(bmctx_t *ctx, param_t *p)
+{
+  free_tg_gmem(ctx, p->src);
+  free_tg_gmem(ctx, p->dst);
+}
+
+static void test_one_case(bmctx_t *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  param_t p;
+  bmk1822_tensor_tgmem_t *src, *dst;
+
+  memset(&p, 0, sizeof(p));
+
+  src = alloc_tg_gmem(ctx, c->src_shape, FMT_I8);
+  src->stride.n = c->src_stride.n;
+  src->stride.c = c->src_stride.c;
+  src->stride.h = c->src_stride.h;
+
+  dst = alloc_tg_gmem(ctx, c->dst_shape, FMT_I8);
+  dst->stride.n = c->dst_stride.n;
+  dst->stride.c = c->dst_stride.c;
+  dst->stride.h = c->dst_stride.h;
+
+  p.src = src;
+  p.dst = dst;
+  test_param_g2g(ctx, bmk, &p);
+
+  destroy_param_g2g(ctx, &p);
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
\ No newline at end of file
diff --git a/cviruntime/test/1822/test_1822_get_matrix_stride.cpp b/cviruntime/test/1822/test_1822_get_matrix_stride.cpp
new file mode 100644
index 000000000..7a7ae55fa
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_get_matrix_stride.cpp
@@ -0,0 +1,128 @@
+#include "1822_test_util.h"
+
+static void get_matrix_l2g_stride_ref(
+    u8 *ref, u8 *a,
+    ml_shape_t ml_shape,
+    bmk1822_matrix_tgmem_stride_t gmem_stride)
+{
+  int row = ml_shape.n;
+  int col = ml_shape.col;
+  int row_stride = gmem_stride.row;
+
+  /*
+   * Same as in get_matrix_l2g_stride().
+   */
+  int stride_size = row * row_stride;
+  for (int i = 0; i < stride_size; i++)
+    ref[i] = 0xaf;
+
+  for (int ri = 0; ri < row; ri++) {
+    for (int ci = 0; ci < col; ci++) {
+      int src_i = ri * col + ci;
+      int dst_i = ri * row_stride + ci;
+      ref[dst_i] = a[src_i];
+    }
+  }
+}
+
+static u8 * get_matrix_l2g_stride(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    ml_t *ml,
+    bmk1822_matrix_tgmem_stride_t mg_stride)
+{
+  int row = ml->shape.n;
+  int row_stride = mg_stride.row;
+  int col = ml->shape.col;
+  int stride_size = row * row_stride;
+
+  u8 *data = (u8 *)malloc(sizeof(u8) * stride_size);
+  if (!data)
+    return NULL;
+
+  for (int i = 0; i < stride_size; i++)
+    data[i] = 0xaf;
+
+  bmshape_t bms = BM_TENSOR_WITH_FMT(row, row_stride, 1, 1, BM_FMT_INT8);
+  bmmem_device_t devmem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  int ret = bm_memcpy_s2d(*ctx, devmem, data);
+  assert(ret == BM_SUCCESS);
+
+  mg_t mg;
+  mg.base_reg_index = 0;
+  mg.start_address = bmmem_device_addr(devmem);
+  mg.shape.row = row;
+  mg.shape.col = col;
+  mg.stride = mg_stride;
+
+  bmk1822_tdma_l2tg_matrix_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = ml;
+  p.dst = &mg;
+
+  bmk1822_tdma_l2g_matrix_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  ret = bm_memcpy_d2s(*ctx, data, devmem);
+  assert(ret == BM_SUCCESS);
+
+  bmmem_device_free(*ctx, devmem);
+  return data;
+}
+
+static void test_get_matrix_l2g_stride(
+    bmctx_t *ctx, bmk_ctx_t *bk_ctx)
+{
+  int row = 80;
+  int col = 70;
+  int size = row * col;
+  int row_stride = col * 2;
+
+  ml_shape_t ml_shape =
+      bmk1822_matrix_lmem_default_shape(bk_ctx, row, col, FMT_I8);
+  bmk1822_matrix_tgmem_stride_t gmem_stride;
+  gmem_stride.row = row_stride;
+  int stride_size = row * row_stride;
+
+  u8 *data_x = (u8 *)xmalloc(size);
+  if (!data_x)
+    return;
+
+  for (int i = 0; i < size; i++)
+    data_x[i] = i;
+
+  ml_t *ml_x =
+      bmk1822_lmem_alloc_matrix(bk_ctx,ml_shape, FMT_I8, 1);
+  put_matrix_g2l(ctx, bk_ctx, ml_x, data_x);
+  u8 *result_x = get_matrix_l2g_stride(ctx, bk_ctx, ml_x, gmem_stride);
+  u8 *ref_x = (u8 *)xmalloc(stride_size);
+  if (!result_x || !ref_x)
+    goto fail_exit;
+
+  get_matrix_l2g_stride_ref(ref_x, data_x, ml_shape, gmem_stride);
+
+  for (int i = 0; i < stride_size; i++) {
+    if (result_x[i] != ref_x[i]) {
+      printf("compare failed at result_x[%d], got %d, exp %d\n",
+             i, result_x[i], ref_x[i]);
+      exit(-1);
+    }
+  }
+
+  bmk1822_lmem_free_matrix(bk_ctx, ml_x);
+
+fail_exit:
+  free(data_x);
+  free(result_x);
+  free(ref_x);
+}
+
+int main (void)
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  test_get_matrix_l2g_stride(&ctx, bk_ctx);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_get_tensor_gl_stride.cpp b/cviruntime/test/1822/test_1822_get_tensor_gl_stride.cpp
new file mode 100644
index 000000000..907f3ace9
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_get_tensor_gl_stride.cpp
@@ -0,0 +1,161 @@
+#include "1822_test_util.h"
+
+static void get_tensor_l2g_stride_ref(
+    u8 *ref, u8 *a,
+    int n, int c, int h, int w,
+    bmk1822_tensor_lmem_stride_t tl_stride,
+    bmk1822_tensor_tgmem_stride_t tg_stride)
+{
+  /*
+   * Same as in get_tensor_l2g_stride().
+   */
+  int stride_size = n * tg_stride.n;
+  for (int i = 0; i < stride_size; i++)
+    ref[i] = 0xcf;
+
+  for (int ni = 0; ni < n; ni++) {
+    for (int ci = 0; ci < c; ci++) {
+      for (int hi = 0; hi < h; hi++) {
+        for (int wi = 0; wi < w; wi++) {
+          u64 src_i =
+              (ni * c + ci) * tl_stride.c +
+              hi * tl_stride.h +
+              wi * 1;
+          u64 dst_i =
+              ni * tg_stride.n +
+              ci * tg_stride.c +
+              hi * tg_stride.h +
+              wi * 1;
+          ref[dst_i] = a[src_i];
+        }
+      }
+    }
+  }
+}
+
+static inline u8 * get_tensor_l2g_stride(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    tl_t *tl,
+    bmk1822_tensor_tgmem_stride_t tg_stride)
+{
+  int n = tl->shape.n;
+  int n_stride = tg_stride.n;
+
+  int stride_size = n * n_stride;
+  u8 *data = (u8 *)malloc(sizeof(u8) * stride_size);
+  if (!data)
+    return NULL;
+
+  for (int i = 0; i < stride_size; i++)
+    data[i] = 0xcf;
+
+  bmshape_t bms = BM_TENSOR_WITH_FMT(n, n_stride, 1, 1, BM_FMT_INT8);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_I8;
+  tg.shape.n = tl->shape.n;
+  tg.shape.c = tl->shape.c;
+  tg.shape.h = tl->shape.h;
+  tg.shape.w = tl->shape.w;
+  tg.stride = tg_stride;
+  tg.base_reg_index = 0;
+
+  bmk1822_tdma_l2tg_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = tl;
+  p.dst = &tg;
+  bmk1822_tdma_l2g_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  ret = bm_memcpy_d2s(*ctx, data, dev_mem);
+  assert(ret == BM_SUCCESS);
+
+  bmmem_device_free(*ctx, dev_mem);
+  return data;
+}
+
+static void test_get_tensor_l2g_gl_stride(
+    bmctx_t *ctx, bmk_ctx_t *bk_ctx)
+{
+  int n = 2;
+  int c = 35;
+  int h = 2;
+  int w = 3;
+
+  tg_shape_t tg_shape;
+  tg_shape.n = n;
+  tg_shape.c = c;
+  tg_shape.h = h;
+  tg_shape.w = w;
+
+  bmk1822_tensor_tgmem_stride_t tg_stride =
+      bmk1822_tensor_tgmem_default_stride(tg_shape, FMT_I8);
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h * w;
+  tl_shape.w = 1;
+
+  int size = tl_shape.n * tl_shape.c * tl_shape.h * tl_shape.w;
+  u8 *data_x = (u8 *)xmalloc(size);
+  if (!data_x)
+    return;
+
+  for (int i = 0; i < size; i++)
+    data_x[i] = i;
+
+  tl_t *tl_x =
+      alloc_tl(bk_ctx, tl_shape, FMT_I8, 0);
+  put_tensor_g2l(ctx, bk_ctx, tl_x, data_x);
+
+  tl_x->shape.n = n;
+  tl_x->shape.c = c;
+  tl_x->shape.h = h;
+  tl_x->shape.w = w;
+
+  tl_x->stride = bmk1822_tensor_lmem_default_stride(bk_ctx, tl_x->shape, FMT_I8, 0);
+
+  u8 *result_x = get_tensor_l2g_stride(ctx, bk_ctx, tl_x, tg_stride);
+  int stride_size = tg_shape.n * tg_stride.n;
+  u8 *ref_x = (u8 *)xmalloc(stride_size);
+  if (!result_x || !ref_x)
+    goto fail_exit;
+
+  get_tensor_l2g_stride_ref(ref_x,
+      data_x, tg_shape.n,
+      tg_shape.c, tg_shape.h,
+      tg_shape.w, tl_x->stride, tg_stride);
+
+  for (int i = 0; i < stride_size; i++) {
+    if (result_x[i] != ref_x[i]) {
+      printf("compare failed at result_x[%d], got %d, exp %d\n",
+             i, result_x[i], ref_x[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_x);
+
+fail_exit:
+  free(data_x);
+  free(result_x);
+  free(ref_x);
+}
+
+int main (void)
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  test_get_tensor_l2g_gl_stride(&ctx, bk_ctx);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_get_tensor_stride.cpp b/cviruntime/test/1822/test_1822_get_tensor_stride.cpp
new file mode 100644
index 000000000..91df9c9c6
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_get_tensor_stride.cpp
@@ -0,0 +1,145 @@
+#include "1822_test_util.h"
+
+static void get_tensor_l2g_stride_ref(
+    u8 *ref, u8 *a,
+    tl_shape_t tl_shape,
+    bmk1822_tensor_tgmem_stride_t tg_stride)
+{
+  uint32_t n = tl_shape.n;
+  uint32_t c = tl_shape.c;
+  uint32_t h = tl_shape.h;
+  uint32_t w = tl_shape.w;
+
+  uint32_t n_str = tg_stride.n;
+  uint32_t c_str = tg_stride.c;
+  uint32_t h_str = tg_stride.h;
+  uint32_t w_str = 1;
+
+  /*
+   * Same as in get_tensor_l2g_stride().
+   */
+  int stride_size = n * tg_stride.n;
+  for (int i = 0; i < stride_size; i++)
+    ref[i] = 0xcf;
+
+  for (uint32_t ni = 0; ni < n; ni++) {
+    for (uint32_t ci = 0; ci < c; ci++) {
+      for (uint32_t hi = 0; hi < h; hi++) {
+        for (uint32_t wi = 0; wi < w; wi++) {
+          uint32_t src_i = ni * c * h * w + ci * h * w + hi * w + wi;
+          uint32_t dst_i = ni * n_str + ci * c_str + hi * h_str + wi * w_str;
+          ref[dst_i] = a[src_i];
+        }
+      }
+    }
+  }
+}
+
+static inline u8 * get_tensor_l2g_stride(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    tl_t *tl,
+    bmk1822_tensor_tgmem_stride_t tg_stride)
+{
+  int n = tl->shape.n;
+  int n_stride = tg_stride.n;
+
+  int stride_size = n * n_stride;
+  u8 *data = (u8 *)malloc(sizeof(u8) * stride_size);
+  if (!data)
+    return NULL;
+
+  for (int i = 0; i < stride_size; i++)
+    data[i] = 0xcf;
+
+  bmshape_t bms = BM_TENSOR_WITH_FMT(n, n_stride, 1, 1, BM_FMT_INT8);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_I8;
+  tg.shape.n = tl->shape.n;
+  tg.shape.c = tl->shape.c;
+  tg.shape.h = tl->shape.h;
+  tg.shape.w = tl->shape.w;
+  tg.stride = tg_stride;
+
+  bmk1822_tdma_l2tg_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = tl;
+  p.dst = &tg;
+  bmk1822_tdma_l2g_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  ret = bm_memcpy_d2s(*ctx, data, dev_mem);
+  assert(ret == BM_SUCCESS);
+
+  bmmem_device_free(*ctx, dev_mem);
+  return data;
+}
+
+static void test_get_tensor_l2g_stride(
+    bmctx_t *ctx, bmk_ctx_t *bk_ctx)
+{
+  int n = 2;
+  int c = 15;
+  int h = 10;
+  int w = 8;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  int size = n * c * h * w;
+  u8 *data_x = (u8 *)xmalloc(size);
+  for (int i = 0; i < size; i++)
+    data_x[i] = i;
+
+  bmk1822_tensor_tgmem_stride_t tg_stride;
+  tg_stride.h = w * 2;
+  tg_stride.c = tg_stride.h * h * 2;
+  tg_stride.n = tg_stride.c * c * 2;
+  tl_t *tl_x =
+      alloc_tl(bk_ctx, tl_shape, FMT_I8, 1);
+
+  put_tensor_g2l(ctx, bk_ctx, tl_x, data_x);
+  u8 *result_x = get_tensor_l2g_stride(ctx, bk_ctx ,tl_x, tg_stride);
+
+  int stride_size = n * tg_stride.n;
+  u8 *ref_x = (u8 *)xmalloc(stride_size);
+  if (!result_x || !ref_x)
+    goto fail_exit;
+
+  get_tensor_l2g_stride_ref(ref_x, data_x, tl_shape, tg_stride);
+
+  for (int i = 0; i < stride_size; i++) {
+    if (result_x[i] != ref_x[i]) {
+      printf("compare failed at result_x[%d], got %d, exp %d\n",
+             i, result_x[i], ref_x[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_x);
+
+fail_exit:
+  free(data_x);
+  free(result_x);
+  free(ref_x);
+}
+
+int main (void)
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  test_get_tensor_l2g_stride(&ctx, bk_ctx);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_get_tensor_stride_unalign.cpp b/cviruntime/test/1822/test_1822_get_tensor_stride_unalign.cpp
new file mode 100644
index 000000000..aad8a3067
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_get_tensor_stride_unalign.cpp
@@ -0,0 +1,167 @@
+#include "1822_test_util.h"
+
+static void get_tensor_l2g_stride_unalign_ref(
+    u8 *ref, u8 *a,
+    tl_shape_t tl_shape,
+    bmk1822_tensor_tgmem_stride_t gmem_stride)
+{
+  int n = tl_shape.n;
+  int c = tl_shape.c;
+  int h = tl_shape.h;
+  int w = tl_shape.w;
+  int n_str = gmem_stride.n;
+  int c_str = gmem_stride.c;
+  int h_str = gmem_stride.h;
+  int new_n = n * 2;
+  int new_h = h / 2;
+
+  /*
+   * Same as in get_tensor_l2g_stride_unalign().
+   */
+  int stride_size = new_n * gmem_stride.n;
+  for (int i = 0; i < stride_size; i++)
+    ref[i] = 0xcf;
+
+  /*
+   * (n, c, h, w) => (n * 2, c, h / 2, w)
+   */
+  for (int ni = 0; ni < n; ni++) {
+    for (int ci = 0; ci < c; ci++) {
+      for (int hi = 0; hi < h; hi++) {
+        for (int wi = 0; wi < w; wi++) {
+          u64 src_i = ni * c * h * w + ci * h * w + hi * w + wi;
+          u64 dst_i = (ni * 2 + hi / new_h) * n_str +
+              ci * c_str + (hi % new_h) * h_str + wi;
+          ref[dst_i] = a[src_i];
+        }
+      }
+    }
+  }
+}
+
+static inline u8 * get_tensor_l2g_stride(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    tl_t *tl,
+    bmk1822_tensor_tgmem_stride_t tg_stride)
+{
+  int n = tl->shape.n;
+  int n_stride = tg_stride.n;
+  int stride_size = n * n_stride;
+  u8 *data = (u8 *)malloc(sizeof(u8) * stride_size);
+  if (!data)
+    return NULL;
+
+  for (int i = 0; i < stride_size; i++)
+    data[i] = 0xcf;
+
+  bmshape_t bms = BM_TENSOR_WITH_FMT(n, n_stride, 1, 1, BM_FMT_INT8);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_I8;
+  tg.shape.n = tl->shape.n;
+  tg.shape.c = tl->shape.c;
+  tg.shape.h = tl->shape.h;
+  tg.shape.w = tl->shape.w;
+  tg.stride = tg_stride;
+  tg.base_reg_index = 0;
+
+  bmk1822_tdma_l2tg_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = tl;
+  p.dst = &tg;
+  bmk1822_tdma_l2g_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  ret = bm_memcpy_d2s(*ctx, data, dev_mem);
+  assert(ret == BM_SUCCESS);
+
+  bmmem_device_free(*ctx, dev_mem);
+  return data;
+}
+
+static void test_get_tensor_l2g_stride_unalign(
+    bmctx_t *ctx, bmk_ctx_t *bk_ctx)
+{
+  /*
+   * Make sure (h / 2 * w) is not eu-aligned.
+   */
+  int n = 1;
+  int c = 5;
+  int h = 18;
+  int w = 7;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  int size = n * c * h * w;
+  u8 *data_x = (u8 *)xmalloc(size);
+  if (!data_x)
+    return;
+
+  for (int i = 0; i < size; i++)
+    data_x[i] = i;
+
+  int new_n = n * 2;
+  int new_h = h / 2;
+
+  bmk1822_tensor_tgmem_stride_t tg_stride;
+  tg_stride.h = w * 2;
+  tg_stride.c = w * 2 * new_h * 2;
+  tg_stride.n = w * 2 * new_h * 2 * c * 2;
+
+  tl_t *tl_x =
+      alloc_tl(bk_ctx, tl_shape, FMT_I8, 1);
+  put_tensor_g2l(ctx, bk_ctx, tl_x, data_x);
+
+  tl_x->shape.n = new_n;
+  tl_x->shape.c = c;
+  tl_x->shape.h = new_h;
+  tl_x->shape.w = w;
+
+  tl_x->stride = bmk1822_tensor_lmem_default_stride(bk_ctx, tl_x->shape, FMT_I8, 0);
+  u8 *result_x = get_tensor_l2g_stride(ctx, bk_ctx, tl_x, tg_stride);
+  tl_x->shape = tl_shape;
+  tl_x->stride = bmk1822_tensor_lmem_default_stride(bk_ctx, tl_x->shape, FMT_I8, 1);
+
+  int stride_size = new_n * tg_stride.n;
+  u8 *ref_x = (u8 *)xmalloc(stride_size);
+  if (!result_x || !ref_x)
+    goto fail_exit;
+
+  get_tensor_l2g_stride_unalign_ref(ref_x, data_x, tl_shape, tg_stride);
+
+  for (int i = 0; i < stride_size; i++) {
+    if (result_x[i] != ref_x[i]) {
+      printf("compare failed at result_x[%d], got %d, exp %d\n",
+             i, result_x[i], ref_x[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_x);
+
+fail_exit:
+  free(data_x);
+  free(result_x);
+  free(ref_x);
+}
+
+int main (void)
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  test_get_tensor_l2g_stride_unalign(&ctx, bk_ctx);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_lut.cpp b/cviruntime/test/1822/test_1822_lut.cpp
new file mode 100644
index 000000000..83cf9f0c6
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_lut.cpp
@@ -0,0 +1,101 @@
+#include "1822_test_util.h"
+static u32 channel = -1; //<! 1822 hardcode
+
+static u64 shape_size(tl_shape_t s)
+{
+  return s.n * s.c * s.h * s.w;
+}
+
+static void tl_lut_ref(
+    u8 *ofmap,
+    u8 *ifmap,
+    u8 *table,
+    tl_shape_t ifmap_shape,
+    tl_shape_t table_shape)
+{
+  int ih, iw;
+  int tn, th, tw;
+
+  ih = ifmap_shape.h;
+  iw = ifmap_shape.w;
+  tn = table_shape.n;
+  th = table_shape.h;
+  tw = table_shape.w;
+  assert(tn == 1);
+  assert(th * tw == 256);
+
+  for (u64 i = 0; i < shape_size(ifmap_shape); i++) {
+    int ici = i / (ih * iw) % 32;
+    ofmap[i] = table[ici * (th * tw) + ifmap[i]];
+  }
+}
+
+static void test_tl_lut(bmctx_t *ctx, bmk_ctx_t *bk_ctx)
+{
+  tl_shape_t ifmap_shape = {1, channel, 1, 224};
+  tl_shape_t table_shape = {1, channel, 16, 16};
+  tl_shape_t ofmap_shape = ifmap_shape;
+
+  u64 ifmap_size = shape_size(ifmap_shape);
+  u64 table_size = shape_size(table_shape);
+  u64 ofmap_size = shape_size(ofmap_shape);
+
+  u8 *ifmap_data = (u8 *)xmalloc(ifmap_size);
+  for (u64 i = 0; i < ifmap_size; i++)
+    ifmap_data[i] = i - 20;
+
+  u8 *table_data = (u8 *)xmalloc(table_size);
+  for (u64 i = 0; i < table_size; i++)
+    table_data[i] = i + i / 256 * 3;
+
+  u8 *ref_data = (u8 *)xmalloc(ofmap_size);
+  tl_lut_ref(ref_data, ifmap_data, table_data, ifmap_shape, table_shape);
+
+  tl_t *tl_ifmap =
+    alloc_tl(bk_ctx,ifmap_shape, FMT_I8, 1);;
+  tl_t *tl_table =
+    alloc_tl(bk_ctx, table_shape, FMT_I8, /*align*/1);
+  tl_t *tl_ofmap =
+    alloc_tl(bk_ctx,ofmap_shape, FMT_I8, /*align*/1);
+
+  put_tensor_g2l(ctx, bk_ctx, tl_ifmap, ifmap_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_table, table_data);
+  bmk1822_tiu_lookup_table_param_t p12;
+  p12.ofmap = tl_ofmap;
+  p12.ifmap = tl_ifmap;
+  p12.table = tl_table;
+  bmk1822_tiu_lookup_table(bk_ctx, &p12);
+  test_submit(ctx);
+  u8 *ofmap_data = get_tensor_l2g(ctx, bk_ctx, tl_ofmap);
+  for (u64 i = 0; i < ofmap_size; i++) {
+    if (ofmap_data[i] != ref_data[i]) {
+      fprintf(stderr,
+          "comparing failed at ofmap_data[%" PRIu64 "], got %d, exp %d\n",
+          i, ofmap_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+  free_tl(bk_ctx, tl_ofmap);
+  free_tl(bk_ctx, tl_table);
+  free_tl(bk_ctx, tl_ifmap);
+
+  free(ifmap_data);
+  free(table_data);
+  free(ref_data);
+  free(ofmap_data);
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  // get channel info
+  bmk1822_chip_info_t chip_info = bmk1822_chip_info();
+  channel = chip_info.npu_num;
+
+  test_tl_lut(&ctx, bk_ctx);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_matrix_mac.cpp b/cviruntime/test/1822/test_1822_matrix_mac.cpp
new file mode 100644
index 000000000..8b4f671a1
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_matrix_mac.cpp
@@ -0,0 +1,1962 @@
+#include "1822_test_util.h"
+
+typedef bmk1822_tiu_matrix_multiplication_param_t param_t;
+
+static u64 matrix_size(const ml_t *ml)
+{
+  u64 row = ml->shape.n;
+  u64 col = ml->shape.col;
+  return row * col;
+}
+
+static u64 res_size(param_t *p)
+{
+  if (p->res_is_int8 && !p->add_result)
+    return matrix_size(p->res);
+  else
+    return matrix_size(p->res) / 2;
+}
+
+static u8 * alloc_left(param_t *p)
+{
+  u64 size = matrix_size(p->left);
+
+  u8 *buf = (u8 *)malloc(sizeof(u8) * size);
+  for (u64 i = 0; i < size; i++)
+    buf[i] = i % 17 - 9;
+
+  return buf;
+}
+
+static u8 * alloc_right(param_t *p)
+{
+  u64 size = matrix_size(p->right);
+
+  u8 *buf = (u8 *)malloc(sizeof(u8) * size);
+  for (u64 i = 0; i < size; i++)
+    buf[i] = i % 13 - 6;
+
+  return buf;
+}
+
+static u16 * alloc_bias(param_t *p)
+{
+  if (!p->bias)
+    return NULL;
+
+  u64 size = matrix_size(p->bias) / 2;
+
+  u16 *buf = (u16 *)malloc(sizeof(u16) * size);
+  for (u64 i = 0; i < size; i++)
+    buf[i] = 5 - (i % 7);
+
+  return buf;
+}
+
+static u16 * alloc_res(param_t *p)
+{
+  u64 size = res_size(p);
+
+  u16 *buf = (u16 *)malloc(sizeof(u16) * size);
+  for (u64 i = 0; i < size; i++)
+    buf[i] = 17 - (i % 35);
+
+  return buf;
+}
+
+static void right_shift(param_t *p, s32 *buf, u64 size)
+{
+  int shift_bits = p->rshift_bits;
+  int round_up = 1;
+  if (1)
+    arith_right_shift(buf, size, shift_bits, round_up);
+  else
+    logic_right_shift(buf, size, shift_bits, round_up);
+}
+
+static void matrix_mac_ref(
+    param_t *p, u8 left[], u8 right[], u16 bias[], u16 res[])
+{
+  u64 size = res_size(p);
+  u32 left_col = p->left->shape.col;
+  u32 right_col = p->right->shape.col;
+  u32 res_row = p->left->shape.n;
+  u32 res_col = p->res->shape.col;
+  int left_sign = (p->left->fmt == FMT_I8);
+  int right_sign = (p->right->fmt == FMT_I8);
+  int res_sign = (p->res->fmt == FMT_I8);
+
+  s32 *tmp_res = (s32 *)malloc(sizeof(s32) * size);
+  if (p->add_result) {
+    for (u32 i = 0; i < res_row * res_col; i++) {
+      tmp_res[i] = res_sign? (s16)res[i]: res[i];
+      tmp_res[i] <<= p->lshift_bits;
+    }
+  } else {
+    for (u32 i = 0; i < res_row * res_col; i++)
+      tmp_res[i] = 0;
+  }
+
+  for (u32 row = 0; row < res_row; row++) {
+    for (u32 col = 0; col < res_col; col++) {
+      for (u32 i = 0; i < left_col; i++) {
+        u32 li = row * left_col + i;
+        u32 ri = i * right_col + col;
+        s32 l = left_sign? (s8)left[li]: left[li];
+        s32 r = right_sign? (s8)right[ri]: right[ri];
+        tmp_res[row * res_col + col] += l * r;
+      }
+    }
+  }
+
+  if (p->bias && bias) {
+    for (u32 row = 0; row < res_row; row++) {
+      for (u32 col = 0; col < res_col; col++) {
+        int bias_sign = (p->bias->fmt == FMT_I8);
+        s32 b = bias_sign? (s16)bias[col]: bias[col];
+        tmp_res[row * res_col + col] += b;
+      }
+    }
+  }
+
+  if (p->relu_enable)
+    relu(tmp_res, size);
+
+  right_shift(p, tmp_res, size);
+
+  if (p->res_is_int8)
+    saturate_to_int8(tmp_res, size, res_sign);
+  else
+    saturate_to_int16(tmp_res, size, res_sign);
+
+  for (u64 i = 0; i < size; i++)
+    res[i] = tmp_res[i];
+
+  free(tmp_res);
+}
+
+static void put_bias(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    const ml_t *ml,
+    u16 data[])
+{
+  u64 size = ml->shape.col;
+
+  u8 *tmp = (u8 *)malloc(sizeof(u8) * size * 2);
+  if (!tmp)
+    return;
+
+  for (u64 i = 0; i < size; i++) {
+    tmp[i] = data[i] & 0xff;
+    tmp[i + size] = (data[i] >> 8) & 0xff;
+  }
+
+  put_matrix_g2l(ctx, bk_ctx, ml, tmp);
+
+  free(tmp);
+}
+
+static void put_res(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    const ml_t *ml,
+    u16 data[])
+{
+  u64 size = ml->shape.n / 2 * ml->shape.col;
+
+  u8 *tmp = (u8 *)malloc(sizeof(u8) * size * 2);
+  for (u64 i = 0; i < size; i++) {
+    tmp[i] = data[i] & 0xff;
+    tmp[i + size] = (data[i] >> 8) & 0xff;
+  }
+
+  put_matrix_g2l(ctx, bk_ctx, ml, tmp);
+
+  free(tmp);
+}
+
+static u16 * get_res(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    param_t *p)
+{
+  u64 size = res_size(p);
+  u16 *res = (u16 *)malloc(sizeof(u16) * size);
+
+  u8 *tmp = get_matrix_l2g(ctx, bk_ctx, p->res);
+  if (p->res_is_int8) {
+    int res_sign = (p->res->fmt == FMT_I8);
+    for (u64 i = 0; i < size; i++)
+      res[i] = res_sign? (s8)tmp[i]: tmp[i];
+  } else {
+    for (u64 i = 0; i < size; i++)
+      res[i] = tmp[i] + (tmp[i + size] << 8);
+  }
+
+  free(tmp);
+  return res;
+}
+
+static void test_param(bmctx_t *ctx, bmk_ctx_t *bk_ctx, param_t *p)
+{
+  u8 *left = alloc_left(p);
+  u8 *right = alloc_right(p);
+  u16 *bias = alloc_bias(p);
+  u16 *ref = alloc_res(p);
+
+  put_matrix_g2l(ctx, bk_ctx, p->left, left);
+  put_matrix_g2l(ctx, bk_ctx, p->right, right);
+  if (bias)
+    put_bias(ctx, bk_ctx, p->bias, bias);
+  if (p->add_result)
+    put_res(ctx, bk_ctx, p->res, ref);
+
+  bmk1822_tiu_matrix_multiplication(bk_ctx, p);
+  u16 *res = get_res(ctx, bk_ctx, p);
+
+  matrix_mac_ref(p, left, right, bias, ref);
+
+  u64 size = res_size(p);
+  for (u64 i = 0; i < size; i++) {
+    if (res[i] != ref[i]) {
+      fprintf(stderr, "comparing failed at out[%" PRIu64 "], got %d, exp %d\n",
+              i, (s16)res[i], (s16)ref[i]);
+      exit(-1);
+    }
+  }
+
+  free(left);
+  free(right);
+  free(bias);
+  free(ref);
+  free(res);
+}
+
+static void destroy_param(bmk_ctx_t *bk_ctx, param_t *p)
+{
+  bmk1822_lmem_free_matrix(bk_ctx, p->res);
+  if (p->bias)
+    bmk1822_lmem_free_matrix(bk_ctx, p->bias);
+  bmk1822_lmem_free_matrix(bk_ctx, p->right);
+  bmk1822_lmem_free_matrix(bk_ctx, p->left);
+}
+
+static ml_t *alloc_param_res(
+    bmk_ctx_t *bk_ctx, param_t *p)
+{
+  ml_shape_t s;
+  s.n = p->left->shape.n;
+  if (p->add_result || !p->res_is_int8)
+    s.n *= 2;
+  s.c = p->right->shape.c;
+  s.w = p->right->shape.w;
+  s.col = p->right->shape.col;
+
+  fmt_t fmt = FMT_U8;
+  if (p->left->fmt == FMT_I8)
+    fmt = FMT_I8;
+  if (p->right->fmt == FMT_I8)
+    fmt = FMT_I8;
+  if (p->bias)
+    if (p->bias->fmt == FMT_I8)
+      fmt = FMT_I8;
+
+  if (p->relu_enable)
+    fmt = FMT_U8;
+
+  return bmk1822_lmem_alloc_matrix(bk_ctx, s, fmt, 1);
+}
+
+static param_t param_0(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+  p.ps32_mode = 0;
+
+  u32 left_row = 1;
+  u32 left_col = 1;
+  u32 left_c = 1;
+  u32 left_w = 1;
+
+  u32 right_row = 1;
+  u32 right_col = 1;
+  u32 right_c = 1;
+  u32 right_w = 1;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = bmk1822_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1822_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_1(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 6;
+  u32 left_col = 1;
+  u32 left_c = 1;
+  u32 left_w = 1;
+
+  u32 right_row = 1;
+  u32 right_col = 1;
+  u32 right_c = 1;
+  u32 right_w = 1;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = bmk1822_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1822_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_2(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 6;
+  u32 left_col = 25;
+  u32 left_c = 1;
+  u32 left_w = 25;
+
+  u32 right_row = 25;
+  u32 right_col = 1;
+  u32 right_c = 1;
+  u32 right_w = 1;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = bmk1822_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1822_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_3(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 6;
+  u32 left_col = 25;
+  u32 left_c = 2;
+  u32 left_w = 18;
+
+  u32 right_row = 25;
+  u32 right_col = 1;
+  u32 right_c = 1;
+  u32 right_w = 1;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = bmk1822_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1822_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_4(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 6;
+  u32 left_col = 39;
+  u32 left_c = 4;
+  u32 left_w = 10;
+
+  u32 right_row = 39;
+  u32 right_col = 1;
+  u32 right_c = 1;
+  u32 right_w = 1;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = bmk1822_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1822_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_5(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 1;
+  u32 left_col = 1;
+  u32 left_c = 1;
+  u32 left_w = 1;
+
+  u32 right_row = 1;
+  u32 right_col = 2;
+  u32 right_c = 1;
+  u32 right_w = 2;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = bmk1822_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1822_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_6(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 1;
+  u32 left_col = 1;
+  u32 left_c = 1;
+  u32 left_w = 1;
+
+  u32 right_row = 1;
+  u32 right_col = 2;
+  u32 right_c = 2;
+  u32 right_w = 1;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = bmk1822_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1822_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_7(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 1;
+  u32 left_col = 1;
+  u32 left_c = 1;
+  u32 left_w = 1;
+
+  u32 right_row = 1;
+  u32 right_col = 10;
+  u32 right_c = 3;
+  u32 right_w = 4;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = bmk1822_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1822_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_8(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 7;
+  u32 left_col = 1;
+  u32 left_c = 1;
+  u32 left_w = 1;
+
+  u32 right_row = 1;
+  u32 right_col = 10;
+  u32 right_c = 3;
+  u32 right_w = 4;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = bmk1822_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1822_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_9(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 7;
+  u32 left_col = 23;
+  u32 left_c = 3;
+  u32 left_w = 8;
+
+  u32 right_row = 23;
+  u32 right_col = 10;
+  u32 right_c = 3;
+  u32 right_w = 4;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = bmk1822_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1822_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_10(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 7;
+  u32 left_col = 477;
+  u32 left_c = 60;
+  u32 left_w = 8;
+
+  u32 right_row = 477;
+  u32 right_col = 10;
+  u32 right_c = 3;
+  u32 right_w = 4;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = bmk1822_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1822_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_11(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 7;
+  u32 left_col = 23;
+  u32 left_c = 3;
+  u32 left_w = 8;
+
+  u32 right_row = 23;
+  u32 right_col = 477;
+  u32 right_c = 60;
+  u32 right_w = 8;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = bmk1822_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1822_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_12(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 1;
+  u32 left_col = 1;
+  u32 left_c = 1;
+  u32 left_w = 1;
+
+  u32 right_row = 1;
+  u32 right_col = 1;
+  u32 right_c = 1;
+  u32 right_w = 1;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1822_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1822_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = bmk1822_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_I8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_13(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 1;
+  u32 left_col = 1;
+  u32 left_c = 1;
+  u32 left_w = 1;
+
+  u32 right_row = 1;
+  u32 right_col = 2;
+  u32 right_c = 1;
+  u32 right_w = 2;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1822_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1822_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = bmk1822_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_I8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_14(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 7;
+  u32 left_col = 477;
+  u32 left_c = 60;
+  u32 left_w = 8;
+
+  u32 right_row = 477;
+  u32 right_col = 10;
+  u32 right_c = 3;
+  u32 right_w = 4;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1822_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1822_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = bmk1822_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_I8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_15(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 3;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 7;
+  u32 left_col = 477;
+  u32 left_c = 60;
+  u32 left_w = 8;
+
+  u32 right_row = 477;
+  u32 right_col = 10;
+  u32 right_c = 3;
+  u32 right_w = 4;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1822_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1822_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = bmk1822_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_I8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_16(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 3;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 7;
+  u32 left_col = 477;
+  u32 left_c = 60;
+  u32 left_w = 8;
+
+  u32 right_row = 477;
+  u32 right_col = 10;
+  u32 right_c = 3;
+  u32 right_w = 4;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1822_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1822_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = bmk1822_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_U8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_17(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 3;
+  p.res_is_int8 = true;
+  p.relu_enable = true;
+  p.add_result = false;
+
+  u32 left_row = 7;
+  u32 left_col = 477;
+  u32 left_c = 60;
+  u32 left_w = 8;
+
+  u32 right_row = 477;
+  u32 right_col = 10;
+  u32 right_c = 3;
+  u32 right_w = 4;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1822_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1822_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = bmk1822_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_U8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_18(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 7;
+  u32 left_col = 23;
+  u32 left_c = 3;
+  u32 left_w = 8;
+
+  u32 right_row = 23;
+  u32 right_col = 477;
+  u32 right_c = 60;
+  u32 right_w = 8;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1822_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1822_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = bmk1822_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_U8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_19(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 1;
+  u32 left_col = 1;
+  u32 left_c = 1;
+  u32 left_w = 1;
+
+  u32 right_row = 1;
+  u32 right_col = 1;
+  u32 right_c = 1;
+  u32 right_w = 1;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1822_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1822_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = bmk1822_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_I8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_20(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 4;
+  u32 left_col = 1;
+  u32 left_c = 1;
+  u32 left_w = 1;
+
+  u32 right_row = 1;
+  u32 right_col = 1;
+  u32 right_c = 1;
+  u32 right_w = 1;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1822_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1822_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = bmk1822_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_I8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_21(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 3;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 7;
+  u32 left_col = 477;
+  u32 left_c = 60;
+  u32 left_w = 8;
+
+  u32 right_row = 477;
+  u32 right_col = 10;
+  u32 right_c = 3;
+  u32 right_w = 4;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1822_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1822_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = bmk1822_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_U8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_22(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 2;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 7;
+  u32 left_col = 23;
+  u32 left_c = 3;
+  u32 left_w = 8;
+
+  u32 right_row = 23;
+  u32 right_col = 477;
+  u32 right_c = 60;
+  u32 right_w = 8;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1822_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1822_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = bmk1822_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_U8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_23(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 2;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 7;
+  u32 left_col = 23;
+  u32 left_c = 3;
+  u32 left_w = 8;
+
+  u32 right_row = 23;
+  u32 right_col = 477;
+  u32 right_c = 60;
+  u32 right_w = 8;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1822_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1822_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = bmk1822_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_U8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_24(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  u32 left_row = 1;
+  u32 left_col = 1;
+  u32 left_c = 1;
+  u32 left_w = 1;
+
+  u32 right_row = 1;
+  u32 right_col = 1;
+  u32 right_c = 1;
+  u32 right_w = 1;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1822_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1822_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = bmk1822_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_I8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_25(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  u32 left_row = 4;
+  u32 left_col = 1;
+  u32 left_c = 1;
+  u32 left_w = 1;
+
+  u32 right_row = 1;
+  u32 right_col = 1;
+  u32 right_c = 1;
+  u32 right_w = 1;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1822_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1822_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = bmk1822_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_I8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_26(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  u32 left_row = 4;
+  u32 left_col = 1;
+  u32 left_c = 1;
+  u32 left_w = 1;
+
+  u32 right_row = 1;
+  u32 right_col = 1;
+  u32 right_c = 1;
+  u32 right_w = 1;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1822_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1822_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = bmk1822_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_I8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_27(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 3;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  u32 left_row = 7;
+  u32 left_col = 477;
+  u32 left_c = 60;
+  u32 left_w = 8;
+
+  u32 right_row = 477;
+  u32 right_col = 10;
+  u32 right_c = 3;
+  u32 right_w = 4;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1822_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1822_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = bmk1822_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_U8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_28(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 2;
+  p.rshift_bits = 3;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  u32 left_row = 7;
+  u32 left_col = 477;
+  u32 left_c = 60;
+  u32 left_w = 8;
+
+  u32 right_row = 477;
+  u32 right_col = 10;
+  u32 right_c = 3;
+  u32 right_w = 4;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1822_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1822_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = bmk1822_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_U8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_29(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 2;
+  p.rshift_bits = 3;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  u32 left_row = 7;
+  u32 left_col = 477;
+  u32 left_c = 60;
+  u32 left_w = 8;
+
+  u32 right_row = 477;
+  u32 right_col = 10;
+  u32 right_c = 3;
+  u32 right_w = 4;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1822_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1822_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = bmk1822_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_U8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_30(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 2;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  u32 left_row = 7;
+  u32 left_col = 23;
+  u32 left_c = 3;
+  u32 left_w = 8;
+
+  u32 right_row = 23;
+  u32 right_col = 477;
+  u32 right_c = 60;
+  u32 right_w = 8;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1822_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1822_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = bmk1822_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_U8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_31(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 3;
+  p.rshift_bits = 2;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  u32 left_row = 7;
+  u32 left_col = 23;
+  u32 left_c = 3;
+  u32 left_w = 8;
+
+  u32 right_row = 23;
+  u32 right_col = 477;
+  u32 right_c = 60;
+  u32 right_w = 8;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1822_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1822_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = bmk1822_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_U8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_32(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 6;
+  p.rshift_bits = 2;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  u32 left_row = 7;
+  u32 left_col = 23;
+  u32 left_c = 3;
+  u32 left_w = 8;
+
+  u32 right_row = 23;
+  u32 right_col = 477;
+  u32 right_c = 60;
+  u32 right_w = 8;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1822_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1822_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = bmk1822_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_U8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_33(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 6;
+  p.rshift_bits = 2;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  u32 left_row = 7;
+  u32 left_col = 23;
+  u32 left_c = 3;
+  u32 left_w = 8;
+
+  u32 right_row = 23;
+  u32 right_col = 477;
+  u32 right_c = 60;
+  u32 right_w = 8;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1822_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1822_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = bmk1822_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_U8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_34(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 1;
+  p.rshift_bits = 13;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  u32 left_row = 7;
+  u32 left_col = 23;
+  u32 left_c = 3;
+  u32 left_w = 8;
+
+  u32 right_row = 23;
+  u32 right_col = 477;
+  u32 right_c = 60;
+  u32 right_w = 8;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1822_lmem_alloc_matrix(bk_ctx, left_shape, FMT_U8, 1);
+  p.right = bmk1822_lmem_alloc_matrix(bk_ctx, right_shape, FMT_U8, 1);
+  p.bias = bmk1822_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_U8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_35(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 1;
+  p.rshift_bits = 10;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  u32 left_row = 7;
+  u32 left_col = 23;
+  u32 left_c = 3;
+  u32 left_w = 8;
+
+  u32 right_row = 23;
+  u32 right_col = 477;
+  u32 right_c = 60;
+  u32 right_w = 8;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1822_lmem_alloc_matrix(bk_ctx, left_shape, FMT_U8, 1);
+  p.right = bmk1822_lmem_alloc_matrix(bk_ctx, right_shape, FMT_U8, 1);
+  p.bias = bmk1822_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_U8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_36(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 1;
+  p.rshift_bits = 10;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 7;
+  u32 left_col = 23;
+  u32 left_c = 3;
+  u32 left_w = 8;
+
+  u32 right_row = 23;
+  u32 right_col = 477;
+  u32 right_c = 60;
+  u32 right_w = 8;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1822_lmem_alloc_matrix(bk_ctx, left_shape, FMT_U8, 1);
+  p.right = bmk1822_lmem_alloc_matrix(bk_ctx, right_shape, FMT_U8, 1);
+  p.bias = bmk1822_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_U8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_37(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 1;
+  p.rshift_bits = 10;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 7;
+  u32 left_col = 23;
+  u32 left_c = 3;
+  u32 left_w = 8;
+
+  u32 right_row = 23;
+  u32 right_col = 477;
+  u32 right_c = 60;
+  u32 right_w = 8;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = bmk1822_lmem_alloc_matrix(bk_ctx, left_shape, FMT_U8, 1);
+  p.right = bmk1822_lmem_alloc_matrix(bk_ctx, right_shape, FMT_U8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_38(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 1;
+  p.rshift_bits = 6;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 7;
+  u32 left_col = 23;
+  u32 left_c = 3;
+  u32 left_w = 8;
+
+  u32 right_row = 23;
+  u32 right_col = 477;
+  u32 right_c = 60;
+  u32 right_w = 8;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = bmk1822_lmem_alloc_matrix(bk_ctx, left_shape, FMT_U8, 1);
+  p.right = bmk1822_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+#define test_one_param(n)                               \
+  do {                                                  \
+    param_t p = param_##n(bk_ctx);                      \
+    test_param(&ctx, bk_ctx, &p);                       \
+    destroy_param(bk_ctx, &p);                          \
+  } while (0)
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_one_param(0);
+  test_one_param(1);
+  test_one_param(2);
+  test_one_param(3);
+  test_one_param(4);
+  test_one_param(5);
+  test_one_param(6);
+  test_one_param(7);
+  test_one_param(8);
+  test_one_param(9);
+  test_one_param(10);
+  test_one_param(11);
+  test_one_param(12);
+  test_one_param(13);
+  test_one_param(14);
+  test_one_param(15);
+  test_one_param(16);
+  test_one_param(17);
+  test_one_param(18);
+  test_one_param(19);
+  test_one_param(20);
+  test_one_param(21);
+  test_one_param(22);
+  test_one_param(23);
+  test_one_param(24);
+  test_one_param(25);
+  test_one_param(26);
+  test_one_param(27);
+  test_one_param(28);
+  test_one_param(29);
+  test_one_param(30);
+  test_one_param(31);
+  test_one_param(32);
+  test_one_param(33);
+  test_one_param(34);
+  test_one_param(35);
+  test_one_param(36);
+  test_one_param(37);
+  test_one_param(38);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_matrix_mac_ps32.cpp b/cviruntime/test/1822/test_1822_matrix_mac_ps32.cpp
new file mode 100644
index 000000000..1cd2a4107
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_matrix_mac_ps32.cpp
@@ -0,0 +1,559 @@
+#include "1822_test_util.h"
+
+typedef bmk1822_tiu_matrix_multiplication_param_t param_t;
+
+typedef struct{
+  fmt_t left_sign;
+  u32 left_row ;
+  u32 left_col ;
+  u32 left_c ;
+  u32 left_w ;
+  fmt_t right_sign;
+  u32 right_row ;
+  u32 right_col ;
+  u32 right_c ;
+  u32 right_w ;
+  u32 lshift_bits ;
+  u32 rshift_bits ;
+  u32 relu_enable ;
+  u32 using_bias;
+  fmt_t bias_sign;
+} matrix_init_para_t;
+
+matrix_init_para_t matrix_para_t;
+
+static void make_bmk_matrix_param_ps32(bmk_ctx_t *bk_ctx, param_t *p, int ps32_mode);
+static param_t param_init();
+
+void print_param(param_t *p)
+{
+  printf("ps32_mode =%d\n",p->ps32_mode);
+  printf("left_shape.n =%d\n",p->left->shape.n);
+  printf("left_shape.col =%d\n",p->left->shape.col);
+  printf("left_shape.c =%d\n",p->left->shape.c);
+  printf("left_shape.w =%d\n",p->left->shape.w);
+  printf("left_fmt =%d\n",p->left->fmt);
+  printf("right_shape.n =%d\n",p->right->shape.n);
+  printf("right_shape.col =%d\n",p->right->shape.col);
+  printf("right_shape.c =%d\n",p->right->shape.c);
+  printf("right_shape.w =%d\n",p->right->shape.w);
+  printf("right_fmt =%d\n",p->right->fmt);
+  if(p->bias)
+  {
+    printf("bias_shape.n =%d\n",p->bias->shape.n);
+    printf("bias_shape.col =%d\n",p->bias->shape.col);
+    printf("bias_shape.c =%d\n",p->bias->shape.c);
+    printf("bias_shape.w =%d\n",p->bias->shape.w);
+    printf("bias_fmt =%d\n",p->bias->fmt);
+  }
+  printf("result_shape.n =%d\n",p->res->shape.n);
+  printf("result_shape.col =%d\n",p->res->shape.col);
+  printf("result_shape.c =%d\n",p->res->shape.c);
+  printf("result_shape.w =%d\n",p->res->shape.w);
+  printf("result_fmt =%d\n",p->res->fmt);
+  printf("relu_enable=%d\n",p->relu_enable);
+  printf("rshift_bits=%d\n",p->rshift_bits);
+}
+
+
+static u64 matrix_size(const ml_t *ml)
+{
+  u64 row = ml->shape.n;
+  u64 col = ml->shape.col;
+  return row * col;
+}
+
+static u64 res_ps32_size(param_t *p)
+{
+    return matrix_size(p->res);
+}
+
+static u64 res_size(param_t *p)
+{
+  if (p->res_is_int8 && !p->add_result)
+    return matrix_size(p->res);
+  else
+    return matrix_size(p->res) *2 ;
+}
+
+static u8 * alloc_left(param_t *p)
+{
+  u64 size = matrix_size(p->left);
+  u8 *buf = (u8 *)malloc(sizeof(u8) * size);
+  for (u64 i = 0; i < size; i++)
+    buf[i] = i % 17 - 9;
+
+  return buf;
+}
+
+static u8 * alloc_right(param_t *p)
+{
+  u64 size = matrix_size(p->right);
+
+  u8 *buf = (u8 *)malloc(sizeof(u8) * size);
+  for (u64 i = 0; i < size; i++)
+    buf[i] = i % 13 - 6;
+
+  return buf;
+}
+static u16 * alloc_bias(param_t *p)
+{
+  if (!p->bias)
+    return NULL;
+
+  u64 size = matrix_size(p->bias) / 2;
+
+  u16 *buf = (u16 *)malloc(sizeof(u16) * size);
+  for (u64 i = 0; i < size; i++)
+    buf[i] = 5 - (i % 7);
+
+  return buf;
+}
+
+static u8 * alloc_ps32_res(param_t *p)
+{
+  u64 size = res_ps32_size(p)*4;
+  u8 *buf = (u8 *)malloc(sizeof(u8) * size);
+  for (u64 i = 0; i < size; i++)
+    buf[i] = 17 - (i % 35);
+
+  return buf;
+}
+
+static void right_shift(param_t *p, s32 *buf, u64 size)
+{
+  int shift_bits = p->rshift_bits;
+  int round_up = 1;
+  if (1)
+    arith_right_shift(buf, size, shift_bits, round_up);
+  else
+    logic_right_shift(buf, size, shift_bits, round_up);
+}
+
+static int ps32_m2_matrix_mac_ref(
+  param_t *p,
+  u8 *left,
+  u8 *right,
+  u8 *res)
+{
+  u64 size = res_ps32_size(p);
+  u32 left_col = p->left->shape.col;
+  u32 right_col = p->right->shape.col;
+  u32 res_row = p->left->shape.n;
+  u32 res_col = p->res->shape.col;
+  int left_sign = (p->left->fmt == FMT_I8);
+  int right_sign = (p->right->fmt == FMT_I8);
+  int ret = BM_SUCCESS;
+  int bstride = res_row * res_col;
+
+  s32 *tmp_res = (s32 *)malloc(sizeof(s32) * size);
+  for (u32 i = 0; i < res_row * res_col; i++)
+      tmp_res[i] = 0;
+  for (u32 row = 0; row < res_row; row++) {
+    for (u32 col = 0; col < res_col; col++) {
+      for (u32 i = 0; i < left_col; i++) {
+        u32 li = row * left_col + i;
+        u32 ri = i * right_col + col;
+        s32 l = left_sign? (s8)left[li]: left[li];
+        s32 r = right_sign? (s8)right[ri]: right[ri];
+        tmp_res[row * res_col + col] += l * r;
+      }
+    }
+  }
+
+  for (u64 i = 0; i < size; i++)
+    res[i + bstride*0] = tmp_res[i]>>0;
+  for (u64 i = 0; i < size; i++)
+    res[i + bstride*1] = tmp_res[i]>>8;
+  for (u64 i = 0; i < size; i++)
+    res[i + bstride*2] = tmp_res[i]>>16;
+  for (u64 i = 0; i < size; i++)
+    res[i + bstride*3] = tmp_res[i]>>24;
+
+  free(tmp_res);
+
+  return ret;
+}
+
+static int ps32_m3_matrix_mac_ref(
+  param_t *p,
+  u8 *left,
+  u8 *right,
+  u8 *res)
+{
+  u64 size = res_ps32_size(p);
+  u32 left_col = p->left->shape.col;
+  u32 right_col = p->right->shape.col;
+  u32 res_row = p->left->shape.n;
+  u32 res_col = p->res->shape.col;
+  int left_sign = (p->left->fmt == FMT_I8);
+  int right_sign = (p->right->fmt == FMT_I8);
+  int ret = BM_SUCCESS;
+  int bstride = res_row * res_col;
+
+  u32 *tmp_res = (u32 *)malloc(sizeof(u32) * size);
+
+  for (u64 i = 0; i < size; i++)
+    tmp_res[i] = res[i + bstride*0];
+  for (u64 i = 0; i < size; i++)
+    tmp_res[i] |= res[i + bstride*1]<<8;
+  for (u64 i = 0; i < size; i++)
+    tmp_res[i] |= res[i + bstride*2]<<16;
+  for (u64 i = 0; i < size; i++)
+    tmp_res[i] |= res[i + bstride*3]<<24;
+
+  for (u32 row = 0; row < res_row; row++) {
+    for (u32 col = 0; col < res_col; col++) {
+      for (u32 i = 0; i < left_col; i++) {
+        u32 li = row * left_col + i;
+        u32 ri = i * right_col + col;
+        s32 l = left_sign? (s8)left[li]: left[li];
+        s32 r = right_sign? (s8)right[ri]: right[ri];
+        tmp_res[row * res_col + col] += l * r;
+      }
+    }
+  }
+
+  for (u64 i = 0; i < size; i++)
+    res[i + bstride*0] = tmp_res[i]>>0;
+  for (u64 i = 0; i < size; i++)
+    res[i + bstride*1] = tmp_res[i]>>8;
+  for (u64 i = 0; i < size; i++)
+    res[i + bstride*2] = tmp_res[i]>>16;
+  for (u64 i = 0; i < size; i++)
+    res[i + bstride*3] = tmp_res[i]>>24;
+
+  free(tmp_res);
+
+  return ret;
+}
+
+static int ps32_m1_matrix_mac_ref(
+  param_t *p,
+  u8 *left,
+  u8 *right,
+  u16 * bias,
+  u8 *res)
+{
+  u64 size = res_ps32_size(p);
+  u32 left_col = p->left->shape.col;
+  u32 right_col = p->right->shape.col;
+  u32 res_row = p->left->shape.n;
+  u32 res_col = p->res->shape.col;
+  int left_sign = (p->left->fmt == FMT_I8);
+  int right_sign = (p->right->fmt == FMT_I8);
+  int res_sign = (p->res->fmt == FMT_I8);
+  int ret = BM_SUCCESS;
+  int bstride = res_row * res_col;
+
+  s32 *tmp_res = (s32 *)malloc(sizeof(s32) * size);
+
+  for (u64 i = 0; i < size; i++)
+    tmp_res[i] = res[i + bstride*0];
+  for (u64 i = 0; i < size; i++)
+    tmp_res[i] |= res[i + bstride*1]<<8;
+  for (u64 i = 0; i < size; i++)
+    tmp_res[i] |= res[i + bstride*2]<<16;
+  for (u64 i = 0; i < size; i++)
+    tmp_res[i] |= res[i + bstride*3]<<24;
+
+  for (u32 row = 0; row < res_row; row++) {
+    for (u32 col = 0; col < res_col; col++) {
+      for (u32 i = 0; i < left_col; i++) {
+        u32 li = row * left_col + i;
+        u32 ri = i * right_col + col;
+        s32 l = left_sign? (s8)left[li]: left[li];
+        s32 r = right_sign? (s8)right[ri]: right[ri];
+        tmp_res[row * res_col + col] += l * r;
+      }
+    }
+  }
+
+  if (p->bias && bias) {
+    for (u32 row = 0; row < res_row; row++) {
+      for (u32 col = 0; col < res_col; col++) {
+        int bias_sign = (p->bias->fmt == FMT_I8);
+        s32 b = bias_sign? (s16)bias[col]: bias[col];
+        tmp_res[row * res_col + col] += b;
+      }
+    }
+  }
+
+  if (p->relu_enable)
+    relu(tmp_res, size);
+  right_shift(p, tmp_res, size);
+  if (p->res_is_int8)
+    saturate_to_int8(tmp_res, size, res_sign);
+  else
+    saturate_to_int16(tmp_res, size, res_sign);
+
+  for (u64 i = 0; i < size; i++)
+    res[i + bstride*0] = tmp_res[i]>>0;
+  for (u64 i = 0; i < size; i++)
+    res[i + bstride*1] = tmp_res[i]>>8;
+
+  free(tmp_res);
+
+  return ret;
+}
+
+static void put_bias(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    const ml_t *ml,
+    u16 data[])
+{
+  u64 size = ml->shape.col;
+
+  u8 *tmp = (u8 *)malloc(sizeof(u8) * size * 2);
+  if (!tmp)
+    return;
+
+  for (u64 i = 0; i < size; i++) {
+    tmp[i] = data[i] & 0xff;
+    tmp[i + size] = (data[i] >> 8) & 0xff;
+  }
+  put_matrix_g2l(ctx, bk_ctx, ml, tmp);
+
+  free(tmp);
+}
+
+
+static int test_matrix_ps32_ut(bmctx_t *ctx, bmk_ctx_t *bk_ctx, param_t *p)
+{
+  make_bmk_matrix_param_ps32(bk_ctx, p, 2);
+  u8 *left = alloc_left(p);
+  u8 *right = alloc_right(p);
+  u8 *ref = alloc_ps32_res(p);
+
+  {
+     bmerr_t ret = ps32_m2_matrix_mac_ref(p, left, right, ref);
+     assert(ret == BM_SUCCESS);
+
+     put_matrix_g2l(ctx, bk_ctx, p->left, left);
+     put_matrix_g2l(ctx, bk_ctx, p->right, right);
+     bmk1822_tiu_matrix_multiplication(bk_ctx, p);
+     bmk1822_matrix_lmem_t ps32_res;
+     ps32_res = *p->res;
+     ps32_res.shape.n *= sizeof(int);
+     u8 *res = get_matrix_l2g(ctx, bk_ctx, &ps32_res);
+
+     int has_error = array_cmp_int8(
+         "Comparing begin_mode results ...\n",
+         (s8 *)ref, (s8 *)res ,(int)res_ps32_size(p)*sizeof(int));
+     if (has_error) {
+       printf("Comparison M2 FAILED\n");
+       print_param(p);
+       exit(-1);
+     }else
+       printf("Comparison M2 PASS\n");
+     free(res);
+  }
+
+  {
+    make_bmk_matrix_param_ps32(bk_ctx, p, 3);
+
+    bmerr_t ret = ps32_m3_matrix_mac_ref(p, left, right, ref);
+    assert(ret == BM_SUCCESS);
+
+    bmk1822_tiu_matrix_multiplication(bk_ctx, p);
+    bmk1822_matrix_lmem_t ps32_res;
+    ps32_res = *p->res;
+    ps32_res.shape.n *= sizeof(int);
+    u8 *res = get_matrix_l2g(ctx, bk_ctx, &ps32_res);
+
+    int has_error = array_cmp_int8(
+        "Comparing m3 results ...\n",
+        (s8 *)ref, (s8 *)res ,(int)res_ps32_size(p)*sizeof(int));
+    if (has_error) {
+      printf("Comparison M3 FAILED\n");
+      print_param(p);
+      exit(-1);
+    }else
+      printf("Comparison M3 PASS\n");
+
+    free(res);
+  }
+  {
+    make_bmk_matrix_param_ps32(bk_ctx, p, 1);
+    u16 *bias = alloc_bias(p);
+
+    bmerr_t ret = ps32_m1_matrix_mac_ref(p, left, right, bias, ref);
+    assert(ret == BM_SUCCESS);
+
+    if(p->bias)
+      put_bias(ctx, bk_ctx, p->bias, bias);
+
+    bmk1822_tiu_matrix_multiplication(bk_ctx, p);
+    bmk1822_matrix_lmem_t ps32_res;
+    ps32_res = *p->res;
+    ps32_res.shape.n *= 2;
+
+    u8 *res = get_matrix_l2g(ctx, bk_ctx, &ps32_res);
+    int has_error = array_cmp_int8(
+        "Comparing m1 results ...\n",
+        (s8 *)ref, (s8 *)res ,(int)res_size(p));
+    if (has_error) {
+      printf("Comparison M1 FAILED\n");
+      print_param(p);
+      exit(-1);
+    }else
+      printf("Comparison M1 PASS\n");
+
+    free(res);
+    free(bias);
+  }
+    free(left);
+    free(right);
+    free(ref);
+  return 1;
+}
+
+static void destroy_param(bmk_ctx_t *bk_ctx, param_t *p)
+{
+  if (p->bias)
+    bmk1822_lmem_free_matrix(bk_ctx, p->bias);
+  bmk1822_lmem_free_matrix(bk_ctx, p->res);
+  bmk1822_lmem_free_matrix(bk_ctx, p->right);
+  bmk1822_lmem_free_matrix(bk_ctx, p->left);
+}
+
+static fmt_t modify_res_fmt()
+{
+  fmt_t fmt = FMT_U8;
+  if (matrix_para_t.left_sign == FMT_I8)
+    fmt = FMT_I8;
+  if (matrix_para_t.right_sign == FMT_I8)
+    fmt = FMT_I8;
+  if (matrix_para_t.using_bias)
+    if (matrix_para_t.bias_sign == FMT_I8)
+      fmt = FMT_I8;
+
+//  if (matrix_para_t.relu_enable)
+//    fmt = FMT_U8;
+
+  return fmt;
+}
+
+static ml_t *alloc_param_res(
+    bmk_ctx_t *bk_ctx, param_t *p)
+{
+  ml_shape_t s;
+  s.n = p->left->shape.n;
+  s.c = p->right->shape.c;
+  s.w = p->right->shape.w;
+  s.col = p->right->shape.col;
+
+  fmt_t fmt = FMT_U8;
+  fmt = modify_res_fmt();
+  return bmk1822_lmem_alloc_ps32_matrix(bk_ctx, s, fmt, 1);
+}
+
+
+static void make_bmk_matrix_param_ps32(bmk_ctx_t *bk_ctx, param_t *p, int ps32_mode)
+{
+
+  ml_shape_t left_shape;
+  ml_shape_t right_shape;
+
+  p->ps32_mode = ps32_mode;
+  p->relu_enable = 0;
+  p->lshift_bits = 0;
+  p->rshift_bits = 0;
+
+  if(ps32_mode==2)
+  {
+    left_shape.n = matrix_para_t.left_row;
+    left_shape.c = matrix_para_t.left_c;
+    left_shape.w = matrix_para_t.left_w;
+    left_shape.col = matrix_para_t.left_col;
+
+    right_shape.n = matrix_para_t.right_row;
+    right_shape.c = matrix_para_t.right_c;
+    right_shape.w = matrix_para_t.right_w;
+    right_shape.col = matrix_para_t.right_col;
+    p->left  = bmk1822_lmem_alloc_matrix(bk_ctx, left_shape,  matrix_para_t.left_sign , 1);
+    p->right = bmk1822_lmem_alloc_matrix(bk_ctx, right_shape, matrix_para_t.right_sign, 1);
+    p->bias = NULL;
+    p->res = alloc_param_res(bk_ctx, p);
+  }else if(ps32_mode==3)
+  {
+
+  }else if(ps32_mode==1)
+  {
+     p->relu_enable = matrix_para_t.relu_enable;
+     p->rshift_bits = matrix_para_t.rshift_bits;
+     if(matrix_para_t.using_bias)
+     {
+       right_shape.n = matrix_para_t.right_row;
+       right_shape.c = matrix_para_t.right_c;
+       right_shape.w = matrix_para_t.right_w;
+       right_shape.col = matrix_para_t.right_col;
+
+       ml_shape_t bias_shape = right_shape;
+       bias_shape.n = 2;
+       p->bias = bmk1822_lmem_alloc_matrix(bk_ctx, bias_shape, matrix_para_t.bias_sign, 1);
+       assert(p->bias);
+    }
+  }
+
+}
+static param_t param_init(void)
+{
+  param_t p;
+
+  //srand(clock());
+
+  memset(&p, 0, sizeof(param_t));
+  memset(&matrix_para_t, 0, sizeof(matrix_init_para_t));
+
+  matrix_para_t.rshift_bits = rand()%4+2;
+  matrix_para_t.using_bias = rand()%2;
+  matrix_para_t.relu_enable = rand()%2;
+  matrix_para_t.right_sign = rand()%2? FMT_I8 : FMT_U8;
+  matrix_para_t.left_sign = rand()%2? FMT_I8 : FMT_U8;
+
+  if(matrix_para_t.using_bias)
+    matrix_para_t.bias_sign = rand()%2? FMT_I8 : FMT_U8;
+
+  if(matrix_para_t.right_sign != FMT_I8 && matrix_para_t.left_sign != FMT_I8)
+    matrix_para_t.relu_enable=0;
+
+  matrix_para_t.left_row = rand()%60+1;
+  matrix_para_t.left_col = rand()%40+1;
+  matrix_para_t.left_w = matrix_para_t.left_col/0x10 ? rand()%8+8 : matrix_para_t.left_col;
+  //matrix_para_t.left_w = rand()%16+1;
+  matrix_para_t.left_c =
+    matrix_para_t.left_col%matrix_para_t.left_w?
+      matrix_para_t.left_col/matrix_para_t.left_w+1 : matrix_para_t.left_col/matrix_para_t.left_w;
+
+  matrix_para_t.right_row = matrix_para_t.left_col;
+  matrix_para_t.right_col = rand()%50+1;
+  //matrix_para_t.right_w = 16;
+  matrix_para_t.right_w = rand()%16+1;
+  matrix_para_t.right_c =
+    matrix_para_t.right_col%matrix_para_t.right_w?
+      matrix_para_t.right_col/matrix_para_t.right_w+1 : matrix_para_t.right_col/matrix_para_t.right_w;
+
+  return p;
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  int test_finished_num = 0;
+  for (int i = 0; i < 20; i++) {
+    printf("random_test_conv iteration: %d\n", i);
+    param_t p = param_init();
+
+    test_finished_num += test_matrix_ps32_ut(&ctx, bk_ctx, &p);
+    destroy_param(bk_ctx, &p);
+  }
+  printf("test_finished_num: %d\n", test_finished_num);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_matrix_mac_qdm.cpp b/cviruntime/test/1822/test_1822_matrix_mac_qdm.cpp
new file mode 100644
index 000000000..5f9b22917
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_matrix_mac_qdm.cpp
@@ -0,0 +1,821 @@
+#include <limits.h>
+#include "1822_test_util.h"
+#include "test_tf_quant_util.h"
+
+// #define ENABLE_DEBUG_MSG
+// #define ENABLE_TV_GEN_PATTERN
+
+#define MIN_EXEC_TESTS  20
+
+using param_t = bmk1822_tiu_matrix_multiplication_qdm_param_t;
+
+typedef struct {
+  int left_row;
+  int left_col;
+  int right_col;
+  int has_bias;
+  int relu_enable;
+  s8 *input_data;
+  s8 *filter_data;
+  s8 *output_data;
+  s32 *bias_data;
+  u32 multiplier;
+  s8 right_shift;
+  float float_multiplier;
+  int retry_cnt;
+} fc_test_param_t;
+
+void fully_connected_ref(fc_test_param_t *p_param)
+{
+  const s32 input_offset = 0;
+  const s32 filter_offset = 0;
+  const s32 output_offset = 0;
+  const s32 output_multiplier = p_param->multiplier;
+  const int output_rshift = p_param->right_shift;
+  const int batches = p_param->left_row;
+  const int output_depth = p_param->right_col;
+  const int accum_depth = p_param->left_col;
+  s8 *input_data = p_param->input_data;
+  s8 *filter_data = p_param->filter_data;
+  s8 *output_data = p_param->output_data;
+  s32 *bias_data = p_param->has_bias ? p_param->bias_data : nullptr;
+
+  const s32 output_activation_min = -128;
+  const s32 output_activation_max = 127;
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("fully_connected_ref:\n");
+  printf("  batches %d, output_depth %d, accum_depth %d, filter_offset %d, "
+         "input_offset %d\n",
+         batches, output_depth, accum_depth, filter_offset, input_offset);
+  printf("  output_multiplier %d, output_rshift %d\n", output_multiplier,
+         output_rshift);
+#endif
+
+  for (int b = 0; b < batches; ++b) {
+    for (int out_c = 0; out_c < output_depth; ++out_c) {
+      s32 acc = 0;
+      for (int d = 0; d < accum_depth; ++d) {
+        s32 input_val = input_data[b * accum_depth + d];
+        // s32 filter_val = filter_data[out_c * accum_depth + d];
+        s32 filter_val = filter_data[output_depth * d + out_c];
+        acc += (filter_val + filter_offset) * (input_val + input_offset);
+
+#ifdef ENABLE_DEBUG_MSG
+        printf("  [%d][%d][%d] acc(%d) += (%d + %d) * (%d + %d) = %d\n", b,
+               out_c, d,
+               acc - (filter_val + filter_offset) * (input_val + input_offset),
+               filter_val, filter_offset, input_val, input_offset, acc);
+#endif
+      }
+      if (bias_data) {
+        acc += bias_data[out_c];
+
+#ifdef ENABLE_DEBUG_MSG
+        printf("  [%d][%d] acc %d, bias %d\n", b, out_c, acc,
+               bias_data ? bias_data[out_c] : 0);
+#endif
+      }
+      acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_rshift);
+
+#ifdef ENABLE_DEBUG_MSG
+      printf("  [%d][%d] acc %d, output_multiplier %d, output_rshift %d\n", b,
+             out_c, acc, output_multiplier, output_rshift);
+#endif
+
+      acc += output_offset;
+
+#ifdef ENABLE_DEBUG_MSG
+      printf("  [%d][%d] acc %d, output_offset %d\n", b, out_c, acc,
+             output_offset);
+#endif
+
+      acc = MAX(acc, output_activation_min);
+      acc = MIN(acc, output_activation_max);
+
+#ifdef ENABLE_DEBUG_MSG
+      printf("  [%d][%d] acc %d, output_activation_min %d, "
+             "output_activation_max %d\n",
+             b, out_c, acc, output_activation_min, output_activation_max);
+#endif
+
+      output_data[out_c + output_depth * b] = static_cast<int8_t>(acc);
+    }
+  }
+}
+
+void calc_fc_float_multiplier(fc_test_param_t *p_param)
+{
+  const s32 input_offset = 0;
+  const s32 filter_offset = 0;
+  const int batches = p_param->left_row;
+  const int output_depth = p_param->right_col;
+  const int accum_depth = p_param->left_col;
+  s8 *input_data = p_param->input_data;
+  s8 *filter_data = p_param->filter_data;
+  s32 *bias_data = p_param->has_bias ? p_param->bias_data : nullptr;
+
+  int output_accu_min = INT_MIN;
+  int output_accu_max = INT_MAX;
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("calc_fc_float_multiplier:\n");
+#endif
+
+  for (int b = 0; b < batches; ++b) {
+    for (int out_c = 0; out_c < output_depth; ++out_c) {
+      s32 acc = 0;
+      for (int d = 0; d < accum_depth; ++d) {
+        s32 input_val = input_data[b * accum_depth + d];
+        // s32 filter_val = filter_data[out_c * accum_depth + d];
+        s32 filter_val = filter_data[output_depth * d + out_c];
+        acc += (filter_val + filter_offset) * (input_val + input_offset);
+      }
+      if (bias_data) {
+        acc += bias_data[out_c];
+      }
+
+      output_accu_max = MAX(acc, output_accu_max);
+      output_accu_min = MIN(acc, output_accu_min);
+
+    }
+  }
+
+  // Since int8 ranges from -128 to 127, we need to squeeze the accumulator
+  // MIN/MAX fit in those ranges correspondingly as much as possible.
+  if (abs(output_accu_max) > abs(output_accu_min)) {
+    p_param->float_multiplier = 127.0f / abs(output_accu_max);
+  } else {
+    p_param->float_multiplier = 128.0f / abs(output_accu_min);
+  }
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("  output_accu_min %d, output_accu_max %d, output_multiplier %f\n",
+         output_accu_min, output_accu_max, p_param->float_multiplier);
+#endif
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("<= calc_fc_float_multiplier\n");
+#endif
+}
+
+static void put_bias32(bmctx_t *ctx, bmk_ctx_t *bk_ctx, const ml_t *ml,
+                       s32 data[])
+{
+  u64 size = ml->shape.col;
+
+  u8 *tmp = (u8 *)malloc(size * 4);
+  if (!tmp)
+    return;
+
+  for (u64 i = 0; i < size; i++) {
+    u32 val = static_cast<u32>(data[i]);
+    tmp[i] = val & 0xff;
+    tmp[i + size] = (val >> 8) & 0xff;
+    tmp[i + 2 * size] = (val >> 16) & 0xff;
+    tmp[i + 3 * size] = (val >> 24) & 0xff;
+  }
+
+  put_matrix_g2l(ctx, bk_ctx, ml, tmp);
+
+  free(tmp);
+}
+
+#if 0
+typedef struct {
+  s32 input_offset;
+  s32 weights_offset;
+  s32 output_offset;
+  s32 output_multiplier;
+  int output_rshift;
+} FullyConnectedParams;
+
+int tfl_original_test()
+{
+  int ret = 0;
+
+  // 2x10
+  s8 input_data[20] = {
+    1, 3, 5, 7,  9, 11, 13,  15, -19, -21,
+    1, 3, 5, 7,  9, 11, 13, -17,  17, -21};
+
+  // 3x10
+  s8 filter_data[30] = {
+    1, 3, 5, 7, 9, 11, 13, 15, 17, 19,
+    1, 3, 5, 7, 9, 11, 13, 15, 17, 19,
+    1, 3, 5, 7, 9, 11, 13, 15, 17, 19};
+
+  // 1x3
+  s32 bias_data[3] = {4, 8, 12};
+
+  // 2x3
+  s8 ref_output_data[6] = {
+    23, 24, 25,
+    57, 58, 59};
+
+  s8 output_rshift = 1; // change to right shift
+  u32 output_multiplier = 1073741824;
+
+  s32 input_offset = 1;
+  s32 filter_offset = 1;
+  s32 output_offset = 1;  // change to right shift
+
+  FullyConnectedParams params;
+  params.input_offset = input_offset;
+  params.weights_offset = filter_offset;
+  params.output_offset = output_offset;
+  params.output_multiplier = output_multiplier;
+  params.output_rshift = output_rshift;
+
+  tl_shape_t input_shape = {2, 10, 1, 1};
+  tl_shape_t filter_shape = {3, 10, 1, 1};
+  tl_shape_t bias_shape = {1, 3, 1, 1};
+  tl_shape_t output_shape = {2, 3, 1, 1};
+
+  s8 output_data[6];
+  fully_connected_ref(params, input_shape,
+                      input_data, filter_shape,
+                      filter_data, bias_shape,
+                      bias_data, output_shape,
+                      output_data);
+  for (int i = 0; i < 6; i++) {
+    if (output_data[i] != ref_output_data[i]) {
+      printf("  output_data[%d] %d != %d\n",
+             i, output_data[i], ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  return ret;
+}
+#endif
+
+int simple_test(bmctx_t *ctx, bmk_ctx_t *bk_ctx)
+{
+  int ret = 0;
+
+  // 2x10
+  s8 input_data[20] = {1, 3, 5, 7, 9, 11, 13, 15,  -19, -21,
+                       1, 3, 5, 7, 9, 11, 13, -17, 17,  -21};
+
+#if 0
+  // 3x10
+  // tfl use transposed filter
+  s8 filter_data_tp[30] = {
+    1, 3, 5, 7, 9, 11, 13, 15, 17, 19,
+    1, 3, 5, 7, 9, 11, 13, 15, 17, 19,
+    1, 3, 5, 7, 9, 11, 13, 15, 17, 19};
+#endif
+
+  // 10x3
+  s8 filter_data[30] = {1,  1,  1,  3,  3,  3,  5,  5,  5,  7,
+                        7,  7,  9,  9,  9,  11, 11, 11, 13, 13,
+                        13, 15, 15, 15, 17, 17, 17, 19, 19, 19};
+
+  // 1x3
+  s32 bias_data[3] = {4, 8, 12};
+
+  // 2x3, input/kernel/output zero_point = 0
+  s8 ref_output_data[6] = {-10, -9, -8, 24, 25, 26};
+  s8 output_data[6];
+
+  s8 output_rshift = 1;  // change to right shift
+  u32 output_multiplier = 1073741824;
+
+  int left_row = 2;
+  int left_col = 10;
+  int right_col = 3;
+
+  fc_test_param_t params;
+  memset(&params, 0, sizeof(params));
+  params.left_row = left_row;
+  params.left_col = left_col;
+  params.right_col = right_col;
+  params.has_bias = 1;
+  params.relu_enable = 0;
+  params.input_data = input_data;
+  params.filter_data = filter_data;
+  params.output_data = output_data;
+  params.bias_data = bias_data;
+  params.multiplier = output_multiplier;
+  params.right_shift = output_rshift;
+  fully_connected_ref(&params);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("Compare ref and golden\n");
+#endif
+  for (int i = 0; i < 6; i++) {
+    if (output_data[i] != ref_output_data[i]) {
+      printf("  output_data[%d] %d(ref) != %d(golden)\n", i, output_data[i],
+             ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  ml_shape_t left_shape =
+      bmk1822_matrix_lmem_default_shape(bk_ctx, left_row, left_col, FMT_I8);
+
+  ml_shape_t right_shape =
+      bmk1822_matrix_lmem_default_shape(bk_ctx, left_col, right_col, FMT_I8);
+
+  ml_shape_t b_shape =
+      bmk1822_matrix_lmem_default_shape(bk_ctx, 4, right_col, FMT_I8);  // 32bit
+
+  ml_shape_t y_shape =
+      bmk1822_matrix_lmem_default_shape(bk_ctx, left_row, right_col, FMT_I8);
+
+  bmk1822_matrix_lmem_t *tl_left =
+      bmk1822_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  bmk1822_matrix_lmem_t *tl_right =
+      bmk1822_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  bmk1822_matrix_lmem_t *tl_b =
+      bmk1822_lmem_alloc_matrix(bk_ctx, b_shape, FMT_I8, 1);
+  bmk1822_matrix_lmem_t *tl_y =
+      bmk1822_lmem_alloc_matrix(bk_ctx, y_shape, FMT_I8, 1);
+
+  put_matrix_g2l(ctx, bk_ctx, tl_left, reinterpret_cast<u8 *>(input_data));
+  put_matrix_g2l(ctx, bk_ctx, tl_right, reinterpret_cast<u8 *>(filter_data));
+  put_bias32(ctx, bk_ctx, tl_b, bias_data);
+
+  {
+    param_t p;
+    memset(&p, 0, sizeof(p));
+    p.left = tl_left;
+    p.right = tl_right;
+    p.bias = tl_b;
+    p.res = tl_y;
+    p.rshift_bits = output_rshift;
+    p.res_is_int8 = 1;
+    p.ps32_mode = 0;
+    p.quan_m = output_multiplier;
+    bmk1822_tiu_matrix_multiplication_qdm(bk_ctx, &p);
+  }
+
+  s8 *tiu_output_data =
+      reinterpret_cast<s8 *>(get_matrix_l2g(ctx, bk_ctx, tl_y));
+#ifdef ENABLE_DEBUG_MSG
+  printf("Compare tiu and ref\n");
+#endif
+  for (int i = 0; i < 6; i++) {
+    if (tiu_output_data[i] != ref_output_data[i]) {
+      printf("  output_data[%d] %d(tiu) != %d(ref)\n", i, tiu_output_data[i],
+             ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  free(tiu_output_data);
+
+  bmk1822_lmem_free_matrix(bk_ctx, tl_y);
+  bmk1822_lmem_free_matrix(bk_ctx, tl_b);
+  bmk1822_lmem_free_matrix(bk_ctx, tl_right);
+  bmk1822_lmem_free_matrix(bk_ctx, tl_left);
+
+  return ret;
+}
+
+int choose_from_range(int table[], int size, int index)
+{
+  if (index >= size) {
+    return 0;
+  }
+
+  int val = table[index];
+  if (index < (size - 1)) {
+    int range = MAX(table[index + 1] - table[index] - 1, 1);
+    val += rand() % range;
+  }
+
+  return val;
+}
+
+bool check_valid_test_param(bmk_ctx_t *bk_ctx, fc_test_param_t *p_param)
+{
+  int left_row = p_param->left_row;
+  int left_col = p_param->left_col;
+  int right_col = p_param->right_col;
+  int has_bias = p_param->has_bias;
+
+  bmk1822_matrix_lmem_shape_t tl_input_shape =
+      bmk1822_matrix_lmem_default_shape(bk_ctx, left_row, left_col, FMT_I8);
+  bmk1822_matrix_lmem_stride_t tl_input_stride =
+      bmk1822_matrix_lmem_default_stride(bk_ctx, tl_input_shape, FMT_I8, 1);
+
+  bmk1822_matrix_lmem_shape_t tl_filter_shape =
+      bmk1822_matrix_lmem_default_shape(bk_ctx, left_col, right_col, FMT_I8);
+  bmk1822_matrix_lmem_stride_t tl_filter_stride =
+      bmk1822_matrix_lmem_default_stride(bk_ctx, tl_filter_shape, FMT_I8, 1);
+
+  bmk1822_matrix_lmem_shape_t tl_output_shape =
+      bmk1822_matrix_lmem_default_shape(bk_ctx, left_row, right_col, FMT_I8);
+  bmk1822_matrix_lmem_stride_t tl_output_stride =
+      bmk1822_matrix_lmem_default_stride(bk_ctx, tl_output_shape, FMT_I8, 1);
+
+  u32 bias_size = 0;
+  if (has_bias) {
+    bmk1822_matrix_lmem_shape_t tl_bias_shape =
+        bmk1822_matrix_lmem_default_shape(bk_ctx, 4, right_col, FMT_I8);  // 32bit
+    bmk1822_matrix_lmem_stride_t tl_bias_stride =
+        bmk1822_matrix_lmem_default_stride(bk_ctx, tl_bias_shape, FMT_I8, 1);
+    bias_size = tl_bias_shape.n * tl_bias_stride.n;
+  }
+
+  bmk1822_chip_info_t chip_info = bmk1822_chip_info();
+  u32 lmem_size_per_lane = chip_info.lmem_size;
+  // u32 total_lmem_size = chip_info.lmem_size * chip_info.npu_num;
+
+  u32 needed_size = tl_input_shape.n * tl_input_stride.n +
+                    tl_filter_shape.n * tl_filter_stride.n +
+                    tl_output_shape.n * tl_output_stride.n + bias_size;
+
+  if (needed_size > lmem_size_per_lane) {
+    return false;
+  }
+
+  return true;
+}
+
+void fill_random_data_s8(s8 *input_data, int size)
+{
+  for (int i = 0; i < size; ++i) {
+    int is_satured = ((rand() % 1000) == 1) ? 1 : 0;
+    int is_sign = rand() % 2 ? 1 : -1;
+
+    if (is_satured && is_sign) {
+      input_data[i] = -128;
+    } else if (is_satured) {
+      input_data[i] = 127;
+    } else {
+      input_data[i] = is_sign * rand() % 128;
+    }
+  }
+}
+
+void fill_random_data_s32(s32 *input_data, int size)
+{
+  for (int i = 0; i < size; ++i) {
+    int is_satured = ((rand() % 1000) == 1) ? 1 : 0;
+    int is_sign = rand() % 2 ? 1 : -1;
+
+    if (is_satured && is_sign) {
+      input_data[i] = INT_MIN;
+    } else if (is_satured) {
+      input_data[i] = INT_MAX;
+    } else {
+      input_data[i] = is_sign * rand() % 128;
+    }
+  }
+}
+
+void dump_test_param(fc_test_param_t *p_param, bool dump_content)
+{
+  printf("Dump test paramter:\n");
+  printf("  left_row %d\n", p_param->left_col);
+  printf("  left_col %d\n", p_param->left_col);
+  printf("  right_col %d\n", p_param->right_col);
+  printf("  has_bias %d\n", p_param->has_bias);
+  printf("  multiplier %d\n", p_param->multiplier);
+  printf("  right_shift %d\n", p_param->right_shift);
+
+  if (dump_content) {
+    printf("input_data(%d, %d)\n", p_param->left_row, p_param->left_col);
+    int left_row = p_param->left_row;
+    int left_col = p_param->left_col;
+    for (int i = 0; i < left_row; ++i) {
+      for (int j = 0; j < left_col; ++j) {
+        int offset = i * left_col + j;
+        printf("%d, ", p_param->input_data[offset]);
+      }
+      printf("\n");
+    }
+    printf("\n\n");
+
+    int right_col = p_param->right_col;
+    printf("kernel_data (%d, %d)\n", left_col, right_col);
+    for (int i = 0; i < left_col; ++i) {
+      for (int j = 0; j < right_col; ++j) {
+        int offset = i * right_col + j;
+        printf("%d, ", p_param->filter_data[offset]);
+      }
+      printf("\n");
+    }
+    printf("\n\n");
+
+    if (p_param->has_bias) {
+      for (int i = 0; i < right_col; ++i) {
+        printf("%d, ", p_param->bias_data[i]);
+      }
+      printf("\n\n");
+    }
+  }
+}
+
+int run_compare_fc(bmctx_t *ctx, bmk_ctx_t *bk_ctx, fc_test_param_t *p_param)
+{
+  int ret = 0;
+
+  int left_row = p_param->left_row;
+  int left_col = p_param->left_col;
+  int right_col = p_param->right_col;
+  int has_bias = p_param->has_bias;
+
+  int input_size = left_row * left_col;
+  s8 *input_data = (s8 *)malloc(input_size);
+
+  int kernel_size = left_col * right_col;
+  s8 *kernel_data = (s8 *)malloc(kernel_size);
+
+  int output_size = left_row * right_col;
+  s8 *output_data = (s8 *)malloc(output_size);
+
+  s32 *bias_data = (s32 *) malloc(sizeof(s32) * right_col);
+
+  p_param->input_data = input_data;
+  p_param->filter_data = kernel_data;
+  p_param->output_data = output_data;
+  p_param->bias_data = bias_data;
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    run_compare_conv =>\n");
+  printf("      left (%d, %d), right (%d, %d), has_bias %d\n", left_row,
+         left_col, left_col, right_col, has_bias);
+#endif
+
+  int retry_cnt = p_param->retry_cnt;
+  do {
+    fill_random_data_s8(input_data, input_size);
+    fill_random_data_s8(kernel_data, kernel_size);
+    if (has_bias) {
+      fill_random_data_s32(bias_data, right_col);
+    }
+
+    p_param->float_multiplier = 100.0;  // should be < 1.0
+    calc_fc_float_multiplier(p_param);
+
+    if (p_param->float_multiplier > 0.f && p_param->float_multiplier < 1.0) {
+      break;
+    }
+
+  } while (--retry_cnt);
+
+  if (p_param->float_multiplier >= 1.0) {
+    printf("    run_compare_dw_conv: unable to find valid multiplier\n");
+    free(input_data);
+    free(kernel_data);
+    free(output_data);
+    free(bias_data);
+    return -1;
+  }
+
+  u32 base_multiplier = 0;
+  int base_shift = 0;
+  QuantizeMultiplierSmallerThanOne(p_param->float_multiplier, &base_multiplier,
+                                   &base_shift);
+
+  // multipliers typically range in [2^30 ; 2^31 - 1].
+  // Values in [0, 2^30 - 1] are normally unused, but harmless.
+  // Thus a good way to randomize multipliers is to subtract from them
+  // a random value smaller than 2^30 but still significant compared to it.
+  u32 output_multiplier = base_multiplier - (rand() % (1 << 26));
+
+  int right_shift = base_shift - 1 + (rand() % 4);
+  s8 output_rshift = truncate_rshift((s8)right_shift, /*allow_lshift*/1);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("      multiplier_data %d, shift_data %d\n", output_multiplier,
+         output_rshift);
+#endif
+
+  p_param->multiplier = output_multiplier;
+  p_param->right_shift = output_rshift;
+  fully_connected_ref(p_param);
+
+  bmk1822_matrix_lmem_shape_t tl_input_shape =
+      bmk1822_matrix_lmem_default_shape(bk_ctx, left_row, left_col, FMT_I8);
+
+  bmk1822_matrix_lmem_shape_t tl_filter_shape =
+      bmk1822_matrix_lmem_default_shape(bk_ctx, left_col, right_col, FMT_I8);
+
+  bmk1822_matrix_lmem_shape_t tl_output_shape =
+      bmk1822_matrix_lmem_default_shape(bk_ctx, left_row, right_col, FMT_I8);
+
+  bmk1822_matrix_lmem_shape_t tl_bias_shape =
+      bmk1822_matrix_lmem_default_shape(bk_ctx, 4, right_col, FMT_I8);  // 32bit
+
+  bmk1822_matrix_lmem_t *tl_input = bmk1822_lmem_alloc_matrix(
+      bk_ctx, tl_input_shape, FMT_I8, /*eu_align=*/1);
+  bmk1822_matrix_lmem_t *tl_filter = bmk1822_lmem_alloc_matrix(
+      bk_ctx, tl_filter_shape, FMT_I8, /*eu_align=*/1);
+  bmk1822_matrix_lmem_t *tl_output = bmk1822_lmem_alloc_matrix(
+      bk_ctx, tl_output_shape, FMT_I8, /*eu_align=*/1);
+
+  bmk1822_matrix_lmem_t *tl_bias = nullptr;
+  if (has_bias) {
+    tl_bias = bmk1822_lmem_alloc_matrix(bk_ctx, tl_bias_shape, FMT_I8,
+                                          /*eu_align=*/1);
+  }
+
+  if (tl_input == nullptr) {
+    printf("   fail to alloc tl_input (%d, %d)\n", left_row, left_col);
+    return -1;
+  }
+  if (tl_filter == nullptr) {
+    printf("    fail to alloc tl_filter (%d, %d)\n", left_col, right_col);
+    return -1;
+  }
+  if (tl_output == nullptr) {
+    printf("    fail to alloc tl_output (%d, %d)\n", left_row, right_col);
+    return -1;
+  }
+  if (has_bias && (tl_bias == nullptr)) {
+    printf("  fail to alloc bias (%d, %d)\n", 4, right_col);
+    return -1;
+  }
+
+  put_matrix_g2l(ctx, bk_ctx, tl_input, reinterpret_cast<u8 *>(input_data));
+  put_matrix_g2l(ctx, bk_ctx, tl_filter, reinterpret_cast<u8 *>(kernel_data));
+  if (tl_bias) {
+    put_bias32(ctx, bk_ctx, tl_bias, bias_data);
+  }
+
+  {
+    param_t p;
+    memset(&p, 0, sizeof(p));
+    p.left = tl_input;
+    p.right = tl_filter;
+    p.bias = tl_bias;
+    p.res = tl_output;
+    p.rshift_bits = (u8)output_rshift;
+    p.res_is_int8 = 1;
+    p.ps32_mode = 0;
+    p.quan_m = output_multiplier;
+    bmk1822_tiu_matrix_multiplication_qdm(bk_ctx, &p);
+  }
+
+  s8 *tiu_output_data =
+      reinterpret_cast<s8 *>(get_matrix_l2g(ctx, bk_ctx, tl_output));
+#ifdef ENABLE_DEBUG_MSG
+  printf("Compare tiu and ref\n");
+#endif
+  for (int i = 0; i < left_row; ++i) {
+    for (int j = 0; j < right_col; ++j) {
+      int offset = i * right_col + j;
+      if (tiu_output_data[offset] != output_data[offset]) {
+        printf("  output_data[%d][%d] %d(tiu) != %d(ref)\n", i, j,
+               tiu_output_data[offset], output_data[offset]);
+        ret = -1;
+      }
+    }
+  }
+
+  if (ret) {
+    dump_test_param(p_param, /*dump_content=*/true);
+  }
+
+  // Reverse order
+  if (tl_bias) {
+    bmk1822_lmem_free_matrix(bk_ctx, tl_bias);
+  }
+
+  bmk1822_lmem_free_matrix(bk_ctx, tl_output);
+  bmk1822_lmem_free_matrix(bk_ctx, tl_filter);
+  bmk1822_lmem_free_matrix(bk_ctx, tl_input);
+
+  free(tiu_output_data);
+
+  free(input_data);
+  free(kernel_data);
+  free(output_data);
+  free(bias_data);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    <= run_compare_conv, ret %d\n", ret);
+#endif
+
+  return ret;
+}
+
+int random_test(bmctx_t *ctx, bmk_ctx_t *bk_ctx)
+{
+  int ret = 0;
+
+#if 0
+  int left_row_range[] = {1};
+  int left_col_range[] = {1};
+  int right_col_range[] = {1};
+#else
+#ifndef ENABLE_TV_GEN_PATTERN
+  int left_row_range[] = {1, 16, 32, 64, 128, 256, 1024, 2048, 4095};
+  int left_col_range[] = {1, 16, 32, 64, 128, 256, 1024, 2048, 4095};
+  int right_col_range[] = {1, 16, 32, 64, 128, 256, 1024, 2048, 4095};
+#else
+  // TV_GEN pattern
+  // Random Test, total 27, skipped 86, executed 22, failed 0, ret 0
+
+  int left_row_range[] =  {1, 16, 4095};
+  int left_col_range[] =  {1, 16, 4095};
+  int right_col_range[] = {1, 16, 4095};
+#endif
+#endif
+
+  const int left_row_range_size =
+      sizeof(left_row_range) / sizeof(left_row_range[0]);
+  const int left_col_range_size =
+      sizeof(left_col_range) / sizeof(left_col_range[0]);
+  const int right_col_range_size =
+      sizeof(right_col_range) / sizeof(right_col_range[0]);
+
+  int random_seed = clock();
+  srand(random_seed);
+
+  const int retry_test_count = 100;
+  bool stop_at_first_error = true;
+
+  int executed_tests = 0;
+  int failed_tests = 0;
+
+  printf("1822-fc-qm: random test =>\n");
+  for (int m = 0; m < retry_test_count; ++m) {
+    for (int i = 0; i < left_row_range_size; ++i) {
+      // random choosed from [range[i] : range[i+1]]
+      int left_row = choose_from_range(left_row_range, left_row_range_size, i);
+
+      for (int j = 0; j < left_col_range_size; ++j) {
+        int left_col =
+            choose_from_range(left_col_range, left_col_range_size, j);
+
+        for (int k = 0; k < right_col_range_size; ++k) {
+          int right_col =
+              choose_from_range(right_col_range, right_col_range_size, k);
+
+          int has_bias = rand() % 2;
+
+          fc_test_param_t test_param;
+          memset(&test_param, 0, sizeof(test_param));
+          test_param.left_row = left_row;
+          test_param.left_col = left_col;
+          test_param.right_col = right_col;
+          test_param.has_bias = has_bias;
+          test_param.retry_cnt = 5;
+
+          bool is_valid_param = check_valid_test_param(bk_ctx, &test_param);
+          if (is_valid_param == false) {
+            continue;
+          }
+
+          int ret2 = run_compare_fc(ctx, bk_ctx, &test_param);
+          failed_tests = ret2 ? failed_tests + 1 : failed_tests;
+          ret |= ret2;
+          executed_tests++;
+
+#ifdef ENABLE_DEBUG_MSG
+          printf("  [%d] random test: left(%d, %d), right (%d, %d), result "
+                 "%d\n",
+                 executed_tests, left_row, left_col, left_col,
+                 right_col, ret2);
+#endif
+        }
+
+        // Stop at first error
+        if (ret && stop_at_first_error) {
+          break;
+        }
+      }
+
+      // Stop at first error
+      if (ret && stop_at_first_error) {
+        break;
+      }
+    }
+
+    // Stop at first error
+    if (ret && stop_at_first_error) {
+      break;
+    }
+
+    if (executed_tests >= MIN_EXEC_TESTS) {
+      break;
+    }
+  }
+
+  printf("<= 1822-fc-qm: random test, total %d, failed %d, ret %d\n",
+         executed_tests, failed_tests, ret);
+
+  return 0;
+}
+
+int main()
+{
+  int ret = 0;
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  // ret |= tfl_original_test();
+  ret |= simple_test(&ctx, bk_ctx);
+  ret |= random_test(&ctx, bk_ctx);
+
+  test_exit(&ctx);
+
+  return ret;
+}
diff --git a/cviruntime/test/1822/test_1822_matrix_transfer.cpp b/cviruntime/test/1822/test_1822_matrix_transfer.cpp
new file mode 100644
index 000000000..a753c5c1a
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_matrix_transfer.cpp
@@ -0,0 +1,82 @@
+#include "1822_test_util.h"
+
+static void test_put_and_get_matrix_l2g(
+    bmctx_t *ctx, bmk_ctx_t *bk_ctx)
+{
+  int row = 5;
+  int col = 16 * 5 + 2;
+  int size = row * col;
+
+  ml_shape_t s =
+      bmk1822_matrix_lmem_default_shape(bk_ctx, row, col, FMT_I8);
+
+  u8 *data_x = (u8 *)xmalloc(size);
+  u8 *data_y = (u8 *)xmalloc(size);
+
+  for (int i = 0; i < size; i++)
+    data_x[i] = i - 100;
+
+  for (int i = 0; i < size; i++)
+    data_y[i] = -i;
+
+  ml_t *ml_x =
+      bmk1822_lmem_alloc_matrix(bk_ctx,s, FMT_I8, 1);
+  ml_t *ml_y =
+      bmk1822_lmem_alloc_matrix(bk_ctx,s, FMT_I8, 1);
+
+  /*
+   * Interleave two matrice in case the same devmem is reused between
+   * put_matrix_g2l() and get_matrix_l2g(), in which case the content of
+   * devmem is already what is expected before bmk1822_gdma_store_matrix().
+   */
+  put_matrix_g2l(ctx, bk_ctx, ml_x, data_x);
+  put_matrix_g2l(ctx, bk_ctx, ml_y, data_y);
+
+  u8 *result_x = get_matrix_l2g(ctx, bk_ctx, ml_x);
+  u8 *result_y = get_matrix_l2g(ctx, bk_ctx, ml_y);
+  for (int i = 0; i < size; i++) {
+    if (result_x[i] != data_x[i]) {
+      printf("compare failed at result_x[%d]\n", i);
+      exit(-1);
+    }
+    if (result_y[i] != data_y[i]) {
+      printf("compare failed at result_y[%d]\n", i);
+      exit(-1);
+    }
+  }
+  free(result_x);
+  free(result_y);
+
+  /*
+   * Get result_y before result_x.
+   */
+  result_y = get_matrix_l2g(ctx, bk_ctx, ml_y);
+  result_x = get_matrix_l2g(ctx, bk_ctx, ml_x);
+  for (int i = 0; i < size; i++) {
+    if (result_x[i] != data_x[i]) {
+      printf("compare failed at result_x[%d]\n", i);
+      exit(-1);
+    }
+    if (result_y[i] != data_y[i]) {
+      printf("compare failed at result_y[%d]\n", i);
+      exit(-1);
+    }
+  }
+  free(result_x);
+  free(result_y);
+
+  bmk1822_lmem_free_matrix(bk_ctx, ml_y);
+  bmk1822_lmem_free_matrix(bk_ctx, ml_x);
+  free(data_x);
+  free(data_y);
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  test_put_and_get_matrix_l2g(&ctx, bk_ctx);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_max_pooling.cpp b/cviruntime/test/1822/test_1822_max_pooling.cpp
new file mode 100644
index 000000000..9741bc8ad
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_max_pooling.cpp
@@ -0,0 +1,214 @@
+#include "1822_test_util.h"
+
+#define INVALIDE_STRIDE (-1)
+typedef bmk1822_tiu_max_pooling_param_t param_t;
+
+static void print_pooling_param(param_t *p)
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+
+  printf("  Pooling parameters:\n");
+  printf("    ifmap = (%d, %d, %d, %d)\n", in, ic, ih, iw);
+  printf("    opd0_sign = %d\n", p->ifmap->fmt == FMT_I8);
+  printf("    weight = (%d, %d)\n", p->kh, p->kw);
+  printf("    padding = (%d, %d, %d, %d)\n",
+         p->pad_top, p->pad_bottom, p->pad_left, p->pad_right);
+  printf("    stride = (%d, %d)\n", p->stride_h, p->stride_w);
+}
+
+static int pooling_ih_ext(param_t *p, int ih)
+{
+  int pad = p->pad_top + p->pad_bottom;
+  return ih + pad;
+}
+
+static int pooling_iw_ext(param_t *p, int iw)
+{
+  int pad = p->pad_left + p->pad_right;
+  return iw + pad;
+}
+
+static int pooling_oh(param_t *p, int ih)
+{
+  int ih_ext = pooling_ih_ext(p, ih);
+  return (ih_ext - p->kh) / p->stride_h + 1;
+}
+
+static int pooling_ow(param_t *p, int iw)
+{
+  int iw_ext = pooling_iw_ext(p, iw);
+  return (iw_ext - p->kw) / p->stride_w + 1;
+}
+
+static s8 *alloc_input(param_t *p)
+{
+  u64 size = tl_shape_size(&p->ifmap->shape);
+  s8 *data = (s8 *)xmalloc(size);
+  if (!data)
+    return NULL;
+
+  for (u64 i = 0; i < size; i++)
+    data[i] = rand() % 256 - 128;
+  return data;
+}
+
+static s8 *alloc_output(param_t *p)
+{
+  u64 size = tl_shape_size(&p->ofmap->shape);
+  return (s8 *)xmalloc(size);
+}
+
+static void free_pooling_param(
+    bmk_ctx_t *ctx,
+    param_t *r)
+{
+  if (r->ifmap)
+    free_tl(ctx, r->ifmap);
+  if (r->ofmap)
+    free_tl(ctx, r->ofmap);
+}
+
+static param_t random_pooling_param(bmk_ctx_t *ctx, int stride_w, int stride_h)
+{
+  srand(clock());
+  param_t p;
+
+  memset(&p, 0, sizeof(p));
+
+retry:
+  int in = rand() % 5 + 1;
+  int ic = rand() % (3 * BM1822_HW_NPU_NUM) + 1;
+  int ih = rand() % 30 + 3 + (INVALIDE_STRIDE == stride_h ? 0 : stride_h);
+  int iw = rand() % 30 + 6 + (INVALIDE_STRIDE == stride_w ? 0 : stride_w);
+  int opd0_sign = rand() % 2;
+
+  p.kh = rand() % 7 + 1;
+  p.kw = rand() % 7 + 1;
+  p.stride_h = INVALIDE_STRIDE == stride_h ? rand() % (p.kh) + 1 : stride_h;
+  p.stride_w = INVALIDE_STRIDE == stride_w ? rand() % (p.kh) + 1 : stride_w;
+  p.pad_top = rand() % p.kh;
+  p.pad_bottom = rand() % p.kh;
+  p.pad_left = rand() % p.kw;
+  p.pad_right = rand() % p.kw;
+
+  tl_shape_t ifmap_shape;
+  ifmap_shape.n = in;
+  ifmap_shape.c = ic;
+  ifmap_shape.h = ih;
+  ifmap_shape.w = iw;
+  tl_shape_t ofmap_shape;
+  ofmap_shape.n = in;
+  ofmap_shape.c = ic;
+  ofmap_shape.h = pooling_oh(&p, ih);
+  ofmap_shape.w = pooling_ow(&p, iw);
+
+  fmt_t fmt = opd0_sign? FMT_I8: FMT_U8;
+  p.ofmap = bmk1822_lmem_alloc_tensor(ctx, ofmap_shape, FMT_I8, 1);
+  p.ifmap = bmk1822_lmem_alloc_tensor(ctx, ifmap_shape, fmt, 1);
+
+  if ((p.kh > pooling_ih_ext(&p, ih))
+      || (p.kw > pooling_iw_ext(&p, iw))
+      || (p.pad_top >= (1 << 4))
+      || (p.pad_bottom >= (1 << 4))
+      || (p.pad_left >= (1 << 4))
+      || (p.pad_right >= (1 << 4))
+      || (p.kh * p.kw == 1)
+      || !p.ofmap || !p.ifmap) {
+    printf("retry init_pooling_param\n");
+    free_pooling_param(ctx, &p);
+    goto retry;
+  }
+
+  return p;
+}
+
+static void compare_results(
+    param_t *p,
+    s8 input[],
+    s8 output[])
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+  int sign = (p->ifmap->fmt == FMT_I8);
+
+  s8 *output_ref = alloc_output(p);
+  bmerr_t ret = native_pooling_max_int8(
+      input, output_ref, in, ic, ih, iw, p->kh, p->kw,
+      p->pad_top, p->pad_bottom, p->pad_left, p->pad_right,
+      p->stride_h, p->stride_w, 0, 0, 0, 0, sign);
+  assert(ret == BM_SUCCESS);
+
+  int cmp_res = array_cmp_int8(
+      "Comparing results ...\n", output_ref, output,
+      tl_shape_size(&p->ofmap->shape));
+
+  if (cmp_res != 0) {
+    printf("Comparison FAILED!!!\n");
+    print_pooling_param(p);
+    exit(-1);
+  }
+
+  free(output_ref);
+}
+
+static int _test_pooling(bmctx_t ctx, bmk_ctx_t *bk_ctx, int stride_w, int stride_h)
+{
+  param_t param = random_pooling_param(bk_ctx, stride_w, stride_h);
+  s8 *input = alloc_input(&param);
+
+  put_tensor_g2l(&ctx, bk_ctx, param.ifmap, (u8 *)input);
+  bmk1822_tiu_max_pooling(bk_ctx, &param);
+  s8 *output = (s8 *)get_tensor_l2g(&ctx, bk_ctx, param.ofmap);
+
+  compare_results(&param, input, output);
+
+  free_pooling_param(bk_ctx, &param);
+  free(output);
+  free(input);
+
+  return 1;
+}
+
+static int test_pooling(bmctx_t ctx, bmk_ctx_t *bk_ctx) {
+  return _test_pooling(ctx, bk_ctx, INVALIDE_STRIDE, INVALIDE_STRIDE);
+}
+
+static void test_max_pooling(bmctx_t *ctx, bmk_ctx_t *bk_ctx)
+{
+  int test_finished_num = 0;
+  for (u64 i = 0; i < 16; i++)
+    test_finished_num += test_pooling(*ctx, bk_ctx);
+
+  // test stride extend (0, 31]
+  int stride_list[] = {15, 16, 31};
+  int stride_list_len = sizeof(stride_list) / sizeof(stride_list[0]);
+
+  for (int stride_w_idx = 0; stride_w_idx < stride_list_len; stride_w_idx++) {
+    for (int stride_h_idx = 0; stride_h_idx < stride_list_len; stride_h_idx++) {
+      int stride_w = stride_list[stride_w_idx];
+      int stride_h = stride_list[stride_h_idx];
+
+      test_finished_num += _test_pooling(*ctx, bk_ctx, stride_w, stride_h);
+    }
+  }
+
+  printf("Test finished %d\n", test_finished_num);
+
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_max_pooling(&ctx, bk_ctx);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_min_pooling.cpp b/cviruntime/test/1822/test_1822_min_pooling.cpp
new file mode 100644
index 000000000..2d28ae22b
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_min_pooling.cpp
@@ -0,0 +1,194 @@
+#include "1822_test_util.h"
+
+typedef bmk1822_tiu_min_pooling_param_t param_t;
+
+static void print_pooling_param(param_t *p)
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+
+  printf("  Pooling parameters:\n");
+  printf("    ifmap = (%d, %d, %d, %d)\n", in, ic, ih, iw);
+  printf("    opd0_sign = %d\n", p->ifmap->fmt == FMT_I8);
+  printf("    weight = (%d, %d)\n", p->kh, p->kw);
+  printf("    padding = (%d, %d, %d, %d)\n",
+         p->pad_top, p->pad_bottom, p->pad_left, p->pad_right);
+  printf("    stride = (%d, %d)\n", p->stride_h, p->stride_w);
+}
+
+static int pooling_ih_ext(param_t *p, int ih)
+{
+  int pad = p->pad_top + p->pad_bottom;
+  return ih + pad;
+}
+
+static int pooling_iw_ext(param_t *p, int iw)
+{
+  int pad = p->pad_left + p->pad_right;
+  return iw + pad;
+}
+
+static int pooling_oh(param_t *p, int ih)
+{
+  int ih_ext = pooling_ih_ext(p, ih);
+  return (ih_ext - p->kh) / p->stride_h + 1;
+}
+
+static int pooling_ow(param_t *p, int iw)
+{
+  int iw_ext = pooling_iw_ext(p, iw);
+  return (iw_ext - p->kw) / p->stride_w + 1;
+}
+
+static s8 *alloc_input(param_t *p)
+{
+  u64 size = tl_shape_size(&p->ifmap->shape);
+  s8 *data = (s8 *)xmalloc(size);
+  if (!data)
+    return NULL;
+
+  for (u64 i = 0; i < size; i++)
+    data[i] = rand() % 256 - 128;
+  return data;
+}
+
+static s8 *alloc_output(param_t *p)
+{
+  u64 size = tl_shape_size(&p->ofmap->shape);
+  return (s8 *)xmalloc(size);
+}
+
+static void free_pooling_param(
+    bmk_ctx_t *ctx,
+    param_t *r)
+{
+  if (r->ifmap)
+    free_tl(ctx, r->ifmap);
+  if (r->ofmap)
+    free_tl(ctx, r->ofmap);
+}
+
+static param_t random_pooling_param(bmk_ctx_t *ctx)
+{
+  srand(clock());
+  param_t p;
+
+  memset(&p, 0, sizeof(p));
+
+retry:
+  int in = rand() % 5 + 1;
+  int ic = rand() % (3 * BM1822_HW_NPU_NUM) + 1;
+  int ih = rand() % 30 + 3;
+  int iw = rand() % 30 + 6;
+  int opd0_sign = rand() % 2;
+
+  p.kh = rand() % 7 + 1;
+  p.kw = rand() % 7 + 1;
+  p.stride_h = rand() % (p.kh) + 1;
+  p.stride_w = rand() % (p.kw) + 1;
+  p.pad_top = rand() % p.kh;
+  p.pad_bottom = rand() % p.kh;
+  p.pad_left = rand() % p.kw;
+  p.pad_right = rand() % p.kw;
+
+  tl_shape_t ifmap_shape;
+  ifmap_shape.n = in;
+  ifmap_shape.c = ic;
+  ifmap_shape.h = ih;
+  ifmap_shape.w = iw;
+  tl_shape_t ofmap_shape;
+  ofmap_shape.n = in;
+  ofmap_shape.c = ic;
+  ofmap_shape.h = pooling_oh(&p, ih);
+  ofmap_shape.w = pooling_ow(&p, iw);
+
+  fmt_t fmt = opd0_sign? FMT_I8: FMT_U8;
+  p.ofmap = bmk1822_lmem_alloc_tensor(ctx, ofmap_shape, FMT_I8, 1);
+  p.ifmap = bmk1822_lmem_alloc_tensor(ctx, ifmap_shape, fmt, 1);
+
+  if ((p.kh > pooling_ih_ext(&p, ih))
+      || (p.kw > pooling_iw_ext(&p, iw))
+      || (p.pad_top >= (1 << 4))
+      || (p.pad_bottom >= (1 << 4))
+      || (p.pad_left >= (1 << 4))
+      || (p.pad_right >= (1 << 4))
+      || (p.kh * p.kw == 1)
+      || !p.ofmap || !p.ifmap) {
+    printf("retry init_pooling_param\n");
+    free_pooling_param(ctx, &p);
+    goto retry;
+  }
+
+  return p;
+}
+
+static void compare_results(
+    param_t *p,
+    s8 input[],
+    s8 output[])
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+  int sign = (p->ifmap->fmt == FMT_I8);
+
+  s8 *output_ref = alloc_output(p);
+  bmerr_t ret = native_pooling_min_int8(
+      input, output_ref, in, ic, ih, iw, p->kh, p->kw,
+      p->pad_top, p->pad_bottom, p->pad_left, p->pad_right,
+      p->stride_h, p->stride_w, 0, 0, 0, 0, sign);
+  assert(ret == BM_SUCCESS);
+
+  int cmp_res = array_cmp_int8(
+      "Comparing results ...\n", output_ref, output,
+      tl_shape_size(&p->ofmap->shape));
+
+  if (cmp_res != 0) {
+    printf("Comparison FAILED!!!\n");
+    print_pooling_param(p);
+    exit(-1);
+  }
+
+  free(output_ref);
+}
+
+static int test_pooling(bmctx_t ctx, bmk_ctx_t *bk_ctx)
+{
+  param_t param = random_pooling_param(bk_ctx);
+  s8 *input = alloc_input(&param);
+
+  put_tensor_g2l(&ctx, bk_ctx, param.ifmap, (u8 *)input);
+  bmk1822_tiu_min_pooling(bk_ctx, &param);
+  s8 *output = (s8 *)get_tensor_l2g(&ctx, bk_ctx, param.ofmap);
+
+  compare_results(&param, input, output);
+
+  free_pooling_param(bk_ctx, &param);
+  free(output);
+  free(input);
+
+  return 1;
+}
+
+static void test_min_pooling(bmctx_t *ctx, bmk_ctx_t *bk_ctx)
+{
+  int test_finished_num = 0;
+  for (u64 i = 0; i < 16; i++)
+    test_finished_num += test_pooling(*ctx, bk_ctx);
+  printf("Test finished %d\n", test_finished_num);
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_min_pooling(&ctx, bk_ctx);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_put_matrix_stride.cpp b/cviruntime/test/1822/test_1822_put_matrix_stride.cpp
new file mode 100644
index 000000000..1db5d9420
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_put_matrix_stride.cpp
@@ -0,0 +1,107 @@
+#include "1822_test_util.h"
+
+static void put_matrix_g2l_stride_ref(u8 *ref,
+    u8 *a,
+    ml_shape_t  lmem_shape,
+    bmk1822_matrix_tgmem_stride_t gmem_stride)
+{
+  int row = lmem_shape.n;
+  int col = lmem_shape.col;
+  int row_stride = gmem_stride.row;
+
+  for (int ri = 0; ri < row; ri++)
+    for (int ci = 0; ci < col; ci++)
+      ref[ri * col + ci] = a[ri * row_stride + ci];
+}
+
+static void put_matrix_g2l_stride(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    ml_t *ml,
+    bmk1822_matrix_tgmem_stride_t gmem_stride,
+    u8 *data)
+{
+  int row = ml->shape.n;
+  int col = ml->shape.col;
+  int row_stride = gmem_stride.row;
+
+  bmshape_t bms = BM_MATRIX_INT8(row, row_stride);
+  bmmem_device_t devmem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  int ret = bm_memcpy_s2d(*ctx, devmem, data);
+  assert(ret == BM_SUCCESS);
+
+  gaddr_t gaddr = bmmem_device_addr(devmem);
+  mg_t mg;
+  mg.base_reg_index = 0;
+  mg.start_address = gaddr;
+  mg.shape.row = row;
+  mg.shape.col = col;
+  mg.stride = gmem_stride;
+  mg.base_reg_index = 0;
+
+  bmk1822_tdma_tg2l_matrix_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.dst = ml;
+  p.src = &mg;
+
+  bmk1822_tdma_g2l_matrix_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  bmmem_device_free(*ctx, devmem);
+  return ;
+}
+
+static void test_put_matrix_g2l_stride(
+    bmctx_t *ctx, bmk_ctx_t *bk_ctx)
+{
+  int row = 80;
+  int col = 70;
+  int size = row * col;
+  ml_shape_t mls =
+      bmk1822_matrix_lmem_default_shape(bk_ctx, row, col, FMT_I8);
+  ml_t *ml =
+      bmk1822_lmem_alloc_matrix(bk_ctx,mls, FMT_I8, 0);
+
+  int row_stride = col * 2;
+  bmk1822_matrix_tgmem_stride_t gmem_stride;
+  gmem_stride.row = row_stride;
+  int stride_size = row * row_stride;
+
+  u8 *data_x = (u8 *)xmalloc(stride_size);
+  for (int i = 0; i < stride_size; i++)
+    data_x[i] = i;
+
+  put_matrix_g2l_stride(ctx, bk_ctx, ml, gmem_stride, data_x);
+  u8 *result_x = get_matrix_l2g(ctx, bk_ctx, ml);
+
+  u8 *ref_x = (u8 *)xmalloc(size);
+  if (!result_x || !ref_x)
+    goto fail_exit;
+
+  put_matrix_g2l_stride_ref(ref_x, data_x, mls, gmem_stride);
+
+  for (int i = 0; i < size; i++) {
+    if (result_x[i] != ref_x[i]) {
+      printf("compare failed at result_x[%d], got %d, exp %d\n",
+             i, result_x[i], ref_x[i]);
+      exit(-1);
+    }
+  }
+
+  bmk1822_lmem_free_matrix(bk_ctx, ml);
+
+fail_exit:
+  free(data_x);
+  free(result_x);
+  free(ref_x);
+}
+
+int main ()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  test_put_matrix_g2l_stride(&ctx, bk_ctx);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_put_matrix_tp.cpp b/cviruntime/test/1822/test_1822_put_matrix_tp.cpp
new file mode 100644
index 000000000..01321a59d
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_put_matrix_tp.cpp
@@ -0,0 +1,107 @@
+#include "1822_test_util.h"
+
+static void matrix_tp_ref(
+    u8 *ref, u8 *a, ml_shape_t s)
+{
+  /*
+   * ref[] is transposed matrix in lmem.
+   * row/col are shape in DDR
+   */
+  int row = s.col;
+  int col = s.n;
+
+  for (int ri = 0; ri < row; ri++) {
+    for (int ci = 0; ci < col; ci++) {
+      ref[ci * row + ri] = a[ri * col + ci];
+    }
+  }
+}
+
+static void  put_matrix_g2l_tp(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    ml_t *ml,
+    u8 *data)
+{
+  /*
+   * raw_row = row of src, raw_col = col of dst.
+   * raw and col of ml.shape are transposed raw and col
+   */
+
+  int raw_row = ml->shape.col;
+  int raw_col = ml->shape.n;
+
+  bmshape_t bms = BM_MATRIX_INT8(raw_row,raw_col);
+  bmmem_device_t dev_mem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = bmmem_device_addr(dev_mem);
+  int ret = bm_memcpy_s2d(*ctx, dev_mem, data);
+  assert(ret == BM_SUCCESS);
+
+  mg_t mg;
+  mg.base_reg_index = 0;
+  mg.start_address = gaddr;
+  mg.shape.row = raw_row;
+  mg.shape.col = raw_col;
+  mg.stride.row = raw_col;
+  mg.base_reg_index = 0;
+
+  bmk1822_tdma_tg2l_matrix_copy_row_col_transposed_param_t g2lp;
+  g2lp.src = &mg;
+  g2lp.dst = ml;
+
+  bmk1822_tdma_g2l_matrix_copy_row_col_transposed(bk_ctx, &g2lp);
+  test_submit(ctx);
+
+  bmmem_device_free(*ctx, dev_mem);
+  return ;
+}
+
+static void test_put_matrix_g2l_tp(
+    bmctx_t *ctx, bmk_ctx_t *bk_ctx)
+{
+  int row = 80;
+  int col = 70;
+  int size = row * col;
+
+  u8 *data_x = (u8 *)xmalloc(size);
+  for (int i = 0; i < size; i++)
+    data_x[i] = i;
+
+  ml_shape_t mls =
+      bmk1822_matrix_lmem_default_shape(bk_ctx, col, row, FMT_I8);
+  ml_t *ml =
+      bmk1822_lmem_alloc_matrix(bk_ctx,mls, FMT_I8, 1);
+
+  put_matrix_g2l_tp(ctx, bk_ctx,ml, data_x);
+  u8 *result_x = get_matrix_l2g(ctx, bk_ctx, ml);
+  u8 *ref_x = (u8 *)xmalloc(size);
+  if (!result_x || !ref_x)
+    goto fail_exit;
+
+  matrix_tp_ref(ref_x, data_x, mls);
+
+  for (int i = 0; i < size; i++) {
+    if (result_x[i] != ref_x[i]) {
+      printf("compare failed at result_x[%d], got %d, exp %d\n",
+             i, result_x[i], ref_x[i]);
+      exit(-1);
+    }
+  }
+
+  bmk1822_lmem_free_matrix(bk_ctx, ml);
+
+fail_exit:
+  free(data_x);
+  free(result_x);
+  free(ref_x);
+}
+
+int main (void)
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  test_put_matrix_g2l_tp(&ctx, bk_ctx);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_put_tensor_stride.cpp b/cviruntime/test/1822/test_1822_put_tensor_stride.cpp
new file mode 100644
index 000000000..bbb43cbb7
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_put_tensor_stride.cpp
@@ -0,0 +1,133 @@
+#include "1822_test_util.h"
+
+static void put_tensor_g2l_stride_ref(
+    u8 *ref, u8 *a,
+    tl_shape_t lmem_shape,
+    bmk1822_tensor_tgmem_stride_t gmem_stride)
+{
+  int n = lmem_shape.n;
+  int c = lmem_shape.c;
+  int h = lmem_shape.h;
+  int w = lmem_shape.w;
+
+  int n_str = gmem_stride.n;
+  int c_str = gmem_stride.c;
+  int h_str = gmem_stride.h;
+  int w_str = 1;
+
+  /*
+   * put stride ddr tensor to local memory in default stride.
+   */
+
+  for (int ni = 0; ni < n; ni++) {
+    for (int ci = 0; ci < c; ci++) {
+      for (int hi = 0; hi < h; hi++) {
+        for (int wi = 0; wi < w; wi++) {
+          u64 src_i = ni * n_str + ci * c_str + hi * h_str + wi * w_str;
+          u64 dst_i = ni * c * h * w + ci * h * w + hi * w + wi;
+          ref[dst_i] = a[src_i];
+        }
+      }
+    }
+  }
+}
+
+static inline void put_tensor_g2l_stride(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    tl_t *tl,
+    bmk1822_tensor_tgmem_stride_t tg_stride,
+    u8 *data)
+{
+  int n = tl->shape.n;
+  int n_stride = tg_stride.n;
+  bmshape_t bms = BM_TENSOR_WITH_FMT(n, n_stride, 1, 1, BM_FMT_INT8);
+  bmmem_device_t devmem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  int ret = bm_memcpy_s2d(*ctx, devmem, data);
+  assert(ret == BM_SUCCESS);
+
+  gaddr_t gaddr = bmmem_device_addr(devmem);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_I8;
+  tg.shape.n = tl->shape.n;
+  tg.shape.c = tl->shape.c;
+  tg.shape.h = tl->shape.h;
+  tg.shape.w = tl->shape.w;
+  tg.stride = tg_stride;
+  tg.base_reg_index = 0;
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1822_tdma_g2l_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  bmmem_device_free(*ctx, devmem);
+}
+
+static void test_put_tensor_g2l_stride(
+    bmctx_t *ctx, bmk_ctx_t *bk_ctx)
+{
+  int n = 2;
+  int c = 15;
+  int h = 10;
+  int w = 8;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  bmk1822_tensor_tgmem_stride_t gmem_stride;
+  gmem_stride.h = w * 2;
+  gmem_stride.c = gmem_stride.h * h * 2;
+  gmem_stride.n = gmem_stride.c * c * 2;
+
+  int size = n * c * h * w;
+  int stride_size = gmem_stride.n * n;
+
+  u8 *data_x = (u8 *)xmalloc(stride_size);
+  for (int i = 0; i < stride_size; i++)
+    data_x[i] = i;
+
+  tl_t *tl_x =
+      alloc_tl(bk_ctx, tl_shape, FMT_I8, 1);
+  put_tensor_g2l_stride(ctx, bk_ctx, tl_x, gmem_stride, data_x);
+  u8 *result_x = get_tensor_l2g(ctx, bk_ctx, tl_x);
+  u8 *ref_x = (u8 *)xmalloc(size);
+  if (!result_x || !ref_x)
+    goto fail_exit;
+
+  put_tensor_g2l_stride_ref(ref_x, data_x, tl_shape, gmem_stride);
+
+  for (int i = 0; i < size; i++) {
+    if (result_x[i] != ref_x[i]) {
+      printf("compare failed at result_x[%d], got %d, exp %d\n",
+             i, result_x[i], ref_x[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_x);
+
+fail_exit:
+  free(data_x);
+  free(result_x);
+  free(ref_x);
+}
+
+int main (void)
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  test_put_tensor_g2l_stride(&ctx, bk_ctx);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_put_tensor_stride_unalign.cpp b/cviruntime/test/1822/test_1822_put_tensor_stride_unalign.cpp
new file mode 100644
index 000000000..62b07d7ee
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_put_tensor_stride_unalign.cpp
@@ -0,0 +1,135 @@
+#include "1822_test_util.h"
+
+static void put_tensor_g2l_stride_unalign_ref(
+    u8 *ref, u8 *a, tl_shape_t tl_shape,
+    bmk1822_tensor_tgmem_stride_t gmem_stride)
+{
+  int n = tl_shape.n;
+  int c = tl_shape.c;
+  int h = tl_shape.h;
+  int w = tl_shape.w;
+
+  int n_str = gmem_stride.n;
+  int c_str = gmem_stride.c;
+  int h_str = gmem_stride.h;
+  int w_str = 1;
+
+  for (int ni = 0; ni < n; ni++) {
+    for (int ci = 0; ci < c; ci++) {
+      for (int hi = 0; hi < h; hi++) {
+        for (int wi = 0; wi < w; wi++) {
+          u64 src_i = ni * n_str + ci * c_str + hi * h_str + wi * w_str;
+          u64 dst_i = ci * n * h * w + ni * h * w + hi * w + wi;
+          ref[dst_i] = a[src_i];
+        }
+      }
+    }
+  }
+}
+
+static inline void put_tensor_g2l_stride(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    tl_t *tl,
+    bmk1822_tensor_tgmem_stride_t tg_stride,
+    u8 *data)
+{
+  int n = tl->shape.n;
+  int n_stride = tg_stride.n;
+  bmshape_t bms = BM_TENSOR_WITH_FMT(n, n_stride, 1, 1, BM_FMT_INT8);
+  bmmem_device_t devmem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  int ret = bm_memcpy_s2d(*ctx, devmem, data);
+  assert(ret == BM_SUCCESS);
+
+  gaddr_t gaddr = bmmem_device_addr(devmem);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_I8;
+  tg.shape.n = tl->shape.n;
+  tg.shape.c = tl->shape.c;
+  tg.shape.h = tl->shape.h;
+  tg.shape.w = tl->shape.w;
+  tg.stride = tg_stride;
+  tg.base_reg_index = 0;
+
+  bmk1822_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1822_tdma_g2l_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  bmmem_device_free(*ctx, devmem);
+}
+
+static void test_put_tensor_g2l_stride_unalign(
+    bmctx_t *ctx, bmk_ctx_t *bk_ctx)
+{
+  int n = 6;
+  int c = (BM1822_HW_NPU_NUM / 2 + 1); //just larger than (npu_num/2)
+  int h = 1;
+  int w = 8;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  bmk1822_tensor_tgmem_stride_t gmem_stride;
+  gmem_stride.h = w * 2;
+  gmem_stride.c = gmem_stride.h * h * 2;
+  gmem_stride.n = gmem_stride.c * c * 2;
+
+  int size = n * c * h * w;
+  int stride_size = gmem_stride.n * n;
+
+  u8 *data_x = (u8 *)xmalloc(stride_size);
+  for (int i = 0; i < stride_size; i++)
+    data_x[i] = i;
+
+
+  tl_t *tl_x =
+      alloc_tl(bk_ctx, tl_shape, FMT_I8, 0);
+
+  put_tensor_g2l_stride(ctx, bk_ctx, tl_x, gmem_stride, data_x);
+
+  tl_x->shape.n = 1;
+  tl_x->shape.c = c;
+  tl_x->shape.h = n * h;
+  tl_x->shape.w = w;
+
+  u8 *result_x = get_tensor_l2g(ctx, bk_ctx, tl_x);
+  u8 *ref_x = (u8 *)xmalloc(size);
+  if (!result_x || !ref_x)
+    goto fail_exit;
+
+  put_tensor_g2l_stride_unalign_ref(ref_x, data_x, tl_shape, gmem_stride);
+
+  for (int i = 0; i < size; i++) {
+    if (result_x[i] != ref_x[i]) {
+      printf("compare failed at result_x[%d], got %d, exp %d\n",
+             i, result_x[i], ref_x[i]);
+      exit(-1);
+    }
+  }
+  free_tl(bk_ctx, tl_x);
+
+fail_exit:
+  free(data_x);
+  free(result_x);
+  free(ref_x);
+}
+
+int main (void)
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  test_put_tensor_g2l_stride_unalign(&ctx, bk_ctx);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_put_tensor_tp_unalign.cpp b/cviruntime/test/1822/test_1822_put_tensor_tp_unalign.cpp
new file mode 100644
index 000000000..2c3c88f3a
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_put_tensor_tp_unalign.cpp
@@ -0,0 +1,122 @@
+#include "1822_test_util.h"
+
+static void put_tensor_g2l_tp_unalign_ref(
+    u8 *ref, u8 *a, tl_shape_t tl_shape)
+{
+  /*
+   * (c, n, h, w) => (n, c, h, w) => (1, c, n * h, w)
+   */
+
+  int n = tl_shape.n;
+  int c = tl_shape.c;
+  int h = tl_shape.h;
+  int w = tl_shape.w;
+
+  int size = n * c * h * w;
+  for (int i = 0; i < size; i++)
+    ref[i] = a[i];
+}
+
+
+static void put_tensor_g2l_tp(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx,
+    tl_t *tl,
+    u8 *data)
+{
+  int n = tl->shape.n;
+  int c = tl->shape.c;
+  int h = tl->shape.h;
+  int w = tl->shape.w;
+
+  bmshape_t bms = BM_TENSOR_WITH_FMT(n, c, h, w, BM_FMT_INT8);
+  bmmem_device_t devmem = bmmem_device_alloc_raw(*ctx, bmshape_get_size(&bms));
+  int ret = bm_memcpy_s2d(*ctx, devmem, data);
+  assert(ret == BM_SUCCESS);
+
+  gaddr_t gaddr = bmmem_device_addr(devmem);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_I8;
+  tg.shape.n = tl->shape.c;
+  tg.shape.c = tl->shape.n;
+  tg.shape.h = tl->shape.h;
+  tg.shape.w = tl->shape.w;
+  tg.stride = bmk1822_tensor_tgmem_default_stride(tg.shape, tg.fmt);
+  tg.base_reg_index = 0 ;
+
+  bmk1822_tdma_tg2l_tensor_copy_nc_transposed_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1822_tdma_g2l_tensor_copy_nc_transposed(bk_ctx, &p);
+  test_submit(ctx);
+
+  bmmem_device_free(*ctx, devmem);
+}
+
+static void test_put_tensor_g2l_tp_unalign(
+    bmctx_t *ctx, bmk_ctx_t *bk_ctx)
+{
+  int n = 2;
+  int c = (BM1822_HW_NPU_NUM - 1);
+  int h = 1;
+  int w = 8;
+  int size = n * c * h * w;
+
+  u8 *data_x = (u8 *)xmalloc(size);
+  for (int i = 0; i < size; i++)
+    data_x[i] = i;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  tl_t *tl_x =
+      alloc_tl(bk_ctx, tl_shape, FMT_I8, 0);
+
+  put_tensor_g2l_tp(ctx, bk_ctx, tl_x, data_x);
+
+  tl_x->shape.n = 1;
+  tl_x->shape.c = c;
+  tl_x->shape.h = n * h;
+  tl_x->shape.w = w;
+
+  u8 *result_x = get_tensor_l2g(ctx, bk_ctx, tl_x);
+  tl_x->shape = tl_shape;
+  u8 *ref_x = (u8 *)xmalloc(size);
+  if (!result_x || !ref_x)
+    goto fail_exit;
+
+  put_tensor_g2l_tp_unalign_ref(ref_x, data_x, tl_shape);
+
+  for (int i = 0; i < size; i++) {
+    if (result_x[i] != ref_x[i]) {
+      printf("compare failed at result_x[%d], got %d, exp %d\n",
+             i, result_x[i], ref_x[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_x);
+
+fail_exit:
+  free(data_x);
+  free(result_x);
+  free(ref_x);
+}
+
+int main (void)
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  test_put_tensor_g2l_tp_unalign(&ctx, bk_ctx);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_put_tensor_unalign.cpp b/cviruntime/test/1822/test_1822_put_tensor_unalign.cpp
new file mode 100644
index 000000000..26ba90cd9
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_put_tensor_unalign.cpp
@@ -0,0 +1,86 @@
+#include "1822_test_util.h"
+
+static void put_tensor_g2l_unalign_ref(
+    u8 *ref, u8 *a, tl_shape_t tl_shape)
+{
+  int n = tl_shape.n;
+  int c = tl_shape.c;
+  int h = tl_shape.h;
+  int w = tl_shape.w;
+
+  /*
+   * (n, c, h, w) => (1, c, n * h, w)
+   */
+  for (int ni = 0; ni < n; ni++) {
+    for (int ci = 0; ci < c; ci++) {
+      for (int hi = 0; hi < h; hi++) {
+        for (int wi = 0; wi < w; wi++) {
+          u64 src_i = ni * c * h * w + ci * h * w + hi * w + wi;
+          u64 dst_i = ci * n * h * w + ni * h * w + hi * w + wi;
+          ref[dst_i] = a[src_i];
+        }
+      }
+    }
+  }
+}
+
+static void test_put_tensor_g2l_unalign(
+    bmctx_t *ctx, bmk_ctx_t *bk_ctx)
+{
+  int n = 4;
+  int c = (BM1822_HW_NPU_NUM / 2 + 1); //just larger than (npu_num/2)
+  int h = 1;
+  int w = 8;
+  int size = n * c * h * w;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  u8 *data_x = (u8 *)xmalloc(size);
+  for (int i = 0; i < size; i++)
+    data_x[i] = i;
+
+  tl_t *tl_x =
+      alloc_tl(bk_ctx, tl_shape, FMT_I8, 0);
+  put_tensor_g2l(ctx, bk_ctx, tl_x, data_x);
+
+  tl_x->shape.n = 1;
+  tl_x->shape.c = c;
+  tl_x->shape.h = n * h;
+  tl_x->shape.w = w;
+
+  u8 *result_x = get_tensor_l2g(ctx, bk_ctx, tl_x);
+  u8 *ref_x = (u8 *)xmalloc(size);
+  if (!result_x || !ref_x)
+    goto fail_exit;
+
+  put_tensor_g2l_unalign_ref(ref_x, data_x, tl_shape);
+
+  for (int i = 0; i < size; i++) {
+    if (result_x[i] != ref_x[i]) {
+      printf("compare failed at result_x[%d], got %d, exp %d\n",
+             i, result_x[i], ref_x[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_x);
+
+fail_exit:
+  free(data_x);
+  free(result_x);
+  free(ref_x);
+}
+
+int main (void)
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  test_put_tensor_g2l_unalign(&ctx, bk_ctx);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_tdma_l2l_tensor_copy.cpp b/cviruntime/test/1822/test_1822_tdma_l2l_tensor_copy.cpp
new file mode 100644
index 000000000..d54d33fef
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_tdma_l2l_tensor_copy.cpp
@@ -0,0 +1,133 @@
+#include "1822_test_util.h"
+
+typedef bmk1822_tdma_l2l_tensor_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  tl_shape_t src_shape;
+  tl_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 1, 1 },
+    { 1, 1, 1, 1 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 2, 1 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 2, 7 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 13, 17 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 120, 5 },
+  }, {
+    { 1, 2, 1, 1 },
+    { 1, 1, 1, 2 },
+  }, {
+    { 2, 17, 1,  4 },
+    { 2,  1, 4, 17 },
+  }, {
+    { 2, 17, 1,  4 },
+    { 2,  1, 17, 4 },
+  }, {
+    { 3, 16, 1, 1 },
+    { 3,  1, 2, 8 },
+  }, {
+    { 3, 39, 17, 23 },
+    { 3, 17, 39, 23 },
+  }, {
+    { 3, 36, 16,  20 },
+    { 3, 18,  1, 640 },
+  }, {
+    { 5, 39, 17, 23 },
+    { 5, 17, 39, 23 },
+  }, {
+    { 20, 35,  2, 2 },
+    { 20,  7, 10, 2 },
+  }    
+};
+
+static void destroy_param(bmk_ctx_t *bmk, param_t *p)
+{
+  free_tl(bmk, p->dst);
+  free_tl(bmk, p->src);
+}
+
+static void l2l_tensor_copy_ref(param_t *p, u8 ref_data[], u8 src_data[])
+{
+  u64 size = tl_shape_size(&p->src->shape);
+
+  for (u64 i = 0; i < size; i++)
+    ref_data[i] = src_data[i];
+}
+
+static void test_param(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = tl_shape_size(&p->src->shape);
+
+  u8 *src_data = (u8 *)malloc(sizeof(u8) * size);
+  for (u64 i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  put_tensor_g2l(ctx, bmk, p->src, src_data);
+  bmk1822_tdma_l2l_tensor_copy(bmk, p);
+  u8 *dst_data = get_tensor_l2g(ctx, bmk, p->dst);
+
+  u8 *ref_data = (u8 *)malloc(sizeof(u8) * size);
+  l2l_tensor_copy_ref(p, ref_data, src_data);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+static void test_one_case(bmctx_t *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  for (int src_align = 0; src_align < 2; src_align++) {
+    for (int dst_align = 0; dst_align < 2; dst_align++) {
+      param_t p;
+      memset(&p, 0, sizeof(p));
+      p.src = alloc_tl(bmk, c->src_shape, FMT_I8, src_align);
+      p.dst = alloc_tl(bmk, c->dst_shape, FMT_I8, dst_align);
+      test_param(ctx, bmk, &p);
+      destroy_param(bmk, &p);
+    }
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_tdma_l2l_tensor_lrn_shift.cpp b/cviruntime/test/1822/test_1822_tdma_l2l_tensor_lrn_shift.cpp
new file mode 100644
index 000000000..5b748aa8c
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_tdma_l2l_tensor_lrn_shift.cpp
@@ -0,0 +1,197 @@
+#include "1822_test_util.h"
+
+typedef bmk1822_tdma_l2l_tensor_lrn_shift_param_t param_t;
+
+#define ENABLE_TV_GEN_PATTERN // to reduce rom code size for RTL_SIM (rom size 128KB)
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) %s%u%s (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      (p->right_shift? "": "<-"),
+      p->lrn_step,
+      (p->right_shift? "->": ""),
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  u32 n;
+  u32 c;
+  u32 src_h;
+  u32 src_w;
+  u32 dst_h;
+  u32 dst_w;
+} case_t;
+
+static case_t g_cases[] = {
+#ifdef ENABLE_TV_GEN_PATTERN
+  { 0, 0, 3, 7, 7, 3 },
+  { 0, 0, 4, 4, 2, 8 },
+  { 0, 0, 14, 6, 12, 7 },
+#else
+  { 0, 0, 1, 1, 1, 1 },
+  { 0, 0, 3, 7, 7, 3 },
+  { 0, 0, 4, 4, 2, 8 },
+  { 0, 0, 7, 7, 1, 49 },
+  { 0, 0, 7, 8, 14, 4 },
+  { 0, 0, 14, 6, 12, 7 },
+#endif
+};
+
+static void destroy_param(bmk_ctx_t *bmk, param_t *p)
+{
+  free_tl(bmk, p->dst);
+  free_tl(bmk, p->src);
+}
+
+static void lrn_left_shift_ref(param_t *p, u8 ref_data[], u8 src_data[])
+{
+  u32 n = p->src->shape.n;
+  u32 c = p->src->shape.c;
+  u32 hw = p->src->shape.h * p->src->shape.w;
+  u64 size = tl_shape_size(&p->src->shape);
+
+  for (u64 i = 0; i < size; i++)
+    ref_data[i] = 0;
+
+  for (u32 ni = 0; ni < n; ni++) {
+    for (u32 ci = p->lrn_step; ci < c; ci++) {
+      for (u32 hwi = 0; hwi < hw; hwi++) {
+        u32 src_i = (ni * c + ci) * hw + hwi;
+        u32 dst_i = src_i - p->lrn_step * hw;
+        ref_data[dst_i] = src_data[src_i];
+      }
+    }
+  }
+}
+
+static void lrn_right_shift_ref(param_t *p, u8 ref_data[], u8 src_data[])
+{
+  u32 n = p->src->shape.n;
+  u32 c = p->src->shape.c;
+  u32 hw = p->src->shape.h * p->src->shape.w;
+  u64 size = tl_shape_size(&p->src->shape);
+
+  for (u64 i = 0; i < size; i++)
+    ref_data[i] = 0;
+
+  for (u32 ni = 0; ni < n; ni++) {
+    for (u32 ci = 0; ci < c - p->lrn_step; ci++) {
+      for (u32 hwi = 0; hwi < hw; hwi++) {
+        u32 src_i = (ni * c + ci) * hw + hwi;
+        u32 dst_i = src_i + p->lrn_step * hw;
+        ref_data[dst_i] = src_data[src_i];
+      }
+    }
+  }
+}
+
+static void l2l_tensor_lrn_shift_ref(
+    param_t *p, u8 ref_data[], u8 src_data[])
+{
+  if (p->right_shift)
+    return lrn_right_shift_ref(p, ref_data, src_data);
+  else
+    return lrn_left_shift_ref(p, ref_data, src_data);
+}
+
+static void test_param(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = tl_shape_size(&p->src->shape);
+
+  u8 *src_data = (u8 *)malloc(sizeof(u8) * size);
+  for (u64 i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  put_tensor_g2l(ctx, bmk, p->src, src_data);
+  bmk1822_tdma_l2l_tensor_lrn_shift(bmk, p);
+  u8 *dst_data = get_tensor_l2g(ctx, bmk, p->dst);
+
+  u8 *ref_data = (u8 *)malloc(sizeof(u8) * size);
+  l2l_tensor_lrn_shift_ref(p, ref_data, src_data);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+static void execute_case(bmctx_t *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  static const u32 steps[] = { 1, 3 }; // less than npu_num/2
+  u32 nr_steps = sizeof(steps) / sizeof(steps[0]);
+
+  for (int src_align = 0; src_align < 2; src_align++) {
+    for (int dst_align = 0; dst_align < 2; dst_align++) {
+      tl_shape_t src_shape, dst_shape;
+      src_shape.n = c->n;
+      src_shape.c = c->c;
+      src_shape.h = c->src_h;
+      src_shape.w = c->src_w;
+      dst_shape.n = c->n;
+      dst_shape.c = c->c;
+      dst_shape.h = c->dst_h;
+      dst_shape.w = c->dst_w;
+
+      param_t p;
+      memset(&p, 0, sizeof(p));
+      p.src = alloc_tl(bmk, src_shape, FMT_I8, src_align);
+      p.dst = alloc_tl(bmk, dst_shape, FMT_I8, dst_align);
+
+      for (u32 i = 0; i < nr_steps; i++) {
+        if (steps[i] >= p.src->shape.c)
+          break;
+        p.lrn_step = steps[i];
+
+        p.right_shift = 0;
+        test_param(ctx, bmk, &p);
+
+        p.right_shift = 1;
+        test_param(ctx, bmk, &p);
+      }
+
+      destroy_param(bmk, &p);
+    }
+  }
+}
+
+static void test_one_case(bmctx_t *ctx, bmk_ctx_t *bmk, case_t *ca)
+{
+  for (u32 n = 1; n < 8; n += 2) {
+    ca->n = n;
+    for (u32 c = 1; c < BM1822_HW_NPU_NUM + 1; c += 3) {
+      ca->c = c;
+      execute_case(ctx, bmk, ca);
+    }
+    for (u32 c = BM1822_HW_NPU_NUM + 1; c < BM1822_HW_NPU_NUM * 2; c += 7) {
+      ca->c = c;
+      execute_case(ctx, bmk, ca);
+    }
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_tdma_l2tg_general_copy.cpp b/cviruntime/test/1822/test_1822_tdma_l2tg_general_copy.cpp
new file mode 100644
index 000000000..0e2c6009e
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_tdma_l2tg_general_copy.cpp
@@ -0,0 +1,90 @@
+#include "1822_test_util.h"
+
+typedef bmk1822_tdma_l2tg_general_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: %u bytes from %" PRIx32 " to %u:%" PRIx64 "\n", tag,
+      p->bytes, p->src_address, p->dst_base_reg_index, p->dst_address);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef param_t case_t;
+
+static case_t g_cases[] = {
+  { 0, 0, 0, 1 },
+  { 0, 0, 0, 39 },
+  { 0, 0, 0, 4096 },
+  { 0, 0, 100, 1 },
+  { 0, 0, 200, 39 },
+  { 0, 0, 1024, 4096 },
+  { 39, 0, 100, 1 },
+  { 47, 0, 200, 39 },
+  { 2048, 0, 1024, 4096 },
+};
+
+static void l2tg_general_copy_ref(param_t *p, u8 ref_data[], u8 src_data[])
+{
+  for (u32 i = 0; i < p->bytes; i++)
+    ref_data[i] = src_data[i];
+}
+
+static void test_param_l2g(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = p->bytes;
+
+  u8 *src_data = (u8 *)malloc(sizeof(u8) * size);
+  for (u64 i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  put_bytes_g2l(ctx, bmk, p->src_address, size, src_data);
+
+  #if 1
+  u8 *dst_data = get_bytes_l2g(ctx, bmk, p->src_address, size);
+
+  #else
+  bmk1822_tdma_l2g_general_copy(bmk, p);
+  test_submit(ctx);
+  u8 *dst_data = get_bytes_gmem(ctx, p->dst_address, size);
+  #endif
+
+  u8 *ref_data = (u8 *)malloc(sizeof(u8) * size);
+  l2tg_general_copy_ref(p, ref_data, src_data);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+
+static void test_one_case(bmctx_t *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  param_t *p = c;
+
+  test_param_l2g(ctx, bmk, p);
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_tdma_l2tg_matrix_copy.cpp b/cviruntime/test/1822/test_1822_tdma_l2tg_matrix_copy.cpp
new file mode 100644
index 000000000..ea84e9c7b
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_tdma_l2tg_matrix_copy.cpp
@@ -0,0 +1,136 @@
+#include "1822_test_util.h"
+
+typedef bmk1822_tdma_l2tg_matrix_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.w, p->src->shape.col,
+      p->dst->shape.row, p->dst->shape.col);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  ml_shape_t src_shape;
+  mg_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 0, 1, 1, 1 },
+    { 0, 1 },
+  }, {
+    { 0, 1, 2, 2 },
+    { 1, 2 },
+  }, {
+    { 0, 2, 1, 2 },
+    { 1, 2 },
+  }, {
+    { 0, 1, 7, 7 },
+    { 0, 7 },
+  }, {
+    { 0, 2, 4, 7 },
+    { 0, 7 },
+  }, {
+    { 0, 7, 1, 7 },
+    { 0, 7 },
+  }, {
+    { 0, 1, 17, 17 },
+    { 0, 17 },
+  }, {
+    { 0, 3, 7, 17 },
+    { 0, 17 },
+  }, {
+    { 0, 17, 1, 17 },
+    { 0, 17 },
+  }, {
+    { 0, 1, 60, 60 },
+    { 0, 60 },
+  }, {
+    { 0, 30, 2, 60 },
+    { 0, 60 },
+  }, {
+    { 0, 60, 1, 60 },
+    { 0, 60 },
+  }
+};
+
+static void l2tg_matrix_copy_ref(param_t *p, u8 ref_data[], u8 src_data[])
+{
+  u64 size = ml_shape_size(&p->src->shape);
+
+  for (u64 i = 0; i < size; i++)
+    ref_data[i] = src_data[i];
+}
+
+static void test_param_l2g(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = ml_shape_size(&p->src->shape);
+
+  u8 *src_data = (u8 *)malloc(sizeof(u8) * size);
+  for (u64 i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  put_matrix_g2l(ctx, bmk, p->src, src_data);
+  bmk1822_tdma_l2g_matrix_copy(bmk, p);
+  test_submit(ctx);
+  u8 *dst_data = get_mg_gmem(ctx, p->dst);
+
+  u8 *ref_data = (u8 *)malloc(sizeof(u8) * size);
+  l2tg_matrix_copy_ref(p, ref_data, src_data);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+
+static void destroy_param_l2g(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_ml(bmk, p->src);
+  free_mg_gmem(ctx, p->dst);
+}
+
+static void test_one_case(bmctx_t *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  for (u32 row = 1; row < 13; row += 2) {
+    c->src_shape.n = row;
+    c->dst_shape.row = row;
+    for (int src_align = 0; src_align < 2; src_align++) {
+      param_t p;
+      memset(&p, 0, sizeof(p));
+
+      p.src = alloc_ml(bmk, c->src_shape, src_align);
+      p.dst = alloc_mg_gmem(ctx, c->dst_shape);
+      test_param_l2g(ctx, bmk, &p);
+      destroy_param_l2g(ctx, bmk, &p);
+
+    }
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_tdma_l2tg_matrix_vlc_copy_compressed.cpp b/cviruntime/test/1822/test_1822_tdma_l2tg_matrix_vlc_copy_compressed.cpp
new file mode 100644
index 000000000..64620ec74
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_tdma_l2tg_matrix_vlc_copy_compressed.cpp
@@ -0,0 +1,163 @@
+#include "1822_test_util.h"
+
+typedef bmk1822_tdma_l2tg_matrix_copy_compressed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.w, p->src->shape.col,
+      p->dst->m.shape.row, p->dst->m.shape.col);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  ml_shape_t src_shape;
+  mg_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+ {
+    { 0, 2, 4, 7 },
+    { 0, 7 },
+  },
+ {
+    { 0, 1, 2, 2 },
+    { 1, 2 },
+  },
+ {
+    { 0, 3, 7, 17 },
+    { 0, 17 },
+  },
+ {
+    { 0, 17, 1, 17 },
+    { 0, 17 },
+  },
+ {
+    { 0, 60, 1, 60 },
+    { 0, 60 },
+  },
+#ifndef ENABEL_SIMPLE_BMK1822_VLC_TEST
+  {
+    { 0, 1, 1, 1 },
+    { 0, 1 },
+  },
+ {
+    { 0, 2, 1, 2 },
+    { 1, 2 },
+  },
+ {
+    { 0, 1, 7, 7 },
+    { 0, 7 },
+  },
+ {
+    { 0, 7, 1, 7 },
+    { 0, 7 },
+  },
+ {
+    { 0, 1, 17, 17 },
+    { 0, 17 },
+  },
+ {
+    { 0, 1, 60, 60 },
+    { 0, 60 },
+  },
+ {
+    { 0, 30, 2, 60 },
+    { 0, 60 },
+  },
+#endif /* ifndef ENABEL_SIMPLE_BMK1822_VLC_TEST*/
+};
+
+static void test_param_l2g(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p, u8* src_data, CommandInfo * cmd_info)
+{
+  print_param(stderr, p);
+  u64 size = ml_shape_size(&p->src->shape);
+
+  put_matrix_g2l(ctx, bmk, p->src, src_data);
+  bmk1822_tdma_l2g_matrix_copy_compressed(bmk, p);
+  test_submit(ctx);
+
+  int is_signed = (p->src->fmt == FMT_I8);
+  int data_type = (p->src->fmt == FMT_BF16) ? 1 : 0;
+  size_t bs_size;
+
+  size_t bs_buf_size = get_out_bs_buf_size(size, data_type);
+  u8 *ref_data = vlc_compress(src_data, size, is_signed, data_type, &bs_size, cmd_info, NULL);
+  u8 *dst_data = get_compressed_mg_gmem(ctx, p->dst, bs_buf_size);
+
+  for (u64 i = 0; i < bs_size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+          i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(dst_data);
+  free(ref_data);
+}
+
+
+static void destroy_param_l2g(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_ml(bmk, p->src);
+  free_compressed_mg_gmem(ctx, p->dst);
+}
+
+static void test_one_case(bmctx_t *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  fmt_t fmts[] = { FMT_I8, FMT_U8 };
+  u8 fmts_sz = sizeof(fmts) / sizeof(fmts[0]);
+
+  for (u32 row = 1; row < 13; row += 2) {
+    c->src_shape.n = row;
+    c->dst_shape.row = row;
+    for (int src_align = 0; src_align < 2; src_align++) {
+      for (u8 fmt_i = 0; fmt_i < fmts_sz; fmt_i++) {
+        fmt_t fmt = fmts[fmt_i];
+        param_t p;
+        memset(&p, 0, sizeof(p));
+        p.src = alloc_ml(bmk, c->src_shape, fmt, src_align);
+
+        u64 size = ml_shape_size(&p.src->shape);
+        u8 *src_data = (u8 *)malloc(sizeof(u8) * size);
+        vlc_init_testdata(src_data, size, fmt == FMT_I8, fmt == FMT_BF16);
+
+        //size_t bs_size;
+        CommandInfo cmd_info;
+        memset(&cmd_info, 0, sizeof(CommandInfo));
+
+        // <! not support bias0/1 setting compress by hw
+        //get_vlc_compressed_meta(src_data, size, p.src->fmt, &bs_size, &cmd_info);
+
+        int is_signed = (p.src->fmt == FMT_I8);
+        cmd_info.signedness = is_signed;
+
+        // <! max compressed size
+        p.dst = alloc_vlc_compressed_mg_gmem(ctx, c->dst_shape, p.src->fmt, &cmd_info);
+
+        //printf ("row %u is_align %d fmt %d\n", row, src_align, fmt);
+        test_param_l2g(ctx, bmk, &p, src_data, &cmd_info);
+        destroy_param_l2g(ctx, bmk, &p);
+        free(src_data);
+      }
+    }
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_tdma_l2tg_tensor_copy.cpp b/cviruntime/test/1822/test_1822_tdma_l2tg_tensor_copy.cpp
new file mode 100644
index 000000000..750495002
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_tdma_l2tg_tensor_copy.cpp
@@ -0,0 +1,135 @@
+#include "1822_test_util.h"
+
+typedef bmk1822_tdma_l2tg_tensor_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  tl_shape_t src_shape;
+  tg_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 1, 1 },
+    { 1, 1, 1, 1 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 2, 1 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 2, 7 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 13, 17 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 120, 5 },
+  }, {
+    { 1, 2, 1, 1 },
+    { 1, 1, 1, 2 },
+  }, {
+    { 2, 17, 1,  4 },
+    { 2,  1, 4, 17 },
+  }, {
+    { 2, 17, 1,  4 },
+    { 2,  1, 17, 4 },
+  }, {
+    { 3, 16, 1, 1 },
+    { 3,  1, 2, 8 },
+  }, {
+    { 3, 39, 17, 23 },
+    { 3, 17, 39, 23 },
+  }, {
+    { 3, 36, 16,  20 },
+    { 3, 18,  1, 640 },
+  }, {
+    { 5, 39, 17, 23 },
+    { 5, 17, 39, 23 },
+  }, {
+    { 20, 35,  2, 2 },
+    { 20,  7, 10, 2 },
+  }    
+};
+
+static void l2tg_tensor_copy_ref(param_t *p, u8 ref_data[], u8 src_data[])
+{
+  u64 size = tl_shape_size(&p->src->shape);
+
+  for (u64 i = 0; i < size; i++)
+    ref_data[i] = src_data[i];
+}
+
+static void test_param_l2g(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = tl_shape_size(&p->src->shape);
+
+  u8 *src_data = (u8 *)malloc(sizeof(u8) * size);
+  for (u64 i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  put_tensor_g2l(ctx, bmk, p->src, src_data);
+  bmk1822_tdma_l2g_tensor_copy(bmk, p);
+  test_submit(ctx);
+  u8 *dst_data = get_tg_gmem(ctx, p->dst);
+
+  u8 *ref_data = (u8 *)malloc(sizeof(u8) * size);
+  l2tg_tensor_copy_ref(p, ref_data, src_data);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+
+static void destroy_param_l2g(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_tg_gmem(ctx, p->dst);
+  free_tl(bmk, p->src);
+}
+
+static void test_one_case(bmctx_t *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  for (int src_align = 0; src_align < 2; src_align++) {
+    param_t p;
+    memset(&p, 0, sizeof(p));
+
+    p.src = alloc_tl(bmk, c->src_shape, FMT_I8, src_align);
+    p.dst = alloc_tg_gmem(ctx, c->dst_shape, FMT_I8);
+    test_param_l2g(ctx, bmk, &p);
+    destroy_param_l2g(ctx, bmk, &p);
+
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_tdma_l2tg_tensor_copy_cw_transposed.cpp b/cviruntime/test/1822/test_1822_tdma_l2tg_tensor_copy_cw_transposed.cpp
new file mode 100644
index 000000000..728830760
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_tdma_l2tg_tensor_copy_cw_transposed.cpp
@@ -0,0 +1,152 @@
+#include "1822_test_util.h"
+
+typedef bmk1822_tdma_l2tg_tensor_copy_cw_transposed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  tl_shape_t src_shape;
+  tg_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 1, 1 },
+    { 1, 1, 1, 1 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 2, 1, 1 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 2, 7, 1 },
+  }, {
+    { 1,  1, 17, 13 },
+    { 1, 13, 17,  1 },
+  }, {
+    { 1,  1, 10, 60 },
+    { 1, 60, 10,  1 },
+  }, {
+    { 1, 2, 1, 1 },
+    { 1, 1, 1, 2 },
+  }, {
+    {  2, 17, 1,  4 },
+    {  2,  4, 1, 17 },
+  }, {
+    {  2, 17, 3,  4 },
+    {  2,  4, 3, 17 },
+  }, {
+    {  3, 16, 7,  1 },
+    {  3,  1, 7, 16 },
+  }, {
+    {  3, 39, 17, 23 },
+    {  3, 23, 17, 39 },
+  }, {
+    {  3, 36,  16, 20 },
+    {  3, 20,  16, 36 },
+  }, {
+    {  5, 39, 17, 23 },
+    {  5, 23, 17, 39 },
+  }, {
+    { 20, 35,  2,  2 },
+    { 20,  2,  2, 35 },
+  }, {
+    { 20, 35,  3,  2 },
+    { 20,  2,  3, 35 },
+  }    
+};
+
+static void l2tg_tensor_copy_cw_transposed_ref(
+    param_t *p, u8 ref_data[], u8 src_data[])
+{
+  tl_shape_t s = p->src->shape;
+  u32 n = s.n;
+  u32 c = s.c;
+  u32 h = s.h;
+  u32 w = s.w;
+
+  for (u32 ni = 0; ni < n; ni++) {
+    for (u32 ci = 0; ci < c; ci++) {
+      for (u32 hi = 0; hi < h; hi++) {
+        for (u32 wi = 0; wi < w; wi++) {
+          u32 src_i = ni * c * h * w + ci * h * w + hi * w + wi;
+          u32 dst_i = ni * c * h * w + wi * h * c + hi * c + ci;
+          ref_data[dst_i] = src_data[src_i];
+        }
+      }
+    }
+  }
+}
+
+static void test_param_l2g(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = tl_shape_size(&p->src->shape);
+
+  u8 *src_data = (u8 *)malloc(sizeof(u8) * size);
+  for (u64 i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  put_tensor_g2l(ctx, bmk, p->src, src_data);
+  bmk1822_tdma_l2g_tensor_copy_cw_transposed(bmk, p);
+  test_submit(ctx);
+  u8 *dst_data = get_tg_gmem(ctx, p->dst);
+
+  u8 *ref_data = (u8 *)malloc(sizeof(u8) * size);
+  l2tg_tensor_copy_cw_transposed_ref(p, ref_data, src_data);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+
+static void destroy_param_l2g(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_tg_gmem(ctx, p->dst);
+  free_tl(bmk, p->src);
+}
+
+static void test_one_case(bmctx_t *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  for (int src_align = 0; src_align < 2; src_align++) {
+    param_t p;
+    memset(&p, 0, sizeof(p));
+
+    p.src = alloc_tl(bmk, c->src_shape, FMT_I8, src_align);
+    p.dst = alloc_tg_gmem(ctx, c->dst_shape, FMT_I8);
+    test_param_l2g(ctx, bmk, &p);
+    destroy_param_l2g(ctx, bmk, &p);
+
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_tdma_l2tg_tensor_copy_nc_transposed.cpp b/cviruntime/test/1822/test_1822_tdma_l2tg_tensor_copy_nc_transposed.cpp
new file mode 100644
index 000000000..3b1eb2fe5
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_tdma_l2tg_tensor_copy_nc_transposed.cpp
@@ -0,0 +1,226 @@
+#include "1822_test_util.h"
+
+typedef bmk1822_tdma_l2tg_tensor_copy_nc_transposed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  tl_shape_t src_shape;
+  tg_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 1, 1 },
+    { 1, 1, 1, 1 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 1, 2 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 2, 1 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 7, 2 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 2, 7 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 17, 13 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 13, 17 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 10, 60 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 2, 300 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 3, 200 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 4, 150 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 5, 120 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 60, 10 },
+  }, {
+    { 1, 1, 120, 5 },
+    { 1, 1, 120, 5 },
+  }, {
+    { 1, 2, 1, 1 },
+    { 2, 1, 1, 1 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 1, 4 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 2, 2 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 4, 1 },
+  }, {
+    { 17, 2, 2, 2 },
+    { 2, 17, 2, 2 },
+  }, {
+    { 17, 2, 4, 1 },
+    { 2, 17, 4, 1 },
+  }, {
+    {  3, 16, 1, 1 },
+    { 16,  3, 1, 1 },
+  }, {
+    { 3, 39, 23, 17 },
+    { 39, 3, 23, 17 },
+  }, {
+    { 3, 39, 17, 23 },
+    { 39, 3, 17, 23 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  16, 20 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  2, 160 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  4, 80 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  8, 40 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  20, 16 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  32, 10 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  64, 5 },
+  }, {
+    { 5, 39, 17, 23 },
+    { 39, 5, 17, 23 },
+  }, {
+    { 20, 35, 2, 2 },
+    { 35, 20, 2, 2 },
+  }, {
+    { 35, 20, 2, 2 },
+    { 20, 35, 2, 2 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 160, 2 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 2, 160 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 4, 80 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 8, 40 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 10, 32 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 20, 16 },
+  }, {
+    { 39, 5, 23, 17 },
+    { 5, 39, 23, 17 },
+  }    
+};
+
+static void l2tg_tensor_copy_nc_transposed_ref(
+    param_t *p, u8 ref_data[], u8 src_data[])
+{
+  tl_shape_t s = p->src->shape;
+  u32 n = s.n;
+  u32 c = s.c;
+  u32 hw = s.h * s.w;
+
+  for (u32 ni = 0; ni < n; ni++) {
+    for (u32 ci = 0; ci < c; ci++) {
+      for (u32 hwi = 0; hwi < hw; hwi++) {
+        u32 src_i = ni * c * hw + ci * hw + hwi;
+        u32 dst_i = ci * n * hw + ni * hw + hwi;
+        ref_data[dst_i] = src_data[src_i];
+      }
+    }
+  }
+}
+
+static void test_param_l2g(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = tl_shape_size(&p->src->shape);
+
+  u8 *src_data = (u8 *)malloc(sizeof(u8) * size);
+  for (u64 i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  put_tensor_g2l(ctx, bmk, p->src, src_data);
+  bmk1822_tdma_l2g_tensor_copy_nc_transposed(bmk, p);
+  test_submit(ctx);
+  u8 *dst_data = get_tg_gmem(ctx, p->dst);
+
+  u8 *ref_data = (u8 *)malloc(sizeof(u8) * size);
+  l2tg_tensor_copy_nc_transposed_ref(p, ref_data, src_data);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+static void destroy_param_l2g(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_tg_gmem(ctx, p->dst);
+  free_tl(bmk, p->src);
+}
+
+static void test_one_case(bmctx_t *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  for (int src_align = 0; src_align < 2; src_align++) {
+    param_t p;
+    memset(&p, 0, sizeof(p));
+
+    p.src = alloc_tl(bmk, c->src_shape, FMT_I8, src_align);
+    p.dst = alloc_tg_gmem(ctx, c->dst_shape, FMT_I8);
+    test_param_l2g(ctx, bmk, &p);
+    destroy_param_l2g(ctx, bmk, &p);
+
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_tdma_l2tg_tensor_fill_constant.cpp b/cviruntime/test/1822/test_1822_tdma_l2tg_tensor_fill_constant.cpp
new file mode 100644
index 000000000..200aff0ae
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_tdma_l2tg_tensor_fill_constant.cpp
@@ -0,0 +1,136 @@
+#include "1822_test_util.h"
+
+typedef bmk1822_tdma_l2tg_tensor_fill_constant_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: %u => (%u, %u, %u, %u)\n",
+      tag, p->constant,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  u8 constant;
+  tg_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    37, { 1, 1, 1, 1 }
+  }, {
+    39, { 1, 1, 1, 2 }
+  }, {
+    23, { 1, 1, 2, 1 }
+  }, {
+    19, { 1, 1, 7, 2 }
+  }, {
+    17, { 1, 1, 2, 7 }
+  }, {
+    13, { 1, 1, 17, 13 }
+  }, {
+    11, { 1, 1, 13, 17 }
+  }, {
+    7, { 1, 1, 10, 60 }
+  }, {
+    9, { 1, 1, 120, 5 }
+  }, {
+    2, { 1, 2, 1, 1 }
+  }, {
+    3, { 1, 1, 1, 2 }
+  }, {
+    5, { 2, 17, 1,  4 }
+  }, {
+    41, { 2,  1, 4, 17 }
+  }, {
+    5, { 2, 17, 1,  4 }
+  }, {
+    9, { 2,  1, 17, 4 }
+  }, {
+    17, { 3, 16, 1, 1 }
+  }, {
+    26, { 3,  1, 2, 8 }
+  }, {
+    103, { 3, 39, 17, 23 }
+  }, {
+    255, { 3, 17, 39, 23 }
+  }, {
+    254, { 3, 36, 16,  20 }
+  }, {
+    127, { 3, 18,  1, 640 }
+  }, {
+    128, { 5, 39, 17, 23 }
+  }, {
+    129, { 5, 17, 39, 23 }
+  }, {
+    55, { 20, 35,  2, 2 }
+  }, {
+    1, { 20,  7, 10, 2 }
+  }    
+};
+
+static void l2tg_tensor_fill_constant_ref(param_t *p, u8 ref_data[])
+{
+  u64 size = tg_shape_size(&p->dst->shape);
+
+  for (u64 i = 0; i < size; i++)
+    ref_data[i] = p->constant;
+}
+
+static void test_param_l2g(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = tg_shape_size(&p->dst->shape);
+
+  bmk1822_tdma_l2g_tensor_fill_constant(bmk, p);
+  test_submit(ctx);
+  u8 *dst_data = get_tg_gmem(ctx, p->dst);
+
+  u8 *ref_data = (u8 *)malloc(sizeof(u8) * size);
+  l2tg_tensor_fill_constant_ref(p, ref_data);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(dst_data);
+  free(ref_data);
+}
+
+
+static void destroy_param_l2g(bmctx_t *ctx, param_t *p)
+{
+  free_tg_gmem(ctx, p->dst);
+}
+
+static void test_one_case(bmctx_t *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.constant = c->constant;
+
+  p.dst = alloc_tg_gmem(ctx, c->dst_shape, FMT_I8);
+  test_param_l2g(ctx, bmk, &p);
+  destroy_param_l2g(ctx, &p);
+
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_tdma_l2tg_tensor_vlc_copy_compressed.cpp b/cviruntime/test/1822/test_1822_tdma_l2tg_tensor_vlc_copy_compressed.cpp
new file mode 100644
index 000000000..4ae449575
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_tdma_l2tg_tensor_vlc_copy_compressed.cpp
@@ -0,0 +1,169 @@
+#include "1822_test_util.h"
+
+typedef bmk1822_tdma_l2tg_tensor_copy_compressed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => %d-bit %s\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->bit_length,
+      (p->src->fmt == FMT_I8)? "signed": "unsigned");
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  tl_shape_t lmem_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 17, 13 }
+  },
+  {
+    { 3, 39, 17, 23 }
+  },
+  {
+    { 5, 39, 17, 23 }
+  },
+  {
+    { 20, 35,  2, 2 }
+  },
+#ifndef ENABEL_SIMPLE_BMK1822_VLC_TEST
+  {
+    { 1, 1, 1, 1 }
+  },
+  {
+    { 1, 1, 1, 2 }
+  },
+  {
+    { 1, 1, 7, 2 }
+  },
+  {
+    { 1, 1, 10, 60 }
+  },
+  {
+    { 1, 2, 1, 1 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 3, 16, 1, 1 }
+  },
+  {
+    { 3, 36, 16,  20 }
+  },
+#endif /* ifndef ENABEL_SIMPLE_BMK1822_VLC_TEST*/
+};
+
+static u64 l2tg_tensor_copy_vlc_compressed_ref(
+    param_t *p, u8 ref_data[], u8 src_data[], CommandInfo *cmd_info)
+{
+  u64 in_size = tl_shape_size(&p->src->shape);
+  size_t bs_size = 0;
+
+  bm_vlc_enc_int8(src_data, in_size, ref_data, &bs_size, cmd_info);
+  return bs_size;
+}
+
+static int test_param_l2g(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p, CommandInfo* cmd_info_est, u8 *src_data)
+{
+  print_param(stderr, p);
+  int ret = 0;
+  u64 size = tl_shape_size(&p->src->shape);
+
+  put_tensor_g2l(ctx, bmk, p->src, src_data);
+  bmk1822_tdma_l2g_tensor_copy_compressed(bmk, p);
+  test_submit(ctx);
+
+  u8 *dst_data = get_compressed_tg_gmem(ctx, p->dst);
+  u8 *ref_data = (u8 *)malloc(sizeof(u8) * p->dst->reserved_size); //<! bs_buf_size
+  if (!dst_data || !ref_data) {
+    ret = -1;
+    goto fail_exit;
+  }
+
+  size = l2tg_tensor_copy_vlc_compressed_ref(p, ref_data, src_data, cmd_info_est);
+
+  for (u64 i = 0; i < size ; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "vlc compress comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+
+      ret = -1;
+      break;
+    }
+  }
+
+fail_exit:
+  free(dst_data);
+  free(ref_data);
+
+  return ret;
+}
+
+static void destroy_param_l2g(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_compressed_tg_gmem(ctx, p->dst);
+  free_tl(bmk, p->src);
+}
+
+static int test_one_case(bmctx_t *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  int ret = 0;
+  fmt_t fmts[] = { FMT_I8, FMT_U8 };
+
+  for (int src_align = 0; src_align < 2; src_align++) {
+    for (u8 fmt_i = 0; fmt_i < 2; fmt_i++) {
+      fmt_t fmt = fmts[fmt_i];
+      param_t p;
+      memset(&p, 0, sizeof(p));
+
+      p.src = alloc_tl(bmk, c->lmem_shape, fmt, src_align);
+      assert(p.src);
+
+      CommandInfo cmd_info;
+      memset(&cmd_info, 0, sizeof(CommandInfo));
+      u64 in_size = tl_shape_size(&p.src->shape);
+
+      u8 *src_data = (u8 *)malloc(sizeof(u8) * in_size);
+      vlc_init_testdata(src_data, in_size, fmt == FMT_I8, fmt == FMT_BF16);
+
+      int is_signed = (p.src->fmt == FMT_I8);
+      cmd_info.signedness = is_signed;
+
+      // <! not support bias0/1 setting compress by hw
+      //bm_vlc_est_weight_bias(src_data, in_size, (bool)is_signed, (bool)data_type, &cmd_info);
+
+      p.dst = _alloc_vlc_compressed_tg_gmem(ctx,
+          &c->lmem_shape, fmt, &cmd_info);
+      ret |= test_param_l2g(ctx, bmk, &p, &cmd_info, src_data);
+      destroy_param_l2g(ctx, bmk, &p);
+
+      free(src_data);
+    }
+  }
+
+  return ret;
+}
+
+int main()
+{
+  int ret = 0;
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    ret |= test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return ret;
+}
diff --git a/cviruntime/test/1822/test_1822_tdma_matrix_vlc_decompress_compress.cpp b/cviruntime/test/1822/test_1822_tdma_matrix_vlc_decompress_compress.cpp
new file mode 100644
index 000000000..01c7b46c5
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_tdma_matrix_vlc_decompress_compress.cpp
@@ -0,0 +1,189 @@
+#include "1822_test_util.h"
+
+typedef bmk1822_tdma_tg2l_matrix_copy_decompressed_param_t decompress_param_t;
+typedef bmk1822_tdma_l2tg_matrix_copy_compressed_param_t compress_param_t;
+
+typedef struct{
+  decompress_param_t dec_p;
+  compress_param_t com_p;
+} param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => %s\n",
+      tag,
+      p->dec_p.dst->shape.n, p->dec_p.dst->shape.c, p->dec_p.dst->shape.w, p->dec_p.dst->shape.col,
+      (p->dec_p.dst->fmt == FMT_I8)? "signed": "unsigned");
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  mg_shape_t src_shape;
+  ml_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 0, 1 },
+    { 0, 1, 1, 1 },
+  },
+  {
+    { 0, 2 },
+    { 0, 1, 2, 2 },
+  },
+  {
+    { 0, 2 },
+    { 0, 2, 1, 2 },
+  },
+  {
+    { 0, 7 },
+    { 0, 1, 7, 7 },
+  },
+#ifndef ENABEL_SIMPLE_BMK1822_VLC_TEST
+  {
+    { 0, 7 },
+    { 0, 2, 4, 7 },
+  },
+  {
+    { 0, 7 },
+    { 0, 7, 1, 7 },
+  },
+  {
+    { 0, 17 },
+    { 0, 1, 17, 17 },
+  },
+  {
+    { 0, 17 },
+    { 0, 3, 7, 17 },
+  },
+  {
+    { 0, 17 },
+    { 0, 17, 1, 17 },
+  },
+  {
+    { 0, 60 },
+    { 0, 1, 60, 60 },
+  },
+  {
+    { 0, 60 },
+    { 0, 30, 2, 60 },
+  },
+  {
+    { 0, 60 },
+    { 0, 60, 1, 60 },
+  }
+#endif /* ifndef ENABEL_SIMPLE_BMK1822_VLC_TEST*/
+};
+
+static void test_param_g2l(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p, u8 *src_data,
+  CommandInfo* cmd_info)
+{
+  print_param(stderr, p);
+  u64 size = ml_shape_size(&p->dec_p.dst->shape);
+  int is_signed = (p->dec_p.dst->fmt == FMT_I8);
+
+  u8 *gmem_data;
+  size_t bs_size;
+  size_t data_type = (p->dec_p.dst->fmt == FMT_BF16) ? 1 : 0;
+
+  // command info
+  gmem_data = vlc_compress(src_data, size, is_signed, data_type, &bs_size, cmd_info, NULL);
+
+  //1. send compressed one to gaddr and decompress from gaddr to local
+  put_compressed_mg_gmem(ctx, p->dec_p.src, gmem_data, bs_size);
+  bmk1822_tdma_g2l_matrix_copy_decompressed(bmk, &p->dec_p);
+  test_submit(ctx);
+
+  //2. decompress from sram
+  bmk1822_tdma_l2g_matrix_copy_compressed(bmk, &p->com_p);
+  test_submit(ctx);
+
+  //3. get final data
+  size_t bs_buf_size = get_out_bs_buf_size(size, data_type);
+  u8 *dst_data = get_compressed_mg_gmem(ctx, p->com_p.dst, bs_buf_size);
+
+  for (u64 i = 0; i < bs_size ; i++) {
+    if (dst_data[i] != gmem_data[i]) {
+      fprintf(stderr, "vlc compress comparing failed at dst[%" PRIx64 "], got %d, exp %d\n",
+              i, dst_data[i], gmem_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(dst_data);
+  free(gmem_data);
+}
+
+static void destroy_param_g2l(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_compressed_mg_gmem(ctx, p->dec_p.src);
+  free_compressed_mg_gmem(ctx, p->com_p.dst);
+  free_ml(bmk, p->dec_p.dst);
+}
+
+static void test_one_case(bmctx_t *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  fmt_t fmts[] = { FMT_I8, FMT_U8 };
+  u8 fmts_sz = sizeof(fmts) / sizeof(fmts[0]);
+
+  for (u32 row = 1; row < 13; row += 2) {
+    c->src_shape.row = row;
+    c->dst_shape.n = row;
+    for (int dst_align = 0; dst_align < 2; dst_align++) {
+      for (u8 fmt_i = 0; fmt_i < fmts_sz; fmt_i++) {
+        fmt_t fmt = fmts[fmt_i];
+        param_t p;
+        memset(&p, 0, sizeof(p));
+        //put compressed data to gaddr ->decompress to local -> compress to gaddr
+
+        int is_signed = (fmt == FMT_I8);
+
+        CommandInfo cmd_info;
+        memset(&cmd_info, 0, sizeof(CommandInfo));
+        cmd_info.signedness = is_signed;
+
+        // <! not support bias0/1 setting compress by hw
+        //get_vlc_compressed_meta(src_data, size, fmt, &bs_size, &cmd_info);
+
+        //1. alloc decompress
+        p.dec_p.src = alloc_vlc_compressed_mg_gmem(ctx, c->src_shape, fmt, &cmd_info);
+        p.dec_p.dst = alloc_ml(bmk, c->dst_shape, fmt, dst_align);
+
+        u64 size = ml_shape_size(&p.dec_p.dst->shape);
+        u8 *src_data = (u8 *)malloc(sizeof(u8) * size);
+        vlc_init_testdata(src_data, size, fmt == FMT_I8, fmt == FMT_BF16);
+
+        assert(p.dec_p.dst);
+
+        //2. alloc compress
+        p.com_p.src = p.dec_p.dst; //alloc_tl(bmk, c->lmem_shape, fmt, align);
+        p.com_p.dst = alloc_vlc_compressed_mg_gmem(ctx, c->src_shape, fmt, &cmd_info);
+
+        //3. test: the seqence like below:
+        //3.1 put compressed data to gaddr
+        //3.2 decompress to local
+        //3.3 compress to gaddr
+        //printf ("row %u is_align %d fmt %d\n", row, dst_align, fmt);
+        test_param_g2l(ctx, bmk, &p, src_data, &cmd_info);
+        destroy_param_g2l(ctx, bmk, &p);
+        free(src_data);
+      }
+    }
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_tdma_tensor_vlc_decompress_compress.cpp b/cviruntime/test/1822/test_1822_tdma_tensor_vlc_decompress_compress.cpp
new file mode 100644
index 000000000..4d704747c
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_tdma_tensor_vlc_decompress_compress.cpp
@@ -0,0 +1,166 @@
+#include "1822_test_util.h"
+
+typedef bmk1822_tdma_tg2l_tensor_copy_decompressed_param_t decompress_param_t;
+typedef bmk1822_tdma_l2tg_tensor_copy_compressed_param_t compress_param_t;
+
+typedef struct{
+  decompress_param_t dec_p;
+  compress_param_t com_p;
+} param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => %d-bit %s\n",
+      tag,
+      p->dec_p.dst->shape.n, p->dec_p.dst->shape.c, p->dec_p.dst->shape.h, p->dec_p.dst->shape.w,
+      p->dec_p.src->bit_length,
+      (p->dec_p.dst->fmt == FMT_I8)? "signed": "unsigned");
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  tl_shape_t lmem_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 17, 13 }
+  },
+  {
+    { 3, 39, 17, 23 }
+  },
+  {
+    { 5, 39, 17, 23 }
+  },
+  {
+    { 20, 35,  2, 2 }
+  },
+#ifndef ENABEL_SIMPLE_BMK1822_VLC_TEST
+  {
+    { 1, 1, 1, 1 }
+  },
+  {
+    { 1, 1, 1, 2 }
+  },
+  {
+    { 1, 1, 7, 2 }
+  },
+  {
+    { 1, 1, 10, 60 }
+  },
+  {
+    { 1, 2, 1, 1 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 3, 16, 1, 1 }
+  },
+  {
+    { 3, 36, 16,  20 }
+  },
+#endif /* ifndef ENABEL_SIMPLE_BMK1822_VLC_TEST*/
+};
+
+static void test_param_g2l(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p, compressed_tg_t* dst)
+{
+  print_param(stderr, p);
+  u64 size = tl_shape_size(&p->dec_p.dst->shape);
+  int is_signed = (p->dec_p.dst->fmt == FMT_I8);
+  u8 *src_data = (u8 *)malloc(sizeof(u8) * size);
+  vlc_init_testdata(src_data, size, p->dec_p.dst->fmt == FMT_I8, p->dec_p.dst->fmt == FMT_BF16);
+
+  u8 *gmem_data;
+  size_t total_size;
+  size_t data_type = (p->dec_p.dst->fmt == FMT_BF16) ? 1 : 0;
+  size_t in_size = size;
+  size_t bs_buf_size = get_out_bs_buf_size(size, data_type);
+  gmem_data = (uint8_t *) malloc(bs_buf_size * sizeof(uint8_t));
+
+  // command info
+  CommandInfo cmd_info;
+  memset(&cmd_info, 0, sizeof(CommandInfo));
+  cmd_info.signedness = is_signed;
+
+  // <! not support bias0/1 setting compress by hw
+  //bm_vlc_est_weight_bias(src_data, in_size, (bool)is_signed, (bool)data_type, &cmd_info);
+  bm_vlc_enc_int8(src_data, in_size, gmem_data, &total_size, &cmd_info);
+
+  put_compressed_tg_gmem(ctx, p->dec_p.src, gmem_data, total_size);
+  bmk1822_tdma_g2l_tensor_copy_decompressed(bmk, &p->dec_p);
+  test_submit(ctx);
+
+  dst->zero_guard_en = cmd_info.zero_guard_en;
+  dst->bias0 = cmd_info.bias0;
+  dst->bias1 = cmd_info.bias1;
+  p->com_p.dst = dst;
+  bmk1822_tdma_l2g_tensor_copy_compressed(bmk, &p->com_p);
+  test_submit(ctx);
+
+  u8 *dst_data = get_compressed_tg_gmem(ctx, p->com_p.dst);
+
+  for (u64 i = 0; i < total_size ; i++) {
+    if (dst_data[i] != gmem_data[i]) {
+      fprintf(stderr, "vlc compress comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], gmem_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+  free(gmem_data);
+}
+
+static void destroy_param_g2l(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_compressed_tg_gmem(ctx, p->dec_p.src);
+  free_compressed_tg_gmem(ctx, p->com_p.dst);
+  free_tl(bmk, p->dec_p.dst);
+}
+
+static void test_one_case(bmctx_t *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  fmt_t fmts[2] = { FMT_I8, FMT_U8 };
+
+  for (int align = 0; align < 2; align++) {
+    for (u8 fmt_i = 0; fmt_i < 2; fmt_i++) {
+      fmt_t fmt = fmts[fmt_i];
+
+      param_t p;
+      memset(&p, 0, sizeof(p));
+      p.dec_p.src = alloc_vlc_compressed_tg_gmem(ctx,
+          &c->lmem_shape, fmt);
+      p.dec_p.dst = alloc_tl(bmk, c->lmem_shape, fmt, align);
+      assert(p.dec_p.dst);
+
+      p.com_p.src = p.dec_p.dst; //alloc_tl(bmk, c->lmem_shape, fmt, align);
+      assert(p.com_p.src);
+      compressed_tg_t* dst = alloc_vlc_compressed_tg_gmem(ctx,
+          &c->lmem_shape, fmt);
+
+      test_param_g2l(ctx, bmk, &p, dst);
+      destroy_param_g2l(ctx, bmk, &p);
+    }
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_tdma_tg2l_general_copy.cpp b/cviruntime/test/1822/test_1822_tdma_tg2l_general_copy.cpp
new file mode 100644
index 000000000..6f6b34127
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_tdma_tg2l_general_copy.cpp
@@ -0,0 +1,92 @@
+#include "1822_test_util.h"
+
+typedef bmk1822_tdma_tg2l_general_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: %u bytes from %u:%" PRIu64 " to %" PRIu32 "\n", tag,
+      p->bytes, p->src_base_reg_index, p->src_address, p->dst_address);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef param_t case_t;
+
+static case_t g_cases[] = {
+  { 0, 0, 0, 1 },
+  { 0, 0, 0, 39 },
+  { 0, 0, 0, 4096 },
+  { 0, 1, 0, 1 },
+  { 0, 1, 0, 39 },
+  { 0, 1, 0, 4096 },
+  { 0, 1, 100, 1 },
+  { 0, 1, 200, 39 },
+  { 0, 1, 4096, 4096 },
+  { 0, 257, 100, 1 },
+  { 0, 349, 200, 39 },
+  { 0, 3356, 4096, 4096 },
+};
+
+static void tg2l_general_copy_ref(param_t *p, u8 ref_data[], u8 src_data[])
+{
+  for (u32 i = 0; i < p->bytes; i++)
+    ref_data[i] = src_data[i];
+}
+
+static void test_param_g2l(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = p->bytes;
+
+  u8 *src_data = (u8 *)malloc(sizeof(u8) * size);
+  for (u64 i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+#if 1
+  put_bytes_g2l(ctx, bmk, p->dst_address, size, src_data);
+
+#else
+  put_bytes_gmem(ctx, p->src_address, size, src_data);
+  bmk1822_tdma_g2l_general_copy(bmk, p);
+  test_submit(ctx);
+#endif
+
+  u8 *dst_data = get_bytes_l2g(ctx, bmk, p->dst_address, size);
+
+  u8 *ref_data = (u8 *)malloc(sizeof(u8) * size);
+  tg2l_general_copy_ref(p, ref_data, src_data);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+static void test_one_case(bmctx_t *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  param_t *p = c;
+
+  test_param_g2l(ctx, bmk, p);
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_tdma_tg2l_matrix_copy.cpp b/cviruntime/test/1822/test_1822_tdma_tg2l_matrix_copy.cpp
new file mode 100644
index 000000000..d278e0345
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_tdma_tg2l_matrix_copy.cpp
@@ -0,0 +1,135 @@
+#include "1822_test_util.h"
+
+typedef bmk1822_tdma_tg2l_matrix_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.row, p->src->shape.col,
+      p->dst->shape.n, p->dst->shape.c,
+      p->dst->shape.w, p->dst->shape.col);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  mg_shape_t src_shape;
+  ml_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 0, 1 },
+    { 0, 1, 1, 1 },
+  }, {
+    { 0, 2 },
+    { 0, 1, 2, 2 },
+  }, {
+    { 0, 2 },
+    { 0, 2, 1, 2 },
+  }, {
+    { 0, 7 },
+    { 0, 1, 7, 7 },
+  }, {
+    { 0, 7 },
+    { 0, 2, 4, 7 },
+  }, {
+    { 0, 7 },
+    { 0, 7, 1, 7 },
+  }, {
+    { 0, 17 },
+    { 0, 1, 17, 17 },
+  }, {
+    { 0, 17 },
+    { 0, 3, 7, 17 },
+  }, {
+    { 0, 17 },
+    { 0, 17, 1, 17 },
+  }, {
+    { 0, 60 },
+    { 0, 1, 60, 60 },
+  }, {
+    { 0, 60 },
+    { 0, 30, 2, 60 },
+  }, {
+    { 0, 60 },
+    { 0, 60, 1, 60 },
+  }
+};
+
+static void tg2l_matrix_copy_ref(param_t *p, u8 ref_data[], u8 src_data[])
+{
+  u64 size = ml_shape_size(&p->dst->shape);
+
+  for (u64 i = 0; i < size; i++)
+    ref_data[i] = src_data[i];
+}
+
+static void test_param_g2l(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = ml_shape_size(&p->dst->shape);
+
+  u8 *src_data = (u8 *)malloc(sizeof(u8) * size);
+  for (u64 i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  put_mg_gmem(ctx, p->src, src_data);
+  bmk1822_tdma_g2l_matrix_copy(bmk, p);
+  test_submit(ctx);
+  u8 *dst_data = get_matrix_l2g(ctx, bmk, p->dst);
+
+  u8 *ref_data = (u8 *)malloc(sizeof(u8) * size);
+  tg2l_matrix_copy_ref(p, ref_data, src_data);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+static void destroy_param_g2l(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_mg_gmem(ctx, p->src);
+  free_ml(bmk, p->dst);
+}
+
+static void test_one_case(bmctx_t *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  for (u32 row = 1; row < 13; row += 2) {
+    c->src_shape.row = row;
+    c->dst_shape.n = row;
+    for (int dst_align = 0; dst_align < 2; dst_align++) {
+      param_t p;
+      memset(&p, 0, sizeof(p));
+
+      p.src = alloc_mg_gmem(ctx, c->src_shape);
+      p.dst = alloc_ml(bmk, c->dst_shape, dst_align);
+      test_param_g2l(ctx, bmk, &p);
+      destroy_param_g2l(ctx, bmk, &p);
+    }
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_tdma_tg2l_matrix_copy_row_col_transposed.cpp b/cviruntime/test/1822/test_1822_tdma_tg2l_matrix_copy_row_col_transposed.cpp
new file mode 100644
index 000000000..2bcf2aee3
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_tdma_tg2l_matrix_copy_row_col_transposed.cpp
@@ -0,0 +1,386 @@
+#include "1822_test_util.h"
+
+typedef bmk1822_tdma_tg2l_matrix_copy_row_col_transposed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.row, p->src->shape.col,
+      p->dst->shape.n, p->dst->shape.c,
+      p->dst->shape.w, p->dst->shape.col);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  mg_shape_t src_shape;
+  ml_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1 },
+    { 1, 1, 1, 1 },
+  }, {
+    { 1, 2 },
+    { 2, 1, 1, 1 },
+  }, {
+    { 1, 7 },
+    { 7, 1, 1, 1 },
+  }, {
+    { 1, 17 },
+    { 17, 1, 1, 1 },
+  }, {
+    { 1, 60 },
+    { 60, 1, 1, 1 },
+  }, {
+    { 1, 139 },
+    { 139, 1, 1, 1 },
+  }, {
+    { 2, 1 },
+    { 1, 1, 2, 2 },
+  }, {
+    { 2, 1 },
+    { 1, 2, 1, 2 },
+  }, {
+    { 2, 2 },
+    { 2, 1, 2, 2 },
+  }, {
+    { 2, 2 },
+    { 2, 2, 1, 2 },
+  }, {
+    { 2, 7 },
+    { 7, 1, 2, 2 },
+  }, {
+    { 2, 7 },
+    { 7, 2, 1, 2 },
+  }, {
+    { 2, 17 },
+    { 17, 1, 2, 2 },
+  }, {
+    { 2, 17 },
+    { 17, 2, 1, 2 },
+  }, {
+    { 2, 60 },
+    { 60, 1, 2, 2 },
+  }, {
+    { 2, 60 },
+    { 60, 2, 1, 2 },
+  }, {
+    { 2, 139 },
+    { 139, 1, 2, 2 },
+  }, {
+    { 2, 139 },
+    { 139, 2, 1, 2 },
+  }, {
+    { 7, 1 },
+    { 1, 1, 7, 7 },
+  }, {
+    { 7, 1 },
+    { 1, 2, 4, 7 },
+  }, {
+    { 7, 1 },
+    { 1, 2, 5, 7 },
+  }, {
+    { 7, 1 },
+    { 1, 2, 6, 7 },
+  }, {
+    { 7, 1 },
+    { 1, 3, 3, 7 },
+  }, {
+    { 7, 1 },
+    { 1, 4, 2, 7 },
+  }, {
+    { 7, 1 },
+    { 1, 7, 1, 7 },
+  }, {
+    { 7, 2 },
+    { 2, 1, 7, 7 },
+  }, {
+    { 7, 2 },
+    { 2, 2, 4, 7 },
+  }, {
+    { 7, 2 },
+    { 2, 2, 5, 7 },
+  }, {
+    { 7, 2 },
+    { 2, 2, 6, 7 },
+  }, {
+    { 7, 2 },
+    { 2, 3, 3, 7 },
+  }, {
+    { 7, 2 },
+    { 2, 4, 2, 7 },
+  }, {
+    { 7, 2 },
+    { 2, 7, 1, 7 },
+  }, {
+    { 7, 7 },
+    { 7, 1, 7, 7 },
+  }, {
+    { 7, 7 },
+    { 7, 3, 3, 7 },
+  }, {
+    { 7, 7 },
+    { 7, 4, 2, 7 },
+  }, {
+    { 7, 7 },
+    { 7, 7, 1, 7 },
+  }, {
+    { 7, 17 },
+    { 17, 1, 7, 7 },
+  }, {
+    { 7, 17 },
+    { 17, 4, 2, 7 },
+  }, {
+    { 7, 17 },
+    { 17, 7, 1, 7 },
+  }, {
+    { 7, 60 },
+    { 60, 1, 7, 7 },
+  }, {
+    { 7, 60 },
+    { 60, 3, 3, 7 },
+  }, {
+    { 7, 60 },
+    { 60, 7, 1, 7 },
+  }, {
+    { 7, 139 },
+    { 139, 1, 7, 7 },
+  }, {
+    { 7, 139 },
+    { 139, 3, 3, 7 },
+  }, {
+    { 7, 139 },
+    { 139, 7, 1, 7 },
+  }, {
+    { 43, 1 },
+    { 1, 1, 43, 43 },
+  }, {
+    { 43, 1 },
+    { 1, 2, 22, 43 },
+  }, {
+    { 43, 1 },
+    { 1, 2, 25, 43 },
+  }, {
+    { 43, 1 },
+    { 1, 2, 37, 43 },
+  }, {
+    { 43, 1 },
+    { 1, 2, 41, 43 },
+  }, {
+    { 43, 1 },
+    { 1, 5, 9, 43 },
+  }, {
+    { 43, 1 },
+    { 1, 5, 10, 43 },
+  }, {
+    { 43, 1 },
+    { 1, 9, 5, 43 },
+  }, {
+    { 43, 1 },
+    { 1, 22, 2, 43 },
+  }, {
+    { 43, 1 },
+    { 1, 43, 1, 43 },
+  }, {
+    { 43, 2 },
+    { 2, 1, 43, 43 },
+  }, {
+    { 43, 2 },
+    { 2, 2, 27, 43 },
+  }, {
+    { 43, 2 },
+    { 2, 22, 2, 43 },
+  }, {
+    { 43, 2 },
+    { 2, 43, 1, 43 },
+  }, {
+    { 57, 7 },
+    { 7, 1, 57, 57 },
+  }, {
+    { 57, 7 },
+    { 7, 2, 37, 57 },
+  }, {
+    { 57, 7 },
+    { 7, 2, 43, 57 },
+  }, {
+    { 57, 7 },
+    { 7, 2, 55, 57 },
+  }, {
+    { 57, 7 },
+    { 7, 2, 56, 57 },
+  }, {
+    { 57, 7 },
+    { 7, 7, 9, 57 },
+  }, {
+    { 57, 7 },
+    { 7, 8, 8, 57 },
+  }, {
+    { 57, 7 },
+    { 7, 29, 2, 57 },
+  }, {
+    { 57, 7 },
+    { 7, 57, 1, 57 },
+  }, {
+    { 67, 17 },
+    { 17, 1, 67, 67 },
+  }, {
+    { 67, 17 },
+    { 17, 2, 34, 67 },
+  }, {
+    { 67, 17 },
+    { 17, 2, 49, 67 },
+  }, {
+    { 67, 17 },
+    { 17, 2, 66, 67 },
+  }, {
+    { 67, 17 },
+    { 17, 6, 12, 67 },
+  }, {
+    { 67, 17 },
+    { 17, 6, 13, 67 },
+  }, {
+    { 67, 17 },
+    { 17, 17, 4, 67 },
+  }, {
+    { 67, 17 },
+    { 17, 34, 2, 67 },
+  }, {
+    { 67, 17 },
+    { 17, 67, 1, 67 },
+  }, {
+    { 129, 139 },
+    { 139, 1, 129, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 2, 65, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 2, 80, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 2, 120, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 2, 128, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 3, 43, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 3, 47, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 3, 59, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 3, 64, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 7, 19, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 7, 20, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 7, 21, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 43, 3, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 65, 2, 129 },
+  }
+// out of lmem size
+//  , {
+//    { 129, 139 },
+//    { 139, 129, 1, 129 },
+//  }
+};
+
+static void tg2l_matrix_copy_row_col_transposed_ref(
+    param_t *p, u8 ref_data[], u8 src_data[])
+{
+  u64 row = p->src->shape.row;
+  u64 col = p->src->shape.col;
+
+  for (u64 ri = 0; ri < row; ri++) {
+    for (u64 ci = 0; ci < col; ci++) {
+      u64 src_i = ri * col + ci;
+      u64 dst_i = ci * row + ri;
+      ref_data[dst_i] = src_data[src_i];
+    }
+  }
+}
+
+static void test_param_g2l(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = ml_shape_size(&p->dst->shape);
+
+  u8 *src_data = (u8 *)malloc(sizeof(u8) * size);
+  for (u64 i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  put_mg_gmem(ctx, p->src, src_data);
+  bmk1822_tdma_g2l_matrix_copy_row_col_transposed(bmk, p);
+  test_submit(ctx);
+  u8 *dst_data = get_matrix_l2g(ctx, bmk, p->dst);
+
+  u8 *ref_data = (u8 *)malloc(sizeof(u8) * size);
+  tg2l_matrix_copy_row_col_transposed_ref(p, ref_data, src_data);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+
+static void destroy_param_g2l(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_mg_gmem(ctx, p->src);
+  free_ml(bmk, p->dst);
+}
+
+static void test_one_case(bmctx_t *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  param_t p;
+  /*
+   * Matrix transpose must be n/c stride alignment
+   * for TDMA limitation
+   */
+  int dst_align = 1;
+
+  memset(&p, 0, sizeof(p));
+
+  p.src = alloc_mg_gmem(ctx, c->src_shape);
+  p.dst = alloc_ml(bmk, c->dst_shape, dst_align);
+  test_param_g2l(ctx, bmk, &p);
+  destroy_param_g2l(ctx, bmk, &p);
+
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_tdma_tg2l_matrix_vlc_copy_decompressed.cpp b/cviruntime/test/1822/test_1822_tdma_tg2l_matrix_vlc_copy_decompressed.cpp
new file mode 100644
index 000000000..4d2947e4d
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_tdma_tg2l_matrix_vlc_copy_decompressed.cpp
@@ -0,0 +1,182 @@
+#include "1822_test_util.h"
+
+typedef bmk1822_tdma_tg2l_matrix_copy_decompressed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->m.shape.row, p->src->m.shape.col,
+      p->dst->shape.n, p->dst->shape.c,
+      p->dst->shape.w, p->dst->shape.col);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  mg_shape_t src_shape;
+  ml_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 0, 17 },
+    { 0, 3, 7, 17 },
+  },
+  {
+    { 0, 7 },
+    { 0, 7, 1, 7 },
+  },
+  {
+    { 0, 60 },
+    { 0, 30, 2, 60 },
+  },
+#ifndef ENABEL_SIMPLE_BMK1822_VLC_TEST
+  {
+    { 0, 1 },
+    { 0, 1, 1, 1 },
+  },
+  {
+    { 0, 2 },
+    { 0, 1, 2, 2 },
+  },
+  {
+    { 0, 2 },
+    { 0, 2, 1, 2 },
+  },
+  {
+    { 0, 7 },
+    { 0, 1, 7, 7 },
+  },
+  {
+    { 0, 7 },
+    { 0, 2, 4, 7 },
+  },
+  {
+    { 0, 17 },
+    { 0, 1, 17, 17 },
+  },
+  {
+    { 0, 17 },
+    { 0, 17, 1, 17 },
+  },
+  {
+    { 0, 60 },
+    { 0, 1, 60, 60 },
+  },
+  {
+    { 0, 60 },
+    { 0, 60, 1, 60 },
+  }
+#endif /* ifndef ENABEL_SIMPLE_BMK1822_VLC_TEST*/
+};
+
+static void tg2l_matrix_copy_ref(param_t *p, u8 ref_data[], u8 src_data[])
+{
+  u64 size = ml_shape_size(&p->dst->shape);
+
+  for (u64 i = 0; i < size; i++)
+    ref_data[i] = src_data[i];
+}
+
+static void test_param_g2l(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p, u8 *src_data, CommandInfo* cmd_info)
+{
+  print_param(stderr, p);
+
+  u64 in_size = ml_shape_size(&p->dst->shape);
+  size_t bs_size = 0;
+  int is_signed = (p->dst->fmt == FMT_I8);
+  size_t data_type = (p->dst->fmt == FMT_BF16) ? 1 : 0;
+
+  u8 *bsbuf = vlc_compress(src_data, in_size, is_signed, data_type, &bs_size, cmd_info, NULL);
+
+  put_compressed_mg_gmem(ctx, p->src, bsbuf, bs_size);
+  bmk1822_tdma_g2l_matrix_copy_decompressed(bmk, p);
+  test_submit(ctx);
+  u8 *dst_data = get_matrix_l2g(ctx, bmk, p->dst);
+
+  u8 *ref_data = (u8 *)malloc(sizeof(u8) * in_size);
+  tg2l_matrix_copy_ref(p, ref_data, src_data);
+
+  for (u64 i = 0; i < in_size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(dst_data);
+  free(ref_data);
+  free(bsbuf);
+}
+
+static void destroy_param_g2l(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_compressed_mg_gmem(ctx, p->src);
+  free_ml(bmk, p->dst);
+}
+
+static void test_one_case(bmctx_t *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  fmt_t fmts[] = { FMT_I8, FMT_U8 };
+  u8 fmts_sz = sizeof(fmts) / sizeof(fmts[0]);
+
+  for (u32 row = 1; row < 13; row += 2) {
+    c->src_shape.row = row;
+    c->dst_shape.n = row;
+    for (int mode = 0; mode < VLC_CMP_MODE_MAX; mode++) {
+      for (int dst_align = 0; dst_align < 2; dst_align++) {
+        for (u8 fmt_i = 0; fmt_i < fmts_sz; fmt_i++) {
+          fmt_t fmt = fmts[fmt_i];
+          param_t p;
+
+          memset(&p, 0, sizeof(p));
+
+          int is_signed = (fmt == FMT_I8);
+          size_t data_type = (fmt == FMT_BF16) ? 1 : 0;
+          CommandInfo cmd_info;
+
+          memset(&cmd_info, 0, sizeof(CommandInfo));
+          cmd_info.signedness = is_signed;
+
+          // <! 1. alloc source
+          p.dst = alloc_ml(bmk, c->dst_shape, fmt, dst_align);
+          u64 in_size = ml_shape_size(&p.dst->shape);
+
+          // <! 2 init input
+          u8 *src_data = (u8 *)malloc(sizeof(u8) * in_size);
+          vlc_init_testdata(src_data, in_size, fmt == FMT_I8, fmt == FMT_BF16);
+
+          // <! 3 try to manual set bias0/bias1
+          if (mode == VLC_CMP_MODE_COMPILER) {
+            bm_vlc_est_weight_bias(src_data, in_size, (bool)is_signed, (bool)data_type, &cmd_info);
+          }
+
+          p.src = alloc_vlc_compressed_mg_gmem(ctx, c->src_shape, fmt, &cmd_info);
+
+          //printf ("row %u mode %d is_align %d fmt %d\n", row, mode, dst_align, fmt);
+          test_param_g2l(ctx, bmk, &p, src_data, &cmd_info);
+
+          free(src_data);
+          destroy_param_g2l(ctx, bmk, &p);
+        }
+      }
+    }
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_tdma_tg2l_tensor_copy.cpp b/cviruntime/test/1822/test_1822_tdma_tg2l_tensor_copy.cpp
new file mode 100644
index 000000000..1a2b908be
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_tdma_tg2l_tensor_copy.cpp
@@ -0,0 +1,133 @@
+#include "1822_test_util.h"
+
+typedef bmk1822_tdma_tg2l_tensor_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  tg_shape_t src_shape;
+  tl_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 1, 1 },
+    { 1, 1, 1, 1 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 2, 1 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 2, 7 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 13, 17 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 120, 5 },
+  }, {
+    { 1, 2, 1, 1 },
+    { 1, 1, 1, 2 },
+  }, {
+    { 2, 17, 1,  4 },
+    { 2,  1, 4, 17 },
+  }, {
+    { 2, 17, 1,  4 },
+    { 2,  1, 17, 4 },
+  }, {
+    { 3, 16, 1, 1 },
+    { 3,  1, 2, 8 },
+  }, {
+    { 3, 39, 17, 23 },
+    { 3, 17, 39, 23 },
+  }, {
+    { 3, 36, 16,  20 },
+    { 3, 18,  1, 640 },
+  }, {
+    { 5, 39, 17, 23 },
+    { 5, 17, 39, 23 },
+  }, {
+    { 20, 35,  2, 2 },
+    { 20,  7, 10, 2 },
+  }    
+};
+
+static void tg2l_tensor_copy_ref(param_t *p, u8 ref_data[], u8 src_data[])
+{
+  u64 size = tl_shape_size(&p->dst->shape);
+
+  for (u64 i = 0; i < size; i++)
+    ref_data[i] = src_data[i];
+}
+
+static void test_param_g2l(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = tl_shape_size(&p->dst->shape);
+
+  u8 *src_data = (u8 *)malloc(sizeof(u8) * size);
+  for (u64 i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  put_tg_gmem(ctx, p->src, src_data);
+  bmk1822_tdma_g2l_tensor_copy(bmk, p);
+  test_submit(ctx);
+  u8 *dst_data = get_tensor_l2g(ctx, bmk, p->dst);
+
+  u8 *ref_data = (u8 *)malloc(sizeof(u8) * size);
+  tg2l_tensor_copy_ref(p, ref_data, src_data);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+static void destroy_param_g2l(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_tg_gmem(ctx, p->src);
+  free_tl(bmk, p->dst);
+}
+
+static void test_one_case(bmctx_t *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  for (int dst_align = 0; dst_align < 2; dst_align++) {
+    param_t p;
+    memset(&p, 0, sizeof(p));
+
+    p.src = alloc_tg_gmem(ctx, c->src_shape, FMT_I8);
+    p.dst = alloc_tl(bmk, c->dst_shape, FMT_I8, dst_align);
+    test_param_g2l(ctx, bmk, &p);
+    destroy_param_g2l(ctx, bmk, &p);
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_tdma_tg2l_tensor_copy_chw_rotated.cpp b/cviruntime/test/1822/test_1822_tdma_tg2l_tensor_copy_chw_rotated.cpp
new file mode 100644
index 000000000..a13253879
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_tdma_tg2l_tensor_copy_chw_rotated.cpp
@@ -0,0 +1,179 @@
+#include "1822_test_util.h"
+
+typedef bmk1822_tdma_tg2l_tensor_copy_chw_rotated_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.h, p->src->shape.w, p->src->shape.c,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  tg_shape_t src_shape;
+  tl_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 3, 1, 1 }, // nchw for neuron
+    { 1, 3, 1, 1 }, // nchw for neuron
+  }, {
+    { 1, 4, 1, 1 },
+    { 1, 4, 1, 1 },
+  }, {
+    { 1, 3, 1, 7 },
+    { 1, 3, 1, 7 },
+  }, {
+    { 1, 4, 1, 7 },
+    { 1, 4, 1, 7 },
+  }, {
+    { 1, 3, 1, 17 },
+    { 1, 3, 1, 17 },
+  }, {
+    { 1, 4, 1, 17 },
+    { 1, 4, 1, 17 },
+  }, {
+    { 1, 3, 2, 1 },
+    { 1, 3, 2, 1 },
+  }, {
+    { 1, 4, 2, 1 },
+    { 1, 4, 2, 1 },
+  }, {
+    {  2, 3, 17, 1 },
+    {  2, 3, 17, 1 },
+  }, {
+    {  2, 4, 17, 1 },
+    {  2, 4, 17, 1 },
+  }, {
+    {  2, 3, 17, 3 },
+    {  2, 3, 17, 3 },
+  }, {
+    {  2, 4, 17, 3 },
+    {  2, 4, 17, 3 },
+  }, {
+    {  3, 3, 16, 7 },
+    {  3, 3, 16, 7 },
+  }, {
+    {  3, 4, 16, 7 },
+    {  3, 4, 16, 7 },
+  }, {
+    {  3, 3, 39, 17 },
+    {  3, 3, 39, 17 },
+  }, {
+    {  3, 4, 39, 17 },
+    {  3, 4, 39, 17 },
+  }, {
+    {  3, 3, 36, 16 },
+    {  3, 3, 36, 16 },
+  }, {
+    {  3, 4, 36, 16 },
+    {  3, 4, 36, 16 },
+  }, {
+    {  5, 3, 39, 17 },
+    {  5, 3, 39, 17 },
+  }, {
+    {  5, 4, 39, 17 },
+    {  5, 4, 39, 17 },
+  }, {
+    { 20, 3, 35, 2 },
+    { 20, 3, 35, 2 },
+  }, {
+    { 20, 4, 35, 2 },
+    { 20, 4, 35, 2 },
+  }, {
+    { 20, 3, 35, 3 },
+    { 20, 3, 35, 3 },
+  }, {
+    { 20, 4, 35, 3 },
+    { 20, 4, 35, 3 },
+  }
+};
+
+static void tg2l_tensor_copy_chw_rotated_ref(
+    param_t *p, u8 ref_data[], u8 src_data[])
+{
+  tg_shape_t s = p->src->shape;
+  // change nhwc -> nchw by HW design automatically
+  u32 n = s.n;
+  u32 c = s.h;
+  u32 h = s.w;
+  u32 w = s.c;
+  for (u32 ni = 0; ni < n; ni++) {
+    for (u32 ci = 0; ci < c; ci++) {
+      for (u32 hi = 0; hi < h; hi++) {
+        for (u32 wi = 0; wi < w; wi++) {
+          u64 src_i = ni * c * h * w + ci * h * w + hi * w + wi;
+          u64 dst_i = ni * w * c * h + wi * c * h + ci * h + hi;
+          ref_data[dst_i] = src_data[src_i];
+        }
+      }
+    }
+  }
+}
+
+static void test_param_g2l(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = tg_shape_size(&p->src->shape);
+  u8 *src_data = (u8 *)malloc(sizeof(u8) * size);
+  for (u64 i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  put_tg_gmem(ctx, p->src, src_data);
+  bmk1822_tdma_g2l_tensor_copy_chw_rotated(bmk, p);
+  test_submit(ctx);
+  u8 *dst_data = get_tensor_l2g(ctx, bmk, p->dst);
+
+  u8 *ref_data = (u8 *)malloc(sizeof(u8) * size);
+  tg2l_tensor_copy_chw_rotated_ref(p, ref_data, src_data);
+ 
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+static void destroy_param_g2l(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_tg_gmem(ctx, p->src);
+  free_tl(bmk, p->dst);
+}
+
+static void test_one_case(bmctx_t *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+
+  param_t p;
+  memset(&p, 0, sizeof(p));
+
+  p.src = alloc_tg_gmem(ctx, c->src_shape, FMT_I8);
+  p.dst = alloc_tl(bmk, c->dst_shape, FMT_I8, 1);
+  test_param_g2l(ctx, bmk, &p);
+  destroy_param_g2l(ctx, bmk, &p);
+
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_tdma_tg2l_tensor_copy_nc_transposed.cpp b/cviruntime/test/1822/test_1822_tdma_tg2l_tensor_copy_nc_transposed.cpp
new file mode 100644
index 000000000..d9546a76a
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_tdma_tg2l_tensor_copy_nc_transposed.cpp
@@ -0,0 +1,227 @@
+#include "1822_test_util.h"
+
+typedef bmk1822_tdma_tg2l_tensor_copy_nc_transposed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  tg_shape_t src_shape;
+  tl_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 1, 1 },
+    { 1, 1, 1, 1 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 1, 2 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 2, 1 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 7, 2 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 2, 7 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 17, 13 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 13, 17 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 10, 60 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 2, 300 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 3, 200 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 4, 150 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 5, 120 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 60, 10 },
+  }, {
+    { 1, 1, 120, 5 },
+    { 1, 1, 120, 5 },
+  }, {
+    { 1, 2, 1, 1 },
+    { 2, 1, 1, 1 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 1, 4 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 2, 2 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 4, 1 },
+  }, {
+    { 17, 2, 2, 2 },
+    { 2, 17, 2, 2 },
+  }, {
+    { 17, 2, 4, 1 },
+    { 2, 17, 4, 1 },
+  }, {
+    {  3, 16, 1, 1 },
+    { 16,  3, 1, 1 },
+  }, {
+    { 3, 39, 23, 17 },
+    { 39, 3, 23, 17 },
+  }, {
+    { 3, 39, 17, 23 },
+    { 39, 3, 17, 23 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  16, 20 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  2, 160 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  4, 80 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  8, 40 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  20, 16 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  32, 10 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  64, 5 },
+  }, {
+    { 5, 39, 17, 23 },
+    { 39, 5, 17, 23 },
+  }, {
+    { 20, 35, 2, 2 },
+    { 35, 20, 2, 2 },
+  }, {
+    { 35, 20, 2, 2 },
+    { 20, 35, 2, 2 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 160, 2 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 2, 160 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 4, 80 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 8, 40 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 10, 32 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 20, 16 },
+  }, {
+    { 39, 5, 23, 17 },
+    { 5, 39, 23, 17 },
+  }    
+};
+
+static void tg2l_tensor_copy_nc_transposed_ref(
+    param_t *p, u8 ref_data[], u8 src_data[])
+{
+  tg_shape_t s = p->src->shape;
+  u32 n = s.n;
+  u32 c = s.c;
+  u32 hw = s.h * s.w;
+
+  for (u32 ni = 0; ni < n; ni++) {
+    for (u32 ci = 0; ci < c; ci++) {
+      for (u32 hwi = 0; hwi < hw; hwi++) {
+        u32 src_i = ni * c * hw + ci * hw + hwi;
+        u32 dst_i = ci * n * hw + ni * hw + hwi;
+        ref_data[dst_i] = src_data[src_i];
+      }
+    }
+  }
+}
+
+static void test_param_g2l(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = tl_shape_size(&p->dst->shape);
+
+  u8 *src_data = (u8 *)malloc(sizeof(u8) * size);
+  for (u64 i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  put_tg_gmem(ctx, p->src, src_data);
+  bmk1822_tdma_g2l_tensor_copy_nc_transposed(bmk, p);
+  test_submit(ctx);
+  u8 *dst_data = get_tensor_l2g(ctx, bmk, p->dst);
+
+  u8 *ref_data = (u8 *)malloc(sizeof(u8) * size);
+  tg2l_tensor_copy_nc_transposed_ref(p, ref_data, src_data);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+
+static void destroy_param_g2l(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_tl(bmk, p->dst);
+  free_tg_gmem(ctx, p->src);
+}
+
+static void test_one_case(bmctx_t *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  for (int dst_align = 0; dst_align < 2; dst_align++) {
+    param_t p;
+    memset(&p, 0, sizeof(p));
+
+    p.src = alloc_tg_gmem(ctx, c->src_shape, FMT_I8);
+    p.dst = alloc_tl(bmk, c->dst_shape, FMT_I8, dst_align);
+    test_param_g2l(ctx, bmk, &p);
+    destroy_param_g2l(ctx, bmk, &p);
+
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_tdma_tg2l_tensor_fill_constant.cpp b/cviruntime/test/1822/test_1822_tdma_tg2l_tensor_fill_constant.cpp
new file mode 100644
index 000000000..3484c6e77
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_tdma_tg2l_tensor_fill_constant.cpp
@@ -0,0 +1,136 @@
+#include "1822_test_util.h"
+
+typedef bmk1822_tdma_tg2l_tensor_fill_constant_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: %u => (%u, %u, %u, %u)\n",
+      tag, p->constant,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  u8 constant;
+  tl_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    37, { 1, 1, 1, 1 }
+  }, {
+    39, { 1, 1, 1, 2 }
+  }, {
+    23, { 1, 1, 2, 1 }
+  }, {
+    19, { 1, 1, 7, 2 }
+  }, {
+    17, { 1, 1, 2, 7 }
+  }, {
+    13, { 1, 1, 17, 13 }
+  }, {
+    11, { 1, 1, 13, 17 }
+  }, {
+    7, { 1, 1, 10, 60 }
+  }, {
+    9, { 1, 1, 120, 5 }
+  }, {
+    2, { 1, 2, 1, 1 }
+  }, {
+    3, { 1, 1, 1, 2 }
+  }, {
+    5, { 2, 17, 1,  4 }
+  }, {
+    41, { 2,  1, 4, 17 }
+  }, {
+    5, { 2, 17, 1,  4 }
+  }, {
+    9, { 2,  1, 17, 4 }
+  }, {
+    17, { 3, 16, 1, 1 }
+  }, {
+    26, { 3,  1, 2, 8 }
+  }, {
+    103, { 3, 39, 17, 23 }
+  }, {
+    255, { 3, 17, 39, 23 }
+  }, {
+    254, { 3, 36, 16,  20 }
+  }, {
+    127, { 3, 18,  1, 640 }
+  }, {
+    128, { 5, 39, 17, 23 }
+  }, {
+    129, { 5, 17, 39, 23 }
+  }, {
+    55, { 20, 35,  2, 2 }
+  }, {
+    1, { 20,  7, 10, 2 }
+  }    
+};
+
+static void tg2l_tensor_fill_constant_ref(param_t *p, u8 ref_data[])
+{
+  u64 size = tl_shape_size(&p->dst->shape);
+
+  for (u64 i = 0; i < size; i++)
+    ref_data[i] = p->constant;
+}
+
+static void test_param_tg2l(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = tl_shape_size(&p->dst->shape);
+
+  bmk1822_tdma_tg2l_tensor_fill_constant(bmk, p);
+  test_submit(ctx);
+  u8 *dst_data = get_tensor_l2g(ctx, bmk, p->dst);
+
+  u8 *ref_data = (u8 *)malloc(sizeof(u8) * size);
+  tg2l_tensor_fill_constant_ref(p, ref_data);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(dst_data);
+  free(ref_data);
+}
+
+static void destroy_param_tg2l(bmk_ctx_t *bmk, param_t *p)
+{
+  free_tl(bmk, p->dst);
+}
+
+static void test_one_case(bmctx_t *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  for (int dst_align = 0; dst_align < 2; dst_align++) {
+    param_t p;
+    memset(&p, 0, sizeof(p));
+    p.constant = c->constant;
+    p.dst = alloc_tl(bmk, c->dst_shape, FMT_I8, dst_align);
+
+    test_param_tg2l(ctx, bmk, &p);
+    destroy_param_tg2l(bmk, &p);
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_tdma_tg2l_tensor_vlc_copy_decompressed.cpp b/cviruntime/test/1822/test_1822_tdma_tg2l_tensor_vlc_copy_decompressed.cpp
new file mode 100644
index 000000000..e5be09d93
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_tdma_tg2l_tensor_vlc_copy_decompressed.cpp
@@ -0,0 +1,159 @@
+#include "1822_test_util.h"
+#include "bm_vlc_compress.h"
+
+typedef bmk1822_tdma_tg2l_tensor_copy_decompressed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => fmt(%d) bias0/1/zero is (%u/%u/%u) %s\n",
+      tag,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w,
+      p->dst->fmt,
+      p->src->bias0, p->src->bias1, p->src->zero_guard_en,
+      (p->dst->fmt == FMT_I8)? "signed": "unsigned");
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  tl_shape_t lmem_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 17, 13 }
+  },
+  {
+    { 3, 39, 17, 23 }
+  },
+  {
+    { 5, 39, 17, 23 }
+  },
+  {
+    { 20, 35,  2, 2 }
+  },
+#ifndef ENABEL_SIMPLE_BMK1822_VLC_TEST
+  {
+    { 1, 1, 1, 1 }
+  },
+  {
+    { 1, 1, 1, 2 }
+  },
+  {
+    { 1, 1, 7, 2 }
+  },
+  {
+    { 1, 1, 10, 60 }
+  },
+  {
+    { 1, 2, 1, 1 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 3, 16, 1, 1 }
+  },
+  {
+    { 3, 36, 16,  20 }
+  },
+#endif /* ifndef ENABEL_SIMPLE_BMK1822_VLC_TEST*/
+};
+
+static void tg2l_tensor_copy_vlc_decompressed_ref(
+    u8 ref_data[], u64 ref_size, u8 src_data[])
+{
+  bm_vlc_dec_int8(src_data, ref_size, ref_data);
+}
+
+static void test_param_g2l(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p, u8 *src_data, CommandInfo* cmd_info)
+{
+  print_param(stderr, p);
+  u64 size = tl_shape_size(&p->dst->shape);
+  size_t bs_size = 0;
+  int is_signed = (p->dst->fmt == FMT_I8);
+  u8 data_type = (p->dst->fmt == FMT_BF16) ? 1 : 0;
+
+  u8 *bsbuf = vlc_compress(src_data, size, is_signed, data_type, &bs_size, cmd_info, NULL);
+
+  put_compressed_tg_gmem(ctx, p->src, bsbuf, bs_size);
+  bmk1822_tdma_g2l_tensor_copy_decompressed(bmk, p);
+  test_submit(ctx);
+
+  u8 *dst_data = get_tensor_l2g(ctx, bmk, p->dst);
+  u8 *ref_data = (u8 *)malloc(sizeof(u8) * size);
+  tg2l_tensor_copy_vlc_decompressed_ref(ref_data, size, bsbuf);
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "vlc decompress comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+  free(bsbuf);
+  free(dst_data);
+  free(ref_data);
+}
+
+static void destroy_param_g2l(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_compressed_tg_gmem(ctx, p->src);
+  free_tl(bmk, p->dst);
+}
+
+static void test_one_case(bmctx_t *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  fmt_t fmts[] = { FMT_I8, FMT_U8 };
+
+  for (int dst_align = 0; dst_align < 2; dst_align++) {
+    for (int mode = 0; mode < VLC_CMP_MODE_MAX; mode++) {
+      for (u8 fmt_i = 0; fmt_i < 2; fmt_i++) {
+        fmt_t fmt = fmts[fmt_i];
+        param_t p;
+        memset(&p, 0, sizeof(p));
+        p.dst = alloc_tl(bmk, c->lmem_shape, fmt, dst_align);
+        assert(p.dst);
+
+        u64 size = tl_shape_size(&p.dst->shape);
+        u8 *src_data = (u8 *)malloc(sizeof(u8) * size);
+        vlc_init_testdata(src_data, size, fmt == FMT_I8, fmt == FMT_BF16);
+
+        CommandInfo cmd_info;
+        memset(&cmd_info, 0, sizeof(CommandInfo));
+        int is_signed = (fmt == FMT_I8);
+        u8 data_type = (fmt == FMT_BF16) ? 1 : 0;
+
+        cmd_info.signedness = is_signed;
+
+        if (mode == VLC_CMP_MODE_COMPILER) {
+          bm_vlc_est_weight_bias(src_data, size, (bool)is_signed, (bool)data_type, &cmd_info);
+        }
+
+        p.src = _alloc_vlc_compressed_tg_gmem(ctx, &c->lmem_shape, fmt, &cmd_info);
+
+        test_param_g2l(ctx, bmk, &p, src_data, &cmd_info);
+
+        free(src_data);
+        destroy_param_g2l(ctx, bmk, &p);
+      }
+    }
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_tensor_add.cpp b/cviruntime/test/1822/test_1822_tensor_add.cpp
new file mode 100644
index 000000000..f37c38b4e
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_tensor_add.cpp
@@ -0,0 +1,147 @@
+#include "1822_test_util.h"
+
+static void tl_add_ref(
+    u8 *ref_high, u8 *ref_low,
+    u8 *a_high, u8 *a_low,
+    u8 *b_high, u8 *b_low,
+    int rshift_bits,
+    u64 size, int relu_enable)
+{
+  for (u64 i = 0; i < size; i++) {
+    s32 ta = ((s8)a_high[i] << 8) + a_low[i];
+    s32 tb = ((s8)b_high[i] << 8) + b_low[i];
+    s32 res = ta + tb;
+
+    res += 1 << (rshift_bits - 1);
+    res >>= rshift_bits;
+
+    if(relu_enable)
+    {
+      if (res > 127)
+        res = 127;
+      else if (res < -128)
+        res = -128;
+
+      if(relu_enable)
+        if(res<0)
+          res=0;
+      ref_high[i] = 0;
+      ref_low[i] = res & 0xff;
+
+    }else{
+      if (res > 32767)
+        res = 32767;
+      else if (res < -32768)
+        res = -32768;
+      ref_high[i] = (res >> 8) & 0xff;
+      ref_low[i] = res & 0xff;
+    }
+  }
+}
+
+static void test_tl_add(bmctx_t *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+  int rshift_bits;
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+  for(int relu_enable = 0; relu_enable < 2; relu_enable++) {
+    u64 size = n * c * h  * w;
+
+    u8 *a_high_data = (u8 *)xmalloc(size);
+    u8 *a_low_data = (u8 *)xmalloc(size);
+    u8 *b_high_data = (u8 *)xmalloc(size);
+    u8 *b_low_data = (u8 *)xmalloc(size);
+    for (u64 i = 0; i < size; i++) {
+      a_high_data[i] = rand() % 64+ i ;
+      a_low_data[i] = i;
+      b_high_data[i] = (i + 250) / 20;
+      b_low_data[i] = 100 - i;
+    }
+    if(relu_enable)
+      rshift_bits = 7;
+    else
+      rshift_bits = 1;
+
+    u8 *ref_high_data = (u8 *)xmalloc(size);
+    u8 *ref_low_data = (u8 *)xmalloc(size);
+    tl_add_ref(ref_high_data, ref_low_data,
+               a_high_data, a_low_data,
+               b_high_data, b_low_data,
+               rshift_bits,
+               size, relu_enable);
+
+    tl_t *tl_a_low = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+    tl_t *tl_a_high = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+    tl_t *tl_b_low = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+    tl_t *tl_b_high = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+    tl_t *tl_res_low = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+    tl_t *tl_res_high = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+
+    put_tensor_g2l(ctx, bk_ctx, tl_a_low, a_low_data);
+    put_tensor_g2l(ctx, bk_ctx, tl_a_high, a_high_data);
+    put_tensor_g2l(ctx, bk_ctx, tl_b_low, b_low_data);
+    put_tensor_g2l(ctx, bk_ctx, tl_b_high, b_high_data);
+    bmk1822_tiu_element_wise_add_param_t p4;
+    p4.res_high = relu_enable ? 0 : tl_res_high;
+    p4.res_low = tl_res_low;
+    p4.a_high = tl_a_high;
+    p4.a_low = tl_a_low;
+    p4.b_is_const = 0;
+    p4.b_high = tl_b_high;
+    p4.b_low = tl_b_low;
+    p4.rshift_bits = rshift_bits;
+    p4.relu_enable = relu_enable;
+    bmk1822_tiu_element_wise_add(bk_ctx, &p4);
+    u8 *res_high_data = get_tensor_l2g(ctx, bk_ctx, tl_res_high);
+    u8 *res_low_data = get_tensor_l2g(ctx, bk_ctx, tl_res_low);
+    for (u64 i = 0; i < size; i++) {
+      if(!relu_enable)
+        if (res_high_data[i] != ref_high_data[i]) {
+          fprintf(stderr, "comparing failed at res_high_data[%" PRIu64 "], got %d, exp %d\n",
+                 i, res_high_data[i], ref_high_data[i]);
+          exit(-1);
+        }
+      if (res_low_data[i] != ref_low_data[i]) {
+        fprintf(stderr, "comparing failed at res_low_data[%" PRIu64 "], got %d, exp %d\n",
+               i, res_low_data[i], ref_low_data[i]);
+        exit(-1);
+      }
+    }
+
+    free_tl(bk_ctx, tl_res_high);
+    free_tl(bk_ctx, tl_res_low);
+    free_tl(bk_ctx, tl_b_high);
+    free_tl(bk_ctx, tl_b_low);
+    free_tl(bk_ctx, tl_a_high);
+    free_tl(bk_ctx, tl_a_low);
+
+    free(a_high_data);
+    free(a_low_data);
+    free(b_high_data);
+    free(b_low_data);
+    free(ref_high_data);
+    free(ref_low_data);
+    free(res_high_data);
+    free(res_low_data);
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_add(&ctx, bk_ctx, 0);
+  test_tl_add(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_tensor_add_const.cpp b/cviruntime/test/1822/test_1822_tensor_add_const.cpp
new file mode 100644
index 000000000..32d7f65bd
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_tensor_add_const.cpp
@@ -0,0 +1,142 @@
+#include "1822_test_util.h"
+
+static void tl_add_const_ref(
+    u8 *ref_high, u8 *ref_low,
+    u8 *a_high, u8 *a_low,
+    s16 b, int b_is_signed,
+    int rshift_bits,
+    u64 size, int relu_enable)
+{
+  for (u64 i = 0; i < size; i++) {
+    s32 ta = ((s8)a_high[i] << 8) + a_low[i];
+    s32 tb = b_is_signed? b: (u16)b;
+    s32 res = ta + tb;
+    res += 1 << (rshift_bits - 1);
+    res >>= rshift_bits;
+
+    if(relu_enable)
+    {
+      if (res > 127)
+        res = 127;
+      else if (res < -128)
+        res = -128;
+
+      if(relu_enable)
+        if(res<0)
+          res=0;
+      ref_high[i] = 0;
+      ref_low[i] = res & 0xff;
+    }else{
+      if (res > 32767)
+        res = 32767;
+      else if (res < -32768)
+        res = -32768;
+      ref_high[i] = (res >> 8) & 0xff;
+      ref_low[i] = res & 0xff;
+    }
+  }
+}
+
+static void test_tl_add_const(bmctx_t *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+  int rshift_bits;
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  for(int relu_enable = 0; relu_enable < 2; relu_enable++) {
+    u64 size = n * c * h  * w;
+
+    u8 *a_high_data = (u8 *)xmalloc(size);
+    u8 *a_low_data = (u8 *)xmalloc(size);
+    s16 b;
+    int b_is_signed = 1;
+    for (u64 i = 0; i < size; i++) {
+      a_high_data[i] = rand() % 64+ i;
+      a_low_data[i] = i;
+    }
+
+    if(relu_enable)
+    {
+      b=-64;
+      rshift_bits = 7;
+    }
+    else
+    {
+      b=-278;
+      rshift_bits = 1;
+    }
+    u8 *ref_high_data = (u8 *)xmalloc(size);
+    u8 *ref_low_data = (u8 *)xmalloc(size);
+    tl_add_const_ref(ref_high_data, ref_low_data,
+                     a_high_data, a_low_data,
+                     b, b_is_signed, rshift_bits, size,relu_enable);
+
+    tl_t *tl_a_low = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+    tl_t *tl_a_high = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+    tl_t *tl_res_low = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+    tl_t *tl_res_high = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+
+    put_tensor_g2l(ctx, bk_ctx, tl_a_low, a_low_data);
+    put_tensor_g2l(ctx, bk_ctx, tl_a_high, a_high_data);
+
+    bmk1822_tiu_element_wise_add_param_t p4;
+    p4.res_high = relu_enable ? 0 : tl_res_high;
+    p4.res_low = tl_res_low;
+    p4.a_high = tl_a_high;
+    p4.a_low = tl_a_low;
+    p4.b_is_const = 1;
+    p4.b_const.val = b;
+    p4.b_const.is_signed = b_is_signed;
+    p4.rshift_bits = rshift_bits;
+    p4.relu_enable = relu_enable;
+    bmk1822_tiu_element_wise_add(bk_ctx, &p4);
+
+    u8 *res_high_data = get_tensor_l2g(ctx, bk_ctx, tl_res_high);
+    u8 *res_low_data = get_tensor_l2g(ctx, bk_ctx, tl_res_low);
+    for (u64 i = 0; i < size; i++) {
+      if(!relu_enable)
+        if (res_high_data[i] != ref_high_data[i]) {
+          fprintf(stderr, "comparing failed at res_high_data[%" PRIu64 "], got %d, exp %d\n",
+                  i, res_high_data[i], ref_high_data[i]);
+          exit(-1);
+        }
+      if (res_low_data[i] != ref_low_data[i]) {
+        fprintf(stderr, "comparing failed at res_low_data[%" PRIu64 "], got %d, exp %d\n",
+                i, res_low_data[i], ref_low_data[i]);
+        exit(-1);
+      }
+    }
+
+    free_tl(bk_ctx, tl_res_high);
+    free_tl(bk_ctx, tl_res_low);
+    free_tl(bk_ctx, tl_a_high);
+    free_tl(bk_ctx, tl_a_low);
+
+    free(a_high_data);
+    free(a_low_data);
+    free(ref_high_data);
+    free(ref_low_data);
+    free(res_high_data);
+    free(res_low_data);
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_add_const(&ctx, bk_ctx, 0);
+  test_tl_add_const(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_tensor_and.cpp b/cviruntime/test/1822/test_1822_tensor_and.cpp
new file mode 100644
index 000000000..87d64ed5b
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_tensor_and.cpp
@@ -0,0 +1,180 @@
+#include "1822_test_util.h"
+
+static void tl_and_int8_ref(s8 *a, s8 *b, s8 *res, u64 size)
+{
+  for (u64 i = 0; i < size; i++)
+    res[i] = a[i] & b[i];
+}
+
+static void tl_and_int16_ref(
+    u8 *ref_high, u8 *ref_low,
+    u8 *a_high, u8 *a_low,
+    u8 *b_high, u8 *b_low,
+    u64 size)
+{
+  for (u64 i = 0; i < size; i++) {
+    s32 ta = ((s8)a_high[i] << 8) + a_low[i];
+    s32 tb = ((s8)b_high[i] << 8) + b_low[i];
+    s32 res = ta & tb;
+    ref_high[i] = (res >> 8) & 0xff;
+    ref_low[i] = res & 0xff;
+  }
+}
+
+static void test_tl_and_int8(bmctx_t *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  u64 size = n * c * h * w;
+
+  s8 *a_data = (s8 *)xmalloc(size);
+  for (u64 i = 0; i < size; i++)
+    a_data[i] = (s8)(i % 256);
+
+  s8 *b_data = (s8 *)xmalloc(size);
+  for (u64 i = 0; i < size; i++)
+    b_data[i] = (s8)(100 - i % 256);
+
+  s8 *ref_data = (s8 *)xmalloc(size);
+  tl_and_int8_ref(a_data, b_data, ref_data, size);
+
+  tl_t *tl_a = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_b = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_res = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+
+  put_tensor_g2l(ctx, bk_ctx, tl_a, (u8 *)a_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_b, (u8 *)b_data);
+  bmk1822_tiu_element_wise_and_int8_param_t p9;
+  p9.res = tl_res;
+  p9.a = tl_a;
+  p9.b = tl_b;
+  bmk1822_tiu_element_wise_and_int8(bk_ctx, &p9);
+  u8 *res_data = get_tensor_l2g(ctx, bk_ctx, tl_res);
+
+  for (u64 i = 0; i < size; i++) {
+    if ((s8)res_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+             i, res_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_res);
+  free_tl(bk_ctx, tl_b);
+  free_tl(bk_ctx, tl_a);
+
+  free(a_data);
+  free(b_data);
+  free(ref_data);
+  free(res_data);
+}
+
+static void test_tl_and_int16(bmctx_t *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  u64 size = n * c * h * w;
+
+  u8 *a_high_data = (u8 *)xmalloc(size);
+  u8 *a_low_data = (u8 *)xmalloc(size);
+  u8 *b_high_data = (u8 *)xmalloc(size);
+  u8 *b_low_data = (u8 *)xmalloc(size);
+  for (u64 i = 0; i < size; i++) {
+    a_high_data[i] = i / 10;
+    a_low_data[i] = i;
+    b_high_data[i] = (i + 250) / 20;
+    b_low_data[i] = 100 - i;
+  }
+
+  u8 *ref_high_data = (u8 *)xmalloc(size);
+  u8 *ref_low_data = (u8 *)xmalloc(size);
+  tl_and_int16_ref(
+      ref_high_data, ref_low_data,
+      a_high_data, a_low_data,
+      b_high_data, b_low_data,
+      size);
+
+  tl_t *tl_a_low = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_a_high = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_b_low = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_b_high = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_res_low = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_res_high = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+
+  put_tensor_g2l(ctx, bk_ctx, tl_a_low, a_low_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_a_high, a_high_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_b_low, b_low_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_b_high, b_high_data);
+  bmk1822_tiu_element_wise_and_int16_param_t p8;
+  p8.res_high = tl_res_high;
+  p8.res_low = tl_res_low;
+  p8.a_high = tl_a_high;
+  p8.a_low = tl_a_low;
+  p8.b_high = tl_b_high;
+  p8.b_low = tl_b_low;
+  bmk1822_tiu_element_wise_and_int16(bk_ctx, &p8);
+  u8 *res_high_data = get_tensor_l2g(ctx, bk_ctx, tl_res_high);
+  u8 *res_low_data = get_tensor_l2g(ctx, bk_ctx, tl_res_low);
+
+  for (u64 i = 0; i < size; i++) {
+    if (res_high_data[i] != ref_high_data[i]) {
+      fprintf(stderr, "comparing failed at res_high_data[%" PRIu64 "], got %d, exp %d\n",
+             i, res_high_data[i], ref_high_data[i]);
+      exit(-1);
+    }
+    if (res_low_data[i] != ref_low_data[i]) {
+      fprintf(stderr, "comparing failed at res_low_data[%" PRIu64 "], got %d, exp %d\n",
+             i, res_low_data[i], ref_low_data[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_res_high);
+  free_tl(bk_ctx, tl_res_low);
+  free_tl(bk_ctx, tl_b_high);
+  free_tl(bk_ctx, tl_b_low);
+  free_tl(bk_ctx, tl_a_high);
+  free_tl(bk_ctx, tl_a_low);
+
+  free(a_high_data);
+  free(a_low_data);
+  free(b_high_data);
+  free(b_low_data);
+  free(ref_high_data);
+  free(ref_low_data);
+  free(res_high_data);
+  free(res_low_data);
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_and_int8(&ctx, bk_ctx, 0);
+  test_tl_and_int8(&ctx, bk_ctx, 1);
+  test_tl_and_int16(&ctx, bk_ctx, 0);
+  test_tl_and_int16(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_tensor_arith_shift.cpp b/cviruntime/test/1822/test_1822_tensor_arith_shift.cpp
new file mode 100644
index 000000000..ba025de22
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_tensor_arith_shift.cpp
@@ -0,0 +1,117 @@
+#include "1822_test_util.h"
+
+static void tl_arith_shift_ref(
+    u8 *ref_high, u8 *ref_low,
+    u8 *a_high, u8 *a_low,
+    u8 *bits, u64 size)
+{
+  for (u64 i = 0; i < size; i++) {
+    s32 ta = ((s8)a_high[i] << 8) + a_low[i];
+    s32 tbits = (s8)bits[i];
+
+    /*
+     * Yes, a @tbits bigger than zero means shifting LEFT,
+     * no matter whether the shift type is arithmetic
+     * RIGHT shift or logic RIGHT shift.
+     */
+    s32 res;
+    if (tbits >= 0)
+      res = ta << tbits;
+    else
+      res = ta >> -tbits;
+
+    ref_high[i] = (res >> 8) & 0xff;
+    ref_low[i] = res & 0xff;
+  }
+}
+
+static void test_tl_arith_shift(bmctx_t *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  u64 size = n * c * h * w;
+
+  u8 *a_high_data = (u8 *)xmalloc(size);
+  u8 *a_low_data = (u8 *)xmalloc(size);
+  u8 *bits_data = (u8 *)xmalloc(size);
+  for (u64 i = 0; i < size; i++) {
+    a_high_data[i] = 240 + i;
+    a_low_data[i] = 200 + i;
+    bits_data[i] = (i % 33) - 16;
+  }
+
+  u8 *ref_high_data = (u8 *)xmalloc(size);
+  u8 *ref_low_data = (u8 *)xmalloc(size);
+  tl_arith_shift_ref(
+      ref_high_data, ref_low_data,
+      a_high_data, a_low_data,
+      bits_data, size);
+
+  tl_t *tl_a_low = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_a_high = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_bits = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_res_low = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_res_high = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+
+  put_tensor_g2l(ctx, bk_ctx, tl_a_low, a_low_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_a_high, a_high_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_bits, bits_data);
+  bmk1822_tiu_element_wise_arith_shift_param_t p8;
+  p8.res_high = tl_res_high;
+  p8.res_low = tl_res_low;
+  p8.a_high = tl_a_high;
+  p8.a_low = tl_a_low;
+  p8.bits = tl_bits;
+  bmk1822_tiu_element_wise_arith_shift(bk_ctx, &p8);
+  u8 *res_high_data = get_tensor_l2g(ctx, bk_ctx, tl_res_high);
+  u8 *res_low_data = get_tensor_l2g(ctx, bk_ctx, tl_res_low);
+
+  for (u64 i = 0; i < size; i++) {
+    if (res_high_data[i] != ref_high_data[i]) {
+      fprintf(stderr, "comparing failed at res_high_data[%" PRIu64 "], got %d, exp %d\n",
+             i, res_high_data[i], ref_high_data[i]);
+      exit(-1);
+    }
+    if (res_low_data[i] != ref_low_data[i]) {
+      fprintf(stderr, "comparing failed at res_low_data[%" PRIu64 "], got %d, exp %d\n",
+             i, res_low_data[i], ref_low_data[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_res_high);
+  free_tl(bk_ctx, tl_res_low);
+  free_tl(bk_ctx, tl_bits);
+  free_tl(bk_ctx, tl_a_high);
+  free_tl(bk_ctx, tl_a_low);
+
+  free(a_high_data);
+  free(a_low_data);
+  free(bits_data);
+  free(ref_high_data);
+  free(ref_low_data);
+  free(res_high_data);
+  free(res_low_data);
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_arith_shift(&ctx, bk_ctx, 0);
+  test_tl_arith_shift(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_tensor_copy.cpp b/cviruntime/test/1822/test_1822_tensor_copy.cpp
new file mode 100644
index 000000000..892b95f45
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_tensor_copy.cpp
@@ -0,0 +1,67 @@
+#include "1822_test_util.h"
+
+static void tl_copy_ref(s8 *a, s8 *res, u64 size)
+{
+  for (u64 i = 0; i < size; i++)
+    res[i] = a[i];
+}
+
+static void test_tl_copy(bmctx_t *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  u64 size = n * c * h * w;
+  s8 *a_data = (s8 *)xmalloc(size);
+  for (u64 i = 0; i < size; i++)
+    a_data[i] = (s8)(i % 256);
+
+  s8 *ref_data = (s8 *)xmalloc(size);
+  tl_copy_ref(a_data, ref_data, size);
+
+  tl_t *tl_a = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_res = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+
+  put_tensor_g2l(ctx, bk_ctx, tl_a, (u8 *)a_data);
+  bmk1822_tiu_element_wise_copy_param_t p10;
+  p10.dst = tl_res;
+  p10.src = tl_a;
+  bmk1822_tiu_element_wise_copy(bk_ctx, &p10);
+  u8 *res_data = get_tensor_l2g(ctx, bk_ctx, tl_res);
+
+  for (u64 i = 0; i < size; i++) {
+    if ((s8)res_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+             i, res_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_res);
+  free_tl(bk_ctx, tl_a);
+
+  free(a_data);
+  free(ref_data);
+  free(res_data);
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_copy(&ctx, bk_ctx, 0);
+  test_tl_copy(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_tensor_copy_with_stride.cpp b/cviruntime/test/1822/test_1822_tensor_copy_with_stride.cpp
new file mode 100644
index 000000000..2cadd9b3a
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_tensor_copy_with_stride.cpp
@@ -0,0 +1,162 @@
+#include "1822_test_util.h"
+
+static int npu_num = BM1822_HW_NPU_NUM;
+
+static u64 shape_size(tl_shape_t s)
+{
+  return s.n * s.c * s.h * s.w;
+}
+
+static tl_shape_t shape_of_stride(
+    tl_shape_t tl_shape,
+    bmk1822_tensor_lmem_stride_t tl_stride)
+{
+  tl_shape_t shape;
+  shape.n = tl_shape.n;
+  shape.c = npu_num;
+  shape.h = tl_stride.n;
+  shape.w = 1;
+
+  return shape;
+}
+
+static void tl_copy_with_stride_ref(
+    s8 *src,
+    s8 *dst,
+    tl_shape_t shape,
+    bmk1822_tensor_lmem_stride_t src_stride,
+    bmk1822_tensor_lmem_stride_t dst_stride)
+{
+  int n = shape.n;
+  int c = shape.c;
+  int h = shape.h;
+  int w = shape.w;
+
+  tl_shape_t dst_stride_shape = shape_of_stride(shape, dst_stride);
+
+  u64 dst_size =
+      dst_stride_shape.n *
+      dst_stride_shape.c *
+      dst_stride_shape.h *
+      dst_stride_shape.w;
+
+  for (u64 i = 0; i < dst_size; i++)
+    dst[i] = 0;
+
+  for (int ni = 0; ni < n; ni++) {
+    for (int ci = 0; ci < c; ci++) {
+      for (int hi = 0; hi < h; hi++) {
+        for (int wi = 0; wi < w; wi++) {
+          int src_i = (ni * npu_num + ci % npu_num) * src_stride.n +
+              ci / npu_num * src_stride.c +
+              hi * src_stride.h +
+              wi;
+          int dst_i = (ni * npu_num + ci % npu_num) * dst_stride.n +
+              ci / npu_num * dst_stride.c +
+              hi * dst_stride.h +
+              wi;
+          dst[dst_i] = src[src_i];
+        }
+      }
+    }
+  }
+}
+
+static void test_tl_copy_with_stride(
+    bmctx_t *ctx,
+    bmk_ctx_t *bk_ctx)
+{
+  int n = 3;
+  int c = BM1822_HW_NPU_NUM + 2; // larger than npu_num
+  int h = 2;
+  int w = 3;
+  int c_layers = ALIGN(c, npu_num) / npu_num;
+
+  bmk1822_tensor_lmem_stride_t src_stride;
+  src_stride.w = 1;
+  src_stride.h = w + 3;
+  src_stride.c = h * src_stride.h + 13;
+  src_stride.n = c_layers * src_stride.c + 7;
+
+  bmk1822_tensor_lmem_stride_t dst_stride;
+  dst_stride.w = 1;
+  dst_stride.h = w + 1;
+  dst_stride.c = h * dst_stride.h + 5;
+  dst_stride.n = c_layers * dst_stride.c + 19;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  tl_shape_t src_stride_shape =
+      shape_of_stride(tl_shape, src_stride);
+
+  tl_shape_t dst_stride_shape =
+      shape_of_stride(tl_shape, dst_stride);
+
+  u64 src_size = shape_size(src_stride_shape);
+  u64 dst_size = shape_size(dst_stride_shape);
+
+  s8 *src_data = (s8 *)xmalloc(src_size);
+  for (u64 i = 0; i < src_size; i++)
+    src_data[i] = i;
+
+  s8 *dst_init_data = (s8 *)xmalloc(dst_size);
+  for (u64 i = 0; i < dst_size; i++)
+    dst_init_data[i] = 0;
+
+  tl_t *tl_src = alloc_tl(
+      bk_ctx, src_stride_shape, FMT_I8, /*eu_align*/0);
+
+  tl_t *tl_dst = alloc_tl(
+      bk_ctx, dst_stride_shape, FMT_I8, /*eu_align*/0);
+
+  put_tensor_g2l(ctx, bk_ctx, tl_src, (u8 *)src_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_dst, (u8 *)dst_init_data);
+
+  {
+    tl_t src = *tl_src;
+    tl_t dst = *tl_dst;
+    src.shape = dst.shape = tl_shape;
+    src.stride = src_stride;
+    dst.stride = dst_stride;
+    bmk1822_tiu_element_wise_copy_param_t p11;
+    p11.dst = &dst;
+    p11.src = &src;
+    bmk1822_tiu_element_wise_copy(bk_ctx, &p11);
+  }
+
+  u8 *dst_data = get_tensor_l2g(ctx, bk_ctx, tl_dst);
+
+  s8 *ref_data = (s8 *)xmalloc(dst_size);
+  tl_copy_with_stride_ref(src_data, ref_data,
+                          tl_shape, src_stride, dst_stride);
+
+  for (u64 i = 0; i < dst_size; i++) {
+    if ((s8)dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst_data[%" PRIu64 "], got %x, exp %x\n",
+             i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_dst);
+  free_tl(bk_ctx, tl_src);
+
+  free(src_data);
+  free(dst_init_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  test_tl_copy_with_stride(&ctx, bk_ctx);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_tensor_ge.cpp b/cviruntime/test/1822/test_1822_tensor_ge.cpp
new file mode 100644
index 000000000..ce29640ce
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_tensor_ge.cpp
@@ -0,0 +1,90 @@
+#include "1822_test_util.h"
+
+static void tl_ge_ref(s8 *a, s8 *b, s8 *result, u64 size, fmt_t fmt)
+{
+  for (u64 i = 0; i < size; i++) {
+    s32 a32 = (fmt == FMT_I8) ? (s8)a[i] : (u8)a[i];
+    s32 b32 = (fmt == FMT_I8) ? (s8)b[i] : (u8)b[i];
+    if (a32 >= b32)
+      result[i] = 1;
+    else
+      result[i] = 0;
+  }
+}
+
+static void test_tl_ge(bmctx_t *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  u64 size = n * c * h * w;
+
+  for (int i = 0; i < 2; i++) {
+    s8 *a_data = (s8 *)xmalloc(size);
+    for (u64 i = 0; i < size; i++)
+      a_data[i] = (s8)(i % 256);
+  
+    s8 *b_data = (s8 *)xmalloc(size);
+    for (u64 i = 0; i < size; i++)
+      b_data[i] = (s8)(100 - i % 256);
+  
+    s8 *ref_data = (s8 *)xmalloc(size);
+
+    fmt_t fmt = (i == 0) ? FMT_I8 : FMT_U8;
+    tl_ge_ref(a_data, b_data, ref_data, size, fmt);
+  
+    tl_t *tl_a  = alloc_tl(bk_ctx, tl_shape, fmt, eu_align);
+    tl_t *tl_b  = alloc_tl(bk_ctx, tl_shape, fmt, eu_align);
+    tl_t *tl_ge = alloc_tl(bk_ctx, tl_shape, fmt, eu_align);
+  
+    put_tensor_g2l(ctx, bk_ctx, tl_a, (u8 *)a_data);
+    put_tensor_g2l(ctx, bk_ctx, tl_b, (u8 *)b_data);
+  
+    bmk1822_tiu_element_wise_ge_param_t p;
+    memset(&p, 0, sizeof(p));
+    p.a = tl_a;
+    p.b_is_const = 0;
+    p.b = tl_b;
+    p.ge = tl_ge;
+    bmk1822_tiu_element_wise_ge(bk_ctx, &p);
+    u8 *ge_data = get_tensor_l2g(ctx, bk_ctx, tl_ge);
+  
+    for (u64 i = 0; i < size; i++) {
+      if ((s8)ge_data[i] != ref_data[i]) {
+        fprintf(stderr, "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+               i, ge_data[i], ref_data[i]);
+        exit(-1);
+      }
+    }
+  
+    free_tl(bk_ctx, tl_ge);
+    free_tl(bk_ctx, tl_b);
+    free_tl(bk_ctx, tl_a);
+  
+    free(a_data);
+    free(b_data);
+    free(ref_data);
+    free(ge_data);
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_ge(&ctx, bk_ctx, 0);
+  test_tl_ge(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_tensor_ge_const.cpp b/cviruntime/test/1822/test_1822_tensor_ge_const.cpp
new file mode 100644
index 000000000..1d3753a1a
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_tensor_ge_const.cpp
@@ -0,0 +1,83 @@
+#include "1822_test_util.h"
+
+static void tl_ge_const_ref(s8 *a, s8 b, s8 *result, u64 size, fmt_t fmt)
+{
+  for (u64 i = 0; i < size; i++) {
+    s32 a32 = (fmt == FMT_I8) ? (s8)a[i] : (u8)a[i];
+    s32 b32 = (fmt == FMT_I8) ? (s8)b : (u8)b;
+    if (a32 >= b32)
+      result[i] = 1;
+    else
+      result[i] = 0;
+  }
+}
+
+static void test_tl_ge_const(bmctx_t *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  u64 size = n * c * h * w;
+
+  for (int i = 0; i < 2; i++) {
+    s8 *a_data = (s8 *)xmalloc(size);
+    for (u64 i = 0; i < size; i++)
+      a_data[i] = (s8)(i % 256);
+
+    s8 b = 47;
+    s8 *ref_data = (s8 *)xmalloc(size);
+
+    fmt_t fmt = (i == 1) ? FMT_I8 : FMT_U8;
+    tl_ge_const_ref(a_data, b, ref_data, size, fmt);
+  
+    tl_t *tl_a = alloc_tl(bk_ctx, tl_shape, fmt, eu_align);
+    tl_t *tl_ge = alloc_tl(bk_ctx, tl_shape, fmt, eu_align);
+  
+    put_tensor_g2l(ctx, bk_ctx, tl_a, (u8 *)a_data);
+    bmk1822_tiu_element_wise_ge_param_t p;
+    memset(&p, 0, sizeof(p));
+    p.ge = tl_ge;
+    p.a = tl_a;
+    p.b_is_const = 1;
+    p.b_const.val = b;
+    p.b_const.is_signed = i;
+    bmk1822_tiu_element_wise_ge(bk_ctx, &p);
+    u8 *ge_data = get_tensor_l2g(ctx, bk_ctx, tl_ge);
+
+    for (u64 i = 0; i < size; i++) {
+      if ((s8)ge_data[i] != (s8)ref_data[i]) {
+        fprintf(stderr, "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+               i, ge_data[i], ref_data[i]);
+        exit(-1);
+      }
+    }
+  
+    free_tl(bk_ctx, tl_ge);
+    free_tl(bk_ctx, tl_a);
+  
+    free(a_data);
+    free(ref_data);
+    free(ge_data);
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_ge_const(&ctx, bk_ctx, 0);
+  test_tl_ge_const(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_tensor_mac.cpp b/cviruntime/test/1822/test_1822_tensor_mac.cpp
new file mode 100644
index 000000000..0b6223483
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_tensor_mac.cpp
@@ -0,0 +1,149 @@
+#include "1822_test_util.h"
+
+static void tl_mac_ref(
+    u8 *ref_high, u8 *ref_low,
+    u8 *a, u8 *b, u8 *c_high, u8 *c_low,
+    int lshift_bits, int rshift_bits, u64 size, int relu_enable)
+{
+  for (u64 i = 0; i < size; i++) {
+    s32 ta = (s8)a[i];
+    s32 tb = (s8)b[i];
+    s32 tc = ((s8)c_high[i] << 8) + c_low[i];
+    tc <<= lshift_bits;
+    s32 res = ta * tb + tc;
+
+    res += 1 << (rshift_bits - 1);
+    res >>= rshift_bits;
+    if(relu_enable)
+    {
+      if (res > 127)
+        res = 127;
+      else if (res < -128)
+        res = -128;
+
+      if(relu_enable)
+        if(res<0)
+          res=0;
+      ref_high[i] = 0;
+      ref_low[i] = res & 0xff;
+
+    }else{
+      if (res > 32767)
+        res = 32767;
+      else if (res < -32768)
+        res = -32768;
+      ref_high[i] = (res >> 8) & 0xff;
+      ref_low[i] = res & 0xff;
+    }
+  }
+}
+
+static void test_tl_mac(bmctx_t *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int lshift_bits;
+  int rshift_bits;
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  for(int relu_enable = 0; relu_enable < 2; relu_enable++) {
+    u64 size = n * c * h * w;
+    u8 *a_data = (u8 *)xmalloc(size);
+    u8 *b_data = (u8 *)xmalloc(size);
+    u8 *c_high_data = (u8 *)xmalloc(size);
+    u8 *c_low_data = (u8 *)xmalloc(size);
+
+    for (u64 i = 0; i < size; i++) {
+      a_data[i] = rand() % 128;
+      b_data[i] = 100 - i;
+      c_high_data[i] = rand() % 64;
+      c_low_data[i] = 200 + 2 * i;
+    }
+
+    if(relu_enable) {
+      lshift_bits= 1;
+      rshift_bits = 7;
+    }else {
+      lshift_bits = 1;
+      rshift_bits = 3;
+    }
+
+    u8 *ref_high_data = (u8 *)xmalloc(size);
+    u8 *ref_low_data = (u8 *)xmalloc(size);
+
+    tl_mac_ref(ref_high_data, ref_low_data,
+               a_data, b_data, c_high_data, c_low_data,
+               lshift_bits, rshift_bits, size, relu_enable);
+
+    tl_t *tl_a = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+    tl_t *tl_b = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+    tl_t *tl_c_low = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+    tl_t *tl_c_high = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+
+    put_tensor_g2l(ctx, bk_ctx, tl_a, a_data);
+    put_tensor_g2l(ctx, bk_ctx, tl_b, b_data);
+    put_tensor_g2l(ctx, bk_ctx, tl_c_low, c_low_data);
+    put_tensor_g2l(ctx, bk_ctx, tl_c_high, c_high_data);
+    bmk1822_tiu_element_wise_mac_param_t p2;
+    p2.res_high = tl_c_high;
+    p2.res_low = tl_c_low;
+    p2.res_is_int8 = relu_enable;
+    p2.a = tl_a;
+    p2.b_is_const = 0;
+    p2.b = tl_b;
+    p2.lshift_bits = lshift_bits;
+    p2.rshift_bits = rshift_bits;
+    p2.relu_enable = relu_enable;
+    bmk1822_tiu_element_wise_mac(bk_ctx, &p2);
+    u8 *mac_high_data = get_tensor_l2g(ctx, bk_ctx, tl_c_high);
+    u8 *mac_low_data = get_tensor_l2g(ctx, bk_ctx, tl_c_low);
+
+    for (u64 i = 0; i < size; i++) {
+      if(!relu_enable)
+        if (mac_high_data[i] != ref_high_data[i]) {
+          fprintf(stderr, "comparing failed at mac_high_data[%" PRIu64 "], got %d, exp %d\n",
+                 i, mac_high_data[i], ref_high_data[i]);
+          exit(-1);
+        }
+      if (mac_low_data[i] != ref_low_data[i]) {
+        fprintf(stderr, "comparing failed at mac_low_data[%" PRIu64 "], got %d, exp %d\n",
+               i, mac_low_data[i], ref_low_data[i]);
+        exit(-1);
+      }
+    }
+
+    free_tl(bk_ctx, tl_c_high);
+    free_tl(bk_ctx, tl_c_low);
+    free_tl(bk_ctx, tl_b);
+    free_tl(bk_ctx, tl_a);
+
+    free(a_data);
+    free(b_data);
+    free(c_high_data);
+    free(c_low_data);
+    free(ref_high_data);
+    free(ref_low_data);
+    free(mac_high_data);
+    free(mac_low_data);
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_mac(&ctx, bk_ctx, 0);
+  test_tl_mac(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_tensor_mac_const.cpp b/cviruntime/test/1822/test_1822_tensor_mac_const.cpp
new file mode 100644
index 000000000..a52fcbdb6
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_tensor_mac_const.cpp
@@ -0,0 +1,146 @@
+#include "1822_test_util.h"
+
+static void tl_mac_const_ref(
+    u8 *ref_high, u8 *ref_low,
+    u8 *a, u8 b_const, int b_is_signed,
+    u8 *c_high, u8 *c_low,
+    int lshift_bits, int rshift_bits, u64 size, int relu_enable)
+{
+  for (u64 i = 0; i < size; i++) {
+    s32 ta = (s8)a[i];
+    s32 tb = b_is_signed? (s8)b_const: (u8)b_const;
+    s32 tc = ((s8)c_high[i] << 8) + c_low[i];
+    tc <<= lshift_bits;
+    s32 res = ta * tb + tc;
+
+    res += 1 << (rshift_bits - 1);
+    res >>= rshift_bits;
+
+    if(relu_enable)
+    {
+      if (res > 127)
+        res = 127;
+      else if (res < -128)
+        res = -128;
+
+      if(relu_enable)
+        if(res<0)
+          res=0;
+      ref_high[i] = 0;
+      ref_low[i] = res & 0xff;
+
+    }else{
+      if (res > 32767)
+        res = 32767;
+      else if (res < -32768)
+        res = -32768;
+      ref_high[i] = (res >> 8) & 0xff;
+      ref_low[i] = res & 0xff;
+    }
+  }
+}
+
+static void test_tl_mac_const(bmctx_t *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int lshift_bits;
+  int rshift_bits;
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  for(int relu_enable = 0; relu_enable < 2; relu_enable++) {
+    u64 size = n * c * h * w;
+
+    u8 *a_data = (u8 *)xmalloc(size);
+    u8 *c_high_data = (u8 *)xmalloc(size);
+    u8 *c_low_data = (u8 *)xmalloc(size);
+    for (u64 i = 0; i < size; i++) {
+      a_data[i] = rand() % 256;
+      c_high_data[i] = rand() % 64;
+      c_low_data[i] = 200 + 2 * i;
+    }
+
+    u8 b_const = 37;
+    int b_is_signed = 1;
+     if(relu_enable) {
+      lshift_bits = 1;
+      rshift_bits = 8;
+    }else {
+      lshift_bits = 1;
+      rshift_bits = 3;
+    }
+
+    u8 *ref_high_data = (u8 *)xmalloc(size);
+    u8 *ref_low_data = (u8 *)xmalloc(size);
+    tl_mac_const_ref(ref_high_data, ref_low_data,
+                     a_data, b_const, b_is_signed, c_high_data, c_low_data,
+                     lshift_bits, rshift_bits, size, relu_enable);
+
+    tl_t *tl_a = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+    tl_t *tl_c_low = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+    tl_t *tl_c_high = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+
+    put_tensor_g2l(ctx, bk_ctx, tl_a, a_data);
+    put_tensor_g2l(ctx, bk_ctx, tl_c_low, c_low_data);
+    put_tensor_g2l(ctx, bk_ctx, tl_c_high, c_high_data);
+    bmk1822_tiu_element_wise_mac_param_t p3;
+    p3.res_high = tl_c_high;
+    p3.res_low = tl_c_low;
+    p3.res_is_int8 = relu_enable;
+    p3.a = tl_a;
+    p3.b_is_const = 1;
+    p3.b_const.val = b_const;
+    p3.b_const.is_signed = b_is_signed;
+    p3.lshift_bits = lshift_bits;
+    p3.rshift_bits = rshift_bits;
+    p3.relu_enable = relu_enable;
+    bmk1822_tiu_element_wise_mac(bk_ctx, &p3);
+    u8 *mac_high_data = get_tensor_l2g(ctx, bk_ctx, tl_c_high);
+    u8 *mac_low_data = get_tensor_l2g(ctx, bk_ctx, tl_c_low);
+    for (u64 i = 0; i < size; i++) {
+      if(!relu_enable)
+        if (mac_high_data[i] != ref_high_data[i]) {
+          fprintf(stderr, "comparing failed at mac_high_data[%" PRIu64 "], got %d, exp %d\n",
+                 i, mac_high_data[i], ref_high_data[i]);
+          exit(-1);
+        }
+      if (mac_low_data[i] != ref_low_data[i]) {
+        fprintf(stderr, "comparing failed at mac_low_data[%" PRIu64 "], got %d, exp %d\n",
+               i, mac_low_data[i], ref_low_data[i]);
+        exit(-1);
+      }
+    }
+
+    free_tl(bk_ctx, tl_c_high);
+    free_tl(bk_ctx, tl_c_low);
+    free_tl(bk_ctx, tl_a);
+
+    free(a_data);
+    free(c_high_data);
+    free(c_low_data);
+    free(ref_high_data);
+    free(ref_low_data);
+    free(mac_high_data);
+    free(mac_low_data);
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_mac_const(&ctx, bk_ctx, 0);
+  test_tl_mac_const(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_tensor_max.cpp b/cviruntime/test/1822/test_1822_tensor_max.cpp
new file mode 100644
index 000000000..12143651d
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_tensor_max.cpp
@@ -0,0 +1,83 @@
+#include "1822_test_util.h"
+
+static void tl_max_ref(s8 *a, s8 *b, s8 *max, u64 size)
+{
+  for (u64 i = 0; i < size; i++) {
+    if (a[i] > b[i])
+      max[i] = a[i];
+    else
+      max[i] = b[i];
+  }
+}
+
+static void test_tl_max(bmctx_t *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  u64 size = n * c * h * w;
+  s8 *a_data = (s8 *)xmalloc(size);
+  for (u64 i = 0; i < size; i++)
+    a_data[i] = (s8)(i % 256);
+
+  s8 *b_data = (s8 *)xmalloc(size);
+  for (u64 i = 0; i < size; i++)
+    b_data[i] = (s8)(100 - i % 256);
+
+  s8 *ref_data = (s8 *)xmalloc(size);
+  tl_max_ref(a_data, b_data, ref_data, size);
+
+  tl_t *tl_a = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_b = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_max = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+
+  put_tensor_g2l(ctx, bk_ctx, tl_a, (u8 *)a_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_b, (u8 *)b_data);
+
+  bmk1822_tiu_element_wise_max_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.max = tl_max;
+  p.a = tl_a;
+  p.b_is_const = 0;
+  p.b = tl_b;
+  bmk1822_tiu_element_wise_max(bk_ctx, &p);
+  u8 *max_data = get_tensor_l2g(ctx, bk_ctx, tl_max);
+
+  for (u64 i = 0; i < size; i++) {
+    if ((s8)max_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+             i, max_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_max);
+  free_tl(bk_ctx, tl_b);
+  free_tl(bk_ctx, tl_a);
+
+  free(a_data);
+  free(b_data);
+  free(ref_data);
+  free(max_data);
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_max(&ctx, bk_ctx, 0);
+  test_tl_max(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_tensor_max_const.cpp b/cviruntime/test/1822/test_1822_tensor_max_const.cpp
new file mode 100644
index 000000000..ce5334be7
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_tensor_max_const.cpp
@@ -0,0 +1,76 @@
+#include "1822_test_util.h"
+
+static void tl_max_const_ref(s8 *a, s8 b, s8 *max, u64 size)
+{
+  for (u64 i = 0; i < size; i++) {
+    if (a[i] > b)
+      max[i] = a[i];
+    else
+      max[i] = b;
+  }
+}
+
+static void test_tl_max_const(bmctx_t *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  u64 size = n * c * h * w;
+  s8 *a_data = (s8 *)xmalloc(size);
+  for (u64 i = 0; i < size; i++)
+    a_data[i] = (s8)(i % 256);
+
+  s8 b = 47;
+  s8 *ref_data = (s8 *)xmalloc(size);
+  tl_max_const_ref(a_data, b, ref_data, size);
+
+  tl_t *tl_a = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_max = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+
+  put_tensor_g2l(ctx, bk_ctx, tl_a, (u8 *)a_data);
+  bmk1822_tiu_element_wise_max_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.max = tl_max;
+  p.a = tl_a;
+  p.b_is_const = 1;
+  p.b_const.val = b;
+  p.b_const.is_signed = 1;
+  bmk1822_tiu_element_wise_max(bk_ctx, &p);
+  u8 *max_data = get_tensor_l2g(ctx, bk_ctx, tl_max);
+
+  for (u64 i = 0; i < size; i++) {
+    if ((s8)max_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+             i, max_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_max);
+  free_tl(bk_ctx, tl_a);
+
+  free(a_data);
+  free(ref_data);
+  free(max_data);
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_max_const(&ctx, bk_ctx, 0);
+  test_tl_max_const(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_tensor_min.cpp b/cviruntime/test/1822/test_1822_tensor_min.cpp
new file mode 100644
index 000000000..af0b99855
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_tensor_min.cpp
@@ -0,0 +1,81 @@
+#include "1822_test_util.h"
+
+static void tl_min_ref(s8 *a, s8 *b, s8 *max, u64 size)
+{
+  for (u64 i = 0; i < size; i++) {
+    if (a[i] > b[i])
+      max[i] = b[i];
+    else
+      max[i] = a[i];
+  }
+}
+
+static void test_tl_min(bmctx_t *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  u64 size = n * c * h * w;
+  s8 *a_data = (s8 *)xmalloc(size);
+  for (u64 i = 0; i < size; i++)
+    a_data[i] = (s8)(i % 256);
+
+  s8 *b_data = (s8 *)xmalloc(size);
+  for (u64 i = 0; i < size; i++)
+    b_data[i] = (s8)(100 - i % 256);
+
+  s8 *ref_data = (s8 *)xmalloc(size);
+  tl_min_ref(a_data, b_data, ref_data, size);
+
+  tl_t *tl_a = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_b = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_min = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+
+  put_tensor_g2l(ctx, bk_ctx, tl_a, (u8 *)a_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_b, (u8 *)b_data);
+  bmk1822_tiu_element_wise_min_param_t p6;
+  p6.min = tl_min;
+  p6.a = tl_a;
+  p6.b_is_const = 0;
+  p6.b = tl_b;
+  bmk1822_tiu_element_wise_min(bk_ctx, &p6);
+  u8 *min_data = get_tensor_l2g(ctx, bk_ctx, tl_min);
+
+  for (u64 i = 0; i < size; i++) {
+    if ((s8)min_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+             i, min_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_min);
+  free_tl(bk_ctx, tl_b);
+  free_tl(bk_ctx, tl_a);
+
+  free(a_data);
+  free(b_data);
+  free(ref_data);
+  free(min_data);
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_min(&ctx, bk_ctx, 0);
+  test_tl_min(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_tensor_min_const.cpp b/cviruntime/test/1822/test_1822_tensor_min_const.cpp
new file mode 100644
index 000000000..276d55fff
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_tensor_min_const.cpp
@@ -0,0 +1,76 @@
+#include "1822_test_util.h"
+
+static void tl_min_const_ref(s8 *a, s8 b, s8 *max, u64 size)
+{
+  for (u64 i = 0; i < size; i++) {
+    if (a[i] > b)
+      max[i] = b;
+    else
+      max[i] = a[i];
+  }
+}
+
+static void test_tl_min_const(bmctx_t *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  u64 size = n * c * h * w;
+  s8 *a_data = (s8 *)xmalloc(size);
+  for (u64 i = 0; i < size; i++)
+    a_data[i] = (s8)(i % 256);
+
+  s8 b = 47;
+
+  s8 *ref_data = (s8 *)xmalloc(size);
+  tl_min_const_ref(a_data, b, ref_data, size);
+
+  tl_t *tl_a = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_min = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+
+  put_tensor_g2l(ctx, bk_ctx, tl_a, (u8 *)a_data);
+  bmk1822_tiu_element_wise_min_param_t p7;
+  p7.min = tl_min;
+  p7.a = tl_a;
+  p7.b_is_const = 1;
+  p7.b_const.val = b;
+  p7.b_const.is_signed = 1;
+  bmk1822_tiu_element_wise_min(bk_ctx, &p7);
+  u8 *min_data = get_tensor_l2g(ctx, bk_ctx, tl_min);
+
+  for (u64 i = 0; i < size; i++) {
+    if ((s8)min_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+             i, min_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_min);
+  free_tl(bk_ctx, tl_a);
+
+  free(a_data);
+  free(ref_data);
+  free(min_data);
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_min_const(&ctx, bk_ctx, 0);
+  test_tl_min_const(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_tensor_mul.cpp b/cviruntime/test/1822/test_1822_tensor_mul.cpp
new file mode 100644
index 000000000..80754f195
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_tensor_mul.cpp
@@ -0,0 +1,98 @@
+#include "1822_test_util.h"
+
+static void tl_mul_ref(s8 *ofmap, s8 *a, s8 *b, u64 size, int shift_bits, int relu_enable)
+{
+  for (u64 i = 0; i < size; i++) {
+    s32 tmp = a[i] * b[i];
+    tmp += 1 << (shift_bits - 1);
+    tmp >>= shift_bits;
+    if (tmp > 127)
+      tmp = 127;
+    else if (tmp < -128)
+      tmp = -128;
+    if(relu_enable)
+      if(tmp<0)
+        tmp=0;
+    ofmap[i] = tmp;
+    
+  }
+}
+
+static void test_tl_mul(bmctx_t *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  u64 size = n * c * h  * w;
+  int shift_bits = 1;
+
+  for (u32 relu_enable = 0; relu_enable < 2; relu_enable++)
+  {
+     s8 *a_data = (s8 *)xmalloc(size);
+     s8 *b_data = (s8 *)xmalloc(size);
+     for (u64 i = 0; i < size; i++) {
+       a_data[i] = random()%0x10;
+       b_data[i] = 128 - i;
+     }
+   
+     tl_t *tl_a = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+     tl_t *tl_b = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+     tl_t *tl_res_low = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+   
+     put_tensor_g2l(ctx, bk_ctx, tl_a, (u8 *)a_data);
+     put_tensor_g2l(ctx, bk_ctx, tl_b, (u8 *)b_data);
+   
+     bmk1822_tiu_element_wise_mul_param_t p1;
+     p1.res_high = NULL;
+     p1.res_low = tl_res_low;
+     p1.a = tl_a;
+     p1.b_is_const = 0;
+     p1.b = tl_b;
+     p1.rshift_bits = shift_bits;
+     p1.relu_enable = relu_enable;
+     bmk1822_tiu_element_wise_mul(bk_ctx, &p1);
+   
+     u8 *res_low_data = get_tensor_l2g(ctx, bk_ctx, tl_res_low);
+   
+     s8 *ref_data = (s8 *)xmalloc(size);
+     tl_mul_ref(ref_data, a_data, b_data, size, shift_bits, relu_enable);
+   
+     for (u64 i = 0; i < size; i++) {
+       if ((s8)res_low_data[i] != ref_data[i]) {
+         fprintf(stderr, "comparing failed at res_low_data[%" PRIu64 "], got %x, exp %x\n",
+                i, res_low_data[i], ref_data[i]);
+         exit(-1);
+       }
+     }
+   
+     free_tl(bk_ctx, tl_res_low);
+     free_tl(bk_ctx, tl_b);
+     free_tl(bk_ctx, tl_a);
+   
+     free(a_data);
+     free(b_data);
+     free(ref_data);
+     free(res_low_data);
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_mul(&ctx, bk_ctx, 0);
+  test_tl_mul(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_tensor_mul_const.cpp b/cviruntime/test/1822/test_1822_tensor_mul_const.cpp
new file mode 100644
index 000000000..f6a3655cd
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_tensor_mul_const.cpp
@@ -0,0 +1,97 @@
+#include "1822_test_util.h"
+
+static void tl_mul_const_ref(
+    s8 *ofmap, s8 *ifmap, u64 size, s8 mul_const, int shift_bits, int relu_enable)
+{
+  for (u64 i = 0; i < size; i++) {
+    s32 tmp = ifmap[i] * mul_const;
+    tmp += 1 << (shift_bits - 1);
+    tmp >>= shift_bits;
+    if (tmp > 127)
+      tmp = 127;
+    else if (tmp < -128)
+      tmp = -128;
+    if(relu_enable)
+      if(tmp<0)
+        tmp=0;
+
+    ofmap[i] = tmp;
+  }
+}
+
+static void test_tl_mul_const(bmctx_t *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  u64 size = n * c * h  * w;
+
+  for (u32 relu_enable = 0; relu_enable < 2; relu_enable++)
+  {
+    s8 *ifmap_data = (s8 *)xmalloc(size);
+    for (u64 i = 0; i < size; i++)
+      ifmap_data[i] = (u8)(random() % 256);
+  
+    s8 mul_const = 20;
+    int shift_bits = 1;
+  
+    s8 *ref_data = (s8 *)xmalloc(size);
+    tl_mul_const_ref(ref_data, ifmap_data, size, mul_const, shift_bits, relu_enable);
+  
+    tl_t *tl_ifmap = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+    tl_t *tl_ofmap = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  
+    put_tensor_g2l(ctx, bk_ctx, tl_ifmap, (u8 *)ifmap_data);
+  
+    bmk1822_tiu_element_wise_mul_param_t p;
+    memset(&p, 0, sizeof(p));
+    p.res_high = NULL;
+    p.res_low = tl_ofmap;
+    p.a = tl_ifmap;
+    p.b_is_const = 1;
+    p.b_const.val = mul_const;
+    p.b_const.is_signed = 1;
+    p.rshift_bits = shift_bits;
+    p.relu_enable = relu_enable;
+
+    bmk1822_tiu_element_wise_mul(bk_ctx, &p);
+  
+    u8 *ofmap_data = get_tensor_l2g(ctx, bk_ctx, tl_ofmap);
+  
+    for (u64 i = 0; i < size; i++) {
+      if ((s8)ofmap_data[i] != ref_data[i]) {
+        fprintf(stderr, "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+               i, ofmap_data[i], ref_data[i]);
+        exit(-1);
+      }
+    }
+  
+    free_tl(bk_ctx, tl_ofmap);
+    free_tl(bk_ctx, tl_ifmap);
+  
+    free(ifmap_data);
+    free(ref_data);
+    free(ofmap_data);
+  }
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_mul_const(&ctx, bk_ctx, 0);
+  test_tl_mul_const(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_tensor_mul_qdm.cpp b/cviruntime/test/1822/test_1822_tensor_mul_qdm.cpp
new file mode 100644
index 000000000..a9f22f1a2
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_tensor_mul_qdm.cpp
@@ -0,0 +1,583 @@
+#include <limits.h>
+#include "1822_test_util.h"
+#include "test_tf_quant_util.h"
+
+// #define ENABLE_DEBUG_MSG
+// #define ENABLE_TV_GEN_PATTERN
+
+#define MIN_EXEC_TESTS  20
+
+typedef struct {
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int relu_enable;
+  s8 *input1_data;
+  s8 *input2_data;
+  s8 *output_data;
+  u32 multiplier;
+  s8 right_shift;
+  float float_multiplier;
+  int retry_cnt;
+} elt_mul_test_param_t;
+
+void elt_mul_ref(elt_mul_test_param_t *p_param)
+{
+  int input_n = p_param->input_n;
+  int input_c = p_param->input_c;
+  int input_h = p_param->input_h;
+  int input_w = p_param->input_w;
+  s32 output_multiplier = p_param->multiplier;
+  s8 output_rshift = p_param->right_shift;
+  s8 *input1_data = p_param->input1_data;
+  s8 *input2_data = p_param->input2_data;
+  s8 *output_data = p_param->output_data;
+
+  s32 quantized_activation_min = -128;
+  s32 quantized_activation_max = 127;
+
+  int size = input_n * input_c * input_h * input_w;
+#ifdef ENABLE_DEBUG_MSG
+  printf("elt_mul_ref:\n");
+  printf("  shape (%d, %d, %d, %d)\n", input_n, input_c, input_h, input_w);
+#endif
+  for (int i = 0; i < size; ++i) {
+    const s32 input1_val = input1_data[i];
+    const s32 input2_val = input2_data[i];
+    const s32 unclamped_result = MultiplyByQuantizedMultiplier(
+        input1_val * input2_val, output_multiplier, output_rshift);
+    const s32 clamped_output =
+        MIN(quantized_activation_max,
+                 MAX(quantized_activation_min, unclamped_result));
+
+#ifdef ENABLE_DEBUG_MSG
+    printf("  [%d] unclamped_result %d,  clamped_output %d\n", i,
+           unclamped_result, clamped_output);
+#endif
+
+    output_data[i] = static_cast<int8_t>(clamped_output);
+  }
+}
+
+void calc_elt_mul_float_multiplier(elt_mul_test_param_t *p_param)
+{
+  int input_n = p_param->input_n;
+  int input_c = p_param->input_c;
+  int input_h = p_param->input_h;
+  int input_w = p_param->input_w;
+  s8 *input1_data = p_param->input1_data;
+  s8 *input2_data = p_param->input2_data;
+
+  int output_min = INT_MAX;
+  int output_max = INT_MIN;
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("calc_elt_mul_float_multiplier =>\n");
+#endif
+
+  int size = input_n * input_c * input_h * input_w;
+  for (int i = 0; i < size; ++i) {
+    const s32 input1_val = input1_data[i];
+    const s32 input2_val = input2_data[i];
+
+    const s32 val = input1_val * input2_val;
+
+    output_max = MAX(val, output_max);
+    output_min = MIN(val, output_min);
+  }
+
+  // Since int8 ranges from -128 to 127, we need to squeeze the accumulator
+  // MIN/MAX fit in those ranges correspondingly as much as possible.
+  if (abs(output_max) > abs(output_min)) {
+    p_param->float_multiplier = 127.0f / abs(output_max);
+  } else {
+    p_param->float_multiplier = 128.0f / abs(output_min);
+  }
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("  output_accu_min %d, output_accu_max %d, output_multiplier %f\n",
+         output_min, output_max, p_param->float_multiplier);
+#endif
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("<= calc_elt_mul_float_multiplier\n");
+#endif
+}
+
+int simple_test(bmctx_t *ctx, bmk_ctx_t *bk_ctx)
+{
+  int ret = 0;
+
+  // TFL: QuantizedMulOpTest.NoActivationInt8
+  int size = 4;
+  s8 input1_data[4] = {-102, 25, 115, 89};
+  s8 input2_data[4] = {77, 51, 115, 102};
+  s8 ref_output_data[4] = {-62, 10, 104, 71};
+  s8 output_data[4];
+  u32 output_multiplier = 1077952640;
+  s8 output_rshift = 6;  // change to right shift
+
+  elt_mul_test_param_t test_param;
+  memset(&test_param, 0, sizeof(test_param));
+
+  test_param.input_n = 1;
+  test_param.input_c = 1;
+  test_param.input_h = 1;
+  test_param.input_w = 4;
+  test_param.input1_data = input1_data;
+  test_param.input2_data = input2_data;
+  test_param.output_data = output_data;
+  test_param.multiplier = output_multiplier;
+  test_param.right_shift = output_rshift;
+  elt_mul_ref(&test_param);
+
+  for (int i = 0; i < size; ++i) {
+    if (output_data[i] != ref_output_data[i]) {
+      printf("  Error ! output_data[%d] = %d != %d\n", i, output_data[i],
+             ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  tl_shape_t tl_shape = {1, 1, 1, static_cast<u32>(size)};
+  tl_t *tl_a = alloc_tl(bk_ctx, tl_shape, FMT_I8, /*align=*/1);
+  tl_t *tl_b = alloc_tl(bk_ctx, tl_shape, FMT_I8, /*align=*/1);
+  tl_t *tl_res = alloc_tl(bk_ctx, tl_shape, FMT_I8, /*align=*/1);
+
+  put_tensor_g2l(ctx, bk_ctx, tl_a, reinterpret_cast<u8 *>(input1_data));
+  put_tensor_g2l(ctx, bk_ctx, tl_b, reinterpret_cast<u8 *>(input2_data));
+
+  {
+    bmk1822_tiu_element_wise_mul_qdm_param_t p1;
+    p1.res_high = nullptr;
+    p1.res_low = tl_res;
+    p1.a = tl_a;
+    p1.b_is_const = 0;
+    p1.b = tl_b;
+    p1.rshift_bits = output_rshift;
+    p1.relu_enable = 0;
+    p1.multiplier = output_multiplier;
+    bmk1822_tiu_element_wise_mul_qdm(bk_ctx, &p1);
+  }
+
+  s8 *res_tiu_data =
+      reinterpret_cast<s8 *>(get_tensor_l2g(ctx, bk_ctx, tl_res));
+  for (int i = 0; i < size; ++i) {
+    if (res_tiu_data[i] != ref_output_data[i]) {
+      printf("  Error ! result[%d] %d != %d\n", i, res_tiu_data[i],
+             ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  free(res_tiu_data);
+
+  // Reserver order
+  free_tl(bk_ctx, tl_res);
+  free_tl(bk_ctx, tl_b);
+  free_tl(bk_ctx, tl_a);
+
+  return ret;
+}
+
+int choose_from_range(int table[], int size, int index)
+{
+  if (index >= size) {
+    return 0;
+  }
+
+  int val = table[index];
+  if (index < (size - 1)) {
+    int range = MAX(table[index + 1] - table[index] - 1, 1);
+    val += rand() % range;
+  }
+
+  return val;
+}
+
+bool check_valid_test_param(bmk_ctx_t *bk_ctx, elt_mul_test_param_t *p_param)
+{
+  u32 input_n = p_param->input_n;
+  u32 input_c = p_param->input_c;
+  u32 input_h = p_param->input_h;
+  u32 input_w = p_param->input_w;
+
+  // input1, input2, output
+  u32 total_needed_size = 3 * input_n * input_c * input_h * input_w;
+
+  bmk1822_chip_info_t chip_info = bmk1822_chip_info();
+  u32 lmem_size_per_lane = chip_info.lmem_size;
+  u32 total_lmem_size = chip_info.lmem_size * chip_info.npu_num;
+
+  if (total_needed_size > total_lmem_size) {
+    return false;
+  }
+
+  tl_shape_t input_shape = {input_n, input_c, input_h, input_w};
+
+  u32 needed_size =
+      3 * bmk1822_lmem_tensor_to_size(bk_ctx, input_shape, FMT_I8, /*eu_align=*/1);
+
+  // Skip invalid shape
+  if (needed_size > lmem_size_per_lane) {
+    return false;
+  }
+
+  return true;
+}
+
+void fill_random_data_s8(s8 *input_data, int size)
+{
+  for (int i = 0; i < size; ++i) {
+    int is_satured = ((rand() % 1000) == 1) ? 1 : 0;
+    int is_sign = rand() % 2 ? 1 : -1;
+
+    if (is_satured && is_sign) {
+      input_data[i] = -128;
+    } else if (is_satured) {
+      input_data[i] = 127;
+    } else {
+      input_data[i] = is_sign * rand() % 128;
+    }
+  }
+}
+
+void dump_test_param(elt_mul_test_param_t *p_param, bool dump_content)
+{
+  printf("Dump test parameter:\n");
+  printf("  input_n %d\n", p_param->input_n);
+  printf("  input_c %d\n", p_param->input_c);
+  printf("  input_h %d\n", p_param->input_h);
+  printf("  input_w %d\n", p_param->input_w);
+  printf("  multiplier %d\n", p_param->multiplier);
+  printf("  right_shift %d\n", p_param->right_shift);
+
+  if (dump_content) {
+    printf("input1_data(%d, %d, %d, %d) :\n", p_param->input_n,
+           p_param->input_c, p_param->input_h, p_param->input_w);
+    int in = p_param->input_n;
+    int ic = p_param->input_c;
+    int ih = p_param->input_h;
+    int iw = p_param->input_w;
+    for (int i = 0; i < in; ++i) {
+      for (int j = 0; j < ic; ++j) {
+        for (int k = 0; k < ih; ++k) {
+          for (int l = 0; l < iw; ++l) {
+            int offset = i * (ic * ih * iw) + j * (ih * iw) + k * iw + l;
+            printf("%d, ", p_param->input1_data[offset]);
+          }
+          printf("\n");
+        }
+      }
+    }
+    printf("\n\n");
+
+    printf("input2_data(%d, %d, %d, %d) :\n", p_param->input_n,
+           p_param->input_c, p_param->input_h, p_param->input_w);
+    for (int i = 0; i < in; ++i) {
+      for (int j = 0; j < ic; ++j) {
+        for (int k = 0; k < ih; ++k) {
+          for (int l = 0; l < iw; ++l) {
+            int offset = i * (ic * ih * iw) + j * (ih * iw) + k * iw + l;
+            printf("%d, ", p_param->input2_data[offset]);
+          }
+          printf("\n");
+        }
+      }
+    }
+    printf("\n\n");
+  }
+}
+
+int run_compare_elt_mul(bmctx_t *ctx, bmk_ctx_t *bk_ctx,
+                        elt_mul_test_param_t *p_param)
+{
+  int ret = 0;
+
+  int input_n = p_param->input_n;
+  int input_c = p_param->input_c;
+  int input_h = p_param->input_h;
+  int input_w = p_param->input_w;
+
+  int input_size = input_n * input_c * input_h * input_w;
+  s8 *input1_data = (s8 *)malloc(input_size);
+  s8 *input2_data = (s8 *)malloc(input_size);
+  s8 *output_data = (s8 *)malloc(input_size);
+
+  p_param->input1_data = input1_data;
+  p_param->input2_data = input2_data;
+  p_param->output_data = output_data;
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    run_compare_elt_mul => \n");
+#endif
+
+  int retry_cnt = p_param->retry_cnt;
+  do {
+    fill_random_data_s8(input1_data, input_size);
+    fill_random_data_s8(input2_data, input_size);
+
+    p_param->float_multiplier = 100.0;  // should be < 1.0
+    calc_elt_mul_float_multiplier(p_param);
+
+    if (p_param->float_multiplier > 0.f && p_param->float_multiplier < 1.0) {
+      break;
+    }
+
+  } while (--retry_cnt);
+
+  if (p_param->float_multiplier >= 1.0) {
+    printf("    run_compare_elt_mul: unable to find valid multiplier\n");
+    free(input1_data);
+    free(input2_data);
+    free(output_data);
+    return -1;
+  }
+
+  u32 base_multiplier = 0;
+  int base_shift = 0;
+  QuantizeMultiplierSmallerThanOne(p_param->float_multiplier, &base_multiplier,
+                                   &base_shift);
+
+  // multipliers typically range in [2^30 ; 2^31 - 1].
+  // Values in [0, 2^30 - 1] are normally unused, but harmless.
+  // Thus a good way to randomize multipliers is to subtract from them
+  // a random value smaller than 2^30 but still significant compared to it.
+  u32 output_multiplier = base_multiplier - (rand() % (1 << 26));
+
+  int right_shift = base_shift - 1 + (rand() % 4);
+  s8 output_right_shift = truncate_rshift((s8)right_shift, /*allow_lshift*/1);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("      multiplier_data %d, shift_data %d\n", output_multiplier,
+         output_right_shift);
+#endif
+
+  p_param->multiplier = output_multiplier;
+  p_param->right_shift = output_right_shift;
+
+  elt_mul_ref(p_param);
+
+  tl_shape_t input_shape = {
+      static_cast<u32>(input_n), static_cast<u32>(input_c),
+      static_cast<u32>(input_h), static_cast<u32>(input_w)};
+
+  bmk1822_tensor_lmem_t *tl_input1 =
+      bmk1822_lmem_alloc_tensor(bk_ctx, input_shape, FMT_I8, /*eu_aign=*/1);
+
+  bmk1822_tensor_lmem_t *tl_input2 =
+      bmk1822_lmem_alloc_tensor(bk_ctx, input_shape, FMT_I8, /*eu_aign=*/1);
+
+  bmk1822_tensor_lmem_t *tl_output =
+      bmk1822_lmem_alloc_tensor(bk_ctx, input_shape, FMT_I8, /*eu_aign=*/1);
+
+  if (tl_input1 == nullptr) {
+    printf("    fail to alloc tl_input1 (%d, %d, %d, %d)\n", input_n, input_c,
+           input_h, input_w);
+    return -1;
+  }
+  if (tl_input2 == nullptr) {
+    printf("    fail to alloc tl_input2 (%d, %d, %d, %d)\n", input_n, input_c,
+           input_h, input_w);
+    return -1;
+  }
+  if (tl_output == nullptr) {
+    printf("    fail to alloc tl_output (%d, %d, %d, %d)\n", input_n, input_c,
+           input_h, input_w);
+    return -1;
+  }
+
+  put_tensor_g2l(ctx, bk_ctx, tl_input1, reinterpret_cast<u8 *>(input1_data));
+  put_tensor_g2l(ctx, bk_ctx, tl_input2, reinterpret_cast<u8 *>(input2_data));
+
+  {
+    bmk1822_tiu_element_wise_mul_qdm_param_t p1;
+    p1.res_high = nullptr;
+    p1.res_low = tl_output;
+    p1.a = tl_input1;
+    p1.b_is_const = 0;
+    p1.b = tl_input2;
+    p1.rshift_bits = (u8)output_right_shift;
+    p1.relu_enable = 0;
+    p1.multiplier = output_multiplier;
+    bmk1822_tiu_element_wise_mul_qdm(bk_ctx, &p1);
+  }
+
+
+  test_submit(ctx);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("      compare result:\n");
+#endif
+  s8 *tiu_output_data =
+      reinterpret_cast<s8 *>(get_tensor_l2g(ctx, bk_ctx, tl_output));
+  for (int i = 0; i < input_n; ++i) {
+    for (int j = 0; j < input_c; ++j) {
+      for (int k = 0; k < input_h; ++k) {
+        for (int l = 0; l < input_w; ++l) {
+          int offset = i * (input_c * input_h * input_w) +
+                       j * (input_h * input_w) + k * input_w + l;
+          if (tiu_output_data[offset] != output_data[offset]) {
+            printf("        [ni=%d][oci=%d][ohi=%d][owi=%d] output %d(tiu) != "
+                   "%d(ref)\n",
+                   i, j, k, l, tiu_output_data[offset], output_data[offset]);
+            ret = -1;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  if (ret) {
+    dump_test_param(p_param, /*dump_content=*/true);
+  }
+
+  // Reverse order
+  bmk1822_lmem_free_tensor(bk_ctx, tl_output);
+  bmk1822_lmem_free_tensor(bk_ctx, tl_input2);
+  bmk1822_lmem_free_tensor(bk_ctx, tl_input1);
+
+  free(input1_data);
+  free(input2_data);
+  free(output_data);
+  free(tiu_output_data);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    <= run_compare_elt_mul, ret %d\n", ret);
+#endif
+
+  return ret;
+}
+
+int random_test(bmctx_t *ctx, bmk_ctx_t *bk_ctx)
+{
+  int ret = 0;
+
+#if 0
+  int input_n_range[] = {1};
+  int input_c_range[] = {1};
+  int input_h_range[] = {1};
+  int input_w_range[] = {1};
+#else
+#ifndef ENABLE_TV_GEN_PATTERN
+  int input_n_range[] = {1, 2, 4, 8, 16, 32, 64, 4095 - 32};
+  int input_c_range[] = {1, 3, 11, 128, 512, 1024, 2048, 4095 - 32};
+  int input_h_range[] = {1, 3, 11, 128, 512, 1024, 2048, 4095 - 32};
+  int input_w_range[] = {1, 3, 11, 128, 512, 1024, 2048, 4095 - 32};
+#else
+  // TV_GEN
+  // Random Test, total 81, skipped 8095, executed 5, failed 0, ret 0
+
+  int input_n_range[] = {1,   2, 4095 - 32};
+  int input_c_range[] = {1, 512, 4095 - 32};
+  int input_h_range[] = {1, 512, 4095 - 32};
+  int input_w_range[] = {1, 512, 4095 - 32};
+#endif
+#endif
+
+  const int input_n_range_size =
+      sizeof(input_n_range) / sizeof(input_n_range[0]);
+  const int input_c_range_size =
+      sizeof(input_c_range) / sizeof(input_c_range[0]);
+  const int input_h_range_size =
+      sizeof(input_h_range) / sizeof(input_h_range[0]);
+  const int input_w_range_size =
+      sizeof(input_w_range) / sizeof(input_w_range[0]);
+
+  int random_seed = clock();
+  srand(random_seed);
+
+  const int retry_test_count = 100;
+  bool stop_at_first_error = true;
+
+  int executed_tests = 0;
+  int failed_tests = 0;
+
+  printf("1822-mul-qm: random Test =>\n");
+  for (int m = 0; m < retry_test_count; ++m) {
+    for (int i = 0; i < input_n_range_size; ++i) {
+      int input_n = choose_from_range(input_n_range, input_n_range_size, i);
+
+      for (int j = 0; j < input_c_range_size; ++j) {
+        int input_c = choose_from_range(input_c_range, input_c_range_size, j);
+
+        for (int k = 0; k < input_h_range_size; ++k) {
+          int input_h = choose_from_range(input_h_range, input_h_range_size, k);
+
+          for (int l = 0; l < input_w_range_size; ++l) {
+            int input_w =
+                choose_from_range(input_w_range, input_w_range_size, l);
+
+            elt_mul_test_param_t test_param;
+            memset(&test_param, 0, sizeof(test_param));
+            test_param.input_n = input_n;
+            test_param.input_c = input_c;
+            test_param.input_h = input_h;
+            test_param.input_w = input_w;
+            test_param.retry_cnt = 5;
+
+            bool is_valid_param = check_valid_test_param(bk_ctx, &test_param);
+            if (is_valid_param == false)
+              continue;
+
+            int ret2 = run_compare_elt_mul(ctx, bk_ctx, &test_param);
+            failed_tests = ret2 ? failed_tests + 1 : failed_tests;
+            ret |= ret2;
+            executed_tests++;
+
+#ifdef ENABLE_DEBUG_MSG
+            printf("  [%d] random test: input shape (%d, %d, %d, %d), ret %d\n",
+                   executed_tests, current_testinput_n, input_c, input_h, input_w, ret2);
+#endif
+          }
+
+          // Stop at first error
+          if (ret && stop_at_first_error) {
+            break;
+          }
+        }
+
+        // Stop at first error
+        if (ret && stop_at_first_error) {
+          break;
+        }
+      }
+
+      // Stop at first error
+      if (ret && stop_at_first_error) {
+        break;
+      }
+    }
+
+    // Stop at first error
+    if (ret && stop_at_first_error) {
+      break;
+    }
+
+    if (executed_tests >= MIN_EXEC_TESTS) {
+      break;
+    }
+  }
+
+  printf("<= 1822-mul-qm: random test, total %d, failed %d, ret %d\n",
+         executed_tests, failed_tests, ret);
+
+  return ret;
+}
+
+int main()
+{
+  int ret = 0;
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  ret = simple_test(&ctx, bk_ctx);
+  ret |= random_test(&ctx, bk_ctx);
+
+  test_exit(&ctx);
+
+  return ret;
+}
diff --git a/cviruntime/test/1822/test_1822_tensor_or.cpp b/cviruntime/test/1822/test_1822_tensor_or.cpp
new file mode 100644
index 000000000..a2057e879
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_tensor_or.cpp
@@ -0,0 +1,179 @@
+#include "1822_test_util.h"
+
+static void tl_or_int8_ref(s8 *a, s8 *b, s8 *res, u64 size)
+{
+  for (u64 i = 0; i < size; i++)
+    res[i] = a[i] | b[i];
+}
+
+static void tl_or_int16_ref(
+    u8 *ref_high, u8 *ref_low,
+    u8 *a_high, u8 *a_low,
+    u8 *b_high, u8 *b_low,
+    u64 size)
+{
+  for (u64 i = 0; i < size; i++) {
+    s32 ta = ((s8)a_high[i] << 8) + a_low[i];
+    s32 tb = ((s8)b_high[i] << 8) + b_low[i];
+    s32 res = ta | tb;
+    ref_high[i] = (res >> 8) & 0xff;
+    ref_low[i] = res & 0xff;
+  }
+}
+
+static void test_tl_or_int8(bmctx_t *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  u64 size = n * c * h * w;
+  s8 *a_data = (s8 *)xmalloc(size);
+  for (u64 i = 0; i < size; i++)
+    a_data[i] = (s8)(i % 256);
+
+  s8 *b_data = (s8 *)xmalloc(size);
+  for (u64 i = 0; i < size; i++)
+    b_data[i] = (s8)(100 - i % 256);
+
+  s8 *ref_data = (s8 *)xmalloc(size);
+  tl_or_int8_ref(a_data, b_data, ref_data, size);
+
+  tl_t *tl_a = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_b = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_res = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+
+  put_tensor_g2l(ctx, bk_ctx, tl_a, (u8 *)a_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_b, (u8 *)b_data);
+
+  bmk1822_tiu_element_wise_or_int8_param_t p9;
+  p9.res = tl_res;
+  p9.a = tl_a;
+  p9.b = tl_b;
+  bmk1822_tiu_element_wise_or_int8(bk_ctx, &p9);
+  u8 *res_data = get_tensor_l2g(ctx, bk_ctx, tl_res);
+
+  for (u64 i = 0; i < size; i++) {
+    if ((s8)res_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+             i, res_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_res);
+  free_tl(bk_ctx, tl_b);
+  free_tl(bk_ctx, tl_a);
+
+  free(a_data);
+  free(b_data);
+  free(ref_data);
+  free(res_data);
+}
+
+static void test_tl_or_int16(bmctx_t *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  u64 size = n * c * h * w;
+  u8 *a_high_data = (u8 *)xmalloc(size);
+  u8 *a_low_data = (u8 *)xmalloc(size);
+  u8 *b_high_data = (u8 *)xmalloc(size);
+  u8 *b_low_data = (u8 *)xmalloc(size);
+  for (u64 i = 0; i < size; i++) {
+    a_high_data[i] = i / 10;
+    a_low_data[i] = i;
+    b_high_data[i] = (i + 250) / 20;
+    b_low_data[i] = 100 - i;
+  }
+
+  u8 *ref_high_data = (u8 *)xmalloc(size);
+  u8 *ref_low_data = (u8 *)xmalloc(size);
+  tl_or_int16_ref(
+      ref_high_data, ref_low_data,
+      a_high_data, a_low_data,
+      b_high_data, b_low_data,
+      size);
+
+  tl_t *tl_a_low = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_a_high = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_b_low = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_b_high = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_res_low = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_res_high = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+
+  put_tensor_g2l(ctx, bk_ctx, tl_a_low, a_low_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_a_high, a_high_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_b_low, b_low_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_b_high, b_high_data);
+  bmk1822_tiu_element_wise_or_int16_param_t p9;
+  p9.res_high = tl_res_high;
+  p9.res_low = tl_res_low;
+  p9.a_high = tl_a_high;
+  p9.a_low = tl_a_low;
+  p9.b_high = tl_b_high;
+  p9.b_low = tl_b_low;
+  bmk1822_tiu_element_wise_or_int16(bk_ctx, &p9);
+  u8 *res_high_data = get_tensor_l2g(ctx, bk_ctx, tl_res_high);
+  u8 *res_low_data = get_tensor_l2g(ctx, bk_ctx, tl_res_low);
+
+  for (u64 i = 0; i < size; i++) {
+    if (res_high_data[i] != ref_high_data[i]) {
+      fprintf(stderr, "comparing failed at res_high_data[%" PRIu64 "], got %d, exp %d\n",
+             i, res_high_data[i], ref_high_data[i]);
+      exit(-1);
+    }
+    if (res_low_data[i] != ref_low_data[i]) {
+      fprintf(stderr, "comparing failed at res_low_data[%" PRIu64 "], got %d, exp %d\n",
+             i, res_low_data[i], ref_low_data[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_res_high);
+  free_tl(bk_ctx, tl_res_low);
+  free_tl(bk_ctx, tl_b_high);
+  free_tl(bk_ctx, tl_b_low);
+  free_tl(bk_ctx, tl_a_high);
+  free_tl(bk_ctx, tl_a_low);
+
+  free(a_high_data);
+  free(a_low_data);
+  free(b_high_data);
+  free(b_low_data);
+  free(ref_high_data);
+  free(ref_low_data);
+  free(res_high_data);
+  free(res_low_data);
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_or_int8(&ctx, bk_ctx, 0);
+  test_tl_or_int8(&ctx, bk_ctx, 1);
+  test_tl_or_int16(&ctx, bk_ctx, 0);
+  test_tl_or_int16(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_tensor_sub.cpp b/cviruntime/test/1822/test_1822_tensor_sub.cpp
new file mode 100644
index 000000000..d77726a7c
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_tensor_sub.cpp
@@ -0,0 +1,117 @@
+#include "1822_test_util.h"
+
+static void tl_sub_ref(
+    u8 *ref_high, u8 *ref_low,
+    u8 *a_high, u8 *a_low,
+    u8 *b_high, u8 *b_low,
+    u64 size)
+{
+  for (u64 i = 0; i < size; i++) {
+    s32 ta = ((s8)a_high[i] << 8) + a_low[i];
+    s32 tb = ((s8)b_high[i] << 8) + b_low[i];
+    s32 res = ta - tb;
+    if (res > 32767)
+      res = 32767;
+    else if (res < -32768)
+      res = -32768;
+    ref_high[i] = (res >> 8) & 0xff;
+    ref_low[i] = res & 0xff;
+  }
+}
+
+static void test_tl_sub(bmctx_t *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  u64 size = n * c * h * w;
+  u8 *a_high_data = (u8 *)xmalloc(size);
+  u8 *a_low_data = (u8 *)xmalloc(size);
+  u8 *b_high_data = (u8 *)xmalloc(size);
+  u8 *b_low_data = (u8 *)xmalloc(size);
+  for (u64 i = 0; i < size; i++) {
+    a_high_data[i] = i / 10;
+    a_low_data[i] = i;
+    b_high_data[i] = (i + 250) / 20;
+    b_low_data[i] = 100 - i;
+  }
+
+  u8 *ref_high_data = (u8 *)xmalloc(size);
+  u8 *ref_low_data = (u8 *)xmalloc(size);
+  tl_sub_ref(ref_high_data, ref_low_data,
+             a_high_data, a_low_data,
+             b_high_data, b_low_data,
+             size);
+
+  tl_t *tl_a_low = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_a_high = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_b_low = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_b_high = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_res_low = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_res_high = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+
+  put_tensor_g2l(ctx, bk_ctx, tl_a_low, a_low_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_a_high, a_high_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_b_low, b_low_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_b_high, b_high_data);
+  bmk1822_tiu_element_wise_sub_param_t p5;
+  p5.res_high = tl_res_high;
+  p5.res_low = tl_res_low;
+  p5.a_high = tl_a_high;
+  p5.a_low = tl_a_low;
+  p5.b_high = tl_b_high;
+  p5.b_low = tl_b_low;
+  p5.rshift_bits = 0;
+  bmk1822_tiu_element_wise_sub(bk_ctx, &p5);
+  u8 *res_high_data = get_tensor_l2g(ctx, bk_ctx, tl_res_high);
+  u8 *res_low_data = get_tensor_l2g(ctx, bk_ctx, tl_res_low);
+
+  for (u64 i = 0; i < size; i++) {
+    if (res_high_data[i] != ref_high_data[i]) {
+      fprintf(stderr, "comparing failed at res_high_data[%" PRIu64 "], got %d, exp %d\n",
+             i, res_high_data[i], ref_high_data[i]);
+      exit(-1);
+    }
+    if (res_low_data[i] != ref_low_data[i]) {
+      fprintf(stderr, "comparing failed at res_low_data[%" PRIu64 "], got %d, exp %d\n",
+             i, res_low_data[i], ref_low_data[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_res_high);
+  free_tl(bk_ctx, tl_res_low);
+  free_tl(bk_ctx, tl_b_high);
+  free_tl(bk_ctx, tl_b_low);
+  free_tl(bk_ctx, tl_a_high);
+  free_tl(bk_ctx, tl_a_low);
+
+  free(a_high_data);
+  free(a_low_data);
+  free(b_high_data);
+  free(b_low_data);
+  free(ref_high_data);
+  free(ref_low_data);
+  free(res_high_data);
+  free(res_low_data);
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_sub(&ctx, bk_ctx, 0);
+  test_tl_sub(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_tensor_transfer.cpp b/cviruntime/test/1822/test_1822_tensor_transfer.cpp
new file mode 100644
index 000000000..36b4b5d8d
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_tensor_transfer.cpp
@@ -0,0 +1,103 @@
+#include "1822_test_util.h"
+
+static void test_put_and_get_tensor_l2g(
+    bmctx_t *ctx, bmk_ctx_t *bk_ctx)
+{
+  int n = 2;
+  int c = 66;
+  int h = 3;
+  int w = 15;
+  int size = n * c * h * w;
+  u8 *data_x = (u8 *)xmalloc(size);
+  u8 *data_y = (u8 *)xmalloc(size);
+
+  for (int i = 0; i < size; i++)
+    data_x[i] = i - 100;
+
+  for (int i = 0; i < size; i++)
+    data_y[i] = -i;
+
+  /*
+   * Interleave two tensors in case the same devmem is reused between
+   * put_tensor_g2l() and get_tensor_l2g(), in which case the content of
+   * devmem is already what is expected before bmk1822_gdma_store(bk_ctx, ).
+   */
+
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  tg_shape_t ts_shape;
+  ts_shape.n = n;
+  ts_shape.c = c;
+  ts_shape.h = h;
+  ts_shape.w = w;
+
+  tl_t *tl_x =
+      alloc_tl(bk_ctx, tl_shape, FMT_I8, 1);
+  tl_t *tl_y =
+      alloc_tl(bk_ctx, tl_shape, FMT_I8, 1);
+
+  tg_t ts_x;
+  ts_x.base_reg_index = 0;
+  ts_x.start_address = 0;
+  ts_x.shape = ts_shape;
+  ts_x.stride = bmk1822_tensor_tgmem_default_stride(ts_shape, FMT_I8);
+
+  put_tensor_g2l(ctx, bk_ctx, tl_x, data_x);
+  put_tensor_g2l(ctx, bk_ctx, tl_y, data_y);
+
+  u8 *result_x = get_tensor_l2g(ctx, bk_ctx, tl_x);
+  u8 *result_y = get_tensor_l2g(ctx, bk_ctx, tl_y);
+
+  for (int i = 0; i < size; i++) {
+    if (result_x[i] != data_x[i]) {
+      printf("compare failed at result_x[%d]\n", i);
+      exit(-1);
+    }
+    if (result_y[i] != data_y[i]) {
+      printf("compare failed at result_y[%d]\n", i);
+      exit(-1);
+    }
+  }
+  free(result_x);
+  free(result_y);
+
+  /*
+   * Get result_y before result_x.
+   */
+
+
+  result_y = get_tensor_l2g(ctx, bk_ctx, tl_y);
+  result_x = get_tensor_l2g(ctx, bk_ctx, tl_x);
+  for (int i = 0; i < size; i++) {
+    if (result_x[i] != data_x[i]) {
+      printf("compare failed at result_x[%d]\n", i);
+      exit(-1);
+    }
+    if (result_y[i] != data_y[i]) {
+      printf("compare failed at result_y[%d]\n", i);
+      exit(-1);
+    }
+  }
+  free(result_x);
+  free(result_y);
+
+  free_tl(bk_ctx, tl_y);
+  free_tl(bk_ctx, tl_x);
+  free(data_x);
+  free(data_y);
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  test_put_and_get_tensor_l2g(&ctx, bk_ctx);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_tensor_xor.cpp b/cviruntime/test/1822/test_1822_tensor_xor.cpp
new file mode 100644
index 000000000..5ce1ccc74
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_tensor_xor.cpp
@@ -0,0 +1,182 @@
+#include "1822_test_util.h"
+
+static void tl_xor_int8_ref(s8 *a, s8 *b, s8 *res, u64 size)
+{
+  for (u64 i = 0; i < size; i++)
+    res[i] = a[i] ^ b[i];
+}
+
+static void tl_xor_int16_ref(
+    u8 *ref_high, u8 *ref_low,
+    u8 *a_high, u8 *a_low,
+    u8 *b_high, u8 *b_low,
+    u64 size)
+{
+  for (u64 i = 0; i < size; i++) {
+    s32 ta = ((s8)a_high[i] << 8) + a_low[i];
+    s32 tb = ((s8)b_high[i] << 8) + b_low[i];
+    s32 res = ta ^ tb;
+    ref_high[i] = (res >> 8) & 0xff;
+    ref_low[i] = res & 0xff;
+  }
+}
+
+static void test_tl_xor_int8(bmctx_t *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  u64 size = n * c * h * w;
+  s8 *a_data = (s8 *)xmalloc(size);
+  for (u64 i = 0; i < size; i++)
+    a_data[i] = (s8)(i % 256);
+
+  s8 *b_data = (s8 *)xmalloc(size);
+  for (u64 i = 0; i < size; i++)
+    b_data[i] = (s8)(100 - i % 256);
+
+  s8 *ref_data = (s8 *)xmalloc(size);
+  tl_xor_int8_ref(a_data, b_data, ref_data, size);
+
+  tl_t *tl_a = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_b = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_res = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+
+  put_tensor_g2l(ctx, bk_ctx, tl_a, (u8 *)a_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_b, (u8 *)b_data);
+
+  bmk1822_tiu_element_wise_xor_int8_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.res = tl_res;
+  p.a = tl_a;
+  p.b = tl_b;
+  bmk1822_tiu_element_wise_xor_int8(bk_ctx, &p);
+  u8 *res_data = get_tensor_l2g(ctx, bk_ctx, tl_res);
+
+  for (u64 i = 0; i < size; i++) {
+    if ((s8)res_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+             i, res_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_res);
+  free_tl(bk_ctx, tl_b);
+  free_tl(bk_ctx, tl_a);
+
+  free(a_data);
+  free(b_data);
+  free(ref_data);
+  free(res_data);
+}
+
+static void test_tl_xor_int16(bmctx_t *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  u64 size = n * c * h * w;
+  u8 *a_high_data = (u8 *)xmalloc(size);
+  u8 *a_low_data = (u8 *)xmalloc(size);
+  u8 *b_high_data = (u8 *)xmalloc(size);
+  u8 *b_low_data = (u8 *)xmalloc(size);
+  for (u64 i = 0; i < size; i++) {
+    a_high_data[i] = i / 10;
+    a_low_data[i] = i;
+    b_high_data[i] = (i + 250) / 20;
+    b_low_data[i] = 100 - i;
+  }
+
+  u8 *ref_high_data = (u8 *)xmalloc(size);
+  u8 *ref_low_data = (u8 *)xmalloc(size);
+  tl_xor_int16_ref(
+      ref_high_data, ref_low_data,
+      a_high_data, a_low_data,
+      b_high_data, b_low_data,
+      size);
+
+  tl_t *tl_a_low = alloc_tl( bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_a_high = alloc_tl( bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_b_low = alloc_tl( bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_b_high = alloc_tl( bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_res_low = alloc_tl( bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_res_high = alloc_tl( bk_ctx, tl_shape, FMT_I8, eu_align);
+
+  put_tensor_g2l(ctx, bk_ctx, tl_a_low, a_low_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_a_high, a_high_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_b_low, b_low_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_b_high, b_high_data);
+
+  bmk1822_tiu_element_wise_xor_int16_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.res_high = tl_res_high;
+  p.res_low = tl_res_low;
+  p.a_high = tl_a_high;
+  p.a_low = tl_a_low;
+  p.b_high = tl_b_high;
+  p.b_low = tl_b_low;
+  bmk1822_tiu_element_wise_xor_int16(bk_ctx, &p);
+  u8 *res_high_data = get_tensor_l2g(ctx, bk_ctx, tl_res_high);
+  u8 *res_low_data = get_tensor_l2g(ctx, bk_ctx, tl_res_low);
+
+  for (u64 i = 0; i < size; i++) {
+    if (res_high_data[i] != ref_high_data[i]) {
+      fprintf(stderr, "comparing failed at res_high_data[%" PRIu64 "], got %d, exp %d\n",
+             i, res_high_data[i], ref_high_data[i]);
+      exit(-1);
+    }
+    if (res_low_data[i] != ref_low_data[i]) {
+      fprintf(stderr, "comparing failed at res_low_data[%" PRIu64 "], got %d, exp %d\n",
+             i, res_low_data[i], ref_low_data[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_res_high);
+  free_tl(bk_ctx, tl_res_low);
+  free_tl(bk_ctx, tl_b_high);
+  free_tl(bk_ctx, tl_b_low);
+  free_tl(bk_ctx, tl_a_high);
+  free_tl(bk_ctx, tl_a_low);
+
+  free(a_high_data);
+  free(a_low_data);
+  free(b_high_data);
+  free(b_low_data);
+  free(ref_high_data);
+  free(ref_low_data);
+  free(res_high_data);
+  free(res_low_data);
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_xor_int8(&ctx, bk_ctx, 0);
+  test_tl_xor_int8(&ctx, bk_ctx, 1);
+  test_tl_xor_int16(&ctx, bk_ctx, 0);
+  test_tl_xor_int16(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_1822_tg_copy_tensor.cpp b/cviruntime/test/1822/test_1822_tg_copy_tensor.cpp
new file mode 100644
index 000000000..5c322112b
--- /dev/null
+++ b/cviruntime/test/1822/test_1822_tg_copy_tensor.cpp
@@ -0,0 +1,105 @@
+#include "1822_test_util.h"
+
+typedef bmk1822_tdma_tg2tg_tensor_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  tg_shape_t src_shape;
+  tg_stride_t src_stride;
+  tg_shape_t dst_shape;
+  tg_stride_t dst_stride;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    {1, 3, 3, 3}, {27, 9, 3},
+    {1, 3, 3, 3}, {27, 9, 3},
+  },
+  {
+    // YOLOv2 concat layer
+    {1, 256, 19, 19}, {92416, 361, 19},
+    {1, 256, 19, 19}, {462080, 361, 19},
+  }
+};
+
+static void test_param_g2g(bmctx_t *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+
+  u64 size = p->src->shape.c * p->src->shape.h * p->src->shape.w;
+  u8 *src_data = (u8 *)malloc(sizeof(u8) * size);
+  for (u64 i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  put_tg_gmem(ctx, p->src, src_data);
+
+  bmk1822_tdma_tg2tg_tensor_copy(bmk, p);
+  test_submit(ctx);
+
+  u8 *dst_data = get_tg_gmem(ctx, p->dst);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != src_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], src_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+}
+
+static void destroy_param_g2g(bmctx_t *ctx, param_t *p)
+{
+  free_tg_gmem(ctx, p->src);
+  free_tg_gmem(ctx, p->dst);
+}
+
+static void test_one_case(bmctx_t *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  param_t p;
+  bmk1822_tensor_tgmem_t *src, *dst;
+
+  memset(&p, 0, sizeof(p));
+
+  src = alloc_tg_gmem(ctx, c->src_shape, FMT_I8);
+  src->stride.n = c->src_stride.n;
+  src->stride.c = c->src_stride.c;
+  src->stride.h = c->src_stride.h;
+
+  dst = alloc_tg_gmem(ctx, c->dst_shape, FMT_I8);
+  dst->stride.n = c->dst_stride.n;
+  dst->stride.c = c->dst_stride.c;
+  dst->stride.h = c->dst_stride.h;
+
+  p.src = src;
+  p.dst = dst;
+  test_param_g2g(ctx, bmk, &p);
+
+  destroy_param_g2g(ctx, &p);
+}
+
+int main()
+{
+  bmctx_t ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1822/test_cv1822_conv.c b/cviruntime/test/1822/test_cv1822_conv.c
new file mode 100644
index 000000000..c41446323
--- /dev/null
+++ b/cviruntime/test/1822/test_cv1822_conv.c
@@ -0,0 +1,1428 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <time.h>
+#include "test_cvikernel_util.h"
+#include "test_tf_quant_util.h"
+#include "test_native_ref.h"
+
+// #define ENABLE_DEBUG_MSG
+// #define ENABLE_FULL_REGRESSION
+// #define ENABLE_TV_GEN_PATTERN
+
+#define TEST_CASE_NAME    "test_cv1822_conv"
+#define MIN_EXEC_TESTS    20
+
+typedef struct {
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int output_h;
+  int output_w;
+  int has_bias;
+  int relu_enable;
+  int8_t *input_data;
+  int8_t *filter_data;
+  int8_t *output_data;
+  int32_t *bias_data;
+  uint32_t *multiplier_data;
+  int8_t *shift_data;
+  uint8_t *chl_quan_data;
+  uint32_t chl_quan_data_size;
+  float float_multiplier;
+  int retry_cnt;
+} conv_test_param_t;
+
+static inline int Offset(cvk_tl_shape_t shape, int n, int c, int h, int w)
+{
+  return n * (shape.c * shape.h * shape.w) + c * (shape.h * shape.w) +
+         h * shape.w + w;
+}
+
+void conv_per_channel_ref(conv_test_param_t *p_param)
+{
+  const int stride_width = p_param->stride_w;
+  const int stride_height = p_param->stride_h;
+  const int dilation_width_factor = 1;
+  const int dilation_height_factor = 1;
+  const int pad_width = p_param->pad_left;
+  const int pad_height = p_param->pad_top;
+
+  const int32_t output_activation_min = -128;
+  const int32_t output_activation_max = 127;
+
+  const int batches = p_param->input_n;
+  const int input_depth = p_param->input_c;
+  const int output_depth = p_param->output_c;
+
+  const int input_height = p_param->input_h;
+  const int input_width = p_param->input_w;
+  const int filter_height = p_param->kh;
+  const int filter_width = p_param->kw;
+  const int output_height = p_param->output_h;
+  const int output_width = p_param->output_w;
+  int8_t *input_data = p_param->input_data;
+  int8_t *filter_data = p_param->filter_data;
+  int8_t *output_data = p_param->output_data;
+  int32_t *bias_data = p_param->has_bias ? p_param->bias_data : NULL;
+  uint32_t *output_multiplier = p_param->multiplier_data;
+  int8_t *output_rshift = p_param->shift_data;
+
+  cvk_tl_shape_t input_shape = {
+      batches, input_depth,
+      input_height, input_width};
+  cvk_tl_shape_t filter_shape = {
+      output_depth, filter_height,
+      filter_width, input_depth};
+  cvk_tl_shape_t output_shape = {
+      batches, output_depth,
+      output_height, output_width};
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("conv_per_channel_ref: \n"
+         "  input (n=%d, ic=%d, h=%d, w=%d)\n"
+         "  kernel (oc=%d, kh=%d, kw=%d, ic=%d)\n",
+         batches, input_depth, input_height, input_width, output_depth,
+         filter_height, filter_width, input_depth);
+#endif
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          const int in_y_origin = (out_y * stride_height) - pad_height;
+          int32_t acc = 0;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  int32_t input_val = input_data[Offset(input_shape, batch,
+                                                    in_channel, in_y, in_x)];
+                  // int32_t filter_val = filter_data[Offset(filter_shape,
+                  // out_channel, in_channel,
+                  //                                    filter_y, filter_x)];
+                  int32_t filter_val =
+                      filter_data[Offset(filter_shape, out_channel, filter_y,
+                                         filter_x, in_channel)];
+                  acc += filter_val * input_val;
+
+#ifdef ENABLE_DEBUG_MSG
+                  printf("  [batch=%d][out_channel=%d][out_y=%d][out_x=%d]"
+                         "[filter_y=%d][filter_x=%d][in_channel=%d] acc(%d) += "
+                         "%d * %d = %d\n",
+                         batch, out_channel, out_y, out_x, filter_y, filter_x,
+                         in_channel, acc - filter_val * input_val, filter_val,
+                         input_val, acc);
+#endif
+                }
+              }
+            }
+          }
+
+          if (bias_data) {
+            acc += bias_data[out_channel];
+          }
+
+#ifdef ENABLE_DEBUG_MSG
+          printf("  [batch=%d][out_channel=%d][out_y=%d][out_x=%d] acc = %d, "
+                 "bias %d\n",
+                 batch, out_channel, out_y, out_x, acc,
+                 bias_data ? bias_data[out_channel] : 0);
+#endif
+
+          acc = MultiplyByQuantizedMultiplier(
+              acc, output_multiplier[out_channel], output_rshift[out_channel]);
+
+#ifdef ENABLE_DEBUG_MSG
+          printf("  [batch=%d][out_channel=%d][out_y=%d][out_x=%d] acc = %d, "
+                 "multiplier %d, shift %d\n",
+                 batch, out_channel, out_y, out_x, acc,
+                 output_multiplier[out_channel], output_rshift[out_channel]);
+#endif
+
+          acc = MAX(acc, output_activation_min);
+          acc = MIN(acc, output_activation_max);
+
+#ifdef ENABLE_DEBUG_MSG
+          printf("  [batch=%d][out_channel=%d][out_y=%d][out_x=%d] acc = %d\n",
+                 batch, out_channel, out_y, out_x, acc);
+#endif
+
+          output_data[Offset(output_shape, batch, out_channel, out_y, out_x)] =
+              acc;
+        }
+      }
+    }
+  }
+}
+
+void calc_conv_float_multiplier(conv_test_param_t *p_param)
+{
+  const int stride_width = p_param->stride_w;
+  const int stride_height = p_param->stride_h;
+  const int dilation_width_factor = 1;
+  const int dilation_height_factor = 1;
+  const int pad_width = p_param->pad_left;
+  const int pad_height = p_param->pad_top;
+
+  const int batches = p_param->input_n;
+  const int input_depth = p_param->input_c;
+  const int output_depth = p_param->output_c;
+
+  const int input_height = p_param->input_h;
+  const int input_width = p_param->input_w;
+  const int filter_height = p_param->kh;
+  const int filter_width = p_param->kw;
+  const int output_height = p_param->output_h;
+  const int output_width = p_param->output_w;
+  int8_t *input_data = p_param->input_data;
+  int8_t *filter_data = p_param->filter_data;
+  int32_t *bias_data = p_param->has_bias ? p_param->bias_data : NULL;
+
+  cvk_tl_shape_t input_shape = {
+      batches, input_depth,
+      input_height, input_width};
+  cvk_tl_shape_t filter_shape = {
+      output_depth, filter_height,
+      filter_width, input_depth};
+
+  int output_accu_min = INT_MAX;
+  int output_accu_max = INT_MIN;
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("calc_conv_float_multiplier =>\n");
+#endif
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          const int in_y_origin = (out_y * stride_height) - pad_height;
+          int32_t acc = 0;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  int32_t input_val = input_data[Offset(input_shape, batch,
+                                                    in_channel, in_y, in_x)];
+                  // int32_t filter_val = filter_data[Offset(filter_shape,
+                  // out_channel, in_channel,
+                  //                                    filter_y, filter_x)];
+                  int32_t filter_val =
+                      filter_data[Offset(filter_shape, out_channel, filter_y,
+                                         filter_x, in_channel)];
+                  acc += filter_val * input_val;
+
+                  // printf("  [batch=%d][out_channel=%d][out_y=%d][out_x=%d]"
+                  //        "[filter_y=%d][filter_x=%d][in_channel=%d] acc(%d)
+                  //        += %d * %d = %d\n", batch, out_channel, out_y,
+                  //        out_x, filter_y, filter_x, in_channel, acc -
+                  //        filter_val * input_val, filter_val, input_val, acc);
+                }
+              }
+            }
+          }
+
+          if (bias_data) {
+            acc += bias_data[out_channel];
+          }
+
+          output_accu_max = MAX(acc, output_accu_max);
+          output_accu_min = MIN(acc, output_accu_min);
+        }
+      }
+    }
+  }
+
+  // Since int8 ranges from -128 to 127, we need to squeeze the accumulator
+  // min/max fit in those ranges correspondingly as much as possible.
+  if (abs(output_accu_max) > abs(output_accu_min)) {
+    p_param->float_multiplier = 127.0f / abs(output_accu_max);
+  } else {
+    p_param->float_multiplier = 128.0f / abs(output_accu_min);
+  }
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("  output_accu_min %d, output_accu_max %d, output_multiplier %f\n",
+         output_accu_min, output_accu_max, p_param->float_multiplier);
+#endif
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("<= calc_dw_conv_float_multiplier\n");
+#endif
+}
+
+
+static void fill_random_data_s8(int8_t *input_data, int size)
+{
+  for (int i = 0; i < size; ++i) {
+    int is_satured = ((rand() % 1000) == 1) ? 1 : 0;
+    int is_sign = rand() % 2 ? 1 : -1;
+
+    if (is_satured && is_sign) {
+      input_data[i] = -128;
+    } else if (is_satured) {
+      input_data[i] = 127;
+    } else {
+      input_data[i] = is_sign * rand() % 128;
+    }
+  }
+}
+
+static void fill_random_data_s32(int32_t *input_data, int size)
+{
+  for (int i = 0; i < size; ++i) {
+    int is_satured = ((rand() % 1000) == 1) ? 1 : 0;
+    int is_sign = rand() % 2 ? 1 : -1;
+
+    if (is_satured && is_sign) {
+      input_data[i] = INT_MIN;
+    } else if (is_satured) {
+      input_data[i] = INT_MAX;
+    } else {
+      input_data[i] = is_sign * rand() % 128;
+    }
+  }
+}
+
+static int check_valid_test_param(cvk_context_t *cvk_ctx,
+                                  conv_test_param_t *p_param)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int oh = p_param->output_h;
+  int ow = p_param->output_w;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int chl_quan_per_lane_data_size =
+      p_param->has_bias ? 9 : 5;  // bias(4) + multiplier(4) + shift(1)
+
+  // Skip invalid shape
+  if ((kh > ih) || (kw > iw) || (stride_h > ih) || (stride_w > iw)) {
+    return false;
+  }
+
+  // multiply random-choosen value may exceeded than int32_t
+  uint32_t input_size = in * ic * ih * iw;
+  uint32_t kernel_size = oc * ic * kh * kw;
+  uint32_t output_size = in * oc * oh * ow;
+
+  uint32_t lmem_size_per_lane = cvk_ctx->info.lmem_size;
+  uint32_t total_lmem_size = cvk_ctx->info.lmem_size * cvk_ctx->info.npu_num;
+
+  uint32_t total_needed_size =
+      input_size + kernel_size + output_size +
+      chl_quan_per_lane_data_size * cvk_ctx->info.npu_num;
+  if (total_needed_size > total_lmem_size) {
+    return false;
+  }
+
+  cvk_tl_shape_t input_shape = {in, ic, ih, iw};
+  cvk_tl_shape_t filter_shape = {1, oc, kh * kw, ic};
+  cvk_tl_shape_t output_shape = {in, oc, oh, ow};
+  cvk_tl_shape_t chl_quan_shape = {1, oc, 1, chl_quan_per_lane_data_size};
+
+  uint32_t needed_size =
+      cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, input_shape, CVK_FMT_I8, /*eu_align=*/1) +
+      cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, filter_shape, CVK_FMT_I8, /*eu_align=*/0) +
+      cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, output_shape, CVK_FMT_I8, /*eu_align=*/1) +
+      cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, chl_quan_shape, CVK_FMT_I8, /*eu_align=*/0);
+
+  // Skip invalid shape
+  if (needed_size > lmem_size_per_lane) {
+    return false;
+  }
+
+  return true;
+}
+
+static int choose_from_range(int table[], int size, int index)
+{
+  if (index >= size) {
+    return 0;
+  }
+
+  int val = table[index];
+  if (index < (size - 1)) {
+    int range = MAX(table[index + 1] - table[index] - 1, 1);
+    val += rand() % range;
+  }
+
+  return val;
+}
+
+static void save_input_data(conv_test_param_t *p_param)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  uint32_t input_size = in * ic * ih * iw;
+  char name[64];
+  FILE *fp = NULL;
+  snprintf(name, sizeof(name), "%s_input_%d_%d_%d_%d.bin",
+           TEST_CASE_NAME, in, ic, ih, iw);
+
+  fp = fopen(name, "wb");
+  if (fp) {
+    printf("Write %d bytes to %s\n", input_size, name);
+    fwrite(p_param->input_data, input_size, 1, fp);
+    fclose(fp);
+  } else {
+    printf("Fail to open %s\n", name);
+    return;
+  }
+}
+
+static void save_output_data(conv_test_param_t *p_param)
+{
+  int in = p_param->input_n;
+  int oc = p_param->output_c;
+  int oh = p_param->output_h;
+  int ow = p_param->output_w;
+  uint32_t output_size = in * oc * oh * ow;
+  char name[64];
+  FILE *fp = NULL;
+
+  snprintf(name, sizeof(name), "%s_%d_%d_%d_%d.bin",
+           TEST_CASE_NAME, in, oc, oh, ow);
+
+  fp = fopen(name, "wb");
+  if (fp) {
+    printf("Write %d bytes to %s\n", output_size, name);
+    fwrite(p_param->output_data, output_size, 1, fp);
+    fclose(fp);
+  } else {
+    printf("Fail to open %s\n", name);
+    return;
+  }
+}
+
+static void save_kernel_data(conv_test_param_t *p_param)
+{
+  int ic = p_param->input_c;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  uint32_t kernel_size = oc * kh * kw * ic;
+  char name[64];
+  FILE *fp = NULL;
+
+  snprintf(name, sizeof(name), "%s_filter_oc%d_kh%d_kw%d_ic%d.bin",
+           TEST_CASE_NAME, oc, kh, kw, ic);
+  fp = fopen(name, "wb");
+  if (fp) {
+    printf("Write %d bytes to %s\n", kernel_size, name);
+    fwrite(p_param->filter_data, kernel_size, 1, fp);
+    fclose(fp);
+  } else {
+    printf("Fail to open %s\n", name);
+    return;
+  }
+
+  snprintf(name, sizeof(name), "%s_bias_oc%d.bin",
+           TEST_CASE_NAME, oc);
+  fp = fopen(name, "wb");
+  if (fp) {
+    printf("Write %" PRIu32 " bytes to %s\n", (uint32_t)(sizeof(int32_t) * oc), name);
+    fwrite(p_param->bias_data, sizeof(int32_t) * oc, 1, fp);
+    fclose(fp);
+  } else {
+    printf("Fail to open %s\n", name);
+    return;
+  }
+
+  snprintf(name, sizeof(name), "%s_multiplier_oc%d.bin",
+           TEST_CASE_NAME, oc);
+  fp = fopen(name, "wb");
+  if (fp) {
+    printf("Write %" PRIu32 " bytes to %s\n", (uint32_t)(sizeof(int32_t) * oc), name);
+    fwrite(p_param->multiplier_data, sizeof(int32_t) * oc, 1, fp);
+    fclose(fp);
+  } else {
+    printf("Fail to open %s\n", name);
+    return;
+  }
+
+  snprintf(name, sizeof(name), "%s_rshift_oc%d.bin",
+           TEST_CASE_NAME, oc);
+  fp = fopen(name, "wb");
+  if (fp) {
+    printf("Write %d bytes to %s\n", oc, name);
+    fwrite(p_param->shift_data, oc, 1, fp);
+    fclose(fp);
+  } else {
+    printf("Fail to open %s\n", name);
+    return;
+  }
+}
+
+static void save_test_param(conv_test_param_t *p_param)
+{
+  printf("Save test parameter:\n");
+  printf("  input (%d, %d, %d, %d)\n",
+         p_param->input_n, p_param->input_c, p_param->input_h,
+         p_param->input_w);
+  printf("  filter (oc=%d, kh=%d, kw=%d, ic=%d), dh=%d, dw=%d\n",
+         p_param->output_c, p_param->kh, p_param->kw, p_param->input_c,
+         p_param->dh, p_param->dw);
+  printf("output (%d, %d, %d, %d)\n",
+         p_param->input_n, p_param->output_c, p_param->output_h,
+         p_param->output_w);
+  printf("  pad_top %d, pad_bot %d, pad_left %d, pad_right %d\n",
+         p_param->pad_top, p_param->pad_bot, p_param->pad_left,
+         p_param->pad_right);
+  printf("  ins_h %d, ins_h_last %d, ins_w %d, ins_w_last %d\n",
+         p_param->ins_h, p_param->ins_h_last, p_param->ins_w,
+         p_param->ins_w_last);
+  printf("  stride_h %d, stride_w %d\n", p_param->stride_h, p_param->stride_w);
+  printf("  has_bias %d, relu_enable %d\n",
+         p_param->has_bias, p_param->relu_enable);
+
+  save_input_data(p_param);
+  save_output_data(p_param);
+  save_kernel_data(p_param);
+}
+
+int run_compare_conv(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx,
+                     conv_test_param_t *p_param)
+{
+  int ret = 0;
+
+  if (rt_handle == NULL || cvk_ctx == NULL) {
+    return -1;
+  }
+
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int oh = p_param->output_h;
+  int ow = p_param->output_w;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_last_h = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_last_w = p_param->ins_w_last;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int has_bias = p_param->has_bias;
+  int relu_enable = p_param->relu_enable;
+
+  int input_size = in * ic * iw * ih;
+  int8_t *input_data = (int8_t *)malloc(input_size);
+
+  int kernel_size = oc * ic * kh * kw;
+  int8_t *kernel_data = (int8_t *)malloc(kernel_size);
+
+  int output_size = in * oc * oh * ow;
+  int8_t *output_data = (int8_t *)malloc(output_size);
+  memset(output_data, 0, output_size);
+
+  int32_t *bias_data = (int32_t *) malloc(sizeof(int32_t) * oc);
+  uint32_t *multiplier_data = (uint32_t *) malloc(sizeof(uint32_t) * oc);
+  int8_t *shift_data = (int8_t *)malloc(oc);
+
+  p_param->input_data = input_data;
+  p_param->filter_data = kernel_data;
+  p_param->output_data = output_data;
+  p_param->has_bias = has_bias;
+  p_param->bias_data = bias_data;
+  p_param->multiplier_data = multiplier_data;
+  p_param->shift_data = shift_data;
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    run_compare_conv =>\n");
+  printf("      input (n=%d, ic=%d, h=%d, w=%d), kernel (oc=%d, ic=%d, h=%d, "
+         "w=%d), output (, c=%d, h=%d, w=%d), has_bias %d\n",
+         in, ic, ih, iw, oc, ic, kh, kw, oc, oh, ow, has_bias);
+#endif
+
+  int retry_cnt = p_param->retry_cnt;
+  do {
+    fill_random_data_s8(input_data, input_size);
+    fill_random_data_s8(kernel_data, kernel_size);
+    if (has_bias) {
+      fill_random_data_s32(bias_data, oc);
+    }
+
+    p_param->float_multiplier = 100.0;  // should be < 1.0
+    calc_conv_float_multiplier(p_param);
+
+    if (p_param->float_multiplier > 0.f && p_param->float_multiplier < 1.0) {
+      break;
+    }
+
+  } while (--retry_cnt);
+
+  if (p_param->float_multiplier >= 1.0) {
+    printf("    run_compare_dw_conv: unable to find valid multiplier\n");
+    free(input_data);
+    free(kernel_data);
+    free(output_data);
+    free(bias_data);
+    free(multiplier_data);
+    free(shift_data);
+
+    return -1;
+  }
+
+  uint32_t base_multiplier = 0;
+  int base_shift = 0;
+  QuantizeMultiplierSmallerThanOne(p_param->float_multiplier, &base_multiplier,
+                                   &base_shift);
+
+  for (int i = 0; i < oc; ++i) {
+    // multipliers typically range in [2^30 ; 2^31 - 1].
+    // Values in [0, 2^30 - 1] are normally unused, but harmless.
+    // Thus a good way to randomize multipliers is to subtract from them
+    // a random value smaller than 2^30 but still significant compared to it.
+    p_param->multiplier_data[i] = base_multiplier - (rand() % (1 << 26));
+
+    // Our H/W only supports right shift
+    int right_shift = base_shift - 1 + (rand() % 4);
+    p_param->shift_data[i] = right_shift > 0 ? right_shift : 0;
+
+#ifdef ENABLE_DEBUG_MSG
+    printf("      [oc=%d] multiplier_data %d, shift_data %d\n", i,
+           p_param->multiplier_data[i], p_param->shift_data[i]);
+#endif
+  }
+
+  conv_per_channel_ref(p_param);
+
+  // w/  bias: bias(4) + multiplier(4) + shift(1)
+  // w/o bias: multiplier(4) + shift(1)
+  const int chl_quan_per_lane_data_size =
+      p_param->has_bias ? 9 : 5;
+  const int chl_quan_data_size = chl_quan_per_lane_data_size * oc;
+  uint8_t *chl_quan_data = (uint8_t *) malloc(chl_quan_data_size);
+  pack_chl_quan_param(oc, has_bias, bias_data, multiplier_data, shift_data,
+                      chl_quan_data);
+
+  p_param->chl_quan_data = chl_quan_data;
+  p_param->chl_quan_data_size = chl_quan_data_size;
+
+  cvk_tl_shape_t input_shape = {in, ic, ih, iw};
+  cvk_tl_shape_t filter_shape = {1, oc, kh * kw, ic};
+  cvk_tl_shape_t output_shape = {in, oc, oh, ow};
+  cvk_tl_shape_t chl_quan_shape = {1, oc, 1, chl_quan_per_lane_data_size};
+
+  cvk_tl_t *tl_input =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, input_shape, CVK_FMT_I8,
+                                      /*eu_aign=*/1);
+
+  cvk_tl_t *tl_filter =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, filter_shape, CVK_FMT_I8,
+                                      /*eu_align=*/0);
+
+  cvk_tl_t *tl_output =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, output_shape, CVK_FMT_I8,
+                                      /*eu_align=*/1);
+
+  // Shape for TDMA load
+  cvk_tl_t *tl_quan_data =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, chl_quan_shape, CVK_FMT_U8,
+                                      /*eu_align*/ 0);
+
+  if (tl_input == NULL) {
+    printf("      fail to alloc tl_input (%d, %d, %d, %d)\n",
+           input_shape.n, input_shape.c, input_shape.h, input_shape.w);
+    return -1;
+  }
+  if (tl_filter == NULL) {
+    printf("     fail to alloc tl_filter (%d, %d, %d, %d)\n",
+           filter_shape.n, filter_shape.c, filter_shape.h, filter_shape.w);
+    return -1;
+  }
+  if (tl_output == NULL) {
+    printf("    fail to alloc tl_output (%d, %d, %d, %d)\n", output_shape.n,
+           output_shape.c, output_shape.h, output_shape.w);
+    return -1;
+  }
+  if (tl_quan_data == NULL) {
+    printf("    fail to alloc tl_quan_data (%d, %d ,%d, %d)\n",
+           chl_quan_shape.n, chl_quan_shape.c, chl_quan_shape.h,
+           chl_quan_shape.w);
+    return -1;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_quan_data, chl_quan_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_input, (uint8_t *)input_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_filter, (uint8_t *)kernel_data);
+
+  {
+    // Reshape per channel quantization data for TIU
+    tl_quan_data->shape.n = 1;
+    tl_quan_data->shape.c = oc;
+    tl_quan_data->shape.h = 1;
+    tl_quan_data->shape.w = 1;
+    tl_quan_data->stride = cvk_ctx->ops->tl_default_stride(
+        cvk_ctx, tl_quan_data->shape, CVK_FMT_I8, /*eu_align=*/0);
+
+    // Reshape weight for TIU
+    tl_filter->shape.n = ic;
+    tl_filter->shape.c = oc;
+    tl_filter->shape.h = kh;
+    tl_filter->shape.w = kw;
+
+    cvk_tiu_convolution_param_t param;
+    memset(&param, 0, sizeof(param));
+    param.ofmap = tl_output;
+    param.ifmap = tl_input;
+    param.weight = tl_filter;
+    param.chl_quan_param = tl_quan_data;
+    param.ins_h = ins_h;
+    param.ins_last_h = ins_last_h;
+    param.ins_w = ins_w;
+    param.ins_last_w = ins_last_w;
+    param.stride_h = stride_h;
+    param.stride_w = stride_w;
+    param.dilation_h = dh;
+    param.dilation_w = dw;
+    param.pad_top = pad_top;
+    param.pad_bottom = pad_bot;
+    param.pad_left = pad_left;
+    param.pad_right = pad_right;
+    param.has_bias = has_bias;
+    param.relu_enable = relu_enable;
+
+#ifdef ENABLE_DEBUG_MSG
+    printf("      tiu_conv:\n");
+    printf("        ifmap shape (%d, %d, %d, %d)\n",
+           param.ifmap->shape.n, param.ifmap->shape.c, param.ifmap->shape.h,
+           param.ifmap->shape.w);
+    printf("        weight shape (%d, %d, %d, %d)\n",
+           param.weight->shape.n, param.weight->shape.c, param.weight->shape.h,
+           param.weight->shape.w);
+    printf("        ofmap shape (%d, %d, %d, %d)\n",
+           param.ofmap->shape.n, param.ofmap->shape.c, param.ofmap->shape.h,
+           param.ofmap->shape.w);
+#endif
+
+    cvk_ctx->ops->tiu_convolution(cvk_ctx, &param);
+  }
+
+  CVI_RT_Submit(cvk_ctx);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("      compare result:\n");
+#endif
+  int8_t *conv_output_data =
+      (int8_t *)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_output);
+  for (int i = 0; i < in; ++i) {
+    for (int j = 0; j < oc; ++j) {
+      for (int k = 0; k < oh; ++k) {
+        for (int l = 0; l < ow; ++l) {
+          int offset = i * (oc * oh * ow) + j * (oh * ow) + k * ow + l;
+          if (conv_output_data[offset] != output_data[offset]) {
+            printf("        [ni=%d][oci=%d][ohi=%d][owi=%d] output %d(tiu) != "
+                   "%d(ref)\n",
+                   i, j, k, l, conv_output_data[offset], output_data[offset]);
+            ret = -1;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  if (ret) {
+    save_test_param(p_param);
+  }
+
+  // Reverse order
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_quan_data);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_output);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_filter);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_input);
+
+  free(conv_output_data);
+
+  free(input_data);
+  free(kernel_data);
+  free(output_data);
+  free(bias_data);
+  free(multiplier_data);
+  free(shift_data);
+  free(chl_quan_data);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    <= run_compare_conv\n");
+#endif
+
+  return ret;
+}
+
+static int simple_test(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+
+  const int batches = 1;
+  const int input_depth = 2;
+  const int input_height = 2;
+  const int input_width = 3;
+  cvk_tl_shape_t input_shape = {batches, input_depth, input_height, input_width};
+  int8_t input_data[12] = {
+      9,  1,   -11,  // ic = 0, h = 0
+      13, 5,   -15,  // ic = 0, h = 1
+      5,  -7,  -15,  // ic = 1, h = 0
+      9,  -11, -19   // ic = 1, h = 1
+  };
+
+  const int output_depth = 2;
+  const int kernel_height = 2;
+  const int kernel_width = 2;
+  cvk_tl_shape_t filter_shape_tiu =
+      {output_depth, input_depth, kernel_height, kernel_width};
+
+  cvk_tl_shape_t quan_param_shape = {1, output_depth, 1, 9};
+
+  // TIU weight layout (1, oc, hw*kc, ic)
+  cvk_tl_shape_t filter_shape_dma = {
+      1, output_depth, kernel_height * kernel_width, input_depth};
+  int8_t filter_data_dma[16] = {
+      2,  4,  6,  8,  6,  8,  10, 12,  // oc = 0
+      28, 32, 20, 24, 12, 16, 4,  8    // oc = 1
+  };
+
+  int32_t bias_data[2] = {12, -16};
+
+  const int output_height = 1;
+  const int output_width = 2;
+  cvk_tl_shape_t output_shape = {1, output_depth, output_height, output_width};
+  // zero_point = 0
+  int8_t ref_output_data[4] = {
+      17, -128,  // oc = 0
+      60, -128,  // oc = 1
+  };
+
+  uint32_t output_multiplier[] = {1073741824, 1073741824};
+  int8_t output_rshift[2] = {1, 2};  // changed to right shift
+
+  int8_t output_data[4];
+
+  conv_test_param_t params;
+  memset(&params, 0, sizeof(params));
+
+  params.input_n = batches;
+  params.input_c = input_depth;
+  params.input_h = input_height;
+  params.input_w = input_width;
+  params.kh = kernel_height;
+  params.kw = kernel_width;
+  params.output_c = output_depth;
+  params.output_h = output_height;
+  params.output_w = output_width;
+  params.stride_w = 1;
+  params.stride_h = 1;
+  params.input_data = input_data;
+  params.filter_data = filter_data_dma;
+  params.output_data = output_data;
+  params.has_bias = 1;
+  params.bias_data = bias_data;
+  params.multiplier_data = output_multiplier;
+  params.shift_data = output_rshift;
+  conv_per_channel_ref(&params);
+
+  printf("Simple rshift compare ref and golden\n");
+  for (int i = 0; i < 4; i++) {
+    if (output_data[i] != ref_output_data[i]) {
+      printf("Error ! output[%d]=%d != ref_output_data[%d]=%d\n", i,
+             output_data[i], i, ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  // cvk_tl_shape_t per_channel_cal_shape = {1, /*oc=*/2, 1, 9};
+  uint8_t per_channel_quan_data[18];
+  pack_chl_quan_param(2, /*has_bias=*/true, bias_data, output_multiplier,
+                      output_rshift, per_channel_quan_data);
+
+  cvk_tl_t *tl_per_channel_cal =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, quan_param_shape, CVK_FMT_U8,
+                                      /*eu_align*/ 0);
+
+  cvk_tl_t *tl_input =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, input_shape, CVK_FMT_I8,
+                                      /*eu_aign=*/1);
+
+  cvk_tl_t *tl_filter_dma = cvk_ctx->ops->lmem_alloc_tensor(
+      cvk_ctx, filter_shape_dma, CVK_FMT_I8, /*eu_align=*/1);
+
+  cvk_tl_t *tl_output =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, output_shape, CVK_FMT_I8,
+                                      /*eu_align=*/1);
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_per_channel_cal,
+                           per_channel_quan_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_input, (uint8_t *)input_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_filter_dma,
+                           (uint8_t *)filter_data_dma);
+
+  {
+    cvk_tl_t tl_filter_tiu;
+    memset(&tl_filter_tiu, 0, sizeof(tl_filter_tiu));
+    tl_filter_tiu.start_address = tl_filter_dma->start_address;
+    tl_filter_tiu.fmt = tl_filter_dma->fmt;
+    tl_filter_tiu.shape.n = filter_shape_tiu.n;
+    tl_filter_tiu.shape.c = filter_shape_tiu.c;
+    tl_filter_tiu.shape.h = filter_shape_tiu.h;
+    tl_filter_tiu.shape.w = filter_shape_tiu.w;
+    tl_filter_tiu.stride =
+        cvk_ctx->ops->tl_default_stride(cvk_ctx, tl_filter_tiu.shape,
+                                        CVK_FMT_I8, /*eu_align=*/1);
+
+    // Reshape per channel quantization data
+    tl_per_channel_cal->shape.n = 1;
+    tl_per_channel_cal->shape.c = 2;
+    tl_per_channel_cal->shape.h = 1;
+    tl_per_channel_cal->shape.w = 1;
+    tl_per_channel_cal->stride = cvk_ctx->ops->tl_default_stride(
+        cvk_ctx, tl_per_channel_cal->shape, CVK_FMT_I8, /*eu_align=*/0);
+
+    cvk_tiu_convolution_param_t param;
+    memset(&param, 0, sizeof(param));
+    param.ofmap = tl_output;
+    param.ifmap = tl_input;
+    param.weight = &tl_filter_tiu;
+    param.chl_quan_param = tl_per_channel_cal;
+    param.stride_h = 1;
+    param.stride_w = 1;
+    param.dilation_h = 1;
+    param.dilation_w = 1;
+    param.has_bias = 1;
+    cvk_ctx->ops->tiu_convolution(cvk_ctx, &param);
+  }
+
+  CVI_RT_Submit(cvk_ctx);
+
+  printf("Simple rshift: compare tiu and golden\n");
+  int8_t *conv_output_data =
+      (int8_t *)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_output);
+  for (uint64_t i = 0; i < sizeof(ref_output_data); i++) {
+    if (conv_output_data[i] != ref_output_data[i]) {
+      printf("  output_data[%" PRIu64 "] %d != %d\n", i, conv_output_data[i],
+             ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  free(conv_output_data);
+
+  // Reverse order
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_output);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_filter_dma);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_input);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_per_channel_cal);
+
+  return ret;
+}
+
+// Add SimplePerChannelSymmetricTest in conv_test.cc of tensorflow
+// as test pattern.
+int simple_lshift_test(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+
+  const int batches = 1;
+  const int input_depth = 2;
+  const int input_height = 2;
+  const int input_width = 3;
+  cvk_tl_shape_t input_shape =
+      {batches, input_depth, input_height, input_width};
+  int8_t input_data[12] = {
+      5,  1, -5,  // ic = 0, h = 0
+      7,  3, -7,  // ic = 0, h = 1
+      3, -3, -7,  // ic = 1, h = 0
+      5, -5, -9   // ic = 1, h = 1
+  };
+
+  const int output_depth = 2;
+  const int kernel_height = 2;
+  const int kernel_width = 2;
+  cvk_tl_shape_t filter_shape_tiu =
+      {output_depth, input_depth, kernel_height, kernel_width};
+
+  // TIU weight layout (1, oc, hw*kc, ic)
+  cvk_tl_shape_t filter_shape_dma =
+      {1, output_depth, kernel_height * kernel_width, input_depth};
+  int8_t filter_data_dma[16] = {
+      1, 2, 3, 4, 3, 4, 5, 6,  // oc = 0
+      4, 4, 3, 3, 2, 2, 1, 1   // oc = 1
+  };
+
+  int32_t bias_data[2] = {3, -1};
+
+  const int32_t output_height = 1;
+  const int32_t output_width = 2;
+  cvk_tl_shape_t output_shape =
+      {batches, output_depth, output_height, output_width};
+
+  int8_t output_rshift[2] = {-1, -2};  // change to right shift
+  uint32_t output_multiplier[2] = {1073741824, 1073741824};
+
+  int8_t ref_output_data[4] = {
+      31, -128, // oc = 0,
+      94, -128  // oc = 1
+  };
+
+  int8_t output_data[4];
+
+  conv_test_param_t params;
+  memset(&params, 0, sizeof(params));
+
+  params.input_n = batches;
+  params.input_c = input_depth;
+  params.input_h = input_height;
+  params.input_w = input_width;
+  params.kh = kernel_height;
+  params.kw = kernel_width;
+  params.output_c = output_depth;
+  params.output_h = output_height;
+  params.output_w = output_width;
+  params.stride_w = 1;
+  params.stride_h = 1;
+  params.input_data = input_data;
+  params.filter_data = filter_data_dma;
+  params.output_data = output_data;
+  params.has_bias = 1;
+  params.bias_data = bias_data;
+  params.multiplier_data = output_multiplier;
+  params.shift_data = output_rshift;
+  conv_per_channel_ref(&params);
+
+  printf("Simple lshift: compare ref and golden\n");
+  for (int i = 0; i < 4; i++) {
+    if (output_data[i] != ref_output_data[i]) {
+      printf("  output[%d]=%d != ref_output_data[%d]=%d\n", i,
+             output_data[i], i, ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  // cvk_tl_shape_t per_channel_cal_shape = {1, /*oc=*/2, 1, 9};
+  uint8_t per_channel_quan_data[18];
+  pack_chl_quan_param(2, /*has_bias=*/true, bias_data, output_multiplier,
+                      output_rshift, per_channel_quan_data);
+
+  cvk_tl_shape_t quan_param_shape = {1, output_depth, 1, 9};
+  cvk_tl_t *tl_per_channel_cal =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, quan_param_shape, CVK_FMT_U8,
+                                      /*eu_align*/ 0);
+
+  cvk_tl_t *tl_input =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, input_shape, CVK_FMT_I8,
+                                      /*eu_aign=*/1);
+
+  cvk_tl_t *tl_filter_dma = cvk_ctx->ops->lmem_alloc_tensor(
+      cvk_ctx, filter_shape_dma, CVK_FMT_I8, /*eu_align=*/1);
+
+  cvk_tl_t *tl_output =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, output_shape, CVK_FMT_I8,
+                                      /*eu_align=*/1);
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_per_channel_cal,
+                           per_channel_quan_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_input, (uint8_t *)input_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_filter_dma,
+                           (uint8_t *)filter_data_dma);
+
+  {
+    cvk_tl_t tl_filter_tiu;
+    memset(&tl_filter_tiu, 0, sizeof(tl_filter_tiu));
+    tl_filter_tiu.start_address = tl_filter_dma->start_address;
+    tl_filter_tiu.fmt = tl_filter_dma->fmt;
+    tl_filter_tiu.shape.n = filter_shape_tiu.n;
+    tl_filter_tiu.shape.c = filter_shape_tiu.c;
+    tl_filter_tiu.shape.h = filter_shape_tiu.h;
+    tl_filter_tiu.shape.w = filter_shape_tiu.w;
+    tl_filter_tiu.stride =
+        cvk_ctx->ops->tl_default_stride(cvk_ctx, tl_filter_tiu.shape,
+                                        CVK_FMT_I8, /*eu_align=*/1);
+
+    // Reshape per channel quantization data
+    tl_per_channel_cal->shape.n = 1;
+    tl_per_channel_cal->shape.c = 2;
+    tl_per_channel_cal->shape.h = 1;
+    tl_per_channel_cal->shape.w = 1;
+    tl_per_channel_cal->stride = cvk_ctx->ops->tl_default_stride(
+        cvk_ctx, tl_per_channel_cal->shape, CVK_FMT_I8, /*eu_align=*/0);
+
+    cvk_tiu_convolution_param_t param;
+    memset(&param, 0, sizeof(param));
+    param.ofmap = tl_output;
+    param.ifmap = tl_input;
+    param.weight = &tl_filter_tiu;
+    param.chl_quan_param = tl_per_channel_cal;
+    param.stride_h = 1;
+    param.stride_w = 1;
+    param.dilation_h = 1;
+    param.dilation_w = 1;
+    param.has_bias = 1;
+    cvk_ctx->ops->tiu_convolution(cvk_ctx, &param);
+  }
+
+  CVI_RT_Submit(cvk_ctx);
+
+  printf("Simple lshift: compare tiu and golden\n");
+  int8_t *conv_output_data =
+      (int8_t *)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_output);
+  for (uint64_t i = 0; i < sizeof(ref_output_data); i++) {
+    if (conv_output_data[i] != ref_output_data[i]) {
+      printf("  output_data[%" PRIu64 "] %d != %d\n", i, conv_output_data[i],
+             ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  free(conv_output_data);
+
+  // Reverse order
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_output);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_filter_dma);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_input);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_per_channel_cal);
+
+  return ret;
+}
+
+static int random_test(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+
+
+#ifndef ENABLE_FULL_REGRESSION
+  // TV_GEN pattern
+  // Random Test, total 19683, skipped 118066, executed 32, failed 0, ret 0
+
+  // n: 12b, c: 12b, h: 12b(4095-32), wb: 12b(4095-32)
+  int batch_range[] = {1, 1, 32};
+  int input_height_range[] = {1, 512, 4095 - 32};
+  int input_width_range[] = {1, 512, 4095 - 32};
+  int input_depth_range[] = {1, 16, 4095};
+  int output_depth_range[] = {1, 16, 4095};
+
+  // h: 12b, w: 12b
+  // stride_h: 4b, stride_w: 4b
+  int kernel_height_range[] = {1, 11, 4095};
+  int kernel_width_range[] = {1, 11, 4095};
+  int kernel_stride_height_range[] = {1, 5, 15};
+  int kernel_stride_width_range[] = {1, 5, 15};
+
+#else
+  // n: 12b, c: 12b, h: 12b(4095-32), wb: 12b(4095-32)
+  int batch_range[] = {1, 2, 4, 8, 16, 32, 64, 4095 - 32};
+  int input_height_range[] = {1, 3, 11, 128, 512, 1024, 2048, 4095 - 32};
+  int input_width_range[] = {1, 3, 11, 128, 512, 1024, 2048, 4095 - 32};
+  int input_depth_range[] = {1, 3, 11, 32, 64, 1024, 2048, 4095};
+  int output_depth_range[] = {1, 3, 11, 32, 64, 1024, 2048, 4095};
+
+  // h: 12b, w: 12b
+  // stride_h: 4b, stride_w: 4b
+  int kernel_height_range[] = {1, 3, 11, 511, 4095};
+  int kernel_width_range[] = {1, 3, 11, 511, 4095};
+  int kernel_stride_height_range[] = {1, 3, 5, 7, 15};
+  int kernel_stride_width_range[] = {1, 3, 5, 7, 15};
+#endif /* ENABLE_FULL_REGRESSION */
+
+  const int batch_range_size = sizeof(batch_range) / sizeof(batch_range[0]);
+  const int input_height_range_size =
+      sizeof(input_height_range) / sizeof(input_height_range[0]);
+  const int input_width_range_size =
+      sizeof(input_width_range) / sizeof(input_width_range[0]);
+  const int input_depth_range_size =
+      sizeof(input_depth_range) / sizeof(input_depth_range[0]);
+  const int output_depth_range_size =
+      sizeof(output_depth_range) / sizeof(output_depth_range[0]);
+
+  const int kernel_height_range_size =
+      sizeof(kernel_height_range) / sizeof(kernel_height_range[0]);
+  const int kernel_width_range_size =
+      sizeof(kernel_width_range) / sizeof(kernel_width_range[0]);
+  const int kernel_stride_height_range_size =
+      sizeof(kernel_stride_height_range) /
+      sizeof(kernel_stride_height_range[0]);
+  const int kernel_stride_width_range_size =
+      sizeof(kernel_stride_width_range) / sizeof(kernel_stride_width_range[0]);
+
+  int random_seed = clock();
+  srand(random_seed);
+
+  const int retry_test_count = 100;
+
+  bool stop_at_first_error = true;
+
+  int total_tests = batch_range_size * input_depth_range_size *
+                    input_height_range_size * input_width_range_size *
+                    output_depth_range_size * kernel_height_range_size *
+                    kernel_width_range_size * kernel_stride_height_range_size *
+                    kernel_stride_width_range_size;
+  int skipped_tests = 0;
+  int executed_tests = 0;
+  int failed_tests = 0;
+  int current_test = 0;
+
+  printf("Random Test =>\n");
+  for (int m = 0; m < retry_test_count; ++m) {
+    for (int i = 0; i < batch_range_size; ++i) {
+      // random choosen from [range[i] : range[i+1]]
+      int batch = choose_from_range(batch_range, batch_range_size, i);
+
+      for (int j = 0; j < input_height_range_size; ++j) {
+        int input_height =
+            choose_from_range(input_height_range, input_height_range_size, j);
+
+        for (int k = 0; k < input_width_range_size; ++k) {
+          int input_width =
+              choose_from_range(input_width_range, input_width_range_size, k);
+
+          for (int l = 0; l < input_depth_range_size; ++l) {
+            int input_depth =
+                choose_from_range(input_depth_range, input_depth_range_size, k);
+
+            for (int m = 0; m < kernel_height_range_size; ++m) {
+              int kernel_height = choose_from_range(
+                  kernel_height_range, kernel_height_range_size, m);
+
+              for (int n = 0; n < kernel_width_range_size; ++n) {
+                int kernel_width = choose_from_range(
+                    kernel_width_range, kernel_width_range_size, n);
+
+                for (int x = 0; x < kernel_stride_height_range_size; ++x) {
+                  int kernel_stride_height =
+                      choose_from_range(kernel_stride_height_range,
+                                        kernel_stride_height_range_size, x);
+
+                  for (int y = 0; y < kernel_stride_width_range_size; ++y) {
+                    int kernel_stride_width =
+                        choose_from_range(kernel_stride_width_range,
+                                          kernel_stride_width_range_size, y);
+
+                    for (int z = 0; z < output_depth_range_size; ++z) {
+                      int output_depth = choose_from_range(
+                          output_depth_range, output_depth_range_size, y);
+
+                      current_test++;
+
+                      int has_bias = rand() % 2;
+                      int dh = 1;
+                      int dw = 1;
+                      int ins_h = 0;
+                      int ins_h_last = 0;
+                      int ins_w = 0;
+                      int ins_w_last = 0;
+                      int pad_top = 0;
+                      int pad_bot = 0;
+                      int pad_left = 0;
+                      int pad_right = 0;
+
+                      int ih_ext = calc_dilute_hw(input_height, ins_h,
+                                                  ins_h_last, pad_top, pad_bot);
+                      int iw_ext = calc_dilute_hw(
+                          input_width, ins_w, ins_w_last, pad_left, pad_right);
+                      int kh_ext =
+                          calc_dilute_hw(kernel_height, dh - 1, 0, 0, 0);
+                      int kw_ext =
+                          calc_dilute_hw(kernel_width, dw - 1, 0, 0, 0);
+
+                      int oh =
+                          calc_output_hw(ih_ext, kh_ext, kernel_stride_height);
+                      int ow =
+                          calc_output_hw(iw_ext, kw_ext, kernel_stride_width);
+
+                      conv_test_param_t test_param;
+                      memset(&test_param, 0, sizeof(test_param));
+                      test_param.input_n = batch;
+                      test_param.input_c = input_depth;
+                      test_param.input_h = input_height;
+                      test_param.input_w = input_width;
+                      test_param.kh = kernel_height;
+                      test_param.kw = kernel_width;
+                      test_param.dh = dh;
+                      test_param.dw = dw;
+                      test_param.pad_top = pad_top;
+                      test_param.pad_bot = pad_bot;
+                      test_param.pad_left = pad_left;
+                      test_param.pad_right = pad_right;
+                      test_param.ins_h = ins_h;
+                      test_param.ins_h_last = ins_h_last;
+                      test_param.ins_w = ins_w;
+                      test_param.ins_w_last = ins_w_last;
+                      test_param.stride_h = kernel_stride_height;
+                      test_param.stride_w = kernel_stride_width;
+                      test_param.output_c = output_depth;
+                      test_param.output_h = oh;
+                      test_param.output_w = ow;
+                      test_param.has_bias = has_bias;
+                      test_param.retry_cnt = 5;
+
+                      bool is_valid_param =
+                          check_valid_test_param(cvk_ctx, &test_param);
+                      if (is_valid_param == false) {
+                        skipped_tests++;
+                        continue;
+                      }
+
+                      int ret2 = run_compare_conv(rt_handle, cvk_ctx, &test_param);
+                      failed_tests = ret2 ? failed_tests + 1 : failed_tests;
+                      ret |= ret2;
+                      executed_tests++;
+
+#ifdef ENABLE_DEBUG_MSG
+                      printf("  [%d] random test: input shape(%d, %d, %d, %d)",
+                             executed_tests, batch, input_depth,
+                             input_height, input_width);
+                      printf(", kernel shape (%d, %d, %d, %d), result %d\n",
+                             output_depth, input_depth, kernel_height,
+                             kernel_width, ret2);
+#endif
+
+
+                      // Stop at first error
+                      if (ret && stop_at_first_error) {
+                        break;
+                      }
+                    }
+
+                    // Stop at first error
+                    if (ret && stop_at_first_error) {
+                      break;
+                    }
+                  }
+
+                  // Stop at first error
+                  if (ret && stop_at_first_error) {
+                    break;
+                  }
+                }
+
+                // Stop at first error
+                if (ret && stop_at_first_error) {
+                  break;
+                }
+              }
+
+              // Stop at first error
+              if (ret && stop_at_first_error) {
+                break;
+              }
+            }
+
+            // Stop at first error
+            if (ret && stop_at_first_error) {
+              break;
+            }
+          }
+
+          // Stop at first error
+          if (ret && stop_at_first_error) {
+            break;
+          }
+        }
+
+        // Stop at first error
+        if (ret && stop_at_first_error) {
+          break;
+        }
+      }
+
+      // Stop at first error
+      if (ret && stop_at_first_error) {
+        break;
+      }
+    }
+    // Stop at first error
+    if (ret && stop_at_first_error) {
+      break;
+    }
+
+    if (executed_tests >= MIN_EXEC_TESTS) {
+      break;
+    }
+  }
+
+  printf(
+      "<= Random Test, total %d, skipped %d, executed %d, failed %d, ret %d\n",
+      total_tests, skipped_tests, executed_tests, failed_tests, ret);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_HANDLE rt_handle;
+  cvk_context_t *cvk_ctx = NULL;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+
+  if (!ret)
+    ret |= simple_test(rt_handle, cvk_ctx);
+  if (!ret)
+    ret |= simple_lshift_test(rt_handle, cvk_ctx);
+  if (!ret)
+    ret |= random_test(rt_handle, cvk_ctx);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  if (!ret)
+    printf("%s pass\n", __FILENAME__);
+  else
+    printf("%s fail\n", __FILENAME__);
+
+  return ret;
+}
diff --git a/cviruntime/test/1822/test_cv1822_depthwise_conv.c b/cviruntime/test/1822/test_cv1822_depthwise_conv.c
new file mode 100644
index 000000000..e05e3af6c
--- /dev/null
+++ b/cviruntime/test/1822/test_cv1822_depthwise_conv.c
@@ -0,0 +1,1656 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <time.h>
+#include "test_cvikernel_util.h"
+#include "test_tf_quant_util.h"
+#include "test_native_ref.h"
+
+#define TEST_CASE_NAME    "test_cv1822_dw_conv"
+
+// #define ENABLE_DEBUG_MSG
+// #define ENABLE_FULL_REGRESSION
+// #define ENABLE_TV_GEN_PATTERN
+
+#define MIN_EXEC_TESTS  20
+
+typedef struct {
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int output_h;
+  int output_w;
+  int has_bias;
+  int relu_enable;
+  int8_t *input_data;
+  int8_t *filter_data;
+  int8_t *output_data;
+  int32_t *bias_data;
+  uint32_t *multiplier_data;
+  int8_t *shift_data;
+  float float_multiplier;
+  int retry_cnt;
+} dw_conv_test_param_t;
+
+
+static inline int Offset(cvk_tl_shape_t shape, int i0, int i1, int i2, int i3)
+{
+  // return n * (shape.c * shape.h * shape.w) + c * (shape.h * shape.w) + h *
+  // shape.w + w;
+  int dims_data[4] = {shape.n, shape.c, shape.h, shape.w};
+  return ((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3;
+}
+
+void fill_random_data_s8(int8_t *input_data, int size)
+{
+  for (int i = 0; i < size; ++i) {
+    int is_satured = ((rand() % 1000) == 1) ? 1 : 0;
+    int is_sign = rand() % 2 ? 1 : -1;
+
+    if (is_satured && is_sign) {
+      input_data[i] = -128;
+    } else if (is_satured) {
+      input_data[i] = 127;
+    } else {
+      input_data[i] = is_sign * rand() % 128;
+    }
+  }
+}
+
+void fill_random_data_s32(int32_t *input_data, int size)
+{
+  for (int i = 0; i < size; ++i) {
+    int is_satured = ((rand() % 1000) == 1) ? 1 : 0;
+    int is_sign = rand() % 2 ? 1 : -1;
+
+    if (is_satured && is_sign) {
+      input_data[i] = INT_MIN;
+    } else if (is_satured) {
+      input_data[i] = INT_MAX;
+    } else {
+      input_data[i] = is_sign * rand() % 128;
+    }
+  }
+}
+
+void convert_nhwc_to_nchw(cvk_tl_shape_t tl_shape, int8_t *src, int8_t *dst)
+{
+  // NHWC
+  uint32_t src_shape_n = tl_shape.n;
+  uint32_t src_shape_h = tl_shape.c;
+  uint32_t src_shape_w = tl_shape.h;
+  uint32_t src_shape_c = tl_shape.w;
+  uint32_t src_stride_c = 1;
+  uint32_t src_stride_w = src_shape_c * src_stride_c;
+  uint32_t src_stride_h = src_shape_w * src_stride_w;
+  uint32_t src_stride_n = src_shape_h * src_stride_h;
+
+  // NCHW
+  // uint32_t dst_shape_n = src_shape_n;
+  uint32_t dst_shape_c = src_shape_c;
+  uint32_t dst_shape_h = src_shape_h;
+  uint32_t dst_shape_w = src_shape_w;
+  uint32_t dst_stride_w = 1;
+  uint32_t dst_stride_h = dst_shape_w * dst_stride_w;
+  uint32_t dst_stride_c = dst_shape_h * dst_stride_h;
+  uint32_t dst_stride_n = dst_shape_c * dst_stride_c;
+
+  printf("convert_nhwc_to_nchw:\n");
+  printf("  src shape (%d, %d, %d, %d), stride (%d, %d, %d, %d)\n", src_shape_n,
+         src_shape_c, src_shape_h, src_shape_w, src_stride_n, src_stride_c,
+         src_stride_h, src_stride_w);
+  printf("  dst shape (%d, %d, %d, %d), stride (%d, %d, %d, %d)\n", src_shape_n,
+         dst_shape_c, dst_shape_h, dst_shape_w, dst_stride_n, dst_stride_c,
+         dst_stride_h, dst_stride_w);
+
+  for (uint32_t i = 0; i < src_shape_n; ++i) {
+    for (uint32_t j = 0; j < src_shape_h; ++j) {
+      for (uint32_t k = 0; k < src_shape_w; ++k) {
+        for (uint32_t l = 0; l < src_shape_c; ++l) {
+          uint32_t src_offset = i * src_stride_n + j * src_stride_h +
+                           k * src_stride_w + l * src_stride_c;
+          uint32_t dst_offset = i * dst_stride_n + j * dst_stride_h +
+                           k * dst_stride_w + l * dst_stride_c;
+          dst[dst_offset] = src[src_offset];
+        }
+      }
+    }
+  }
+}
+
+int test_nhwc_to_nchw(void)
+{
+  int ret = 0;
+
+  cvk_tl_shape_t shape = {2, 2, 2, 2};
+  int size = shape.n * shape.c * shape.h * shape.w;
+
+  int8_t src[2 * 2 * 2 * 2] = {1,  5,  2,  6,  3,  7,  4,  8,
+                           11, 15, 12, 16, 13, 17, 14, 18};
+
+  int8_t dst[2 * 2 * 2 * 2] = {0};
+  int8_t ref_dst[2 * 2 * 2 * 2] = {1,  2,  3,  4,  5,  6,  7,  8,
+                               11, 12, 13, 14, 15, 16, 17, 18};
+
+  convert_nhwc_to_nchw(shape, src, dst);
+  for (int i = 0; i < size; ++i) {
+    if (dst[i] != ref_dst[i]) {
+      printf("Error ! dst[%d] %d != %d(expected)\n", i, dst[i], ref_dst[i]);
+      ret = -1;
+    }
+  }
+
+  cvk_tl_shape_t input_shape = {/*n=*/1, /*h=*/5, /*w=*/6, /*c=*/8};
+  int input_size =
+      input_shape.n * input_shape.c * input_shape.h * input_shape.w;
+  int8_t nhwc_input_data[240] = {
+      103,  85,   -96,  120,  105,  -72,  33,   -50,  -104, 12,   -57,  -80,
+      12,   126,  117,  127,  119,  119,  -88,  57,   120,  123,  117,  -100,
+      -4,   76,   76,   -52,  -92,  -127, -21,  -100, 106,  35,   74,   96,
+      117,  0,    39,   76,   -119, -36,  89,   -74,  111,  46,   45,   -26,
+      65,   61,   62,   -7,   -28,  -20,  39,   -84,  -85,  -51,  52,   76,
+      -120, -47,  -58,  95,   -117, -90,  -104, 126,  82,   82,   49,   -96,
+      -47,  67,   115,  -3,   -120, 41,   -16,  -96,  -31,  -75,  67,   -115,
+      75,   -119, -81,  -24,  -3,   -11,  -14,  -4,   37,   75,   53,   107,
+      65,   78,   -58,  52,   46,   -128, 39,   53,   -87,  36,   -98,  -12,
+      -1,   70,   117,  18,   -41,  96,   21,   78,   -71,  -124, 64,   82,
+      -63,  82,   1,    112,  50,   -23,  100,  -20,  117,  20,   12,   -88,
+      -93,  67,   -90,  -70,  -63,  79,   87,   125,  -63,  -43,  80,   -52,
+      -66,  -125, 109,  -73,  -39,  104,  -78,  89,   -64,  116,  29,   71,
+      -7,   124,  -38,  -111, 84,   75,   21,   24,   12,   59,   106,  49,
+      -55,  46,   65,   -28,  64,   15,   -31,  -75,  17,   7,    -109, -25,
+      -115, -38,  7,    23,   71,   -37,  111,  119,  -95,  -89,  17,   -27,
+      -8,   -29,  -125, 58,   -42,  -29,  -87,  109,  75,   -17,  -49,  92,
+      7,    30,   -86,  -98,  26,   -8,   -61,  -41,  39,   7,    48,   55,
+      63,   125,  -13,  56,   -107, 105,  -70,  1,    105,  14,   -89,  0,
+      83,   -10,  9,    11,   127,  -14,  -108, 90,   -15,  26,   -101, -1};
+  int8_t input_data[240];
+  convert_nhwc_to_nchw(input_shape, nhwc_input_data, input_data);
+  printf("NCHW input_data[%d] = {\n", input_size);
+  for (int i = 0; i < input_size; ++i) {
+    printf("%d, ", input_data[i]);
+    if (i && ((i % 16) == 0)) {
+      printf("\n");
+    }
+  }
+  printf("};\n\n");
+
+  cvk_tl_shape_t filter_shape = {1, 3, 3, 8};
+  int filter_size =
+      filter_shape.n * filter_shape.c * filter_shape.h * filter_shape.w;
+  int8_t nhwc_filter_data[72] = {
+      103,  85,  -96, 120, 105,  -72,  33,   -50,  -104, 12,  -57, -80,
+      12,   126, 117, 127, 119,  119,  -88,  57,   120,  123, 117, -100,
+      -4,   76,  76,  -52, -92,  -127, -21,  -100, 106,  35,  74,  96,
+      117,  0,   39,  76,  -119, -36,  89,   -74,  111,  46,  45,  -26,
+      65,   61,  62,  -7,  -28,  -20,  39,   -84,  -85,  -51, 52,  76,
+      -120, -47, -58, 95,  -117, -90,  -104, 126,  82,   82,  49,  -96};
+  int8_t filter_data[72];
+  convert_nhwc_to_nchw(filter_shape, nhwc_filter_data, filter_data);
+  printf("NCHW filter_data[%d] = {\n", filter_size);
+  for (int i = 0; i < filter_size; ++i) {
+    printf("%d, ", filter_data[i]);
+    if (i && ((i % 16) == 0)) {
+      printf("\n");
+    }
+  }
+  printf("}\n\n");
+
+  cvk_tl_shape_t output_shape = {1, 3, 4, 8};
+  int output_size =
+      output_shape.n * output_shape.c * output_shape.h * output_shape.w;
+  int8_t nhwc_output_data[96] = {
+      127,  127,  69,   34,  36,   127,  127,  127,  -101, -65,  39,   13,
+      26,   6,    127,  -67, 60,   123,  31,   17,   3,    -128, -58,  -64,
+      -128, 26,   -128, -21, 72,   55,   127,  94,   -46,  -128, -37,  1,
+      -6,   109,  98,   -14, -11,  48,   -128, -3,   -50,  37,   -20,  79,
+      -94,  -36,  127,  19,  3,    -18,  -40,  -115, 24,   124,  -128, -1,
+      -52,  -123, -54,  -1,  -62,  95,   127,  24,   10,   -74,  127,  -128,
+      -2,   111,  106,  4,   3,    -128, 127,  127,  -30,  98,   -21,  -1,
+      -11,  -12,  58,   -72, -128, 127,  30,   32,   -85,  -11,  -35,  34};
+  int8_t output_data[96] = {0};
+  convert_nhwc_to_nchw(output_shape, nhwc_output_data, output_data);
+  printf("NCHW output_data[%d] = {\n", output_size);
+  for (int i = 0; i < output_size; ++i) {
+    printf("%d, ", output_data[i]);
+    if (i && ((i % 16) == 0)) {
+      printf("\n");
+    }
+  }
+  printf("};\n\n");
+
+  return ret;
+}
+
+int simple_nhwc_dw_conv_test(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+
+  const int stride_width = 1;
+  const int stride_height = 1;
+  const int dilation_width_factor = 1;
+  const int dilation_height_factor = 1;
+  const int pad_width = 0;
+  const int pad_height = 0;
+  const int depth_multiplier = 1;
+  const int input_offset = 0;   // symmetric
+  const int output_offset = 0;  // symmetric
+  const int output_activation_min = -128;
+  const int output_activation_max = 127;
+
+  if (rt_handle == NULL) {
+    return -1;
+  }
+  if (cvk_ctx == NULL) {
+    return -1;
+  }
+
+  cvk_tl_shape_t input_shape = {/*n=*/1, /*h=*/5, /*w=*/6, /*c=*/8};
+  int8_t input_data[240] = {
+      103,  85,   -96,  120,  105,  -72,  33,   -50,  -104, 12,   -57,  -80,
+      12,   126,  117,  127,  119,  119,  -88,  57,   120,  123,  117,  -100,
+      -4,   76,   76,   -52,  -92,  -127, -21,  -100, 106,  35,   74,   96,
+      117,  0,    39,   76,   -119, -36,  89,   -74,  111,  46,   45,   -26,
+      65,   61,   62,   -7,   -28,  -20,  39,   -84,  -85,  -51,  52,   76,
+      -120, -47,  -58,  95,   -117, -90,  -104, 126,  82,   82,   49,   -96,
+      -47,  67,   115,  -3,   -120, 41,   -16,  -96,  -31,  -75,  67,   -115,
+      75,   -119, -81,  -24,  -3,   -11,  -14,  -4,   37,   75,   53,   107,
+      65,   78,   -58,  52,   46,   -128, 39,   53,   -87,  36,   -98,  -12,
+      -1,   70,   117,  18,   -41,  96,   21,   78,   -71,  -124, 64,   82,
+      -63,  82,   1,    112,  50,   -23,  100,  -20,  117,  20,   12,   -88,
+      -93,  67,   -90,  -70,  -63,  79,   87,   125,  -63,  -43,  80,   -52,
+      -66,  -125, 109,  -73,  -39,  104,  -78,  89,   -64,  116,  29,   71,
+      -7,   124,  -38,  -111, 84,   75,   21,   24,   12,   59,   106,  49,
+      -55,  46,   65,   -28,  64,   15,   -31,  -75,  17,   7,    -109, -25,
+      -115, -38,  7,    23,   71,   -37,  111,  119,  -95,  -89,  17,   -27,
+      -8,   -29,  -125, 58,   -42,  -29,  -87,  109,  75,   -17,  -49,  92,
+      7,    30,   -86,  -98,  26,   -8,   -61,  -41,  39,   7,    48,   55,
+      63,   125,  -13,  56,   -107, 105,  -70,  1,    105,  14,   -89,  0,
+      83,   -10,  9,    11,   127,  -14,  -108, 90,   -15,  26,   -101, -1};
+
+  cvk_tl_shape_t filter_shape = {1, 3, 3, 8};
+  int8_t filter_data[72] = {
+      103,  85,  -96, 120, 105,  -72,  33,   -50,  -104, 12,  -57, -80,
+      12,   126, 117, 127, 119,  119,  -88,  57,   120,  123, 117, -100,
+      -4,   76,  76,  -52, -92,  -127, -21,  -100, 106,  35,  74,  96,
+      117,  0,   39,  76,  -119, -36,  89,   -74,  111,  46,  45,  -26,
+      65,   61,  62,  -7,  -28,  -20,  39,   -84,  -85,  -51, 52,  76,
+      -120, -47, -58, 95,  -117, -90,  -104, 126,  82,   82,  49,  -96};
+
+  int32_t bias_data[8] = {812, 670, -746, 938, 827, -558, 265, -384};
+
+  uint32_t output_multiplier[8] = {1155460505, 1210948247, 1203328687, 1166122678,
+                              1155273687, 1196350022, 1169748238, 1183287581};
+
+  int8_t output_rshift[8] = {-7, -6, -6, -9, -8, -6, -6, -7};
+
+  cvk_tl_shape_t output_shape = {1, 3, 4, 8};
+  int8_t output_data[96] = {0};
+  int8_t ref_output_data[96] = {
+      127,  127,  69,   34,  36,   127,  127,  127,  -101, -65,  39,   13,
+      26,   6,    127,  -67, 60,   123,  31,   17,   3,    -128, -58,  -64,
+      -128, 26,   -128, -21, 72,   55,   127,  94,   -46,  -128, -37,  1,
+      -6,   109,  98,   -14, -11,  48,   -128, -3,   -50,  37,   -20,  79,
+      -94,  -36,  127,  19,  3,    -18,  -40,  -115, 24,   124,  -128, -1,
+      -52,  -123, -54,  -1,  -62,  95,   127,  24,   10,   -74,  127,  -128,
+      -2,   111,  106,  4,   3,    -128, 127,  127,  -30,  98,   -21,  -1,
+      -11,  -12,  58,   -72, -128, 127,  30,   32,   -85,  -11,  -35,  34};
+
+  const int batches = input_shape.n;
+  // const int output_depth = 8;
+  const int input_height = input_shape.c;
+  const int input_width = input_shape.h;
+  const int input_depth = input_shape.w;
+  const int filter_height = filter_shape.c;
+  const int filter_width = filter_shape.h;
+  const int output_height = output_shape.c;
+  const int output_width = output_shape.h;
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          for (int m = 0; m < depth_multiplier; ++m) {
+            const int output_channel = m + in_channel * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - pad_width;
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+            int32_t acc = 0;
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  int32_t input_val = input_data[Offset(input_shape, batch, in_y,
+                                                    in_x, in_channel)];
+                  int32_t filter_val = filter_data[Offset(
+                      filter_shape, 0, filter_y, filter_x, output_channel)];
+                  acc += filter_val * (input_val + input_offset);
+
+                  printf("  [batch=%d][out_y=%d][out_x=%d][in_channel=%d][m=%d]"
+                         "[filter_y=%d][filter_x=%d] acc(%d) += %d * (%d + %d) "
+                         "= %d\n",
+                         batch, out_y, out_x, in_channel, m, filter_y, filter_x,
+                         acc - filter_val * (input_val + input_offset),
+                         filter_val, input_val, input_offset, acc);
+                }
+              }
+            }
+            if (1 /*bias_data*/) {
+              acc += bias_data[output_channel];
+            }
+
+            printf("  [batch=%d][out_y=%d][out_x=%d][output_channel=%d] acc = "
+                   "%d, bias %d\n",
+                   batch, out_y, out_x, output_channel, acc,
+                   bias_data[output_channel]);
+
+            acc = MultiplyByQuantizedMultiplier(
+                acc, output_multiplier[output_channel],
+                output_rshift[output_channel]);
+
+            printf("  [batch=%d][out_y=%d][out_x=%d][output_channel=%d] acc = "
+                   "%d, multiplier %d, shift %d\n",
+                   batch, out_y, out_x, output_channel, acc,
+                   output_multiplier[output_channel],
+                   output_rshift[output_channel]);
+
+            acc += output_offset;
+            acc = MAX(acc, output_activation_min);
+            acc = MIN(acc, output_activation_max);
+
+            printf("  [batch=%d][out_y=%d][out_x=%d][output_channel=%d] acc = "
+                   "%d\n",
+                   batch, out_y, out_x, output_channel, acc);
+
+            {
+              int x = Offset(output_shape, batch, out_y, out_x, output_channel);
+              if (x >= 96) {
+                printf("Error ! shape=(%d, %d, %d, %d), batch %d, out_y %d, "
+                       "out_x %d, output_channel %d, offset %d\n",
+                       output_shape.n, output_shape.c, output_shape.h,
+                       output_shape.w, batch, out_y, out_x, output_channel, x);
+              }
+            }
+
+            output_data[Offset(output_shape, batch, out_y, out_x,
+                               output_channel)] = acc;
+          }
+        }
+      }
+    }
+  }
+
+  int output_size =
+      output_shape.n * output_shape.c * output_shape.h * output_shape.w;
+  for (int i = 0; i < output_size; ++i) {
+    if (output_data[i] != ref_output_data[i]) {
+      printf("  output_data[%d] = %d != %d\n", i, output_data[i],
+             ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  return ret;
+}
+
+typedef struct {
+  int stride_width;
+  int stride_height;
+  int dilation_width_factor;
+  int dilation_height_factor;
+  int padding_width;
+  int padding_height;
+  int depth_multiplier;
+} DwConvParams;
+
+void dw_conv_per_channel_ref(const dw_conv_test_param_t *p_param)
+{
+  const int input_offset = 0;   // symmetric
+  const int output_offset = 0;  // symmetric
+  const int output_activation_min = -128;
+  const int output_activation_max = 127;
+
+  const int stride_width = p_param->stride_w;
+  const int stride_height = p_param->stride_h;
+  const int dilation_width_factor = 1;   // params.dilation_width_factor;
+  const int dilation_height_factor = 1;  // params.dilation_height_factor;
+  const int pad_width = p_param->pad_left;
+  const int pad_height = p_param->pad_top;
+  const int depth_multiplier = 1;  // params.depth_multiplier;
+
+  const int batches = p_param->input_n;
+  const int input_height = p_param->input_h;
+  const int input_width = p_param->input_w;
+  const int input_depth = p_param->input_c;
+  const int filter_height = p_param->kh;
+  const int filter_width = p_param->kw;
+  const int output_depth = p_param->output_c;
+  const int output_height = p_param->output_h;
+  const int output_width = p_param->output_w;
+  int8_t *input_data = p_param->input_data;
+  int8_t *filter_data = p_param->filter_data;
+  int8_t *output_data = p_param->output_data;
+  int32_t *bias_data = p_param->has_bias ? p_param->bias_data : NULL;
+  uint32_t *output_multiplier = p_param->multiplier_data;
+  int8_t *output_rshift = p_param->shift_data;
+
+  cvk_tl_shape_t input_shape = {
+      batches, input_depth, input_height, input_width};
+  cvk_tl_shape_t filter_shape = {
+      output_depth, input_depth, filter_height, filter_width};
+  cvk_tl_shape_t output_shape = {
+      batches, output_depth, output_height, output_width};
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("dw_conv_per_channel_ref =>\n");
+  printf("  input shape (n=%d, c=%d, h=%d, w=%d)\n", batches, input_depth,
+         input_height, input_width);
+  // printf("  filter shape (oc=%d, kh=%d, kw=%d\n",
+  //       );
+  printf("  output shape (n=%d, c=%d, h=%d, w=%d)\n", batches, output_depth,
+         output_height, output_width);
+  printf("  stride_h %d, stride_w %d\n", stride_height, stride_width);
+#endif
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          for (int m = 0; m < depth_multiplier; ++m) {
+            const int output_channel = m + in_channel * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - pad_width;
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+            int32_t acc = 0;
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  int32_t input_val = input_data[Offset(input_shape, batch,
+                                                    in_channel, in_y, in_x)];
+                  int32_t filter_val = filter_data[Offset(
+                      filter_shape, 0, output_channel, filter_y, filter_x)];
+                  acc += filter_val * (input_val + input_offset);
+
+#ifdef ENABLE_DEBUG_MSG
+                  printf("  [batch=%d][out_y=%d][out_x=%d][in_channel=%d][m=%d]"
+                         "[filter_y=%d][filter_x=%d] acc(%d) += %d * (%d + %d) "
+                         "= %d, in_x_origin %d, in_x %d\n",
+                         batch, out_y, out_x, in_channel, m, filter_y, filter_x,
+                         acc - filter_val * (input_val + input_offset),
+                         filter_val, input_val, input_offset, acc, in_x_origin,
+                         in_x);
+#endif
+                }
+              }
+            }
+            if (bias_data) {
+              acc += bias_data[output_channel];
+            }
+
+#ifdef ENABLE_DEBUG_MSG
+            printf("  [batch=%d][out_y=%d][out_x=%d][output_channel=%d] acc = "
+                   "%d, bias %d\n",
+                   batch, out_y, out_x, output_channel, acc,
+                   bias_data ? bias_data[output_channel] : 0);
+#endif
+
+            acc = MultiplyByQuantizedMultiplier(
+                acc, output_multiplier[output_channel],
+                output_rshift[output_channel]);
+
+#ifdef ENABLE_DEBUG_MSG
+            printf("  [batch=%d][out_y=%d][out_x=%d][output_channel=%d] acc = "
+                   "%d, multiplier %d, shift %d\n",
+                   batch, out_y, out_x, output_channel, acc,
+                   output_multiplier[output_channel],
+                   output_rshift[output_channel]);
+#endif
+
+            acc += output_offset;
+            acc = MAX(acc, output_activation_min);
+            acc = MIN(acc, output_activation_max);
+
+#ifdef ENABLE_DEBUG_MSG
+            printf("  [batch=%d][out_y=%d][out_x=%d][output_channel=%d] acc = "
+                   "%d\n",
+                   batch, out_y, out_x, output_channel, acc);
+#endif
+
+            output_data[Offset(output_shape, batch, output_channel, out_y,
+                               out_x)] = acc;
+          }
+        }
+      }
+    }
+  }
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("<= dw_conv_per_channel_ref\n");
+#endif
+}
+
+void calc_dw_conv_float_multiplier(dw_conv_test_param_t *p_param)
+{
+  const int input_offset = 0;  // symmetric
+
+  const int stride_width = p_param->stride_w;
+  const int stride_height = p_param->stride_h;
+  const int dilation_width_factor = 1;   // params.dilation_width_factor;
+  const int dilation_height_factor = 1;  // params.dilation_height_factor;
+  const int pad_width = p_param->pad_left;
+  ;
+  const int pad_height = p_param->pad_top;
+  const int depth_multiplier = 1;  // params.depth_multiplier;
+
+  const int batches = p_param->input_n;
+  const int input_height = p_param->input_h;
+  const int input_width = p_param->input_w;
+  const int input_depth = p_param->input_c;
+  const int filter_height = p_param->kh;
+  const int filter_width = p_param->kw;
+  const int output_depth = p_param->output_c;
+  const int output_height = p_param->output_h;
+  const int output_width = p_param->output_w;
+  int8_t *input_data = p_param->input_data;
+  int8_t *filter_data = p_param->filter_data;
+  int32_t *bias_data = p_param->has_bias ? p_param->bias_data : NULL;
+
+  cvk_tl_shape_t input_shape = {
+      batches, input_depth, input_height, input_width};
+  cvk_tl_shape_t filter_shape = {
+      output_depth, input_depth, filter_height, filter_width};
+
+  int output_accu_min = INT_MAX;
+  int output_accu_max = INT_MIN;
+
+  // printf("calc_dw_conv_float_multiplier =>\n");
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          for (int m = 0; m < depth_multiplier; ++m) {
+            const int output_channel = m + in_channel * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - pad_width;
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+            int32_t acc = 0;
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  int32_t input_val = input_data[Offset(input_shape, batch,
+                                                    in_channel, in_y, in_x)];
+                  int32_t filter_val = filter_data[Offset(
+                      filter_shape, 0, output_channel, filter_y, filter_x)];
+                  acc += filter_val * (input_val + input_offset);
+
+                  // printf("
+                  // [batch=%d][out_y=%d][out_x=%d][in_channel=%d][m=%d]"
+                  //        "[filter_y=%d][filter_x=%d] acc(%d) += %d * (%d +
+                  //        %d) = %d\n",
+                  //         batch, out_y, out_x, in_channel, m, filter_y,
+                  //         filter_x, acc - filter_val * (input_val +
+                  //         input_offset), filter_val, input_val, input_offset,
+                  //         acc);
+                }
+              }
+            }
+            if (bias_data) {
+              acc += bias_data[output_channel];
+            }
+
+            output_accu_max = MAX(acc, output_accu_max);
+            output_accu_min = MIN(acc, output_accu_min);
+
+            // printf("  [batch=%d][out_y=%d][out_x=%d][output_channel=%d] acc =
+            // %d, MIN = %d, MAX = %d\n",
+            //        batch, out_y, out_x, output_channel, acc,
+            //        output_accu_min, output_accu_max);
+          }
+        }
+      }
+    }
+  }
+
+  // Since int8 ranges from -128 to 127, we need to squeeze the accumulator
+  // MIN/MAX fit in those ranges correspondingly as much as possible.
+  if (abs(output_accu_max) > abs(output_accu_min)) {
+    p_param->float_multiplier = 127.0f / abs(output_accu_max);
+  } else {
+    p_param->float_multiplier = 128.0f / abs(output_accu_min);
+  }
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("  output_accu_min %d, output_accu_max %d, output_multiplier %f\n",
+         output_accu_min, output_accu_max, p_param->float_multiplier);
+#endif
+
+  // printf("<= calc_dw_conv_float_multiplier\n");
+}
+
+static int simple_test(CVI_RT_HANDLE *rt_handler, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+
+  const int batches = 1;
+  const int input_depth = 8;
+  const int input_height = 5;
+  const int input_width = 6;
+  cvk_tl_shape_t input_shape =
+      {batches, input_depth, input_height, input_width};
+  int8_t input_data[240] = {
+      /* ic = 0 */
+      103, -104, 119, -4, 106, -119, 65, -85, -117, -47, -31, -3, 65, -87, -41,
+      -63, 117, -63, -66, -64, 84, -55, 17, 71, -8, 75, 26, 63, 105, 127,
+
+      /* ic = 1 */
+      85, 12, 119, 76, 35, -36, 61, -51, -90, 67, -75, -11, 78, 36, 96, 82, 20,
+      79, -125, 116, 75, 46, 7, -37, -29, -17, -8, 125, 14, -14,
+
+      /* ic = 2 */
+      -96, -57, -88, 76, 74, 89, 62, 52, -104, 115, 67, -14, -58, -98, 21, 1,
+      12, 87, 109, 29, 21, 65, -109, 111, -125, -49, -61, -13, -89, -108,
+
+      /* ic = 3 */
+      120, -80, 57, -52, 96, -74, -7, 76, 126, -3, -115, -4, 52, -12, 78, 112,
+      -88, 125, -73, 71, 24, -28, -25, 119, 58, 92, -41, 56, 0, 90,
+
+      /* ic = 4 */
+      105, 12, 120, -92, 117, 111, -28, -120, 82, -120, 75, 37, 46, -1, -71, 50,
+      -93, -63, -39, -7, 12, 64, -115, -95, -42, 7, 39, -107, 83, -15,
+
+      /* ic = 5 */
+      -72, 126, 123, -127, 0, 46, -20, -47, 82, 41, -119, 75, -128, 70, -124,
+      -23, 67, -43, 104, 124, 59, 15, -38, -89, -29, 30, 7, 105, -10, 26,
+
+      /* ic = 6 */
+      33, 117, 117, -21, 39, 45, 39, -58, 49, -16, -81, 53, 39, 117, 64, 100,
+      -90, 80, -78, -38, 106, -31, 7, 17, -87, -86, 48, -70, 9, -101,
+
+      /* ic = 7 */
+      -50, 127, -100, -100, 76, -26, -84, 95, -96, -96, -24, 107, 53, 18, 82,
+      -20, -70, -52, 89, -111, 49, -75, 23, -27, 109, -98, 55, 1, 11, -1};
+
+  const int kernel_height = 3;
+  const int kernel_width = 3;
+  cvk_tl_shape_t filter_shape_tiu =
+      {1, input_depth, kernel_height, kernel_width};
+  cvk_tl_shape_t filter_shape_dma =
+      {1, input_depth, kernel_height, kernel_width};
+  // Global memory layout: OcKhKw
+  int8_t filter_data[72] = {
+      103,  -104, 119,  -4,  106, -119, 65,   -85,  -117, 85,  12,  119,
+      76,   35,   -36,  61,  -51, -90,  -96,  -57,  -88,  76,  74,  89,
+      62,   52,   -104, 120, -80, 57,   -52,  96,   -74,  -7,  76,  126,
+      105,  12,   120,  -92, 117, 111,  -28,  -120, 82,   -72, 126, 123,
+      -127, 0,    46,   -20, -47, 82,   33,   117,  117,  -21, 39,  45,
+      39,   -58,  49,   -50, 127, -100, -100, 76,   -26,  -84, 95,  -96};
+
+  int32_t bias_data[8] = {812, 670, -746, 938, 827, -558, 265, -384};
+
+  uint32_t output_multiplier[8] = {
+      1155460505, 1210948247, 1203328687, 1166122678,
+      1155273687, 1196350022, 1169748238, 1183287581};
+
+  // Change to right shift
+  int8_t output_rshift[8] = {7, 6, 6, 9, 8, 6, 6, 7};
+
+  uint8_t chl_quan_data[8 * 4 + 8 * 4 + 8];
+  pack_chl_quan_param(8, /*has_bias=*/true, bias_data, output_multiplier,
+                      output_rshift, chl_quan_data);
+
+  const int output_height = 3;
+  const int output_width = 4;
+  cvk_tl_shape_t output_shape =
+      {batches, input_depth, output_height, output_width};
+  int8_t ref_output_data[96] = {
+      /* oc = 0 */
+      127, -101, 60, -128, -46, -11, -94, 24, -62, -2, -30, -128,
+
+      /* oc = 1 */
+      127, -65, 123, 26, -128, 48, -36, 124, 95, 111, 98, 127,
+
+      /* oc = 2 */
+      69, 39, 31, -128, -37, -128, 127, -128, 127, 106, -21, 30,
+
+      /* oc = 3 */
+      34, 13, 17, -21, 1, -3, 19, -1, 24, 4, -1, 32,
+
+      /* oc = 4 */
+      36, 26, 3, 72, -6, -50, 3, -52, 10, 3, -11, -85,
+
+      /* oc = 5 */
+      127, 6, -128, 55, 109, 37, -18, -123, -74, -128, -12, -11,
+
+      /* oc = 6 */
+      127, 127, -58, 127, 98, -20, -40, -54, 127, 127, 58, -35,
+
+      /* oc = 7 */
+      127, -67, -64, 94, -14, 79, -115, -1, -128, 127, -72, 34};
+
+  cvk_tl_t *tl_input =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, input_shape, CVK_FMT_I8,
+                                      /*eu_aign=*/1);
+
+  cvk_tl_t *tl_filter_dma =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, filter_shape_dma, CVK_FMT_I8,
+                                      /*eu_align=*/1);
+
+  cvk_tl_t *tl_output =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, output_shape, CVK_FMT_I8,
+                                      /*eu_align=*/1);
+
+  cvk_tl_shape_t chl_quan_shape_dma = {1, 8, 1, 9};
+  cvk_tl_shape_t chl_quan_shape_tiu = {1, 8, 1, 1};
+  cvk_tl_t *tl_chl_quan_dma =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, chl_quan_shape_dma, CVK_FMT_U8,
+                                      /*eu_align*/ 0);
+
+  tensor_copy_s2d_g2l(rt_handler, cvk_ctx, tl_chl_quan_dma, chl_quan_data);
+  tensor_copy_s2d_g2l(rt_handler, cvk_ctx, tl_input, (uint8_t *)input_data);
+  tensor_copy_s2d_g2l(rt_handler, cvk_ctx, tl_filter_dma,
+                           (uint8_t *)filter_data);
+
+  {
+   cvk_tl_t tl_filter_tiu;
+    memset(&tl_filter_tiu, 0, sizeof(tl_filter_tiu));
+    tl_filter_tiu.start_address = tl_filter_dma->start_address;
+    tl_filter_tiu.fmt = tl_filter_dma->fmt;
+    tl_filter_tiu.shape.n = filter_shape_tiu.n;
+    tl_filter_tiu.shape.c = filter_shape_tiu.c;
+    tl_filter_tiu.shape.h = filter_shape_tiu.h;
+    tl_filter_tiu.shape.w = filter_shape_tiu.w;
+    tl_filter_tiu.stride =
+        cvk_ctx->ops->tl_default_stride(cvk_ctx, tl_filter_tiu.shape,
+                                        CVK_FMT_I8, /*eu_align=*/1);
+
+    cvk_tl_t tl_chl_quan_tiu;
+    memset(&tl_chl_quan_tiu, 0, sizeof(tl_chl_quan_tiu));
+    tl_chl_quan_tiu.start_address = tl_chl_quan_dma->start_address;
+    tl_chl_quan_tiu.fmt = tl_chl_quan_dma->fmt;
+    tl_chl_quan_tiu.shape.n = chl_quan_shape_tiu.n;
+    tl_chl_quan_tiu.shape.c = chl_quan_shape_tiu.c;
+    tl_chl_quan_tiu.shape.h = chl_quan_shape_tiu.h;
+    tl_chl_quan_tiu.shape.w = chl_quan_shape_tiu.w;
+    tl_chl_quan_tiu.stride = cvk_ctx->ops->tl_default_stride(
+        cvk_ctx, tl_chl_quan_tiu.shape, CVK_FMT_I8, /*eu_align=*/0);
+
+    cvk_tiu_depthwise_convolution_param_t param;
+    memset(&param, 0, sizeof(param));
+    param.ofmap = tl_output;
+    param.ifmap = tl_input;
+    param.weight = &tl_filter_tiu;
+    param.chl_quan_param = &tl_chl_quan_tiu;
+    param.dilation_h = 1;
+    param.dilation_w = 1;
+    param.stride_h = 1;
+    param.stride_w = 1;
+    param.has_bias = 1;
+    cvk_ctx->ops->tiu_depthwise_convolution(cvk_ctx, &param);
+  }
+
+  CVI_RT_Submit(cvk_ctx);
+
+  printf("dw-conv simple :compare tiu and golden\n");
+  int8_t *conv_output_data =
+      (int8_t *)tensor_copy_l2g_d2s(rt_handler, cvk_ctx, tl_output);
+  for (int i = 0; i < (int)sizeof(ref_output_data); i++) {
+    if (conv_output_data[i] != ref_output_data[i]) {
+      printf("  output_data[%d] %d != %d\n", i, conv_output_data[i],
+             ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  free(conv_output_data);
+
+  int8_t output_data[96] = {0};
+  memset(output_data, 0, sizeof(output_data));
+
+  dw_conv_test_param_t params;
+  memset(&params, 0, sizeof(params));
+  params.input_n = batches;
+  params.input_c = input_depth;
+  params.input_h = input_height;
+  params.input_w = input_width;
+  params.kh = kernel_height;
+  params.kw = kernel_width;
+  params.output_c = input_depth;
+  params.output_h = output_height;
+  params.output_w = output_width;
+  params.stride_w = 1;
+  params.stride_h = 1;
+  params.input_data = input_data;
+  params.filter_data = filter_data;
+  params.output_data = output_data;
+  params.has_bias = 1;
+  params.bias_data = bias_data;
+  params.multiplier_data = output_multiplier;
+  params.shift_data = output_rshift;
+
+  dw_conv_per_channel_ref(&params);
+
+  printf("dw-conv simple: compare ref and golden\n");
+  int output_size =
+      output_shape.n * output_shape.c * output_shape.h * output_shape.w;
+  for (int i = 0; i < output_size; ++i) {
+    if (output_data[i] != ref_output_data[i]) {
+      printf("  output_data[%d] = %d != %d\n", i, output_data[i],
+             ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  // Reverse order
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_chl_quan_dma);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_output);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_filter_dma);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_input);
+
+  return ret;
+}
+
+#if 0
+static int simple_lshift_test(CVI_RT_HANDLE *bm_ctx, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+
+  (void)bm_ctx;
+  (void)cvk_ctx;
+
+DepthwiseConvPerChannelTest:
+  input_shape (1, 6, 5, 8)
+  filter_shape (1, 3, 3, 8)
+  output_shape (1, 6, 5, 8)
+  input offset 0, output_offset 0,  output_multiplier 0x55663ec39de0, output_activation_min -128, output_activation_max 127
+input_data[240] = {
+105, -72, 33, -50, -104, 12, -57, -80, 12, 126, 117, 127, 119, 119, -88, 57, 120,
+123, 117, -100, -4, 76, 76, -52, -92, -127, -21, -100, 106, 35, 74, 96, 117,
+0, 39, 76, -119, -36, 89, -74, 111, 46, 45, -26, 65, 61, 62, -7, -28,
+-20, 39, -84, -85, -51, 52, 76, -120, -47, -58, 95, -117, -90, -104, 126, 82,
+82, 49, -96, -47, 67, 115, -3, -120, 41, -16, -96, -31, -75, 67, -115, 75,
+-119, -81, -24, -3, -11, -14, -4, 37, 75, 53, 107, 65, 78, -58, 52, 46,
+-128, 39, 53, -87, 36, -98, -12, -1, 70, 117, 18, -41, 96, 21, 78, -71,
+-124, 64, 82, -63, 82, 1, 112, 50, -23, 100, -20, 117, 20, 12, -88, -93,
+67, -90, -70, -63, 79, 87, 125, -63, -43, 80, -52, -66, -125, 109, -73, -39,
+104, -78, 89, -64, 116, 29, 71, -7, 124, -38, -111, 84, 75, 21, 24, 12,
+59, 106, 49, -55, 46, 65, -28, 64, 15, -31, -75, 17, 7, -109, -25, -115,
+-38, 7, 23, 71, -37, 111, 119, -95, -89, 17, -27, -8, -29, -125, 58, -42,
+-29, -87, 109, 75, -17, -49, 92, 7, 30, -86, -98, 26, -8, -61, -41, 39,
+7, 48, 55, 63, 125, -13, 56, -107, 105, -70, 1, 105, 14, -89, 0, 83,
+-10, 9, 11, 127, -14, -108, 90, -15, 26, -101, -1, 118, 122, -127, -120, };
+filter_data[72] = {
+105, -72, 33, -50, -104, 12, -57, -80, 12, 126, 117, 127, 119, 119, -88, 57, 120,
+123, 117, -100, -4, 76, 76, -52, -92, -127, -21, -100, 106, 35, 74, 96, 117,
+0, 39, 76, -119, -36, 89, -74, 111, 46, 45, -26, 65, 61, 62, -7, -28,
+-20, 39, -84, -85, -51, 52, 76, -120, -47, -58, 95, -117, -90, -104, 126, 82,
+82, 49, -96, -47, 67, 115, -3, };
+bias_data[8] = {
+827, -558, 265, -384, -805, 94, -443, -624, };
+output_multiplier[8] = {
+1610079514, 1623689331, 1597316687, 1626449885, 1593210715, 1596467067, 1646211112, 1593791522, };
+output_shift[8] = {
+9, -2, 7, -7, 4, 10, 8, -5, };
+
+  return ret;
+}
+#endif
+
+int choose_from_range(int table[], int size, int index)
+{
+  if (index >= size) {
+    return 0;
+  }
+
+  int val = table[index];
+  if (index < (size - 1)) {
+    int range = MAX(table[index + 1] - table[index] - 1, 1);
+    val += rand() % range;
+  }
+
+  return val;
+}
+
+void dump_test_param(dw_conv_test_param_t *p_param, bool dump_content)
+{
+  printf("Dump test parameter:\n");
+  printf("  input_n %d\n", p_param->input_n);
+  printf("  input_c %d\n", p_param->input_c);
+  printf("  input_h %d\n", p_param->input_h);
+  printf("  input_w %d\n", p_param->input_w);
+  printf("  kw %d\n", p_param->kw);
+  printf("  kh %d\n", p_param->kh);
+  printf("  dh %d\n", p_param->dh);
+  printf("  dw %d\n", p_param->dw);
+  printf("  pad_top %d\n", p_param->pad_top);
+  printf("  pad_bot %d\n", p_param->pad_bot);
+  printf("  pad_left %d\n", p_param->pad_left);
+  printf("  pad_right %d\n", p_param->pad_right);
+  printf("  ins_h %d\n", p_param->ins_h);
+  printf("  ins_h_last %d\n", p_param->ins_h_last);
+  printf("  ins_w %d\n", p_param->ins_w);
+  printf("  ins_w_last %d\n", p_param->ins_w_last);
+  printf("  stride_h %d\n", p_param->stride_h);
+  printf("  stride_w %d\n", p_param->stride_w);
+  printf("  output_c %d\n", p_param->output_c);
+  printf("  output_h %d\n", p_param->output_h);
+  printf("  output_w %d\n", p_param->output_w);
+  printf("  has_bias %d\n", p_param->has_bias);
+  printf("  relu_enable %d\n", p_param->relu_enable);
+
+  if (dump_content) {
+    printf("input_data(%d, %d, %d, %d) :\n", p_param->input_n, p_param->input_c,
+           p_param->input_h, p_param->input_w);
+    int in = p_param->input_n;
+    int ic = p_param->input_c;
+    int ih = p_param->input_h;
+    int iw = p_param->input_w;
+    for (int i = 0; i < in; ++i) {
+      for (int j = 0; j < ic; ++j) {
+        for (int k = 0; k < ih; ++k) {
+          for (int l = 0; l < iw; ++l) {
+            int offset = i * (ic * ih * iw) + j * (ih * iw) + k * iw + l;
+            printf("%d, ", p_param->input_data[offset]);
+          }
+          printf("\n");
+        }
+      }
+    }
+    printf("\n\n");
+
+    printf("kener_data (%d, %d, %d)\n", p_param->output_c, p_param->kh,
+           p_param->kw);
+    int kh = p_param->kh;
+    int kw = p_param->kw;
+    for (int i = 0; i < ic; ++i) {
+      for (int j = 0; j < kh; ++j) {
+        for (int k = 0; k < kw; ++k) {
+          int offset = i * (kh * kw) + j * kw + k;
+          printf("%d, ", p_param->filter_data[offset]);
+        }
+      }
+      printf("\n");
+    }
+    printf("\n\n");
+
+    if (p_param->has_bias) {
+      printf("bias_data:\n");
+      for (int i = 0; i < ic; ++i) {
+        printf("%d, ", p_param->bias_data[i]);
+      }
+      printf("\n\n");
+    }
+
+    printf("multiplier_data:\n");
+    for (int i = 0; i < ic; ++i) {
+      printf("%d, ", p_param->multiplier_data[i]);
+    }
+    printf("\n\n");
+
+    printf("shift_data:\n");
+    for (int i = 0; i < ic; ++i) {
+      printf("%d, ", p_param->shift_data[i]);
+    }
+    printf("\n\n");
+  }
+}
+
+int run_compare_dw_conv(CVI_RT_HANDLE *rt_handler, cvk_context_t *cvk_ctx,
+                        dw_conv_test_param_t *p_param)
+{
+  int ret = 0;
+
+  if (rt_handler == NULL || cvk_ctx == NULL) {
+    return -1;
+  }
+
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int oh = p_param->output_h;
+  int ow = p_param->output_w;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_last_h = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_last_w = p_param->ins_w_last;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int has_bias = p_param->has_bias;
+  int relu_enable = p_param->relu_enable;
+
+  int input_size = in * ic * iw * ih;
+  int8_t *input_data = (int8_t *)malloc(input_size);
+
+  int kernel_size = oc * ic * kh * kw;
+  int8_t *kernel_data = (int8_t *)malloc(kernel_size);
+
+  int output_size = in * oc * oh * ow;
+  int8_t *output_data = (int8_t *)malloc(output_size);
+  if (!input_data || !kernel_data || !output_data) {
+    free(input_data);
+    free(kernel_data);
+    free(output_data);
+    return -1;
+  }
+
+  memset(output_data, 0, output_size);
+
+  int32_t *bias_data = (int32_t *)malloc(sizeof(int32_t) * oc);
+  uint32_t *multiplier_data = (uint32_t *)malloc(sizeof(uint32_t) * oc);
+  int8_t *shift_data = (int8_t *)malloc(oc);
+
+  p_param->input_data = input_data;
+  p_param->filter_data = kernel_data;
+  p_param->output_data = output_data;
+  p_param->has_bias = has_bias;
+  p_param->bias_data = bias_data;
+  p_param->multiplier_data = multiplier_data;
+  p_param->shift_data = shift_data;
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    run_compare_dw_conv =>\n");
+  printf("      input (n=%d, ic=%d, h=%d, w=%d), kernel (oc=%d, ic=%d, h=%d, "
+         "w=%d), output (, c=%d, h=%d, w=%d), has_bias %d\n",
+         in, ic, ih, iw, oc, ic, kh, kw, oc, oh, ow, has_bias);
+#endif
+
+  int retry_cnt = p_param->retry_cnt;
+  do {
+    fill_random_data_s8(input_data, input_size);
+    fill_random_data_s8(kernel_data, kernel_size);
+    if (has_bias) {
+      fill_random_data_s32(bias_data, oc);
+    }
+
+    p_param->float_multiplier = 100.0;  // should be < 1.0
+    calc_dw_conv_float_multiplier(p_param);
+
+    if (p_param->float_multiplier > 0.f && p_param->float_multiplier < 1.0) {
+      break;
+    }
+
+  } while (--retry_cnt);
+
+  if (p_param->float_multiplier >= 1.0) {
+    printf("    run_compare_dw_conv: unable to find valid multiplier\n");
+    free(input_data);
+    free(kernel_data);
+    free(output_data);
+    free(bias_data);
+    free(multiplier_data);
+    free(shift_data);
+    return -1;
+  }
+
+  uint32_t base_multiplier = 0;
+  int base_shift = 0;
+  QuantizeMultiplierSmallerThanOne(p_param->float_multiplier, &base_multiplier,
+                                   &base_shift);
+
+  for (int i = 0; i < oc; ++i) {
+    // multipliers typically range in [2^30 ; 2^31 - 1].
+    // Values in [0, 2^30 - 1] are normally unused, but harmless.
+    // Thus a good way to randomize multipliers is to subtract from them
+    // a random value smaller than 2^30 but still significant compared to it.
+    p_param->multiplier_data[i] = base_multiplier - (rand() % (1 << 26));
+
+    int right_shift = base_shift - 1 + (rand() % 4);
+    p_param->shift_data[i] =
+        truncate_rshift((int8_t)right_shift, /*allow_lshift*/1);
+
+#ifdef ENABLE_DEBUG_MSG
+    printf("      [oc=%d] multiplier_data %d, shift_data %d\n", i,
+           p_param->multiplier_data[i], p_param->shift_data[i]);
+#endif
+  }
+
+  dw_conv_per_channel_ref(p_param);
+
+  const int chl_quan_size_dma =
+      p_param->has_bias ? 9 : 5;  // bias(4) + multiplier(4) + shift(1)
+  const int cal_data_size = oc * chl_quan_size_dma;
+  uint8_t *chl_quan_data = (uint8_t *)malloc(cal_data_size);
+  pack_chl_quan_param(oc, p_param->has_bias, p_param->bias_data,
+                      p_param->multiplier_data, p_param->shift_data,
+                      chl_quan_data);
+
+  cvk_tl_shape_t input_shape = {in, ic, ih, iw};
+  cvk_tl_shape_t filter_shape = {1, oc, kh, kw};
+  cvk_tl_shape_t output_shape = {in, oc, oh, ow};
+  cvk_tl_shape_t chl_quan_shape_dma = {1, oc, 1, chl_quan_size_dma};
+  cvk_tl_shape_t chl_quan_shape_tiu = {1, oc, 1, 1};
+
+  cvk_tl_t *tl_input =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, input_shape, CVK_FMT_I8,
+                                      /*eu_aign=*/1);
+
+  cvk_tl_t *tl_filter =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, filter_shape, CVK_FMT_I8,
+                                      /*eu_align=*/1);
+
+  cvk_tl_t *tl_output =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, output_shape, CVK_FMT_I8,
+                                      /*eu_align=*/1);
+
+  // Shape for TDMA load
+  cvk_tl_t *tl_chl_quan_dma =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, chl_quan_shape_dma, CVK_FMT_U8,
+                                      /*eu_align*/ 0);
+
+  if (!tl_input || !tl_filter || !tl_output || !tl_chl_quan_dma) {
+    if (tl_input == NULL) {
+      printf("      fail to alloc tl_input (%d, %d, %d, %d)\n",
+            input_shape.n, input_shape.c, input_shape.h, input_shape.w);
+    }
+    if (tl_filter == NULL) {
+      printf("      fail to alloc tl_filter (%d, %d, %d, %d)\n",
+            filter_shape.n, filter_shape.c, filter_shape.h, filter_shape.w);
+    }
+    if (tl_output == NULL) {
+      printf("      fail to alloc tl_output (%d, %d, %d, %d)\n",
+            output_shape.n, output_shape.c, output_shape.h, output_shape.w);
+    }
+    if (tl_chl_quan_dma == NULL) {
+      printf("      fail to alloc tl_chl_quan_dma (%d, %d, %d, %d)\n",
+            chl_quan_shape_dma.n, chl_quan_shape_dma.c, chl_quan_shape_dma.h,
+            chl_quan_shape_dma.w);
+    }
+
+    // Reverse order
+    if (tl_chl_quan_dma)
+      cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_chl_quan_dma);
+    if (tl_output)
+      cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_output);
+    if (tl_filter)
+      cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_filter);
+    if (tl_input)
+      cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_input);
+
+    return -1;
+  }
+
+
+  tensor_copy_s2d_g2l(rt_handler, cvk_ctx, tl_chl_quan_dma, chl_quan_data);
+  tensor_copy_s2d_g2l(rt_handler, cvk_ctx, tl_input, (uint8_t *)input_data);
+  tensor_copy_s2d_g2l(rt_handler, cvk_ctx, tl_filter, (uint8_t *)kernel_data);
+
+  {
+    cvk_tl_t tl_chl_quan_tiu;
+    memset(&tl_chl_quan_tiu, 0, sizeof(tl_chl_quan_tiu));
+    tl_chl_quan_tiu.start_address = tl_chl_quan_dma->start_address;
+    tl_chl_quan_tiu.fmt = tl_chl_quan_dma->fmt;
+    tl_chl_quan_tiu.shape.n = chl_quan_shape_tiu.n;
+    tl_chl_quan_tiu.shape.c = chl_quan_shape_tiu.c;
+    tl_chl_quan_tiu.shape.h = chl_quan_shape_tiu.h;
+    tl_chl_quan_tiu.shape.w = chl_quan_shape_tiu.w;
+    tl_chl_quan_tiu.stride = cvk_ctx->ops->tl_default_stride(
+        cvk_ctx, tl_chl_quan_tiu.shape, CVK_FMT_I8, /*eu_align=*/0);
+
+    cvk_tiu_depthwise_convolution_param_t param;
+    memset(&param, 0, sizeof(param));
+    param.ofmap = tl_output;
+    param.ifmap = tl_input;
+    param.weight = tl_filter;
+    param.chl_quan_param = &tl_chl_quan_tiu;
+    param.ins_h = ins_h;
+    param.ins_last_h = ins_last_h;
+    param.ins_w = ins_w;
+    param.ins_last_w = ins_last_w;
+    param.stride_h = stride_h;
+    param.stride_w = stride_w;
+    param.dilation_h = dh;
+    param.dilation_w = dw;
+    param.pad_top = pad_top;
+    param.pad_bottom = pad_bot;
+    param.pad_left = pad_left;
+    param.pad_right = pad_right;
+    param.has_bias = has_bias;
+    param.relu_enable = relu_enable;
+
+#ifdef ENABLE_DEBUG_MSG
+    printf("      tiu_dw_conv_qdm:\n");
+    printf("        ifmap shape (%d, %d, %d, %d)\n", param.ifmap->shape.n,
+           param.ifmap->shape.c, param.ifmap->shape.h, param.ifmap->shape.w);
+    printf("        weight shape (%d, %d, %d, %d)\n", param.weight->shape.n,
+           param.weight->shape.c, param.weight->shape.h, param.weight->shape.w);
+    printf("        ofmap shape (%d, %d, %d, %d)\n", param.ofmap->shape.n,
+           param.ofmap->shape.c, param.ofmap->shape.h, param.ofmap->shape.w);
+#endif
+
+    cvk_ctx->ops->tiu_depthwise_convolution(cvk_ctx, &param);
+  }
+
+  CVI_RT_Submit(cvk_ctx);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("      compare result:\n");
+#endif
+  int8_t *conv_output_data =
+      (int8_t *)tensor_copy_l2g_d2s(rt_handler, cvk_ctx, tl_output);
+  for (int i = 0; i < output_size; i++) {
+    if (conv_output_data[i] != output_data[i]) {
+      printf("        output_data[%d] %d(tiu) != %d(ref)\n", i,
+             conv_output_data[i], output_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+  if (ret) {
+    dump_test_param(p_param, /*dump_content=*/true);
+  }
+
+  // Reverse order
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_chl_quan_dma);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_output);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_filter);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_input);
+
+  free(conv_output_data);
+
+  free(input_data);
+  free(kernel_data);
+  free(output_data);
+  free(bias_data);
+  free(multiplier_data);
+  free(shift_data);
+  free(chl_quan_data);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    <= run_compare_dw_conv\n");
+#endif
+
+  return ret;
+}
+
+bool check_valid_test_param(cvk_context_t *cvk_ctx,
+                            dw_conv_test_param_t *p_param)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int oh = p_param->output_h;
+  int ow = p_param->output_w;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int chl_quan_size_dma =
+      p_param->has_bias ? 9 : 5;  // bias(4) + multiplier(4) + shift(1)
+
+  // Skip invalid shape
+  if ((kh > ih) || (kw > iw) || (stride_h > ih) || (stride_w > iw)) {
+    return false;
+  }
+
+  // muliply random-choosen value may exceeded than int32_t
+  uint32_t input_size = in * ic * ih * iw;
+  uint32_t kernel_size = ic * kh * kw;  // no oc
+  uint32_t output_size = in * oc * oh * ow;
+
+  uint32_t lmem_size_per_lane = cvk_ctx->info.lmem_size;
+  uint32_t total_lmem_size = cvk_ctx->info.lmem_size * cvk_ctx->info.npu_num;
+
+  uint32_t total_needed_size = input_size + kernel_size + output_size +
+                          chl_quan_size_dma * cvk_ctx->info.npu_num;
+  if (total_needed_size > total_lmem_size) {
+    return false;
+  }
+
+  cvk_tl_shape_t input_shape = {in, ic, ih, iw};
+  cvk_tl_shape_t filter_shape = {1, oc, kh, kw};
+  cvk_tl_shape_t output_shape = {in, oc, oh, ow};
+  cvk_tl_shape_t chl_quan_shape_dma = {1, oc, 1, chl_quan_size_dma};
+
+  uint32_t needed_size =
+      cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, input_shape, CVK_FMT_I8, /*eu_align=*/1) +
+      cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, filter_shape, CVK_FMT_I8, /*eu_align=*/1) +
+      cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, output_shape, CVK_FMT_I8, /*eu_align=*/1) +
+      cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, chl_quan_shape_dma, CVK_FMT_I8, /*eu_align=*/0);
+
+  // Skip invalid shape
+  if (needed_size > lmem_size_per_lane) {
+    return false;
+  }
+
+  return true;
+}
+
+static int random_test(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+
+  if (rt_handle == NULL || cvk_ctx == NULL) {
+    return -1;
+  }
+
+#ifndef ENABLE_FULL_REGRESSION
+#ifndef ENABLE_TV_GEN_PATTERN
+  // Input with same range size
+  // n: 12b, c: 12b, h: 12b(4095-32), wb: 12b(4095-32)
+  int batch_range[] = {1, 1, 2, 4095 - 32};
+  int input_height_range[] = {1, 512, 1024, 4095 - 32};
+  int input_width_range[] = {1, 512, 1024, 4095 - 32};
+  int input_depth_range[] = {1, 16, 32, 4095 - 32};
+
+  // Kernel with same range size
+  // h: 12b, w: 12b
+  // stride_h: 4b, strid_w: 4b
+  int kernel_height_range[] = {1, 11, 4095};
+  int kernel_width_range[] = {1, 11, 4095};
+  int kernel_stride_height_range[] = {1, 5, 15};
+  int kernel_stride_width_range[] = {1, 5, 15};
+#else
+  // TV_GEN pattern
+  //  Random Test, total 2187, skipped 13095, executed 27, failed 0, ret 0
+
+  // Input with same range size
+  // n: 12b, c: 12b, h: 12b(4095-32), wb: 12b(4095-32)
+  int batch_range[] = {1, 1, 3232};
+  int input_height_range[] = {1, 512, 4095 - 32};
+  int input_width_range[] = {1, 512, 4095 - 32};
+  int input_depth_range[] = {1, 16, 4095 - 32};
+
+  // Kernel with same range size
+  // h: 12b, w: 12b
+  // stride_h: 4b, strid_w: 4b
+  int kernel_height_range[] = {1, 11, 4095};
+  int kernel_width_range[] = {1, 11, 4095};
+  int kernel_stride_height_range[] = {1, 5, 15};
+  int kernel_stride_width_range[] = {1, 5, 15};
+#endif // ENABLE_TV_GEN_PATTERN
+#else
+#if 0
+  // Input with same range size
+  int batch_range[] = {1};
+  int input_height_range[] = {1};
+  int input_width_range[] = {1};
+  int input_depth_range[] = {1};
+
+  // Kernel with same range size
+  int kernel_height_range[] = {1};
+  int kernel_width_range[] = {1};
+  int kernel_stride_height_range[] = {1};
+  int kernel_stride_width_range[] = {1};
+  int output_depth_range[] = {1};
+#else
+  // 10/21/2019
+  // Random Test, total 512000, skipped 2535629, executed 24371
+
+  // Input with same range size
+  // n: 12b, c: 12b, h: 12b(4095-32), wb: 12b(4095-32)
+  int batch_range[] = {1, 2, 4, 8, 16, 32, 64, 4095 - 32};
+  int input_height_range[] = {1, 3, 11, 128, 512, 1024, 2048, 4095 - 32};
+  int input_width_range[] = {1, 3, 11, 128, 512, 1024, 2048, 4095 - 32};
+  int input_depth_range[] = {1, 3, 11, 32, 64, 1024, 2048, 4095 - 32};
+
+  // Kernel with same range size
+  // h: 12b, w: 12b
+  // stride_h: 4b, strid_w: 4b
+  int kernel_height_range[] = {1, 3, 11, 511, 4095};
+  int kernel_width_range[] = {1, 3, 11, 511, 4095};
+  int kernel_stride_height_range[] = {1, 3, 5, 7, 15};
+  int kernel_stride_width_range[] = {1, 3, 5, 7, 15};
+#endif
+#endif /* ENABLE_FULL_REGRESSION */
+
+  const int input_range_size =
+      sizeof(input_height_range) / sizeof(input_height_range[0]);
+  const int kernel_range_size =
+      sizeof(kernel_height_range) / sizeof(kernel_height_range[0]);
+
+  int random_seed = clock();
+  srand(random_seed);
+
+  const int retry_test_count = 100;
+  bool stop_at_first_error = true;
+
+  int executed_tests = 0;
+  int failed_tests = 0;
+
+  printf("dw-conv random Test =>\n");
+  for (int m = 0; m < retry_test_count; ++m) {
+    for (int i = 0; i < input_range_size; ++i) {
+      // random choosed from [range[i] : range[i+1]]
+      int batch = choose_from_range(batch_range, input_range_size, i);
+
+      for (int j = 0; j < input_range_size; ++j) {
+        int input_height =
+            choose_from_range(input_height_range, input_range_size, j);
+
+        for (int k = 0; k < input_range_size; ++k) {
+          int input_width =
+              choose_from_range(input_width_range, input_range_size, k);
+
+          for (int l = 0; l < input_range_size; ++l) {
+            int input_depth =
+                choose_from_range(input_depth_range, input_range_size, k);
+
+            for (int m = 0; m < kernel_range_size; ++m) {
+              int kernel_height =
+                  choose_from_range(kernel_height_range, kernel_range_size, m);
+
+              for (int n = 0; n < kernel_range_size; ++n) {
+                int kernel_width =
+                    choose_from_range(kernel_width_range, kernel_range_size, n);
+
+                for (int x = 0; x < kernel_range_size; ++x) {
+                  int kernel_stride_height = choose_from_range(
+                      kernel_stride_height_range, kernel_range_size, x);
+
+                  for (int y = 0; y < kernel_range_size; ++y) {
+                    int kernel_stride_width = choose_from_range(
+                        kernel_stride_width_range, kernel_range_size, y);
+                    int has_bias = rand() % 2;
+                    int dh = 1;
+                    int dw = 1;
+                    int ins_h = 0;
+                    int ins_h_last = 0;
+                    int ins_w = 0;
+                    int ins_w_last = 0;
+                    int pad_top = 0;
+                    int pad_bot = 0;
+                    int pad_left = 0;
+                    int pad_right = 0;
+
+                    int ih_ext = calc_dilute_hw(input_height, ins_h, ins_h_last,
+                                                pad_top, pad_bot);
+                    int iw_ext = calc_dilute_hw(input_width, ins_w, ins_w_last,
+                                                pad_left, pad_right);
+                    int kh_ext = calc_dilute_hw(kernel_height, dh - 1, 0, 0, 0);
+                    int kw_ext = calc_dilute_hw(kernel_width, dw - 1, 0, 0, 0);
+
+                    int oh =
+                        calc_output_hw(ih_ext, kh_ext, kernel_stride_height);
+                    int ow =
+                        calc_output_hw(iw_ext, kw_ext, kernel_stride_width);
+
+                    // depthwise, input depth == output depth
+                    int output_depth = input_depth;
+
+                    dw_conv_test_param_t test_param;
+                    memset(&test_param, 0, sizeof(test_param));
+                    test_param.input_n = batch;
+                    test_param.input_c = input_depth;
+                    test_param.input_h = input_height;
+                    test_param.input_w = input_width;
+                    test_param.kh = kernel_height;
+                    test_param.kw = kernel_width;
+                    test_param.dh = dh;
+                    test_param.dw = dw;
+                    test_param.pad_top = pad_top;
+                    test_param.pad_bot = pad_bot;
+                    test_param.pad_left = pad_left;
+                    test_param.pad_right = pad_right;
+                    test_param.ins_h = ins_h;
+                    test_param.ins_h_last = ins_h_last;
+                    test_param.ins_w = ins_w;
+                    test_param.ins_w_last = ins_w_last;
+                    test_param.stride_h = kernel_stride_height;
+                    test_param.stride_w = kernel_stride_width;
+                    test_param.output_c = output_depth;
+                    test_param.output_h = oh;
+                    test_param.output_w = ow;
+                    test_param.has_bias = has_bias;
+                    test_param.retry_cnt = 5;
+
+                    bool is_valid_param =
+                        check_valid_test_param(cvk_ctx, &test_param);
+                    if (is_valid_param == false)
+                      continue;
+
+                    int ret2 = run_compare_dw_conv(rt_handle, cvk_ctx, &test_param);
+                    failed_tests = ret2 ? failed_tests + 1 : failed_tests;
+                    ret |= ret2;
+                    executed_tests++;
+
+#ifdef ENABLE_DEBUG_MSG
+                    printf("  [%d] random test: input shape(%d, %d, %d, %d)",
+                           executed_tests, batch, input_depth,
+                           input_height, input_width);
+                    printf(", kernel shape (%d, %d, %d, %d), result %d\n",
+                           output_depth, input_depth, kernel_height,
+                           kernel_width, ret);
+#endif
+
+                    // Stop at first error
+                    if (ret && stop_at_first_error) {
+                      break;
+                    }
+                  }
+
+                  // Stop at first error
+                  if (ret && stop_at_first_error) {
+                    break;
+                  }
+                }
+
+                // Stop at first error
+                if (ret && stop_at_first_error) {
+                  break;
+                }
+              }
+
+              // Stop at first error
+              if (ret && stop_at_first_error) {
+                break;
+              }
+            }
+
+            // Stop at first error
+            if (ret && stop_at_first_error) {
+              break;
+            }
+          }
+
+          // Stop at first error
+          if (ret && stop_at_first_error) {
+            break;
+          }
+        }
+
+        // Stop at first error
+        if (ret && stop_at_first_error) {
+          break;
+        }
+      }
+
+      // Stop at first error
+      if (ret && stop_at_first_error) {
+        break;
+      }
+    }
+
+    // Stop at first error
+    if (ret && stop_at_first_error) {
+      break;
+    }
+
+    if (executed_tests >= MIN_EXEC_TESTS) {
+      break;
+    }
+  }
+
+  printf("<= dw-conv: random test, total %d, failed %d, ret %d\n",
+         executed_tests, failed_tests, ret);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_HANDLE rt_handle;
+  cvk_context_t *cvk_ctx = NULL;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+
+  if (!ret)
+    ret |= simple_test(rt_handle, cvk_ctx);
+  if (!ret)
+    ret |= random_test(rt_handle, cvk_ctx);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  if (!ret)
+    printf("%s pass\n", __FILENAME__);
+  else
+    printf("%s fail\n", __FILENAME__);
+
+  return ret;
+}
diff --git a/cviruntime/test/1822/test_cv1822_test_copy.c b/cviruntime/test/1822/test_cv1822_test_copy.c
new file mode 100644
index 000000000..d4f35722e
--- /dev/null
+++ b/cviruntime/test/1822/test_cv1822_test_copy.c
@@ -0,0 +1,96 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+static void tl_copy_ref(int8_t *a, int8_t *res, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++)
+    res[i] = a[i];
+}
+
+static int test_tl_copy(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  uint32_t size = n * c * h * w;
+  int8_t *a_data = (int8_t *)malloc(size);
+  assert(a_data && "Expect allocated a_data");
+  for (uint32_t i = 0; i < size; i++)
+    a_data[i] = (int8_t)(i % 256);
+
+  int8_t *ref_data = (int8_t *)malloc(size);
+  assert(ref_data && "Expect allocated ref_data");
+  tl_copy_ref(a_data, ref_data, size);
+  
+  cvk_tl_t *tl_a = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_res = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, (uint8_t *)a_data);
+
+  cvk_tiu_copy_param_t p10;
+  p10.dst = tl_res;
+  p10.src = tl_a;
+  cvk_ctx->ops->tiu_copy(cvk_ctx, &p10);
+  uint8_t *res_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res);
+
+  for (uint64_t i = 0; i < size; i++) {
+    if ((int8_t)res_data[i] != ref_data[i]) {
+      printf("comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+             i, res_data[i], ref_data[i]);
+      ret = -1;
+    }
+  }
+
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+
+  free(a_data);
+  free(ref_data);
+  free(res_data);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+ 
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_HANDLE rt_handle;
+  cvk_context_t *cvk_ctx = NULL;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+
+  ret = test_tl_copy(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_copy(rt_handle, cvk_ctx, 1);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  if (!ret)
+    printf("%s pass\n", __FILENAME__);
+  else
+    printf("%s fail\n", __FILENAME__);
+
+  return ret;
+}
diff --git a/cviruntime/test/1880v2/1880v2_test_util.h b/cviruntime/test/1880v2/1880v2_test_util.h
new file mode 100644
index 000000000..37701dace
--- /dev/null
+++ b/cviruntime/test/1880v2/1880v2_test_util.h
@@ -0,0 +1,1294 @@
+#ifndef INC_1880v2_TEST_UTIL_H
+#define INC_1880v2_TEST_UTIL_H
+
+#include <runtime/debug.h>
+#include "cviruntime_context.h"
+#include <bmkernel/bm1880v2/bmkernel_1880v2.h>
+#include "test_native_ref.h"
+#include <assert.h>
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <inttypes.h>
+#include <fstream>
+#include "compression.h"
+#include "bm_vlc_compress.h"
+#include "1880v2_vlc_random_gen_nn_data.h"
+#include <bmkernel/bm1880v2/1880v2_fp_convert.h>
+#include <bmkernel/bm1880v2/bm1880v2_tpu_cfg.h>
+#include <algorithm>
+#include "test_neuron_dump.hpp"
+
+
+#define math_min(x, y)          ((x) < (y) ? (x) : (y))
+#define math_max(x, y)          ((x) > (y) ? (x) : (y))
+#define __ALIGN_MASK(x,mask)    (((x)+(mask))&~(mask))
+#define ALIGN(x,a)              __ALIGN_MASK(x,(__typeof__(x))(a)-1)
+
+typedef uint32_t laddr_t;
+typedef uint64_t gaddr_t;
+
+//#define ENABEL_SIMPLE_BMK1880V2_VLC_TEST
+#define ENABEL_GAUSSIANRANDOM_BMK1880V2_VLC_TEST
+
+typedef bmk1880v2_context_t bmk_ctx_t;
+
+typedef bmk1880v2_tensor_lmem_shape_t tl_shape_t;
+typedef bmk1880v2_matrix_lmem_shape_t ml_shape_t;
+typedef bmk1880v2_tensor_tgmem_shape_t tg_shape_t;
+typedef bmk1880v2_matrix_tgmem_shape_t mg_shape_t;
+
+typedef bmk1880v2_tensor_lmem_t tl_t;
+typedef bmk1880v2_matrix_lmem_t ml_t;
+typedef bmk1880v2_tensor_tgmem_t tg_t;
+typedef bmk1880v2_matrix_tgmem_t mg_t;
+typedef bmk1880v2_compressed_tensor_tgmem_t compressed_tg_t;
+typedef bmk1880v2_compressed_matrix_tgmem_t compressed_mg_t;
+
+typedef bmk1880v2_tensor_tgmem_stride_t tg_stride_t;
+typedef bmk1880v2_matrix_tgmem_stride_t mg_stride_t;
+
+typedef struct {
+  tg_t tg;
+  CVI_RT_MEM mem;
+} tg_wrapper_t;
+
+typedef struct {
+  mg_t mg;
+  CVI_RT_MEM mem;
+} mg_wrapper_t;
+
+typedef struct {
+  compressed_tg_t tg;
+  CVI_RT_MEM mem;
+} compressed_tg_wrapper_t;
+
+typedef struct {
+  compressed_mg_t mg;
+  CVI_RT_MEM mem;
+} compressed_mg_wrapper_t;
+
+typedef enum {
+  VLC_CMP_MODE_HW = 0, // <! vlc compress mode - hw, ONLY support bias0/bias1
+  VLC_CMP_MODE_COMPILER, // <! vlc compress mode - sw, compiler, it could call bm_vlc_est_weight_bias
+  VLC_CMP_MODE_MAX,
+} vlc_cmp_mode_e;
+
+typedef struct dim_s {
+  int n, c, h, w;
+} dim_t;
+
+typedef struct {
+  fmt_t src_fmt;
+  fmt_t dst_fmt;
+} fmt_type;
+
+#define BM_TENSOR_FP32(n, c, h, w) \
+    {.fmt = BM_FMT_FP32, \
+     .dim_size = 4, \
+     .dim = {n, c, h, w} \
+    }
+#define BM_TENSOR_INT16(n, c, h, w) \
+    {.fmt = BM_FMT_INT16, \
+     .dim_size = 4, \
+     .dim = {n, c, h, w} \
+    }
+#define BM_TENSOR_INT8(n, c, h, w) \
+    {.fmt = BM_FMT_INT8, \
+     .dim_size = 4, \
+     .dim = {n, c, h, w} \
+    }
+#define BM_TENSOR_BF16(n, c, h, w) \
+    {.fmt = BM_FMT_BF16, \
+     .dim_size = 4, \
+     .dim = {n, c, h, w} \
+    }
+#define BM_TENSOR_WITH_FMT(n, c, h, w, data_fmt) \
+    {.fmt = data_fmt, \
+     .dim_size = 4, \
+     .dim = {n, c, h, w} \
+    }
+#define BM_MATRIX_INT16(l, r) \
+    {.fmt = BM_FMT_INT16, \
+     .dim_size = 2, \
+     .dim = {l, r} \
+    }
+#define BM_MATRIX_INT8(l, r) \
+    {.fmt = BM_FMT_INT8, \
+     .dim_size = 2, \
+     .dim = {l, r} \
+    }
+#define BM_MATRIX_FP32(l, r) \
+    {.fmt = BM_FMT_FP32, \
+     .dim_size = 2, \
+     .dim = {l, r} \
+    }
+
+typedef enum bmfmt_e {
+  BM_FMT_FP32   = 0,
+  BM_FMT_FP16   = 1,
+  BM_FMT_INT16  = 2,
+  BM_FMT_INT8   = 3,
+  BM_FMT_BF16   = 4,
+  BM_FMT_MAX    = 5
+} bmfmt_t;
+#define BM_SHAPE_MAX_DIM       (4)
+typedef struct bmshape_s {
+  bmfmt_t fmt;
+  int dim_size;
+  int dim[BM_SHAPE_MAX_DIM];
+} bmshape_t;
+
+static const int bmfmt_bpp[BM_FMT_MAX] = {32, 16, 16, 8, 16};
+#define BM_FMT_BPP(_fmt_)      (bmfmt_bpp[(_fmt_)])
+
+extern CVI_RC CVI_RT_InitWithKernelBK(CVI_RT_HANDLE *rt_handle, uint32_t cmdbuf_size);
+extern CVI_RC CVI_RT_SubmitBK(CVI_RT_HANDLE rt_handle);
+extern CVI_RT_KHANDLE CVI_RT_GetKHandleBK(CVI_RT_HANDLE rt_handle);
+extern CVI_RC CVI_RT_DeInitBK(CVI_RT_HANDLE rt_handle);
+
+static inline size_t bmshape_get_size(bmshape_t *s) {
+  TPU_ASSERT(s->dim_size <= BM_SHAPE_MAX_DIM, NULL);
+  size_t size = BM_FMT_BPP(s->fmt) / 8;
+
+  for (int i = 0; i < s->dim_size; i++) {
+    TPU_ASSERT(s->dim[i] > 0, NULL);
+    size *= s->dim[i];
+  }
+  return size;
+}
+
+static inline int dim_size(const dim_t *dim)
+{
+  return dim->n * dim->c * dim->h * dim->w;
+}
+
+static inline u64 tl_shape_size(const tl_shape_t *s)
+{
+  return (u64)s->n * s->c * s->h * s->w;
+}
+
+static inline u64 ml_shape_size(const ml_shape_t *s)
+{
+  return (u64)s->n * s->col;
+}
+
+static inline u64 mg_shape_size(const mg_shape_t *s)
+{
+  return (u64)s->row * s->col;
+}
+
+static inline u64 tg_shape_size(const tg_shape_t *s)
+{
+  return (u64)s->n * s->c * s->h * s->w;
+}
+
+static inline dim_t dim_of_ith_element(int i, dim_t *dim, int transpose)
+{
+  int channel_offset = i % (dim->h * dim->w);
+  int hidx = channel_offset / dim->w;
+  int widx = channel_offset % dim->w;
+  int channel_index = i / (dim->h * dim->w);
+  int nidx = channel_index / dim->c;
+  int cidx = channel_index % dim->c;
+  if (transpose) {
+    nidx = channel_index % dim->n;
+    cidx = channel_index / dim->n;
+  }
+  dim_t r = { nidx, cidx, hidx, widx };
+  return r;
+}
+
+static inline void * xmalloc(size_t size)
+{
+  void *p = malloc(size);
+  assert(p);
+  return p;
+}
+
+static inline void test_init(CVI_RT_HANDLE *ctx, bmk_ctx_t **bmk)
+{
+#if 1
+  //int ret = bm_init_chip(0, ctx, "cv1880v2");
+  //(void)bmk;
+  int ret = CVI_RT_InitWithKernelBK(ctx, 0x10000);
+  if (ret != BM_SUCCESS) {
+    fprintf(stderr, "bm_init failed, err %d\n", ret);
+    exit(-1);
+  }
+  *bmk = (bmk_context_t*)CVI_RT_GetKHandleBK(*ctx);
+#else
+  CVI_RT_Init(ctx);
+  *bmk = (bmk_context_t*)CVI_RT_GetKHandleBK(*ctx);
+#endif
+}
+
+static inline void test_submit(CVI_RT_HANDLE *ctx)
+{
+  CVI_RT_SubmitBK(*ctx);
+}
+
+static inline void test_exit(CVI_RT_HANDLE *ctx)
+{
+  //cviruntime_cvikernel_destroy(*ctx);
+  //bm_exit(*ctx);
+  CVI_RT_DeInitBK(*ctx);
+}
+
+static inline tl_t * alloc_tl(bmk_ctx_t *bmk, tl_shape_t s, fmt_t f, int align)
+{
+  tl_t *t = bmk1880v2_lmem_alloc_tensor(bmk, s, f, align);
+  t->cmprs_fmt = f;
+  assert(t);
+  return t;
+}
+
+static inline ml_t * alloc_ml(bmk_ctx_t *bmk, ml_shape_t s, int align)
+{
+  ml_t *m = bmk1880v2_lmem_alloc_matrix(bmk, s, FMT_I8, align);
+  assert(m);
+  return m;
+}
+
+static inline ml_t * alloc_ml(bmk_ctx_t *bmk, ml_shape_t s, fmt_t f, int align)
+{
+  ml_t *m = bmk1880v2_lmem_alloc_matrix(bmk, s, f, align);
+  assert(m);
+  return m;
+}
+
+static inline ml_t * alloc_ml_bf16(bmk_ctx_t *bmk, ml_shape_t s, fmt_t f,int align)
+{
+  ml_t *m = bmk1880v2_lmem_alloc_matrix(bmk, s, f, align);
+  assert(m);
+  return m;
+}
+
+static inline tg_t * alloc_tg_gmem(CVI_RT_HANDLE *ctx, tg_shape_t s, fmt_t fmt)
+{
+  bmshape_t bms = BM_TENSOR_INT8(
+      (int)s.n,
+      (int)s.c,
+      (int)s.h,
+      (int)s.w);
+
+  tg_wrapper_t *w = (tg_wrapper_t *)malloc(sizeof(tg_wrapper_t));
+  //w->mem = bmmem_device_alloc(*ctx, &bms);
+  //w->mem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms));
+  w->mem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms));
+  w->tg.base_reg_index = 0;
+  w->tg.start_address = CVI_RT_MemGetPAddr(w->mem);
+  w->tg.fmt = fmt;
+  w->tg.shape = s;
+  w->tg.stride = bmk1880v2_tensor_tgmem_default_stride(s, fmt);
+
+  return &w->tg;
+}
+
+static inline tg_t * _alloc_tg_bf16_gmem(CVI_RT_HANDLE *ctx, tg_shape_t s, fmt_t fmt,
+    bmk1880v2_tensor_tgmem_stride_t* tg_stride)
+{
+  u32 val = (fmt == FMT_BF16) ? 2 : 1;
+  bmshape_t bms = BM_TENSOR_INT8(
+      (int)s.n,
+      (int)s.c,
+      (int)s.h,
+      (int)s.w * (int)val);
+
+  tg_wrapper_t *w = (tg_wrapper_t *)malloc(sizeof(tg_wrapper_t));
+  //w->mem = bmmem_device_alloc(*ctx, &bms);
+  //w->mem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms));
+  w->mem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms));
+  w->tg.base_reg_index = 0;
+  w->tg.start_address = CVI_RT_MemGetPAddr(w->mem);
+  w->tg.fmt = fmt;
+  w->tg.shape = s;
+  if (tg_stride) {
+    w->tg.stride = *tg_stride;
+  }
+  else {
+    w->tg.stride = bmk1880v2_tensor_tgmem_default_stride(s, fmt);
+  }
+  return &w->tg;
+}
+
+static inline tg_t * alloc_tg_bf16_gmem(CVI_RT_HANDLE *ctx, tg_shape_t s, fmt_t fmt)
+{
+  return _alloc_tg_bf16_gmem(ctx, s, fmt, NULL);
+}
+
+static inline mg_t * alloc_mg_gmem(CVI_RT_HANDLE *ctx, mg_shape_t s)
+{
+  bmshape_t bms = BM_MATRIX_INT8((int)s.row, (int)s.col);
+  mg_wrapper_t *w = (mg_wrapper_t *)malloc(sizeof(mg_wrapper_t));
+  //w->mem = bmmem_device_alloc(*ctx, &bms);
+  w->mem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms));
+
+  w->mg.base_reg_index = 0;
+  w->mg.start_address = CVI_RT_MemGetPAddr(w->mem);
+  w->mg.shape = s;
+  w->mg.stride.row = s.col;
+
+  return &w->mg;
+}
+
+static inline compressed_mg_t* alloc_compressed_mg_gmem(CVI_RT_HANDLE *ctx, mg_shape_t s)
+{
+  bmshape_t bms = BM_MATRIX_INT8((int)s.row, (int)s.col);
+  compressed_mg_wrapper_t *w = (compressed_mg_wrapper_t *)malloc(sizeof(compressed_mg_wrapper_t));
+  //w->mem = bmmem_device_alloc(*ctx, &bms);
+  w->mem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms));
+  
+  w->mg.m.base_reg_index = 0;
+  w->mg.m.start_address = CVI_RT_MemGetPAddr(w->mem);
+  w->mg.m.shape = s;
+  w->mg.m.stride.row = s.col;
+
+  return &w->mg;
+}
+
+static inline mg_t * alloc_mg_bf16_gmem(CVI_RT_HANDLE *ctx, mg_shape_t s, fmt_t fmt)
+{
+
+  u32 val = (fmt == FMT_BF16) ? 2 : 1;
+  bmshape_t bms = BM_MATRIX_INT8((int)s.row, (int)s.col * (int)val);
+  mg_wrapper_t *w = (mg_wrapper_t *)malloc(sizeof(mg_wrapper_t));
+  //w->mem = bmmem_device_alloc(*ctx, &bms);
+  w->mem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms));
+
+  w->mg.base_reg_index = 0;
+  w->mg.start_address = CVI_RT_MemGetPAddr(w->mem);
+  w->mg.shape = s;
+  w->mg.fmt = fmt;
+  w->mg.stride.row = s.col * val;
+  return &w->mg;
+}
+
+static inline compressed_tg_t * alloc_compressed_tg_gmem(
+    CVI_RT_HANDLE *ctx, tl_shape_t *s, u8 bit_length)
+{
+  u64 size = tl_shape_size(s);
+  u64 header_bytes = 16;
+  u64 map_bytes = compression_map_bytes(size);
+  u64 data_bytes = compression_data_bytes(size, bit_length);
+  u64 total_bytes = header_bytes + map_bytes + data_bytes;
+  compressed_tg_wrapper_t *w = (compressed_tg_wrapper_t *)malloc(sizeof(compressed_tg_wrapper_t));
+  w->mem = CVI_RT_MemAlloc(*ctx, total_bytes);
+  w->tg.t.base_reg_index = 0;
+  w->tg.t.start_address = CVI_RT_MemGetPAddr(w->mem);
+  w->tg.reserved_size = total_bytes;
+  w->tg.bit_length = bit_length;
+  w->tg.t.shape.n = s->n;
+  w->tg.t.shape.c = s->c;
+  w->tg.t.shape.h = s->h;
+  w->tg.t.shape.w = s->w;
+  w->tg.t.stride = bmk1880v2_tensor_tgmem_default_stride(w->tg.t.shape, FMT_I8);
+  return &w->tg;
+}
+// <! copy from bmkernel/src/kernel_internal.h
+static inline int bitsize_of_fmt(fmt_t fmt)
+{
+  switch (fmt) {
+    case FMT_F32:
+    case FMT_I32:
+      return 32;
+    case FMT_F16:
+    case FMT_I16:
+    case FMT_U16:
+    case FMT_BF16:
+      return 16;
+    case FMT_I8:
+    case FMT_U8:
+      return 8;
+    case FMT_I4:
+      return 4;
+    case FMT_I2:
+      return 2;
+    case FMT_I1:
+      return 1;
+    default:
+      assert(0);
+      return -1;
+  }
+}
+// <! end copy from bmkernel/src/kernel_internal.h
+
+static inline int bytesize_of_fmt(fmt_t fmt)
+{
+  return bitsize_of_fmt(fmt) / 8;
+}
+
+static inline compressed_tg_t * _alloc_vlc_compressed_tg_gmem(
+    CVI_RT_HANDLE *ctx, tl_shape_t *s, fmt_t fmt, CommandInfo* cmd_info)
+{
+  u64 in_size = tl_shape_size(s);
+
+  u8 data_type = (fmt == FMT_BF16) ? 1 : 0;
+  in_size *= bytesize_of_fmt(fmt);
+
+  size_t bs_buf_size = get_out_bs_buf_size(in_size, data_type);
+  compressed_tg_wrapper_t *w = (compressed_tg_wrapper_t *)malloc(sizeof(compressed_tg_wrapper_t));
+  memset(w, 0, sizeof(*w));
+  w->mem = CVI_RT_MemAlloc(*ctx, bs_buf_size);
+  w->tg.t.base_reg_index = 0;
+  w->tg.t.start_address = CVI_RT_MemGetPAddr(w->mem);
+  w->tg.reserved_size = bs_buf_size;
+  w->tg.t.fmt = fmt;
+
+  if (cmd_info) {
+    w->tg.bias0 = cmd_info->bias0;
+    w->tg.bias1 = cmd_info->bias1;
+    w->tg.zero_guard_en = cmd_info->zero_guard_en;
+  }
+  else {
+    if (fmt == FMT_BF16) {
+      w->tg.bias0 = 127;
+    }
+    else if (fmt == FMT_I8 || fmt == FMT_U8) {
+      w->tg.bias0 = 0;
+    }
+    else {
+      printf("only accept fmt for FMT_BF16/FMT_I8/FMT_U8/, your format is %d\n", fmt);
+      assert(0);
+    }
+
+    w->tg.bias1 = 0;
+    // <! TODO: need to analyze data contain 0
+    w->tg.zero_guard_en = 0;
+  }
+  w->tg.t.shape.n = s->n;
+  w->tg.t.shape.c = s->c;
+  w->tg.t.shape.h = s->h;
+  w->tg.t.shape.w = s->w;
+  w->tg.t.stride = bmk1880v2_tensor_tgmem_default_stride(w->tg.t.shape, fmt);
+
+  return &w->tg;
+}
+
+static inline compressed_tg_t * alloc_vlc_compressed_tg_gmem(
+    CVI_RT_HANDLE *ctx, tl_shape_t *s, fmt_t fmt)
+{
+  return _alloc_vlc_compressed_tg_gmem(ctx, s, fmt, NULL);
+}
+
+/**
+ * \shape_size shape size
+ * \signedness 0 means ungiend 1 means signed
+ * \data_type 0 means 8bit 1 means bf16
+ */
+static inline void vlc_init_testdata(u16 *src_data, u64 shape_size, bool signedness, bool data_type) {
+#ifdef ENABEL_GAUSSIANRANDOM_BMK1880V2_VLC_TEST
+  float zero_ratio = 0;
+  assert(signedness == 0); //<! bf16 only set to 0
+  assert(data_type == 1); //<! bf16 only set to 1
+  random_gen_nn_data((u8* )src_data, shape_size, signedness, data_type, zero_ratio);
+#else /* ! ifdef ENABEL_GAUSSIANRANDOM_BMK1880V2_VLC_TEST */
+  printf ("randome signedness %d data_type %d\n", signedness, data_type);
+  memset(src_data, 0x00, shape_size * sizeof(u16));
+  for (u64 i = 0; i < shape_size; i++)
+    src_data[i] = 200 + i;
+
+  u64 zero_range = 20; //<! friendly enhance compress ratio
+  if (shape_size > zero_range) {
+    for (u64 i = 0; i < shape_size - zero_range; i++) {
+      src_data[i] = 0;
+    }
+  }
+#endif /* ifdef ENABEL_GAUSSIANRANDOM_BMK1880V2_VLC_TEST */
+}
+
+static inline void vlc_init_testdata(u8 *src_data, u64 shape_size, bool signedness, bool data_type) {
+  memset(src_data, 0x00, shape_size);
+#ifdef ENABEL_GAUSSIANRANDOM_BMK1880V2_VLC_TEST
+  float zero_ratio = 0;
+  assert(data_type == 0); //<! bf16 only set to 1
+  random_gen_nn_data(src_data, shape_size, signedness, data_type, zero_ratio);
+#else /* ! ifdef ENABEL_GAUSSIANRANDOM_BMK1880V2_VLC_TEST */
+  for (u64 i = 0; i < shape_size; i++)
+    src_data[i] = 200 + i;
+
+  u64 zero_range = 20; //<! friendly enhance compress ratio
+  if (shape_size > zero_range) {
+    for (u64 i = 0; i < shape_size - zero_range; i++) {
+      src_data[i] = 0;
+    }
+  }
+#endif /* ifdef ENABEL_GAUSSIANRANDOM_BMK1880V2_VLC_TEST */
+}
+
+static inline compressed_mg_t * alloc_vlc_compressed_mg_gmem(
+    CVI_RT_HANDLE *ctx, mg_shape_t s, fmt_t fmt, CommandInfo* cmd_info)
+{
+  u64 in_size = mg_shape_size(&s);
+  u8 data_type = (fmt == FMT_BF16) ? 1 : 0;
+  in_size *= bytesize_of_fmt(fmt);
+
+  size_t bs_buf_size = get_out_bs_buf_size(in_size, data_type);
+
+  compressed_mg_wrapper_t *w = (compressed_mg_wrapper_t *)malloc(sizeof(compressed_mg_wrapper_t));
+
+  w->mem = CVI_RT_MemAlloc(*ctx, bs_buf_size);
+  w->mg.m.shape = s;
+  w->mg.m.stride.row = s.col * bytesize_of_fmt(fmt);
+  w->mg.m.base_reg_index = 0;
+  w->mg.m.fmt = fmt;
+  w->mg.m.start_address = CVI_RT_MemGetPAddr(w->mem);
+
+  if (cmd_info) {
+    w->mg.bias0 = cmd_info->bias0;
+    w->mg.bias1 = cmd_info->bias1;
+    w->mg.zero_guard_en = cmd_info->zero_guard_en;
+  }
+  else {
+    w->mg.bias0 = 0;
+
+    if (fmt == FMT_BF16) {
+      w->mg.bias0 = 127;
+    }
+    else if (fmt == FMT_I8 || fmt == FMT_U8) {
+      w->mg.bias0 = 0;
+    }
+    else {
+      printf("only accept fmt for FMT_BF16/FMT_I8/FMT_U8/, your format is %d\n", fmt);
+      assert(0);
+    }
+
+    w->mg.bias1 = 0;
+    // <! FIXME: need to analyze data contain 0
+    w->mg.zero_guard_en = 0;
+  }
+
+  return &w->mg;
+}
+
+/**
+ * \cmd_info_est_in that manual set compress parameters, the possible input as below
+    1. NULL, it could call \bm_vlc_est_weight_bias
+    2. not NULL that directly send to \bm_vlc_enc_int8
+ * \cmd_info_est_out output est result, the passble value as following
+    1. \cmd_info_est_out = \cmd_info_est_in once cmd_info_est_in != NULL
+    2. \cmd_info_est_out = est result once cmd_info_est_in == NULL
+    3. NULL if you dont care
+ */
+static inline u8 *vlc_compress (
+    u8 *src_data, u64 size, int is_signed, int data_type, size_t* bs_size, const CommandInfo* cmd_info_est_in, CommandInfo* cmd_info_est_out)
+{
+  CommandInfo cmd_info;
+  size_t bs_buf_size = get_out_bs_buf_size(size, data_type);
+
+  u8 *bsbuf = (u8 *)malloc(sizeof(u8) * bs_buf_size);
+  memset(&cmd_info, 0x00, sizeof(CommandInfo));
+
+  /* generate comparess data (bsbuf)*/
+  if (cmd_info_est_in) {
+    memcpy(&cmd_info, cmd_info_est_in, sizeof(CommandInfo));
+  }
+  else {
+    bm_vlc_est_weight_bias(src_data, size, (bool)is_signed, (bool)data_type, &cmd_info);
+  }
+
+  if (cmd_info_est_out) {
+    memcpy(cmd_info_est_out, &cmd_info, sizeof(CommandInfo));
+  }
+
+  if (data_type) {
+    bm_vlc_enc_bf16((u16*)src_data, size, bsbuf, bs_size, &cmd_info);
+  }
+  else {
+    bm_vlc_enc_int8(src_data, size, bsbuf, bs_size, &cmd_info);
+  }
+
+  return bsbuf;
+}
+
+static inline int get_vlc_compressed_meta(
+    u8 *src_data, u64 in_size, fmt_t fmt, size_t* bs_size, CommandInfo* cmd_info)
+{
+  int is_signed = (fmt == FMT_I8);
+  int data_type = (fmt == FMT_BF16) ? 1 : 0;
+  //bm_vlc_est_weight_bias(src_data, in_size, (bool)is_signed, (bool)data_type, cmd_info);
+
+  u8 *ref_data = vlc_compress(src_data, in_size, is_signed, data_type, bs_size, cmd_info, NULL);
+  free(ref_data);
+  return 0;
+}
+
+static inline void free_tl(bmk_ctx_t *bmk, const tl_t *t)
+{
+  return bmk1880v2_lmem_free_tensor(bmk, t);
+}
+
+static inline void free_ml(bmk_ctx_t *bmk, const ml_t *m)
+{
+  return bmk1880v2_lmem_free_matrix(bmk, m);
+}
+
+static inline void free_tg_gmem(CVI_RT_HANDLE *ctx, const tg_t *tg)
+{
+  tg_wrapper_t *w = (typeof(w))tg;
+  CVI_RT_MemFree(*ctx, w->mem);
+  free(w);
+}
+
+static inline void free_mg_gmem(CVI_RT_HANDLE *ctx, const mg_t *mg)
+{
+  mg_wrapper_t *w = (typeof(w))mg;
+  CVI_RT_MemFree(*ctx, w->mem);
+  free(w);
+}
+
+static inline void free_compressed_tg_gmem(
+    CVI_RT_HANDLE *ctx, const compressed_tg_t *t)
+{
+  compressed_tg_wrapper_t *w = (typeof(w))t;
+  CVI_RT_MemFree(*ctx, w->mem);
+  free(w);
+}
+
+static inline void free_compressed_mg_gmem(
+    CVI_RT_HANDLE *ctx, const compressed_mg_t *t)
+{
+  compressed_mg_wrapper_t *w = (typeof(w))t;
+  CVI_RT_MemFree(*ctx, w->mem);
+  free(w);
+}
+
+static inline u8 * get_tg_gmem(CVI_RT_HANDLE *ctx, const tg_t *tg)
+{
+  tg_shape_t s = tg->shape;
+  u32 size = s.n * s.c * s.h * s.w;
+  u8 *data = (u8 *)malloc(sizeof(u8) * size);
+
+  tg_wrapper_t *w = (typeof(w))tg;
+  int ret = CVI_RT_MemCopyD2S(*ctx, data, w->mem);
+  assert(ret == BM_SUCCESS);
+
+  return data;
+}
+
+static inline u8 * get_tg_bf16_gmem(CVI_RT_HANDLE *ctx, const tg_t *tg)
+{
+  tg_shape_t s = tg->shape;
+  u32 size = s.n * s.c * s.h * s.w * (tg->fmt == FMT_BF16 ? 2 : 1);
+  u8 *data = (u8 *)malloc(sizeof(u8) * size);
+  tg_wrapper_t *w = (typeof(w))tg;
+  int ret = CVI_RT_MemCopyD2S(*ctx, data, w->mem);
+  assert(ret == BM_SUCCESS);
+
+  return data;
+}
+
+static inline u8 * get_mg_gmem(CVI_RT_HANDLE *ctx, const mg_t *mg)
+{
+  mg_shape_t s = mg->shape;
+  u32 size = s.row * s.col;
+  u8 *data = (u8 *)malloc(sizeof(u8) * size);
+
+  mg_wrapper_t *w = (typeof(w))mg;
+  int ret = CVI_RT_MemCopyD2S(*ctx, data, w->mem);
+  assert(ret == BM_SUCCESS);
+
+  return data;
+}
+
+static inline u8 * get_compressed_mg_gmem(CVI_RT_HANDLE *ctx, const compressed_mg_t *mg, size_t bs_size)
+{
+  //mg_shape_t s = mg->m.shape;
+  //u32 size = s.row * s.col;
+  u8 *data = (u8 *)malloc(sizeof(u8) * bs_size);
+
+  compressed_mg_wrapper_t *w = (typeof(w))mg;
+  int ret = CVI_RT_MemCopyD2S(*ctx, data, w->mem);
+  assert(ret == BM_SUCCESS);
+
+  return data;
+}
+
+static inline u8 * get_mg_bf16_gmem(CVI_RT_HANDLE *ctx, const mg_t *mg)
+{
+  mg_shape_t s = mg->shape;
+  u32 size = s.row * s.col * (mg->fmt == FMT_BF16 ? 2 : 1);
+  u8 *data = (u8 *)malloc(sizeof(u8) * size);
+
+  mg_wrapper_t *w = (typeof(w))mg;
+  int ret = CVI_RT_MemCopyD2S(*ctx, data, w->mem);
+  assert(ret == BM_SUCCESS);
+
+  return data;
+}
+
+static inline u8 * get_compressed_tg_gmem(
+    CVI_RT_HANDLE *ctx, const compressed_tg_t *t)
+{
+  compressed_tg_wrapper_t *w = (typeof(w))t;
+
+  u8 *data = (u8 *)malloc(sizeof(u8) * t->reserved_size);
+  int ret = CVI_RT_MemCopyD2S(*ctx, data, w->mem);
+  assert(ret == BM_SUCCESS);
+
+  return data;
+}
+
+static inline u8 * get_bytes_gmem(CVI_RT_HANDLE *ctx, CVI_RT_MEM mem, u64 size)
+{
+  //CVI_RT_MEM mem = bmmem_device_prealloc_raw(*ctx, NULL, addr, size);
+
+  u8 *data = (u8 *)malloc(sizeof(u8) * size);
+  int ret = CVI_RT_MemCopyD2S(*ctx, data, mem);
+  assert(ret == BM_SUCCESS);
+
+  return data;
+}
+
+static inline void put_tg_gmem(CVI_RT_HANDLE *ctx, const tg_t *tg, u8 data[])
+{
+  tg_wrapper_t *w = (typeof(w))tg;
+  int ret = CVI_RT_MemCopyS2D(*ctx, w->mem, data);
+  assert(ret == BM_SUCCESS);
+}
+
+static inline void put_tg_bf16_gmem(CVI_RT_HANDLE *ctx, const tg_t *tg, u8 data[])
+{
+  tg_wrapper_t *w = (typeof(w))tg;
+  int ret = CVI_RT_MemCopyS2D(*ctx, w->mem, data);
+  assert(ret == BM_SUCCESS);
+}
+
+static inline void put_mg_gmem(CVI_RT_HANDLE *ctx, const mg_t *mg, u8 data[])
+{
+  mg_wrapper_t *w = (typeof(w))mg;
+  int ret = CVI_RT_MemCopyS2D(*ctx, w->mem, data);
+  assert(ret == BM_SUCCESS);
+}
+
+static inline void put_mg_bf16_gmem(CVI_RT_HANDLE *ctx, const mg_t *mg, u8 data[])
+{
+  mg_wrapper_t *w = (typeof(w))mg;
+  int ret = CVI_RT_MemCopyS2D(*ctx, w->mem, data);
+  assert(ret == BM_SUCCESS);
+}
+
+#if 0
+static inline void put_bytes_gmem(CVI_RT_HANDLE *ctx, u64 addr, u64 size, u8 data[])
+{
+  CVI_RT_MEM mem = bmmem_device_prealloc_raw(*ctx, NULL, addr, size);
+
+  int ret = CVI_RT_MemCopyS2D(*ctx, mem, data);
+  assert(ret == BM_SUCCESS);
+}
+#else
+static inline void put_bytes_gmem(CVI_RT_HANDLE *ctx, CVI_RT_MEM mem, u8 data[])
+{
+  //CVI_RT_MEM mem = bmmem_device_prealloc_raw(*ctx, NULL, addr, size);
+
+  int ret = CVI_RT_MemCopyS2D(*ctx, mem, data);
+  assert(ret == BM_SUCCESS);
+}
+#endif
+
+static inline void put_compressed_tg_gmem(
+    CVI_RT_HANDLE *ctx, const compressed_tg_t *t, u8 buf[], u64 size)
+{
+  assert(size <= t->reserved_size);
+
+  compressed_tg_wrapper_t *w = (typeof(w))t;
+  //u64 addr = CVI_RT_MemGetPAddr(w->mem);
+
+  //put_bytes_gmem(ctx, addr, size, buf);
+  put_bytes_gmem(ctx, w->mem, buf);
+}
+
+static inline void put_compressed_mg_gmem(
+    CVI_RT_HANDLE *ctx, const compressed_mg_t *t, u8 buf[], u64 size)
+{
+  assert(size != 0);
+
+  compressed_mg_wrapper_t *w = (typeof(w))t;
+  //u64 addr = CVI_RT_MemGetPAddr(w->mem);
+
+  put_bytes_gmem(ctx, w->mem, buf);
+}
+
+static inline void put_tensor_g2l(
+    CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, const tl_t *tl, u8 data[])
+{
+  tg_shape_t s;
+  s.n = tl->shape.n;
+  s.c = tl->shape.c;
+  s.h = tl->shape.h;
+  s.w = tl->shape.w;
+  tg_t *tg = alloc_tg_gmem(ctx, s, FMT_I8);
+
+  bmk1880v2_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = tg;
+  p.dst = tl;
+
+  put_tg_gmem(ctx, tg, data);
+  bmk1880v2_tdma_g2l_tensor_copy(bmk, &p);
+  test_submit(ctx);
+
+  free_tg_gmem(ctx, tg);
+}
+
+/**
+ * prepard mean you alloc address but not submit it
+ * once submit it could re-assign from head
+ */
+static inline tg_t* prepare_put_bf16_tensor_g2l(
+    CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, const tl_t *tl, u16 data[], fmt_t tg_data_format,
+bmk1880v2_tdma_tg2l_tensor_copy_param_t* p)
+{
+  tg_shape_t s;
+  s.n = tl->shape.n;
+  s.c = tl->shape.c;
+  s.h = tl->shape.h;
+  s.w = tl->shape.w;
+  tg_t *tg = alloc_tg_bf16_gmem(ctx, s, tg_data_format);
+
+  p->src = tg;
+  p->dst = tl;
+
+  assert(bmk);
+
+  put_tg_bf16_gmem(ctx, tg, (u8 *)data);
+  return tg;
+}
+
+/**
+ * issue prepared one
+ */
+static inline void launch_put_bf16_tensor_g2l(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk,
+const tg_t *tg, bmk1880v2_tdma_tg2l_tensor_copy_param_t* p) {
+  bmk1880v2_tdma_g2l_bf16_tensor_copy(bmk, p);
+  test_submit(ctx);
+  free_tg_gmem(ctx, tg);
+}
+
+static inline void put_bf16_tensor_g2l(
+    CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, const tl_t *tl, u16 data[], fmt_t tg_data_format)
+{
+  tg_shape_t s;
+  s.n = tl->shape.n;
+  s.c = tl->shape.c;
+  s.h = tl->shape.h;
+  s.w = tl->shape.w;
+  tg_t *tg = alloc_tg_bf16_gmem(ctx, s, tg_data_format);
+
+  bmk1880v2_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = tg;
+  p.dst = tl;
+
+  put_tg_bf16_gmem(ctx, tg, (u8 *)data);
+  bmk1880v2_tdma_g2l_bf16_tensor_copy(bmk, &p);
+  test_submit(ctx);
+  free_tg_gmem(ctx, tg);
+}
+
+static inline void put_matrix_g2l(
+    CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, const ml_t *ml, u8 data[])
+{
+  mg_shape_t s;
+  s.row = ml->shape.n;
+  s.col = ml->shape.col;
+  mg_t *mg = alloc_mg_gmem(ctx, s);
+
+  bmk1880v2_tdma_tg2l_matrix_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = mg;
+  p.dst = ml;
+
+  put_mg_gmem(ctx, mg, data);
+  bmk1880v2_tdma_g2l_matrix_copy(bmk, &p);
+  test_submit(ctx);
+
+  free_mg_gmem(ctx, mg);
+}
+
+
+static inline void put_bf16_matrix_g2l(
+    CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, const ml_t *ml, u8 data[], fmt_t mg_data_format)
+{
+  mg_shape_t s;
+  s.row = ml->shape.n;
+  s.col = ml->shape.col;
+  mg_t *mg = alloc_mg_bf16_gmem(ctx, s, mg_data_format);
+
+  bmk1880v2_tdma_tg2l_matrix_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = mg;
+  p.dst = ml;
+
+  put_mg_bf16_gmem(ctx, mg, data);
+  bmk1880v2_tdma_g2l_bf16_matrix_copy(bmk, &p);
+  test_submit(ctx);
+
+  free_mg_gmem(ctx, mg);
+}
+
+static inline void put_bytes_g2l(
+    CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, u32 lmem_addr, u64 size, u8 data[])
+{
+  CVI_RT_MEM mem = CVI_RT_MemAlloc(*ctx, size);
+  u64 gmem_addr = CVI_RT_MemGetPAddr(mem);
+
+  bmk1880v2_tdma_tg2l_general_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src_base_reg_index = 0;
+  p.src_address = gmem_addr;
+  p.dst_address = lmem_addr;
+  p.bytes = size;
+
+  //put_bytes_gmem(ctx, gmem_addr, size, data);
+  put_bytes_gmem(ctx, mem, data);
+
+  bmk1880v2_tdma_g2l_general_copy(bmk, &p);
+  test_submit(ctx);
+
+  CVI_RT_MemFree(*ctx, mem);
+}
+
+static inline u8 * get_tensor_l2g(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, const tl_t *tl)
+{
+  tg_shape_t s;
+  s.n = tl->shape.n;
+  s.c = tl->shape.h;
+  s.h = tl->shape.w;
+  s.w = tl->shape.c;
+  tg_t *tg = alloc_tg_gmem(ctx, s, FMT_I8);
+
+  bmk1880v2_tdma_l2tg_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = tl;
+  p.dst = tg;
+
+  bmk1880v2_tdma_l2g_tensor_copy(bmk, &p);
+  test_submit(ctx);
+  u8 *data = get_tg_gmem(ctx, tg);
+
+  free_tg_gmem(ctx, tg);
+  return data;
+}
+
+static inline u8 * get_bf16_tensor_l2g(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, const tl_t *tl, fmt_t tg_data_format)
+{
+  tg_shape_t s;
+  s.n = tl->shape.n;
+  s.c = tl->shape.h;
+  s.h = tl->shape.w;
+  s.w = tl->shape.c;
+
+  tg_t *tg = alloc_tg_bf16_gmem(ctx, s, tg_data_format); // alloc tg to bf16 or int8 mode
+
+  bmk1880v2_tdma_l2tg_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = tl;
+  p.dst = tg;
+
+  bmk1880v2_tdma_l2g_bf16_tensor_copy(bmk, &p);
+  test_submit(ctx);
+  u8 *data = get_tg_bf16_gmem(ctx, tg);
+
+  free_tg_gmem(ctx, tg);
+  return data;
+}
+
+
+static inline u8 * get_matrix_l2g(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, const ml_t *ml)
+{
+  mg_shape_t s;
+  s.row = ml->shape.n;
+  s.col = ml->shape.col;
+  mg_t *mg = alloc_mg_gmem(ctx, s);
+
+  bmk1880v2_tdma_l2tg_matrix_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = ml;
+  p.dst = mg;
+
+  bmk1880v2_tdma_l2g_matrix_copy(bmk, &p);
+  test_submit(ctx);
+  u8 *data = get_mg_gmem(ctx, mg);
+
+  free_mg_gmem(ctx, mg);
+  return data;
+}
+
+static inline u8 * get_bf16_matrix_l2g(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, const ml_t *ml, fmt_t mg_data_format)
+{
+  mg_shape_t s;
+  s.row = ml->shape.n;
+  s.col = ml->shape.col;
+  mg_t *mg = alloc_mg_bf16_gmem(ctx, s, mg_data_format);
+
+  bmk1880v2_tdma_l2tg_matrix_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = ml;
+  p.dst = mg;
+
+  bmk1880v2_tdma_l2g_bf16_matrix_copy(bmk, &p);
+  test_submit(ctx);
+  u8 *data = get_mg_bf16_gmem(ctx, mg);
+
+  free_mg_gmem(ctx, mg);
+  return data;
+}
+
+static inline u8 * get_bytes_l2g(
+    CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, u32 lmem_addr, u64 size)
+{
+  CVI_RT_MEM mem = CVI_RT_MemAlloc(*ctx, size);
+  u64 gmem_addr = CVI_RT_MemGetPAddr(mem);
+
+  bmk1880v2_tdma_l2tg_general_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src_address = lmem_addr;
+  p.dst_base_reg_index = 0;
+  p.dst_address = gmem_addr;
+  p.bytes = size;
+
+  bmk1880v2_tdma_l2g_general_copy(bmk, &p);
+  test_submit(ctx);
+  //u8 *data = get_bytes_gmem(ctx, gmem_addr, size);
+  u8 *data = get_bytes_gmem(ctx, mem, size);
+
+  CVI_RT_MemFree(*ctx, mem);
+  return data;
+}
+
+/*
+ * tensor dump utility
+ * detail = 1, dump all tensor and indicate N and C number
+ * detail = 0, only dump 3 byte closing to begin and end point.
+ */
+static inline void dump_tensor(u8 src[], u32 n, u32 c, u32 h, u32 w, u8 detail)
+{
+  if (detail) {
+    for (u32 ni = 0; ni < n; ni++) {
+      for (u32 ci = 0; ci < c; ci++) {
+        for (u32 hi = 0; hi < h; hi++) {
+          for (u32 wi = 0; wi < w; wi++) {
+            u32 i = ni * c * h * w + ci * h * w + hi * w + wi;
+            printf("%4d ", src[i]);
+
+            if (hi == 0 && wi == w-1)
+              printf("| <= C: %d ", ci);
+
+            if (ci == 0 && hi == 0 && wi == w-1)
+              printf("@ <= N: %d ", ni);
+          }
+          printf("\n");
+        }
+      }
+    }
+  } else {
+    u64 end = (n-1) * c * h * w + (c-1) * h * w + (h-1) * w + (w-1);
+    printf("[");
+    printf("%4d", src[0]);
+    printf("%4d", src[1]);
+    printf("%4d", src[2]);
+    printf(" ... ");
+    printf("%4d", src[end - 2]);
+    printf("%4d", src[end - 1]);
+    printf("%4d", src[end]);
+    printf(" ]\n");
+  }
+}
+
+static inline void saturate_to_int8(s32 *buf, u64 size, int res_sign)
+{
+  s32 max, min;
+  if (res_sign) {
+    max = 127;
+    min = -128;
+  } else {
+    max = 255;
+    min = 0;
+  }
+
+  for (u64 i = 0; i < size; i++) {
+    if (buf[i] > max)
+      buf[i] = max;
+    else if (buf[i] < min)
+      buf[i] = min;
+  }
+}
+
+static inline void saturate_to_int16(s32 *buf, u64 size, int res_sign)
+{
+  s32 max, min;
+  if (res_sign) {
+    max = 32767;
+    min = -32768;
+  } else {
+    max = 65535;
+    min = 0;
+  }
+
+  for (u64 i = 0; i < size; i++) {
+    if (buf[i] > max)
+      buf[i] = max;
+    else if (buf[i] < min)
+      buf[i] = min;
+  }
+}
+
+static inline void arith_right_shift(
+    s32 *buf, u64 size, int shift_bits, int round_up)
+{
+  if (shift_bits == 0)
+    return;
+
+  for (u64 i = 0; i < size; i++) {
+    buf[i] >>= shift_bits - 1;
+    if (round_up)
+      buf[i] += 1;
+    buf[i] >>= 1;
+  }
+}
+
+static inline void logic_right_shift(
+    s32 *buf, u64 size, int shift_bits, int round_up)
+{
+  if (shift_bits == 0)
+    return;
+
+  for (u64 i = 0; i < size; i++) {
+    buf[i] = (u32)buf[i] >> (shift_bits - 1);
+    if (round_up)
+      buf[i] += 1;
+    buf[i] = (u32)buf[i] >> 1;
+  }
+}
+
+#if 0
+/*
+ * \return closest large or equal divisor, -1 means no divisors >= \match_divisor
+ */
+static inline int get_all_divisors(std::vector<int> *v, int n, int match_divisor)
+{
+  int match = -1;
+  for (int i=1; i<=sqrt(n); i++)
+  {
+    if (n%i==0)
+    {
+      if (n/i == i) // check if divisors are equal
+        printf("%d ", i);
+      else
+      {
+        printf("%d ", i);
+
+        // push the second divisor in the vector
+        v->push_back(n/i);
+      }
+    }
+  }
+
+  // The vector will be printed in reverse
+  for (int i=v->size()-1; i>=0; i--) {
+    int d = (*v)[i];
+    if (match == -1 && d >= match_divisor && d != 1) {
+      match = d;
+    }
+    printf("%d ", d);
+  }
+
+  return match;
+}
+
+/*
+ * \return -1 means fail to reshape, 0 means success
+ */
+static inline int get_dup_shape(int in, int ic, int ih, int iw, int dilation_h,
+    bmk1880v2_tensor_lmem_shape_t* tl_shape, bmk1880v2_tensor_lmem_stride_t* tl_stride,
+    bmk1880v2_tensor_tgmem_shape_t* tg_shape, bmk1880v2_tensor_tgmem_stride_t* tg_stride,
+    fmt_t src_tg_fmt, fmt_t dst_tl_fmt
+    ) {
+
+  // 1. reshape and extend c,h axis in order
+  int ret = 0;
+  int ch = ic * ih;
+  int c_h_gcd = std::__gcd(ch, 32);
+  if (c_h_gcd == 1) {
+    printf("cant reshape it\n");
+    c_h_gcd = ic;
+    ret = -1;
+  }
+
+  int oc = ch / c_h_gcd;
+  int oh = ch / oc;
+
+  if (oh < dilation_h) {
+    // TODO: get property c h
+    std::vector<int> all_divisors;
+    oh = get_all_divisors(&all_divisors, ch, dilation_h);
+    if (oh == -1) {
+      printf("cant reshape it with dilation_h %d\n", dilation_h);
+      ret = -1;
+      oh = ih;
+    }
+    oc = ch / oh;
+  }
+
+  // 2 means 2 bytes
+  int src_tg_fmt_sz = src_tg_fmt == FMT_BF16 ? 2 : 1;
+  int dst_tl_fmt_sz = dst_tl_fmt == FMT_BF16 ? 2 : 1;
+
+  printf ("ic:ih is %d %d, oc:oh is %d:%d, c_h_gcd %d\n", ic, ih, oc, oh, c_h_gcd);
+
+  assert(dilation_h * oc * iw <= oh * oc * iw);
+  tl_shape->n = tg_shape->n = in * 2;
+  tl_shape->c = tg_shape->c = oc;
+  tl_shape->h = tg_shape->h = oh;
+  tl_shape->w = tg_shape->w = iw;
+
+  //tl_stride->n = tg_stride->n = iw * oh * oc;
+
+  tl_stride->c = tg_stride->c = iw * oh;
+  tl_stride->h = tg_stride->h = iw;
+
+  tg_stride->n = iw * (oh) * src_tg_fmt_sz;
+  tg_stride->c *= src_tg_fmt_sz;
+  tg_stride->h *= src_tg_fmt_sz;
+
+  tl_stride->n = iw * oh * oc * dst_tl_fmt_sz;
+  tl_stride->c *= dst_tl_fmt_sz;
+  tl_stride->h *= dst_tl_fmt_sz;
+  tl_stride->w = dst_tl_fmt_sz;
+
+  return ret;
+}
+
+static inline void get_dup_first_channel_shape(int in, int ic, int ih, int iw,
+    bmk1880v2_tensor_lmem_shape_t* tl_shape, bmk1880v2_tensor_lmem_stride_t* tl_stride,
+    bmk1880v2_tensor_tgmem_shape_t* tg_shape, bmk1880v2_tensor_tgmem_stride_t* tg_stride,
+    fmt_t src_tg_fmt, fmt_t dst_tl_fmt
+    ) {
+
+  // 2 means 2 bytes
+  int src_tg_fmt_sz = src_tg_fmt == FMT_BF16 ? 2 : 1;
+  int dst_tl_fmt_sz = dst_tl_fmt == FMT_BF16 ? 2 : 1;
+
+  tl_shape->n = tg_shape->n = in;
+  tl_shape->c = tg_shape->c = ic;
+  tl_shape->h = tg_shape->h = ih;
+  tl_shape->w = tg_shape->w = iw;
+
+  tl_stride->c = tg_stride->c = iw * ih;
+  tl_stride->h = tg_stride->h = iw;
+
+  tg_stride->n = 0;
+  tg_stride->c = 0;
+  tg_stride->h *= src_tg_fmt_sz;
+
+  tl_stride->n = iw * ih * ic * dst_tl_fmt_sz;
+  tl_stride->c *= dst_tl_fmt_sz;
+  tl_stride->h *= dst_tl_fmt_sz;
+  tl_stride->w = dst_tl_fmt_sz;
+}
+#endif
+#endif /* INC_1880v2_TEST_UTIL_H */
diff --git a/cviruntime/test/1880v2/1880v2_vlc_random_gen_nn_data.h b/cviruntime/test/1880v2/1880v2_vlc_random_gen_nn_data.h
new file mode 100644
index 000000000..7ccbcf71b
--- /dev/null
+++ b/cviruntime/test/1880v2/1880v2_vlc_random_gen_nn_data.h
@@ -0,0 +1,91 @@
+/**
+ * copy from git@gitlab-ai.bitmain.vip:2290/wesley.teng/tpu_compress.git tpu_compress/test_vlc_compress.c
+   only include random_gen_nn_data relative function
+ */
+
+#ifndef __BM_VLC_COMPRESS_RANDOM_GEN_NN_DATA_H__
+#define __BM_VLC_COMPRESS_RANDOM_GEN_NN_DATA_H__
+#include <stdint.h>
+#include <stdbool.h>
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#include <assert.h>
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+
+// --- contrain random test ---
+double getGaussianRandomVar(double mean, double std)
+{
+  double PI = 3.1415926;
+  double u0 = (double)rand() / RAND_MAX;
+  double u1 = (double)rand() / RAND_MAX;
+  double n = sqrt(-2 * log(u0)) * cos(2 * PI * u1);
+  return n * std + mean;
+}
+
+double getExpRandomVar(double lambda)
+{
+  double x = (double)rand() / RAND_MAX;
+  return log(1 - x) / (-lambda);
+}
+
+void random_gen_nn_data(uint8_t *ibuf, size_t in_num, bool signedness, bool data_type, double zero_ratio)
+{
+  float *random_buf = (float *)malloc(in_num * sizeof(float));
+  int zero_thr = (int)(100 * zero_ratio);
+  double lambda = getGaussianRandomVar(0, 0.5);
+  double mean = getGaussianRandomVar(0, 8);
+  bool pdf_sel = ((rand() % 10) < 9); // 9 over 10 choose exponential pdf
+  double max_v = 0;
+  double eps = 0.0001;
+  lambda += (lambda > 0) ? eps : -eps;
+  for (size_t i = 0; i < in_num; i++)
+  {
+    double val = (pdf_sel) ? getExpRandomVar(lambda) : getGaussianRandomVar(mean, lambda);
+    val = ((signedness || data_type) && rand() % 2) ? -val : val;
+    random_buf[i] = ((rand() % 100) < zero_thr) ? 0 : val;
+    max_v = (fabs(random_buf[i]) > max_v) ? fabs(random_buf[i]) : max_v;
+  }
+
+  if (data_type == 0) // INT8
+  {
+    double cali_decay = (signedness) ? (rand() / (double)RAND_MAX) + 1 : 1; // weight dacay by calibration
+    uint8_t pruned_thr = (signedness && !data_type && (rand() % 2)) ? rand() % 12 : 0;
+    for (size_t i = 0; i < in_num; i++)
+    {
+      int val = (int)((random_buf[i] * 127) / (max_v * cali_decay));
+      ibuf[i] = (abs(val) < pruned_thr)
+                    ? 0
+                    : (val > 127)
+                          ? 127
+                          : (val < (-128))
+                                ? -128
+                                : val;
+    }
+  }
+  else // BFloat16
+  {
+    uint16_t *bf16_buf = (uint16_t *)random_buf;
+    for (size_t i = 0; i < in_num; i++)
+    {
+      short bf16_val = bf16_buf[(i << 1) + 1];
+      // WARNING: set subnormal value to zero since HW do NOT support
+      int exp = ((bf16_val >> 7) & 0xFF);
+      bf16_val = (exp) ? bf16_val : 0;
+
+      ibuf[i << 1] = (uint8_t)(bf16_val & 0xFF);
+      ibuf[(i << 1) + 1] = (uint8_t)(bf16_val >> 8);
+    }
+  }
+  free(random_buf);
+}
+  #ifdef __cplusplus
+}
+#endif
+
+#endif /* __BM_VLC_COMPRESS_RANDOM_GEN_NN_DATA_H__ */
diff --git a/cviruntime/test/1880v2/bf16/1880v2_bf16_util.h b/cviruntime/test/1880v2/bf16/1880v2_bf16_util.h
new file mode 100644
index 000000000..80da8bef8
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/1880v2_bf16_util.h
@@ -0,0 +1,60 @@
+#ifndef INC_1880v2_BF16_UTIL_H
+#define INC_1880v2_BF16_UTIL_H
+
+#define RAND_SEED_MOD 10
+#define COMPARE_PASS 0
+
+u16 corner_val[] = {
+  0x0000, // 0 00000000 0000000 = zero
+  0x8000, // 1 00000000 0000000 = −zero
+  0x7f80, // 0 11111111 0000000 = infinity
+  0xff80, // 1 11111111 0000000 = −infinity
+  0x4049, // 0 10000000 1001001 = 3.140625 ≈ π ( pi )
+  0x3eab, // 0 01111101 0101011 = 0.333984375 ≈ 1/3
+  0xffc1, // x 11111111 1000001 => qNaN
+  0xff81, // x 11111111 0000001 => sNaN
+  0x00ff, // x 00000000 1111111 => denormal
+};
+
+u16 generate_bf16_corner_val(float val)
+{
+  if( rand()%RAND_SEED_MOD == 0 ) {
+    return corner_val[ rand() % (sizeof(corner_val)/sizeof(u16)) ];
+  } else {
+    return convert_fp32_bf16(val);
+  }
+}
+
+int compare_result( void *ref_x, void *result_x , fmt_t fmt, int stride_size)
+{
+  u8 *u8result_x = NULL;
+  u16 *u16result_x = NULL;
+  u8 *u8ref_x = NULL;
+  u16 *u16ref_x = NULL;
+
+  if(fmt == FMT_BF16) {
+    u16result_x = (u16 *)result_x;
+    u16ref_x = (u16 *)ref_x;
+    for (int i = 0; i < stride_size; i++) {
+      if (u16result_x[i] != u16ref_x[i]) {
+        printf("compare failed at result_x[%d], got %d, exp %d\n", 
+               i, u16result_x[i], u16ref_x[i]);
+        return -1;
+      }
+    }
+  } else {
+    u8result_x = (u8 *)result_x;
+    u8ref_x = (u8 *)ref_x;
+    for (int i = 0; i < stride_size; i++) {
+      if (u8result_x[i] != u8ref_x[i]) {
+        printf("compare failed at result_x[%d], got %d, exp %d\n", 
+               i, u8result_x[i], u8ref_x[i]);
+        return -1;
+      }
+    }
+  }
+
+  return 0;
+}
+
+#endif /* INC_1880v2_BF16_UTIL_H */
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_bf16_atan.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_atan.cpp
new file mode 100644
index 000000000..a782fa605
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_atan.cpp
@@ -0,0 +1,2624 @@
+/**
+ */
+#include "../1880v2_test_util.h"
+#define OUT
+#define IN
+#include <iostream>
+#include <iomanip>
+#include <string>
+#include <map>
+#include <random>
+#include <cfloat>
+//#include <boost/math/special_functions/next.hpp>
+//#define DBG
+
+using namespace std;
+
+//TODO: get from ctx
+static u32 channel = 32; //<! 1880v2 hardcode
+static u32 table_h = 32;
+static u32 table_w = 8;
+static u32 table_hw = table_h * table_w;
+static double *lut = (double *)malloc(sizeof(double) * table_hw);
+
+
+
+// http://www.enseignement.polytechnique.fr/informatique/INF478/docs/Cpp/en/cpp/types/numeric_limits/epsilon.html
+template<class T>
+typename std::enable_if<!std::numeric_limits<T>::is_integer, bool>::type
+    almost_equal(T x, T y, int ulp)
+{
+    // the machine epsilon has to be scaled to the magnitude of the values used
+    // and multiplied by the desired precision in ULPs (units in the last place)
+    return std::abs(x-y) < std::numeric_limits<T>::epsilon() * std::abs(x+y) * ulp
+    // unless the result is subnormal
+           || std::abs(x-y) < std::numeric_limits<T>::min();
+}
+/**
+ * pre_data means we test fixed pattern, it should be same sa lut
+ */
+enum TEST_MODE {
+  PRE_DATA_COMPARE_FIX = 0, // pre-data + fix compare
+  DATA_COMPARE_ACCURACY,    //generate \range_start to \range_end value that check epsilon
+  DATA_COMPARE_U8,          //generate \range_start to \range_end value that check epsilon, result bf16->u8
+  TEST_MODE_MAX,
+};
+
+static TEST_MODE mode;
+
+static u16 test_pattern[] = {
+  0x0000,
+  0x38D2,
+  0x3952,
+  0x399D,
+  0x39D2,
+  0x3A03,
+  0x3A1D,
+  0x3A38,
+  0x3A52,
+  0x3A6C,
+  0x3A83,
+  0x3A90,
+  0x3A9D,
+  0x3AAA,
+  0x3AB8,
+  0x3AC5,
+  0x3AD2,
+  0x3ADF,
+  0x3AEC,
+  0x3AF9,
+  0x3B03,
+  0x3B0A,
+  0x3B10,
+  0x3B17,
+  0x3B1D,
+  0x3B24,
+  0x3B2A,
+  0x3B31,
+  0x3B38,
+  0x3B3E,
+  0x3B45,
+  0x3B4B,
+  0x3B52,
+  0x3B58,
+  0x3B5F,
+  0x3B65,
+  0x3B6C,
+  0x3B72,
+  0x3B79,
+  0x3B80,
+  0x3B83,
+  0x3B86,
+  0x3B8A,
+  0x3B8D,
+  0x3B90,
+  0x3B93,
+  0x3B97,
+  0x3B9A,
+  0x3B9D,
+  0x3BA1,
+  0x3BA4,
+  0x3BA7,
+  0x3BAA,
+  0x3BAE,
+  0x3BB1,
+  0x3BB4,
+  0x3BB8,
+  0x3BBB,
+  0x3BBE,
+  0x3BC1,
+  0x3BC5,
+  0x3BC8,
+  0x3BCB,
+  0x3BCE,
+  0x3BD2,
+  0x3BD5,
+  0x3BD8,
+  0x3BDC,
+  0x3BDF,
+  0x3BE2,
+  0x3BE5,
+  0x3BE9,
+  0x3BEC,
+  0x3BEF,
+  0x3BF2,
+  0x3BF6,
+  0x3BF9,
+  0x3BFC,
+  0x3C00,
+  0x3C01,
+  0x3C03,
+  0x3C05,
+  0x3C06,
+  0x3C08,
+  0x3C0A,
+  0x3C0B,
+  0x3C0D,
+  0x3C0F,
+  0x3C10,
+  0x3C12,
+  0x3C13,
+  0x3C15,
+  0x3C17,
+  0x3C18,
+  0x3C1A,
+  0x3C1C,
+  0x3C1D,
+  0x3C1F,
+  0x3C21,
+  0x3C22,
+  0x3C24,
+  0x3C25,
+  0x3C27,
+  0x3C29,
+  0x3C2A,
+  0x3C2C,
+  0x3C2E,
+  0x3C2F,
+  0x3C31,
+  0x3C33,
+  0x3C34,
+  0x3C36,
+  0x3C38,
+  0x3C39,
+  0x3C3B,
+  0x3C3C,
+  0x3C3E,
+  0x3C40,
+  0x3C41,
+  0x3C43,
+  0x3C45,
+  0x3C46,
+  0x3C48,
+  0x3C4A,
+  0x3C4B,
+  0x3C4D,
+  0x3C4E,
+  0x3C50,
+  0x3C52,
+  0x3C53,
+  0x3C55,
+  0x3C57,
+  0x3C58,
+  0x3C5A,
+  0x3C5C,
+  0x3C5D,
+  0x3C5F,
+  0x3C60,
+  0x3C62,
+  0x3C64,
+  0x3C65,
+  0x3C67,
+  0x3C69,
+  0x3C6A,
+  0x3C6C,
+  0x3C6E,
+  0x3C6F,
+  0x3C71,
+  0x3C72,
+  0x3C74,
+  0x3C76,
+  0x3C77,
+  0x3C79,
+  0x3C7B,
+  0x3C7C,
+  0x3C7E,
+  0x3C80,
+  0x3C81,
+  0x3C81,
+  0x3C82,
+  0x3C83,
+  0x3C84,
+  0x3C85,
+  0x3C86,
+  0x3C86,
+  0x3C87,
+  0x3C88,
+  0x3C89,
+  0x3C8A,
+  0x3C8A,
+  0x3C8B,
+  0x3C8C,
+  0x3C8D,
+  0x3C8E,
+  0x3C8F,
+  0x3C8F,
+  0x3C90,
+  0x3C91,
+  0x3C92,
+  0x3C93,
+  0x3C93,
+  0x3C94,
+  0x3C95,
+  0x3C96,
+  0x3C97,
+  0x3C98,
+  0x3C98,
+  0x3C99,
+  0x3C9A,
+  0x3C9B,
+  0x3C9C,
+  0x3C9C,
+  0x3C9D,
+  0x3C9E,
+  0x3C9F,
+  0x3CA0,
+  0x3CA1,
+  0x3CA1,
+  0x3CA2,
+  0x3CA3,
+  0x3CA4,
+  0x3CA5,
+  0x3CA5,
+  0x3CA6,
+  0x3CA7,
+  0x3CA8,
+  0x3CA9,
+  0x3CAA,
+  0x3CAA,
+  0x3CAB,
+  0x3CAC,
+  0x3CAD,
+  0x3CAE,
+  0x3CAE,
+  0x3CAF,
+  0x3CB0,
+  0x3CB1,
+  0x3CB2,
+  0x3CB3,
+  0x3CB3,
+  0x3CB4,
+  0x3CB5,
+  0x3CB6,
+  0x3CB7,
+  0x3CB8,
+  0x3CB8,
+  0x3CB9,
+  0x3CBA,
+  0x3CBB,
+  0x3CBC,
+  0x3CBC,
+  0x3CBD,
+  0x3CBE,
+  0x3CBF,
+  0x3CC0,
+  0x3CC1,
+  0x3CC1,
+  0x3CC2,
+  0x3CC3,
+  0x3CC4,
+  0x3CC5,
+  0x3CC5,
+  0x3CC6,
+  0x3CC7,
+  0x3CC8,
+  0x3CC9,
+  0x3CCA,
+  0x3CCA,
+  0x3CCB,
+  0x3CCC,
+  0x3CCD,
+  0x3CCE,
+  0x3CCE,
+  0x3CCF,
+  0x3CD0,
+  0x3CD1,
+  0x3CD2,
+  0x3CD3,
+  0x3CD3,
+  0x3CD4,
+  0x3CD5,
+  0x3CD6,
+  0x3CD7,
+  0x3CD7,
+  0x3CD8,
+  0x3CD9,
+  0x3CDA,
+  0x3CDB,
+  0x3CDC,
+  0x3CDC,
+  0x3CDD,
+  0x3CDE,
+  0x3CDF,
+  0x3CE0,
+  0x3CE0,
+  0x3CE1,
+  0x3CE2,
+  0x3CE3,
+  0x3CE4,
+  0x3CE5,
+  0x3CE5,
+  0x3CE6,
+  0x3CE7,
+  0x3CE8,
+  0x3CE9,
+  0x3CE9,
+  0x3CEA,
+  0x3CEB,
+  0x3CEC,
+  0x3CED,
+  0x3CEE,
+  0x3CEE,
+  0x3CEF,
+  0x3CF0,
+  0x3CF1,
+  0x3CF2,
+  0x3CF2,
+  0x3CF3,
+  0x3CF4,
+  0x3CF5,
+  0x3CF6,
+  0x3CF7,
+  0x3CF7,
+  0x3CF8,
+  0x3CF9,
+  0x3CFA,
+  0x3CFB,
+  0x3CFB,
+  0x3CFC,
+  0x3CFD,
+  0x3CFE,
+  0x3CFF,
+  0x3D00,
+  0x3D00,
+  0x3D01,
+  0x3D01,
+  0x3D01,
+  0x3D02,
+  0x3D02,
+  0x3D03,
+  0x3D03,
+  0x3D03,
+  0x3D04,
+  0x3D04,
+  0x3D05,
+  0x3D05,
+  0x3D06,
+  0x3D06,
+  0x3D06,
+  0x3D07,
+  0x3D07,
+  0x3D08,
+  0x3D08,
+  0x3D08,
+  0x3D09,
+  0x3D09,
+  0x3D0A,
+  0x3D0A,
+  0x3D0A,
+  0x3D0B,
+  0x3D0B,
+  0x3D0C,
+  0x3D0C,
+  0x3D0C,
+  0x3D0D,
+  0x3D0D,
+  0x3D0E,
+  0x3D0E,
+  0x3D0F,
+  0x3D0F,
+  0x3D0F,
+  0x3D10,
+  0x3D10,
+  0x3D11,
+  0x3D11,
+  0x3D11,
+  0x3D12,
+  0x3D12,
+  0x3D13,
+  0x3D13,
+  0x3D13,
+  0x3D14,
+  0x3D14,
+  0x3D15,
+  0x3D15,
+  0x3D16,
+  0x3D16,
+  0x3D16,
+  0x3D17,
+  0x3D17,
+  0x3D18,
+  0x3D18,
+  0x3D18,
+  0x3D19,
+  0x3D19,
+  0x3D1A,
+  0x3D1A,
+  0x3D1A,
+  0x3D1B,
+  0x3D1B,
+  0x3D1C,
+  0x3D1C,
+  0x3D1C,
+  0x3D1D,
+  0x3D1D,
+  0x3D1E,
+  0x3D1E,
+  0x3D1F,
+  0x3D1F,
+  0x3D1F,
+  0x3D20,
+  0x3D20,
+  0x3D21,
+  0x3D21,
+  0x3D21,
+  0x3D22,
+  0x3D22,
+  0x3D23,
+  0x3D23,
+  0x3D23,
+  0x3D24,
+  0x3D24,
+  0x3D25,
+  0x3D25,
+  0x3D25,
+  0x3D26,
+  0x3D26,
+  0x3D27,
+  0x3D27,
+  0x3D28,
+  0x3D28,
+  0x3D28,
+  0x3D29,
+  0x3D29,
+  0x3D2A,
+  0x3D2A,
+  0x3D2A,
+  0x3D2B,
+  0x3D2B,
+  0x3D2C,
+  0x3D2C,
+  0x3D2C,
+  0x3D2D,
+  0x3D2D,
+  0x3D2E,
+  0x3D2E,
+  0x3D2E,
+  0x3D2F,
+  0x3D2F,
+  0x3D30,
+  0x3D30,
+  0x3D31,
+  0x3D31,
+  0x3D31,
+  0x3D32,
+  0x3D32,
+  0x3D33,
+  0x3D33,
+  0x3D33,
+  0x3D34,
+  0x3D34,
+  0x3D35,
+  0x3D35,
+  0x3D35,
+  0x3D36,
+  0x3D36,
+  0x3D37,
+  0x3D37,
+  0x3D38,
+  0x3D38,
+  0x3D38,
+  0x3D39,
+  0x3D39,
+  0x3D3A,
+  0x3D3A,
+  0x3D3A,
+  0x3D3B,
+  0x3D3B,
+  0x3D3C,
+  0x3D3C,
+  0x3D3C,
+  0x3D3D,
+  0x3D3D,
+  0x3D3E,
+  0x3D3E,
+  0x3D3E,
+  0x3D3F,
+  0x3D3F,
+  0x3D40,
+  0x3D40,
+  0x3D41,
+  0x3D41,
+  0x3D41,
+  0x3D42,
+  0x3D42,
+  0x3D43,
+  0x3D43,
+  0x3D43,
+  0x3D44,
+  0x3D44,
+  0x3D45,
+  0x3D45,
+  0x3D45,
+  0x3D46,
+  0x3D46,
+  0x3D47,
+  0x3D47,
+  0x3D47,
+  0x3D48,
+  0x3D48,
+  0x3D49,
+  0x3D49,
+  0x3D4A,
+  0x3D4A,
+  0x3D4A,
+  0x3D4B,
+  0x3D4B,
+  0x3D4C,
+  0x3D4C,
+  0x3D4C,
+  0x3D4D,
+  0x3D4D,
+  0x3D4E,
+  0x3D4E,
+  0x3D4E,
+  0x3D4F,
+  0x3D4F,
+  0x3D50,
+  0x3D50,
+  0x3D50,
+  0x3D51,
+  0x3D51,
+  0x3D52,
+  0x3D52,
+  0x3D53,
+  0x3D53,
+  0x3D53,
+  0x3D54,
+  0x3D54,
+  0x3D55,
+  0x3D55,
+  0x3D55,
+  0x3D56,
+  0x3D56,
+  0x3D57,
+  0x3D57,
+  0x3D57,
+  0x3D58,
+  0x3D58,
+  0x3D59,
+  0x3D59,
+  0x3D59,
+  0x3D5A,
+  0x3D5A,
+  0x3D5B,
+  0x3D5B,
+  0x3D5C,
+  0x3D5C,
+  0x3D5C,
+  0x3D5D,
+  0x3D5D,
+  0x3D5E,
+  0x3D5E,
+  0x3D5E,
+  0x3D5F,
+  0x3D5F,
+  0x3D60,
+  0x3D60,
+  0x3D60,
+  0x3D61,
+  0x3D61,
+  0x3D62,
+  0x3D62,
+  0x3D63,
+  0x3D63,
+  0x3D63,
+  0x3D64,
+  0x3D64,
+  0x3D65,
+  0x3D65,
+  0x3D65,
+  0x3D66,
+  0x3D66,
+  0x3D67,
+  0x3D67,
+  0x3D67,
+  0x3D68,
+  0x3D68,
+  0x3D69,
+  0x3D69,
+  0x3D69,
+  0x3D6A,
+  0x3D6A,
+  0x3D6B,
+  0x3D6B,
+  0x3D6C,
+  0x3D6C,
+  0x3D6C,
+  0x3D6D,
+  0x3D6D,
+  0x3D6E,
+  0x3D6E,
+  0x3D6E,
+  0x3D6F,
+  0x3D6F,
+  0x3D70,
+  0x3D70,
+  0x3D70,
+  0x3D71,
+  0x3D71,
+  0x3D72,
+  0x3D72,
+  0x3D72,
+  0x3D73,
+  0x3D73,
+  0x3D74,
+  0x3D74,
+  0x3D75,
+  0x3D75,
+  0x3D75,
+  0x3D76,
+  0x3D76,
+  0x3D77,
+  0x3D77,
+  0x3D77,
+  0x3D78,
+  0x3D78,
+  0x3D79,
+  0x3D79,
+  0x3D79,
+  0x3D7A,
+  0x3D7A,
+  0x3D7B,
+  0x3D7B,
+  0x3D7B,
+  0x3D7C,
+  0x3D7C,
+  0x3D7D,
+  0x3D7D,
+  0x3D7E,
+  0x3D7E,
+  0x3D7E,
+  0x3D7F,
+  0x3D7F,
+  0x3D80,
+  0x3D80,
+  0x3D80,
+  0x3D80,
+  0x3D81,
+  0x3D81,
+  0x3D81,
+  0x3D81,
+  0x3D81,
+  0x3D82,
+  0x3D82,
+  0x3D82,
+  0x3D82,
+  0x3D82,
+  0x3D83,
+  0x3D83,
+  0x3D83,
+  0x3D83,
+  0x3D83,
+  0x3D84,
+  0x3D84,
+  0x3D84,
+  0x3D84,
+  0x3D85,
+  0x3D85,
+  0x3D85,
+  0x3D85,
+  0x3D85,
+  0x3D86,
+  0x3D86,
+  0x3D86,
+  0x3D86,
+  0x3D86,
+  0x3D87,
+  0x3D87,
+  0x3D87,
+  0x3D87,
+  0x3D87,
+  0x3D88,
+  0x3D88,
+  0x3D88,
+  0x3D88,
+  0x3D88,
+  0x3D89,
+  0x3D89,
+  0x3D89,
+  0x3D89,
+  0x3D89,
+  0x3D8A,
+  0x3D8A,
+  0x3D8A,
+  0x3D8A,
+  0x3D8A,
+  0x3D8B,
+  0x3D8B,
+  0x3D8B,
+  0x3D8B,
+  0x3D8B,
+  0x3D8C,
+  0x3D8C,
+  0x3D8C,
+  0x3D8C,
+  0x3D8C,
+  0x3D8D,
+  0x3D8D,
+  0x3D8D,
+  0x3D8D,
+  0x3D8E,
+  0x3D8E,
+  0x3D8E,
+  0x3D8E,
+  0x3D8E,
+  0x3D8F,
+  0x3D8F,
+  0x3D8F,
+  0x3D8F,
+  0x3D8F,
+  0x3D90,
+  0x3D90,
+  0x3D90,
+  0x3D90,
+  0x3D90,
+  0x3D91,
+  0x3D91,
+  0x3D91,
+  0x3D91,
+  0x3D91,
+  0x3D92,
+  0x3D92,
+  0x3D92,
+  0x3D92,
+  0x3D92,
+  0x3D93,
+  0x3D93,
+  0x3D93,
+  0x3D93,
+  0x3D93,
+  0x3D94,
+  0x3D94,
+  0x3D94,
+  0x3D94,
+  0x3D94,
+  0x3D95,
+  0x3D95,
+  0x3D95,
+  0x3D95,
+  0x3D96,
+  0x3D96,
+  0x3D96,
+  0x3D96,
+  0x3D96,
+  0x3D97,
+  0x3D97,
+  0x3D97,
+  0x3D97,
+  0x3D97,
+  0x3D98,
+  0x3D98,
+  0x3D98,
+  0x3D98,
+  0x3D98,
+  0x3D99,
+  0x3D99,
+  0x3D99,
+  0x3D99,
+  0x3D99,
+  0x3D9A,
+  0x3D9A,
+  0x3D9A,
+  0x3D9A,
+  0x3D9A,
+  0x3D9B,
+  0x3D9B,
+  0x3D9B,
+  0x3D9B,
+  0x3D9B,
+  0x3D9C,
+  0x3D9C,
+  0x3D9C,
+  0x3D9C,
+  0x3D9C,
+  0x3D9D,
+  0x3D9D,
+  0x3D9D,
+  0x3D9D,
+  0x3D9D,
+  0x3D9E,
+  0x3D9E,
+  0x3D9E,
+  0x3D9E,
+  0x3D9F,
+  0x3D9F,
+  0x3D9F,
+  0x3D9F,
+  0x3D9F,
+  0x3DA0,
+  0x3DA0,
+  0x3DA0,
+  0x3DA0,
+  0x3DA0,
+  0x3DA1,
+  0x3DA1,
+  0x3DA1,
+  0x3DA1,
+  0x3DA1,
+  0x3DA2,
+  0x3DA2,
+  0x3DA2,
+  0x3DA2,
+  0x3DA2,
+  0x3DA3,
+  0x3DA3,
+  0x3DA3,
+  0x3DA3,
+  0x3DA3,
+  0x3DA4,
+  0x3DA4,
+  0x3DA4,
+  0x3DA4,
+  0x3DA4,
+  0x3DA5,
+  0x3DA5,
+  0x3DA5,
+  0x3DA5,
+  0x3DA5,
+  0x3DA6,
+  0x3DA6,
+  0x3DA6,
+  0x3DA6,
+  0x3DA7,
+  0x3DA7,
+  0x3DA7,
+  0x3DA7,
+  0x3DA7,
+  0x3DA8,
+  0x3DA8,
+  0x3DA8,
+  0x3DA8,
+  0x3DA8,
+  0x3DA9,
+  0x3DA9,
+  0x3DA9,
+  0x3DA9,
+  0x3DA9,
+  0x3DAA,
+  0x3DAA,
+  0x3DAA,
+  0x3DAA,
+  0x3DAA,
+  0x3DAB,
+  0x3DAB,
+  0x3DAB,
+  0x3DAB,
+  0x3DAB,
+  0x3DAC,
+  0x3DAC,
+  0x3DAC,
+  0x3DAC,
+  0x3DAC,
+  0x3DAD,
+  0x3DAD,
+  0x3DAD,
+  0x3DAD,
+  0x3DAD,
+  0x3DAE,
+  0x3DAE,
+  0x3DAE,
+  0x3DAE,
+  0x3DAE,
+  0x3DAF,
+  0x3DAF,
+  0x3DAF,
+  0x3DAF,
+  0x3DB0,
+  0x3DB0,
+  0x3DB0,
+  0x3DB0,
+  0x3DB0,
+  0x3DB1,
+  0x3DB1,
+  0x3DB1,
+  0x3DB1,
+  0x3DB1,
+  0x3DB2,
+  0x3DB2,
+  0x3DB2,
+  0x3DB2,
+  0x3DB2,
+  0x3DB3,
+  0x3DB3,
+  0x3DB3,
+  0x3DB3,
+  0x3DB3,
+  0x3DB4,
+  0x3DB4,
+  0x3DB4,
+  0x3DB4,
+  0x3DB4,
+  0x3DB5,
+  0x3DB5,
+  0x3DB5,
+  0x3DB5,
+  0x3DB5,
+  0x3DB6,
+  0x3DB6,
+  0x3DB6,
+  0x3DB6,
+  0x3DB6,
+  0x3DB7,
+  0x3DB7,
+  0x3DB7,
+  0x3DB7,
+  0x3DB8,
+  0x3DB8,
+  0x3DB8,
+  0x3DB8,
+  0x3DB8,
+  0x3DB9,
+  0x3DB9,
+  0x3DB9,
+  0x3DB9,
+  0x3DB9,
+  0x3DBA,
+  0x3DBA,
+  0x3DBA,
+  0x3DBA,
+  0x3DBA,
+  0x3DBB,
+  0x3DBB,
+  0x3DBB,
+  0x3DBB,
+  0x3DBB,
+  0x3DBC,
+  0x3DBC,
+  0x3DBC,
+  0x3DBC,
+  0x3DBC,
+  0x3DBD,
+  0x3DBD,
+  0x3DBD,
+  0x3DBD,
+  0x3DBD,
+  0x3DBE,
+  0x3DBE,
+  0x3DBE,
+  0x3DBE,
+  0x3DBE,
+  0x3DBF,
+  0x3DBF,
+  0x3DBF,
+  0x3DBF,
+  0x3DBF,
+  0x3DC0,
+  0x3DC0,
+  0x3DC0,
+  0x3DC0,
+  0x3DC1,
+  0x3DC1,
+  0x3DC1,
+  0x3DC1,
+  0x3DC1,
+  0x3DC2,
+  0x3DC2,
+  0x3DC2,
+  0x3DC2,
+  0x3DC2,
+  0x3DC3,
+  0x3DC3,
+  0x3DC3,
+  0x3DC3,
+  0x3DC3,
+  0x3DC4,
+  0x3DC4,
+  0x3DC4,
+  0x3DC4,
+  0x3DC4,
+  0x3DC5,
+  0x3DC5,
+  0x3DC5,
+  0x3DC5,
+  0x3DC5,
+  0x3DC6,
+  0x3DC6,
+  0x3DC6,
+  0x3DC6,
+  0x3DC6,
+  0x3DC7,
+  0x3DC7,
+  0x3DC7,
+  0x3DC7,
+  0x3DC7,
+  0x3DC8,
+  0x3DC8,
+  0x3DC8,
+  0x3DC8,
+  0x3DC8,
+  0x3DC9,
+  0x3DC9,
+  0x3DC9,
+  0x3DC9,
+  0x3DCA,
+  0x3DCA,
+  0x3DCA,
+  0x3DCA,
+  0x3DCA,
+  0x3DCB,
+  0x3DCB,
+  0x3DCB,
+  0x3DCB,
+  0x3DCB,
+  0x3DCC,
+  0x3DCC,
+  0x3DCC,
+  0x3DCC,
+  0x3DCC,
+  0x3DCD,
+  0x3DCE,
+  0x3DCF,
+  0x3DD0,
+  0x3DD1,
+  0x3DD2,
+  0x3DD3,
+  0x3DD4,
+  0x3DD5,
+  0x3DD6,
+  0x3DD7,
+  0x3DD8,
+  0x3DD9,
+  0x3DDA,
+  0x3DDB,
+  0x3DDC,
+  0x3DDD,
+  0x3DDE,
+  0x3DDF,
+  0x3DE0,
+  0x3DE1,
+  0x3DE2,
+  0x3DE3,
+  0x3DE4,
+  0x3DE5,
+};
+
+static u16 golden_bf16[] = {
+  0x0,
+  0x38d2,
+  0x3952,
+  0x399d,
+  0x39d2,
+  0x3a03,
+  0x3a1d,
+  0x3a38,
+  0x3a52,
+  0x3a6c,
+  0x3a83,
+  0x3a90,
+  0x3a9d,
+  0x3aaa,
+  0x3ab8,
+  0x3ac5,
+  0x3ad2,
+  0x3adf,
+  0x3aec,
+  0x3af9,
+  0x3b03,
+  0x3b0a,
+  0x3b10,
+  0x3b17,
+  0x3b1d,
+  0x3b24,
+  0x3b2a,
+  0x3b31,
+  0x3b38,
+  0x3b3e,
+  0x3b45,
+  0x3b4b,
+  0x3b52,
+  0x3b58,
+  0x3b5f,
+  0x3b65,
+  0x3b6c,
+  0x3b72,
+  0x3b79,
+  0x3b80,
+  0x3b83,
+  0x3b86,
+  0x3b8a,
+  0x3b8d,
+  0x3b90,
+  0x3b93,
+  0x3b97,
+  0x3b9a,
+  0x3b9d,
+  0x3ba1,
+  0x3ba4,
+  0x3ba7,
+  0x3baa,
+  0x3bae,
+  0x3bb1,
+  0x3bb4,
+  0x3bb8,
+  0x3bbb,
+  0x3bbe,
+  0x3bc1,
+  0x3bc5,
+  0x3bc8,
+  0x3bcb,
+  0x3bce,
+  0x3bd2,
+  0x3bd5,
+  0x3bd8,
+  0x3bdc,
+  0x3bdf,
+  0x3be2,
+  0x3be5,
+  0x3be9,
+  0x3bec,
+  0x3bef,
+  0x3bf2,
+  0x3bf6,
+  0x3bf9,
+  0x3bfc,
+  0x3c00,
+  0x3c01,
+  0x3c03,
+  0x3c05,
+  0x3c06,
+  0x3c08,
+  0x3c0a,
+  0x3c0b,
+  0x3c0d,
+  0x3c0f,
+  0x3c10,
+  0x3c12,
+  0x3c13,
+  0x3c15,
+  0x3c17,
+  0x3c18,
+  0x3c1a,
+  0x3c1c,
+  0x3c1d,
+  0x3c1f,
+  0x3c21,
+  0x3c22,
+  0x3c24,
+  0x3c25,
+  0x3c27,
+  0x3c29,
+  0x3c2a,
+  0x3c2c,
+  0x3c2e,
+  0x3c2f,
+  0x3c31,
+  0x3c33,
+  0x3c34,
+  0x3c36,
+  0x3c38,
+  0x3c39,
+  0x3c3b,
+  0x3c3c,
+  0x3c3e,
+  0x3c40,
+  0x3c41,
+  0x3c43,
+  0x3c45,
+  0x3c46,
+  0x3c48,
+  0x3c4a,
+  0x3c4b,
+  0x3c4d,
+  0x3c4e,
+  0x3c50,
+  0x3c52,
+  0x3c53,
+  0x3c55,
+  0x3c57,
+  0x3c58,
+  0x3c5a,
+  0x3c5c,
+  0x3c5d,
+  0x3c5f,
+  0x3c60,
+  0x3c62,
+  0x3c64,
+  0x3c65,
+  0x3c67,
+  0x3c69,
+  0x3c6a,
+  0x3c6c,
+  0x3c6e,
+  0x3c6f,
+  0x3c71,
+  0x3c72,
+  0x3c74,
+  0x3c76,
+  0x3c77,
+  0x3c79,
+  0x3c7b,
+  0x3c7c,
+  0x3c7e,
+  0x3c80,
+  0x3c81,
+  0x3c81,
+  0x3c82,
+  0x3c83,
+  0x3c84,
+  0x3c85,
+  0x3c86,
+  0x3c86,
+  0x3c87,
+  0x3c88,
+  0x3c89,
+  0x3c8a,
+  0x3c8a,
+  0x3c8b,
+  0x3c8c,
+  0x3c8d,
+  0x3c8e,
+  0x3c8f,
+  0x3c8f,
+  0x3c90,
+  0x3c91,
+  0x3c92,
+  0x3c93,
+  0x3c93,
+  0x3c94,
+  0x3c95,
+  0x3c96,
+  0x3c97,
+  0x3c98,
+  0x3c98,
+  0x3c99,
+  0x3c9a,
+  0x3c9b,
+  0x3c9c,
+  0x3c9c,
+  0x3c9d,
+  0x3c9e,
+  0x3c9f,
+  0x3ca0,
+  0x3ca1,
+  0x3ca1,
+  0x3ca2,
+  0x3ca3,
+  0x3ca4,
+  0x3ca5,
+  0x3ca5,
+  0x3ca6,
+  0x3ca7,
+  0x3ca8,
+  0x3ca9,
+  0x3caa,
+  0x3caa,
+  0x3cab,
+  0x3cac,
+  0x3cad,
+  0x3cae,
+  0x3cae,
+  0x3caf,
+  0x3cb0,
+  0x3cb1,
+  0x3cb2,
+  0x3cb3,
+  0x3cb3,
+  0x3cb4,
+  0x3cb5,
+  0x3cb6,
+  0x3cb7,
+  0x3cb8,
+  0x3cb8,
+  0x3cb9,
+  0x3cba,
+  0x3cbb,
+  0x3cbc,
+  0x3cbc,
+  0x3cbd,
+  0x3cbe,
+  0x3cbf,
+  0x3cc0,
+  0x3cc1,
+  0x3cc1,
+  0x3cc2,
+  0x3cc3,
+  0x3cc4,
+  0x3cc5,
+  0x3cc5,
+  0x3cc6,
+  0x3cc7,
+  0x3cc8,
+  0x3cc9,
+  0x3cca,
+  0x3cca,
+  0x3ccb,
+  0x3ccc,
+  0x3ccd,
+  0x3cce,
+  0x3cce,
+  0x3ccf,
+  0x3cd0,
+  0x3cd1,
+  0x3cd2,
+  0x3cd3,
+  0x3cd3,
+  0x3cd4,
+  0x3cd5,
+  0x3cd6,
+  0x3cd7,
+  0x3cd7,
+  0x3cd8,
+  0x3cd9,
+  0x3cda,
+  0x3cdb,
+  0x3cdc,
+  0x3cdc,
+  0x3cdd,
+  0x3cde,
+  0x3cdf,
+  0x3ce0,
+  0x3ce0,
+  0x3ce1,
+  0x3ce2,
+  0x3ce3,
+  0x3ce4,
+  0x3ce5,
+  0x3ce5,
+  0x3ce6,
+  0x3ce7,
+  0x3ce8,
+  0x3ce9,
+  0x3ce9,
+  0x3cea,
+  0x3ceb,
+  0x3cec,
+  0x3ced,
+  0x3cee,
+  0x3cee,
+  0x3cef,
+  0x3cf0,
+  0x3cf1,
+  0x3cf2,
+  0x3cf2,
+  0x3cf3,
+  0x3cf4,
+  0x3cf5,
+  0x3cf6,
+  0x3cf7,
+  0x3cf7,
+  0x3cf8,
+  0x3cf9,
+  0x3cfa,
+  0x3cfb,
+  0x3cfb,
+  0x3cfc,
+  0x3cfd,
+  0x3cfe,
+  0x3cff,
+  0x3d00,
+  0x3d00,
+  0x3d01,
+  0x3d01,
+  0x3d01,
+  0x3d02,
+  0x3d02,
+  0x3d03,
+  0x3d03,
+  0x3d03,
+  0x3d04,
+  0x3d04,
+  0x3d05,
+  0x3d05,
+  0x3d06,
+  0x3d06,
+  0x3d06,
+  0x3d07,
+  0x3d07,
+  0x3d08,
+  0x3d08,
+  0x3d08,
+  0x3d09,
+  0x3d09,
+  0x3d0a,
+  0x3d0a,
+  0x3d0a,
+  0x3d0b,
+  0x3d0b,
+  0x3d0c,
+  0x3d0c,
+  0x3d0c,
+  0x3d0d,
+  0x3d0d,
+  0x3d0e,
+  0x3d0e,
+  0x3d0f,
+  0x3d0f,
+  0x3d0f,
+  0x3d10,
+  0x3d10,
+  0x3d11,
+  0x3d11,
+  0x3d11,
+  0x3d12,
+  0x3d12,
+  0x3d13,
+  0x3d13,
+  0x3d13,
+  0x3d14,
+  0x3d14,
+  0x3d15,
+  0x3d15,
+  0x3d16,
+  0x3d16,
+  0x3d16,
+  0x3d17,
+  0x3d17,
+  0x3d18,
+  0x3d18,
+  0x3d18,
+  0x3d19,
+  0x3d19,
+  0x3d1a,
+  0x3d1a,
+  0x3d1a,
+  0x3d1b,
+  0x3d1b,
+  0x3d1c,
+  0x3d1c,
+  0x3d1c,
+  0x3d1d,
+  0x3d1d,
+  0x3d1e,
+  0x3d1e,
+  0x3d1f,
+  0x3d1f,
+  0x3d1f,
+  0x3d20,
+  0x3d20,
+  0x3d21,
+  0x3d21,
+  0x3d21,
+  0x3d22,
+  0x3d22,
+  0x3d23,
+  0x3d23,
+  0x3d23,
+  0x3d24,
+  0x3d24,
+  0x3d25,
+  0x3d25,
+  0x3d25,
+  0x3d26,
+  0x3d26,
+  0x3d27,
+  0x3d27,
+  0x3d28,
+  0x3d28,
+  0x3d28,
+  0x3d29,
+  0x3d29,
+  0x3d2a,
+  0x3d2a,
+  0x3d2a,
+  0x3d2b,
+  0x3d2b,
+  0x3d2c,
+  0x3d2c,
+  0x3d2c,
+  0x3d2d,
+  0x3d2d,
+  0x3d2e,
+  0x3d2e,
+  0x3d2e,
+  0x3d2f,
+  0x3d2f,
+  0x3d30,
+  0x3d30,
+  0x3d31,
+  0x3d31,
+  0x3d31,
+  0x3d32,
+  0x3d32,
+  0x3d33,
+  0x3d33,
+  0x3d33,
+  0x3d34,
+  0x3d34,
+  0x3d35,
+  0x3d35,
+  0x3d35,
+  0x3d36,
+  0x3d36,
+  0x3d37,
+  0x3d37,
+  0x3d38,
+  0x3d38,
+  0x3d38,
+  0x3d39,
+  0x3d39,
+  0x3d3a,
+  0x3d3a,
+  0x3d3a,
+  0x3d3b,
+  0x3d3b,
+  0x3d3c,
+  0x3d3c,
+  0x3d3c,
+  0x3d3d,
+  0x3d3d,
+  0x3d3e,
+  0x3d3e,
+  0x3d3e,
+  0x3d3f,
+  0x3d3f,
+  0x3d40,
+  0x3d40,
+  0x3d41,
+  0x3d41,
+  0x3d41,
+  0x3d42,
+  0x3d42,
+  0x3d43,
+  0x3d43,
+  0x3d43,
+  0x3d44,
+  0x3d44,
+  0x3d45,
+  0x3d45,
+  0x3d45,
+  0x3d46,
+  0x3d46,
+  0x3d47,
+  0x3d47,
+  0x3d47,
+  0x3d48,
+  0x3d48,
+  0x3d49,
+  0x3d49,
+  0x3d4a,
+  0x3d4a,
+  0x3d4a,
+  0x3d4b,
+  0x3d4b,
+  0x3d4c,
+  0x3d4c,
+  0x3d4c,
+  0x3d4d,
+  0x3d4d,
+  0x3d4e,
+  0x3d4e,
+  0x3d4e,
+  0x3d4f,
+  0x3d4f,
+  0x3d50,
+  0x3d50,
+  0x3d50,
+  0x3d51,
+  0x3d51,
+  0x3d52,
+  0x3d52,
+  0x3d53,
+  0x3d53,
+  0x3d53,
+  0x3d54,
+  0x3d54,
+  0x3d55,
+  0x3d55,
+  0x3d55,
+  0x3d56,
+  0x3d56,
+  0x3d57,
+  0x3d57,
+  0x3d57,
+  0x3d58,
+  0x3d58,
+  0x3d59,
+  0x3d59,
+  0x3d59,
+  0x3d5a,
+  0x3d5a,
+  0x3d5b,
+  0x3d5b,
+  0x3d5c,
+  0x3d5c,
+  0x3d5c,
+  0x3d5d,
+  0x3d5d,
+  0x3d5e,
+  0x3d5e,
+  0x3d5e,
+  0x3d5f,
+  0x3d5f,
+  0x3d60,
+  0x3d60,
+  0x3d60,
+  0x3d61,
+  0x3d61,
+  0x3d62,
+  0x3d62,
+  0x3d63,
+  0x3d63,
+  0x3d63,
+  0x3d64,
+  0x3d64,
+  0x3d65,
+  0x3d65,
+  0x3d65,
+  0x3d66,
+  0x3d66,
+  0x3d67,
+  0x3d67,
+  0x3d67,
+  0x3d68,
+  0x3d68,
+  0x3d69,
+  0x3d69,
+  0x3d69,
+  0x3d6a,
+  0x3d6a,
+  0x3d6b,
+  0x3d6b,
+  0x3d6c,
+  0x3d6c,
+  0x3d6c,
+  0x3d6d,
+  0x3d6d,
+  0x3d6e,
+  0x3d6e,
+  0x3d6e,
+  0x3d6f,
+  0x3d6f,
+  0x3d70,
+  0x3d70,
+  0x3d70,
+  0x3d71,
+  0x3d71,
+  0x3d72,
+  0x3d72,
+  0x3d72,
+  0x3d73,
+  0x3d73,
+  0x3d74,
+  0x3d74,
+  0x3d75,
+  0x3d75,
+  0x3d75,
+  0x3d76,
+  0x3d76,
+  0x3d77,
+  0x3d77,
+  0x3d77,
+  0x3d78,
+  0x3d78,
+  0x3d79,
+  0x3d79,
+  0x3d79,
+  0x3d7a,
+  0x3d7a,
+  0x3d7b,
+  0x3d7b,
+  0x3d7b,
+  0x3d7c,
+  0x3d7c,
+  0x3d7d,
+  0x3d7d,
+  0x3d7e,
+  0x3d7e,
+  0x3d7e,
+  0x3d7f,
+  0x3d7f,
+  0x3d80,
+  0x3d80,
+  0x3d80,
+  0x3d80,
+  0x3d81,
+  0x3d81,
+  0x3d81,
+  0x3d81,
+  0x3d81,
+  0x3d82,
+  0x3d82,
+  0x3d82,
+  0x3d82,
+  0x3d82,
+  0x3d83,
+  0x3d83,
+  0x3d83,
+  0x3d83,
+  0x3d83,
+  0x3d84,
+  0x3d84,
+  0x3d84,
+  0x3d84,
+  0x3d85,
+  0x3d85,
+  0x3d85,
+  0x3d85,
+  0x3d85,
+  0x3d86,
+  0x3d86,
+  0x3d86,
+  0x3d86,
+  0x3d86,
+  0x3d87,
+  0x3d87,
+  0x3d87,
+  0x3d87,
+  0x3d87,
+  0x3d88,
+  0x3d88,
+  0x3d88,
+  0x3d88,
+  0x3d88,
+  0x3d89,
+  0x3d89,
+  0x3d89,
+  0x3d89,
+  0x3d89,
+  0x3d8a,
+  0x3d8a,
+  0x3d8a,
+  0x3d8a,
+  0x3d8a,
+  0x3d8b,
+  0x3d8b,
+  0x3d8b,
+  0x3d8b,
+  0x3d8b,
+  0x3d8c,
+  0x3d8c,
+  0x3d8c,
+  0x3d8c,
+  0x3d8c,
+  0x3d8d,
+  0x3d8d,
+  0x3d8d,
+  0x3d8d,
+  0x3d8e,
+  0x3d8e,
+  0x3d8e,
+  0x3d8e,
+  0x3d8e,
+  0x3d8f,
+  0x3d8f,
+  0x3d8f,
+  0x3d8f,
+  0x3d8f,
+  0x3d90,
+  0x3d90,
+  0x3d90,
+  0x3d90,
+  0x3d90,
+  0x3d91,
+  0x3d91,
+  0x3d91,
+  0x3d91,
+  0x3d91,
+  0x3d92,
+  0x3d92,
+  0x3d92,
+  0x3d92,
+  0x3d92,
+  0x3d93,
+  0x3d93,
+  0x3d93,
+  0x3d93,
+  0x3d93,
+  0x3d94,
+  0x3d94,
+  0x3d94,
+  0x3d94,
+  0x3d94,
+  0x3d95,
+  0x3d95,
+  0x3d95,
+  0x3d95,
+  0x3d96,
+  0x3d96,
+  0x3d96,
+  0x3d96,
+  0x3d96,
+  0x3d97,
+  0x3d97,
+  0x3d97,
+  0x3d97,
+  0x3d97,
+  0x3d98,
+  0x3d98,
+  0x3d98,
+  0x3d98,
+  0x3d98,
+  0x3d99,
+  0x3d99,
+  0x3d99,
+  0x3d99,
+  0x3d99,
+  0x3d9a,
+  0x3d9a,
+  0x3d9a,
+  0x3d9a,
+  0x3d9a,
+  0x3d9b,
+  0x3d9b,
+  0x3d9b,
+  0x3d9b,
+  0x3d9b,
+  0x3d9c,
+  0x3d9c,
+  0x3d9c,
+  0x3d9c,
+  0x3d9c,
+  0x3d9d,
+  0x3d9d,
+  0x3d9d,
+  0x3d9d,
+  0x3d9d,
+  0x3d9e,
+  0x3d9e,
+  0x3d9e,
+  0x3d9e,
+  0x3d9f,
+  0x3d9f,
+  0x3d9f,
+  0x3d9f,
+  0x3d9f,
+  0x3da0,
+  0x3da0,
+  0x3da0,
+  0x3da0,
+  0x3da0,
+  0x3da1,
+  0x3da1,
+  0x3da1,
+  0x3da1,
+  0x3da1,
+  0x3da2,
+  0x3da2,
+  0x3da2,
+  0x3da2,
+  0x3da2,
+  0x3da3,
+  0x3da3,
+  0x3da3,
+  0x3da3,
+  0x3da3,
+  0x3da4,
+  0x3da4,
+  0x3da4,
+  0x3da4,
+  0x3da4,
+  0x3da5,
+  0x3da5,
+  0x3da5,
+  0x3da5,
+  0x3da5,
+  0x3da6,
+  0x3da6,
+  0x3da6,
+  0x3da6,
+  0x3da7,
+  0x3da7,
+  0x3da7,
+  0x3da7,
+  0x3da7,
+  0x3da8,
+  0x3da8,
+  0x3da8,
+  0x3da8,
+  0x3da8,
+  0x3da9,
+  0x3da9,
+  0x3da9,
+  0x3da9,
+  0x3da9,
+  0x3daa,
+  0x3daa,
+  0x3daa,
+  0x3daa,
+  0x3daa,
+  0x3dab,
+  0x3dab,
+  0x3dab,
+  0x3dab,
+  0x3dab,
+  0x3dac,
+  0x3dac,
+  0x3dac,
+  0x3dac,
+  0x3dac,
+  0x3dad,
+  0x3dad,
+  0x3dad,
+  0x3dad,
+  0x3dad,
+  0x3dae,
+  0x3dae,
+  0x3dae,
+  0x3dae,
+  0x3dae,
+  0x3daf,
+  0x3daf,
+  0x3daf,
+  0x3daf,
+  0x3db0,
+  0x3db0,
+  0x3db0,
+  0x3db0,
+  0x3db0,
+  0x3db1,
+  0x3db1,
+  0x3db1,
+  0x3db1,
+  0x3db1,
+  0x3db2,
+  0x3db2,
+  0x3db2,
+  0x3db2,
+  0x3db2,
+  0x3db3,
+  0x3db3,
+  0x3db3,
+  0x3db3,
+  0x3db3,
+  0x3db4,
+  0x3db4,
+  0x3db4,
+  0x3db4,
+  0x3db4,
+  0x3db5,
+  0x3db5,
+  0x3db5,
+  0x3db5,
+  0x3db5,
+  0x3db6,
+  0x3db6,
+  0x3db6,
+  0x3db6,
+  0x3db6,
+  0x3db7,
+  0x3db7,
+  0x3db7,
+  0x3db7,
+  0x3db8,
+  0x3db8,
+  0x3db8,
+  0x3db8,
+  0x3db8,
+  0x3db9,
+  0x3db9,
+  0x3db9,
+  0x3db9,
+  0x3db9,
+  0x3dba,
+  0x3dba,
+  0x3dba,
+  0x3dba,
+  0x3dba,
+  0x3dbb,
+  0x3dbb,
+  0x3dbb,
+  0x3dbb,
+  0x3dbb,
+  0x3dbc,
+  0x3dbc,
+  0x3dbc,
+  0x3dbc,
+  0x3dbc,
+  0x3dbd,
+  0x3dbd,
+  0x3dbd,
+  0x3dbd,
+  0x3dbd,
+  0x3dbe,
+  0x3dbe,
+  0x3dbe,
+  0x3dbe,
+  0x3dbe,
+  0x3dbf,
+  0x3dbf,
+  0x3dbf,
+  0x3dbf,
+  0x3dbf,
+  0x3dc0,
+  0x3dc0,
+  0x3dc0,
+  0x3dc0,
+  0x3dc0,
+  0x3dc0,
+  0x3dc0,
+  0x3dc0,
+  0x3dc0,
+  0x3dc1,
+  0x3dc1,
+  0x3dc1,
+  0x3dc1,
+  0x3dc1,
+  0x3dc2,
+  0x3dc2,
+  0x3dc2,
+  0x3dc2,
+  0x3dc2,
+  0x3dc3,
+  0x3dc3,
+  0x3dc3,
+  0x3dc3,
+  0x3dc3,
+  0x3dc4,
+  0x3dc4,
+  0x3dc4,
+  0x3dc4,
+  0x3dc4,
+  0x3dc5,
+  0x3dc5,
+  0x3dc5,
+  0x3dc5,
+  0x3dc5,
+  0x3dc6,
+  0x3dc6,
+  0x3dc6,
+  0x3dc6,
+  0x3dc6,
+  0x3dc7,
+  0x3dc7,
+  0x3dc7,
+  0x3dc7,
+  0x3dc7,
+  0x3dc8,
+  0x3dc8,
+  0x3dc8,
+  0x3dc8,
+  0x3dc9,
+  0x3dc9,
+  0x3dc9,
+  0x3dc9,
+  0x3dc9,
+  0x3dca,
+  0x3dca,
+  0x3dca,
+  0x3dca,
+  0x3dca,
+  0x3dcb,
+  0x3dcb,
+  0x3dcb,
+  0x3dcb,
+  0x3dcb,
+  0x3dcc,
+  0x3dcd,
+  0x3dce,
+  0x3dcf,
+  0x3dd0,
+  0x3dd1,
+  0x3dd2,
+  0x3dd3,
+  0x3dd4,
+  0x3dd5,
+  0x3dd6,
+  0x3dd7,
+  0x3dd8,
+  0x3dd9,
+  0x3dda,
+  0x3ddb,
+  0x3ddc,
+  0x3ddd,
+  0x3dde,
+  0x3ddf,
+  0x3de0,
+  0x3de1,
+  0x3de2,
+  0x3de3,
+  0x3de4,
+};
+
+// <! gen atan f(x) = atan(x)
+static double _gen_atan(float i) {
+  return atan(i);
+}
+
+static void tl_lut_ref(
+    u16 *ofmap,
+    u16 *ifmap,
+    tl_shape_t ifmap_shape
+    )
+{
+  assert(ofmap);
+
+#if 0
+  #define INFP32FILE "infp32file.bin"
+  #define OUTBF16FILE "lutbf16out.bin"
+  FILE* pFile;
+  pFile = fopen(INFP32FILE, "wb");
+  int shape_sz = tl_shape_size(&ifmap_shape);
+  float *f = (float *)malloc(sizeof(float) * shape_sz);
+  for (int i = 0; i < shape_sz; i++) {
+    f[i] = convert_bf16_fp32(ifmap[i]);
+  }
+  fwrite(f, 1, shape_sz *sizeof(float), pFile);
+  fclose(pFile);
+
+  // 2. read result from `eval_lut.py`
+  char command[256]; // 7 means atan
+  sprintf(command, "python eval_lut.py --func_id 7 --lut_input_range_start %d --lut_input_range_end %d --inputfloat32 %s --outputbf16 %s 2>&1 > 2\n",
+      range_start, range_end,
+      INFP32FILE, OUTBF16FILE);
+
+  int r;
+  r = system(command);
+  printf ("command is %s, return %d\n", command, r);
+
+  pFile = fopen(OUTBF16FILE, "rb");
+  if (!pFile) {
+    fprintf(stderr, "open golden %s fail\n", OUTBF16FILE);
+    exit(-1);
+  }
+
+  size_t file_length;
+  file_length = fread(ofmap, sizeof(u16), tl_shape_size(&ifmap_shape), pFile);
+  printf("read from golden, file size %" PRIu64 "\n", file_length);
+  fclose(pFile);
+#else
+  for (u32 i = 0; i < tl_shape_size(&ifmap_shape); i++) {
+    float f = convert_bf16_fp32(ifmap[i]);
+    double v = _gen_atan(f);
+    ofmap[i] = convert_fp32_bf16(v);
+
+	if (mode == PRE_DATA_COMPARE_FIX) {
+      ofmap[i] = golden_bf16[i];
+    }
+	else if (mode == DATA_COMPARE_U8) {
+      ofmap[i] = (u8) convert_bf16_s8(ofmap[i]);
+	}
+  }
+#endif
+}
+
+static void gen_y0(u16 *table_data_y0, u64 table_size,
+    int range_start, int range_end) {
+
+  float scale = table_hw / (1.0 * abs(range_start - range_end));
+  //<! 32*8 table, duplicate `channel` times;
+  int half = table_size / channel / 2;
+  double s;
+  u64 idx = 0;
+
+  assert(table_size);
+  assert(half == 128);
+
+  // prepare channel 0
+  // x [0, 127]
+  for (int i = 0; i < half; i++) {
+    float _idx = idx / scale;
+    s = _gen_atan(_idx);
+    lut[idx] = s;
+    table_data_y0[idx] = convert_fp32_bf16(s);
+#ifdef DBG
+    printf("t [%" PRIu64 "] is %f[%d], 0x%x fp is %f d is %.8lf, input is %f\n", idx, convert_bf16_fp32(table_data_y0[idx]), i, table_data_y0[idx], (float)s, s, _idx);
+#endif
+    idx++;
+  }
+
+  // x = -128
+  s = _gen_atan(range_start);
+  lut[idx] = s;
+  table_data_y0[idx] = convert_fp32_bf16(s);
+#ifdef DBG
+  printf("t [%" PRIu64 "] is %f[%d] bf %x\n", idx, convert_bf16_fp32(table_data_y0[idx]), 0, table_data_y0[idx]);
+#endif
+  idx++;
+
+  // x [-128~-1], 2's complement
+  for (int i = 1; i < half; i++) {
+    float _idx = (i) / scale;
+    s = _gen_atan(range_start + _idx);
+    lut[idx] = s;
+    table_data_y0[idx] = convert_fp32_bf16(s);
+#ifdef DBG
+    printf("t [%" PRIu64 "] is %f[%d], 0x%x fp is %f d is %.8lf input is %f\n", idx, convert_bf16_fp32(table_data_y0[idx]), -127 + i, table_data_y0[idx], (float)s, s, range_start + _idx);
+#endif
+    idx++;
+  }
+
+  // idx = 255 dont care
+  //s = _gen_atan(2, 0);
+  //table_data_y0[idx] = convert_fp32_bf16(s);
+  //printf("t [%" PRIu64 "] is %f[%d]\n", idx, convert_bf16_fp32(table_data_y0[idx]), 0);
+  //idx++;
+
+  // duplicate channel #1 to #31
+  //TODO: tensor copy
+  for (u32 i = 1; i < channel; i++) {
+    memcpy(&table_data_y0[i * table_hw], &table_data_y0[0], sizeof(u16) * table_hw);
+  }
+}
+
+static void gen_slope(u16 IN *table_data_y0, u16* OUT table_slope, u64 table_size,
+    int range_start, int range_end) {
+
+  float scale = table_hw / (1.0 * abs(range_start - range_end));
+  u32 half = table_size / channel / 2;
+  assert(half == 128);
+  assert(table_data_y0);
+
+  for (u32 i = 0; i < table_hw; i++) {
+    double x0 = lut[i];
+    double x1 = lut[i+1];
+    double delta = 1.0;
+    if (i == half - 1) {
+      //<! slope[127] means f(127)~f(128)
+      double f = _gen_atan(range_end);
+      x1 = f;
+    }
+    else if (i == half) {
+      // 128 index mean x1 is -129 and x0 is -128
+      x1 = _gen_atan(range_start - 1/scale);
+      delta = -1.0;
+    }
+    else if (i > half) {
+      x0 = lut[i];
+      x1 = lut[i-1];
+      delta = -1.0;
+    }
+    double s = (x1 - x0) / delta; // x1 already scale up
+    table_slope[i] = convert_fp32_bf16((float)s);
+#ifdef DBG
+    printf ("slope table [%u] = (bf16 %f double %.8lf float %f), 0x%x, %.8lf - %.8lf(%.8lf)\n",
+        i, convert_bf16_fp32(table_slope[i]), s, (float)s, table_slope[i], x1, x0, x1-x0);
+#endif
+  }
+
+#if 0 //def DBG
+  for (u32 i = 0; i < 2 * half; i++) {
+	printf("slope [%u] is %lf, 0x%x\n", i, convert_bf16_fp32(table_slope[i]),
+		table_slope[i]);
+  }
+#endif /* ifdef DBG */
+
+  // duplicate channel #1 to #31
+  //TODO: tensor copy
+  for (u64 i = 1; i < channel; i++) {
+    memcpy(&table_slope[table_hw * i], &table_slope[0], sizeof(u16) * table_hw);
+  }
+}
+
+static bool verify(u16 *ofmap_data, u16 *ref_data, u16* ifmap, u64 ifmap_size, float epsilon) {
+  u64 size = ifmap_size;
+
+  for (u64 i = 0; i < size; i++) {
+    bool is_close;
+	u16 ref = ref_data[i];
+	u16 ofmap_data_bf16;
+	float ref_f;
+	float ofmap_data_f;
+	u32 shift;
+    
+	if (mode == DATA_COMPARE_U8) {
+	  shift = (i%2)*8;
+	  ofmap_data_bf16 = (u16)ofmap_data[i/2];
+	  ofmap_data_f = (float)(ofmap_data[i/2] >> shift);
+	  ref_f = (float)(ref);
+
+      is_close = ((u8)(ofmap_data[i/2] >> shift)) == (u8)ref;
+
+	  //printf("[%" PRIu64 "] of is %x ref is %x\n", i, (u8)(ofmap_data[i/2] >> shift), (u8)ref);
+	}
+	else {
+	  ref_f = convert_bf16_fp32(ref);
+	  ofmap_data_f = convert_bf16_fp32(ofmap_data[i]);
+	  ofmap_data_bf16 = ofmap_data[i];
+
+	  if (mode == PRE_DATA_COMPARE_FIX) {
+		is_close = ofmap_data[i] == ref;
+	  }
+	  else {
+		is_close = almost_equal(ref_f, ofmap_data_f, 1);
+		is_close = fabs(ref_f-ofmap_data_f) < epsilon;
+	  }
+	}
+
+#if 0
+	if (i == 0) {
+	  fprintf(stderr,
+		  "input, ofmap, ref, diff, diff / ref_f\n");
+    }
+
+    fprintf(stderr,
+        "%.16f, %f, %lf, %lf, %lf\n",
+        convert_bf16_fp32(ifmap[i]),
+        ofmap_data_f, ref_f, fabs(ref_f - ofmap_data_f), fabs(ref_f - ofmap_data_f) / ref_f);
+    //if (ofmap_data[i] != ref && fabs(ref_f-ofmap_data_f) > 0.07) 
+    //if (ofmap_data[i] != ref && AlmostEqual2sComplement(ref_f, ofmap_data_f, 1))
+    //if (ofmap_data[i] != ref && AlmostEqual(ref_f, ofmap_data_f, FLT_EPSILON))
+#endif
+    if (!is_close) {
+      float input = convert_bf16_fp32(ifmap[i]);
+      fprintf(stderr,
+          "comparing failed at ofmap_data[%" PRIu64 "](input:%f)\n"
+          "\tgot %x, exp %x, fp32: got %f exp %f, atan(%f) = %f\n",
+          i, input,
+          ofmap_data_bf16, ref, ofmap_data_f, ref_f,
+          input, _gen_atan(input));
+      exit(-1);
+    }
+  }
+
+  return true;
+}
+
+/*
+ * NOTICE: it could occupy 2 lookup table size which shape is <1,32,32,8> with bf16 data type
+ *
+ * \tl_buf tmp buffer, the shape MUST be same type/shape with \tl_ifmap
+ * \tl_y0 tmp buffer for lut used, shape should be <1,32,32,8>
+ * \tl_slope tmp buffer for lut used, shape should be <1,32,32,8>
+ * \tl_ofmap_u8 result as u8 type, NULL means use bf16 result
+ * \tl_ofmap_bf16 result as bf16, MUST given for tmp buffer used
+ * \range_start, \range_end specify data range, default range is -8 ~ +8
+ */
+static int bf16_emit(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk,
+	tl_t* tl_ifmap,
+	tl_t* tl_buf,
+    tl_t* tl_y0_buf,
+    tl_t* tl_slope_buf,
+	tl_t* OUT tl_ofmap_bf16,
+	tl_t* OUT tl_ofmap_u8,
+    int range_start, int range_end
+  ) {
+
+  assert(tl_y0_buf->shape.n == tl_slope_buf->shape.n);
+  assert(tl_y0_buf->shape.c == tl_slope_buf->shape.c);
+  assert(tl_y0_buf->shape.h == tl_slope_buf->shape.h);
+  assert(tl_y0_buf->shape.w == tl_slope_buf->shape.w);
+
+  tl_shape_t tl_shape_int8 = {1, channel, tl_ofmap_bf16->shape.h * tl_ofmap_bf16->shape.w, 1};
+
+  fmt_t fmt = FMT_BF16;
+
+  int data_type_size = bytesize_of_fmt(fmt);
+  u64 table_size = tl_shape_size(&tl_y0_buf->shape);
+  u64 table_bytesize  =  table_size * data_type_size;
+
+  u16 *table_data_y0 = (u16 *)xmalloc(table_bytesize);
+  gen_y0 (table_data_y0, table_size, range_start, range_end);
+
+  u16 *table_data_slope = (u16 *)xmalloc(table_bytesize);
+  gen_slope(table_data_y0, table_data_slope, table_size, range_start, range_end);
+
+  float scale = table_hw / (1.0 * abs(range_start - range_end));
+
+  // prepare load data from sys->local
+  bmk1880v2_tdma_tg2l_tensor_copy_param_t copy_p2, copy_p3;
+  memset(&copy_p2, 0, sizeof(copy_p2));
+  memset(&copy_p3, 0, sizeof(copy_p3));
+
+  prepare_put_bf16_tensor_g2l(ctx, bmk, tl_y0_buf, table_data_y0, fmt, &copy_p2);
+  prepare_put_bf16_tensor_g2l(ctx, bmk, tl_slope_buf, table_data_slope, fmt, &copy_p3);
+
+  launch_put_bf16_tensor_g2l(ctx, bmk, copy_p2.src, &copy_p2); // table value
+  launch_put_bf16_tensor_g2l(ctx, bmk, copy_p3.src, &copy_p3); // table mantissa
+
+  bmk1880v2_tdma_l2l_tensor_copy_param_t p10;
+  memset(&p10, 0, sizeof(p10));
+
+  // scale input for remap its idx(-x~x) to (-127~127), dirty tl_ifmap
+  bmk1880v2_tiu_element_wise_mul_param_t p1;
+  memset(&p1, 0, sizeof(p1));
+  p1.res_high = NULL;
+  p1.res_low = tl_ifmap;
+  p1.a = tl_ifmap;
+  p1.b_is_const = 1;
+  p1.b_const.val = convert_fp32_bf16(scale);
+  p1.rshift_bits = 0;
+  p1.relu_enable = 0;
+  bmk1880v2_tiu_element_wise_mul(bmk, &p1);
+
+  // <! get idx from bf16->int8
+  memset(&p10, 0x00, sizeof(bmk1880v2_tdma_l2l_tensor_copy_param_t));
+  bmk1880v2_tensor_lmem_t dst;
+  memcpy(&dst, tl_ofmap_bf16, sizeof(bmk1880v2_tensor_lmem_t)); 
+  dst.fmt = FMT_I8;
+  dst.shape = tl_shape_int8;
+  dst.stride = bmk1880v2_tensor_lmem_default_stride(bmk, dst.shape, dst.fmt, /*eu_align*/ 1);
+  dst.stride.h = dst.stride.h * 2;
+  dst.int8_rnd_mode = 1;
+  p10.dst = &dst;
+  p10.src = tl_ifmap;
+  bmk1880v2_tdma_l2l_bf16_tensor_copy(bmk, &p10);
+  test_submit(ctx);
+  dst.int8_rnd_mode = 0; // reset
+
+  // <! int8 to fb16 format cus for sub use, sub MUST in the same format
+  memset(&p10, 0x00, sizeof(bmk1880v2_tdma_l2l_tensor_copy_param_t));
+  p10.dst = tl_buf; //<! bf16
+  p10.src = &dst;
+  bmk1880v2_tdma_l2l_bf16_tensor_copy(bmk, &p10);
+  test_submit(ctx);
+
+  // <! sub, diff base , a - b
+  // (x - x0)
+  bmk1880v2_tiu_element_wise_sub_param_t p5;
+  memset(&p5, 0, sizeof(p5));
+  p5.res_high = 0;
+  p5.res_low = tl_ifmap;
+  p5.a_high = 0;
+  p5.a_low = tl_ifmap;
+  p5.b_high = 0;
+  p5.b_low = tl_buf;
+  p5.rshift_bits = 0;
+  bmk1880v2_tiu_element_wise_sub(bmk, &p5);
+
+  // get f(x0) and slope(x)
+  // reshape, 16->16
+  dst.fmt = fmt;
+  dst.shape = tl_buf->shape;
+  dst.stride = tl_buf->stride;
+
+  // <! get slope by index
+  // <! ( (f(x1) - f(x0)) / (x1 - x0) )
+  // <! TIU MUST with same shape and stride, we leverage output map shape and stride
+  bmk1880v2_tiu_lookup_table_param_t p12;
+  memset(&p12, 0x0, sizeof(p12));
+  p12.ofmap = tl_buf;
+  p12.ifmap = &dst;
+  p12.table = tl_slope_buf;
+  bmk1880v2_tiu_lookup_table(bmk, &p12);
+
+  // base f(x0)
+  memset(&p12, 0x0, sizeof(bmk1880v2_tiu_lookup_table_param_t));
+  p12.ofmap = tl_ofmap_bf16;
+  p12.ifmap = &dst;
+  p12.table = tl_y0_buf;
+  bmk1880v2_tiu_lookup_table(bmk, &p12);
+
+  // <! mac
+  // <! part A + part B, a * b + res = res
+  bmk1880v2_tiu_element_wise_mac_param_t p2;
+  memset(&p2, 0, sizeof(p2));
+  p2.res_high = 0;
+  p2.res_low = tl_ofmap_bf16;
+  p2.res_is_int8 = 0;
+  p2.a = tl_ifmap;
+  p2.b_is_const = 0;
+  p2.b = tl_buf;
+  p2.lshift_bits = 0;
+  p2.rshift_bits = 0;
+  p2.relu_enable = 0;
+  bmk1880v2_tiu_element_wise_mac(bmk, &p2);
+
+  if (tl_ofmap_u8) {
+	p10.dst = tl_ofmap_u8;
+	p10.src = tl_ofmap_bf16;
+	bmk1880v2_tdma_l2l_bf16_tensor_copy(bmk, &p10);
+  }
+
+  test_submit(ctx);
+
+  free(table_data_y0);
+  free(table_data_slope);
+
+  return 0;
+}
+
+static void gen_input(u16 *input_data, u64 ifmap_size, TEST_MODE mode, 
+    int range_start, int range_end) {
+
+  if (mode == PRE_DATA_COMPARE_FIX) {
+    memcpy(input_data, &test_pattern, sizeof(test_pattern));
+  }
+  else {
+    std::random_device rd;
+    std::mt19937 e2(rd());
+    std::uniform_real_distribution<> dist(range_start, range_end);
+    for (u64 i = 0; i < ifmap_size; i++) {
+      // input range is -8 ~ +8
+      float input = ((int)i % (range_end-2)) * (((int)i % 2) ? 1 : -1) + 0.03 + (i % table_hw) * 0.002;
+      //float input = ((int)i % 10) * (((int)i % 2) ? 1 : -1) + 0.03 + (i % table_hw) * 0.002;
+      //float input = dist(e2);
+      input_data[i] = convert_fp32_bf16(input);
+    }
+  }
+
+#ifdef DBG
+  for (u64 i = 0; i < ifmap_size; i++) {
+    printf("source if[%" PRIu64 "] bf16 %f 0x%x, log2f is %f\n", i, convert_bf16_fp32(input_data[i]), input_data[i], floor(log2((convert_bf16_fp32(input_data[i])))));
+  }
+#endif /* ifdef DBG */
+
+}
+
+static void test_tl_int8_lut_bf16(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk)
+{
+  // TODO: check more shape / align
+  u32 input_n = 1;
+  u32 input_c = channel;
+  u32 input_h = 16;
+  u32 input_w = 16;
+  float epsilon = 0.01;
+  int range_start = -8;
+  int range_end = 8;
+
+  if (mode == PRE_DATA_COMPARE_FIX) {
+    input_h = 4;
+    input_w = 8;
+  }
+
+  tl_shape_t ifmap_shape= {input_n, input_c, input_h, input_w};
+  tl_shape_t ofmap_shape = ifmap_shape;
+  tl_shape_t table_shape = {input_n, channel, table_h, table_w}; // hard code for hw, hw:32x8
+
+  u64 ifmap_size = tl_shape_size(&ifmap_shape);
+  u64 ofmap_size = tl_shape_size(&ofmap_shape);
+
+  fmt_t fmt = FMT_BF16;
+
+  tl_t *tl_ifmap = alloc_tl(bmk,ifmap_shape, fmt, /*align*/1);
+  tl_t *tl_buf = alloc_tl(bmk,ofmap_shape, fmt, /*align*/1);
+  tl_t *tl_ofmap_bf16 = alloc_tl(bmk,ofmap_shape, fmt, /*align*/1);
+  tl_t *tl_table_answer_y0 = alloc_tl(bmk, table_shape, fmt, /*align*/1);
+  tl_t *tl_table_answer_slope = alloc_tl(bmk, table_shape, fmt, /*align*/1);
+  tl_t *tl_ofmap_u8 = nullptr;
+
+
+  int data_type_size = bytesize_of_fmt(fmt);
+  u64 ifmap_bytesize  =  ifmap_size * data_type_size;
+  u64 ofmap_bytesize  =  ofmap_size * data_type_size;
+
+  u16 *input_data = (u16 *)xmalloc(ifmap_bytesize);
+  u16 *ref_data = (u16 *)xmalloc(ofmap_bytesize);
+
+  gen_input(input_data, ifmap_size, mode, range_start, range_end);
+  tl_lut_ref(ref_data, input_data, ifmap_shape);
+
+  tl_t *out = tl_ofmap_bf16;
+
+  if (mode == DATA_COMPARE_U8) {
+	tl_ofmap_u8 =
+	  alloc_tl(bmk,ofmap_shape, FMT_U8, /*align*/1);
+	out = tl_ofmap_u8;
+  }
+  
+  // <! FIXME: prepare it
+  bmk1880v2_tdma_tg2l_tensor_copy_param_t copy_p1;
+  memset(&copy_p1, 0, sizeof(copy_p1));
+  prepare_put_bf16_tensor_g2l(ctx, bmk, tl_ifmap, input_data, fmt, &copy_p1);
+  launch_put_bf16_tensor_g2l(ctx, bmk, copy_p1.src, &copy_p1); // input
+
+  bf16_emit(ctx, bmk,
+	tl_ifmap,
+	tl_buf,
+    tl_table_answer_y0,
+    tl_table_answer_slope,
+	OUT tl_ofmap_bf16,
+	OUT tl_ofmap_u8,
+    range_start, range_end);
+
+  u16 *ofmap_data = (u16*)get_bf16_tensor_l2g(ctx, bmk, out, out->fmt);
+  verify(ofmap_data, ref_data, input_data, ifmap_size, epsilon);
+
+  if (tl_ofmap_u8) {
+	free_tl(bmk, tl_ofmap_u8);
+  }
+
+  free_tl(bmk, tl_table_answer_slope);
+  free_tl(bmk, tl_table_answer_y0);
+  free_tl(bmk, tl_ofmap_bf16);
+  free_tl(bmk, tl_buf);
+  free_tl(bmk, tl_ifmap);
+
+  free(input_data);
+  free(ref_data);
+  free(ofmap_data);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  int round_mode;
+
+  round_mode = set_store_feround();
+
+  test_init(&ctx, &bmk);
+
+  //for (int i = PRE_DATA_COMPARE_FIX; i < TEST_MODE_MAX; i++)
+  for (int i = PRE_DATA_COMPARE_FIX; i < DATA_COMPARE_ACCURACY; i++)
+  //for (int i = DATA_COMPARE_ACCURACY; i < DATA_COMPARE_U8; i++)
+  {
+    mode = static_cast<TEST_MODE>(i);
+    printf ("test mode %d...\n", mode);
+    test_tl_int8_lut_bf16(&ctx, bmk);
+  }
+
+  test_exit(&ctx);
+  restore_feround(round_mode);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_bf16_atan2.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_atan2.cpp
new file mode 100644
index 000000000..195632f9e
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_atan2.cpp
@@ -0,0 +1,2975 @@
+/**
+ */
+#include "../1880v2_test_util.h"
+#define OUT
+#define IN
+#include <iostream>
+#include <iomanip>
+#include <string>
+#include <map>
+#include <random>
+#include <cfloat>
+//#include <boost/math/special_functions/next.hpp>
+//#define DBG
+//#define LOCAL_MEM_SIZE (32*1024) //<! 1880v2, 32k
+
+using namespace std;
+
+//<! 1880v2 hw config
+// TODO: get from ctx
+static u32 channel = 32; //<! 1880v2 hardcode
+static u32 table_h = 32;
+static u32 table_w = 8;
+static u32 table_hw = table_h * table_w;
+static double *lut = (double *)malloc(sizeof(double) * table_hw);
+
+/**
+ * pre_data means we test fixed pattern, it should be same sa lut
+ */
+enum TEST_MODE {
+  PRE_DATA_COMPARE_FIX = 0, // pre-data + fix compare
+  DATA_COMPARE, //generate 2^-20 ~ 2^20 value that check epsilon
+  DATA_COMPARE_U8, //generate 2^-20 ~ 2^20 value that check epsilon, result bf16->u8
+  TEST_MODE_MAX,
+};
+
+static TEST_MODE mode;
+
+static u16 test_pattern[] = {
+  0x0000,
+  0x38D2,
+  0x3952,
+  0x399D,
+  0x39D2,
+  0x3A03,
+  0x3A1D,
+  0x3A38,
+  0x3A52,
+  0x3A6C,
+  0x3A83,
+  0x3A90,
+  0x3A9D,
+  0x3AAA,
+  0x3AB8,
+  0x3AC5,
+  0x3AD2,
+  0x3ADF,
+  0x3AEC,
+  0x3AF9,
+  0x3B03,
+  0x3B0A,
+  0x3B10,
+  0x3B17,
+  0x3B1D,
+  0x3B24,
+  0x3B2A,
+  0x3B31,
+  0x3B38,
+  0x3B3E,
+  0x3B45,
+  0x3B4B,
+  0x3B52,
+  0x3B58,
+  0x3B5F,
+  0x3B65,
+  0x3B6C,
+  0x3B72,
+  0x3B79,
+  0x3B80,
+  0x3B83,
+  0x3B86,
+  0x3B8A,
+  0x3B8D,
+  0x3B90,
+  0x3B93,
+  0x3B97,
+  0x3B9A,
+  0x3B9D,
+  0x3BA1,
+  0x3BA4,
+  0x3BA7,
+  0x3BAA,
+  0x3BAE,
+  0x3BB1,
+  0x3BB4,
+  0x3BB8,
+  0x3BBB,
+  0x3BBE,
+  0x3BC1,
+  0x3BC5,
+  0x3BC8,
+  0x3BCB,
+  0x3BCE,
+  0x3BD2,
+  0x3BD5,
+  0x3BD8,
+  0x3BDC,
+  0x3BDF,
+  0x3BE2,
+  0x3BE5,
+  0x3BE9,
+  0x3BEC,
+  0x3BEF,
+  0x3BF2,
+  0x3BF6,
+  0x3BF9,
+  0x3BFC,
+  0x3C00,
+  0x3C01,
+  0x3C03,
+  0x3C05,
+  0x3C06,
+  0x3C08,
+  0x3C0A,
+  0x3C0B,
+  0x3C0D,
+  0x3C0F,
+  0x3C10,
+  0x3C12,
+  0x3C13,
+  0x3C15,
+  0x3C17,
+  0x3C18,
+  0x3C1A,
+  0x3C1C,
+  0x3C1D,
+  0x3C1F,
+  0x3C21,
+  0x3C22,
+  0x3C24,
+  0x3C25,
+  0x3C27,
+  0x3C29,
+  0x3C2A,
+  0x3C2C,
+  0x3C2E,
+  0x3C2F,
+  0x3C31,
+  0x3C33,
+  0x3C34,
+  0x3C36,
+  0x3C38,
+  0x3C39,
+  0x3C3B,
+  0x3C3C,
+  0x3C3E,
+  0x3C40,
+  0x3C41,
+  0x3C43,
+  0x3C45,
+  0x3C46,
+  0x3C48,
+  0x3C4A,
+  0x3C4B,
+  0x3C4D,
+  0x3C4E,
+  0x3C50,
+  0x3C52,
+  0x3C53,
+  0x3C55,
+  0x3C57,
+  0x3C58,
+  0x3C5A,
+  0x3C5C,
+  0x3C5D,
+  0x3C5F,
+  0x3C60,
+  0x3C62,
+  0x3C64,
+  0x3C65,
+  0x3C67,
+  0x3C69,
+  0x3C6A,
+  0x3C6C,
+  0x3C6E,
+  0x3C6F,
+  0x3C71,
+  0x3C72,
+  0x3C74,
+  0x3C76,
+  0x3C77,
+  0x3C79,
+  0x3C7B,
+  0x3C7C,
+  0x3C7E,
+  0x3C80,
+  0x3C81,
+  0x3C81,
+  0x3C82,
+  0x3C83,
+  0x3C84,
+  0x3C85,
+  0x3C86,
+  0x3C86,
+  0x3C87,
+  0x3C88,
+  0x3C89,
+  0x3C8A,
+  0x3C8A,
+  0x3C8B,
+  0x3C8C,
+  0x3C8D,
+  0x3C8E,
+  0x3C8F,
+  0x3C8F,
+  0x3C90,
+  0x3C91,
+  0x3C92,
+  0x3C93,
+  0x3C93,
+  0x3C94,
+  0x3C95,
+  0x3C96,
+  0x3C97,
+  0x3C98,
+  0x3C98,
+  0x3C99,
+  0x3C9A,
+  0x3C9B,
+  0x3C9C,
+  0x3C9C,
+  0x3C9D,
+  0x3C9E,
+  0x3C9F,
+  0x3CA0,
+  0x3CA1,
+  0x3CA1,
+  0x3CA2,
+  0x3CA3,
+  0x3CA4,
+  0x3CA5,
+  0x3CA5,
+  0x3CA6,
+  0x3CA7,
+  0x3CA8,
+  0x3CA9,
+  0x3CAA,
+  0x3CAA,
+  0x3CAB,
+  0x3CAC,
+  0x3CAD,
+  0x3CAE,
+  0x3CAE,
+  0x3CAF,
+  0x3CB0,
+  0x3CB1,
+  0x3CB2,
+  0x3CB3,
+  0x3CB3,
+  0x3CB4,
+  0x3CB5,
+  0x3CB6,
+  0x3CB7,
+  0x3CB8,
+  0x3CB8,
+  0x3CB9,
+  0x3CBA,
+  0x3CBB,
+  0x3CBC,
+  0x3CBC,
+  0x3CBD,
+  0x3CBE,
+  0x3CBF,
+  0x3CC0,
+  0x3CC1,
+  0x3CC1,
+  0x3CC2,
+  0x3CC3,
+  0x3CC4,
+  0x3CC5,
+  0x3CC5,
+  0x3CC6,
+  0x3CC7,
+  0x3CC8,
+  0x3CC9,
+  0x3CCA,
+  0x3CCA,
+  0x3CCB,
+  0x3CCC,
+  0x3CCD,
+  0x3CCE,
+  0x3CCE,
+  0x3CCF,
+  0x3CD0,
+  0x3CD1,
+  0x3CD2,
+  0x3CD3,
+  0x3CD3,
+  0x3CD4,
+  0x3CD5,
+  0x3CD6,
+  0x3CD7,
+  0x3CD7,
+  0x3CD8,
+  0x3CD9,
+  0x3CDA,
+  0x3CDB,
+  0x3CDC,
+  0x3CDC,
+  0x3CDD,
+  0x3CDE,
+  0x3CDF,
+  0x3CE0,
+  0x3CE0,
+  0x3CE1,
+  0x3CE2,
+  0x3CE3,
+  0x3CE4,
+  0x3CE5,
+  0x3CE5,
+  0x3CE6,
+  0x3CE7,
+  0x3CE8,
+  0x3CE9,
+  0x3CE9,
+  0x3CEA,
+  0x3CEB,
+  0x3CEC,
+  0x3CED,
+  0x3CEE,
+  0x3CEE,
+  0x3CEF,
+  0x3CF0,
+  0x3CF1,
+  0x3CF2,
+  0x3CF2,
+  0x3CF3,
+  0x3CF4,
+  0x3CF5,
+  0x3CF6,
+  0x3CF7,
+  0x3CF7,
+  0x3CF8,
+  0x3CF9,
+  0x3CFA,
+  0x3CFB,
+  0x3CFB,
+  0x3CFC,
+  0x3CFD,
+  0x3CFE,
+  0x3CFF,
+  0x3D00,
+  0x3D00,
+  0x3D01,
+  0x3D01,
+  0x3D01,
+  0x3D02,
+  0x3D02,
+  0x3D03,
+  0x3D03,
+  0x3D03,
+  0x3D04,
+  0x3D04,
+  0x3D05,
+  0x3D05,
+  0x3D06,
+  0x3D06,
+  0x3D06,
+  0x3D07,
+  0x3D07,
+  0x3D08,
+  0x3D08,
+  0x3D08,
+  0x3D09,
+  0x3D09,
+  0x3D0A,
+  0x3D0A,
+  0x3D0A,
+  0x3D0B,
+  0x3D0B,
+  0x3D0C,
+  0x3D0C,
+  0x3D0C,
+  0x3D0D,
+  0x3D0D,
+  0x3D0E,
+  0x3D0E,
+  0x3D0F,
+  0x3D0F,
+  0x3D0F,
+  0x3D10,
+  0x3D10,
+  0x3D11,
+  0x3D11,
+  0x3D11,
+  0x3D12,
+  0x3D12,
+  0x3D13,
+  0x3D13,
+  0x3D13,
+  0x3D14,
+  0x3D14,
+  0x3D15,
+  0x3D15,
+  0x3D16,
+  0x3D16,
+  0x3D16,
+  0x3D17,
+  0x3D17,
+  0x3D18,
+  0x3D18,
+  0x3D18,
+  0x3D19,
+  0x3D19,
+  0x3D1A,
+  0x3D1A,
+  0x3D1A,
+  0x3D1B,
+  0x3D1B,
+  0x3D1C,
+  0x3D1C,
+  0x3D1C,
+  0x3D1D,
+  0x3D1D,
+  0x3D1E,
+  0x3D1E,
+  0x3D1F,
+  0x3D1F,
+  0x3D1F,
+  0x3D20,
+  0x3D20,
+  0x3D21,
+  0x3D21,
+  0x3D21,
+  0x3D22,
+  0x3D22,
+  0x3D23,
+  0x3D23,
+  0x3D23,
+  0x3D24,
+  0x3D24,
+  0x3D25,
+  0x3D25,
+  0x3D25,
+  0x3D26,
+  0x3D26,
+  0x3D27,
+  0x3D27,
+  0x3D28,
+  0x3D28,
+  0x3D28,
+  0x3D29,
+  0x3D29,
+  0x3D2A,
+  0x3D2A,
+  0x3D2A,
+  0x3D2B,
+  0x3D2B,
+  0x3D2C,
+  0x3D2C,
+  0x3D2C,
+  0x3D2D,
+  0x3D2D,
+  0x3D2E,
+  0x3D2E,
+  0x3D2E,
+  0x3D2F,
+  0x3D2F,
+  0x3D30,
+  0x3D30,
+  0x3D31,
+  0x3D31,
+  0x3D31,
+  0x3D32,
+  0x3D32,
+  0x3D33,
+  0x3D33,
+  0x3D33,
+  0x3D34,
+  0x3D34,
+  0x3D35,
+  0x3D35,
+  0x3D35,
+  0x3D36,
+  0x3D36,
+  0x3D37,
+  0x3D37,
+  0x3D38,
+  0x3D38,
+  0x3D38,
+  0x3D39,
+  0x3D39,
+  0x3D3A,
+  0x3D3A,
+  0x3D3A,
+  0x3D3B,
+  0x3D3B,
+  0x3D3C,
+  0x3D3C,
+  0x3D3C,
+  0x3D3D,
+  0x3D3D,
+  0x3D3E,
+  0x3D3E,
+  0x3D3E,
+  0x3D3F,
+  0x3D3F,
+  0x3D40,
+  0x3D40,
+  0x3D41,
+  0x3D41,
+  0x3D41,
+  0x3D42,
+  0x3D42,
+  0x3D43,
+  0x3D43,
+  0x3D43,
+  0x3D44,
+  0x3D44,
+  0x3D45,
+  0x3D45,
+  0x3D45,
+  0x3D46,
+  0x3D46,
+  0x3D47,
+  0x3D47,
+  0x3D47,
+  0x3D48,
+  0x3D48,
+  0x3D49,
+  0x3D49,
+  0x3D4A,
+  0x3D4A,
+  0x3D4A,
+  0x3D4B,
+  0x3D4B,
+  0x3D4C,
+  0x3D4C,
+  0x3D4C,
+  0x3D4D,
+  0x3D4D,
+  0x3D4E,
+  0x3D4E,
+  0x3D4E,
+  0x3D4F,
+  0x3D4F,
+  0x3D50,
+  0x3D50,
+  0x3D50,
+  0x3D51,
+  0x3D51,
+  0x3D52,
+  0x3D52,
+  0x3D53,
+  0x3D53,
+  0x3D53,
+  0x3D54,
+  0x3D54,
+  0x3D55,
+  0x3D55,
+  0x3D55,
+  0x3D56,
+  0x3D56,
+  0x3D57,
+  0x3D57,
+  0x3D57,
+  0x3D58,
+  0x3D58,
+  0x3D59,
+  0x3D59,
+  0x3D59,
+  0x3D5A,
+  0x3D5A,
+  0x3D5B,
+  0x3D5B,
+  0x3D5C,
+  0x3D5C,
+  0x3D5C,
+  0x3D5D,
+  0x3D5D,
+  0x3D5E,
+  0x3D5E,
+  0x3D5E,
+  0x3D5F,
+  0x3D5F,
+  0x3D60,
+  0x3D60,
+  0x3D60,
+  0x3D61,
+  0x3D61,
+  0x3D62,
+  0x3D62,
+  0x3D63,
+  0x3D63,
+  0x3D63,
+  0x3D64,
+  0x3D64,
+  0x3D65,
+  0x3D65,
+  0x3D65,
+  0x3D66,
+  0x3D66,
+  0x3D67,
+  0x3D67,
+  0x3D67,
+  0x3D68,
+  0x3D68,
+  0x3D69,
+  0x3D69,
+  0x3D69,
+  0x3D6A,
+  0x3D6A,
+  0x3D6B,
+  0x3D6B,
+  0x3D6C,
+  0x3D6C,
+  0x3D6C,
+  0x3D6D,
+  0x3D6D,
+  0x3D6E,
+  0x3D6E,
+  0x3D6E,
+  0x3D6F,
+  0x3D6F,
+  0x3D70,
+  0x3D70,
+  0x3D70,
+  0x3D71,
+  0x3D71,
+  0x3D72,
+  0x3D72,
+  0x3D72,
+  0x3D73,
+  0x3D73,
+  0x3D74,
+  0x3D74,
+  0x3D75,
+  0x3D75,
+  0x3D75,
+  0x3D76,
+  0x3D76,
+  0x3D77,
+  0x3D77,
+  0x3D77,
+  0x3D78,
+  0x3D78,
+  0x3D79,
+  0x3D79,
+  0x3D79,
+  0x3D7A,
+  0x3D7A,
+  0x3D7B,
+  0x3D7B,
+  0x3D7B,
+  0x3D7C,
+  0x3D7C,
+  0x3D7D,
+  0x3D7D,
+  0x3D7E,
+  0x3D7E,
+  0x3D7E,
+  0x3D7F,
+  0x3D7F,
+  0x3D80,
+  0x3D80,
+  0x3D80,
+  0x3D80,
+  0x3D81,
+  0x3D81,
+  0x3D81,
+  0x3D81,
+  0x3D81,
+  0x3D82,
+  0x3D82,
+  0x3D82,
+  0x3D82,
+  0x3D82,
+  0x3D83,
+  0x3D83,
+  0x3D83,
+  0x3D83,
+  0x3D83,
+  0x3D84,
+  0x3D84,
+  0x3D84,
+  0x3D84,
+  0x3D85,
+  0x3D85,
+  0x3D85,
+  0x3D85,
+  0x3D85,
+  0x3D86,
+  0x3D86,
+  0x3D86,
+  0x3D86,
+  0x3D86,
+  0x3D87,
+  0x3D87,
+  0x3D87,
+  0x3D87,
+  0x3D87,
+  0x3D88,
+  0x3D88,
+  0x3D88,
+  0x3D88,
+  0x3D88,
+  0x3D89,
+  0x3D89,
+  0x3D89,
+  0x3D89,
+  0x3D89,
+  0x3D8A,
+  0x3D8A,
+  0x3D8A,
+  0x3D8A,
+  0x3D8A,
+  0x3D8B,
+  0x3D8B,
+  0x3D8B,
+  0x3D8B,
+  0x3D8B,
+  0x3D8C,
+  0x3D8C,
+  0x3D8C,
+  0x3D8C,
+  0x3D8C,
+  0x3D8D,
+  0x3D8D,
+  0x3D8D,
+  0x3D8D,
+  0x3D8E,
+  0x3D8E,
+  0x3D8E,
+  0x3D8E,
+  0x3D8E,
+  0x3D8F,
+  0x3D8F,
+  0x3D8F,
+  0x3D8F,
+  0x3D8F,
+  0x3D90,
+  0x3D90,
+  0x3D90,
+  0x3D90,
+  0x3D90,
+  0x3D91,
+  0x3D91,
+  0x3D91,
+  0x3D91,
+  0x3D91,
+  0x3D92,
+  0x3D92,
+  0x3D92,
+  0x3D92,
+  0x3D92,
+  0x3D93,
+  0x3D93,
+  0x3D93,
+  0x3D93,
+  0x3D93,
+  0x3D94,
+  0x3D94,
+  0x3D94,
+  0x3D94,
+  0x3D94,
+  0x3D95,
+  0x3D95,
+  0x3D95,
+  0x3D95,
+  0x3D96,
+  0x3D96,
+  0x3D96,
+  0x3D96,
+  0x3D96,
+  0x3D97,
+  0x3D97,
+  0x3D97,
+  0x3D97,
+  0x3D97,
+  0x3D98,
+  0x3D98,
+  0x3D98,
+  0x3D98,
+  0x3D98,
+  0x3D99,
+  0x3D99,
+  0x3D99,
+  0x3D99,
+  0x3D99,
+  0x3D9A,
+  0x3D9A,
+  0x3D9A,
+  0x3D9A,
+  0x3D9A,
+  0x3D9B,
+  0x3D9B,
+  0x3D9B,
+  0x3D9B,
+  0x3D9B,
+  0x3D9C,
+  0x3D9C,
+  0x3D9C,
+  0x3D9C,
+  0x3D9C,
+  0x3D9D,
+  0x3D9D,
+  0x3D9D,
+  0x3D9D,
+  0x3D9D,
+  0x3D9E,
+  0x3D9E,
+  0x3D9E,
+  0x3D9E,
+  0x3D9F,
+  0x3D9F,
+  0x3D9F,
+  0x3D9F,
+  0x3D9F,
+  0x3DA0,
+  0x3DA0,
+  0x3DA0,
+  0x3DA0,
+  0x3DA0,
+  0x3DA1,
+  0x3DA1,
+  0x3DA1,
+  0x3DA1,
+  0x3DA1,
+  0x3DA2,
+  0x3DA2,
+  0x3DA2,
+  0x3DA2,
+  0x3DA2,
+  0x3DA3,
+  0x3DA3,
+  0x3DA3,
+  0x3DA3,
+  0x3DA3,
+  0x3DA4,
+  0x3DA4,
+  0x3DA4,
+  0x3DA4,
+  0x3DA4,
+  0x3DA5,
+  0x3DA5,
+  0x3DA5,
+  0x3DA5,
+  0x3DA5,
+  0x3DA6,
+  0x3DA6,
+  0x3DA6,
+  0x3DA6,
+  0x3DA7,
+  0x3DA7,
+  0x3DA7,
+  0x3DA7,
+  0x3DA7,
+  0x3DA8,
+  0x3DA8,
+  0x3DA8,
+  0x3DA8,
+  0x3DA8,
+  0x3DA9,
+  0x3DA9,
+  0x3DA9,
+  0x3DA9,
+  0x3DA9,
+  0x3DAA,
+  0x3DAA,
+  0x3DAA,
+  0x3DAA,
+  0x3DAA,
+  0x3DAB,
+  0x3DAB,
+  0x3DAB,
+  0x3DAB,
+  0x3DAB,
+  0x3DAC,
+  0x3DAC,
+  0x3DAC,
+  0x3DAC,
+  0x3DAC,
+  0x3DAD,
+  0x3DAD,
+  0x3DAD,
+  0x3DAD,
+  0x3DAD,
+  0x3DAE,
+  0x3DAE,
+  0x3DAE,
+  0x3DAE,
+  0x3DAE,
+  0x3DAF,
+  0x3DAF,
+  0x3DAF,
+  0x3DAF,
+  0x3DB0,
+  0x3DB0,
+  0x3DB0,
+  0x3DB0,
+  0x3DB0,
+  0x3DB1,
+  0x3DB1,
+  0x3DB1,
+  0x3DB1,
+  0x3DB1,
+  0x3DB2,
+  0x3DB2,
+  0x3DB2,
+  0x3DB2,
+  0x3DB2,
+  0x3DB3,
+  0x3DB3,
+  0x3DB3,
+  0x3DB3,
+  0x3DB3,
+  0x3DB4,
+  0x3DB4,
+  0x3DB4,
+  0x3DB4,
+  0x3DB4,
+  0x3DB5,
+  0x3DB5,
+  0x3DB5,
+  0x3DB5,
+  0x3DB5,
+  0x3DB6,
+  0x3DB6,
+  0x3DB6,
+  0x3DB6,
+  0x3DB6,
+  0x3DB7,
+  0x3DB7,
+  0x3DB7,
+  0x3DB7,
+  0x3DB8,
+  0x3DB8,
+  0x3DB8,
+  0x3DB8,
+  0x3DB8,
+  0x3DB9,
+  0x3DB9,
+  0x3DB9,
+  0x3DB9,
+  0x3DB9,
+  0x3DBA,
+  0x3DBA,
+  0x3DBA,
+  0x3DBA,
+  0x3DBA,
+  0x3DBB,
+  0x3DBB,
+  0x3DBB,
+  0x3DBB,
+  0x3DBB,
+  0x3DBC,
+  0x3DBC,
+  0x3DBC,
+  0x3DBC,
+  0x3DBC,
+  0x3DBD,
+  0x3DBD,
+  0x3DBD,
+  0x3DBD,
+  0x3DBD,
+  0x3DBE,
+  0x3DBE,
+  0x3DBE,
+  0x3DBE,
+  0x3DBE,
+  0x3DBF,
+  0x3DBF,
+  0x3DBF,
+  0x3DBF,
+  0x3DBF,
+  0x3DC0,
+  0x3DC0,
+  0x3DC0,
+  0x3DC0,
+  0x3DC1,
+  0x3DC1,
+  0x3DC1,
+  0x3DC1,
+  0x3DC1,
+  0x3DC2,
+  0x3DC2,
+  0x3DC2,
+  0x3DC2,
+  0x3DC2,
+  0x3DC3,
+  0x3DC3,
+  0x3DC3,
+  0x3DC3,
+  0x3DC3,
+  0x3DC4,
+  0x3DC4,
+  0x3DC4,
+  0x3DC4,
+  0x3DC4,
+  0x3DC5,
+  0x3DC5,
+  0x3DC5,
+  0x3DC5,
+  0x3DC5,
+  0x3DC6,
+  0x3DC6,
+  0x3DC6,
+  0x3DC6,
+  0x3DC6,
+  0x3DC7,
+  0x3DC7,
+  0x3DC7,
+  0x3DC7,
+  0x3DC7,
+  0x3DC8,
+  0x3DC8,
+  0x3DC8,
+  0x3DC8,
+  0x3DC8,
+  0x3DC9,
+  0x3DC9,
+  0x3DC9,
+  0x3DC9,
+  0x3DCA,
+  0x3DCA,
+  0x3DCA,
+  0x3DCA,
+  0x3DCA,
+  0x3DCB,
+  0x3DCB,
+  0x3DCB,
+  0x3DCB,
+  0x3DCB,
+  0x3DCC,
+  0x3DCC,
+  0x3DCC,
+  0x3DCC,
+  0x3DCC,
+  0x3DCD,
+  0x3DCE,
+  0x3DCF,
+  0x3DD0,
+  0x3DD1,
+  0x3DD2,
+  0x3DD3,
+  0x3DD4,
+  0x3DD5,
+  0x3DD6,
+  0x3DD7,
+  0x3DD8,
+  0x3DD9,
+  0x3DDA,
+  0x3DDB,
+  0x3DDC,
+  0x3DDD,
+  0x3DDE,
+  0x3DDF,
+  0x3DE0,
+  0x3DE1,
+  0x3DE2,
+  0x3DE3,
+  0x3DE4,
+  0x3DE5,
+};
+
+static u16 golden_bf16[] = {
+  0x3fc9,
+  0x3fbe,
+  0x3fb4,
+  0x3fac,
+  0x3fa6,
+  0x3fa0,
+  0x3f9b,
+  0x3f97,
+  0x3f93,
+  0x3f90,
+  0x3f8e,
+  0x3f8b,
+  0x3f89,
+  0x3f87,
+  0x3f84,
+  0x3f84,
+  0x3f83,
+  0x3f81,
+  0x3f80,
+  0x3f7d,
+  0x3f7b,
+  0x3f7a,
+  0x3f78,
+  0x3f76,
+  0x3f75,
+  0x3f73,
+  0x3f72,
+  0x3f70,
+  0x3f70,
+  0x3f6f,
+  0x3f6d,
+  0x3f6c,
+  0x3f6c,
+  0x3f6c,
+  0x3f69,
+  0x3f68,
+  0x3f68,
+  0x3f68,
+  0x3f67,
+  0x3f66,
+  0x3f66,
+  0x3f64,
+  0x3f63,
+  0x3f64,
+  0x3f63,
+  0x3f62,
+  0x3f63,
+  0x3f61,
+  0x3f60,
+  0x3f61,
+  0x3f60,
+  0x3f5f,
+  0x3f5f,
+  0x3f5f,
+  0x3f5e,
+  0x3f5e,
+  0x3f5e,
+  0x3f5e,
+  0x3f5c,
+  0x3f5c,
+  0x3f5c,
+  0x3f5e,
+  0x3f5b,
+  0x3f5c,
+  0x3f5c,
+  0x3f5b,
+  0x3f5b,
+  0x3f5a,
+  0x3f5a,
+  0x3f5a,
+  0x3f5a,
+  0x3f5a,
+  0x3f59,
+  0x3f5a,
+  0x3f59,
+  0x3f59,
+  0x3f59,
+  0x3f58,
+  0x3f59,
+  0x3f57,
+  0x3f58,
+  0x3f57,
+  0x3f57,
+  0x3f57,
+  0x3f58,
+  0x3f56,
+  0x3f56,
+  0x3f57,
+  0x3f56,
+  0x3f56,
+  0x3f56,
+  0x3f55,
+  0x3f55,
+  0x3f55,
+  0x3f55,
+  0x3f55,
+  0x3f55,
+  0x3f55,
+  0x3f55,
+  0x3f54,
+  0x3f55,
+  0x3f55,
+  0x3f55,
+  0x3f54,
+  0x3f54,
+  0x3f54,
+  0x3f55,
+  0x3f55,
+  0x3f54,
+  0x3f54,
+  0x3f54,
+  0x3f55,
+  0x3f54,
+  0x3f53,
+  0x3f53,
+  0x3f54,
+  0x3f53,
+  0x3f53,
+  0x3f54,
+  0x3f53,
+  0x3f53,
+  0x3f53,
+  0x3f53,
+  0x3f54,
+  0x3f53,
+  0x3f53,
+  0x3f53,
+  0x3f53,
+  0x3f52,
+  0x3f53,
+  0x3f53,
+  0x3f52,
+  0x3f53,
+  0x3f53,
+  0x3f52,
+  0x3f53,
+  0x3f51,
+  0x3f52,
+  0x3f51,
+  0x3f53,
+  0x3f52,
+  0x3f53,
+  0x3f53,
+  0x3f53,
+  0x3f50,
+  0x3f52,
+  0x3f52,
+  0x3f52,
+  0x3f52,
+  0x3f51,
+  0x3f51,
+  0x3f53,
+  0x3f52,
+  0x3f51,
+  0x3f51,
+  0x3f51,
+  0x3f50,
+  0x3f52,
+  0x3f52,
+  0x3f51,
+  0x3f52,
+  0x3f51,
+  0x3f50,
+  0x3f52,
+  0x3f52,
+  0x3f50,
+  0x3f50,
+  0x3f51,
+  0x3f50,
+  0x3f50,
+  0x3f52,
+  0x3f51,
+  0x3f50,
+  0x3f50,
+  0x3f4f,
+  0x3f4f,
+  0x3f51,
+  0x3f50,
+  0x3f50,
+  0x3f50,
+  0x3f50,
+  0x3f4f,
+  0x3f51,
+  0x3f50,
+  0x3f50,
+  0x3f50,
+  0x3f50,
+  0x3f4f,
+  0x3f50,
+  0x3f4f,
+  0x3f50,
+  0x3f50,
+  0x3f50,
+  0x3f4f,
+  0x3f4f,
+  0x3f4f,
+  0x3f4e,
+  0x3f4e,
+  0x3f4f,
+  0x3f4f,
+  0x3f4f,
+  0x3f4f,
+  0x3f4f,
+  0x3f4f,
+  0x3f50,
+  0x3f4f,
+  0x3f4e,
+  0x3f50,
+  0x3f50,
+  0x3f50,
+  0x3f4f,
+  0x3f4f,
+  0x3f4f,
+  0x3f4f,
+  0x3f4f,
+  0x3f4e,
+  0x3f50,
+  0x3f4e,
+  0x3f4f,
+  0x3f4f,
+  0x3f4f,
+  0x3f4f,
+  0x3f50,
+  0x3f4e,
+  0x3f4f,
+  0x3f4f,
+  0x3f4f,
+  0x3f4e,
+  0x3f4f,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4e,
+  0x3f4e,
+  0x3f4d,
+  0x3f4e,
+  0x3f4e,
+  0x3f4e,
+  0x3f4f,
+  0x3f4e,
+  0x3f4f,
+  0x3f4f,
+  0x3f4d,
+  0x3f4d,
+  0x3f4e,
+  0x3f4d,
+  0x3f4e,
+  0x3f4e,
+  0x3f4e,
+  0x3f4f,
+  0x3f4e,
+  0x3f4f,
+  0x3f4f,
+  0x3f4e,
+  0x3f4e,
+  0x3f4d,
+  0x3f4e,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4e,
+  0x3f4f,
+  0x3f4e,
+  0x3f4e,
+  0x3f4d,
+  0x3f4e,
+  0x3f4d,
+  0x3f4d,
+  0x3f4e,
+  0x3f4e,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4e,
+  0x3f4e,
+  0x3f4e,
+  0x3f4d,
+  0x3f4e,
+  0x3f4e,
+  0x3f4d,
+  0x3f4d,
+  0x3f4e,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4e,
+  0x3f4d,
+  0x3f4d,
+  0x3f4e,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4e,
+  0x3f4e,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4e,
+  0x3f4e,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4e,
+  0x3f4e,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4c,
+  0x3f4c,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4d,
+  0x3f4d,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4d,
+  0x3f4d,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4c,
+  0x3f4c,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4d,
+  0x3f4d,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4d,
+  0x3f4d,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4b,
+  0x3f4b,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4d,
+  0x3f4d,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4d,
+  0x3f4d,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4c,
+  0x3f4c,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4d,
+  0x3f4d,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4b,
+  0x3f4b,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4d,
+  0x3f4d,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4d,
+  0x3f4d,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4c,
+  0x3f4c,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4c,
+  0x3f4c,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4c,
+  0x3f4c,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4d,
+  0x3f4d,
+  0x3f4c,
+  0x3f4c,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4b,
+  0x3f4b,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4b,
+  0x3f4b,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4c,
+  0x3f4c,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4c,
+  0x3f4c,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4c,
+  0x3f4c,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4c,
+  0x3f4c,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4c,
+  0x3f4c,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4b,
+  0x3f4b,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4d,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4c,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4a,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4b,
+  0x3f4a,
+  0x3f4a,
+  0x3f4b,
+  0x3f4a,
+  0x3f4b,
+  0x3f4b,
+  0x3f4a,
+  0x3f4b,
+  0x3f4a,
+  0x3f4a,
+  0x3f4b,
+  0x3f4c,
+  0x3f4b,
+  0x3f4a,
+  0x3f49,
+  0x3f4a,
+  0x3f4a,
+  0x3f4b,
+  0x3f4b,
+  0x3f4a,
+  0x3f4b,
+  0x3f4a,
+  0x3f4b,
+  0x3f4a,
+  0x3f4a,
+};
+
+// <! gen invert sqrt
+static double _gen_atan2(float y, float x) {
+  // come from https://en.wikipedia.org/wiki/Atan2
+  if (!(abs(atan2(y, x)  - 2 * atan(y / (sqrt(x*x + y*y)) + x)) < 0.001)) {
+    //printf("atan2(%f, %f) is %f, y / (sqrt(x*x + y*y) + x) is %f\n",
+    //    y, x, atan2(y, x), 2 * y / (sqrt(x*x + y*y) + x));
+  }
+  return atan2(y, x);
+}
+
+static double _gen_sqrt(int base, int p) {
+  // y = x ^ 0.5
+  double f = (double) (pow(base, p * 0.5));
+
+  if (isnan(f)) {
+    assert(0);
+  }
+  return f;
+}
+
+static double _gen_reciprocal(int base, int p) {
+  // y = x ^ -1
+  double f = (double) (pow(base, -1 * p));
+
+  if (isnan(f)) {
+    assert(0);
+  }
+  return f;
+}
+
+// <! gen atan f(x) = atan(x)
+static double _gen_atan(float i) {
+  return atan(i);
+}
+
+static void tl_lut_ref(
+    u16 *ofmap,
+    u16 *ifmap,
+    u16 *ifmap2,
+    tl_shape_t ifmap_shape
+    )
+{
+  for (u32 i = 0; i < tl_shape_size(&ifmap_shape); i++) {
+    // _gen_atan2(y, x)
+    float x = convert_bf16_fp32(ifmap[i]);
+    float y = convert_bf16_fp32(ifmap2[i]);
+    float v = _gen_atan2(y, x);
+#if 0 //ifdef DBG
+    printf("ref out[%u] is %f\n", i, v);
+#endif /* ifdef DBG */
+    ofmap[i] = convert_fp32_bf16(v);
+
+    if (mode == PRE_DATA_COMPARE_FIX) {
+      ofmap[i] = golden_bf16[i];
+    }
+    else if (mode == DATA_COMPARE_U8) {
+      ofmap[i] = (u8) convert_bf16_s8(ofmap[i]);
+    }
+  }
+}
+
+static void gen_sqrt(u16 *table_data, u64 table_size) {
+  //<! 32*8 table, duplicate `channel` times;
+  int exp_start = -62; // hard code for bf16 range
+  int half = table_size / channel / 2;
+  u64 idx = 0;
+  assert(table_size);
+  assert(half == 128);
+
+  // prepare channel 0
+  double s = 0.0;
+  table_data[idx] = convert_fp32_bf16(s); // 0^0.5 = 0
+#ifdef DBG
+  printf("t [%" PRIu64 "] is %f(%.8lf)[idx:%f][2^%f] bf %x\n", idx, convert_bf16_fp32(table_data[idx]), s, (float)exp_start, (float)(exp_start/2), table_data[idx]);
+#endif
+  idx++;
+
+  // > 0, exp from 0 -62 -61 ..  62  63
+  for (int i = 0; i < half; i++) {
+    //float exp = round((exp_start + i) / 2) * 2;
+    int shift = (exp_start + i);
+    bool is_odd = (shift % 2);
+    float exp = shift;
+    if (is_odd) {
+      exp = shift > 0 ? exp - 1 : exp - 1;
+    }
+
+    double s = _gen_sqrt(2, exp);
+    table_data[idx] = convert_fp32_bf16(s);
+#ifdef DBG
+    printf("t [%" PRIu64 "] is %f [idx:%f][2^%f(%f)] bf %x\n", idx,
+        convert_bf16_fp32(table_data[idx]),
+        float(exp_start + i), exp/2, (exp_start + i) / 2.0, 
+        table_data[idx]);
+#endif
+    idx++;
+  }
+
+  //// idx = 127 dont care
+#if 0
+  s = _gen_sqrt(2, -0);
+  table_data[idx] = convert_fp32_bf16(s);
+#if 1
+  printf("t [%" PRIu64 "] is %f[%d] bf %x\n", idx, convert_bf16_fp32(table_data[idx]), 0, table_data[idx]);
+#endif
+  idx++;
+
+  for (int i = 1; i < half; i++) {
+    float exp = exp_start + i;
+    double s = _gen_sqrt(-2, exp);
+    table_data[idx] = convert_fp32_bf16(s);
+#ifdef DBG
+    printf("t [%" PRIu64 "] is %f(%e - %.8lf)[(-2)^%f] bf %x\n", idx, convert_bf16_fp32(table_data[idx]), s, s, exp, table_data[idx]);
+#endif
+    idx++;
+  }
+
+  // idx = 255 dont care
+  //s = _gen_sqrt(2, 0);
+  //table_data[idx] = convert_fp32_bf16(s);
+  //printf("t [%" PRIu64 "] is %f[%d]\n", idx, convert_bf16_fp32(table_data[idx]), 0);
+  //idx++;
+#endif
+
+  // duplicate channel #1 to #31
+  //TODO: tensor copy
+  for (u32 i = 1; i < channel; i++) {
+    memcpy(&table_data[i * table_hw], &table_data[0], sizeof(u16) * table_hw);
+  }
+}
+
+static void gen_sqrt_mantissa(u16 IN *table_data, u16* OUT table_mantissa, u64 table_size) {
+
+  u32 half = table_size / channel / 2;
+  assert(half == 128);
+  assert(table_data);
+
+  int idx = 0;
+  double d;
+  for (u32 i = 0; i < half; i++) {
+    d = 1 + i * 1 / 128.0;
+    d = (double) pow(d, 0.5);
+    table_mantissa[128+idx] = convert_fp32_bf16(d);
+#ifdef DBG
+    //printf(", [%u] is %lf\n", i+128, d);
+#endif /* ifdef DBG */
+
+    //13=2^3x1.625=(2^2)x(2^1x1.625)
+    d = 2 * (1 + i * 1 / 128.0);
+    d = (double) pow(d, 0.5);
+    table_mantissa[idx] = convert_fp32_bf16(d);
+#ifdef DBG
+    //printf("mantissa [%u] is %lf", i, d);
+#endif /* ifdef DBG */
+    idx++;
+  }
+#ifdef DBG
+  for (u32 i = 0; i < 2 * half; i++) {
+    printf("mantissa [%u] is %lf, 0x%x\n", i, convert_bf16_fp32(table_mantissa[i]),
+        table_mantissa[i]);
+  }
+#endif /* ifdef DBG */
+
+  // duplicate channel #1 to #31
+  //TODO: tensor copy
+  for (u64 i = 1; i < channel; i++) {
+    memcpy(&table_mantissa[table_hw * i], &table_mantissa[0], sizeof(u16) * table_hw);
+  }
+}
+
+static void gen_reciprocal(u16 *table_data, u64 table_size) {
+  //<! 32*8 table, duplicate `channel` times;
+  int exp_start = -62; // hard code for bf16 range
+  int half = table_size / channel / 2;
+  u64 idx = 0;
+  assert(table_size);
+  assert(half == 128);
+
+  // prepare channel 0
+  double s = 0.0;
+  // 0^-1 is invalid, skip it
+  table_data[idx] = convert_fp32_bf16(s);
+#if 0
+  printf("t [%" PRIu64 "] is %f(%.8lf)[idx:%f][2^%f] bf %x\n", idx, convert_bf16_fp32(table_data[idx]), s, (float)exp_start, (float)(exp_start/2), table_data[idx]);
+#endif
+  idx++;
+
+  // > 0, exp from 0 -62 -61 ..  62  63
+  for (int i = 0; i < half; i++) {
+    int shift = (exp_start + i);
+    bool is_odd = (shift % 2);
+    float exp = shift;
+    if (is_odd) {
+      exp = shift > 0 ? exp - 1 : exp - 1;
+    }
+
+    double s = _gen_reciprocal(2, exp);
+    table_data[idx] = convert_fp32_bf16(s);
+#ifdef DBG
+    printf("t [%" PRIu64 "] is %f [idx:%f][2^%f] bf %x\n", idx,
+        convert_bf16_fp32(table_data[idx]),
+        float(exp_start + i), -1 * exp,
+        table_data[idx]);
+#endif
+    idx++;
+  }
+
+  s = _gen_reciprocal(2, -0);
+  table_data[idx] = convert_fp32_bf16(s);
+#ifdef DBG
+  printf("t [%" PRIu64 "] is %f[%d] bf %x\n", idx, convert_bf16_fp32(table_data[idx]), 0, table_data[idx]);
+#endif
+  idx++;
+
+  for (int i = 1; i < half; i++) {
+    int shift = (exp_start + i);
+    bool is_odd = (shift % 2);
+    float exp = shift;
+    if (is_odd) {
+      exp = shift > 0 ? exp - 1 : exp - 1;
+    }
+
+    double s = _gen_reciprocal(-2, exp);
+    table_data[idx] = convert_fp32_bf16(s);
+#ifdef DBG
+    printf("t [%" PRIu64 "] is %f(%e - %.8lf)[(-2)^%f] bf %x\n", idx, convert_bf16_fp32(table_data[idx]), s, s, exp, table_data[idx]);
+#endif
+    idx++;
+  }
+
+  // idx = 255 dont care
+  //s = _gen_reciprocal(2, 0);
+  //table_data[idx] = convert_fp32_bf16(s);
+  //printf("t [%" PRIu64 "] is %f[%d]\n", idx, convert_bf16_fp32(table_data[idx]), 0);
+  //idx++;
+
+  // duplicate channel #1 to #31
+  //TODO: tensor copy
+  for (u32 i = 1; i < channel; i++) {
+    memcpy(&table_data[i * table_hw], &table_data[0], sizeof(u16) * table_hw);
+  }
+}
+
+static void gen_reciprocal_mantissa(u16 IN *table_data, u16* OUT table_mantissa, u64 table_size) {
+
+  u32 half = table_size / channel / 2;
+  assert(half == 128);
+  assert(table_data);
+
+  int idx = 0;
+  double d;
+  for (u32 i = 0; i < half; i++) {
+    d = 1 + i * 1 / 128.0;
+    d = (double) pow(d, -1);
+    table_mantissa[128+idx] = convert_fp32_bf16(d);
+
+    //13=2^3x1.625=(2^2)x(2^1x1.625)
+    d = 2 * (1 + i * 1 / 128.0);
+    d = (double) pow(d, -1);
+    table_mantissa[idx] = convert_fp32_bf16(d);
+    idx++;
+  }
+
+#ifdef DBG
+  for (u32 i = 0; i < 2 * half; i++) {
+    printf("mantissa [%u] is %lf, 0x%x\n", i, convert_bf16_fp32(table_mantissa[i]),
+        table_mantissa[i]);
+  }
+#endif /* ifdef DBG */
+
+  // duplicate channel #1 to #31
+  //TODO: tensor copy
+  for (u64 i = 1; i < channel; i++) {
+    memcpy(&table_mantissa[table_hw * i], &table_mantissa[0], sizeof(u16) * table_hw);
+  }
+}
+
+static void gen_atan_y0(u16 *table_data_y0, u64 table_size,
+    int range_start, int range_end) {
+  float scale = table_hw / (1.0 * abs(range_start - range_end));
+  //<! 32*8 table, duplicate `channel` times;
+  int half = table_size / channel / 2;
+  double s;
+  u64 idx = 0;
+
+  assert(table_size);
+  assert(half == 128);
+
+  // prepare channel 0
+  // x [0, 127]
+  for (int i = 0; i < half; i++) {
+    float _idx = idx / scale;
+    s = _gen_atan(_idx);
+    lut[idx] = s;
+    table_data_y0[idx] = convert_fp32_bf16(s);
+#ifdef DBG
+    printf("t [%" PRIu64 "] is %f[%d], 0x%x fp is %f d is %.8lf, input is %f\n", idx, convert_bf16_fp32(table_data_y0[idx]), i, table_data_y0[idx], (float)s, s, _idx);
+#endif
+    idx++;
+  }
+
+  // x = -128
+  s = _gen_atan(range_start);
+  lut[idx] = s;
+  table_data_y0[idx] = convert_fp32_bf16(s);
+#ifdef DBG
+  printf("t [%" PRIu64 "] is %f[%d] bf %x\n", idx, convert_bf16_fp32(table_data_y0[idx]), 0, table_data_y0[idx]);
+#endif
+  idx++;
+
+  // x [-128~-1], 2's complement
+  for (int i = 1; i < half; i++) {
+    float _idx = (i) / scale;
+    s = _gen_atan(range_start + _idx);
+    lut[idx] = s;
+    table_data_y0[idx] = convert_fp32_bf16(s);
+#ifdef DBG
+    printf("t [%" PRIu64 "] is %f[%d], 0x%x fp is %f d is %.8lf input is %f\n", idx, convert_bf16_fp32(table_data_y0[idx]), -127 + i, table_data_y0[idx], (float)s, s, range_start + _idx);
+#endif
+    idx++;
+  }
+
+  // idx = 255 dont care
+  //s = _gen_atan(2, 0);
+  //table_data_y0[idx] = convert_fp32_bf16(s);
+  //printf("t [%" PRIu64 "] is %f[%d]\n", idx, convert_bf16_fp32(table_data_y0[idx]), 0);
+  //idx++;
+
+  // duplicate channel #1 to #31
+  //TODO: tensor copy
+  for (u32 i = 1; i < channel; i++) {
+    memcpy(&table_data_y0[i * table_hw], &table_data_y0[0], sizeof(u16) * table_hw);
+  }
+}
+
+static void gen_atan_slope(u16 IN *table_data_y0, u16* OUT table_slope, u64 table_size,
+    int range_start, int range_end) {
+
+  float scale = table_hw / (1.0 * abs(range_start - range_end));
+  u32 half = table_size / channel / 2;
+  assert(half == 128);
+  assert(table_data_y0);
+
+  for (u32 i = 0; i < table_hw; i++) {
+    double x0 = lut[i];
+    double x1 = lut[i+1];
+    double delta = 1.0;
+    if (i == half - 1) {
+      //<! slope[127] means f(127)~f(128)
+      double f = _gen_atan(range_end);
+      x1 = f;
+    }
+    else if (i == half) {
+      // 128 index mean x1 is -129 and x0 is -128
+      x1 = _gen_atan(range_start - 1/scale);
+      delta = -1.0;
+    }
+    else if (i > half) {
+      x0 = lut[i];
+      x1 = lut[i-1];
+      delta = -1.0;
+    }
+    double s = (x1 - x0) / delta; // x1 already scale up
+    table_slope[i] = convert_fp32_bf16((float)s);
+#ifdef DBG
+    printf ("slope table [%u] = (bf16 %f double %.8lf float %f), 0x%x, %.8lf - %.8lf(%.8lf)\n",
+        i, convert_bf16_fp32(table_slope[i]), s, (float)s, table_slope[i], x1, x0, x1-x0);
+#endif
+  }
+
+#if 0 //def DBG
+  for (u32 i = 0; i < 2 * half; i++) {
+    printf("slope [%u] is %lf, 0x%x\n", i, convert_bf16_fp32(table_slope[i]),
+        table_slope[i]);
+  }
+#endif /* ifdef DBG */
+
+  // duplicate channel #1 to #31
+  //TODO: tensor copy
+  for (u64 i = 1; i < channel; i++) {
+    memcpy(&table_slope[table_hw * i], &table_slope[0], sizeof(u16) * table_hw);
+  }
+}
+
+static bool verify(u16 *ofmap_data, u16 *ref_data, u16* ifmap, u64 ifmap_shape_size,
+    TEST_MODE mode, float epsilon) {
+  u64 size = ifmap_shape_size;
+
+  for (u64 i = 0; i < size; i++) {
+    bool is_close;
+    u16 ref;
+    u16 ofmap_data_bf16;
+    float ref_f;
+    float ofmap_data_f;
+    u32 shift;
+
+    if (mode == DATA_COMPARE_U8) {
+      shift = (i%2)*8;
+      ref = ref_data[i];
+      ofmap_data_bf16 = (u16)ofmap_data[i/2];
+      ofmap_data_f = (float)(ofmap_data[i/2] >> shift);
+      ref_f = (float)(ref);
+
+      is_close = ((u8)(ofmap_data[i/2] >> shift)) == (u8)ref;
+
+      //printf("[%" PRIu64 "] of is %x ref is %x\n", i, (u8)(ofmap_data[i/2] >> shift), (u8)ref);
+    }
+    else {
+      ref = ref_data[i];
+      ref_f = convert_bf16_fp32(ref);
+      ofmap_data_f = convert_bf16_fp32(ofmap_data[i]);
+      ofmap_data_bf16 = ofmap_data[i];
+
+      if (mode == PRE_DATA_COMPARE_FIX) {
+        is_close = ofmap_data[i] == ref;
+      }
+      else {
+        is_close = fabs(ref_f-ofmap_data_f) < epsilon;
+      }
+    }
+
+    if (!is_close) {
+      fprintf(stderr,
+          "comparing failed at ofmap_data[%" PRIu64 "](input:%e), got %x, exp %x, fp32: got %e exp %e\n",
+          i, convert_bf16_fp32(ifmap[i]),
+          ofmap_data_bf16, ref, ofmap_data_f, ref_f);
+      exit(-1);
+    }
+  }
+
+  return true;
+}
+
+/*
+ * NOTICE: it could occupy 2 lookup table size which shape is <1,32,32,8> with bf16 data type
+ *
+ * \tl_buf, \tl_buf2 tmp buffer, the shape MUST be same with \tl_ifmap
+ * \tl_ofmap_u8 result as u8 type, NULL means use bf16 result
+ * \tl_ofmap_bf16 result as bf16, MUST given for tmp buffer used
+ * \range_start, \range_end specify data range, default range is -8 ~ +8
+ */
+static int bf16_emit(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk,
+    tl_t* tl_ifmap,
+    tl_t* tl_ifmap2,
+    tl_t* tl_buf,
+    tl_t* tl_buf2,
+    tl_t* OUT tl_ofmap_bf16,
+    tl_t* OUT tl_ofmap_u8,
+    tl_t *tl_table_answer, tl_t *tl_table_answer_mantissa,
+    int range_start, int range_end
+    ) {
+  assert(tl_ofmap_bf16);
+  assert(tl_ifmap2);
+  assert(tl_buf2);
+  assert(tl_buf);
+  assert(tl_ofmap_bf16);
+
+  fmt_t fmt = FMT_BF16;
+  float scale = table_hw / (1.0 * abs(range_start - range_end));
+  int data_type_size = bytesize_of_fmt(fmt);
+  u64 table_size = tl_shape_size(&tl_table_answer->shape);
+  u64 table_bytesize  =  table_size * data_type_size;
+
+  bmk1880v2_tdma_tg2l_tensor_copy_param_t copy_p2, copy_p3;
+  memset(&copy_p2, 0, sizeof(copy_p2));
+  memset(&copy_p2, 0, sizeof(copy_p3));
+
+  bmk1880v2_tdma_l2l_tensor_copy_param_t p10;
+  memset(&p10, 0, sizeof(p10));
+
+  // 1. get x^2 + y ^ 2
+  bmk1880v2_tiu_element_wise_mul_param_t p1;
+  memset(&p1, 0, sizeof(p1));
+  p1.res_high = NULL;
+  p1.res_low = tl_buf2;
+  p1.a = tl_ifmap;
+  p1.b_is_const = 0;
+  p1.b = tl_ifmap;
+  p1.rshift_bits = 0;
+  p1.relu_enable = 0;
+  bmk1880v2_tiu_element_wise_mul(bmk, &p1);
+
+  bmk1880v2_tiu_element_wise_mac_param_t p2;
+  memset(&p2, 0, sizeof(p2));
+  p2.res_high = 0;
+  p2.res_low = tl_buf2;
+  p2.res_is_int8 = 0;
+  p2.a = tl_ifmap2;
+  p2.b_is_const = 0;
+  p2.b = tl_ifmap2;
+  p2.lshift_bits = 0;
+  p2.rshift_bits = 0;
+  p2.relu_enable = 0;
+  bmk1880v2_tiu_element_wise_mac(bmk, &p2);
+
+  // 2. sqrt
+
+  // prepare exp table
+  u16 *table_data = (u16 *)xmalloc(table_bytesize);
+  gen_sqrt (table_data, table_size);
+
+  // prepare mantissa table
+  u16 *table_data_mantissa = (u16 *)xmalloc(table_bytesize);
+  gen_sqrt_mantissa(table_data, table_data_mantissa, table_size);
+
+  prepare_put_bf16_tensor_g2l(ctx, bmk, tl_table_answer, table_data, fmt, &copy_p2);
+  prepare_put_bf16_tensor_g2l(ctx, bmk, tl_table_answer_mantissa, table_data_mantissa, fmt, &copy_p3);
+
+  launch_put_bf16_tensor_g2l(ctx, bmk, copy_p2.src, &copy_p2); // table value
+  launch_put_bf16_tensor_g2l(ctx, bmk, copy_p3.src, &copy_p3); // table mantissa
+
+  // remove low 8 bits by int8 copy with stride
+  // <! get index(pow)
+  memset(&p10, 0x00, sizeof(bmk1880v2_tdma_l2l_tensor_copy_param_t));
+  p10.dst = tl_buf;
+  p10.src = tl_buf2;
+  p10.mv_lut_idx = true;
+  bmk1880v2_tdma_l2l_bf16_tensor_copy(bmk, &p10);
+  p10.mv_lut_idx = false;
+  test_submit(ctx);
+
+  // <! get f(x0) = 2^(x0*-0.5)
+  bmk1880v2_tiu_lookup_table_param_t p12;
+  memset(&p12, 0, sizeof(p12));
+  p12.ofmap = tl_ofmap_bf16;
+  p12.ifmap = tl_buf;
+  p12.table = tl_table_answer;
+  bmk1880v2_tiu_lookup_table(bmk, &p12);
+
+  // <! get mantissa value
+  p12.ofmap = tl_buf;
+  p12.ifmap = tl_buf2;
+  p12.table = tl_table_answer_mantissa;
+  bmk1880v2_tiu_lookup_table(bmk, &p12);
+
+  // sqrt = (2^exp) * mantissa
+  p1.res_high = NULL;
+  p1.res_low = tl_ofmap_bf16;
+  p1.a = tl_buf;
+  p1.b_is_const = 0;
+  p1.b = tl_ofmap_bf16;
+  p1.rshift_bits = 0;
+  p1.relu_enable = 0;
+  bmk1880v2_tiu_element_wise_mul(bmk, &p1);
+
+  // 3. add x
+  bmk1880v2_tiu_element_wise_add_param_t p4;
+  memset(&p4, 0, sizeof(p4));
+  p4.res_high = 0;
+  p4.res_low = tl_ofmap_bf16;
+  p4.a_high = 0;
+  p4.a_low = tl_ofmap_bf16;
+  p4.b_is_const = 0;
+  p4.b_high = 0;
+  p4.b_low = tl_ifmap;
+  p4.rshift_bits = 0;
+  p4.relu_enable = 0;
+  bmk1880v2_tiu_element_wise_add(bmk, &p4);
+ 
+  // 4. get reciprocal
+  gen_reciprocal (table_data, table_size);
+  gen_reciprocal_mantissa(table_data, table_data_mantissa, table_size);
+  prepare_put_bf16_tensor_g2l(ctx, bmk, tl_table_answer, table_data, fmt, &copy_p2);
+  prepare_put_bf16_tensor_g2l(ctx, bmk, tl_table_answer_mantissa, table_data_mantissa, fmt, &copy_p3);
+
+  launch_put_bf16_tensor_g2l(ctx, bmk, copy_p2.src, &copy_p2); // table value
+  launch_put_bf16_tensor_g2l(ctx, bmk, copy_p3.src, &copy_p3); // table mantissa
+
+  // <! get index(pow)
+  memset(&p10, 0x00, sizeof(bmk1880v2_tdma_l2l_tensor_copy_param_t));
+  p10.dst = tl_buf;
+  p10.src = tl_ofmap_bf16;
+  p10.mv_lut_idx = true;
+  bmk1880v2_tdma_l2l_bf16_tensor_copy(bmk, &p10);
+  p10.mv_lut_idx = false;
+  test_submit(ctx);
+
+  // <! get f(x0) = 2^(x0*-0.5)
+  p12.ofmap = tl_buf;
+  p12.ifmap = tl_buf;
+  p12.table = tl_table_answer;
+  bmk1880v2_tiu_lookup_table(bmk, &p12);
+
+  // <! get mantissa value
+  p12.ofmap = tl_buf2;
+  p12.ifmap = tl_ofmap_bf16;
+  p12.table = tl_table_answer_mantissa;
+  bmk1880v2_tiu_lookup_table(bmk, &p12);
+
+  // reciprocal = (2^exp) * mantissa
+  p1.res_high = NULL;
+  p1.res_low = tl_ofmap_bf16;
+  p1.a = tl_buf;
+  p1.b_is_const = 0;
+  p1.b = tl_buf2;
+  p1.rshift_bits = 0;
+  p1.relu_enable = 0;
+  bmk1880v2_tiu_element_wise_mul(bmk, &p1);
+
+  // 5. mul y
+  p1.res_high = NULL;
+  p1.res_low = tl_ofmap_bf16;
+  p1.a = tl_ofmap_bf16;
+  p1.b_is_const = 0;
+  p1.b = tl_ifmap2;
+  p1.relu_enable = 0;
+  bmk1880v2_tiu_element_wise_mul(bmk, &p1);
+
+  // 6. get f(x) = atan(x)
+  gen_atan_y0 (table_data, table_size, range_start, range_end);
+  gen_atan_slope(table_data, table_data_mantissa, table_size, range_start, range_end);
+
+  prepare_put_bf16_tensor_g2l(ctx, bmk, tl_table_answer, table_data, fmt, &copy_p2);
+  prepare_put_bf16_tensor_g2l(ctx, bmk, tl_table_answer_mantissa, table_data_mantissa, fmt, &copy_p3);
+  launch_put_bf16_tensor_g2l(ctx, bmk, copy_p2.src, &copy_p2); // table value
+  launch_put_bf16_tensor_g2l(ctx, bmk, copy_p3.src, &copy_p3); // table mantissa
+
+  // scale input for remap its idx(-x~x) to (-127~127), dirty tl_ifmap
+  p1.res_high = NULL;
+  p1.res_low = tl_ofmap_bf16;
+  p1.a = tl_ofmap_bf16;
+  p1.b_is_const = 1;
+  p1.b_const.val = convert_fp32_bf16(scale);
+  p1.rshift_bits = 0;
+  p1.relu_enable = 0;
+  bmk1880v2_tiu_element_wise_mul(bmk, &p1);
+
+  // <! get idx from bf16->int8
+  tl_shape_t tl_shape_int8 = {1, channel, tl_ofmap_bf16->shape.h * tl_ofmap_bf16->shape.w, 1};
+  memset(&p10, 0x00, sizeof(bmk1880v2_tdma_l2l_tensor_copy_param_t));
+  bmk1880v2_tensor_lmem_t dst;
+  memcpy(&dst, tl_ifmap, sizeof(bmk1880v2_tensor_lmem_t)); 
+  dst.fmt = FMT_I8;
+  dst.shape = tl_shape_int8;
+  dst.stride = bmk1880v2_tensor_lmem_default_stride(bmk, dst.shape, dst.fmt, /*align*/1);
+  dst.stride.h = dst.stride.h * 2;
+  dst.int8_rnd_mode = 1;
+  p10.dst = &dst;
+  p10.src = tl_ofmap_bf16;
+  bmk1880v2_tdma_l2l_bf16_tensor_copy(bmk, &p10);
+  test_submit(ctx);
+  dst.int8_rnd_mode = 0; // reset
+
+  // <! int8 to fb16 format cus for sub use, sub MUST in the same format
+  memset(&p10, 0x00, sizeof(bmk1880v2_tdma_l2l_tensor_copy_param_t));
+  p10.dst = tl_buf; //<! bf16
+  p10.src = &dst;
+  bmk1880v2_tdma_l2l_bf16_tensor_copy(bmk, &p10);
+  test_submit(ctx);
+
+  // <! sub, diff base , a - b
+  // (x - x0)
+  bmk1880v2_tiu_element_wise_sub_param_t p5;
+  memset(&p5, 0, sizeof(p5));
+  p5.res_high = 0;
+  p5.res_low = tl_buf;
+  p5.a_high = 0;
+  p5.a_low = tl_ofmap_bf16;
+  p5.b_high = 0;
+  p5.b_low = tl_buf;
+  p5.rshift_bits = 0;
+  bmk1880v2_tiu_element_wise_sub(bmk, &p5);
+
+  // get f(x0) and slope(x)
+  // reshape, 16->16
+  dst.fmt = fmt;
+  dst.shape = tl_buf->shape;
+  dst.stride = tl_buf->stride;
+
+  // <! get slope by index
+  // <! ( (f(x1) - f(x0)) / (x1 - x0) )
+  // <! TIU MUST with same shape and stride, we leverage output map shape and stride
+  memset(&p12, 0x0, sizeof(bmk1880v2_tiu_lookup_table_param_t));
+  p12.ofmap = tl_buf2;
+  p12.ifmap = &dst;
+  p12.table = tl_table_answer_mantissa;
+  bmk1880v2_tiu_lookup_table(bmk, &p12);
+
+  // base f(x0)
+  memset(&p12, 0x0, sizeof(bmk1880v2_tiu_lookup_table_param_t));
+  p12.ofmap = tl_ofmap_bf16;
+  p12.ifmap = &dst;
+  p12.table = tl_table_answer;
+  bmk1880v2_tiu_lookup_table(bmk, &p12);
+
+  // <! mac
+  // <! part A + part B, a * b + res = res
+  p2.res_high = 0;
+  p2.res_low = tl_ofmap_bf16;
+  p2.res_is_int8 = 0;
+  p2.a = tl_buf2;
+  p2.b_is_const = 0;
+  p2.b = tl_buf;
+  p2.lshift_bits = 0;
+  p2.rshift_bits = 0;
+  p2.relu_enable = 0;
+  bmk1880v2_tiu_element_wise_mac(bmk, &p2);
+
+  // 7. 2*
+  p1.res_high = NULL;
+  p1.res_low = tl_ofmap_bf16;
+  p1.a = tl_ofmap_bf16;
+  p1.b_is_const = 1;
+  p1.b_const.val = convert_fp32_bf16(2);
+  p1.relu_enable = 0;
+  bmk1880v2_tiu_element_wise_mul(bmk, &p1);
+
+  if (tl_ofmap_u8) {
+    p10.dst = tl_ofmap_u8;
+    p10.src = tl_ofmap_bf16;
+    bmk1880v2_tdma_l2l_bf16_tensor_copy(bmk, &p10);
+  }
+
+  test_submit(ctx);
+
+
+  free_tl(bmk, tl_table_answer_mantissa);
+  free_tl(bmk, tl_table_answer);
+
+  free(table_data);
+  free(table_data_mantissa);
+
+  return 0;
+}
+
+static void gen_test_pattern(u16 *ifmap, u16 *ifmap2, TEST_MODE mode,
+    u64 ifmap_shape_size,
+    int range_start, int range_end) {
+
+  if (mode == PRE_DATA_COMPARE_FIX) {
+    memcpy(ifmap, &test_pattern, sizeof(test_pattern));
+  }
+  else {
+    float LO = range_start;
+    float HI = range_end;
+    for (u64 i = 0; i < ifmap_shape_size; i++) {
+      srand (static_cast <unsigned> (time(0)));
+      std::random_device rd;
+      std::mt19937 e2(rd());
+      //std::uniform_real_distribution<> dist(pow(2,-62), pow(2,63));
+      for (u64 i = 0; i < ifmap_shape_size; i++) {
+        //float r3 = dist(e2);
+        float r3 = LO + static_cast <float> (rand()) /( static_cast <float> (RAND_MAX/(HI-LO)));
+        ifmap[i] = convert_fp32_bf16(r3);
+      }
+    }
+  }
+
+  for (u64 i = 0; i < ifmap_shape_size; i++) {
+    ifmap2[i] = convert_fp32_bf16(convert_bf16_fp32(ifmap[i]) + 0.001);
+  }
+
+#if 0//#ifdef DBG
+  for (u64 i = 0; i < ifmap_shape_size; i++) {
+    printf("source if[%" PRIu64 "] bf16 %f 0x%x, log2f is %f\n", i, convert_bf16_fp32(ifmap[i]), ifmap[i], floor(log2((convert_bf16_fp32(ifmap[i])))));
+  }
+#endif /* ifdef DBG */
+}
+
+static void test_tl_int8_lut_bf16(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk,
+    u32 input_n, u32 input_c, u32 input_h, u32 input_w,
+    float epsilon, int range_start, int range_end)
+{
+  // TODO: check more shape / align
+  tl_shape_t ifmap_shape = {input_n, input_c, input_h, input_w};
+  tl_shape_t ofmap_shape = ifmap_shape;
+  tl_shape_t table_shape = {1, channel, table_h, table_w}; // hard code for hw, hw:32x8
+
+  u64 ifmap_shape_size = tl_shape_size(&ifmap_shape);
+  u64 ofmap_size = tl_shape_size(&ofmap_shape);
+
+  fmt_t fmt = FMT_BF16;
+
+  int data_type_size = bytesize_of_fmt(fmt);
+  u64 ifmap_bytesize  =  ifmap_shape_size * data_type_size;
+  u64 ofmap_bytesize  =  ofmap_size * data_type_size;
+
+  u16 *ifmap = (u16 *)xmalloc(ifmap_bytesize);
+  u16 *ifmap2 = (u16 *)xmalloc(ifmap_bytesize);
+  u16 *ref_data = (u16 *)xmalloc(ofmap_bytesize);
+
+  tl_t *tl_ifmap = alloc_tl(bmk,ifmap_shape, fmt, /*align*/1);
+  tl_t *tl_ifmap2 = alloc_tl(bmk,ifmap_shape, fmt, /*align*/1);
+  tl_t *tl_ofmap_bf16 = alloc_tl(bmk,ofmap_shape, fmt, /*align*/1);
+  tl_t *tl_buf = tl_ifmap ? alloc_tl(bmk, tl_ifmap->shape, fmt, /*align*/1) : nullptr;
+  tl_t *tl_buf2 = alloc_tl(bmk, tl_ifmap->shape, fmt, /*align*/1);
+  tl_t *tl_table_answer = alloc_tl(bmk, table_shape, fmt, /*align*/1);
+  tl_t *tl_table_answer_mantissa = alloc_tl(bmk, table_shape, fmt, /*align*/1);
+
+  gen_test_pattern(ifmap, ifmap2, mode, ifmap_shape_size, range_start, range_end);
+  tl_lut_ref(ref_data, ifmap, ifmap2, ifmap_shape);
+
+  tl_t *tl_ofmap_u8 = nullptr;
+  tl_t *out = tl_ofmap_bf16;
+
+  if (mode == DATA_COMPARE_U8) {
+    tl_ofmap_u8 =
+      alloc_tl(bmk,ofmap_shape, FMT_U8, /*align*/1);
+    out = tl_ofmap_u8;
+  }
+
+  // <! FIXME: prepare it
+  bmk1880v2_tdma_tg2l_tensor_copy_param_t copy_p1, copy_p2;
+  memset(&copy_p1, 0, sizeof(copy_p1));
+  memset(&copy_p2, 0, sizeof(copy_p2));
+  prepare_put_bf16_tensor_g2l(ctx, bmk, tl_ifmap, ifmap, fmt, &copy_p1);
+  prepare_put_bf16_tensor_g2l(ctx, bmk, tl_ifmap2, ifmap2, fmt, &copy_p2);
+
+  launch_put_bf16_tensor_g2l(ctx, bmk, copy_p1.src, &copy_p1); // input
+  launch_put_bf16_tensor_g2l(ctx, bmk, copy_p2.src, &copy_p2); // input 2
+
+  bf16_emit(ctx, bmk,
+      tl_ifmap,
+      tl_ifmap2,
+      tl_buf,
+      tl_buf2,
+      tl_ofmap_bf16,
+      tl_ofmap_u8,
+      tl_table_answer, tl_table_answer_mantissa,
+      range_start, range_end
+      );
+
+  u16* ofmap_data = (u16*)get_bf16_tensor_l2g(ctx, bmk, out, out->fmt);
+
+  verify(ofmap_data, ref_data, ifmap, ifmap_shape_size, mode, epsilon);
+
+  if (tl_ofmap_u8) {
+    free_tl(bmk, tl_ofmap_u8);
+  }
+
+  free_tl(bmk, tl_buf2);
+  free_tl(bmk, tl_buf);
+  free_tl(bmk, tl_ofmap_bf16);
+  free_tl(bmk, tl_ifmap2);
+  free_tl(bmk, tl_ifmap);
+
+  free(ifmap);
+  free(ifmap2);
+  free(ref_data);
+  free(ofmap_data);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  int round_mode;
+
+  round_mode = set_store_feround();
+
+  test_init(&ctx, &bmk);
+
+  for (int i = PRE_DATA_COMPARE_FIX; i < DATA_COMPARE_U8; i++)
+  //for (int i = PRE_DATA_COMPARE_FIX; i < DATA_COMPARE; i++)
+  //for (int i = DATA_COMPARE; i < DATA_COMPARE_U8; i++) 
+  {
+    mode = static_cast<TEST_MODE>(i);
+    printf ("test mode %d...\n", mode);
+
+    int input_n = 1;
+    int input_c = channel;
+    int input_h = 1;
+    int input_w = 1;
+    float epsilon = 0.1;
+    int range_start = -8;
+    int range_end = 8;
+
+    if (mode == PRE_DATA_COMPARE_FIX) {
+      input_h = 4;
+      input_w = 8;
+    }
+    else {
+      input_h = input_w = 16;
+    }
+
+    test_tl_int8_lut_bf16(&ctx, bmk,
+        input_n, input_c, input_h, input_w,
+        epsilon, range_start, range_end);
+  }
+
+  test_exit(&ctx);
+  restore_feround(round_mode);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_bf16_atan2_fast_degree_kernel.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_atan2_fast_degree_kernel.cpp
new file mode 100644
index 000000000..0b34520a3
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_atan2_fast_degree_kernel.cpp
@@ -0,0 +1,745 @@
+/**
+ * \breif atan2 is implemented by atan, you can refer
+ * [wiki](https://en.wikipedia.org/wiki/Atan2) for more details
+ */
+
+#include "../1880v2_test_util.h"
+#define OUT
+#define IN
+#include <cfloat>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <random>
+#include <string>
+//#define DBG
+
+/**
+ * pre_data means we test fixed pattern, it should be same sa lut
+ */
+enum TEST_MODE {
+  PRE_DATA_COMPARE_FIX = 0,  // pre-data + fix compare
+  DATA_COMPARE_ACCURACY,     // generate \range_start to \range_end value that
+                             // check epsilon, default set x > 0, y > 0
+
+  DATA_COMPARE_ACCURACY_X_GT_0,         // atan(y/x), x > 0, y = 0
+  DATA_COMPARE_ACCURACY_X_LT_0_Y_GE_0,  // atan(y/x) + PI , x < 0 and y >= 0
+  DATA_COMPARE_ACCURACY_X_LT_0_Y_LT_0,  // atan(y/x) - PI , x < 0 and y < 0
+  DATA_COMPARE_ACCURACY_X_0_Y_GT_0,     // pi / 2, x = 0 and y > 0
+  DATA_COMPARE_ACCURACY_X_0_Y_LT_0,     // -pi / 2, x = 0 and y < 0
+  DATA_COMPARE_U8,  // generate \range_start to \range_end value that check
+                    // epsilon, result bf16->u8
+  TEST_MODE_MAX,
+};
+
+
+static TEST_MODE mode;
+
+static u16 test_pattern[] = {
+    0x0000, 0x38D2, 0x3952, 0x399D, 0x39D2, 0x3A03, 0x3A1D, 0x3A38, 0x3A52,
+    0x3A6C, 0x3A83, 0x3A90, 0x3A9D, 0x3AAA, 0x3AB8, 0x3AC5, 0x3AD2, 0x3ADF,
+    0x3AEC, 0x3AF9, 0x3B03, 0x3B0A, 0x3B10, 0x3B17, 0x3B1D, 0x3B24, 0x3B2A,
+    0x3B31, 0x3B38, 0x3B3E, 0x3B45, 0x3B4B, 0x3B52, 0x3B58, 0x3B5F, 0x3B65,
+    0x3B6C, 0x3B72, 0x3B79, 0x3B80, 0x3B83, 0x3B86, 0x3B8A, 0x3B8D, 0x3B90,
+    0x3B93, 0x3B97, 0x3B9A, 0x3B9D, 0x3BA1, 0x3BA4, 0x3BA7, 0x3BAA, 0x3BAE,
+    0x3BB1, 0x3BB4, 0x3BB8, 0x3BBB, 0x3BBE, 0x3BC1, 0x3BC5, 0x3BC8, 0x3BCB,
+    0x3BCE, 0x3BD2, 0x3BD5, 0x3BD8, 0x3BDC, 0x3BDF, 0x3BE2, 0x3BE5, 0x3BE9,
+    0x3BEC, 0x3BEF, 0x3BF2, 0x3BF6, 0x3BF9, 0x3BFC, 0x3C00, 0x3C01, 0x3C03,
+    0x3C05, 0x3C06, 0x3C08, 0x3C0A, 0x3C0B, 0x3C0D, 0x3C0F, 0x3C10, 0x3C12,
+    0x3C13, 0x3C15, 0x3C17, 0x3C18, 0x3C1A, 0x3C1C, 0x3C1D, 0x3C1F, 0x3C21,
+    0x3C22, 0x3C24, 0x3C25, 0x3C27, 0x3C29, 0x3C2A, 0x3C2C, 0x3C2E, 0x3C2F,
+    0x3C31, 0x3C33, 0x3C34, 0x3C36, 0x3C38, 0x3C39, 0x3C3B, 0x3C3C, 0x3C3E,
+    0x3C40, 0x3C41, 0x3C43, 0x3C45, 0x3C46, 0x3C48, 0x3C4A, 0x3C4B, 0x3C4D,
+    0x3C4E, 0x3C50, 0x3C52, 0x3C53, 0x3C55, 0x3C57, 0x3C58, 0x3C5A, 0x3C5C,
+    0x3C5D, 0x3C5F, 0x3C60, 0x3C62, 0x3C64, 0x3C65, 0x3C67, 0x3C69, 0x3C6A,
+    0x3C6C, 0x3C6E, 0x3C6F, 0x3C71, 0x3C72, 0x3C74, 0x3C76, 0x3C77, 0x3C79,
+    0x3C7B, 0x3C7C, 0x3C7E, 0x3C80, 0x3C81, 0x3C81, 0x3C82, 0x3C83, 0x3C84,
+    0x3C85, 0x3C86, 0x3C86, 0x3C87, 0x3C88, 0x3C89, 0x3C8A, 0x3C8A, 0x3C8B,
+    0x3C8C, 0x3C8D, 0x3C8E, 0x3C8F, 0x3C8F, 0x3C90, 0x3C91, 0x3C92, 0x3C93,
+    0x3C93, 0x3C94, 0x3C95, 0x3C96, 0x3C97, 0x3C98, 0x3C98, 0x3C99, 0x3C9A,
+    0x3C9B, 0x3C9C, 0x3C9C, 0x3C9D, 0x3C9E, 0x3C9F, 0x3CA0, 0x3CA1, 0x3CA1,
+    0x3CA2, 0x3CA3, 0x3CA4, 0x3CA5, 0x3CA5, 0x3CA6, 0x3CA7, 0x3CA8, 0x3CA9,
+    0x3CAA, 0x3CAA, 0x3CAB, 0x3CAC, 0x3CAD, 0x3CAE, 0x3CAE, 0x3CAF, 0x3CB0,
+    0x3CB1, 0x3CB2, 0x3CB3, 0x3CB3, 0x3CB4, 0x3CB5, 0x3CB6, 0x3CB7, 0x3CB8,
+    0x3CB8, 0x3CB9, 0x3CBA, 0x3CBB, 0x3CBC, 0x3CBC, 0x3CBD, 0x3CBE, 0x3CBF,
+    0x3CC0, 0x3CC1, 0x3CC1, 0x3CC2, 0x3CC3, 0x3CC4, 0x3CC5, 0x3CC5, 0x3CC6,
+    0x3CC7, 0x3CC8, 0x3CC9, 0x3CCA, 0x3CCA, 0x3CCB, 0x3CCC, 0x3CCD, 0x3CCE,
+    0x3CCE, 0x3CCF, 0x3CD0, 0x3CD1, 0x3CD2, 0x3CD3, 0x3CD3, 0x3CD4, 0x3CD5,
+    0x3CD6, 0x3CD7, 0x3CD7, 0x3CD8, 0x3CD9, 0x3CDA, 0x3CDB, 0x3CDC, 0x3CDC,
+    0x3CDD, 0x3CDE, 0x3CDF, 0x3CE0, 0x3CE0, 0x3CE1, 0x3CE2, 0x3CE3, 0x3CE4,
+    0x3CE5, 0x3CE5, 0x3CE6, 0x3CE7, 0x3CE8, 0x3CE9, 0x3CE9, 0x3CEA, 0x3CEB,
+    0x3CEC, 0x3CED, 0x3CEE, 0x3CEE, 0x3CEF, 0x3CF0, 0x3CF1, 0x3CF2, 0x3CF2,
+    0x3CF3, 0x3CF4, 0x3CF5, 0x3CF6, 0x3CF7, 0x3CF7, 0x3CF8, 0x3CF9, 0x3CFA,
+    0x3CFB, 0x3CFB, 0x3CFC, 0x3CFD, 0x3CFE, 0x3CFF, 0x3D00, 0x3D00, 0x3D01,
+    0x3D01, 0x3D01, 0x3D02, 0x3D02, 0x3D03, 0x3D03, 0x3D03, 0x3D04, 0x3D04,
+    0x3D05, 0x3D05, 0x3D06, 0x3D06, 0x3D06, 0x3D07, 0x3D07, 0x3D08, 0x3D08,
+    0x3D08, 0x3D09, 0x3D09, 0x3D0A, 0x3D0A, 0x3D0A, 0x3D0B, 0x3D0B, 0x3D0C,
+    0x3D0C, 0x3D0C, 0x3D0D, 0x3D0D, 0x3D0E, 0x3D0E, 0x3D0F, 0x3D0F, 0x3D0F,
+    0x3D10, 0x3D10, 0x3D11, 0x3D11, 0x3D11, 0x3D12, 0x3D12, 0x3D13, 0x3D13,
+    0x3D13, 0x3D14, 0x3D14, 0x3D15, 0x3D15, 0x3D16, 0x3D16, 0x3D16, 0x3D17,
+    0x3D17, 0x3D18, 0x3D18, 0x3D18, 0x3D19, 0x3D19, 0x3D1A, 0x3D1A, 0x3D1A,
+    0x3D1B, 0x3D1B, 0x3D1C, 0x3D1C, 0x3D1C, 0x3D1D, 0x3D1D, 0x3D1E, 0x3D1E,
+    0x3D1F, 0x3D1F, 0x3D1F, 0x3D20, 0x3D20, 0x3D21, 0x3D21, 0x3D21, 0x3D22,
+    0x3D22, 0x3D23, 0x3D23, 0x3D23, 0x3D24, 0x3D24, 0x3D25, 0x3D25, 0x3D25,
+    0x3D26, 0x3D26, 0x3D27, 0x3D27, 0x3D28, 0x3D28, 0x3D28, 0x3D29, 0x3D29,
+    0x3D2A, 0x3D2A, 0x3D2A, 0x3D2B, 0x3D2B, 0x3D2C, 0x3D2C, 0x3D2C, 0x3D2D,
+    0x3D2D, 0x3D2E, 0x3D2E, 0x3D2E, 0x3D2F, 0x3D2F, 0x3D30, 0x3D30, 0x3D31,
+    0x3D31, 0x3D31, 0x3D32, 0x3D32, 0x3D33, 0x3D33, 0x3D33, 0x3D34, 0x3D34,
+    0x3D35, 0x3D35, 0x3D35, 0x3D36, 0x3D36, 0x3D37, 0x3D37, 0x3D38, 0x3D38,
+    0x3D38, 0x3D39, 0x3D39, 0x3D3A, 0x3D3A, 0x3D3A, 0x3D3B, 0x3D3B, 0x3D3C,
+    0x3D3C, 0x3D3C, 0x3D3D, 0x3D3D, 0x3D3E, 0x3D3E, 0x3D3E, 0x3D3F, 0x3D3F,
+    0x3D40, 0x3D40, 0x3D41, 0x3D41, 0x3D41, 0x3D42, 0x3D42, 0x3D43, 0x3D43,
+    0x3D43, 0x3D44, 0x3D44, 0x3D45, 0x3D45, 0x3D45, 0x3D46, 0x3D46, 0x3D47,
+    0x3D47, 0x3D47, 0x3D48, 0x3D48, 0x3D49, 0x3D49, 0x3D4A, 0x3D4A, 0x3D4A,
+    0x3D4B, 0x3D4B, 0x3D4C, 0x3D4C, 0x3D4C, 0x3D4D, 0x3D4D, 0x3D4E, 0x3D4E,
+    0x3D4E, 0x3D4F, 0x3D4F, 0x3D50, 0x3D50, 0x3D50, 0x3D51, 0x3D51, 0x3D52,
+    0x3D52, 0x3D53, 0x3D53, 0x3D53, 0x3D54, 0x3D54, 0x3D55, 0x3D55, 0x3D55,
+    0x3D56, 0x3D56, 0x3D57, 0x3D57, 0x3D57, 0x3D58, 0x3D58, 0x3D59, 0x3D59,
+    0x3D59, 0x3D5A, 0x3D5A, 0x3D5B, 0x3D5B, 0x3D5C, 0x3D5C, 0x3D5C, 0x3D5D,
+    0x3D5D, 0x3D5E, 0x3D5E, 0x3D5E, 0x3D5F, 0x3D5F, 0x3D60, 0x3D60, 0x3D60,
+    0x3D61, 0x3D61, 0x3D62, 0x3D62, 0x3D63, 0x3D63, 0x3D63, 0x3D64, 0x3D64,
+    0x3D65, 0x3D65, 0x3D65, 0x3D66, 0x3D66, 0x3D67, 0x3D67, 0x3D67, 0x3D68,
+    0x3D68, 0x3D69, 0x3D69, 0x3D69, 0x3D6A, 0x3D6A, 0x3D6B, 0x3D6B, 0x3D6C,
+    0x3D6C, 0x3D6C, 0x3D6D, 0x3D6D, 0x3D6E, 0x3D6E, 0x3D6E, 0x3D6F, 0x3D6F,
+    0x3D70, 0x3D70, 0x3D70, 0x3D71, 0x3D71, 0x3D72, 0x3D72, 0x3D72, 0x3D73,
+    0x3D73, 0x3D74, 0x3D74, 0x3D75, 0x3D75, 0x3D75, 0x3D76, 0x3D76, 0x3D77,
+    0x3D77, 0x3D77, 0x3D78, 0x3D78, 0x3D79, 0x3D79, 0x3D79, 0x3D7A, 0x3D7A,
+    0x3D7B, 0x3D7B, 0x3D7B, 0x3D7C, 0x3D7C, 0x3D7D, 0x3D7D, 0x3D7E, 0x3D7E,
+    0x3D7E, 0x3D7F, 0x3D7F, 0x3D80, 0x3D80, 0x3D80, 0x3D80, 0x3D81, 0x3D81,
+    0x3D81, 0x3D81, 0x3D81, 0x3D82, 0x3D82, 0x3D82, 0x3D82, 0x3D82, 0x3D83,
+    0x3D83, 0x3D83, 0x3D83, 0x3D83, 0x3D84, 0x3D84, 0x3D84, 0x3D84, 0x3D85,
+    0x3D85, 0x3D85, 0x3D85, 0x3D85, 0x3D86, 0x3D86, 0x3D86, 0x3D86, 0x3D86,
+    0x3D87, 0x3D87, 0x3D87, 0x3D87, 0x3D87, 0x3D88, 0x3D88, 0x3D88, 0x3D88,
+    0x3D88, 0x3D89, 0x3D89, 0x3D89, 0x3D89, 0x3D89, 0x3D8A, 0x3D8A, 0x3D8A,
+    0x3D8A, 0x3D8A, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8C, 0x3D8C,
+    0x3D8C, 0x3D8C, 0x3D8C, 0x3D8D, 0x3D8D, 0x3D8D, 0x3D8D, 0x3D8E, 0x3D8E,
+    0x3D8E, 0x3D8E, 0x3D8E, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D90,
+    0x3D90, 0x3D90, 0x3D90, 0x3D90, 0x3D91, 0x3D91, 0x3D91, 0x3D91, 0x3D91,
+    0x3D92, 0x3D92, 0x3D92, 0x3D92, 0x3D92, 0x3D93, 0x3D93, 0x3D93, 0x3D93,
+    0x3D93, 0x3D94, 0x3D94, 0x3D94, 0x3D94, 0x3D94, 0x3D95, 0x3D95, 0x3D95,
+    0x3D95, 0x3D96, 0x3D96, 0x3D96, 0x3D96, 0x3D96, 0x3D97, 0x3D97, 0x3D97,
+    0x3D97, 0x3D97, 0x3D98, 0x3D98, 0x3D98, 0x3D98, 0x3D98, 0x3D99, 0x3D99,
+    0x3D99, 0x3D99, 0x3D99, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9B,
+    0x3D9B, 0x3D9B, 0x3D9B, 0x3D9B, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9C,
+    0x3D9D, 0x3D9D, 0x3D9D, 0x3D9D, 0x3D9D, 0x3D9E, 0x3D9E, 0x3D9E, 0x3D9E,
+    0x3D9F, 0x3D9F, 0x3D9F, 0x3D9F, 0x3D9F, 0x3DA0, 0x3DA0, 0x3DA0, 0x3DA0,
+    0x3DA0, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA2, 0x3DA2, 0x3DA2,
+    0x3DA2, 0x3DA2, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA4, 0x3DA4,
+    0x3DA4, 0x3DA4, 0x3DA4, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA6,
+    0x3DA6, 0x3DA6, 0x3DA6, 0x3DA7, 0x3DA7, 0x3DA7, 0x3DA7, 0x3DA7, 0x3DA8,
+    0x3DA8, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA9, 0x3DA9, 0x3DA9, 0x3DA9, 0x3DA9,
+    0x3DAA, 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAB,
+    0x3DAB, 0x3DAC, 0x3DAC, 0x3DAC, 0x3DAC, 0x3DAC, 0x3DAD, 0x3DAD, 0x3DAD,
+    0x3DAD, 0x3DAD, 0x3DAE, 0x3DAE, 0x3DAE, 0x3DAE, 0x3DAE, 0x3DAF, 0x3DAF,
+    0x3DAF, 0x3DAF, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB1, 0x3DB1,
+    0x3DB1, 0x3DB1, 0x3DB1, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB3,
+    0x3DB3, 0x3DB3, 0x3DB3, 0x3DB3, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB4,
+    0x3DB5, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB6, 0x3DB6, 0x3DB6, 0x3DB6,
+    0x3DB6, 0x3DB7, 0x3DB7, 0x3DB7, 0x3DB7, 0x3DB8, 0x3DB8, 0x3DB8, 0x3DB8,
+    0x3DB8, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DBA, 0x3DBA, 0x3DBA,
+    0x3DBA, 0x3DBA, 0x3DBB, 0x3DBB, 0x3DBB, 0x3DBB, 0x3DBB, 0x3DBC, 0x3DBC,
+    0x3DBC, 0x3DBC, 0x3DBC, 0x3DBD, 0x3DBD, 0x3DBD, 0x3DBD, 0x3DBD, 0x3DBE,
+    0x3DBE, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBF, 0x3DBF, 0x3DBF, 0x3DBF, 0x3DBF,
+    0x3DC0, 0x3DC0, 0x3DC0, 0x3DC0, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC1,
+    0x3DC2, 0x3DC2, 0x3DC2, 0x3DC2, 0x3DC2, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC3,
+    0x3DC3, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC5, 0x3DC5, 0x3DC5,
+    0x3DC5, 0x3DC5, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC7, 0x3DC7,
+    0x3DC7, 0x3DC7, 0x3DC7, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC9,
+    0x3DC9, 0x3DC9, 0x3DC9, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCB,
+    0x3DCB, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCC, 0x3DCC, 0x3DCC, 0x3DCC, 0x3DCC,
+    0x3DCD, 0x3DCE, 0x3DCF, 0x3DD0, 0x3DD1, 0x3DD2, 0x3DD3, 0x3DD4, 0x3DD5,
+    0x3DD6, 0x3DD7, 0x3DD8, 0x3DD9, 0x3DDA, 0x3DDB, 0x3DDC, 0x3DDD, 0x3DDE,
+    0x3DDF, 0x3DE0, 0x3DE1, 0x3DE2, 0x3DE3, 0x3DE4, 0x3DE5,
+};
+
+static u16 golden_bf16[] = {
+    0x42b4, 0x42b4, 0x42b4, 0x42b4, 0x42b4, 0x42b4, 0x42b3, 0x42b3, 0x42b3,
+    0x42b3, 0x42b3, 0x42b3, 0x42b3, 0x42b3, 0x42b3, 0x42b3, 0x42b2, 0x42b2,
+    0x42b2, 0x42b2, 0x42b2, 0x42b2, 0x42b2, 0x42b2, 0x42b2, 0x42b1, 0x42b1,
+    0x42b1, 0x42b1, 0x42b1, 0x42b1, 0x42b1, 0x42b1, 0x42b1, 0x42b1, 0x42af,
+    0x42af, 0x42af, 0x42af, 0x42af, 0x42af, 0x42af, 0x42af, 0x42af, 0x42af,
+    0x42ae, 0x42ae, 0x42ae, 0x42ae, 0x42ae, 0x42ae, 0x42ae, 0x42ae, 0x42ae,
+    0x42ad, 0x42ad, 0x42ad, 0x42ad, 0x42ad, 0x42ad, 0x42ad, 0x42ad, 0x42ad,
+    0x42ac, 0x42ac, 0x42ac, 0x42ac, 0x42ac, 0x42ac, 0x42ac, 0x42ac, 0x42ac,
+    0x42ab, 0x42ab, 0x42ab, 0x42ab, 0x42ab, 0x42ab, 0x42ab, 0x42ab, 0x42ab,
+    0x42aa, 0x42aa, 0x42aa, 0x42aa, 0x42aa, 0x42aa, 0x42aa, 0x42aa, 0x42a9,
+    0x42a9, 0x42a9, 0x42a9, 0x42a9, 0x42a9, 0x42a9, 0x42a9, 0x42a9, 0x42a7,
+    0x42a7, 0x42a7, 0x42a7, 0x42a7, 0x42a7, 0x42a7, 0x42a7, 0x42a6, 0x42a6,
+    0x42a6, 0x42a6, 0x42a6, 0x42a6, 0x42a6, 0x42a6, 0x42a5, 0x42a5, 0x42a5,
+    0x42a5, 0x42a5, 0x42a5, 0x42a5, 0x42a4, 0x42a4, 0x42a4, 0x42a4, 0x42a4,
+    0x42a4, 0x42a4, 0x42a4, 0x42a4, 0x42a3, 0x42a3, 0x42a3, 0x42a3, 0x42a3,
+    0x42a3, 0x42a3, 0x42a2, 0x42a2, 0x42a2, 0x42a2, 0x42a2, 0x42a2, 0x42a2,
+    0x42a2, 0x42a2, 0x42a1, 0x42a1, 0x42a1, 0x42a1, 0x42a1, 0x42a1, 0x42a1,
+    0x42a0, 0x42a0, 0x42a0, 0x42a0, 0x42a0, 0x42a0, 0x42a0, 0x42a0, 0x429e,
+    0x429e, 0x429e, 0x429e, 0x429e, 0x429e, 0x429d, 0x429d, 0x429d, 0x429d,
+    0x429d, 0x429d, 0x429d, 0x429d, 0x429d, 0x429c, 0x429c, 0x429c, 0x429c,
+    0x429c, 0x429b, 0x429b, 0x429b, 0x429b, 0x429b, 0x429b, 0x429b, 0x429b,
+    0x429a, 0x429a, 0x429a, 0x429a, 0x429a, 0x429a, 0x4299, 0x4299, 0x4299,
+    0x4299, 0x4299, 0x4299, 0x4299, 0x4299, 0x4298, 0x4298, 0x4298, 0x4298,
+    0x4298, 0x4298, 0x4297, 0x4297, 0x4297, 0x4297, 0x4297, 0x4297, 0x4296,
+    0x4296, 0x4296, 0x4296, 0x4296, 0x4295, 0x4295, 0x4295, 0x4295, 0x4295,
+    0x4295, 0x4295, 0x4295, 0x4294, 0x4294, 0x4294, 0x4294, 0x4294, 0x4294,
+    0x4293, 0x4293, 0x4293, 0x4293, 0x4293, 0x4293, 0x4292, 0x4292, 0x4292,
+    0x4292, 0x4292, 0x4291, 0x4291, 0x4291, 0x4291, 0x4291, 0x4291, 0x4291,
+    0x4291, 0x428f, 0x428f, 0x428f, 0x428e, 0x428e, 0x428e, 0x428e, 0x428e,
+    0x428e, 0x428e, 0x428e, 0x428d, 0x428d, 0x428d, 0x428d, 0x428c, 0x428c,
+    0x428c, 0x428c, 0x428c, 0x428c, 0x428c, 0x428c, 0x428b, 0x428b, 0x428b,
+    0x428a, 0x428a, 0x428a, 0x428a, 0x428a, 0x428a, 0x428a, 0x4289, 0x4289,
+    0x4289, 0x4288, 0x4288, 0x4288, 0x4288, 0x4288, 0x4288, 0x4287, 0x4287,
+    0x4287, 0x4287, 0x4287, 0x4286, 0x4286, 0x4286, 0x4286, 0x4286, 0x4286,
+    0x4286, 0x4286, 0x4285, 0x4285, 0x4285, 0x4285, 0x4285, 0x4285, 0x4285,
+    0x4285, 0x4285, 0x4284, 0x4284, 0x4284, 0x4284, 0x4284, 0x4283, 0x4283,
+    0x4282, 0x4282, 0x4282, 0x4282, 0x4282, 0x4281, 0x4281, 0x4281, 0x4281,
+    0x4281, 0x4281, 0x4281, 0x4280, 0x4280, 0x4280, 0x427e, 0x427e, 0x427e,
+    0x427e, 0x427e, 0x427c, 0x427c, 0x427c, 0x427a, 0x427a, 0x427a, 0x427a,
+    0x427a, 0x427a, 0x4278, 0x4278, 0x4278, 0x4277, 0x4277, 0x4277, 0x4277,
+    0x4277, 0x4277, 0x4275, 0x4275, 0x4275, 0x4273, 0x4273, 0x4273, 0x4273,
+    0x4273, 0x4271, 0x4271, 0x4271, 0x4270, 0x4270, 0x4270, 0x4270, 0x4270,
+    0x4270, 0x4270, 0x426e, 0x426c, 0x426c, 0x426c, 0x426c, 0x426c, 0x426a,
+    0x426a, 0x426a, 0x426a, 0x4269, 0x4269, 0x4269, 0x4269, 0x4269, 0x4267,
+    0x4267, 0x4266, 0x4266, 0x4266, 0x4266, 0x4266, 0x4264, 0x4264, 0x4264,
+    0x4262, 0x4262, 0x4262, 0x4262, 0x4261, 0x4261, 0x4261, 0x425f, 0x425f,
+    0x425f, 0x425f, 0x425f, 0x425e, 0x425e, 0x425c, 0x425c, 0x425c, 0x425c,
+    0x425c, 0x425b, 0x425b, 0x425b, 0x4259, 0x4259, 0x4259, 0x4259, 0x4257,
+    0x4257, 0x4257, 0x4256, 0x4256, 0x4256, 0x4256, 0x4256, 0x4253, 0x4253,
+    0x4253, 0x4253, 0x4253, 0x4253, 0x4253, 0x4250, 0x4250, 0x4250, 0x4250,
+    0x4250, 0x424f, 0x424f, 0x424d, 0x424d, 0x424d, 0x424d, 0x424d, 0x424b,
+    0x424b, 0x424b, 0x424b, 0x424b, 0x4249, 0x4249, 0x4249, 0x4248, 0x4248,
+    0x4248, 0x4248, 0x4247, 0x4247, 0x4247, 0x4245, 0x4245, 0x4244, 0x4244,
+    0x4244, 0x4243, 0x4243, 0x4241, 0x4241, 0x4241, 0x4240, 0x4240, 0x4240,
+    0x4240, 0x4240, 0x423e, 0x423e, 0x423e, 0x423e, 0x423b, 0x423b, 0x423b,
+    0x423b, 0x423b, 0x423a, 0x423a, 0x423a, 0x4239, 0x4239, 0x4237, 0x4237,
+    0x4237, 0x4236, 0x4236, 0x4236, 0x4236, 0x4236, 0x4235, 0x4235, 0x4234,
+    0x4234, 0x4232, 0x4232, 0x4232, 0x4232, 0x4232, 0x4231, 0x4231, 0x4231,
+    0x422f, 0x422f, 0x422d, 0x422d, 0x422d, 0x422d, 0x422d, 0x422c, 0x422c,
+    0x422c, 0x422a, 0x422a, 0x422a, 0x422a, 0x4228, 0x4228, 0x4228, 0x4228,
+    0x4228, 0x4227, 0x4227, 0x4227, 0x4225, 0x4225, 0x4223, 0x4223, 0x4223,
+    0x4223, 0x4223, 0x4223, 0x4223, 0x4221, 0x4220, 0x4220, 0x4220, 0x4220,
+    0x421f, 0x421f, 0x421f, 0x421d, 0x421d, 0x421d, 0x421d, 0x421d, 0x421b,
+    0x421b, 0x421b, 0x421b, 0x421b, 0x4219, 0x4219, 0x4218, 0x4218, 0x4218,
+    0x4218, 0x4218, 0x4215, 0x4215, 0x4215, 0x4215, 0x4215, 0x4215, 0x4215,
+    0x4213, 0x4213, 0x4213, 0x4212, 0x4212, 0x4211, 0x4211, 0x4211, 0x420f,
+    0x420f, 0x420f, 0x420f, 0x420d, 0x420d, 0x420d, 0x420c, 0x420c, 0x420c,
+    0x420c, 0x420c, 0x420a, 0x420a, 0x4209, 0x4209, 0x4209, 0x4209, 0x4209,
+    0x4207, 0x4207, 0x4207, 0x4206, 0x4206, 0x4206, 0x4206, 0x4204, 0x4204,
+    0x4204, 0x4202, 0x4202, 0x4202, 0x4202, 0x4202, 0x4201, 0x4201, 0x41fe,
+    0x41fe, 0x41fe, 0x41fe, 0x41fe, 0x41fb, 0x41fb, 0x41fb, 0x41fb, 0x41f8,
+    0x41f8, 0x41f8, 0x41f8, 0x41f8, 0x41f4, 0x41f1, 0x41f1, 0x41f1, 0x41f1,
+    0x41f1, 0x41f1, 0x41f1, 0x41ed, 0x41ed, 0x41ed, 0x41ed, 0x41ed, 0x41ea,
+    0x41ea, 0x41ea, 0x41e6, 0x41e6, 0x41e6, 0x41e3, 0x41e3, 0x41e3, 0x41e3,
+    0x41e3, 0x41df, 0x41df, 0x41df, 0x41df, 0x41dc, 0x41dc, 0x41dc, 0x41dc,
+    0x41dc, 0x41d8, 0x41d8, 0x41d8, 0x41d8, 0x41d5, 0x41d5, 0x41d5, 0x41d5,
+    0x41d5, 0x41d1, 0x41d1, 0x41d1, 0x41cd, 0x41cd, 0x41cd, 0x41cd, 0x41cd,
+    0x41cd, 0x41cd, 0x41c9, 0x41c9, 0x41c9, 0x41c6, 0x41c6, 0x41c6, 0x41c6,
+    0x41c6, 0x41c6, 0x41c6, 0x41c2, 0x41c2, 0x41be, 0x41be, 0x41be, 0x41be,
+    0x41be, 0x41be, 0x41ba, 0x41ba, 0x41ba, 0x41ba, 0x41ba, 0x41b6, 0x41b6,
+    0x41b6, 0x41b6, 0x41b6, 0x41b6, 0x41b6, 0x41b2, 0x41b2, 0x41ae, 0x41ae,
+    0x41ae, 0x41ae, 0x41ae, 0x41ae, 0x41ae, 0x41ae, 0x41aa, 0x41aa, 0x41aa,
+    0x41aa, 0x41aa, 0x41a6, 0x41a6, 0x41a6, 0x41a6, 0x41a6, 0x41a2, 0x41a2,
+    0x41a2, 0x41a2, 0x419e, 0x419e, 0x419e, 0x419e, 0x419e, 0x419e, 0x419a,
+    0x419a, 0x419a, 0x419a, 0x419a, 0x4196, 0x4196, 0x4196, 0x4196, 0x4196,
+    0x4196, 0x4196, 0x4196, 0x4196, 0x4192, 0x418e, 0x418e, 0x418e, 0x418e,
+    0x418e, 0x418e, 0x418e, 0x418e, 0x418e, 0x418a, 0x418a, 0x418a, 0x418a,
+    0x418a, 0x4186, 0x4186, 0x4186, 0x4186, 0x4186, 0x4186, 0x4186, 0x4181,
+    0x4181, 0x4181, 0x4181, 0x4181, 0x417a, 0x417a, 0x417a, 0x417a, 0x417a,
+    0x417a, 0x417a, 0x417a, 0x4172, 0x4172, 0x4172, 0x4172, 0x4172, 0x4169,
+    0x4169, 0x4169, 0x4169, 0x4169, 0x4169, 0x4161, 0x4161, 0x4161, 0x4161,
+    0x4161, 0x4161, 0x4158, 0x4158, 0x4158, 0x4158, 0x4158, 0x4158, 0x4158,
+    0x4158, 0x4158, 0x414f, 0x414f, 0x414f, 0x414f, 0x414f, 0x4147, 0x4147,
+    0x4147, 0x4147, 0x4147, 0x4147, 0x4147, 0x4147, 0x413e, 0x413e, 0x413e,
+    0x413e, 0x413e, 0x4135, 0x4135, 0x4135, 0x4135, 0x4135, 0x4135, 0x4135,
+    0x4135, 0x4135, 0x412c, 0x412c, 0x412c, 0x412c, 0x412c, 0x412c, 0x4123,
+    0x4123, 0x4123, 0x4123, 0x4123, 0x4123, 0x4123, 0x4123, 0x4123, 0x411a,
+    0x411a, 0x411a, 0x411a, 0x411a, 0x411a, 0x4111, 0x4111, 0x4111, 0x4111,
+    0x4111, 0x4111, 0x4111, 0x4111, 0x4108, 0x4108, 0x4108, 0x4108, 0x4108,
+    0x4108, 0x4108, 0x4108, 0x40ff, 0x40ff, 0x40ff, 0x40ff, 0x40ff, 0x40ff,
+    0x40ff, 0x40ff, 0x40ed, 0x40ed, 0x40ed, 0x40ed, 0x40ed, 0x40ed, 0x40ed,
+    0x40ed, 0x40db, 0x40db, 0x40db, 0x40db, 0x40db, 0x40db, 0x40db, 0x40db,
+    0x40c9, 0x40c9, 0x40c9, 0x40c9, 0x40c9, 0x40c9, 0x40c9, 0x40c9, 0x40b7,
+    0x40b7, 0x40b7, 0x40b7, 0x40b7, 0x40b7, 0x40b7, 0x40b7, 0x40b7, 0x40b7,
+    0x40a5, 0x40a5, 0x40a5, 0x40a5, 0x40a5, 0x40a5, 0x40a5, 0x4092, 0x4092,
+    0x4092, 0x4092, 0x4092, 0x4092, 0x4092, 0x4092, 0x4092, 0x4080, 0x4080,
+    0x4080, 0x4080, 0x4080, 0x4080, 0x4080, 0x4080, 0x4080, 0x405c, 0x405c,
+    0x405c, 0x405c, 0x405c, 0x405c, 0x405c, 0x405c, 0x405c, 0x4037, 0x4037,
+    0x4037, 0x4037, 0x4037, 0x4037, 0x4037, 0x4037, 0x4037, 0x4013, 0x4013,
+    0x4013, 0x4013, 0x4013, 0x4013, 0x4013, 0x4013, 0x4013, 0x4013, 0x3fdc,
+    0x3fdc, 0x3fdc, 0x3fdc, 0x3fdc, 0x3fdc, 0x3fdc, 0x3fdc, 0x3fdc, 0x3fdc,
+    0x3f93, 0x3f93, 0x3f93, 0x3f93, 0x3f93, 0x3f93, 0x3f93, 0x3f93, 0x3f93,
+    0x3f13, 0x3f13, 0x3f13, 0x3f13, 0x3f13, 0x3f13, 0x3f13, 0x3f13, 0x3f13,
+    0x3f13, 0x0,    0x0,    0x0,    0x0,    0x0,    0x0,
+};
+
+// <! gen atan2 f(y, x) = 2 * atan(y / (pow(x*x+y*y, 0.5) + x))
+static double _gen_atan2_degree(float y, float x)
+{
+  return atan2(y, x) * 180 / M_PI;
+}
+
+static void tl_lut_ref(u16 *ofmap, u16 *ifmap, u16 *ifmap2,
+                       tl_shape_t ifmap_shape)
+{
+  assert(ofmap);
+
+  for (u32 i = 0; i < tl_shape_size(&ifmap_shape); i++) {
+    float y = convert_bf16_fp32(ifmap2[i]);
+    float x = convert_bf16_fp32(ifmap[i]);
+    double v = _gen_atan2_degree(y, x);
+    ofmap[i] = convert_fp32_bf16(v);
+
+    if (mode == PRE_DATA_COMPARE_FIX) {
+      ofmap[i] = golden_bf16[i];
+    } else if (mode == DATA_COMPARE_U8) {
+      ofmap[i] = (u8)convert_bf16_s8(ofmap[i]);
+    }
+  }
+}
+
+static bool verify(u16 *ofmap_data, u16 *ref_data, u16 *ifmap, u16 *ifmap2,
+                   u64 ifmap_size, float epsilon)
+{
+  u64 size = ifmap_size;
+
+  for (u64 i = 0; i < size; i++) {
+    bool is_close;
+    u16 ref = ref_data[i];
+    u16 ofmap_data_bf16;
+    float ref_f;
+    float ofmap_data_f;
+
+    ref_f = convert_bf16_fp32(ref);
+    ofmap_data_f = convert_bf16_fp32(ofmap_data[i]);
+    ofmap_data_bf16 = ofmap_data[i];
+
+    if (mode == PRE_DATA_COMPARE_FIX) {
+      is_close = ofmap_data[i] == ref;
+    } else {
+      is_close = fabs(ref_f - ofmap_data_f) < epsilon;
+      if (abs(ofmap_data_f) * epsilon == 0) {
+        // https://stackoverflow.com/questions/19837576/comparing-floating-point-number-to-zero
+        is_close = abs(ref_f) < epsilon;
+      } else {
+        is_close =
+            fabs(ref_f - ofmap_data_f) / fabs(std::max(ref_f, ofmap_data_f)) <
+            epsilon;
+      }
+    }
+
+    if (!is_close) {
+      float y = convert_bf16_fp32(ifmap2[i]);
+      float x = convert_bf16_fp32(ifmap[i]);
+      fprintf(stderr,
+              "comparing failed at ofmap_data[%" PRIu64 "]\n"
+              "\tgot %x, exp %x, fp32: got %f exp %f, atan2(%f, %f) = %f"
+              "\ty %f(0x%x), x %f(0x%x)\n",
+              i, ofmap_data_bf16, ref, ofmap_data_f, ref_f, y, x,
+              _gen_atan2_degree(y, x), y, ifmap2[i], x, ifmap[i]);
+      exit(-1);
+    }
+  }
+
+  return true;
+}
+
+static void _gen_input(u16 *input_data, u64 ifmap_size, int range_start,
+                       int range_end)
+{
+  std::random_device rd;
+  std::mt19937 e2(rd());
+  std::uniform_real_distribution<> dist(range_start, range_end);
+
+  float LO = pow(2, range_start);
+  float HI = pow(2, range_end);
+  for (u64 i = 0; i < ifmap_size; i++) {
+    // input range is -8 ~ +8
+    int table_hw = 256;
+    float input = ((int)i % (range_end - 2)) * (((int)i % 2) ? 1 : -1) + 0.03 +
+                  (i % table_hw) * 0.002;
+    input = ((int)i % (range_end - 2)) * (((int)i % 2) ? 1 : 1) + 0.03 +
+            (i % table_hw) * 0.002;
+    input_data[i] = convert_fp32_bf16(input);
+    input = dist(e2);
+    input = LO + static_cast<float>(rand()) /
+                     (static_cast<float>(RAND_MAX / (HI - LO)));
+  }
+}
+
+static void gen_input(u16 *x, u16 *y, u64 ifmap_size, TEST_MODE mode,
+                      int range_start, int range_end)
+{
+
+  if (mode == PRE_DATA_COMPARE_FIX) {
+    memcpy(x, &test_pattern, sizeof(test_pattern));
+  } else {
+    range_start = abs(range_start);
+    range_end = abs(range_end);
+    _gen_input(x, ifmap_size, range_start, range_end);
+  }
+
+  // invert for test
+  for (u64 i = 0; i < ifmap_size; i++) {
+    y[i] = x[(ifmap_size - 1) - i];
+  }
+
+  if (mode == DATA_COMPARE_ACCURACY_X_GT_0) {
+    // y = any
+    u32 i = 0;
+    for (; i < ifmap_size / 4; i++) {
+      // y < 0
+      y[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(y[i]));
+      y[i + ifmap_size / 4] = convert_fp32_bf16(0);
+    }
+  } else if (mode == DATA_COMPARE_ACCURACY_X_LT_0_Y_GE_0) {
+    // x < 0 and y >= 0
+    for (u32 i = 0; i < ifmap_size; i++) {
+      x[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(x[i]));
+    }
+
+    for (u32 i = 0; i < ifmap_size / 4; i++) {
+      y[i + ifmap_size / 4] = convert_fp32_bf16(0);
+    }
+  } else if (mode == DATA_COMPARE_ACCURACY_X_LT_0_Y_LT_0) {
+    // x < 0 and y < 0
+    for (u32 i = 0; i < ifmap_size; i++) {
+      x[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(x[i]));
+      y[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(y[i]));
+    }
+  } else if (mode == DATA_COMPARE_ACCURACY_X_0_Y_GT_0) {
+    // pi / 2, x = 0 and y > 0
+    for (u32 i = 0; i < ifmap_size; i++) {
+      x[i] = convert_fp32_bf16(0);
+    }
+  } else if (mode == DATA_COMPARE_ACCURACY_X_0_Y_LT_0) {
+    // -pi / 2, x = 0 and y < 0
+    for (u32 i = 0; i < ifmap_size; i++) {
+      x[i] = convert_fp32_bf16(0);
+      y[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(y[i]));
+    }
+  }
+
+  if (mode != PRE_DATA_COMPARE_FIX) {
+    int i = 0;
+    x[i] = convert_fp32_bf16(-10.0);
+    y[i++] = convert_fp32_bf16(6.0);
+    y[i] = convert_fp32_bf16(1.000000);
+    x[i++] = convert_fp32_bf16(19.000000);
+    y[i] = convert_fp32_bf16(5.000000);
+    x[i++] = convert_fp32_bf16(-125.000000);
+    y[i] = convert_fp32_bf16(1.070312);
+    x[i++] = convert_fp32_bf16(0.498046);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-8.000000);
+    x[i] = convert_fp32_bf16(424.000);
+    y[i++] = convert_fp32_bf16(-1.00);
+    x[i] = convert_fp32_bf16(2.484375);
+    y[i++] = convert_fp32_bf16(-7.531250);
+    x[i] = convert_fp32_bf16(-2.484375);
+    y[i++] = convert_fp32_bf16(-7.531250);
+    x[i] = convert_fp32_bf16(0);
+    y[i++] = convert_fp32_bf16(7.531250);
+    x[i] = convert_fp32_bf16(0);
+    y[i++] = convert_fp32_bf16(-7.531250);
+    x[i] = convert_fp32_bf16(0);
+    y[i++] = convert_fp32_bf16(0);
+    x[i] = convert_fp32_bf16(0);
+    y[i++] = convert_fp32_bf16(0.394531);
+    y[i] = convert_fp32_bf16(-4.000000);
+    x[i++] = convert_fp32_bf16(-64.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-4.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-40.000000);
+    y[i] = convert_fp32_bf16(1.000000);
+    x[i++] = convert_fp32_bf16(-53.000000);
+    y[i] = convert_fp32_bf16(-9.000000);
+    x[i++] = convert_fp32_bf16(-91.000000);
+    y[i] = convert_fp32_bf16(12.000000);
+    x[i++] = convert_fp32_bf16(-164.000000);
+    y[i] = convert_fp32_bf16(-20.000000);
+    x[i++] = convert_fp32_bf16(-320.000000);
+    y[i] = convert_fp32_bf16(1.000000);
+    x[i++] = convert_fp32_bf16(-71.000000);
+    y[i] = convert_fp32_bf16(1.000000);
+    x[i++] = convert_fp32_bf16(-155.000000);
+    y[i] = convert_fp32_bf16(1.000000);
+    x[i++] = convert_fp32_bf16(-247.000000);
+    y[i] = convert_fp32_bf16(-2.000000);
+    x[i++] = convert_fp32_bf16(-118.000000);
+    y[i] = convert_fp32_bf16(-2.000000);
+    x[i++] = convert_fp32_bf16(-54.000000);
+    y[i] = convert_fp32_bf16(-5.000000);
+    x[i++] = convert_fp32_bf16(-392.000000);
+    y[i] = convert_fp32_bf16(-37.000000);
+    x[i++] = convert_fp32_bf16(-520.000000);
+    y[i] = convert_fp32_bf16(-1.000000);
+    x[i++] = convert_fp32_bf16(-19.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-10.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-8.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-2.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-14.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-2.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-6.000000);
+    y[i] = convert_fp32_bf16(-1.000000);
+    x[i++] = convert_fp32_bf16(-21.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-14.000000);
+    y[i] = convert_fp32_bf16(-1.000000);
+    x[i++] = convert_fp32_bf16(-17.000000);
+    y[i] = convert_fp32_bf16(-1.000000);
+    x[i++] = convert_fp32_bf16(-17.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-8.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-4.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-10.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-8.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-14.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-4.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-2.000000);
+    y[i] = convert_fp32_bf16(1.000000);
+    x[i++] = convert_fp32_bf16(-41.000000);
+    y[i] = convert_fp32_bf16(1.000000);
+    x[i++] = convert_fp32_bf16(-69.000000);
+    y[i] = convert_fp32_bf16(4.000000);
+    x[i++] = convert_fp32_bf16(-86.000000);
+    y[i] = convert_fp32_bf16(1.000000);
+    x[i++] = convert_fp32_bf16(-41.000000);
+    y[i] = convert_fp32_bf16(-2.000000);
+    x[i++] = convert_fp32_bf16(-34.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-6.000000);
+    y[i] = convert_fp32_bf16(1.000000);
+    x[i++] = convert_fp32_bf16(-41.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-136.000000);
+    y[i] = convert_fp32_bf16(-3.000000);
+    x[i++] = convert_fp32_bf16(-79.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-38.000000);
+    y[i] = convert_fp32_bf16(5.000000);
+    x[i++] = convert_fp32_bf16(-173.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-78.000000);
+    y[i] = convert_fp32_bf16(-2.000000);
+    x[i++] = convert_fp32_bf16(-60.000000);
+    y[i] = convert_fp32_bf16(3.000000);
+    x[i++] = convert_fp32_bf16(-123.000000);
+    y[i] = convert_fp32_bf16(-9.000000);
+    x[i++] = convert_fp32_bf16(-280.000000);
+    y[i] = convert_fp32_bf16(3.000000);
+    x[i++] = convert_fp32_bf16(-39.000000);
+    y[i] = convert_fp32_bf16(2.000000);
+    x[i++] = convert_fp32_bf16(-524.000000);
+    y[i] = convert_fp32_bf16(11.000000);
+    x[i++] = convert_fp32_bf16(-376.000000);
+    y[i] = convert_fp32_bf16(5.000000);
+    x[i++] = convert_fp32_bf16(-131.000000);
+    y[i] = convert_fp32_bf16(11.000000);
+    x[i++] = convert_fp32_bf16(-324.000000);
+    y[i] = convert_fp32_bf16(9.000000);
+    x[i++] = convert_fp32_bf16(-125.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-92.000000);
+    y[i] = convert_fp32_bf16(-7.000000);
+    x[i++] = convert_fp32_bf16(-233.000000);
+    y[i] = convert_fp32_bf16(10.000000);
+    x[i++] = convert_fp32_bf16(-170.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-4.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-4.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-10.000000);
+    y[i] = convert_fp32_bf16(-1.000000);
+    x[i++] = convert_fp32_bf16(-23.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-6.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-6.000000);
+    y[i] = convert_fp32_bf16(-3.000000);
+    x[i++] = convert_fp32_bf16(-37.000000);
+
+    y[i] = convert_fp32_bf16(-9);
+    x[i++] = convert_fp32_bf16(-1);
+
+    y[i] = convert_fp32_bf16(7.0);
+    x[i++] = convert_fp32_bf16(-1);
+
+    y[i] = convert_fp32_bf16(0);
+    x[i++] = convert_fp32_bf16(-1);
+  }
+
+#ifdef DBG
+  for (u64 i = 0; i < ifmap_size; i++) {
+    printf("source[%ld] y %f x %f\n", i, convert_bf16_fp32(y[i]),
+           convert_bf16_fp32(x[i]));
+  }
+#endif /* ifdef DBG */
+}
+
+static void testbench(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk)
+{
+  // TODO: check more shape / align
+  bmk1880v2_chip_info_t chip_info = bmk1880v2_chip_info();
+
+  u32 input_n = 1;
+  u32 input_c = chip_info.npu_num;
+  u32 input_h = 16;
+  u32 input_w = 16;
+  float epsilon = 0.2;
+  int range_start = -8;
+  int range_end = 8;
+
+  if (mode == PRE_DATA_COMPARE_FIX) {
+    input_h = 4;
+    input_w = 8;
+  }
+
+  fmt_t fmt = FMT_BF16;
+
+  tl_shape_t ifmap_shape = {input_n, input_c, input_h, input_w};
+  tl_shape_t ofmap_shape = ifmap_shape;
+
+  // get lut table shape and size
+  tl_shape_t table_shape;
+  u64 table_bytesize = bf16_lut_tbl_bytesize(bmk, &table_shape, fmt);
+
+  // get input / output size
+  u64 ifmap_size = tl_shape_size(&ifmap_shape);
+  u64 ofmap_size = tl_shape_size(&ofmap_shape);
+  int data_type_size = bytesize_of_fmt(fmt);
+  u64 ifmap_bytesize = ifmap_size * data_type_size;
+  u64 ofmap_bytesize = ofmap_size * data_type_size;
+
+  // atan2 was two inputs
+  tl_t *tl_ifmap = alloc_tl(bmk, ifmap_shape, fmt, /*align*/1);
+  tl_t *tl_ifmap2 = alloc_tl(bmk, ifmap_shape, fmt, /*align*/1);
+  tl_t *tl_ofmap_bf16 = alloc_tl(bmk, ofmap_shape, fmt, /*align*/1);
+  tl_t *out = tl_ofmap_bf16;
+
+  // atan buf
+  tl_t *tl_y0_buf = alloc_tl(bmk, table_shape, fmt, /*align*/1);
+  tl_t *tl_invert_buf = alloc_tl(bmk, table_shape, fmt, /*align*/1);
+  tl_t *tl_pos_neg_buf = alloc_tl(bmk, table_shape, fmt, /*align*/1);
+
+  // reciprocal buf
+  tl_t *tl_reciprocal_table_answer =
+      alloc_tl(bmk, table_shape, fmt, /*align*/1);
+  tl_t *tl_reciprocal_table_answer_mantissa =
+      alloc_tl(bmk, table_shape, fmt, /*align*/1);
+
+  // temp buf
+  tl_t *tl_buf = alloc_tl(bmk, ofmap_shape, fmt, /*align*/1);
+  tl_t *tl_buf2 = alloc_tl(bmk, tl_ofmap_bf16->shape, fmt, /*align*/1);
+  tl_t *tl_buf3 = alloc_tl(bmk, tl_ofmap_bf16->shape, fmt, /*align*/1);
+
+  u16 *input_data = (u16 *)xmalloc(ifmap_bytesize);
+  u16 *input_data2 = (u16 *)xmalloc(ifmap_bytesize);
+  u16 *ref_data = (u16 *)xmalloc(ofmap_bytesize);
+
+  // for reciprocal
+  u16 *table_reciprocal_data = (u16 *)xmalloc(table_bytesize);
+  u16 *table_reciprocal_data_mantissa = (u16 *)xmalloc(table_bytesize);
+
+  // for atan
+  u16 *table_data_atan_y0 = (u16 *)xmalloc(table_bytesize);
+  u16 *table_data_atan_invert = (u16 *)xmalloc(table_bytesize);
+  u16 *table_data_atan_pos_neg = (u16 *)xmalloc(table_bytesize);
+
+  // for search '0' index
+  u16 *idx_0_table_data = (u16 *)xmalloc(table_bytesize);
+
+  // init input / ref
+  // input_data is x, input_data2 is y
+  gen_input(input_data, input_data2, ifmap_size, mode, range_start, range_end);
+  tl_lut_ref(ref_data, input_data, input_data2, ifmap_shape);
+
+  // init lut table
+  bf16_reciprocal_tbl(table_reciprocal_data, table_reciprocal_data_mantissa,
+                      &table_shape);
+  bf16_atan_fast_degree_tbl(table_data_atan_y0, table_data_atan_invert,
+                            table_data_atan_pos_neg, &table_shape);
+  bf16_gen_0_tbl(idx_0_table_data, &table_shape);
+
+  // sys->local
+  put_bf16_tensor_g2l(ctx, bmk, tl_ifmap, (u16 *)input_data, fmt);
+  put_bf16_tensor_g2l(ctx, bmk, tl_ifmap2, (u16 *)input_data2, fmt);
+  put_bf16_tensor_g2l(ctx, bmk, tl_reciprocal_table_answer,
+                      (u16 *)table_reciprocal_data, fmt);
+  put_bf16_tensor_g2l(ctx, bmk, tl_reciprocal_table_answer_mantissa,
+                      (u16 *)table_reciprocal_data_mantissa, fmt);
+
+  put_bf16_tensor_g2l(ctx, bmk, tl_y0_buf, (u16 *)table_data_atan_y0, fmt);
+  put_bf16_tensor_g2l(ctx, bmk, tl_invert_buf, (u16 *)table_data_atan_invert,
+                      fmt);
+  put_bf16_tensor_g2l(ctx, bmk, tl_pos_neg_buf, (u16 *)table_data_atan_pos_neg,
+                      fmt);
+
+  bf16_atan2_fast_degree_emit(
+      bmk, tl_ifmap2, tl_ifmap, tl_buf, tl_buf2, tl_buf3, tl_y0_buf,
+      tl_invert_buf, tl_pos_neg_buf, tl_reciprocal_table_answer,
+      tl_reciprocal_table_answer_mantissa, OUT tl_ofmap_bf16, fmt);
+
+  u16 *ofmap_data = (u16 *)get_bf16_tensor_l2g(ctx, bmk, out, out->fmt);
+  verify(ofmap_data, ref_data, input_data, input_data2, ifmap_size, epsilon);
+
+  free_tl(bmk, tl_buf3);
+  free_tl(bmk, tl_buf2);
+  free_tl(bmk, tl_buf);
+  free_tl(bmk, tl_reciprocal_table_answer_mantissa);
+  free_tl(bmk, tl_reciprocal_table_answer);
+  free_tl(bmk, tl_pos_neg_buf);
+  free_tl(bmk, tl_invert_buf);
+  free_tl(bmk, tl_y0_buf);
+  free_tl(bmk, tl_ofmap_bf16);
+  free_tl(bmk, tl_ifmap2);
+  free_tl(bmk, tl_ifmap);
+
+  free(input_data);
+  free(input_data2);
+  free(ref_data);
+  free(table_reciprocal_data);
+  free(table_reciprocal_data_mantissa);
+  free(table_data_atan_y0);
+  free(table_data_atan_invert);
+  free(table_data_atan_pos_neg);
+  free(idx_0_table_data);
+  free(ofmap_data);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  int round_mode;
+
+  round_mode = set_store_feround();
+
+  test_init(&ctx, &bmk);
+
+  // for (int i = PRE_DATA_COMPARE_FIX; i < TEST_MODE_MAX; i++)
+  // for (int i = PRE_DATA_COMPARE_FIX; i < DATA_COMPARE_ACCURACY; i++) {
+  // for (int i = DATA_COMPARE_ACCURACY; i < DATA_COMPARE_U8; i++) {
+  // for (int i = DATA_COMPARE_ACCURACY; i < DATA_COMPARE_ACCURACY_X_GT_0; i++)
+  // {
+  for (int i = PRE_DATA_COMPARE_FIX; i < DATA_COMPARE_U8; i++) {
+    mode = static_cast<TEST_MODE>(i);
+    printf("test mode %d...\n", mode);
+    testbench(&ctx, bmk);
+  }
+  printf("pass\n");
+
+  test_exit(&ctx);
+  restore_feround(round_mode);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_bf16_atan2_kernel.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_atan2_kernel.cpp
new file mode 100644
index 000000000..9dbc2684a
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_atan2_kernel.cpp
@@ -0,0 +1,772 @@
+/**
+ * \breif atan2 is implemented by atan, you can refer
+ * [wiki](https://en.wikipedia.org/wiki/Atan2) for more details
+ */
+
+#include "../1880v2_test_util.h"
+#define OUT
+#define IN
+#include <cfloat>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <random>
+#include <string>
+//#define DBG
+
+/**
+ * pre_data means we test fixed pattern, it should be same sa lut
+ */
+enum TEST_MODE {
+  PRE_DATA_COMPARE_FIX = 0,  // pre-data + fix compare
+  DATA_COMPARE_ACCURACY,     // generate \range_start to \range_end value that
+                             // check epsilon, default set x > 0, y > 0
+
+  DATA_COMPARE_ACCURACY_X_GT_0,         // atan(y/x), x > 0, y = 0
+  DATA_COMPARE_ACCURACY_X_LT_0_Y_GE_0,  // atan(y/x) + PI , x < 0 and y >= 0
+  DATA_COMPARE_ACCURACY_X_LT_0_Y_LT_0,  // atan(y/x) - PI , x < 0 and y < 0
+  DATA_COMPARE_ACCURACY_X_0_Y_GT_0,     // pi / 2, x = 0 and y > 0
+  DATA_COMPARE_ACCURACY_X_0_Y_LT_0,     // -pi / 2, x = 0 and y < 0
+  DATA_COMPARE_U8,  // generate \range_start to \range_end value that check
+                    // epsilon, result bf16->u8
+  TEST_MODE_MAX,
+};
+
+
+static TEST_MODE mode;
+
+static u16 test_pattern[] = {
+    0x0000, 0x38D2, 0x3952, 0x399D, 0x39D2, 0x3A03, 0x3A1D, 0x3A38, 0x3A52,
+    0x3A6C, 0x3A83, 0x3A90, 0x3A9D, 0x3AAA, 0x3AB8, 0x3AC5, 0x3AD2, 0x3ADF,
+    0x3AEC, 0x3AF9, 0x3B03, 0x3B0A, 0x3B10, 0x3B17, 0x3B1D, 0x3B24, 0x3B2A,
+    0x3B31, 0x3B38, 0x3B3E, 0x3B45, 0x3B4B, 0x3B52, 0x3B58, 0x3B5F, 0x3B65,
+    0x3B6C, 0x3B72, 0x3B79, 0x3B80, 0x3B83, 0x3B86, 0x3B8A, 0x3B8D, 0x3B90,
+    0x3B93, 0x3B97, 0x3B9A, 0x3B9D, 0x3BA1, 0x3BA4, 0x3BA7, 0x3BAA, 0x3BAE,
+    0x3BB1, 0x3BB4, 0x3BB8, 0x3BBB, 0x3BBE, 0x3BC1, 0x3BC5, 0x3BC8, 0x3BCB,
+    0x3BCE, 0x3BD2, 0x3BD5, 0x3BD8, 0x3BDC, 0x3BDF, 0x3BE2, 0x3BE5, 0x3BE9,
+    0x3BEC, 0x3BEF, 0x3BF2, 0x3BF6, 0x3BF9, 0x3BFC, 0x3C00, 0x3C01, 0x3C03,
+    0x3C05, 0x3C06, 0x3C08, 0x3C0A, 0x3C0B, 0x3C0D, 0x3C0F, 0x3C10, 0x3C12,
+    0x3C13, 0x3C15, 0x3C17, 0x3C18, 0x3C1A, 0x3C1C, 0x3C1D, 0x3C1F, 0x3C21,
+    0x3C22, 0x3C24, 0x3C25, 0x3C27, 0x3C29, 0x3C2A, 0x3C2C, 0x3C2E, 0x3C2F,
+    0x3C31, 0x3C33, 0x3C34, 0x3C36, 0x3C38, 0x3C39, 0x3C3B, 0x3C3C, 0x3C3E,
+    0x3C40, 0x3C41, 0x3C43, 0x3C45, 0x3C46, 0x3C48, 0x3C4A, 0x3C4B, 0x3C4D,
+    0x3C4E, 0x3C50, 0x3C52, 0x3C53, 0x3C55, 0x3C57, 0x3C58, 0x3C5A, 0x3C5C,
+    0x3C5D, 0x3C5F, 0x3C60, 0x3C62, 0x3C64, 0x3C65, 0x3C67, 0x3C69, 0x3C6A,
+    0x3C6C, 0x3C6E, 0x3C6F, 0x3C71, 0x3C72, 0x3C74, 0x3C76, 0x3C77, 0x3C79,
+    0x3C7B, 0x3C7C, 0x3C7E, 0x3C80, 0x3C81, 0x3C81, 0x3C82, 0x3C83, 0x3C84,
+    0x3C85, 0x3C86, 0x3C86, 0x3C87, 0x3C88, 0x3C89, 0x3C8A, 0x3C8A, 0x3C8B,
+    0x3C8C, 0x3C8D, 0x3C8E, 0x3C8F, 0x3C8F, 0x3C90, 0x3C91, 0x3C92, 0x3C93,
+    0x3C93, 0x3C94, 0x3C95, 0x3C96, 0x3C97, 0x3C98, 0x3C98, 0x3C99, 0x3C9A,
+    0x3C9B, 0x3C9C, 0x3C9C, 0x3C9D, 0x3C9E, 0x3C9F, 0x3CA0, 0x3CA1, 0x3CA1,
+    0x3CA2, 0x3CA3, 0x3CA4, 0x3CA5, 0x3CA5, 0x3CA6, 0x3CA7, 0x3CA8, 0x3CA9,
+    0x3CAA, 0x3CAA, 0x3CAB, 0x3CAC, 0x3CAD, 0x3CAE, 0x3CAE, 0x3CAF, 0x3CB0,
+    0x3CB1, 0x3CB2, 0x3CB3, 0x3CB3, 0x3CB4, 0x3CB5, 0x3CB6, 0x3CB7, 0x3CB8,
+    0x3CB8, 0x3CB9, 0x3CBA, 0x3CBB, 0x3CBC, 0x3CBC, 0x3CBD, 0x3CBE, 0x3CBF,
+    0x3CC0, 0x3CC1, 0x3CC1, 0x3CC2, 0x3CC3, 0x3CC4, 0x3CC5, 0x3CC5, 0x3CC6,
+    0x3CC7, 0x3CC8, 0x3CC9, 0x3CCA, 0x3CCA, 0x3CCB, 0x3CCC, 0x3CCD, 0x3CCE,
+    0x3CCE, 0x3CCF, 0x3CD0, 0x3CD1, 0x3CD2, 0x3CD3, 0x3CD3, 0x3CD4, 0x3CD5,
+    0x3CD6, 0x3CD7, 0x3CD7, 0x3CD8, 0x3CD9, 0x3CDA, 0x3CDB, 0x3CDC, 0x3CDC,
+    0x3CDD, 0x3CDE, 0x3CDF, 0x3CE0, 0x3CE0, 0x3CE1, 0x3CE2, 0x3CE3, 0x3CE4,
+    0x3CE5, 0x3CE5, 0x3CE6, 0x3CE7, 0x3CE8, 0x3CE9, 0x3CE9, 0x3CEA, 0x3CEB,
+    0x3CEC, 0x3CED, 0x3CEE, 0x3CEE, 0x3CEF, 0x3CF0, 0x3CF1, 0x3CF2, 0x3CF2,
+    0x3CF3, 0x3CF4, 0x3CF5, 0x3CF6, 0x3CF7, 0x3CF7, 0x3CF8, 0x3CF9, 0x3CFA,
+    0x3CFB, 0x3CFB, 0x3CFC, 0x3CFD, 0x3CFE, 0x3CFF, 0x3D00, 0x3D00, 0x3D01,
+    0x3D01, 0x3D01, 0x3D02, 0x3D02, 0x3D03, 0x3D03, 0x3D03, 0x3D04, 0x3D04,
+    0x3D05, 0x3D05, 0x3D06, 0x3D06, 0x3D06, 0x3D07, 0x3D07, 0x3D08, 0x3D08,
+    0x3D08, 0x3D09, 0x3D09, 0x3D0A, 0x3D0A, 0x3D0A, 0x3D0B, 0x3D0B, 0x3D0C,
+    0x3D0C, 0x3D0C, 0x3D0D, 0x3D0D, 0x3D0E, 0x3D0E, 0x3D0F, 0x3D0F, 0x3D0F,
+    0x3D10, 0x3D10, 0x3D11, 0x3D11, 0x3D11, 0x3D12, 0x3D12, 0x3D13, 0x3D13,
+    0x3D13, 0x3D14, 0x3D14, 0x3D15, 0x3D15, 0x3D16, 0x3D16, 0x3D16, 0x3D17,
+    0x3D17, 0x3D18, 0x3D18, 0x3D18, 0x3D19, 0x3D19, 0x3D1A, 0x3D1A, 0x3D1A,
+    0x3D1B, 0x3D1B, 0x3D1C, 0x3D1C, 0x3D1C, 0x3D1D, 0x3D1D, 0x3D1E, 0x3D1E,
+    0x3D1F, 0x3D1F, 0x3D1F, 0x3D20, 0x3D20, 0x3D21, 0x3D21, 0x3D21, 0x3D22,
+    0x3D22, 0x3D23, 0x3D23, 0x3D23, 0x3D24, 0x3D24, 0x3D25, 0x3D25, 0x3D25,
+    0x3D26, 0x3D26, 0x3D27, 0x3D27, 0x3D28, 0x3D28, 0x3D28, 0x3D29, 0x3D29,
+    0x3D2A, 0x3D2A, 0x3D2A, 0x3D2B, 0x3D2B, 0x3D2C, 0x3D2C, 0x3D2C, 0x3D2D,
+    0x3D2D, 0x3D2E, 0x3D2E, 0x3D2E, 0x3D2F, 0x3D2F, 0x3D30, 0x3D30, 0x3D31,
+    0x3D31, 0x3D31, 0x3D32, 0x3D32, 0x3D33, 0x3D33, 0x3D33, 0x3D34, 0x3D34,
+    0x3D35, 0x3D35, 0x3D35, 0x3D36, 0x3D36, 0x3D37, 0x3D37, 0x3D38, 0x3D38,
+    0x3D38, 0x3D39, 0x3D39, 0x3D3A, 0x3D3A, 0x3D3A, 0x3D3B, 0x3D3B, 0x3D3C,
+    0x3D3C, 0x3D3C, 0x3D3D, 0x3D3D, 0x3D3E, 0x3D3E, 0x3D3E, 0x3D3F, 0x3D3F,
+    0x3D40, 0x3D40, 0x3D41, 0x3D41, 0x3D41, 0x3D42, 0x3D42, 0x3D43, 0x3D43,
+    0x3D43, 0x3D44, 0x3D44, 0x3D45, 0x3D45, 0x3D45, 0x3D46, 0x3D46, 0x3D47,
+    0x3D47, 0x3D47, 0x3D48, 0x3D48, 0x3D49, 0x3D49, 0x3D4A, 0x3D4A, 0x3D4A,
+    0x3D4B, 0x3D4B, 0x3D4C, 0x3D4C, 0x3D4C, 0x3D4D, 0x3D4D, 0x3D4E, 0x3D4E,
+    0x3D4E, 0x3D4F, 0x3D4F, 0x3D50, 0x3D50, 0x3D50, 0x3D51, 0x3D51, 0x3D52,
+    0x3D52, 0x3D53, 0x3D53, 0x3D53, 0x3D54, 0x3D54, 0x3D55, 0x3D55, 0x3D55,
+    0x3D56, 0x3D56, 0x3D57, 0x3D57, 0x3D57, 0x3D58, 0x3D58, 0x3D59, 0x3D59,
+    0x3D59, 0x3D5A, 0x3D5A, 0x3D5B, 0x3D5B, 0x3D5C, 0x3D5C, 0x3D5C, 0x3D5D,
+    0x3D5D, 0x3D5E, 0x3D5E, 0x3D5E, 0x3D5F, 0x3D5F, 0x3D60, 0x3D60, 0x3D60,
+    0x3D61, 0x3D61, 0x3D62, 0x3D62, 0x3D63, 0x3D63, 0x3D63, 0x3D64, 0x3D64,
+    0x3D65, 0x3D65, 0x3D65, 0x3D66, 0x3D66, 0x3D67, 0x3D67, 0x3D67, 0x3D68,
+    0x3D68, 0x3D69, 0x3D69, 0x3D69, 0x3D6A, 0x3D6A, 0x3D6B, 0x3D6B, 0x3D6C,
+    0x3D6C, 0x3D6C, 0x3D6D, 0x3D6D, 0x3D6E, 0x3D6E, 0x3D6E, 0x3D6F, 0x3D6F,
+    0x3D70, 0x3D70, 0x3D70, 0x3D71, 0x3D71, 0x3D72, 0x3D72, 0x3D72, 0x3D73,
+    0x3D73, 0x3D74, 0x3D74, 0x3D75, 0x3D75, 0x3D75, 0x3D76, 0x3D76, 0x3D77,
+    0x3D77, 0x3D77, 0x3D78, 0x3D78, 0x3D79, 0x3D79, 0x3D79, 0x3D7A, 0x3D7A,
+    0x3D7B, 0x3D7B, 0x3D7B, 0x3D7C, 0x3D7C, 0x3D7D, 0x3D7D, 0x3D7E, 0x3D7E,
+    0x3D7E, 0x3D7F, 0x3D7F, 0x3D80, 0x3D80, 0x3D80, 0x3D80, 0x3D81, 0x3D81,
+    0x3D81, 0x3D81, 0x3D81, 0x3D82, 0x3D82, 0x3D82, 0x3D82, 0x3D82, 0x3D83,
+    0x3D83, 0x3D83, 0x3D83, 0x3D83, 0x3D84, 0x3D84, 0x3D84, 0x3D84, 0x3D85,
+    0x3D85, 0x3D85, 0x3D85, 0x3D85, 0x3D86, 0x3D86, 0x3D86, 0x3D86, 0x3D86,
+    0x3D87, 0x3D87, 0x3D87, 0x3D87, 0x3D87, 0x3D88, 0x3D88, 0x3D88, 0x3D88,
+    0x3D88, 0x3D89, 0x3D89, 0x3D89, 0x3D89, 0x3D89, 0x3D8A, 0x3D8A, 0x3D8A,
+    0x3D8A, 0x3D8A, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8C, 0x3D8C,
+    0x3D8C, 0x3D8C, 0x3D8C, 0x3D8D, 0x3D8D, 0x3D8D, 0x3D8D, 0x3D8E, 0x3D8E,
+    0x3D8E, 0x3D8E, 0x3D8E, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D90,
+    0x3D90, 0x3D90, 0x3D90, 0x3D90, 0x3D91, 0x3D91, 0x3D91, 0x3D91, 0x3D91,
+    0x3D92, 0x3D92, 0x3D92, 0x3D92, 0x3D92, 0x3D93, 0x3D93, 0x3D93, 0x3D93,
+    0x3D93, 0x3D94, 0x3D94, 0x3D94, 0x3D94, 0x3D94, 0x3D95, 0x3D95, 0x3D95,
+    0x3D95, 0x3D96, 0x3D96, 0x3D96, 0x3D96, 0x3D96, 0x3D97, 0x3D97, 0x3D97,
+    0x3D97, 0x3D97, 0x3D98, 0x3D98, 0x3D98, 0x3D98, 0x3D98, 0x3D99, 0x3D99,
+    0x3D99, 0x3D99, 0x3D99, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9B,
+    0x3D9B, 0x3D9B, 0x3D9B, 0x3D9B, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9C,
+    0x3D9D, 0x3D9D, 0x3D9D, 0x3D9D, 0x3D9D, 0x3D9E, 0x3D9E, 0x3D9E, 0x3D9E,
+    0x3D9F, 0x3D9F, 0x3D9F, 0x3D9F, 0x3D9F, 0x3DA0, 0x3DA0, 0x3DA0, 0x3DA0,
+    0x3DA0, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA2, 0x3DA2, 0x3DA2,
+    0x3DA2, 0x3DA2, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA4, 0x3DA4,
+    0x3DA4, 0x3DA4, 0x3DA4, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA6,
+    0x3DA6, 0x3DA6, 0x3DA6, 0x3DA7, 0x3DA7, 0x3DA7, 0x3DA7, 0x3DA7, 0x3DA8,
+    0x3DA8, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA9, 0x3DA9, 0x3DA9, 0x3DA9, 0x3DA9,
+    0x3DAA, 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAB,
+    0x3DAB, 0x3DAC, 0x3DAC, 0x3DAC, 0x3DAC, 0x3DAC, 0x3DAD, 0x3DAD, 0x3DAD,
+    0x3DAD, 0x3DAD, 0x3DAE, 0x3DAE, 0x3DAE, 0x3DAE, 0x3DAE, 0x3DAF, 0x3DAF,
+    0x3DAF, 0x3DAF, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB1, 0x3DB1,
+    0x3DB1, 0x3DB1, 0x3DB1, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB3,
+    0x3DB3, 0x3DB3, 0x3DB3, 0x3DB3, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB4,
+    0x3DB5, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB6, 0x3DB6, 0x3DB6, 0x3DB6,
+    0x3DB6, 0x3DB7, 0x3DB7, 0x3DB7, 0x3DB7, 0x3DB8, 0x3DB8, 0x3DB8, 0x3DB8,
+    0x3DB8, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DBA, 0x3DBA, 0x3DBA,
+    0x3DBA, 0x3DBA, 0x3DBB, 0x3DBB, 0x3DBB, 0x3DBB, 0x3DBB, 0x3DBC, 0x3DBC,
+    0x3DBC, 0x3DBC, 0x3DBC, 0x3DBD, 0x3DBD, 0x3DBD, 0x3DBD, 0x3DBD, 0x3DBE,
+    0x3DBE, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBF, 0x3DBF, 0x3DBF, 0x3DBF, 0x3DBF,
+    0x3DC0, 0x3DC0, 0x3DC0, 0x3DC0, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC1,
+    0x3DC2, 0x3DC2, 0x3DC2, 0x3DC2, 0x3DC2, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC3,
+    0x3DC3, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC5, 0x3DC5, 0x3DC5,
+    0x3DC5, 0x3DC5, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC7, 0x3DC7,
+    0x3DC7, 0x3DC7, 0x3DC7, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC9,
+    0x3DC9, 0x3DC9, 0x3DC9, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCB,
+    0x3DCB, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCC, 0x3DCC, 0x3DCC, 0x3DCC, 0x3DCC,
+    0x3DCD, 0x3DCE, 0x3DCF, 0x3DD0, 0x3DD1, 0x3DD2, 0x3DD3, 0x3DD4, 0x3DD5,
+    0x3DD6, 0x3DD7, 0x3DD8, 0x3DD9, 0x3DDA, 0x3DDB, 0x3DDC, 0x3DDD, 0x3DDE,
+    0x3DDF, 0x3DE0, 0x3DE1, 0x3DE2, 0x3DE3, 0x3DE4, 0x3DE5,
+};
+
+static u16 golden_bf16[] = {
+
+    0x3fc9, 0x3fc9, 0x3fc9, 0x3fc9, 0x3fc9, 0x3fc8, 0x3fc9, 0x3fc8, 0x3fc8,
+    0x3fc8, 0x3fc8, 0x3fc8, 0x3fc8, 0x3fc8, 0x3fc8, 0x3fc7, 0x3fc8, 0x3fc7,
+    0x3fc7, 0x3fc7, 0x3fc7, 0x3fc7, 0x3fc7, 0x3fc7, 0x3fc6, 0x3fc6, 0x3fc6,
+    0x3fc5, 0x3fc5, 0x3fc5, 0x3fc5, 0x3fc5, 0x3fc5, 0x3fc5, 0x3fc4, 0x3fc5,
+    0x3fc4, 0x3fc4, 0x3fc4, 0x3fc4, 0x3fc4, 0x3fc4, 0x3fc4, 0x3fc3, 0x3fc3,
+    0x3fc4, 0x3fc3, 0x3fc3, 0x3fc3, 0x3fc3, 0x3fc3, 0x3fc3, 0x3fc3, 0x3fc2,
+    0x3fc2, 0x3fc1, 0x3fc1, 0x3fc1, 0x3fc1, 0x3fc1, 0x3fc1, 0x3fc1, 0x3fc0,
+    0x3fc1, 0x3fc0, 0x3fc0, 0x3fc0, 0x3fc0, 0x3fc0, 0x3fc0, 0x3fc0, 0x3fbf,
+    0x3fc0, 0x3fbf, 0x3fbf, 0x3fbf, 0x3fbf, 0x3fbf, 0x3fc9, 0x3fbe, 0x3fbe,
+    0x3fbe, 0x3fbe, 0x3fbe, 0x3fbe, 0x3fbe, 0x3fbe, 0x3fbe, 0x3fbd, 0x3fbd,
+    0x3fbc, 0x3fbc, 0x3fbc, 0x3fbc, 0x3fbc, 0x3fbc, 0x3fbc, 0x3fbb, 0x3fbc,
+    0x3fbb, 0x3fbb, 0x3fbb, 0x3fbb, 0x3fbb, 0x3fbb, 0x3fba, 0x3fbb, 0x3fba,
+    0x3fba, 0x3fba, 0x3fba, 0x3fba, 0x3fba, 0x3fb9, 0x3fba, 0x3fb9, 0x3fb9,
+    0x3fb9, 0x3fb9, 0x3fb9, 0x3fb9, 0x3fb8, 0x3fb8, 0x3fb7, 0x3fb7, 0x3fb7,
+    0x3fb7, 0x3fb7, 0x3fb7, 0x3fb6, 0x3fb7, 0x3fb6, 0x3fb6, 0x3fb6, 0x3fb6,
+    0x3fb6, 0x3fb6, 0x3fb6, 0x3fb5, 0x3fb5, 0x3fb5, 0x3fb5, 0x3fb5, 0x3fb5,
+    0x3fb4, 0x3fb4, 0x3fb4, 0x3fb4, 0x3fb4, 0x3fb4, 0x3fb4, 0x3fb4, 0x3fb4,
+    0x3fb3, 0x3fb2, 0x3fb2, 0x3fb2, 0x3fb2, 0x3fb2, 0x3fb2, 0x3fb1, 0x3fb1,
+    0x3fb1, 0x3fb1, 0x3fb1, 0x3fb1, 0x3fb1, 0x3fb0, 0x3fb0, 0x3fb0, 0x3fb0,
+    0x3fb0, 0x3fb0, 0x3faf, 0x3faf, 0x3faf, 0x3faf, 0x3faf, 0x3faf, 0x3faf,
+    0x3faf, 0x3fae, 0x3fae, 0x3fad, 0x3fad, 0x3fad, 0x3fad, 0x3fad, 0x3fac,
+    0x3fac, 0x3fac, 0x3fac, 0x3fac, 0x3fac, 0x3fac, 0x3fac, 0x3fab, 0x3fab,
+    0x3fab, 0x3fab, 0x3fab, 0x3faa, 0x3faa, 0x3faa, 0x3faa, 0x3faa, 0x3faa,
+    0x3faa, 0x3faa, 0x3fa9, 0x3fa9, 0x3fa9, 0x3fa9, 0x3fa9, 0x3fa9, 0x3fa7,
+    0x3fa7, 0x3fa7, 0x3fa7, 0x3fa7, 0x3fa7, 0x3fa7, 0x3fa6, 0x3fa6, 0x3fa6,
+    0x3fa6, 0x3fa5, 0x3fa5, 0x3fa5, 0x3fa5, 0x3fa5, 0x3fa5, 0x3fa5, 0x3fa5,
+    0x3fa4, 0x3fa4, 0x3fa4, 0x3fa4, 0x3fa4, 0x3fa3, 0x3fa3, 0x3fa3, 0x3fa3,
+    0x3fa3, 0x3fa3, 0x3fa1, 0x3fa1, 0x3fa1, 0x3fa1, 0x3fa1, 0x3fa1, 0x3fa0,
+    0x3fa0, 0x3fa0, 0x3fa0, 0x3fa0, 0x3f9f, 0x3f9f, 0x3f9f, 0x3f9f, 0x3f9f,
+    0x3f9f, 0x3f9e, 0x3f9e, 0x3f9e, 0x3f9e, 0x3f9e, 0x3f9e, 0x3f9e, 0x3f9d,
+    0x3f9d, 0x3f9d, 0x3f9d, 0x3f9d, 0x3f9c, 0x3f9c, 0x3f9c, 0x3f9c, 0x3f9c,
+    0x3f9b, 0x3f9b, 0x3f9b, 0x3f9b, 0x3f9b, 0x3f9a, 0x3f9a, 0x3f99, 0x3f99,
+    0x3f99, 0x3f99, 0x3f99, 0x3f99, 0x3f98, 0x3f98, 0x3f98, 0x3f97, 0x3f97,
+    0x3f97, 0x3f97, 0x3f97, 0x3f96, 0x3f96, 0x3f96, 0x3f96, 0x3f96, 0x3f95,
+    0x3f95, 0x3f95, 0x3f95, 0x3f95, 0x3f94, 0x3f94, 0x3fc9, 0x3fc9, 0x3f93,
+    0x3f93, 0x3f93, 0x3f93, 0x3f93, 0x3f93, 0x3f93, 0x3f93, 0x3f92, 0x3f92,
+    0x3f91, 0x3f91, 0x3f91, 0x3f91, 0x3f91, 0x3f91, 0x3f91, 0x3f90, 0x3f90,
+    0x3f90, 0x3f90, 0x3f90, 0x3f8f, 0x3f8f, 0x3f8f, 0x3f8e, 0x3f8e, 0x3f8e,
+    0x3f8e, 0x3f8e, 0x3f8d, 0x3f8d, 0x3f8d, 0x3f8c, 0x3f8c, 0x3f8c, 0x3f8c,
+    0x3f8c, 0x3f8c, 0x3f8b, 0x3f8b, 0x3f8b, 0x3f8a, 0x3f8a, 0x3f8a, 0x3f8a,
+    0x3f8a, 0x3f8a, 0x3f89, 0x3f89, 0x3f89, 0x3f89, 0x3f89, 0x3f88, 0x3f88,
+    0x3f88, 0x3f87, 0x3f87, 0x3f87, 0x3f86, 0x3f86, 0x3f86, 0x3f86, 0x3f86,
+    0x3f85, 0x3f85, 0x3f85, 0x3f85, 0x3f85, 0x3f84, 0x3f84, 0x3f83, 0x3f83,
+    0x3f83, 0x3f83, 0x3f83, 0x3f82, 0x3f82, 0x3f82, 0x3f82, 0x3f82, 0x3f81,
+    0x3f81, 0x3f80, 0x3f80, 0x3f80, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f,
+    0x3f7e, 0x3f7e, 0x3f7c, 0x3f7c, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7a, 0x3f7a,
+    0x3f79, 0x3f79, 0x3f79, 0x3f78, 0x3f78, 0x3f77, 0x3f77, 0x3f77, 0x3f75,
+    0x3f75, 0x3f74, 0x3f74, 0x3f74, 0x3f72, 0x3f72, 0x3f72, 0x3f72, 0x3f71,
+    0x3f71, 0x3f71, 0x3f6f, 0x3f6f, 0x3f6e, 0x3f6e, 0x3f6e, 0x3f6d, 0x3f6d,
+    0x3f6c, 0x3f6c, 0x3f6c, 0x3f6b, 0x3f6b, 0x3f69, 0x3f69, 0x3f69, 0x3f68,
+    0x3f68, 0x3f67, 0x3f67, 0x3f65, 0x3f65, 0x3f65, 0x3f64, 0x3f64, 0x3f63,
+    0x3f63, 0x3f63, 0x3f61, 0x3f61, 0x3f61, 0x3f61, 0x3f61, 0x3f60, 0x3f60,
+    0x3f5e, 0x3f5e, 0x3f5e, 0x3f5e, 0x3f5e, 0x3f5c, 0x3f5c, 0x3f5b, 0x3f5b,
+    0x3f5b, 0x3f59, 0x3f59, 0x3f58, 0x3f58, 0x3f58, 0x3f57, 0x3f57, 0x3f57,
+    0x3f57, 0x3f57, 0x3f55, 0x3f55, 0x3f54, 0x3f54, 0x3f52, 0x3f52, 0x3f52,
+    0x3f51, 0x3f51, 0x3f50, 0x3f50, 0x3f50, 0x3f4e, 0x3f4e, 0x3f4d, 0x3f4d,
+    0x3f4d, 0x3f4c, 0x3f4c, 0x3f4b, 0x3f4b, 0x3f4b, 0x3f4a, 0x3f4a, 0x3f49,
+    0x3f49, 0x3f47, 0x3f47, 0x3f47, 0x3f46, 0x3f46, 0x3f45, 0x3f45, 0x3f45,
+    0x3f44, 0x3f44, 0x3f42, 0x3f42, 0x3f42, 0x3f41, 0x3f41, 0x3f40, 0x3f40,
+    0x3f40, 0x3f3e, 0x3f3e, 0x3f3d, 0x3f3d, 0x3f3d, 0x3f3d, 0x3f3d, 0x3f3b,
+    0x3f3b, 0x3f3a, 0x3f3a, 0x3f3a, 0x3f39, 0x3f39, 0x3f37, 0x3f37, 0x3f37,
+    0x3f36, 0x3f36, 0x3f35, 0x3f35, 0x3f34, 0x3f34, 0x3f34, 0x3f33, 0x3f33,
+    0x3f31, 0x3f31, 0x3f31, 0x3f30, 0x3f30, 0x3f2f, 0x3f2f, 0x3f2f, 0x3f2e,
+    0x3f2e, 0x3f2d, 0x3f2d, 0x3f2d, 0x3f2b, 0x3f2b, 0x3f2a, 0x3f2a, 0x3f2a,
+    0x3f2a, 0x3f2a, 0x3f27, 0x3f27, 0x3f26, 0x3f26, 0x3f26, 0x3f25, 0x3f25,
+    0x3f25, 0x3f25, 0x3f25, 0x3f23, 0x3f23, 0x3f21, 0x3f21, 0x3f21, 0x3f21,
+    0x3f21, 0x3f1f, 0x3f1f, 0x3f1e, 0x3f1e, 0x3f1e, 0x3f1d, 0x3f1d, 0x3f1c,
+    0x3f1c, 0x3f1c, 0x3f1b, 0x3f1b, 0x3f1a, 0x3f1a, 0x3f1a, 0x3f18, 0x3f18,
+    0x3f17, 0x3f17, 0x3f17, 0x3f16, 0x3f16, 0x3f15, 0x3f15, 0x3f14, 0x3f14,
+    0x3f14, 0x3f13, 0x3f13, 0x3f11, 0x3f11, 0x3f11, 0x3f11, 0x3f10, 0x3f0f,
+    0x3f0f, 0x3f0f, 0x3f0e, 0x3f0d, 0x3f0d, 0x3f0d, 0x3f0d, 0x3f0c, 0x3f0a,
+    0x3f0a, 0x3f0a, 0x3f09, 0x3f09, 0x3f08, 0x3f08, 0x3f08, 0x3f07, 0x3f07,
+    0x3f07, 0x3f06, 0x3f06, 0x3f05, 0x3f04, 0x3f04, 0x3f04, 0x3f04, 0x3f03,
+    0x3f02, 0x3f02, 0x3f01, 0x3f01, 0x3f00, 0x3eff, 0x3efe, 0x3efe, 0x3efe,
+    0x3efc, 0x3efa, 0x3ef9, 0x3ef9, 0x3ef9, 0x3ef7, 0x3ef6, 0x3ef4, 0x3ef4,
+    0x3ef4, 0x3ef2, 0x3ef2, 0x3ef0, 0x3ef0, 0x3eef, 0x3eef, 0x3eed, 0x3eeb,
+    0x3eeb, 0x3eea, 0x3eea, 0x3ee8, 0x3ee7, 0x3ee7, 0x3ee5, 0x3ee4, 0x3ee4,
+    0x3ee4, 0x3ee4, 0x3ee2, 0x3ee0, 0x3ee0, 0x3edf, 0x3edf, 0x3edd, 0x3edd,
+    0x3edb, 0x3edb, 0x3edb, 0x3eda, 0x3ed9, 0x3ed6, 0x3ed6, 0x3ed6, 0x3ed5,
+    0x3ed3, 0x3ed2, 0x3ed2, 0x3ed1, 0x3ed0, 0x3ecf, 0x3ecf, 0x3ecf, 0x3ece,
+    0x3ecd, 0x3eca, 0x3eca, 0x3ec9, 0x3ec9, 0x3ec8, 0x3ec6, 0x3ec6, 0x3ec6,
+    0x3ec5, 0x3ec2, 0x3ec2, 0x3ec1, 0x3ec1, 0x3ec1, 0x3ebe, 0x3ebd, 0x3ebd,
+    0x3ebd, 0x3ebd, 0x3ebb, 0x3eba, 0x3eba, 0x3eb9, 0x3eb9, 0x3eb6, 0x3eb5,
+    0x3eb5, 0x3eb4, 0x3eb3, 0x3eb2, 0x3eb2, 0x3eb2, 0x3eb1, 0x3eb0, 0x3eae,
+    0x3eae, 0x3ead, 0x3eac, 0x3eab, 0x3eaa, 0x3eaa, 0x3eaa, 0x3ea9, 0x3ea8,
+    0x3ea6, 0x3ea6, 0x3ea5, 0x3ea5, 0x3ea4, 0x3ea2, 0x3ea2, 0x3ea2, 0x3ea1,
+    0x3e9f, 0x3e9e, 0x3e9e, 0x3e9e, 0x3e9d, 0x3e9b, 0x3e9a, 0x3e9a, 0x3e99,
+    0x3e99, 0x3e97, 0x3e96, 0x3e96, 0x3e95, 0x3e95, 0x3e94, 0x3e93, 0x3e92,
+    0x3e91, 0x3e91, 0x3e91, 0x3e90, 0x3e8e, 0x3e8e, 0x3e8d, 0x3e8d, 0x3e8c,
+    0x3e8b, 0x3e8a, 0x3e8a, 0x3e88, 0x3e88, 0x3e87, 0x3e87, 0x3e86, 0x3e84,
+    0x3e83, 0x3e83, 0x3e82, 0x3e81, 0x3e80, 0x3e7f, 0x3e7f, 0x3e7d, 0x3e7b,
+    0x3e79, 0x3e78, 0x3e76, 0x3e76, 0x3e74, 0x3e72, 0x3e71, 0x3e6f, 0x3e6f,
+    0x3e6c, 0x3e6c, 0x3e69, 0x3e68, 0x3e67, 0x3e66, 0x3e64, 0x3e63, 0x3e62,
+    0x3e61, 0x3e60, 0x3e5e, 0x3e5d, 0x3e5b, 0x3e5a, 0x3e56, 0x3e56, 0x3e55,
+    0x3e53, 0x3e52, 0x3e4f, 0x3e4f, 0x3e4e, 0x3e4c, 0x3e4b, 0x3e48, 0x3e47,
+    0x3e47, 0x3e45, 0x3e43, 0x3e41, 0x3e40, 0x3e40, 0x3e3e, 0x3e3d, 0x3e3a,
+    0x3e3a, 0x3e38, 0x3e38, 0x3e36, 0x3e35, 0x3e33, 0x3e32, 0x3e31, 0x3e2f,
+    0x3e2e, 0x3e2c, 0x3e2b, 0x3e2a, 0x3e28, 0x3e27, 0x3e26, 0x3e24, 0x3e24,
+    0x3e21, 0x3e20, 0x3e1f, 0x3e1e, 0x3e1c, 0x3e1b, 0x3e19, 0x3e19, 0x3e17,
+    0x3e16, 0x3e14, 0x3e14, 0x3e13, 0x3e12, 0x3e10, 0x3e0e, 0x3e0e, 0x3e0c,
+    0x3e0b, 0x3e09, 0x3e08, 0x3e07, 0x3e06, 0x3e04, 0x3e03, 0x3e02, 0x3e00,
+    0x3e00, 0x3dfd, 0x3dfb, 0x3df8, 0x3df5, 0x3df4, 0x3df1, 0x3ded, 0x3dec,
+    0x3de9, 0x3de6, 0x3de5, 0x3de1, 0x3dde, 0x3ddd, 0x3dda, 0x3dd9, 0x3dd5,
+    0x3dd2, 0x3dd1, 0x3dce, 0x3dcb, 0x3dca, 0x3dc7, 0x3dc5, 0x3dc3, 0x3dc1,
+    0x3dbf, 0x3dbc, 0x3dba, 0x3db8, 0x3db5, 0x3db2, 0x3db1, 0x3dad, 0x3daa,
+    0x3da8, 0x3da5, 0x3da3, 0x3da1, 0x3d9f, 0x3d9c, 0x3d9a, 0x3d99, 0x3d96,
+    0x3d94, 0x3d91, 0x3d90, 0x3d8d, 0x3d8b, 0x3d89, 0x3d86, 0x3d84, 0x3d82,
+    0x3d7e, 0x3d79, 0x3d74, 0x3d70, 0x3d6c, 0x3d67, 0x3d63, 0x3d60, 0x3d5b,
+    0x3d57, 0x3d53, 0x3d50, 0x3d49, 0x3d45, 0x3d41, 0x3d3c, 0x3d38, 0x3d34,
+    0x3d30, 0x3d2b, 0x3d27, 0x3d22, 0x3d1e, 0x3d19, 0x3d15, 0x3d11, 0x3d0c,
+    0x3d08, 0x3d04, 0x3cff, 0x3cf8, 0x3cef, 0x3ce7, 0x3cdf, 0x3cd6, 0x3cce,
+    0x3cc4, 0x3cbc, 0x3cb2, 0x3caa, 0x3ca1, 0x3c98, 0x3c8f, 0x3c87, 0x3c7e,
+    0x3c6b, 0x3c5a, 0x3c4a, 0x3c39, 0x3c29, 0x3c1a, 0x3c09, 0x3bf4, 0x3bd6,
+    0x3bb4, 0x3b95, 0x3b71, 0x3b32, 0x3aec, 0x3a6c, 0x0,
+};
+
+// <! gen atan2 f(y, x) = 2 * atan(y / (pow(x*x+y*y, 0.5) + x))
+static double _gen_atan2(float y, float x)
+{
+  return atan2(y, x);
+}
+
+static void tl_lut_ref(u16 *ofmap, u16 *ifmap, u16 *ifmap2,
+                       tl_shape_t ifmap_shape)
+{
+  assert(ofmap);
+
+  for (u32 i = 0; i < tl_shape_size(&ifmap_shape); i++) {
+    float y = convert_bf16_fp32(ifmap2[i]);
+    float x = convert_bf16_fp32(ifmap[i]);
+    double v = _gen_atan2(y, x);
+    ofmap[i] = convert_fp32_bf16(v);
+
+    if (mode == PRE_DATA_COMPARE_FIX) {
+      ofmap[i] = golden_bf16[i];
+    } else if (mode == DATA_COMPARE_U8) {
+      ofmap[i] = (u8)convert_bf16_s8(ofmap[i]);
+    }
+  }
+}
+
+static bool verify(u16 *ofmap_data, u16 *ref_data, u16 *ifmap, u16 *ifmap2,
+                   u64 ifmap_size, float epsilon)
+{
+  u64 size = ifmap_size;
+
+  for (u64 i = 0; i < size; i++) {
+    bool is_close;
+    u16 ref = ref_data[i];
+    u16 ofmap_data_bf16;
+    float ref_f;
+    float ofmap_data_f;
+
+    ref_f = convert_bf16_fp32(ref);
+    ofmap_data_f = convert_bf16_fp32(ofmap_data[i]);
+    ofmap_data_bf16 = ofmap_data[i];
+
+    if (mode == PRE_DATA_COMPARE_FIX) {
+      is_close = ofmap_data[i] == ref;
+    } else {
+      is_close = fabs(ref_f - ofmap_data_f) < epsilon;
+    }
+
+    if (!is_close) {
+      float y = convert_bf16_fp32(ifmap2[i]);
+      float x = convert_bf16_fp32(ifmap[i]);
+      fprintf(stderr,
+              "comparing failed at ofmap_data[%" PRIu64 "]\n"
+              "\tgot %x, exp %x, fp32: got %f exp %f, atan2(%f, %f) = %f"
+              "\ty %f(0x%x), x %f(0x%x)\n",
+              i, ofmap_data_bf16, ref, ofmap_data_f, ref_f, y, x,
+              _gen_atan2(y, x), y, ifmap2[i], x, ifmap[i]);
+      exit(-1);
+    }
+  }
+
+  return true;
+}
+
+static void _gen_input(u16 *input_data, u64 ifmap_size, int range_start,
+                       int range_end)
+{
+  std::random_device rd;
+  std::mt19937 e2(rd());
+  std::uniform_real_distribution<> dist(range_start, range_end);
+
+  float LO = pow(2, range_start);
+  float HI = pow(2, range_end);
+  for (u64 i = 0; i < ifmap_size; i++) {
+    // input range is -8 ~ +8
+    int table_hw = 256;
+    float input = ((int)i % (range_end - 2)) * (((int)i % 2) ? 1 : -1) + 0.03 +
+                  (i % table_hw) * 0.002;
+    input = ((int)i % (range_end - 2)) * (((int)i % 2) ? 1 : 1) + 0.03 +
+            (i % table_hw) * 0.002;
+    input_data[i] = convert_fp32_bf16(input);
+    input = dist(e2);
+    input = LO + static_cast<float>(rand()) /
+                     (static_cast<float>(RAND_MAX / (HI - LO)));
+  }
+}
+
+static void gen_input(u16 *x, u16 *y, u64 ifmap_size, TEST_MODE mode,
+                      int range_start, int range_end)
+{
+
+  if (mode == PRE_DATA_COMPARE_FIX) {
+    memcpy(x, &test_pattern, sizeof(test_pattern));
+  } else {
+    range_start = abs(range_start);
+    range_end = abs(range_end);
+    _gen_input(x, ifmap_size, range_start, range_end);
+  }
+
+  // invert for test
+  for (u64 i = 0; i < ifmap_size; i++) {
+    y[i] = x[(ifmap_size - 1) - i];
+  }
+
+  if (mode == DATA_COMPARE_ACCURACY_X_GT_0) {
+    // y = any
+    u32 i = 0;
+    for (; i < ifmap_size / 4; i++) {
+      // y < 0
+      y[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(y[i]));
+      y[i + ifmap_size / 4] = convert_fp32_bf16(0);
+    }
+  } else if (mode == DATA_COMPARE_ACCURACY_X_LT_0_Y_GE_0) {
+    // x < 0 and y >= 0
+    for (u32 i = 0; i < ifmap_size; i++) {
+      x[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(x[i]));
+    }
+
+    for (u32 i = 0; i < ifmap_size / 4; i++) {
+      y[i + ifmap_size / 4] = convert_fp32_bf16(0);
+    }
+  } else if (mode == DATA_COMPARE_ACCURACY_X_LT_0_Y_LT_0) {
+    // x < 0 and y < 0
+    for (u32 i = 0; i < ifmap_size; i++) {
+      x[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(x[i]));
+      y[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(y[i]));
+    }
+  } else if (mode == DATA_COMPARE_ACCURACY_X_0_Y_GT_0) {
+    // pi / 2, x = 0 and y > 0
+    for (u32 i = 0; i < ifmap_size; i++) {
+      x[i] = convert_fp32_bf16(0);
+    }
+  } else if (mode == DATA_COMPARE_ACCURACY_X_0_Y_LT_0) {
+    // -pi / 2, x = 0 and y < 0
+    for (u32 i = 0; i < ifmap_size; i++) {
+      x[i] = convert_fp32_bf16(0);
+      y[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(y[i]));
+    }
+  }
+
+  if (mode != PRE_DATA_COMPARE_FIX) {
+    int i = 0;
+    y[i] = convert_fp32_bf16(5.000000);
+    x[i++] = convert_fp32_bf16(-125.000000);
+    y[i] = convert_fp32_bf16(1.070312);
+    x[i++] = convert_fp32_bf16(0.498046);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-8.000000);
+    x[i] = convert_fp32_bf16(424.000);
+    y[i++] = convert_fp32_bf16(-1.00);
+    x[i] = convert_fp32_bf16(2.484375);
+    y[i++] = convert_fp32_bf16(-7.531250);
+    x[i] = convert_fp32_bf16(-2.484375);
+    y[i++] = convert_fp32_bf16(-7.531250);
+    x[i] = convert_fp32_bf16(0);
+    y[i++] = convert_fp32_bf16(7.531250);
+    x[i] = convert_fp32_bf16(0);
+    y[i++] = convert_fp32_bf16(-7.531250);
+    x[i] = convert_fp32_bf16(0);
+    y[i++] = convert_fp32_bf16(0);
+    x[i] = convert_fp32_bf16(0);
+    y[i++] = convert_fp32_bf16(0.394531);
+    y[i] = convert_fp32_bf16(-4.000000);
+    x[i++] = convert_fp32_bf16(-64.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-4.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-40.000000);
+    y[i] = convert_fp32_bf16(1.000000);
+    x[i++] = convert_fp32_bf16(-53.000000);
+    y[i] = convert_fp32_bf16(-9.000000);
+    x[i++] = convert_fp32_bf16(-91.000000);
+    y[i] = convert_fp32_bf16(12.000000);
+    x[i++] = convert_fp32_bf16(-164.000000);
+    y[i] = convert_fp32_bf16(-20.000000);
+    x[i++] = convert_fp32_bf16(-320.000000);
+    y[i] = convert_fp32_bf16(1.000000);
+    x[i++] = convert_fp32_bf16(-71.000000);
+    y[i] = convert_fp32_bf16(1.000000);
+    x[i++] = convert_fp32_bf16(-155.000000);
+    y[i] = convert_fp32_bf16(1.000000);
+    x[i++] = convert_fp32_bf16(-247.000000);
+    y[i] = convert_fp32_bf16(-2.000000);
+    x[i++] = convert_fp32_bf16(-118.000000);
+    y[i] = convert_fp32_bf16(-2.000000);
+    x[i++] = convert_fp32_bf16(-54.000000);
+    y[i] = convert_fp32_bf16(-5.000000);
+    x[i++] = convert_fp32_bf16(-392.000000);
+    y[i] = convert_fp32_bf16(-37.000000);
+    x[i++] = convert_fp32_bf16(-520.000000);
+    y[i] = convert_fp32_bf16(-1.000000);
+    x[i++] = convert_fp32_bf16(-19.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-10.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-8.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-2.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-14.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-2.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-6.000000);
+    y[i] = convert_fp32_bf16(-1.000000);
+    x[i++] = convert_fp32_bf16(-21.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-14.000000);
+    y[i] = convert_fp32_bf16(-1.000000);
+    x[i++] = convert_fp32_bf16(-17.000000);
+    y[i] = convert_fp32_bf16(-1.000000);
+    x[i++] = convert_fp32_bf16(-17.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-8.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-4.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-10.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-8.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-14.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-4.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-2.000000);
+    y[i] = convert_fp32_bf16(1.000000);
+    x[i++] = convert_fp32_bf16(-41.000000);
+    y[i] = convert_fp32_bf16(1.000000);
+    x[i++] = convert_fp32_bf16(-69.000000);
+    y[i] = convert_fp32_bf16(4.000000);
+    x[i++] = convert_fp32_bf16(-86.000000);
+    y[i] = convert_fp32_bf16(1.000000);
+    x[i++] = convert_fp32_bf16(-41.000000);
+    y[i] = convert_fp32_bf16(-2.000000);
+    x[i++] = convert_fp32_bf16(-34.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-6.000000);
+    y[i] = convert_fp32_bf16(1.000000);
+    x[i++] = convert_fp32_bf16(-41.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-136.000000);
+    y[i] = convert_fp32_bf16(-3.000000);
+    x[i++] = convert_fp32_bf16(-79.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-38.000000);
+    y[i] = convert_fp32_bf16(5.000000);
+    x[i++] = convert_fp32_bf16(-173.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-78.000000);
+    y[i] = convert_fp32_bf16(-2.000000);
+    x[i++] = convert_fp32_bf16(-60.000000);
+    y[i] = convert_fp32_bf16(3.000000);
+    x[i++] = convert_fp32_bf16(-123.000000);
+    y[i] = convert_fp32_bf16(-9.000000);
+    x[i++] = convert_fp32_bf16(-280.000000);
+    y[i] = convert_fp32_bf16(3.000000);
+    x[i++] = convert_fp32_bf16(-39.000000);
+    y[i] = convert_fp32_bf16(2.000000);
+    x[i++] = convert_fp32_bf16(-524.000000);
+    y[i] = convert_fp32_bf16(11.000000);
+    x[i++] = convert_fp32_bf16(-376.000000);
+    y[i] = convert_fp32_bf16(5.000000);
+    x[i++] = convert_fp32_bf16(-131.000000);
+    y[i] = convert_fp32_bf16(11.000000);
+    x[i++] = convert_fp32_bf16(-324.000000);
+    y[i] = convert_fp32_bf16(9.000000);
+    x[i++] = convert_fp32_bf16(-125.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-92.000000);
+    y[i] = convert_fp32_bf16(-7.000000);
+    x[i++] = convert_fp32_bf16(-233.000000);
+    y[i] = convert_fp32_bf16(10.000000);
+    x[i++] = convert_fp32_bf16(-170.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-4.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-4.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-10.000000);
+    y[i] = convert_fp32_bf16(-1.000000);
+    x[i++] = convert_fp32_bf16(-23.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-6.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-6.000000);
+    y[i] = convert_fp32_bf16(-3.000000);
+    x[i++] = convert_fp32_bf16(-37.000000);
+
+    y[i] = convert_fp32_bf16(-9);
+    x[i++] = convert_fp32_bf16(-1);
+
+    y[i] = convert_fp32_bf16(7.0);
+    x[i++] = convert_fp32_bf16(-1);
+
+    y[i] = convert_fp32_bf16(0);
+    x[i++] = convert_fp32_bf16(-1);
+  }
+
+#ifdef DBG
+  for (u64 i = 0; i < ifmap_size; i++) {
+    printf("source[%ld] y %f x %f\n", i, convert_bf16_fp32(y[i]),
+           convert_bf16_fp32(x[i]));
+  }
+#endif /* ifdef DBG */
+}
+
+static void testbench(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk)
+{
+  // TODO: check more shape / align
+  bmk1880v2_chip_info_t chip_info = bmk1880v2_chip_info();
+
+  u32 input_n = 1;
+  u32 input_c = chip_info.npu_num;
+  u32 input_h = 16;
+  u32 input_w = 16;
+  float epsilon = 0.1;
+  int range_start = -8;
+  int range_end = 8;
+
+  if (mode == PRE_DATA_COMPARE_FIX) {
+    input_h = 4;
+    input_w = 8;
+  }
+
+  fmt_t fmt = FMT_BF16;
+
+  tl_shape_t ifmap_shape = {input_n, input_c, input_h, input_w};
+  tl_shape_t ofmap_shape = ifmap_shape;
+
+  // get lut table shape and size
+  tl_shape_t table_shape;
+  u64 table_bytesize = bf16_lut_tbl_bytesize(bmk, &table_shape, fmt);
+
+  // get input / output size
+  u64 ifmap_size = tl_shape_size(&ifmap_shape);
+  u64 ofmap_size = tl_shape_size(&ofmap_shape);
+  int data_type_size = bytesize_of_fmt(fmt);
+  u64 ifmap_bytesize = ifmap_size * data_type_size;
+  u64 ofmap_bytesize = ofmap_size * data_type_size;
+
+  // atan2 was two inputs
+  tl_t *tl_ifmap = alloc_tl(bmk, ifmap_shape, fmt, /*align*/1);
+  tl_t *tl_ifmap2 = alloc_tl(bmk, ifmap_shape, fmt, /*align*/1);
+  tl_t *tl_ofmap_bf16 = alloc_tl(bmk, ofmap_shape, fmt, /*align*/1);
+  tl_t *out = tl_ofmap_bf16;
+
+  // atan buf
+  tl_t *tl_y0_buf = alloc_tl(bmk, table_shape, fmt, /*align*/1);
+  tl_t *tl_slope_buf = alloc_tl(bmk, table_shape, fmt, /*align*/1);
+  tl_t *tl_invert_buf = alloc_tl(bmk, table_shape, fmt, /*align*/1);
+  tl_t *tl_pos_neg_buf = alloc_tl(bmk, table_shape, fmt, /*align*/1);
+
+  // reciprocal buf
+  tl_t *tl_reciprocal_table_answer =
+      alloc_tl(bmk, table_shape, fmt, /*align*/1);
+  tl_t *tl_reciprocal_table_answer_mantissa =
+      alloc_tl(bmk, table_shape, fmt, /*align*/1);
+
+  // temp buf
+  tl_t *tl_buf = alloc_tl(bmk, ofmap_shape, fmt, /*align*/1);
+  tl_t *tl_buf2 = alloc_tl(bmk, tl_ofmap_bf16->shape, fmt, /*align*/1);
+  tl_t *tl_buf3 = alloc_tl(bmk, tl_ofmap_bf16->shape, fmt, /*align*/1);
+  tl_t *tl_buf4 = alloc_tl(bmk, tl_ofmap_bf16->shape, fmt, /*align*/1);
+  tl_t *tl_buf5 = alloc_tl(bmk, tl_ofmap_bf16->shape, fmt, /*align*/1);
+  tl_t *tl_buf6 = alloc_tl(bmk, tl_ofmap_bf16->shape, fmt, /*align*/1);
+
+  // sqrt buf
+  tl_t *tl_sqrt_table_answer = alloc_tl(bmk, table_shape, fmt, /*align*/1);
+  tl_t *tl_sqrt_table_answer_mantissa =
+      alloc_tl(bmk, table_shape, fmt, /*align*/1);
+
+  // search '0' index
+  tl_t *tl_0_idx_table = alloc_tl(bmk, table_shape, fmt, /*align*/1);
+
+  u16 *input_data = (u16 *)xmalloc(ifmap_bytesize);
+  u16 *input_data2 = (u16 *)xmalloc(ifmap_bytesize);
+  u16 *ref_data = (u16 *)xmalloc(ofmap_bytesize);
+
+  // for reciprocal
+  u16 *table_reciprocal_data = (u16 *)xmalloc(table_bytesize);
+  u16 *table_reciprocal_data_mantissa = (u16 *)xmalloc(table_bytesize);
+
+  // for atan
+  u16 *table_data_atan_y0 = (u16 *)xmalloc(table_bytesize);
+  u16 *table_data_atan_slope = (u16 *)xmalloc(table_bytesize);
+  u16 *table_data_atan_invert = (u16 *)xmalloc(table_bytesize);
+  u16 *table_data_atan_pos_neg = (u16 *)xmalloc(table_bytesize);
+
+  // for sqrt
+  u16 *sqrt_table_data = (u16 *)xmalloc(table_bytesize);
+  u16 *sqrt_table_data_mantissa = (u16 *)xmalloc(table_bytesize);
+
+  // for search '0' index
+  u16 *idx_0_table_data = (u16 *)xmalloc(table_bytesize);
+
+  // init input / ref
+  // input_data is x, input_data2 is y
+  gen_input(input_data, input_data2, ifmap_size, mode, range_start, range_end);
+  tl_lut_ref(ref_data, input_data, input_data2, ifmap_shape);
+
+  // init lut table
+  bf16_reciprocal_tbl(table_reciprocal_data, table_reciprocal_data_mantissa,
+                      &table_shape);
+  bf16_sqrt_tbl(sqrt_table_data, sqrt_table_data_mantissa, &table_shape);
+  bf16_atan_tbl(table_data_atan_y0, table_data_atan_slope,
+                table_data_atan_invert, table_data_atan_pos_neg, &table_shape);
+  bf16_gen_0_tbl(idx_0_table_data, &table_shape);
+
+  // sys->local
+  put_bf16_tensor_g2l(ctx, bmk, tl_ifmap, (u16 *)input_data, fmt);
+  put_bf16_tensor_g2l(ctx, bmk, tl_ifmap2, (u16 *)input_data2, fmt);
+  put_bf16_tensor_g2l(ctx, bmk, tl_reciprocal_table_answer,
+                      (u16 *)table_reciprocal_data, fmt);
+  put_bf16_tensor_g2l(ctx, bmk, tl_reciprocal_table_answer_mantissa,
+                      (u16 *)table_reciprocal_data_mantissa, fmt);
+
+  put_bf16_tensor_g2l(ctx, bmk, tl_y0_buf, (u16 *)table_data_atan_y0, fmt);
+  put_bf16_tensor_g2l(ctx, bmk, tl_slope_buf, (u16 *)table_data_atan_slope,
+                      fmt);
+  put_bf16_tensor_g2l(ctx, bmk, tl_invert_buf, (u16 *)table_data_atan_invert,
+                      fmt);
+  put_bf16_tensor_g2l(ctx, bmk, tl_pos_neg_buf, (u16 *)table_data_atan_pos_neg,
+                      fmt);
+  put_bf16_tensor_g2l(ctx, bmk, tl_sqrt_table_answer, (u16 *)sqrt_table_data,
+                      fmt);
+  put_bf16_tensor_g2l(ctx, bmk, tl_sqrt_table_answer_mantissa,
+                      (u16 *)sqrt_table_data_mantissa, fmt);
+  put_bf16_tensor_g2l(ctx, bmk, tl_0_idx_table, (u16 *)idx_0_table_data, fmt);
+
+  bf16_atan2_emit(bmk, tl_ifmap2, tl_ifmap, tl_buf, tl_buf2, tl_buf3, tl_buf4,
+                  tl_buf5, tl_buf6, tl_y0_buf, tl_slope_buf, tl_invert_buf,
+                  tl_pos_neg_buf, tl_reciprocal_table_answer,
+                  tl_reciprocal_table_answer_mantissa, tl_sqrt_table_answer,
+                  tl_sqrt_table_answer_mantissa, tl_0_idx_table,
+                  OUT tl_ofmap_bf16, fmt);
+
+  u16 *ofmap_data = (u16 *)get_bf16_tensor_l2g(ctx, bmk, out, out->fmt);
+  verify(ofmap_data, ref_data, input_data, input_data2, ifmap_size, epsilon);
+
+  free_tl(bmk, tl_0_idx_table);
+  free_tl(bmk, tl_sqrt_table_answer_mantissa);
+  free_tl(bmk, tl_sqrt_table_answer);
+  free_tl(bmk, tl_buf6);
+  free_tl(bmk, tl_buf5);
+  free_tl(bmk, tl_buf4);
+  free_tl(bmk, tl_buf3);
+  free_tl(bmk, tl_buf2);
+  free_tl(bmk, tl_buf);
+  free_tl(bmk, tl_reciprocal_table_answer_mantissa);
+  free_tl(bmk, tl_reciprocal_table_answer);
+  free_tl(bmk, tl_pos_neg_buf);
+  free_tl(bmk, tl_invert_buf);
+  free_tl(bmk, tl_slope_buf);
+  free_tl(bmk, tl_y0_buf);
+  free_tl(bmk, tl_ofmap_bf16);
+  free_tl(bmk, tl_ifmap2);
+  free_tl(bmk, tl_ifmap);
+
+  free(input_data);
+  free(input_data2);
+  free(ref_data);
+  free(table_reciprocal_data);
+  free(table_reciprocal_data_mantissa);
+
+  free(table_data_atan_y0);
+  free(table_data_atan_slope);
+  free(table_data_atan_invert);
+  free(table_data_atan_pos_neg);
+  free(sqrt_table_data);
+  free(sqrt_table_data_mantissa);
+  free(idx_0_table_data);
+  free(ofmap_data);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  int round_mode;
+
+  round_mode = set_store_feround();
+
+  test_init(&ctx, &bmk);
+
+  // for (int i = PRE_DATA_COMPARE_FIX; i < TEST_MODE_MAX; i++)
+  // for (int i = PRE_DATA_COMPARE_FIX; i < DATA_COMPARE_ACCURACY; i++) {
+  // for (int i = DATA_COMPARE_ACCURACY; i < DATA_COMPARE_U8; i++) {
+  // for (int i = DATA_COMPARE_ACCURACY; i < DATA_COMPARE_ACCURACY_X_GT_0; i++)
+  // {
+  for (int i = PRE_DATA_COMPARE_FIX; i < DATA_COMPARE_U8; i++) {
+    mode = static_cast<TEST_MODE>(i);
+    printf("test mode %d...\n", mode);
+    testbench(&ctx, bmk);
+  }
+  printf("pass\n");
+
+  test_exit(&ctx);
+  restore_feround(round_mode);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_bf16_atan2_merge_kernel.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_atan2_merge_kernel.cpp
new file mode 100644
index 000000000..c4da95951
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_atan2_merge_kernel.cpp
@@ -0,0 +1,783 @@
+/**
+ * \breif atan2 is implemented by atan, you can refer
+ * [wiki](https://en.wikipedia.org/wiki/Atan2) for more details
+ */
+
+#include "../1880v2_test_util.h"
+#define OUT
+#define IN
+#include <cfloat>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <random>
+#include <string>
+//#define DBG
+
+/**
+ * pre_data means we test fixed pattern, it should be same sa lut
+ */
+enum TEST_MODE {
+  PRE_DATA_COMPARE_FIX = 0,  // pre-data + fix compare
+  DATA_COMPARE_ACCURACY,     // generate \range_start to \range_end value that
+                             // check epsilon, default set x > 0, y > 0
+
+  DATA_COMPARE_ACCURACY_X_GT_0,         // atan(y/x), x > 0, y = 0
+  DATA_COMPARE_ACCURACY_X_LT_0_Y_GE_0,  // atan(y/x) + PI , x < 0 and y >= 0
+  DATA_COMPARE_ACCURACY_X_LT_0_Y_LT_0,  // atan(y/x) - PI , x < 0 and y < 0
+  DATA_COMPARE_ACCURACY_X_0_Y_GT_0,     // pi / 2, x = 0 and y > 0
+  DATA_COMPARE_ACCURACY_X_0_Y_LT_0,     // -pi / 2, x = 0 and y < 0
+  DATA_COMPARE_U8,  // generate \range_start to \range_end value that check
+                    // epsilon, result bf16->u8
+  TEST_MODE_MAX,
+};
+
+
+static TEST_MODE mode;
+
+static u16 test_pattern[] = {
+    0x0000, 0x38D2, 0x3952, 0x399D, 0x39D2, 0x3A03, 0x3A1D, 0x3A38, 0x3A52,
+    0x3A6C, 0x3A83, 0x3A90, 0x3A9D, 0x3AAA, 0x3AB8, 0x3AC5, 0x3AD2, 0x3ADF,
+    0x3AEC, 0x3AF9, 0x3B03, 0x3B0A, 0x3B10, 0x3B17, 0x3B1D, 0x3B24, 0x3B2A,
+    0x3B31, 0x3B38, 0x3B3E, 0x3B45, 0x3B4B, 0x3B52, 0x3B58, 0x3B5F, 0x3B65,
+    0x3B6C, 0x3B72, 0x3B79, 0x3B80, 0x3B83, 0x3B86, 0x3B8A, 0x3B8D, 0x3B90,
+    0x3B93, 0x3B97, 0x3B9A, 0x3B9D, 0x3BA1, 0x3BA4, 0x3BA7, 0x3BAA, 0x3BAE,
+    0x3BB1, 0x3BB4, 0x3BB8, 0x3BBB, 0x3BBE, 0x3BC1, 0x3BC5, 0x3BC8, 0x3BCB,
+    0x3BCE, 0x3BD2, 0x3BD5, 0x3BD8, 0x3BDC, 0x3BDF, 0x3BE2, 0x3BE5, 0x3BE9,
+    0x3BEC, 0x3BEF, 0x3BF2, 0x3BF6, 0x3BF9, 0x3BFC, 0x3C00, 0x3C01, 0x3C03,
+    0x3C05, 0x3C06, 0x3C08, 0x3C0A, 0x3C0B, 0x3C0D, 0x3C0F, 0x3C10, 0x3C12,
+    0x3C13, 0x3C15, 0x3C17, 0x3C18, 0x3C1A, 0x3C1C, 0x3C1D, 0x3C1F, 0x3C21,
+    0x3C22, 0x3C24, 0x3C25, 0x3C27, 0x3C29, 0x3C2A, 0x3C2C, 0x3C2E, 0x3C2F,
+    0x3C31, 0x3C33, 0x3C34, 0x3C36, 0x3C38, 0x3C39, 0x3C3B, 0x3C3C, 0x3C3E,
+    0x3C40, 0x3C41, 0x3C43, 0x3C45, 0x3C46, 0x3C48, 0x3C4A, 0x3C4B, 0x3C4D,
+    0x3C4E, 0x3C50, 0x3C52, 0x3C53, 0x3C55, 0x3C57, 0x3C58, 0x3C5A, 0x3C5C,
+    0x3C5D, 0x3C5F, 0x3C60, 0x3C62, 0x3C64, 0x3C65, 0x3C67, 0x3C69, 0x3C6A,
+    0x3C6C, 0x3C6E, 0x3C6F, 0x3C71, 0x3C72, 0x3C74, 0x3C76, 0x3C77, 0x3C79,
+    0x3C7B, 0x3C7C, 0x3C7E, 0x3C80, 0x3C81, 0x3C81, 0x3C82, 0x3C83, 0x3C84,
+    0x3C85, 0x3C86, 0x3C86, 0x3C87, 0x3C88, 0x3C89, 0x3C8A, 0x3C8A, 0x3C8B,
+    0x3C8C, 0x3C8D, 0x3C8E, 0x3C8F, 0x3C8F, 0x3C90, 0x3C91, 0x3C92, 0x3C93,
+    0x3C93, 0x3C94, 0x3C95, 0x3C96, 0x3C97, 0x3C98, 0x3C98, 0x3C99, 0x3C9A,
+    0x3C9B, 0x3C9C, 0x3C9C, 0x3C9D, 0x3C9E, 0x3C9F, 0x3CA0, 0x3CA1, 0x3CA1,
+    0x3CA2, 0x3CA3, 0x3CA4, 0x3CA5, 0x3CA5, 0x3CA6, 0x3CA7, 0x3CA8, 0x3CA9,
+    0x3CAA, 0x3CAA, 0x3CAB, 0x3CAC, 0x3CAD, 0x3CAE, 0x3CAE, 0x3CAF, 0x3CB0,
+    0x3CB1, 0x3CB2, 0x3CB3, 0x3CB3, 0x3CB4, 0x3CB5, 0x3CB6, 0x3CB7, 0x3CB8,
+    0x3CB8, 0x3CB9, 0x3CBA, 0x3CBB, 0x3CBC, 0x3CBC, 0x3CBD, 0x3CBE, 0x3CBF,
+    0x3CC0, 0x3CC1, 0x3CC1, 0x3CC2, 0x3CC3, 0x3CC4, 0x3CC5, 0x3CC5, 0x3CC6,
+    0x3CC7, 0x3CC8, 0x3CC9, 0x3CCA, 0x3CCA, 0x3CCB, 0x3CCC, 0x3CCD, 0x3CCE,
+    0x3CCE, 0x3CCF, 0x3CD0, 0x3CD1, 0x3CD2, 0x3CD3, 0x3CD3, 0x3CD4, 0x3CD5,
+    0x3CD6, 0x3CD7, 0x3CD7, 0x3CD8, 0x3CD9, 0x3CDA, 0x3CDB, 0x3CDC, 0x3CDC,
+    0x3CDD, 0x3CDE, 0x3CDF, 0x3CE0, 0x3CE0, 0x3CE1, 0x3CE2, 0x3CE3, 0x3CE4,
+    0x3CE5, 0x3CE5, 0x3CE6, 0x3CE7, 0x3CE8, 0x3CE9, 0x3CE9, 0x3CEA, 0x3CEB,
+    0x3CEC, 0x3CED, 0x3CEE, 0x3CEE, 0x3CEF, 0x3CF0, 0x3CF1, 0x3CF2, 0x3CF2,
+    0x3CF3, 0x3CF4, 0x3CF5, 0x3CF6, 0x3CF7, 0x3CF7, 0x3CF8, 0x3CF9, 0x3CFA,
+    0x3CFB, 0x3CFB, 0x3CFC, 0x3CFD, 0x3CFE, 0x3CFF, 0x3D00, 0x3D00, 0x3D01,
+    0x3D01, 0x3D01, 0x3D02, 0x3D02, 0x3D03, 0x3D03, 0x3D03, 0x3D04, 0x3D04,
+    0x3D05, 0x3D05, 0x3D06, 0x3D06, 0x3D06, 0x3D07, 0x3D07, 0x3D08, 0x3D08,
+    0x3D08, 0x3D09, 0x3D09, 0x3D0A, 0x3D0A, 0x3D0A, 0x3D0B, 0x3D0B, 0x3D0C,
+    0x3D0C, 0x3D0C, 0x3D0D, 0x3D0D, 0x3D0E, 0x3D0E, 0x3D0F, 0x3D0F, 0x3D0F,
+    0x3D10, 0x3D10, 0x3D11, 0x3D11, 0x3D11, 0x3D12, 0x3D12, 0x3D13, 0x3D13,
+    0x3D13, 0x3D14, 0x3D14, 0x3D15, 0x3D15, 0x3D16, 0x3D16, 0x3D16, 0x3D17,
+    0x3D17, 0x3D18, 0x3D18, 0x3D18, 0x3D19, 0x3D19, 0x3D1A, 0x3D1A, 0x3D1A,
+    0x3D1B, 0x3D1B, 0x3D1C, 0x3D1C, 0x3D1C, 0x3D1D, 0x3D1D, 0x3D1E, 0x3D1E,
+    0x3D1F, 0x3D1F, 0x3D1F, 0x3D20, 0x3D20, 0x3D21, 0x3D21, 0x3D21, 0x3D22,
+    0x3D22, 0x3D23, 0x3D23, 0x3D23, 0x3D24, 0x3D24, 0x3D25, 0x3D25, 0x3D25,
+    0x3D26, 0x3D26, 0x3D27, 0x3D27, 0x3D28, 0x3D28, 0x3D28, 0x3D29, 0x3D29,
+    0x3D2A, 0x3D2A, 0x3D2A, 0x3D2B, 0x3D2B, 0x3D2C, 0x3D2C, 0x3D2C, 0x3D2D,
+    0x3D2D, 0x3D2E, 0x3D2E, 0x3D2E, 0x3D2F, 0x3D2F, 0x3D30, 0x3D30, 0x3D31,
+    0x3D31, 0x3D31, 0x3D32, 0x3D32, 0x3D33, 0x3D33, 0x3D33, 0x3D34, 0x3D34,
+    0x3D35, 0x3D35, 0x3D35, 0x3D36, 0x3D36, 0x3D37, 0x3D37, 0x3D38, 0x3D38,
+    0x3D38, 0x3D39, 0x3D39, 0x3D3A, 0x3D3A, 0x3D3A, 0x3D3B, 0x3D3B, 0x3D3C,
+    0x3D3C, 0x3D3C, 0x3D3D, 0x3D3D, 0x3D3E, 0x3D3E, 0x3D3E, 0x3D3F, 0x3D3F,
+    0x3D40, 0x3D40, 0x3D41, 0x3D41, 0x3D41, 0x3D42, 0x3D42, 0x3D43, 0x3D43,
+    0x3D43, 0x3D44, 0x3D44, 0x3D45, 0x3D45, 0x3D45, 0x3D46, 0x3D46, 0x3D47,
+    0x3D47, 0x3D47, 0x3D48, 0x3D48, 0x3D49, 0x3D49, 0x3D4A, 0x3D4A, 0x3D4A,
+    0x3D4B, 0x3D4B, 0x3D4C, 0x3D4C, 0x3D4C, 0x3D4D, 0x3D4D, 0x3D4E, 0x3D4E,
+    0x3D4E, 0x3D4F, 0x3D4F, 0x3D50, 0x3D50, 0x3D50, 0x3D51, 0x3D51, 0x3D52,
+    0x3D52, 0x3D53, 0x3D53, 0x3D53, 0x3D54, 0x3D54, 0x3D55, 0x3D55, 0x3D55,
+    0x3D56, 0x3D56, 0x3D57, 0x3D57, 0x3D57, 0x3D58, 0x3D58, 0x3D59, 0x3D59,
+    0x3D59, 0x3D5A, 0x3D5A, 0x3D5B, 0x3D5B, 0x3D5C, 0x3D5C, 0x3D5C, 0x3D5D,
+    0x3D5D, 0x3D5E, 0x3D5E, 0x3D5E, 0x3D5F, 0x3D5F, 0x3D60, 0x3D60, 0x3D60,
+    0x3D61, 0x3D61, 0x3D62, 0x3D62, 0x3D63, 0x3D63, 0x3D63, 0x3D64, 0x3D64,
+    0x3D65, 0x3D65, 0x3D65, 0x3D66, 0x3D66, 0x3D67, 0x3D67, 0x3D67, 0x3D68,
+    0x3D68, 0x3D69, 0x3D69, 0x3D69, 0x3D6A, 0x3D6A, 0x3D6B, 0x3D6B, 0x3D6C,
+    0x3D6C, 0x3D6C, 0x3D6D, 0x3D6D, 0x3D6E, 0x3D6E, 0x3D6E, 0x3D6F, 0x3D6F,
+    0x3D70, 0x3D70, 0x3D70, 0x3D71, 0x3D71, 0x3D72, 0x3D72, 0x3D72, 0x3D73,
+    0x3D73, 0x3D74, 0x3D74, 0x3D75, 0x3D75, 0x3D75, 0x3D76, 0x3D76, 0x3D77,
+    0x3D77, 0x3D77, 0x3D78, 0x3D78, 0x3D79, 0x3D79, 0x3D79, 0x3D7A, 0x3D7A,
+    0x3D7B, 0x3D7B, 0x3D7B, 0x3D7C, 0x3D7C, 0x3D7D, 0x3D7D, 0x3D7E, 0x3D7E,
+    0x3D7E, 0x3D7F, 0x3D7F, 0x3D80, 0x3D80, 0x3D80, 0x3D80, 0x3D81, 0x3D81,
+    0x3D81, 0x3D81, 0x3D81, 0x3D82, 0x3D82, 0x3D82, 0x3D82, 0x3D82, 0x3D83,
+    0x3D83, 0x3D83, 0x3D83, 0x3D83, 0x3D84, 0x3D84, 0x3D84, 0x3D84, 0x3D85,
+    0x3D85, 0x3D85, 0x3D85, 0x3D85, 0x3D86, 0x3D86, 0x3D86, 0x3D86, 0x3D86,
+    0x3D87, 0x3D87, 0x3D87, 0x3D87, 0x3D87, 0x3D88, 0x3D88, 0x3D88, 0x3D88,
+    0x3D88, 0x3D89, 0x3D89, 0x3D89, 0x3D89, 0x3D89, 0x3D8A, 0x3D8A, 0x3D8A,
+    0x3D8A, 0x3D8A, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8C, 0x3D8C,
+    0x3D8C, 0x3D8C, 0x3D8C, 0x3D8D, 0x3D8D, 0x3D8D, 0x3D8D, 0x3D8E, 0x3D8E,
+    0x3D8E, 0x3D8E, 0x3D8E, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D90,
+    0x3D90, 0x3D90, 0x3D90, 0x3D90, 0x3D91, 0x3D91, 0x3D91, 0x3D91, 0x3D91,
+    0x3D92, 0x3D92, 0x3D92, 0x3D92, 0x3D92, 0x3D93, 0x3D93, 0x3D93, 0x3D93,
+    0x3D93, 0x3D94, 0x3D94, 0x3D94, 0x3D94, 0x3D94, 0x3D95, 0x3D95, 0x3D95,
+    0x3D95, 0x3D96, 0x3D96, 0x3D96, 0x3D96, 0x3D96, 0x3D97, 0x3D97, 0x3D97,
+    0x3D97, 0x3D97, 0x3D98, 0x3D98, 0x3D98, 0x3D98, 0x3D98, 0x3D99, 0x3D99,
+    0x3D99, 0x3D99, 0x3D99, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9B,
+    0x3D9B, 0x3D9B, 0x3D9B, 0x3D9B, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9C,
+    0x3D9D, 0x3D9D, 0x3D9D, 0x3D9D, 0x3D9D, 0x3D9E, 0x3D9E, 0x3D9E, 0x3D9E,
+    0x3D9F, 0x3D9F, 0x3D9F, 0x3D9F, 0x3D9F, 0x3DA0, 0x3DA0, 0x3DA0, 0x3DA0,
+    0x3DA0, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA2, 0x3DA2, 0x3DA2,
+    0x3DA2, 0x3DA2, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA4, 0x3DA4,
+    0x3DA4, 0x3DA4, 0x3DA4, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA6,
+    0x3DA6, 0x3DA6, 0x3DA6, 0x3DA7, 0x3DA7, 0x3DA7, 0x3DA7, 0x3DA7, 0x3DA8,
+    0x3DA8, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA9, 0x3DA9, 0x3DA9, 0x3DA9, 0x3DA9,
+    0x3DAA, 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAB,
+    0x3DAB, 0x3DAC, 0x3DAC, 0x3DAC, 0x3DAC, 0x3DAC, 0x3DAD, 0x3DAD, 0x3DAD,
+    0x3DAD, 0x3DAD, 0x3DAE, 0x3DAE, 0x3DAE, 0x3DAE, 0x3DAE, 0x3DAF, 0x3DAF,
+    0x3DAF, 0x3DAF, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB1, 0x3DB1,
+    0x3DB1, 0x3DB1, 0x3DB1, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB3,
+    0x3DB3, 0x3DB3, 0x3DB3, 0x3DB3, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB4,
+    0x3DB5, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB6, 0x3DB6, 0x3DB6, 0x3DB6,
+    0x3DB6, 0x3DB7, 0x3DB7, 0x3DB7, 0x3DB7, 0x3DB8, 0x3DB8, 0x3DB8, 0x3DB8,
+    0x3DB8, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DBA, 0x3DBA, 0x3DBA,
+    0x3DBA, 0x3DBA, 0x3DBB, 0x3DBB, 0x3DBB, 0x3DBB, 0x3DBB, 0x3DBC, 0x3DBC,
+    0x3DBC, 0x3DBC, 0x3DBC, 0x3DBD, 0x3DBD, 0x3DBD, 0x3DBD, 0x3DBD, 0x3DBE,
+    0x3DBE, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBF, 0x3DBF, 0x3DBF, 0x3DBF, 0x3DBF,
+    0x3DC0, 0x3DC0, 0x3DC0, 0x3DC0, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC1,
+    0x3DC2, 0x3DC2, 0x3DC2, 0x3DC2, 0x3DC2, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC3,
+    0x3DC3, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC5, 0x3DC5, 0x3DC5,
+    0x3DC5, 0x3DC5, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC7, 0x3DC7,
+    0x3DC7, 0x3DC7, 0x3DC7, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC9,
+    0x3DC9, 0x3DC9, 0x3DC9, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCB,
+    0x3DCB, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCC, 0x3DCC, 0x3DCC, 0x3DCC, 0x3DCC,
+    0x3DCD, 0x3DCE, 0x3DCF, 0x3DD0, 0x3DD1, 0x3DD2, 0x3DD3, 0x3DD4, 0x3DD5,
+    0x3DD6, 0x3DD7, 0x3DD8, 0x3DD9, 0x3DDA, 0x3DDB, 0x3DDC, 0x3DDD, 0x3DDE,
+    0x3DDF, 0x3DE0, 0x3DE1, 0x3DE2, 0x3DE3, 0x3DE4, 0x3DE5,
+};
+
+static u16 golden_bf16[] = {
+    0x3fc9, 0x3fc9, 0x3fc9, 0x3fc9, 0x3fc9, 0x3fc9, 0x3fc8, 0x3fc8, 0x3fc8,
+    0x3fc8, 0x3fc8, 0x3fc8, 0x3fc8, 0x3fc8, 0x3fc8, 0x3fc8, 0x3fc7, 0x3fc7,
+    0x3fc7, 0x3fc7, 0x3fc7, 0x3fc7, 0x3fc7, 0x3fc7, 0x3fc7, 0x3fc5, 0x3fc5,
+    0x3fc5, 0x3fc5, 0x3fc5, 0x3fc5, 0x3fc5, 0x3fc5, 0x3fc5, 0x3fc5, 0x3fc4,
+    0x3fc4, 0x3fc4, 0x3fc4, 0x3fc4, 0x3fc4, 0x3fc4, 0x3fc4, 0x3fc4, 0x3fc4,
+    0x3fc3, 0x3fc3, 0x3fc3, 0x3fc3, 0x3fc3, 0x3fc3, 0x3fc3, 0x3fc3, 0x3fc3,
+    0x3fc1, 0x3fc1, 0x3fc1, 0x3fc1, 0x3fc1, 0x3fc1, 0x3fc1, 0x3fc1, 0x3fc1,
+    0x3fc0, 0x3fc0, 0x3fc0, 0x3fc0, 0x3fc0, 0x3fc0, 0x3fc0, 0x3fc0, 0x3fc0,
+    0x3fbf, 0x3fbf, 0x3fbf, 0x3fbf, 0x3fbf, 0x3fbf, 0x3fbf, 0x3fbf, 0x3fbf,
+    0x3fbe, 0x3fbe, 0x3fbe, 0x3fbe, 0x3fbe, 0x3fbe, 0x3fbe, 0x3fbe, 0x3fbc,
+    0x3fbc, 0x3fbc, 0x3fbc, 0x3fbc, 0x3fbc, 0x3fbc, 0x3fbc, 0x3fbc, 0x3fbb,
+    0x3fbb, 0x3fbb, 0x3fbb, 0x3fbb, 0x3fbb, 0x3fbb, 0x3fbb, 0x3fba, 0x3fba,
+    0x3fba, 0x3fba, 0x3fba, 0x3fba, 0x3fba, 0x3fba, 0x3fb9, 0x3fb9, 0x3fb9,
+    0x3fb9, 0x3fb9, 0x3fb9, 0x3fb9, 0x3fb7, 0x3fb7, 0x3fb7, 0x3fb7, 0x3fb7,
+    0x3fb7, 0x3fb7, 0x3fb7, 0x3fb7, 0x3fb6, 0x3fb6, 0x3fb6, 0x3fb6, 0x3fb6,
+    0x3fb6, 0x3fb6, 0x3fb5, 0x3fb5, 0x3fb5, 0x3fb5, 0x3fb5, 0x3fb5, 0x3fb5,
+    0x3fb5, 0x3fb5, 0x3fb4, 0x3fb4, 0x3fb4, 0x3fb4, 0x3fb4, 0x3fb4, 0x3fb4,
+    0x3fb2, 0x3fb2, 0x3fb2, 0x3fb2, 0x3fb2, 0x3fb2, 0x3fb2, 0x3fb2, 0x3fb1,
+    0x3fb1, 0x3fb1, 0x3fb1, 0x3fb1, 0x3fb1, 0x3fb0, 0x3fb0, 0x3fb0, 0x3fb0,
+    0x3fb0, 0x3fb0, 0x3fb0, 0x3fb0, 0x3fb0, 0x3faf, 0x3faf, 0x3faf, 0x3faf,
+    0x3faf, 0x3fad, 0x3fad, 0x3fad, 0x3fad, 0x3fad, 0x3fad, 0x3fad, 0x3fad,
+    0x3fac, 0x3fac, 0x3fac, 0x3fac, 0x3fac, 0x3fac, 0x3fab, 0x3fab, 0x3fab,
+    0x3fab, 0x3fab, 0x3fab, 0x3fab, 0x3fab, 0x3faa, 0x3faa, 0x3faa, 0x3faa,
+    0x3faa, 0x3faa, 0x3fa9, 0x3fa9, 0x3fa9, 0x3fa9, 0x3fa9, 0x3fa9, 0x3fa7,
+    0x3fa7, 0x3fa7, 0x3fa7, 0x3fa7, 0x3fa6, 0x3fa6, 0x3fa6, 0x3fa6, 0x3fa6,
+    0x3fa6, 0x3fa6, 0x3fa6, 0x3fa5, 0x3fa5, 0x3fa5, 0x3fa5, 0x3fa5, 0x3fa5,
+    0x3fa4, 0x3fa4, 0x3fa4, 0x3fa4, 0x3fa4, 0x3fa4, 0x3fa3, 0x3fa3, 0x3fa3,
+    0x3fa3, 0x3fa3, 0x3fa1, 0x3fa1, 0x3fa1, 0x3fa1, 0x3fa1, 0x3fa1, 0x3fa1,
+    0x3fa1, 0x3fa0, 0x3fa0, 0x3fa0, 0x3f9f, 0x3f9f, 0x3f9f, 0x3f9f, 0x3f9f,
+    0x3f9f, 0x3f9f, 0x3f9f, 0x3f9e, 0x3f9e, 0x3f9e, 0x3f9e, 0x3f9d, 0x3f9d,
+    0x3f9d, 0x3f9d, 0x3f9d, 0x3f9d, 0x3f9d, 0x3f9d, 0x3f9c, 0x3f9c, 0x3f9c,
+    0x3f9b, 0x3f9b, 0x3f9b, 0x3f9b, 0x3f9b, 0x3f9b, 0x3f9b, 0x3f99, 0x3f99,
+    0x3f99, 0x3f98, 0x3f98, 0x3f98, 0x3f98, 0x3f98, 0x3f98, 0x3f97, 0x3f97,
+    0x3f97, 0x3f97, 0x3f97, 0x3f96, 0x3f96, 0x3f96, 0x3f96, 0x3f96, 0x3f96,
+    0x3f96, 0x3f96, 0x3f95, 0x3f95, 0x3f94, 0x3f94, 0x3f94, 0x3f94, 0x3f94,
+    0x3f94, 0x3f94, 0x3f93, 0x3f93, 0x3f93, 0x3f93, 0x3f93, 0x3f92, 0x3f92,
+    0x3f91, 0x3f91, 0x3f91, 0x3f91, 0x3f91, 0x3f90, 0x3f90, 0x3f90, 0x3f90,
+    0x3f90, 0x3f90, 0x3f90, 0x3f8f, 0x3f8f, 0x3f8f, 0x3f8e, 0x3f8e, 0x3f8e,
+    0x3f8e, 0x3f8e, 0x3f8d, 0x3f8d, 0x3f8d, 0x3f8c, 0x3f8c, 0x3f8c, 0x3f8c,
+    0x3f8c, 0x3f8c, 0x3f8b, 0x3f8b, 0x3f8b, 0x3f8a, 0x3f8a, 0x3f8a, 0x3f8a,
+    0x3f8a, 0x3f8a, 0x3f89, 0x3f89, 0x3f89, 0x3f88, 0x3f88, 0x3f88, 0x3f88,
+    0x3f88, 0x3f87, 0x3f87, 0x3f87, 0x3f86, 0x3f86, 0x3f86, 0x3f86, 0x3f86,
+    0x3f86, 0x3f86, 0x3f85, 0x3f84, 0x3f84, 0x3f84, 0x3f84, 0x3f84, 0x3f83,
+    0x3f83, 0x3f83, 0x3f83, 0x3f82, 0x3f82, 0x3f82, 0x3f82, 0x3f82, 0x3f81,
+    0x3f81, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f7f, 0x3f7f, 0x3f7f,
+    0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f79, 0x3f79,
+    0x3f79, 0x3f79, 0x3f79, 0x3f78, 0x3f78, 0x3f76, 0x3f76, 0x3f76, 0x3f76,
+    0x3f76, 0x3f74, 0x3f74, 0x3f74, 0x3f72, 0x3f72, 0x3f72, 0x3f72, 0x3f71,
+    0x3f71, 0x3f71, 0x3f6f, 0x3f6f, 0x3f6f, 0x3f6f, 0x3f6f, 0x3f6c, 0x3f6c,
+    0x3f6c, 0x3f6c, 0x3f6c, 0x3f6c, 0x3f6c, 0x3f69, 0x3f69, 0x3f69, 0x3f69,
+    0x3f69, 0x3f67, 0x3f67, 0x3f65, 0x3f65, 0x3f65, 0x3f65, 0x3f65, 0x3f62,
+    0x3f62, 0x3f62, 0x3f62, 0x3f62, 0x3f61, 0x3f61, 0x3f61, 0x3f5f, 0x3f5f,
+    0x3f5f, 0x3f5f, 0x3f5e, 0x3f5e, 0x3f5e, 0x3f5c, 0x3f5c, 0x3f5b, 0x3f5b,
+    0x3f5b, 0x3f59, 0x3f59, 0x3f58, 0x3f58, 0x3f58, 0x3f57, 0x3f57, 0x3f57,
+    0x3f57, 0x3f57, 0x3f54, 0x3f54, 0x3f54, 0x3f54, 0x3f51, 0x3f51, 0x3f51,
+    0x3f51, 0x3f51, 0x3f50, 0x3f50, 0x3f50, 0x3f4e, 0x3f4e, 0x3f4d, 0x3f4d,
+    0x3f4d, 0x3f4c, 0x3f4c, 0x3f4c, 0x3f4c, 0x3f4c, 0x3f4a, 0x3f4a, 0x3f49,
+    0x3f49, 0x3f46, 0x3f46, 0x3f46, 0x3f46, 0x3f46, 0x3f45, 0x3f45, 0x3f45,
+    0x3f44, 0x3f44, 0x3f41, 0x3f41, 0x3f41, 0x3f41, 0x3f41, 0x3f40, 0x3f40,
+    0x3f40, 0x3f3e, 0x3f3e, 0x3f3e, 0x3f3e, 0x3f3c, 0x3f3c, 0x3f3c, 0x3f3c,
+    0x3f3c, 0x3f3a, 0x3f3a, 0x3f3a, 0x3f39, 0x3f39, 0x3f36, 0x3f36, 0x3f36,
+    0x3f36, 0x3f36, 0x3f36, 0x3f36, 0x3f34, 0x3f33, 0x3f33, 0x3f33, 0x3f33,
+    0x3f31, 0x3f31, 0x3f31, 0x3f30, 0x3f30, 0x3f30, 0x3f30, 0x3f30, 0x3f2d,
+    0x3f2d, 0x3f2d, 0x3f2d, 0x3f2d, 0x3f2b, 0x3f2b, 0x3f2a, 0x3f2a, 0x3f2a,
+    0x3f2a, 0x3f2a, 0x3f26, 0x3f26, 0x3f26, 0x3f26, 0x3f26, 0x3f26, 0x3f26,
+    0x3f25, 0x3f25, 0x3f25, 0x3f23, 0x3f23, 0x3f21, 0x3f21, 0x3f21, 0x3f20,
+    0x3f20, 0x3f20, 0x3f20, 0x3f1e, 0x3f1e, 0x3f1e, 0x3f1c, 0x3f1c, 0x3f1c,
+    0x3f1c, 0x3f1c, 0x3f1b, 0x3f1b, 0x3f19, 0x3f19, 0x3f19, 0x3f19, 0x3f19,
+    0x3f17, 0x3f17, 0x3f17, 0x3f15, 0x3f15, 0x3f15, 0x3f15, 0x3f14, 0x3f14,
+    0x3f14, 0x3f12, 0x3f12, 0x3f12, 0x3f12, 0x3f12, 0x3f10, 0x3f10, 0x3f0e,
+    0x3f0e, 0x3f0e, 0x3f0e, 0x3f0e, 0x3f0c, 0x3f0c, 0x3f0c, 0x3f0c, 0x3f0a,
+    0x3f0a, 0x3f0a, 0x3f0a, 0x3f0a, 0x3f08, 0x3f07, 0x3f07, 0x3f07, 0x3f07,
+    0x3f07, 0x3f07, 0x3f07, 0x3f05, 0x3f05, 0x3f05, 0x3f05, 0x3f05, 0x3f03,
+    0x3f03, 0x3f03, 0x3f01, 0x3f01, 0x3f01, 0x3efe, 0x3efe, 0x3efe, 0x3efe,
+    0x3efe, 0x3efa, 0x3efa, 0x3efa, 0x3efa, 0x3ef6, 0x3ef6, 0x3ef6, 0x3ef6,
+    0x3ef6, 0x3ef1, 0x3ef1, 0x3ef1, 0x3ef1, 0x3eed, 0x3eed, 0x3eed, 0x3eed,
+    0x3eed, 0x3ee9, 0x3ee9, 0x3ee9, 0x3ee5, 0x3ee5, 0x3ee5, 0x3ee5, 0x3ee5,
+    0x3ee5, 0x3ee5, 0x3ee1, 0x3ee1, 0x3ee1, 0x3edd, 0x3edd, 0x3edd, 0x3edd,
+    0x3edd, 0x3edd, 0x3edd, 0x3ed9, 0x3ed9, 0x3ed4, 0x3ed4, 0x3ed4, 0x3ed4,
+    0x3ed4, 0x3ed4, 0x3ed0, 0x3ed0, 0x3ed0, 0x3ed0, 0x3ed0, 0x3ecc, 0x3ecc,
+    0x3ecc, 0x3ecc, 0x3ecc, 0x3ecc, 0x3ecc, 0x3ec7, 0x3ec7, 0x3ec3, 0x3ec3,
+    0x3ec3, 0x3ec3, 0x3ec3, 0x3ec3, 0x3ec3, 0x3ec3, 0x3ebe, 0x3ebe, 0x3ebe,
+    0x3ebe, 0x3ebe, 0x3eba, 0x3eba, 0x3eba, 0x3eba, 0x3eba, 0x3eb5, 0x3eb5,
+    0x3eb5, 0x3eb5, 0x3eb1, 0x3eb1, 0x3eb1, 0x3eb1, 0x3eb1, 0x3eb1, 0x3eac,
+    0x3eac, 0x3eac, 0x3eac, 0x3eac, 0x3ea8, 0x3ea8, 0x3ea8, 0x3ea8, 0x3ea8,
+    0x3ea8, 0x3ea8, 0x3ea8, 0x3ea8, 0x3ea3, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f,
+    0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9a, 0x3e9a, 0x3e9a, 0x3e9a,
+    0x3e9a, 0x3e95, 0x3e95, 0x3e95, 0x3e95, 0x3e95, 0x3e95, 0x3e95, 0x3e91,
+    0x3e91, 0x3e91, 0x3e91, 0x3e91, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8c,
+    0x3e8c, 0x3e8c, 0x3e8c, 0x3e87, 0x3e87, 0x3e87, 0x3e87, 0x3e87, 0x3e82,
+    0x3e82, 0x3e82, 0x3e82, 0x3e82, 0x3e82, 0x3e7b, 0x3e7b, 0x3e7b, 0x3e7b,
+    0x3e7b, 0x3e7b, 0x3e71, 0x3e71, 0x3e71, 0x3e71, 0x3e71, 0x3e71, 0x3e71,
+    0x3e71, 0x3e71, 0x3e67, 0x3e67, 0x3e67, 0x3e67, 0x3e67, 0x3e5e, 0x3e5e,
+    0x3e5e, 0x3e5e, 0x3e5e, 0x3e5e, 0x3e5e, 0x3e5e, 0x3e54, 0x3e54, 0x3e54,
+    0x3e54, 0x3e54, 0x3e4a, 0x3e4a, 0x3e4a, 0x3e4a, 0x3e4a, 0x3e4a, 0x3e4a,
+    0x3e4a, 0x3e4a, 0x3e40, 0x3e40, 0x3e40, 0x3e40, 0x3e40, 0x3e40, 0x3e36,
+    0x3e36, 0x3e36, 0x3e36, 0x3e36, 0x3e36, 0x3e36, 0x3e36, 0x3e36, 0x3e2c,
+    0x3e2c, 0x3e2c, 0x3e2c, 0x3e2c, 0x3e2c, 0x3e22, 0x3e22, 0x3e22, 0x3e22,
+    0x3e22, 0x3e22, 0x3e22, 0x3e22, 0x3e18, 0x3e18, 0x3e18, 0x3e18, 0x3e18,
+    0x3e18, 0x3e18, 0x3e18, 0x3e0e, 0x3e0e, 0x3e0e, 0x3e0e, 0x3e0e, 0x3e0e,
+    0x3e0e, 0x3e0e, 0x3e04, 0x3e04, 0x3e04, 0x3e04, 0x3e04, 0x3e04, 0x3e04,
+    0x3e04, 0x3df5, 0x3df5, 0x3df5, 0x3df5, 0x3df5, 0x3df5, 0x3df5, 0x3df5,
+    0x3de0, 0x3de0, 0x3de0, 0x3de0, 0x3de0, 0x3de0, 0x3de0, 0x3de0, 0x3dcc,
+    0x3dcc, 0x3dcc, 0x3dcc, 0x3dcc, 0x3dcc, 0x3dcc, 0x3dcc, 0x3dcc, 0x3dcc,
+    0x3db8, 0x3db8, 0x3db8, 0x3db8, 0x3db8, 0x3db8, 0x3db8, 0x3da3, 0x3da3,
+    0x3da3, 0x3da3, 0x3da3, 0x3da3, 0x3da3, 0x3da3, 0x3da3, 0x3d8f, 0x3d8f,
+    0x3d8f, 0x3d8f, 0x3d8f, 0x3d8f, 0x3d8f, 0x3d8f, 0x3d8f, 0x3d75, 0x3d75,
+    0x3d75, 0x3d75, 0x3d75, 0x3d75, 0x3d75, 0x3d75, 0x3d75, 0x3d4d, 0x3d4d,
+    0x3d4d, 0x3d4d, 0x3d4d, 0x3d4d, 0x3d4d, 0x3d4d, 0x3d4d, 0x3d24, 0x3d24,
+    0x3d24, 0x3d24, 0x3d24, 0x3d24, 0x3d24, 0x3d24, 0x3d24, 0x3d24, 0x3cf6,
+    0x3cf6, 0x3cf6, 0x3cf6, 0x3cf6, 0x3cf6, 0x3cf6, 0x3cf6, 0x3cf6, 0x3cf6,
+    0x3ca4, 0x3ca4, 0x3ca4, 0x3ca4, 0x3ca4, 0x3ca4, 0x3ca4, 0x3ca4, 0x3ca4,
+    0x3c24, 0x3c24, 0x3c24, 0x3c24, 0x3c24, 0x3c24, 0x3c24, 0x3c24, 0x3c24,
+    0x3c24, 0x0,    0x0,    0x0,    0x0,    0x0,
+};
+
+// <! gen atan2 f(y, x) = 2 * atan(y / (pow(x*x+y*y, 0.5) + x))
+static double _gen_atan2(float y, float x)
+{
+  return atan2(y, x);
+}
+
+static void tl_lut_ref(u16 *ofmap, u16 *ifmap, u16 *ifmap2,
+                       tl_shape_t ifmap_shape)
+{
+  assert(ofmap);
+
+  for (u32 i = 0; i < tl_shape_size(&ifmap_shape); i++) {
+    float y = convert_bf16_fp32(ifmap2[i]);
+    float x = convert_bf16_fp32(ifmap[i]);
+    double v = _gen_atan2(y, x);
+    ofmap[i] = convert_fp32_bf16(v);
+
+    if (mode == PRE_DATA_COMPARE_FIX) {
+      ofmap[i] = golden_bf16[i];
+    } else if (mode == DATA_COMPARE_U8) {
+      ofmap[i] = (u8)convert_bf16_s8(ofmap[i]);
+    }
+  }
+}
+
+static bool verify(u16 *ofmap_data, u16 *ref_data, u16 *ifmap, u16 *ifmap2,
+                   u64 ifmap_size, float epsilon)
+{
+  u64 size = ifmap_size;
+
+  int tolerant_max = 20;
+  tolerant_max = -1;
+  int tolerant_cnt = 0;
+  for (u64 i = 0; i < size; i++) {
+    bool is_close;
+    u16 ref = ref_data[i];
+    u16 ofmap_data_bf16;
+    float ref_f;
+    float ofmap_data_f;
+
+    ref_f = convert_bf16_fp32(ref);
+    ofmap_data_f = convert_bf16_fp32(ofmap_data[i]);
+    ofmap_data_bf16 = ofmap_data[i];
+
+    if (mode == PRE_DATA_COMPARE_FIX) {
+      is_close = ofmap_data[i] == ref;
+    } else {
+      is_close = fabs(ref_f - ofmap_data_f) < epsilon;
+    }
+
+    if (!is_close) {
+      float y = convert_bf16_fp32(ifmap2[i]);
+      float x = convert_bf16_fp32(ifmap[i]);
+      fprintf(stderr,
+              "comparing failed at ofmap_data[%" PRIu64 "]\n"
+              "\tgot %x, exp %x, fp32: got %f exp %f, atan2(%f, %f) = %f"
+              "\ty %f(0x%x), x %f(0x%x)\n",
+              i, ofmap_data_bf16, ref, ofmap_data_f, ref_f, y, x,
+              _gen_atan2(y, x), y, ifmap2[i], x, ifmap[i]);
+
+      if (tolerant_cnt++ >= tolerant_max) {
+        exit(-1);
+      }
+    }
+  }
+
+  return true;
+}
+
+static void _gen_input(u16 *input_data, u64 ifmap_size, int range_start,
+                       int range_end)
+{
+  std::random_device rd;
+  std::mt19937 e2(rd());
+  std::uniform_real_distribution<> dist(range_start, range_end);
+
+  float LO = pow(2, range_start);
+  float HI = pow(2, range_end);
+  for (u64 i = 0; i < ifmap_size; i++) {
+    // input range is -8 ~ +8
+    int table_hw = 256;
+    float input = ((int)i % (range_end - 2)) * (((int)i % 2) ? 1 : -1) + 0.03 +
+                  (i % table_hw) * 0.002;
+    input = ((int)i % (range_end - 2)) * (((int)i % 2) ? 1 : 1) + 0.03 +
+            (i % table_hw) * 0.002;
+    input_data[i] = convert_fp32_bf16(input);
+    input = dist(e2);
+    input = LO + static_cast<float>(rand()) /
+                     (static_cast<float>(RAND_MAX / (HI - LO)));
+  }
+}
+
+static void gen_input(u16 *x, u16 *y, u64 ifmap_size, TEST_MODE mode,
+                      int range_start, int range_end)
+{
+
+  if (mode == PRE_DATA_COMPARE_FIX) {
+    memcpy(x, &test_pattern, sizeof(test_pattern));
+  } else {
+    range_start = abs(range_start);
+    range_end = abs(range_end);
+    _gen_input(x, ifmap_size, range_start, range_end);
+  }
+
+  // invert for test
+  for (u64 i = 0; i < ifmap_size; i++) {
+    y[i] = x[(ifmap_size - 1) - i];
+  }
+
+  if (mode == DATA_COMPARE_ACCURACY_X_GT_0) {
+    // y = any
+    u32 i = 0;
+    for (; i < ifmap_size / 4; i++) {
+      // y < 0
+      y[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(y[i]));
+      y[i + ifmap_size / 4] = convert_fp32_bf16(0);
+    }
+  } else if (mode == DATA_COMPARE_ACCURACY_X_LT_0_Y_GE_0) {
+    // x < 0 and y >= 0
+    for (u32 i = 0; i < ifmap_size; i++) {
+      x[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(x[i]));
+    }
+
+    for (u32 i = 0; i < ifmap_size / 4; i++) {
+      y[i + ifmap_size / 4] = convert_fp32_bf16(0);
+    }
+  } else if (mode == DATA_COMPARE_ACCURACY_X_LT_0_Y_LT_0) {
+    // x < 0 and y < 0
+    for (u32 i = 0; i < ifmap_size; i++) {
+      x[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(x[i]));
+      y[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(y[i]));
+    }
+  } else if (mode == DATA_COMPARE_ACCURACY_X_0_Y_GT_0) {
+    // pi / 2, x = 0 and y > 0
+    for (u32 i = 0; i < ifmap_size; i++) {
+      x[i] = convert_fp32_bf16(0);
+    }
+  } else if (mode == DATA_COMPARE_ACCURACY_X_0_Y_LT_0) {
+    // -pi / 2, x = 0 and y < 0
+    for (u32 i = 0; i < ifmap_size; i++) {
+      x[i] = convert_fp32_bf16(0);
+      y[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(y[i]));
+    }
+  }
+
+#if 1
+
+  if (mode != PRE_DATA_COMPARE_FIX) {
+    int i = 0;
+    x[i] = convert_fp32_bf16(0);
+    y[i++] = convert_fp32_bf16(1.394531);
+    x[i] = convert_fp32_bf16(0);
+    y[i++] = convert_fp32_bf16(0.394531);
+    x[i] = convert_fp32_bf16(0);
+    y[i++] = convert_fp32_bf16(0.594531);
+    x[i] = convert_fp32_bf16(-10.0);
+    y[i++] = convert_fp32_bf16(6.0);
+    x[i] = convert_fp32_bf16(1.0);
+    y[i++] = convert_fp32_bf16(-1.);
+    x[i] = convert_fp32_bf16(-1.0);
+    y[i++] = convert_fp32_bf16(1.);
+    x[i] = convert_fp32_bf16(0.111816);
+    y[i++] = convert_fp32_bf16(0);
+    x[i] = convert_fp32_bf16(2.031250);
+    y[i++] = convert_fp32_bf16(0.0);
+    x[i] = convert_fp32_bf16(-2.031250);
+    x[i] = convert_fp32_bf16(0);
+    y[i++] = convert_fp32_bf16(-1.394531);
+    y[i++] = convert_fp32_bf16(0.0);
+    x[i] = convert_fp32_bf16(0);
+    y[i++] = convert_fp32_bf16(-6.0);
+    x[i] = convert_fp32_bf16(0);
+    y[i++] = convert_fp32_bf16(-0.394531);
+    x[i] = convert_fp32_bf16(0);
+    y[i++] = convert_fp32_bf16(-0.594531);
+    x[i] = convert_fp32_bf16(0);
+    y[i++] = convert_fp32_bf16(0.0);
+    x[i] = convert_fp32_bf16(-8);
+    y[i++] = convert_fp32_bf16(0);
+    x[i] = convert_fp32_bf16(0);
+    y[i++] = convert_fp32_bf16(3.0);
+    x[i] = convert_fp32_bf16(-1.0);
+    y[i++] = convert_fp32_bf16(-5.0);
+    x[i] = convert_fp32_bf16(-2.484375);
+    y[i++] = convert_fp32_bf16(-7.531250);
+    x[i++] = convert_fp32_bf16(-125.000000);
+    y[i] = convert_fp32_bf16(5.000000);
+    x[i++] = convert_fp32_bf16(-8.000000);
+    y[i] = convert_fp32_bf16(1.000000);
+    x[i++] = convert_fp32_bf16(19.000000);
+    y[i] = convert_fp32_bf16(1.070312);
+    x[i++] = convert_fp32_bf16(0.498046);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i] = convert_fp32_bf16(424.000);
+    y[i++] = convert_fp32_bf16(-1.00);
+    x[i] = convert_fp32_bf16(2.484375);
+    y[i++] = convert_fp32_bf16(-7.531250);
+    x[i] = convert_fp32_bf16(0);
+    y[i++] = convert_fp32_bf16(7.531250);
+
+    x[i] = convert_fp32_bf16(0);
+    y[i++] = convert_fp32_bf16(-7.531250);
+
+    x[i] = convert_fp32_bf16(0);
+    y[i++] = convert_fp32_bf16(0.394531);
+    y[i] = convert_fp32_bf16(-4.000000);
+    x[i++] = convert_fp32_bf16(-64.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-4.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-40.000000);
+    y[i] = convert_fp32_bf16(1.000000);
+    x[i++] = convert_fp32_bf16(-53.000000);
+    y[i] = convert_fp32_bf16(-9.000000);
+    x[i++] = convert_fp32_bf16(-91.000000);
+    y[i] = convert_fp32_bf16(12.000000);
+    x[i++] = convert_fp32_bf16(-164.000000);
+    y[i] = convert_fp32_bf16(-20.000000);
+    x[i++] = convert_fp32_bf16(-320.000000);
+    y[i] = convert_fp32_bf16(1.000000);
+    x[i++] = convert_fp32_bf16(-71.000000);
+    y[i] = convert_fp32_bf16(1.000000);
+    x[i++] = convert_fp32_bf16(-155.000000);
+    y[i] = convert_fp32_bf16(1.000000);
+    x[i++] = convert_fp32_bf16(-247.000000);
+    y[i] = convert_fp32_bf16(-2.000000);
+    x[i++] = convert_fp32_bf16(-118.000000);
+    y[i] = convert_fp32_bf16(-2.000000);
+    x[i++] = convert_fp32_bf16(-54.000000);
+    y[i] = convert_fp32_bf16(-5.000000);
+    x[i++] = convert_fp32_bf16(-392.000000);
+    y[i] = convert_fp32_bf16(-37.000000);
+    x[i++] = convert_fp32_bf16(-520.000000);
+    y[i] = convert_fp32_bf16(-1.000000);
+    x[i++] = convert_fp32_bf16(-19.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-10.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-8.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-2.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-14.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-2.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-6.000000);
+    y[i] = convert_fp32_bf16(-1.000000);
+    x[i++] = convert_fp32_bf16(-21.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-14.000000);
+    y[i] = convert_fp32_bf16(-1.000000);
+    x[i++] = convert_fp32_bf16(-17.000000);
+    y[i] = convert_fp32_bf16(-1.000000);
+    x[i++] = convert_fp32_bf16(-17.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-8.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-4.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-10.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-8.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-14.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-4.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-2.000000);
+    y[i] = convert_fp32_bf16(1.000000);
+    x[i++] = convert_fp32_bf16(-41.000000);
+    y[i] = convert_fp32_bf16(1.000000);
+    x[i++] = convert_fp32_bf16(-69.000000);
+    y[i] = convert_fp32_bf16(4.000000);
+    x[i++] = convert_fp32_bf16(-86.000000);
+    y[i] = convert_fp32_bf16(1.000000);
+    x[i++] = convert_fp32_bf16(-41.000000);
+    y[i] = convert_fp32_bf16(-2.000000);
+    x[i++] = convert_fp32_bf16(-34.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-6.000000);
+    y[i] = convert_fp32_bf16(1.000000);
+    x[i++] = convert_fp32_bf16(-41.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-136.000000);
+    y[i] = convert_fp32_bf16(-3.000000);
+    x[i++] = convert_fp32_bf16(-79.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-38.000000);
+    y[i] = convert_fp32_bf16(5.000000);
+    x[i++] = convert_fp32_bf16(-173.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-78.000000);
+    y[i] = convert_fp32_bf16(-2.000000);
+    x[i++] = convert_fp32_bf16(-60.000000);
+    y[i] = convert_fp32_bf16(3.000000);
+    x[i++] = convert_fp32_bf16(-123.000000);
+    y[i] = convert_fp32_bf16(-9.000000);
+    x[i++] = convert_fp32_bf16(-280.000000);
+    y[i] = convert_fp32_bf16(3.000000);
+    x[i++] = convert_fp32_bf16(-39.000000);
+    y[i] = convert_fp32_bf16(2.000000);
+    x[i++] = convert_fp32_bf16(-524.000000);
+    y[i] = convert_fp32_bf16(11.000000);
+    x[i++] = convert_fp32_bf16(-376.000000);
+    y[i] = convert_fp32_bf16(5.000000);
+    x[i++] = convert_fp32_bf16(-131.000000);
+    y[i] = convert_fp32_bf16(11.000000);
+    x[i++] = convert_fp32_bf16(-324.000000);
+    y[i] = convert_fp32_bf16(9.000000);
+    x[i++] = convert_fp32_bf16(-125.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-92.000000);
+    y[i] = convert_fp32_bf16(-7.000000);
+    x[i++] = convert_fp32_bf16(-233.000000);
+    y[i] = convert_fp32_bf16(10.000000);
+    x[i++] = convert_fp32_bf16(-170.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-4.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-4.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-10.000000);
+    y[i] = convert_fp32_bf16(-1.000000);
+    x[i++] = convert_fp32_bf16(-23.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-6.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-6.000000);
+    y[i] = convert_fp32_bf16(-3.000000);
+    x[i++] = convert_fp32_bf16(-37.000000);
+
+    y[i] = convert_fp32_bf16(-9);
+    x[i++] = convert_fp32_bf16(-1);
+
+    y[i] = convert_fp32_bf16(7.0);
+    x[i++] = convert_fp32_bf16(-1);
+
+    y[i] = convert_fp32_bf16(0);
+    x[i++] = convert_fp32_bf16(-1);
+  }
+#else
+  for (u64 i = 0; i < ifmap_size; i++) {
+    x[i] = convert_fp32_bf16(5.375000);
+    y[i] = convert_fp32_bf16(2.203125);
+  }
+#endif
+
+#ifdef DBG
+  for (u64 i = 0; i < ifmap_size; i++) {
+    printf("source[%ld] y %f x %f\n", i, convert_bf16_fp32(y[i]),
+           convert_bf16_fp32(x[i]));
+  }
+#endif /* ifdef DBG */
+}
+
+static void testbench(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk)
+{
+  // TODO: check more shape / align
+  bmk1880v2_chip_info_t chip_info = bmk1880v2_chip_info();
+
+  u32 input_n = 1;
+  u32 input_c = chip_info.npu_num;
+  u32 input_h = 16;
+  u32 input_w = 16;
+  float epsilon = 0.1;
+  int range_start = -8;
+  int range_end = 8;
+
+  if (mode == PRE_DATA_COMPARE_FIX) {
+    input_h = 4;
+    input_w = 8;
+  }
+
+  fmt_t fmt = FMT_BF16;
+
+  tl_shape_t ifmap_shape = {input_n, input_c, input_h, input_w};
+  tl_shape_t ofmap_shape = ifmap_shape;
+
+  // get lut table shape and size
+  tl_shape_t table_shape;
+  u64 table_bytesize = bf16_lut_tbl_bytesize(bmk, &table_shape, fmt);
+
+  // get input / output size
+  u64 ifmap_size = tl_shape_size(&ifmap_shape);
+  u64 ofmap_size = tl_shape_size(&ofmap_shape);
+  int data_type_size = bytesize_of_fmt(fmt);
+  u64 ifmap_bytesize = ifmap_size * data_type_size;
+  u64 ofmap_bytesize = ofmap_size * data_type_size;
+
+  // atan2 was two inputs
+  tl_t *tl_ifmap = alloc_tl(bmk, ifmap_shape, fmt, /*align*/1);
+  tl_t *tl_ifmap2 = alloc_tl(bmk, ifmap_shape, fmt, /*align*/1);
+  tl_t *tl_ofmap_bf16 = alloc_tl(bmk, ofmap_shape, fmt, /*align*/1);
+  tl_t *out = tl_ofmap_bf16;
+
+  // atan buf
+  tl_t *tl_y0_buf = alloc_tl(bmk, table_shape, fmt, /*align*/1);
+  tl_t *tl_invert_buf = alloc_tl(bmk, table_shape, fmt, /*align*/1);
+  tl_t *tl_pos_neg_buf = alloc_tl(bmk, table_shape, fmt, /*align*/1);
+
+  // reciprocal buf
+  tl_t *tl_reciprocal_table_answer =
+      alloc_tl(bmk, table_shape, fmt, /*align*/1);
+  tl_t *tl_reciprocal_table_answer_mantissa =
+      alloc_tl(bmk, table_shape, fmt, /*align*/1);
+
+  // temp buf
+  tl_t *tl_buf = alloc_tl(bmk, ofmap_shape, fmt, /*align*/1);
+  tl_t *tl_buf2 = alloc_tl(bmk, tl_ofmap_bf16->shape, fmt, /*align*/1);
+  tl_t *tl_buf3 = alloc_tl(bmk, tl_ofmap_bf16->shape, fmt, /*align*/1);
+
+  u16 *input_data = (u16 *)xmalloc(ifmap_bytesize);
+  u16 *input_data2 = (u16 *)xmalloc(ifmap_bytesize);
+  u16 *ref_data = (u16 *)xmalloc(ofmap_bytesize);
+
+  // for reciprocal
+  u16 *table_reciprocal_data = (u16 *)xmalloc(table_bytesize);
+  u16 *table_reciprocal_data_mantissa = (u16 *)xmalloc(table_bytesize);
+
+  // for atan
+  u16 *table_data_atan_y0 = (u16 *)xmalloc(table_bytesize);
+  u16 *table_data_atan_invert = (u16 *)xmalloc(table_bytesize);
+  u16 *table_data_atan_pos_neg = (u16 *)xmalloc(table_bytesize);
+
+  // for search '0' index
+  u16 *idx_0_table_data = (u16 *)xmalloc(table_bytesize);
+
+  // init input / ref
+  // input_data is x, input_data2 is y
+  gen_input(input_data, input_data2, ifmap_size, mode, range_start, range_end);
+  tl_lut_ref(ref_data, input_data, input_data2, ifmap_shape);
+
+  // init lut table
+  bf16_reciprocal_tbl(table_reciprocal_data, table_reciprocal_data_mantissa,
+                      &table_shape);
+  bf16_atan_tbl(table_data_atan_y0, NULL, table_data_atan_invert,
+                table_data_atan_pos_neg, &table_shape);
+  bf16_gen_0_tbl(idx_0_table_data, &table_shape);
+
+  // sys->local
+  put_bf16_tensor_g2l(ctx, bmk, tl_ifmap, (u16 *)input_data, fmt);
+  put_bf16_tensor_g2l(ctx, bmk, tl_ifmap2, (u16 *)input_data2, fmt);
+  put_bf16_tensor_g2l(ctx, bmk, tl_reciprocal_table_answer,
+                      (u16 *)table_reciprocal_data, fmt);
+  put_bf16_tensor_g2l(ctx, bmk, tl_reciprocal_table_answer_mantissa,
+                      (u16 *)table_reciprocal_data_mantissa, fmt);
+
+  put_bf16_tensor_g2l(ctx, bmk, tl_y0_buf, (u16 *)table_data_atan_y0, fmt);
+  put_bf16_tensor_g2l(ctx, bmk, tl_invert_buf, (u16 *)table_data_atan_invert,
+                      fmt);
+  put_bf16_tensor_g2l(ctx, bmk, tl_pos_neg_buf, (u16 *)table_data_atan_pos_neg,
+                      fmt);
+
+  bf16_atan2_merge_emit(
+      bmk, tl_ifmap2, tl_ifmap, tl_buf, tl_buf2, tl_buf3, tl_y0_buf,
+      tl_invert_buf, tl_pos_neg_buf, tl_reciprocal_table_answer,
+      tl_reciprocal_table_answer_mantissa, OUT tl_ofmap_bf16, fmt);
+
+  u16 *ofmap_data = (u16 *)get_bf16_tensor_l2g(ctx, bmk, out, out->fmt);
+  verify(ofmap_data, ref_data, input_data, input_data2, ifmap_size, epsilon);
+
+  free_tl(bmk, tl_buf3);
+  free_tl(bmk, tl_buf2);
+  free_tl(bmk, tl_buf);
+  free_tl(bmk, tl_reciprocal_table_answer_mantissa);
+  free_tl(bmk, tl_reciprocal_table_answer);
+  free_tl(bmk, tl_pos_neg_buf);
+  free_tl(bmk, tl_invert_buf);
+  free_tl(bmk, tl_y0_buf);
+  free_tl(bmk, tl_ofmap_bf16);
+  free_tl(bmk, tl_ifmap2);
+  free_tl(bmk, tl_ifmap);
+
+  free(idx_0_table_data);
+  free(table_data_atan_y0);
+  free(table_data_atan_invert);
+  free(table_data_atan_pos_neg);
+  free(table_reciprocal_data);
+  free(table_reciprocal_data_mantissa);
+  free(input_data);
+  free(ref_data);
+  free(ofmap_data);
+  free(input_data2);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  int round_mode;
+
+  round_mode = set_store_feround();
+
+  test_init(&ctx, &bmk);
+
+  // for (int i = PRE_DATA_COMPARE_FIX; i < TEST_MODE_MAX; i++)
+  // for (int i = PRE_DATA_COMPARE_FIX; i < DATA_COMPARE_ACCURACY; i++) {
+  // for (int i = DATA_COMPARE_ACCURACY; i < DATA_COMPARE_U8; i++) {
+  // for (int i = DATA_COMPARE_ACCURACY; i < DATA_COMPARE_ACCURACY_X_GT_0; i++)
+  // {
+  for (int i = PRE_DATA_COMPARE_FIX; i < DATA_COMPARE_U8; i++) {
+    mode = static_cast<TEST_MODE>(i);
+    printf("test mode %d...\n", mode);
+    testbench(&ctx, bmk);
+  }
+  printf("pass\n");
+
+  test_exit(&ctx);
+  restore_feround(round_mode);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_bf16_atan_kernel.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_atan_kernel.cpp
new file mode 100644
index 000000000..4c5c7cdac
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_atan_kernel.cpp
@@ -0,0 +1,553 @@
+/**
+ * plz refer [git](https://github.com/xiezhq-hermann/atan_lookup)
+ * input range is `all real numbers` and output range is -pi/2 < x < pi/2,
+ * you can refer [here](https://www.mathopenref.com/arctan.html) for more
+ * details
+ */
+//
+// xiezhq@shanghaitech.edu.cn && wanghe@shanghaitech.edu.cn
+/* Reference:
+   [1] Abhisek Ukil, Vishal H Shah, Bernhard Deck,
+   "Fast Computation of arctangent Functions for Embedded Applications: A
+   Comparative Analysis" IEEE International Symposium on Industrial Electronics,
+   Pages: 1206 - 1211, DOI: 10.1109/ISIE.2011.5984330, 2011
+   [2] Sreeraman Rajan, Sichun Wang, Robert Inkol, and Alain Joyal
+   "Efficient Approximations for the Arctangent Function"
+   IEEE SIGNAL PROCESSING MAGAZINE [108] MAY 2006
+ */
+
+
+#include "../1880v2_test_util.h"
+#define OUT
+#define IN
+#include <cfloat>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <random>
+#include <string>
+//#define DBG
+
+using namespace std;
+
+#if 0
+double atan_double(double x) {
+  /*
+  More precise look-up table is used for higher accuracy
+  */
+  if (x >= 0) {
+    if (x <= 1) {
+      int index = round(x * 100);
+      return (LUT_d[index] + (x * 100 - index) * (LUT_d[index + 1] - LUT_d[index]));
+    } else {
+      double re_x = 1 / x;
+      int index = round(re_x * 100);
+      return (M_PI_2 - (LUT_d[index] + (re_x * 100 - index) * (LUT_d[index + 1] - LUT_d[index])));
+      // No recursive is better here
+    }
+  } else {
+    if (x >= -1) {
+      double abs_x = -x;
+      int index = round(abs_x * 100);
+      return -(LUT_d[index] + (abs_x * 100 - index) * (LUT_d[index + 1] - LUT_d[index]));
+    } else {
+      double re_x = 1 / (-x);
+      int index = round(re_x * 100);
+      return (LUT_d[index] + (re_x * 100 - index) * (LUT_d[index+1] - LUT_d[index])) - M_PI_2;
+    }
+  }
+}
+#endif
+
+/**
+ * pre_data means we test fixed pattern, it should be same sa lut
+ */
+enum TEST_MODE {
+  PRE_DATA_COMPARE_FIX = 0,  // pre-data + fix compare
+  DATA_COMPARE_ACCURACY,     // generate \range_start to \range_end value that
+                             // check epsilon
+  DATA_COMPARE_U8,  // generate \range_start to \range_end value that check
+                    // epsilon, result bf16->u8
+  TEST_MODE_MAX,
+};
+
+static TEST_MODE mode;
+
+static u16 test_pattern[] = {
+    0x0000, 0x38D2, 0x3952, 0x399D, 0x39D2, 0x3A03, 0x3A1D, 0x3A38, 0x3A52,
+    0x3A6C, 0x3A83, 0x3A90, 0x3A9D, 0x3AAA, 0x3AB8, 0x3AC5, 0x3AD2, 0x3ADF,
+    0x3AEC, 0x3AF9, 0x3B03, 0x3B0A, 0x3B10, 0x3B17, 0x3B1D, 0x3B24, 0x3B2A,
+    0x3B31, 0x3B38, 0x3B3E, 0x3B45, 0x3B4B, 0x3B52, 0x3B58, 0x3B5F, 0x3B65,
+    0x3B6C, 0x3B72, 0x3B79, 0x3B80, 0x3B83, 0x3B86, 0x3B8A, 0x3B8D, 0x3B90,
+    0x3B93, 0x3B97, 0x3B9A, 0x3B9D, 0x3BA1, 0x3BA4, 0x3BA7, 0x3BAA, 0x3BAE,
+    0x3BB1, 0x3BB4, 0x3BB8, 0x3BBB, 0x3BBE, 0x3BC1, 0x3BC5, 0x3BC8, 0x3BCB,
+    0x3BCE, 0x3BD2, 0x3BD5, 0x3BD8, 0x3BDC, 0x3BDF, 0x3BE2, 0x3BE5, 0x3BE9,
+    0x3BEC, 0x3BEF, 0x3BF2, 0x3BF6, 0x3BF9, 0x3BFC, 0x3C00, 0x3C01, 0x3C03,
+    0x3C05, 0x3C06, 0x3C08, 0x3C0A, 0x3C0B, 0x3C0D, 0x3C0F, 0x3C10, 0x3C12,
+    0x3C13, 0x3C15, 0x3C17, 0x3C18, 0x3C1A, 0x3C1C, 0x3C1D, 0x3C1F, 0x3C21,
+    0x3C22, 0x3C24, 0x3C25, 0x3C27, 0x3C29, 0x3C2A, 0x3C2C, 0x3C2E, 0x3C2F,
+    0x3C31, 0x3C33, 0x3C34, 0x3C36, 0x3C38, 0x3C39, 0x3C3B, 0x3C3C, 0x3C3E,
+    0x3C40, 0x3C41, 0x3C43, 0x3C45, 0x3C46, 0x3C48, 0x3C4A, 0x3C4B, 0x3C4D,
+    0x3C4E, 0x3C50, 0x3C52, 0x3C53, 0x3C55, 0x3C57, 0x3C58, 0x3C5A, 0x3C5C,
+    0x3C5D, 0x3C5F, 0x3C60, 0x3C62, 0x3C64, 0x3C65, 0x3C67, 0x3C69, 0x3C6A,
+    0x3C6C, 0x3C6E, 0x3C6F, 0x3C71, 0x3C72, 0x3C74, 0x3C76, 0x3C77, 0x3C79,
+    0x3C7B, 0x3C7C, 0x3C7E, 0x3C80, 0x3C81, 0x3C81, 0x3C82, 0x3C83, 0x3C84,
+    0x3C85, 0x3C86, 0x3C86, 0x3C87, 0x3C88, 0x3C89, 0x3C8A, 0x3C8A, 0x3C8B,
+    0x3C8C, 0x3C8D, 0x3C8E, 0x3C8F, 0x3C8F, 0x3C90, 0x3C91, 0x3C92, 0x3C93,
+    0x3C93, 0x3C94, 0x3C95, 0x3C96, 0x3C97, 0x3C98, 0x3C98, 0x3C99, 0x3C9A,
+    0x3C9B, 0x3C9C, 0x3C9C, 0x3C9D, 0x3C9E, 0x3C9F, 0x3CA0, 0x3CA1, 0x3CA1,
+    0x3CA2, 0x3CA3, 0x3CA4, 0x3CA5, 0x3CA5, 0x3CA6, 0x3CA7, 0x3CA8, 0x3CA9,
+    0x3CAA, 0x3CAA, 0x3CAB, 0x3CAC, 0x3CAD, 0x3CAE, 0x3CAE, 0x3CAF, 0x3CB0,
+    0x3CB1, 0x3CB2, 0x3CB3, 0x3CB3, 0x3CB4, 0x3CB5, 0x3CB6, 0x3CB7, 0x3CB8,
+    0x3CB8, 0x3CB9, 0x3CBA, 0x3CBB, 0x3CBC, 0x3CBC, 0x3CBD, 0x3CBE, 0x3CBF,
+    0x3CC0, 0x3CC1, 0x3CC1, 0x3CC2, 0x3CC3, 0x3CC4, 0x3CC5, 0x3CC5, 0x3CC6,
+    0x3CC7, 0x3CC8, 0x3CC9, 0x3CCA, 0x3CCA, 0x3CCB, 0x3CCC, 0x3CCD, 0x3CCE,
+    0x3CCE, 0x3CCF, 0x3CD0, 0x3CD1, 0x3CD2, 0x3CD3, 0x3CD3, 0x3CD4, 0x3CD5,
+    0x3CD6, 0x3CD7, 0x3CD7, 0x3CD8, 0x3CD9, 0x3CDA, 0x3CDB, 0x3CDC, 0x3CDC,
+    0x3CDD, 0x3CDE, 0x3CDF, 0x3CE0, 0x3CE0, 0x3CE1, 0x3CE2, 0x3CE3, 0x3CE4,
+    0x3CE5, 0x3CE5, 0x3CE6, 0x3CE7, 0x3CE8, 0x3CE9, 0x3CE9, 0x3CEA, 0x3CEB,
+    0x3CEC, 0x3CED, 0x3CEE, 0x3CEE, 0x3CEF, 0x3CF0, 0x3CF1, 0x3CF2, 0x3CF2,
+    0x3CF3, 0x3CF4, 0x3CF5, 0x3CF6, 0x3CF7, 0x3CF7, 0x3CF8, 0x3CF9, 0x3CFA,
+    0x3CFB, 0x3CFB, 0x3CFC, 0x3CFD, 0x3CFE, 0x3CFF, 0x3D00, 0x3D00, 0x3D01,
+    0x3D01, 0x3D01, 0x3D02, 0x3D02, 0x3D03, 0x3D03, 0x3D03, 0x3D04, 0x3D04,
+    0x3D05, 0x3D05, 0x3D06, 0x3D06, 0x3D06, 0x3D07, 0x3D07, 0x3D08, 0x3D08,
+    0x3D08, 0x3D09, 0x3D09, 0x3D0A, 0x3D0A, 0x3D0A, 0x3D0B, 0x3D0B, 0x3D0C,
+    0x3D0C, 0x3D0C, 0x3D0D, 0x3D0D, 0x3D0E, 0x3D0E, 0x3D0F, 0x3D0F, 0x3D0F,
+    0x3D10, 0x3D10, 0x3D11, 0x3D11, 0x3D11, 0x3D12, 0x3D12, 0x3D13, 0x3D13,
+    0x3D13, 0x3D14, 0x3D14, 0x3D15, 0x3D15, 0x3D16, 0x3D16, 0x3D16, 0x3D17,
+    0x3D17, 0x3D18, 0x3D18, 0x3D18, 0x3D19, 0x3D19, 0x3D1A, 0x3D1A, 0x3D1A,
+    0x3D1B, 0x3D1B, 0x3D1C, 0x3D1C, 0x3D1C, 0x3D1D, 0x3D1D, 0x3D1E, 0x3D1E,
+    0x3D1F, 0x3D1F, 0x3D1F, 0x3D20, 0x3D20, 0x3D21, 0x3D21, 0x3D21, 0x3D22,
+    0x3D22, 0x3D23, 0x3D23, 0x3D23, 0x3D24, 0x3D24, 0x3D25, 0x3D25, 0x3D25,
+    0x3D26, 0x3D26, 0x3D27, 0x3D27, 0x3D28, 0x3D28, 0x3D28, 0x3D29, 0x3D29,
+    0x3D2A, 0x3D2A, 0x3D2A, 0x3D2B, 0x3D2B, 0x3D2C, 0x3D2C, 0x3D2C, 0x3D2D,
+    0x3D2D, 0x3D2E, 0x3D2E, 0x3D2E, 0x3D2F, 0x3D2F, 0x3D30, 0x3D30, 0x3D31,
+    0x3D31, 0x3D31, 0x3D32, 0x3D32, 0x3D33, 0x3D33, 0x3D33, 0x3D34, 0x3D34,
+    0x3D35, 0x3D35, 0x3D35, 0x3D36, 0x3D36, 0x3D37, 0x3D37, 0x3D38, 0x3D38,
+    0x3D38, 0x3D39, 0x3D39, 0x3D3A, 0x3D3A, 0x3D3A, 0x3D3B, 0x3D3B, 0x3D3C,
+    0x3D3C, 0x3D3C, 0x3D3D, 0x3D3D, 0x3D3E, 0x3D3E, 0x3D3E, 0x3D3F, 0x3D3F,
+    0x3D40, 0x3D40, 0x3D41, 0x3D41, 0x3D41, 0x3D42, 0x3D42, 0x3D43, 0x3D43,
+    0x3D43, 0x3D44, 0x3D44, 0x3D45, 0x3D45, 0x3D45, 0x3D46, 0x3D46, 0x3D47,
+    0x3D47, 0x3D47, 0x3D48, 0x3D48, 0x3D49, 0x3D49, 0x3D4A, 0x3D4A, 0x3D4A,
+    0x3D4B, 0x3D4B, 0x3D4C, 0x3D4C, 0x3D4C, 0x3D4D, 0x3D4D, 0x3D4E, 0x3D4E,
+    0x3D4E, 0x3D4F, 0x3D4F, 0x3D50, 0x3D50, 0x3D50, 0x3D51, 0x3D51, 0x3D52,
+    0x3D52, 0x3D53, 0x3D53, 0x3D53, 0x3D54, 0x3D54, 0x3D55, 0x3D55, 0x3D55,
+    0x3D56, 0x3D56, 0x3D57, 0x3D57, 0x3D57, 0x3D58, 0x3D58, 0x3D59, 0x3D59,
+    0x3D59, 0x3D5A, 0x3D5A, 0x3D5B, 0x3D5B, 0x3D5C, 0x3D5C, 0x3D5C, 0x3D5D,
+    0x3D5D, 0x3D5E, 0x3D5E, 0x3D5E, 0x3D5F, 0x3D5F, 0x3D60, 0x3D60, 0x3D60,
+    0x3D61, 0x3D61, 0x3D62, 0x3D62, 0x3D63, 0x3D63, 0x3D63, 0x3D64, 0x3D64,
+    0x3D65, 0x3D65, 0x3D65, 0x3D66, 0x3D66, 0x3D67, 0x3D67, 0x3D67, 0x3D68,
+    0x3D68, 0x3D69, 0x3D69, 0x3D69, 0x3D6A, 0x3D6A, 0x3D6B, 0x3D6B, 0x3D6C,
+    0x3D6C, 0x3D6C, 0x3D6D, 0x3D6D, 0x3D6E, 0x3D6E, 0x3D6E, 0x3D6F, 0x3D6F,
+    0x3D70, 0x3D70, 0x3D70, 0x3D71, 0x3D71, 0x3D72, 0x3D72, 0x3D72, 0x3D73,
+    0x3D73, 0x3D74, 0x3D74, 0x3D75, 0x3D75, 0x3D75, 0x3D76, 0x3D76, 0x3D77,
+    0x3D77, 0x3D77, 0x3D78, 0x3D78, 0x3D79, 0x3D79, 0x3D79, 0x3D7A, 0x3D7A,
+    0x3D7B, 0x3D7B, 0x3D7B, 0x3D7C, 0x3D7C, 0x3D7D, 0x3D7D, 0x3D7E, 0x3D7E,
+    0x3D7E, 0x3D7F, 0x3D7F, 0x3D80, 0x3D80, 0x3D80, 0x3D80, 0x3D81, 0x3D81,
+    0x3D81, 0x3D81, 0x3D81, 0x3D82, 0x3D82, 0x3D82, 0x3D82, 0x3D82, 0x3D83,
+    0x3D83, 0x3D83, 0x3D83, 0x3D83, 0x3D84, 0x3D84, 0x3D84, 0x3D84, 0x3D85,
+    0x3D85, 0x3D85, 0x3D85, 0x3D85, 0x3D86, 0x3D86, 0x3D86, 0x3D86, 0x3D86,
+    0x3D87, 0x3D87, 0x3D87, 0x3D87, 0x3D87, 0x3D88, 0x3D88, 0x3D88, 0x3D88,
+    0x3D88, 0x3D89, 0x3D89, 0x3D89, 0x3D89, 0x3D89, 0x3D8A, 0x3D8A, 0x3D8A,
+    0x3D8A, 0x3D8A, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8C, 0x3D8C,
+    0x3D8C, 0x3D8C, 0x3D8C, 0x3D8D, 0x3D8D, 0x3D8D, 0x3D8D, 0x3D8E, 0x3D8E,
+    0x3D8E, 0x3D8E, 0x3D8E, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D90,
+    0x3D90, 0x3D90, 0x3D90, 0x3D90, 0x3D91, 0x3D91, 0x3D91, 0x3D91, 0x3D91,
+    0x3D92, 0x3D92, 0x3D92, 0x3D92, 0x3D92, 0x3D93, 0x3D93, 0x3D93, 0x3D93,
+    0x3D93, 0x3D94, 0x3D94, 0x3D94, 0x3D94, 0x3D94, 0x3D95, 0x3D95, 0x3D95,
+    0x3D95, 0x3D96, 0x3D96, 0x3D96, 0x3D96, 0x3D96, 0x3D97, 0x3D97, 0x3D97,
+    0x3D97, 0x3D97, 0x3D98, 0x3D98, 0x3D98, 0x3D98, 0x3D98, 0x3D99, 0x3D99,
+    0x3D99, 0x3D99, 0x3D99, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9B,
+    0x3D9B, 0x3D9B, 0x3D9B, 0x3D9B, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9C,
+    0x3D9D, 0x3D9D, 0x3D9D, 0x3D9D, 0x3D9D, 0x3D9E, 0x3D9E, 0x3D9E, 0x3D9E,
+    0x3D9F, 0x3D9F, 0x3D9F, 0x3D9F, 0x3D9F, 0x3DA0, 0x3DA0, 0x3DA0, 0x3DA0,
+    0x3DA0, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA2, 0x3DA2, 0x3DA2,
+    0x3DA2, 0x3DA2, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA4, 0x3DA4,
+    0x3DA4, 0x3DA4, 0x3DA4, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA6,
+    0x3DA6, 0x3DA6, 0x3DA6, 0x3DA7, 0x3DA7, 0x3DA7, 0x3DA7, 0x3DA7, 0x3DA8,
+    0x3DA8, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA9, 0x3DA9, 0x3DA9, 0x3DA9, 0x3DA9,
+    0x3DAA, 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAB,
+    0x3DAB, 0x3DAC, 0x3DAC, 0x3DAC, 0x3DAC, 0x3DAC, 0x3DAD, 0x3DAD, 0x3DAD,
+    0x3DAD, 0x3DAD, 0x3DAE, 0x3DAE, 0x3DAE, 0x3DAE, 0x3DAE, 0x3DAF, 0x3DAF,
+    0x3DAF, 0x3DAF, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB1, 0x3DB1,
+    0x3DB1, 0x3DB1, 0x3DB1, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB3,
+    0x3DB3, 0x3DB3, 0x3DB3, 0x3DB3, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB4,
+    0x3DB5, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB6, 0x3DB6, 0x3DB6, 0x3DB6,
+    0x3DB6, 0x3DB7, 0x3DB7, 0x3DB7, 0x3DB7, 0x3DB8, 0x3DB8, 0x3DB8, 0x3DB8,
+    0x3DB8, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DBA, 0x3DBA, 0x3DBA,
+    0x3DBA, 0x3DBA, 0x3DBB, 0x3DBB, 0x3DBB, 0x3DBB, 0x3DBB, 0x3DBC, 0x3DBC,
+    0x3DBC, 0x3DBC, 0x3DBC, 0x3DBD, 0x3DBD, 0x3DBD, 0x3DBD, 0x3DBD, 0x3DBE,
+    0x3DBE, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBF, 0x3DBF, 0x3DBF, 0x3DBF, 0x3DBF,
+    0x3DC0, 0x3DC0, 0x3DC0, 0x3DC0, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC1,
+    0x3DC2, 0x3DC2, 0x3DC2, 0x3DC2, 0x3DC2, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC3,
+    0x3DC3, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC5, 0x3DC5, 0x3DC5,
+    0x3DC5, 0x3DC5, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC7, 0x3DC7,
+    0x3DC7, 0x3DC7, 0x3DC7, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC9,
+    0x3DC9, 0x3DC9, 0x3DC9, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCB,
+    0x3DCB, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCC, 0x3DCC, 0x3DCC, 0x3DCC, 0x3DCC,
+    0x3DCD, 0x3DCE, 0x3DCF, 0x3DD0, 0x3DD1, 0x3DD2, 0x3DD3, 0x3DD4, 0x3DD5,
+    0x3DD6, 0x3DD7, 0x3DD8, 0x3DD9, 0x3DDA, 0x3DDB, 0x3DDC, 0x3DDD, 0x3DDE,
+    0x3DDF, 0x3DE0, 0x3DE1, 0x3DE2, 0x3DE3, 0x3DE4, 0x3DE5,
+};
+
+static u16 golden_bf16[] = {
+    0x0,    0x38d2, 0x3952, 0x399d, 0x39d2, 0x3a03, 0x3a1d, 0x3a38, 0x3a52,
+    0x3a6c, 0x3a83, 0x3a90, 0x3a9d, 0x3aaa, 0x3ab8, 0x3ac5, 0x3ad2, 0x3adf,
+    0x3aec, 0x3afa, 0x3b03, 0x3b0a, 0x3b10, 0x3b17, 0x3b1d, 0x3b24, 0x3b2a,
+    0x3b31, 0x3b38, 0x3b3e, 0x3b45, 0x3b4c, 0x3b52, 0x3b59, 0x3b5f, 0x3b65,
+    0x3b6c, 0x3b72, 0x3b7a, 0x3b80, 0x3b83, 0x3b86, 0x3b8a, 0x3b8d, 0x3b90,
+    0x3b93, 0x3b97, 0x3b9a, 0x3b9d, 0x3ba1, 0x3ba4, 0x3ba7, 0x3baa, 0x3bae,
+    0x3bb1, 0x3bb4, 0x3bb8, 0x3bbb, 0x3bbe, 0x3bc1, 0x3bc5, 0x3bc8, 0x3bcb,
+    0x3bce, 0x3bd2, 0x3bd6, 0x3bd8, 0x3bdc, 0x3bdf, 0x3be2, 0x3be6, 0x3be9,
+    0x3bec, 0x3bef, 0x3bf2, 0x3bf6, 0x3bf9, 0x3bfc, 0x3c00, 0x3c01, 0x3c03,
+    0x3c05, 0x3c06, 0x3c08, 0x3c0a, 0x3c0b, 0x3c0d, 0x3c0f, 0x3c10, 0x3c12,
+    0x3c13, 0x3c15, 0x3c17, 0x3c18, 0x3c1a, 0x3c1c, 0x3c1d, 0x3c1f, 0x3c21,
+    0x3c22, 0x3c24, 0x3c25, 0x3c27, 0x3c29, 0x3c2a, 0x3c2c, 0x3c2e, 0x3c2f,
+    0x3c31, 0x3c33, 0x3c34, 0x3c36, 0x3c38, 0x3c39, 0x3c3b, 0x3c3c, 0x3c3e,
+    0x3c40, 0x3c41, 0x3c43, 0x3c45, 0x3c46, 0x3c48, 0x3c4a, 0x3c4b, 0x3c4d,
+    0x3c4e, 0x3c50, 0x3c52, 0x3c53, 0x3c55, 0x3c57, 0x3c58, 0x3c5a, 0x3c5c,
+    0x3c5d, 0x3c5f, 0x3c60, 0x3c62, 0x3c64, 0x3c66, 0x3c68, 0x3c69, 0x3c6a,
+    0x3c6c, 0x3c6e, 0x3c70, 0x3c71, 0x3c72, 0x3c74, 0x3c76, 0x3c78, 0x3c79,
+    0x3c7b, 0x3c7c, 0x3c7e, 0x3c80, 0x3c81, 0x3c81, 0x3c82, 0x3c83, 0x3c84,
+    0x3c85, 0x3c86, 0x3c86, 0x3c87, 0x3c88, 0x3c89, 0x3c8a, 0x3c8a, 0x3c8b,
+    0x3c8c, 0x3c8d, 0x3c8e, 0x3c8f, 0x3c8f, 0x3c90, 0x3c91, 0x3c92, 0x3c93,
+    0x3c93, 0x3c94, 0x3c95, 0x3c96, 0x3c97, 0x3c98, 0x3c98, 0x3c99, 0x3c9a,
+    0x3c9b, 0x3c9c, 0x3c9c, 0x3c9d, 0x3c9e, 0x3c9f, 0x3ca0, 0x3ca1, 0x3ca1,
+    0x3ca2, 0x3ca3, 0x3ca4, 0x3ca5, 0x3ca5, 0x3ca6, 0x3ca7, 0x3ca8, 0x3ca9,
+    0x3caa, 0x3caa, 0x3cab, 0x3cac, 0x3cad, 0x3cae, 0x3cae, 0x3caf, 0x3cb0,
+    0x3cb1, 0x3cb2, 0x3cb3, 0x3cb3, 0x3cb4, 0x3cb5, 0x3cb6, 0x3cb7, 0x3cb8,
+    0x3cb8, 0x3cb9, 0x3cba, 0x3cbb, 0x3cbc, 0x3cbc, 0x3cbd, 0x3cbe, 0x3cbf,
+    0x3cc0, 0x3cc1, 0x3cc1, 0x3cc2, 0x3cc3, 0x3cc4, 0x3cc5, 0x3cc5, 0x3cc6,
+    0x3cc7, 0x3cc8, 0x3cc9, 0x3cca, 0x3cca, 0x3ccb, 0x3ccc, 0x3ccd, 0x3cce,
+    0x3cce, 0x3ccf, 0x3cd0, 0x3cd1, 0x3cd2, 0x3cd3, 0x3cd3, 0x3cd4, 0x3cd5,
+    0x3cd6, 0x3cd7, 0x3cd7, 0x3cd8, 0x3cd9, 0x3cda, 0x3cdb, 0x3cdc, 0x3cdc,
+    0x3cdd, 0x3cde, 0x3cdf, 0x3ce0, 0x3ce0, 0x3ce1, 0x3ce2, 0x3ce3, 0x3ce4,
+    0x3ce5, 0x3ce5, 0x3ce6, 0x3ce7, 0x3ce8, 0x3ce9, 0x3ce9, 0x3cea, 0x3ceb,
+    0x3cec, 0x3ced, 0x3cee, 0x3cee, 0x3cef, 0x3cf0, 0x3cf1, 0x3cf2, 0x3cf2,
+    0x3cf3, 0x3cf4, 0x3cf5, 0x3cf6, 0x3cf7, 0x3cf7, 0x3cf8, 0x3cf9, 0x3cfa,
+    0x3cfb, 0x3cfb, 0x3cfc, 0x3cfd, 0x3cfe, 0x3cff, 0x3d00, 0x3d00, 0x3d01,
+    0x3d01, 0x3d01, 0x3d02, 0x3d02, 0x3d03, 0x3d03, 0x3d03, 0x3d04, 0x3d04,
+    0x3d05, 0x3d05, 0x3d06, 0x3d06, 0x3d06, 0x3d07, 0x3d07, 0x3d08, 0x3d08,
+    0x3d08, 0x3d09, 0x3d09, 0x3d0a, 0x3d0a, 0x3d0a, 0x3d0b, 0x3d0b, 0x3d0c,
+    0x3d0c, 0x3d0c, 0x3d0d, 0x3d0d, 0x3d0e, 0x3d0e, 0x3d0f, 0x3d0f, 0x3d0f,
+    0x3d10, 0x3d10, 0x3d11, 0x3d11, 0x3d11, 0x3d12, 0x3d12, 0x3d13, 0x3d13,
+    0x3d13, 0x3d14, 0x3d14, 0x3d15, 0x3d15, 0x3d16, 0x3d16, 0x3d16, 0x3d17,
+    0x3d17, 0x3d18, 0x3d18, 0x3d18, 0x3d19, 0x3d19, 0x3d1a, 0x3d1a, 0x3d1a,
+    0x3d1b, 0x3d1b, 0x3d1c, 0x3d1c, 0x3d1c, 0x3d1d, 0x3d1d, 0x3d1e, 0x3d1e,
+    0x3d1f, 0x3d1f, 0x3d1f, 0x3d20, 0x3d20, 0x3d21, 0x3d21, 0x3d21, 0x3d22,
+    0x3d22, 0x3d23, 0x3d23, 0x3d23, 0x3d24, 0x3d24, 0x3d25, 0x3d25, 0x3d25,
+    0x3d26, 0x3d26, 0x3d27, 0x3d27, 0x3d28, 0x3d28, 0x3d28, 0x3d29, 0x3d29,
+    0x3d2a, 0x3d2a, 0x3d2a, 0x3d2b, 0x3d2b, 0x3d2c, 0x3d2c, 0x3d2c, 0x3d2d,
+    0x3d2d, 0x3d2e, 0x3d2e, 0x3d2e, 0x3d2f, 0x3d2f, 0x3d30, 0x3d30, 0x3d31,
+    0x3d31, 0x3d31, 0x3d32, 0x3d32, 0x3d33, 0x3d33, 0x3d33, 0x3d34, 0x3d34,
+    0x3d35, 0x3d35, 0x3d35, 0x3d36, 0x3d36, 0x3d37, 0x3d37, 0x3d38, 0x3d38,
+    0x3d38, 0x3d39, 0x3d39, 0x3d3a, 0x3d3a, 0x3d3a, 0x3d3b, 0x3d3b, 0x3d3c,
+    0x3d3c, 0x3d3c, 0x3d3d, 0x3d3d, 0x3d3e, 0x3d3e, 0x3d3e, 0x3d3f, 0x3d3f,
+    0x3d40, 0x3d40, 0x3d41, 0x3d41, 0x3d41, 0x3d42, 0x3d42, 0x3d43, 0x3d43,
+    0x3d43, 0x3d44, 0x3d44, 0x3d45, 0x3d45, 0x3d45, 0x3d46, 0x3d46, 0x3d47,
+    0x3d47, 0x3d47, 0x3d48, 0x3d48, 0x3d49, 0x3d49, 0x3d4a, 0x3d4a, 0x3d4a,
+    0x3d4b, 0x3d4b, 0x3d4c, 0x3d4c, 0x3d4c, 0x3d4d, 0x3d4d, 0x3d4e, 0x3d4e,
+    0x3d4e, 0x3d4f, 0x3d4f, 0x3d50, 0x3d50, 0x3d50, 0x3d51, 0x3d51, 0x3d52,
+    0x3d52, 0x3d53, 0x3d53, 0x3d53, 0x3d54, 0x3d54, 0x3d55, 0x3d55, 0x3d55,
+    0x3d56, 0x3d56, 0x3d57, 0x3d57, 0x3d57, 0x3d58, 0x3d58, 0x3d59, 0x3d59,
+    0x3d59, 0x3d5a, 0x3d5a, 0x3d5b, 0x3d5b, 0x3d5c, 0x3d5c, 0x3d5c, 0x3d5d,
+    0x3d5d, 0x3d5e, 0x3d5e, 0x3d5e, 0x3d5f, 0x3d5f, 0x3d60, 0x3d60, 0x3d60,
+    0x3d60, 0x3d60, 0x3d61, 0x3d61, 0x3d62, 0x3d62, 0x3d62, 0x3d63, 0x3d63,
+    0x3d64, 0x3d64, 0x3d64, 0x3d65, 0x3d65, 0x3d66, 0x3d66, 0x3d66, 0x3d67,
+    0x3d67, 0x3d68, 0x3d68, 0x3d68, 0x3d69, 0x3d69, 0x3d6a, 0x3d6a, 0x3d6b,
+    0x3d6b, 0x3d6b, 0x3d6c, 0x3d6c, 0x3d6d, 0x3d6d, 0x3d6d, 0x3d6e, 0x3d6e,
+    0x3d6f, 0x3d6f, 0x3d6f, 0x3d70, 0x3d70, 0x3d71, 0x3d71, 0x3d71, 0x3d72,
+    0x3d72, 0x3d73, 0x3d73, 0x3d74, 0x3d74, 0x3d74, 0x3d75, 0x3d75, 0x3d76,
+    0x3d76, 0x3d76, 0x3d77, 0x3d77, 0x3d78, 0x3d78, 0x3d78, 0x3d79, 0x3d79,
+    0x3d7a, 0x3d7a, 0x3d7a, 0x3d7b, 0x3d7b, 0x3d7c, 0x3d7c, 0x3d7d, 0x3d7d,
+    0x3d7d, 0x3d7e, 0x3d7e, 0x3d7f, 0x3d7f, 0x3d7f, 0x3d7f, 0x3d81, 0x3d81,
+    0x3d81, 0x3d81, 0x3d81, 0x3d82, 0x3d82, 0x3d82, 0x3d82, 0x3d82, 0x3d83,
+    0x3d83, 0x3d83, 0x3d83, 0x3d83, 0x3d84, 0x3d84, 0x3d84, 0x3d84, 0x3d85,
+    0x3d85, 0x3d85, 0x3d85, 0x3d85, 0x3d86, 0x3d86, 0x3d86, 0x3d86, 0x3d86,
+    0x3d87, 0x3d87, 0x3d87, 0x3d87, 0x3d87, 0x3d88, 0x3d88, 0x3d88, 0x3d88,
+    0x3d88, 0x3d89, 0x3d89, 0x3d89, 0x3d89, 0x3d89, 0x3d8a, 0x3d8a, 0x3d8a,
+    0x3d8a, 0x3d8a, 0x3d8b, 0x3d8b, 0x3d8b, 0x3d8b, 0x3d8b, 0x3d8c, 0x3d8c,
+    0x3d8c, 0x3d8c, 0x3d8c, 0x3d8d, 0x3d8d, 0x3d8d, 0x3d8d, 0x3d8e, 0x3d8e,
+    0x3d8e, 0x3d8e, 0x3d8e, 0x3d8f, 0x3d8f, 0x3d8f, 0x3d8f, 0x3d8f, 0x3d90,
+    0x3d90, 0x3d90, 0x3d90, 0x3d90, 0x3d91, 0x3d91, 0x3d91, 0x3d91, 0x3d91,
+    0x3d92, 0x3d92, 0x3d92, 0x3d92, 0x3d92, 0x3d93, 0x3d93, 0x3d93, 0x3d93,
+    0x3d93, 0x3d94, 0x3d94, 0x3d94, 0x3d94, 0x3d94, 0x3d95, 0x3d95, 0x3d95,
+    0x3d95, 0x3d96, 0x3d96, 0x3d96, 0x3d96, 0x3d96, 0x3d97, 0x3d97, 0x3d97,
+    0x3d97, 0x3d97, 0x3d98, 0x3d98, 0x3d98, 0x3d98, 0x3d98, 0x3d99, 0x3d99,
+    0x3d99, 0x3d99, 0x3d99, 0x3d99, 0x3d99, 0x3d99, 0x3d99, 0x3d99, 0x3d9a,
+    0x3d9a, 0x3d9a, 0x3d9a, 0x3d9a, 0x3d9b, 0x3d9b, 0x3d9b, 0x3d9b, 0x3d9b,
+    0x3d9c, 0x3d9c, 0x3d9c, 0x3d9c, 0x3d9c, 0x3d9d, 0x3d9d, 0x3d9d, 0x3d9d,
+    0x3d9e, 0x3d9e, 0x3d9e, 0x3d9e, 0x3d9e, 0x3d9f, 0x3d9f, 0x3d9f, 0x3d9f,
+    0x3d9f, 0x3da0, 0x3da0, 0x3da0, 0x3da0, 0x3da0, 0x3da1, 0x3da1, 0x3da1,
+    0x3da1, 0x3da1, 0x3da2, 0x3da2, 0x3da2, 0x3da2, 0x3da2, 0x3da3, 0x3da3,
+    0x3da3, 0x3da3, 0x3da3, 0x3da4, 0x3da4, 0x3da4, 0x3da4, 0x3da4, 0x3da5,
+    0x3da5, 0x3da5, 0x3da5, 0x3da6, 0x3da6, 0x3da6, 0x3da6, 0x3da6, 0x3da7,
+    0x3da7, 0x3da7, 0x3da7, 0x3da7, 0x3da8, 0x3da8, 0x3da8, 0x3da8, 0x3da8,
+    0x3da9, 0x3da9, 0x3da9, 0x3da9, 0x3da9, 0x3daa, 0x3daa, 0x3daa, 0x3daa,
+    0x3daa, 0x3dab, 0x3dab, 0x3dab, 0x3dab, 0x3dab, 0x3dac, 0x3dac, 0x3dac,
+    0x3dac, 0x3dac, 0x3dad, 0x3dad, 0x3dad, 0x3dad, 0x3dad, 0x3daf, 0x3daf,
+    0x3daf, 0x3daf, 0x3db0, 0x3db0, 0x3db0, 0x3db0, 0x3db0, 0x3db1, 0x3db1,
+    0x3db1, 0x3db1, 0x3db1, 0x3db2, 0x3db2, 0x3db2, 0x3db2, 0x3db2, 0x3db3,
+    0x3db3, 0x3db3, 0x3db3, 0x3db3, 0x3db4, 0x3db4, 0x3db4, 0x3db4, 0x3db4,
+    0x3db5, 0x3db5, 0x3db5, 0x3db5, 0x3db5, 0x3db6, 0x3db6, 0x3db6, 0x3db6,
+    0x3db6, 0x3db7, 0x3db7, 0x3db7, 0x3db7, 0x3db8, 0x3db8, 0x3db8, 0x3db8,
+    0x3db8, 0x3db9, 0x3db9, 0x3db9, 0x3db9, 0x3db9, 0x3dba, 0x3dba, 0x3dba,
+    0x3dba, 0x3dba, 0x3dbb, 0x3dbb, 0x3dbb, 0x3dbb, 0x3dbb, 0x3dbc, 0x3dbc,
+    0x3dbc, 0x3dbc, 0x3dbc, 0x3dbd, 0x3dbd, 0x3dbd, 0x3dbd, 0x3dbd, 0x3dbe,
+    0x3dbe, 0x3dbe, 0x3dbe, 0x3dbe, 0x3dbf, 0x3dbf, 0x3dbf, 0x3dbf, 0x3dbf,
+    0x3dc0, 0x3dc0, 0x3dc0, 0x3dc0, 0x3dc1, 0x3dc1, 0x3dc1, 0x3dc1, 0x3dc1,
+    0x3dc1, 0x3dc1, 0x3dc1, 0x3dc1, 0x3dc1, 0x3dc2, 0x3dc2, 0x3dc2, 0x3dc2,
+    0x3dc2, 0x3dc3, 0x3dc3, 0x3dc3, 0x3dc3, 0x3dc3, 0x3dc4, 0x3dc4, 0x3dc4,
+    0x3dc4, 0x3dc4, 0x3dc5, 0x3dc5, 0x3dc5, 0x3dc5, 0x3dc5, 0x3dc6, 0x3dc6,
+    0x3dc6, 0x3dc6, 0x3dc6, 0x3dc7, 0x3dc7, 0x3dc7, 0x3dc7, 0x3dc7, 0x3dc8,
+    0x3dc8, 0x3dc8, 0x3dc8, 0x3dc9, 0x3dc9, 0x3dc9, 0x3dc9, 0x3dc9, 0x3dca,
+    0x3dca, 0x3dca, 0x3dca, 0x3dca, 0x3dcb, 0x3dcb, 0x3dcb, 0x3dcb, 0x3dcb,
+    0x3dcc, 0x3dcd, 0x3dce, 0x3dcf, 0x3dd0, 0x3dd1, 0x3dd2, 0x3dd3, 0x3dd4,
+    0x3dd5, 0x3dd6, 0x3dd7, 0x3dd8, 0x3dd9, 0x3dda, 0x3ddb, 0x3ddc, 0x3ddd,
+    0x3dde, 0x3ddf, 0x3de0, 0x3de1, 0x3de2, 0x3de3, 0x3de4,
+};
+
+// <! gen atan f(x) = atan(x)
+static double _gen_atan(float i)
+{
+  return atan(i);
+}
+
+static void tl_lut_ref(u16 *ofmap, u16 *ifmap, tl_shape_t ifmap_shape)
+{
+  assert(ofmap);
+
+  for (u32 i = 0; i < tl_shape_size(&ifmap_shape); i++) {
+    float f = convert_bf16_fp32(ifmap[i]);
+    double v = _gen_atan(f);
+    ofmap[i] = convert_fp32_bf16(v);
+
+    if (mode == PRE_DATA_COMPARE_FIX) {
+      ofmap[i] = golden_bf16[i];
+    } else if (mode == DATA_COMPARE_U8) {
+      ofmap[i] = (u8)convert_bf16_s8(ofmap[i]);
+    }
+  }
+}
+
+
+static bool verify(u16 *ofmap_data, u16 *ref_data, u16 *ifmap, u64 ifmap_size,
+                   float epsilon)
+{
+  u64 size = ifmap_size;
+
+  for (u64 i = 0; i < size; i++) {
+    bool is_close;
+    u16 ref = ref_data[i];
+    u16 ofmap_data_bf16;
+    float ref_f;
+    float ofmap_data_f;
+
+    ref_f = convert_bf16_fp32(ref);
+    ofmap_data_f = convert_bf16_fp32(ofmap_data[i]);
+    ofmap_data_bf16 = ofmap_data[i];
+
+    if (mode == PRE_DATA_COMPARE_FIX) {
+      is_close = ofmap_data[i] == ref;
+    } else {
+      is_close = fabs(ref_f - ofmap_data_f) < epsilon;
+    }
+
+    if (!is_close) {
+      float input = convert_bf16_fp32(ifmap[i]);
+      fprintf(stderr,
+              "comparing failed at ofmap_data[%" PRIu64 "](input:%f)\n"
+              "\tgot %x, exp %x, fp32: got %f exp %f, atan(%f) = %f\n",
+              i, input, ofmap_data_bf16, ref, ofmap_data_f, ref_f, input,
+              _gen_atan(input));
+      exit(-1);
+    }
+  }
+
+  return true;
+}
+
+static void gen_input(u16 *input_data, u64 ifmap_size, TEST_MODE mode,
+                      int range_start, int range_end)
+{
+
+  if (mode == PRE_DATA_COMPARE_FIX) {
+    memcpy(input_data, &test_pattern, sizeof(test_pattern));
+  } else {
+    std::random_device rd;
+    std::mt19937 e2(rd());
+    std::uniform_real_distribution<> dist(range_start, range_end);
+    int table_hw = 256;
+    for (u64 i = 0; i < ifmap_size; i++) {
+      // input range is -8 ~ +8
+      float input = ((int)i % (range_end - 2)) * (((int)i % 2) ? 1 : -1) +
+                    0.03 + (i % table_hw) * 0.002;
+      // float input = ((int)i % 10) * (((int)i % 2) ? 1 : -1) + 0.03 + (i %
+      // table_hw) * 0.002;  float input = dist(e2);  input = ((int)i %
+      // (range_end-2)) * (((int)i % 2) ? 1 : 1) + 0.03 + (i % table_hw) *
+      // 0.002; if (input < 1 && input > 0) {
+      //  input = 111.9;
+      //}
+      input_data[i] = convert_fp32_bf16(input);
+    }
+    input_data[0] = convert_fp32_bf16(0);
+    input_data[1] = convert_fp32_bf16(1);
+    input_data[2] = convert_fp32_bf16(-1);
+  }
+
+#ifdef DBG
+  for (u64 i = 0; i < ifmap_size; i++) {
+    printf("source if[%" PRIu64 "] bf16 %f 0x%x, log2f is %f\n", i,
+           convert_bf16_fp32(input_data[i]), input_data[i],
+           floor(log2((convert_bf16_fp32(input_data[i])))));
+  }
+#endif /* ifdef DBG */
+}
+
+static void testbench(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk)
+{
+  // TODO: check more shape / align
+  bmk1880v2_chip_info_t chip_info = bmk1880v2_chip_info();
+
+  u32 input_n = 1;
+  u32 input_c = chip_info.npu_num;
+  u32 input_h = 16;
+  u32 input_w = 16;
+  float epsilon = 0.01;
+  int range_start = -8;
+  int range_end = 8;
+  fmt_t fmt = FMT_BF16;
+
+  if (mode == PRE_DATA_COMPARE_FIX) {
+    input_h = 4;
+    input_w = 8;
+  }
+
+  tl_shape_t ifmap_shape = {input_n, input_c, input_h, input_w};
+  tl_shape_t ofmap_shape = ifmap_shape;
+  u64 ifmap_size = tl_shape_size(&ifmap_shape);
+  u64 ofmap_size = tl_shape_size(&ofmap_shape);
+
+  int data_type_size = bytesize_of_fmt(fmt);
+  u64 ifmap_bytesize = ifmap_size * data_type_size;
+  u64 ofmap_bytesize = ofmap_size * data_type_size;
+
+  // get lut table shape and size
+  tl_shape_t table_shape;
+  u64 table_bytesize = bf16_lut_tbl_bytesize(bmk, &table_shape, fmt);
+
+  tl_t *tl_ifmap = alloc_tl(bmk, ifmap_shape, fmt, /*align*/1);
+  tl_t *tl_ofmap_bf16 = alloc_tl(bmk, ofmap_shape, fmt, /*align*/1);
+  tl_t *out = tl_ofmap_bf16;
+
+  // atan buf
+  tl_t *tl_y0_buf = alloc_tl(bmk, table_shape, fmt, /*align*/1);
+  tl_t *tl_slope_buf = alloc_tl(bmk, table_shape, fmt, /*align*/1);
+  tl_t *tl_invert_buf = alloc_tl(bmk, table_shape, fmt, /*align*/1);
+  tl_t *tl_pos_neg_buf = alloc_tl(bmk, table_shape, fmt, /*align*/1);
+
+  // reciprocal buf
+  tl_t *tl_reciprocal_table_answer =
+      alloc_tl(bmk, table_shape, fmt, /*align*/1);
+  tl_t *tl_reciprocal_table_answer_mantissa =
+      alloc_tl(bmk, table_shape, fmt, /*align*/1);
+
+  // temp buf
+  tl_t *tl_buf = alloc_tl(bmk, ofmap_shape, fmt, /*align*/1);
+  tl_t *tl_buf2 = alloc_tl(bmk, tl_ofmap_bf16->shape, fmt, /*align*/1);
+  tl_t *tl_buf4 = alloc_tl(bmk, tl_ofmap_bf16->shape, fmt, /*align*/1);
+
+  u16 *input_data = (u16 *)xmalloc(ifmap_bytesize);
+  u16 *ref_data = (u16 *)xmalloc(ofmap_bytesize);
+
+  // for reciprocal
+  u16 *table_reciprocal_data = (u16 *)xmalloc(table_bytesize);
+  u16 *table_reciprocal_data_mantissa = (u16 *)xmalloc(table_bytesize);
+
+  // for atan
+  u16 *table_data_atan_y0 = (u16 *)xmalloc(table_bytesize);
+  u16 *table_data_atan_slope = (u16 *)xmalloc(table_bytesize);
+  u16 *table_data_atan_invert = (u16 *)xmalloc(table_bytesize);
+  u16 *table_data_atan_pos_neg = (u16 *)xmalloc(table_bytesize);
+
+  gen_input(input_data, ifmap_size, mode, range_start, range_end);
+  tl_lut_ref(ref_data, input_data, ifmap_shape);
+
+  bf16_reciprocal_tbl(table_reciprocal_data, table_reciprocal_data_mantissa,
+                      &table_shape);
+  bf16_atan_tbl(table_data_atan_y0, table_data_atan_slope,
+                table_data_atan_invert, table_data_atan_pos_neg, &table_shape);
+
+  put_bf16_tensor_g2l(ctx, bmk, tl_ifmap, (u16 *)input_data, fmt);
+  put_bf16_tensor_g2l(ctx, bmk, tl_reciprocal_table_answer,
+                      (u16 *)table_reciprocal_data, fmt);
+  put_bf16_tensor_g2l(ctx, bmk, tl_reciprocal_table_answer_mantissa,
+                      (u16 *)table_reciprocal_data_mantissa, fmt);
+
+  // prepare atan
+  put_bf16_tensor_g2l(ctx, bmk, tl_y0_buf, (u16 *)table_data_atan_y0, fmt);
+  put_bf16_tensor_g2l(ctx, bmk, tl_slope_buf, (u16 *)table_data_atan_slope,
+                      fmt);
+  put_bf16_tensor_g2l(ctx, bmk, tl_invert_buf, (u16 *)table_data_atan_invert,
+                      fmt);
+  put_bf16_tensor_g2l(ctx, bmk, tl_pos_neg_buf, (u16 *)table_data_atan_pos_neg,
+                      fmt);
+
+  bf16_atan_emit(bmk, tl_ifmap, tl_buf, tl_buf2, tl_buf4, tl_y0_buf,
+                 tl_slope_buf, tl_invert_buf, tl_pos_neg_buf,
+                 tl_reciprocal_table_answer,
+                 tl_reciprocal_table_answer_mantissa, tl_ofmap_bf16, fmt);
+
+  test_submit(ctx);
+
+  u16 *ofmap_data = (u16 *)get_bf16_tensor_l2g(ctx, bmk, out, out->fmt);
+  verify(ofmap_data, ref_data, input_data, ifmap_size, epsilon);
+
+  free_tl(bmk, tl_buf4);
+  free_tl(bmk, tl_buf2);
+  free_tl(bmk, tl_buf);
+  free_tl(bmk, tl_reciprocal_table_answer_mantissa);
+  free_tl(bmk, tl_reciprocal_table_answer);
+  free_tl(bmk, tl_pos_neg_buf);
+  free_tl(bmk, tl_invert_buf);
+  free_tl(bmk, tl_slope_buf);
+  free_tl(bmk, tl_y0_buf);
+  free_tl(bmk, tl_ofmap_bf16);
+  free_tl(bmk, tl_ifmap);
+
+  free(table_data_atan_y0);
+  free(table_data_atan_slope);
+  free(table_data_atan_invert);
+  free(table_data_atan_pos_neg);
+  free(table_reciprocal_data);
+  free(table_reciprocal_data_mantissa);
+  free(input_data);
+  free(ref_data);
+  free(ofmap_data);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  int round_mode;
+
+  round_mode = set_store_feround();
+
+  test_init(&ctx, &bmk);
+
+  // for (int i = PRE_DATA_COMPARE_FIX; i < TEST_MODE_MAX; i++)
+  // for (int i = PRE_DATA_COMPARE_FIX; i < DATA_COMPARE_ACCURACY; i++)
+  for (int i = PRE_DATA_COMPARE_FIX; i < DATA_COMPARE_U8; i++)
+  // for (int i = DATA_COMPARE_ACCURACY; i < DATA_COMPARE_U8; i++)
+  {
+    mode = static_cast<TEST_MODE>(i);
+    printf("test mode %d...\n", mode);
+    testbench(&ctx, bmk);
+  }
+  printf("pass\n");
+
+  test_exit(&ctx);
+  restore_feround(round_mode);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_bf16_avg_pooling.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_avg_pooling.cpp
new file mode 100644
index 000000000..e4cfeebd6
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_avg_pooling.cpp
@@ -0,0 +1,305 @@
+#include "../1880v2_test_util.h"
+
+typedef bmk1880v2_tiu_average_pooling_param_t param_t;
+int random_seed;
+static void print_pooling_param(const param_t *p)
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+
+  printf("  Pooling parameters:\n");
+  printf("    random_seed : %d \n", random_seed);
+  printf("    ifmap = (%d, %d, %d, %d)\n", in, ic, ih, iw);
+  printf("    opd0_sign = %d\n", p->ifmap->fmt == FMT_I8);
+  printf("    weight = (%d, %d)\n", p->kh, p->kw);
+  printf("    padding = (%d, %d, %d, %d)\n",
+         p->pad_top, p->pad_bottom, p->pad_left, p->pad_right);
+  printf("    stride = (%d, %d)\n", p->stride_h, p->stride_w);
+  printf("    ins0 = (%d, %d, %d, %d)\n",
+         p->ins_h, p->ins_last_h, p->ins_w, p->ins_last_w);
+  printf("    avg_pooling_const = %d\n", p->avg_pooling_const);
+  printf("    rshift_bits = %d\n", p->rshift_bits);
+}
+static int index_get(int h, int w1, int w2)
+{
+  return h * w1 + w2;
+}
+
+int native_pooling_avg_bf16(
+    const u16* i_fmap,
+    const void* weight,
+    const u32 *bias,
+    u16 * o_fmap,
+    int input_n, int input_c, int input_h, int input_w,
+    int kh, int kw,
+    int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r,
+    int stride_h, int stride_w,
+    int ins_h, int ins_w,
+    int ins_h_last, int ins_w_last,
+    int const_weight)
+{
+  if (kh * kw <= 0)
+    return BM_ERR_INVALID_ARGUMENT;
+
+  float *avg_pooling_mac_a = (float *)malloc(kh * kw * sizeof(float));
+  float *avg_pooling_mac_b = (float *)malloc(kh * kw * sizeof(float));
+
+  u16 avg_const_weight = *(u16 *)weight;
+  const u16 *weight_arr = (u16*)weight;
+
+  int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b);
+  int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r);
+
+  int output_h = calc_output_hw(h_after, kh, stride_h);
+  int output_w = calc_output_hw(w_after, kw, stride_w);
+  u16 *i_fmap_pad = NULL;
+  for (int n = 0; n < input_n; n++) {
+    if (const_weight == 0)
+      weight_arr = (u16*)weight;
+
+    for (int c = 0; c < input_c; ++c) {
+      fill_pad_fmap_bf16(i_fmap, &i_fmap_pad, convert_fp32_bf16(0),
+          pad_w_l, pad_w_r, pad_h_t, pad_h_b,
+          ins_h, ins_w, ins_h_last, ins_w_last,
+          input_h, input_w);
+
+      for (int ph = 0; ph < output_h; ++ph) {
+        for (int pw = 0; pw < output_w; ++pw) {
+          int hstart = ph * stride_h;
+          int wstart = pw * stride_w;
+          int pool_index = index_get(ph, output_w, pw);
+          int mac_index = 0;
+          float avg_pool_result=0;
+          for (int h = 0; h < kh; h++) {
+            for (int w = 0; w < kw; w++) {
+              int index = index_get((hstart+h), w_after, (w+wstart));
+              mac_index = index_get(h, kw, w);
+              float a = convert_bf16_fp32(i_fmap_pad[index]);
+              float b = const_weight ?
+                  convert_bf16_fp32(avg_const_weight) : convert_bf16_fp32(weight_arr[mac_index]);
+
+              avg_pool_result += a*b;
+            }
+          }
+
+          if(bias) {
+            avg_pool_result += convert_hex_fp32(bias[c]);
+          }
+          *(o_fmap+pool_index) = convert_fp32_bf16(avg_pool_result);
+        }
+      }
+      i_fmap += input_w * input_h;
+      if (const_weight == 0)
+        weight_arr += kh * kw;
+
+      o_fmap += output_w * output_h;
+    }
+  }
+  free(i_fmap_pad);
+
+  free(avg_pooling_mac_a);
+  free(avg_pooling_mac_b);
+
+  return BM_SUCCESS;
+}
+
+static u16 *alloc_input(param_t *p)
+{
+  u64 size = tl_shape_size(&p->ifmap->shape);
+  u16 *data = (u16 *)xmalloc(size*2);
+  if (!data)
+    return NULL;
+
+  for (u64 i = 0; i < size; i++) {
+    float val = 0;
+    int RAND_MAX2 = RAND_MAX/2; //100 ~ -100
+    val = (float)(rand()-RAND_MAX2)*1000 / (float)RAND_MAX;
+    data[i] = convert_fp32_bf16(val);//rand() % 256 - 128;
+  }
+  return data;
+}
+
+static u16 *alloc_output(param_t *p)
+{
+  u64 size = tl_shape_size(&p->ofmap->shape);
+  return (u16 *)xmalloc(size*2);
+}
+
+static int pooling_ih_ext(param_t *p, int ih)
+{
+  int ins = p->ins_h;
+  int ins_last = p->ins_last_h;
+  int pad = p->pad_top + p->pad_bottom;
+  return (ih - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+static int pooling_iw_ext(param_t *p, int iw)
+{
+  int ins = p->ins_w;
+  int ins_last = p->ins_last_w;
+  int pad = p->pad_left + p->pad_right;
+  return (iw - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+static int pooling_oh(param_t *p, int ih)
+{
+  int ih_ext = pooling_ih_ext(p, ih);
+  return (ih_ext - p->kh) / p->stride_h + 1;
+}
+
+static int pooling_ow(param_t *p, int iw)
+{
+  int iw_ext = pooling_iw_ext(p, iw);
+  return (iw_ext - p->kw) / p->stride_w + 1;
+}
+
+static void free_pooling_param(
+    bmk_ctx_t *ctx,
+    param_t *p)
+{
+  if (p->ifmap)
+    free_tl(ctx, p->ifmap);
+  if (p->ofmap)
+    free_tl(ctx, p->ofmap);
+}
+
+static param_t random_pooling_param(bmk_ctx_t *ctx)
+{
+  param_t p;
+
+  memset(&p, 0, sizeof(p));
+
+retry:
+  random_seed = clock();
+  srand(random_seed);
+
+  int in = rand() % 5 + 1;
+  int ic = rand() % (3 * BM1880V2_HW_NPU_NUM) + 1;
+  int ih = rand() % 30 + 3;
+  int iw = rand() % 30 + 6;
+
+  p.kh = rand() % 7 + 1;
+  p.kw = rand() % 7 + 1;
+  p.stride_h = rand() % p.kh + 1;
+  p.stride_w = rand() % p.kw + 1;
+  p.ins_h = rand() % p.kh;
+  p.ins_w = rand() % p.kw;
+  p.ins_last_h = rand() % p.kh;
+  p.ins_last_w = rand() % p.kw;
+  p.pad_top = rand() % p.kh;
+  p.pad_bottom = rand() % p.kh;
+  p.pad_left = rand() % p.kw;
+  p.pad_right= rand() % p.kw;
+  p.rshift_bits = rand() % 32;
+  p.avg_pooling_const = convert_fp32_bf16(rand()%0x1000);//rand() % 256;
+  tl_shape_t ifmap_shape;
+  ifmap_shape.n = in;
+  ifmap_shape.c = ic;
+  ifmap_shape.h = ih;
+  ifmap_shape.w = iw;
+
+  int on = in;
+  int oc = ic;
+  int oh = pooling_oh(&p, ih);
+  int ow = pooling_ow(&p, iw);
+  tl_shape_t ofmap_shape;
+  ofmap_shape.n = on;
+  ofmap_shape.c = oc;
+  ofmap_shape.h = oh;
+  ofmap_shape.w = ow;
+
+  p.ofmap = bmk1880v2_lmem_alloc_tensor(ctx, ofmap_shape, FMT_BF16, 1);
+  p.ifmap = bmk1880v2_lmem_alloc_tensor(ctx, ifmap_shape, FMT_BF16, 1);
+
+  if ((p.kh > pooling_ih_ext(&p, ih))
+      || (p.kw > pooling_iw_ext(&p, iw))
+      || (p.pad_top >= (1 << 4))
+      || (p.pad_bottom >= (1 << 4))
+      || (p.pad_left >= (1 << 4))
+      || (p.pad_right >= (1 << 4))
+      || !p.ofmap
+      || !p.ifmap) {
+    printf("retry init_pooling_param\n");
+    free_pooling_param(ctx, &p);
+    goto retry;
+  }
+
+  return p;
+}
+
+static void compare_results(
+    param_t *p,
+    u16 input[],
+    u16 output[])
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+
+  u16 *output_ref = alloc_output(p);
+  p->avg_pooling_const = convert_fp32_bf16(convert_bf16_fp32(p->avg_pooling_const)/(p->kh * p->kw));
+  bmerr_t ret = native_pooling_avg_bf16(
+      input, &p->avg_pooling_const, NULL, output_ref,
+      in, ic, ih, iw, p->kh, p->kw,
+      p->pad_top, p->pad_bottom, p->pad_left, p->pad_right,
+      p->stride_h, p->stride_w,
+      p->ins_h, p->ins_w, p->ins_last_h, p->ins_last_w,1
+      );
+  assert(ret == BM_SUCCESS);
+  int cmp_res = array_cmp_int8(
+      "Comparing results ...\n", (s8*)output_ref, (s8*) output,
+      tl_shape_size(&p->ofmap->shape)*2);
+
+  if (cmp_res != 0) {
+    printf("Comparison FAILED!!!\n");
+    print_pooling_param(p);
+    exit(-1);
+  }
+
+  free(output_ref);
+}
+
+static int test_pooling(CVI_RT_HANDLE ctx, bmk_ctx_t *bk_ctx)
+{
+  param_t p = random_pooling_param(bk_ctx);
+//  print_pooling_param(&p);
+
+  u16 *input = alloc_input(&p);
+
+  put_bf16_tensor_g2l(&ctx, bk_ctx, p.ifmap, (u16 *)input, FMT_BF16);
+  bmk1880v2_tiu_average_pooling(bk_ctx, &p);
+  u16 *output = (u16 *)get_bf16_tensor_l2g(&ctx, bk_ctx, p.ofmap, FMT_BF16);
+
+  compare_results(&p, input, output);
+
+  free_pooling_param(bk_ctx, &p);
+  free(output);
+  free(input);
+
+  return 1;
+}
+
+static void test_avg_pooling(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx)
+{
+  int test_finished_num = 0;
+  for (u64 i = 0; i < 20; i++)
+    test_finished_num += test_pooling(*ctx, bk_ctx);
+  printf("Test finished %d\n", test_finished_num);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  int round_mode;
+  round_mode = set_store_feround();
+  test_avg_pooling(&ctx, bk_ctx);
+  restore_feround(round_mode);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_bf16_conv.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_conv.cpp
new file mode 100644
index 000000000..864c81831
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_conv.cpp
@@ -0,0 +1,693 @@
+#include "../1880v2_test_util.h"
+//#include <float.h>
+//#undef printf
+//#define printf(...) {}
+
+typedef struct {
+  int random_seed;
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int using_bias;
+  int bReLU_EN;
+  int r_shift_m;
+  int opd0_sign;
+  int opd1_sign;
+  int opd2_sign;
+} conv_param_t;
+
+static void print_conv_param(const conv_param_t *p);
+
+static inline void bf16_relu(float *buf, u64 size)
+{
+  for (u64 i = 0; i < size; i++)
+    if (buf[i] < 0)
+      buf[i] = 0;
+}
+
+static int conv_ref(
+    const conv_param_t *p_param,
+    const u16 *ifmap,
+    const u16 *weight,
+    const u32 *bias,
+    u16 *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int do_relu = p_param->bReLU_EN;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  float *result = (float *)malloc(sizeof(float) * in * oc * oh * ow);
+  if (!result)
+    return BM_ERR_FAILURE;
+
+  memset(result, 0, sizeof(float) * in * oc * oh * ow);
+  int ret = BM_SUCCESS;
+
+
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      u16 *i_fmap_pad[ic];
+      u16 *kernel_pad[ic];
+
+      for (int iic = 0; iic < ic; ++iic) {
+        i_fmap_pad[iic] = NULL;
+        kernel_pad[iic] = NULL;
+        fill_pad_fmap_bf16(
+            (ifmap + n*ic*ih*iw + iic*ih*iw), &i_fmap_pad[iic], convert_fp32_bf16(0),
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+        //kernel_dilation(
+        fill_pad_fmap_bf16(
+            (weight + c*ic*kh*kw + iic*kh*kw), &kernel_pad[iic], convert_fp32_bf16(0),
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+                  kh, kw);
+      }
+      for (int ph = 0; ph < oh; ++ph) {
+        for (int pw = 0; pw < ow; ++pw) {
+          float result_val = result[n*oc*oh*ow + c*oh*ow + ph*ow + pw];
+          for (int idxh = 0; idxh < kh_ext; ++idxh) {
+            for (int idxw = 0; idxw < kw_ext; ++idxw) {
+              for (int iic = 0; iic < ic; ++iic){
+                float ifv = convert_bf16_fp32(i_fmap_pad[iic][(idxh+ph*stride_h) * iw_ext + idxw + pw*stride_w]);
+                float ikv = convert_bf16_fp32(kernel_pad[iic][idxh* kw_ext + idxw]);
+                result_val += ifv*ikv;
+              }
+            }
+          }
+          result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] = result_val;
+        }
+      }
+
+       if (p_param->using_bias) {
+         for (int ph = 0; ph < oh; ++ph) {
+           for (int pw = 0; pw < ow; ++pw) {
+             result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += convert_hex_fp32(bias[c]); //bias+c ;
+           }
+         }
+       }
+
+       if (do_relu)
+         bf16_relu(&result[n*oc*oh*ow + c*oh*ow], oh * ow);
+       for(int i = 0 ;i<ic;i++) {
+         free(i_fmap_pad[i]);
+         free(kernel_pad[i]);
+       }
+       if (ret != BM_SUCCESS)
+         goto error_release;
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+    for (int i = 0; i < in * oc * oh * ow; i++) {
+      ofmap[i] = convert_fp32_bf16(result[i]);
+    }
+
+error_release:
+  free(result);
+
+  return ret;
+}
+
+static u16 * transform_weight(const tl_shape_t *s, u16 before[])
+{
+  u32 ic = s->n;
+  u32 oc = s->c;
+  u32 kh = s->h;
+  u32 kw = s->w;
+
+  u32 size = ic * oc * kh * kw;
+  u16 *after = (u16 *)malloc(sizeof(u16) * size);
+
+  /*
+   * (oc, ic, kh, kw) -> (1, oc, kh * kw, ic)
+   */
+  for (u32 oci = 0; oci < oc; oci++) {
+    for (u32 ici = 0; ici < ic; ici++) {
+      for (u32 khi = 0; khi < kh; khi++) {
+        for (u32 kwi = 0; kwi < kw; kwi++) {
+          u32 src_i = oci * ic * kh * kw + ici * kh * kw + khi * kw + kwi;
+          u32 dst_i = oci * kh * kw * ic + khi * kw * ic + kwi * ic + ici;
+          after[dst_i] = before[src_i];
+        }
+      }
+    }
+  }
+
+  return after;
+}
+
+static void put_conv_weight(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    const tl_t *tl,
+    u16 *data)
+{
+  const tl_shape_t *s = &tl->shape;
+  u32 ic = s->n;
+  u32 oc = s->c;
+  u32 kh = s->h;
+  u32 kw = s->w;
+
+  bmshape_t bms = BM_TENSOR_INT8((int)oc, (int)ic, (int)kh, (int)kw*2);
+  CVI_RT_MEM dev_mem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = CVI_RT_MemGetPAddr(dev_mem);
+  u16 *transformed_data = transform_weight(s, data);
+
+  /* we put weight to region 1. CVI_RT_MemCopyS2D regard dev_mem as
+   * absolute address, so we should pass abs address to copy weight
+   * to right place.
+   */
+
+  //u64 ab_addr = bm_device_read_base_reg(*ctx, 1);
+  //CVI_RT_MEM ab_dev_mem =
+    //bmmem_device_prealloc(*ctx, NULL, ab_addr + gaddr, &bms);
+
+  //int ret = CVI_RT_MemCopyS2D(*ctx, ab_dev_mem, (u8*)transformed_data);
+  int ret = CVI_RT_MemCopyS2D(*ctx, dev_mem, (u8*)transformed_data);
+
+  assert(ret == BM_SUCCESS);
+  tl_shape_t tdma_shape = { 1, oc, kh * kw, ic };
+
+  tg_t tdma_tg;
+  tdma_tg.base_reg_index = 0;
+  tdma_tg.start_address = gaddr;
+  tdma_tg.fmt = FMT_BF16;
+  tdma_tg.shape.n = tdma_shape.n;
+  tdma_tg.shape.c = tdma_shape.c;
+  tdma_tg.shape.h = tdma_shape.h;
+  tdma_tg.shape.w = tdma_shape.w;
+  tdma_tg.stride = bmk1880v2_tensor_tgmem_default_stride(tdma_tg.shape, FMT_BF16);
+  tdma_tg.base_reg_index = 1;
+
+  tl_t tdma_tl = *tl;
+  tdma_tl.shape = tdma_shape;
+  tdma_tl.stride = bmk1880v2_tensor_lmem_default_stride(bk_ctx, tdma_shape, FMT_BF16, 0);
+
+  bmk1880v2_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tdma_tg;
+  p.dst = &tdma_tl;
+
+  bmk1880v2_tdma_g2l_bf16_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+  CVI_RT_MemFree(*ctx, dev_mem);
+
+  free(transformed_data);
+}
+
+
+static u16 * transform_bias(int oc, u32 before[])
+{
+  u16 *after = (u16 *)malloc(sizeof(u16) * 2 * oc);
+  if (!after)
+    return NULL;
+
+  for (int i = 0; i < oc; i++){
+    after[i] = (before[i] >> 16) & 0xffff;
+    after[i + oc] = before[i] & 0xffff;
+  }
+  return after;
+}
+
+static void put_conv_bias(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    const tl_t *tl,
+    u32 *data)
+{
+
+  int oc = tl->shape.c;
+
+  bmshape_t bms = BM_TENSOR_INT8(2 * sizeof(short), oc, 1, 1);
+  CVI_RT_MEM dev_mem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = CVI_RT_MemGetPAddr(dev_mem);
+  u16 *transformed_data = transform_bias(oc, data);
+  int ret = CVI_RT_MemCopyS2D(*ctx, dev_mem, (u8 *)transformed_data);
+  free(transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_BF16;
+  tg.shape.n = 2;
+  tg.shape.c = oc;
+  tg.shape.h = 1;
+  tg.shape.w = 1;
+  tg.stride = bmk1880v2_tensor_tgmem_default_stride(tg.shape, FMT_BF16);
+
+  bmk1880v2_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1880v2_tdma_g2l_bf16_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  CVI_RT_MemFree(*ctx, dev_mem);
+}
+
+static int conv_kh_ext(const conv_param_t *p)
+{
+  return (p->kh - 1) * p->dh + 1;
+}
+
+static int conv_kw_ext(const conv_param_t *p)
+{
+  return (p->kw - 1) * p->dw + 1;
+}
+
+static int conv_ih_ext(const conv_param_t *p)
+{
+  return (p->input_h - 1) * (p->ins_h + 1) +
+      p->ins_h_last + 1 + p->pad_top + p->pad_bot;
+}
+
+static int conv_iw_ext(const conv_param_t *p)
+{
+  return (p->input_w - 1) * (p->ins_w + 1) +
+      p->ins_w_last + 1 + p->pad_left + p->pad_right;
+}
+
+static int conv_oh(const conv_param_t *p)
+{
+  return (conv_ih_ext(p) - conv_kh_ext(p)) / p->stride_h + 1;
+}
+
+static int conv_ow(const conv_param_t *p)
+{
+  return (conv_iw_ext(p) - conv_kw_ext(p)) / p->stride_w + 1;
+}
+
+static int conv_input_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int ic = p->input_c;
+  int ih = p->input_h;
+  int iw = p->input_w;
+  return in * ic * ih * iw;
+}
+
+static int conv_output_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int oc = p->output_c;
+  int oh = conv_oh(p);
+  int ow = conv_ow(p);
+  return in * oc * oh * ow;
+}
+
+static int conv_weight_size(const conv_param_t *p)
+{
+  int oc = p->output_c;
+  int ic = p->input_c;
+  int kh = p->kh;
+  int kw = p->kw;
+  return oc * ic * kh * kw;
+}
+
+static u16 * alloc_input(const conv_param_t *p)
+{
+  int size = conv_input_size(p);
+  u16 *buf = (u16 *)malloc(sizeof(u16) * size);
+  for (int i = 0; i < size; i++) {
+    float val = 0;
+    int RAND_MAX2 = RAND_MAX/2; //5 ~ -5
+    val = (float)(rand()-RAND_MAX2)*5 / (float)RAND_MAX;
+    buf[i] = convert_fp32_bf16(val);
+  }
+  return buf;
+}
+
+static u16 * alloc_weight(const conv_param_t *p)
+{
+  int size = conv_weight_size(p);
+  u16 *buf = (u16 *)malloc(sizeof(u16) * size);
+  for (int i = 0; i < size; i++) {
+    float val = 0;
+    int RAND_MAX2 = RAND_MAX/2; // 5 ~ -5
+    val = (float)(rand()-RAND_MAX2)*5 / (float)RAND_MAX;
+    buf[i] = convert_fp32_bf16(val);
+  }
+
+  return buf;
+}
+
+static u32 * alloc_bias(const conv_param_t *p)
+{
+  int oc = p->output_c;
+  u32 *bias = (u32 *)malloc(sizeof(u32) * oc);
+  for (int i = 0; i < oc; i++) {
+    float val = 0;
+    int RAND_MAX2 = RAND_MAX/2; // 5 ~ -5
+    val = (float)(rand()-RAND_MAX2)*5 / (float)RAND_MAX;
+    bias[i] = convert_fp32_hex(val);
+  }
+
+  return bias;
+}
+
+static tl_t * conv_ifmap_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = FMT_BF16;//p->opd0_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return bmk1880v2_lmem_alloc_tensor(ctx, s, fmt, 1);
+}
+
+static tl_t * conv_weight_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = FMT_BF16;//p->opd1_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+
+  return bmk1880v2_lmem_alloc_tensor(ctx, s, fmt, 0);
+}
+
+static tl_t * conv_ofmap_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  tl_shape_t s;
+  fmt_t fmt = FMT_BF16;
+  s.n = p->input_n;
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  return bmk1880v2_lmem_alloc_tensor(ctx, s, fmt, 1);
+}
+
+static tl_t * conv_bias_tensor(
+    bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = FMT_BF16;
+  tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+
+  return bmk1880v2_lmem_alloc_tensor(ctx, s, fmt, 0);
+}
+
+static int conv_param_is_ok(const conv_param_t *p)
+{
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+
+  if ((kh_ext > ih_ext)
+      || (kw_ext > iw_ext)
+      || (kh_ext <= p->pad_top)
+      || (kh_ext <= p->pad_bot)
+      || (kw_ext <= p->pad_left)
+      || (kw_ext <= p->pad_right)
+      || (p->pad_top >= (1 << 4))
+      || (p->pad_bot >= (1 << 4))
+      || (p->pad_left >= (1 << 4))
+      || (p->pad_right >= (1 << 4))) {
+    return 0;
+  }
+
+  return 1;
+}
+
+static int bmk_conv_param_alloc_ok(
+    const bmk1880v2_tiu_convolution_param_t *p,
+    const conv_param_t *param)
+{
+  if (!p->ifmap || !p->ofmap || !p->weight)
+    return 0;
+
+  if (param->using_bias)
+    if (!p->bias)
+      return 0;
+
+  return 1;
+}
+
+static void make_bmk_conv_param(
+    bmk_ctx_t *ctx,
+    bmk1880v2_tiu_convolution_param_t *dst,
+    const conv_param_t *p)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  dst->relu_enable = p->bReLU_EN;
+  dst->rshift_bits = p->r_shift_m;
+
+  dst->ifmap = conv_ifmap_tensor(ctx, p);
+  dst->weight = conv_weight_tensor(ctx, p);
+  dst->ofmap = conv_ofmap_tensor(ctx, p);
+  dst->bias = NULL;
+  dst->ps32_mode = 0;
+  if (p->using_bias)
+    dst->bias = conv_bias_tensor(ctx, p);
+  dst->w_is_const = 0;
+}
+
+static void free_bmk_conv_param(
+    bmk_ctx_t *ctx,
+    bmk1880v2_tiu_convolution_param_t *r,
+    const conv_param_t *p)
+{
+  if (p->using_bias && r->bias)
+    free_tl(ctx, r->bias);
+  if (r->ofmap)
+    free_tl(ctx, r->ofmap);
+  if (r->weight)
+    free_tl(ctx, r->weight);
+  if (r->ifmap)
+    free_tl(ctx, r->ifmap);
+}
+
+static void init_conv_param(conv_param_t &p)
+{
+  printf("init_conv_param\n");
+  memset(&p, 0, sizeof(p));
+  p.random_seed = clock();
+  srand(p.random_seed);
+
+retry:
+
+  p.input_n = rand() % 5 + 1;
+  p.input_c = rand() % (5 * 32) + 1;
+  p.kh = rand() % 7 + 1;
+  p.kw = rand() % 7 + 1;
+  p.input_h = rand() % 40 + p.kh;
+  p.input_w = rand() % 40 + p.kw;
+  p.output_c = rand() % 10 + 3;
+  p.stride_h = rand() % (p.kh) + 1;
+  p.stride_w = rand() % (p.kw) + 1;
+  p.ins_h = rand() % p.kh;
+  p.ins_w = rand() % p.kw;
+  p.ins_h_last = rand() % p.kh;;
+  p.ins_w_last = rand() % p.kw;;
+  p.dh = rand() % 3 + 1;
+  p.dw = rand() % 3 + 1;
+  int kh_ext = conv_kh_ext(&p);
+  int kw_ext = conv_kw_ext(&p);
+  p.pad_top = rand() % kh_ext;
+  p.pad_bot = rand() % kh_ext;
+  p.pad_left = rand() % kw_ext;
+  p.pad_right = rand() % kw_ext;
+
+  if (!conv_param_is_ok(&p)) {
+    printf("retry init_conv_param\n");
+    goto retry;
+  }
+
+  p.using_bias = rand() % 2;
+  p.bReLU_EN = rand() % 2;
+
+  p.opd0_sign = rand() % 2;
+  p.opd1_sign = 1;
+  p.opd2_sign = 1;
+
+  assert(p.opd1_sign == 1 && p.opd2_sign == 1);
+
+  int ih_ext = conv_ih_ext(&p);
+  int iw_ext = conv_iw_ext(&p);
+  assert(ih_ext >= kh_ext);
+  assert(iw_ext >= kw_ext);
+}
+
+#if 1
+static void print_conv_param(const conv_param_t *p)
+{
+  printf("%s\n", "Conv parameters:");
+  printf("  %s%d;\n", "p->random_seed = ", p->random_seed);
+
+  printf("  %s%d;\n", "p->input_n = ", p->input_n);
+  printf("  %s%d;\n", "p->input_c = ", p->input_c);
+  printf("  %s%d;\n", "p->input_h = ", p->input_h);
+  printf("  %s%d;\n", "p->input_w = ", p->input_w);
+  printf("  %s%d;\n", "p->output_c = ", p->output_c);
+
+  printf("  %s%d;\n", "p->kh = ", p->kh);
+  printf("  %s%d;\n", "p->kw = ", p->kw);
+  printf("  %s%d;\n", "p->dh = ", p->dh);
+  printf("  %s%d;\n", "p->dw = ", p->dw);
+  printf("  %s%d;\n", "p->pad_top = ", p->pad_top);
+  printf("  %s%d;\n", "p->pad_bot = ", p->pad_bot);
+  printf("  %s%d;\n", "p->pad_left = ", p->pad_left);
+  printf("  %s%d;\n", "p->pad_right = ", p->pad_right);
+  printf("  %s%d;\n", "p->stride_h = ", p->stride_h);
+  printf("  %s%d;\n", "p->stride_w = ", p->stride_w);
+  printf("  %s%d;\n", "p->ins_w = ", p->ins_w);
+  printf("  %s%d;\n", "p->ins_h = ", p->ins_h);
+  printf("  %s%d;\n", "p->ins_w_last = ", p->ins_w_last);
+  printf("  %s%d;\n", "p->ins_h_last = ", p->ins_h_last);
+
+  printf("  %s%d;\n", "p->r_shift_m = ", p->r_shift_m);
+  printf("  %s%d;\n", "p->opd0_sign = ", p->opd0_sign);
+  printf("  %s%d;\n", "p->opd1_sign = ", p->opd1_sign);
+  printf("  %s%d;\n", "p->opd2_sign = ", p->opd2_sign);
+  printf("  %s%d;\n", "p->bReLU_EN = ", p->bReLU_EN);
+  printf("  %s%d;\n", "p->using_bias = ", p->using_bias);
+
+  printf("  %s%d\n", "kh_ext = ", conv_kh_ext(p));
+  printf("  %s%d\n", "kw_ext = ", conv_kw_ext(p));
+  printf("  %s%d\n", "ih_ext = ", conv_ih_ext(p));
+  printf("  %s%d\n", "iw_ext = ", conv_iw_ext(p));
+  printf("  %s%d\n", "output_h = ", conv_oh(p));
+  printf("  %s%d\n", "output_w = ", conv_ow(p));
+}
+#endif
+
+static int test_conv(
+    conv_param_t &p_param, CVI_RT_HANDLE ctx, bmk_ctx_t *bk_ctx)
+{
+  u16 *input = alloc_input(&p_param);
+  u16 *weight = alloc_weight(&p_param);
+  u32 *bias = alloc_bias(&p_param);
+
+  //print_conv_param(&p_param);
+
+  u16 *output_ref = (u16 *)malloc(sizeof(u16) * conv_output_size(&p_param));
+  if (!output_ref)
+    return BM_ERR_FAILURE;
+
+  bmerr_t ret = conv_ref(&p_param, input, weight, bias, output_ref);
+
+  assert(ret == BM_SUCCESS);
+
+  bmk1880v2_tiu_convolution_param_t conv_param;
+  make_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+  int tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, &p_param);
+  if (tl_alloc_success) {
+    put_bf16_tensor_g2l(&ctx, bk_ctx, conv_param.ifmap, (u16 *)input, FMT_BF16);
+    put_conv_weight(&ctx, bk_ctx, conv_param.weight, (u16 *)weight);
+    if (p_param.using_bias)
+      put_conv_bias(&ctx, bk_ctx, conv_param.bias, bias);
+    bmk1880v2_tiu_convolution(bk_ctx, &conv_param);
+    u16 *output = (u16 *) get_bf16_tensor_l2g(&ctx, bk_ctx, conv_param.ofmap, FMT_BF16 );
+
+    int has_error = array_cmp_int8(
+        "Comparing results ...\n",
+        (s8*)output_ref, (s8*)output, conv_output_size(&p_param)*2);
+
+    if (has_error) {
+      print_conv_param(&p_param);
+      printf("Comparison FAILED\n");
+      exit(-1);
+    }
+    free(output);
+  }
+
+  free_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+  free(input);
+  free(weight);
+  free(output_ref);
+  free(bias);
+  return tl_alloc_success ? 1 : 0;
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  int round_mode;
+  round_mode = set_store_feround();
+  int test_finished_num = 0;
+
+  for (int i = 0; i < 20; i++) {
+    printf("random_test_conv iteration: %d\n", i);
+    conv_param_t test_conv_param;
+    init_conv_param(test_conv_param);
+
+    test_finished_num += test_conv(test_conv_param, ctx, bk_ctx);
+
+    if (!test_conv_param.using_bias)
+      test_conv_param.using_bias = 1;
+
+    if (test_conv_param.output_c <= 32)
+    {
+      test_conv_param.output_c += 32;
+    }
+    test_finished_num += test_conv(test_conv_param, ctx, bk_ctx);
+  }
+  restore_feround(round_mode);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_bf16_conv_ps32.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_conv_ps32.cpp
new file mode 100644
index 000000000..431a0fcd9
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_conv_ps32.cpp
@@ -0,0 +1,1077 @@
+#include "../1880v2_test_util.h"
+typedef struct {
+  int random_seed;
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int using_bias;
+  int bReLU_EN;
+  int r_shift_m;
+  int opd0_sign;
+  int opd1_sign;
+  int opd2_sign;
+  int bf16_enable;
+} conv_param_t;
+
+static inline void bf16_relu(float *buf, u64 size)
+{
+  for (u64 i = 0; i < size; i++)
+    if (buf[i] < 0)
+      buf[i] = 0;
+}
+
+static int ps32_conv_ref(
+    const conv_param_t *p_param,
+    const u16 *ifmap,
+    const u16 *weight,
+    const u32 *bias,
+    u16 *ofmap, int ps32_mode)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  float *result = (float *)malloc(sizeof(float) * in * oc * oh * ow);
+  if (!result)
+    return BM_ERR_FAILURE;
+
+  u32 bstride = in * oc * oh * ow;
+  int ret = BM_SUCCESS;
+
+  if (ps32_mode == 2 || ps32_mode == 0)
+    memset(result, 0, sizeof(float) * in * oc * oh * ow);
+  else {
+    for (int i = 0; i < in * oc * oh * ow; i++) {
+      result[i] = convert_hex_fp32((ofmap[i + bstride * 0] << 16) | ofmap[i + bstride * 1]);
+    }
+  }
+
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      u16 *i_fmap_pad[ic];
+      u16 *kernel_pad[ic];
+
+      for (int iic = 0; iic < ic; ++iic) {
+        i_fmap_pad[iic] = NULL;
+        kernel_pad[iic] = NULL;
+        fill_pad_fmap_bf16(
+            (ifmap + n*ic*ih*iw + iic*ih*iw), &i_fmap_pad[iic], convert_fp32_bf16(0),
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+        //kernel_dilation(
+        fill_pad_fmap_bf16(
+            (weight + c*ic*kh*kw + iic*kh*kw), &kernel_pad[iic], convert_fp32_bf16(0),
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+                  kh, kw);
+      }
+      for (int ph = 0; ph < oh; ++ph) {
+        for (int pw = 0; pw < ow; ++pw) {
+          float result_val= result[n*oc*oh*ow + c*oh*ow + ph*ow + pw];
+          for (int idxh = 0; idxh < kh_ext; ++idxh)  {
+            for (int idxw = 0; idxw < kw_ext; ++idxw) {
+              for (int iic = 0; iic < ic; ++iic){
+                float ifv = convert_bf16_fp32(i_fmap_pad[iic][(idxh+ph*stride_h) * iw_ext + idxw + pw*stride_w]);
+                float ikv = convert_bf16_fp32(kernel_pad[iic][idxh* kw_ext + idxw]);
+                result_val += ifv*ikv;
+              }
+            }
+          }
+          result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] = result_val;
+		}
+      }
+        for(int i = 0; i < ic; i++) {
+          free(i_fmap_pad[i]);
+          free(kernel_pad[i]);
+        }
+    } //end for (int c = 0; c < oc; ++c)
+  }
+
+  if( ps32_mode & 0x2) {
+    for (int i = 0; i < in * oc * oh * ow; i ++) {
+      ofmap[i] = convert_fp32_hex(result[i]) >> 16;
+      ofmap[bstride + i] = convert_fp32_hex(result[i]) & 0xFFFF;
+    }
+  } else {
+    for (int n = 0; n < in; ++n) {
+      for (int c = 0; c < oc; ++c) {
+        if (p_param->using_bias) {
+          for (int ph = 0; ph < oh; ++ph) {
+            for (int pw = 0; pw < ow; ++pw) {
+              result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += convert_hex_fp32(bias[c]); //bias+c ;
+            }
+          }
+        }
+        if (p_param->bReLU_EN)
+          bf16_relu(&result[n*oc*oh*ow + c*oh*ow], oh * ow);
+      }
+    }
+    for (int i = 0; i < in * oc * oh * ow; i++) {
+      ofmap[i] = convert_fp32_bf16(result[i]);
+    }
+  }
+  free(result);
+  return ret;
+}
+
+static u16 * transform_weight(const tl_shape_t *s, u16 before[])
+{
+  u32 ic = s->n;
+  u32 oc = s->c;
+  u32 kh = s->h;
+  u32 kw = s->w;
+
+  u32 size = ic * oc * kh * kw;
+  u16 *after = (u16 *)malloc(sizeof(u16) * size);
+
+  /*
+   * (oc, ic, kh, kw) -> (1, oc, kh * kw, ic)
+   */
+  for (u32 oci = 0; oci < oc; oci++) {
+    for (u32 ici = 0; ici < ic; ici++) {
+      for (u32 khi = 0; khi < kh; khi++) {
+        for (u32 kwi = 0; kwi < kw; kwi++) {
+          u32 src_i = oci * ic * kh * kw + ici * kh * kw + khi * kw + kwi;
+          u32 dst_i = oci * kh * kw * ic + khi * kw * ic + kwi * ic + ici;
+          after[dst_i] = before[src_i];
+        }
+      }
+    }
+  }
+
+  return after;
+}
+
+static void put_conv_weight(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    const tl_t *tl,
+    u16 *data)
+{
+  const tl_shape_t *s = &tl->shape;
+  u32 ic = s->n;
+  u32 oc = s->c;
+  u32 kh = s->h;
+  u32 kw = s->w;
+
+  bmshape_t bms = BM_TENSOR_INT8((int)oc, (int)ic, (int)kh, (int)kw*2);
+  CVI_RT_MEM dev_mem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = CVI_RT_MemGetPAddr(dev_mem);
+  u16 *transformed_data = transform_weight(s, data);
+
+  /* we put weight to region 1. CVI_RT_MemCopyS2D regard dev_mem as
+   * absolute address, so we should pass abs address to copy weight
+   * to right place.
+   */
+
+  //u64 ab_addr = bm_device_read_base_reg(*ctx, 1);
+  //CVI_RT_MEM ab_dev_mem =
+    //bmmem_device_prealloc(*ctx, NULL, ab_addr + gaddr, &bms);
+
+  //int ret = CVI_RT_MemCopyS2D(*ctx, ab_dev_mem, (u8*)transformed_data);
+  int ret = CVI_RT_MemCopyS2D(*ctx, dev_mem, (u8*)transformed_data);
+  assert(ret == BM_SUCCESS);
+  tl_shape_t tdma_shape = { 1, oc, kh * kw, ic };
+
+  tg_t tdma_tg;
+  tdma_tg.base_reg_index = 0;
+  tdma_tg.start_address = gaddr;
+  tdma_tg.fmt = FMT_BF16;
+  tdma_tg.shape.n = tdma_shape.n;
+  tdma_tg.shape.c = tdma_shape.c;
+  tdma_tg.shape.h = tdma_shape.h;
+  tdma_tg.shape.w = tdma_shape.w;
+  tdma_tg.stride = bmk1880v2_tensor_tgmem_default_stride(tdma_tg.shape, FMT_BF16);
+  tdma_tg.base_reg_index = 1;
+
+  tl_t tdma_tl = *tl;
+  tdma_tl.shape = tdma_shape;
+  tdma_tl.stride = bmk1880v2_tensor_lmem_default_stride(bk_ctx, tdma_shape, FMT_BF16, 0);
+
+  bmk1880v2_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tdma_tg;
+  p.dst = &tdma_tl;
+
+  bmk1880v2_tdma_g2l_bf16_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+  CVI_RT_MemFree(*ctx, dev_mem);
+  free(transformed_data);
+}
+
+static u16 * transform_bias(int oc, u32 before[])
+{
+  u16 *after = (u16 *)malloc(sizeof(u16) * 2 * oc);
+  if (!after)
+    return NULL;
+
+  for (int i = 0; i < oc; i++){
+    after[i] = (before[i] >> 16) & 0xFFFF;
+    after[i + oc] = before[i] & 0xFFFF;
+  }
+  return after;
+}
+
+static void put_conv_bias(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    const tl_t *tl,
+    u32 *data)
+{
+
+  int oc = tl->shape.c;
+
+  bmshape_t bms = BM_TENSOR_INT8(2 * sizeof(short), oc, 1, 1);
+  CVI_RT_MEM dev_mem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = CVI_RT_MemGetPAddr(dev_mem);
+  u16 *transformed_data = transform_bias(oc, data);
+  int ret = CVI_RT_MemCopyS2D(*ctx, dev_mem, (u8 *)transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_BF16;
+  tg.shape.n = 2;
+  tg.shape.c = oc;
+  tg.shape.h = 1;
+  tg.shape.w = 1;
+  tg.stride = bmk1880v2_tensor_tgmem_default_stride(tg.shape, FMT_BF16);
+
+  bmk1880v2_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1880v2_tdma_g2l_bf16_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  CVI_RT_MemFree(*ctx, dev_mem);
+  free(transformed_data);
+}
+
+static int conv_kh_ext(const conv_param_t *p)
+{
+  return (p->kh - 1) * p->dh + 1;
+}
+
+static int conv_kw_ext(const conv_param_t *p)
+{
+  return (p->kw - 1) * p->dw + 1;
+}
+
+static int conv_ih_ext(const conv_param_t *p)
+{
+  return (p->input_h - 1) * (p->ins_h + 1) +
+      p->ins_h_last + 1 + p->pad_top + p->pad_bot;
+}
+
+static int conv_iw_ext(const conv_param_t *p)
+{
+  return (p->input_w - 1) * (p->ins_w + 1) +
+      p->ins_w_last + 1 + p->pad_left + p->pad_right;
+}
+
+static int conv_oh(const conv_param_t *p)
+{
+  return (conv_ih_ext(p) - conv_kh_ext(p)) / p->stride_h + 1;
+}
+
+static int conv_ow(const conv_param_t *p)
+{
+  return (conv_iw_ext(p) - conv_kw_ext(p)) / p->stride_w + 1;
+}
+
+static int conv_input_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int ic = p->input_c;
+  int ih = p->input_h;
+  int iw = p->input_w;
+  return in * ic * ih * iw;
+}
+
+static int conv_output_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int oc = p->output_c;
+  int oh = conv_oh(p);
+  int ow = conv_ow(p);
+  return in * oc * oh * ow;
+}
+
+static int conv_weight_size(const conv_param_t *p)
+{
+  int oc = p->output_c;
+  int ic = p->input_c;
+  int kh = p->kh;
+  int kw = p->kw;
+  return oc * ic * kh * kw;
+}
+
+static u16 * alloc_input(const conv_param_t *p)
+{
+  int size = conv_input_size(p);
+  u16 *buf = (u16 *)malloc(sizeof(u16) * size);
+  for (int i = 0; i < size; i++) {
+    buf[i] = convert_fp32_bf16(i);
+  }
+
+  return buf;
+}
+
+static u16 * alloc_weight(const conv_param_t *p)
+{
+  int size = conv_weight_size(p);
+
+  u16 *buf = (u16 *)malloc(sizeof(u16) * size);
+  for (int i = 0; i < size; i++) {
+    buf[i] = convert_fp32_bf16(i);
+  }
+
+  return buf;
+}
+
+static u32 * alloc_bias(const conv_param_t *p)
+{
+  int oc = p->output_c;
+
+  u32 *bias = (u32 *)malloc(sizeof(u32) * oc);
+  float val = 100;
+  for (int i = 0; i < oc; i++) {
+    bias[i] = convert_fp32_hex(val);
+    val += 1;
+  }
+  return bias;
+}
+
+static tl_t * conv_ifmap_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = FMT_BF16;
+  tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return bmk1880v2_lmem_alloc_tensor(ctx, s, fmt, 1);
+}
+
+static uint32_t conv_ifmap_tensor_size(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = FMT_BF16;
+  tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return bmk1880v2_lmem_tensor_to_size(ctx, s, fmt, 1);
+}
+
+static tl_t * conv_weight_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = FMT_BF16;
+  tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+  return bmk1880v2_lmem_alloc_tensor(ctx, s, fmt, 0);
+}
+
+static uint32_t conv_weight_tensor_size(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = FMT_BF16;
+  tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+  return bmk1880v2_lmem_tensor_to_size(ctx, s, fmt, 0);
+}
+
+static tl_t * conv_ofmap_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  return bmk1880v2_lmem_alloc_ps32_tensor(ctx, s, FMT_BF16, 1);
+}
+
+static uint32_t conv_ofmap_tensor_size(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  tl_shape_t s;
+  s.n = p->input_n * sizeof(u32) / sizeof(u16);
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  return bmk1880v2_lmem_tensor_to_size(ctx, s, FMT_BF16, 1);
+}
+
+static tl_t * conv_bias_tensor(
+    bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = FMT_BF16;
+  tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+  return bmk1880v2_lmem_alloc_tensor(ctx, s, fmt, 0);
+}
+
+static uint32_t conv_bias_tensor_size(
+    bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = FMT_BF16;
+  tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+  return bmk1880v2_lmem_tensor_to_size(ctx, s, fmt, 0);
+}
+
+static int conv_param_is_ok(const conv_param_t *p)
+{
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+
+  if ((kh_ext > ih_ext)
+      || (kw_ext > iw_ext)
+      || (kh_ext <= p->pad_top)
+      || (kh_ext <= p->pad_bot)
+      || (kw_ext <= p->pad_left)
+      || (kw_ext <= p->pad_right)
+      || (p->pad_top >= (1 << 4))
+      || (p->pad_bot >= (1 << 4))
+      || (p->pad_left >= (1 << 4))
+      || (p->pad_right >= (1 << 4))) {
+    return 0;
+  }
+
+  return 1;
+}
+
+static int bmk_conv_param_alloc_ok(
+    const bmk1880v2_tiu_convolution_param_t *p,
+    const conv_param_t *param)
+{
+  if (!p->ifmap || !p->ofmap || !p->weight)
+    return 0;
+  if(p->ps32_mode==1)
+      if (param->using_bias)
+        if (!p->bias)
+          return 0;
+
+  return 1;
+}
+
+static void make_bmk_conv_param_ps32(
+    bmk_ctx_t *ctx,
+    bmk1880v2_tiu_convolution_param_t *dst,
+    const conv_param_t *p, u32 ps32_mode)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  if(ps32_mode==2)
+  {
+    u32 ifmap_size = conv_ifmap_tensor_size(ctx, p);
+    u32 weight_size = conv_weight_tensor_size(ctx, p);
+    u32 ofmap_size = conv_ofmap_tensor_size(ctx, p);
+    u32 bias_size = p->using_bias ? conv_bias_tensor_size(ctx, p) : 0;
+    u32 total_size = ifmap_size + weight_size + ofmap_size + bias_size;
+
+    // Allocation if size fit.
+    // Assertion check in bmk1880v2_lmem_alloc_ps32_tensor().
+    bmk1880v2_chip_info_t chip_info = bmk1880v2_chip_info();
+    if (total_size <= chip_info.lmem_size) {
+      dst->ifmap = conv_ifmap_tensor(ctx, p);
+      dst->weight = conv_weight_tensor(ctx, p);
+      dst->ofmap = conv_ofmap_tensor(ctx, p);
+    } else {
+      dst->ifmap = nullptr;
+      dst->weight = nullptr;
+      dst->ofmap = nullptr;
+    }
+  }
+
+  dst->ps32_mode = ps32_mode;
+
+  dst->bias = NULL;
+  dst->relu_enable = 0;
+  dst->rshift_bits = 0;
+  if(ps32_mode==1)
+  {
+    dst->relu_enable = p->bReLU_EN;
+    dst->rshift_bits = p->r_shift_m;
+    // only mode=1 can use bias
+    if (p->using_bias)
+      dst->bias = conv_bias_tensor(ctx, p);
+  }
+}
+
+static void make_bmk_conv_param(
+    bmk_ctx_t *ctx,
+    bmk1880v2_tiu_convolution_param_t *dst,
+    const conv_param_t *p)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  dst->relu_enable = p->bReLU_EN;
+  dst->rshift_bits = p->r_shift_m;
+  dst->ifmap = conv_ifmap_tensor(ctx, p);
+  dst->weight = conv_weight_tensor(ctx, p);
+  dst->ofmap = conv_ofmap_tensor(ctx, p);
+  dst->bias = NULL;
+  dst->ps32_mode = 0;
+  if (p->using_bias)
+    dst->bias = conv_bias_tensor(ctx, p);
+
+}
+
+static void free_bmk_conv_param(
+    bmk_ctx_t *ctx,
+    bmk1880v2_tiu_convolution_param_t *r,
+    const conv_param_t *p)
+{
+  if (p->using_bias && r->bias)
+    free_tl(ctx, r->bias);
+
+  if (r->ofmap)
+    free_tl(ctx, r->ofmap);
+
+  if (r->weight)
+    free_tl(ctx, r->weight);
+
+  if (r->ifmap)
+    free_tl(ctx, r->ifmap);
+}
+
+static void init_conv_param(conv_param_t &p)
+{
+  printf("init_conv_param\n");
+  memset(&p, 0, sizeof(p));
+  p.random_seed = clock();
+  srand(p.random_seed);
+
+retry:
+
+  p.input_n = 1;
+  p.input_c = rand() % (10) + 2;
+  p.kh = rand() % 6 + 1;
+  p.kw = rand() % 6 + 1;
+  p.input_h = rand() % 10 + p.kh;
+  p.input_w = rand() % 10 + p.kw;
+  p.output_c = rand() % 10 + 3;
+  p.stride_h = rand() % (p.kh) + 1;
+  p.stride_w = rand() % (p.kw) + 1;
+  p.ins_h = rand() % p.kh;
+  p.ins_w = rand() % p.kw;
+  p.ins_h_last = rand() % p.kh;;
+  p.ins_w_last = rand() % p.kw;;
+  p.dh = rand() % 3 + 1;
+  p.dw = rand() % 3 + 1;
+
+  int kh_ext = conv_kh_ext(&p);
+  int kw_ext = conv_kw_ext(&p);
+  p.pad_top = rand() % kh_ext;
+  p.pad_bot = rand() % kh_ext;
+  p.pad_left = rand() % kw_ext;
+  p.pad_right = rand() % kw_ext;
+
+  if (!conv_param_is_ok(&p)) {
+    printf("retry init_conv_param\n");
+    goto retry;
+  }
+
+  p.using_bias = rand() % 2;
+  p.r_shift_m = rand() % 8;
+  p.bReLU_EN = rand() % 2;
+  p.opd0_sign = rand() % 2;
+  p.opd1_sign = 1;
+  p.opd2_sign = 1;
+
+  assert(p.opd1_sign == 1 && p.opd2_sign == 1);
+
+  int ih_ext = conv_ih_ext(&p);
+  int iw_ext = conv_iw_ext(&p);
+  assert(ih_ext >= kh_ext);
+  assert(iw_ext >= kw_ext);
+}
+
+static void print_conv_param(const conv_param_t *p)
+{
+  printf("%s\n", "Conv parameters:");
+  printf("  %s%d;\n", "p->random_seed = ", p->random_seed);
+
+  printf("  %s%d;\n", "p->input_n = ", p->input_n);
+  printf("  %s%d;\n", "p->input_c = ", p->input_c);
+  printf("  %s%d;\n", "p->input_h = ", p->input_h);
+  printf("  %s%d;\n", "p->input_w = ", p->input_w);
+  printf("  %s%d;\n", "p->output_c = ", p->output_c);
+
+  printf("  %s%d;\n", "p->kh = ", p->kh);
+  printf("  %s%d;\n", "p->kw = ", p->kw);
+  printf("  %s%d;\n", "p->dh = ", p->dh);
+  printf("  %s%d;\n", "p->dw = ", p->dw);
+  printf("  %s%d;\n", "p->pad_top = ", p->pad_top);
+  printf("  %s%d;\n", "p->pad_bot = ", p->pad_bot);
+  printf("  %s%d;\n", "p->pad_left = ", p->pad_left);
+  printf("  %s%d;\n", "p->pad_right = ", p->pad_right);
+  printf("  %s%d;\n", "p->stride_h = ", p->stride_h);
+  printf("  %s%d;\n", "p->stride_w = ", p->stride_w);
+  printf("  %s%d;\n", "p->ins_w = ", p->ins_w);
+  printf("  %s%d;\n", "p->ins_h = ", p->ins_h);
+  printf("  %s%d;\n", "p->ins_w_last = ", p->ins_w_last);
+  printf("  %s%d;\n", "p->ins_h_last = ", p->ins_h_last);
+
+  printf("  %s%d;\n", "p->r_shift_m = ", p->r_shift_m);
+  printf("  %s%d;\n", "p->opd0_sign = ", p->opd0_sign);
+  printf("  %s%d;\n", "p->opd1_sign = ", p->opd1_sign);
+  printf("  %s%d;\n", "p->opd2_sign = ", p->opd2_sign);
+  printf("  %s%d;\n", "p->bReLU_EN = ", p->bReLU_EN);
+  printf("  %s%d;\n", "p->using_bias = ", p->using_bias);
+
+  printf("  %s%d\n", "kh_ext = ", conv_kh_ext(p));
+  printf("  %s%d\n", "kw_ext = ", conv_kw_ext(p));
+  printf("  %s%d\n", "ih_ext = ", conv_ih_ext(p));
+  printf("  %s%d\n", "iw_ext = ", conv_iw_ext(p));
+  printf("  %s%d\n", "output_h = ", conv_oh(p));
+  printf("  %s%d\n", "output_w = ", conv_ow(p));
+}
+
+static int test_ps32_ut(
+    conv_param_t &p_param, CVI_RT_HANDLE ctx, bmk_ctx_t *bk_ctx)
+{
+  printf("test_ps32_ut\n");
+  u16 *input = alloc_input(&p_param);
+  u16 *weight = alloc_weight(&p_param);
+  u32 *bias = alloc_bias(&p_param);
+  u16 *output_ref = (u16 *)malloc(sizeof(u16) * conv_output_size(&p_param) * sizeof(short));
+  if (!output_ref)
+    return BM_ERR_FAILURE;
+
+  bmerr_t ret = ps32_conv_ref(&p_param, input, weight, bias, output_ref, 2);
+  assert(ret == BM_SUCCESS);
+  bmk1880v2_tiu_convolution_param_t conv_param;
+  make_bmk_conv_param_ps32(bk_ctx, &conv_param, &p_param, 2);
+
+  int tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, &p_param);
+  if (tl_alloc_success) {
+
+    put_bf16_tensor_g2l(&ctx, bk_ctx, conv_param.ifmap, (u16 *)input, FMT_BF16);
+    put_conv_weight(&ctx, bk_ctx, conv_param.weight, (u16 *)weight);
+    bmk1880v2_tiu_convolution(bk_ctx, &conv_param);
+    bmk1880v2_tensor_lmem_t ps32_ofmap;
+    ps32_ofmap = *conv_param.ofmap;
+    ps32_ofmap.shape.n = ps32_ofmap.shape.n * sizeof(short);
+    u16 *output = (u16*) get_bf16_tensor_l2g(&ctx, bk_ctx, &ps32_ofmap, FMT_BF16);
+
+    int has_error = array_cmp_int8(
+        "Comparing M2 begin_mode results ...\n",
+        (s8*)output_ref, (s8 *)output, conv_output_size(&p_param) * sizeof(int));
+
+    if (has_error) {
+      print_conv_param(&p_param);
+      printf("Comparison M2 FAILED\n");
+      exit(-1);
+    } else
+      printf("Comparison M2 PASS\n");
+
+    free(output);
+  }
+  free_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+  printf("test_ps32_intermediate_mode\n");
+  for (int i=0; i < conv_input_size(&p_param); i++)
+    input[i] = convert_fp32_bf16(i);
+
+  for (int i=0; i < conv_weight_size(&p_param); i++)
+    weight[i] = convert_fp32_bf16(i);
+
+  ret = ps32_conv_ref(&p_param, input, weight, bias, output_ref, 3);
+  assert(ret == BM_SUCCESS);
+
+  make_bmk_conv_param_ps32(bk_ctx, &conv_param, &p_param, 3);
+  tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, &p_param);
+
+  if (tl_alloc_success) {
+    put_bf16_tensor_g2l(&ctx, bk_ctx, conv_param.ifmap, (u16 *)input, FMT_BF16);
+    put_conv_weight(&ctx, bk_ctx, conv_param.weight, (u16 *)weight);
+
+    bmk1880v2_tiu_convolution(bk_ctx, &conv_param);
+
+    bmk1880v2_tensor_lmem_t ps32_ofmap;
+    ps32_ofmap = *conv_param.ofmap;
+    ps32_ofmap.shape.n = ps32_ofmap.shape.n * sizeof(short);
+
+    u16 *output = (u16*) get_bf16_tensor_l2g(&ctx, bk_ctx, &ps32_ofmap, FMT_BF16);
+
+    int has_error = array_cmp_int8(
+        "Comparing M3 intermediate results ...\n",
+        (s8*)output_ref, (s8 *)output, conv_output_size(&p_param) * sizeof(int));
+
+    if (has_error) {
+      print_conv_param(&p_param);
+      printf("Comparison M3 FAILED\n");
+      exit(-1);
+    } else
+      printf("Comparison M3 PASS\n");
+
+    free(output);
+  }
+  free_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+  printf("test_ps32_end_mode\n");
+  for (int i=0; i < conv_input_size(&p_param); i++)
+    input[i] = convert_fp32_bf16(i);
+
+  for (int i=0; i < conv_weight_size(&p_param); i++)
+    weight[i] = convert_fp32_bf16(i);
+
+  ret = ps32_conv_ref(&p_param, input, weight, bias, output_ref, 1);
+  assert(ret == BM_SUCCESS);
+
+  make_bmk_conv_param_ps32(bk_ctx, &conv_param, &p_param, 1);
+
+  tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, &p_param);
+
+  if (tl_alloc_success) {
+
+    put_bf16_tensor_g2l(&ctx, bk_ctx, conv_param.ifmap, (u16 *)input, FMT_BF16);
+    put_conv_weight(&ctx, bk_ctx, conv_param.weight, (u16 *)weight);
+    if (p_param.using_bias) {
+      put_conv_bias(&ctx, bk_ctx, conv_param.bias, bias);
+    }
+    bmk1880v2_tiu_convolution(bk_ctx, &conv_param);
+    u16 *output = (u16*) get_bf16_tensor_l2g(&ctx, bk_ctx, conv_param.ofmap, FMT_BF16);
+
+    int has_error = array_cmp_int8(
+        "Comparing M1 end results ...\n",
+        (s8*)output_ref, (s8 *)output, conv_output_size(&p_param) * 2);
+
+    if (has_error) {
+      print_conv_param(&p_param);
+      printf("Comparison M1 FAILED\n");
+      exit(-1);
+    } else
+      printf("Comparison M1 PASS\n");
+
+    free(output);
+  }
+
+  free_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+  free(input);
+  free(weight);
+  free(bias);
+  free(output_ref);
+
+  return tl_alloc_success ? 1 : 0;
+}
+
+static int test_ic_tiling_conv(
+    conv_param_t &p_param, CVI_RT_HANDLE ctx, bmk_ctx_t *bk_ctx)
+{
+  printf("test tiled ps32 conv\n");
+  u16 *input = alloc_input(&p_param);
+  u16 *weight = alloc_weight(&p_param);
+  u32 *bias = alloc_bias(&p_param);
+  p_param.r_shift_m = 0;
+  u16 *output_ref = (u16 *)malloc(sizeof(u16) * conv_output_size(&p_param));
+  if (!output_ref)
+    return BM_ERR_FAILURE;
+
+  memset((u8*)output_ref, 0, conv_output_size(&p_param)*2);
+  bmerr_t ret = ps32_conv_ref(&p_param, input, weight, bias, output_ref, 0);
+  assert(ret == BM_SUCCESS);
+
+  bmk1880v2_tiu_convolution_param_t conv_tmp_param;
+  bmk1880v2_tiu_convolution_param_t conv_param;
+  make_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+  int tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, &p_param);
+  if (tl_alloc_success) {
+    if (p_param.using_bias) {
+      conv_tmp_param.bias = conv_param.bias;
+      put_conv_bias(&ctx, bk_ctx, conv_param.bias, bias);
+    }
+    // for ps32_md[1] = 1, relu_enable & rshift_bits need to set to 0
+    // so we store those parameters to conv_tmp_para
+    conv_tmp_param.relu_enable = conv_param.relu_enable;
+    conv_tmp_param.rshift_bits = conv_param.rshift_bits;
+    conv_tmp_param.bias        = conv_param.bias;
+
+    u32 ic_step = 1;
+    u32 n_step = 1;
+    tl_t ifmap = *conv_param.ifmap;
+    tl_t ofmap = *conv_param.ofmap;
+    tg_shape_t s;
+    s.n = conv_param.ifmap->shape.n;
+    s.c = conv_param.ifmap->shape.c;
+    s.h = conv_param.ifmap->shape.h;
+    s.w = conv_param.ifmap->shape.w;
+    tg_t *tg_ifmap = alloc_tg_bf16_gmem(&ctx, s, FMT_BF16);
+    put_tg_bf16_gmem(&ctx, tg_ifmap, (u8 *)input);
+
+    s.n = conv_param.weight->shape.n;
+    s.c = conv_param.weight->shape.c;
+    s.h = conv_param.weight->shape.h;
+    s.w = conv_param.weight->shape.w;
+    u16 *transformed_weight =
+      transform_weight(&conv_param.weight->shape, (u16 *)weight);
+    tg_t *tg_weight = alloc_tg_bf16_gmem(&ctx, s, FMT_BF16);
+    put_tg_bf16_gmem(&ctx, tg_weight, (u8 *)transformed_weight);
+    free(transformed_weight);
+
+    tl_shape_t cur_tl_ifmap_shape = {
+        n_step,
+        ic_step,
+        ifmap.shape.h,
+        ifmap.shape.w
+    };
+
+    tg_shape_t cur_tg_ifmap_shape = {
+      cur_tl_ifmap_shape.n,
+      cur_tl_ifmap_shape.c,
+      cur_tl_ifmap_shape.h,
+      cur_tl_ifmap_shape.w
+    };
+
+    tg_stride_t cur_tg_ifmap_stride = {
+      tg_ifmap->stride.n,
+      tg_ifmap->stride.c,
+      tg_ifmap->stride.h,
+    };
+
+    tg_t cur_tg_ifmap;
+    cur_tg_ifmap.base_reg_index = 0;
+    cur_tg_ifmap.start_address = tg_ifmap->start_address;
+    cur_tg_ifmap.shape = cur_tg_ifmap_shape;
+    cur_tg_ifmap.stride = cur_tg_ifmap_stride;
+    cur_tg_ifmap.fmt = FMT_BF16;
+
+    tl_t cur_tl_ifmap;
+    cur_tl_ifmap.shape = cur_tl_ifmap_shape;
+    cur_tl_ifmap.stride =
+      bmk1880v2_tensor_lmem_default_stride(bk_ctx, cur_tl_ifmap_shape, FMT_BF16, 1);
+    cur_tl_ifmap.start_address = ifmap.start_address;
+    cur_tl_ifmap.fmt = ifmap.fmt;
+
+    tl_t cur_tl_ofmap;
+    cur_tl_ofmap.start_address = ofmap.start_address;
+    cur_tl_ofmap.shape = ofmap.shape;
+    cur_tl_ofmap.shape.n = n_step;
+    cur_tl_ofmap.stride =
+      bmk1880v2_tensor_lmem_default_stride(bk_ctx, cur_tl_ofmap.shape, FMT_BF16, 1);
+    cur_tl_ofmap.fmt = ofmap.fmt;
+
+    tl_t cur_tl_weight;
+    cur_tl_weight.start_address = conv_param.weight->start_address;
+    cur_tl_weight.shape = conv_param.weight->shape;
+    cur_tl_weight.shape.n = ic_step;
+    cur_tl_weight.stride = {
+      2,
+      cur_tl_weight.shape.n * cur_tl_weight.shape.h * cur_tl_weight.shape.w * 2,
+      cur_tl_weight.shape.n * cur_tl_weight.shape.w * 2,
+      cur_tl_weight.shape.n * 2
+    };
+    cur_tl_weight.fmt = conv_param.weight->fmt;
+
+    const tl_t *saved_tl_weight = conv_param.weight;
+    const tl_t *saved_tl_ifmap = conv_param.ifmap;
+    for (u32 ci = 0; ci < ifmap.shape.c; ci += ic_step) {
+      {
+        u32 ic = tg_weight->shape.n;
+        u32 oc = tg_weight->shape.c;
+        u32 kh = tg_weight->shape.h;
+        u32 kw = tg_weight->shape.w;
+
+        tg_t cur_tdma_tg_weight;
+        cur_tdma_tg_weight.base_reg_index = tg_weight->base_reg_index;
+        cur_tdma_tg_weight.start_address = tg_weight->start_address + ci * (tg_weight->fmt == FMT_BF16 ? 2 : 1);
+        cur_tdma_tg_weight.fmt = tg_weight->fmt;
+        cur_tdma_tg_weight.shape = {1, oc, kh * kw, ic};
+        cur_tdma_tg_weight.stride =
+          bmk1880v2_tensor_tgmem_default_stride(cur_tdma_tg_weight.shape, FMT_BF16);
+        cur_tdma_tg_weight.shape = {1, oc, kh * kw, ic_step};
+
+        tl_t cur_tdma_tl_weight;
+        cur_tdma_tl_weight = cur_tl_weight;
+        cur_tdma_tl_weight.shape.n = cur_tdma_tg_weight.shape.n;
+        cur_tdma_tl_weight.shape.c = cur_tdma_tg_weight.shape.c;
+        cur_tdma_tl_weight.shape.h = cur_tdma_tg_weight.shape.h;
+        cur_tdma_tl_weight.shape.w = cur_tdma_tg_weight.shape.w;
+        cur_tdma_tl_weight.stride = bmk1880v2_tensor_lmem_default_stride(
+            bk_ctx, cur_tdma_tl_weight.shape, cur_tdma_tl_weight.fmt, 0);
+
+        bmk1880v2_tdma_tg2l_tensor_copy_param_t p1;
+        memset(&p1, 0, sizeof(p1));
+        p1.src = &cur_tdma_tg_weight;
+        p1.dst = &cur_tdma_tl_weight;
+        bmk1880v2_tdma_g2l_bf16_tensor_copy(bk_ctx, &p1);
+        test_submit(&ctx);
+      }
+      {
+        bmk1880v2_tdma_tg2l_tensor_copy_param_t p2;
+        memset(&p2, 0, sizeof(p2));
+        cur_tg_ifmap.start_address =
+          tg_ifmap->start_address + ci * tg_ifmap->stride.c;
+        p2.src = &cur_tg_ifmap;
+        p2.dst = &cur_tl_ifmap;
+        bmk1880v2_tdma_g2l_bf16_tensor_copy(bk_ctx, &p2);
+        test_submit(&ctx);
+      }
+
+      conv_param.ifmap = &cur_tl_ifmap;
+      conv_param.weight = &cur_tl_weight;
+
+      // for ps32_md[1] = 1, relu_enable & rshift_bits need to set to 0
+      // so we store those parameters to conv_tmp_para
+      if (ci == (ifmap.shape.c - 1))
+      {
+        conv_param.relu_enable = conv_tmp_param.relu_enable;
+        conv_param.rshift_bits = conv_tmp_param.rshift_bits;
+        conv_param.bias        = conv_tmp_param.bias;
+        conv_param.ps32_mode   = 1;
+      }
+      else if (ci == 0)
+      {
+        conv_param.relu_enable = 0;
+        conv_param.rshift_bits = 0;
+        conv_param.bias        = 0;;
+        conv_param.ps32_mode   = 2;
+      }
+      else
+      {
+        conv_param.relu_enable = 0;
+        conv_param.rshift_bits = 0;
+        conv_param.bias        = 0;;
+        conv_param.ps32_mode   = 3;
+      }
+      bmk1880v2_tiu_convolution(bk_ctx, &conv_param);
+      conv_param.weight = saved_tl_weight;
+      conv_param.ifmap = saved_tl_ifmap;
+    }
+
+    u16 *output = (u16*) get_bf16_tensor_l2g(&ctx, bk_ctx, conv_param.ofmap, FMT_BF16);
+
+    free_tg_gmem(&ctx, tg_ifmap);
+    free_tg_gmem(&ctx, tg_weight);
+
+    int has_error = array_cmp_int8(
+        "Comparing results ...\n",
+        (s8*) output_ref, (s8 *)output, conv_output_size(&p_param)*2);
+    if (has_error) {
+      print_conv_param(&p_param);
+      printf("Comparison FAILED\n");
+      exit(-1);
+    }
+    free(output);
+  }
+  free_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+  free(input);
+  free(weight);
+  free(output_ref);
+  free(bias);
+
+  return tl_alloc_success ? 1 : 0;
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  int test_finished_num = 0;
+  int round_mode;
+  round_mode = set_store_feround();
+
+  for (int i = 0; i < 15; i++) {
+    printf("random_test_conv iteration: %d\n", i);
+    conv_param_t test_conv_param;
+    init_conv_param(test_conv_param);
+    //print_conv_param(&test_conv_param);
+    test_finished_num += test_ic_tiling_conv(test_conv_param, ctx, bk_ctx);
+    test_finished_num += test_ps32_ut(test_conv_param, ctx, bk_ctx);
+    if (!test_conv_param.using_bias)
+      test_conv_param.using_bias = 1;
+    if (test_conv_param.output_c <= 9)
+      test_conv_param.output_c += 3;
+    //print_conv_param(&test_conv_param);
+    test_finished_num += test_ic_tiling_conv(test_conv_param, ctx, bk_ctx);
+    test_finished_num += test_ps32_ut(test_conv_param, ctx, bk_ctx);
+  }
+  printf("test_finished_num: %d\n", test_finished_num);
+  restore_feround(round_mode);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_bf16_conv_zero_ratio.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_conv_zero_ratio.cpp
new file mode 100644
index 000000000..a55dba42a
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_conv_zero_ratio.cpp
@@ -0,0 +1,741 @@
+#include "../1880v2_test_util.h"
+
+typedef struct{
+    u16 *conv_input;
+    u16 *conv_weight;
+    u32 *conv_bias;
+    u16 *conv_output;
+    u16 *conv_output_ref;
+}u_test_data;
+
+typedef struct {
+  int random_seed;
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int using_bias;
+  int bReLU_EN;
+  int r_shift_m;
+  int opd0_sign;
+  int opd1_sign;
+  int opd2_sign;
+  int izratio;
+  int kzratio;
+} conv_param_t;
+
+conv_param_t conv_param;
+u_test_data u16_test_data;
+bmk1880v2_tiu_convolution_param_t bmk_conv_param;
+
+bmk1880v2_tensor_lmem_t *skip_tensor_lmem[10];
+u32 skip_tensor_num=0;
+
+/* need to make sure the free order of alloc_tl for skip_tensor_lmem*/
+void skip_tensor_lmem_size(bmk_ctx_t *bmk, const bmk1880v2_tensor_lmem_t *p)
+{
+  u32 needed = align_up(p->shape.n * p->stride.n, BM1880V2_HW_EU_NUM);
+  u32 start_addr = p->start_address + needed; //src_shape.n*src_shape.c*src_shape.h*src_shape.w/32;
+  u32 remain_size = start_addr % BM1880V2_HW_LMEM_BANK_SIZE ? (BM1880V2_HW_LMEM_BANK_SIZE - start_addr % BM1880V2_HW_LMEM_BANK_SIZE) : 0; // remain size for each lane
+  if(remain_size)
+  {
+    tl_shape_t src_shape2 = {1, BM1880V2_HW_NPU_NUM, 1, remain_size};
+    skip_tensor_lmem[skip_tensor_num] = alloc_tl(bmk, src_shape2, FMT_BF16, 1); // skip the lmem size and next tl can alignment to bank size
+  }
+  skip_tensor_num++;
+}
+
+void free_skip_tensor_lmem(bmk_ctx_t *ctx)
+{
+  if(skip_tensor_lmem[--skip_tensor_num]!=NULL)
+    free_tl(ctx, skip_tensor_lmem[skip_tensor_num]);
+}
+
+static inline void bf16_relu(float *buf, u64 size)
+{
+  for (u64 i = 0; i < size; i++)
+    if (buf[i] < 0)
+      buf[i] = 0;
+}
+
+static int conv_ref(
+    const conv_param_t *p_param,
+    const u16 *ifmap,
+    const u16 *weight,
+    const u32 *bias,
+    u16 *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int do_relu = p_param->bReLU_EN;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+  float *result = (float *)malloc(sizeof(float) * in * oc * oh * ow);
+  if (!result)
+    return BM_ERR_FAILURE;
+
+  memset(result, 0, sizeof(float) * in * oc * oh * ow);
+  int ret = BM_SUCCESS;
+
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      u16 *i_fmap_pad[ic];
+      u16 *kernel_pad[ic];
+
+      for (int iic = 0; iic < ic; ++iic) {
+        i_fmap_pad[iic] = NULL;
+        kernel_pad[iic] = NULL;
+        fill_pad_fmap_bf16(
+            (ifmap + n*ic*ih*iw + iic*ih*iw), &i_fmap_pad[iic], convert_fp32_bf16(0),
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+        //kernel_dilation(
+        fill_pad_fmap_bf16(
+            (weight + c*ic*kh*kw + iic*kh*kw), &kernel_pad[iic], convert_fp32_bf16(0),
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+                  kh, kw);
+      }
+      for (int ph = 0; ph < oh; ++ph) {
+        for (int pw = 0; pw < ow; ++pw) {
+          float result_val = result[n*oc*oh*ow + c*oh*ow + ph*ow + pw];
+          for (int idxh = 0; idxh < kh_ext; idxh += dh) {
+            for (int idxw = 0; idxw < kw_ext; idxw += dw) {
+              for (int iic = 0; iic < ic; ++iic){
+                float ifv = convert_bf16_fp32(i_fmap_pad[iic][(idxh+ph*stride_h) * iw_ext + idxw + pw*stride_w]);
+                float ikv = convert_bf16_fp32(kernel_pad[iic][idxh* kw_ext + idxw]);
+                result_val += ifv*ikv;
+              }
+            }
+          }
+          result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] = result_val;
+        }
+      }
+       if (p_param->using_bias) {
+         for (int ph = 0; ph < oh; ++ph) {
+           for (int pw = 0; pw < ow; ++pw) {
+             result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += convert_hex_fp32(bias[c]); //bias+c ;
+           }
+         }
+       }
+       if (do_relu)
+         bf16_relu(&result[n*oc*oh*ow + c*oh*ow], oh * ow);
+       for(int i = 0 ;i<ic;i++) {
+         free(i_fmap_pad[i]);
+         free(kernel_pad[i]);
+       }
+       if (ret != BM_SUCCESS)
+         goto error_release;
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+    for (int i = 0; i < in * oc * oh * ow; i++) {
+      ofmap[i] = convert_fp32_bf16(result[i]);
+    }
+
+error_release:
+  free(result);
+
+  return ret;
+
+}
+
+static u16 * transform_weight(const tl_shape_t *s, u16 before[])
+{
+  u32 ic = s->n;
+  u32 oc = s->c;
+  u32 kh = s->h;
+  u32 kw = s->w;
+
+  u32 size = ic * oc * kh * kw;
+  u16 *after = (u16 *)malloc(sizeof(u16) * size);
+
+  /*
+   * (oc, ic, kh, kw) -> (1, oc, kh * kw, ic)
+   */
+  for (u32 oci = 0; oci < oc; oci++) {
+    for (u32 ici = 0; ici < ic; ici++) {
+      for (u32 khi = 0; khi < kh; khi++) {
+        for (u32 kwi = 0; kwi < kw; kwi++) {
+          u32 src_i = oci * ic * kh * kw + ici * kh * kw + khi * kw + kwi;
+          u32 dst_i = oci * kh * kw * ic + khi * kw * ic + kwi * ic + ici;
+          after[dst_i] = before[src_i];
+        }
+      }
+    }
+  }
+
+  return after;
+}
+
+static void put_conv_weight(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    const tl_t *tl,
+    u16 *data)
+{
+  const tl_shape_t *s = &tl->shape;
+  u32 ic = s->n;
+  u32 oc = s->c;
+  u32 kh = s->h;
+  u32 kw = s->w;
+
+  bmshape_t bms = BM_TENSOR_INT8((int)oc, (int)ic, (int)kh, (int)kw * 2);
+  CVI_RT_MEM dev_mem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = CVI_RT_MemGetPAddr(dev_mem);
+  u16 *transformed_data = transform_weight(s, data);
+
+  /* we put weight to region 1. CVI_RT_MemCopyS2D regard dev_mem as
+   * absolute address, so we should pass abs address to copy weight
+   * to right place.
+   */
+
+  //u64 ab_addr = bm_device_read_base_reg(*ctx, 1);
+  //CVI_RT_MEM ab_dev_mem = bmmem_device_prealloc(*ctx, NULL, ab_addr + gaddr, &bms);
+  //CVI_RT_MEM ab_dev_mem = bmmem_device_prealloc_raw(*ctx, NULL, ab_addr + gaddr, bmshape_get_size(&bms));
+
+  int ret = CVI_RT_MemCopyS2D(*ctx, dev_mem, (u8 *)transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tl_shape_t tdma_shape = { 1, oc, kh * kw, ic };
+
+  tg_t tdma_tg;
+  tdma_tg.base_reg_index = 0;
+  tdma_tg.start_address = gaddr;
+  tdma_tg.fmt = FMT_BF16;
+  tdma_tg.shape.n = tdma_shape.n;
+  tdma_tg.shape.c = tdma_shape.c;
+  tdma_tg.shape.h = tdma_shape.h;
+  tdma_tg.shape.w = tdma_shape.w;
+  tdma_tg.stride = bmk1880v2_tensor_tgmem_default_stride(tdma_tg.shape, FMT_BF16);
+  tdma_tg.base_reg_index = 1;
+
+  tl_t tdma_tl = *tl;
+  tdma_tl.shape = tdma_shape;
+  tdma_tl.stride = bmk1880v2_tensor_lmem_default_stride(bk_ctx, tdma_shape, FMT_BF16, 0);
+
+  bmk1880v2_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tdma_tg;
+  p.dst = &tdma_tl;
+
+  bmk1880v2_tdma_g2l_bf16_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+  CVI_RT_MemFree(*ctx, dev_mem);
+  //CVI_RT_MemFree(*ctx, ab_dev_mem);
+  free(transformed_data);
+}
+
+static u16 * transform_bias(int oc, u32 before[])
+{
+  u16 *after = (u16 *)malloc(sizeof(u16) * 2 * oc);
+  if (!after)
+    return NULL;
+
+  for (int i = 0; i < oc; i++){
+    after[i] = (before[i] >> 16) & 0xffff;
+    after[i + oc] = before[i] & 0xffff;
+  }
+  return after;
+}
+
+static void put_conv_bias(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    const tl_t *tl,
+    u32 *data)
+{
+
+  int oc = tl->shape.c;
+
+  bmshape_t bms = BM_TENSOR_INT8(2 * sizeof(short), oc, 1, 1);
+  CVI_RT_MEM dev_mem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = CVI_RT_MemGetPAddr(dev_mem);
+  u16 *transformed_data = transform_bias(oc, data);
+  int ret = CVI_RT_MemCopyS2D(*ctx, dev_mem, (u8 *)transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_BF16;
+  tg.shape.n = 2;
+  tg.shape.c = oc;
+  tg.shape.h = 1;
+  tg.shape.w = 1;
+  tg.stride = bmk1880v2_tensor_tgmem_default_stride(tg.shape, FMT_BF16);
+
+  bmk1880v2_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1880v2_tdma_g2l_bf16_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  CVI_RT_MemFree(*ctx, dev_mem);
+  free(transformed_data);
+}
+
+static int conv_kh_ext(const conv_param_t *p)
+{
+  return (p->kh - 1) * p->dh + 1;
+}
+
+static int conv_kw_ext(const conv_param_t *p)
+{
+  return (p->kw - 1) * p->dw + 1;
+}
+
+static int conv_ih_ext(const conv_param_t *p)
+{
+  return (p->input_h - 1) * (p->ins_h + 1) +
+      p->ins_h_last + 1 + p->pad_top + p->pad_bot;
+}
+
+static int conv_iw_ext(const conv_param_t *p)
+{
+  return (p->input_w - 1) * (p->ins_w + 1) +
+      p->ins_w_last + 1 + p->pad_left + p->pad_right;
+}
+
+static int conv_oh(const conv_param_t *p)
+{
+  return (conv_ih_ext(p) - conv_kh_ext(p)) / p->stride_h + 1;
+}
+
+static int conv_ow(const conv_param_t *p)
+{
+  return (conv_iw_ext(p) - conv_kw_ext(p)) / p->stride_w + 1;
+}
+
+static int conv_input_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int ic = p->input_c;
+  int ih = p->input_h;
+  int iw = p->input_w;
+  return in * ic * ih * iw;
+}
+
+static int conv_output_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int oc = p->output_c;
+  int oh = conv_oh(p);
+  int ow = conv_ow(p);
+  return in * oc * oh * ow;
+}
+
+static int conv_weight_size(const conv_param_t *p)
+{
+  int oc = p->output_c;
+  int ic = p->input_c;
+  int kh = p->kh;
+  int kw = p->kw;
+  return oc * ic * kh * kw;
+}
+
+static u16 * alloc_input(const conv_param_t *p)
+{
+  int size = conv_input_size(p);
+  u16 *buf = (u16 *)malloc(sizeof(u16) * size);
+  for (int i = 0; i < size; i++) {
+    if (p->izratio == 0) //almost 100% not zero
+      buf[i] = convert_fp32_bf16(rand() % 256 - 128);
+    else if (p->izratio == 1)
+      buf[i] = convert_fp32_bf16(rand() % 2 ? rand() % 256 - 128 : 0);
+    else
+      buf[i] = 0;
+  }
+  return buf;
+}
+
+static u16 * alloc_weight(const conv_param_t *p)
+{
+  int size = conv_weight_size(p);
+
+  u16 *buf = (u16 *)malloc(sizeof(u16) * size);
+  for (int i = 0; i < size; i++) {
+    if (p->kzratio == 0) //almost 100% not zero
+      buf[i] = convert_fp32_bf16(rand() % 256 - 128);
+    else if (p->kzratio == 1)
+      buf[i] = convert_fp32_bf16(rand() % 2 ? rand() % 256 - 128 : 0);
+    else
+      buf[i] = 0;
+  }
+  return buf;
+}
+
+static u32 * alloc_bias(const conv_param_t *p)
+{
+  int oc = p->output_c;
+
+  u32 *bias = (u32 *)malloc(sizeof(u32) * oc);
+  for (int i = 0; i < oc; i++)
+    bias[i] = convert_fp32_hex(rand() % 65536 - 32768);
+
+  return bias;
+}
+
+static tl_t * conv_ifmap_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  //fmt_t fmt = p->opd0_sign? FMT_I8: FMT_U8;
+  fmt_t fmt = FMT_BF16;
+  tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return bmk1880v2_lmem_alloc_tensor(ctx, s, fmt, 1);
+}
+
+static tl_t * conv_weight_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  //fmt_t fmt = p->opd1_sign? FMT_I8: FMT_U8;
+  fmt_t fmt = FMT_BF16;
+  tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+  return bmk1880v2_lmem_alloc_tensor(ctx, s, fmt, 0);
+}
+
+static tl_t * conv_ofmap_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  return bmk1880v2_lmem_alloc_tensor(ctx, s, FMT_BF16, 1);
+}
+
+static tl_t * conv_bias_tensor(
+    bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  //fmt_t fmt = p->opd2_sign? FMT_I8: FMT_U8;
+  fmt_t fmt = FMT_BF16;
+  tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+  return bmk1880v2_lmem_alloc_tensor(ctx, s, fmt, 0);
+}
+
+static int conv_param_is_ok(const conv_param_t *p)
+{
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+
+  if ((kh_ext > ih_ext)
+      || (kw_ext > iw_ext)
+      || (kh_ext <= p->pad_top)
+      || (kh_ext <= p->pad_bot)
+      || (kw_ext <= p->pad_left)
+      || (kw_ext <= p->pad_right)
+      || (p->pad_top >= (1 << 4))
+      || (p->pad_bot >= (1 << 4))
+      || (p->pad_left >= (1 << 4))
+      || (p->pad_right >= (1 << 4))) {
+    return 0;
+  }
+
+  return 1;
+}
+
+static int bmk_conv_param_alloc_ok(
+    const bmk1880v2_tiu_convolution_param_t *p,
+    const conv_param_t *param)
+{
+  if (!p->ifmap || !p->ofmap || !p->weight)
+    return 0;
+
+  if (param->using_bias)
+    if (!p->bias)
+      return 0;
+
+  return 1;
+}
+
+static void make_bmk_conv_param(
+    bmk_ctx_t *ctx,
+    bmk1880v2_tiu_convolution_param_t *dst,
+    const conv_param_t *p)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  dst->relu_enable = p->bReLU_EN;
+  dst->rshift_bits = p->r_shift_m;
+
+  dst->ifmap = conv_ifmap_tensor(ctx, p);
+  skip_tensor_lmem_size(ctx, dst->ifmap);
+  dst->weight = conv_weight_tensor(ctx, p);
+  skip_tensor_lmem_size(ctx, dst->weight);
+  dst->ofmap = conv_ofmap_tensor(ctx, p);
+  skip_tensor_lmem_size(ctx, dst->ofmap);
+  dst->bias = NULL;
+  dst->ps32_mode = 0;
+  if (p->using_bias)
+  {
+    dst->bias = conv_bias_tensor(ctx, p);
+    skip_tensor_lmem_size(ctx, dst->bias);
+  }
+  dst->w_is_const = 0;
+}
+
+static void free_bmk_conv_param(
+    bmk_ctx_t *ctx,
+    bmk1880v2_tiu_convolution_param_t *r,
+    const conv_param_t *p)
+{
+  if (p->using_bias && r->bias)
+  {
+    free_skip_tensor_lmem(ctx);
+    free_tl(ctx, r->bias);
+  }
+  if (r->ofmap)
+  {
+    free_skip_tensor_lmem(ctx);
+    free_tl(ctx, r->ofmap);
+  }
+  if (r->weight)
+  {
+    free_skip_tensor_lmem(ctx);
+    free_tl(ctx, r->weight);
+  }
+  if (r->ifmap)
+  {
+    free_skip_tensor_lmem(ctx);
+    free_tl(ctx, r->ifmap);
+  }
+}
+
+static void init_conv_param(conv_param_t &p)
+{
+retry:
+  p.input_n = 1;
+  p.input_c = 64;
+  p.input_h = 2;
+  p.input_w = 600;
+
+  p.kh = 2;
+  p.kw = 16;
+  p.output_c = 64;
+
+  p.stride_h = 1;
+  p.stride_w = 15;
+  p.ins_h = 0;
+  p.ins_w = 0;
+  p.ins_h_last = 0;;
+  p.ins_w_last = 0;;
+  p.dh = 1;
+  p.dw = 1;
+
+  int kh_ext = conv_kh_ext(&p);
+  int kw_ext = conv_kw_ext(&p);
+  p.pad_top = 1;
+  p.pad_bot = 0;
+  p.pad_left = 0;
+  p.pad_right = 0;
+
+  if (!conv_param_is_ok(&p)) {
+    printf("retry init_conv_param\n");
+    goto retry;
+  }
+
+  p.using_bias = 0;
+  p.r_shift_m = 7;
+  p.bReLU_EN = 1;
+
+  p.opd0_sign = 0;
+  p.opd1_sign = 1;
+  p.opd2_sign = 1;
+
+  assert(p.opd1_sign == 1 && p.opd2_sign == 1);
+
+  int ih_ext = conv_ih_ext(&p);
+  int iw_ext = conv_iw_ext(&p);
+  assert(ih_ext >= kh_ext);
+  assert(iw_ext >= kw_ext);
+
+}
+
+static void print_conv_param(const conv_param_t *p)
+{
+  printf("%s\n", "Conv parameters:");
+  printf("  %s%d;\n", "p->random_seed = ", p->random_seed);
+
+  printf("  %s%d;\n", "p->input_n = ", p->input_n);
+  printf("  %s%d;\n", "p->input_c = ", p->input_c);
+  printf("  %s%d;\n", "p->input_h = ", p->input_h);
+  printf("  %s%d;\n", "p->input_w = ", p->input_w);
+  printf("  %s%d;\n", "p->output_c = ", p->output_c);
+
+  printf("  %s%d;\n", "p->kh = ", p->kh);
+  printf("  %s%d;\n", "p->kw = ", p->kw);
+  printf("  %s%d;\n", "p->dh = ", p->dh);
+  printf("  %s%d;\n", "p->dw = ", p->dw);
+  printf("  %s%d;\n", "p->pad_top = ", p->pad_top);
+  printf("  %s%d;\n", "p->pad_bot = ", p->pad_bot);
+  printf("  %s%d;\n", "p->pad_left = ", p->pad_left);
+  printf("  %s%d;\n", "p->pad_right = ", p->pad_right);
+  printf("  %s%d;\n", "p->stride_h = ", p->stride_h);
+  printf("  %s%d;\n", "p->stride_w = ", p->stride_w);
+  printf("  %s%d;\n", "p->ins_w = ", p->ins_w);
+  printf("  %s%d;\n", "p->ins_h = ", p->ins_h);
+  printf("  %s%d;\n", "p->ins_w_last = ", p->ins_w_last);
+  printf("  %s%d;\n", "p->ins_h_last = ", p->ins_h_last);
+
+  printf("  %s%d;\n", "p->r_shift_m = ", p->r_shift_m);
+  printf("  %s%d;\n", "p->opd0_sign = ", p->opd0_sign);
+  printf("  %s%d;\n", "p->opd1_sign = ", p->opd1_sign);
+  printf("  %s%d;\n", "p->opd2_sign = ", p->opd2_sign);
+  printf("  %s%d;\n", "p->bReLU_EN = ", p->bReLU_EN);
+  printf("  %s%d;\n", "p->using_bias = ", p->using_bias);
+
+  printf("  %s%d\n", "kh_ext = ", conv_kh_ext(p));
+  printf("  %s%d\n", "kw_ext = ", conv_kw_ext(p));
+  printf("  %s%d\n", "ih_ext = ", conv_ih_ext(p));
+  printf("  %s%d\n", "iw_ext = ", conv_iw_ext(p));
+  printf("  %s%d\n", "output_h = ", conv_oh(p));
+  printf("  %s%d\n", "output_w = ", conv_ow(p));
+}
+
+static int setup_conv(
+    conv_param_t &p_param, CVI_RT_HANDLE ctx, bmk_ctx_t *bk_ctx)
+{
+  u16_test_data.conv_input = alloc_input(&p_param);
+  u16_test_data.conv_weight = alloc_weight(&p_param);
+  u16_test_data.conv_bias = alloc_bias(&p_param);
+  //p_param.r_shift_m = calc_rshift_m(&p_param, s8_test_data.conv_weight);
+  u16_test_data.conv_output_ref = (u16 *)malloc(sizeof(u16) * conv_output_size(&p_param));
+  if (!u16_test_data.conv_output_ref)
+    return BM_ERR_FAILURE;
+
+  bmerr_t ret = conv_ref(&p_param, u16_test_data.conv_input, u16_test_data.conv_weight, u16_test_data.conv_bias, u16_test_data.conv_output_ref);
+  assert(ret == BM_SUCCESS);
+  make_bmk_conv_param(bk_ctx, &bmk_conv_param, &p_param);
+
+  bmk_conv_param_alloc_ok(&bmk_conv_param, &p_param);
+
+  put_bf16_tensor_g2l(&ctx, bk_ctx, bmk_conv_param.ifmap, (u16 *)u16_test_data.conv_input, FMT_BF16);
+  put_conv_weight(&ctx, bk_ctx, bmk_conv_param.weight, (u16 *)u16_test_data.conv_weight);
+  if (p_param.using_bias)
+    put_conv_bias(&ctx, bk_ctx, bmk_conv_param.bias, u16_test_data.conv_bias);
+
+  return 1;
+}
+
+void get_result(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk)
+{
+  u16_test_data.conv_output = (u16*) get_bf16_tensor_l2g(ctx, bmk, bmk_conv_param.ofmap, FMT_BF16);
+}
+
+void check_result()
+{
+    int has_error = array_cmp_int8(
+        "conv Comparing results ...\n",
+        (s8*)u16_test_data.conv_output_ref, (s8 *)u16_test_data.conv_output, conv_output_size(&conv_param)*2);
+
+    if (has_error) {
+      print_conv_param(&conv_param);
+      printf("Comparison FAILED\n");
+      exit(-1);
+    }
+
+}
+
+void trigger_max_power(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk)
+{
+ bmk1880v2_tiu_convolution(bmk, &bmk_conv_param);
+ test_submit(ctx);
+}
+
+void free_s8_data()
+{
+  free(u16_test_data.conv_input);
+  free(u16_test_data.conv_weight);
+  free(u16_test_data.conv_bias);
+  free(u16_test_data.conv_output);
+  free(u16_test_data.conv_output_ref);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  for (int i = 0; i < 3; i++) {
+    for (int k = 0; k < 3; k++) {
+      printf("bf16 conv zero ratio test: ( %d ) ( %d )\n",i,k);
+      init_conv_param(conv_param);
+      conv_param.izratio = i;
+      conv_param.kzratio = k;
+      setup_conv(conv_param, ctx, bk_ctx);
+
+      trigger_max_power(&ctx, bk_ctx);
+      get_result(&ctx, bk_ctx);
+      check_result();
+
+      free_bmk_conv_param(bk_ctx, &bmk_conv_param, &conv_param);
+      free_s8_data();
+    }
+  }
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_bf16_depthwise.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_depthwise.cpp
new file mode 100644
index 000000000..47c12c14c
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_depthwise.cpp
@@ -0,0 +1,427 @@
+#include "../1880v2_test_util.h"
+
+typedef bmk1880v2_tiu_depthwise_convolution_param_t param_t;
+int random_seed;
+static void print_pooling_param(param_t *p)
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+  int kh = p->weight->shape.h;
+  int kw = p->weight->shape.w;
+
+  printf("  Pooling parameters:\n");
+  printf("    random_seed : %d \n", random_seed);
+  printf("    ifmap = (%d, %d, %d, %d)\n", in, ic, ih, iw);
+  printf("    opd0_sign = %d\n", p->ifmap->fmt == FMT_I8);
+  printf("    weight = (%d, %d)\n", kh, kw);
+  printf("    padding = (%d, %d, %d, %d)\n",
+         p->pad_top, p->pad_bottom, p->pad_left, p->pad_right);
+  printf("    stride = (%d, %d)\n", p->stride_h, p->stride_w);
+  printf("    ins0 = (%d, %d, %d, %d)\n",
+         p->ins_h, p->ins_last_h, p->ins_w, p->ins_last_w);
+  printf("    dilation = (%d, %d)\n",p->dilation_h, p->dilation_w);
+  printf("    rshift_bits = %d\n", p->rshift_bits);
+  printf("    relu_enable = %d\n", p->relu_enable);
+  printf("    res0_sign = %d\n", p->ofmap->fmt == FMT_I8);
+}
+
+static u16 *alloc_input(param_t *p)
+{
+  u64 size = tl_shape_size(&p->ifmap->shape);
+  u16 *data = (u16 *)xmalloc(size * 2);
+  if (!data)
+    return NULL;
+
+  for (u64 i = 0; i < size; i++) {
+    float val = 0;
+    int RAND_MAX2 = RAND_MAX/2; //5 ~ -5
+    val = (float)(rand()-RAND_MAX2)*5 / (float)RAND_MAX;
+    data[i] = convert_fp32_bf16(val);
+  }
+  return data;
+}
+
+static u16 *alloc_weight(param_t *p)
+{
+  int size = tl_shape_size(&p->weight->shape);
+  u16 *data = (u16 *)xmalloc(size * 2);
+  if (!data)
+    return NULL;
+
+  for (int i = 0; i < size; i++) {
+    float val = 0;
+    int RAND_MAX2 = RAND_MAX/2; //5 ~ -5
+    val = (float)(rand()-RAND_MAX2)*5 / (float)RAND_MAX;
+    data[i] = convert_fp32_bf16(val);
+  }
+  return data;
+}
+
+static u32 *alloc_bias(param_t *p)
+{
+  int c = p->bias->shape.c;
+  u32 *bias = (u32 *)malloc(sizeof(u32) * c);
+  if (!bias)
+    return NULL;
+
+  for (int i = 0; i < c; i++) {
+    float val = 0;
+    int RAND_MAX2 = RAND_MAX/2; //2 ~ -2
+    val = (float)(rand()-RAND_MAX2)*2 / (float)RAND_MAX;
+    bias[i] = convert_fp32_hex(val);
+  }
+  return bias;
+}
+
+static u16 *alloc_output(param_t *p)
+{
+  u64 size = tl_shape_size(&p->ofmap->shape);
+  return (u16 *)xmalloc(size * 2);
+}
+
+static inline void bf16_relu(u16 *buf, u64 size)
+{
+  for (u64 i = 0; i < size; i++)
+    if (convert_bf16_fp32(buf[i]) < 0)
+      buf[i] = convert_fp32_bf16(0);
+}
+
+static int index_get(int h, int w1, int w2)
+{
+  return h * w1 + w2;
+}
+
+int native_pooling_avg_bf16(
+    const u16* i_fmap,
+    const void* weight,
+    const u32 *bias,
+    u16 * o_fmap,
+    int input_n, int input_c, int input_h, int input_w,
+    int kh, int kw,
+    int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r,
+    int stride_h, int stride_w,
+    int ins_h, int ins_w,
+    int ins_h_last, int ins_w_last,
+    int dh, int dw,
+    int const_weight)
+{
+  if (kh * kw <= 0)
+    return BM_ERR_INVALID_ARGUMENT;
+
+  u16 avg_const_weight = *(u16 *)weight;
+  u16 *weight_arr = (u16*)weight;
+  int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b);
+  int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r);
+  int d_kh = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int d_kw = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int output_h = calc_output_hw(h_after, d_kh, stride_h);
+  int output_w = calc_output_hw(w_after, d_kw, stride_w);
+  float *avg_pooling_mac_a = (float *)malloc(d_kh * d_kw * sizeof(float));
+  float *avg_pooling_mac_b = (float *)malloc(d_kh * d_kw * sizeof(float));
+
+  u16 *i_fmap_pad = NULL;
+  u16 *i_kmap_pad = NULL;
+  for (int n = 0; n < input_n; n++) {
+    if (const_weight == 0)
+      weight_arr = (u16*)weight;
+
+    for (int c = 0; c < input_c; ++c) {
+      fill_pad_fmap_bf16(i_fmap, &i_fmap_pad, 0,
+          pad_w_l, pad_w_r, pad_h_t, pad_h_b,
+          ins_h, ins_w, ins_h_last, ins_w_last,
+          input_h, input_w);
+
+      //kernel_dilation(
+      if (const_weight == 0)
+        fill_pad_fmap_bf16(
+          (weight_arr ), &i_kmap_pad, 0,
+          0, 0, 0, 0,  // no padding
+          dh - 1, dw - 1, 0, 0,
+          kh, kw);
+
+      float avg_pool_result;
+      for (int ph = 0; ph < output_h; ++ph) {
+        for (int pw = 0; pw < output_w; ++pw) {
+          int hstart = ph * stride_h;
+          int wstart = pw * stride_w;
+          int pool_index = index_get(ph, output_w, pw);
+          int mac_index = 0;
+
+          for (int h = 0; h < d_kh; h++) {
+            for (int w = 0; w < d_kw; w++) {
+              int index = index_get((hstart+h), w_after, (w+wstart));
+              mac_index = h*d_kw + w;
+
+              avg_pooling_mac_a[mac_index] = convert_bf16_fp32(i_fmap_pad[index]);
+
+              avg_pooling_mac_b[h*d_kw+w] = const_weight ?
+                  convert_bf16_fp32(avg_const_weight) : convert_bf16_fp32(i_kmap_pad[mac_index]);
+            }
+          }
+          inner_float_product(avg_pooling_mac_a, avg_pooling_mac_b, d_kh * d_kw,
+              &avg_pool_result);
+
+          if(bias) {
+            avg_pool_result += convert_hex_fp32(bias[c]);
+          }
+          *(o_fmap+pool_index) = convert_fp32_bf16(avg_pool_result);
+        }
+      }
+      weight_arr += kh * kw;
+      i_fmap += input_w * input_h;
+      o_fmap += output_w * output_h;
+    }
+  }
+  free(i_fmap_pad);
+  free(i_kmap_pad);
+  free(avg_pooling_mac_a);
+  free(avg_pooling_mac_b);
+
+  return BM_SUCCESS;
+}
+
+static void compare_results(
+    param_t *p,
+    u16 input[],
+    u16 weight[],
+    u32 bias[],
+    u16 output[])
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+  int kh = p->weight->shape.h;
+  int kw = p->weight->shape.w;
+
+  u16 *output_ref = alloc_output(p);
+  bmerr_t ret = native_pooling_avg_bf16(
+      input, weight, p->bias ? bias : NULL, output_ref,
+      in, ic, ih, iw, kh, kw,
+      p->pad_top, p->pad_bottom, p->pad_left, p->pad_right,
+      p->stride_h, p->stride_w,
+      p->ins_h, p->ins_w, p->ins_last_h, p->ins_last_w,
+      p->dilation_h, p->dilation_w, 0
+      );
+  assert(ret == BM_SUCCESS);
+
+  if(p->relu_enable )
+    bf16_relu(output_ref, tl_shape_size(&p->ofmap->shape));
+
+  int cmp_res = array_cmp_int8(
+      "Comparing results ...\n", (s8*) output_ref, (s8*) output,
+      tl_shape_size(&p->ofmap->shape) * 2);
+
+  if (cmp_res != 0) {
+    printf("Comparison FAILED!!!\n");
+    print_pooling_param(p);
+    exit(-1);
+  }
+
+  free(output_ref);
+}
+
+static int pooling_ih_ext(param_t *p, int ih)
+{
+  int ins = p->ins_h;
+  int ins_last = p->ins_last_h;
+  int pad = p->pad_top + p->pad_bottom;
+  return (ih - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+static int pooling_iw_ext(param_t *p, int iw)
+{
+  int ins = p->ins_w;
+  int ins_last = p->ins_last_w;
+  int pad = p->pad_left + p->pad_right;
+  return (iw - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+static int pooling_oh(param_t *p, int ih, int kh, int dh)
+{
+  int ih_ext = pooling_ih_ext(p, ih);
+  int d_h = (kh -1) * dh + 1;
+  return (ih_ext - d_h) / p->stride_h + 1;
+}
+
+static int pooling_ow(param_t *p, int iw, int kw, int dw)
+{
+  int iw_ext = pooling_iw_ext(p, iw);
+  int d_w = (kw -1) * dw +1;
+  return (iw_ext - d_w) / p->stride_w + 1;
+}
+
+static void free_depthwise_param(
+    bmk_ctx_t *ctx,
+    param_t *p)
+{
+  if (p->bias)
+    free_tl(ctx, p->bias);
+
+  if (p->weight)
+    free_tl(ctx, p->weight);
+
+  if (p->ifmap)
+    free_tl(ctx, p->ifmap);
+
+  if (p->ofmap)
+    free_tl(ctx, p->ofmap);
+}
+
+static param_t random_depthwise_param(bmk_ctx_t *ctx)
+{
+  param_t p;
+
+  memset(&p, 0, sizeof(p));
+
+retry:
+  random_seed = clock();
+  srand(random_seed);
+  int using_bias = rand() % 2;
+  int n = rand() % 5 + 1;
+  int c = rand() % (3 * BM1880V2_HW_NPU_NUM) + 1;
+  int ih = rand() % 30 + 3;
+  int iw = rand() % 30 + 6;
+  int kh = rand() % 7 + 1;
+  int kw = rand() % 7 + 1;
+
+  p.ins_h = rand() % kh;
+  p.ins_w = rand() % kw;
+  p.ins_last_h = rand() % kh;
+  p.ins_last_w = rand() % kw;
+  p.stride_h = rand() % kh + 1;
+  p.stride_w = rand() % kw + 1;
+  p.pad_top = rand() % kh;
+  p.pad_bottom = rand() % kh;
+  p.pad_left = rand() % kw;
+  p.pad_right = rand() % kw;
+  p.rshift_bits = rand() % 32;
+  p.dilation_h = rand()%4 + 1;
+  p.dilation_w = rand()%4 + 1;
+
+  int oh = pooling_oh(&p, ih, kh, p.dilation_h);
+  int ow = pooling_ow(&p, iw, kw, p.dilation_w);
+  int d_kh = calc_dilute_hw(kh, p.dilation_h - 1, 0, 0, 0);
+  int d_kw = calc_dilute_hw(kw, p.dilation_w - 1, 0, 0, 0);
+
+  tl_shape_t ofmap_shape;
+  ofmap_shape.n = n;
+  ofmap_shape.c = c;
+  ofmap_shape.h = oh;
+  ofmap_shape.w = ow;
+  tl_shape_t ifmap_shape;
+  ifmap_shape.n = n;
+  ifmap_shape.c = c;
+  ifmap_shape.h = ih;
+  ifmap_shape.w = iw;
+  tl_shape_t weight_shape;
+  weight_shape.n = 1;
+  weight_shape.c = c;
+  weight_shape.h = kh;
+  weight_shape.w = kw;
+  tl_shape_t bias_shape;
+  bias_shape.n = 2;
+  bias_shape.c = c;
+  bias_shape.h = 1;
+  bias_shape.w = 1;
+  p.relu_enable = rand()%2;
+
+  fmt_t ifmt = FMT_BF16;
+  p.ofmap = bmk1880v2_lmem_alloc_tensor(ctx, ofmap_shape, FMT_BF16, 1);
+  p.ifmap = bmk1880v2_lmem_alloc_tensor(ctx, ifmap_shape, ifmt, 1);
+  p.weight = bmk1880v2_lmem_alloc_tensor(ctx, weight_shape, FMT_BF16, 1);
+  p.bias = NULL;
+  if (using_bias)
+    p.bias = bmk1880v2_lmem_alloc_tensor(ctx, bias_shape, FMT_BF16, 0);
+
+  if ((kh > pooling_ih_ext(&p, ih))
+      || (kw > pooling_iw_ext(&p, iw))
+      || (oh < d_kh)
+      || (ow < d_kw)
+      || (p.pad_top >= (1 << 4))
+      || (p.pad_bottom >= (1 << 4))
+      || (p.pad_left >= (1 << 4))
+      || (p.pad_right >= (1 << 4))
+      || !p.ofmap
+      || !p.ifmap
+      || !p.weight
+      || (using_bias && !p.bias)) {
+    printf("retry init_pooling_param\n");
+    free_depthwise_param(ctx, &p);
+    goto retry;
+  }
+  return p;
+}
+
+static void put_bias_tensor(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    const tl_t *tl,
+    u32 data[])
+{
+  int c = tl->shape.c;
+
+  u16 *hi_lo = (u16 *)xmalloc(2 * c * 2);
+  if (!hi_lo)
+    return;
+
+  for (int i = 0; i < c; i++) {
+    hi_lo[i] = (data[i] >> 16) & 0xffff;
+    hi_lo[i + c] = (data[i]  & 0xffff);
+  }
+  put_bf16_tensor_g2l(ctx, bk_ctx, tl, (u16 *)hi_lo, FMT_BF16);
+
+  free(hi_lo);
+}
+
+static int test_pooling(CVI_RT_HANDLE ctx, bmk_ctx_t *bk_ctx)
+{
+  param_t param = random_depthwise_param(bk_ctx);
+  //print_pooling_param(&param);
+  u16 *input = alloc_input(&param);
+  u16 *weight = alloc_weight(&param);
+  u32 *bias = NULL;
+  if (param.bias)
+    bias = alloc_bias(&param);
+
+  put_bf16_tensor_g2l(&ctx, bk_ctx, param.ifmap, (u16 *)input, FMT_BF16);
+  put_bf16_tensor_g2l(&ctx, bk_ctx, param.weight, (u16 *)weight, FMT_BF16);
+  if (param.bias)
+    put_bias_tensor(&ctx, bk_ctx, param.bias, bias);
+
+  bmk1880v2_tiu_depthwise_convolution(bk_ctx, &param);
+  u16 *output = (u16 *)get_bf16_tensor_l2g(&ctx, bk_ctx, param.ofmap, FMT_BF16);
+  compare_results(&param, input, weight, bias, output);
+
+  free_depthwise_param(bk_ctx, &param);
+  free(input);
+  free(weight);
+  free(bias);
+  free(output);
+
+  return 1;
+}
+
+static void test_depthwise_pooling(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx)
+{
+  int test_finished_num = 0;
+  for (u64 i = 0; i < 20; i++)
+    test_finished_num += test_pooling(*ctx, bk_ctx);
+  printf("Test finished %d\n", test_finished_num);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  int round_mode;
+  round_mode = set_store_feround();
+  test_depthwise_pooling(&ctx, bk_ctx);
+  restore_feround(round_mode);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_bf16_depthwise_reshape_same_kernel.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_depthwise_reshape_same_kernel.cpp
new file mode 100644
index 000000000..8e39066b9
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_depthwise_reshape_same_kernel.cpp
@@ -0,0 +1,977 @@
+#include "../1880v2_test_util.h"
+
+typedef bmk1880v2_tiu_depthwise_convolution_param_t param_t;
+int random_seed;
+static void print_pooling_param(param_t *p)
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+  int kh = p->weight->shape.h;
+  int kw = p->weight->shape.w;
+
+  printf("  Pooling parameters:\n");
+  // printf("    random_seed : %d \n", random_seed);
+  printf("    ifmap = (%d, %d, %d, %d)\n", in, ic, ih, iw);
+  printf("    opd0_sign = %d\n", p->ifmap->fmt == FMT_I8);
+  printf("    weight = (%d, %d)\n", kh, kw);
+  printf("    padding = (%d, %d, %d, %d)\n", p->pad_top, p->pad_bottom,
+         p->pad_left, p->pad_right);
+  printf("    stride = (%d, %d)\n", p->stride_h, p->stride_w);
+  // printf("    ins0 = (%d, %d, %d, %d)\n",
+  //       p->ins_h, p->ins_last_h, p->ins_w, p->ins_last_w);
+  // printf("    dilation = (%d, %d)\n",p->dilation_h, p->dilation_w);
+  // printf("    rshift_bits = %d\n", p->rshift_bits);
+  // printf("    relu_enable = %d\n", p->relu_enable);
+  printf("    res0_sign = %d\n", p->ofmap->fmt == FMT_I8);
+}
+
+static u16 *alloc_input(int ic, int ih, int iw, fmt_t ifmt)
+{
+  u64 size = ic * ih * iw;
+  u16 *data = (u16 *)malloc(sizeof(u16) * (size));
+  if (!data)
+    return NULL;
+
+  if (ifmt == FMT_BF16) {
+    for (u64 i = 0; i < size; i++) {
+      float val = 0;
+      int RAND_MAX2 = RAND_MAX / 2;  // 5 ~ -5
+      val = (float)(rand() - RAND_MAX2) * 5 / (float)RAND_MAX;
+      val = i;
+      data[i] = convert_fp32_bf16(val);
+    }
+  } else {
+    u8 *d = (u8 *)data;
+    for (u64 i = 0; i < size; i++) {
+      d[i] = i % 10 * (i % 2 ? -1 : 1);
+    }
+  }
+
+  return data;
+}
+
+static u16 *alloc_weight(int ic, int kh, int kw, fmt_t fmt)
+{
+  int size = ic * kh * kw;
+  u16 *data = (u16 *)malloc(size * sizeof(u16));
+  if (!data)
+    return NULL;
+
+  // printf("weight size is %d\n", size * 2);
+  if (fmt == FMT_BF16) {
+    for (int i = 0; i < size; i++) {
+      float val = 0;
+      int RAND_MAX2 = RAND_MAX / 2;  // 5 ~ -5
+      val = (float)(rand() - RAND_MAX2) * 5 / (float)RAND_MAX;
+      val = i;
+      data[i] = convert_fp32_bf16(val);
+    }
+  } else {
+    u8 *d = (u8 *)data;
+    for (int i = 0; i < size; i++) {
+      d[i] = i % 5 * (i % 2 ? -1 : 1);
+    }
+  }
+  return data;
+}
+
+static u32 *alloc_bias(int ic, fmt_t fmt)
+{
+  int c = ic;
+  u64 size = c;
+  u32 *bias = (u32 *)malloc(sizeof(u32) * c);
+  if (!bias)
+    return NULL;
+
+  if (fmt == FMT_BF16) {
+    for (int i = 0; i < c; i++) {
+      float val = 0;
+      int RAND_MAX2 = RAND_MAX / 2;  // 2 ~ -2
+      val = (float)(rand() - RAND_MAX2) * 2 / (float)RAND_MAX;
+      val = i;
+      bias[i] = convert_fp32_hex(val);
+    }
+  } else {
+    u16 *d = (u16 *)bias;
+    for (u64 i = 0; i < size; i++) {
+      d[i] = i % 0xf * (i % 2 ? -1 : 1);
+    }
+  }
+  return bias;
+}
+
+static u16 *alloc_output(int ic, int oh, int ow)
+{
+  u64 size = ic * oh * ow;
+  return (u16 *)malloc(sizeof(u16) * size);
+}
+
+static inline void bf16_relu(u16 *buf, u64 size, fmt_t fmt)
+{
+  if (fmt == FMT_BF16) {
+    for (u64 i = 0; i < size; i++)
+      if (convert_bf16_fp32(buf[i]) < 0)
+        buf[i] = convert_fp32_bf16(0);
+  } else {
+    s8 *buf_s8 = (s8 *)buf;
+    for (u64 i = 0; i < size; i++) {
+      if (buf_s8[i] < 0)
+        buf_s8[i] = 0;
+    }
+  }
+}
+
+static int index_get(int h, int w1, int w2)
+{
+  return h * w1 + w2;
+}
+
+int native_pooling_avg_bf16(const u16 *i_fmap, const void *weight,
+                            const u32 *bias, u16 *o_fmap, int input_n,
+                            int input_c, int input_h, int input_w, int kh,
+                            int kw, int pad_h_t, int pad_h_b, int pad_w_l,
+                            int pad_w_r, int stride_h, int stride_w, int ins_h,
+                            int ins_w, int ins_h_last, int ins_w_last, int dh,
+                            int dw, int const_weight)
+{
+  if (kh * kw <= 0)
+    return BM_ERR_INVALID_ARGUMENT;
+
+  u16 avg_const_weight = *(u16 *)weight;
+  u16 *weight_arr = (u16 *)weight;
+  int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b);
+  int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r);
+  int d_kh = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int d_kw = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int output_h = calc_output_hw(h_after, d_kh, stride_h);
+  int output_w = calc_output_hw(w_after, d_kw, stride_w);
+  // printf("output_h/output_w is %d/%d\n", output_h, output_w);
+  float *avg_pooling_mac_a = (float *)malloc(d_kh * d_kw * sizeof(float));
+  float *avg_pooling_mac_b = (float *)malloc(d_kh * d_kw * sizeof(float));
+
+  u16 *i_fmap_pad = NULL;
+  u16 *i_kmap_pad = NULL;
+  for (int n = 0; n < input_n; n++) {
+    if (const_weight == 0)
+      weight_arr = (u16 *)weight;
+
+    for (int c = 0; c < input_c; ++c) {
+      fill_pad_fmap_bf16(i_fmap, &i_fmap_pad, 0, pad_w_l, pad_w_r, pad_h_t,
+                         pad_h_b, ins_h, ins_w, ins_h_last, ins_w_last, input_h,
+                         input_w);
+
+      // kernel_dilation(
+      if (const_weight == 0)
+        fill_pad_fmap_bf16((weight_arr), &i_kmap_pad, 0, 0, 0, 0,
+                           0,  // no padding
+                           dh - 1, dw - 1, 0, 0, kh, kw);
+
+      float avg_pool_result;
+      for (int ph = 0; ph < output_h; ++ph) {
+        for (int pw = 0; pw < output_w; ++pw) {
+          int hstart = ph * stride_h;
+          int wstart = pw * stride_w;
+          int pool_index = index_get(ph, output_w, pw);
+          int mac_index = 0;
+
+          float r = 0;
+          for (int h = 0; h < d_kh; h++) {
+            for (int w = 0; w < d_kw; w++) {
+              int index = index_get((hstart + h), w_after, (w + wstart));
+              mac_index = h * d_kw + w;
+
+              avg_pooling_mac_a[mac_index] =
+                  convert_bf16_fp32(i_fmap_pad[index]);
+
+              avg_pooling_mac_b[h * d_kw + w] =
+                  const_weight ? convert_bf16_fp32(avg_const_weight) :
+                                 convert_bf16_fp32(i_kmap_pad[mac_index]);
+
+#if 0
+              printf ("ref[ni %u][ci %u][oh/ow %u/%u][kh/kw %u/%u] o[%d]"
+                " %.1f * %.1f + %.1f = %.1f\n",
+                n, c, ph, pw, h, w, pool_index,
+                avg_pooling_mac_a[mac_index], avg_pooling_mac_b[h*d_kw+w],
+                r, r + avg_pooling_mac_a[mac_index] * avg_pooling_mac_b[h*d_kw+w]);
+#endif
+
+              r += avg_pooling_mac_a[mac_index] *
+                   avg_pooling_mac_b[h * d_kw + w];
+            }
+          }
+
+          inner_float_product(avg_pooling_mac_a, avg_pooling_mac_b, d_kh * d_kw,
+                              &avg_pool_result);
+
+          if (bias) {
+            avg_pool_result += convert_hex_fp32(bias[c]);
+          }
+          *(o_fmap + pool_index) = convert_fp32_bf16(avg_pool_result);
+        }
+      }
+      weight_arr += kh * kw;
+      i_fmap += input_w * input_h;
+      o_fmap += output_w * output_h;
+    }
+  }
+  free(i_fmap_pad);
+  free(i_kmap_pad);
+  free(avg_pooling_mac_a);
+  free(avg_pooling_mac_b);
+
+  return BM_SUCCESS;
+}
+
+static int get_fsz(fmt_t fmt)
+{
+  assert(fmt == FMT_BF16 || fmt == FMT_I8 || fmt == FMT_U8);
+  return fmt == FMT_BF16 ? 2 : 1;
+}
+
+static void compare_results(param_t *p, u16 input[], u16 weight[], u32 bias[],
+                            u16 output[], u16 output_ref[],
+                            u32 org_o_shape_size, int is_valid_pack, int org_oc,
+                            int org_oh, int org_ow)
+{
+  assert(input);
+  assert(weight);
+  printf("bias at %p\n", bias);
+  int f_sz = get_fsz(p->ofmap->fmt);
+
+  if (p->relu_enable) {
+    bf16_relu(output_ref, org_o_shape_size, p->ofmap->fmt);
+  }
+
+  int cmp_res = -1;
+  if (!is_valid_pack) {
+    // we reshape c with SAME mode padding with garbage
+    // \is_valid_pack set to false means we skip garbage part
+    int org_hw = org_oh * org_ow;
+    int new_hw = p->ofmap->shape.h * p->ofmap->shape.w;
+    int duplicated_c = p->ofmap->shape.c / org_oc;
+
+    assert(new_hw >= org_hw / duplicated_c);
+
+    s8 *output_c = ((s8 *)output);
+    s8 *output_ref_c = ((s8 *)output_ref);
+    for (int c = 0; c < org_oc; c++) {
+      cmp_res = array_cmp_int8("Comparing results ...\n",
+                               output_c + c * duplicated_c * new_hw * f_sz,
+                               output_ref_c + org_hw * c * f_sz, org_hw * f_sz);
+
+      if (cmp_res != 0) {
+        break;
+      }
+      // printf("compare [%d] pass, org len is %u, new len is %u\n", c,
+      //    org_hw, duplicated_c * new_hw);
+    }
+  } else {
+    cmp_res = array_cmp_int8("Comparing results ...\n", (s8 *)output_ref,
+                             (s8 *)output, org_o_shape_size * f_sz);
+  }
+  if (cmp_res != 0) {
+    printf("Comparison FAILED!!!\n");
+    // print_pooling_param(p);
+    exit(-1);
+  }
+
+  free(output_ref);
+}
+
+static int pooling_ih_ext(int ins_h, int ins_last_h, int pad_top,
+                          int pad_bottom, int ih)
+{
+  int ins = ins_h;
+  int ins_last = ins_last_h;
+  int pad = pad_top + pad_bottom;
+  return (ih - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+static int pooling_iw_ext(int ins_w, int ins_last_w, int pad_left,
+                          int pad_right, int iw)
+{
+  int ins = ins_w;
+  int ins_last = ins_last_w;
+  int pad = pad_left + pad_right;
+  return (iw - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+static int pooling_oh(int ins_h, int ins_last_h, int pad_top, int pad_bottom,
+                      int stride_h, int ih, int kh, int dh)
+{
+  int ih_ext = pooling_ih_ext(ins_h, ins_last_h, pad_top, pad_bottom, ih);
+  int d_h = (kh - 1) * dh + 1;
+  return (ih_ext - d_h) / stride_h + 1;
+}
+
+static int pooling_ow(int ins_w, int ins_last_w, int pad_left, int pad_right,
+                      int stride_w, int iw, int kw, int dw)
+{
+  int iw_ext = pooling_iw_ext(ins_w, ins_last_w, pad_left, pad_right, iw);
+  int d_w = (kw - 1) * dw + 1;
+  return (iw_ext - d_w) / stride_w + 1;
+}
+
+static void free_depthwise_struct(param_t *p)
+{
+
+  free((void *)p->ofmap);
+  free((void *)p->ifmap);
+  free((void *)p->weight);
+  if (p->bias) {
+    free((void *)p->bias);
+  }
+
+  p->ofmap = NULL;
+  p->ifmap = NULL;
+  p->weight = NULL;
+  p->bias = NULL;
+}
+
+static void free_depthwise_param(bmk_ctx_t *ctx, param_t *p)
+{
+  if (p->ofmap)
+    free_tl(ctx, p->ofmap);
+
+  if (p->weight)
+    free_tl(ctx, p->weight);
+
+  if (p->bias)
+    free_tl(ctx, p->bias);
+
+  if (p->ifmap)
+    free_tl(ctx, p->ifmap);
+}
+
+static param_t random_depthwise_param(bmk_ctx_t *ctx, int _ih, int _iw,
+                                      int _stride_h, fmt_t _fmt)
+{
+  param_t p;
+
+  memset(&p, 0, sizeof(p));
+
+  // retry:
+  random_seed = clock();
+  srand(random_seed);
+  int using_bias = rand() % 2;
+  int n = rand() % 5 + 1;
+  n = 1;
+  int c = rand() % (3 * BM1880V2_HW_NPU_NUM) + 1;
+  c = 3;
+  int ih = rand() % 30 + 3;
+  int iw = rand() % 30 + 6;
+  int kh = rand() % 7 + 1;
+  int kw = rand() % 7 + 1;
+
+  p.ins_h = rand() % kh;
+  p.ins_w = rand() % kw;
+  p.ins_last_h = rand() % kh;
+  p.ins_last_w = rand() % kw;
+  p.stride_h = rand() % kh + 1;
+  p.stride_w = rand() % kw + 1;
+  p.pad_top = rand() % kh;
+  p.pad_bottom = rand() % kh;
+  p.pad_left = rand() % kw;
+  p.pad_right = rand() % kw;
+  p.rshift_bits = rand() % 32;
+  p.dilation_h = rand() % 4 + 1;
+  p.dilation_w = rand() % 4 + 1;
+
+  // default
+  fmt_t ifmt = FMT_BF16;
+  fmt_t other_fmt = FMT_BF16;
+  ih = 24;
+  iw = 16;
+  kw = 2;
+  kh = 4;
+  p.stride_h = 3;
+  p.stride_w = 2;
+  p.rshift_bits = 0;
+
+  ih = _ih;
+  p.stride_h = _stride_h;
+  iw = _iw;
+  ifmt = _fmt;
+  other_fmt = FMT_I8;
+  if (ifmt == FMT_BF16) {
+    other_fmt = FMT_BF16;
+  }
+
+  p.pad_left = 0;
+  p.pad_right = 1;
+  p.pad_top = 0;
+  p.pad_bottom = 0;
+  // TODO: pad / ins / dilation
+  p.ins_h = 0;
+  p.ins_last_h = 0;
+  p.ins_w = 0;
+  p.ins_last_w = 0;
+  p.dilation_h = 1;
+  p.dilation_w = 1;
+
+  int oh = pooling_oh(p.ins_h, p.ins_last_h, p.pad_top, p.pad_bottom,
+                      p.stride_h, ih, kh, p.dilation_h);
+  int ow = pooling_ow(p.ins_w, p.ins_last_w, p.pad_left, p.pad_right,
+                      p.stride_w, iw, kw, p.dilation_w);
+
+  tl_shape_t ofmap_shape;
+  ofmap_shape.n = n;
+  ofmap_shape.c = c;
+  ofmap_shape.h = oh;
+  ofmap_shape.w = ow;
+  tl_shape_t ifmap_shape;
+  ifmap_shape.n = n;
+  ifmap_shape.c = c;
+  ifmap_shape.h = ih;
+  ifmap_shape.w = iw;
+  tl_shape_t weight_shape;
+  weight_shape.n = 1;
+  weight_shape.c = c;
+  weight_shape.h = kh;
+  weight_shape.w = kw;
+  tl_shape_t bias_shape;
+  bias_shape.n = 2;
+  bias_shape.c = c;
+  bias_shape.h = 1;
+  bias_shape.w = 1;
+  p.relu_enable = rand() % 2;
+
+  // fake init for ref
+  bmk1880v2_tensor_lmem_t *bias, *weight, *ofmap, *ifmap;
+  ifmap = (bmk1880v2_tensor_lmem_t *)malloc(sizeof(bmk1880v2_tensor_lmem_t));
+  if (using_bias) {
+    bias = (bmk1880v2_tensor_lmem_t *)malloc(sizeof(bmk1880v2_tensor_lmem_t));
+  }
+  weight = (bmk1880v2_tensor_lmem_t *)malloc(sizeof(bmk1880v2_tensor_lmem_t));
+  ofmap = (bmk1880v2_tensor_lmem_t *)malloc(sizeof(bmk1880v2_tensor_lmem_t));
+
+  p.bias = NULL;
+  if (using_bias) {
+    bias->start_address = -1;
+    bias->fmt = other_fmt;
+    bias->shape = bias_shape;
+    bias->stride = bmk1880v2_tensor_lmem_default_stride(
+        ctx, bias->shape, other_fmt, /*eu_align*/0);
+    p.bias = bias;
+  }
+
+  weight->start_address = -1;
+  weight->fmt = other_fmt;
+  weight->shape = weight_shape;
+  weight->stride = bmk1880v2_tensor_lmem_default_stride(
+      ctx, weight->shape, other_fmt, /*align*/1);
+  p.weight = weight;
+
+  ofmap->start_address = -1;
+  ofmap->fmt = other_fmt;
+  ofmap->shape = ofmap_shape;
+  ofmap->stride = bmk1880v2_tensor_lmem_default_stride(ctx, ofmap->shape,
+                                                            other_fmt, /*align*/1);
+  p.ofmap = ofmap;
+
+  ifmap->start_address = -1;
+  ifmap->fmt = ifmt;
+  ifmap->shape = ifmap_shape;
+  ifmap->stride = bmk1880v2_tensor_lmem_default_stride(ctx, ifmap->shape,
+                                                            ifmt, /*align*/1);
+  p.ifmap = ifmap;
+
+#if 0
+  int d_kh = calc_dilute_hw(kh, p.dilation_h - 1, 0, 0, 0);
+  int d_kw = calc_dilute_hw(kw, p.dilation_w - 1, 0, 0, 0);
+  if ((kh > pooling_ih_ext(&p, ih))
+      || (kw > pooling_iw_ext(&p, iw))
+      || (oh < d_kh)
+      || (ow < d_kw)
+      || (p.pad_top >= (1 << 4))
+      || (p.pad_bottom >= (1 << 4))
+      || (p.pad_left >= (1 << 4))
+      || (p.pad_right >= (1 << 4))
+      || !p.ofmap
+      || !p.ifmap
+      || !p.weight
+      || (using_bias && !p.bias)
+) {
+    LOG(INFO) << "retry init_pooling_param";
+    assert(0 && "it MUST valid param pass");
+    goto retry;
+  }
+#endif
+  return p;
+}
+
+static void put_bias_tensor(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, const tl_t *tl,
+                            u32 data[])
+{
+  int c = tl->shape.c;
+
+  u16 *hi_lo = (u16 *)malloc(sizeof(u16) * 2 * c);
+  if (!hi_lo)
+    return;
+
+  if (tl->fmt == FMT_BF16) {
+    for (int i = 0; i < c; i++) {
+      hi_lo[i] = (data[i] >> 16) & 0xffff;
+      hi_lo[i + c] = (data[i] & 0xffff);
+    }
+  } else {
+    u8 *hi_lo_u8 = (u8 *)hi_lo;
+    u16 *data_u16 = (u16 *)data;
+    for (int i = 0; i < c; i++) {
+      hi_lo_u8[i] = data_u16[i] & 0xff;
+      hi_lo_u8[i + c] = (data_u16[i] >> 8) & 0xff;
+    }
+  }
+  put_bf16_tensor_g2l(ctx, bk_ctx, tl, (u16 *)hi_lo, tl->fmt);
+
+  free(hi_lo);
+}
+
+/**
+ * \brief
+ */
+static int reshape_valid_output(bmk_ctx_t *bk_ctx,
+                                const bmk1880v2_tensor_lmem_t *ofmap,
+                                int org_oc, int org_oh, int org_ow,
+                                bmk1880v2_tensor_lmem_shape_t *tl_shape,
+                                bmk1880v2_tensor_lmem_stride_t *tl_load_stride,
+                                bmk1880v2_tensor_tgmem_shape_t *tg_shape,
+                                bmk1880v2_tensor_tgmem_stride_t *tg_stride,
+                                fmt_t fmt)
+{
+
+  assert(fmt == FMT_BF16 || fmt == FMT_I8 || fmt == FMT_U8);
+
+  // skip redundant one
+  // store to sys and re-slice, maybe use next layer
+  // sys->local skip redundant one
+
+  tg_shape->n = tl_shape->n = 1;
+  tg_shape->c = tl_shape->c = org_oc;
+  tg_shape->h = tl_shape->h = org_oh;
+  tg_shape->w = tl_shape->w = org_ow;
+
+  bmk1880v2_tensor_lmem_stride_t s =
+      bmk1880v2_tensor_lmem_default_stride(bk_ctx, *tl_shape, fmt, /*eu_align*/0);
+
+  tl_load_stride->n = s.n;
+  tl_load_stride->c = s.c;
+  tl_load_stride->h = s.h;
+  tl_load_stride->w = s.w;
+
+  int duplicat_c = ofmap->shape.c / org_oc;
+  tg_stride->n = tg_stride->c =
+      duplicat_c * ofmap->shape.h * ofmap->shape.w * get_fsz(fmt);
+  tg_stride->h = org_ow * get_fsz(fmt);
+
+  return 0;
+}
+
+static bmerr_t init_ref(int ic, int ih, int iw, int kh, int kw, int pad_right,
+                        int pad_left, int stride_h, int stride_w, fmt_t fmt,
+                        u16 *input, u16 *weight, u32 *bias, u16 *output_ref)
+{
+  bmerr_t ret;
+  int in = 1;
+  int ins_h = 0;
+  int ins_w = 0;
+  int ins_last_h = 0;
+  int ins_last_w = 0;
+  int dilation_h = 1;
+  int dilation_w = 1;
+  int pad_top = 0;
+  int pad_bottom = 0;
+  int rshift_bits = 0;
+
+  if (fmt == FMT_BF16) {
+    ret = native_pooling_avg_bf16(
+        input, weight, bias ? bias : NULL, output_ref, in, ic, ih, iw, kh, kw,
+        pad_top, pad_bottom, pad_left, pad_right, stride_h, stride_w, ins_h,
+        ins_w, ins_last_h, ins_last_w, dilation_h, dilation_w, 0);
+  } else {
+    int opd0_sign = fmt == FMT_I8;
+    int res0_sign = true;  //(ofmap->fmt == FMT_I8);
+    ret = native_pooling_ave_int8(
+        (s8 *)input, (s8 *)weight, bias ? (s16 *)bias : NULL, (s8 *)output_ref,
+        in, ic, ih, iw, kh, kw, pad_top, pad_bottom, pad_left, pad_right,
+        stride_h, stride_w, ins_h, ins_w, ins_last_h, ins_last_w, opd0_sign,
+        res0_sign, rshift_bits, 0);
+  }
+  return ret;
+}
+
+static int test_depthwise(CVI_RT_HANDLE ctx, bmk_ctx_t *bk_ctx, int ic, int ih,
+                          int iw, int kh, int kw, int pad_right, int pad_left,
+                          int stride_h, int stride_w, bool has_bias, fmt_t ifmt)
+{
+  // print_pooling_param(param);
+  param_t param;
+  param_t *p = &param;
+  assert(ifmt == FMT_BF16 || ifmt == FMT_I8 || ifmt == FMT_U8);
+  memset(p, 0, sizeof(*p));
+
+  int in = 1;
+  // TODO: verify dialate > 1
+  int dilation_h = 1;
+  int dilation_w = 1;
+  int relu_enable = 0;
+  int rshift_bits = 0;
+
+  // TODO: verity ins_x
+  int org_oh = pooling_oh(0, 0, 0, 0, stride_h, ih, kh, dilation_h);
+  int org_ow =
+      pooling_ow(0, 0, pad_left, pad_right, stride_w, iw, kw, dilation_w);
+  int org_oc = ic;
+  int org_o_shape_size = in * org_oc * org_oh * org_ow;
+  u16 *output;
+  bmk1880v2_tdma_tg2l_tensor_copy_param_t p1;
+  bmk1880v2_tdma_l2tg_tensor_copy_param_t p2;
+  memset(&p1, 0, sizeof(p1));
+  memset(&p2, 0, sizeof(p2));
+  // weight / ofmap not support U8 format
+  fmt_t other_fmt = ifmt == FMT_BF16 ? FMT_BF16 : FMT_I8;
+
+  // alloc testbench, input/ref
+  u16 *input = alloc_input(ic, ih, iw, ifmt);
+  u16 *weight = alloc_weight(ic, kh, kw, ifmt);
+  u32 *bias = NULL;
+  if (has_bias)
+    bias = alloc_bias(ic, ifmt);
+
+  u16 *output_ref = alloc_output(ic, org_oh, org_ow);
+
+  // init ref
+  init_ref(ic, ih, iw, kh, kw, pad_right, pad_left, stride_h, stride_w, ifmt,
+           input, weight, bias, output_ref);
+  // assert(ret == BM_SUCCESS);
+
+  // init param
+  // TODO: verify pad_top/pad_bottom
+  // TODO: verify ins_h_x
+  p->pad_left = pad_left;
+  p->pad_right = pad_right;
+  p->pad_top = 0;
+  p->pad_bottom = 0;
+  p->ins_h = 0;
+  p->ins_last_h = 0;
+  p->ins_w = 0;
+  p->ins_last_w = 0;
+  p->dilation_h = dilation_h;
+  p->dilation_w = dilation_w;
+  p->stride_h = stride_h;
+  p->stride_w = stride_w;
+  p->relu_enable = relu_enable;
+  p->rshift_bits = rshift_bits;
+  p->bias = NULL;
+
+  // prepard load / input / weight / bias / output new shape / stride
+  bmk1880v2_tensor_lmem_shape_t tl_load_shape;
+  bmk1880v2_tensor_lmem_stride_t tl_load_stride;
+  bmk1880v2_tensor_tgmem_shape_t tg_shape;
+  bmk1880v2_tensor_tgmem_stride_t tg_stride;
+  bmk1880v2_tensor_lmem_shape_t tl_weight_shape;
+  bmk1880v2_tensor_lmem_shape_t tl_bias_shape;
+  bmk1880v2_tensor_lmem_shape_t tl_output_shape;
+  bmk1880v2_tensor_lmem_t *tmp_tl_load;
+  bmk1880v2_tensor_tgmem_t *tmp_tg;
+
+  // get reshaped information
+  int r = bm1880v2_reshape_channel_same(
+      bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left, stride_h, stride_w,
+      &tl_load_shape, &tl_load_stride, &tg_shape, &tg_stride, &tl_weight_shape,
+      &tl_bias_shape, &tl_output_shape, ifmt, /*align*/1);
+
+  if (r == -1) {
+    printf("could not reshape it, 81\n");
+    free_depthwise_param(bk_ctx, p);
+
+    free(input);
+    free(weight);
+    free(bias);
+    free(output_ref);
+    return -1;
+  }
+
+  // prepare input tg
+  {
+    bmk1880v2_tensor_tgmem_shape_t put_tg_shape;
+
+    put_tg_shape.n = in;
+    put_tg_shape.c = ic;
+    put_tg_shape.h = ih;
+    put_tg_shape.w = iw;
+    bmk1880v2_tensor_tgmem_t *put_tg =
+        alloc_tg_bf16_gmem(&ctx, put_tg_shape, ifmt);
+    put_tg_bf16_gmem(&ctx, put_tg, (u8 *)input);
+    free_tg_gmem(&ctx, put_tg);
+  }
+
+  // prepare load input, put to tg and load back
+  {
+    tmp_tl_load = alloc_tl(bk_ctx, tl_load_shape, ifmt, /*eu_align*/0);
+    assert(tmp_tl_load);
+
+    tmp_tg = alloc_tg_bf16_gmem(&ctx, tg_shape, ifmt);
+    tmp_tg->stride = tg_stride;
+
+    p1.src = tmp_tg;
+    p1.dst = tmp_tl_load;
+
+    bmk1880v2_tdma_g2l_bf16_tensor_copy(bk_ctx, &p1);
+    test_submit(&ctx);
+    free_tg_gmem(&ctx, tmp_tg);
+
+
+    // fit for hw
+    tmp_tl_load->stride = bmk1880v2_tensor_lmem_default_stride(
+        bk_ctx, tmp_tl_load->shape, ifmt, /*align*/1);
+    p->ifmap = tmp_tl_load;
+  }
+
+  // prepare load bias, put to tg and load back
+  if (has_bias) {
+    // bias must i8
+    fmt_t bias_fmt = ifmt == FMT_BF16 ? FMT_BF16 : FMT_I8;
+    p->bias =
+        bmk1880v2_lmem_alloc_tensor(bk_ctx, tl_bias_shape, bias_fmt, 0);
+
+    // duplicate bias and replace old
+    u32 *new_bias = bm1880v2_reshape_channel_bias(
+        (u8 *)bias, tl_bias_shape.n, tl_bias_shape.c, tl_bias_shape.h,
+        tl_bias_shape.w, org_oc, ifmt);
+
+    // free old one
+    free(bias);
+    bias = new_bias;
+    put_bias_tensor(&ctx, bk_ctx, p->bias, bias);
+  }
+
+  // prepare load weight, put to tg and load back
+  {
+    p->weight = bmk1880v2_lmem_alloc_tensor(bk_ctx, tl_weight_shape,
+                                                 other_fmt, /*align*/1);
+    assert(p->weight);
+
+    // duplicate kernel with c
+    u8 *new_weight = bm1880v2_reshape_channel_weight(
+        (u8 *)weight, tl_weight_shape.n, tl_weight_shape.c, tl_weight_shape.h,
+        tl_weight_shape.w, org_oc, ifmt);
+
+    // free old one
+    free(weight);
+    weight = (u16 *)new_weight;
+    put_bf16_tensor_g2l(&ctx, bk_ctx, p->weight, (u16 *)weight, ifmt);
+  }
+
+  // prepard ofmap
+  {
+    // we allocate 'same' mode shape
+    p->ofmap = bmk1880v2_lmem_alloc_tensor(bk_ctx, tl_output_shape,
+                                                other_fmt, /*align*/1);
+    assert(p->ofmap);
+  }
+
+  // printf("p->ifmap at %p, c is %d\n", p->ifmap, tmp_tl_load->shape.c);
+
+  // emit
+  if (ifmt == FMT_BF16) {
+    bmk1880v2_tiu_depthwise_convolution(bk_ctx, p);
+  } else {
+    bmk1880v2_tiu_depthwise_convolution(bk_ctx, p);
+  }
+
+  // output = (u16 *)get_bf16_tensor_l2g(&ctx, bk_ctx, p->ofmap, ifmt);
+
+  // check with no pad if true
+  int is_valid_pack = false;
+  bmk1880v2_tensor_lmem_shape_t r_ofmap_shape;
+  bmk1880v2_tensor_lmem_stride_t r_ofmap_stride;
+  bmk1880v2_tensor_tgmem_shape_t r_tg_shape;
+  bmk1880v2_tensor_tgmem_stride_t r_tg_stride;
+
+  reshape_valid_output(bk_ctx, p->ofmap, org_oc, org_oh, org_ow, &r_ofmap_shape,
+                       &r_ofmap_stride, &r_tg_shape, &r_tg_stride, ifmt);
+
+  p1.dst = p->ofmap;
+
+  if (is_valid_pack) {
+    bmk1880v2_tensor_tgmem_shape_t dst_shape;
+    dst_shape.n = p->ofmap->shape.n;
+    dst_shape.c = p->ofmap->shape.c;
+    dst_shape.h = p->ofmap->shape.h;
+    dst_shape.w = p->ofmap->shape.w;
+    tg_t *tg_tmp = alloc_tg_bf16_gmem(&ctx, dst_shape, ifmt);
+
+    p2.src = p->ofmap;
+    p2.dst = tg_tmp;
+
+    // store for later reshape
+    bmk1880v2_tdma_l2g_bf16_tensor_copy(bk_ctx, &p2);
+    test_submit(&ctx);
+
+    // free useless for later reallocate
+    free_depthwise_param(bk_ctx, p);
+
+    p->ofmap = bmk1880v2_lmem_alloc_tensor(bk_ctx, r_ofmap_shape, ifmt,
+                                                /*eu_align*/0);
+    assert(p->ofmap);
+
+    tg_tmp->shape = r_tg_shape;
+    tg_tmp->stride = r_tg_stride;
+
+    p1.src = tg_tmp;
+    p1.dst = p->ofmap;
+    bmk1880v2_tdma_g2l_bf16_tensor_copy(bk_ctx, &p1);
+    free_tg_gmem(&ctx, tg_tmp);
+  }
+
+
+  fmt_t ofmap_fmt = ifmt == FMT_BF16 ? FMT_BF16 : FMT_I8;
+  output = (u16 *)get_bf16_tensor_l2g(&ctx, bk_ctx, p1.dst, ofmap_fmt);
+  compare_results(p, input, weight, bias, output, output_ref, org_o_shape_size,
+                  is_valid_pack, org_oc, org_oh, org_ow);
+
+  // free resource
+  if (is_valid_pack) {
+    free_tl(bk_ctx, p->ofmap);
+  } else {
+    free_depthwise_param(bk_ctx, p);
+  }
+
+  free(input);
+  free(weight);
+  free(bias);
+  free(output);
+
+  return 1;
+}
+
+static void init_input(param_t *p, int *ic, int *ih, int *iw, int *kh, int *kw,
+                       int *pad_right, int *pad_left)
+{
+  *ic = p->ifmap->shape.c;
+  *ih = p->ifmap->shape.h;
+  *iw = p->ifmap->shape.w;
+  *kh = p->weight->shape.h;
+  *kw = p->weight->shape.w;
+  *pad_right = p->pad_right;
+  *pad_left = p->pad_left;
+}
+
+static int test_depthwise_pooling(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx)
+{
+  int loop = 1;
+  int test_finished_num = 0;
+  int ihs[] = {24, 96, 120, 480, 0};
+  int iws[] = {16, 17, 19, 23, 128, 256, 0};
+  int stride_hs[] = {3, 4, 0};
+  fmt_t formats[] = {FMT_I8, FMT_U8, FMT_BF16, FMT_F32};
+  int ic, ih, iw, kh, kw, pad_right, pad_left;
+  fmt_t ifmt;
+  param_t param;
+  assert(print_pooling_param);
+
+  ifmt = FMT_U8;
+  param = random_depthwise_param(bk_ctx, 36, 11, 3, ifmt);
+  init_input(&param, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left);
+  test_finished_num +=
+      test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left,
+                     param.stride_h, param.stride_w, param.bias, ifmt);
+  print_pooling_param(&param);
+  free_depthwise_struct(&param);
+
+  ifmt = FMT_U8;
+  param = random_depthwise_param(bk_ctx, 24, 29, 3, ifmt);
+  init_input(&param, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left);
+  free_depthwise_struct(&param);
+  test_finished_num +=
+      test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left,
+                     param.stride_h, param.stride_w, param.bias, ifmt);
+
+  ifmt = FMT_BF16;
+  param = random_depthwise_param(bk_ctx, 480, 53, 3, ifmt);
+  init_input(&param, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left);
+  free_depthwise_struct(&param);
+  test_finished_num +=
+      test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left,
+                     param.stride_h, param.stride_w, param.bias, ifmt);
+
+  ifmt = FMT_I8;
+  param = random_depthwise_param(bk_ctx, 480, 61, 3, ifmt);
+  init_input(&param, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left);
+  free_depthwise_struct(&param);
+  test_finished_num +=
+      test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left,
+                     param.stride_h, param.stride_w, param.bias, ifmt);
+
+  ifmt = FMT_U8;
+  param = random_depthwise_param(bk_ctx, 24, 17, 3, ifmt);
+  init_input(&param, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left);
+  free_depthwise_struct(&param);
+  test_finished_num +=
+      test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left,
+                     param.stride_h, param.stride_w, param.bias, ifmt);
+
+  ifmt = FMT_BF16;
+  param = random_depthwise_param(bk_ctx, 48, 65, 3, ifmt);
+  init_input(&param, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left);
+  free_depthwise_struct(&param);
+  test_finished_num +=
+      test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left,
+                     param.stride_h, param.stride_w, param.bias, ifmt);
+
+  ifmt = FMT_I8;
+  param = random_depthwise_param(bk_ctx, 48, 63, 3, ifmt);
+  init_input(&param, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left);
+  free_depthwise_struct(&param);
+  test_finished_num +=
+      test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left,
+                     param.stride_h, param.stride_w, param.bias, ifmt);
+
+  for (int i = 0; i < loop; i++) {
+    for (int i = 0; ihs[i] != 0; i++) {
+      for (int j = 0; iws[j] != 0; j++) {
+        for (int k = 0; stride_hs[k] != 0; k++) {
+          for (int l = 0; formats[l] != 0; l++) {
+            if (ihs[i] >= 480 && formats[l] == FMT_BF16) {
+              continue;
+            }
+            param = random_depthwise_param(bk_ctx, ihs[i], iws[j], stride_hs[k],
+                                           formats[l]);
+            ifmt = formats[l];
+            printf("test[%d] ih/iw/sh/fmt is {%d, %d, %d, %d}\n",
+                   test_finished_num, ihs[i], iws[j], stride_hs[k], formats[l]);
+
+            init_input(&param, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left);
+            free_depthwise_struct(&param);
+            int r = test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right,
+                                   pad_left, param.stride_h, param.stride_w,
+                                   param.bias, ifmt);
+            test_finished_num += r;
+          }
+        }
+      }
+    }
+  }
+  printf("Test finished %u\n", test_finished_num);
+
+  return test_finished_num;
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  int round_mode;
+  round_mode = set_store_feround();
+  int ret = test_depthwise_pooling(&ctx, bk_ctx);
+  assert(ret >= 0);
+  printf("pass\n");
+  restore_feround(round_mode);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_bf16_inv_sqrt.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_inv_sqrt.cpp
new file mode 100644
index 000000000..bae707df6
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_inv_sqrt.cpp
@@ -0,0 +1,2494 @@
+/**
+ */
+#include "../1880v2_test_util.h"
+#define OUT
+#define IN
+
+using namespace std;
+//TODO: get from ctx
+static u32 channel = 32; //<! 1880v2 hardcode
+
+//<! 1880v2 hw config
+static u32 table_h = 32;
+static u32 table_w = 8;
+static u32 table_hw = table_h * table_w;
+
+// NOTICE: all inter result save in doulbe unit
+static double *sqrt_hw = (double *)malloc(sizeof(double) * table_hw);
+
+// fix range
+const static int exp_start = -62;
+const static int exp_end = 63;
+
+/**
+ * pre_data means we test fixed pattern, it should be same sa lut
+ */
+enum TEST_MODE {
+  PRE_DATA_COMPARE_FIX = 0, // pre-data + fix compare
+  TEST_MODE_MAX,
+};
+
+static TEST_MODE mode;
+
+static u16 test_pattern[] = {
+  0x0000,
+  0x38D2,
+  0x3952,
+  0x399D,
+  0x39D2,
+  0x3A03,
+  0x3A1D,
+  0x3A38,
+  0x3A52,
+  0x3A6C,
+  0x3A83,
+  0x3A90,
+  0x3A9D,
+  0x3AAA,
+  0x3AB8,
+  0x3AC5,
+  0x3AD2,
+  0x3ADF,
+  0x3AEC,
+  0x3AF9,
+  0x3B03,
+  0x3B0A,
+  0x3B10,
+  0x3B17,
+  0x3B1D,
+  0x3B24,
+  0x3B2A,
+  0x3B31,
+  0x3B38,
+  0x3B3E,
+  0x3B45,
+  0x3B4B,
+  0x3B52,
+  0x3B58,
+  0x3B5F,
+  0x3B65,
+  0x3B6C,
+  0x3B72,
+  0x3B79,
+  0x3B80,
+  0x3B83,
+  0x3B86,
+  0x3B8A,
+  0x3B8D,
+  0x3B90,
+  0x3B93,
+  0x3B97,
+  0x3B9A,
+  0x3B9D,
+  0x3BA1,
+  0x3BA4,
+  0x3BA7,
+  0x3BAA,
+  0x3BAE,
+  0x3BB1,
+  0x3BB4,
+  0x3BB8,
+  0x3BBB,
+  0x3BBE,
+  0x3BC1,
+  0x3BC5,
+  0x3BC8,
+  0x3BCB,
+  0x3BCE,
+  0x3BD2,
+  0x3BD5,
+  0x3BD8,
+  0x3BDC,
+  0x3BDF,
+  0x3BE2,
+  0x3BE5,
+  0x3BE9,
+  0x3BEC,
+  0x3BEF,
+  0x3BF2,
+  0x3BF6,
+  0x3BF9,
+  0x3BFC,
+  0x3C00,
+  0x3C01,
+  0x3C03,
+  0x3C05,
+  0x3C06,
+  0x3C08,
+  0x3C0A,
+  0x3C0B,
+  0x3C0D,
+  0x3C0F,
+  0x3C10,
+  0x3C12,
+  0x3C13,
+  0x3C15,
+  0x3C17,
+  0x3C18,
+  0x3C1A,
+  0x3C1C,
+  0x3C1D,
+  0x3C1F,
+  0x3C21,
+  0x3C22,
+  0x3C24,
+  0x3C25,
+  0x3C27,
+  0x3C29,
+  0x3C2A,
+  0x3C2C,
+  0x3C2E,
+  0x3C2F,
+  0x3C31,
+  0x3C33,
+  0x3C34,
+  0x3C36,
+  0x3C38,
+  0x3C39,
+  0x3C3B,
+  0x3C3C,
+  0x3C3E,
+  0x3C40,
+  0x3C41,
+  0x3C43,
+  0x3C45,
+  0x3C46,
+  0x3C48,
+  0x3C4A,
+  0x3C4B,
+  0x3C4D,
+  0x3C4E,
+  0x3C50,
+  0x3C52,
+  0x3C53,
+  0x3C55,
+  0x3C57,
+  0x3C58,
+  0x3C5A,
+  0x3C5C,
+  0x3C5D,
+  0x3C5F,
+  0x3C60,
+  0x3C62,
+  0x3C64,
+  0x3C65,
+  0x3C67,
+  0x3C69,
+  0x3C6A,
+  0x3C6C,
+  0x3C6E,
+  0x3C6F,
+  0x3C71,
+  0x3C72,
+  0x3C74,
+  0x3C76,
+  0x3C77,
+  0x3C79,
+  0x3C7B,
+  0x3C7C,
+  0x3C7E,
+  0x3C80,
+  0x3C81,
+  0x3C81,
+  0x3C82,
+  0x3C83,
+  0x3C84,
+  0x3C85,
+  0x3C86,
+  0x3C86,
+  0x3C87,
+  0x3C88,
+  0x3C89,
+  0x3C8A,
+  0x3C8A,
+  0x3C8B,
+  0x3C8C,
+  0x3C8D,
+  0x3C8E,
+  0x3C8F,
+  0x3C8F,
+  0x3C90,
+  0x3C91,
+  0x3C92,
+  0x3C93,
+  0x3C93,
+  0x3C94,
+  0x3C95,
+  0x3C96,
+  0x3C97,
+  0x3C98,
+  0x3C98,
+  0x3C99,
+  0x3C9A,
+  0x3C9B,
+  0x3C9C,
+  0x3C9C,
+  0x3C9D,
+  0x3C9E,
+  0x3C9F,
+  0x3CA0,
+  0x3CA1,
+  0x3CA1,
+  0x3CA2,
+  0x3CA3,
+  0x3CA4,
+  0x3CA5,
+  0x3CA5,
+  0x3CA6,
+  0x3CA7,
+  0x3CA8,
+  0x3CA9,
+  0x3CAA,
+  0x3CAA,
+  0x3CAB,
+  0x3CAC,
+  0x3CAD,
+  0x3CAE,
+  0x3CAE,
+  0x3CAF,
+  0x3CB0,
+  0x3CB1,
+  0x3CB2,
+  0x3CB3,
+  0x3CB3,
+  0x3CB4,
+  0x3CB5,
+  0x3CB6,
+  0x3CB7,
+  0x3CB8,
+  0x3CB8,
+  0x3CB9,
+  0x3CBA,
+  0x3CBB,
+  0x3CBC,
+  0x3CBC,
+  0x3CBD,
+  0x3CBE,
+  0x3CBF,
+  0x3CC0,
+  0x3CC1,
+  0x3CC1,
+  0x3CC2,
+  0x3CC3,
+  0x3CC4,
+  0x3CC5,
+  0x3CC5,
+  0x3CC6,
+  0x3CC7,
+  0x3CC8,
+  0x3CC9,
+  0x3CCA,
+  0x3CCA,
+  0x3CCB,
+  0x3CCC,
+  0x3CCD,
+  0x3CCE,
+  0x3CCE,
+  0x3CCF,
+  0x3CD0,
+  0x3CD1,
+  0x3CD2,
+  0x3CD3,
+  0x3CD3,
+  0x3CD4,
+  0x3CD5,
+  0x3CD6,
+  0x3CD7,
+  0x3CD7,
+  0x3CD8,
+  0x3CD9,
+  0x3CDA,
+  0x3CDB,
+  0x3CDC,
+  0x3CDC,
+  0x3CDD,
+  0x3CDE,
+  0x3CDF,
+  0x3CE0,
+  0x3CE0,
+  0x3CE1,
+  0x3CE2,
+  0x3CE3,
+  0x3CE4,
+  0x3CE5,
+  0x3CE5,
+  0x3CE6,
+  0x3CE7,
+  0x3CE8,
+  0x3CE9,
+  0x3CE9,
+  0x3CEA,
+  0x3CEB,
+  0x3CEC,
+  0x3CED,
+  0x3CEE,
+  0x3CEE,
+  0x3CEF,
+  0x3CF0,
+  0x3CF1,
+  0x3CF2,
+  0x3CF2,
+  0x3CF3,
+  0x3CF4,
+  0x3CF5,
+  0x3CF6,
+  0x3CF7,
+  0x3CF7,
+  0x3CF8,
+  0x3CF9,
+  0x3CFA,
+  0x3CFB,
+  0x3CFB,
+  0x3CFC,
+  0x3CFD,
+  0x3CFE,
+  0x3CFF,
+  0x3D00,
+  0x3D00,
+  0x3D01,
+  0x3D01,
+  0x3D01,
+  0x3D02,
+  0x3D02,
+  0x3D03,
+  0x3D03,
+  0x3D03,
+  0x3D04,
+  0x3D04,
+  0x3D05,
+  0x3D05,
+  0x3D06,
+  0x3D06,
+  0x3D06,
+  0x3D07,
+  0x3D07,
+  0x3D08,
+  0x3D08,
+  0x3D08,
+  0x3D09,
+  0x3D09,
+  0x3D0A,
+  0x3D0A,
+  0x3D0A,
+  0x3D0B,
+  0x3D0B,
+  0x3D0C,
+  0x3D0C,
+  0x3D0C,
+  0x3D0D,
+  0x3D0D,
+  0x3D0E,
+  0x3D0E,
+  0x3D0F,
+  0x3D0F,
+  0x3D0F,
+  0x3D10,
+  0x3D10,
+  0x3D11,
+  0x3D11,
+  0x3D11,
+  0x3D12,
+  0x3D12,
+  0x3D13,
+  0x3D13,
+  0x3D13,
+  0x3D14,
+  0x3D14,
+  0x3D15,
+  0x3D15,
+  0x3D16,
+  0x3D16,
+  0x3D16,
+  0x3D17,
+  0x3D17,
+  0x3D18,
+  0x3D18,
+  0x3D18,
+  0x3D19,
+  0x3D19,
+  0x3D1A,
+  0x3D1A,
+  0x3D1A,
+  0x3D1B,
+  0x3D1B,
+  0x3D1C,
+  0x3D1C,
+  0x3D1C,
+  0x3D1D,
+  0x3D1D,
+  0x3D1E,
+  0x3D1E,
+  0x3D1F,
+  0x3D1F,
+  0x3D1F,
+  0x3D20,
+  0x3D20,
+  0x3D21,
+  0x3D21,
+  0x3D21,
+  0x3D22,
+  0x3D22,
+  0x3D23,
+  0x3D23,
+  0x3D23,
+  0x3D24,
+  0x3D24,
+  0x3D25,
+  0x3D25,
+  0x3D25,
+  0x3D26,
+  0x3D26,
+  0x3D27,
+  0x3D27,
+  0x3D28,
+  0x3D28,
+  0x3D28,
+  0x3D29,
+  0x3D29,
+  0x3D2A,
+  0x3D2A,
+  0x3D2A,
+  0x3D2B,
+  0x3D2B,
+  0x3D2C,
+  0x3D2C,
+  0x3D2C,
+  0x3D2D,
+  0x3D2D,
+  0x3D2E,
+  0x3D2E,
+  0x3D2E,
+  0x3D2F,
+  0x3D2F,
+  0x3D30,
+  0x3D30,
+  0x3D31,
+  0x3D31,
+  0x3D31,
+  0x3D32,
+  0x3D32,
+  0x3D33,
+  0x3D33,
+  0x3D33,
+  0x3D34,
+  0x3D34,
+  0x3D35,
+  0x3D35,
+  0x3D35,
+  0x3D36,
+  0x3D36,
+  0x3D37,
+  0x3D37,
+  0x3D38,
+  0x3D38,
+  0x3D38,
+  0x3D39,
+  0x3D39,
+  0x3D3A,
+  0x3D3A,
+  0x3D3A,
+  0x3D3B,
+  0x3D3B,
+  0x3D3C,
+  0x3D3C,
+  0x3D3C,
+  0x3D3D,
+  0x3D3D,
+  0x3D3E,
+  0x3D3E,
+  0x3D3E,
+  0x3D3F,
+  0x3D3F,
+  0x3D40,
+  0x3D40,
+  0x3D41,
+  0x3D41,
+  0x3D41,
+  0x3D42,
+  0x3D42,
+  0x3D43,
+  0x3D43,
+  0x3D43,
+  0x3D44,
+  0x3D44,
+  0x3D45,
+  0x3D45,
+  0x3D45,
+  0x3D46,
+  0x3D46,
+  0x3D47,
+  0x3D47,
+  0x3D47,
+  0x3D48,
+  0x3D48,
+  0x3D49,
+  0x3D49,
+  0x3D4A,
+  0x3D4A,
+  0x3D4A,
+  0x3D4B,
+  0x3D4B,
+  0x3D4C,
+  0x3D4C,
+  0x3D4C,
+  0x3D4D,
+  0x3D4D,
+  0x3D4E,
+  0x3D4E,
+  0x3D4E,
+  0x3D4F,
+  0x3D4F,
+  0x3D50,
+  0x3D50,
+  0x3D50,
+  0x3D51,
+  0x3D51,
+  0x3D52,
+  0x3D52,
+  0x3D53,
+  0x3D53,
+  0x3D53,
+  0x3D54,
+  0x3D54,
+  0x3D55,
+  0x3D55,
+  0x3D55,
+  0x3D56,
+  0x3D56,
+  0x3D57,
+  0x3D57,
+  0x3D57,
+  0x3D58,
+  0x3D58,
+  0x3D59,
+  0x3D59,
+  0x3D59,
+  0x3D5A,
+  0x3D5A,
+  0x3D5B,
+  0x3D5B,
+  0x3D5C,
+  0x3D5C,
+  0x3D5C,
+  0x3D5D,
+  0x3D5D,
+  0x3D5E,
+  0x3D5E,
+  0x3D5E,
+  0x3D5F,
+  0x3D5F,
+  0x3D60,
+  0x3D60,
+  0x3D60,
+  0x3D61,
+  0x3D61,
+  0x3D62,
+  0x3D62,
+  0x3D63,
+  0x3D63,
+  0x3D63,
+  0x3D64,
+  0x3D64,
+  0x3D65,
+  0x3D65,
+  0x3D65,
+  0x3D66,
+  0x3D66,
+  0x3D67,
+  0x3D67,
+  0x3D67,
+  0x3D68,
+  0x3D68,
+  0x3D69,
+  0x3D69,
+  0x3D69,
+  0x3D6A,
+  0x3D6A,
+  0x3D6B,
+  0x3D6B,
+  0x3D6C,
+  0x3D6C,
+  0x3D6C,
+  0x3D6D,
+  0x3D6D,
+  0x3D6E,
+  0x3D6E,
+  0x3D6E,
+  0x3D6F,
+  0x3D6F,
+  0x3D70,
+  0x3D70,
+  0x3D70,
+  0x3D71,
+  0x3D71,
+  0x3D72,
+  0x3D72,
+  0x3D72,
+  0x3D73,
+  0x3D73,
+  0x3D74,
+  0x3D74,
+  0x3D75,
+  0x3D75,
+  0x3D75,
+  0x3D76,
+  0x3D76,
+  0x3D77,
+  0x3D77,
+  0x3D77,
+  0x3D78,
+  0x3D78,
+  0x3D79,
+  0x3D79,
+  0x3D79,
+  0x3D7A,
+  0x3D7A,
+  0x3D7B,
+  0x3D7B,
+  0x3D7B,
+  0x3D7C,
+  0x3D7C,
+  0x3D7D,
+  0x3D7D,
+  0x3D7E,
+  0x3D7E,
+  0x3D7E,
+  0x3D7F,
+  0x3D7F,
+  0x3D80,
+  0x3D80,
+  0x3D80,
+  0x3D80,
+  0x3D81,
+  0x3D81,
+  0x3D81,
+  0x3D81,
+  0x3D81,
+  0x3D82,
+  0x3D82,
+  0x3D82,
+  0x3D82,
+  0x3D82,
+  0x3D83,
+  0x3D83,
+  0x3D83,
+  0x3D83,
+  0x3D83,
+  0x3D84,
+  0x3D84,
+  0x3D84,
+  0x3D84,
+  0x3D85,
+  0x3D85,
+  0x3D85,
+  0x3D85,
+  0x3D85,
+  0x3D86,
+  0x3D86,
+  0x3D86,
+  0x3D86,
+  0x3D86,
+  0x3D87,
+  0x3D87,
+  0x3D87,
+  0x3D87,
+  0x3D87,
+  0x3D88,
+  0x3D88,
+  0x3D88,
+  0x3D88,
+  0x3D88,
+  0x3D89,
+  0x3D89,
+  0x3D89,
+  0x3D89,
+  0x3D89,
+  0x3D8A,
+  0x3D8A,
+  0x3D8A,
+  0x3D8A,
+  0x3D8A,
+  0x3D8B,
+  0x3D8B,
+  0x3D8B,
+  0x3D8B,
+  0x3D8B,
+  0x3D8C,
+  0x3D8C,
+  0x3D8C,
+  0x3D8C,
+  0x3D8C,
+  0x3D8D,
+  0x3D8D,
+  0x3D8D,
+  0x3D8D,
+  0x3D8E,
+  0x3D8E,
+  0x3D8E,
+  0x3D8E,
+  0x3D8E,
+  0x3D8F,
+  0x3D8F,
+  0x3D8F,
+  0x3D8F,
+  0x3D8F,
+  0x3D90,
+  0x3D90,
+  0x3D90,
+  0x3D90,
+  0x3D90,
+  0x3D91,
+  0x3D91,
+  0x3D91,
+  0x3D91,
+  0x3D91,
+  0x3D92,
+  0x3D92,
+  0x3D92,
+  0x3D92,
+  0x3D92,
+  0x3D93,
+  0x3D93,
+  0x3D93,
+  0x3D93,
+  0x3D93,
+  0x3D94,
+  0x3D94,
+  0x3D94,
+  0x3D94,
+  0x3D94,
+  0x3D95,
+  0x3D95,
+  0x3D95,
+  0x3D95,
+  0x3D96,
+  0x3D96,
+  0x3D96,
+  0x3D96,
+  0x3D96,
+  0x3D97,
+  0x3D97,
+  0x3D97,
+  0x3D97,
+  0x3D97,
+  0x3D98,
+  0x3D98,
+  0x3D98,
+  0x3D98,
+  0x3D98,
+  0x3D99,
+  0x3D99,
+  0x3D99,
+  0x3D99,
+  0x3D99,
+  0x3D9A,
+  0x3D9A,
+  0x3D9A,
+  0x3D9A,
+  0x3D9A,
+  0x3D9B,
+  0x3D9B,
+  0x3D9B,
+  0x3D9B,
+  0x3D9B,
+  0x3D9C,
+  0x3D9C,
+  0x3D9C,
+  0x3D9C,
+  0x3D9C,
+  0x3D9D,
+  0x3D9D,
+  0x3D9D,
+  0x3D9D,
+  0x3D9D,
+  0x3D9E,
+  0x3D9E,
+  0x3D9E,
+  0x3D9E,
+  0x3D9F,
+  0x3D9F,
+  0x3D9F,
+  0x3D9F,
+  0x3D9F,
+  0x3DA0,
+  0x3DA0,
+  0x3DA0,
+  0x3DA0,
+  0x3DA0,
+  0x3DA1,
+  0x3DA1,
+  0x3DA1,
+  0x3DA1,
+  0x3DA1,
+  0x3DA2,
+  0x3DA2,
+  0x3DA2,
+  0x3DA2,
+  0x3DA2,
+  0x3DA3,
+  0x3DA3,
+  0x3DA3,
+  0x3DA3,
+  0x3DA3,
+  0x3DA4,
+  0x3DA4,
+  0x3DA4,
+  0x3DA4,
+  0x3DA4,
+  0x3DA5,
+  0x3DA5,
+  0x3DA5,
+  0x3DA5,
+  0x3DA5,
+  0x3DA6,
+  0x3DA6,
+  0x3DA6,
+  0x3DA6,
+  0x3DA7,
+  0x3DA7,
+  0x3DA7,
+  0x3DA7,
+  0x3DA7,
+  0x3DA8,
+  0x3DA8,
+  0x3DA8,
+  0x3DA8,
+  0x3DA8,
+  0x3DA9,
+  0x3DA9,
+  0x3DA9,
+  0x3DA9,
+  0x3DA9,
+  0x3DAA,
+  0x3DAA,
+  0x3DAA,
+  0x3DAA,
+  0x3DAA,
+  0x3DAB,
+  0x3DAB,
+  0x3DAB,
+  0x3DAB,
+  0x3DAB,
+  0x3DAC,
+  0x3DAC,
+  0x3DAC,
+  0x3DAC,
+  0x3DAC,
+  0x3DAD,
+  0x3DAD,
+  0x3DAD,
+  0x3DAD,
+  0x3DAD,
+  0x3DAE,
+  0x3DAE,
+  0x3DAE,
+  0x3DAE,
+  0x3DAE,
+  0x3DAF,
+  0x3DAF,
+  0x3DAF,
+  0x3DAF,
+  0x3DB0,
+  0x3DB0,
+  0x3DB0,
+  0x3DB0,
+  0x3DB0,
+  0x3DB1,
+  0x3DB1,
+  0x3DB1,
+  0x3DB1,
+  0x3DB1,
+  0x3DB2,
+  0x3DB2,
+  0x3DB2,
+  0x3DB2,
+  0x3DB2,
+  0x3DB3,
+  0x3DB3,
+  0x3DB3,
+  0x3DB3,
+  0x3DB3,
+  0x3DB4,
+  0x3DB4,
+  0x3DB4,
+  0x3DB4,
+  0x3DB4,
+  0x3DB5,
+  0x3DB5,
+  0x3DB5,
+  0x3DB5,
+  0x3DB5,
+  0x3DB6,
+  0x3DB6,
+  0x3DB6,
+  0x3DB6,
+  0x3DB6,
+  0x3DB7,
+  0x3DB7,
+  0x3DB7,
+  0x3DB7,
+  0x3DB8,
+  0x3DB8,
+  0x3DB8,
+  0x3DB8,
+  0x3DB8,
+  0x3DB9,
+  0x3DB9,
+  0x3DB9,
+  0x3DB9,
+  0x3DB9,
+  0x3DBA,
+  0x3DBA,
+  0x3DBA,
+  0x3DBA,
+  0x3DBA,
+  0x3DBB,
+  0x3DBB,
+  0x3DBB,
+  0x3DBB,
+  0x3DBB,
+  0x3DBC,
+  0x3DBC,
+  0x3DBC,
+  0x3DBC,
+  0x3DBC,
+  0x3DBD,
+  0x3DBD,
+  0x3DBD,
+  0x3DBD,
+  0x3DBD,
+  0x3DBE,
+  0x3DBE,
+  0x3DBE,
+  0x3DBE,
+  0x3DBE,
+  0x3DBF,
+  0x3DBF,
+  0x3DBF,
+  0x3DBF,
+  0x3DBF,
+  0x3DC0,
+  0x3DC0,
+  0x3DC0,
+  0x3DC0,
+  0x3DC1,
+  0x3DC1,
+  0x3DC1,
+  0x3DC1,
+  0x3DC1,
+  0x3DC2,
+  0x3DC2,
+  0x3DC2,
+  0x3DC2,
+  0x3DC2,
+  0x3DC3,
+  0x3DC3,
+  0x3DC3,
+  0x3DC3,
+  0x3DC3,
+  0x3DC4,
+  0x3DC4,
+  0x3DC4,
+  0x3DC4,
+  0x3DC4,
+  0x3DC5,
+  0x3DC5,
+  0x3DC5,
+  0x3DC5,
+  0x3DC5,
+  0x3DC6,
+  0x3DC6,
+  0x3DC6,
+  0x3DC6,
+  0x3DC6,
+  0x3DC7,
+  0x3DC7,
+  0x3DC7,
+  0x3DC7,
+  0x3DC7,
+  0x3DC8,
+  0x3DC8,
+  0x3DC8,
+  0x3DC8,
+  0x3DC8,
+  0x3DC9,
+  0x3DC9,
+  0x3DC9,
+  0x3DC9,
+  0x3DCA,
+  0x3DCA,
+  0x3DCA,
+  0x3DCA,
+  0x3DCA,
+  0x3DCB,
+  0x3DCB,
+  0x3DCB,
+  0x3DCB,
+  0x3DCB,
+  0x3DCC,
+  0x3DCC,
+  0x3DCC,
+  0x3DCC,
+  0x3DCC,
+  0x3DCD,
+};
+
+static u16 sigmode_golden_bf16[] = {
+  0x4f00,
+  0x42d0,
+  0x4293,
+  0x426f,
+  0x4250,
+  0x4234,
+  0x4229,
+  0x421e,
+  0x4213,
+  0x4208,
+  0x41fe,
+  0x41f7,
+  0x41ef,
+  0x41e7,
+  0x41df,
+  0x41d8,
+  0x41d0,
+  0x41c8,
+  0x41c1,
+  0x41b9,
+  0x41b4,
+  0x41b1,
+  0x41ae,
+  0x41ab,
+  0x41a9,
+  0x41a6,
+  0x41a4,
+  0x41a1,
+  0x419e,
+  0x419b,
+  0x4198,
+  0x4196,
+  0x4193,
+  0x4191,
+  0x418e,
+  0x418b,
+  0x4188,
+  0x4186,
+  0x4183,
+  0x4180,
+  0x417e,
+  0x417c,
+  0x417a,
+  0x4178,
+  0x4177,
+  0x4175,
+  0x4173,
+  0x4171,
+  0x416f,
+  0x416d,
+  0x416b,
+  0x4169,
+  0x4167,
+  0x4165,
+  0x4163,
+  0x4162,
+  0x415f,
+  0x415d,
+  0x415c,
+  0x415a,
+  0x4158,
+  0x4156,
+  0x4154,
+  0x4152,
+  0x4150,
+  0x414e,
+  0x414c,
+  0x414a,
+  0x4148,
+  0x4147,
+  0x4145,
+  0x4142,
+  0x4141,
+  0x413f,
+  0x413d,
+  0x413b,
+  0x4139,
+  0x4137,
+  0x4135,
+  0x4135,
+  0x4134,
+  0x4133,
+  0x4133,
+  0x4132,
+  0x4131,
+  0x4130,
+  0x4130,
+  0x412f,
+  0x412e,
+  0x412e,
+  0x412d,
+  0x412c,
+  0x412b,
+  0x412b,
+  0x412a,
+  0x4129,
+  0x4129,
+  0x4128,
+  0x4127,
+  0x4127,
+  0x4126,
+  0x4126,
+  0x4125,
+  0x4124,
+  0x4124,
+  0x4123,
+  0x4122,
+  0x4122,
+  0x4121,
+  0x4120,
+  0x411f,
+  0x411f,
+  0x411e,
+  0x411d,
+  0x411d,
+  0x411c,
+  0x411b,
+  0x411a,
+  0x411a,
+  0x4119,
+  0x4118,
+  0x4118,
+  0x4117,
+  0x4116,
+  0x4116,
+  0x4115,
+  0x4115,
+  0x4114,
+  0x4113,
+  0x4113,
+  0x4112,
+  0x4111,
+  0x4111,
+  0x4110,
+  0x410f,
+  0x410e,
+  0x410e,
+  0x410d,
+  0x410c,
+  0x410c,
+  0x410b,
+  0x410a,
+  0x410a,
+  0x4109,
+  0x4108,
+  0x4107,
+  0x4107,
+  0x4106,
+  0x4106,
+  0x4105,
+  0x4104,
+  0x4104,
+  0x4103,
+  0x4102,
+  0x4102,
+  0x4101,
+  0x4100,
+  0x40ff,
+  0x40ff,
+  0x40ff,
+  0x40fe,
+  0x40fe,
+  0x40fd,
+  0x40fc,
+  0x40fc,
+  0x40fc,
+  0x40fb,
+  0x40fb,
+  0x40fa,
+  0x40fa,
+  0x40fa,
+  0x40f9,
+  0x40f8,
+  0x40f8,
+  0x40f7,
+  0x40f7,
+  0x40f7,
+  0x40f6,
+  0x40f5,
+  0x40f5,
+  0x40f5,
+  0x40f4,
+  0x40f4,
+  0x40f3,
+  0x40f3,
+  0x40f2,
+  0x40f2,
+  0x40f1,
+  0x40f1,
+  0x40f0,
+  0x40f0,
+  0x40f0,
+  0x40ef,
+  0x40ee,
+  0x40ee,
+  0x40ed,
+  0x40ed,
+  0x40ed,
+  0x40ec,
+  0x40eb,
+  0x40eb,
+  0x40ea,
+  0x40ea,
+  0x40ea,
+  0x40e9,
+  0x40e9,
+  0x40e8,
+  0x40e7,
+  0x40e7,
+  0x40e7,
+  0x40e6,
+  0x40e6,
+  0x40e5,
+  0x40e5,
+  0x40e4,
+  0x40e4,
+  0x40e3,
+  0x40e3,
+  0x40e2,
+  0x40e2,
+  0x40e2,
+  0x40e1,
+  0x40e0,
+  0x40e0,
+  0x40df,
+  0x40df,
+  0x40df,
+  0x40de,
+  0x40dd,
+  0x40dd,
+  0x40dd,
+  0x40dc,
+  0x40dc,
+  0x40db,
+  0x40da,
+  0x40da,
+  0x40da,
+  0x40d9,
+  0x40d9,
+  0x40d8,
+  0x40d8,
+  0x40d8,
+  0x40d7,
+  0x40d6,
+  0x40d6,
+  0x40d5,
+  0x40d5,
+  0x40d5,
+  0x40d4,
+  0x40d3,
+  0x40d3,
+  0x40d2,
+  0x40d2,
+  0x40d2,
+  0x40d1,
+  0x40d1,
+  0x40d0,
+  0x40cf,
+  0x40cf,
+  0x40cf,
+  0x40ce,
+  0x40ce,
+  0x40cd,
+  0x40cd,
+  0x40cc,
+  0x40cc,
+  0x40cb,
+  0x40cb,
+  0x40ca,
+  0x40ca,
+  0x40ca,
+  0x40c9,
+  0x40c8,
+  0x40c8,
+  0x40c8,
+  0x40c7,
+  0x40c7,
+  0x40c6,
+  0x40c5,
+  0x40c5,
+  0x40c5,
+  0x40c4,
+  0x40c4,
+  0x40c3,
+  0x40c2,
+  0x40c2,
+  0x40c2,
+  0x40c1,
+  0x40c1,
+  0x40c0,
+  0x40c0,
+  0x40c0,
+  0x40bf,
+  0x40be,
+  0x40be,
+  0x40bd,
+  0x40bd,
+  0x40bd,
+  0x40bc,
+  0x40bb,
+  0x40bb,
+  0x40ba,
+  0x40ba,
+  0x40ba,
+  0x40b9,
+  0x40b9,
+  0x40b8,
+  0x40b8,
+  0x40b7,
+  0x40b7,
+  0x40b6,
+  0x40b6,
+  0x40b5,
+  0x40b5,
+  0x40b5,
+  0x40b5,
+  0x40b5,
+  0x40b4,
+  0x40b4,
+  0x40b4,
+  0x40b4,
+  0x40b4,
+  0x40b3,
+  0x40b3,
+  0x40b3,
+  0x40b3,
+  0x40b3,
+  0x40b3,
+  0x40b3,
+  0x40b2,
+  0x40b2,
+  0x40b2,
+  0x40b2,
+  0x40b2,
+  0x40b1,
+  0x40b1,
+  0x40b1,
+  0x40b1,
+  0x40b1,
+  0x40b0,
+  0x40b0,
+  0x40b0,
+  0x40b0,
+  0x40b0,
+  0x40b0,
+  0x40b0,
+  0x40af,
+  0x40af,
+  0x40af,
+  0x40af,
+  0x40af,
+  0x40ae,
+  0x40ae,
+  0x40ae,
+  0x40ae,
+  0x40ae,
+  0x40ae,
+  0x40ae,
+  0x40ad,
+  0x40ad,
+  0x40ad,
+  0x40ad,
+  0x40ad,
+  0x40ac,
+  0x40ac,
+  0x40ac,
+  0x40ac,
+  0x40ac,
+  0x40ab,
+  0x40ab,
+  0x40ab,
+  0x40ab,
+  0x40ab,
+  0x40ab,
+  0x40ab,
+  0x40aa,
+  0x40aa,
+  0x40aa,
+  0x40aa,
+  0x40aa,
+  0x40a9,
+  0x40a9,
+  0x40a9,
+  0x40a9,
+  0x40a9,
+  0x40a9,
+  0x40a9,
+  0x40a8,
+  0x40a8,
+  0x40a8,
+  0x40a8,
+  0x40a8,
+  0x40a7,
+  0x40a7,
+  0x40a7,
+  0x40a7,
+  0x40a7,
+  0x40a7,
+  0x40a7,
+  0x40a7,
+  0x40a6,
+  0x40a6,
+  0x40a6,
+  0x40a6,
+  0x40a6,
+  0x40a5,
+  0x40a5,
+  0x40a5,
+  0x40a5,
+  0x40a4,
+  0x40a4,
+  0x40a4,
+  0x40a4,
+  0x40a4,
+  0x40a4,
+  0x40a4,
+  0x40a4,
+  0x40a3,
+  0x40a3,
+  0x40a3,
+  0x40a3,
+  0x40a3,
+  0x40a2,
+  0x40a2,
+  0x40a2,
+  0x40a2,
+  0x40a2,
+  0x40a2,
+  0x40a2,
+  0x40a1,
+  0x40a1,
+  0x40a1,
+  0x40a1,
+  0x40a1,
+  0x40a0,
+  0x40a0,
+  0x40a0,
+  0x40a0,
+  0x40a0,
+  0x409f,
+  0x409f,
+  0x409f,
+  0x409f,
+  0x409f,
+  0x409f,
+  0x409f,
+  0x409e,
+  0x409e,
+  0x409e,
+  0x409e,
+  0x409e,
+  0x409d,
+  0x409d,
+  0x409d,
+  0x409d,
+  0x409d,
+  0x409d,
+  0x409d,
+  0x409c,
+  0x409c,
+  0x409c,
+  0x409c,
+  0x409c,
+  0x409b,
+  0x409b,
+  0x409b,
+  0x409b,
+  0x409b,
+  0x409a,
+  0x409a,
+  0x409a,
+  0x409a,
+  0x409a,
+  0x409a,
+  0x409a,
+  0x4099,
+  0x4099,
+  0x4099,
+  0x4099,
+  0x4099,
+  0x4098,
+  0x4098,
+  0x4098,
+  0x4098,
+  0x4098,
+  0x4098,
+  0x4098,
+  0x4098,
+  0x4097,
+  0x4097,
+  0x4097,
+  0x4097,
+  0x4096,
+  0x4096,
+  0x4096,
+  0x4096,
+  0x4096,
+  0x4096,
+  0x4096,
+  0x4096,
+  0x4095,
+  0x4095,
+  0x4095,
+  0x4095,
+  0x4095,
+  0x4094,
+  0x4094,
+  0x4094,
+  0x4094,
+  0x4094,
+  0x4093,
+  0x4093,
+  0x4093,
+  0x4093,
+  0x4093,
+  0x4093,
+  0x4093,
+  0x4092,
+  0x4092,
+  0x4092,
+  0x4092,
+  0x4092,
+  0x4091,
+  0x4091,
+  0x4091,
+  0x4091,
+  0x4091,
+  0x4091,
+  0x4091,
+  0x4090,
+  0x4090,
+  0x4090,
+  0x4090,
+  0x4090,
+  0x408f,
+  0x408f,
+  0x408f,
+  0x408f,
+  0x408f,
+  0x408e,
+  0x408e,
+  0x408e,
+  0x408e,
+  0x408e,
+  0x408e,
+  0x408e,
+  0x408d,
+  0x408d,
+  0x408d,
+  0x408d,
+  0x408d,
+  0x408c,
+  0x408c,
+  0x408c,
+  0x408c,
+  0x408c,
+  0x408c,
+  0x408c,
+  0x408b,
+  0x408b,
+  0x408b,
+  0x408b,
+  0x408b,
+  0x408a,
+  0x408a,
+  0x408a,
+  0x408a,
+  0x408a,
+  0x408a,
+  0x408a,
+  0x408a,
+  0x4089,
+  0x4089,
+  0x4089,
+  0x4089,
+  0x4088,
+  0x4088,
+  0x4088,
+  0x4088,
+  0x4088,
+  0x4087,
+  0x4087,
+  0x4087,
+  0x4087,
+  0x4087,
+  0x4087,
+  0x4087,
+  0x4087,
+  0x4086,
+  0x4086,
+  0x4086,
+  0x4086,
+  0x4086,
+  0x4085,
+  0x4085,
+  0x4085,
+  0x4085,
+  0x4085,
+  0x4085,
+  0x4085,
+  0x4084,
+  0x4084,
+  0x4084,
+  0x4084,
+  0x4084,
+  0x4083,
+  0x4083,
+  0x4083,
+  0x4083,
+  0x4083,
+  0x4082,
+  0x4082,
+  0x4082,
+  0x4082,
+  0x4082,
+  0x4082,
+  0x4082,
+  0x4081,
+  0x4081,
+  0x4081,
+  0x4081,
+  0x4081,
+  0x4080,
+  0x4080,
+  0x4080,
+  0x4080,
+  0x4080,
+  0x4080,
+  0x407f,
+  0x407f,
+  0x407f,
+  0x407f,
+  0x407f,
+  0x407f,
+  0x407f,
+  0x407f,
+  0x407f,
+  0x407f,
+  0x407e,
+  0x407e,
+  0x407e,
+  0x407e,
+  0x407e,
+  0x407e,
+  0x407e,
+  0x407e,
+  0x407e,
+  0x407d,
+  0x407d,
+  0x407d,
+  0x407d,
+  0x407d,
+  0x407c,
+  0x407c,
+  0x407c,
+  0x407c,
+  0x407c,
+  0x407c,
+  0x407c,
+  0x407c,
+  0x407c,
+  0x407c,
+  0x407b,
+  0x407b,
+  0x407b,
+  0x407b,
+  0x407b,
+  0x407b,
+  0x407b,
+  0x407b,
+  0x407b,
+  0x407b,
+  0x407a,
+  0x407a,
+  0x407a,
+  0x407a,
+  0x407a,
+  0x407a,
+  0x407a,
+  0x407a,
+  0x407a,
+  0x407a,
+  0x4079,
+  0x4079,
+  0x4079,
+  0x4079,
+  0x4079,
+  0x4078,
+  0x4078,
+  0x4078,
+  0x4078,
+  0x4078,
+  0x4078,
+  0x4078,
+  0x4078,
+  0x4078,
+  0x4077,
+  0x4077,
+  0x4077,
+  0x4077,
+  0x4077,
+  0x4077,
+  0x4077,
+  0x4077,
+  0x4077,
+  0x4077,
+  0x4076,
+  0x4076,
+  0x4076,
+  0x4076,
+  0x4076,
+  0x4075,
+  0x4075,
+  0x4075,
+  0x4075,
+  0x4075,
+  0x4075,
+  0x4075,
+  0x4075,
+  0x4075,
+  0x4075,
+  0x4074,
+  0x4074,
+  0x4074,
+  0x4074,
+  0x4074,
+  0x4074,
+  0x4074,
+  0x4074,
+  0x4074,
+  0x4073,
+  0x4073,
+  0x4073,
+  0x4073,
+  0x4073,
+  0x4073,
+  0x4073,
+  0x4073,
+  0x4073,
+  0x4073,
+  0x4072,
+  0x4072,
+  0x4072,
+  0x4072,
+  0x4072,
+  0x4071,
+  0x4071,
+  0x4071,
+  0x4071,
+  0x4071,
+  0x4071,
+  0x4071,
+  0x4071,
+  0x4071,
+  0x4071,
+  0x4070,
+  0x4070,
+  0x4070,
+  0x4070,
+  0x4070,
+  0x4070,
+  0x4070,
+  0x4070,
+  0x4070,
+  0x4070,
+  0x406f,
+  0x406f,
+  0x406f,
+  0x406f,
+  0x406f,
+  0x406e,
+  0x406e,
+  0x406e,
+  0x406e,
+  0x406e,
+  0x406e,
+  0x406e,
+  0x406e,
+  0x406e,
+  0x406d,
+  0x406d,
+  0x406d,
+  0x406d,
+  0x406d,
+  0x406d,
+  0x406d,
+  0x406d,
+  0x406d,
+  0x406d,
+  0x406c,
+  0x406c,
+  0x406c,
+  0x406c,
+  0x406c,
+  0x406b,
+  0x406b,
+  0x406b,
+  0x406b,
+  0x406b,
+  0x406b,
+  0x406b,
+  0x406b,
+  0x406b,
+  0x406b,
+  0x406a,
+  0x406a,
+  0x406a,
+  0x406a,
+  0x406a,
+  0x406a,
+  0x406a,
+  0x406a,
+  0x406a,
+  0x4069,
+  0x4069,
+  0x4069,
+  0x4069,
+  0x4069,
+  0x4069,
+  0x4069,
+  0x4069,
+  0x4069,
+  0x4069,
+  0x4068,
+  0x4068,
+  0x4068,
+  0x4068,
+  0x4068,
+  0x4067,
+  0x4067,
+  0x4067,
+  0x4067,
+  0x4067,
+  0x4067,
+  0x4067,
+  0x4067,
+  0x4067,
+  0x4067,
+  0x4066,
+  0x4066,
+  0x4066,
+  0x4066,
+  0x4066,
+  0x4066,
+  0x4066,
+  0x4066,
+  0x4066,
+  0x4066,
+  0x4065,
+  0x4065,
+  0x4065,
+  0x4065,
+  0x4065,
+  0x4064,
+  0x4064,
+  0x4064,
+  0x4064,
+  0x4064,
+  0x4064,
+  0x4064,
+  0x4064,
+  0x4064,
+  0x4063,
+  0x4063,
+  0x4063,
+  0x4063,
+  0x4063,
+  0x4063,
+  0x4063,
+  0x4063,
+  0x4063,
+  0x4063,
+  0x4062,
+  0x4062,
+  0x4062,
+  0x4062,
+  0x4062,
+  0x4062,
+  0x4062,
+  0x4062,
+  0x4062,
+  0x4062,
+  0x4061,
+  0x4061,
+  0x4061,
+  0x4061,
+  0x4061,
+  0x4060,
+  0x4060,
+  0x4060,
+  0x4060,
+  0x4060,
+  0x4060,
+  0x4060,
+  0x4060,
+  0x4060,
+  0x405f,
+  0x405f,
+  0x405f,
+  0x405f,
+  0x405f,
+  0x405f,
+  0x405f,
+  0x405f,
+  0x405f,
+  0x405f,
+  0x405e,
+  0x405e,
+  0x405e,
+  0x405e,
+  0x405e,
+  0x405d,
+  0x405d,
+  0x405d,
+  0x405d,
+  0x405d,
+  0x405d,
+  0x405d,
+  0x405d,
+  0x405d,
+  0x405d,
+  0x405c,
+  0x405c,
+  0x405c,
+  0x405c,
+  0x405c,
+  0x405c,
+  0x405c,
+  0x405c,
+  0x405c,
+  0x405c,
+  0x405b,
+  0x405b,
+  0x405b,
+  0x405b,
+  0x405b,
+  0x405a,
+  0x405a,
+  0x405a,
+  0x405a,
+  0x405a,
+  0x405a,
+  0x405a,
+  0x405a,
+  0x405a,
+  0x4059,
+  0x4059,
+  0x4059,
+  0x4059,
+  0x4059,
+  0x4059,
+  0x4059,
+  0x4059,
+  0x4059,
+  0x4059,
+  0x4058,
+  0x4058,
+  0x4058,
+  0x4058,
+  0x4058,
+  0x4058,
+  0x4058,
+  0x4058,
+  0x4058,
+  0x4058,
+  0x4057,
+  0x4057,
+  0x4057,
+  0x4057,
+  0x4057,
+  0x4056,
+  0x4056,
+  0x4056,
+  0x4056,
+  0x4056,
+  0x4056,
+  0x4056,
+  0x4056,
+  0x4056,
+  0x4056,
+  0x4055,
+  0x4055,
+  0x4055,
+  0x4055,
+  0x4055,
+  0x4055,
+  0x4055,
+  0x4055,
+  0x4055,
+  0x4054,
+  0x4054,
+  0x4054,
+  0x4054,
+  0x4054,
+  0x4053,
+  0x4053,
+  0x4053,
+  0x4053,
+  0x4053,
+  0x4053,
+};
+
+static bool check_input_int8_range(float input) {
+  bool ret = input > -128.0 && input < 128.0;
+  if (!ret) {
+    printf("invalid int8 range, input is %f\n", input);
+  }
+  return ret;
+}
+
+// <! gen invert sqrt
+static double _gen_sqrt_inv(int base, int p) {
+  // y = x ^ -0.5
+  int m = 1;
+  if (base < 0 && p % 2) {
+    // cant sqrt with base, it need to hoist it
+    // (-2)^(-31) -> -2 * (-2^-30)
+    m = base;
+    if (p == 0) {
+      m = 1; // pow(base, 0) its fine
+    }
+    else if (p > 0) {
+      p = p - 1;
+    }
+    else {
+      // p < 0
+      p = p + 1;
+    }
+  }
+
+  double f = (double) (m * pow(base, p * -0.5));
+
+  if (isnan(f)) {
+    assert(0);
+  }
+  return f;
+}
+
+static void tl_lut_ref(
+    u16 *ofmap,
+    u16 *ifmap,
+    u16 *table,
+    u16 *table_slope,
+    tl_shape_t ifmap_shape,
+    tl_shape_t table_shape)
+{
+  int tn, th, tw;
+
+  tn = table_shape.n;
+  th = table_shape.h;
+  tw = table_shape.w;
+  assert(tn == 1);
+  assert(th * tw == 256);
+  assert(ofmap);
+  assert(ifmap);
+  assert(table);
+  assert(table_slope);
+  assert(tl_shape_size(&ifmap_shape));
+
+  // TODO: use c function
+  // TODO: cal error with `eval_lut.py`
+#if 0
+  // 1. dump all input as binary file
+  #define INFP32FILE "inv_infp32file.bin"
+  #define OUTBF16FILE "inv_lutbf16out.bin"
+  FILE* pFile;
+  pFile = fopen(INFP32FILE, "wb");
+  fwrite(ifmap, 1, tl_shape_size(&ifmap_shape) *sizeof(u16), pFile);
+  fclose(pFile);
+
+  // 2. read result from `eval_lut.py`
+  char command[256];
+  // func_id 4 means invsqrt
+  // lut_type_id 1 means exp
+  sprintf(command, "python eval_lut.py --lut_input_range_start %d --lut_input_range_end %d --func_id 4 --lut_type_id 1 --inputfloat32 %s --outputbf16 %s 2>&1 > 2\n",
+      exp_start, exp_end,
+      INFP32FILE, OUTBF16FILE);
+
+  // printf ("command is %s\n", command);
+  system(command);
+
+  pFile = fopen(OUTBF16FILE, "rb");
+  if (!pFile) {
+    fprintf(stderr, "open golden %s fail\n", OUTBF16FILE);
+    exit(-1);
+  }
+
+  fread(ofmap, sizeof(u16), tl_shape_size(&ifmap_shape), pFile);
+  fclose(pFile);
+#endif
+
+#if 0
+  for (u64 i = 0; i < tl_shape_size(&ifmap_shape); i++) {
+    printf ("ref %" PRIu64 " input %x golden %x\n", i, ifmap[i], ofmap[i]);
+  }
+#endif
+}
+
+static void gen_sqrt_inv(u16 *table_data, u64 table_size) {
+  // S(x) = 1 / (1 + (e^-x))
+  //<! 32*8 table, duplicate `channel` times;
+  int half = table_size / channel / 2;
+  u64 idx = 0;
+  assert(table_size);
+  assert(half == 128);
+
+  // prepare channel 0
+  double s = _gen_sqrt_inv(2, exp_start);
+  sqrt_hw[idx] = s;
+  table_data[idx] = convert_fp32_bf16(s);
+#if 0
+  printf("t [%" PRIu64 "] is %f(%.8lf)[2^%d] bf %x\n", idx, convert_bf16_fp32(table_data[idx]), s, range_start, table_data[idx]);
+#endif
+  idx++;
+
+  // log scale range from 2^-62 ~ 2^+63
+  // and -2^-62 ~ -2^+63
+
+  // > 0, exp from 0 -62 -61 ..  62  63
+  for (int i = 0; i < half; i++) {
+    float exp = exp_start + i;
+    double s = _gen_sqrt_inv(2, exp);
+    sqrt_hw[idx] = s;
+    table_data[idx] = convert_fp32_bf16(s);
+#if 0
+    printf("t [%" PRIu64 "] is %f(%e - %.8lf)[2^%f] bf %x\n", idx, convert_bf16_fp32(table_data[idx]), s, s, exp, table_data[idx]);
+#endif
+    idx++;
+  }
+
+  //// idx = 127 dont care
+  s = _gen_sqrt_inv(2, -0);
+  sqrt_hw[idx] = s;
+  table_data[idx] = convert_fp32_bf16(s);
+#if 0
+  printf("t [%" PRIu64 "] is %f[%d] bf %x\n", idx, convert_bf16_fp32(table_data[idx]), 0, table_data[idx]);
+#endif
+  idx++;
+
+  for (int i = 1; i < half; i++) {
+    float exp = exp_start + i;
+    double s = _gen_sqrt_inv(-2, exp);
+    sqrt_hw[idx] = s;
+    table_data[idx] = convert_fp32_bf16(s);
+#if 0
+    printf("t [%" PRIu64 "] is %f(%e - %.8lf)[(-2)^%f] bf %x\n", idx, convert_bf16_fp32(table_data[idx]), s, s, exp, table_data[idx]);
+#endif
+    idx++;
+  }
+
+  // idx = 255 dont care
+  //s = _gen_sqrt_inv(2, 0);
+  //table_data[idx] = convert_fp32_bf16(s);
+  //printf("t [%" PRIu64 "] is %f[%d]\n", idx, convert_bf16_fp32(table_data[idx]), 0);
+  //idx++;
+
+#if 0
+  for (u32 i = 0; i < table_hw; i++) {
+    printf("t [%u] is %f\n", i, convert_bf16_fp32(table_data[i]));
+  }
+#endif
+
+  // duplicate channel #1 to #31
+  //TODO: tensor copy
+  for (u32 i = 1; i < channel; i++) {
+    memcpy(&table_data[i * table_hw], &table_data[0], sizeof(u16) * table_hw);
+  }
+}
+
+static void gen_sqrt_inv_slope(u16 IN *table_data, u16* OUT table_slope, u64 table_size) {
+
+  u32 half = table_size / channel / 2;
+  assert(half == 128);
+  assert(table_data);
+
+  int idx = 0;
+  int i = 0;
+  double f_x0 = sqrt_hw[i];
+  double f_x1 = sqrt_hw[i+1];
+  double x0 = 0;
+  double x1 = pow(2.0, exp_start);
+  double s = (f_x1 - f_x0) / (x1 - x0);
+  table_slope[idx] = convert_fp32_bf16(s);
+#if 0
+  printf ("slope [%u]  = %f, 0x%x(org:%e(%.8lf)) f_x0 %lf f_x1 %lf\n", 
+        i, convert_bf16_fp32(table_slope[i]), table_slope[i], s, s, f_x0, f_x1);
+#endif
+  idx++;
+
+  for (u32 i = 0; i < table_hw; i++) {
+    double f_x0 = sqrt_hw[idx];
+    double f_x1 = sqrt_hw[idx+1];
+    int shift = 0;
+    int sign = 1;
+    if (idx >= 128) {
+      shift = 128;
+      sign = -1;
+    }
+    double exp = exp_start + (double)i - (double)shift;
+    double x0 = pow(sign * 2.0, exp);
+    double x1 = pow(sign * 2.0, exp + 1);
+    if (idx == 127 || idx >= 255) {
+      double s = 0.0;
+      table_slope[idx] = convert_fp32_bf16(s); // not used
+      idx++;
+      continue;
+    }
+    else if (idx == 128) {
+      x0 = 0;
+      exp = exp_start; //<! for asset check
+    }
+#if 0
+    printf ("[%u] x0 is %e %.16lf x1 is %.16lf, exp is %f\n", idx, x0, x1, exp);
+#endif
+    assert (!isinf(x0) && !isinf(x1));
+    assert(exp >= exp_start && exp <= exp_end);
+    
+    double s = (f_x1 - f_x0) / (x1 - x0);
+    table_slope[idx] = convert_fp32_bf16(s);
+#if 0
+    printf ("slope [%u]  = %f, 0x%x(org:%e(%.8lf)) (%.8lf - %.8lf) / (%.8lf - %.8lf), diif is %d\n",
+        idx, convert_bf16_fp32(table_slope[idx]), table_slope[idx], s, s, 
+        f_x1, f_x0, x1, x0, exp_start + i - shift);
+#endif
+    idx++;
+  }
+
+  // duplicate channel #1 to #31
+  //TODO: tensor copy
+  for (u64 i = 1; i < channel; i++) {
+    memcpy(&table_slope[table_hw * i], &table_slope[0], sizeof(u16) * table_hw);
+  }
+}
+
+static bool verify(u16 *ofmap_data, u16 *ref_data, u64 ofmap_size) {
+  u64 size = ofmap_size;
+  if (mode == PRE_DATA_COMPARE_FIX) {
+    size = sizeof(sigmode_golden_bf16) / sizeof(sigmode_golden_bf16[0]);
+  }
+
+  for (u64 i = 0; i < size; i++) {
+    u16 ref = ref_data[i];
+    if (mode == PRE_DATA_COMPARE_FIX) {
+      ref = sigmode_golden_bf16[i];
+    }
+
+    if (ofmap_data[i] != ref) {
+      fprintf(stderr,
+          "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+          i, ofmap_data[i], ref_data[i]);
+      exit(-1);
+
+#if 0
+      for (u64 i = 0; i < ofmap_size; i++) {
+        printf("error, dump all to [%" PRIx64 "]%" PRIu64 " source %x ref %x\n", i, i, ofmap_data[i], ref_data[i]);
+      }
+
+#endif
+    }
+  }
+  return true;
+}
+
+static void test_tl_int8_lut_bf16(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk)
+{
+  // TODO: check more shape / align
+  tl_shape_t ifmap_shape;
+  if (mode == PRE_DATA_COMPARE_FIX) {
+    ifmap_shape = {1, channel, 8, 8};
+  }
+  else {
+    ifmap_shape = {1, channel, 16, 16};
+  }
+
+  tl_shape_t table_shape = {1, channel, table_h, table_w}; // hard code for hw, hw:32x8
+  tl_shape_t ofmap_shape = ifmap_shape;
+
+  u64 ifmap_size = tl_shape_size(&ifmap_shape);
+  u64 table_size = tl_shape_size(&table_shape);
+  u64 ofmap_size = tl_shape_size(&ofmap_shape);
+
+  fmt_t fmt = FMT_BF16;
+
+  int data_type_size = bytesize_of_fmt(fmt);
+  u64 ifmap_bytesize  =  ifmap_size * data_type_size;
+  u64 table_bytesize  =  table_size * data_type_size;
+  u64 ofmap_bytesize  =  ofmap_size * data_type_size;
+
+  // hw ONLY support index in int8
+  u16 *ifmap = (u16 *)xmalloc(ifmap_bytesize);
+  memset(ifmap, 0x00, ifmap_bytesize);
+
+  u16 *ifmap_slope = (u16 *)xmalloc(ifmap_bytesize);
+  memset(ifmap_slope, 0x00, ifmap_bytesize);
+
+  if (mode == PRE_DATA_COMPARE_FIX) {
+    memcpy(ifmap, &test_pattern, sizeof(test_pattern));
+#if 0
+    for (u64 i = 0; i < ifmap_size; i++) {
+      printf("source if[%" PRIu64 "] is %e bf16 %f (bf16)with 0x%x log2f is %f\n", i, convert_bf16_fp32(ifmap[i]), convert_bf16_fp32(ifmap[i]), ifmap[i],
+          log2f(convert_bf16_fp32(ifmap[i]))); 
+    }
+#endif
+  }
+  else {
+    for (u64 i = 0; i < ifmap_size; i++) {
+      // input range 0.001 - 32
+      float input = ((int)i % 31) + (i % 100) * 0.012;
+      assert(check_input_int8_range(input));
+      ifmap[i] = convert_fp32_bf16(input);
+#if 0
+      printf("source if[%" PRIu64 "] is bf16 %f, input is %f (bf16)with 0x%x\n", i, convert_bf16_fp32(ifmap[i]), input, ifmap[i]); 
+#endif
+    }
+  }
+
+  u16 *table_data = (u16 *)xmalloc(table_bytesize);
+  gen_sqrt_inv (table_data, table_size);
+
+  u16 *table_data_slope = (u16 *)xmalloc(table_bytesize);
+  gen_sqrt_inv_slope(table_data, table_data_slope, table_size);
+
+  u16 *ref_data = (u16 *)xmalloc(ofmap_bytesize);
+  tl_lut_ref(ref_data, ifmap, table_data, table_data_slope, ifmap_shape, table_shape);
+
+  tl_t *tl_ifmap =
+    alloc_tl(bmk,ifmap_shape, fmt, /*align*/1);
+  tl_t *tl_table_answer =
+    alloc_tl(bmk, table_shape, fmt, /*align*/1);
+  tl_t *tl_table_answer_slope =
+    alloc_tl(bmk, table_shape, fmt, /*align*/1);
+
+  tl_t *tl_ofmap_A_idx =
+    alloc_tl(bmk,ofmap_shape, fmt, /*align*/1);
+  tl_t *tl_ofmap_B_slope =
+    alloc_tl(bmk,ofmap_shape, fmt, /*align*/1);
+  tl_t *tl_ofmap_A_base_val =
+    alloc_tl(bmk,ofmap_shape, fmt, /*align*/1);
+  tl_t *tl_ofmap_A_base =
+    alloc_tl(bmk,ofmap_shape, fmt, /*align*/1);
+  tl_t *tl_ofmap_C =
+    alloc_tl(bmk,ofmap_shape, fmt, /*align*/1);
+
+  // <! FIXME: prepare it
+  bmk1880v2_tdma_tg2l_tensor_copy_param_t copy_p1, copy_p2, copy_p3;
+  memset(&copy_p1, 0, sizeof(copy_p1));
+  memset(&copy_p2, 0, sizeof(copy_p2));
+  memset(&copy_p3, 0, sizeof(copy_p3));
+  prepare_put_bf16_tensor_g2l(ctx, bmk, tl_ifmap, ifmap, fmt, &copy_p1);
+  prepare_put_bf16_tensor_g2l(ctx, bmk, tl_table_answer, table_data, fmt, &copy_p2);
+  prepare_put_bf16_tensor_g2l(ctx, bmk, tl_table_answer_slope, table_data_slope, fmt, &copy_p3);
+
+  launch_put_bf16_tensor_g2l(ctx, bmk, copy_p1.src, &copy_p1); // input
+  launch_put_bf16_tensor_g2l(ctx, bmk, copy_p2.src, &copy_p2); // table value
+  launch_put_bf16_tensor_g2l(ctx, bmk, copy_p3.src, &copy_p3); // table slope
+
+  // <! get base (x0)
+  bmk1880v2_tdma_l2l_tensor_copy_param_t p10;
+  memset(&p10, 0x00, sizeof(p10));
+  p10.dst = tl_ofmap_A_base;
+  p10.src = tl_ifmap;
+  p10.mv_lut_base = true;
+  bmk1880v2_tdma_l2l_bf16_tensor_copy(bmk, &p10);
+  test_submit(ctx);
+
+  // <! get index(pow)
+  memset(&p10, 0x00, sizeof(bmk1880v2_tdma_l2l_tensor_copy_param_t));
+  p10.dst = tl_ofmap_A_idx;
+  p10.src = tl_ifmap;
+  p10.mv_lut_idx = true;
+  bmk1880v2_tdma_l2l_bf16_tensor_copy(bmk, &p10);
+  test_submit(ctx);
+
+  // <! get f(x0)
+  bmk1880v2_tiu_lookup_table_param_t p12;
+  memset(&p12, 0, sizeof(p12));
+  p12.ofmap = tl_ofmap_A_base_val;
+  p12.ifmap = tl_ofmap_A_idx;
+  p12.table = tl_table_answer;
+  bmk1880v2_tiu_lookup_table(bmk, &p12);
+
+  // <! get slope by index
+  // <! ( (f(x1) - f(x0)) / (x1 - x0) )
+  memset(&p12, 0x0, sizeof(bmk1880v2_tiu_lookup_table_param_t));
+  p12.ofmap = tl_ofmap_B_slope;
+  p12.ifmap = tl_ofmap_A_idx;
+  p12.table = tl_table_answer_slope;
+  bmk1880v2_tiu_lookup_table(bmk, &p12);
+
+  // <! sub, diff base , a - b
+  // (x - x0)
+  bmk1880v2_tiu_element_wise_sub_param_t p5;
+  memset(&p5, 0, sizeof(p5));
+  p5.res_high = 0;
+  p5.res_low = tl_ofmap_C;
+  p5.a_high = 0;
+  p5.a_low = tl_ifmap;
+  p5.b_high = 0;
+  p5.b_low = tl_ofmap_A_base;
+  p5.rshift_bits = 0;
+  bmk1880v2_tiu_element_wise_sub(bmk, &p5);
+
+  // <! mac
+  // <! part A + part B, a * b + res = res
+  bmk1880v2_tiu_element_wise_mac_param_t p2;
+  memset(&p2, 0, sizeof(p2));
+  p2.res_high = 0;
+  p2.res_low = tl_ofmap_A_base_val;
+  p2.res_is_int8 = 0;
+  p2.a = tl_ofmap_C;
+  p2.b_is_const = 0;
+  p2.b = tl_ofmap_B_slope;
+  p2.lshift_bits = 0;//lshift_bits;
+  p2.rshift_bits = 0;//rshift_bits;
+  p2.relu_enable = 0;
+  bmk1880v2_tiu_element_wise_mac(bmk, &p2);
+  test_submit(ctx);
+
+  u16 *ofmap_data = (u16*)get_bf16_tensor_l2g(ctx, bmk, tl_ofmap_A_base_val, fmt);
+  verify(ofmap_data, ref_data, ofmap_size);
+
+  free_tl(bmk, tl_ofmap_C);
+  free_tl(bmk, tl_ofmap_A_base);
+  free_tl(bmk, tl_ofmap_A_base_val);
+  free_tl(bmk, tl_ofmap_B_slope);
+  free_tl(bmk, tl_ofmap_A_idx);
+  free_tl(bmk, tl_table_answer_slope);
+  free_tl(bmk, tl_table_answer);
+  free_tl(bmk, tl_ifmap);
+
+  free(ifmap);
+  free(ifmap_slope);
+  free(table_data);
+  free(table_data_slope);
+  free(ref_data);
+  free(ofmap_data);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  int round_mode;
+
+  round_mode = set_store_feround();
+
+  test_init(&ctx, &bmk);
+
+  for (int i = PRE_DATA_COMPARE_FIX; i < TEST_MODE_MAX; i++) {
+    mode = static_cast<TEST_MODE>(i);
+    printf ("test mode %d...\n", mode);
+    test_tl_int8_lut_bf16(&ctx, bmk);
+  }
+
+  test_exit(&ctx);
+  restore_feround(round_mode);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_bf16_lut.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_lut.cpp
new file mode 100644
index 000000000..912b6d237
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_lut.cpp
@@ -0,0 +1,107 @@
+#include "../1880v2_test_util.h"
+
+static u64 shape_size(tl_shape_t s)
+{
+  return s.n * s.c * s.h * s.w;
+}
+
+static void tl_lut_ref(
+    u16 *ofmap,
+    u16 *ifmap,
+    u16 *table,
+    tl_shape_t ifmap_shape,
+    tl_shape_t table_shape)
+{
+  int ih, iw;
+  int tn, th, tw;
+
+  ih = ifmap_shape.h;
+  iw = ifmap_shape.w;
+  tn = table_shape.n;
+  th = table_shape.h;
+  tw = table_shape.w;
+  assert(tn == 1);
+  assert(th * tw == 256);
+
+  for (u64 i = 0; i < shape_size(ifmap_shape); i++) {
+    int ici = i / (ih * iw) % 32;
+    ofmap[i] = table[ici * (th * tw) + ifmap[i]];
+  }
+}
+
+static void test_tl_lut(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx)
+{
+  tl_shape_t ifmap_shape = {1, 32, 1, 224};
+  tl_shape_t table_shape = {1, 32, 32, 8};
+  tl_shape_t ofmap_shape = ifmap_shape;
+
+  u64 ifmap_size = shape_size(ifmap_shape);
+  u64 table_size = shape_size(table_shape);
+  u64 ofmap_size = shape_size(ofmap_shape);
+
+  fmt_t fmt = FMT_BF16;
+
+  int data_type_size = bytesize_of_fmt(fmt);
+  u64 ifmap_bytesize  =  ifmap_size * data_type_size;
+  u64 table_bytesize  =  table_size * data_type_size;
+  u64 ofmap_bytesize  =  ofmap_size * data_type_size;
+
+  u16 *ifmap_data = (u16 *)xmalloc(ifmap_bytesize);
+  for (u64 i = 0; i < ifmap_size; i++)
+    ifmap_data[i] = 0;
+    //ifmap_data[i] = i - 20;
+
+  u16 *table_data = (u16 *)xmalloc(table_bytesize);
+  for (u64 i = 0; i < table_size; i++)
+    table_data[i] = i + i / 256 * 3;
+
+  u16 *ref_data = (u16 *)xmalloc(ofmap_bytesize);
+  tl_lut_ref(ref_data, ifmap_data, table_data, ifmap_shape, table_shape);
+
+  tl_t *tl_ifmap =
+    alloc_tl(bk_ctx,ifmap_shape, fmt, 1);
+  tl_t *tl_table =
+    alloc_tl(bk_ctx, table_shape, fmt, /*align*/1);
+  tl_t *tl_ofmap =
+    alloc_tl(bk_ctx,ofmap_shape, fmt, /*align*/1);
+
+  put_bf16_tensor_g2l(ctx, bk_ctx, tl_ifmap, ifmap_data, fmt);
+  put_bf16_tensor_g2l(ctx, bk_ctx, tl_table, table_data, fmt);
+
+  bmk1880v2_tiu_lookup_table_param_t p12;
+  memset(&p12, 0, sizeof(p12));
+  p12.ofmap = tl_ofmap;
+  p12.ifmap = tl_ifmap;
+  p12.table = tl_table;
+  bmk1880v2_tiu_lookup_table(bk_ctx, &p12);
+  test_submit(ctx);
+
+  u16 *ofmap_data = (u16*)get_bf16_tensor_l2g(ctx, bk_ctx, tl_ofmap, fmt);
+  for (u64 i = 0; i < ofmap_size; i++) {
+    if (ofmap_data[i] != ref_data[i]) {
+      fprintf(stderr,
+          "comparing failed at ofmap_data[%" PRIu64 "], got %d, exp %d\n",
+          i, ofmap_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_ofmap);
+  free_tl(bk_ctx, tl_table);
+  free_tl(bk_ctx, tl_ifmap);
+
+  free(ifmap_data);
+  free(table_data);
+  free(ref_data);
+  free(ofmap_data);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  test_tl_lut(&ctx, bk_ctx);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_bf16_mask_kernel.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_mask_kernel.cpp
new file mode 100644
index 000000000..cafb35db8
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_mask_kernel.cpp
@@ -0,0 +1,160 @@
+#include "../1880v2_test_util.h"
+#define OUT
+#define IN
+#include <cfloat>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <random>
+#include <string>
+//#define DBG
+
+using namespace std;
+
+/**
+ * pre_data means we test fixed pattern, it should be same sa lut
+ */
+// enum TEST_MODE {
+//  BF16_MASK_TYPE_GT_0 = 0,  // remain >  0
+//  //BF16_MASK_TYPE_GE_0,      // remain >= 0
+//  //BF16_MASK_TYPE_EQ_0,      // remain  = 0
+//  //BF16_MASK_TYPE_LT_0,      // remain <  0
+//  //BF16_MASK_TYPE_LE_0,      // remain <= 0
+//  BF16_MASK_MAX
+//};
+
+enum BF16_MASK_TYPE mode;
+
+struct pattern {
+  float *input;
+  float *ref;
+  int len;
+};
+#define SIZEOF(x) (sizeof(x) / sizeof(x[0]))
+float bf16_mask_type_gt_0_input[] = {
+    -1 * pow(2, -62), -0.003, -1.0, -100000, 0.000001, 1, 1000, pow(2, 62), 0};
+
+float bf16_mask_type_gt_0_output[] = {0, 0, 0, 0, 1, 1, 1, 1, 0};
+float bf16_mask_type_ge_0_output[] = {0, 0, 0, 0, 1, 1, 1, 1, 1};
+float bf16_mask_type_eq_0_output[] = {0, 0, 0, 0, 0, 0, 0, 0, 1};
+float bf16_mask_type_lt_0_output[] = {1, 1, 1, 1, 0, 0, 0, 0, 0};
+float bf16_mask_type_le_0_output[] = {1, 1, 1, 1, 0, 0, 0, 0, 1};
+
+int input_sz =
+    sizeof(bf16_mask_type_gt_0_input) / sizeof(bf16_mask_type_gt_0_input[0]);
+
+static struct pattern patterns[] = {
+    {bf16_mask_type_gt_0_input, bf16_mask_type_gt_0_output, input_sz},
+    {bf16_mask_type_gt_0_input, bf16_mask_type_ge_0_output, input_sz},
+    {bf16_mask_type_gt_0_input, bf16_mask_type_eq_0_output, input_sz},
+    {bf16_mask_type_gt_0_input, bf16_mask_type_lt_0_output, input_sz},
+    {bf16_mask_type_gt_0_input, bf16_mask_type_le_0_output, input_sz},
+};
+
+static void testbench(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk)
+{
+  fmt_t fmt = FMT_BF16;
+  struct pattern *p = &patterns[mode];
+  u32 input_n = 1;
+  u32 input_c = 1;
+  u32 input_h = 1;
+  u32 input_w = p->len;
+
+  tl_shape_t ifmap_shape = {input_n, input_c, input_h, input_w};
+  tl_shape_t ofmap_shape = ifmap_shape;
+  u64 ifmap_size = tl_shape_size(&ifmap_shape);
+  u64 ofmap_size = tl_shape_size(&ofmap_shape);
+
+  int data_type_size = bytesize_of_fmt(fmt);
+  u64 ifmap_bytesize = ifmap_size * data_type_size;
+  u64 ofmap_bytesize = ofmap_size * data_type_size;
+
+  tl_shape_t table_shape;
+  u64 table_bytesize = bf16_lut_tbl_bytesize(bmk, &table_shape, fmt);
+
+  tl_t *tl_ifmap = alloc_tl(bmk, ifmap_shape, fmt, /*align*/1);
+  tl_t *tl_ofmap_bf16 = alloc_tl(bmk, ofmap_shape, fmt, /*align*/1);
+  tl_t *out = tl_ofmap_bf16;
+  tl_t *tl_pos_neg_buf = alloc_tl(bmk, table_shape, fmt, /*align*/1);
+  tl_t *tl_0_idx_table = alloc_tl(bmk, table_shape, fmt, /*align*/1);
+
+  // temp buf
+  tl_t *tl_buf = alloc_tl(bmk, ofmap_shape, fmt, /*align*/1);
+  tl_t *tl_buf2 = alloc_tl(bmk, tl_ofmap_bf16->shape, fmt, /*align*/1);
+  tl_t *tl_buf4 = alloc_tl(bmk, tl_ofmap_bf16->shape, fmt, /*align*/1);
+
+  u16 *input_data = (u16 *)xmalloc(ifmap_bytesize);
+  u16 *ref_data = (u16 *)xmalloc(ofmap_bytesize);
+  u16 *table_data_atan_pos_neg = (u16 *)xmalloc(table_bytesize);
+  u16 *idx_0_table_data = (u16 *)xmalloc(table_bytesize);
+
+  bf16_gen_0_tbl(idx_0_table_data, &table_shape);
+  bf16_atan_pos_neg(table_data_atan_pos_neg, &table_shape);
+
+  for (u32 i = 0; i < ifmap_size; i++) {
+    input_data[i] = convert_fp32_bf16(p->input[i]);
+    ref_data[i] = convert_fp32_bf16(p->ref[i]);
+  }
+
+  put_bf16_tensor_g2l(ctx, bmk, tl_ifmap, (u16 *)input_data, fmt);
+  put_bf16_tensor_g2l(ctx, bmk, tl_pos_neg_buf, (u16 *)table_data_atan_pos_neg,
+                      fmt);
+  put_bf16_tensor_g2l(ctx, bmk, tl_0_idx_table, (u16 *)idx_0_table_data, fmt);
+
+  bf16_emit_mask(bmk, tl_ifmap, tl_buf, tl_buf2, tl_buf4, tl_pos_neg_buf,
+                 tl_0_idx_table, out, fmt, mode);
+
+  test_submit(ctx);
+
+  u16 *ofmap_data = (u16 *)get_bf16_tensor_l2g(ctx, bmk, out, out->fmt);
+
+  for (u32 i = 0; i < ifmap_size; i++) {
+    if (ref_data[i] != ofmap_data[i]) {
+      fprintf(stderr,
+              "comparing failed at mode %d ofmap_data[%u] got %f(0x%x), ref "
+              "%f(0x%x)\n",
+              mode, i, convert_bf16_fp32(ofmap_data[i]), ofmap_data[i],
+              convert_bf16_fp32(ref_data[i]), ref_data[i]);
+      exit(-1);
+    }
+  }
+#if 0
+  if (!is_close) {
+    float input = convert_bf16_fp32(ifmap[i]);
+      }
+#endif
+  free_tl(bmk, tl_buf4);
+  free_tl(bmk, tl_buf2);
+  free_tl(bmk, tl_buf);
+  free_tl(bmk, tl_0_idx_table);
+  free_tl(bmk, tl_pos_neg_buf);
+  free_tl(bmk, tl_ofmap_bf16);
+  free_tl(bmk, tl_ifmap);
+
+  free(input_data);
+  free(ref_data);
+  free(ofmap_data);
+  free(table_data_atan_pos_neg);
+  free(idx_0_table_data);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  int round_mode;
+
+  round_mode = set_store_feround();
+
+  test_init(&ctx, &bmk);
+
+  for (int i = BF16_MASK_TYPE_GT_0; i < BF16_MASK_MAX; i++) {
+    mode = static_cast<enum BF16_MASK_TYPE>(i);
+    printf("test mode %d...\n", mode);
+    testbench(&ctx, bmk);
+  }
+
+  test_exit(&ctx);
+  restore_feround(round_mode);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_bf16_matrix_mac.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_matrix_mac.cpp
new file mode 100644
index 000000000..49bcb38e8
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_matrix_mac.cpp
@@ -0,0 +1,328 @@
+#include "../1880v2_test_util.h"
+
+typedef bmk1880v2_tiu_matrix_multiplication_param_t param_t;
+int random_seed;
+
+static u64 matrix_size(const ml_t *ml)
+{
+
+  u64 row = ml->shape.n;
+  u64 col = ml->shape.col;
+  return row * col;
+}
+
+static u64 res_size(param_t *p)
+{
+    return matrix_size(p->res);
+}
+
+static u16 * alloc_left(param_t *p)
+{
+  u64 size = matrix_size(p->left);
+  u16 *buf = (u16 *)malloc(sizeof(u16) * size);
+  for (u64 i = 0; i < size; i++) {
+    buf[i] = convert_fp32_bf16(i);
+  }
+
+  return buf;
+}
+
+static u16 * alloc_right(param_t *p)
+{
+  u64 size = matrix_size(p->right);
+  u16 *buf = (u16 *)malloc(sizeof(u16) * size);
+  for (u64 i = 0; i < size; i++) {
+    float val = 0.01;
+    buf[i] = convert_fp32_bf16(i);
+    val += 0.01;
+  }
+  return buf;
+}
+
+static u32 * alloc_bias(param_t *p)
+{
+  if (!p->bias)
+    return NULL;
+
+  u64 size = matrix_size(p->bias);
+  u32 *buf = (u32 *)malloc(sizeof(u32) * size);
+  for (u64 i = 0; i < size; i++) {
+    buf[i] = convert_fp32_hex(i);
+  }
+  return buf;
+}
+
+static u32 * alloc_res(param_t *p)
+{
+  u64 size = res_size(p);
+  u32 *buf = (u32 *)malloc(sizeof(u32) * size);
+  for (u64 i = 0; i < size; i++) {
+    buf[i] = convert_fp32_bf16(i);
+  }
+  return buf;
+}
+
+static inline void bf16_relu(float *buf, u64 size)
+{
+  for (u64 i = 0; i < size; i++)
+    if (buf[i] < 0)
+      buf[i] = 0;
+}
+
+static void matrix_mac_ref(
+    param_t *p, u16 left[], u16 right[], u32 bias[], u32 res[])
+{
+  u64 size = res_size(p);
+  u32 left_col = p->left->shape.col;
+  u32 right_col = p->right->shape.col;
+  u32 res_row = p->left->shape.n;
+  u32 res_col = p->res->shape.col;
+  u32 left_c = p->left->shape.c; 
+  u32 left_w = p->left->shape.w; 
+
+  float *tmp_res = (float *)malloc(sizeof(float) * size);
+  if (p->add_result) {
+    for (u32 i = 0; i < res_row * res_col; i++)
+      tmp_res[i] = convert_bf16_fp32(res[i]);
+  } else {
+    for (u32 i = 0; i < res_row * res_col; i++)
+      tmp_res[i] = 0;
+  }
+  for (u32 row = 0; row < res_row; row++) {
+    for (u32 col = 0; col < res_col; col++) {
+      for (u32 wi = 0; wi < left_w; wi++) {
+        for (u32 ci = 0; ci < left_c; ci++) {
+          if ((wi + (ci*left_w)) >= left_col)
+            continue;
+          u32 li = row * left_col + left_w * ci + wi;
+          u32 ri = (ci* left_w + wi )* right_col + col;
+
+          float l = convert_bf16_fp32(left[li]);
+          float r = convert_bf16_fp32(right[ri]);
+          tmp_res[row * res_col + col] += l * r;
+        }
+      }
+    }
+  }
+
+  if (p->bias && bias) {
+    for (u32 row = 0; row < res_row; row++) {
+      for (u32 col = 0; col < res_col; col++) {
+        float b = convert_hex_fp32(bias[col]);
+        tmp_res[row * res_col + col] += b;
+      }
+    }
+  }
+
+  if (p->relu_enable)
+    bf16_relu(tmp_res, size);
+
+  for (u64 i = 0; i < size; i++) {
+    res[i] = convert_fp32_bf16(tmp_res[i]);
+  }
+  free(tmp_res);
+}
+
+static void put_bias(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    const ml_t *ml,
+    u32 data[])
+{
+  u64 size = ml->shape.col;
+
+  u16 *tmp = (u16 *)malloc(sizeof(u16) * size * 2);
+  if (!tmp)
+    return;
+
+  for (u64 i = 0; i < size; i++) {
+    tmp[i] = (data[i] >> 16) & 0xFFFF;
+    tmp[i + size] = (data[i] & 0xFFFF);
+  }
+
+  put_bf16_matrix_g2l(ctx, bk_ctx, ml, (u8*)tmp, FMT_BF16);
+
+  free(tmp);
+}
+
+static void put_res(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    const ml_t *ml,
+    u32 data[])
+{
+  u64 size = ml->shape.n  * ml->shape.col;
+
+  u16 *tmp = (u16 *)malloc(sizeof(u16) * size);
+  if (!tmp)
+    return;
+
+  for (u64 i = 0; i < size; i++) {
+    tmp[i] = (data[i] & 0xFFFF);
+  }
+
+  put_bf16_matrix_g2l(ctx, bk_ctx, ml, (u8*)tmp, FMT_BF16);
+
+  free(tmp);
+}
+
+static u32 * get_res(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    param_t *p)
+{
+  u64 size = res_size(p);
+  u32 *res = (u32 *)malloc(sizeof(u32) * size);
+
+  u16 *tmp = (u16 *)get_bf16_matrix_l2g(ctx, bk_ctx, p->res, FMT_BF16);
+  for (u64 i = 0; i < size; i++)
+    res[i] = tmp[i];
+
+  free(tmp);
+  return res;
+}
+
+static void test_param(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, param_t *p)
+{
+  u16 *left = alloc_left(p);
+  u16 *right = alloc_right(p);
+  u32 *bias = alloc_bias(p);
+  u32 *ref = alloc_res(p);
+  put_bf16_matrix_g2l(ctx, bk_ctx, p->left, (u8*)left, FMT_BF16);
+  put_bf16_matrix_g2l(ctx, bk_ctx, p->right, (u8*)right, FMT_BF16);
+  if (bias)
+    put_bias(ctx, bk_ctx, p->bias, bias);
+  if (p->add_result)
+    put_res(ctx, bk_ctx, p->res, ref);
+
+  bmk1880v2_tiu_matrix_multiplication(bk_ctx, p);
+  u32 *res = get_res(ctx, bk_ctx, p);
+  matrix_mac_ref(p, left, right, bias, ref);
+  u64 size = res_size(p);
+  for (u64 i = 0; i < size; i++) {
+    if (res[i] != ref[i]) {
+      fprintf(stderr, "comparing failed at out[%" PRIu64 "], got %x, exp %x\n",
+              i, res[i], ref[i]);
+      fprintf(stderr, "random_seed=%d\n", random_seed);
+      exit(-1);
+    }
+  }
+  free(left);
+  free(right);
+  free(bias);
+  free(ref);
+  free(res);
+}
+
+static void destroy_param(bmk_ctx_t *bk_ctx, param_t *p)
+{
+  if (p->bias)
+    bmk1880v2_lmem_free_matrix(bk_ctx, p->bias);
+  if (p->res)
+    bmk1880v2_lmem_free_matrix(bk_ctx, p->res);
+  if (p->right)
+    bmk1880v2_lmem_free_matrix(bk_ctx, p->right);
+  if (p->left)
+    bmk1880v2_lmem_free_matrix(bk_ctx, p->left);
+}
+
+static ml_t *alloc_param_res(
+    bmk_ctx_t *bk_ctx, param_t *p)
+{
+  ml_shape_t s;
+
+  s.n = p->left->shape.n;
+  s.c = p->right->shape.c;
+  s.w = p->right->shape.w;
+  s.col = p->right->shape.col;
+  fmt_t fmt = FMT_BF16;
+  return bmk1880v2_lmem_alloc_matrix(bk_ctx, s, fmt, 1);
+}
+
+static param_t param_0(bmk_ctx_t *bk_ctx)
+{
+
+retry:
+  random_seed = clock();
+  srand(random_seed);
+
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = rand()%2;
+  p.add_result = 0; /*bf16 HW does not support add_result*/
+  p.ps32_mode = 0;
+
+  u32 left_row = rand() % 100 +1;
+  u32 left_col = rand() % 100 + 1;
+  u32 left_w = rand() % (left_col/5+1) + 1; // c is generate by w, and make c is larger
+  u32 left_c = left_col / left_w + (left_col % left_w ? 1: 0);
+
+  u32 right_row = left_col;
+  u32 right_col = rand() % 100 + 1;
+  u32 right_w = (rand() % (right_col/5+1) + 1); // make c is larger
+  u32 right_c = right_col / right_w + (right_col % right_w ? 1: 0) ;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+  
+  u32 bias = rand()%2;
+  p.bias = NULL;
+  p.left = bmk1880v2_lmem_alloc_matrix(bk_ctx, left_shape, FMT_BF16, 1);
+  p.right = bmk1880v2_lmem_alloc_matrix(bk_ctx, right_shape, FMT_BF16, 1);
+  if (!p.left || !p.right) {
+    printf("retry init_matrix_param\n");
+    destroy_param(bk_ctx, &p);
+    goto retry;
+  }
+
+  p.res = alloc_param_res(bk_ctx, &p);
+  if (bias) {
+    ml_shape_t bias_shape = right_shape;
+    bias_shape.n = 2;
+    p.bias = bmk1880v2_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_BF16, 1);
+  }
+
+  if (!p.res || (bias && !p.bias)) {
+    printf("retry init_matrix_param\n");
+    destroy_param(bk_ctx, &p);
+    goto retry;
+  }
+
+  return p;
+}
+
+
+#define test_one_param(n)                               \
+  do {                                                  \
+    param_t p = param_##n(bk_ctx);                      \
+    test_param(&ctx, bk_ctx, &p);                       \
+    destroy_param(bk_ctx, &p);                          \
+  } while (0)
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  int round_mode;
+  round_mode = set_store_feround();
+
+  for (int i = 0 ; i < 30 ; i++)
+    test_one_param(0);
+
+  restore_feround(round_mode);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_bf16_matrix_mac_ps32.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_matrix_mac_ps32.cpp
new file mode 100644
index 000000000..8b7ab77e7
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_matrix_mac_ps32.cpp
@@ -0,0 +1,519 @@
+#include "../1880v2_test_util.h"
+
+typedef bmk1880v2_tiu_matrix_multiplication_param_t param_t;
+
+typedef struct{
+  u32 left_sign;
+  u32 left_row ;
+  u32 left_col ;
+  u32 left_c ;
+  u32 left_w ;
+  u32 right_sign;
+  u32 right_row ;
+  u32 right_col ;
+  u32 right_c ;
+  u32 right_w ;
+  u32 lshift_bits ;
+  u32 rshift_bits ;
+  u32 relu_enable ;
+  u32 using_bias;
+  u32 bias_sign;
+} matrix_init_para_t;
+
+u32 random_seed;
+matrix_init_para_t matrix_para_t;
+
+static void make_bmk_matrix_param_ps32(bmk_ctx_t *bk_ctx, param_t *p, int ps32_mode);
+static param_t param_init();
+
+void print_param(param_t *p)
+{
+  printf("random_seed =%d\n", random_seed);
+  printf("ps32_mode =%d\n",p->ps32_mode);
+  printf("left_shape.n =%d\n",p->left->shape.n);
+  printf("left_shape.col =%d\n",p->left->shape.col);
+  printf("left_shape.c =%d\n",p->left->shape.c);
+  printf("left_shape.w =%d\n",p->left->shape.w);
+  printf("left_fmt =%d\n",p->left->fmt);
+  printf("right_shape.n =%d\n",p->right->shape.n);
+  printf("right_shape.col =%d\n",p->right->shape.col);
+  printf("right_shape.c =%d\n",p->right->shape.c);
+  printf("right_shape.w =%d\n",p->right->shape.w);
+  printf("right_fmt =%d\n",p->right->fmt);
+  if(p->bias)
+  {
+    printf("bias_shape.n =%d\n",p->bias->shape.n);
+    printf("bias_shape.col =%d\n",p->bias->shape.col);
+    printf("bias_shape.c =%d\n",p->bias->shape.c);
+    printf("bias_shape.w =%d\n",p->bias->shape.w);
+    printf("bias_fmt =%d\n",p->bias->fmt);
+  }
+  printf("result_shape.n =%d\n",p->res->shape.n);
+  printf("result_shape.col =%d\n",p->res->shape.col);
+  printf("result_shape.c =%d\n",p->res->shape.c);
+  printf("result_shape.w =%d\n",p->res->shape.w);
+  printf("result_fmt =%d\n",p->res->fmt);
+  printf("relu_enable=%d\n",p->relu_enable);
+  printf("rshift_bits=%d\n",p->rshift_bits);
+}
+
+
+static u64 matrix_size(const ml_t *ml)
+{
+  u64 row = ml->shape.n;
+  u64 col = ml->shape.col;
+  return row * col;
+}
+
+static u64 res_ps32_size(param_t *p)
+{
+    return matrix_size(p->res);
+}
+
+static u64 res_size(param_t *p)
+{
+    return matrix_size(p->res);
+}
+
+static u16 * alloc_left(param_t *p)
+{
+  u64 size = matrix_size(p->left);
+  u16 *buf = (u16 *)malloc(sizeof(u16) * size);
+  for (u64 i = 0; i < size; i++)
+    buf[i] = convert_fp32_bf16(i);
+
+  return buf;
+}
+
+static u16 * alloc_right(param_t *p)
+{
+  u64 size = matrix_size(p->right);
+
+  u16 *buf = (u16 *)malloc(sizeof(u16) * size);
+  for (u64 i = 0; i < size; i++)
+    buf[i] = convert_fp32_bf16(i);
+
+  return buf;
+}
+static u32 * alloc_bias(param_t *p)
+{
+  if (!p->bias)
+    return NULL;
+
+  u64 size = matrix_size(p->bias) / 2;
+
+  u32 *buf = (u32 *)malloc(sizeof(u32) * size);
+  for (u64 i = 0; i < size; i++)
+    buf[i] = convert_fp32_hex(i);
+
+  return buf;
+}
+
+static u16 * alloc_ps32_res(param_t *p)
+{
+  u64 size = res_ps32_size(p)*2;
+  u16 *buf = (u16 *)malloc(sizeof(u16) * size);
+  for (u64 i = 0; i < size; i++)
+    buf[i] = convert_fp32_bf16(i);
+
+  return buf;
+}
+
+static inline void bf16_relu(float *buf, u64 size)
+{
+  for (u64 i = 0; i < size; i++)
+    if (buf[i] < 0)
+      buf[i] = 0;
+}
+
+static int ps32_m2_matrix_mac_ref(
+  param_t *p,
+  u16 *left,
+  u16 *right,
+  u16 *res)
+{
+  u64 size = res_ps32_size(p);
+  u32 left_col = p->left->shape.col;
+  u32 right_col = p->right->shape.col;
+  u32 res_row = p->left->shape.n;
+  u32 res_col = p->res->shape.col;
+  u32 left_c = p->left->shape.c;
+  u32 left_w = p->left->shape.w;
+
+  int ret = BM_SUCCESS;
+  int bstride = res_row * res_col;
+
+  float *tmp_res = (float *)malloc(sizeof(float) * size);
+  for (u32 i = 0; i < res_row * res_col; i++)
+      tmp_res[i] = 0;
+  for (u32 row = 0; row < res_row; row++) {
+    for (u32 col = 0; col < res_col; col++) {
+      for (u32 wi = 0; wi < left_w; wi++) {
+        for (u32 ci = 0; ci < left_c; ci++) {
+          if ((wi + (ci*left_w)) >= left_col)
+            continue;
+          u32 li = row * left_col + left_w * ci + wi;
+          u32 ri = (ci* left_w + wi )* right_col + col;
+
+          float l = convert_bf16_fp32(left[li]);
+          float r = convert_bf16_fp32(right[ri]);
+          tmp_res[row * res_col + col] += l * r;
+        }
+      }
+    }
+  }
+
+  for (u64 i = 0; i < size; i++)
+    res[i + bstride*0] = (convert_fp32_hex(tmp_res[i]) >> 16) & 0xFFFF;
+  for (u64 i = 0; i < size; i++)
+    res[i + bstride*1] = (convert_fp32_hex(tmp_res[i]) >> 0) & 0xFFFF;
+
+  free(tmp_res);
+
+  return ret;
+}
+
+static int ps32_m3_matrix_mac_ref(
+  param_t *p,
+  u16 *left,
+  u16 *right,
+  u16 *res)
+{
+  u64 size = res_ps32_size(p);
+  u32 left_col = p->left->shape.col;
+  u32 right_col = p->right->shape.col;
+  u32 res_row = p->left->shape.n;
+  u32 res_col = p->res->shape.col;
+  u32 left_c = p->left->shape.c;
+  u32 left_w = p->left->shape.w;
+
+  int ret = BM_SUCCESS;
+  int bstride = res_row * res_col;
+
+  float *tmp_res = (float *)malloc(sizeof(float) * size);
+
+  for (u64 i = 0; i < size; i++)
+    tmp_res[i] = convert_hex_fp32((res[i + bstride*0] << 16) | res[i + bstride*1]);
+
+  for (u32 row = 0; row < res_row; row++) {
+    for (u32 col = 0; col < res_col; col++) {
+      for (u32 wi = 0; wi < left_w; wi++) {
+        for (u32 ci = 0; ci < left_c; ci++) {
+          if ((wi + (ci*left_w)) >= left_col)
+            continue;
+          u32 li = row * left_col + left_w * ci + wi;
+          u32 ri = (ci* left_w + wi )* right_col + col;
+
+          float l = convert_bf16_fp32(left[li]);
+          float r = convert_bf16_fp32(right[ri]);
+          tmp_res[row * res_col + col] += l * r;
+        }
+      }
+    }
+  }
+
+  for (u64 i = 0; i < size; i++)
+    res[i + bstride*0] = (convert_fp32_hex(tmp_res[i]) >> 16) & 0xFFFF;
+  for (u64 i = 0; i < size; i++)
+    res[i + bstride*1] = (convert_fp32_hex(tmp_res[i]) >> 0) & 0xFFFF;
+
+  free(tmp_res);
+
+  return ret;
+}
+
+static int ps32_m1_matrix_mac_ref(
+  param_t *p,
+  u16 *left,
+  u16 *right,
+  u32 * bias,
+  u16 *res)
+{
+  u64 size = res_ps32_size(p);
+  u32 left_col = p->left->shape.col;
+  u32 right_col = p->right->shape.col;
+  u32 res_row = p->left->shape.n;
+  u32 res_col = p->res->shape.col;
+  u32 left_c = p->left->shape.c;
+  u32 left_w = p->left->shape.w;
+
+  int ret = BM_SUCCESS;
+  int bstride = res_row * res_col;
+
+  float *tmp_res = (float *)malloc(sizeof(float) * size);
+
+  for (u64 i = 0; i < size; i++) {
+    tmp_res[i] = convert_hex_fp32((res[i + bstride*0] << 16) | res[i + bstride*1]);
+   }
+
+  for (u32 row = 0; row < res_row; row++) {
+    for (u32 col = 0; col < res_col; col++) {
+      for (u32 wi = 0; wi < left_w; wi++) {
+        for (u32 ci = 0; ci < left_c; ci++) {
+          if ((wi + (ci*left_w)) >= left_col)
+            continue;
+          u32 li = row * left_col + left_w * ci + wi;
+          u32 ri = (ci* left_w + wi )* right_col + col;
+
+          float l = convert_bf16_fp32(left[li]);
+          float r = convert_bf16_fp32(right[ri]);
+          tmp_res[row * res_col + col] += l * r;
+        }
+      }
+    }
+  }
+
+  if (p->bias && bias) {
+    for (u32 row = 0; row < res_row; row++) {
+      for (u32 col = 0; col < res_col; col++) {
+        float b = convert_hex_fp32(bias[col]);
+        tmp_res[row * res_col + col] += b;
+      }
+    }
+  }
+
+  if (p->relu_enable)
+    bf16_relu(tmp_res, size);
+
+  for (u64 i = 0; i < size; i++)
+    res[i] = convert_fp32_bf16(tmp_res[i]);
+
+  free(tmp_res);
+
+  return ret;
+}
+
+static void put_bias(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    const ml_t *ml,
+    u32 data[])
+{
+  u64 size = ml->shape.col;
+
+  u16 *tmp = (u16 *)malloc(sizeof(u16) * size * 2);
+  if (!tmp)
+    return;
+
+  for (u64 i = 0; i < size; i++) {
+    tmp[i] = data[i] >> 16;
+    tmp[i + size] = data[i] & 0xFFFF;
+  }
+  put_bf16_matrix_g2l(ctx, bk_ctx, ml, (u8*) tmp, FMT_BF16);
+
+  free(tmp);
+}
+
+
+static int test_matrix_ps32_ut(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, param_t *p)
+{
+  make_bmk_matrix_param_ps32(bk_ctx, p, 2);
+  u16 *left = alloc_left(p);
+  u16 *right = alloc_right(p);
+  u16 *ref = alloc_ps32_res(p);
+
+  {
+     bmerr_t ret = ps32_m2_matrix_mac_ref(p, left, right, ref);
+     assert(ret == BM_SUCCESS);
+
+     put_bf16_matrix_g2l(ctx, bk_ctx, p->left, (u8*) left, FMT_BF16);
+     put_bf16_matrix_g2l(ctx, bk_ctx, p->right, (u8*) right, FMT_BF16);
+     bmk1880v2_tiu_matrix_multiplication(bk_ctx, p);
+     bmk1880v2_matrix_lmem_t ps32_res;
+     ps32_res = *p->res;
+     ps32_res.shape.n *= sizeof(short);
+     u16 *res = (u16*) get_bf16_matrix_l2g(ctx, bk_ctx, &ps32_res, FMT_BF16);
+
+     int has_error = array_cmp_int8(
+         "Comparing begin_mode results ...\n",
+         (s8 *)ref, (s8 *)res ,(int)res_ps32_size(p)*sizeof(int));
+     if (has_error) {
+       printf("Comparison M2 FAILED\n");
+       print_param(p);
+       exit(-1);
+     }else
+       printf("Comparison M2 PASS\n");
+     free(res);
+  }
+
+  {
+    make_bmk_matrix_param_ps32(bk_ctx, p, 3);
+
+    bmerr_t ret = ps32_m3_matrix_mac_ref(p, left, right, ref);
+    assert(ret == BM_SUCCESS);
+
+    bmk1880v2_tiu_matrix_multiplication(bk_ctx, p);
+    bmk1880v2_matrix_lmem_t ps32_res;
+    ps32_res = *p->res;
+    ps32_res.shape.n *= sizeof(short);
+    u16 *res = (u16 *) get_bf16_matrix_l2g(ctx, bk_ctx, &ps32_res, FMT_BF16);
+
+    int has_error = array_cmp_int8(
+        "Comparing m3 results ...\n",
+        (s8 *)ref, (s8 *)res ,(int)res_ps32_size(p)*sizeof(int));
+    if (has_error) {
+      printf("Comparison M3 FAILED\n");
+      print_param(p);
+      exit(-1);
+    }else
+      printf("Comparison M3 PASS\n");
+
+    free(res);
+  }
+  {
+    make_bmk_matrix_param_ps32(bk_ctx, p, 1);
+    u32 *bias = alloc_bias(p);
+
+    bmerr_t ret = ps32_m1_matrix_mac_ref(p, left, right, bias, ref);
+    assert(ret == BM_SUCCESS);
+
+    if(p->bias)
+      put_bias(ctx, bk_ctx, p->bias, bias);
+
+    bmk1880v2_tiu_matrix_multiplication(bk_ctx, p);
+    bmk1880v2_matrix_lmem_t ps32_res;
+    ps32_res = *p->res;
+    ps32_res.shape.n *= 2;
+
+    u16 *res = (u16 *) get_bf16_matrix_l2g(ctx, bk_ctx, &ps32_res, FMT_BF16);
+
+    int has_error = array_cmp_int8(
+        "Comparing m1 results ...\n",
+        (s8 *)ref, (s8 *)res ,(int)res_size(p)*2);
+    if (has_error) {
+      printf("Comparison M1 FAILED\n");
+      print_param(p);
+      exit(-1);
+    }else
+      printf("Comparison M1 PASS\n");
+
+    free(res);
+    free(bias);
+  }
+  free(left);
+  free(right);
+  free(ref);
+  return 1;
+}
+
+static void destroy_param(bmk_ctx_t *bk_ctx, param_t *p)
+{
+  if (p->bias)
+    bmk1880v2_lmem_free_matrix(bk_ctx, p->bias);
+  bmk1880v2_lmem_free_matrix(bk_ctx, p->res);
+  bmk1880v2_lmem_free_matrix(bk_ctx, p->right);
+  bmk1880v2_lmem_free_matrix(bk_ctx, p->left);
+}
+
+static ml_t *alloc_param_res(
+    bmk_ctx_t *bk_ctx, param_t *p)
+{
+  ml_shape_t s;
+  s.n = p->left->shape.n;
+  s.c = p->right->shape.c;
+  s.w = p->right->shape.w;
+  s.col = p->right->shape.col;
+
+  fmt_t fmt = FMT_BF16;
+  return bmk1880v2_lmem_alloc_ps32_matrix(bk_ctx, s, fmt, 1);
+}
+
+
+static void make_bmk_matrix_param_ps32(bmk_ctx_t *bk_ctx, param_t *p, int ps32_mode)
+{
+
+  ml_shape_t left_shape;
+  ml_shape_t right_shape;
+
+  p->ps32_mode = ps32_mode;
+  p->relu_enable = 0;
+  p->lshift_bits = 0;
+  p->rshift_bits = 0;
+  if(ps32_mode==2)
+  {
+    left_shape.n = matrix_para_t.left_row;
+    left_shape.c = matrix_para_t.left_c;
+    left_shape.w = matrix_para_t.left_w;
+    left_shape.col = matrix_para_t.left_col;
+
+    right_shape.n = matrix_para_t.right_row;
+    right_shape.c = matrix_para_t.right_c;
+    right_shape.w = matrix_para_t.right_w;
+    right_shape.col = matrix_para_t.right_col;
+    p->left  = bmk1880v2_lmem_alloc_matrix(bk_ctx, left_shape, FMT_BF16, 1);
+    p->right = bmk1880v2_lmem_alloc_matrix(bk_ctx, right_shape, FMT_BF16, 1);
+    p->bias = NULL;
+    p->res = alloc_param_res(bk_ctx, p);
+  }else if(ps32_mode==3)
+  {
+
+  }else if(ps32_mode==1)
+  {
+     p->relu_enable = matrix_para_t.relu_enable;
+     p->rshift_bits = matrix_para_t.rshift_bits;
+     if(matrix_para_t.using_bias)
+     {
+       right_shape.n = matrix_para_t.right_row;
+       right_shape.c = matrix_para_t.right_c;
+       right_shape.w = matrix_para_t.right_w;
+       right_shape.col = matrix_para_t.right_col;
+
+       ml_shape_t bias_shape = right_shape;
+       bias_shape.n = 2;
+       p->bias = bmk1880v2_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_BF16, 1);
+       assert(p->bias);
+    }
+  }
+  //print_param(p);
+}
+
+static param_t param_init(void)
+{
+  param_t p;
+
+  random_seed = clock();
+  srand(random_seed);
+
+  memset(&p, 0, sizeof(param_t));
+  memset(&matrix_para_t, 0, sizeof(matrix_init_para_t));
+
+  matrix_para_t.using_bias = rand()%2;
+  matrix_para_t.relu_enable = rand()%2;
+
+  matrix_para_t.left_row = rand()%60+1;
+  matrix_para_t.left_col = rand()%40+1;
+  matrix_para_t.left_w = matrix_para_t.left_col/0x10 ? rand()%8+8 : matrix_para_t.left_col;
+  matrix_para_t.left_c =
+    matrix_para_t.left_col%matrix_para_t.left_w?
+      matrix_para_t.left_col/matrix_para_t.left_w+1 : matrix_para_t.left_col/matrix_para_t.left_w;
+
+  matrix_para_t.right_row = matrix_para_t.left_col;
+  matrix_para_t.right_col = rand()%50+1;
+  matrix_para_t.right_w = rand()%16+1;
+  matrix_para_t.right_c =
+    matrix_para_t.right_col%matrix_para_t.right_w?
+      matrix_para_t.right_col/matrix_para_t.right_w+1 : matrix_para_t.right_col/matrix_para_t.right_w;
+  return p;
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  int round_mode;
+  round_mode = set_store_feround();
+
+  int test_finished_num = 0;
+  for (int i = 0; i < 30; i++) {
+    printf("random_test_conv iteration: %d\n", i);
+    param_t p = param_init();
+
+    test_finished_num += test_matrix_ps32_ut(&ctx, bk_ctx, &p);
+    destroy_param(bk_ctx, &p);
+  }
+  printf("test_finished_num: %d\n", test_finished_num);
+  restore_feround(round_mode);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_bf16_matrix_transfer.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_matrix_transfer.cpp
new file mode 100644
index 000000000..4474e855a
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_matrix_transfer.cpp
@@ -0,0 +1,101 @@
+#include "../1880v2_test_util.h"
+#include "1880v2_bf16_util.h"
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+ {FMT_I8, FMT_I8},
+};
+
+static void test_put_and_get_matrix_l2g(
+    CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, fmt_t fmt, int eu_align)
+{
+  int row = 5;
+  int col = 16 * 5 + 2;
+  int size = row * col;
+  float val = -100;
+
+  ml_shape_t s = bmk1880v2_matrix_lmem_default_shape(bk_ctx, row, col, fmt);
+
+  u16 *u16data_x = (u16 *)malloc(sizeof(u16) * size);
+  u16 *u16data_y = (u16 *)malloc(sizeof(u16) * size);
+  s8 *s8data_x = (s8 *)malloc(sizeof(s8) * size);
+  s8 *s8data_y = (s8 *)malloc(sizeof(s8) * size);
+  void *data_x = NULL;
+  void *data_y = NULL;
+  u8 *result_x =NULL;
+  u8 *result_y = NULL;
+
+  // prepare source data
+  for (int i = 0; i < size; i++) {
+    if(fmt == FMT_BF16) {
+      u16data_x[i] = generate_bf16_corner_val(val);
+      u16data_y[i] = generate_bf16_corner_val(val);
+      val += 0.1;
+    } else {
+      s8data_x[i] = i - 100;
+      s8data_y[i] = -i;
+    }
+  }
+  data_x =  (fmt == FMT_BF16) ? (void *)u16data_x : (void *)s8data_x;
+  data_y =  (fmt == FMT_BF16) ? (void *)u16data_y : (void *)s8data_y;
+
+ // run tpu operations
+  ml_t *ml_x = bmk1880v2_lmem_alloc_matrix(bk_ctx, s, fmt, eu_align);
+  ml_t *ml_y = bmk1880v2_lmem_alloc_matrix(bk_ctx, s, fmt, eu_align);
+  /*
+   * Interleave two matrice in case the same devmem is reused between
+   * put_matrix_g2l() and get_matrix_l2g(), in which case the content of
+   * devmem is already what is expected before bmk1880v2_gdma_store_matrix().
+   */
+  put_bf16_matrix_g2l(ctx, bk_ctx, ml_x, (u8 *)data_x, fmt);
+  put_bf16_matrix_g2l(ctx, bk_ctx, ml_y, (u8 *)data_y, fmt);
+
+
+  // compare data
+  //// Get result_x before result_y.
+  result_x = get_bf16_matrix_l2g(ctx, bk_ctx, ml_x, fmt);
+  result_y = get_bf16_matrix_l2g(ctx, bk_ctx, ml_y, fmt);
+  if( COMPARE_PASS != compare_result( data_x, result_x, fmt, size))
+    exit(-1);
+  if( COMPARE_PASS != compare_result( data_y, result_y, fmt, size))
+    exit(-1);
+  free(result_x);
+  free(result_y);
+
+  //// Get result_y before result_x.
+  result_y = get_bf16_matrix_l2g(ctx, bk_ctx, ml_y, fmt);
+  result_x = get_bf16_matrix_l2g(ctx, bk_ctx, ml_x, fmt);
+  if( COMPARE_PASS != compare_result( data_x, result_x, fmt, size))
+    exit(-1);
+  if( COMPARE_PASS != compare_result( data_y, result_y, fmt, size))
+    exit(-1);
+  free(result_x);
+  free(result_y);
+
+  // free variables
+  bmk1880v2_lmem_free_matrix(bk_ctx, ml_y);
+  bmk1880v2_lmem_free_matrix(bk_ctx, ml_x);
+  free(u16data_x);
+  free(u16data_y);
+  free(s8data_x);
+  free(s8data_y);
+}
+
+#define TEST_ALIGNED 2 // 1: test unalign only, 2: test both align/unalign
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+
+  test_init(&ctx, &bk_ctx);
+
+  for (u32 i = 0; i < nr_fmt; i++) {
+    for (u8 u8_align = 0; u8_align < TEST_ALIGNED; u8_align++) {
+      test_put_and_get_matrix_l2g(&ctx, bk_ctx, input_fmt[i].src_fmt, u8_align);
+    }
+  }
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_bf16_max_pooling.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_max_pooling.cpp
new file mode 100644
index 000000000..6b2190b0a
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_max_pooling.cpp
@@ -0,0 +1,296 @@
+#include "../1880v2_test_util.h"
+#include <float.h>
+
+typedef bmk1880v2_tiu_max_pooling_param_t param_t;
+int random_seed;
+static void print_pooling_param(param_t *p)
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+  int on = p->ofmap->shape.n;
+  int oc = p->ofmap->shape.c;
+  int oh = p->ofmap->shape.h;
+  int ow = p->ofmap->shape.w;
+
+  printf("  Pooling parameters:\n");
+  printf("    random_seed : %d \n", random_seed);
+  printf("    ifmap = (%d, %d, %d, %d)\n", in, ic, ih, iw);
+  printf("    opd0_sign = %d\n", p->ifmap->fmt == FMT_I8);
+  printf("    weight = (%d, %d)\n", p->kh, p->kw);
+  printf("    padding = (%d, %d, %d, %d)\n",
+         p->pad_top, p->pad_bottom, p->pad_left, p->pad_right);
+  printf("    stride = (%d, %d)\n", p->stride_h, p->stride_w);
+  printf("    ofmap = (%d, %d, %d, %d)\n", on, oc, oh, ow);
+}
+
+static int pooling_ih_ext(param_t *p, int ih)
+{
+  int pad = p->pad_top + p->pad_bottom;
+  return ih + pad;
+}
+
+static int pooling_iw_ext(param_t *p, int iw)
+{
+  int pad = p->pad_left + p->pad_right;
+  return iw + pad;
+}
+
+static int pooling_oh(param_t *p, int ih)
+{
+  int ih_ext = pooling_ih_ext(p, ih);
+  return (ih_ext - p->kh) / p->stride_h + 1;
+}
+
+static int pooling_ow(param_t *p, int iw)
+{
+  int iw_ext = pooling_iw_ext(p, iw);
+  return (iw_ext - p->kw) / p->stride_w + 1;
+}
+
+static u16 *alloc_input(param_t *p)
+{
+  u64 size = tl_shape_size(&p->ifmap->shape);
+  u16 *data = (u16 *)xmalloc(size*2);
+  if (!data)
+    return NULL;
+
+  for (u64 i = 0; i < size; i++) {
+    float val;
+    int RAND_MAX2 = RAND_MAX/2; //100 ~ -100
+    val = (float)(rand()-RAND_MAX2)*100 / (float)RAND_MAX;
+    data[i] = convert_fp32_bf16(val);
+  }
+  return data;
+}
+
+static u16 *alloc_output(param_t *p)
+{
+  u64 size = tl_shape_size(&p->ofmap->shape);
+  return (u16 *)xmalloc(size * 2);
+}
+
+static void free_pooling_param(
+    bmk_ctx_t *ctx,
+    param_t *r)
+{
+  if (r->ifmap)
+    free_tl(ctx, r->ifmap);
+  if (r->ofmap)
+    free_tl(ctx, r->ofmap);
+}
+
+static param_t random_pooling_param(bmk_ctx_t *ctx)
+{
+
+  param_t p;
+
+  memset(&p, 0, sizeof(p));
+
+retry:
+  random_seed = clock();
+//  random_seed = 3058538;
+  srand(random_seed);
+
+#if 0
+  int in = 1;
+  int ic = 1;
+  int ih = 6;
+  int iw = 6;
+  //int opd0_sign = rand() % 2;
+
+  p.kh = 3;
+  p.kw = 3;
+  p.stride_h = p.kh;
+  p.stride_w = p.kw;
+  p.pad_top = 3;//rand() % p.kh;
+  p.pad_bottom = 3;//rand() % p.kh;
+  p.pad_left = 3;//rand() % p.kw;
+  p.pad_right = 3;//rand() % p.kw;
+
+  tl_shape_t ifmap_shape;
+  ifmap_shape.n = in;
+  ifmap_shape.c = ic;
+  ifmap_shape.h = ih;
+  ifmap_shape.w = iw;
+  tl_shape_t ofmap_shape;
+  ofmap_shape.n = in;
+  ofmap_shape.c = ic;
+  ofmap_shape.h = pooling_oh(&p, ih);
+  ofmap_shape.w = pooling_ow(&p, iw);
+
+#else
+  int in = rand() % 5 + 1;
+  int ic = rand() % (3 * BM1880V2_HW_NPU_NUM) + 1;
+  int ih = rand() % 30 + 3;
+  int iw = rand() % 30 + 6;
+  //int opd0_sign = rand() % 2;
+
+  p.kh = rand() % 5 + 1;
+  p.kw = rand() % 5 + 1;
+  p.stride_h = rand() % (p.kh) + 1;
+  p.stride_w = rand() % (p.kw) + 1;
+  p.pad_top = rand() % p.kh;
+  p.pad_bottom = rand() % p.kh;
+  p.pad_left = rand() % p.kw;
+  p.pad_right = rand() % p.kw;
+
+  tl_shape_t ifmap_shape;
+  ifmap_shape.n = in;
+  ifmap_shape.c = ic;
+  ifmap_shape.h = ih;
+  ifmap_shape.w = iw;
+  tl_shape_t ofmap_shape;
+  ofmap_shape.n = in;
+  ofmap_shape.c = ic;
+  ofmap_shape.h = pooling_oh(&p, ih);
+  ofmap_shape.w = pooling_ow(&p, iw);
+#endif
+//  fmt_t fmt = opd0_sign? FMT_I8: FMT_U8;
+  p.ofmap = bmk1880v2_lmem_alloc_tensor(ctx, ofmap_shape, FMT_BF16, 1);
+  p.ifmap = bmk1880v2_lmem_alloc_tensor(ctx, ifmap_shape, FMT_BF16, 1);
+
+  int RAND_MAX2 = RAND_MAX/2; //20 ~ -20
+  float ins_val = (float)(rand()-RAND_MAX2)*20 / (float)RAND_MAX;
+  p.ins_fp = convert_fp32_bf16(ins_val);
+
+  if ((p.kh > pooling_ih_ext(&p, ih))
+      || (p.kw > pooling_iw_ext(&p, iw))
+      || (p.pad_top >= (1 << 4))
+      || (p.pad_bottom >= (1 << 4))
+      || (p.pad_left >= (1 << 4))
+      || (p.pad_right >= (1 << 4))
+      || (p.kh * p.kw == 1)
+      || !p.ofmap || !p.ifmap) {
+      printf("retry init_pooling_param\n");
+    free_pooling_param(ctx, &p);
+    goto retry;
+  }
+
+  return p;
+}
+static int index_get(int h, int w1, int w2)
+{
+  return h * w1 + w2;
+}
+
+int native_pooling_max_bf16(
+    const u16* i_fmap,
+    u16* o_fmap,
+    int input_n, int input_c, int input_h, int input_w,
+    int kh, int kw,
+    int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r,
+    int stride_h, int stride_w,
+    int ins_h, int ins_w,
+    int ins_h_last, int ins_w_last,
+    u16 ins_fp
+    )
+{
+  if (ins_h != 0 || ins_w != 0 || ins_h_last != 0  || ins_w_last !=0)
+    return BM_ERR_INVALID_ARGUMENT;
+
+  int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b);
+  int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r);
+
+  int output_h = calc_output_hw(h_after, kh, stride_h);
+  int output_w = calc_output_hw(w_after, kw, stride_w);
+
+  const float max_init = -FLT_MAX;//convert_bf16_fp32(ins_fp);
+  u16 *i_fmap_pad = NULL;
+  for (int nc = 0; nc < input_n * input_c; nc++) {
+    fill_pad_fmap_bf16(i_fmap, &i_fmap_pad, ins_fp,
+      pad_w_l, pad_w_r, pad_h_t, pad_h_b,
+      0, 0, 0, 0, input_h, input_w);
+
+    for (int ph = 0; ph < output_h; ++ph) {
+      for (int pw = 0; pw < output_w; ++pw) {
+        int hstart = ph * stride_h;
+        int wstart = pw * stride_w;
+        int pool_index = index_get(ph, output_w, pw);
+        float max = max_init;
+        for (int h = 0; h < kh; h++) {
+          for (int w = 0; w < kw; w++) {
+            int index = index_get((hstart + h), (input_w + pad_w_l + pad_w_r),
+                            (w + wstart));
+            float val = convert_bf16_fp32(i_fmap_pad[index]);
+            max = (val > max)? val: max;
+          }
+        }
+        o_fmap[pool_index] = convert_fp32_bf16(max);
+      }
+    }
+    i_fmap += input_w * input_h;
+    o_fmap += output_w * output_h;
+  }
+  free(i_fmap_pad);
+
+  return BM_SUCCESS;
+}
+
+
+static void compare_results(
+    param_t *p,
+    u16 input[],
+    u16 output[])
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+
+  u16 *output_ref = alloc_output(p);
+  bmerr_t ret = native_pooling_max_bf16(
+      input, output_ref, in, ic, ih, iw, p->kh, p->kw,
+      p->pad_top, p->pad_bottom, p->pad_left, p->pad_right,
+      p->stride_h, p->stride_w, 0, 0, 0, 0, p->ins_fp);
+  assert(ret == BM_SUCCESS);
+  int cmp_res = array_cmp_int8(
+      "Comparing results ...\n", (s8*) output_ref, (s8*)output,
+      tl_shape_size(&p->ofmap->shape)*2);
+  if (cmp_res != 0) {
+    printf("Comparison FAILED!!!\n");
+    print_pooling_param(p);
+    exit(-1);
+  }
+  free(output_ref);
+}
+
+static int test_pooling(CVI_RT_HANDLE ctx, bmk_ctx_t *bk_ctx)
+{
+  param_t param = random_pooling_param(bk_ctx);
+  //print_pooling_param(&param);
+  u16 *input = alloc_input(&param);
+  put_bf16_tensor_g2l(&ctx, bk_ctx, param.ifmap, (u16 *)input, FMT_BF16);
+  bmk1880v2_tiu_max_pooling(bk_ctx, &param);
+
+  u16 *output = (u16 *)get_bf16_tensor_l2g(&ctx, bk_ctx, param.ofmap, FMT_BF16);
+
+  compare_results(&param, input, output);
+
+  free_pooling_param(bk_ctx, &param);
+  free(output);
+  free(input);
+
+  return 1;
+}
+
+static void test_max_pooling(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx)
+{
+  int test_finished_num = 0;
+  for (u64 i = 0; i < 20; i++)
+    test_finished_num += test_pooling(*ctx, bk_ctx);
+  printf("Test finished %d\n", test_finished_num);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_max_pooling(&ctx, bk_ctx);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_bf16_reciprocal.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_reciprocal.cpp
new file mode 100644
index 000000000..f2396891f
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_reciprocal.cpp
@@ -0,0 +1,2491 @@
+/**
+ */
+#include "../1880v2_test_util.h"
+#define OUT
+#define IN
+#include <iostream>
+#include <iomanip>
+#include <string>
+#include <map>
+#include <random>
+#include <cfloat>
+//#include <boost/math/special_functions/next.hpp>
+//#define DBG
+
+using namespace std;
+//TODO: get from ctx
+static u32 channel = 32; //<! 1880v2 hardcode
+
+//<! 1880v2 hw config
+static u32 table_h = 32;
+static u32 table_w = 8;
+static u32 table_hw = table_h * table_w;
+
+// fix range
+const static int exp_start = -62;
+const static int exp_end = 63;
+
+// http://www.enseignement.polytechnique.fr/informatique/INF478/docs/Cpp/en/cpp/types/numeric_limits/epsilon.html
+template<class T>
+typename std::enable_if<!std::numeric_limits<T>::is_integer, bool>::type
+    almost_equal(T x, T y, int ulp)
+{
+    // the machine epsilon has to be scaled to the magnitude of the values used
+    // and multiplied by the desired precision in ULPs (units in the last place)
+    return std::abs(x-y) < std::numeric_limits<T>::epsilon() * std::abs(x+y) * ulp
+    // unless the result is subnormal
+           || std::abs(x-y) < std::numeric_limits<T>::min();
+}
+/**
+ * pre_data means we test fixed pattern, it should be same sa lut
+ */
+enum TEST_MODE {
+  PRE_DATA_COMPARE_FIX = 0, // pre-data + fix compare
+  GEN_POW_20_DATA_MAX_ERROR, //generate 2^-20 ~ 2^20 value that check epsilon
+  GEN_POW_20_DATA_MAX_ERROR_U8, //generate 2^-20 ~ 2^20 value that check epsilon, result bf16->u8
+  TEST_MODE_MAX,
+};
+
+static TEST_MODE mode;
+
+static u16 test_pattern[] = {
+  0x0000,
+  0x38D2,
+  0x3952,
+  0x399D,
+  0x39D2,
+  0x3A03,
+  0x3A1D,
+  0x3A38,
+  0x3A52,
+  0x3A6C,
+  0x3A83,
+  0x3A90,
+  0x3A9D,
+  0x3AAA,
+  0x3AB8,
+  0x3AC5,
+  0x3AD2,
+  0x3ADF,
+  0x3AEC,
+  0x3AF9,
+  0x3B03,
+  0x3B0A,
+  0x3B10,
+  0x3B17,
+  0x3B1D,
+  0x3B24,
+  0x3B2A,
+  0x3B31,
+  0x3B38,
+  0x3B3E,
+  0x3B45,
+  0x3B4B,
+  0x3B52,
+  0x3B58,
+  0x3B5F,
+  0x3B65,
+  0x3B6C,
+  0x3B72,
+  0x3B79,
+  0x3B80,
+  0x3B83,
+  0x3B86,
+  0x3B8A,
+  0x3B8D,
+  0x3B90,
+  0x3B93,
+  0x3B97,
+  0x3B9A,
+  0x3B9D,
+  0x3BA1,
+  0x3BA4,
+  0x3BA7,
+  0x3BAA,
+  0x3BAE,
+  0x3BB1,
+  0x3BB4,
+  0x3BB8,
+  0x3BBB,
+  0x3BBE,
+  0x3BC1,
+  0x3BC5,
+  0x3BC8,
+  0x3BCB,
+  0x3BCE,
+  0x3BD2,
+  0x3BD5,
+  0x3BD8,
+  0x3BDC,
+  0x3BDF,
+  0x3BE2,
+  0x3BE5,
+  0x3BE9,
+  0x3BEC,
+  0x3BEF,
+  0x3BF2,
+  0x3BF6,
+  0x3BF9,
+  0x3BFC,
+  0x3C00,
+  0x3C01,
+  0x3C03,
+  0x3C05,
+  0x3C06,
+  0x3C08,
+  0x3C0A,
+  0x3C0B,
+  0x3C0D,
+  0x3C0F,
+  0x3C10,
+  0x3C12,
+  0x3C13,
+  0x3C15,
+  0x3C17,
+  0x3C18,
+  0x3C1A,
+  0x3C1C,
+  0x3C1D,
+  0x3C1F,
+  0x3C21,
+  0x3C22,
+  0x3C24,
+  0x3C25,
+  0x3C27,
+  0x3C29,
+  0x3C2A,
+  0x3C2C,
+  0x3C2E,
+  0x3C2F,
+  0x3C31,
+  0x3C33,
+  0x3C34,
+  0x3C36,
+  0x3C38,
+  0x3C39,
+  0x3C3B,
+  0x3C3C,
+  0x3C3E,
+  0x3C40,
+  0x3C41,
+  0x3C43,
+  0x3C45,
+  0x3C46,
+  0x3C48,
+  0x3C4A,
+  0x3C4B,
+  0x3C4D,
+  0x3C4E,
+  0x3C50,
+  0x3C52,
+  0x3C53,
+  0x3C55,
+  0x3C57,
+  0x3C58,
+  0x3C5A,
+  0x3C5C,
+  0x3C5D,
+  0x3C5F,
+  0x3C60,
+  0x3C62,
+  0x3C64,
+  0x3C65,
+  0x3C67,
+  0x3C69,
+  0x3C6A,
+  0x3C6C,
+  0x3C6E,
+  0x3C6F,
+  0x3C71,
+  0x3C72,
+  0x3C74,
+  0x3C76,
+  0x3C77,
+  0x3C79,
+  0x3C7B,
+  0x3C7C,
+  0x3C7E,
+  0x3C80,
+  0x3C81,
+  0x3C81,
+  0x3C82,
+  0x3C83,
+  0x3C84,
+  0x3C85,
+  0x3C86,
+  0x3C86,
+  0x3C87,
+  0x3C88,
+  0x3C89,
+  0x3C8A,
+  0x3C8A,
+  0x3C8B,
+  0x3C8C,
+  0x3C8D,
+  0x3C8E,
+  0x3C8F,
+  0x3C8F,
+  0x3C90,
+  0x3C91,
+  0x3C92,
+  0x3C93,
+  0x3C93,
+  0x3C94,
+  0x3C95,
+  0x3C96,
+  0x3C97,
+  0x3C98,
+  0x3C98,
+  0x3C99,
+  0x3C9A,
+  0x3C9B,
+  0x3C9C,
+  0x3C9C,
+  0x3C9D,
+  0x3C9E,
+  0x3C9F,
+  0x3CA0,
+  0x3CA1,
+  0x3CA1,
+  0x3CA2,
+  0x3CA3,
+  0x3CA4,
+  0x3CA5,
+  0x3CA5,
+  0x3CA6,
+  0x3CA7,
+  0x3CA8,
+  0x3CA9,
+  0x3CAA,
+  0x3CAA,
+  0x3CAB,
+  0x3CAC,
+  0x3CAD,
+  0x3CAE,
+  0x3CAE,
+  0x3CAF,
+  0x3CB0,
+  0x3CB1,
+  0x3CB2,
+  0x3CB3,
+  0x3CB3,
+  0x3CB4,
+  0x3CB5,
+  0x3CB6,
+  0x3CB7,
+  0x3CB8,
+  0x3CB8,
+  0x3CB9,
+  0x3CBA,
+  0x3CBB,
+  0x3CBC,
+  0x3CBC,
+  0x3CBD,
+  0x3CBE,
+  0x3CBF,
+  0x3CC0,
+  0x3CC1,
+  0x3CC1,
+  0x3CC2,
+  0x3CC3,
+  0x3CC4,
+  0x3CC5,
+  0x3CC5,
+  0x3CC6,
+  0x3CC7,
+  0x3CC8,
+  0x3CC9,
+  0x3CCA,
+  0x3CCA,
+  0x3CCB,
+  0x3CCC,
+  0x3CCD,
+  0x3CCE,
+  0x3CCE,
+  0x3CCF,
+  0x3CD0,
+  0x3CD1,
+  0x3CD2,
+  0x3CD3,
+  0x3CD3,
+  0x3CD4,
+  0x3CD5,
+  0x3CD6,
+  0x3CD7,
+  0x3CD7,
+  0x3CD8,
+  0x3CD9,
+  0x3CDA,
+  0x3CDB,
+  0x3CDC,
+  0x3CDC,
+  0x3CDD,
+  0x3CDE,
+  0x3CDF,
+  0x3CE0,
+  0x3CE0,
+  0x3CE1,
+  0x3CE2,
+  0x3CE3,
+  0x3CE4,
+  0x3CE5,
+  0x3CE5,
+  0x3CE6,
+  0x3CE7,
+  0x3CE8,
+  0x3CE9,
+  0x3CE9,
+  0x3CEA,
+  0x3CEB,
+  0x3CEC,
+  0x3CED,
+  0x3CEE,
+  0x3CEE,
+  0x3CEF,
+  0x3CF0,
+  0x3CF1,
+  0x3CF2,
+  0x3CF2,
+  0x3CF3,
+  0x3CF4,
+  0x3CF5,
+  0x3CF6,
+  0x3CF7,
+  0x3CF7,
+  0x3CF8,
+  0x3CF9,
+  0x3CFA,
+  0x3CFB,
+  0x3CFB,
+  0x3CFC,
+  0x3CFD,
+  0x3CFE,
+  0x3CFF,
+  0x3D00,
+  0x3D00,
+  0x3D01,
+  0x3D01,
+  0x3D01,
+  0x3D02,
+  0x3D02,
+  0x3D03,
+  0x3D03,
+  0x3D03,
+  0x3D04,
+  0x3D04,
+  0x3D05,
+  0x3D05,
+  0x3D06,
+  0x3D06,
+  0x3D06,
+  0x3D07,
+  0x3D07,
+  0x3D08,
+  0x3D08,
+  0x3D08,
+  0x3D09,
+  0x3D09,
+  0x3D0A,
+  0x3D0A,
+  0x3D0A,
+  0x3D0B,
+  0x3D0B,
+  0x3D0C,
+  0x3D0C,
+  0x3D0C,
+  0x3D0D,
+  0x3D0D,
+  0x3D0E,
+  0x3D0E,
+  0x3D0F,
+  0x3D0F,
+  0x3D0F,
+  0x3D10,
+  0x3D10,
+  0x3D11,
+  0x3D11,
+  0x3D11,
+  0x3D12,
+  0x3D12,
+  0x3D13,
+  0x3D13,
+  0x3D13,
+  0x3D14,
+  0x3D14,
+  0x3D15,
+  0x3D15,
+  0x3D16,
+  0x3D16,
+  0x3D16,
+  0x3D17,
+  0x3D17,
+  0x3D18,
+  0x3D18,
+  0x3D18,
+  0x3D19,
+  0x3D19,
+  0x3D1A,
+  0x3D1A,
+  0x3D1A,
+  0x3D1B,
+  0x3D1B,
+  0x3D1C,
+  0x3D1C,
+  0x3D1C,
+  0x3D1D,
+  0x3D1D,
+  0x3D1E,
+  0x3D1E,
+  0x3D1F,
+  0x3D1F,
+  0x3D1F,
+  0x3D20,
+  0x3D20,
+  0x3D21,
+  0x3D21,
+  0x3D21,
+  0x3D22,
+  0x3D22,
+  0x3D23,
+  0x3D23,
+  0x3D23,
+  0x3D24,
+  0x3D24,
+  0x3D25,
+  0x3D25,
+  0x3D25,
+  0x3D26,
+  0x3D26,
+  0x3D27,
+  0x3D27,
+  0x3D28,
+  0x3D28,
+  0x3D28,
+  0x3D29,
+  0x3D29,
+  0x3D2A,
+  0x3D2A,
+  0x3D2A,
+  0x3D2B,
+  0x3D2B,
+  0x3D2C,
+  0x3D2C,
+  0x3D2C,
+  0x3D2D,
+  0x3D2D,
+  0x3D2E,
+  0x3D2E,
+  0x3D2E,
+  0x3D2F,
+  0x3D2F,
+  0x3D30,
+  0x3D30,
+  0x3D31,
+  0x3D31,
+  0x3D31,
+  0x3D32,
+  0x3D32,
+  0x3D33,
+  0x3D33,
+  0x3D33,
+  0x3D34,
+  0x3D34,
+  0x3D35,
+  0x3D35,
+  0x3D35,
+  0x3D36,
+  0x3D36,
+  0x3D37,
+  0x3D37,
+  0x3D38,
+  0x3D38,
+  0x3D38,
+  0x3D39,
+  0x3D39,
+  0x3D3A,
+  0x3D3A,
+  0x3D3A,
+  0x3D3B,
+  0x3D3B,
+  0x3D3C,
+  0x3D3C,
+  0x3D3C,
+  0x3D3D,
+  0x3D3D,
+  0x3D3E,
+  0x3D3E,
+  0x3D3E,
+  0x3D3F,
+  0x3D3F,
+  0x3D40,
+  0x3D40,
+  0x3D41,
+  0x3D41,
+  0x3D41,
+  0x3D42,
+  0x3D42,
+  0x3D43,
+  0x3D43,
+  0x3D43,
+  0x3D44,
+  0x3D44,
+  0x3D45,
+  0x3D45,
+  0x3D45,
+  0x3D46,
+  0x3D46,
+  0x3D47,
+  0x3D47,
+  0x3D47,
+  0x3D48,
+  0x3D48,
+  0x3D49,
+  0x3D49,
+  0x3D4A,
+  0x3D4A,
+  0x3D4A,
+  0x3D4B,
+  0x3D4B,
+  0x3D4C,
+  0x3D4C,
+  0x3D4C,
+  0x3D4D,
+  0x3D4D,
+  0x3D4E,
+  0x3D4E,
+  0x3D4E,
+  0x3D4F,
+  0x3D4F,
+  0x3D50,
+  0x3D50,
+  0x3D50,
+  0x3D51,
+  0x3D51,
+  0x3D52,
+  0x3D52,
+  0x3D53,
+  0x3D53,
+  0x3D53,
+  0x3D54,
+  0x3D54,
+  0x3D55,
+  0x3D55,
+  0x3D55,
+  0x3D56,
+  0x3D56,
+  0x3D57,
+  0x3D57,
+  0x3D57,
+  0x3D58,
+  0x3D58,
+  0x3D59,
+  0x3D59,
+  0x3D59,
+  0x3D5A,
+  0x3D5A,
+  0x3D5B,
+  0x3D5B,
+  0x3D5C,
+  0x3D5C,
+  0x3D5C,
+  0x3D5D,
+  0x3D5D,
+  0x3D5E,
+  0x3D5E,
+  0x3D5E,
+  0x3D5F,
+  0x3D5F,
+  0x3D60,
+  0x3D60,
+  0x3D60,
+  0x3D61,
+  0x3D61,
+  0x3D62,
+  0x3D62,
+  0x3D63,
+  0x3D63,
+  0x3D63,
+  0x3D64,
+  0x3D64,
+  0x3D65,
+  0x3D65,
+  0x3D65,
+  0x3D66,
+  0x3D66,
+  0x3D67,
+  0x3D67,
+  0x3D67,
+  0x3D68,
+  0x3D68,
+  0x3D69,
+  0x3D69,
+  0x3D69,
+  0x3D6A,
+  0x3D6A,
+  0x3D6B,
+  0x3D6B,
+  0x3D6C,
+  0x3D6C,
+  0x3D6C,
+  0x3D6D,
+  0x3D6D,
+  0x3D6E,
+  0x3D6E,
+  0x3D6E,
+  0x3D6F,
+  0x3D6F,
+  0x3D70,
+  0x3D70,
+  0x3D70,
+  0x3D71,
+  0x3D71,
+  0x3D72,
+  0x3D72,
+  0x3D72,
+  0x3D73,
+  0x3D73,
+  0x3D74,
+  0x3D74,
+  0x3D75,
+  0x3D75,
+  0x3D75,
+  0x3D76,
+  0x3D76,
+  0x3D77,
+  0x3D77,
+  0x3D77,
+  0x3D78,
+  0x3D78,
+  0x3D79,
+  0x3D79,
+  0x3D79,
+  0x3D7A,
+  0x3D7A,
+  0x3D7B,
+  0x3D7B,
+  0x3D7B,
+  0x3D7C,
+  0x3D7C,
+  0x3D7D,
+  0x3D7D,
+  0x3D7E,
+  0x3D7E,
+  0x3D7E,
+  0x3D7F,
+  0x3D7F,
+  0x3D80,
+  0x3D80,
+  0x3D80,
+  0x3D80,
+  0x3D81,
+  0x3D81,
+  0x3D81,
+  0x3D81,
+  0x3D81,
+  0x3D82,
+  0x3D82,
+  0x3D82,
+  0x3D82,
+  0x3D82,
+  0x3D83,
+  0x3D83,
+  0x3D83,
+  0x3D83,
+  0x3D83,
+  0x3D84,
+  0x3D84,
+  0x3D84,
+  0x3D84,
+  0x3D85,
+  0x3D85,
+  0x3D85,
+  0x3D85,
+  0x3D85,
+  0x3D86,
+  0x3D86,
+  0x3D86,
+  0x3D86,
+  0x3D86,
+  0x3D87,
+  0x3D87,
+  0x3D87,
+  0x3D87,
+  0x3D87,
+  0x3D88,
+  0x3D88,
+  0x3D88,
+  0x3D88,
+  0x3D88,
+  0x3D89,
+  0x3D89,
+  0x3D89,
+  0x3D89,
+  0x3D89,
+  0x3D8A,
+  0x3D8A,
+  0x3D8A,
+  0x3D8A,
+  0x3D8A,
+  0x3D8B,
+  0x3D8B,
+  0x3D8B,
+  0x3D8B,
+  0x3D8B,
+  0x3D8C,
+  0x3D8C,
+  0x3D8C,
+  0x3D8C,
+  0x3D8C,
+  0x3D8D,
+  0x3D8D,
+  0x3D8D,
+  0x3D8D,
+  0x3D8E,
+  0x3D8E,
+  0x3D8E,
+  0x3D8E,
+  0x3D8E,
+  0x3D8F,
+  0x3D8F,
+  0x3D8F,
+  0x3D8F,
+  0x3D8F,
+  0x3D90,
+  0x3D90,
+  0x3D90,
+  0x3D90,
+  0x3D90,
+  0x3D91,
+  0x3D91,
+  0x3D91,
+  0x3D91,
+  0x3D91,
+  0x3D92,
+  0x3D92,
+  0x3D92,
+  0x3D92,
+  0x3D92,
+  0x3D93,
+  0x3D93,
+  0x3D93,
+  0x3D93,
+  0x3D93,
+  0x3D94,
+  0x3D94,
+  0x3D94,
+  0x3D94,
+  0x3D94,
+  0x3D95,
+  0x3D95,
+  0x3D95,
+  0x3D95,
+  0x3D96,
+  0x3D96,
+  0x3D96,
+  0x3D96,
+  0x3D96,
+  0x3D97,
+  0x3D97,
+  0x3D97,
+  0x3D97,
+  0x3D97,
+  0x3D98,
+  0x3D98,
+  0x3D98,
+  0x3D98,
+  0x3D98,
+  0x3D99,
+  0x3D99,
+  0x3D99,
+  0x3D99,
+  0x3D99,
+  0x3D9A,
+  0x3D9A,
+  0x3D9A,
+  0x3D9A,
+  0x3D9A,
+  0x3D9B,
+  0x3D9B,
+  0x3D9B,
+  0x3D9B,
+  0x3D9B,
+  0x3D9C,
+  0x3D9C,
+  0x3D9C,
+  0x3D9C,
+  0x3D9C,
+  0x3D9D,
+  0x3D9D,
+  0x3D9D,
+  0x3D9D,
+  0x3D9D,
+  0x3D9E,
+  0x3D9E,
+  0x3D9E,
+  0x3D9E,
+  0x3D9F,
+  0x3D9F,
+  0x3D9F,
+  0x3D9F,
+  0x3D9F,
+  0x3DA0,
+  0x3DA0,
+  0x3DA0,
+  0x3DA0,
+  0x3DA0,
+  0x3DA1,
+  0x3DA1,
+  0x3DA1,
+  0x3DA1,
+  0x3DA1,
+  0x3DA2,
+  0x3DA2,
+  0x3DA2,
+  0x3DA2,
+  0x3DA2,
+  0x3DA3,
+  0x3DA3,
+  0x3DA3,
+  0x3DA3,
+  0x3DA3,
+  0x3DA4,
+  0x3DA4,
+  0x3DA4,
+  0x3DA4,
+  0x3DA4,
+  0x3DA5,
+  0x3DA5,
+  0x3DA5,
+  0x3DA5,
+  0x3DA5,
+  0x3DA6,
+  0x3DA6,
+  0x3DA6,
+  0x3DA6,
+  0x3DA7,
+  0x3DA7,
+  0x3DA7,
+  0x3DA7,
+  0x3DA7,
+  0x3DA8,
+  0x3DA8,
+  0x3DA8,
+  0x3DA8,
+  0x3DA8,
+  0x3DA9,
+  0x3DA9,
+  0x3DA9,
+  0x3DA9,
+  0x3DA9,
+  0x3DAA,
+  0x3DAA,
+  0x3DAA,
+  0x3DAA,
+  0x3DAA,
+  0x3DAB,
+  0x3DAB,
+  0x3DAB,
+  0x3DAB,
+  0x3DAB,
+  0x3DAC,
+  0x3DAC,
+  0x3DAC,
+  0x3DAC,
+  0x3DAC,
+  0x3DAD,
+  0x3DAD,
+  0x3DAD,
+  0x3DAD,
+  0x3DAD,
+  0x3DAE,
+  0x3DAE,
+  0x3DAE,
+  0x3DAE,
+  0x3DAE,
+  0x3DAF,
+  0x3DAF,
+  0x3DAF,
+  0x3DAF,
+  0x3DB0,
+  0x3DB0,
+  0x3DB0,
+  0x3DB0,
+  0x3DB0,
+  0x3DB1,
+  0x3DB1,
+  0x3DB1,
+  0x3DB1,
+  0x3DB1,
+  0x3DB2,
+  0x3DB2,
+  0x3DB2,
+  0x3DB2,
+  0x3DB2,
+  0x3DB3,
+  0x3DB3,
+  0x3DB3,
+  0x3DB3,
+  0x3DB3,
+  0x3DB4,
+  0x3DB4,
+  0x3DB4,
+  0x3DB4,
+  0x3DB4,
+  0x3DB5,
+  0x3DB5,
+  0x3DB5,
+  0x3DB5,
+  0x3DB5,
+  0x3DB6,
+  0x3DB6,
+  0x3DB6,
+  0x3DB6,
+  0x3DB6,
+  0x3DB7,
+  0x3DB7,
+  0x3DB7,
+  0x3DB7,
+  0x3DB8,
+  0x3DB8,
+  0x3DB8,
+  0x3DB8,
+  0x3DB8,
+  0x3DB9,
+  0x3DB9,
+  0x3DB9,
+  0x3DB9,
+  0x3DB9,
+  0x3DBA,
+  0x3DBA,
+  0x3DBA,
+  0x3DBA,
+  0x3DBA,
+  0x3DBB,
+  0x3DBB,
+  0x3DBB,
+  0x3DBB,
+  0x3DBB,
+  0x3DBC,
+  0x3DBC,
+  0x3DBC,
+  0x3DBC,
+  0x3DBC,
+  0x3DBD,
+  0x3DBD,
+  0x3DBD,
+  0x3DBD,
+  0x3DBD,
+  0x3DBE,
+  0x3DBE,
+  0x3DBE,
+  0x3DBE,
+  0x3DBE,
+  0x3DBF,
+  0x3DBF,
+  0x3DBF,
+  0x3DBF,
+  0x3DBF,
+  0x3DC0,
+  0x3DC0,
+  0x3DC0,
+  0x3DC0,
+  0x3DC1,
+  0x3DC1,
+  0x3DC1,
+  0x3DC1,
+  0x3DC1,
+  0x3DC2,
+  0x3DC2,
+  0x3DC2,
+  0x3DC2,
+  0x3DC2,
+  0x3DC3,
+  0x3DC3,
+  0x3DC3,
+  0x3DC3,
+  0x3DC3,
+  0x3DC4,
+  0x3DC4,
+  0x3DC4,
+  0x3DC4,
+  0x3DC4,
+  0x3DC5,
+  0x3DC5,
+  0x3DC5,
+  0x3DC5,
+  0x3DC5,
+  0x3DC6,
+  0x3DC6,
+  0x3DC6,
+  0x3DC6,
+  0x3DC6,
+  0x3DC7,
+  0x3DC7,
+  0x3DC7,
+  0x3DC7,
+  0x3DC7,
+  0x3DC8,
+  0x3DC8,
+  0x3DC8,
+  0x3DC8,
+  0x3DC8,
+  0x3DC9,
+  0x3DC9,
+  0x3DC9,
+  0x3DC9,
+  0x3DCA,
+  0x3DCA,
+  0x3DCA,
+  0x3DCA,
+  0x3DCA,
+  0x3DCB,
+  0x3DCB,
+  0x3DCB,
+  0x3DCB,
+  0x3DCB,
+  0x3DCC,
+  0x3DCC,
+  0x3DCC,
+  0x3DCC,
+  0x3DCC,
+  0x3DCD,
+  0x3DCE,
+  0x3DCF,
+  0x3DD0,
+  0x3DD1,
+  0x3DD2,
+  0x3DD3,
+  0x3DD4,
+  0x3DD5,
+  0x3DD6,
+  0x3DD7,
+  0x3DD8,
+  0x3DD9,
+  0x3DDA,
+  0x3DDB,
+  0x3DDC,
+  0x3DDD,
+  0x3DDE,
+  0x3DDF,
+  0x3DE0,
+  0x3DE1,
+  0x3DE2,
+  0x3DE3,
+  0x3DE4,
+  0x3DE5,
+};
+
+static u16 sigmode_golden_bf16[] = {
+  0x0,
+  0x461c,
+  0x459c,
+  0x4551,
+  0x451c,
+  0x44fa,
+  0x44d1,
+  0x44b2,
+  0x449c,
+  0x448b,
+  0x447a,
+  0x4464,
+  0x4451,
+  0x4441,
+  0x4432,
+  0x4426,
+  0x441c,
+  0x4413,
+  0x440b,
+  0x4404,
+  0x43fa,
+  0x43ed,
+  0x43e4,
+  0x43d9,
+  0x43d1,
+  0x43c8,
+  0x43c1,
+  0x43b9,
+  0x43b2,
+  0x43ac,
+  0x43a6,
+  0x43a1,
+  0x439c,
+  0x4398,
+  0x4393,
+  0x438f,
+  0x438b,
+  0x4387,
+  0x4384,
+  0x4380,
+  0x437a,
+  0x4375,
+  0x436d,
+  0x4368,
+  0x4364,
+  0x435f,
+  0x4359,
+  0x4355,
+  0x4351,
+  0x434c,
+  0x4348,
+  0x4344,
+  0x4341,
+  0x433c,
+  0x4339,
+  0x4336,
+  0x4332,
+  0x432f,
+  0x432c,
+  0x432a,
+  0x4326,
+  0x4324,
+  0x4321,
+  0x431f,
+  0x431c,
+  0x431a,
+  0x4318,
+  0x4315,
+  0x4313,
+  0x4311,
+  0x430f,
+  0x430d,
+  0x430b,
+  0x4309,
+  0x4307,
+  0x4305,
+  0x4304,
+  0x4302,
+  0x4300,
+  0x42fe,
+  0x42fa,
+  0x42f6,
+  0x42f5,
+  0x42f1,
+  0x42ed,
+  0x42ec,
+  0x42e8,
+  0x42e5,
+  0x42e4,
+  0x42e0,
+  0x42df,
+  0x42dc,
+  0x42d9,
+  0x42d8,
+  0x42d5,
+  0x42d2,
+  0x42d1,
+  0x42ce,
+  0x42cc,
+  0x42ca,
+  0x42c8,
+  0x42c7,
+  0x42c4,
+  0x42c2,
+  0x42c1,
+  0x42bf,
+  0x42bc,
+  0x42bb,
+  0x42b9,
+  0x42b7,
+  0x42b6,
+  0x42b4,
+  0x42b2,
+  0x42b1,
+  0x42af,
+  0x42ae,
+  0x42ac,
+  0x42ab,
+  0x42aa,
+  0x42a8,
+  0x42a6,
+  0x42a5,
+  0x42a4,
+  0x42a2,
+  0x42a1,
+  0x42a0,
+  0x429f,
+  0x429e,
+  0x429c,
+  0x429b,
+  0x429a,
+  0x4298,
+  0x4298,
+  0x4296,
+  0x4295,
+  0x4294,
+  0x4293,
+  0x4292,
+  0x4291,
+  0x4290,
+  0x428f,
+  0x428e,
+  0x428d,
+  0x428c,
+  0x428b,
+  0x428a,
+  0x4289,
+  0x4288,
+  0x4287,
+  0x4286,
+  0x4285,
+  0x4285,
+  0x4284,
+  0x4283,
+  0x4282,
+  0x4281,
+  0x4280,
+  0x427e,
+  0x427e,
+  0x427c,
+  0x427a,
+  0x4278,
+  0x4276,
+  0x4275,
+  0x4275,
+  0x4273,
+  0x4271,
+  0x426f,
+  0x426d,
+  0x426d,
+  0x426c,
+  0x426a,
+  0x4268,
+  0x4267,
+  0x4265,
+  0x4265,
+  0x4264,
+  0x4262,
+  0x4260,
+  0x425f,
+  0x425f,
+  0x425d,
+  0x425c,
+  0x425a,
+  0x4259,
+  0x4258,
+  0x4258,
+  0x4256,
+  0x4255,
+  0x4253,
+  0x4252,
+  0x4252,
+  0x4251,
+  0x424f,
+  0x424e,
+  0x424d,
+  0x424c,
+  0x424c,
+  0x424a,
+  0x4249,
+  0x4248,
+  0x4247,
+  0x4247,
+  0x4245,
+  0x4244,
+  0x4243,
+  0x4242,
+  0x4241,
+  0x4241,
+  0x4240,
+  0x423f,
+  0x423d,
+  0x423c,
+  0x423c,
+  0x423b,
+  0x423a,
+  0x4239,
+  0x4238,
+  0x4237,
+  0x4237,
+  0x4236,
+  0x4235,
+  0x4234,
+  0x4233,
+  0x4232,
+  0x4232,
+  0x4231,
+  0x4230,
+  0x422f,
+  0x422e,
+  0x422e,
+  0x422d,
+  0x422c,
+  0x422c,
+  0x422b,
+  0x422a,
+  0x422a,
+  0x4229,
+  0x4228,
+  0x4227,
+  0x4226,
+  0x4226,
+  0x4225,
+  0x4225,
+  0x4224,
+  0x4223,
+  0x4222,
+  0x4222,
+  0x4221,
+  0x4221,
+  0x4220,
+  0x421f,
+  0x421f,
+  0x421e,
+  0x421e,
+  0x421d,
+  0x421c,
+  0x421b,
+  0x421b,
+  0x421b,
+  0x421a,
+  0x4219,
+  0x4218,
+  0x4218,
+  0x4218,
+  0x4217,
+  0x4216,
+  0x4216,
+  0x4215,
+  0x4215,
+  0x4214,
+  0x4214,
+  0x4213,
+  0x4212,
+  0x4212,
+  0x4212,
+  0x4211,
+  0x4210,
+  0x4210,
+  0x420f,
+  0x420f,
+  0x420e,
+  0x420e,
+  0x420d,
+  0x420d,
+  0x420d,
+  0x420c,
+  0x420b,
+  0x420b,
+  0x420a,
+  0x420a,
+  0x420a,
+  0x4209,
+  0x4209,
+  0x4208,
+  0x4207,
+  0x4207,
+  0x4207,
+  0x4206,
+  0x4206,
+  0x4205,
+  0x4205,
+  0x4205,
+  0x4204,
+  0x4204,
+  0x4203,
+  0x4203,
+  0x4203,
+  0x4202,
+  0x4202,
+  0x4201,
+  0x4201,
+  0x4200,
+  0x4200,
+  0x41fe,
+  0x41fe,
+  0x41fe,
+  0x41fc,
+  0x41fc,
+  0x41fa,
+  0x41fa,
+  0x41fa,
+  0x41f8,
+  0x41f8,
+  0x41f6,
+  0x41f6,
+  0x41f5,
+  0x41f5,
+  0x41f5,
+  0x41f3,
+  0x41f3,
+  0x41f1,
+  0x41f1,
+  0x41f1,
+  0x41ef,
+  0x41ef,
+  0x41ed,
+  0x41ed,
+  0x41ed,
+  0x41ec,
+  0x41ec,
+  0x41ea,
+  0x41ea,
+  0x41ea,
+  0x41e8,
+  0x41e8,
+  0x41e7,
+  0x41e7,
+  0x41e5,
+  0x41e5,
+  0x41e5,
+  0x41e4,
+  0x41e4,
+  0x41e2,
+  0x41e2,
+  0x41e2,
+  0x41e0,
+  0x41e0,
+  0x41df,
+  0x41df,
+  0x41df,
+  0x41dd,
+  0x41dd,
+  0x41dc,
+  0x41dc,
+  0x41da,
+  0x41da,
+  0x41da,
+  0x41d9,
+  0x41d9,
+  0x41d8,
+  0x41d8,
+  0x41d8,
+  0x41d6,
+  0x41d6,
+  0x41d5,
+  0x41d5,
+  0x41d5,
+  0x41d3,
+  0x41d3,
+  0x41d2,
+  0x41d2,
+  0x41d2,
+  0x41d1,
+  0x41d1,
+  0x41cf,
+  0x41cf,
+  0x41ce,
+  0x41ce,
+  0x41ce,
+  0x41cd,
+  0x41cd,
+  0x41cc,
+  0x41cc,
+  0x41cc,
+  0x41ca,
+  0x41ca,
+  0x41c9,
+  0x41c9,
+  0x41c9,
+  0x41c8,
+  0x41c8,
+  0x41c7,
+  0x41c7,
+  0x41c7,
+  0x41c5,
+  0x41c5,
+  0x41c4,
+  0x41c4,
+  0x41c3,
+  0x41c3,
+  0x41c3,
+  0x41c2,
+  0x41c2,
+  0x41c1,
+  0x41c1,
+  0x41c1,
+  0x41c0,
+  0x41c0,
+  0x41bf,
+  0x41bf,
+  0x41bf,
+  0x41bd,
+  0x41bd,
+  0x41bc,
+  0x41bc,
+  0x41bc,
+  0x41bb,
+  0x41bb,
+  0x41ba,
+  0x41ba,
+  0x41b9,
+  0x41b9,
+  0x41b9,
+  0x41b8,
+  0x41b8,
+  0x41b7,
+  0x41b7,
+  0x41b7,
+  0x41b6,
+  0x41b6,
+  0x41b5,
+  0x41b5,
+  0x41b5,
+  0x41b4,
+  0x41b4,
+  0x41b3,
+  0x41b3,
+  0x41b2,
+  0x41b2,
+  0x41b2,
+  0x41b1,
+  0x41b1,
+  0x41b0,
+  0x41b0,
+  0x41b0,
+  0x41af,
+  0x41af,
+  0x41ae,
+  0x41ae,
+  0x41ae,
+  0x41ad,
+  0x41ad,
+  0x41ac,
+  0x41ac,
+  0x41ac,
+  0x41ac,
+  0x41ac,
+  0x41ab,
+  0x41ab,
+  0x41aa,
+  0x41aa,
+  0x41aa,
+  0x41a9,
+  0x41a9,
+  0x41a8,
+  0x41a8,
+  0x41a8,
+  0x41a7,
+  0x41a7,
+  0x41a6,
+  0x41a6,
+  0x41a6,
+  0x41a5,
+  0x41a5,
+  0x41a5,
+  0x41a5,
+  0x41a5,
+  0x41a4,
+  0x41a4,
+  0x41a3,
+  0x41a3,
+  0x41a2,
+  0x41a2,
+  0x41a2,
+  0x41a1,
+  0x41a1,
+  0x41a1,
+  0x41a1,
+  0x41a1,
+  0x41a0,
+  0x41a0,
+  0x419f,
+  0x419f,
+  0x419f,
+  0x419e,
+  0x419e,
+  0x419e,
+  0x419e,
+  0x419e,
+  0x419d,
+  0x419d,
+  0x419c,
+  0x419c,
+  0x419b,
+  0x419b,
+  0x419b,
+  0x419b,
+  0x419b,
+  0x419a,
+  0x419a,
+  0x419a,
+  0x4199,
+  0x4199,
+  0x4198,
+  0x4198,
+  0x4198,
+  0x4198,
+  0x4198,
+  0x4197,
+  0x4197,
+  0x4197,
+  0x4196,
+  0x4196,
+  0x4196,
+  0x4196,
+  0x4195,
+  0x4195,
+  0x4195,
+  0x4194,
+  0x4194,
+  0x4194,
+  0x4194,
+  0x4194,
+  0x4193,
+  0x4193,
+  0x4192,
+  0x4192,
+  0x4192,
+  0x4192,
+  0x4192,
+  0x4191,
+  0x4191,
+  0x4190,
+  0x4190,
+  0x4190,
+  0x4190,
+  0x4190,
+  0x418f,
+  0x418f,
+  0x418f,
+  0x418e,
+  0x418e,
+  0x418e,
+  0x418e,
+  0x418e,
+  0x418d,
+  0x418d,
+  0x418d,
+  0x418d,
+  0x418d,
+  0x418c,
+  0x418c,
+  0x418b,
+  0x418b,
+  0x418b,
+  0x418b,
+  0x418b,
+  0x418a,
+  0x418a,
+  0x418a,
+  0x418a,
+  0x418a,
+  0x4189,
+  0x4189,
+  0x4189,
+  0x4189,
+  0x4189,
+  0x4188,
+  0x4188,
+  0x4187,
+  0x4187,
+  0x4187,
+  0x4187,
+  0x4187,
+  0x4186,
+  0x4186,
+  0x4186,
+  0x4186,
+  0x4186,
+  0x4185,
+  0x4185,
+  0x4185,
+  0x4185,
+  0x4185,
+  0x4184,
+  0x4184,
+  0x4184,
+  0x4184,
+  0x4184,
+  0x4183,
+  0x4183,
+  0x4183,
+  0x4183,
+  0x4183,
+  0x4182,
+  0x4182,
+  0x4182,
+  0x4182,
+  0x4181,
+  0x4181,
+  0x4181,
+  0x4181,
+  0x4181,
+  0x4180,
+  0x4180,
+  0x4180,
+  0x4180,
+  0x417e,
+  0x417e,
+  0x417e,
+  0x417e,
+  0x417e,
+  0x417c,
+  0x417c,
+  0x417c,
+  0x417c,
+  0x417c,
+  0x417a,
+  0x417a,
+  0x417a,
+  0x417a,
+  0x417a,
+  0x4178,
+  0x4178,
+  0x4178,
+  0x4178,
+  0x4176,
+  0x4176,
+  0x4176,
+  0x4176,
+  0x4176,
+  0x4175,
+  0x4175,
+  0x4175,
+  0x4175,
+  0x4175,
+  0x4173,
+  0x4173,
+  0x4173,
+  0x4173,
+  0x4173,
+  0x4171,
+  0x4171,
+  0x4171,
+  0x4171,
+  0x4171,
+  0x416f,
+  0x416f,
+  0x416f,
+  0x416f,
+  0x416f,
+  0x416d,
+  0x416d,
+  0x416d,
+  0x416d,
+  0x416d,
+  0x416c,
+  0x416c,
+  0x416c,
+  0x416c,
+  0x416c,
+  0x416a,
+  0x416a,
+  0x416a,
+  0x416a,
+  0x416a,
+  0x4168,
+  0x4168,
+  0x4168,
+  0x4168,
+  0x4167,
+  0x4167,
+  0x4167,
+  0x4167,
+  0x4167,
+  0x4165,
+  0x4165,
+  0x4165,
+  0x4165,
+  0x4165,
+  0x4164,
+  0x4164,
+  0x4164,
+  0x4164,
+  0x4164,
+  0x4162,
+  0x4162,
+  0x4162,
+  0x4162,
+  0x4162,
+  0x4160,
+  0x4160,
+  0x4160,
+  0x4160,
+  0x4160,
+  0x415f,
+  0x415f,
+  0x415f,
+  0x415f,
+  0x415f,
+  0x415d,
+  0x415d,
+  0x415d,
+  0x415d,
+  0x415d,
+  0x415c,
+  0x415c,
+  0x415c,
+  0x415c,
+  0x415a,
+  0x415a,
+  0x415a,
+  0x415a,
+  0x415a,
+  0x4159,
+  0x4159,
+  0x4159,
+  0x4159,
+  0x4159,
+  0x4158,
+  0x4158,
+  0x4158,
+  0x4158,
+  0x4158,
+  0x4156,
+  0x4156,
+  0x4156,
+  0x4156,
+  0x4156,
+  0x4155,
+  0x4155,
+  0x4155,
+  0x4155,
+  0x4155,
+  0x4153,
+  0x4153,
+  0x4153,
+  0x4153,
+  0x4153,
+  0x4152,
+  0x4152,
+  0x4152,
+  0x4152,
+  0x4152,
+  0x4151,
+  0x4151,
+  0x4151,
+  0x4151,
+  0x4151,
+  0x414f,
+  0x414f,
+  0x414f,
+  0x414f,
+  0x414e,
+  0x414e,
+  0x414e,
+  0x414e,
+  0x414e,
+  0x414d,
+  0x414d,
+  0x414d,
+  0x414d,
+  0x414d,
+  0x414c,
+  0x414c,
+  0x414c,
+  0x414c,
+  0x414c,
+  0x414a,
+  0x414a,
+  0x414a,
+  0x414a,
+  0x414a,
+  0x4149,
+  0x4149,
+  0x4149,
+  0x4149,
+  0x4149,
+  0x4148,
+  0x4148,
+  0x4148,
+  0x4148,
+  0x4148,
+  0x4147,
+  0x4147,
+  0x4147,
+  0x4147,
+  0x4147,
+  0x4145,
+  0x4145,
+  0x4145,
+  0x4145,
+  0x4144,
+  0x4144,
+  0x4144,
+  0x4144,
+  0x4144,
+  0x4143,
+  0x4143,
+  0x4143,
+  0x4143,
+  0x4143,
+  0x4142,
+  0x4142,
+  0x4142,
+  0x4142,
+  0x4142,
+  0x4141,
+  0x4141,
+  0x4141,
+  0x4141,
+  0x4141,
+  0x4140,
+  0x4140,
+  0x4140,
+  0x4140,
+  0x4140,
+  0x413f,
+  0x413f,
+  0x413f,
+  0x413f,
+  0x413f,
+  0x413d,
+  0x413d,
+  0x413d,
+  0x413d,
+  0x413d,
+  0x413c,
+  0x413c,
+  0x413c,
+  0x413c,
+  0x413c,
+  0x413b,
+  0x413b,
+  0x413b,
+  0x413b,
+  0x413a,
+  0x413a,
+  0x413a,
+  0x413a,
+  0x413a,
+  0x4139,
+  0x4139,
+  0x4139,
+  0x4139,
+  0x4139,
+  0x4138,
+  0x4138,
+  0x4138,
+  0x4138,
+  0x4138,
+  0x4137,
+  0x4137,
+  0x4137,
+  0x4137,
+  0x4137,
+  0x4136,
+  0x4136,
+  0x4136,
+  0x4136,
+  0x4136,
+  0x4135,
+  0x4135,
+  0x4135,
+  0x4135,
+  0x4135,
+  0x4134,
+  0x4134,
+  0x4134,
+  0x4134,
+  0x4134,
+  0x4133,
+  0x4133,
+  0x4133,
+  0x4133,
+  0x4132,
+  0x4132,
+  0x4132,
+  0x4132,
+  0x4132,
+  0x4131,
+  0x4131,
+  0x4131,
+  0x4131,
+  0x4131,
+  0x4130,
+  0x4130,
+  0x4130,
+  0x4130,
+  0x4130,
+  0x412f,
+  0x412f,
+  0x412f,
+  0x412f,
+  0x412f,
+  0x412e,
+  0x412e,
+  0x412e,
+  0x412e,
+  0x412e,
+  0x412d,
+  0x412d,
+  0x412d,
+  0x412d,
+  0x412d,
+  0x412c,
+  0x412c,
+  0x412c,
+  0x412c,
+  0x412c,
+  0x412c,
+  0x412c,
+  0x412c,
+  0x412c,
+  0x412c,
+  0x412b,
+  0x412b,
+  0x412b,
+  0x412b,
+  0x412a,
+  0x412a,
+  0x412a,
+  0x412a,
+  0x412a,
+  0x4129,
+  0x4129,
+  0x4129,
+  0x4129,
+  0x4129,
+  0x4128,
+  0x4128,
+  0x4128,
+  0x4128,
+  0x4128,
+  0x4127,
+  0x4127,
+  0x4127,
+  0x4127,
+  0x4127,
+  0x4126,
+  0x4126,
+  0x4126,
+  0x4126,
+  0x4126,
+  0x4125,
+  0x4125,
+  0x4125,
+  0x4125,
+  0x4125,
+  0x4125,
+  0x4125,
+  0x4125,
+  0x4125,
+  0x4125,
+  0x4124,
+  0x4124,
+  0x4124,
+  0x4124,
+  0x4124,
+  0x4123,
+  0x4123,
+  0x4123,
+  0x4123,
+  0x4122,
+  0x4122,
+  0x4122,
+  0x4122,
+  0x4122,
+  0x4121,
+  0x4121,
+  0x4121,
+  0x4121,
+  0x4121,
+  0x4121,
+  0x4121,
+  0x4121,
+  0x4121,
+  0x4121,
+  0x4120,
+  0x411f,
+  0x411e,
+  0x411e,
+  0x411d,
+  0x411c,
+  0x411b,
+  0x411b,
+  0x411a,
+  0x4119,
+  0x4118,
+  0x4118,
+  0x4117,
+  0x4116,
+  0x4116,
+  0x4115,
+  0x4114,
+  0x4114,
+  0x4113,
+  0x4112,
+  0x4112,
+  0x4111,
+  0x4110,
+  0x4110,
+  0x410f,
+};
+
+// <! gen reciprocal f(x) = 1/x
+static double _gen_reciprocal(int base, int p) {
+  // y = x ^ -1
+  double f = (double) (pow(base, -1 * p));
+
+  if (isnan(f)) {
+    assert(0);
+  }
+  return f;
+}
+
+static void tl_lut_ref(
+    u16 *ofmap,
+    u16 *ifmap,
+    tl_shape_t ifmap_shape
+    )
+{
+  // get reciprocal, f(x) = 1/x
+  for (u32 i = 0; i < tl_shape_size(&ifmap_shape); i++) {
+	if (mode == PRE_DATA_COMPARE_FIX) {
+      ofmap[i] = sigmode_golden_bf16[i];
+    }
+	else if (mode == GEN_POW_20_DATA_MAX_ERROR_U8) {
+	  u16 v = convert_fp32_bf16(1 / (convert_bf16_fp32(ifmap[i])));
+      ofmap[i] = (u8) convert_bf16_s8(v);
+	}
+    else {
+	  u16 v = convert_fp32_bf16(1 / (1.0 * (convert_bf16_fp32(ifmap[i]))));
+      ofmap[i] = v;
+    }
+  }
+}
+
+static void gen_reciprocal(u16 *table_data, u64 table_size) {
+  //<! 32*8 table, duplicate `channel` times;
+  int half = table_size / channel / 2;
+  u64 idx = 0;
+  assert(table_size);
+  assert(half == 128);
+
+  // prepare channel 0
+  double s = 0.0;
+  // 0^-1 is invalid, skip it
+  table_data[idx] = convert_fp32_bf16(s);
+#if 0
+  printf("t [%" PRIu64 "] is %f(%.8lf)[idx:%f][2^%f] bf %x\n", idx, convert_bf16_fp32(table_data[idx]), s, (float)exp_start, (float)(exp_start/2), table_data[idx]);
+#endif
+  idx++;
+
+  // > 0, exp from 0 -62 -61 ..  62  63
+  for (int i = 0; i < half; i++) {
+    int shift = (exp_start + i);
+    bool is_odd = (shift % 2);
+    float exp = shift;
+    if (is_odd) {
+      exp = shift > 0 ? exp - 1 : exp - 1;
+    }
+
+    double s = _gen_reciprocal(2, exp);
+    table_data[idx] = convert_fp32_bf16(s);
+#ifdef DBG
+    printf("t [%" PRIu64 "] is %f [idx:%f][2^%f] bf %x\n", idx,
+        convert_bf16_fp32(table_data[idx]),
+        float(exp_start + i), -1 * exp,
+        table_data[idx]);
+#endif
+    idx++;
+  }
+
+  s = _gen_reciprocal(2, -0);
+  table_data[idx] = convert_fp32_bf16(s);
+#ifdef DBG
+  printf("t [%" PRIu64 "] is %f[%d] bf %x\n", idx, convert_bf16_fp32(table_data[idx]), 0, table_data[idx]);
+#endif
+  idx++;
+
+  for (int i = 1; i < half; i++) {
+    int shift = (exp_start + i);
+    bool is_odd = (shift % 2);
+    float exp = shift;
+    if (is_odd) {
+      exp = shift > 0 ? exp - 1 : exp - 1;
+    }
+
+    double s = _gen_reciprocal(-2, exp);
+    table_data[idx] = convert_fp32_bf16(s);
+#ifdef DBG
+    printf("t [%" PRIu64 "] is %f(%e - %.8lf)[(-2)^%f] bf %x\n", idx, convert_bf16_fp32(table_data[idx]), s, s, exp, table_data[idx]);
+#endif
+    idx++;
+  }
+
+  // idx = 255 dont care
+  //s = _gen_reciprocal(2, 0);
+  //table_data[idx] = convert_fp32_bf16(s);
+  //printf("t [%" PRIu64 "] is %f[%d]\n", idx, convert_bf16_fp32(table_data[idx]), 0);
+  //idx++;
+
+  // duplicate channel #1 to #31
+  //TODO: tensor copy
+  for (u32 i = 1; i < channel; i++) {
+    memcpy(&table_data[i * table_hw], &table_data[0], sizeof(u16) * table_hw);
+  }
+}
+
+static void gen_reciprocal_mantissa(u16 IN *table_data, u16* OUT table_mantissa, u64 table_size) {
+
+  u32 half = table_size / channel / 2;
+  assert(half == 128);
+  assert(table_data);
+
+  int idx = 0;
+  double d;
+  for (u32 i = 0; i < half; i++) {
+    d = 1 + i * 1 / 128.0;
+    d = (double) pow(d, -1);
+    table_mantissa[128+idx] = convert_fp32_bf16(d);
+
+    //13=2^3x1.625=(2^2)x(2^1x1.625)
+    d = 2 * (1 + i * 1 / 128.0);
+    d = (double) pow(d, -1);
+    table_mantissa[idx] = convert_fp32_bf16(d);
+    idx++;
+  }
+
+#ifdef DBG
+  for (u32 i = 0; i < 2 * half; i++) {
+	printf("mantissa [%u] is %lf, 0x%x\n", i, convert_bf16_fp32(table_mantissa[i]),
+		table_mantissa[i]);
+  }
+#endif /* ifdef DBG */
+
+  // duplicate channel #1 to #31
+  //TODO: tensor copy
+  for (u64 i = 1; i < channel; i++) {
+    memcpy(&table_mantissa[table_hw * i], &table_mantissa[0], sizeof(u16) * table_hw);
+  }
+}
+
+static bool verify(u16 *ofmap_data, u16 *ref_data, u16* ifmap, u64 ifmap_size) {
+  u64 size = ifmap_size;
+
+  for (u64 i = 0; i < size; i++) {
+    bool is_close;
+	u16 ref = ref_data[i];
+	u16 ofmap_data_bf16;
+	float ref_f;
+	float ofmap_data_f;
+	u32 shift;
+    
+	if (mode == GEN_POW_20_DATA_MAX_ERROR_U8) {
+	  shift = (i%2)*8;
+	  ofmap_data_bf16 = (u16)ofmap_data[i/2];
+	  ofmap_data_f = (float)(ofmap_data[i/2] >> shift);
+	  ref_f = (float)(ref);
+
+      is_close = ((u8)(ofmap_data[i/2] >> shift)) == (u8)ref;
+
+	  //printf("[%" PRIu64 "] of is %x ref is %x\n", i, (u8)(ofmap_data[i/2] >> shift), (u8)ref);
+	}
+	else {
+	  ref_f = convert_bf16_fp32(ref);
+	  ofmap_data_f = convert_bf16_fp32(ofmap_data[i]);
+	  ofmap_data_bf16 = ofmap_data[i];
+
+	  if (mode == PRE_DATA_COMPARE_FIX) {
+		is_close = ofmap_data[i] == ref;
+	  }
+	  else {
+		is_close = almost_equal(ref_f, ofmap_data_f, 1);
+		is_close = fabs(ref_f-ofmap_data_f) < 0.001;
+	  }
+	}
+
+#if 0
+	if (i == 0) {
+	  fprintf(stderr,
+		  "input, ofmap, ref, diff, diff / ref_f\n");
+    }
+
+    fprintf(stderr,
+        "%.16f, %f, %lf, %lf, %lf\n",
+        convert_bf16_fp32(ifmap[i]),
+        ofmap_data_f, ref_f, fabs(ref_f - ofmap_data_f), fabs(ref_f - ofmap_data_f) / ref_f);
+    //if (ofmap_data[i] != ref && fabs(ref_f-ofmap_data_f) > 0.07) 
+    //if (ofmap_data[i] != ref && AlmostEqual2sComplement(ref_f, ofmap_data_f, 1))
+    //if (ofmap_data[i] != ref && AlmostEqual(ref_f, ofmap_data_f, FLT_EPSILON))
+#endif
+    if (!is_close) {
+      fprintf(stderr,
+          "comparing failed at ofmap_data[%" PRIu64 "](input:%e), got %x, exp %x, fp32: got %e exp %e\n",
+          i, convert_bf16_fp32(ifmap[i]),
+          ofmap_data_bf16, ref, ofmap_data_f, ref_f);
+      exit(-1);
+    }
+  }
+
+  return true;
+}
+
+static void test_tl_int8_lut_bf16(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk)
+{
+  // TODO: check more shape / align
+  tl_shape_t ifmap_shape;
+  if (mode == PRE_DATA_COMPARE_FIX) {
+    ifmap_shape = {1, channel, 4, 8};
+  }
+  else {
+    ifmap_shape = {1, channel, 16, 16};
+  }
+
+  tl_shape_t table_shape = {1, channel, table_h, table_w}; // hard code for hw, hw:32x8
+  tl_shape_t ofmap_shape = ifmap_shape;
+
+  u64 ifmap_size = tl_shape_size(&ifmap_shape);
+  u64 table_size = tl_shape_size(&table_shape);
+  u64 ofmap_size = tl_shape_size(&ofmap_shape);
+
+  fmt_t fmt = FMT_BF16;
+
+  int data_type_size = bytesize_of_fmt(fmt);
+  u64 ifmap_bytesize  =  ifmap_size * data_type_size;
+  u64 table_bytesize  =  table_size * data_type_size;
+  u64 ofmap_bytesize  =  ofmap_size * data_type_size;
+
+  // hw ONLY support index in int8
+  u16 *ifmap = (u16 *)xmalloc(ifmap_bytesize);
+  memset(ifmap, 0x00, ifmap_bytesize);
+
+  u16 *ifmap_mantissa = (u16 *)xmalloc(ifmap_bytesize);
+  memset(ifmap_mantissa, 0x00, ifmap_bytesize);
+
+  if (mode == PRE_DATA_COMPARE_FIX) {
+    memcpy(ifmap, &test_pattern, sizeof(test_pattern));
+  }
+  else {
+    for (u64 i = 0; i < ifmap_size; i++) {
+      srand (static_cast <unsigned> (time(0)));
+      std::random_device rd;
+      std::mt19937 e2(rd());
+      float LO = pow(2, -20);
+      float HI = pow(2, 20);
+      //std::uniform_real_distribution<> dist(pow(2,-62), pow(2,63));
+      for (u64 i = 0; i < ifmap_size; i++) {
+        //float r3 = dist(e2);
+        float r3 = LO + static_cast <float> (rand()) /( static_cast <float> (RAND_MAX/(HI-LO)));
+        ifmap[i] = convert_fp32_bf16(r3);
+      }
+    }
+  }
+
+#ifdef DBG
+  for (u64 i = 0; i < ifmap_size; i++) {
+    printf("source if[%" PRIu64 "] bf16 %f 0x%x, log2f is %f\n", i, convert_bf16_fp32(ifmap[i]), ifmap[i], floor(log2((convert_bf16_fp32(ifmap[i])))));
+  }
+#endif /* ifdef DBG */
+
+  u16 *table_data = (u16 *)xmalloc(table_bytesize);
+  gen_reciprocal (table_data, table_size);
+
+  u16 *table_data_mantissa = (u16 *)xmalloc(table_bytesize);
+  gen_reciprocal_mantissa(table_data, table_data_mantissa, table_size);
+
+  u16 *ref_data = (u16 *)xmalloc(ofmap_bytesize);
+  tl_lut_ref(ref_data, ifmap, ifmap_shape);
+
+  tl_t *tl_ifmap =
+    alloc_tl(bmk,ifmap_shape, fmt, /*align*/1);
+  tl_t *tl_table_answer =
+    alloc_tl(bmk, table_shape, fmt, /*align*/1);
+  tl_t *tl_table_answer_mantissa =
+    alloc_tl(bmk, table_shape, fmt, /*align*/1);
+
+  tl_t *tl_ofmap_exp =
+    alloc_tl(bmk,ofmap_shape, fmt, /*align*/1);
+  tl_t *tl_ofmap_mantissa =
+    alloc_tl(bmk,ofmap_shape, fmt, /*align*/1);
+  tl_t *tl_ofmap_exp_val =
+    alloc_tl(bmk,ofmap_shape, fmt, /*align*/1);
+
+  tl_t *tl_ofmap_exp_val_u8 = nullptr;
+  tl_t *out = tl_ofmap_exp_val;
+
+  if (mode == GEN_POW_20_DATA_MAX_ERROR_U8) {
+	tl_ofmap_exp_val_u8 =
+	  alloc_tl(bmk,ofmap_shape, FMT_U8, /*align*/1);
+  }
+  
+  // <! FIXME: prepare it
+  bmk1880v2_tdma_tg2l_tensor_copy_param_t copy_p1, copy_p2, copy_p3;
+  prepare_put_bf16_tensor_g2l(ctx, bmk, tl_ifmap, ifmap, fmt, &copy_p1);
+  prepare_put_bf16_tensor_g2l(ctx, bmk, tl_table_answer, table_data, fmt, &copy_p2);
+  prepare_put_bf16_tensor_g2l(ctx, bmk, tl_table_answer_mantissa, table_data_mantissa, fmt, &copy_p3);
+
+  launch_put_bf16_tensor_g2l(ctx, bmk, copy_p1.src, &copy_p1); // input
+  launch_put_bf16_tensor_g2l(ctx, bmk, copy_p2.src, &copy_p2); // table value
+  launch_put_bf16_tensor_g2l(ctx, bmk, copy_p3.src, &copy_p3); // table mantissa
+
+  bmk1880v2_tdma_l2l_tensor_copy_param_t p10;
+  u16 *ofmap_data;
+
+  // remove low 8 bits by int8 copy with stride
+  // <! get index(pow)
+  memset(&p10, 0x00, sizeof(p10));
+  p10.dst = tl_ofmap_exp;
+  p10.src = tl_ifmap;
+  p10.mv_lut_idx = true;
+  bmk1880v2_tdma_l2l_bf16_tensor_copy(bmk, &p10);
+  p10.mv_lut_idx = false;
+  test_submit(ctx);
+
+  // <! get f(x0) = 2^(x0*-0.5)
+  bmk1880v2_tiu_lookup_table_param_t p12;
+  memset(&p12, 0, sizeof(p12));
+  p12.ofmap = tl_ofmap_exp_val;
+  p12.ifmap = tl_ofmap_exp;
+  p12.table = tl_table_answer;
+  bmk1880v2_tiu_lookup_table(bmk, &p12);
+
+  // <! get mantissa value
+  p12.ofmap = tl_ofmap_mantissa;
+  p12.ifmap = tl_ifmap;
+  p12.table = tl_table_answer_mantissa;
+  bmk1880v2_tiu_lookup_table(bmk, &p12);
+
+  // reciprocal = (2^exp) * mantissa
+  bmk1880v2_tiu_element_wise_mul_param_t p1;
+  memset(&p1, 0, sizeof(p1));
+  p1.res_high = NULL;
+  p1.res_low = tl_ofmap_exp_val;
+  p1.a = tl_ofmap_mantissa;
+  p1.b_is_const = 0;
+  p1.b = tl_ofmap_exp_val;
+  p1.rshift_bits = 0;
+  p1.relu_enable = 0;
+  bmk1880v2_tiu_element_wise_mul(bmk, &p1);
+
+  if (mode == GEN_POW_20_DATA_MAX_ERROR_U8) {
+	p10.dst = tl_ofmap_exp_val_u8;
+	p10.src = tl_ofmap_exp_val;
+	bmk1880v2_tdma_l2l_bf16_tensor_copy(bmk, &p10);
+	out = tl_ofmap_exp_val_u8;
+  }
+
+  test_submit(ctx);
+
+  ofmap_data = (u16*)get_bf16_tensor_l2g(ctx, bmk, out, out->fmt);
+  verify(ofmap_data, ref_data, ifmap, ifmap_size);
+
+  if (tl_ofmap_exp_val_u8) {
+	free_tl(bmk, tl_ofmap_exp_val_u8);
+  }
+  free_tl(bmk, tl_ofmap_exp_val);
+  free_tl(bmk, tl_ofmap_mantissa);
+  free_tl(bmk, tl_ofmap_exp);
+  free_tl(bmk, tl_table_answer_mantissa);
+  free_tl(bmk, tl_table_answer);
+  free_tl(bmk, tl_ifmap);
+
+  free(ifmap);
+  free(ifmap_mantissa);
+  free(table_data);
+  free(table_data_mantissa);
+  free(ref_data);
+  free(ofmap_data);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  int round_mode;
+
+  round_mode = set_store_feround();
+
+  test_init(&ctx, &bmk);
+
+  for (int i = PRE_DATA_COMPARE_FIX; i < TEST_MODE_MAX; i++) {
+    mode = static_cast<TEST_MODE>(i);
+    printf ("test mode %d...\n", mode);
+    test_tl_int8_lut_bf16(&ctx, bmk);
+  }
+
+  test_exit(&ctx);
+  restore_feround(round_mode);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_bf16_reciprocal_kernel.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_reciprocal_kernel.cpp
new file mode 100644
index 000000000..0351efe92
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_reciprocal_kernel.cpp
@@ -0,0 +1,443 @@
+/**
+ */
+#include "../1880v2_test_util.h"
+
+#include <cfloat>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <random>
+#include <string>
+//#define DBG
+
+using namespace std;
+
+/**
+ * pre_data means we test fixed pattern, it should be same sa lut
+ */
+enum TEST_MODE {
+  PRE_DATA_COMPARE_FIX = 0,   // pre-data + fix compare
+  GEN_POW_20_DATA_MAX_ERROR,  // generate 2^-20 ~ 2^20 value that check epsilon
+  TEST_MODE_MAX,
+};
+
+static TEST_MODE mode;
+
+static u16 test_pattern[] = {
+    0x0000, 0x38D2, 0x3952, 0x399D, 0x39D2, 0x3A03, 0x3A1D, 0x3A38, 0x3A52,
+    0x3A6C, 0x3A83, 0x3A90, 0x3A9D, 0x3AAA, 0x3AB8, 0x3AC5, 0x3AD2, 0x3ADF,
+    0x3AEC, 0x3AF9, 0x3B03, 0x3B0A, 0x3B10, 0x3B17, 0x3B1D, 0x3B24, 0x3B2A,
+    0x3B31, 0x3B38, 0x3B3E, 0x3B45, 0x3B4B, 0x3B52, 0x3B58, 0x3B5F, 0x3B65,
+    0x3B6C, 0x3B72, 0x3B79, 0x3B80, 0x3B83, 0x3B86, 0x3B8A, 0x3B8D, 0x3B90,
+    0x3B93, 0x3B97, 0x3B9A, 0x3B9D, 0x3BA1, 0x3BA4, 0x3BA7, 0x3BAA, 0x3BAE,
+    0x3BB1, 0x3BB4, 0x3BB8, 0x3BBB, 0x3BBE, 0x3BC1, 0x3BC5, 0x3BC8, 0x3BCB,
+    0x3BCE, 0x3BD2, 0x3BD5, 0x3BD8, 0x3BDC, 0x3BDF, 0x3BE2, 0x3BE5, 0x3BE9,
+    0x3BEC, 0x3BEF, 0x3BF2, 0x3BF6, 0x3BF9, 0x3BFC, 0x3C00, 0x3C01, 0x3C03,
+    0x3C05, 0x3C06, 0x3C08, 0x3C0A, 0x3C0B, 0x3C0D, 0x3C0F, 0x3C10, 0x3C12,
+    0x3C13, 0x3C15, 0x3C17, 0x3C18, 0x3C1A, 0x3C1C, 0x3C1D, 0x3C1F, 0x3C21,
+    0x3C22, 0x3C24, 0x3C25, 0x3C27, 0x3C29, 0x3C2A, 0x3C2C, 0x3C2E, 0x3C2F,
+    0x3C31, 0x3C33, 0x3C34, 0x3C36, 0x3C38, 0x3C39, 0x3C3B, 0x3C3C, 0x3C3E,
+    0x3C40, 0x3C41, 0x3C43, 0x3C45, 0x3C46, 0x3C48, 0x3C4A, 0x3C4B, 0x3C4D,
+    0x3C4E, 0x3C50, 0x3C52, 0x3C53, 0x3C55, 0x3C57, 0x3C58, 0x3C5A, 0x3C5C,
+    0x3C5D, 0x3C5F, 0x3C60, 0x3C62, 0x3C64, 0x3C65, 0x3C67, 0x3C69, 0x3C6A,
+    0x3C6C, 0x3C6E, 0x3C6F, 0x3C71, 0x3C72, 0x3C74, 0x3C76, 0x3C77, 0x3C79,
+    0x3C7B, 0x3C7C, 0x3C7E, 0x3C80, 0x3C81, 0x3C81, 0x3C82, 0x3C83, 0x3C84,
+    0x3C85, 0x3C86, 0x3C86, 0x3C87, 0x3C88, 0x3C89, 0x3C8A, 0x3C8A, 0x3C8B,
+    0x3C8C, 0x3C8D, 0x3C8E, 0x3C8F, 0x3C8F, 0x3C90, 0x3C91, 0x3C92, 0x3C93,
+    0x3C93, 0x3C94, 0x3C95, 0x3C96, 0x3C97, 0x3C98, 0x3C98, 0x3C99, 0x3C9A,
+    0x3C9B, 0x3C9C, 0x3C9C, 0x3C9D, 0x3C9E, 0x3C9F, 0x3CA0, 0x3CA1, 0x3CA1,
+    0x3CA2, 0x3CA3, 0x3CA4, 0x3CA5, 0x3CA5, 0x3CA6, 0x3CA7, 0x3CA8, 0x3CA9,
+    0x3CAA, 0x3CAA, 0x3CAB, 0x3CAC, 0x3CAD, 0x3CAE, 0x3CAE, 0x3CAF, 0x3CB0,
+    0x3CB1, 0x3CB2, 0x3CB3, 0x3CB3, 0x3CB4, 0x3CB5, 0x3CB6, 0x3CB7, 0x3CB8,
+    0x3CB8, 0x3CB9, 0x3CBA, 0x3CBB, 0x3CBC, 0x3CBC, 0x3CBD, 0x3CBE, 0x3CBF,
+    0x3CC0, 0x3CC1, 0x3CC1, 0x3CC2, 0x3CC3, 0x3CC4, 0x3CC5, 0x3CC5, 0x3CC6,
+    0x3CC7, 0x3CC8, 0x3CC9, 0x3CCA, 0x3CCA, 0x3CCB, 0x3CCC, 0x3CCD, 0x3CCE,
+    0x3CCE, 0x3CCF, 0x3CD0, 0x3CD1, 0x3CD2, 0x3CD3, 0x3CD3, 0x3CD4, 0x3CD5,
+    0x3CD6, 0x3CD7, 0x3CD7, 0x3CD8, 0x3CD9, 0x3CDA, 0x3CDB, 0x3CDC, 0x3CDC,
+    0x3CDD, 0x3CDE, 0x3CDF, 0x3CE0, 0x3CE0, 0x3CE1, 0x3CE2, 0x3CE3, 0x3CE4,
+    0x3CE5, 0x3CE5, 0x3CE6, 0x3CE7, 0x3CE8, 0x3CE9, 0x3CE9, 0x3CEA, 0x3CEB,
+    0x3CEC, 0x3CED, 0x3CEE, 0x3CEE, 0x3CEF, 0x3CF0, 0x3CF1, 0x3CF2, 0x3CF2,
+    0x3CF3, 0x3CF4, 0x3CF5, 0x3CF6, 0x3CF7, 0x3CF7, 0x3CF8, 0x3CF9, 0x3CFA,
+    0x3CFB, 0x3CFB, 0x3CFC, 0x3CFD, 0x3CFE, 0x3CFF, 0x3D00, 0x3D00, 0x3D01,
+    0x3D01, 0x3D01, 0x3D02, 0x3D02, 0x3D03, 0x3D03, 0x3D03, 0x3D04, 0x3D04,
+    0x3D05, 0x3D05, 0x3D06, 0x3D06, 0x3D06, 0x3D07, 0x3D07, 0x3D08, 0x3D08,
+    0x3D08, 0x3D09, 0x3D09, 0x3D0A, 0x3D0A, 0x3D0A, 0x3D0B, 0x3D0B, 0x3D0C,
+    0x3D0C, 0x3D0C, 0x3D0D, 0x3D0D, 0x3D0E, 0x3D0E, 0x3D0F, 0x3D0F, 0x3D0F,
+    0x3D10, 0x3D10, 0x3D11, 0x3D11, 0x3D11, 0x3D12, 0x3D12, 0x3D13, 0x3D13,
+    0x3D13, 0x3D14, 0x3D14, 0x3D15, 0x3D15, 0x3D16, 0x3D16, 0x3D16, 0x3D17,
+    0x3D17, 0x3D18, 0x3D18, 0x3D18, 0x3D19, 0x3D19, 0x3D1A, 0x3D1A, 0x3D1A,
+    0x3D1B, 0x3D1B, 0x3D1C, 0x3D1C, 0x3D1C, 0x3D1D, 0x3D1D, 0x3D1E, 0x3D1E,
+    0x3D1F, 0x3D1F, 0x3D1F, 0x3D20, 0x3D20, 0x3D21, 0x3D21, 0x3D21, 0x3D22,
+    0x3D22, 0x3D23, 0x3D23, 0x3D23, 0x3D24, 0x3D24, 0x3D25, 0x3D25, 0x3D25,
+    0x3D26, 0x3D26, 0x3D27, 0x3D27, 0x3D28, 0x3D28, 0x3D28, 0x3D29, 0x3D29,
+    0x3D2A, 0x3D2A, 0x3D2A, 0x3D2B, 0x3D2B, 0x3D2C, 0x3D2C, 0x3D2C, 0x3D2D,
+    0x3D2D, 0x3D2E, 0x3D2E, 0x3D2E, 0x3D2F, 0x3D2F, 0x3D30, 0x3D30, 0x3D31,
+    0x3D31, 0x3D31, 0x3D32, 0x3D32, 0x3D33, 0x3D33, 0x3D33, 0x3D34, 0x3D34,
+    0x3D35, 0x3D35, 0x3D35, 0x3D36, 0x3D36, 0x3D37, 0x3D37, 0x3D38, 0x3D38,
+    0x3D38, 0x3D39, 0x3D39, 0x3D3A, 0x3D3A, 0x3D3A, 0x3D3B, 0x3D3B, 0x3D3C,
+    0x3D3C, 0x3D3C, 0x3D3D, 0x3D3D, 0x3D3E, 0x3D3E, 0x3D3E, 0x3D3F, 0x3D3F,
+    0x3D40, 0x3D40, 0x3D41, 0x3D41, 0x3D41, 0x3D42, 0x3D42, 0x3D43, 0x3D43,
+    0x3D43, 0x3D44, 0x3D44, 0x3D45, 0x3D45, 0x3D45, 0x3D46, 0x3D46, 0x3D47,
+    0x3D47, 0x3D47, 0x3D48, 0x3D48, 0x3D49, 0x3D49, 0x3D4A, 0x3D4A, 0x3D4A,
+    0x3D4B, 0x3D4B, 0x3D4C, 0x3D4C, 0x3D4C, 0x3D4D, 0x3D4D, 0x3D4E, 0x3D4E,
+    0x3D4E, 0x3D4F, 0x3D4F, 0x3D50, 0x3D50, 0x3D50, 0x3D51, 0x3D51, 0x3D52,
+    0x3D52, 0x3D53, 0x3D53, 0x3D53, 0x3D54, 0x3D54, 0x3D55, 0x3D55, 0x3D55,
+    0x3D56, 0x3D56, 0x3D57, 0x3D57, 0x3D57, 0x3D58, 0x3D58, 0x3D59, 0x3D59,
+    0x3D59, 0x3D5A, 0x3D5A, 0x3D5B, 0x3D5B, 0x3D5C, 0x3D5C, 0x3D5C, 0x3D5D,
+    0x3D5D, 0x3D5E, 0x3D5E, 0x3D5E, 0x3D5F, 0x3D5F, 0x3D60, 0x3D60, 0x3D60,
+    0x3D61, 0x3D61, 0x3D62, 0x3D62, 0x3D63, 0x3D63, 0x3D63, 0x3D64, 0x3D64,
+    0x3D65, 0x3D65, 0x3D65, 0x3D66, 0x3D66, 0x3D67, 0x3D67, 0x3D67, 0x3D68,
+    0x3D68, 0x3D69, 0x3D69, 0x3D69, 0x3D6A, 0x3D6A, 0x3D6B, 0x3D6B, 0x3D6C,
+    0x3D6C, 0x3D6C, 0x3D6D, 0x3D6D, 0x3D6E, 0x3D6E, 0x3D6E, 0x3D6F, 0x3D6F,
+    0x3D70, 0x3D70, 0x3D70, 0x3D71, 0x3D71, 0x3D72, 0x3D72, 0x3D72, 0x3D73,
+    0x3D73, 0x3D74, 0x3D74, 0x3D75, 0x3D75, 0x3D75, 0x3D76, 0x3D76, 0x3D77,
+    0x3D77, 0x3D77, 0x3D78, 0x3D78, 0x3D79, 0x3D79, 0x3D79, 0x3D7A, 0x3D7A,
+    0x3D7B, 0x3D7B, 0x3D7B, 0x3D7C, 0x3D7C, 0x3D7D, 0x3D7D, 0x3D7E, 0x3D7E,
+    0x3D7E, 0x3D7F, 0x3D7F, 0x3D80, 0x3D80, 0x3D80, 0x3D80, 0x3D81, 0x3D81,
+    0x3D81, 0x3D81, 0x3D81, 0x3D82, 0x3D82, 0x3D82, 0x3D82, 0x3D82, 0x3D83,
+    0x3D83, 0x3D83, 0x3D83, 0x3D83, 0x3D84, 0x3D84, 0x3D84, 0x3D84, 0x3D85,
+    0x3D85, 0x3D85, 0x3D85, 0x3D85, 0x3D86, 0x3D86, 0x3D86, 0x3D86, 0x3D86,
+    0x3D87, 0x3D87, 0x3D87, 0x3D87, 0x3D87, 0x3D88, 0x3D88, 0x3D88, 0x3D88,
+    0x3D88, 0x3D89, 0x3D89, 0x3D89, 0x3D89, 0x3D89, 0x3D8A, 0x3D8A, 0x3D8A,
+    0x3D8A, 0x3D8A, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8C, 0x3D8C,
+    0x3D8C, 0x3D8C, 0x3D8C, 0x3D8D, 0x3D8D, 0x3D8D, 0x3D8D, 0x3D8E, 0x3D8E,
+    0x3D8E, 0x3D8E, 0x3D8E, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D90,
+    0x3D90, 0x3D90, 0x3D90, 0x3D90, 0x3D91, 0x3D91, 0x3D91, 0x3D91, 0x3D91,
+    0x3D92, 0x3D92, 0x3D92, 0x3D92, 0x3D92, 0x3D93, 0x3D93, 0x3D93, 0x3D93,
+    0x3D93, 0x3D94, 0x3D94, 0x3D94, 0x3D94, 0x3D94, 0x3D95, 0x3D95, 0x3D95,
+    0x3D95, 0x3D96, 0x3D96, 0x3D96, 0x3D96, 0x3D96, 0x3D97, 0x3D97, 0x3D97,
+    0x3D97, 0x3D97, 0x3D98, 0x3D98, 0x3D98, 0x3D98, 0x3D98, 0x3D99, 0x3D99,
+    0x3D99, 0x3D99, 0x3D99, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9B,
+    0x3D9B, 0x3D9B, 0x3D9B, 0x3D9B, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9C,
+    0x3D9D, 0x3D9D, 0x3D9D, 0x3D9D, 0x3D9D, 0x3D9E, 0x3D9E, 0x3D9E, 0x3D9E,
+    0x3D9F, 0x3D9F, 0x3D9F, 0x3D9F, 0x3D9F, 0x3DA0, 0x3DA0, 0x3DA0, 0x3DA0,
+    0x3DA0, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA2, 0x3DA2, 0x3DA2,
+    0x3DA2, 0x3DA2, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA4, 0x3DA4,
+    0x3DA4, 0x3DA4, 0x3DA4, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA6,
+    0x3DA6, 0x3DA6, 0x3DA6, 0x3DA7, 0x3DA7, 0x3DA7, 0x3DA7, 0x3DA7, 0x3DA8,
+    0x3DA8, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA9, 0x3DA9, 0x3DA9, 0x3DA9, 0x3DA9,
+    0x3DAA, 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAB,
+    0x3DAB, 0x3DAC, 0x3DAC, 0x3DAC, 0x3DAC, 0x3DAC, 0x3DAD, 0x3DAD, 0x3DAD,
+    0x3DAD, 0x3DAD, 0x3DAE, 0x3DAE, 0x3DAE, 0x3DAE, 0x3DAE, 0x3DAF, 0x3DAF,
+    0x3DAF, 0x3DAF, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB1, 0x3DB1,
+    0x3DB1, 0x3DB1, 0x3DB1, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB3,
+    0x3DB3, 0x3DB3, 0x3DB3, 0x3DB3, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB4,
+    0x3DB5, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB6, 0x3DB6, 0x3DB6, 0x3DB6,
+    0x3DB6, 0x3DB7, 0x3DB7, 0x3DB7, 0x3DB7, 0x3DB8, 0x3DB8, 0x3DB8, 0x3DB8,
+    0x3DB8, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DBA, 0x3DBA, 0x3DBA,
+    0x3DBA, 0x3DBA, 0x3DBB, 0x3DBB, 0x3DBB, 0x3DBB, 0x3DBB, 0x3DBC, 0x3DBC,
+    0x3DBC, 0x3DBC, 0x3DBC, 0x3DBD, 0x3DBD, 0x3DBD, 0x3DBD, 0x3DBD, 0x3DBE,
+    0x3DBE, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBF, 0x3DBF, 0x3DBF, 0x3DBF, 0x3DBF,
+    0x3DC0, 0x3DC0, 0x3DC0, 0x3DC0, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC1,
+    0x3DC2, 0x3DC2, 0x3DC2, 0x3DC2, 0x3DC2, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC3,
+    0x3DC3, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC5, 0x3DC5, 0x3DC5,
+    0x3DC5, 0x3DC5, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC7, 0x3DC7,
+    0x3DC7, 0x3DC7, 0x3DC7, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC9,
+    0x3DC9, 0x3DC9, 0x3DC9, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCB,
+    0x3DCB, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCC, 0x3DCC, 0x3DCC, 0x3DCC, 0x3DCC,
+    0x3DCD, 0x3DCE, 0x3DCF, 0x3DD0, 0x3DD1, 0x3DD2, 0x3DD3, 0x3DD4, 0x3DD5,
+    0x3DD6, 0x3DD7, 0x3DD8, 0x3DD9, 0x3DDA, 0x3DDB, 0x3DDC, 0x3DDD, 0x3DDE,
+    0x3DDF, 0x3DE0, 0x3DE1, 0x3DE2, 0x3DE3, 0x3DE4, 0x3DE5,
+};
+
+static u16 test_pattern_ref[] = {
+    0x7f7f, 0x461c, 0x459c, 0x4551, 0x451c, 0x44fa, 0x44d1, 0x44b2, 0x449c,
+    0x448b, 0x447a, 0x4464, 0x4451, 0x4441, 0x4432, 0x4426, 0x441c, 0x4413,
+    0x440b, 0x4404, 0x43fa, 0x43ed, 0x43e4, 0x43d9, 0x43d1, 0x43c8, 0x43c1,
+    0x43b9, 0x43b2, 0x43ac, 0x43a6, 0x43a1, 0x439c, 0x4398, 0x4393, 0x438f,
+    0x438b, 0x4387, 0x4384, 0x4380, 0x437a, 0x4375, 0x436d, 0x4368, 0x4364,
+    0x435f, 0x4359, 0x4355, 0x4351, 0x434c, 0x4348, 0x4344, 0x4341, 0x433c,
+    0x4339, 0x4336, 0x4332, 0x432f, 0x432c, 0x432a, 0x4326, 0x4324, 0x4321,
+    0x431f, 0x431c, 0x431a, 0x4318, 0x4315, 0x4313, 0x4311, 0x430f, 0x430d,
+    0x430b, 0x4309, 0x4307, 0x4305, 0x4304, 0x4302, 0x4300, 0x42fe, 0x42fa,
+    0x42f6, 0x42f5, 0x42f1, 0x42ed, 0x42ec, 0x42e8, 0x42e5, 0x42e4, 0x42e0,
+    0x42df, 0x42dc, 0x42d9, 0x42d8, 0x42d5, 0x42d2, 0x42d1, 0x42ce, 0x42cc,
+    0x42ca, 0x42c8, 0x42c7, 0x42c4, 0x42c2, 0x42c1, 0x42bf, 0x42bc, 0x42bb,
+    0x42b9, 0x42b7, 0x42b6, 0x42b4, 0x42b2, 0x42b1, 0x42af, 0x42ae, 0x42ac,
+    0x42ab, 0x42aa, 0x42a8, 0x42a6, 0x42a5, 0x42a4, 0x42a2, 0x42a1, 0x42a0,
+    0x429f, 0x429e, 0x429c, 0x429b, 0x429a, 0x4298, 0x4298, 0x4296, 0x4295,
+    0x4294, 0x4293, 0x4292, 0x4291, 0x4290, 0x428f, 0x428e, 0x428d, 0x428c,
+    0x428b, 0x428a, 0x4289, 0x4288, 0x4287, 0x4286, 0x4285, 0x4285, 0x4284,
+    0x4283, 0x4282, 0x4281, 0x4280, 0x427e, 0x427e, 0x427c, 0x427a, 0x4278,
+    0x4276, 0x4275, 0x4275, 0x4273, 0x4271, 0x426f, 0x426d, 0x426d, 0x426c,
+    0x426a, 0x4268, 0x4267, 0x4265, 0x4265, 0x4264, 0x4262, 0x4260, 0x425f,
+    0x425f, 0x425d, 0x425c, 0x425a, 0x4259, 0x4258, 0x4258, 0x4256, 0x4255,
+    0x4253, 0x4252, 0x4252, 0x4251, 0x424f, 0x424e, 0x424d, 0x424c, 0x424c,
+    0x424a, 0x4249, 0x4248, 0x4247, 0x4247, 0x4245, 0x4244, 0x4243, 0x4242,
+    0x4241, 0x4241, 0x4240, 0x423f, 0x423d, 0x423c, 0x423c, 0x423b, 0x423a,
+    0x4239, 0x4238, 0x4237, 0x4237, 0x4236, 0x4235, 0x4234, 0x4233, 0x4232,
+    0x4232, 0x4231, 0x4230, 0x422f, 0x422e, 0x422e, 0x422d, 0x422c, 0x422c,
+    0x422b, 0x422a, 0x422a, 0x4229, 0x4228, 0x4227, 0x4226, 0x4226, 0x4225,
+    0x4225, 0x4224, 0x4223, 0x4222, 0x4222, 0x4221, 0x4221, 0x4220, 0x421f,
+    0x421f, 0x421e, 0x421e, 0x421d, 0x421c, 0x421b, 0x421b, 0x421b, 0x421a,
+    0x4219, 0x4218, 0x4218, 0x4218, 0x4217, 0x4216, 0x4216, 0x4215, 0x4215,
+    0x4214, 0x4214, 0x4213, 0x4212, 0x4212, 0x4212, 0x4211, 0x4210, 0x4210,
+    0x420f, 0x420f, 0x420e, 0x420e, 0x420d, 0x420d, 0x420d, 0x420c, 0x420b,
+    0x420b, 0x420a, 0x420a, 0x420a, 0x4209, 0x4209, 0x4208, 0x4207, 0x4207,
+    0x4207, 0x4206, 0x4206, 0x4205, 0x4205, 0x4205, 0x4204, 0x4204, 0x4203,
+    0x4203, 0x4203, 0x4202, 0x4202, 0x4201, 0x4201, 0x4200, 0x4200, 0x41fe,
+    0x41fe, 0x41fe, 0x41fc, 0x41fc, 0x41fa, 0x41fa, 0x41fa, 0x41f8, 0x41f8,
+    0x41f6, 0x41f6, 0x41f5, 0x41f5, 0x41f5, 0x41f3, 0x41f3, 0x41f1, 0x41f1,
+    0x41f1, 0x41ef, 0x41ef, 0x41ed, 0x41ed, 0x41ed, 0x41ec, 0x41ec, 0x41ea,
+    0x41ea, 0x41ea, 0x41e8, 0x41e8, 0x41e7, 0x41e7, 0x41e5, 0x41e5, 0x41e5,
+    0x41e4, 0x41e4, 0x41e2, 0x41e2, 0x41e2, 0x41e0, 0x41e0, 0x41df, 0x41df,
+    0x41df, 0x41dd, 0x41dd, 0x41dc, 0x41dc, 0x41da, 0x41da, 0x41da, 0x41d9,
+    0x41d9, 0x41d8, 0x41d8, 0x41d8, 0x41d6, 0x41d6, 0x41d5, 0x41d5, 0x41d5,
+    0x41d3, 0x41d3, 0x41d2, 0x41d2, 0x41d2, 0x41d1, 0x41d1, 0x41cf, 0x41cf,
+    0x41ce, 0x41ce, 0x41ce, 0x41cd, 0x41cd, 0x41cc, 0x41cc, 0x41cc, 0x41ca,
+    0x41ca, 0x41c9, 0x41c9, 0x41c9, 0x41c8, 0x41c8, 0x41c7, 0x41c7, 0x41c7,
+    0x41c5, 0x41c5, 0x41c4, 0x41c4, 0x41c3, 0x41c3, 0x41c3, 0x41c2, 0x41c2,
+    0x41c1, 0x41c1, 0x41c1, 0x41c0, 0x41c0, 0x41bf, 0x41bf, 0x41bf, 0x41bd,
+    0x41bd, 0x41bc, 0x41bc, 0x41bc, 0x41bb, 0x41bb, 0x41ba, 0x41ba, 0x41b9,
+    0x41b9, 0x41b9, 0x41b8, 0x41b8, 0x41b7, 0x41b7, 0x41b7, 0x41b6, 0x41b6,
+    0x41b5, 0x41b5, 0x41b5, 0x41b4, 0x41b4, 0x41b3, 0x41b3, 0x41b2, 0x41b2,
+    0x41b2, 0x41b1, 0x41b1, 0x41b0, 0x41b0, 0x41b0, 0x41af, 0x41af, 0x41ae,
+    0x41ae, 0x41ae, 0x41ad, 0x41ad, 0x41ac, 0x41ac, 0x41ac, 0x41ac, 0x41ac,
+    0x41ab, 0x41ab, 0x41aa, 0x41aa, 0x41aa, 0x41a9, 0x41a9, 0x41a8, 0x41a8,
+    0x41a8, 0x41a7, 0x41a7, 0x41a6, 0x41a6, 0x41a6, 0x41a5, 0x41a5, 0x41a5,
+    0x41a5, 0x41a5, 0x41a4, 0x41a4, 0x41a3, 0x41a3, 0x41a2, 0x41a2, 0x41a2,
+    0x41a1, 0x41a1, 0x41a1, 0x41a1, 0x41a1, 0x41a0, 0x41a0, 0x419f, 0x419f,
+    0x419f, 0x419e, 0x419e, 0x419e, 0x419e, 0x419e, 0x419d, 0x419d, 0x419c,
+    0x419c, 0x419b, 0x419b, 0x419b, 0x419b, 0x419b, 0x419a, 0x419a, 0x419a,
+    0x4199, 0x4199, 0x4198, 0x4198, 0x4198, 0x4198, 0x4198, 0x4197, 0x4197,
+    0x4197, 0x4196, 0x4196, 0x4196, 0x4196, 0x4195, 0x4195, 0x4195, 0x4194,
+    0x4194, 0x4194, 0x4194, 0x4194, 0x4193, 0x4193, 0x4192, 0x4192, 0x4192,
+    0x4192, 0x4192, 0x4191, 0x4191, 0x4190, 0x4190, 0x4190, 0x4190, 0x4190,
+    0x418f, 0x418f, 0x418f, 0x418e, 0x418e, 0x418e, 0x418e, 0x418e, 0x418d,
+    0x418d, 0x418d, 0x418d, 0x418d, 0x418c, 0x418c, 0x418b, 0x418b, 0x418b,
+    0x418b, 0x418b, 0x418a, 0x418a, 0x418a, 0x418a, 0x418a, 0x4189, 0x4189,
+    0x4189, 0x4189, 0x4189, 0x4188, 0x4188, 0x4187, 0x4187, 0x4187, 0x4187,
+    0x4187, 0x4186, 0x4186, 0x4186, 0x4186, 0x4186, 0x4185, 0x4185, 0x4185,
+    0x4185, 0x4185, 0x4184, 0x4184, 0x4184, 0x4184, 0x4184, 0x4183, 0x4183,
+    0x4183, 0x4183, 0x4183, 0x4182, 0x4182, 0x4182, 0x4182, 0x4181, 0x4181,
+    0x4181, 0x4181, 0x4181, 0x4180, 0x4180, 0x4180, 0x4180, 0x417e, 0x417e,
+    0x417e, 0x417e, 0x417e, 0x417c, 0x417c, 0x417c, 0x417c, 0x417c, 0x417a,
+    0x417a, 0x417a, 0x417a, 0x417a, 0x4178, 0x4178, 0x4178, 0x4178, 0x4176,
+    0x4176, 0x4176, 0x4176, 0x4176, 0x4175, 0x4175, 0x4175, 0x4175, 0x4175,
+    0x4173, 0x4173, 0x4173, 0x4173, 0x4173, 0x4171, 0x4171, 0x4171, 0x4171,
+    0x4171, 0x416f, 0x416f, 0x416f, 0x416f, 0x416f, 0x416d, 0x416d, 0x416d,
+    0x416d, 0x416d, 0x416c, 0x416c, 0x416c, 0x416c, 0x416c, 0x416a, 0x416a,
+    0x416a, 0x416a, 0x416a, 0x4168, 0x4168, 0x4168, 0x4168, 0x4167, 0x4167,
+    0x4167, 0x4167, 0x4167, 0x4165, 0x4165, 0x4165, 0x4165, 0x4165, 0x4164,
+    0x4164, 0x4164, 0x4164, 0x4164, 0x4162, 0x4162, 0x4162, 0x4162, 0x4162,
+    0x4160, 0x4160, 0x4160, 0x4160, 0x4160, 0x415f, 0x415f, 0x415f, 0x415f,
+    0x415f, 0x415d, 0x415d, 0x415d, 0x415d, 0x415d, 0x415c, 0x415c, 0x415c,
+    0x415c, 0x415a, 0x415a, 0x415a, 0x415a, 0x415a, 0x4159, 0x4159, 0x4159,
+    0x4159, 0x4159, 0x4158, 0x4158, 0x4158, 0x4158, 0x4158, 0x4156, 0x4156,
+    0x4156, 0x4156, 0x4156, 0x4155, 0x4155, 0x4155, 0x4155, 0x4155, 0x4153,
+    0x4153, 0x4153, 0x4153, 0x4153, 0x4152, 0x4152, 0x4152, 0x4152, 0x4152,
+    0x4151, 0x4151, 0x4151, 0x4151, 0x4151, 0x414f, 0x414f, 0x414f, 0x414f,
+    0x414e, 0x414e, 0x414e, 0x414e, 0x414e, 0x414d, 0x414d, 0x414d, 0x414d,
+    0x414d, 0x414c, 0x414c, 0x414c, 0x414c, 0x414c, 0x414a, 0x414a, 0x414a,
+    0x414a, 0x414a, 0x4149, 0x4149, 0x4149, 0x4149, 0x4149, 0x4148, 0x4148,
+    0x4148, 0x4148, 0x4148, 0x4147, 0x4147, 0x4147, 0x4147, 0x4147, 0x4145,
+    0x4145, 0x4145, 0x4145, 0x4144, 0x4144, 0x4144, 0x4144, 0x4144, 0x4143,
+    0x4143, 0x4143, 0x4143, 0x4143, 0x4142, 0x4142, 0x4142, 0x4142, 0x4142,
+    0x4141, 0x4141, 0x4141, 0x4141, 0x4141, 0x4140, 0x4140, 0x4140, 0x4140,
+    0x4140, 0x413f, 0x413f, 0x413f, 0x413f, 0x413f, 0x413d, 0x413d, 0x413d,
+    0x413d, 0x413d, 0x413c, 0x413c, 0x413c, 0x413c, 0x413c, 0x413b, 0x413b,
+    0x413b, 0x413b, 0x413a, 0x413a, 0x413a, 0x413a, 0x413a, 0x4139, 0x4139,
+    0x4139, 0x4139, 0x4139, 0x4138, 0x4138, 0x4138, 0x4138, 0x4138, 0x4137,
+    0x4137, 0x4137, 0x4137, 0x4137, 0x4136, 0x4136, 0x4136, 0x4136, 0x4136,
+    0x4135, 0x4135, 0x4135, 0x4135, 0x4135, 0x4134, 0x4134, 0x4134, 0x4134,
+    0x4134, 0x4133, 0x4133, 0x4133, 0x4133, 0x4132, 0x4132, 0x4132, 0x4132,
+    0x4132, 0x4131, 0x4131, 0x4131, 0x4131, 0x4131, 0x4130, 0x4130, 0x4130,
+    0x4130, 0x4130, 0x412f, 0x412f, 0x412f, 0x412f, 0x412f, 0x412e, 0x412e,
+    0x412e, 0x412e, 0x412e, 0x412d, 0x412d, 0x412d, 0x412d, 0x412d, 0x412c,
+    0x412c, 0x412c, 0x412c, 0x412c, 0x412c, 0x412c, 0x412c, 0x412c, 0x412c,
+    0x412b, 0x412b, 0x412b, 0x412b, 0x412a, 0x412a, 0x412a, 0x412a, 0x412a,
+    0x4129, 0x4129, 0x4129, 0x4129, 0x4129, 0x4128, 0x4128, 0x4128, 0x4128,
+    0x4128, 0x4127, 0x4127, 0x4127, 0x4127, 0x4127, 0x4126, 0x4126, 0x4126,
+    0x4126, 0x4126, 0x4125, 0x4125, 0x4125, 0x4125, 0x4125, 0x4125, 0x4125,
+    0x4125, 0x4125, 0x4125, 0x4124, 0x4124, 0x4124, 0x4124, 0x4124, 0x4123,
+    0x4123, 0x4123, 0x4123, 0x4122, 0x4122, 0x4122, 0x4122, 0x4122, 0x4121,
+    0x4121, 0x4121, 0x4121, 0x4121, 0x4121, 0x4121, 0x4121, 0x4121, 0x4121,
+    0x4120, 0x411f, 0x411e, 0x411e, 0x411d, 0x411c, 0x411b, 0x411b, 0x411a,
+    0x4119, 0x4118, 0x4118, 0x4117, 0x4116, 0x4116, 0x4115, 0x4114, 0x4114,
+    0x4113, 0x4112, 0x4112, 0x4111, 0x4110, 0x4110, 0x410f,
+};
+
+static void tl_lut_ref(u16 *ofmap, u16 *ifmap, tl_shape_t ifmap_shape)
+{
+  for (u32 i = 0; i < tl_shape_size(&ifmap_shape); i++) {
+    if (mode == PRE_DATA_COMPARE_FIX) {
+      ofmap[i] = test_pattern_ref[i];
+    } else {
+      u16 v = convert_fp32_bf16(1 / (1.0 * (convert_bf16_fp32(ifmap[i]))));
+      ofmap[i] = v;
+    }
+  }
+}
+
+static bool verify(u16 *ofmap_data, u16 *ref_data, u16 *ifmap,
+                   u64 ifmap_shape_size, TEST_MODE mode)
+{
+  u64 size = ifmap_shape_size;
+
+  for (u64 i = 0; i < size; i++) {
+    bool is_close;
+    u16 ref;
+    u16 ofmap_data_bf16;
+    float ref_f;
+    float ofmap_data_f;
+
+    ref = ref_data[i];
+    ref_f = convert_bf16_fp32(ref);
+    ofmap_data_f = convert_bf16_fp32(ofmap_data[i]);
+    ofmap_data_bf16 = ofmap_data[i];
+
+    if (mode == PRE_DATA_COMPARE_FIX) {
+      is_close = ofmap_data[i] == ref;
+    } else {
+      is_close = fabs(ref_f - ofmap_data_f) < 0.001;
+    }
+
+    if (!is_close) {
+      fprintf(stderr,
+              "comparing failed at ofmap_data[%" PRIu64 "](input:%e), got %x, exp %x, "
+              "fp32: got %e exp %e\n",
+              i, convert_bf16_fp32(ifmap[i]), ofmap_data_bf16, ref,
+              ofmap_data_f, ref_f);
+      exit(-1);
+    }
+  }
+
+  return true;
+}
+
+static void gen_input(u16 *ifmap, u64 ifmap_shape_size)
+{
+
+  if (mode == PRE_DATA_COMPARE_FIX) {
+    memcpy(ifmap, &test_pattern, sizeof(test_pattern));
+  } else {
+    for (u64 i = 0; i < ifmap_shape_size; i++) {
+      srand(static_cast<unsigned>(time(0)));
+      std::random_device rd;
+      std::mt19937 e2(rd());
+      float LO = pow(2, -10);
+      float HI = pow(2, 10);
+      // std::uniform_real_distribution<> dist(pow(2,-62), pow(2,63));
+      for (u64 i = 0; i < ifmap_shape_size; i++) {
+        // float r3 = dist(e2);
+        float r3 = LO + static_cast<float>(rand()) /
+                            (static_cast<float>(RAND_MAX / (HI - LO)));
+        ifmap[i] = convert_fp32_bf16(r3);
+      }
+    }
+  }
+
+#ifdef DBG
+  for (u64 i = 0; i < ifmap_shape_size; i++) {
+    printf("source if[%" PRIu64 "] bf16 %f 0x%x, log2f is %f\n", i,
+           convert_bf16_fp32(ifmap[i]), ifmap[i],
+           floor(log2((convert_bf16_fp32(ifmap[i])))));
+  }
+#endif /* ifdef DBG */
+}
+
+static void testbench(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, u32 input_n, u32 input_c,
+                      u32 input_h, u32 input_w)
+{
+  fmt_t fmt = FMT_BF16;
+
+  // TODO: check more shape / align
+  tl_shape_t ifmap_shape = {input_n, input_c, input_h, input_w};
+  tl_shape_t ofmap_shape = ifmap_shape;
+  tl_shape_t table_shape;
+  bf16_table_shape(bmk, &table_shape);
+
+  u64 ifmap_shape_size = tl_shape_size(&ifmap_shape);
+  u64 ofmap_size = tl_shape_size(&ofmap_shape);
+  u64 table_size = tl_shape_size(&table_shape);
+
+  // prepare input data with size
+  int data_type_size = bytesize_of_fmt(fmt);
+  u64 ifmap_bytesize = ifmap_shape_size * data_type_size;
+  u64 ofmap_bytesize = ofmap_size * data_type_size;
+  u64 table_bytesize = table_size * data_type_size;
+
+  u16 *ifmap = (u16 *)xmalloc(ifmap_bytesize);
+  u16 *ref_data = (u16 *)xmalloc(ofmap_bytesize);
+  u16 *table_data = (u16 *)xmalloc(table_bytesize);
+  u16 *table_data_mantissa = (u16 *)xmalloc(table_bytesize);
+
+  // alloc lmem
+  tl_t *tl_ifmap = alloc_tl(bmk, ifmap_shape, fmt, /*align*/1);
+  tl_t *tl_ofmap_bf16 = alloc_tl(bmk, ofmap_shape, fmt, /*align*/1);
+  tl_t *tl_buf = tl_ifmap ? alloc_tl(bmk, tl_ifmap->shape, fmt, /*align*/1) : nullptr;
+  tl_t *tl_table_answer = alloc_tl(bmk, table_shape, fmt, /*align*/1);
+  tl_t *tl_table_answer_mantissa =
+      alloc_tl(bmk, table_shape, fmt, /*align*/1);
+
+  // generate testbench
+  gen_input(ifmap, ifmap_shape_size);
+  tl_lut_ref(ref_data, ifmap, ifmap_shape);
+
+  // prepare table
+  bf16_reciprocal_tbl(table_data, table_data_mantissa, &table_shape);
+
+  // sys->lmem
+  put_bf16_tensor_g2l(ctx, bmk, tl_ifmap, (u16 *)ifmap, FMT_BF16);
+  put_bf16_tensor_g2l(ctx, bmk, tl_table_answer, (u16 *)table_data, FMT_BF16);
+  put_bf16_tensor_g2l(ctx, bmk, tl_table_answer_mantissa,
+                      (u16 *)table_data_mantissa, FMT_BF16);
+
+  bf16_emit_reciprocal(bmk, tl_ifmap, tl_buf, tl_table_answer,
+                       tl_table_answer_mantissa, tl_ofmap_bf16);
+
+  // issue cmd
+  test_submit(ctx);
+
+  // get output from lmem->sys
+  u16 *ofmap_data =
+      (u16 *)get_bf16_tensor_l2g(ctx, bmk, tl_ofmap_bf16, tl_ofmap_bf16->fmt);
+
+  verify(ofmap_data, ref_data, ifmap, ifmap_shape_size, mode);
+
+  free_tl(bmk, tl_table_answer_mantissa);
+  free_tl(bmk, tl_table_answer);
+  free_tl(bmk, tl_buf);
+  free_tl(bmk, tl_ofmap_bf16);
+  free_tl(bmk, tl_ifmap);
+
+  free(ifmap);
+  free(ref_data);
+  free(ofmap_data);
+  free(table_data);
+  free(table_data_mantissa);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  int round_mode;
+
+  round_mode = set_store_feround();
+
+  test_init(&ctx, &bmk);
+
+  for (int i = GEN_POW_20_DATA_MAX_ERROR; i < TEST_MODE_MAX; i++) {
+    mode = static_cast<TEST_MODE>(i);
+    printf("test mode %d...\n", mode);
+
+    int input_n = 1;
+    int input_c = 32;
+    int input_h = 1;
+    int input_w = 1;
+
+    if (mode == PRE_DATA_COMPARE_FIX) {
+      input_h = 4;
+      input_w = 8;
+    } else {
+      input_h = input_w = 16;
+    }
+
+    testbench(&ctx, bmk, input_n, input_c, input_h, input_w);
+  }
+
+  test_exit(&ctx);
+  restore_feround(round_mode);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_bf16_sigmoid.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_sigmoid.cpp
new file mode 100644
index 000000000..8ad8ddb26
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_sigmoid.cpp
@@ -0,0 +1,141 @@
+#include "../1880v2_test_util.h"
+#include <random>
+
+static u32 channel = 32; //<! 1880v2 hardcode
+static u32 table_h = 32;
+static u32 table_w = 8;
+static u32 table_hw = table_h * table_w;
+
+using namespace std;
+static void tl_lut_ref(
+    u16 *ofmap,
+    u16 *ifmap,
+    u16 *table,
+    tl_shape_t ifmap_shape,
+    tl_shape_t table_shape)
+{
+  int ih, iw;
+  int tn, th, tw;
+
+  ih = ifmap_shape.h;
+  iw = ifmap_shape.w;
+  tn = table_shape.n;
+  th = table_shape.h;
+  tw = table_shape.w;
+  assert(tn == 1);
+  assert(th * tw == 256);
+
+  for (u64 i = 0; i < tl_shape_size(&ifmap_shape); i++) {
+    int ici = i / (ih * iw) % 32;
+    u8 off = ifmap[i] & 0xff;
+    ofmap[i] = table[ici * (th * tw) + off];
+  }
+}
+
+static void gen_sigmoid(u16 *table_data, u64 table_size) {
+  // S(x) = 1 / (1 + (e^-x))
+  printf ("table_size is %" PRIu64 "\n", table_size);
+
+  for (u64 i = 0; i < table_hw; i++) {
+    int sign = rand() % 2 ? 1 : -1;
+    float s = exp(0.001 * i) * sign;
+    table_data[i] = convert_fp32_bf16(s);
+  }
+
+  for (u32 i = 1; i < channel; i++) {
+    memcpy(&table_data[i * table_hw], &table_data[0], sizeof(u16) * table_hw);
+  }
+}
+
+static bool verify(u16 *ofmap_data, u16 *ref_data, u64 ofmap_size) {
+  for (u64 i = 0; i < ofmap_size; i++) {
+    if (ofmap_data[i] != ref_data[i]) {
+      fprintf(stderr,
+          "comparing failed at ofmap_data[%" PRIu64 "], got %d(0x%x), exp %d(0x%x)\n",
+          i, ofmap_data[i], ofmap_data[i], ref_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+  return true;
+}
+
+union bf16int8 {
+  u16 bf16;
+  u8  int8[2];
+};
+
+static void test_tl_int8_lut_bf16(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx)
+{
+  // TODO: check more shape / align
+  tl_shape_t ifmap_shape = {1, 32, 16, 16};
+  tl_shape_t table_shape = {1, 32, 32, 8}; // hard code for hw
+  tl_shape_t ofmap_shape = ifmap_shape;
+
+  u64 ifmap_size = tl_shape_size(&ifmap_shape);
+  u64 table_size = tl_shape_size(&table_shape);
+  u64 ofmap_size = tl_shape_size(&ofmap_shape);
+
+  fmt_t fmt = FMT_BF16;
+
+  int data_type_size = bytesize_of_fmt(fmt);
+  u64 ifmap_bytesize  =  ifmap_size * data_type_size;
+  u64 table_bytesize  =  table_size * data_type_size;
+  u64 ofmap_bytesize  =  ofmap_size * data_type_size;
+
+  u16 *ifmap_data = (u16 *)xmalloc(ifmap_bytesize);
+  memset(ifmap_data, 0x00, ifmap_bytesize);
+  // hw ONLY support index in int8
+
+  for (u64 i = 0; i < ifmap_size; i++) {
+    bf16int8 b;
+    b.int8[0] = i % table_hw;
+    b.int8[1] = i % table_hw;
+    ifmap_data[i] = b.bf16;
+  }
+
+  u16 *table_data = (u16 *)xmalloc(table_bytesize);
+  gen_sigmoid (table_data, table_size);
+
+  u16 *ref_data = (u16 *)xmalloc(ofmap_bytesize);
+  tl_lut_ref(ref_data, ifmap_data, table_data, ifmap_shape, table_shape);
+
+  tl_t *tl_ifmap =
+    alloc_tl(bk_ctx,ifmap_shape, fmt, /*align*/1);
+  tl_t *tl_table =
+    alloc_tl(bk_ctx, table_shape, fmt, /*align*/1);
+  tl_t *tl_ofmap =
+    alloc_tl(bk_ctx,ofmap_shape, fmt, /*align*/1);
+
+  put_bf16_tensor_g2l(ctx, bk_ctx, tl_ifmap, ifmap_data, fmt);
+  put_bf16_tensor_g2l(ctx, bk_ctx, tl_table, table_data, fmt);
+
+  bmk1880v2_tiu_lookup_table_param_t p12;
+  memset(&p12, 0, sizeof(p12));
+  p12.ofmap = tl_ofmap;
+  p12.ifmap = tl_ifmap;
+  p12.table = tl_table;
+  bmk1880v2_tiu_lookup_table(bk_ctx, &p12);
+  test_submit(ctx);
+
+  u16 *ofmap_data = (u16*)get_bf16_tensor_l2g(ctx, bk_ctx, tl_ofmap, fmt);
+  verify(ofmap_data, ref_data, ofmap_size);
+
+  free_tl(bk_ctx, tl_ofmap);
+  free_tl(bk_ctx, tl_table);
+  free_tl(bk_ctx, tl_ifmap);
+
+  free(ifmap_data);
+  free(table_data);
+  free(ref_data);
+  free(ofmap_data);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  test_tl_int8_lut_bf16(&ctx, bk_ctx);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_bf16_sigmoid_linear_interp.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_sigmoid_linear_interp.cpp
new file mode 100644
index 000000000..446911b79
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_sigmoid_linear_interp.cpp
@@ -0,0 +1,6684 @@
+/**
+ * implement Linear interpolation serach
+ * 
+ * we need to pass 2 table, one is answer(lut_answer), another is slope with anwser(lut_answer_slope),
+ *
+ * for example, we want to get x value
+ * +------+----+
+ * x0     x    x1
+ *
+ * the [Linear interpolation defined] (https://en.wikipedia.org/wiki/Linear_interpolation) as flowing:
+ *
+ * part C  part A                     part B
+ * +--+    +---+           +----------------------------------------+
+ *
+ * p(x) =  f(x0)     +     ( (f(x1) - f(x0)) / (x1 - x0) ) * (x - x0)
+ *         
+ *         +---+           +-----------------------------+
+ *        lut_answer              lut_answer_slope
+ */
+//* TODO: you could rerange any value to -127~127
+#include "../1880v2_test_util.h"
+#define OUT
+#define IN
+
+/**
+ * pre_data means we test fixed pattern, it should be same sa lut
+ * compare fix means we MAKE SURE output values equal with golden,
+ * comment it for check with error using `MAX_ERROR`
+ */
+enum TEST_MODE {
+  PRE_DATA_COMPARE_FIX = 0, // pre-data + fix compare
+  PRE_DATA_MAX_ERROR,       // pre-data + compare only diff < MAX_ERROR
+  //GEN_DATA_MAX_ERROR,       // gen data + compare only diff < MAX_ERROR
+  TEST_MODE_MAX,
+};
+
+static TEST_MODE mode;
+#define MAX_ERROR (0.004)
+
+using namespace std;
+//TODO: get from ctx
+//NOTICE: table duplicate by channel
+static u32 channel = 32; //<! 1880v2 hardcode
+
+//<! 1880v2 hw config
+static u32 table_h = 32;
+static u32 table_w = 8;
+static u32 table_hw = table_h * table_w;
+
+// NOTICE: activation ragne from -8 ~ +8 and slice to 256, dequantize to -127 ~ 127
+static int range_start = -8;
+static int range_end = 8;
+static float scale = table_hw / (1.0 * abs(range_start - range_end)); // 256 / 16 = 16
+
+// NOTICE: all inter result save in doulbe unit
+static double *sigmode_hw = (double *)malloc(sizeof(double) * table_hw);
+
+static u16 test_pattern[] = {
+  0x0000,
+  0x3C03,
+  0x3C83,
+  0x3CC5,
+  0x3D03,
+  0x3D24,
+  0x3D45,
+  0x3D65,
+  0x3D83,
+  0x3D93,
+  0x3DA4,
+  0x3DB4,
+  0x3DC5,
+  0x3DD5,
+  0x3DE5,
+  0x3DF6,
+  0x3E03,
+  0x3E0B,
+  0x3E13,
+  0x3E1C,
+  0x3E24,
+  0x3E2C,
+  0x3E34,
+  0x3E3C,
+  0x3E45,
+  0x3E4D,
+  0x3E55,
+  0x3E5D,
+  0x3E65,
+  0x3E6E,
+  0x3E76,
+  0x3E7E,
+  0x3E83,
+  0x3E87,
+  0x3E8B,
+  0x3E8F,
+  0x3E93,
+  0x3E98,
+  0x3E9C,
+  0x3EA0,
+  0x3EA4,
+  0x3EA8,
+  0x3EAC,
+  0x3EB0,
+  0x3EB4,
+  0x3EB8,
+  0x3EBC,
+  0x3EC1,
+  0x3EC5,
+  0x3EC9,
+  0x3ECD,
+  0x3ED1,
+  0x3ED5,
+  0x3ED9,
+  0x3EDD,
+  0x3EE1,
+  0x3EE5,
+  0x3EE9,
+  0x3EEE,
+  0x3EF2,
+  0x3EF6,
+  0x3EFA,
+  0x3EFE,
+  0x3F01,
+  0x3F03,
+  0x3F05,
+  0x3F07,
+  0x3F09,
+  0x3F0B,
+  0x3F0D,
+  0x3F0F,
+  0x3F11,
+  0x3F13,
+  0x3F16,
+  0x3F18,
+  0x3F1A,
+  0x3F1C,
+  0x3F1E,
+  0x3F20,
+  0x3F22,
+  0x3F24,
+  0x3F26,
+  0x3F28,
+  0x3F2A,
+  0x3F2C,
+  0x3F2E,
+  0x3F30,
+  0x3F32,
+  0x3F34,
+  0x3F36,
+  0x3F38,
+  0x3F3A,
+  0x3F3C,
+  0x3F3E,
+  0x3F41,
+  0x3F43,
+  0x3F45,
+  0x3F47,
+  0x3F49,
+  0x3F4B,
+  0x3F4D,
+  0x3F4F,
+  0x3F51,
+  0x3F53,
+  0x3F55,
+  0x3F57,
+  0x3F59,
+  0x3F5B,
+  0x3F5D,
+  0x3F5F,
+  0x3F61,
+  0x3F63,
+  0x3F65,
+  0x3F67,
+  0x3F69,
+  0x3F6C,
+  0x3F6E,
+  0x3F70,
+  0x3F72,
+  0x3F74,
+  0x3F76,
+  0x3F78,
+  0x3F7A,
+  0x3F7C,
+  0x3F7E,
+  0x3F80,
+  0x3F81,
+  0x3F82,
+  0x3F83,
+  0x3F84,
+  0x3F85,
+  0x3F86,
+  0x3F87,
+  0x3F88,
+  0x3F89,
+  0x3F8A,
+  0x3F8B,
+  0x3F8C,
+  0x3F8D,
+  0x3F8E,
+  0x3F8F,
+  0x3F90,
+  0x3F91,
+  0x3F92,
+  0x3F93,
+  0x3F94,
+  0x3F96,
+  0x3F97,
+  0x3F98,
+  0x3F99,
+  0x3F9A,
+  0x3F9B,
+  0x3F9C,
+  0x3F9D,
+  0x3F9E,
+  0x3F9F,
+  0x3FA0,
+  0x3FA1,
+  0x3FA2,
+  0x3FA3,
+  0x3FA4,
+  0x3FA5,
+  0x3FA6,
+  0x3FA7,
+  0x3FA8,
+  0x3FA9,
+  0x3FAA,
+  0x3FAB,
+  0x3FAC,
+  0x3FAD,
+  0x3FAE,
+  0x3FAF,
+  0x3FB0,
+  0x3FB1,
+  0x3FB2,
+  0x3FB3,
+  0x3FB4,
+  0x3FB5,
+  0x3FB6,
+  0x3FB7,
+  0x3FB8,
+  0x3FB9,
+  0x3FBA,
+  0x3FBB,
+  0x3FBC,
+  0x3FBD,
+  0x3FBE,
+  0x3FBF,
+  0x3FC1,
+  0x3FC2,
+  0x3FC3,
+  0x3FC4,
+  0x3FC5,
+  0x3FC6,
+  0x3FC7,
+  0x3FC8,
+  0x3FC9,
+  0x3FCA,
+  0x3FCB,
+  0x3FCC,
+  0x3FCD,
+  0x3FCE,
+  0x3FCF,
+  0x3FD0,
+  0x3FD1,
+  0x3FD2,
+  0x3FD3,
+  0x3FD4,
+  0x3FD5,
+  0x3FD6,
+  0x3FD7,
+  0x3FD8,
+  0x3FD9,
+  0x3FDA,
+  0x3FDB,
+  0x3FDC,
+  0x3FDD,
+  0x3FDE,
+  0x3FDF,
+  0x3FE0,
+  0x3FE1,
+  0x3FE2,
+  0x3FE3,
+  0x3FE4,
+  0x3FE5,
+  0x3FE6,
+  0x3FE7,
+  0x3FE8,
+  0x3FE9,
+  0x3FEA,
+  0x3FEC,
+  0x3FED,
+  0x3FEE,
+  0x3FEF,
+  0x3FF0,
+  0x3FF1,
+  0x3FF2,
+  0x3FF3,
+  0x3FF4,
+  0x3FF5,
+  0x3FF6,
+  0x3FF7,
+  0x3FF8,
+  0x3FF9,
+  0x3FFA,
+  0x3FFB,
+  0x3FFC,
+  0x3FFD,
+  0x3FFE,
+  0x3FFF,
+  0x4000,
+  0x4001,
+  0x4001,
+  0x4002,
+  0x4002,
+  0x4003,
+  0x4003,
+  0x4004,
+  0x4004,
+  0x4005,
+  0x4005,
+  0x4006,
+  0x4006,
+  0x4007,
+  0x4007,
+  0x4008,
+  0x4008,
+  0x4009,
+  0x4009,
+  0x400A,
+  0x400A,
+  0x400B,
+  0x400B,
+  0x400C,
+  0x400C,
+  0x400D,
+  0x400D,
+  0x400E,
+  0x400E,
+  0x400F,
+  0x400F,
+  0x4010,
+  0x4010,
+  0x4011,
+  0x4011,
+  0x4012,
+  0x4012,
+  0x4013,
+  0x4013,
+  0x4014,
+  0x4014,
+  0x4015,
+  0x4016,
+  0x4016,
+  0x4017,
+  0x4017,
+  0x4018,
+  0x4018,
+  0x4019,
+  0x4019,
+  0x401A,
+  0x401A,
+  0x401B,
+  0x401B,
+  0x401C,
+  0x401C,
+  0x401D,
+  0x401D,
+  0x401E,
+  0x401E,
+  0x401F,
+  0x401F,
+  0x4020,
+  0x4020,
+  0x4021,
+  0x4021,
+  0x4022,
+  0x4022,
+  0x4023,
+  0x4023,
+  0x4024,
+  0x4024,
+  0x4025,
+  0x4025,
+  0x4026,
+  0x4026,
+  0x4027,
+  0x4027,
+  0x4028,
+  0x4028,
+  0x4029,
+  0x4029,
+  0x402A,
+  0x402A,
+  0x402B,
+  0x402C,
+  0x402C,
+  0x402D,
+  0x402D,
+  0x402E,
+  0x402E,
+  0x402F,
+  0x402F,
+  0x4030,
+  0x4030,
+  0x4031,
+  0x4031,
+  0x4032,
+  0x4032,
+  0x4033,
+  0x4033,
+  0x4034,
+  0x4034,
+  0x4035,
+  0x4035,
+  0x4036,
+  0x4036,
+  0x4037,
+  0x4037,
+  0x4038,
+  0x4038,
+  0x4039,
+  0x4039,
+  0x403A,
+  0x403A,
+  0x403B,
+  0x403B,
+  0x403C,
+  0x403C,
+  0x403D,
+  0x403D,
+  0x403E,
+  0x403E,
+  0x403F,
+  0x403F,
+  0x4040,
+  0x4041,
+  0x4041,
+  0x4042,
+  0x4042,
+  0x4043,
+  0x4043,
+  0x4044,
+  0x4044,
+  0x4045,
+  0x4045,
+  0x4046,
+  0x4046,
+  0x4047,
+  0x4047,
+  0x4048,
+  0x4048,
+  0x4049,
+  0x4049,
+  0x404A,
+  0x404A,
+  0x404B,
+  0x404B,
+  0x404C,
+  0x404C,
+  0x404D,
+  0x404D,
+  0x404E,
+  0x404E,
+  0x404F,
+  0x404F,
+  0x4050,
+  0x4050,
+  0x4051,
+  0x4051,
+  0x4052,
+  0x4052,
+  0x4053,
+  0x4053,
+  0x4054,
+  0x4054,
+  0x4055,
+  0x4056,
+  0x4056,
+  0x4057,
+  0x4057,
+  0x4058,
+  0x4058,
+  0x4059,
+  0x4059,
+  0x405A,
+  0x405A,
+  0x405B,
+  0x405B,
+  0x405C,
+  0x405C,
+  0x405D,
+  0x405D,
+  0x405E,
+  0x405E,
+  0x405F,
+  0x405F,
+  0x4060,
+  0x4060,
+  0x4061,
+  0x4061,
+  0x4062,
+  0x4062,
+  0x4063,
+  0x4063,
+  0x4064,
+  0x4064,
+  0x4065,
+  0x4065,
+  0x4066,
+  0x4066,
+  0x4067,
+  0x4067,
+  0x4068,
+  0x4068,
+  0x4069,
+  0x4069,
+  0x406A,
+  0x406A,
+  0x406B,
+  0x406C,
+  0x406C,
+  0x406D,
+  0x406D,
+  0x406E,
+  0x406E,
+  0x406F,
+  0x406F,
+  0x4070,
+  0x4070,
+  0x4071,
+  0x4071,
+  0x4072,
+  0x4072,
+  0x4073,
+  0x4073,
+  0x4074,
+  0x4074,
+  0x4075,
+  0x4075,
+  0x4076,
+  0x4076,
+  0x4077,
+  0x4077,
+  0x4078,
+  0x4078,
+  0x4079,
+  0x4079,
+  0x407A,
+  0x407A,
+  0x407B,
+  0x407B,
+  0x407C,
+  0x407C,
+  0x407D,
+  0x407D,
+  0x407E,
+  0x407E,
+  0x407F,
+  0x407F,
+  0x4080,
+  0x4080,
+  0x4081,
+  0x4081,
+  0x4081,
+  0x4081,
+  0x4082,
+  0x4082,
+  0x4082,
+  0x4082,
+  0x4083,
+  0x4083,
+  0x4083,
+  0x4083,
+  0x4084,
+  0x4084,
+  0x4084,
+  0x4084,
+  0x4085,
+  0x4085,
+  0x4085,
+  0x4085,
+  0x4086,
+  0x4086,
+  0x4086,
+  0x4086,
+  0x4087,
+  0x4087,
+  0x4087,
+  0x4087,
+  0x4088,
+  0x4088,
+  0x4088,
+  0x4088,
+  0x4089,
+  0x4089,
+  0x4089,
+  0x4089,
+  0x408A,
+  0x408A,
+  0x408A,
+  0x408A,
+  0x408B,
+  0x408B,
+  0x408B,
+  0x408C,
+  0x408C,
+  0x408C,
+  0x408C,
+  0x408D,
+  0x408D,
+  0x408D,
+  0x408D,
+  0x408E,
+  0x408E,
+  0x408E,
+  0x408E,
+  0x408F,
+  0x408F,
+  0x408F,
+  0x408F,
+  0x4090,
+  0x4090,
+  0x4090,
+  0x4090,
+  0x4091,
+  0x4091,
+  0x4091,
+  0x4091,
+  0x4092,
+  0x4092,
+  0x4092,
+  0x4092,
+  0x4093,
+  0x4093,
+  0x4093,
+  0x4093,
+  0x4094,
+  0x4094,
+  0x4094,
+  0x4094,
+  0x4095,
+  0x4095,
+  0x4095,
+  0x4096,
+  0x4096,
+  0x4096,
+  0x4096,
+  0x4097,
+  0x4097,
+  0x4097,
+  0x4097,
+  0x4098,
+  0x4098,
+  0x4098,
+  0x4098,
+  0x4099,
+  0x4099,
+  0x4099,
+  0x4099,
+  0x409A,
+  0x409A,
+  0x409A,
+  0x409A,
+  0x409B,
+  0x409B,
+  0x409B,
+  0x409B,
+  0x409C,
+  0x409C,
+  0x409C,
+  0x409C,
+  0x409D,
+  0x409D,
+  0x409D,
+  0x409D,
+  0x409E,
+  0x409E,
+  0x409E,
+  0x409E,
+  0x409F,
+  0x409F,
+  0x409F,
+  0x409F,
+  0x40A0,
+  0x40A0,
+  0x40A0,
+  0x40A1,
+  0x40A1,
+  0x40A1,
+  0x40A1,
+  0x40A2,
+  0x40A2,
+  0x40A2,
+  0x40A2,
+  0x40A3,
+  0x40A3,
+  0x40A3,
+  0x40A3,
+  0x40A4,
+  0x40A4,
+  0x40A4,
+  0x40A4,
+  0x40A5,
+  0x40A5,
+  0x40A5,
+  0x40A5,
+  0x40A6,
+  0x40A6,
+  0x40A6,
+  0x40A6,
+  0x40A7,
+  0x40A7,
+  0x40A7,
+  0x40A7,
+  0x40A8,
+  0x40A8,
+  0x40A8,
+  0x40A8,
+  0x40A9,
+  0x40A9,
+  0x40A9,
+  0x40A9,
+  0x40AA,
+  0x40AA,
+  0x40AA,
+  0x40AA,
+  0x40AB,
+  0x40AB,
+  0x40AB,
+  0x40AC,
+  0x40AC,
+  0x40AC,
+  0x40AC,
+  0x40AD,
+  0x40AD,
+  0x40AD,
+  0x40AD,
+  0x40AE,
+  0x40AE,
+  0x40AE,
+  0x40AE,
+  0x40AF,
+  0x40AF,
+  0x40AF,
+  0x40AF,
+  0x40B0,
+  0x40B0,
+  0x40B0,
+  0x40B0,
+  0x40B1,
+  0x40B1,
+  0x40B1,
+  0x40B1,
+  0x40B2,
+  0x40B2,
+  0x40B2,
+  0x40B2,
+  0x40B3,
+  0x40B3,
+  0x40B3,
+  0x40B3,
+  0x40B4,
+  0x40B4,
+  0x40B4,
+  0x40B4,
+  0x40B5,
+  0x40B5,
+  0x40B5,
+  0x40B6,
+  0x40B6,
+  0x40B6,
+  0x40B6,
+  0x40B7,
+  0x40B7,
+  0x40B7,
+  0x40B7,
+  0x40B8,
+  0x40B8,
+  0x40B8,
+  0x40B8,
+  0x40B9,
+  0x40B9,
+  0x40B9,
+  0x40B9,
+  0x40BA,
+  0x40BA,
+  0x40BA,
+  0x40BA,
+  0x40BB,
+  0x40BB,
+  0x40BB,
+  0x40BB,
+  0x40BC,
+  0x40BC,
+  0x40BC,
+  0x40BC,
+  0x40BD,
+  0x40BD,
+  0x40BD,
+  0x40BD,
+  0x40BE,
+  0x40BE,
+  0x40BE,
+  0x40BE,
+  0x40BF,
+  0x40BF,
+  0x40BF,
+  0x40BF,
+  0x40C0,
+  0x40C0,
+  0x40C0,
+  0x40C1,
+  0x40C1,
+  0x40C1,
+  0x40C1,
+  0x40C2,
+  0x40C2,
+  0x40C2,
+  0x40C2,
+  0x40C3,
+  0x40C3,
+  0x40C3,
+  0x40C3,
+  0x40C4,
+  0x40C4,
+  0x40C4,
+  0x40C4,
+  0x40C5,
+  0x40C5,
+  0x40C5,
+  0x40C5,
+  0x40C6,
+  0x40C6,
+  0x40C6,
+  0x40C6,
+  0x40C7,
+  0x40C7,
+  0x40C7,
+  0x40C7,
+  0x40C8,
+  0x40C8,
+  0x40C8,
+  0x40C8,
+  0x40C9,
+  0x40C9,
+  0x40C9,
+  0x40C9,
+  0x40CA,
+  0x40CA,
+  0x40CA,
+  0x40CA,
+  0x40CB,
+  0x40CB,
+  0x40CB,
+  0x40CC,
+  0x40CC,
+  0x40CC,
+  0x40CC,
+  0x40CD,
+  0x40CD,
+  0x40CD,
+  0x40CD,
+  0x40CE,
+  0x40CE,
+  0x40CE,
+  0x40CE,
+  0x40CF,
+  0x40CF,
+  0x40CF,
+  0x40CF,
+  0x40D0,
+  0x40D0,
+  0x40D0,
+  0x40D0,
+  0x40D1,
+  0x40D1,
+  0x40D1,
+  0x40D1,
+  0x40D2,
+  0x40D2,
+  0x40D2,
+  0x40D2,
+  0x40D3,
+  0x40D3,
+  0x40D3,
+  0x40D3,
+  0x40D4,
+  0x40D4,
+  0x40D4,
+  0x40D4,
+  0x40D5,
+  0x40D5,
+  0x40D5,
+  0x40D6,
+  0x40D6,
+  0x40D6,
+  0x40D6,
+  0x40D7,
+  0x40D7,
+  0x40D7,
+  0x40D7,
+  0x40D8,
+  0x40D8,
+  0x40D8,
+  0x40D8,
+  0x40D9,
+  0x40D9,
+  0x40D9,
+  0x40D9,
+  0x40DA,
+  0x40DA,
+  0x40DA,
+  0x40DA,
+  0x40DB,
+  0x40DB,
+  0x40DB,
+  0x40DB,
+  0x40DC,
+  0x40DC,
+  0x40DC,
+  0x40DC,
+  0x40DD,
+  0x40DD,
+  0x40DD,
+  0x40DD,
+  0x40DE,
+  0x40DE,
+  0x40DE,
+  0x40DE,
+  0x40DF,
+  0x40DF,
+  0x40DF,
+  0x40DF,
+  0x40E0,
+  0x40E0,
+  0x40E0,
+  0x40E1,
+  0x40E1,
+  0x40E1,
+  0x40E1,
+  0x40E2,
+  0x40E2,
+  0x40E2,
+  0x40E2,
+  0x40E3,
+  0x40E3,
+  0x40E3,
+  0x40E3,
+  0x40E4,
+  0x40E4,
+  0x40E4,
+  0x40E4,
+  0x40E5,
+  0x40E5,
+  0x40E5,
+  0x40E5,
+  0x40E6,
+  0x40E6,
+  0x40E6,
+  0x40E6,
+  0x40E7,
+  0x40E7,
+  0x40E7,
+  0x40E7,
+  0x40E8,
+  0x40E8,
+  0x40E8,
+  0x40E8,
+  0x40E9,
+  0x40E9,
+  0x40E9,
+  0x40E9,
+  0x40EA,
+  0x40EA,
+  0x40EA,
+  0x40EA,
+  0x40EB,
+  0x40EB,
+  0x40EB,
+  0x40EC,
+  0x40EC,
+  0x40EC,
+  0x40EC,
+  0x40ED,
+  0x40ED,
+  0x40ED,
+  0x40ED,
+  0x40EE,
+  0x40EE,
+  0x40EE,
+  0x40EE,
+  0x40EF,
+  0x40EF,
+  0x40EF,
+  0x40EF,
+  0x40F0,
+  0x40F0,
+  0x40F0,
+  0x40F0,
+  0x40F1,
+  0x40F1,
+  0x40F1,
+  0x40F1,
+  0x40F2,
+  0x40F2,
+  0x40F2,
+  0x40F2,
+  0x40F3,
+  0x40F3,
+  0x40F3,
+  0x40F3,
+  0x40F4,
+  0x40F4,
+  0x40F4,
+  0x40F4,
+  0x40F5,
+  0x40F5,
+  0x40F5,
+  0x40F6,
+  0x40F6,
+  0x40F6,
+  0x40F6,
+  0x40F7,
+  0x40F7,
+  0x40F7,
+  0x40F7,
+  0x40F8,
+  0x40F8,
+  0x40F8,
+  0x40F8,
+  0x40F9,
+  0x40F9,
+  0x40F9,
+  0x40F9,
+  0x40FA,
+  0x40FA,
+  0x40FA,
+  0x40FA,
+  0x40FB,
+  0x40FB,
+  0x40FB,
+  0x40FB,
+  0x40FC,
+  0x40FC,
+  0x40FC,
+  0x40FC,
+  0x40FD,
+  0x40FD,
+  0x40FD,
+  0x40FD,
+  0x40FE,
+  0x40FE,
+  0x40FE,
+  0x40FE,
+  0x40FF,
+  0x40FF,
+  0x40FF,
+  0x40FF,
+  0x4100,
+  0xBC03,
+  0xBC83,
+  0xBCC5,
+  0xBD03,
+  0xBD24,
+  0xBD45,
+  0xBD65,
+  0xBD83,
+  0xBD93,
+  0xBDA4,
+  0xBDB4,
+  0xBDC5,
+  0xBDD5,
+  0xBDE5,
+  0xBDF6,
+  0xBE03,
+  0xBE0B,
+  0xBE13,
+  0xBE1C,
+  0xBE24,
+  0xBE2C,
+  0xBE34,
+  0xBE3C,
+  0xBE45,
+  0xBE4D,
+  0xBE55,
+  0xBE5D,
+  0xBE65,
+  0xBE6E,
+  0xBE76,
+  0xBE7E,
+  0xBE83,
+  0xBE87,
+  0xBE8B,
+  0xBE8F,
+  0xBE93,
+  0xBE98,
+  0xBE9C,
+  0xBEA0,
+  0xBEA4,
+  0xBEA8,
+  0xBEAC,
+  0xBEB0,
+  0xBEB4,
+  0xBEB8,
+  0xBEBC,
+  0xBEC1,
+  0xBEC5,
+  0xBEC9,
+  0xBECD,
+  0xBED1,
+  0xBED5,
+  0xBED9,
+  0xBEDD,
+  0xBEE1,
+  0xBEE5,
+  0xBEE9,
+  0xBEEE,
+  0xBEF2,
+  0xBEF6,
+  0xBEFA,
+  0xBEFE,
+  0xBF01,
+  0xBF03,
+  0xBF05,
+  0xBF07,
+  0xBF09,
+  0xBF0B,
+  0xBF0D,
+  0xBF0F,
+  0xBF11,
+  0xBF13,
+  0xBF16,
+  0xBF18,
+  0xBF1A,
+  0xBF1C,
+  0xBF1E,
+  0xBF20,
+  0xBF22,
+  0xBF24,
+  0xBF26,
+  0xBF28,
+  0xBF2A,
+  0xBF2C,
+  0xBF2E,
+  0xBF30,
+  0xBF32,
+  0xBF34,
+  0xBF36,
+  0xBF38,
+  0xBF3A,
+  0xBF3C,
+  0xBF3E,
+  0xBF41,
+  0xBF43,
+  0xBF45,
+  0xBF47,
+  0xBF49,
+  0xBF4B,
+  0xBF4D,
+  0xBF4F,
+  0xBF51,
+  0xBF53,
+  0xBF55,
+  0xBF57,
+  0xBF59,
+  0xBF5B,
+  0xBF5D,
+  0xBF5F,
+  0xBF61,
+  0xBF63,
+  0xBF65,
+  0xBF67,
+  0xBF69,
+  0xBF6C,
+  0xBF6E,
+  0xBF70,
+  0xBF72,
+  0xBF74,
+  0xBF76,
+  0xBF78,
+  0xBF7A,
+  0xBF7C,
+  0xBF7E,
+  0xBF80,
+  0xBF81,
+  0xBF82,
+  0xBF83,
+  0xBF84,
+  0xBF85,
+  0xBF86,
+  0xBF87,
+  0xBF88,
+  0xBF89,
+  0xBF8A,
+  0xBF8B,
+  0xBF8C,
+  0xBF8D,
+  0xBF8E,
+  0xBF8F,
+  0xBF90,
+  0xBF91,
+  0xBF92,
+  0xBF93,
+  0xBF94,
+  0xBF96,
+  0xBF97,
+  0xBF98,
+  0xBF99,
+  0xBF9A,
+  0xBF9B,
+  0xBF9C,
+  0xBF9D,
+  0xBF9E,
+  0xBF9F,
+  0xBFA0,
+  0xBFA1,
+  0xBFA2,
+  0xBFA3,
+  0xBFA4,
+  0xBFA5,
+  0xBFA6,
+  0xBFA7,
+  0xBFA8,
+  0xBFA9,
+  0xBFAA,
+  0xBFAB,
+  0xBFAC,
+  0xBFAD,
+  0xBFAE,
+  0xBFAF,
+  0xBFB0,
+  0xBFB1,
+  0xBFB2,
+  0xBFB3,
+  0xBFB4,
+  0xBFB5,
+  0xBFB6,
+  0xBFB7,
+  0xBFB8,
+  0xBFB9,
+  0xBFBA,
+  0xBFBB,
+  0xBFBC,
+  0xBFBD,
+  0xBFBE,
+  0xBFBF,
+  0xBFC1,
+  0xBFC2,
+  0xBFC3,
+  0xBFC4,
+  0xBFC5,
+  0xBFC6,
+  0xBFC7,
+  0xBFC8,
+  0xBFC9,
+  0xBFCA,
+  0xBFCB,
+  0xBFCC,
+  0xBFCD,
+  0xBFCE,
+  0xBFCF,
+  0xBFD0,
+  0xBFD1,
+  0xBFD2,
+  0xBFD3,
+  0xBFD4,
+  0xBFD5,
+  0xBFD6,
+  0xBFD7,
+  0xBFD8,
+  0xBFD9,
+  0xBFDA,
+  0xBFDB,
+  0xBFDC,
+  0xBFDD,
+  0xBFDE,
+  0xBFDF,
+  0xBFE0,
+  0xBFE1,
+  0xBFE2,
+  0xBFE3,
+  0xBFE4,
+  0xBFE5,
+  0xBFE6,
+  0xBFE7,
+  0xBFE8,
+  0xBFE9,
+  0xBFEA,
+  0xBFEC,
+  0xBFED,
+  0xBFEE,
+  0xBFEF,
+  0xBFF0,
+  0xBFF1,
+  0xBFF2,
+  0xBFF3,
+  0xBFF4,
+  0xBFF5,
+  0xBFF6,
+  0xBFF7,
+  0xBFF8,
+  0xBFF9,
+  0xBFFA,
+  0xBFFB,
+  0xBFFC,
+  0xBFFD,
+  0xBFFE,
+  0xBFFF,
+  0xC000,
+  0xC001,
+  0xC001,
+  0xC002,
+  0xC002,
+  0xC003,
+  0xC003,
+  0xC004,
+  0xC004,
+  0xC005,
+  0xC005,
+  0xC006,
+  0xC006,
+  0xC007,
+  0xC007,
+  0xC008,
+  0xC008,
+  0xC009,
+  0xC009,
+  0xC00A,
+  0xC00A,
+  0xC00B,
+  0xC00B,
+  0xC00C,
+  0xC00C,
+  0xC00D,
+  0xC00D,
+  0xC00E,
+  0xC00E,
+  0xC00F,
+  0xC00F,
+  0xC010,
+  0xC010,
+  0xC011,
+  0xC011,
+  0xC012,
+  0xC012,
+  0xC013,
+  0xC013,
+  0xC014,
+  0xC014,
+  0xC015,
+  0xC016,
+  0xC016,
+  0xC017,
+  0xC017,
+  0xC018,
+  0xC018,
+  0xC019,
+  0xC019,
+  0xC01A,
+  0xC01A,
+  0xC01B,
+  0xC01B,
+  0xC01C,
+  0xC01C,
+  0xC01D,
+  0xC01D,
+  0xC01E,
+  0xC01E,
+  0xC01F,
+  0xC01F,
+  0xC020,
+  0xC020,
+  0xC021,
+  0xC021,
+  0xC022,
+  0xC022,
+  0xC023,
+  0xC023,
+  0xC024,
+  0xC024,
+  0xC025,
+  0xC025,
+  0xC026,
+  0xC026,
+  0xC027,
+  0xC027,
+  0xC028,
+  0xC028,
+  0xC029,
+  0xC029,
+  0xC02A,
+  0xC02A,
+  0xC02B,
+  0xC02C,
+  0xC02C,
+  0xC02D,
+  0xC02D,
+  0xC02E,
+  0xC02E,
+  0xC02F,
+  0xC02F,
+  0xC030,
+  0xC030,
+  0xC031,
+  0xC031,
+  0xC032,
+  0xC032,
+  0xC033,
+  0xC033,
+  0xC034,
+  0xC034,
+  0xC035,
+  0xC035,
+  0xC036,
+  0xC036,
+  0xC037,
+  0xC037,
+  0xC038,
+  0xC038,
+  0xC039,
+  0xC039,
+  0xC03A,
+  0xC03A,
+  0xC03B,
+  0xC03B,
+  0xC03C,
+  0xC03C,
+  0xC03D,
+  0xC03D,
+  0xC03E,
+  0xC03E,
+  0xC03F,
+  0xC03F,
+  0xC040,
+  0xC041,
+  0xC041,
+  0xC042,
+  0xC042,
+  0xC043,
+  0xC043,
+  0xC044,
+  0xC044,
+  0xC045,
+  0xC045,
+  0xC046,
+  0xC046,
+  0xC047,
+  0xC047,
+  0xC048,
+  0xC048,
+  0xC049,
+  0xC049,
+  0xC04A,
+  0xC04A,
+  0xC04B,
+  0xC04B,
+  0xC04C,
+  0xC04C,
+  0xC04D,
+  0xC04D,
+  0xC04E,
+  0xC04E,
+  0xC04F,
+  0xC04F,
+  0xC050,
+  0xC050,
+  0xC051,
+  0xC051,
+  0xC052,
+  0xC052,
+  0xC053,
+  0xC053,
+  0xC054,
+  0xC054,
+  0xC055,
+  0xC056,
+  0xC056,
+  0xC057,
+  0xC057,
+  0xC058,
+  0xC058,
+  0xC059,
+  0xC059,
+  0xC05A,
+  0xC05A,
+  0xC05B,
+  0xC05B,
+  0xC05C,
+  0xC05C,
+  0xC05D,
+  0xC05D,
+  0xC05E,
+  0xC05E,
+  0xC05F,
+  0xC05F,
+  0xC060,
+  0xC060,
+  0xC061,
+  0xC061,
+  0xC062,
+  0xC062,
+  0xC063,
+  0xC063,
+  0xC064,
+  0xC064,
+  0xC065,
+  0xC065,
+  0xC066,
+  0xC066,
+  0xC067,
+  0xC067,
+  0xC068,
+  0xC068,
+  0xC069,
+  0xC069,
+  0xC06A,
+  0xC06A,
+  0xC06B,
+  0xC06C,
+  0xC06C,
+  0xC06D,
+  0xC06D,
+  0xC06E,
+  0xC06E,
+  0xC06F,
+  0xC06F,
+  0xC070,
+  0xC070,
+  0xC071,
+  0xC071,
+  0xC072,
+  0xC072,
+  0xC073,
+  0xC073,
+  0xC074,
+  0xC074,
+  0xC075,
+  0xC075,
+  0xC076,
+  0xC076,
+  0xC077,
+  0xC077,
+  0xC078,
+  0xC078,
+  0xC079,
+  0xC079,
+  0xC07A,
+  0xC07A,
+  0xC07B,
+  0xC07B,
+  0xC07C,
+  0xC07C,
+  0xC07D,
+  0xC07D,
+  0xC07E,
+  0xC07E,
+  0xC07F,
+  0xC07F,
+  0xC080,
+  0xC080,
+  0xC081,
+  0xC081,
+  0xC081,
+  0xC081,
+  0xC082,
+  0xC082,
+  0xC082,
+  0xC082,
+  0xC083,
+  0xC083,
+  0xC083,
+  0xC083,
+  0xC084,
+  0xC084,
+  0xC084,
+  0xC084,
+  0xC085,
+  0xC085,
+  0xC085,
+  0xC085,
+  0xC086,
+  0xC086,
+  0xC086,
+  0xC086,
+  0xC087,
+  0xC087,
+  0xC087,
+  0xC087,
+  0xC088,
+  0xC088,
+  0xC088,
+  0xC088,
+  0xC089,
+  0xC089,
+  0xC089,
+  0xC089,
+  0xC08A,
+  0xC08A,
+  0xC08A,
+  0xC08A,
+  0xC08B,
+  0xC08B,
+  0xC08B,
+  0xC08C,
+  0xC08C,
+  0xC08C,
+  0xC08C,
+  0xC08D,
+  0xC08D,
+  0xC08D,
+  0xC08D,
+  0xC08E,
+  0xC08E,
+  0xC08E,
+  0xC08E,
+  0xC08F,
+  0xC08F,
+  0xC08F,
+  0xC08F,
+  0xC090,
+  0xC090,
+  0xC090,
+  0xC090,
+  0xC091,
+  0xC091,
+  0xC091,
+  0xC091,
+  0xC092,
+  0xC092,
+  0xC092,
+  0xC092,
+  0xC093,
+  0xC093,
+  0xC093,
+  0xC093,
+  0xC094,
+  0xC094,
+  0xC094,
+  0xC094,
+  0xC095,
+  0xC095,
+  0xC095,
+  0xC096,
+  0xC096,
+  0xC096,
+  0xC096,
+  0xC097,
+  0xC097,
+  0xC097,
+  0xC097,
+  0xC098,
+  0xC098,
+  0xC098,
+  0xC098,
+  0xC099,
+  0xC099,
+  0xC099,
+  0xC099,
+  0xC09A,
+  0xC09A,
+  0xC09A,
+  0xC09A,
+  0xC09B,
+  0xC09B,
+  0xC09B,
+  0xC09B,
+  0xC09C,
+  0xC09C,
+  0xC09C,
+  0xC09C,
+  0xC09D,
+  0xC09D,
+  0xC09D,
+  0xC09D,
+  0xC09E,
+  0xC09E,
+  0xC09E,
+  0xC09E,
+  0xC09F,
+  0xC09F,
+  0xC09F,
+  0xC09F,
+  0xC0A0,
+  0xC0A0,
+  0xC0A0,
+  0xC0A1,
+  0xC0A1,
+  0xC0A1,
+  0xC0A1,
+  0xC0A2,
+  0xC0A2,
+  0xC0A2,
+  0xC0A2,
+  0xC0A3,
+  0xC0A3,
+  0xC0A3,
+  0xC0A3,
+  0xC0A4,
+  0xC0A4,
+  0xC0A4,
+  0xC0A4,
+  0xC0A5,
+  0xC0A5,
+  0xC0A5,
+  0xC0A5,
+  0xC0A6,
+  0xC0A6,
+  0xC0A6,
+  0xC0A6,
+  0xC0A7,
+  0xC0A7,
+  0xC0A7,
+  0xC0A7,
+  0xC0A8,
+  0xC0A8,
+  0xC0A8,
+  0xC0A8,
+  0xC0A9,
+  0xC0A9,
+  0xC0A9,
+  0xC0A9,
+  0xC0AA,
+  0xC0AA,
+  0xC0AA,
+  0xC0AA,
+  0xC0AB,
+  0xC0AB,
+  0xC0AB,
+  0xC0AC,
+  0xC0AC,
+  0xC0AC,
+  0xC0AC,
+  0xC0AD,
+  0xC0AD,
+  0xC0AD,
+  0xC0AD,
+  0xC0AE,
+  0xC0AE,
+  0xC0AE,
+  0xC0AE,
+  0xC0AF,
+  0xC0AF,
+  0xC0AF,
+  0xC0AF,
+  0xC0B0,
+  0xC0B0,
+  0xC0B0,
+  0xC0B0,
+  0xC0B1,
+  0xC0B1,
+  0xC0B1,
+  0xC0B1,
+  0xC0B2,
+  0xC0B2,
+  0xC0B2,
+  0xC0B2,
+  0xC0B3,
+  0xC0B3,
+  0xC0B3,
+  0xC0B3,
+  0xC0B4,
+  0xC0B4,
+  0xC0B4,
+  0xC0B4,
+  0xC0B5,
+  0xC0B5,
+  0xC0B5,
+  0xC0B6,
+  0xC0B6,
+  0xC0B6,
+  0xC0B6,
+  0xC0B7,
+  0xC0B7,
+  0xC0B7,
+  0xC0B7,
+  0xC0B8,
+  0xC0B8,
+  0xC0B8,
+  0xC0B8,
+  0xC0B9,
+  0xC0B9,
+  0xC0B9,
+  0xC0B9,
+  0xC0BA,
+  0xC0BA,
+  0xC0BA,
+  0xC0BA,
+  0xC0BB,
+  0xC0BB,
+  0xC0BB,
+  0xC0BB,
+  0xC0BC,
+  0xC0BC,
+  0xC0BC,
+  0xC0BC,
+  0xC0BD,
+  0xC0BD,
+  0xC0BD,
+  0xC0BD,
+  0xC0BE,
+  0xC0BE,
+  0xC0BE,
+  0xC0BE,
+  0xC0BF,
+  0xC0BF,
+  0xC0BF,
+  0xC0BF,
+  0xC0C0,
+  0xC0C0,
+  0xC0C0,
+  0xC0C1,
+  0xC0C1,
+  0xC0C1,
+  0xC0C1,
+  0xC0C2,
+  0xC0C2,
+  0xC0C2,
+  0xC0C2,
+  0xC0C3,
+  0xC0C3,
+  0xC0C3,
+  0xC0C3,
+  0xC0C4,
+  0xC0C4,
+  0xC0C4,
+  0xC0C4,
+  0xC0C5,
+  0xC0C5,
+  0xC0C5,
+  0xC0C5,
+  0xC0C6,
+  0xC0C6,
+  0xC0C6,
+  0xC0C6,
+  0xC0C7,
+  0xC0C7,
+  0xC0C7,
+  0xC0C7,
+  0xC0C8,
+  0xC0C8,
+  0xC0C8,
+  0xC0C8,
+  0xC0C9,
+  0xC0C9,
+  0xC0C9,
+  0xC0C9,
+  0xC0CA,
+  0xC0CA,
+  0xC0CA,
+  0xC0CA,
+  0xC0CB,
+  0xC0CB,
+  0xC0CB,
+  0xC0CC,
+  0xC0CC,
+  0xC0CC,
+  0xC0CC,
+  0xC0CD,
+  0xC0CD,
+  0xC0CD,
+  0xC0CD,
+  0xC0CE,
+  0xC0CE,
+  0xC0CE,
+  0xC0CE,
+  0xC0CF,
+  0xC0CF,
+  0xC0CF,
+  0xC0CF,
+  0xC0D0,
+  0xC0D0,
+  0xC0D0,
+  0xC0D0,
+  0xC0D1,
+  0xC0D1,
+  0xC0D1,
+  0xC0D1,
+  0xC0D2,
+  0xC0D2,
+  0xC0D2,
+  0xC0D2,
+  0xC0D3,
+  0xC0D3,
+  0xC0D3,
+  0xC0D3,
+  0xC0D4,
+  0xC0D4,
+  0xC0D4,
+  0xC0D4,
+  0xC0D5,
+  0xC0D5,
+  0xC0D5,
+  0xC0D6,
+  0xC0D6,
+  0xC0D6,
+  0xC0D6,
+  0xC0D7,
+  0xC0D7,
+  0xC0D7,
+  0xC0D7,
+  0xC0D8,
+  0xC0D8,
+  0xC0D8,
+  0xC0D8,
+  0xC0D9,
+  0xC0D9,
+  0xC0D9,
+  0xC0D9,
+  0xC0DA,
+  0xC0DA,
+  0xC0DA,
+  0xC0DA,
+  0xC0DB,
+  0xC0DB,
+  0xC0DB,
+  0xC0DB,
+  0xC0DC,
+  0xC0DC,
+  0xC0DC,
+  0xC0DC,
+  0xC0DD,
+  0xC0DD,
+  0xC0DD,
+  0xC0DD,
+  0xC0DE,
+  0xC0DE,
+  0xC0DE,
+  0xC0DE,
+  0xC0DF,
+  0xC0DF,
+  0xC0DF,
+  0xC0DF,
+  0xC0E0,
+  0xC0E0,
+  0xC0E0,
+  0xC0E1,
+  0xC0E1,
+  0xC0E1,
+  0xC0E1,
+  0xC0E2,
+  0xC0E2,
+  0xC0E2,
+  0xC0E2,
+  0xC0E3,
+  0xC0E3,
+  0xC0E3,
+  0xC0E3,
+  0xC0E4,
+  0xC0E4,
+  0xC0E4,
+  0xC0E4,
+  0xC0E5,
+  0xC0E5,
+  0xC0E5,
+  0xC0E5,
+  0xC0E6,
+  0xC0E6,
+  0xC0E6,
+  0xC0E6,
+  0xC0E7,
+  0xC0E7,
+  0xC0E7,
+  0xC0E7,
+  0xC0E8,
+  0xC0E8,
+  0xC0E8,
+  0xC0E8,
+  0xC0E9,
+  0xC0E9,
+  0xC0E9,
+  0xC0E9,
+  0xC0EA,
+  0xC0EA,
+  0xC0EA,
+  0xC0EA,
+  0xC0EB,
+  0xC0EB,
+  0xC0EB,
+  0xC0EC,
+  0xC0EC,
+  0xC0EC,
+  0xC0EC,
+  0xC0ED,
+  0xC0ED,
+  0xC0ED,
+  0xC0ED,
+  0xC0EE,
+  0xC0EE,
+  0xC0EE,
+  0xC0EE,
+  0xC0EF,
+  0xC0EF,
+  0xC0EF,
+  0xC0EF,
+  0xC0F0,
+  0xC0F0,
+  0xC0F0,
+  0xC0F0,
+  0xC0F1,
+  0xC0F1,
+  0xC0F1,
+  0xC0F1,
+  0xC0F2,
+  0xC0F2,
+  0xC0F2,
+  0xC0F2,
+  0xC0F3,
+  0xC0F3,
+  0xC0F3,
+  0xC0F3,
+  0xC0F4,
+  0xC0F4,
+  0xC0F4,
+  0xC0F4,
+  0xC0F5,
+  0xC0F5,
+  0xC0F5,
+  0xC0F6,
+  0xC0F6,
+  0xC0F6,
+  0xC0F6,
+  0xC0F7,
+  0xC0F7,
+  0xC0F7,
+  0xC0F7,
+  0xC0F8,
+  0xC0F8,
+  0xC0F8,
+  0xC0F8,
+  0xC0F9,
+  0xC0F9,
+  0xC0F9,
+  0xC0F9,
+  0xC0FA,
+  0xC0FA,
+  0xC0FA,
+  0xC0FA,
+  0xC0FB,
+  0xC0FB,
+  0xC0FB,
+  0xC0FB,
+  0xC0FC,
+  0xC0FC,
+  0xC0FC,
+  0xC0FC,
+  0xC0FD,
+  0xC0FD,
+  0xC0FD,
+  0xC0FD,
+  0xC0FE,
+  0xC0FE,
+  0xC0FE,
+  0xC0FE,
+  0xC0FF,
+  0xC0FF,
+  0xC0FF,
+  0xC0FF,
+  0xC100,
+  0xC100,
+};
+
+static u16 sigmode_golden_bf16[] = {
+0x3f00,
+0x3f01,
+0x3f01,
+0x3f02,
+0x3f02,
+0x3f03,
+0x3f03,
+0x3f04,
+0x3f04,
+0x3f05,
+0x3f05,
+0x3f06,
+0x3f06,
+0x3f07,
+0x3f07,
+0x3f08,
+0x3f08,
+0x3f09,
+0x3f09,
+0x3f0a,
+0x3f0a,
+0x3f0b,
+0x3f0b,
+0x3f0c,
+0x3f0c,
+0x3f0d,
+0x3f0d,
+0x3f0e,
+0x3f0e,
+0x3f0f,
+0x3f0f,
+0x3f10,
+0x3f10,
+0x3f11,
+0x3f11,
+0x3f12,
+0x3f12,
+0x3f13,
+0x3f13,
+0x3f14,
+0x3f14,
+0x3f15,
+0x3f15,
+0x3f16,
+0x3f16,
+0x3f17,
+0x3f17,
+0x3f18,
+0x3f19,
+0x3f19,
+0x3f1a,
+0x3f1a,
+0x3f1b,
+0x3f1b,
+0x3f1b,
+0x3f1c,
+0x3f1d,
+0x3f1d,
+0x3f1e,
+0x3f1e,
+0x3f1f,
+0x3f1f,
+0x3f20,
+0x3f1f,
+0x3f20,
+0x3f20,
+0x3f21,
+0x3f21,
+0x3f22,
+0x3f22,
+0x3f23,
+0x3f23,
+0x3f24,
+0x3f24,
+0x3f25,
+0x3f25,
+0x3f26,
+0x3f26,
+0x3f27,
+0x3f27,
+0x3f28,
+0x3f28,
+0x3f29,
+0x3f29,
+0x3f2a,
+0x3f2a,
+0x3f2a,
+0x3f2a,
+0x3f2b,
+0x3f2b,
+0x3f2c,
+0x3f2c,
+0x3f2d,
+0x3f2d,
+0x3f2e,
+0x3f2f,
+0x3f2f,
+0x3f30,
+0x3f30,
+0x3f30,
+0x3f31,
+0x3f31,
+0x3f31,
+0x3f32,
+0x3f32,
+0x3f32,
+0x3f33,
+0x3f33,
+0x3f34,
+0x3f34,
+0x3f35,
+0x3f36,
+0x3f36,
+0x3f36,
+0x3f37,
+0x3f37,
+0x3f38,
+0x3f38,
+0x3f38,
+0x3f39,
+0x3f39,
+0x3f3a,
+0x3f3a,
+0x3f3a,
+0x3f3b,
+0x3f3b,
+0x3f3b,
+0x3f3c,
+0x3f3c,
+0x3f3d,
+0x3f3d,
+0x3f3d,
+0x3f3e,
+0x3f3e,
+0x3f3e,
+0x3f3f,
+0x3f3f,
+0x3f40,
+0x3f40,
+0x3f40,
+0x3f41,
+0x3f41,
+0x3f41,
+0x3f42,
+0x3f42,
+0x3f42,
+0x3f43,
+0x3f44,
+0x3f44,
+0x3f44,
+0x3f45,
+0x3f45,
+0x3f45,
+0x3f46,
+0x3f46,
+0x3f46,
+0x3f47,
+0x3f47,
+0x3f48,
+0x3f48,
+0x3f48,
+0x3f49,
+0x3f49,
+0x3f49,
+0x3f4a,
+0x3f4a,
+0x3f4b,
+0x3f4b,
+0x3f4b,
+0x3f4c,
+0x3f4c,
+0x3f4c,
+0x3f4c,
+0x3f4c,
+0x3f4d,
+0x3f4d,
+0x3f4d,
+0x3f4e,
+0x3f4e,
+0x3f4e,
+0x3f4f,
+0x3f4f,
+0x3f50,
+0x3f50,
+0x3f50,
+0x3f51,
+0x3f51,
+0x3f51,
+0x3f51,
+0x3f52,
+0x3f52,
+0x3f52,
+0x3f52,
+0x3f53,
+0x3f53,
+0x3f54,
+0x3f54,
+0x3f55,
+0x3f55,
+0x3f55,
+0x3f55,
+0x3f56,
+0x3f56,
+0x3f56,
+0x3f56,
+0x3f57,
+0x3f57,
+0x3f57,
+0x3f57,
+0x3f58,
+0x3f58,
+0x3f58,
+0x3f58,
+0x3f59,
+0x3f59,
+0x3f59,
+0x3f59,
+0x3f5a,
+0x3f5a,
+0x3f5a,
+0x3f5a,
+0x3f5a,
+0x3f5b,
+0x3f5b,
+0x3f5b,
+0x3f5b,
+0x3f5c,
+0x3f5c,
+0x3f5c,
+0x3f5c,
+0x3f5d,
+0x3f5d,
+0x3f5d,
+0x3f5e,
+0x3f5e,
+0x3f5e,
+0x3f5e,
+0x3f5f,
+0x3f5f,
+0x3f5f,
+0x3f5f,
+0x3f60,
+0x3f60,
+0x3f60,
+0x3f60,
+0x3f61,
+0x3f61,
+0x3f61,
+0x3f61,
+0x3f62,
+0x3f61,
+0x3f61,
+0x3f61,
+0x3f62,
+0x3f62,
+0x3f62,
+0x3f62,
+0x3f63,
+0x3f63,
+0x3f63,
+0x3f63,
+0x3f64,
+0x3f64,
+0x3f64,
+0x3f64,
+0x3f65,
+0x3f65,
+0x3f65,
+0x3f65,
+0x3f66,
+0x3f66,
+0x3f66,
+0x3f66,
+0x3f66,
+0x3f66,
+0x3f66,
+0x3f66,
+0x3f67,
+0x3f67,
+0x3f67,
+0x3f67,
+0x3f68,
+0x3f68,
+0x3f68,
+0x3f68,
+0x3f69,
+0x3f69,
+0x3f69,
+0x3f69,
+0x3f69,
+0x3f69,
+0x3f69,
+0x3f6a,
+0x3f6a,
+0x3f6a,
+0x3f6a,
+0x3f6a,
+0x3f6a,
+0x3f6a,
+0x3f6a,
+0x3f6b,
+0x3f6b,
+0x3f6b,
+0x3f6b,
+0x3f6b,
+0x3f6b,
+0x3f6b,
+0x3f6b,
+0x3f6c,
+0x3f6c,
+0x3f6c,
+0x3f6c,
+0x3f6d,
+0x3f6d,
+0x3f6d,
+0x3f6d,
+0x3f6e,
+0x3f6e,
+0x3f6e,
+0x3f6e,
+0x3f6e,
+0x3f6e,
+0x3f6e,
+0x3f6e,
+0x3f6f,
+0x3f6f,
+0x3f6f,
+0x3f6f,
+0x3f6f,
+0x3f6f,
+0x3f6f,
+0x3f6f,
+0x3f6f,
+0x3f6f,
+0x3f70,
+0x3f70,
+0x3f70,
+0x3f70,
+0x3f70,
+0x3f70,
+0x3f70,
+0x3f71,
+0x3f71,
+0x3f71,
+0x3f71,
+0x3f71,
+0x3f71,
+0x3f71,
+0x3f71,
+0x3f72,
+0x3f72,
+0x3f71,
+0x3f71,
+0x3f71,
+0x3f71,
+0x3f71,
+0x3f71,
+0x3f72,
+0x3f72,
+0x3f72,
+0x3f72,
+0x3f72,
+0x3f72,
+0x3f72,
+0x3f72,
+0x3f73,
+0x3f73,
+0x3f73,
+0x3f73,
+0x3f73,
+0x3f73,
+0x3f73,
+0x3f73,
+0x3f74,
+0x3f74,
+0x3f74,
+0x3f74,
+0x3f74,
+0x3f74,
+0x3f74,
+0x3f75,
+0x3f75,
+0x3f75,
+0x3f75,
+0x3f75,
+0x3f75,
+0x3f75,
+0x3f75,
+0x3f75,
+0x3f75,
+0x3f75,
+0x3f75,
+0x3f75,
+0x3f75,
+0x3f75,
+0x3f75,
+0x3f75,
+0x3f75,
+0x3f76,
+0x3f76,
+0x3f76,
+0x3f76,
+0x3f76,
+0x3f76,
+0x3f76,
+0x3f76,
+0x3f76,
+0x3f76,
+0x3f76,
+0x3f76,
+0x3f76,
+0x3f76,
+0x3f76,
+0x3f76,
+0x3f77,
+0x3f77,
+0x3f77,
+0x3f77,
+0x3f77,
+0x3f77,
+0x3f77,
+0x3f78,
+0x3f78,
+0x3f78,
+0x3f78,
+0x3f78,
+0x3f78,
+0x3f78,
+0x3f78,
+0x3f78,
+0x3f78,
+0x3f78,
+0x3f78,
+0x3f78,
+0x3f78,
+0x3f78,
+0x3f78,
+0x3f78,
+0x3f78,
+0x3f78,
+0x3f78,
+0x3f78,
+0x3f78,
+0x3f78,
+0x3f78,
+0x3f79,
+0x3f79,
+0x3f79,
+0x3f79,
+0x3f79,
+0x3f79,
+0x3f79,
+0x3f79,
+0x3f79,
+0x3f79,
+0x3f79,
+0x3f79,
+0x3f79,
+0x3f79,
+0x3f79,
+0x3f7a,
+0x3f7a,
+0x3f7a,
+0x3f7a,
+0x3f7a,
+0x3f7a,
+0x3f7a,
+0x3f7a,
+0x3f7a,
+0x3f7a,
+0x3f7a,
+0x3f7a,
+0x3f7a,
+0x3f7a,
+0x3f7a,
+0x3f7a,
+0x3f7a,
+0x3f7a,
+0x3f7a,
+0x3f7a,
+0x3f7a,
+0x3f7a,
+0x3f7a,
+0x3f7a,
+0x3f7b,
+0x3f7b,
+0x3f7b,
+0x3f7b,
+0x3f7b,
+0x3f7b,
+0x3f7b,
+0x3f7b,
+0x3f7b,
+0x3f7b,
+0x3f7b,
+0x3f7b,
+0x3f7b,
+0x3f7b,
+0x3f7b,
+0x3f7b,
+0x3f7b,
+0x3f7b,
+0x3f7b,
+0x3f7b,
+0x3f7b,
+0x3f7b,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7c,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7d,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7e,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f7f,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3f80,
+0x3eff,
+0x3efe,
+0x3efd,
+0x3efc,
+0x3efb,
+0x3efa,
+0x3ef9,
+0x3ef8,
+0x3ef7,
+0x3ef6,
+0x3ef5,
+0x3ef4,
+0x3ef3,
+0x3ef2,
+0x3ef1,
+0x3ef0,
+0x3eef,
+0x3eee,
+0x3eed,
+0x3eec,
+0x3eeb,
+0x3eea,
+0x3ee9,
+0x3ee7,
+0x3ee6,
+0x3ee5,
+0x3ee4,
+0x3ee3,
+0x3ee2,
+0x3ee1,
+0x3ee0,
+0x3edf,
+0x3ede,
+0x3edd,
+0x3edc,
+0x3edb,
+0x3eda,
+0x3ed9,
+0x3ed8,
+0x3ed7,
+0x3ed6,
+0x3ed5,
+0x3ed4,
+0x3ed3,
+0x3ed2,
+0x3ed1,
+0x3ed1,
+0x3ed0,
+0x3ecf,
+0x3ece,
+0x3ecd,
+0x3ecc,
+0x3ecb,
+0x3eca,
+0x3ec9,
+0x3ec8,
+0x3ec7,
+0x3ec6,
+0x3ec5,
+0x3ec4,
+0x3ec3,
+0x3ec2,
+0x3ec1,
+0x3ec0,
+0x3ebf,
+0x3ebe,
+0x3ebd,
+0x3ebc,
+0x3ebb,
+0x3eba,
+0x3eba,
+0x3eb9,
+0x3eb7,
+0x3eb6,
+0x3eb5,
+0x3eb4,
+0x3eb4,
+0x3eb3,
+0x3eb2,
+0x3eb1,
+0x3eb0,
+0x3eaf,
+0x3eaf,
+0x3eae,
+0x3ead,
+0x3eab,
+0x3eaa,
+0x3ea9,
+0x3ea8,
+0x3ea7,
+0x3ea7,
+0x3ea6,
+0x3ea5,
+0x3ea4,
+0x3ea3,
+0x3ea2,
+0x3ea1,
+0x3ea0,
+0x3e9f,
+0x3e9e,
+0x3e9e,
+0x3e9d,
+0x3e9c,
+0x3e9b,
+0x3e9a,
+0x3e99,
+0x3e98,
+0x3e98,
+0x3e97,
+0x3e97,
+0x3e96,
+0x3e95,
+0x3e94,
+0x3e93,
+0x3e92,
+0x3e91,
+0x3e90,
+0x3e8f,
+0x3e8e,
+0x3e8e,
+0x3e8d,
+0x3e8c,
+0x3e8b,
+0x3e8a,
+0x3e8a,
+0x3e89,
+0x3e88,
+0x3e88,
+0x3e87,
+0x3e86,
+0x3e85,
+0x3e85,
+0x3e83,
+0x3e82,
+0x3e82,
+0x3e81,
+0x3e80,
+0x3e7e,
+0x3e7d,
+0x3e7c,
+0x3e7b,
+0x3e7a,
+0x3e78,
+0x3e77,
+0x3e75,
+0x3e72,
+0x3e71,
+0x3e6f,
+0x3e6e,
+0x3e6c,
+0x3e6b,
+0x3e69,
+0x3e68,
+0x3e67,
+0x3e65,
+0x3e64,
+0x3e63,
+0x3e61,
+0x3e60,
+0x3e5f,
+0x3e5d,
+0x3e5c,
+0x3e5a,
+0x3e59,
+0x3e58,
+0x3e56,
+0x3e55,
+0x3e54,
+0x3e52,
+0x3e51,
+0x3e50,
+0x3e4f,
+0x3e4e,
+0x3e4c,
+0x3e4b,
+0x3e4a,
+0x3e49,
+0x3e47,
+0x3e46,
+0x3e45,
+0x3e44,
+0x3e43,
+0x3e41,
+0x3e40,
+0x3e3f,
+0x3e3e,
+0x3e3c,
+0x3e3a,
+0x3e39,
+0x3e37,
+0x3e36,
+0x3e35,
+0x3e34,
+0x3e33,
+0x3e31,
+0x3e30,
+0x3e2f,
+0x3e2e,
+0x3e2c,
+0x3e2b,
+0x3e2a,
+0x3e29,
+0x3e28,
+0x3e27,
+0x3e26,
+0x3e25,
+0x3e24,
+0x3e23,
+0x3e22,
+0x3e20,
+0x3e20,
+0x3e1f,
+0x3e1e,
+0x3e1d,
+0x3e1c,
+0x3e1b,
+0x3e1a,
+0x3e19,
+0x3e18,
+0x3e17,
+0x3e16,
+0x3e15,
+0x3e14,
+0x3e13,
+0x3e12,
+0x3e11,
+0x3e10,
+0x3e0f,
+0x3e0e,
+0x3e0c,
+0x3e0b,
+0x3e0a,
+0x3e09,
+0x3e08,
+0x3e07,
+0x3e06,
+0x3e05,
+0x3e04,
+0x3e03,
+0x3e03,
+0x3e02,
+0x3e01,
+0x3e00,
+0x3dff,
+0x3dfd,
+0x3dfb,
+0x3df9,
+0x3df8,
+0x3df6,
+0x3df4,
+0x3df1,
+0x3df1,
+0x3ded,
+0x3ded,
+0x3dea,
+0x3dea,
+0x3de7,
+0x3de7,
+0x3de4,
+0x3de4,
+0x3de1,
+0x3de1,
+0x3dde,
+0x3dde,
+0x3ddb,
+0x3ddb,
+0x3dd8,
+0x3dd8,
+0x3dd5,
+0x3dd5,
+0x3dd2,
+0x3dd2,
+0x3dcf,
+0x3dcf,
+0x3dcc,
+0x3dcc,
+0x3dc9,
+0x3dc9,
+0x3dc7,
+0x3dc7,
+0x3dc3,
+0x3dc3,
+0x3dc0,
+0x3dc0,
+0x3dbe,
+0x3dbe,
+0x3dbb,
+0x3dbb,
+0x3db9,
+0x3db9,
+0x3db6,
+0x3db4,
+0x3db4,
+0x3db1,
+0x3db1,
+0x3dae,
+0x3dae,
+0x3dac,
+0x3dac,
+0x3da9,
+0x3da9,
+0x3da7,
+0x3da7,
+0x3da5,
+0x3da5,
+0x3da3,
+0x3da3,
+0x3da0,
+0x3da0,
+0x3d9e,
+0x3d9e,
+0x3d9b,
+0x3d9b,
+0x3d99,
+0x3d99,
+0x3d97,
+0x3d97,
+0x3d94,
+0x3d94,
+0x3d93,
+0x3d93,
+0x3d91,
+0x3d91,
+0x3d8f,
+0x3d8f,
+0x3d8d,
+0x3d8d,
+0x3d8a,
+0x3d8a,
+0x3d88,
+0x3d88,
+0x3d86,
+0x3d86,
+0x3d84,
+0x3d82,
+0x3d82,
+0x3d80,
+0x3d80,
+0x3d7d,
+0x3d7d,
+0x3d79,
+0x3d79,
+0x3d76,
+0x3d76,
+0x3d72,
+0x3d72,
+0x3d6f,
+0x3d6f,
+0x3d6b,
+0x3d6b,
+0x3d68,
+0x3d68,
+0x3d65,
+0x3d65,
+0x3d61,
+0x3d61,
+0x3d5e,
+0x3d5e,
+0x3d5b,
+0x3d5b,
+0x3d58,
+0x3d58,
+0x3d55,
+0x3d55,
+0x3d52,
+0x3d52,
+0x3d4e,
+0x3d4e,
+0x3d4b,
+0x3d4b,
+0x3d48,
+0x3d48,
+0x3d45,
+0x3d45,
+0x3d42,
+0x3d3f,
+0x3d3f,
+0x3d3c,
+0x3d3c,
+0x3d3a,
+0x3d3a,
+0x3d37,
+0x3d37,
+0x3d34,
+0x3d34,
+0x3d32,
+0x3d32,
+0x3d2f,
+0x3d2f,
+0x3d2c,
+0x3d2c,
+0x3d2a,
+0x3d2a,
+0x3d27,
+0x3d27,
+0x3d24,
+0x3d24,
+0x3d22,
+0x3d22,
+0x3d20,
+0x3d20,
+0x3d1d,
+0x3d1d,
+0x3d1b,
+0x3d1b,
+0x3d19,
+0x3d19,
+0x3d17,
+0x3d17,
+0x3d15,
+0x3d15,
+0x3d12,
+0x3d12,
+0x3d10,
+0x3d10,
+0x3d0e,
+0x3d0c,
+0x3d0c,
+0x3d0a,
+0x3d0a,
+0x3d08,
+0x3d08,
+0x3d06,
+0x3d06,
+0x3d04,
+0x3d04,
+0x3d02,
+0x3d02,
+0x3cff,
+0x3cff,
+0x3cfb,
+0x3cfb,
+0x3cf8,
+0x3cf8,
+0x3cf4,
+0x3cf4,
+0x3cf0,
+0x3cf0,
+0x3cec,
+0x3cec,
+0x3ce9,
+0x3ce9,
+0x3ce5,
+0x3ce5,
+0x3ce2,
+0x3ce2,
+0x3cdf,
+0x3cdf,
+0x3cdb,
+0x3cdb,
+0x3cd8,
+0x3cd8,
+0x3cd5,
+0x3cd5,
+0x3cd2,
+0x3cd2,
+0x3ccf,
+0x3ccf,
+0x3ccc,
+0x3cc8,
+0x3cc8,
+0x3cc5,
+0x3cc5,
+0x3cc2,
+0x3cc2,
+0x3cbf,
+0x3cbf,
+0x3cbc,
+0x3cbc,
+0x3cb9,
+0x3cb9,
+0x3cb6,
+0x3cb6,
+0x3cb4,
+0x3cb4,
+0x3cb1,
+0x3cb1,
+0x3cae,
+0x3cae,
+0x3cac,
+0x3cac,
+0x3ca9,
+0x3ca9,
+0x3ca7,
+0x3ca7,
+0x3ca5,
+0x3ca5,
+0x3ca2,
+0x3ca2,
+0x3ca0,
+0x3ca0,
+0x3c9d,
+0x3c9d,
+0x3c9b,
+0x3c9b,
+0x3c98,
+0x3c98,
+0x3c96,
+0x3c96,
+0x3c93,
+0x3c93,
+0x3c8f,
+0x3c8f,
+0x3c8f,
+0x3c8f,
+0x3c8b,
+0x3c8b,
+0x3c8b,
+0x3c8b,
+0x3c87,
+0x3c87,
+0x3c87,
+0x3c87,
+0x3c82,
+0x3c82,
+0x3c82,
+0x3c82,
+0x3c7c,
+0x3c7c,
+0x3c7c,
+0x3c7c,
+0x3c75,
+0x3c75,
+0x3c75,
+0x3c75,
+0x3c6e,
+0x3c6e,
+0x3c6e,
+0x3c6e,
+0x3c66,
+0x3c66,
+0x3c66,
+0x3c66,
+0x3c5f,
+0x3c5f,
+0x3c5f,
+0x3c5f,
+0x3c59,
+0x3c59,
+0x3c59,
+0x3c59,
+0x3c53,
+0x3c53,
+0x3c53,
+0x3c4c,
+0x3c4c,
+0x3c4c,
+0x3c4c,
+0x3c46,
+0x3c46,
+0x3c46,
+0x3c46,
+0x3c3f,
+0x3c3f,
+0x3c3f,
+0x3c3f,
+0x3c39,
+0x3c39,
+0x3c39,
+0x3c39,
+0x3c34,
+0x3c34,
+0x3c34,
+0x3c34,
+0x3c2f,
+0x3c2f,
+0x3c2f,
+0x3c2f,
+0x3c29,
+0x3c29,
+0x3c29,
+0x3c29,
+0x3c24,
+0x3c24,
+0x3c24,
+0x3c24,
+0x3c1f,
+0x3c1f,
+0x3c1f,
+0x3c1f,
+0x3c1a,
+0x3c1a,
+0x3c1a,
+0x3c16,
+0x3c16,
+0x3c16,
+0x3c16,
+0x3c12,
+0x3c12,
+0x3c12,
+0x3c12,
+0x3c0d,
+0x3c0d,
+0x3c0d,
+0x3c0d,
+0x3c09,
+0x3c09,
+0x3c09,
+0x3c09,
+0x3c04,
+0x3c04,
+0x3c04,
+0x3c04,
+0x3c00,
+0x3c00,
+0x3c00,
+0x3c00,
+0x3bf8,
+0x3bf8,
+0x3bf8,
+0x3bf8,
+0x3bf1,
+0x3bf1,
+0x3bf1,
+0x3bf1,
+0x3be9,
+0x3be9,
+0x3be9,
+0x3be9,
+0x3be2,
+0x3be2,
+0x3be2,
+0x3be2,
+0x3bdb,
+0x3bdb,
+0x3bdb,
+0x3bd4,
+0x3bd4,
+0x3bd4,
+0x3bd4,
+0x3bce,
+0x3bce,
+0x3bce,
+0x3bce,
+0x3bc8,
+0x3bc8,
+0x3bc8,
+0x3bc8,
+0x3bc2,
+0x3bc2,
+0x3bc2,
+0x3bc2,
+0x3bbc,
+0x3bbc,
+0x3bbc,
+0x3bbc,
+0x3bb6,
+0x3bb6,
+0x3bb6,
+0x3bb6,
+0x3bb0,
+0x3bb0,
+0x3bb0,
+0x3bb0,
+0x3bab,
+0x3bab,
+0x3bab,
+0x3bab,
+0x3ba6,
+0x3ba6,
+0x3ba6,
+0x3ba6,
+0x3ba1,
+0x3ba1,
+0x3ba1,
+0x3ba1,
+0x3b9c,
+0x3b9c,
+0x3b9c,
+0x3b97,
+0x3b97,
+0x3b97,
+0x3b97,
+0x3b92,
+0x3b92,
+0x3b92,
+0x3b92,
+0x3b8e,
+0x3b8e,
+0x3b8e,
+0x3b8e,
+0x3b8a,
+0x3b8a,
+0x3b8a,
+0x3b8a,
+0x3b85,
+0x3b85,
+0x3b85,
+0x3b85,
+0x3b81,
+0x3b81,
+0x3b81,
+0x3b81,
+0x3b7b,
+0x3b7b,
+0x3b7b,
+0x3b7b,
+0x3b73,
+0x3b73,
+0x3b73,
+0x3b73,
+0x3b6c,
+0x3b6c,
+0x3b6c,
+0x3b6c,
+0x3b65,
+0x3b65,
+0x3b65,
+0x3b5d,
+0x3b5d,
+0x3b5d,
+0x3b5d,
+0x3b56,
+0x3b56,
+0x3b56,
+0x3b56,
+0x3b50,
+0x3b50,
+0x3b50,
+0x3b50,
+0x3b4a,
+0x3b4a,
+0x3b4a,
+0x3b4a,
+0x3b43,
+0x3b43,
+0x3b43,
+0x3b43,
+0x3b3d,
+0x3b3d,
+0x3b3d,
+0x3b3d,
+0x3b38,
+0x3b38,
+0x3b38,
+0x3b38,
+0x3b32,
+0x3b32,
+0x3b32,
+0x3b32,
+0x3b2c,
+0x3b2c,
+0x3b2c,
+0x3b2c,
+0x3b27,
+0x3b27,
+0x3b27,
+0x3b27,
+0x3b22,
+0x3b22,
+0x3b22,
+0x3b1d,
+0x3b1d,
+0x3b1d,
+0x3b1d,
+0x3b18,
+0x3b18,
+0x3b18,
+0x3b18,
+0x3b13,
+0x3b13,
+0x3b13,
+0x3b13,
+0x3b0f,
+0x3b0f,
+0x3b0f,
+0x3b0f,
+0x3b0b,
+0x3b0b,
+0x3b0b,
+0x3b0b,
+0x3b06,
+0x3b06,
+0x3b06,
+0x3b06,
+0x3b02,
+0x3b02,
+0x3b02,
+0x3b02,
+0x3afd,
+0x3afd,
+0x3afd,
+0x3afd,
+0x3af5,
+0x3af5,
+0x3af5,
+0x3af5,
+0x3aed,
+0x3aed,
+0x3aed,
+0x3aed,
+0x3ae6,
+0x3ae6,
+0x3ae6,
+0x3adf,
+0x3adf,
+0x3adf,
+0x3adf,
+0x3ad8,
+0x3ad8,
+0x3ad8,
+0x3ad8,
+0x3ad1,
+0x3ad1,
+0x3ad1,
+0x3ad1,
+0x3acb,
+0x3acb,
+0x3acb,
+0x3acb,
+0x3ac5,
+0x3ac5,
+0x3ac5,
+0x3ac5,
+0x3abf,
+0x3abf,
+0x3abf,
+0x3abf,
+0x3ab9,
+0x3ab9,
+0x3ab9,
+0x3ab9,
+0x3ab3,
+0x3ab3,
+0x3ab3,
+0x3ab3,
+0x3aae,
+0x3aae,
+0x3aae,
+0x3aae,
+0x3aa9,
+0x3aa9,
+0x3aa9,
+0x3aa3,
+0x3aa3,
+0x3aa3,
+0x3aa3,
+0x3a9e,
+0x3a9e,
+0x3a9e,
+0x3a9e,
+0x3a99,
+0x3a99,
+0x3a99,
+0x3a99,
+0x3a94,
+0x3a94,
+0x3a94,
+0x3a94,
+0x3a90,
+0x3a90,
+0x3a90,
+0x3a90,
+0x3a8c,
+0x3a8c,
+0x3a8c,
+0x3a8c,
+0x3a87,
+0x3a87,
+0x3a87,
+0x3a87,
+0x3a83,
+0x3a83,
+0x3a83,
+0x3a83,
+0x3a7e,
+0x3a7e,
+0x3a7e,
+0x3a7e,
+0x3a76,
+0x3a76,
+0x3a76,
+0x3a76,
+0x3a6f,
+0x3a6f,
+0x3a6f,
+0x3a68,
+0x3a68,
+0x3a68,
+0x3a68,
+0x3a60,
+0x3a60,
+0x3a60,
+0x3a60,
+0x3a59,
+0x3a59,
+0x3a59,
+0x3a59,
+0x3a53,
+0x3a53,
+0x3a53,
+0x3a53,
+0x3a4d,
+0x3a4d,
+0x3a4d,
+0x3a4d,
+0x3a46,
+0x3a46,
+0x3a46,
+0x3a46,
+0x3a40,
+0x3a40,
+0x3a40,
+0x3a40,
+0x3a3a,
+0x3a3a,
+0x3a3a,
+0x3a3a,
+0x3a34,
+0x3a34,
+0x3a34,
+0x3a34,
+0x3a2f,
+0x3a2f,
+0x3a2f,
+0x3a2f,
+0x3a2a,
+0x3a2a,
+0x3a2a,
+0x3a24,
+0x3a24,
+0x3a24,
+0x3a24,
+0x3a1f,
+0x3a1f,
+0x3a1f,
+0x3a1f,
+0x3a1a,
+0x3a1a,
+0x3a1a,
+0x3a1a,
+0x3a15,
+0x3a15,
+0x3a15,
+0x3a15,
+0x3a11,
+0x3a11,
+0x3a11,
+0x3a11,
+0x3a0d,
+0x3a0d,
+0x3a0d,
+0x3a0d,
+0x3a08,
+0x3a08,
+0x3a08,
+0x3a08,
+0x3a04,
+0x3a04,
+0x3a04,
+0x3a04,
+0x3a00,
+0x3a00,
+0x3a00,
+0x3a00,
+0x39f8,
+0x39f8,
+0x39f8,
+0x39f0,
+0x39f0,
+0x39f0,
+0x39f0,
+0x39e9,
+0x39e9,
+0x39e9,
+0x39e9,
+0x39e2,
+0x39e2,
+0x39e2,
+0x39e2,
+0x39db,
+0x39db,
+0x39db,
+0x39db,
+0x39d4,
+0x39d4,
+0x39d4,
+0x39d4,
+0x39ce,
+0x39ce,
+0x39ce,
+0x39ce,
+0x39c7,
+0x39c7,
+0x39c7,
+0x39c7,
+0x39c1,
+0x39c1,
+0x39c1,
+0x39c1,
+0x39bb,
+0x39bb,
+0x39bb,
+0x39bb,
+0x39b5,
+0x39b5,
+0x39b5,
+0x39b5,
+0x39b0,
+0x39b0,
+};
+
+// FIXME: not hard code
+// contribute from hw, fix with `PRE_DATA` input
+static double sigmode_golden[] = {0.5,
+0.501999989,
+0.503999915,
+0.505999712,
+0.507999317,
+0.509998667,
+0.511997697,
+0.513996342,
+0.515994541,
+0.517992228,
+0.51998934,
+0.521985814,
+0.523981585,
+0.525976591,
+0.527970767,
+0.529964052,
+0.531956381,
+0.533947691,
+0.535937921,
+0.537927006,
+0.539914885,
+0.541901494,
+0.543886772,
+0.545870657,
+0.547853086,
+0.549833997,
+0.55181333,
+0.553791023,
+0.555767014,
+0.557741243,
+0.559713649,
+0.561684172,
+0.56365275,
+0.565619325,
+0.567583836,
+0.569546224,
+0.571506429,
+0.573464394,
+0.575420058,
+0.577373363,
+0.579324252,
+0.581272667,
+0.583218549,
+0.585161842,
+0.58710249,
+0.589040434,
+0.59097562,
+0.59290799,
+0.594837491,
+0.596764066,
+0.59868766,
+0.60060822,
+0.60252569,
+0.604440017,
+0.606351149,
+0.608259031,
+0.610163611,
+0.612064837,
+0.613962657,
+0.61585702,
+0.617747875,
+0.61963517,
+0.621518857,
+0.623398885,
+0.625275204,
+0.627147766,
+0.629016523,
+0.630881426,
+0.632742428,
+0.634599482,
+0.63645254,
+0.638301558,
+0.640146488,
+0.641987286,
+0.643823907,
+0.645656306,
+0.64748444,
+0.649308265,
+0.651127739,
+0.652942818,
+0.654753461,
+0.656559626,
+0.658361272,
+0.66015836,
+0.661950848,
+0.663738697,
+0.665521869,
+0.667300325,
+0.669074026,
+0.670842936,
+0.672607017,
+0.674366233,
+0.676120548,
+0.677869926,
+0.679614333,
+0.681353734,
+0.683088095,
+0.684817383,
+0.686541565,
+0.688260608,
+0.689974481,
+0.691683153,
+0.693386592,
+0.695084769,
+0.696777653,
+0.698465216,
+0.700147429,
+0.701824263,
+0.703495691,
+0.705161686,
+0.706822221,
+0.70847727,
+0.710126808,
+0.71177081,
+0.71340925,
+0.715042106,
+0.716669353,
+0.718290968,
+0.71990693,
+0.721517216,
+0.723121805,
+0.724720676,
+0.726313808,
+0.727901182,
+0.729482779,
+0.731058579,
+0.732628564,
+0.734192716,
+0.735751018,
+0.737303454,
+0.738850006,
+0.740390659,
+0.741925398,
+0.743454208,
+0.744977074,
+0.746493983,
+0.748004922,
+0.749509876,
+0.751008835,
+0.752501785,
+0.753988716,
+0.755469617,
+0.756944477,
+0.758413287,
+0.759876035,
+0.761332715,
+0.762783316,
+0.764227831,
+0.765666252,
+0.767098572,
+0.768524783,
+0.769944881,
+0.771358858,
+0.772766709,
+0.774168429,
+0.775564014,
+0.77695346,
+0.778336762,
+0.779713917,
+0.781084923,
+0.782449776,
+0.783808476,
+0.78516102,
+0.786507407,
+0.787847636,
+0.789181707,
+0.790509619,
+0.791831373,
+0.79314697,
+0.794456411,
+0.795759698,
+0.797056831,
+0.798347814,
+0.79963265,
+0.80091134,
+0.802183889,
+0.803450299,
+0.804710577,
+0.805964724,
+0.807212748,
+0.808454651,
+0.809690441,
+0.810920123,
+0.812143702,
+0.813361186,
+0.814572581,
+0.815777894,
+0.816977132,
+0.818170304,
+0.819357418,
+0.820538481,
+0.821713502,
+0.82288249,
+0.824045455,
+0.825202406,
+0.826353353,
+0.827498306,
+0.828637274,
+0.82977027,
+0.830897303,
+0.832018385,
+0.833133528,
+0.834242742,
+0.83534604,
+0.836443435,
+0.837534937,
+0.838620561,
+0.83970032,
+0.840774225,
+0.841842291,
+0.842904531,
+0.843960959,
+0.84501159,
+0.846056436,
+0.847095514,
+0.848128836,
+0.84915642,
+0.850178278,
+0.851194427,
+0.852204883,
+0.85320966,
+0.854208775,
+0.855202244,
+0.856190082,
+0.857172307,
+0.858148935,
+0.859119982,
+0.860085466,
+0.861045403,
+0.861999811,
+0.862948707,
+0.863892109,
+0.864830034,
+0.8657625,
+0.866689525,
+0.867611126,
+0.868527324,
+0.869438134,
+0.870343577,
+0.871243671,
+0.872138434,
+0.873027885,
+0.873912043,
+0.874790928,
+0.875664558,
+0.876532952,
+0.877396131,
+0.878254114,
+0.879106919,
+0.879954567,
+0.880797078,
+0.881634471,
+0.882466767,
+0.883293985,
+0.884116145,
+0.884933268,
+0.885745374,
+0.886552483,
+0.887354615,
+0.888151792,
+0.888944033,
+0.88973136,
+0.890513792,
+0.89129135,
+0.892064056,
+0.89283193,
+0.893594992,
+0.894353264,
+0.895106767,
+0.895855521,
+0.896599549,
+0.897338869,
+0.898073505,
+0.898803476,
+0.899528804,
+0.900249511,
+0.900965617,
+0.901677143,
+0.902384111,
+0.903086543,
+0.903784458,
+0.90447788,
+0.905166828,
+0.905851324,
+0.90653139,
+0.907207047,
+0.907878316,
+0.908545218,
+0.909207776,
+0.90986601,
+0.910519941,
+0.911169591,
+0.911814981,
+0.912456133,
+0.913093067,
+0.913725806,
+0.914354369,
+0.91497878,
+0.915599058,
+0.916215226,
+0.916827304,
+0.917435313,
+0.918039275,
+0.91863921,
+0.919235141,
+0.919827088,
+0.920415072,
+0.920999114,
+0.921579235,
+0.922155456,
+0.922727798,
+0.923296282,
+0.923860929,
+0.92442176,
+0.924978795,
+0.925532055,
+0.926081561,
+0.926627334,
+0.927169394,
+0.927707762,
+0.928242458,
+0.928773503,
+0.929300917,
+0.929824721,
+0.930344935,
+0.93086158,
+0.931374675,
+0.931884241,
+0.932390297,
+0.932892865,
+0.933391964,
+0.933887615,
+0.934379836,
+0.934868648,
+0.93535407,
+0.935836124,
+0.936314827,
+0.9367902,
+0.937262263,
+0.937731034,
+0.938196534,
+0.938658781,
+0.939117796,
+0.939573597,
+0.940026203,
+0.940475634,
+0.940921909,
+0.941365046,
+0.941805065,
+0.942241985,
+0.942675824,
+0.943106601,
+0.943534335,
+0.943959044,
+0.944380747,
+0.944799462,
+0.945215208,
+0.945628003,
+0.946037865,
+0.946444813,
+0.946848864,
+0.947250036,
+0.947648348,
+0.948043817,
+0.948436462,
+0.948826299,
+0.949213347,
+0.949597623,
+0.949979144,
+0.950357929,
+0.950733994,
+0.951107357,
+0.951478034,
+0.951846044,
+0.952211402,
+0.952574127,
+0.952934234,
+0.953291742,
+0.953646665,
+0.953999022,
+0.954348829,
+0.954696102,
+0.955040858,
+0.955383113,
+0.955722883,
+0.956060185,
+0.956395034,
+0.956727447,
+0.95705744,
+0.957385028,
+0.957710228,
+0.958033055,
+0.958353525,
+0.958671653,
+0.958987455,
+0.959300946,
+0.959612142,
+0.959921058,
+0.960227709,
+0.960532111,
+0.960834277,
+0.961134224,
+0.961431966,
+0.961727518,
+0.962020894,
+0.962312109,
+0.962601179,
+0.962888117,
+0.963172937,
+0.963455655,
+0.963736284,
+0.964014838,
+0.964291332,
+0.96456578,
+0.964838195,
+0.965108591,
+0.965376983,
+0.965643384,
+0.965907808,
+0.966170267,
+0.966430777,
+0.966689349,
+0.966945998,
+0.967200737,
+0.967453578,
+0.967704535,
+0.967953622,
+0.96820085,
+0.968446233,
+0.968689784,
+0.968931516,
+0.96917144,
+0.969409571,
+0.969645919,
+0.969880498,
+0.97011332,
+0.970344398,
+0.970573743,
+0.970801367,
+0.971027284,
+0.971251504,
+0.97147404,
+0.971694904,
+0.971914107,
+0.972131661,
+0.972347578,
+0.972561869,
+0.972774546,
+0.97298562,
+0.973195103,
+0.973403006,
+0.973609341,
+0.973814117,
+0.974017347,
+0.974219042,
+0.974419212,
+0.974617868,
+0.974815021,
+0.975010683,
+0.975204863,
+0.975397572,
+0.97558882,
+0.975778619,
+0.975966979,
+0.97615391,
+0.976339422,
+0.976523525,
+0.97670623,
+0.976887547,
+0.977067486,
+0.977246057,
+0.977423269,
+0.977599132,
+0.977773657,
+0.977946853,
+0.978118729,
+0.978289296,
+0.978458562,
+0.978626537,
+0.978793231,
+0.978958653,
+0.979122812,
+0.979285717,
+0.979447378,
+0.979607804,
+0.979767003,
+0.979924985,
+0.980081758,
+0.980237332,
+0.980391715,
+0.980544915,
+0.980696943,
+0.980847805,
+0.980997512,
+0.981146071,
+0.98129349,
+0.981439779,
+0.981584945,
+0.981728996,
+0.981871942,
+0.98201379,
+0.982154548,
+0.982294225,
+0.982432827,
+0.982570364,
+0.982706843,
+0.982842273,
+0.982976659,
+0.983110012,
+0.983242337,
+0.983373644,
+0.983503939,
+0.983633229,
+0.983761524,
+0.983888829,
+0.984015152,
+0.9841405,
+0.984264882,
+0.984388303,
+0.984510772,
+0.984632294,
+0.984752879,
+0.984872531,
+0.984991259,
+0.985109069,
+0.985225968,
+0.985341963,
+0.985457061,
+0.985571269,
+0.985684592,
+0.985797039,
+0.985908614,
+0.986019326,
+0.98612918,
+0.986238183,
+0.986346341,
+0.986453661,
+0.986560148,
+0.98666581,
+0.986770653,
+0.986874682,
+0.986977903,
+0.987080324,
+0.98718195,
+0.987282786,
+0.987382839,
+0.987482115,
+0.98758062,
+0.98767836,
+0.987775339,
+0.987871565,
+0.987967043,
+0.988061778,
+0.988155776,
+0.988249042,
+0.988341583,
+0.988433404,
+0.98852451,
+0.988614907,
+0.9887046,
+0.988793594,
+0.988881895,
+0.988969507,
+0.989056437,
+0.98914269,
+0.98922827,
+0.989313183,
+0.989397433,
+0.989481027,
+0.989563968,
+0.989646262,
+0.989727914,
+0.989808929,
+0.989889312,
+0.989969066,
+0.990048198,
+0.990126712,
+0.990204613,
+0.990281905,
+0.990358593,
+0.990434681,
+0.990510175,
+0.990585079,
+0.990659397,
+0.990733134,
+0.990806295,
+0.990878883,
+0.990950903,
+0.99102236,
+0.991093257,
+0.9911636,
+0.991233391,
+0.991302637,
+0.99137134,
+0.991439506,
+0.991507137,
+0.991574239,
+0.991640815,
+0.991706869,
+0.991772406,
+0.991837429,
+0.991901942,
+0.99196595,
+0.992029456,
+0.992092463,
+0.992154977,
+0.992217,
+0.992278537,
+0.992339591,
+0.992400166,
+0.992460265,
+0.992519893,
+0.992579053,
+0.992637749,
+0.992695983,
+0.99275376,
+0.992811084,
+0.992867957,
+0.992924384,
+0.992980367,
+0.993035911,
+0.993091018,
+0.993145692,
+0.993199936,
+0.993253754,
+0.993307149,
+0.993360124,
+0.993412683,
+0.993464828,
+0.993516563,
+0.993567892,
+0.993618816,
+0.99366934,
+0.993719466,
+0.993769198,
+0.993818539,
+0.993867491,
+0.993916059,
+0.993964243,
+0.994012049,
+0.994059478,
+0.994106533,
+0.994153219,
+0.994199536,
+0.994245489,
+0.994291079,
+0.994336311,
+0.994381186,
+0.994425708,
+0.994469878,
+0.994513701,
+0.994557178,
+0.994600313,
+0.994643108,
+0.994685565,
+0.994727688,
+0.994769478,
+0.994810939,
+0.994852073,
+0.994892883,
+0.994933371,
+0.994973539,
+0.995013391,
+0.995052928,
+0.995092153,
+0.995131069,
+0.995169677,
+0.995207981,
+0.995245983,
+0.995283685,
+0.995321089,
+0.995358198,
+0.995395014,
+0.995431539,
+0.995467776,
+0.995503727,
+0.995539394,
+0.995574779,
+0.995609885,
+0.995644713,
+0.995679266,
+0.995713547,
+0.995747556,
+0.995781297,
+0.995814772,
+0.995847981,
+0.995880929,
+0.995913616,
+0.995946044,
+0.995978217,
+0.996010135,
+0.996041801,
+0.996073216,
+0.996104383,
+0.996135304,
+0.99616598,
+0.996196413,
+0.996226606,
+0.996256561,
+0.996286278,
+0.99631576,
+0.996345009,
+0.996374027,
+0.996402815,
+0.996431375,
+0.99645971,
+0.99648782,
+0.996515708,
+0.996543375,
+0.996570823,
+0.996598054,
+0.99662507,
+0.996651872,
+0.996678461,
+0.99670484,
+0.99673101,
+0.996756974,
+0.996782731,
+0.996808285,
+0.996833636,
+0.996858787,
+0.996883738,
+0.996908492,
+0.99693305,
+0.996957413,
+0.996981584,
+0.997005563,
+0.997029352,
+0.997052952,
+0.997076366,
+0.997099594,
+0.997122638,
+0.9971455,
+0.99716818,
+0.997190681,
+0.997213004,
+0.997235149,
+0.99725712,
+0.997278916,
+0.997300539,
+0.997321991,
+0.997343273,
+0.997364386,
+0.997385332,
+0.997406112,
+0.997426727,
+0.997447179,
+0.997467468,
+0.997487597,
+0.997507566,
+0.997527377,
+0.997547031,
+0.997566528,
+0.997585872,
+0.997605062,
+0.997624099,
+0.997642986,
+0.997661723,
+0.997680312,
+0.997698752,
+0.997717047,
+0.997735197,
+0.997753202,
+0.997771065,
+0.997788786,
+0.997806367,
+0.997823808,
+0.99784111,
+0.997858276,
+0.997875305,
+0.997892199,
+0.997908959,
+0.997925586,
+0.997942081,
+0.997958445,
+0.99797468,
+0.997990785,
+0.998006763,
+0.998022614,
+0.998038339,
+0.998053939,
+0.998069415,
+0.998084769,
+0.998100001,
+0.998115112,
+0.998130102,
+0.998144974,
+0.998159728,
+0.998174365,
+0.998188885,
+0.99820329,
+0.998217581,
+0.998231759,
+0.998245823,
+0.998259777,
+0.998273619,
+0.998287351,
+0.998300975,
+0.99831449,
+0.998327898,
+0.998341199,
+0.998354395,
+0.998367486,
+0.998380473,
+0.998393356,
+0.998406138,
+0.998418818,
+0.998431397,
+0.998443876,
+0.998456256,
+0.998468538,
+0.998480723,
+0.99849281,
+0.998504802,
+0.998516698,
+0.998528499,
+0.998540207,
+0.998551822,
+0.998563345,
+0.998574776,
+0.998586116,
+0.998597366,
+0.998608527,
+0.998619599,
+0.998630583,
+0.99864148,
+0.99865229,
+0.998663015,
+0.998673654,
+0.998684208,
+0.998694679,
+0.998705066,
+0.998715371,
+0.998725594,
+0.998735736,
+0.998745797,
+0.998755778,
+0.99876568,
+0.998775503,
+0.998785248,
+0.998794916,
+0.998804507,
+0.998814021,
+0.99882346,
+0.998832824,
+0.998842113,
+0.998851329,
+0.998860471,
+0.998869541,
+0.998878538,
+0.998887464,
+0.998896319,
+0.998905104,
+0.998913818,
+0.998922464,
+0.99893104,
+0.998939549,
+0.99894799,
+0.998956364,
+0.998964671,
+0.998972912,
+0.998981088,
+0.998989198,
+0.998997244,
+0.999005226,
+0.999013145,
+0.999021001,
+0.999028794,
+0.999036525,
+0.999044195,
+0.999051803,
+0.999059352,
+0.99906684,
+0.999074268,
+0.999081638,
+0.999088949,
+0.999096202,
+0.999103397,
+0.999110535,
+0.999117616,
+0.99912464,
+0.999131609,
+0.999138523,
+0.999145381,
+0.999152185,
+0.999158935,
+0.999165631,
+0.999172274,
+0.999178864,
+0.999185401,
+0.999191887,
+0.999198321,
+0.999204704,
+0.999211036,
+0.999217317,
+0.999223549,
+0.999229731,
+0.999235864,
+0.999241948,
+0.999247984,
+0.999253971,
+0.999259911,
+0.999265804,
+0.99927165,
+0.999277449,
+0.999283202,
+0.99928891,
+0.999294572,
+0.999300189,
+0.999305761,
+0.999311289,
+0.999316773,
+0.999322213,
+0.99932761,
+0.999332964,
+0.999338276,
+0.999343545,
+0.999348772,
+0.999353958,
+0.999359103,
+0.999364206,
+0.999369269,
+0.999374291,
+0.999379274,
+0.999384217,
+0.999389121,
+0.999393985,
+0.999398811,
+0.999403599,
+0.999408348,
+0.99941306,
+0.999417734,
+0.99942237,
+0.99942697,
+0.999431534,
+0.999436061,
+0.999440552,
+0.999445007,
+0.999449427,
+0.999453811,
+0.999458161,
+0.999462476,
+0.999466757,
+0.999471004,
+0.999475217,
+0.999479396,
+0.999483542,
+0.999487655,
+0.999491735,
+0.999495783,
+0.999499799,
+0.999503783,
+0.999507735,
+0.999511655,
+0.999515544,
+0.999519403,
+0.99952323,
+0.999527027,
+0.999530794,
+0.999534531,
+0.999538238,
+0.999541916,
+0.999545564,
+0.999549184,
+0.999552774,
+0.999556336,
+0.99955987,
+0.999563375,
+0.999566853,
+0.999570303,
+0.999573725,
+0.99957712,
+0.999580488,
+0.99958383,
+0.999587145,
+0.999590433,
+0.999593695,
+0.999596931,
+0.999600142,
+0.999603326,
+0.999606486,
+0.99960962,
+0.99961273,
+0.999615814,
+0.999618874,
+0.99962191,
+0.999624921,
+0.999627909,
+0.999630873,
+0.999633813,
+0.99963673,
+0.999639623,
+0.999642494,
+0.999645341,
+0.999648166,
+0.999650969,
+0.999653749,
+0.999656507,
+0.999659243,
+0.999661957,
+0.498000011,
+0.496000085,
+0.494000288,
+0.492000683,
+0.490001333,
+0.488002303,
+0.486003658,
+0.484005459,
+0.482007772,
+0.48001066,
+0.478014186,
+0.476018415,
+0.474023409,
+0.472029233,
+0.470035948,
+0.468043619,
+0.466052309,
+0.464062079,
+0.462072994,
+0.460085115,
+0.458098506,
+0.456113228,
+0.454129343,
+0.452146914,
+0.450166003,
+0.44818667,
+0.446208977,
+0.444232986,
+0.442258757,
+0.440286351,
+0.438315828,
+0.43634725,
+0.434380675,
+0.432416164,
+0.430453776,
+0.428493571,
+0.426535606,
+0.424579942,
+0.422626637,
+0.420675748,
+0.418727333,
+0.416781451,
+0.414838158,
+0.41289751,
+0.410959566,
+0.40902438,
+0.40709201,
+0.405162509,
+0.403235934,
+0.40131234,
+0.39939178,
+0.39747431,
+0.395559983,
+0.393648851,
+0.391740969,
+0.389836389,
+0.387935163,
+0.386037343,
+0.38414298,
+0.382252125,
+0.38036483,
+0.378481143,
+0.376601115,
+0.374724796,
+0.372852234,
+0.370983477,
+0.369118574,
+0.367257572,
+0.365400518,
+0.36354746,
+0.361698442,
+0.359853512,
+0.358012714,
+0.356176093,
+0.354343694,
+0.35251556,
+0.350691735,
+0.348872261,
+0.347057182,
+0.345246539,
+0.343440374,
+0.341638728,
+0.33984164,
+0.338049152,
+0.336261303,
+0.334478131,
+0.332699675,
+0.330925974,
+0.329157064,
+0.327392983,
+0.325633767,
+0.323879452,
+0.322130074,
+0.320385667,
+0.318646266,
+0.316911905,
+0.315182617,
+0.313458435,
+0.311739392,
+0.310025519,
+0.308316847,
+0.306613408,
+0.304915231,
+0.303222347,
+0.301534784,
+0.299852571,
+0.298175737,
+0.296504309,
+0.294838314,
+0.293177779,
+0.29152273,
+0.289873192,
+0.28822919,
+0.28659075,
+0.284957894,
+0.283330647,
+0.281709032,
+0.28009307,
+0.278482784,
+0.276878195,
+0.275279324,
+0.273686192,
+0.272098818,
+0.270517221,
+0.268941421,
+0.267371436,
+0.265807284,
+0.264248982,
+0.262696546,
+0.261149994,
+0.259609341,
+0.258074602,
+0.256545792,
+0.255022926,
+0.253506017,
+0.251995078,
+0.250490124,
+0.248991165,
+0.247498215,
+0.246011284,
+0.244530383,
+0.243055523,
+0.241586713,
+0.240123965,
+0.238667285,
+0.237216684,
+0.235772169,
+0.234333748,
+0.232901428,
+0.231475217,
+0.230055119,
+0.228641142,
+0.227233291,
+0.225831571,
+0.224435986,
+0.22304654,
+0.221663238,
+0.220286083,
+0.218915077,
+0.217550224,
+0.216191524,
+0.21483898,
+0.213492593,
+0.212152364,
+0.210818293,
+0.209490381,
+0.208168627,
+0.20685303,
+0.205543589,
+0.204240302,
+0.202943169,
+0.201652186,
+0.20036735,
+0.19908866,
+0.197816111,
+0.196549701,
+0.195289423,
+0.194035276,
+0.192787252,
+0.191545349,
+0.190309559,
+0.189079877,
+0.187856298,
+0.186638814,
+0.185427419,
+0.184222106,
+0.183022868,
+0.181829696,
+0.180642582,
+0.179461519,
+0.178286498,
+0.17711751,
+0.175954545,
+0.174797594,
+0.173646647,
+0.172501694,
+0.171362726,
+0.17022973,
+0.169102697,
+0.167981615,
+0.166866472,
+0.165757258,
+0.16465396,
+0.163556565,
+0.162465063,
+0.161379439,
+0.16029968,
+0.159225775,
+0.158157709,
+0.157095469,
+0.156039041,
+0.15498841,
+0.153943564,
+0.152904486,
+0.151871164,
+0.15084358,
+0.149821722,
+0.148805573,
+0.147795117,
+0.14679034,
+0.145791225,
+0.144797756,
+0.143809918,
+0.142827693,
+0.141851065,
+0.140880018,
+0.139914534,
+0.138954597,
+0.138000189,
+0.137051293,
+0.136107891,
+0.135169966,
+0.1342375,
+0.133310475,
+0.132388874,
+0.131472676,
+0.130561866,
+0.129656423,
+0.128756329,
+0.127861566,
+0.126972115,
+0.126087957,
+0.125209072,
+0.124335442,
+0.123467048,
+0.122603869,
+0.121745886,
+0.120893081,
+0.120045433,
+0.119202922,
+0.118365529,
+0.117533233,
+0.116706015,
+0.115883855,
+0.115066732,
+0.114254626,
+0.113447517,
+0.112645385,
+0.111848208,
+0.111055967,
+0.11026864,
+0.109486208,
+0.10870865,
+0.107935944,
+0.10716807,
+0.106405008,
+0.105646736,
+0.104893233,
+0.104144479,
+0.103400451,
+0.102661131,
+0.101926495,
+0.101196524,
+0.100471196,
+0.099750489,
+0.099034383,
+0.098322857,
+0.097615889,
+0.096913457,
+0.096215542,
+0.09552212,
+0.094833172,
+0.094148676,
+0.09346861,
+0.092792953,
+0.092121684,
+0.091454782,
+0.090792224,
+0.09013399,
+0.089480059,
+0.088830409,
+0.088185019,
+0.087543867,
+0.086906933,
+0.086274194,
+0.085645631,
+0.08502122,
+0.084400942,
+0.083784774,
+0.083172696,
+0.082564687,
+0.081960725,
+0.08136079,
+0.080764859,
+0.080172912,
+0.079584928,
+0.079000886,
+0.078420765,
+0.077844544,
+0.077272202,
+0.076703718,
+0.076139071,
+0.07557824,
+0.075021205,
+0.074467945,
+0.073918439,
+0.073372666,
+0.072830606,
+0.072292238,
+0.071757542,
+0.071226497,
+0.070699083,
+0.070175279,
+0.069655065,
+0.06913842,
+0.068625325,
+0.068115759,
+0.067609703,
+0.067107135,
+0.066608036,
+0.066112385,
+0.065620164,
+0.065131352,
+0.06464593,
+0.064163876,
+0.063685173,
+0.0632098,
+0.062737737,
+0.062268966,
+0.061803466,
+0.061341219,
+0.060882204,
+0.060426403,
+0.059973797,
+0.059524366,
+0.059078091,
+0.058634954,
+0.058194935,
+0.057758015,
+0.057324176,
+0.056893399,
+0.056465665,
+0.056040956,
+0.055619253,
+0.055200538,
+0.054784792,
+0.054371997,
+0.053962135,
+0.053555187,
+0.053151136,
+0.052749964,
+0.052351652,
+0.051956183,
+0.051563538,
+0.051173701,
+0.050786653,
+0.050402377,
+0.050020856,
+0.049642071,
+0.049266006,
+0.048892643,
+0.048521966,
+0.048153956,
+0.047788598,
+0.047425873,
+0.047065766,
+0.046708258,
+0.046353335,
+0.046000978,
+0.045651171,
+0.045303898,
+0.044959142,
+0.044616887,
+0.044277117,
+0.043939815,
+0.043604966,
+0.043272553,
+0.04294256,
+0.042614972,
+0.042289772,
+0.041966945,
+0.041646475,
+0.041328347,
+0.041012545,
+0.040699054,
+0.040387858,
+0.040078942,
+0.039772291,
+0.039467889,
+0.039165723,
+0.038865776,
+0.038568034,
+0.038272482,
+0.037979106,
+0.037687891,
+0.037398821,
+0.037111883,
+0.036827063,
+0.036544345,
+0.036263716,
+0.035985162,
+0.035708668,
+0.03543422,
+0.035161805,
+0.034891409,
+0.034623017,
+0.034356616,
+0.034092192,
+0.033829733,
+0.033569223,
+0.033310651,
+0.033054002,
+0.032799263,
+0.032546422,
+0.032295465,
+0.032046378,
+0.03179915,
+0.031553767,
+0.031310216,
+0.031068484,
+0.03082856,
+0.030590429,
+0.030354081,
+0.030119502,
+0.02988668,
+0.029655602,
+0.029426257,
+0.029198633,
+0.028972716,
+0.028748496,
+0.02852596,
+0.028305096,
+0.028085893,
+0.027868339,
+0.027652422,
+0.027438131,
+0.027225454,
+0.02701438,
+0.026804897,
+0.026596994,
+0.026390659,
+0.026185883,
+0.025982653,
+0.025780958,
+0.025580788,
+0.025382132,
+0.025184979,
+0.024989317,
+0.024795137,
+0.024602428,
+0.02441118,
+0.024221381,
+0.024033021,
+0.02384609,
+0.023660578,
+0.023476475,
+0.02329377,
+0.023112453,
+0.022932514,
+0.022753943,
+0.022576731,
+0.022400868,
+0.022226343,
+0.022053147,
+0.021881271,
+0.021710704,
+0.021541438,
+0.021373463,
+0.021206769,
+0.021041347,
+0.020877188,
+0.020714283,
+0.020552622,
+0.020392196,
+0.020232997,
+0.020075015,
+0.019918242,
+0.019762668,
+0.019608285,
+0.019455085,
+0.019303057,
+0.019152195,
+0.019002488,
+0.018853929,
+0.01870651,
+0.018560221,
+0.018415055,
+0.018271004,
+0.018128058,
+0.01798621,
+0.017845452,
+0.017705775,
+0.017567173,
+0.017429636,
+0.017293157,
+0.017157727,
+0.017023341,
+0.016889988,
+0.016757663,
+0.016626356,
+0.016496061,
+0.016366771,
+0.016238476,
+0.016111171,
+0.015984848,
+0.0158595,
+0.015735118,
+0.015611697,
+0.015489228,
+0.015367706,
+0.015247121,
+0.015127469,
+0.015008741,
+0.014890931,
+0.014774032,
+0.014658037,
+0.014542939,
+0.014428731,
+0.014315408,
+0.014202961,
+0.014091386,
+0.013980674,
+0.01387082,
+0.013761817,
+0.013653659,
+0.013546339,
+0.013439852,
+0.01333419,
+0.013229347,
+0.013125318,
+0.013022097,
+0.012919676,
+0.01281805,
+0.012717214,
+0.012617161,
+0.012517885,
+0.01241938,
+0.01232164,
+0.012224661,
+0.012128435,
+0.012032957,
+0.011938222,
+0.011844224,
+0.011750958,
+0.011658417,
+0.011566596,
+0.01147549,
+0.011385093,
+0.0112954,
+0.011206406,
+0.011118105,
+0.011030493,
+0.010943563,
+0.01085731,
+0.01077173,
+0.010686817,
+0.010602567,
+0.010518973,
+0.010436032,
+0.010353738,
+0.010272086,
+0.010191071,
+0.010110688,
+0.010030934,
+0.009951802,
+0.009873288,
+0.009795387,
+0.009718095,
+0.009641407,
+0.009565319,
+0.009489825,
+0.009414921,
+0.009340603,
+0.009266866,
+0.009193705,
+0.009121117,
+0.009049097,
+0.00897764,
+0.008906743,
+0.0088364,
+0.008766609,
+0.008697363,
+0.00862866,
+0.008560494,
+0.008492863,
+0.008425761,
+0.008359185,
+0.008293131,
+0.008227594,
+0.008162571,
+0.008098058,
+0.00803405,
+0.007970544,
+0.007907537,
+0.007845023,
+0.007783,
+0.007721463,
+0.007660409,
+0.007599834,
+0.007539735,
+0.007480107,
+0.007420947,
+0.007362251,
+0.007304017,
+0.00724624,
+0.007188916,
+0.007132043,
+0.007075616,
+0.007019633,
+0.006964089,
+0.006908982,
+0.006854308,
+0.006800064,
+0.006746246,
+0.006692851,
+0.006639876,
+0.006587317,
+0.006535172,
+0.006483437,
+0.006432108,
+0.006381184,
+0.00633066,
+0.006280534,
+0.006230802,
+0.006181461,
+0.006132509,
+0.006083941,
+0.006035757,
+0.005987951,
+0.005940522,
+0.005893467,
+0.005846781,
+0.005800464,
+0.005754511,
+0.005708921,
+0.005663689,
+0.005618814,
+0.005574292,
+0.005530122,
+0.005486299,
+0.005442822,
+0.005399687,
+0.005356892,
+0.005314435,
+0.005272312,
+0.005230522,
+0.005189061,
+0.005147927,
+0.005107117,
+0.005066629,
+0.005026461,
+0.004986609,
+0.004947072,
+0.004907847,
+0.004868931,
+0.004830323,
+0.004792019,
+0.004754017,
+0.004716315,
+0.004678911,
+0.004641802,
+0.004604986,
+0.004568461,
+0.004532224,
+0.004496273,
+0.004460606,
+0.004425221,
+0.004390115,
+0.004355287,
+0.004320734,
+0.004286453,
+0.004252444,
+0.004218703,
+0.004185228,
+0.004152019,
+0.004119071,
+0.004086384,
+0.004053956,
+0.004021783,
+0.003989865,
+0.003958199,
+0.003926784,
+0.003895617,
+0.003864696,
+0.00383402,
+0.003803587,
+0.003773394,
+0.003743439,
+0.003713722,
+0.00368424,
+0.003654991,
+0.003625973,
+0.003597185,
+0.003568625,
+0.00354029,
+0.00351218,
+0.003484292,
+0.003456625,
+0.003429177,
+0.003401946,
+0.00337493,
+0.003348128,
+0.003321539,
+0.00329516,
+0.00326899,
+0.003243026,
+0.003217269,
+0.003191715,
+0.003166364,
+0.003141213,
+0.003116262,
+0.003091508,
+0.00306695,
+0.003042587,
+0.003018416,
+0.002994437,
+0.002970648,
+0.002947048,
+0.002923634,
+0.002900406,
+0.002877362,
+0.0028545,
+0.00283182,
+0.002809319,
+0.002786996,
+0.002764851,
+0.00274288,
+0.002721084,
+0.002699461,
+0.002678009,
+0.002656727,
+0.002635614,
+0.002614668,
+0.002593888,
+0.002573273,
+0.002552821,
+0.002532532,
+0.002512403,
+0.002492434,
+0.002472623,
+0.002452969,
+0.002433472,
+0.002414128,
+0.002394938,
+0.002375901,
+0.002357014,
+0.002338277,
+0.002319688,
+0.002301248,
+0.002282953,
+0.002264803,
+0.002246798,
+0.002228935,
+0.002211214,
+0.002193633,
+0.002176192,
+0.00215889,
+0.002141724,
+0.002124695,
+0.002107801,
+0.002091041,
+0.002074414,
+0.002057919,
+0.002041555,
+0.00202532,
+0.002009215,
+0.001993237,
+0.001977386,
+0.001961661,
+0.001946061,
+0.001930585,
+0.001915231,
+0.001899999,
+0.001884888,
+0.001869898,
+0.001855026,
+0.001840272,
+0.001825635,
+0.001811115,
+0.00179671,
+0.001782419,
+0.001768241,
+0.001754177,
+0.001740223,
+0.001726381,
+0.001712649,
+0.001699025,
+0.00168551,
+0.001672102,
+0.001658801,
+0.001645605,
+0.001632514,
+0.001619527,
+0.001606644,
+0.001593862,
+0.001581182,
+0.001568603,
+0.001556124,
+0.001543744,
+0.001531462,
+0.001519277,
+0.00150719,
+0.001495198,
+0.001483302,
+0.001471501,
+0.001459793,
+0.001448178,
+0.001436655,
+0.001425224,
+0.001413884,
+0.001402634,
+0.001391473,
+0.001380401,
+0.001369417,
+0.00135852,
+0.00134771,
+0.001336985,
+0.001326346,
+0.001315792,
+0.001305321,
+0.001294934,
+0.001284629,
+0.001274406,
+0.001264264,
+0.001254203,
+0.001244222,
+0.00123432,
+0.001224497,
+0.001214752,
+0.001205084,
+0.001195493,
+0.001185979,
+0.00117654,
+0.001167176,
+0.001157887,
+0.001148671,
+0.001139529,
+0.001130459,
+0.001121462,
+0.001112536,
+0.001103681,
+0.001094896,
+0.001086182,
+0.001077536,
+0.00106896,
+0.001060451,
+0.00105201,
+0.001043636,
+0.001035329,
+0.001027088,
+0.001018912,
+0.001010802,
+0.001002756,
+0.000994774,
+0.000986855,
+0.000978999,
+0.000971206,
+0.000963475,
+0.000955805,
+0.000948197,
+0.000940648,
+0.00093316,
+0.000925732,
+0.000918362,
+0.000911051,
+0.000903798,
+0.000896603,
+0.000889465,
+0.000882384,
+0.00087536,
+0.000868391,
+0.000861477,
+0.000854619,
+0.000847815,
+0.000841065,
+0.000834369,
+0.000827726,
+0.000821136,
+0.000814599,
+0.000808113,
+0.000801679,
+0.000795296,
+0.000788964,
+0.000782683,
+0.000776451,
+0.000770269,
+0.000764136,
+0.000758052,
+0.000752016,
+0.000746029,
+0.000740089,
+0.000734196,
+0.00072835,
+0.000722551,
+0.000716798,
+0.00071109,
+0.000705428,
+0.000699811,
+0.000694239,
+0.000688711,
+0.000683227,
+0.000677787,
+0.00067239,
+0.000667036,
+0.000661724,
+0.000656455,
+0.000651228,
+0.000646042,
+0.000640897,
+0.000635794,
+0.000630731,
+0.000625709,
+0.000620726,
+0.000615783,
+0.000610879,
+0.000606015,
+0.000601189,
+0.000596401,
+0.000591652,
+0.00058694,
+0.000582266,
+0.00057763,
+0.00057303,
+0.000568466,
+0.000563939,
+0.000559448,
+0.000554993,
+0.000550573,
+0.000546189,
+0.000541839,
+0.000537524,
+0.000533243,
+0.000528996,
+0.000524783,
+0.000520604,
+0.000516458,
+0.000512345,
+0.000508265,
+0.000504217,
+0.000500201,
+0.000496217,
+0.000492265,
+0.000488345,
+0.000484456,
+0.000480597,
+0.00047677,
+0.000472973,
+0.000469206,
+0.000465469,
+0.000461762,
+0.000458084,
+0.000454436,
+0.000450816,
+0.000447226,
+0.000443664,
+0.00044013,
+0.000436625,
+0.000433147,
+0.000429697,
+0.000426275,
+0.00042288,
+0.000419512,
+0.00041617,
+0.000412855,
+0.000409567,
+0.000406305,
+0.000403069,
+0.000399858,
+0.000396674,
+0.000393514,
+0.00039038,
+0.00038727,
+0.000384186,
+0.000381126,
+0.00037809,
+0.000375079,
+0.000372091,
+0.000369127,
+0.000366187,
+0.00036327,
+0.000360377,
+0.000357506,
+0.000354659,
+0.000351834,
+0.000349031,
+0.000346251,
+0.000343493,
+0.000340757,
+0.000338043,
+0.00033535
+};
+
+static bool check_input_int8_range(float input) {
+  bool ret = input > -128.0 && input < 128.0;
+  if (!ret) {
+    printf("invalid int8 range, input is %f\n", input);
+  }
+  return ret;
+}
+
+static double _gen_sigmoid(float x) {
+  return 1.0 / (1.0 + exp(-(x)));
+}
+
+static void tl_lut_ref(
+    u16 *ofmap,
+    u16 *ifmap,
+    u16 *table,
+    u16 *table_slope,
+    tl_shape_t ifmap_shape,
+    tl_shape_t table_shape)
+{
+  int tn, th, tw;
+
+  tn = table_shape.n;
+  th = table_shape.h;
+  tw = table_shape.w;
+  assert(tn == 1);
+  assert(th * tw == 256);
+  assert(table);
+  assert(table_slope);
+  assert(ifmap_shape.n);
+  assert(ifmap);
+  assert(ofmap);
+
+  // TODO: use c function
+  // 1. dump all input as binary file
+#if 0
+  #define INFP32FILE "infp32file.bin"
+  #define OUTBF16FILE "lutbf16out.bin"
+  FILE* pFile;
+  pFile = fopen(INFP32FILE, "wb");
+  int shape_sz = tl_shape_size(&ifmap_shape);
+  float *f = (float *)malloc(sizeof(float) * shape_sz];
+  for (int i = 0; i < shape_sz; i++) {
+    f[i] = convert_bf16_fp32(ifmap[i]);
+  }
+  fwrite(f, 1, shape_sz *sizeof(float), pFile);
+  fclose(pFile);
+
+  // 2. read result from `eval_lut.py`
+  char command[256];
+  sprintf(command, "python eval_lut.py --lut_input_range_start %d --lut_input_range_end %d --inputfloat32 %s --outputbf16 %s 2>&1 > 2\n",
+      range_start, range_end,
+      INFP32FILE, OUTBF16FILE);
+
+  int r;
+  r = system(command);
+  printf ("command is %s, return %d\n", command, r);
+
+  pFile = fopen(OUTBF16FILE, "rb");
+  if (!pFile) {
+    fprintf(stderr, "open golden %s fail\n", OUTBF16FILE);
+    exit(-1);
+  }
+
+  size_t file_length;
+  file_length = fread(ofmap, sizeof(u16), tl_shape_size(&ifmap_shape), pFile);
+  printf("read from golden, file size %" PRIu64 "\n", file_length);
+  fclose(pFile);
+#endif
+
+#if 0
+  for (u64 i = 0; i < tl_shape_size(&ifmap_shape); i++) {
+    printf ("ref %" PRIu64 " input %x golden %x\n", i, ifmap[i], ofmap[i]);
+  }
+#endif
+}
+
+static void gen_sigmoid(u16 *table_data, u64 table_size) {
+  // S(x) = 1 / (1 + (e^-x))
+  //<! 32*8 table, duplicate `channel` times;
+  int half = table_size / channel / 2;
+  u64 idx = 0;
+  assert(half == 128);
+
+  // prepare channel 0
+  // x [0, 127]
+  // we re-scale [-8, 8] into 256
+  for (int i = 0; i < half; i++) {
+    float _idx = idx / scale;
+    double s = _gen_sigmoid(_idx);
+    sigmode_hw[idx] = s;
+    table_data[idx] = convert_fp32_bf16((float)s);
+#if 0
+    printf("t [%" PRIu64 "] is %f[%d], 0x%x fp is %f d is %.8lf, input is %f\n", idx, convert_bf16_fp32(table_data[idx]), i, table_data[idx], (float)s, s, _idx);
+#endif
+    idx++;
+  }
+
+  // x = -128
+  double s = _gen_sigmoid(range_start);
+  sigmode_hw[idx] = s;
+  table_data[idx] = convert_fp32_bf16((double)s);
+#if 0
+  printf("t [%" PRIu64 "] is %f[%d], 0x%x fp is %f d is %.8lf input is %d\n", idx, convert_bf16_fp32(table_data[idx]), -128, table_data[idx], (float)s, s, range_start);
+#endif
+  idx++;
+
+  // x [-128~-1], 2's complement
+  for (int i = 1; i < half; i++) {
+    float _idx = (i) / scale;
+    double s = _gen_sigmoid(range_start + _idx);
+    sigmode_hw[idx] = s;
+    table_data[idx] = convert_fp32_bf16((double)s);
+#if 0
+    printf("t [%" PRIu64 "] is %f[%d], 0x%x fp is %f d is %.8lf input is %f\n", idx, convert_bf16_fp32(table_data[idx]), -127 + i, table_data[idx], (float)s, s, range_start + _idx);
+#endif
+    idx++;
+  }
+
+  // duplicate channel #1 to #31
+
+  //TODO: tensor copy
+  for (u32 i = 1; i < channel; i++) {
+    memcpy(&table_data[i * table_hw], &table_data[0], sizeof(u16) * table_hw);
+  }
+}
+
+static void gen_sigmoid_slope(u16 IN *table_data, u16* OUT table_slope, u64 table_size) {
+  u32 half = table_size / channel / 2;
+
+  assert(half == 128);
+  assert(table_data); //<! we use global `sigmode_hw` with wise precision
+
+  for (u32 i = 0; i < table_hw; i++) {
+    double x0 = sigmode_hw[i];
+    double x1 = sigmode_hw[i+1];
+    double delta = 1.0;
+    if (i == half - 1) {
+      //<! slope[127] means f(127)~f(128)
+      double f = _gen_sigmoid(range_end);
+      //u16 bf16 = convert_fp32_bf16(f);
+      //x1 = convert_bf16_fp32(bf16);
+      x1 = f;
+    }
+    else if (i == half) {
+      // 128 index mean x1 is -129 and x0 is -128
+      x1 = _gen_sigmoid(range_start - 1/scale);
+      delta = -1.0;
+    }
+    else if (i > half) {
+      x0 = sigmode_hw[i];
+      x1 = sigmode_hw[i-1];
+      delta = -1.0;
+    }
+    double s = (x1 - x0) / delta; // x1 already scale up
+    table_slope[i] = convert_fp32_bf16((float)s);
+#if 0
+    printf ("slope table [%u] = (bf16 %f double %.8lf float %f), 0x%x, %.8lf - %.8lf(%.8lf)\n",
+        i, convert_bf16_fp32(table_slope[i]), s, (float)s, table_slope[i], x1, x0, x1-x0);
+#endif
+  }
+
+  // duplicate channel #1 to #31
+
+  //TODO: tensor copy
+  for (u64 i = 1; i < channel; i++) {
+    memcpy(&table_slope[table_hw * i], &table_slope[0], sizeof(u16) * table_hw);
+  }
+}
+
+static bool verify(u16 *ofmap_data, u16 *ref_data, u64 ofmap_size) {
+  int count = 0;
+  u64 size = ofmap_size;
+  if (mode == PRE_DATA_COMPARE_FIX) {
+    size = sizeof(sigmode_golden_bf16) / sizeof(u16);
+  }
+  else if (PRE_DATA_MAX_ERROR) {
+    size = sizeof(sigmode_golden) / sizeof(double);
+  }
+
+  for (u64 i = 0; i < size; i++) {
+    if (mode == PRE_DATA_COMPARE_FIX) {
+      if (ofmap_data[i] != sigmode_golden_bf16[i]) {
+        fprintf(stderr,
+            "[%d] comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+            count, i, ofmap_data[i], sigmode_golden_bf16[i]);
+        exit(-1);
+      }
+    }
+    else {
+      float got = convert_bf16_fp32(ofmap_data[i]);
+      float exp = convert_bf16_fp32(ref_data[i]);
+
+      if (mode == PRE_DATA_MAX_ERROR) {
+        // cus we have better accuracy ~ 0.0039
+        exp = sigmode_golden[i];
+      }
+
+      if (fabs(got - exp) > MAX_ERROR) {
+        fprintf(stderr,
+            "[%d] comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x, diff(%f - %f) is %f\n",
+            count, i, ofmap_data[i], ref_data[i], got, exp, fabs(got - exp)
+            );
+        count++;
+      }
+    }
+  }
+
+  if (count != 0) {
+    printf("error count is %d\n", count);
+    exit(-1);
+  }
+
+  return true;
+}
+
+static void test_tl_int8_lut_bf16(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk)
+{
+  // TODO: check more shape / align
+  tl_shape_t ifmap_shape;
+  if (mode == PRE_DATA_COMPARE_FIX || mode == PRE_DATA_MAX_ERROR) {
+    ifmap_shape = {1, channel, 8, 8};
+  }
+  else {
+    ifmap_shape = {1, channel, 16, 16};
+  }
+
+  tl_shape_t table_shape = {1, channel, table_h, table_w}; // hard code for hw, hw:32x8
+  tl_shape_t ofmap_shape = ifmap_shape;
+
+  u64 ifmap_size = tl_shape_size(&ifmap_shape);
+  u64 table_size = tl_shape_size(&table_shape);
+  u64 ofmap_size = tl_shape_size(&ofmap_shape);
+
+  fmt_t fmt = FMT_BF16;
+
+  int data_type_size = bytesize_of_fmt(fmt);
+  u64 ifmap_bytesize  =  ifmap_size * data_type_size;
+  u64 table_bytesize  =  table_size * data_type_size;
+  u64 ofmap_bytesize  =  ofmap_size * data_type_size;
+
+  // hw ONLY support index in int8
+  u16 *ifmap = (u16 *)xmalloc(ifmap_bytesize);
+  memset(ifmap, 0x00, ifmap_bytesize);
+
+  u16 *ifmap_slope = (u16 *)xmalloc(ifmap_bytesize);
+  memset(ifmap_slope, 0x00, ifmap_bytesize);
+
+  if (mode == PRE_DATA_COMPARE_FIX || mode == PRE_DATA_MAX_ERROR) {
+    memcpy(ifmap, &test_pattern, sizeof(test_pattern));
+
+#if 0
+    for (u64 i = 0; i < ifmap_size; i++) {
+      printf("source if[%" PRIu64 "] is bf16 %f (bf16)with 0x%x\n", i, convert_bf16_fp32(ifmap[i]), ifmap[i]); 
+    }
+#endif
+  }
+  else {
+    for (u64 i = 0; i < ifmap_size; i++) {
+      // input range is -8 ~ +8
+      float input = ((int)i % 7) * (((int)i % 2) ? 1 : -1) + 0.03 + (i % table_hw) * 0.002;
+      //float input = ((int)i % 10) * (((int)i % 2) ? 1 : -1) + 0.03 + (i % table_hw) * 0.002;
+      assert(check_input_int8_range(input));
+      ifmap[i] = convert_fp32_bf16(input);
+#if 1
+      printf("source if[%" PRIu64 "] is bf16 %f, input is %f (bf16)with 0x%x\n", i, convert_bf16_fp32(ifmap[i]), input, ifmap[i]); 
+#endif
+    }
+  }
+
+  u16 *table_data = (u16 *)xmalloc(table_bytesize);
+  gen_sigmoid (table_data, table_size);
+
+  u16 *table_data_slope = (u16 *)xmalloc(table_bytesize);
+  gen_sigmoid_slope(table_data, table_data_slope, table_size);
+
+  u16 *ref_data = (u16 *)xmalloc(ofmap_bytesize);
+  tl_lut_ref(ref_data, ifmap, table_data, table_data_slope, ifmap_shape, table_shape);
+
+  tl_t *tl_ifmap =
+    alloc_tl(bmk,ifmap_shape, fmt, /*align*/1);
+  tl_t *tl_table_answer =
+    alloc_tl(bmk, table_shape, fmt, /*align*/1);
+  tl_t *tl_table_answer_slope =
+    alloc_tl(bmk, table_shape, fmt, /*align*/1);
+
+  tl_t *tl_ofmap_A_idx_int8 =
+    alloc_tl(bmk,ofmap_shape, fmt, /*align*/1);
+  tl_t *tl_ofmap_A_idx =
+    alloc_tl(bmk,ofmap_shape, fmt, /*align*/1);
+  tl_t *tl_ofmap_B_slope =
+    alloc_tl(bmk,ofmap_shape, fmt, /*align*/1);
+  tl_t *tl_ofmap_A_base =
+    alloc_tl(bmk,ofmap_shape, fmt, /*align*/1);
+  tl_t *tl_ofmap_C =
+    alloc_tl(bmk,ofmap_shape, fmt, /*align*/1);
+
+  tl_shape_t tl_ofmap_A_idx_int8_shape = {1, channel, ofmap_shape.h * ofmap_shape.w, 1};
+  tl_t *tl_ofmap_A_idx_int8_reshape =
+    alloc_tl(bmk,tl_ofmap_A_idx_int8_shape, FMT_I8, /*align*/1);
+
+  bmk1880v2_tdma_tg2l_tensor_copy_param_t copy_p1, copy_p2, copy_p3;
+  memset(&copy_p1, 0, sizeof(copy_p1));
+  memset(&copy_p2, 0, sizeof(copy_p2));
+  memset(&copy_p3, 0, sizeof(copy_p3));
+
+  // pre alloc gaddr
+  prepare_put_bf16_tensor_g2l(ctx, bmk, tl_ifmap, ifmap, fmt, &copy_p1);
+  prepare_put_bf16_tensor_g2l(ctx, bmk, tl_table_answer, table_data, fmt, &copy_p2);
+  prepare_put_bf16_tensor_g2l(ctx, bmk, tl_table_answer_slope, table_data_slope, fmt, &copy_p3);
+
+  // load it
+  launch_put_bf16_tensor_g2l(ctx, bmk, copy_p1.src, &copy_p1);
+  // load table f(x0)
+  launch_put_bf16_tensor_g2l(ctx, bmk, copy_p2.src, &copy_p2);
+  launch_put_bf16_tensor_g2l(ctx, bmk, copy_p3.src, &copy_p3);
+
+  bmk1880v2_tdma_l2l_tensor_copy_param_t p10;
+  memset(&p10, 0, sizeof(p10));
+
+  // scale input for remap its idx(-x~x) to (-127~127), dirty tl_ifmap
+  bmk1880v2_tiu_element_wise_mul_param_t p1;
+  memset(&p1, 0, sizeof(p1));
+  p1.res_high = NULL;
+  p1.res_low = tl_ifmap;
+  p1.a = tl_ifmap;
+  p1.b_is_const = 1;
+  p1.b_const.val = convert_fp32_bf16(scale);
+  p1.rshift_bits = 0;
+  p1.relu_enable = 0;
+  bmk1880v2_tiu_element_wise_mul(bmk, &p1);
+
+#if 0
+  // <! get idx from bf16->int8
+  // save by stride
+  memset(&p10, 0x00, sizeof(bmk1880v2_tdma_l2l_tensor_copy_param_t));
+  bmk1880v2_tensor_lmem_t dst;
+  memcpy(&dst, tl_ofmap_A_idx_int8_reshape, sizeof(bmk1880v2_tensor_lmem_t)); 
+  dst.stride.h = dst.stride.h * 2;
+  dst.int8_rnd_mode = 1;
+  p10.dst = &dst;
+  p10.src = tl_ifmap;
+  bmk1880v2_tdma_l2l_bf16_tensor_copy(bmk, &p10);
+  test_submit(ctx);
+  dst.int8_rnd_mode = 0; // reset
+
+  // <! int8 to fb16 format cus for sub use, sub MUST in the same format
+  memset(&p10, 0x00, sizeof(bmk1880v2_tdma_l2l_tensor_copy_param_t));
+  p10.dst = tl_ofmap_A_idx; //<! bf16
+  p10.src = &dst;
+  bmk1880v2_tdma_l2l_bf16_tensor_copy(bmk, &p10);
+  test_submit(ctx);
+
+  // get f(x0) and slope(x)
+  // reshape, 16->16
+  dst.fmt = fmt;
+  dst.shape = tl_ofmap_B_slope->shape;
+  dst.stride = tl_ofmap_B_slope->stride;
+
+  // <! get slope by index
+  // <! ( (f(x1) - f(x0)) / (x1 - x0) )
+  // <! TIU MUST with same shape and stride, we leverage output map shape and stride
+
+  bmk1880v2_tiu_lookup_table_param_t p12;
+  memset(&p12, 0x0, sizeof(p12));
+  p12.ofmap = tl_ofmap_B_slope;
+  p12.ifmap = &dst;
+  p12.table = tl_table_answer_slope;
+  bmk1880v2_tiu_lookup_table(bmk, &p12);
+
+  // NOTICE: only call test_submit once after all tiu cmd issued
+
+  // base f(x0)
+  memset(&p12, 0x0, sizeof(p12));
+  p12.ofmap = tl_ofmap_A_base;
+  p12.ifmap = &dst;
+  p12.table = tl_table_answer;
+  bmk1880v2_tiu_lookup_table(bmk, &p12);
+
+  // <! sub, diff base , a - b
+  // (x - x0)
+  bmk1880v2_tiu_element_wise_sub_param_t p5;
+  memset(&p5, 0, sizeof(p5));
+  p5.res_high = 0;
+  p5.res_low = tl_ofmap_C;
+  p5.a_high = 0;
+  p5.a_low = tl_ifmap;
+  p5.b_high = 0;
+  p5.b_low = tl_ofmap_A_idx;
+  p5.rshift_bits = 0;
+  bmk1880v2_tiu_element_wise_sub(bmk, &p5);
+
+  // <! mac
+  // <! part A + part B, a * b + res = res
+  bmk1880v2_tiu_element_wise_mac_param_t p2;
+  memset(&p2, 0, sizeof(p2));
+  p2.res_high = 0;
+  p2.res_low = tl_ofmap_A_base;
+  p2.res_is_int8 = 0;
+  p2.a = tl_ofmap_C;
+  p2.b_is_const = 0;
+  p2.b = tl_ofmap_B_slope;
+  p2.lshift_bits = 0;//lshift_bits;
+  p2.rshift_bits = 0;//rshift_bits;
+  p2.relu_enable = 0;
+  bmk1880v2_tiu_element_wise_mac(bmk, &p2);
+  test_submit(ctx);
+#else
+  #if 0
+  // <! get idx from bf16->int8
+  // save by stride
+  memset(&p10, 0x00, sizeof(bmk1880v2_tdma_l2l_tensor_copy_param_t));
+  bmk1880v2_tensor_lmem_t dst;
+  memcpy(&dst, tl_ofmap_A_idx_int8, sizeof(bmk1880v2_tensor_lmem_t)); 
+  dst.stride = tl_ofmap_A_idx_int8_reshape->stride;
+  dst.stride.h = dst.stride.h * 2;
+  dst.fmt = FMT_I8;
+  dst.int8_rnd_mode = 1;
+  p10.dst = &dst;
+  p10.src = tl_ifmap;
+  bmk1880v2_tdma_l2l_bf16_tensor_copy(bmk, &p10);
+  test_submit(ctx);
+  dst.int8_rnd_mode = 0; // reset
+
+  // get f(x0) and slope(x)
+  // reshape, 16->16
+  dst.fmt = fmt;
+  dst.shape = tl_ofmap_B_slope->shape;
+  dst.stride = tl_ofmap_B_slope->stride;
+
+  // <! get slope by index
+  // <! ( (f(x1) - f(x0)) / (x1 - x0) )
+  // <! TIU MUST with same shape and stride, we leverage output map shape and stride
+
+  bmk1880v2_tiu_lookup_table_param_t p12;
+  memset(&p12, 0x0, sizeof(p12));
+  p12.ofmap = tl_ofmap_B_slope;
+  p12.ifmap = &dst;
+  p12.table = tl_table_answer_slope;
+  bmk1880v2_tiu_lookup_table(bmk, &p12);
+
+  // NOTICE: only call test_submit once after all tiu cmd issued
+
+  // base f(x0)
+  memset(&p12, 0x0, sizeof(bmk1880v2_tiu_lookup_table_param_t));
+  p12.ofmap = tl_ofmap_A_base;
+  p12.ifmap = &dst;
+  p12.table = tl_table_answer;
+  bmk1880v2_tiu_lookup_table(bmk, &p12);
+
+  // <! int8 to fb16 format cus for sub use, sub MUST in the same format
+  memcpy(&dst, tl_ofmap_A_idx_int8, sizeof(bmk1880v2_tensor_lmem_t)); 
+  dst.fmt = FMT_I8;
+  dst.stride = tl_ofmap_A_idx_int8_reshape->stride;
+  dst.shape = tl_ofmap_A_idx_int8_reshape->shape;
+  dst.stride.h = dst.stride.h * 2;
+  dst.int8_rnd_mode = 0; // reset
+  memset(&p10, 0x00, sizeof(bmk1880v2_tdma_l2l_tensor_copy_param_t));
+  p10.dst = tl_ofmap_A_idx_int8; //<! bf16
+  p10.src = &dst;
+  bmk1880v2_tdma_l2l_bf16_tensor_copy(bmk, &p10);
+  
+
+  // <! sub, diff base , a - b
+  // (x - x0)
+  bmk1880v2_tiu_element_wise_sub_param_t p5;
+  memset(&p5, 0, sizeof(p5));
+  p5.res_high = 0;
+  p5.res_low = tl_ifmap;
+  p5.a_high = 0;
+  p5.a_low = tl_ifmap;
+  p5.b_high = 0;
+  p5.b_low = tl_ofmap_A_idx_int8;
+  p5.rshift_bits = 0;
+  bmk1880v2_tiu_element_wise_sub(bmk, &p5);
+
+  // <! mac
+  // <! part A + part B, a * b + res = res
+  bmk1880v2_tiu_element_wise_mac_param_t p2;
+  memset(&p2, 0, sizeof(p2));
+  p2.res_high = 0;
+  p2.res_low = tl_ofmap_A_base;
+  p2.res_is_int8 = 0;
+  p2.a = tl_ifmap;
+  p2.b_is_const = 0;
+  p2.b = tl_ofmap_B_slope;
+  p2.lshift_bits = 0;//lshift_bits;
+  p2.rshift_bits = 0;//rshift_bits;
+  p2.relu_enable = 0;
+  bmk1880v2_tiu_element_wise_mac(bmk, &p2);
+  test_submit(ctx);
+  #else
+
+  // <! get idx from bf16->int8
+  // save by stride
+  assert(tl_ofmap_A_idx_int8_reshape);
+  memset(&p10, 0x00, sizeof(bmk1880v2_tdma_l2l_tensor_copy_param_t));
+  bmk1880v2_tensor_lmem_t dst;
+  memcpy(&dst, tl_ofmap_A_base, sizeof(bmk1880v2_tensor_lmem_t)); 
+  dst.fmt = FMT_I8;
+  dst.shape = tl_ofmap_A_idx_int8_shape;
+  dst.stride = bmk1880v2_tensor_lmem_default_stride(bmk, dst.shape, dst.fmt, /*eu_align*/ 1);
+  dst.stride.h = dst.stride.h * 2;
+  dst.int8_rnd_mode = 1;
+  p10.dst = &dst;
+  p10.src = tl_ifmap;
+  bmk1880v2_tdma_l2l_bf16_tensor_copy(bmk, &p10);
+  test_submit(ctx);
+  dst.int8_rnd_mode = 0; // reset
+
+  // <! int8 to fb16 format cus for sub use, sub MUST in the same format
+  memset(&p10, 0x00, sizeof(bmk1880v2_tdma_l2l_tensor_copy_param_t));
+  p10.dst = tl_ofmap_B_slope; //<! bf16
+  p10.src = &dst;
+  bmk1880v2_tdma_l2l_bf16_tensor_copy(bmk, &p10);
+  test_submit(ctx);
+
+  // <! sub, diff base , a - b
+  // (x - x0)
+  bmk1880v2_tiu_element_wise_sub_param_t p5;
+  memset(&p5, 0, sizeof(p5));
+  p5.res_high = 0;
+  p5.res_low = tl_ifmap;
+  p5.a_high = 0;
+  p5.a_low = tl_ifmap;
+  p5.b_high = 0;
+  p5.b_low = tl_ofmap_B_slope;
+  p5.rshift_bits = 0;
+  bmk1880v2_tiu_element_wise_sub(bmk, &p5);
+
+
+  // get f(x0) and slope(x)
+  // reshape, 16->16
+  dst.fmt = fmt;
+  dst.shape = tl_ofmap_B_slope->shape;
+  dst.stride = tl_ofmap_B_slope->stride;
+
+  // <! get slope by index
+  // <! ( (f(x1) - f(x0)) / (x1 - x0) )
+  // <! TIU MUST with same shape and stride, we leverage output map shape and stride
+
+  bmk1880v2_tiu_lookup_table_param_t p12;
+  memset(&p12, 0x0, sizeof(p12));
+  p12.ofmap = tl_ofmap_B_slope;
+  p12.ifmap = &dst;
+  p12.table = tl_table_answer_slope;
+  bmk1880v2_tiu_lookup_table(bmk, &p12);
+
+  // NOTICE: only call test_submit once after all tiu cmd issued
+
+  // base f(x0)
+  memset(&p12, 0x0, sizeof(bmk1880v2_tiu_lookup_table_param_t));
+  p12.ofmap = tl_ofmap_A_base;
+  p12.ifmap = &dst;
+  p12.table = tl_table_answer;
+  bmk1880v2_tiu_lookup_table(bmk, &p12);
+
+  // <! mac
+  // <! part A + part B, a * b + res = res
+  bmk1880v2_tiu_element_wise_mac_param_t p2;
+  memset(&p2, 0, sizeof(p2));
+  p2.res_high = 0;
+  p2.res_low = tl_ofmap_A_base;
+  p2.res_is_int8 = 0;
+  p2.a = tl_ifmap;
+  p2.b_is_const = 0;
+  p2.b = tl_ofmap_B_slope;
+  p2.lshift_bits = 0;//lshift_bits;
+  p2.rshift_bits = 0;//rshift_bits;
+  p2.relu_enable = 0;
+  bmk1880v2_tiu_element_wise_mac(bmk, &p2);
+  test_submit(ctx);
+  #endif
+
+#endif
+
+  u16 *ofmap_data = (u16*)get_bf16_tensor_l2g(ctx, bmk, tl_ofmap_A_base, fmt);
+  verify(ofmap_data, ref_data, ofmap_size);
+
+  free_tl(bmk, tl_ofmap_A_idx_int8_reshape);
+  free_tl(bmk, tl_ofmap_C);
+  free_tl(bmk, tl_ofmap_A_base);
+  free_tl(bmk, tl_ofmap_B_slope);
+  free_tl(bmk, tl_ofmap_A_idx);
+  free_tl(bmk, tl_ofmap_A_idx_int8);
+  free_tl(bmk, tl_table_answer_slope);
+  free_tl(bmk, tl_table_answer);
+  free_tl(bmk, tl_ifmap);
+
+  free(ifmap);
+  free(ifmap_slope);
+  free(table_data);
+  free(table_data_slope);
+  free(ref_data);
+  free(ofmap_data);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  int round_mode;
+
+  round_mode = set_store_feround();
+
+  test_init(&ctx, &bmk);
+
+  for (int i = PRE_DATA_COMPARE_FIX; i < TEST_MODE_MAX; i++) {
+    mode = static_cast<TEST_MODE>(i);
+    printf ("test mode %d...\n", mode);
+    test_tl_int8_lut_bf16(&ctx, bmk);
+  }
+
+  test_exit(&ctx);
+  restore_feround(round_mode);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_bf16_sigmoid_linear_interp_kernel.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_sigmoid_linear_interp_kernel.cpp
new file mode 100644
index 000000000..23b02b237
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_sigmoid_linear_interp_kernel.cpp
@@ -0,0 +1,1143 @@
+//* TODO: you could rerange any value to -127~127
+#include "../1880v2_test_util.h"
+#define OUT
+#define IN
+//#define DBG
+
+/**
+ * pre_data means we test fixed pattern, it should be same sa lut
+ * compare fix means we MAKE SURE output values equal with golden,
+ * comment it for check with error using `MAX_ERROR`
+ */
+enum TEST_MODE {
+  PRE_DATA_COMPARE_FIX = 0,  // pre-data + fix compare
+  PRE_DATA_MAX_ERROR,        // pre-data + compare only diff < MAX_ERROR
+  GEN_DATA_MAX_ERROR,        // gen data + compare only diff < MAX_ERROR
+  TEST_MODE_MAX,
+};
+
+static TEST_MODE mode;
+#define MAX_ERROR (0.004)
+
+using namespace std;
+static u16 test_pattern[] = {
+    0x0000, 0x3C03, 0x3C83, 0x3CC5, 0x3D03, 0x3D24, 0x3D45, 0x3D65, 0x3D83,
+    0x3D93, 0x3DA4, 0x3DB4, 0x3DC5, 0x3DD5, 0x3DE5, 0x3DF6, 0x3E03, 0x3E0B,
+    0x3E13, 0x3E1C, 0x3E24, 0x3E2C, 0x3E34, 0x3E3C, 0x3E45, 0x3E4D, 0x3E55,
+    0x3E5D, 0x3E65, 0x3E6E, 0x3E76, 0x3E7E, 0x3E83, 0x3E87, 0x3E8B, 0x3E8F,
+    0x3E93, 0x3E98, 0x3E9C, 0x3EA0, 0x3EA4, 0x3EA8, 0x3EAC, 0x3EB0, 0x3EB4,
+    0x3EB8, 0x3EBC, 0x3EC1, 0x3EC5, 0x3EC9, 0x3ECD, 0x3ED1, 0x3ED5, 0x3ED9,
+    0x3EDD, 0x3EE1, 0x3EE5, 0x3EE9, 0x3EEE, 0x3EF2, 0x3EF6, 0x3EFA, 0x3EFE,
+    0x3F01, 0x3F03, 0x3F05, 0x3F07, 0x3F09, 0x3F0B, 0x3F0D, 0x3F0F, 0x3F11,
+    0x3F13, 0x3F16, 0x3F18, 0x3F1A, 0x3F1C, 0x3F1E, 0x3F20, 0x3F22, 0x3F24,
+    0x3F26, 0x3F28, 0x3F2A, 0x3F2C, 0x3F2E, 0x3F30, 0x3F32, 0x3F34, 0x3F36,
+    0x3F38, 0x3F3A, 0x3F3C, 0x3F3E, 0x3F41, 0x3F43, 0x3F45, 0x3F47, 0x3F49,
+    0x3F4B, 0x3F4D, 0x3F4F, 0x3F51, 0x3F53, 0x3F55, 0x3F57, 0x3F59, 0x3F5B,
+    0x3F5D, 0x3F5F, 0x3F61, 0x3F63, 0x3F65, 0x3F67, 0x3F69, 0x3F6C, 0x3F6E,
+    0x3F70, 0x3F72, 0x3F74, 0x3F76, 0x3F78, 0x3F7A, 0x3F7C, 0x3F7E, 0x3F80,
+    0x3F81, 0x3F82, 0x3F83, 0x3F84, 0x3F85, 0x3F86, 0x3F87, 0x3F88, 0x3F89,
+    0x3F8A, 0x3F8B, 0x3F8C, 0x3F8D, 0x3F8E, 0x3F8F, 0x3F90, 0x3F91, 0x3F92,
+    0x3F93, 0x3F94, 0x3F96, 0x3F97, 0x3F98, 0x3F99, 0x3F9A, 0x3F9B, 0x3F9C,
+    0x3F9D, 0x3F9E, 0x3F9F, 0x3FA0, 0x3FA1, 0x3FA2, 0x3FA3, 0x3FA4, 0x3FA5,
+    0x3FA6, 0x3FA7, 0x3FA8, 0x3FA9, 0x3FAA, 0x3FAB, 0x3FAC, 0x3FAD, 0x3FAE,
+    0x3FAF, 0x3FB0, 0x3FB1, 0x3FB2, 0x3FB3, 0x3FB4, 0x3FB5, 0x3FB6, 0x3FB7,
+    0x3FB8, 0x3FB9, 0x3FBA, 0x3FBB, 0x3FBC, 0x3FBD, 0x3FBE, 0x3FBF, 0x3FC1,
+    0x3FC2, 0x3FC3, 0x3FC4, 0x3FC5, 0x3FC6, 0x3FC7, 0x3FC8, 0x3FC9, 0x3FCA,
+    0x3FCB, 0x3FCC, 0x3FCD, 0x3FCE, 0x3FCF, 0x3FD0, 0x3FD1, 0x3FD2, 0x3FD3,
+    0x3FD4, 0x3FD5, 0x3FD6, 0x3FD7, 0x3FD8, 0x3FD9, 0x3FDA, 0x3FDB, 0x3FDC,
+    0x3FDD, 0x3FDE, 0x3FDF, 0x3FE0, 0x3FE1, 0x3FE2, 0x3FE3, 0x3FE4, 0x3FE5,
+    0x3FE6, 0x3FE7, 0x3FE8, 0x3FE9, 0x3FEA, 0x3FEC, 0x3FED, 0x3FEE, 0x3FEF,
+    0x3FF0, 0x3FF1, 0x3FF2, 0x3FF3, 0x3FF4, 0x3FF5, 0x3FF6, 0x3FF7, 0x3FF8,
+    0x3FF9, 0x3FFA, 0x3FFB, 0x3FFC, 0x3FFD, 0x3FFE, 0x3FFF, 0x4000, 0x4001,
+    0x4001, 0x4002, 0x4002, 0x4003, 0x4003, 0x4004, 0x4004, 0x4005, 0x4005,
+    0x4006, 0x4006, 0x4007, 0x4007, 0x4008, 0x4008, 0x4009, 0x4009, 0x400A,
+    0x400A, 0x400B, 0x400B, 0x400C, 0x400C, 0x400D, 0x400D, 0x400E, 0x400E,
+    0x400F, 0x400F, 0x4010, 0x4010, 0x4011, 0x4011, 0x4012, 0x4012, 0x4013,
+    0x4013, 0x4014, 0x4014, 0x4015, 0x4016, 0x4016, 0x4017, 0x4017, 0x4018,
+    0x4018, 0x4019, 0x4019, 0x401A, 0x401A, 0x401B, 0x401B, 0x401C, 0x401C,
+    0x401D, 0x401D, 0x401E, 0x401E, 0x401F, 0x401F, 0x4020, 0x4020, 0x4021,
+    0x4021, 0x4022, 0x4022, 0x4023, 0x4023, 0x4024, 0x4024, 0x4025, 0x4025,
+    0x4026, 0x4026, 0x4027, 0x4027, 0x4028, 0x4028, 0x4029, 0x4029, 0x402A,
+    0x402A, 0x402B, 0x402C, 0x402C, 0x402D, 0x402D, 0x402E, 0x402E, 0x402F,
+    0x402F, 0x4030, 0x4030, 0x4031, 0x4031, 0x4032, 0x4032, 0x4033, 0x4033,
+    0x4034, 0x4034, 0x4035, 0x4035, 0x4036, 0x4036, 0x4037, 0x4037, 0x4038,
+    0x4038, 0x4039, 0x4039, 0x403A, 0x403A, 0x403B, 0x403B, 0x403C, 0x403C,
+    0x403D, 0x403D, 0x403E, 0x403E, 0x403F, 0x403F, 0x4040, 0x4041, 0x4041,
+    0x4042, 0x4042, 0x4043, 0x4043, 0x4044, 0x4044, 0x4045, 0x4045, 0x4046,
+    0x4046, 0x4047, 0x4047, 0x4048, 0x4048, 0x4049, 0x4049, 0x404A, 0x404A,
+    0x404B, 0x404B, 0x404C, 0x404C, 0x404D, 0x404D, 0x404E, 0x404E, 0x404F,
+    0x404F, 0x4050, 0x4050, 0x4051, 0x4051, 0x4052, 0x4052, 0x4053, 0x4053,
+    0x4054, 0x4054, 0x4055, 0x4056, 0x4056, 0x4057, 0x4057, 0x4058, 0x4058,
+    0x4059, 0x4059, 0x405A, 0x405A, 0x405B, 0x405B, 0x405C, 0x405C, 0x405D,
+    0x405D, 0x405E, 0x405E, 0x405F, 0x405F, 0x4060, 0x4060, 0x4061, 0x4061,
+    0x4062, 0x4062, 0x4063, 0x4063, 0x4064, 0x4064, 0x4065, 0x4065, 0x4066,
+    0x4066, 0x4067, 0x4067, 0x4068, 0x4068, 0x4069, 0x4069, 0x406A, 0x406A,
+    0x406B, 0x406C, 0x406C, 0x406D, 0x406D, 0x406E, 0x406E, 0x406F, 0x406F,
+    0x4070, 0x4070, 0x4071, 0x4071, 0x4072, 0x4072, 0x4073, 0x4073, 0x4074,
+    0x4074, 0x4075, 0x4075, 0x4076, 0x4076, 0x4077, 0x4077, 0x4078, 0x4078,
+    0x4079, 0x4079, 0x407A, 0x407A, 0x407B, 0x407B, 0x407C, 0x407C, 0x407D,
+    0x407D, 0x407E, 0x407E, 0x407F, 0x407F, 0x4080, 0x4080, 0x4081, 0x4081,
+    0x4081, 0x4081, 0x4082, 0x4082, 0x4082, 0x4082, 0x4083, 0x4083, 0x4083,
+    0x4083, 0x4084, 0x4084, 0x4084, 0x4084, 0x4085, 0x4085, 0x4085, 0x4085,
+    0x4086, 0x4086, 0x4086, 0x4086, 0x4087, 0x4087, 0x4087, 0x4087, 0x4088,
+    0x4088, 0x4088, 0x4088, 0x4089, 0x4089, 0x4089, 0x4089, 0x408A, 0x408A,
+    0x408A, 0x408A, 0x408B, 0x408B, 0x408B, 0x408C, 0x408C, 0x408C, 0x408C,
+    0x408D, 0x408D, 0x408D, 0x408D, 0x408E, 0x408E, 0x408E, 0x408E, 0x408F,
+    0x408F, 0x408F, 0x408F, 0x4090, 0x4090, 0x4090, 0x4090, 0x4091, 0x4091,
+    0x4091, 0x4091, 0x4092, 0x4092, 0x4092, 0x4092, 0x4093, 0x4093, 0x4093,
+    0x4093, 0x4094, 0x4094, 0x4094, 0x4094, 0x4095, 0x4095, 0x4095, 0x4096,
+    0x4096, 0x4096, 0x4096, 0x4097, 0x4097, 0x4097, 0x4097, 0x4098, 0x4098,
+    0x4098, 0x4098, 0x4099, 0x4099, 0x4099, 0x4099, 0x409A, 0x409A, 0x409A,
+    0x409A, 0x409B, 0x409B, 0x409B, 0x409B, 0x409C, 0x409C, 0x409C, 0x409C,
+    0x409D, 0x409D, 0x409D, 0x409D, 0x409E, 0x409E, 0x409E, 0x409E, 0x409F,
+    0x409F, 0x409F, 0x409F, 0x40A0, 0x40A0, 0x40A0, 0x40A1, 0x40A1, 0x40A1,
+    0x40A1, 0x40A2, 0x40A2, 0x40A2, 0x40A2, 0x40A3, 0x40A3, 0x40A3, 0x40A3,
+    0x40A4, 0x40A4, 0x40A4, 0x40A4, 0x40A5, 0x40A5, 0x40A5, 0x40A5, 0x40A6,
+    0x40A6, 0x40A6, 0x40A6, 0x40A7, 0x40A7, 0x40A7, 0x40A7, 0x40A8, 0x40A8,
+    0x40A8, 0x40A8, 0x40A9, 0x40A9, 0x40A9, 0x40A9, 0x40AA, 0x40AA, 0x40AA,
+    0x40AA, 0x40AB, 0x40AB, 0x40AB, 0x40AC, 0x40AC, 0x40AC, 0x40AC, 0x40AD,
+    0x40AD, 0x40AD, 0x40AD, 0x40AE, 0x40AE, 0x40AE, 0x40AE, 0x40AF, 0x40AF,
+    0x40AF, 0x40AF, 0x40B0, 0x40B0, 0x40B0, 0x40B0, 0x40B1, 0x40B1, 0x40B1,
+    0x40B1, 0x40B2, 0x40B2, 0x40B2, 0x40B2, 0x40B3, 0x40B3, 0x40B3, 0x40B3,
+    0x40B4, 0x40B4, 0x40B4, 0x40B4, 0x40B5, 0x40B5, 0x40B5, 0x40B6, 0x40B6,
+    0x40B6, 0x40B6, 0x40B7, 0x40B7, 0x40B7, 0x40B7, 0x40B8, 0x40B8, 0x40B8,
+    0x40B8, 0x40B9, 0x40B9, 0x40B9, 0x40B9, 0x40BA, 0x40BA, 0x40BA, 0x40BA,
+    0x40BB, 0x40BB, 0x40BB, 0x40BB, 0x40BC, 0x40BC, 0x40BC, 0x40BC, 0x40BD,
+    0x40BD, 0x40BD, 0x40BD, 0x40BE, 0x40BE, 0x40BE, 0x40BE, 0x40BF, 0x40BF,
+    0x40BF, 0x40BF, 0x40C0, 0x40C0, 0x40C0, 0x40C1, 0x40C1, 0x40C1, 0x40C1,
+    0x40C2, 0x40C2, 0x40C2, 0x40C2, 0x40C3, 0x40C3, 0x40C3, 0x40C3, 0x40C4,
+    0x40C4, 0x40C4, 0x40C4, 0x40C5, 0x40C5, 0x40C5, 0x40C5, 0x40C6, 0x40C6,
+    0x40C6, 0x40C6, 0x40C7, 0x40C7, 0x40C7, 0x40C7, 0x40C8, 0x40C8, 0x40C8,
+    0x40C8, 0x40C9, 0x40C9, 0x40C9, 0x40C9, 0x40CA, 0x40CA, 0x40CA, 0x40CA,
+    0x40CB, 0x40CB, 0x40CB, 0x40CC, 0x40CC, 0x40CC, 0x40CC, 0x40CD, 0x40CD,
+    0x40CD, 0x40CD, 0x40CE, 0x40CE, 0x40CE, 0x40CE, 0x40CF, 0x40CF, 0x40CF,
+    0x40CF, 0x40D0, 0x40D0, 0x40D0, 0x40D0, 0x40D1, 0x40D1, 0x40D1, 0x40D1,
+    0x40D2, 0x40D2, 0x40D2, 0x40D2, 0x40D3, 0x40D3, 0x40D3, 0x40D3, 0x40D4,
+    0x40D4, 0x40D4, 0x40D4, 0x40D5, 0x40D5, 0x40D5, 0x40D6, 0x40D6, 0x40D6,
+    0x40D6, 0x40D7, 0x40D7, 0x40D7, 0x40D7, 0x40D8, 0x40D8, 0x40D8, 0x40D8,
+    0x40D9, 0x40D9, 0x40D9, 0x40D9, 0x40DA, 0x40DA, 0x40DA, 0x40DA, 0x40DB,
+    0x40DB, 0x40DB, 0x40DB, 0x40DC, 0x40DC, 0x40DC, 0x40DC, 0x40DD, 0x40DD,
+    0x40DD, 0x40DD, 0x40DE, 0x40DE, 0x40DE, 0x40DE, 0x40DF, 0x40DF, 0x40DF,
+    0x40DF, 0x40E0, 0x40E0, 0x40E0, 0x40E1, 0x40E1, 0x40E1, 0x40E1, 0x40E2,
+    0x40E2, 0x40E2, 0x40E2, 0x40E3, 0x40E3, 0x40E3, 0x40E3, 0x40E4, 0x40E4,
+    0x40E4, 0x40E4, 0x40E5, 0x40E5, 0x40E5, 0x40E5, 0x40E6, 0x40E6, 0x40E6,
+    0x40E6, 0x40E7, 0x40E7, 0x40E7, 0x40E7, 0x40E8, 0x40E8, 0x40E8, 0x40E8,
+    0x40E9, 0x40E9, 0x40E9, 0x40E9, 0x40EA, 0x40EA, 0x40EA, 0x40EA, 0x40EB,
+    0x40EB, 0x40EB, 0x40EC, 0x40EC, 0x40EC, 0x40EC, 0x40ED, 0x40ED, 0x40ED,
+    0x40ED, 0x40EE, 0x40EE, 0x40EE, 0x40EE, 0x40EF, 0x40EF, 0x40EF, 0x40EF,
+    0x40F0, 0x40F0, 0x40F0, 0x40F0, 0x40F1, 0x40F1, 0x40F1, 0x40F1, 0x40F2,
+    0x40F2, 0x40F2, 0x40F2, 0x40F3, 0x40F3, 0x40F3, 0x40F3, 0x40F4, 0x40F4,
+    0x40F4, 0x40F4, 0x40F5, 0x40F5, 0x40F5, 0x40F6, 0x40F6, 0x40F6, 0x40F6,
+    0x40F7, 0x40F7, 0x40F7, 0x40F7, 0x40F8, 0x40F8, 0x40F8, 0x40F8, 0x40F9,
+    0x40F9, 0x40F9, 0x40F9, 0x40FA, 0x40FA, 0x40FA, 0x40FA, 0x40FB, 0x40FB,
+    0x40FB, 0x40FB, 0x40FC, 0x40FC, 0x40FC, 0x40FC, 0x40FD, 0x40FD, 0x40FD,
+    0x40FD, 0x40FE, 0x40FE, 0x40FE, 0x40FE, 0x40FF, 0x40FF, 0x40FF, 0x40FF,
+    0x4100, 0xBC03, 0xBC83, 0xBCC5, 0xBD03, 0xBD24, 0xBD45, 0xBD65, 0xBD83,
+    0xBD93, 0xBDA4, 0xBDB4, 0xBDC5, 0xBDD5, 0xBDE5, 0xBDF6, 0xBE03, 0xBE0B,
+    0xBE13, 0xBE1C, 0xBE24, 0xBE2C, 0xBE34, 0xBE3C, 0xBE45, 0xBE4D, 0xBE55,
+    0xBE5D, 0xBE65, 0xBE6E, 0xBE76, 0xBE7E, 0xBE83, 0xBE87, 0xBE8B, 0xBE8F,
+    0xBE93, 0xBE98, 0xBE9C, 0xBEA0, 0xBEA4, 0xBEA8, 0xBEAC, 0xBEB0, 0xBEB4,
+    0xBEB8, 0xBEBC, 0xBEC1, 0xBEC5, 0xBEC9, 0xBECD, 0xBED1, 0xBED5, 0xBED9,
+    0xBEDD, 0xBEE1, 0xBEE5, 0xBEE9, 0xBEEE, 0xBEF2, 0xBEF6, 0xBEFA, 0xBEFE,
+    0xBF01, 0xBF03, 0xBF05, 0xBF07, 0xBF09, 0xBF0B, 0xBF0D, 0xBF0F, 0xBF11,
+    0xBF13, 0xBF16, 0xBF18, 0xBF1A, 0xBF1C, 0xBF1E, 0xBF20, 0xBF22, 0xBF24,
+    0xBF26, 0xBF28, 0xBF2A, 0xBF2C, 0xBF2E, 0xBF30, 0xBF32, 0xBF34, 0xBF36,
+    0xBF38, 0xBF3A, 0xBF3C, 0xBF3E, 0xBF41, 0xBF43, 0xBF45, 0xBF47, 0xBF49,
+    0xBF4B, 0xBF4D, 0xBF4F, 0xBF51, 0xBF53, 0xBF55, 0xBF57, 0xBF59, 0xBF5B,
+    0xBF5D, 0xBF5F, 0xBF61, 0xBF63, 0xBF65, 0xBF67, 0xBF69, 0xBF6C, 0xBF6E,
+    0xBF70, 0xBF72, 0xBF74, 0xBF76, 0xBF78, 0xBF7A, 0xBF7C, 0xBF7E, 0xBF80,
+    0xBF81, 0xBF82, 0xBF83, 0xBF84, 0xBF85, 0xBF86, 0xBF87, 0xBF88, 0xBF89,
+    0xBF8A, 0xBF8B, 0xBF8C, 0xBF8D, 0xBF8E, 0xBF8F, 0xBF90, 0xBF91, 0xBF92,
+    0xBF93, 0xBF94, 0xBF96, 0xBF97, 0xBF98, 0xBF99, 0xBF9A, 0xBF9B, 0xBF9C,
+    0xBF9D, 0xBF9E, 0xBF9F, 0xBFA0, 0xBFA1, 0xBFA2, 0xBFA3, 0xBFA4, 0xBFA5,
+    0xBFA6, 0xBFA7, 0xBFA8, 0xBFA9, 0xBFAA, 0xBFAB, 0xBFAC, 0xBFAD, 0xBFAE,
+    0xBFAF, 0xBFB0, 0xBFB1, 0xBFB2, 0xBFB3, 0xBFB4, 0xBFB5, 0xBFB6, 0xBFB7,
+    0xBFB8, 0xBFB9, 0xBFBA, 0xBFBB, 0xBFBC, 0xBFBD, 0xBFBE, 0xBFBF, 0xBFC1,
+    0xBFC2, 0xBFC3, 0xBFC4, 0xBFC5, 0xBFC6, 0xBFC7, 0xBFC8, 0xBFC9, 0xBFCA,
+    0xBFCB, 0xBFCC, 0xBFCD, 0xBFCE, 0xBFCF, 0xBFD0, 0xBFD1, 0xBFD2, 0xBFD3,
+    0xBFD4, 0xBFD5, 0xBFD6, 0xBFD7, 0xBFD8, 0xBFD9, 0xBFDA, 0xBFDB, 0xBFDC,
+    0xBFDD, 0xBFDE, 0xBFDF, 0xBFE0, 0xBFE1, 0xBFE2, 0xBFE3, 0xBFE4, 0xBFE5,
+    0xBFE6, 0xBFE7, 0xBFE8, 0xBFE9, 0xBFEA, 0xBFEC, 0xBFED, 0xBFEE, 0xBFEF,
+    0xBFF0, 0xBFF1, 0xBFF2, 0xBFF3, 0xBFF4, 0xBFF5, 0xBFF6, 0xBFF7, 0xBFF8,
+    0xBFF9, 0xBFFA, 0xBFFB, 0xBFFC, 0xBFFD, 0xBFFE, 0xBFFF, 0xC000, 0xC001,
+    0xC001, 0xC002, 0xC002, 0xC003, 0xC003, 0xC004, 0xC004, 0xC005, 0xC005,
+    0xC006, 0xC006, 0xC007, 0xC007, 0xC008, 0xC008, 0xC009, 0xC009, 0xC00A,
+    0xC00A, 0xC00B, 0xC00B, 0xC00C, 0xC00C, 0xC00D, 0xC00D, 0xC00E, 0xC00E,
+    0xC00F, 0xC00F, 0xC010, 0xC010, 0xC011, 0xC011, 0xC012, 0xC012, 0xC013,
+    0xC013, 0xC014, 0xC014, 0xC015, 0xC016, 0xC016, 0xC017, 0xC017, 0xC018,
+    0xC018, 0xC019, 0xC019, 0xC01A, 0xC01A, 0xC01B, 0xC01B, 0xC01C, 0xC01C,
+    0xC01D, 0xC01D, 0xC01E, 0xC01E, 0xC01F, 0xC01F, 0xC020, 0xC020, 0xC021,
+    0xC021, 0xC022, 0xC022, 0xC023, 0xC023, 0xC024, 0xC024, 0xC025, 0xC025,
+    0xC026, 0xC026, 0xC027, 0xC027, 0xC028, 0xC028, 0xC029, 0xC029, 0xC02A,
+    0xC02A, 0xC02B, 0xC02C, 0xC02C, 0xC02D, 0xC02D, 0xC02E, 0xC02E, 0xC02F,
+    0xC02F, 0xC030, 0xC030, 0xC031, 0xC031, 0xC032, 0xC032, 0xC033, 0xC033,
+    0xC034, 0xC034, 0xC035, 0xC035, 0xC036, 0xC036, 0xC037, 0xC037, 0xC038,
+    0xC038, 0xC039, 0xC039, 0xC03A, 0xC03A, 0xC03B, 0xC03B, 0xC03C, 0xC03C,
+    0xC03D, 0xC03D, 0xC03E, 0xC03E, 0xC03F, 0xC03F, 0xC040, 0xC041, 0xC041,
+    0xC042, 0xC042, 0xC043, 0xC043, 0xC044, 0xC044, 0xC045, 0xC045, 0xC046,
+    0xC046, 0xC047, 0xC047, 0xC048, 0xC048, 0xC049, 0xC049, 0xC04A, 0xC04A,
+    0xC04B, 0xC04B, 0xC04C, 0xC04C, 0xC04D, 0xC04D, 0xC04E, 0xC04E, 0xC04F,
+    0xC04F, 0xC050, 0xC050, 0xC051, 0xC051, 0xC052, 0xC052, 0xC053, 0xC053,
+    0xC054, 0xC054, 0xC055, 0xC056, 0xC056, 0xC057, 0xC057, 0xC058, 0xC058,
+    0xC059, 0xC059, 0xC05A, 0xC05A, 0xC05B, 0xC05B, 0xC05C, 0xC05C, 0xC05D,
+    0xC05D, 0xC05E, 0xC05E, 0xC05F, 0xC05F, 0xC060, 0xC060, 0xC061, 0xC061,
+    0xC062, 0xC062, 0xC063, 0xC063, 0xC064, 0xC064, 0xC065, 0xC065, 0xC066,
+    0xC066, 0xC067, 0xC067, 0xC068, 0xC068, 0xC069, 0xC069, 0xC06A, 0xC06A,
+    0xC06B, 0xC06C, 0xC06C, 0xC06D, 0xC06D, 0xC06E, 0xC06E, 0xC06F, 0xC06F,
+    0xC070, 0xC070, 0xC071, 0xC071, 0xC072, 0xC072, 0xC073, 0xC073, 0xC074,
+    0xC074, 0xC075, 0xC075, 0xC076, 0xC076, 0xC077, 0xC077, 0xC078, 0xC078,
+    0xC079, 0xC079, 0xC07A, 0xC07A, 0xC07B, 0xC07B, 0xC07C, 0xC07C, 0xC07D,
+    0xC07D, 0xC07E, 0xC07E, 0xC07F, 0xC07F, 0xC080, 0xC080, 0xC081, 0xC081,
+    0xC081, 0xC081, 0xC082, 0xC082, 0xC082, 0xC082, 0xC083, 0xC083, 0xC083,
+    0xC083, 0xC084, 0xC084, 0xC084, 0xC084, 0xC085, 0xC085, 0xC085, 0xC085,
+    0xC086, 0xC086, 0xC086, 0xC086, 0xC087, 0xC087, 0xC087, 0xC087, 0xC088,
+    0xC088, 0xC088, 0xC088, 0xC089, 0xC089, 0xC089, 0xC089, 0xC08A, 0xC08A,
+    0xC08A, 0xC08A, 0xC08B, 0xC08B, 0xC08B, 0xC08C, 0xC08C, 0xC08C, 0xC08C,
+    0xC08D, 0xC08D, 0xC08D, 0xC08D, 0xC08E, 0xC08E, 0xC08E, 0xC08E, 0xC08F,
+    0xC08F, 0xC08F, 0xC08F, 0xC090, 0xC090, 0xC090, 0xC090, 0xC091, 0xC091,
+    0xC091, 0xC091, 0xC092, 0xC092, 0xC092, 0xC092, 0xC093, 0xC093, 0xC093,
+    0xC093, 0xC094, 0xC094, 0xC094, 0xC094, 0xC095, 0xC095, 0xC095, 0xC096,
+    0xC096, 0xC096, 0xC096, 0xC097, 0xC097, 0xC097, 0xC097, 0xC098, 0xC098,
+    0xC098, 0xC098, 0xC099, 0xC099, 0xC099, 0xC099, 0xC09A, 0xC09A, 0xC09A,
+    0xC09A, 0xC09B, 0xC09B, 0xC09B, 0xC09B, 0xC09C, 0xC09C, 0xC09C, 0xC09C,
+    0xC09D, 0xC09D, 0xC09D, 0xC09D, 0xC09E, 0xC09E, 0xC09E, 0xC09E, 0xC09F,
+    0xC09F, 0xC09F, 0xC09F, 0xC0A0, 0xC0A0, 0xC0A0, 0xC0A1, 0xC0A1, 0xC0A1,
+    0xC0A1, 0xC0A2, 0xC0A2, 0xC0A2, 0xC0A2, 0xC0A3, 0xC0A3, 0xC0A3, 0xC0A3,
+    0xC0A4, 0xC0A4, 0xC0A4, 0xC0A4, 0xC0A5, 0xC0A5, 0xC0A5, 0xC0A5, 0xC0A6,
+    0xC0A6, 0xC0A6, 0xC0A6, 0xC0A7, 0xC0A7, 0xC0A7, 0xC0A7, 0xC0A8, 0xC0A8,
+    0xC0A8, 0xC0A8, 0xC0A9, 0xC0A9, 0xC0A9, 0xC0A9, 0xC0AA, 0xC0AA, 0xC0AA,
+    0xC0AA, 0xC0AB, 0xC0AB, 0xC0AB, 0xC0AC, 0xC0AC, 0xC0AC, 0xC0AC, 0xC0AD,
+    0xC0AD, 0xC0AD, 0xC0AD, 0xC0AE, 0xC0AE, 0xC0AE, 0xC0AE, 0xC0AF, 0xC0AF,
+    0xC0AF, 0xC0AF, 0xC0B0, 0xC0B0, 0xC0B0, 0xC0B0, 0xC0B1, 0xC0B1, 0xC0B1,
+    0xC0B1, 0xC0B2, 0xC0B2, 0xC0B2, 0xC0B2, 0xC0B3, 0xC0B3, 0xC0B3, 0xC0B3,
+    0xC0B4, 0xC0B4, 0xC0B4, 0xC0B4, 0xC0B5, 0xC0B5, 0xC0B5, 0xC0B6, 0xC0B6,
+    0xC0B6, 0xC0B6, 0xC0B7, 0xC0B7, 0xC0B7, 0xC0B7, 0xC0B8, 0xC0B8, 0xC0B8,
+    0xC0B8, 0xC0B9, 0xC0B9, 0xC0B9, 0xC0B9, 0xC0BA, 0xC0BA, 0xC0BA, 0xC0BA,
+    0xC0BB, 0xC0BB, 0xC0BB, 0xC0BB, 0xC0BC, 0xC0BC, 0xC0BC, 0xC0BC, 0xC0BD,
+    0xC0BD, 0xC0BD, 0xC0BD, 0xC0BE, 0xC0BE, 0xC0BE, 0xC0BE, 0xC0BF, 0xC0BF,
+    0xC0BF, 0xC0BF, 0xC0C0, 0xC0C0, 0xC0C0, 0xC0C1, 0xC0C1, 0xC0C1, 0xC0C1,
+    0xC0C2, 0xC0C2, 0xC0C2, 0xC0C2, 0xC0C3, 0xC0C3, 0xC0C3, 0xC0C3, 0xC0C4,
+    0xC0C4, 0xC0C4, 0xC0C4, 0xC0C5, 0xC0C5, 0xC0C5, 0xC0C5, 0xC0C6, 0xC0C6,
+    0xC0C6, 0xC0C6, 0xC0C7, 0xC0C7, 0xC0C7, 0xC0C7, 0xC0C8, 0xC0C8, 0xC0C8,
+    0xC0C8, 0xC0C9, 0xC0C9, 0xC0C9, 0xC0C9, 0xC0CA, 0xC0CA, 0xC0CA, 0xC0CA,
+    0xC0CB, 0xC0CB, 0xC0CB, 0xC0CC, 0xC0CC, 0xC0CC, 0xC0CC, 0xC0CD, 0xC0CD,
+    0xC0CD, 0xC0CD, 0xC0CE, 0xC0CE, 0xC0CE, 0xC0CE, 0xC0CF, 0xC0CF, 0xC0CF,
+    0xC0CF, 0xC0D0, 0xC0D0, 0xC0D0, 0xC0D0, 0xC0D1, 0xC0D1, 0xC0D1, 0xC0D1,
+    0xC0D2, 0xC0D2, 0xC0D2, 0xC0D2, 0xC0D3, 0xC0D3, 0xC0D3, 0xC0D3, 0xC0D4,
+    0xC0D4, 0xC0D4, 0xC0D4, 0xC0D5, 0xC0D5, 0xC0D5, 0xC0D6, 0xC0D6, 0xC0D6,
+    0xC0D6, 0xC0D7, 0xC0D7, 0xC0D7, 0xC0D7, 0xC0D8, 0xC0D8, 0xC0D8, 0xC0D8,
+    0xC0D9, 0xC0D9, 0xC0D9, 0xC0D9, 0xC0DA, 0xC0DA, 0xC0DA, 0xC0DA, 0xC0DB,
+    0xC0DB, 0xC0DB, 0xC0DB, 0xC0DC, 0xC0DC, 0xC0DC, 0xC0DC, 0xC0DD, 0xC0DD,
+    0xC0DD, 0xC0DD, 0xC0DE, 0xC0DE, 0xC0DE, 0xC0DE, 0xC0DF, 0xC0DF, 0xC0DF,
+    0xC0DF, 0xC0E0, 0xC0E0, 0xC0E0, 0xC0E1, 0xC0E1, 0xC0E1, 0xC0E1, 0xC0E2,
+    0xC0E2, 0xC0E2, 0xC0E2, 0xC0E3, 0xC0E3, 0xC0E3, 0xC0E3, 0xC0E4, 0xC0E4,
+    0xC0E4, 0xC0E4, 0xC0E5, 0xC0E5, 0xC0E5, 0xC0E5, 0xC0E6, 0xC0E6, 0xC0E6,
+    0xC0E6, 0xC0E7, 0xC0E7, 0xC0E7, 0xC0E7, 0xC0E8, 0xC0E8, 0xC0E8, 0xC0E8,
+    0xC0E9, 0xC0E9, 0xC0E9, 0xC0E9, 0xC0EA, 0xC0EA, 0xC0EA, 0xC0EA, 0xC0EB,
+    0xC0EB, 0xC0EB, 0xC0EC, 0xC0EC, 0xC0EC, 0xC0EC, 0xC0ED, 0xC0ED, 0xC0ED,
+    0xC0ED, 0xC0EE, 0xC0EE, 0xC0EE, 0xC0EE, 0xC0EF, 0xC0EF, 0xC0EF, 0xC0EF,
+    0xC0F0, 0xC0F0, 0xC0F0, 0xC0F0, 0xC0F1, 0xC0F1, 0xC0F1, 0xC0F1, 0xC0F2,
+    0xC0F2, 0xC0F2, 0xC0F2, 0xC0F3, 0xC0F3, 0xC0F3, 0xC0F3, 0xC0F4, 0xC0F4,
+    0xC0F4, 0xC0F4, 0xC0F5, 0xC0F5, 0xC0F5, 0xC0F6, 0xC0F6, 0xC0F6, 0xC0F6,
+    0xC0F7, 0xC0F7, 0xC0F7, 0xC0F7, 0xC0F8, 0xC0F8, 0xC0F8, 0xC0F8, 0xC0F9,
+    0xC0F9, 0xC0F9, 0xC0F9, 0xC0FA, 0xC0FA, 0xC0FA, 0xC0FA, 0xC0FB, 0xC0FB,
+    0xC0FB, 0xC0FB, 0xC0FC, 0xC0FC, 0xC0FC, 0xC0FC, 0xC0FD, 0xC0FD, 0xC0FD,
+    0xC0FD, 0xC0FE, 0xC0FE, 0xC0FE, 0xC0FE, 0xC0FF, 0xC0FF, 0xC0FF, 0xC0FF,
+    0xC100, 0xC100,
+};
+
+static u16 sigmode_golden_bf16[] = {
+    0x3f00, 0x3f01, 0x3f01, 0x3f02, 0x3f02, 0x3f03, 0x3f03, 0x3f04, 0x3f04,
+    0x3f05, 0x3f05, 0x3f06, 0x3f06, 0x3f07, 0x3f07, 0x3f08, 0x3f08, 0x3f09,
+    0x3f09, 0x3f0a, 0x3f0a, 0x3f0b, 0x3f0b, 0x3f0c, 0x3f0c, 0x3f0d, 0x3f0d,
+    0x3f0e, 0x3f0e, 0x3f0f, 0x3f0f, 0x3f10, 0x3f10, 0x3f11, 0x3f11, 0x3f12,
+    0x3f12, 0x3f13, 0x3f13, 0x3f14, 0x3f14, 0x3f15, 0x3f15, 0x3f16, 0x3f16,
+    0x3f17, 0x3f17, 0x3f18, 0x3f19, 0x3f19, 0x3f1a, 0x3f1a, 0x3f1b, 0x3f1b,
+    0x3f1b, 0x3f1c, 0x3f1d, 0x3f1d, 0x3f1e, 0x3f1e, 0x3f1f, 0x3f1f, 0x3f20,
+    0x3f1f, 0x3f20, 0x3f20, 0x3f21, 0x3f21, 0x3f22, 0x3f22, 0x3f23, 0x3f23,
+    0x3f24, 0x3f24, 0x3f25, 0x3f25, 0x3f26, 0x3f26, 0x3f27, 0x3f27, 0x3f28,
+    0x3f28, 0x3f29, 0x3f29, 0x3f2a, 0x3f2a, 0x3f2a, 0x3f2a, 0x3f2b, 0x3f2b,
+    0x3f2c, 0x3f2c, 0x3f2d, 0x3f2d, 0x3f2e, 0x3f2f, 0x3f2f, 0x3f30, 0x3f30,
+    0x3f30, 0x3f31, 0x3f31, 0x3f31, 0x3f32, 0x3f32, 0x3f32, 0x3f33, 0x3f33,
+    0x3f34, 0x3f34, 0x3f35, 0x3f36, 0x3f36, 0x3f36, 0x3f37, 0x3f37, 0x3f38,
+    0x3f38, 0x3f38, 0x3f39, 0x3f39, 0x3f3a, 0x3f3a, 0x3f3a, 0x3f3b, 0x3f3b,
+    0x3f3b, 0x3f3c, 0x3f3c, 0x3f3d, 0x3f3d, 0x3f3d, 0x3f3e, 0x3f3e, 0x3f3e,
+    0x3f3f, 0x3f3f, 0x3f40, 0x3f40, 0x3f40, 0x3f41, 0x3f41, 0x3f41, 0x3f42,
+    0x3f42, 0x3f42, 0x3f43, 0x3f44, 0x3f44, 0x3f44, 0x3f45, 0x3f45, 0x3f45,
+    0x3f46, 0x3f46, 0x3f46, 0x3f47, 0x3f47, 0x3f48, 0x3f48, 0x3f48, 0x3f49,
+    0x3f49, 0x3f49, 0x3f4a, 0x3f4a, 0x3f4b, 0x3f4b, 0x3f4b, 0x3f4c, 0x3f4c,
+    0x3f4c, 0x3f4c, 0x3f4c, 0x3f4d, 0x3f4d, 0x3f4d, 0x3f4e, 0x3f4e, 0x3f4e,
+    0x3f4f, 0x3f4f, 0x3f50, 0x3f50, 0x3f50, 0x3f51, 0x3f51, 0x3f51, 0x3f51,
+    0x3f52, 0x3f52, 0x3f52, 0x3f52, 0x3f53, 0x3f53, 0x3f54, 0x3f54, 0x3f55,
+    0x3f55, 0x3f55, 0x3f55, 0x3f56, 0x3f56, 0x3f56, 0x3f56, 0x3f57, 0x3f57,
+    0x3f57, 0x3f57, 0x3f58, 0x3f58, 0x3f58, 0x3f58, 0x3f59, 0x3f59, 0x3f59,
+    0x3f59, 0x3f5a, 0x3f5a, 0x3f5a, 0x3f5a, 0x3f5a, 0x3f5b, 0x3f5b, 0x3f5b,
+    0x3f5b, 0x3f5c, 0x3f5c, 0x3f5c, 0x3f5c, 0x3f5d, 0x3f5d, 0x3f5d, 0x3f5e,
+    0x3f5e, 0x3f5e, 0x3f5e, 0x3f5f, 0x3f5f, 0x3f5f, 0x3f5f, 0x3f60, 0x3f60,
+    0x3f60, 0x3f60, 0x3f61, 0x3f61, 0x3f61, 0x3f61, 0x3f62, 0x3f61, 0x3f61,
+    0x3f61, 0x3f62, 0x3f62, 0x3f62, 0x3f62, 0x3f63, 0x3f63, 0x3f63, 0x3f63,
+    0x3f64, 0x3f64, 0x3f64, 0x3f64, 0x3f65, 0x3f65, 0x3f65, 0x3f65, 0x3f66,
+    0x3f66, 0x3f66, 0x3f66, 0x3f66, 0x3f66, 0x3f66, 0x3f66, 0x3f67, 0x3f67,
+    0x3f67, 0x3f67, 0x3f68, 0x3f68, 0x3f68, 0x3f68, 0x3f69, 0x3f69, 0x3f69,
+    0x3f69, 0x3f69, 0x3f69, 0x3f69, 0x3f6a, 0x3f6a, 0x3f6a, 0x3f6a, 0x3f6a,
+    0x3f6a, 0x3f6a, 0x3f6a, 0x3f6b, 0x3f6b, 0x3f6b, 0x3f6b, 0x3f6b, 0x3f6b,
+    0x3f6b, 0x3f6b, 0x3f6c, 0x3f6c, 0x3f6c, 0x3f6c, 0x3f6d, 0x3f6d, 0x3f6d,
+    0x3f6d, 0x3f6e, 0x3f6e, 0x3f6e, 0x3f6e, 0x3f6e, 0x3f6e, 0x3f6e, 0x3f6e,
+    0x3f6f, 0x3f6f, 0x3f6f, 0x3f6f, 0x3f6f, 0x3f6f, 0x3f6f, 0x3f6f, 0x3f6f,
+    0x3f6f, 0x3f70, 0x3f70, 0x3f70, 0x3f70, 0x3f70, 0x3f70, 0x3f70, 0x3f71,
+    0x3f71, 0x3f71, 0x3f71, 0x3f71, 0x3f71, 0x3f71, 0x3f71, 0x3f72, 0x3f72,
+    0x3f71, 0x3f71, 0x3f71, 0x3f71, 0x3f71, 0x3f71, 0x3f72, 0x3f72, 0x3f72,
+    0x3f72, 0x3f72, 0x3f72, 0x3f72, 0x3f72, 0x3f73, 0x3f73, 0x3f73, 0x3f73,
+    0x3f73, 0x3f73, 0x3f73, 0x3f73, 0x3f74, 0x3f74, 0x3f74, 0x3f74, 0x3f74,
+    0x3f74, 0x3f74, 0x3f75, 0x3f75, 0x3f75, 0x3f75, 0x3f75, 0x3f75, 0x3f75,
+    0x3f75, 0x3f75, 0x3f75, 0x3f75, 0x3f75, 0x3f75, 0x3f75, 0x3f75, 0x3f75,
+    0x3f75, 0x3f75, 0x3f76, 0x3f76, 0x3f76, 0x3f76, 0x3f76, 0x3f76, 0x3f76,
+    0x3f76, 0x3f76, 0x3f76, 0x3f76, 0x3f76, 0x3f76, 0x3f76, 0x3f76, 0x3f76,
+    0x3f77, 0x3f77, 0x3f77, 0x3f77, 0x3f77, 0x3f77, 0x3f77, 0x3f78, 0x3f78,
+    0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78,
+    0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78,
+    0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f79, 0x3f79, 0x3f79, 0x3f79, 0x3f79,
+    0x3f79, 0x3f79, 0x3f79, 0x3f79, 0x3f79, 0x3f79, 0x3f79, 0x3f79, 0x3f79,
+    0x3f79, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a,
+    0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a,
+    0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7b, 0x3f7b,
+    0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b,
+    0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b,
+    0x3f7b, 0x3f7b, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c,
+    0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c,
+    0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c,
+    0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7d, 0x3f7d,
+    0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d,
+    0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d,
+    0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d,
+    0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d,
+    0x3f7d, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e,
+    0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e,
+    0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e,
+    0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e,
+    0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e,
+    0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e,
+    0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e,
+    0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7f,
+    0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f,
+    0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f,
+    0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f,
+    0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f,
+    0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f,
+    0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f,
+    0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f,
+    0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f,
+    0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f,
+    0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f,
+    0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f,
+    0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f,
+    0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f,
+    0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f,
+    0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f80, 0x3f80, 0x3f80,
+    0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
+    0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
+    0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
+    0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
+    0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
+    0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
+    0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
+    0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
+    0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
+    0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
+    0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
+    0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
+    0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
+    0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
+    0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
+    0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
+    0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
+    0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
+    0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
+    0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
+    0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
+    0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
+    0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
+    0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
+    0x3f80, 0x3eff, 0x3efe, 0x3efd, 0x3efc, 0x3efb, 0x3efa, 0x3ef9, 0x3ef8,
+    0x3ef7, 0x3ef6, 0x3ef5, 0x3ef4, 0x3ef3, 0x3ef2, 0x3ef1, 0x3ef0, 0x3eef,
+    0x3eee, 0x3eed, 0x3eec, 0x3eeb, 0x3eea, 0x3ee9, 0x3ee7, 0x3ee6, 0x3ee5,
+    0x3ee4, 0x3ee3, 0x3ee2, 0x3ee1, 0x3ee0, 0x3edf, 0x3ede, 0x3edd, 0x3edc,
+    0x3edb, 0x3eda, 0x3ed9, 0x3ed8, 0x3ed7, 0x3ed6, 0x3ed5, 0x3ed4, 0x3ed3,
+    0x3ed2, 0x3ed1, 0x3ed1, 0x3ed0, 0x3ecf, 0x3ece, 0x3ecd, 0x3ecc, 0x3ecb,
+    0x3eca, 0x3ec9, 0x3ec8, 0x3ec7, 0x3ec6, 0x3ec5, 0x3ec4, 0x3ec3, 0x3ec2,
+    0x3ec1, 0x3ec0, 0x3ebf, 0x3ebe, 0x3ebd, 0x3ebc, 0x3ebb, 0x3eba, 0x3eba,
+    0x3eb9, 0x3eb7, 0x3eb6, 0x3eb5, 0x3eb4, 0x3eb4, 0x3eb3, 0x3eb2, 0x3eb1,
+    0x3eb0, 0x3eaf, 0x3eaf, 0x3eae, 0x3ead, 0x3eab, 0x3eaa, 0x3ea9, 0x3ea8,
+    0x3ea7, 0x3ea7, 0x3ea6, 0x3ea5, 0x3ea4, 0x3ea3, 0x3ea2, 0x3ea1, 0x3ea0,
+    0x3e9f, 0x3e9e, 0x3e9e, 0x3e9d, 0x3e9c, 0x3e9b, 0x3e9a, 0x3e99, 0x3e98,
+    0x3e98, 0x3e97, 0x3e97, 0x3e96, 0x3e95, 0x3e94, 0x3e93, 0x3e92, 0x3e91,
+    0x3e90, 0x3e8f, 0x3e8e, 0x3e8e, 0x3e8d, 0x3e8c, 0x3e8b, 0x3e8a, 0x3e8a,
+    0x3e89, 0x3e88, 0x3e88, 0x3e87, 0x3e86, 0x3e85, 0x3e85, 0x3e83, 0x3e82,
+    0x3e82, 0x3e81, 0x3e80, 0x3e7e, 0x3e7d, 0x3e7c, 0x3e7b, 0x3e7a, 0x3e78,
+    0x3e77, 0x3e75, 0x3e72, 0x3e71, 0x3e6f, 0x3e6e, 0x3e6c, 0x3e6b, 0x3e69,
+    0x3e68, 0x3e67, 0x3e65, 0x3e64, 0x3e63, 0x3e61, 0x3e60, 0x3e5f, 0x3e5d,
+    0x3e5c, 0x3e5a, 0x3e59, 0x3e58, 0x3e56, 0x3e55, 0x3e54, 0x3e52, 0x3e51,
+    0x3e50, 0x3e4f, 0x3e4e, 0x3e4c, 0x3e4b, 0x3e4a, 0x3e49, 0x3e47, 0x3e46,
+    0x3e45, 0x3e44, 0x3e43, 0x3e41, 0x3e40, 0x3e3f, 0x3e3e, 0x3e3c, 0x3e3a,
+    0x3e39, 0x3e37, 0x3e36, 0x3e35, 0x3e34, 0x3e33, 0x3e31, 0x3e30, 0x3e2f,
+    0x3e2e, 0x3e2c, 0x3e2b, 0x3e2a, 0x3e29, 0x3e28, 0x3e27, 0x3e26, 0x3e25,
+    0x3e24, 0x3e23, 0x3e22, 0x3e20, 0x3e20, 0x3e1f, 0x3e1e, 0x3e1d, 0x3e1c,
+    0x3e1b, 0x3e1a, 0x3e19, 0x3e18, 0x3e17, 0x3e16, 0x3e15, 0x3e14, 0x3e13,
+    0x3e12, 0x3e11, 0x3e10, 0x3e0f, 0x3e0e, 0x3e0c, 0x3e0b, 0x3e0a, 0x3e09,
+    0x3e08, 0x3e07, 0x3e06, 0x3e05, 0x3e04, 0x3e03, 0x3e03, 0x3e02, 0x3e01,
+    0x3e00, 0x3dff, 0x3dfd, 0x3dfb, 0x3df9, 0x3df8, 0x3df6, 0x3df4, 0x3df1,
+    0x3df1, 0x3ded, 0x3ded, 0x3dea, 0x3dea, 0x3de7, 0x3de7, 0x3de4, 0x3de4,
+    0x3de1, 0x3de1, 0x3dde, 0x3dde, 0x3ddb, 0x3ddb, 0x3dd8, 0x3dd8, 0x3dd5,
+    0x3dd5, 0x3dd2, 0x3dd2, 0x3dcf, 0x3dcf, 0x3dcc, 0x3dcc, 0x3dc9, 0x3dc9,
+    0x3dc7, 0x3dc7, 0x3dc3, 0x3dc3, 0x3dc0, 0x3dc0, 0x3dbe, 0x3dbe, 0x3dbb,
+    0x3dbb, 0x3db9, 0x3db9, 0x3db6, 0x3db4, 0x3db4, 0x3db1, 0x3db1, 0x3dae,
+    0x3dae, 0x3dac, 0x3dac, 0x3da9, 0x3da9, 0x3da7, 0x3da7, 0x3da5, 0x3da5,
+    0x3da3, 0x3da3, 0x3da0, 0x3da0, 0x3d9e, 0x3d9e, 0x3d9b, 0x3d9b, 0x3d99,
+    0x3d99, 0x3d97, 0x3d97, 0x3d94, 0x3d94, 0x3d93, 0x3d93, 0x3d91, 0x3d91,
+    0x3d8f, 0x3d8f, 0x3d8d, 0x3d8d, 0x3d8a, 0x3d8a, 0x3d88, 0x3d88, 0x3d86,
+    0x3d86, 0x3d84, 0x3d82, 0x3d82, 0x3d80, 0x3d80, 0x3d7d, 0x3d7d, 0x3d79,
+    0x3d79, 0x3d76, 0x3d76, 0x3d72, 0x3d72, 0x3d6f, 0x3d6f, 0x3d6b, 0x3d6b,
+    0x3d68, 0x3d68, 0x3d65, 0x3d65, 0x3d61, 0x3d61, 0x3d5e, 0x3d5e, 0x3d5b,
+    0x3d5b, 0x3d58, 0x3d58, 0x3d55, 0x3d55, 0x3d52, 0x3d52, 0x3d4e, 0x3d4e,
+    0x3d4b, 0x3d4b, 0x3d48, 0x3d48, 0x3d45, 0x3d45, 0x3d42, 0x3d3f, 0x3d3f,
+    0x3d3c, 0x3d3c, 0x3d3a, 0x3d3a, 0x3d37, 0x3d37, 0x3d34, 0x3d34, 0x3d32,
+    0x3d32, 0x3d2f, 0x3d2f, 0x3d2c, 0x3d2c, 0x3d2a, 0x3d2a, 0x3d27, 0x3d27,
+    0x3d24, 0x3d24, 0x3d22, 0x3d22, 0x3d20, 0x3d20, 0x3d1d, 0x3d1d, 0x3d1b,
+    0x3d1b, 0x3d19, 0x3d19, 0x3d17, 0x3d17, 0x3d15, 0x3d15, 0x3d12, 0x3d12,
+    0x3d10, 0x3d10, 0x3d0e, 0x3d0c, 0x3d0c, 0x3d0a, 0x3d0a, 0x3d08, 0x3d08,
+    0x3d06, 0x3d06, 0x3d04, 0x3d04, 0x3d02, 0x3d02, 0x3cff, 0x3cff, 0x3cfb,
+    0x3cfb, 0x3cf8, 0x3cf8, 0x3cf4, 0x3cf4, 0x3cf0, 0x3cf0, 0x3cec, 0x3cec,
+    0x3ce9, 0x3ce9, 0x3ce5, 0x3ce5, 0x3ce2, 0x3ce2, 0x3cdf, 0x3cdf, 0x3cdb,
+    0x3cdb, 0x3cd8, 0x3cd8, 0x3cd5, 0x3cd5, 0x3cd2, 0x3cd2, 0x3ccf, 0x3ccf,
+    0x3ccc, 0x3cc8, 0x3cc8, 0x3cc5, 0x3cc5, 0x3cc2, 0x3cc2, 0x3cbf, 0x3cbf,
+    0x3cbc, 0x3cbc, 0x3cb9, 0x3cb9, 0x3cb6, 0x3cb6, 0x3cb4, 0x3cb4, 0x3cb1,
+    0x3cb1, 0x3cae, 0x3cae, 0x3cac, 0x3cac, 0x3ca9, 0x3ca9, 0x3ca7, 0x3ca7,
+    0x3ca5, 0x3ca5, 0x3ca2, 0x3ca2, 0x3ca0, 0x3ca0, 0x3c9d, 0x3c9d, 0x3c9b,
+    0x3c9b, 0x3c98, 0x3c98, 0x3c96, 0x3c96, 0x3c93, 0x3c93, 0x3c8f, 0x3c8f,
+    0x3c8f, 0x3c8f, 0x3c8b, 0x3c8b, 0x3c8b, 0x3c8b, 0x3c87, 0x3c87, 0x3c87,
+    0x3c87, 0x3c82, 0x3c82, 0x3c82, 0x3c82, 0x3c7c, 0x3c7c, 0x3c7c, 0x3c7c,
+    0x3c75, 0x3c75, 0x3c75, 0x3c75, 0x3c6e, 0x3c6e, 0x3c6e, 0x3c6e, 0x3c66,
+    0x3c66, 0x3c66, 0x3c66, 0x3c5f, 0x3c5f, 0x3c5f, 0x3c5f, 0x3c59, 0x3c59,
+    0x3c59, 0x3c59, 0x3c53, 0x3c53, 0x3c53, 0x3c4c, 0x3c4c, 0x3c4c, 0x3c4c,
+    0x3c46, 0x3c46, 0x3c46, 0x3c46, 0x3c3f, 0x3c3f, 0x3c3f, 0x3c3f, 0x3c39,
+    0x3c39, 0x3c39, 0x3c39, 0x3c34, 0x3c34, 0x3c34, 0x3c34, 0x3c2f, 0x3c2f,
+    0x3c2f, 0x3c2f, 0x3c29, 0x3c29, 0x3c29, 0x3c29, 0x3c24, 0x3c24, 0x3c24,
+    0x3c24, 0x3c1f, 0x3c1f, 0x3c1f, 0x3c1f, 0x3c1a, 0x3c1a, 0x3c1a, 0x3c16,
+    0x3c16, 0x3c16, 0x3c16, 0x3c12, 0x3c12, 0x3c12, 0x3c12, 0x3c0d, 0x3c0d,
+    0x3c0d, 0x3c0d, 0x3c09, 0x3c09, 0x3c09, 0x3c09, 0x3c04, 0x3c04, 0x3c04,
+    0x3c04, 0x3c00, 0x3c00, 0x3c00, 0x3c00, 0x3bf8, 0x3bf8, 0x3bf8, 0x3bf8,
+    0x3bf1, 0x3bf1, 0x3bf1, 0x3bf1, 0x3be9, 0x3be9, 0x3be9, 0x3be9, 0x3be2,
+    0x3be2, 0x3be2, 0x3be2, 0x3bdb, 0x3bdb, 0x3bdb, 0x3bd4, 0x3bd4, 0x3bd4,
+    0x3bd4, 0x3bce, 0x3bce, 0x3bce, 0x3bce, 0x3bc8, 0x3bc8, 0x3bc8, 0x3bc8,
+    0x3bc2, 0x3bc2, 0x3bc2, 0x3bc2, 0x3bbc, 0x3bbc, 0x3bbc, 0x3bbc, 0x3bb6,
+    0x3bb6, 0x3bb6, 0x3bb6, 0x3bb0, 0x3bb0, 0x3bb0, 0x3bb0, 0x3bab, 0x3bab,
+    0x3bab, 0x3bab, 0x3ba6, 0x3ba6, 0x3ba6, 0x3ba6, 0x3ba1, 0x3ba1, 0x3ba1,
+    0x3ba1, 0x3b9c, 0x3b9c, 0x3b9c, 0x3b97, 0x3b97, 0x3b97, 0x3b97, 0x3b92,
+    0x3b92, 0x3b92, 0x3b92, 0x3b8e, 0x3b8e, 0x3b8e, 0x3b8e, 0x3b8a, 0x3b8a,
+    0x3b8a, 0x3b8a, 0x3b85, 0x3b85, 0x3b85, 0x3b85, 0x3b81, 0x3b81, 0x3b81,
+    0x3b81, 0x3b7b, 0x3b7b, 0x3b7b, 0x3b7b, 0x3b73, 0x3b73, 0x3b73, 0x3b73,
+    0x3b6c, 0x3b6c, 0x3b6c, 0x3b6c, 0x3b65, 0x3b65, 0x3b65, 0x3b5d, 0x3b5d,
+    0x3b5d, 0x3b5d, 0x3b56, 0x3b56, 0x3b56, 0x3b56, 0x3b50, 0x3b50, 0x3b50,
+    0x3b50, 0x3b4a, 0x3b4a, 0x3b4a, 0x3b4a, 0x3b43, 0x3b43, 0x3b43, 0x3b43,
+    0x3b3d, 0x3b3d, 0x3b3d, 0x3b3d, 0x3b38, 0x3b38, 0x3b38, 0x3b38, 0x3b32,
+    0x3b32, 0x3b32, 0x3b32, 0x3b2c, 0x3b2c, 0x3b2c, 0x3b2c, 0x3b27, 0x3b27,
+    0x3b27, 0x3b27, 0x3b22, 0x3b22, 0x3b22, 0x3b1d, 0x3b1d, 0x3b1d, 0x3b1d,
+    0x3b18, 0x3b18, 0x3b18, 0x3b18, 0x3b13, 0x3b13, 0x3b13, 0x3b13, 0x3b0f,
+    0x3b0f, 0x3b0f, 0x3b0f, 0x3b0b, 0x3b0b, 0x3b0b, 0x3b0b, 0x3b06, 0x3b06,
+    0x3b06, 0x3b06, 0x3b02, 0x3b02, 0x3b02, 0x3b02, 0x3afd, 0x3afd, 0x3afd,
+    0x3afd, 0x3af5, 0x3af5, 0x3af5, 0x3af5, 0x3aed, 0x3aed, 0x3aed, 0x3aed,
+    0x3ae6, 0x3ae6, 0x3ae6, 0x3adf, 0x3adf, 0x3adf, 0x3adf, 0x3ad8, 0x3ad8,
+    0x3ad8, 0x3ad8, 0x3ad1, 0x3ad1, 0x3ad1, 0x3ad1, 0x3acb, 0x3acb, 0x3acb,
+    0x3acb, 0x3ac5, 0x3ac5, 0x3ac5, 0x3ac5, 0x3abf, 0x3abf, 0x3abf, 0x3abf,
+    0x3ab9, 0x3ab9, 0x3ab9, 0x3ab9, 0x3ab3, 0x3ab3, 0x3ab3, 0x3ab3, 0x3aae,
+    0x3aae, 0x3aae, 0x3aae, 0x3aa9, 0x3aa9, 0x3aa9, 0x3aa3, 0x3aa3, 0x3aa3,
+    0x3aa3, 0x3a9e, 0x3a9e, 0x3a9e, 0x3a9e, 0x3a99, 0x3a99, 0x3a99, 0x3a99,
+    0x3a94, 0x3a94, 0x3a94, 0x3a94, 0x3a90, 0x3a90, 0x3a90, 0x3a90, 0x3a8c,
+    0x3a8c, 0x3a8c, 0x3a8c, 0x3a87, 0x3a87, 0x3a87, 0x3a87, 0x3a83, 0x3a83,
+    0x3a83, 0x3a83, 0x3a7e, 0x3a7e, 0x3a7e, 0x3a7e, 0x3a76, 0x3a76, 0x3a76,
+    0x3a76, 0x3a6f, 0x3a6f, 0x3a6f, 0x3a68, 0x3a68, 0x3a68, 0x3a68, 0x3a60,
+    0x3a60, 0x3a60, 0x3a60, 0x3a59, 0x3a59, 0x3a59, 0x3a59, 0x3a53, 0x3a53,
+    0x3a53, 0x3a53, 0x3a4d, 0x3a4d, 0x3a4d, 0x3a4d, 0x3a46, 0x3a46, 0x3a46,
+    0x3a46, 0x3a40, 0x3a40, 0x3a40, 0x3a40, 0x3a3a, 0x3a3a, 0x3a3a, 0x3a3a,
+    0x3a34, 0x3a34, 0x3a34, 0x3a34, 0x3a2f, 0x3a2f, 0x3a2f, 0x3a2f, 0x3a2a,
+    0x3a2a, 0x3a2a, 0x3a24, 0x3a24, 0x3a24, 0x3a24, 0x3a1f, 0x3a1f, 0x3a1f,
+    0x3a1f, 0x3a1a, 0x3a1a, 0x3a1a, 0x3a1a, 0x3a15, 0x3a15, 0x3a15, 0x3a15,
+    0x3a11, 0x3a11, 0x3a11, 0x3a11, 0x3a0d, 0x3a0d, 0x3a0d, 0x3a0d, 0x3a08,
+    0x3a08, 0x3a08, 0x3a08, 0x3a04, 0x3a04, 0x3a04, 0x3a04, 0x3a00, 0x3a00,
+    0x3a00, 0x3a00, 0x39f8, 0x39f8, 0x39f8, 0x39f0, 0x39f0, 0x39f0, 0x39f0,
+    0x39e9, 0x39e9, 0x39e9, 0x39e9, 0x39e2, 0x39e2, 0x39e2, 0x39e2, 0x39db,
+    0x39db, 0x39db, 0x39db, 0x39d4, 0x39d4, 0x39d4, 0x39d4, 0x39ce, 0x39ce,
+    0x39ce, 0x39ce, 0x39c7, 0x39c7, 0x39c7, 0x39c7, 0x39c1, 0x39c1, 0x39c1,
+    0x39c1, 0x39bb, 0x39bb, 0x39bb, 0x39bb, 0x39b5, 0x39b5, 0x39b5, 0x39b5,
+    0x39b0, 0x39b0,
+};
+
+// FIXME: not hard code
+// contribute from hw, fix with `PRE_DATA` input
+static double sigmode_golden[] = {
+    0.5,         0.501999989, 0.503999915, 0.505999712, 0.507999317,
+    0.509998667, 0.511997697, 0.513996342, 0.515994541, 0.517992228,
+    0.51998934,  0.521985814, 0.523981585, 0.525976591, 0.527970767,
+    0.529964052, 0.531956381, 0.533947691, 0.535937921, 0.537927006,
+    0.539914885, 0.541901494, 0.543886772, 0.545870657, 0.547853086,
+    0.549833997, 0.55181333,  0.553791023, 0.555767014, 0.557741243,
+    0.559713649, 0.561684172, 0.56365275,  0.565619325, 0.567583836,
+    0.569546224, 0.571506429, 0.573464394, 0.575420058, 0.577373363,
+    0.579324252, 0.581272667, 0.583218549, 0.585161842, 0.58710249,
+    0.589040434, 0.59097562,  0.59290799,  0.594837491, 0.596764066,
+    0.59868766,  0.60060822,  0.60252569,  0.604440017, 0.606351149,
+    0.608259031, 0.610163611, 0.612064837, 0.613962657, 0.61585702,
+    0.617747875, 0.61963517,  0.621518857, 0.623398885, 0.625275204,
+    0.627147766, 0.629016523, 0.630881426, 0.632742428, 0.634599482,
+    0.63645254,  0.638301558, 0.640146488, 0.641987286, 0.643823907,
+    0.645656306, 0.64748444,  0.649308265, 0.651127739, 0.652942818,
+    0.654753461, 0.656559626, 0.658361272, 0.66015836,  0.661950848,
+    0.663738697, 0.665521869, 0.667300325, 0.669074026, 0.670842936,
+    0.672607017, 0.674366233, 0.676120548, 0.677869926, 0.679614333,
+    0.681353734, 0.683088095, 0.684817383, 0.686541565, 0.688260608,
+    0.689974481, 0.691683153, 0.693386592, 0.695084769, 0.696777653,
+    0.698465216, 0.700147429, 0.701824263, 0.703495691, 0.705161686,
+    0.706822221, 0.70847727,  0.710126808, 0.71177081,  0.71340925,
+    0.715042106, 0.716669353, 0.718290968, 0.71990693,  0.721517216,
+    0.723121805, 0.724720676, 0.726313808, 0.727901182, 0.729482779,
+    0.731058579, 0.732628564, 0.734192716, 0.735751018, 0.737303454,
+    0.738850006, 0.740390659, 0.741925398, 0.743454208, 0.744977074,
+    0.746493983, 0.748004922, 0.749509876, 0.751008835, 0.752501785,
+    0.753988716, 0.755469617, 0.756944477, 0.758413287, 0.759876035,
+    0.761332715, 0.762783316, 0.764227831, 0.765666252, 0.767098572,
+    0.768524783, 0.769944881, 0.771358858, 0.772766709, 0.774168429,
+    0.775564014, 0.77695346,  0.778336762, 0.779713917, 0.781084923,
+    0.782449776, 0.783808476, 0.78516102,  0.786507407, 0.787847636,
+    0.789181707, 0.790509619, 0.791831373, 0.79314697,  0.794456411,
+    0.795759698, 0.797056831, 0.798347814, 0.79963265,  0.80091134,
+    0.802183889, 0.803450299, 0.804710577, 0.805964724, 0.807212748,
+    0.808454651, 0.809690441, 0.810920123, 0.812143702, 0.813361186,
+    0.814572581, 0.815777894, 0.816977132, 0.818170304, 0.819357418,
+    0.820538481, 0.821713502, 0.82288249,  0.824045455, 0.825202406,
+    0.826353353, 0.827498306, 0.828637274, 0.82977027,  0.830897303,
+    0.832018385, 0.833133528, 0.834242742, 0.83534604,  0.836443435,
+    0.837534937, 0.838620561, 0.83970032,  0.840774225, 0.841842291,
+    0.842904531, 0.843960959, 0.84501159,  0.846056436, 0.847095514,
+    0.848128836, 0.84915642,  0.850178278, 0.851194427, 0.852204883,
+    0.85320966,  0.854208775, 0.855202244, 0.856190082, 0.857172307,
+    0.858148935, 0.859119982, 0.860085466, 0.861045403, 0.861999811,
+    0.862948707, 0.863892109, 0.864830034, 0.8657625,   0.866689525,
+    0.867611126, 0.868527324, 0.869438134, 0.870343577, 0.871243671,
+    0.872138434, 0.873027885, 0.873912043, 0.874790928, 0.875664558,
+    0.876532952, 0.877396131, 0.878254114, 0.879106919, 0.879954567,
+    0.880797078, 0.881634471, 0.882466767, 0.883293985, 0.884116145,
+    0.884933268, 0.885745374, 0.886552483, 0.887354615, 0.888151792,
+    0.888944033, 0.88973136,  0.890513792, 0.89129135,  0.892064056,
+    0.89283193,  0.893594992, 0.894353264, 0.895106767, 0.895855521,
+    0.896599549, 0.897338869, 0.898073505, 0.898803476, 0.899528804,
+    0.900249511, 0.900965617, 0.901677143, 0.902384111, 0.903086543,
+    0.903784458, 0.90447788,  0.905166828, 0.905851324, 0.90653139,
+    0.907207047, 0.907878316, 0.908545218, 0.909207776, 0.90986601,
+    0.910519941, 0.911169591, 0.911814981, 0.912456133, 0.913093067,
+    0.913725806, 0.914354369, 0.91497878,  0.915599058, 0.916215226,
+    0.916827304, 0.917435313, 0.918039275, 0.91863921,  0.919235141,
+    0.919827088, 0.920415072, 0.920999114, 0.921579235, 0.922155456,
+    0.922727798, 0.923296282, 0.923860929, 0.92442176,  0.924978795,
+    0.925532055, 0.926081561, 0.926627334, 0.927169394, 0.927707762,
+    0.928242458, 0.928773503, 0.929300917, 0.929824721, 0.930344935,
+    0.93086158,  0.931374675, 0.931884241, 0.932390297, 0.932892865,
+    0.933391964, 0.933887615, 0.934379836, 0.934868648, 0.93535407,
+    0.935836124, 0.936314827, 0.9367902,   0.937262263, 0.937731034,
+    0.938196534, 0.938658781, 0.939117796, 0.939573597, 0.940026203,
+    0.940475634, 0.940921909, 0.941365046, 0.941805065, 0.942241985,
+    0.942675824, 0.943106601, 0.943534335, 0.943959044, 0.944380747,
+    0.944799462, 0.945215208, 0.945628003, 0.946037865, 0.946444813,
+    0.946848864, 0.947250036, 0.947648348, 0.948043817, 0.948436462,
+    0.948826299, 0.949213347, 0.949597623, 0.949979144, 0.950357929,
+    0.950733994, 0.951107357, 0.951478034, 0.951846044, 0.952211402,
+    0.952574127, 0.952934234, 0.953291742, 0.953646665, 0.953999022,
+    0.954348829, 0.954696102, 0.955040858, 0.955383113, 0.955722883,
+    0.956060185, 0.956395034, 0.956727447, 0.95705744,  0.957385028,
+    0.957710228, 0.958033055, 0.958353525, 0.958671653, 0.958987455,
+    0.959300946, 0.959612142, 0.959921058, 0.960227709, 0.960532111,
+    0.960834277, 0.961134224, 0.961431966, 0.961727518, 0.962020894,
+    0.962312109, 0.962601179, 0.962888117, 0.963172937, 0.963455655,
+    0.963736284, 0.964014838, 0.964291332, 0.96456578,  0.964838195,
+    0.965108591, 0.965376983, 0.965643384, 0.965907808, 0.966170267,
+    0.966430777, 0.966689349, 0.966945998, 0.967200737, 0.967453578,
+    0.967704535, 0.967953622, 0.96820085,  0.968446233, 0.968689784,
+    0.968931516, 0.96917144,  0.969409571, 0.969645919, 0.969880498,
+    0.97011332,  0.970344398, 0.970573743, 0.970801367, 0.971027284,
+    0.971251504, 0.97147404,  0.971694904, 0.971914107, 0.972131661,
+    0.972347578, 0.972561869, 0.972774546, 0.97298562,  0.973195103,
+    0.973403006, 0.973609341, 0.973814117, 0.974017347, 0.974219042,
+    0.974419212, 0.974617868, 0.974815021, 0.975010683, 0.975204863,
+    0.975397572, 0.97558882,  0.975778619, 0.975966979, 0.97615391,
+    0.976339422, 0.976523525, 0.97670623,  0.976887547, 0.977067486,
+    0.977246057, 0.977423269, 0.977599132, 0.977773657, 0.977946853,
+    0.978118729, 0.978289296, 0.978458562, 0.978626537, 0.978793231,
+    0.978958653, 0.979122812, 0.979285717, 0.979447378, 0.979607804,
+    0.979767003, 0.979924985, 0.980081758, 0.980237332, 0.980391715,
+    0.980544915, 0.980696943, 0.980847805, 0.980997512, 0.981146071,
+    0.98129349,  0.981439779, 0.981584945, 0.981728996, 0.981871942,
+    0.98201379,  0.982154548, 0.982294225, 0.982432827, 0.982570364,
+    0.982706843, 0.982842273, 0.982976659, 0.983110012, 0.983242337,
+    0.983373644, 0.983503939, 0.983633229, 0.983761524, 0.983888829,
+    0.984015152, 0.9841405,   0.984264882, 0.984388303, 0.984510772,
+    0.984632294, 0.984752879, 0.984872531, 0.984991259, 0.985109069,
+    0.985225968, 0.985341963, 0.985457061, 0.985571269, 0.985684592,
+    0.985797039, 0.985908614, 0.986019326, 0.98612918,  0.986238183,
+    0.986346341, 0.986453661, 0.986560148, 0.98666581,  0.986770653,
+    0.986874682, 0.986977903, 0.987080324, 0.98718195,  0.987282786,
+    0.987382839, 0.987482115, 0.98758062,  0.98767836,  0.987775339,
+    0.987871565, 0.987967043, 0.988061778, 0.988155776, 0.988249042,
+    0.988341583, 0.988433404, 0.98852451,  0.988614907, 0.9887046,
+    0.988793594, 0.988881895, 0.988969507, 0.989056437, 0.98914269,
+    0.98922827,  0.989313183, 0.989397433, 0.989481027, 0.989563968,
+    0.989646262, 0.989727914, 0.989808929, 0.989889312, 0.989969066,
+    0.990048198, 0.990126712, 0.990204613, 0.990281905, 0.990358593,
+    0.990434681, 0.990510175, 0.990585079, 0.990659397, 0.990733134,
+    0.990806295, 0.990878883, 0.990950903, 0.99102236,  0.991093257,
+    0.9911636,   0.991233391, 0.991302637, 0.99137134,  0.991439506,
+    0.991507137, 0.991574239, 0.991640815, 0.991706869, 0.991772406,
+    0.991837429, 0.991901942, 0.99196595,  0.992029456, 0.992092463,
+    0.992154977, 0.992217,    0.992278537, 0.992339591, 0.992400166,
+    0.992460265, 0.992519893, 0.992579053, 0.992637749, 0.992695983,
+    0.99275376,  0.992811084, 0.992867957, 0.992924384, 0.992980367,
+    0.993035911, 0.993091018, 0.993145692, 0.993199936, 0.993253754,
+    0.993307149, 0.993360124, 0.993412683, 0.993464828, 0.993516563,
+    0.993567892, 0.993618816, 0.99366934,  0.993719466, 0.993769198,
+    0.993818539, 0.993867491, 0.993916059, 0.993964243, 0.994012049,
+    0.994059478, 0.994106533, 0.994153219, 0.994199536, 0.994245489,
+    0.994291079, 0.994336311, 0.994381186, 0.994425708, 0.994469878,
+    0.994513701, 0.994557178, 0.994600313, 0.994643108, 0.994685565,
+    0.994727688, 0.994769478, 0.994810939, 0.994852073, 0.994892883,
+    0.994933371, 0.994973539, 0.995013391, 0.995052928, 0.995092153,
+    0.995131069, 0.995169677, 0.995207981, 0.995245983, 0.995283685,
+    0.995321089, 0.995358198, 0.995395014, 0.995431539, 0.995467776,
+    0.995503727, 0.995539394, 0.995574779, 0.995609885, 0.995644713,
+    0.995679266, 0.995713547, 0.995747556, 0.995781297, 0.995814772,
+    0.995847981, 0.995880929, 0.995913616, 0.995946044, 0.995978217,
+    0.996010135, 0.996041801, 0.996073216, 0.996104383, 0.996135304,
+    0.99616598,  0.996196413, 0.996226606, 0.996256561, 0.996286278,
+    0.99631576,  0.996345009, 0.996374027, 0.996402815, 0.996431375,
+    0.99645971,  0.99648782,  0.996515708, 0.996543375, 0.996570823,
+    0.996598054, 0.99662507,  0.996651872, 0.996678461, 0.99670484,
+    0.99673101,  0.996756974, 0.996782731, 0.996808285, 0.996833636,
+    0.996858787, 0.996883738, 0.996908492, 0.99693305,  0.996957413,
+    0.996981584, 0.997005563, 0.997029352, 0.997052952, 0.997076366,
+    0.997099594, 0.997122638, 0.9971455,   0.99716818,  0.997190681,
+    0.997213004, 0.997235149, 0.99725712,  0.997278916, 0.997300539,
+    0.997321991, 0.997343273, 0.997364386, 0.997385332, 0.997406112,
+    0.997426727, 0.997447179, 0.997467468, 0.997487597, 0.997507566,
+    0.997527377, 0.997547031, 0.997566528, 0.997585872, 0.997605062,
+    0.997624099, 0.997642986, 0.997661723, 0.997680312, 0.997698752,
+    0.997717047, 0.997735197, 0.997753202, 0.997771065, 0.997788786,
+    0.997806367, 0.997823808, 0.99784111,  0.997858276, 0.997875305,
+    0.997892199, 0.997908959, 0.997925586, 0.997942081, 0.997958445,
+    0.99797468,  0.997990785, 0.998006763, 0.998022614, 0.998038339,
+    0.998053939, 0.998069415, 0.998084769, 0.998100001, 0.998115112,
+    0.998130102, 0.998144974, 0.998159728, 0.998174365, 0.998188885,
+    0.99820329,  0.998217581, 0.998231759, 0.998245823, 0.998259777,
+    0.998273619, 0.998287351, 0.998300975, 0.99831449,  0.998327898,
+    0.998341199, 0.998354395, 0.998367486, 0.998380473, 0.998393356,
+    0.998406138, 0.998418818, 0.998431397, 0.998443876, 0.998456256,
+    0.998468538, 0.998480723, 0.99849281,  0.998504802, 0.998516698,
+    0.998528499, 0.998540207, 0.998551822, 0.998563345, 0.998574776,
+    0.998586116, 0.998597366, 0.998608527, 0.998619599, 0.998630583,
+    0.99864148,  0.99865229,  0.998663015, 0.998673654, 0.998684208,
+    0.998694679, 0.998705066, 0.998715371, 0.998725594, 0.998735736,
+    0.998745797, 0.998755778, 0.99876568,  0.998775503, 0.998785248,
+    0.998794916, 0.998804507, 0.998814021, 0.99882346,  0.998832824,
+    0.998842113, 0.998851329, 0.998860471, 0.998869541, 0.998878538,
+    0.998887464, 0.998896319, 0.998905104, 0.998913818, 0.998922464,
+    0.99893104,  0.998939549, 0.99894799,  0.998956364, 0.998964671,
+    0.998972912, 0.998981088, 0.998989198, 0.998997244, 0.999005226,
+    0.999013145, 0.999021001, 0.999028794, 0.999036525, 0.999044195,
+    0.999051803, 0.999059352, 0.99906684,  0.999074268, 0.999081638,
+    0.999088949, 0.999096202, 0.999103397, 0.999110535, 0.999117616,
+    0.99912464,  0.999131609, 0.999138523, 0.999145381, 0.999152185,
+    0.999158935, 0.999165631, 0.999172274, 0.999178864, 0.999185401,
+    0.999191887, 0.999198321, 0.999204704, 0.999211036, 0.999217317,
+    0.999223549, 0.999229731, 0.999235864, 0.999241948, 0.999247984,
+    0.999253971, 0.999259911, 0.999265804, 0.99927165,  0.999277449,
+    0.999283202, 0.99928891,  0.999294572, 0.999300189, 0.999305761,
+    0.999311289, 0.999316773, 0.999322213, 0.99932761,  0.999332964,
+    0.999338276, 0.999343545, 0.999348772, 0.999353958, 0.999359103,
+    0.999364206, 0.999369269, 0.999374291, 0.999379274, 0.999384217,
+    0.999389121, 0.999393985, 0.999398811, 0.999403599, 0.999408348,
+    0.99941306,  0.999417734, 0.99942237,  0.99942697,  0.999431534,
+    0.999436061, 0.999440552, 0.999445007, 0.999449427, 0.999453811,
+    0.999458161, 0.999462476, 0.999466757, 0.999471004, 0.999475217,
+    0.999479396, 0.999483542, 0.999487655, 0.999491735, 0.999495783,
+    0.999499799, 0.999503783, 0.999507735, 0.999511655, 0.999515544,
+    0.999519403, 0.99952323,  0.999527027, 0.999530794, 0.999534531,
+    0.999538238, 0.999541916, 0.999545564, 0.999549184, 0.999552774,
+    0.999556336, 0.99955987,  0.999563375, 0.999566853, 0.999570303,
+    0.999573725, 0.99957712,  0.999580488, 0.99958383,  0.999587145,
+    0.999590433, 0.999593695, 0.999596931, 0.999600142, 0.999603326,
+    0.999606486, 0.99960962,  0.99961273,  0.999615814, 0.999618874,
+    0.99962191,  0.999624921, 0.999627909, 0.999630873, 0.999633813,
+    0.99963673,  0.999639623, 0.999642494, 0.999645341, 0.999648166,
+    0.999650969, 0.999653749, 0.999656507, 0.999659243, 0.999661957,
+    0.498000011, 0.496000085, 0.494000288, 0.492000683, 0.490001333,
+    0.488002303, 0.486003658, 0.484005459, 0.482007772, 0.48001066,
+    0.478014186, 0.476018415, 0.474023409, 0.472029233, 0.470035948,
+    0.468043619, 0.466052309, 0.464062079, 0.462072994, 0.460085115,
+    0.458098506, 0.456113228, 0.454129343, 0.452146914, 0.450166003,
+    0.44818667,  0.446208977, 0.444232986, 0.442258757, 0.440286351,
+    0.438315828, 0.43634725,  0.434380675, 0.432416164, 0.430453776,
+    0.428493571, 0.426535606, 0.424579942, 0.422626637, 0.420675748,
+    0.418727333, 0.416781451, 0.414838158, 0.41289751,  0.410959566,
+    0.40902438,  0.40709201,  0.405162509, 0.403235934, 0.40131234,
+    0.39939178,  0.39747431,  0.395559983, 0.393648851, 0.391740969,
+    0.389836389, 0.387935163, 0.386037343, 0.38414298,  0.382252125,
+    0.38036483,  0.378481143, 0.376601115, 0.374724796, 0.372852234,
+    0.370983477, 0.369118574, 0.367257572, 0.365400518, 0.36354746,
+    0.361698442, 0.359853512, 0.358012714, 0.356176093, 0.354343694,
+    0.35251556,  0.350691735, 0.348872261, 0.347057182, 0.345246539,
+    0.343440374, 0.341638728, 0.33984164,  0.338049152, 0.336261303,
+    0.334478131, 0.332699675, 0.330925974, 0.329157064, 0.327392983,
+    0.325633767, 0.323879452, 0.322130074, 0.320385667, 0.318646266,
+    0.316911905, 0.315182617, 0.313458435, 0.311739392, 0.310025519,
+    0.308316847, 0.306613408, 0.304915231, 0.303222347, 0.301534784,
+    0.299852571, 0.298175737, 0.296504309, 0.294838314, 0.293177779,
+    0.29152273,  0.289873192, 0.28822919,  0.28659075,  0.284957894,
+    0.283330647, 0.281709032, 0.28009307,  0.278482784, 0.276878195,
+    0.275279324, 0.273686192, 0.272098818, 0.270517221, 0.268941421,
+    0.267371436, 0.265807284, 0.264248982, 0.262696546, 0.261149994,
+    0.259609341, 0.258074602, 0.256545792, 0.255022926, 0.253506017,
+    0.251995078, 0.250490124, 0.248991165, 0.247498215, 0.246011284,
+    0.244530383, 0.243055523, 0.241586713, 0.240123965, 0.238667285,
+    0.237216684, 0.235772169, 0.234333748, 0.232901428, 0.231475217,
+    0.230055119, 0.228641142, 0.227233291, 0.225831571, 0.224435986,
+    0.22304654,  0.221663238, 0.220286083, 0.218915077, 0.217550224,
+    0.216191524, 0.21483898,  0.213492593, 0.212152364, 0.210818293,
+    0.209490381, 0.208168627, 0.20685303,  0.205543589, 0.204240302,
+    0.202943169, 0.201652186, 0.20036735,  0.19908866,  0.197816111,
+    0.196549701, 0.195289423, 0.194035276, 0.192787252, 0.191545349,
+    0.190309559, 0.189079877, 0.187856298, 0.186638814, 0.185427419,
+    0.184222106, 0.183022868, 0.181829696, 0.180642582, 0.179461519,
+    0.178286498, 0.17711751,  0.175954545, 0.174797594, 0.173646647,
+    0.172501694, 0.171362726, 0.17022973,  0.169102697, 0.167981615,
+    0.166866472, 0.165757258, 0.16465396,  0.163556565, 0.162465063,
+    0.161379439, 0.16029968,  0.159225775, 0.158157709, 0.157095469,
+    0.156039041, 0.15498841,  0.153943564, 0.152904486, 0.151871164,
+    0.15084358,  0.149821722, 0.148805573, 0.147795117, 0.14679034,
+    0.145791225, 0.144797756, 0.143809918, 0.142827693, 0.141851065,
+    0.140880018, 0.139914534, 0.138954597, 0.138000189, 0.137051293,
+    0.136107891, 0.135169966, 0.1342375,   0.133310475, 0.132388874,
+    0.131472676, 0.130561866, 0.129656423, 0.128756329, 0.127861566,
+    0.126972115, 0.126087957, 0.125209072, 0.124335442, 0.123467048,
+    0.122603869, 0.121745886, 0.120893081, 0.120045433, 0.119202922,
+    0.118365529, 0.117533233, 0.116706015, 0.115883855, 0.115066732,
+    0.114254626, 0.113447517, 0.112645385, 0.111848208, 0.111055967,
+    0.11026864,  0.109486208, 0.10870865,  0.107935944, 0.10716807,
+    0.106405008, 0.105646736, 0.104893233, 0.104144479, 0.103400451,
+    0.102661131, 0.101926495, 0.101196524, 0.100471196, 0.099750489,
+    0.099034383, 0.098322857, 0.097615889, 0.096913457, 0.096215542,
+    0.09552212,  0.094833172, 0.094148676, 0.09346861,  0.092792953,
+    0.092121684, 0.091454782, 0.090792224, 0.09013399,  0.089480059,
+    0.088830409, 0.088185019, 0.087543867, 0.086906933, 0.086274194,
+    0.085645631, 0.08502122,  0.084400942, 0.083784774, 0.083172696,
+    0.082564687, 0.081960725, 0.08136079,  0.080764859, 0.080172912,
+    0.079584928, 0.079000886, 0.078420765, 0.077844544, 0.077272202,
+    0.076703718, 0.076139071, 0.07557824,  0.075021205, 0.074467945,
+    0.073918439, 0.073372666, 0.072830606, 0.072292238, 0.071757542,
+    0.071226497, 0.070699083, 0.070175279, 0.069655065, 0.06913842,
+    0.068625325, 0.068115759, 0.067609703, 0.067107135, 0.066608036,
+    0.066112385, 0.065620164, 0.065131352, 0.06464593,  0.064163876,
+    0.063685173, 0.0632098,   0.062737737, 0.062268966, 0.061803466,
+    0.061341219, 0.060882204, 0.060426403, 0.059973797, 0.059524366,
+    0.059078091, 0.058634954, 0.058194935, 0.057758015, 0.057324176,
+    0.056893399, 0.056465665, 0.056040956, 0.055619253, 0.055200538,
+    0.054784792, 0.054371997, 0.053962135, 0.053555187, 0.053151136,
+    0.052749964, 0.052351652, 0.051956183, 0.051563538, 0.051173701,
+    0.050786653, 0.050402377, 0.050020856, 0.049642071, 0.049266006,
+    0.048892643, 0.048521966, 0.048153956, 0.047788598, 0.047425873,
+    0.047065766, 0.046708258, 0.046353335, 0.046000978, 0.045651171,
+    0.045303898, 0.044959142, 0.044616887, 0.044277117, 0.043939815,
+    0.043604966, 0.043272553, 0.04294256,  0.042614972, 0.042289772,
+    0.041966945, 0.041646475, 0.041328347, 0.041012545, 0.040699054,
+    0.040387858, 0.040078942, 0.039772291, 0.039467889, 0.039165723,
+    0.038865776, 0.038568034, 0.038272482, 0.037979106, 0.037687891,
+    0.037398821, 0.037111883, 0.036827063, 0.036544345, 0.036263716,
+    0.035985162, 0.035708668, 0.03543422,  0.035161805, 0.034891409,
+    0.034623017, 0.034356616, 0.034092192, 0.033829733, 0.033569223,
+    0.033310651, 0.033054002, 0.032799263, 0.032546422, 0.032295465,
+    0.032046378, 0.03179915,  0.031553767, 0.031310216, 0.031068484,
+    0.03082856,  0.030590429, 0.030354081, 0.030119502, 0.02988668,
+    0.029655602, 0.029426257, 0.029198633, 0.028972716, 0.028748496,
+    0.02852596,  0.028305096, 0.028085893, 0.027868339, 0.027652422,
+    0.027438131, 0.027225454, 0.02701438,  0.026804897, 0.026596994,
+    0.026390659, 0.026185883, 0.025982653, 0.025780958, 0.025580788,
+    0.025382132, 0.025184979, 0.024989317, 0.024795137, 0.024602428,
+    0.02441118,  0.024221381, 0.024033021, 0.02384609,  0.023660578,
+    0.023476475, 0.02329377,  0.023112453, 0.022932514, 0.022753943,
+    0.022576731, 0.022400868, 0.022226343, 0.022053147, 0.021881271,
+    0.021710704, 0.021541438, 0.021373463, 0.021206769, 0.021041347,
+    0.020877188, 0.020714283, 0.020552622, 0.020392196, 0.020232997,
+    0.020075015, 0.019918242, 0.019762668, 0.019608285, 0.019455085,
+    0.019303057, 0.019152195, 0.019002488, 0.018853929, 0.01870651,
+    0.018560221, 0.018415055, 0.018271004, 0.018128058, 0.01798621,
+    0.017845452, 0.017705775, 0.017567173, 0.017429636, 0.017293157,
+    0.017157727, 0.017023341, 0.016889988, 0.016757663, 0.016626356,
+    0.016496061, 0.016366771, 0.016238476, 0.016111171, 0.015984848,
+    0.0158595,   0.015735118, 0.015611697, 0.015489228, 0.015367706,
+    0.015247121, 0.015127469, 0.015008741, 0.014890931, 0.014774032,
+    0.014658037, 0.014542939, 0.014428731, 0.014315408, 0.014202961,
+    0.014091386, 0.013980674, 0.01387082,  0.013761817, 0.013653659,
+    0.013546339, 0.013439852, 0.01333419,  0.013229347, 0.013125318,
+    0.013022097, 0.012919676, 0.01281805,  0.012717214, 0.012617161,
+    0.012517885, 0.01241938,  0.01232164,  0.012224661, 0.012128435,
+    0.012032957, 0.011938222, 0.011844224, 0.011750958, 0.011658417,
+    0.011566596, 0.01147549,  0.011385093, 0.0112954,   0.011206406,
+    0.011118105, 0.011030493, 0.010943563, 0.01085731,  0.01077173,
+    0.010686817, 0.010602567, 0.010518973, 0.010436032, 0.010353738,
+    0.010272086, 0.010191071, 0.010110688, 0.010030934, 0.009951802,
+    0.009873288, 0.009795387, 0.009718095, 0.009641407, 0.009565319,
+    0.009489825, 0.009414921, 0.009340603, 0.009266866, 0.009193705,
+    0.009121117, 0.009049097, 0.00897764,  0.008906743, 0.0088364,
+    0.008766609, 0.008697363, 0.00862866,  0.008560494, 0.008492863,
+    0.008425761, 0.008359185, 0.008293131, 0.008227594, 0.008162571,
+    0.008098058, 0.00803405,  0.007970544, 0.007907537, 0.007845023,
+    0.007783,    0.007721463, 0.007660409, 0.007599834, 0.007539735,
+    0.007480107, 0.007420947, 0.007362251, 0.007304017, 0.00724624,
+    0.007188916, 0.007132043, 0.007075616, 0.007019633, 0.006964089,
+    0.006908982, 0.006854308, 0.006800064, 0.006746246, 0.006692851,
+    0.006639876, 0.006587317, 0.006535172, 0.006483437, 0.006432108,
+    0.006381184, 0.00633066,  0.006280534, 0.006230802, 0.006181461,
+    0.006132509, 0.006083941, 0.006035757, 0.005987951, 0.005940522,
+    0.005893467, 0.005846781, 0.005800464, 0.005754511, 0.005708921,
+    0.005663689, 0.005618814, 0.005574292, 0.005530122, 0.005486299,
+    0.005442822, 0.005399687, 0.005356892, 0.005314435, 0.005272312,
+    0.005230522, 0.005189061, 0.005147927, 0.005107117, 0.005066629,
+    0.005026461, 0.004986609, 0.004947072, 0.004907847, 0.004868931,
+    0.004830323, 0.004792019, 0.004754017, 0.004716315, 0.004678911,
+    0.004641802, 0.004604986, 0.004568461, 0.004532224, 0.004496273,
+    0.004460606, 0.004425221, 0.004390115, 0.004355287, 0.004320734,
+    0.004286453, 0.004252444, 0.004218703, 0.004185228, 0.004152019,
+    0.004119071, 0.004086384, 0.004053956, 0.004021783, 0.003989865,
+    0.003958199, 0.003926784, 0.003895617, 0.003864696, 0.00383402,
+    0.003803587, 0.003773394, 0.003743439, 0.003713722, 0.00368424,
+    0.003654991, 0.003625973, 0.003597185, 0.003568625, 0.00354029,
+    0.00351218,  0.003484292, 0.003456625, 0.003429177, 0.003401946,
+    0.00337493,  0.003348128, 0.003321539, 0.00329516,  0.00326899,
+    0.003243026, 0.003217269, 0.003191715, 0.003166364, 0.003141213,
+    0.003116262, 0.003091508, 0.00306695,  0.003042587, 0.003018416,
+    0.002994437, 0.002970648, 0.002947048, 0.002923634, 0.002900406,
+    0.002877362, 0.0028545,   0.00283182,  0.002809319, 0.002786996,
+    0.002764851, 0.00274288,  0.002721084, 0.002699461, 0.002678009,
+    0.002656727, 0.002635614, 0.002614668, 0.002593888, 0.002573273,
+    0.002552821, 0.002532532, 0.002512403, 0.002492434, 0.002472623,
+    0.002452969, 0.002433472, 0.002414128, 0.002394938, 0.002375901,
+    0.002357014, 0.002338277, 0.002319688, 0.002301248, 0.002282953,
+    0.002264803, 0.002246798, 0.002228935, 0.002211214, 0.002193633,
+    0.002176192, 0.00215889,  0.002141724, 0.002124695, 0.002107801,
+    0.002091041, 0.002074414, 0.002057919, 0.002041555, 0.00202532,
+    0.002009215, 0.001993237, 0.001977386, 0.001961661, 0.001946061,
+    0.001930585, 0.001915231, 0.001899999, 0.001884888, 0.001869898,
+    0.001855026, 0.001840272, 0.001825635, 0.001811115, 0.00179671,
+    0.001782419, 0.001768241, 0.001754177, 0.001740223, 0.001726381,
+    0.001712649, 0.001699025, 0.00168551,  0.001672102, 0.001658801,
+    0.001645605, 0.001632514, 0.001619527, 0.001606644, 0.001593862,
+    0.001581182, 0.001568603, 0.001556124, 0.001543744, 0.001531462,
+    0.001519277, 0.00150719,  0.001495198, 0.001483302, 0.001471501,
+    0.001459793, 0.001448178, 0.001436655, 0.001425224, 0.001413884,
+    0.001402634, 0.001391473, 0.001380401, 0.001369417, 0.00135852,
+    0.00134771,  0.001336985, 0.001326346, 0.001315792, 0.001305321,
+    0.001294934, 0.001284629, 0.001274406, 0.001264264, 0.001254203,
+    0.001244222, 0.00123432,  0.001224497, 0.001214752, 0.001205084,
+    0.001195493, 0.001185979, 0.00117654,  0.001167176, 0.001157887,
+    0.001148671, 0.001139529, 0.001130459, 0.001121462, 0.001112536,
+    0.001103681, 0.001094896, 0.001086182, 0.001077536, 0.00106896,
+    0.001060451, 0.00105201,  0.001043636, 0.001035329, 0.001027088,
+    0.001018912, 0.001010802, 0.001002756, 0.000994774, 0.000986855,
+    0.000978999, 0.000971206, 0.000963475, 0.000955805, 0.000948197,
+    0.000940648, 0.00093316,  0.000925732, 0.000918362, 0.000911051,
+    0.000903798, 0.000896603, 0.000889465, 0.000882384, 0.00087536,
+    0.000868391, 0.000861477, 0.000854619, 0.000847815, 0.000841065,
+    0.000834369, 0.000827726, 0.000821136, 0.000814599, 0.000808113,
+    0.000801679, 0.000795296, 0.000788964, 0.000782683, 0.000776451,
+    0.000770269, 0.000764136, 0.000758052, 0.000752016, 0.000746029,
+    0.000740089, 0.000734196, 0.00072835,  0.000722551, 0.000716798,
+    0.00071109,  0.000705428, 0.000699811, 0.000694239, 0.000688711,
+    0.000683227, 0.000677787, 0.00067239,  0.000667036, 0.000661724,
+    0.000656455, 0.000651228, 0.000646042, 0.000640897, 0.000635794,
+    0.000630731, 0.000625709, 0.000620726, 0.000615783, 0.000610879,
+    0.000606015, 0.000601189, 0.000596401, 0.000591652, 0.00058694,
+    0.000582266, 0.00057763,  0.00057303,  0.000568466, 0.000563939,
+    0.000559448, 0.000554993, 0.000550573, 0.000546189, 0.000541839,
+    0.000537524, 0.000533243, 0.000528996, 0.000524783, 0.000520604,
+    0.000516458, 0.000512345, 0.000508265, 0.000504217, 0.000500201,
+    0.000496217, 0.000492265, 0.000488345, 0.000484456, 0.000480597,
+    0.00047677,  0.000472973, 0.000469206, 0.000465469, 0.000461762,
+    0.000458084, 0.000454436, 0.000450816, 0.000447226, 0.000443664,
+    0.00044013,  0.000436625, 0.000433147, 0.000429697, 0.000426275,
+    0.00042288,  0.000419512, 0.00041617,  0.000412855, 0.000409567,
+    0.000406305, 0.000403069, 0.000399858, 0.000396674, 0.000393514,
+    0.00039038,  0.00038727,  0.000384186, 0.000381126, 0.00037809,
+    0.000375079, 0.000372091, 0.000369127, 0.000366187, 0.00036327,
+    0.000360377, 0.000357506, 0.000354659, 0.000351834, 0.000349031,
+    0.000346251, 0.000343493, 0.000340757, 0.000338043, 0.00033535};
+
+static bool check_input_int8_range(float input)
+{
+  bool ret = input > -128.0 && input < 128.0;
+  if (!ret) {
+    printf("invalid int8 range, input is %f\n", input);
+  }
+  return ret;
+}
+
+static double _gen_sigmoid(float x)
+{
+  return 1.0 / (1.0 + exp(-(x)));
+}
+
+static void tl_lut_ref(u16 *ofmap, u16 *ifmap, u16 *table, u16 *table_slope,
+                       tl_shape_t ifmap_shape, tl_shape_t table_shape,
+                       int range_start, int range_end)
+{
+  int tn, th, tw;
+
+  tn = table_shape.n;
+  th = table_shape.h;
+  tw = table_shape.w;
+  assert(tn == 1);
+  assert(th * tw == 256);
+  assert(table);
+  assert(table_slope);
+  assert(ifmap_shape.n);
+  assert(ifmap);
+  assert(ofmap);
+
+  // TODO: use c function
+  // 1. dump all input as binary file
+#ifdef GDB
+#define INFP32FILE "infp32file.bin"
+#define OUTBF16FILE "lutbf16out.bin"
+  FILE *pFile;
+  pFile = fopen(INFP32FILE, "wb");
+  int shape_sz = tl_shape_size(&ifmap_shape);
+  float *f = (float *)malloc(sizeof(float) * shape_sz];
+  for (int i = 0; i < shape_sz; i++) {
+    f[i] = convert_bf16_fp32(ifmap[i]);
+  }
+  fwrite(f, 1, shape_sz * sizeof(float), pFile);
+  fclose(pFile);
+
+  // 2. read result from `eval_lut.py`
+  char command[256];
+  sprintf(command,
+          "python eval_lut.py --lut_input_range_start %d --lut_input_range_end "
+          "%d --inputfloat32 %s --outputbf16 %s 2>&1 > 2\n",
+          range_start, range_end, INFP32FILE, OUTBF16FILE);
+
+  int r;
+  r = system(command);
+  printf("command is %s, return %d\n", command, r);
+  assert(r != 0);
+
+  pFile = fopen(OUTBF16FILE, "rb");
+  if (!pFile) {
+    fprintf(stderr, "open golden %s fail\n", OUTBF16FILE);
+    exit(-1);
+  }
+
+  size_t file_length;
+  file_length = fread(ofmap, sizeof(u16), tl_shape_size(&ifmap_shape), pFile);
+  printf("read from golden, file size %" PRIu64 "\n", file_length);
+  fclose(pFile);
+#else
+  assert(range_start);
+  assert(range_end);
+  for (u64 i = 0; i < tl_shape_size(&ifmap_shape); i++) {
+    ofmap[i] = convert_fp32_bf16(_gen_sigmoid(convert_bf16_fp32(ifmap[i])));
+  }
+#endif
+
+#ifdef GDB
+  for (u64 i = 0; i < tl_shape_size(&ifmap_shape); i++) {
+    printf("ref %" PRIu64 " input 0x%x(%f) golden 0x%x(%f)\n", i, ifmap[i],
+           convert_bf16_fp32(ifmap[i]), ofmap[i], convert_bf16_fp32(ofmap[i]));
+  }
+#endif
+}
+
+static bool verify(u16 *ofmap_data, u16 *ref_data, u64 ofmap_size)
+{
+  int count = 0;
+  u64 size = ofmap_size;
+  if (mode == PRE_DATA_COMPARE_FIX) {
+    size = sizeof(sigmode_golden_bf16) / sizeof(u16);
+  } else if (PRE_DATA_MAX_ERROR) {
+    size = sizeof(sigmode_golden) / sizeof(double);
+  }
+
+  for (u64 i = 0; i < size; i++) {
+    if (mode == PRE_DATA_COMPARE_FIX) {
+      if (ofmap_data[i] != sigmode_golden_bf16[i]) {
+        fprintf(stderr,
+                "[%d] comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+                count, i, ofmap_data[i], sigmode_golden_bf16[i]);
+        exit(-1);
+      }
+    } else {
+      float got = convert_bf16_fp32(ofmap_data[i]);
+      float exp = convert_bf16_fp32(ref_data[i]);
+
+      if (mode == PRE_DATA_MAX_ERROR) {
+        // cus we have better accuracy ~ 0.0039
+        exp = sigmode_golden[i];
+      }
+
+      if (fabs(got - exp) > MAX_ERROR) {
+        fprintf(stderr,
+                "[%d] comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x, "
+                "diff(%f - %f) is %f\n",
+                count, i, ofmap_data[i], ref_data[i], got, exp,
+                fabs(got - exp));
+        count++;
+      }
+    }
+  }
+
+  if (count != 0) {
+    printf("error count is %d\n", count);
+    exit(-1);
+  }
+
+  return true;
+}
+
+static void gen_input(u16 *ifmap, u64 ifmap_size)
+{
+  if (mode == PRE_DATA_COMPARE_FIX || mode == PRE_DATA_MAX_ERROR) {
+    // ifmap_size 2048, pattern_size 2000
+    memset(ifmap, 0, sizeof(u16) * ifmap_size);
+    memcpy(ifmap, &test_pattern, sizeof(test_pattern));
+
+#ifdef GDB
+    for (u64 i = 0; i < ifmap_size; i++) {
+      printf("source if[%" PRIu64 "] is bf16 %f (bf16)with 0x%x\n", i,
+             convert_bf16_fp32(ifmap[i]), ifmap[i]);
+    }
+#endif
+  } else {
+    int table_hw = 256;
+    for (u64 i = 0; i < ifmap_size; i++) {
+      // input range is -8 ~ +8
+      float input = ((int)i % 7) * (((int)i % 2) ? 1 : -1) + 0.03 +
+                    (i % table_hw) * 0.002;
+      // float input = ((int)i % 10) * (((int)i % 2) ? 1 : -1) + 0.03 + (i %
+      // table_hw) * 0.002;
+      assert(check_input_int8_range(input));
+      ifmap[i] = convert_fp32_bf16(input);
+#ifdef GDB
+      printf("source if[%" PRIu64 "] is bf16 %f, input is %f (bf16)with 0x%x\n", i,
+             convert_bf16_fp32(ifmap[i]), input, ifmap[i]);
+#endif
+    }
+  }
+}
+
+static void testbench(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk)
+{
+  // TODO: check more shape / align
+  tl_shape_t ifmap_shape;
+  if (mode == PRE_DATA_COMPARE_FIX || mode == PRE_DATA_MAX_ERROR) {
+    ifmap_shape = {1, 32, 8, 8};
+  } else {
+    ifmap_shape = {1, 32, 16, 16};
+  }
+
+  fmt_t fmt = FMT_BF16;
+
+  // get table / input shape
+  tl_shape_t table_shape;
+  bf16_table_shape(bmk, &table_shape);
+  tl_shape_t ofmap_shape = ifmap_shape;
+
+  u64 ifmap_size = tl_shape_size(&ifmap_shape);
+  u64 table_size = tl_shape_size(&table_shape);
+  u64 ofmap_size = tl_shape_size(&ofmap_shape);
+
+  int data_type_size = bytesize_of_fmt(fmt);
+  u64 ifmap_bytesize = ifmap_size * data_type_size;
+  u64 table_bytesize = table_size * data_type_size;
+  u64 ofmap_bytesize = ofmap_size * data_type_size;
+
+  // alloc tg
+  u16 *ifmap = (u16 *)xmalloc(ifmap_bytesize);
+  u16 *table_data = (u16 *)xmalloc(table_bytesize);
+  u16 *table_data_slope = (u16 *)xmalloc(table_bytesize);
+  u16 *ref_data = (u16 *)xmalloc(ofmap_bytesize);
+
+  memset(table_data, 0, table_bytesize);
+  memset(table_data_slope, 0, table_bytesize);
+
+  // range depend on ur activation
+  int range_start = -8;
+  int range_end = 8;
+  float scale = bf16_sigmoid_scale(range_start, range_end);
+
+  // fill tg value
+  gen_input(ifmap, ifmap_size);
+  bf16_sigmoid_tbl(table_data, table_data_slope, &table_shape, range_start,
+                   range_end);
+  tl_lut_ref(ref_data, ifmap, table_data, table_data_slope, ifmap_shape,
+             table_shape, range_start, range_end);
+
+  // alloc tl
+  tl_t *tl_ifmap = alloc_tl(bmk, ifmap_shape, fmt, /*align*/1);
+  tl_t *tl_table_answer = alloc_tl(bmk, table_shape, fmt, /*align*/1);
+  tl_t *tl_table_answer_slope = alloc_tl(bmk, table_shape, fmt, /*align*/1);
+  tl_t *tl_buf = alloc_tl(bmk, ofmap_shape, fmt, /*align*/1);
+  tl_t *tl_ofmap_bf16 = alloc_tl(bmk, ofmap_shape, fmt, /*align*/1);
+
+  // sys->local
+  put_bf16_tensor_g2l(ctx, bmk, tl_ifmap, (u16 *)ifmap, FMT_BF16);
+  put_bf16_tensor_g2l(ctx, bmk, tl_table_answer, (u16 *)table_data, FMT_BF16);
+  put_bf16_tensor_g2l(ctx, bmk, tl_table_answer_slope, (u16 *)table_data_slope,
+                      FMT_BF16);
+
+  // emit core function
+  bf16_emit_sigmoid(bmk, tl_ifmap, tl_buf, tl_table_answer,
+                    tl_table_answer_slope, tl_ofmap_bf16, scale);
+
+
+  u16 *ofmap_data = (u16 *)get_bf16_tensor_l2g(ctx, bmk, tl_ofmap_bf16, fmt);
+
+  verify(ofmap_data, ref_data, ofmap_size);
+
+  free_tl(bmk, tl_ofmap_bf16);
+  free_tl(bmk, tl_buf);
+  free_tl(bmk, tl_table_answer_slope);
+  free_tl(bmk, tl_table_answer);
+  free_tl(bmk, tl_ifmap);
+
+  free(ifmap);
+  free(table_data);
+  free(table_data_slope);
+  free(ref_data);
+  free(ofmap_data);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  int round_mode;
+
+  round_mode = set_store_feround();
+
+  test_init(&ctx, &bmk);
+
+  for (int i = PRE_DATA_COMPARE_FIX; i < TEST_MODE_MAX; i++) {
+    // for (int i = GEN_DATA_MAX_ERROR; i < TEST_MODE_MAX; i++) {
+    // for (int i = PRE_DATA_MAX_ERROR; i < GEN_DATA_MAX_ERROR; i++) {
+    mode = static_cast<TEST_MODE>(i);
+    printf("test mode %d...\n", mode);
+    testbench(&ctx, bmk);
+  }
+
+  test_exit(&ctx);
+  restore_feround(round_mode);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_bf16_sqrt.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_sqrt.cpp
new file mode 100644
index 000000000..898341bc3
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_sqrt.cpp
@@ -0,0 +1,2520 @@
+/**
+ */
+#include "../1880v2_test_util.h"
+#define OUT
+#define IN
+#include <iostream>
+#include <iomanip>
+#include <string>
+#include <map>
+#include <random>
+#include <cfloat>
+//#include <boost/math/special_functions/next.hpp>
+//#define DBG
+
+using namespace std;
+//TODO: get from ctx
+static u32 channel = 32; //<! 1880v2 hardcode
+
+//<! 1880v2 hw config
+static u32 table_h = 32;
+static u32 table_w = 8;
+static u32 table_hw = table_h * table_w;
+
+// fix range
+const static int exp_start = -62;
+const static int exp_end = 63;
+#if 0
+static int AlmostEqual(float a, float b, float tolerance)
+{
+    float absA = fabsf(a);
+    float absB = fabsf(b);
+    return fabsf(a - b) <= tolerance * fmaxf(absA, absB);
+}
+static bool AlmostEqual2sComplement(float A, float B, int maxUlps)    
+{    
+    // Make sure maxUlps is non-negative and small enough that the    
+    // default NAN won't compare as equal to anything.    
+    assert(maxUlps > 0 && maxUlps < 4 * 1024 * 1024);    
+    int aInt = *(int*)&A;    
+    // Make aInt lexicographically ordered as a twos-complement int    
+    if (aInt < 0)    
+        aInt = 0x80000000 - aInt;    
+    // Make bInt lexicographically ordered as a twos-complement int    
+    int bInt = *(int*)&B;    
+    if (bInt < 0)    
+        bInt = 0x80000000 - bInt;    
+    int intDiff = abs(aInt - bInt);    
+    if (intDiff <= maxUlps)    
+        return true;    
+    return false;    
+}
+#endif
+// http://www.enseignement.polytechnique.fr/informatique/INF478/docs/Cpp/en/cpp/types/numeric_limits/epsilon.html
+template<class T>
+typename std::enable_if<!std::numeric_limits<T>::is_integer, bool>::type
+    almost_equal(T x, T y, int ulp)
+{
+    // the machine epsilon has to be scaled to the magnitude of the values used
+    // and multiplied by the desired precision in ULPs (units in the last place)
+    return std::abs(x-y) < std::numeric_limits<T>::epsilon() * std::abs(x+y) * ulp
+    // unless the result is subnormal
+           || std::abs(x-y) < std::numeric_limits<T>::min();
+}
+/**
+ * pre_data means we test fixed pattern, it should be same sa lut
+ */
+enum TEST_MODE {
+  PRE_DATA_COMPARE_FIX = 0, // pre-data + fix compare
+  GEN_POW_20_DATA_MAX_ERROR, //generate 2^-20 ~ 2^20 value that check epsilon
+  GEN_POW_20_DATA_MAX_ERROR_U8, //generate 2^-20 ~ 2^20 value that check epsilon, result bf16->u8
+  TEST_MODE_MAX,
+};
+
+static TEST_MODE mode;
+
+static u16 test_pattern[] = {
+  0x0000,
+  0x38D2,
+  0x3952,
+  0x399D,
+  0x39D2,
+  0x3A03,
+  0x3A1D,
+  0x3A38,
+  0x3A52,
+  0x3A6C,
+  0x3A83,
+  0x3A90,
+  0x3A9D,
+  0x3AAA,
+  0x3AB8,
+  0x3AC5,
+  0x3AD2,
+  0x3ADF,
+  0x3AEC,
+  0x3AF9,
+  0x3B03,
+  0x3B0A,
+  0x3B10,
+  0x3B17,
+  0x3B1D,
+  0x3B24,
+  0x3B2A,
+  0x3B31,
+  0x3B38,
+  0x3B3E,
+  0x3B45,
+  0x3B4B,
+  0x3B52,
+  0x3B58,
+  0x3B5F,
+  0x3B65,
+  0x3B6C,
+  0x3B72,
+  0x3B79,
+  0x3B80,
+  0x3B83,
+  0x3B86,
+  0x3B8A,
+  0x3B8D,
+  0x3B90,
+  0x3B93,
+  0x3B97,
+  0x3B9A,
+  0x3B9D,
+  0x3BA1,
+  0x3BA4,
+  0x3BA7,
+  0x3BAA,
+  0x3BAE,
+  0x3BB1,
+  0x3BB4,
+  0x3BB8,
+  0x3BBB,
+  0x3BBE,
+  0x3BC1,
+  0x3BC5,
+  0x3BC8,
+  0x3BCB,
+  0x3BCE,
+  0x3BD2,
+  0x3BD5,
+  0x3BD8,
+  0x3BDC,
+  0x3BDF,
+  0x3BE2,
+  0x3BE5,
+  0x3BE9,
+  0x3BEC,
+  0x3BEF,
+  0x3BF2,
+  0x3BF6,
+  0x3BF9,
+  0x3BFC,
+  0x3C00,
+  0x3C01,
+  0x3C03,
+  0x3C05,
+  0x3C06,
+  0x3C08,
+  0x3C0A,
+  0x3C0B,
+  0x3C0D,
+  0x3C0F,
+  0x3C10,
+  0x3C12,
+  0x3C13,
+  0x3C15,
+  0x3C17,
+  0x3C18,
+  0x3C1A,
+  0x3C1C,
+  0x3C1D,
+  0x3C1F,
+  0x3C21,
+  0x3C22,
+  0x3C24,
+  0x3C25,
+  0x3C27,
+  0x3C29,
+  0x3C2A,
+  0x3C2C,
+  0x3C2E,
+  0x3C2F,
+  0x3C31,
+  0x3C33,
+  0x3C34,
+  0x3C36,
+  0x3C38,
+  0x3C39,
+  0x3C3B,
+  0x3C3C,
+  0x3C3E,
+  0x3C40,
+  0x3C41,
+  0x3C43,
+  0x3C45,
+  0x3C46,
+  0x3C48,
+  0x3C4A,
+  0x3C4B,
+  0x3C4D,
+  0x3C4E,
+  0x3C50,
+  0x3C52,
+  0x3C53,
+  0x3C55,
+  0x3C57,
+  0x3C58,
+  0x3C5A,
+  0x3C5C,
+  0x3C5D,
+  0x3C5F,
+  0x3C60,
+  0x3C62,
+  0x3C64,
+  0x3C65,
+  0x3C67,
+  0x3C69,
+  0x3C6A,
+  0x3C6C,
+  0x3C6E,
+  0x3C6F,
+  0x3C71,
+  0x3C72,
+  0x3C74,
+  0x3C76,
+  0x3C77,
+  0x3C79,
+  0x3C7B,
+  0x3C7C,
+  0x3C7E,
+  0x3C80,
+  0x3C81,
+  0x3C81,
+  0x3C82,
+  0x3C83,
+  0x3C84,
+  0x3C85,
+  0x3C86,
+  0x3C86,
+  0x3C87,
+  0x3C88,
+  0x3C89,
+  0x3C8A,
+  0x3C8A,
+  0x3C8B,
+  0x3C8C,
+  0x3C8D,
+  0x3C8E,
+  0x3C8F,
+  0x3C8F,
+  0x3C90,
+  0x3C91,
+  0x3C92,
+  0x3C93,
+  0x3C93,
+  0x3C94,
+  0x3C95,
+  0x3C96,
+  0x3C97,
+  0x3C98,
+  0x3C98,
+  0x3C99,
+  0x3C9A,
+  0x3C9B,
+  0x3C9C,
+  0x3C9C,
+  0x3C9D,
+  0x3C9E,
+  0x3C9F,
+  0x3CA0,
+  0x3CA1,
+  0x3CA1,
+  0x3CA2,
+  0x3CA3,
+  0x3CA4,
+  0x3CA5,
+  0x3CA5,
+  0x3CA6,
+  0x3CA7,
+  0x3CA8,
+  0x3CA9,
+  0x3CAA,
+  0x3CAA,
+  0x3CAB,
+  0x3CAC,
+  0x3CAD,
+  0x3CAE,
+  0x3CAE,
+  0x3CAF,
+  0x3CB0,
+  0x3CB1,
+  0x3CB2,
+  0x3CB3,
+  0x3CB3,
+  0x3CB4,
+  0x3CB5,
+  0x3CB6,
+  0x3CB7,
+  0x3CB8,
+  0x3CB8,
+  0x3CB9,
+  0x3CBA,
+  0x3CBB,
+  0x3CBC,
+  0x3CBC,
+  0x3CBD,
+  0x3CBE,
+  0x3CBF,
+  0x3CC0,
+  0x3CC1,
+  0x3CC1,
+  0x3CC2,
+  0x3CC3,
+  0x3CC4,
+  0x3CC5,
+  0x3CC5,
+  0x3CC6,
+  0x3CC7,
+  0x3CC8,
+  0x3CC9,
+  0x3CCA,
+  0x3CCA,
+  0x3CCB,
+  0x3CCC,
+  0x3CCD,
+  0x3CCE,
+  0x3CCE,
+  0x3CCF,
+  0x3CD0,
+  0x3CD1,
+  0x3CD2,
+  0x3CD3,
+  0x3CD3,
+  0x3CD4,
+  0x3CD5,
+  0x3CD6,
+  0x3CD7,
+  0x3CD7,
+  0x3CD8,
+  0x3CD9,
+  0x3CDA,
+  0x3CDB,
+  0x3CDC,
+  0x3CDC,
+  0x3CDD,
+  0x3CDE,
+  0x3CDF,
+  0x3CE0,
+  0x3CE0,
+  0x3CE1,
+  0x3CE2,
+  0x3CE3,
+  0x3CE4,
+  0x3CE5,
+  0x3CE5,
+  0x3CE6,
+  0x3CE7,
+  0x3CE8,
+  0x3CE9,
+  0x3CE9,
+  0x3CEA,
+  0x3CEB,
+  0x3CEC,
+  0x3CED,
+  0x3CEE,
+  0x3CEE,
+  0x3CEF,
+  0x3CF0,
+  0x3CF1,
+  0x3CF2,
+  0x3CF2,
+  0x3CF3,
+  0x3CF4,
+  0x3CF5,
+  0x3CF6,
+  0x3CF7,
+  0x3CF7,
+  0x3CF8,
+  0x3CF9,
+  0x3CFA,
+  0x3CFB,
+  0x3CFB,
+  0x3CFC,
+  0x3CFD,
+  0x3CFE,
+  0x3CFF,
+  0x3D00,
+  0x3D00,
+  0x3D01,
+  0x3D01,
+  0x3D01,
+  0x3D02,
+  0x3D02,
+  0x3D03,
+  0x3D03,
+  0x3D03,
+  0x3D04,
+  0x3D04,
+  0x3D05,
+  0x3D05,
+  0x3D06,
+  0x3D06,
+  0x3D06,
+  0x3D07,
+  0x3D07,
+  0x3D08,
+  0x3D08,
+  0x3D08,
+  0x3D09,
+  0x3D09,
+  0x3D0A,
+  0x3D0A,
+  0x3D0A,
+  0x3D0B,
+  0x3D0B,
+  0x3D0C,
+  0x3D0C,
+  0x3D0C,
+  0x3D0D,
+  0x3D0D,
+  0x3D0E,
+  0x3D0E,
+  0x3D0F,
+  0x3D0F,
+  0x3D0F,
+  0x3D10,
+  0x3D10,
+  0x3D11,
+  0x3D11,
+  0x3D11,
+  0x3D12,
+  0x3D12,
+  0x3D13,
+  0x3D13,
+  0x3D13,
+  0x3D14,
+  0x3D14,
+  0x3D15,
+  0x3D15,
+  0x3D16,
+  0x3D16,
+  0x3D16,
+  0x3D17,
+  0x3D17,
+  0x3D18,
+  0x3D18,
+  0x3D18,
+  0x3D19,
+  0x3D19,
+  0x3D1A,
+  0x3D1A,
+  0x3D1A,
+  0x3D1B,
+  0x3D1B,
+  0x3D1C,
+  0x3D1C,
+  0x3D1C,
+  0x3D1D,
+  0x3D1D,
+  0x3D1E,
+  0x3D1E,
+  0x3D1F,
+  0x3D1F,
+  0x3D1F,
+  0x3D20,
+  0x3D20,
+  0x3D21,
+  0x3D21,
+  0x3D21,
+  0x3D22,
+  0x3D22,
+  0x3D23,
+  0x3D23,
+  0x3D23,
+  0x3D24,
+  0x3D24,
+  0x3D25,
+  0x3D25,
+  0x3D25,
+  0x3D26,
+  0x3D26,
+  0x3D27,
+  0x3D27,
+  0x3D28,
+  0x3D28,
+  0x3D28,
+  0x3D29,
+  0x3D29,
+  0x3D2A,
+  0x3D2A,
+  0x3D2A,
+  0x3D2B,
+  0x3D2B,
+  0x3D2C,
+  0x3D2C,
+  0x3D2C,
+  0x3D2D,
+  0x3D2D,
+  0x3D2E,
+  0x3D2E,
+  0x3D2E,
+  0x3D2F,
+  0x3D2F,
+  0x3D30,
+  0x3D30,
+  0x3D31,
+  0x3D31,
+  0x3D31,
+  0x3D32,
+  0x3D32,
+  0x3D33,
+  0x3D33,
+  0x3D33,
+  0x3D34,
+  0x3D34,
+  0x3D35,
+  0x3D35,
+  0x3D35,
+  0x3D36,
+  0x3D36,
+  0x3D37,
+  0x3D37,
+  0x3D38,
+  0x3D38,
+  0x3D38,
+  0x3D39,
+  0x3D39,
+  0x3D3A,
+  0x3D3A,
+  0x3D3A,
+  0x3D3B,
+  0x3D3B,
+  0x3D3C,
+  0x3D3C,
+  0x3D3C,
+  0x3D3D,
+  0x3D3D,
+  0x3D3E,
+  0x3D3E,
+  0x3D3E,
+  0x3D3F,
+  0x3D3F,
+  0x3D40,
+  0x3D40,
+  0x3D41,
+  0x3D41,
+  0x3D41,
+  0x3D42,
+  0x3D42,
+  0x3D43,
+  0x3D43,
+  0x3D43,
+  0x3D44,
+  0x3D44,
+  0x3D45,
+  0x3D45,
+  0x3D45,
+  0x3D46,
+  0x3D46,
+  0x3D47,
+  0x3D47,
+  0x3D47,
+  0x3D48,
+  0x3D48,
+  0x3D49,
+  0x3D49,
+  0x3D4A,
+  0x3D4A,
+  0x3D4A,
+  0x3D4B,
+  0x3D4B,
+  0x3D4C,
+  0x3D4C,
+  0x3D4C,
+  0x3D4D,
+  0x3D4D,
+  0x3D4E,
+  0x3D4E,
+  0x3D4E,
+  0x3D4F,
+  0x3D4F,
+  0x3D50,
+  0x3D50,
+  0x3D50,
+  0x3D51,
+  0x3D51,
+  0x3D52,
+  0x3D52,
+  0x3D53,
+  0x3D53,
+  0x3D53,
+  0x3D54,
+  0x3D54,
+  0x3D55,
+  0x3D55,
+  0x3D55,
+  0x3D56,
+  0x3D56,
+  0x3D57,
+  0x3D57,
+  0x3D57,
+  0x3D58,
+  0x3D58,
+  0x3D59,
+  0x3D59,
+  0x3D59,
+  0x3D5A,
+  0x3D5A,
+  0x3D5B,
+  0x3D5B,
+  0x3D5C,
+  0x3D5C,
+  0x3D5C,
+  0x3D5D,
+  0x3D5D,
+  0x3D5E,
+  0x3D5E,
+  0x3D5E,
+  0x3D5F,
+  0x3D5F,
+  0x3D60,
+  0x3D60,
+  0x3D60,
+  0x3D61,
+  0x3D61,
+  0x3D62,
+  0x3D62,
+  0x3D63,
+  0x3D63,
+  0x3D63,
+  0x3D64,
+  0x3D64,
+  0x3D65,
+  0x3D65,
+  0x3D65,
+  0x3D66,
+  0x3D66,
+  0x3D67,
+  0x3D67,
+  0x3D67,
+  0x3D68,
+  0x3D68,
+  0x3D69,
+  0x3D69,
+  0x3D69,
+  0x3D6A,
+  0x3D6A,
+  0x3D6B,
+  0x3D6B,
+  0x3D6C,
+  0x3D6C,
+  0x3D6C,
+  0x3D6D,
+  0x3D6D,
+  0x3D6E,
+  0x3D6E,
+  0x3D6E,
+  0x3D6F,
+  0x3D6F,
+  0x3D70,
+  0x3D70,
+  0x3D70,
+  0x3D71,
+  0x3D71,
+  0x3D72,
+  0x3D72,
+  0x3D72,
+  0x3D73,
+  0x3D73,
+  0x3D74,
+  0x3D74,
+  0x3D75,
+  0x3D75,
+  0x3D75,
+  0x3D76,
+  0x3D76,
+  0x3D77,
+  0x3D77,
+  0x3D77,
+  0x3D78,
+  0x3D78,
+  0x3D79,
+  0x3D79,
+  0x3D79,
+  0x3D7A,
+  0x3D7A,
+  0x3D7B,
+  0x3D7B,
+  0x3D7B,
+  0x3D7C,
+  0x3D7C,
+  0x3D7D,
+  0x3D7D,
+  0x3D7E,
+  0x3D7E,
+  0x3D7E,
+  0x3D7F,
+  0x3D7F,
+  0x3D80,
+  0x3D80,
+  0x3D80,
+  0x3D80,
+  0x3D81,
+  0x3D81,
+  0x3D81,
+  0x3D81,
+  0x3D81,
+  0x3D82,
+  0x3D82,
+  0x3D82,
+  0x3D82,
+  0x3D82,
+  0x3D83,
+  0x3D83,
+  0x3D83,
+  0x3D83,
+  0x3D83,
+  0x3D84,
+  0x3D84,
+  0x3D84,
+  0x3D84,
+  0x3D85,
+  0x3D85,
+  0x3D85,
+  0x3D85,
+  0x3D85,
+  0x3D86,
+  0x3D86,
+  0x3D86,
+  0x3D86,
+  0x3D86,
+  0x3D87,
+  0x3D87,
+  0x3D87,
+  0x3D87,
+  0x3D87,
+  0x3D88,
+  0x3D88,
+  0x3D88,
+  0x3D88,
+  0x3D88,
+  0x3D89,
+  0x3D89,
+  0x3D89,
+  0x3D89,
+  0x3D89,
+  0x3D8A,
+  0x3D8A,
+  0x3D8A,
+  0x3D8A,
+  0x3D8A,
+  0x3D8B,
+  0x3D8B,
+  0x3D8B,
+  0x3D8B,
+  0x3D8B,
+  0x3D8C,
+  0x3D8C,
+  0x3D8C,
+  0x3D8C,
+  0x3D8C,
+  0x3D8D,
+  0x3D8D,
+  0x3D8D,
+  0x3D8D,
+  0x3D8E,
+  0x3D8E,
+  0x3D8E,
+  0x3D8E,
+  0x3D8E,
+  0x3D8F,
+  0x3D8F,
+  0x3D8F,
+  0x3D8F,
+  0x3D8F,
+  0x3D90,
+  0x3D90,
+  0x3D90,
+  0x3D90,
+  0x3D90,
+  0x3D91,
+  0x3D91,
+  0x3D91,
+  0x3D91,
+  0x3D91,
+  0x3D92,
+  0x3D92,
+  0x3D92,
+  0x3D92,
+  0x3D92,
+  0x3D93,
+  0x3D93,
+  0x3D93,
+  0x3D93,
+  0x3D93,
+  0x3D94,
+  0x3D94,
+  0x3D94,
+  0x3D94,
+  0x3D94,
+  0x3D95,
+  0x3D95,
+  0x3D95,
+  0x3D95,
+  0x3D96,
+  0x3D96,
+  0x3D96,
+  0x3D96,
+  0x3D96,
+  0x3D97,
+  0x3D97,
+  0x3D97,
+  0x3D97,
+  0x3D97,
+  0x3D98,
+  0x3D98,
+  0x3D98,
+  0x3D98,
+  0x3D98,
+  0x3D99,
+  0x3D99,
+  0x3D99,
+  0x3D99,
+  0x3D99,
+  0x3D9A,
+  0x3D9A,
+  0x3D9A,
+  0x3D9A,
+  0x3D9A,
+  0x3D9B,
+  0x3D9B,
+  0x3D9B,
+  0x3D9B,
+  0x3D9B,
+  0x3D9C,
+  0x3D9C,
+  0x3D9C,
+  0x3D9C,
+  0x3D9C,
+  0x3D9D,
+  0x3D9D,
+  0x3D9D,
+  0x3D9D,
+  0x3D9D,
+  0x3D9E,
+  0x3D9E,
+  0x3D9E,
+  0x3D9E,
+  0x3D9F,
+  0x3D9F,
+  0x3D9F,
+  0x3D9F,
+  0x3D9F,
+  0x3DA0,
+  0x3DA0,
+  0x3DA0,
+  0x3DA0,
+  0x3DA0,
+  0x3DA1,
+  0x3DA1,
+  0x3DA1,
+  0x3DA1,
+  0x3DA1,
+  0x3DA2,
+  0x3DA2,
+  0x3DA2,
+  0x3DA2,
+  0x3DA2,
+  0x3DA3,
+  0x3DA3,
+  0x3DA3,
+  0x3DA3,
+  0x3DA3,
+  0x3DA4,
+  0x3DA4,
+  0x3DA4,
+  0x3DA4,
+  0x3DA4,
+  0x3DA5,
+  0x3DA5,
+  0x3DA5,
+  0x3DA5,
+  0x3DA5,
+  0x3DA6,
+  0x3DA6,
+  0x3DA6,
+  0x3DA6,
+  0x3DA7,
+  0x3DA7,
+  0x3DA7,
+  0x3DA7,
+  0x3DA7,
+  0x3DA8,
+  0x3DA8,
+  0x3DA8,
+  0x3DA8,
+  0x3DA8,
+  0x3DA9,
+  0x3DA9,
+  0x3DA9,
+  0x3DA9,
+  0x3DA9,
+  0x3DAA,
+  0x3DAA,
+  0x3DAA,
+  0x3DAA,
+  0x3DAA,
+  0x3DAB,
+  0x3DAB,
+  0x3DAB,
+  0x3DAB,
+  0x3DAB,
+  0x3DAC,
+  0x3DAC,
+  0x3DAC,
+  0x3DAC,
+  0x3DAC,
+  0x3DAD,
+  0x3DAD,
+  0x3DAD,
+  0x3DAD,
+  0x3DAD,
+  0x3DAE,
+  0x3DAE,
+  0x3DAE,
+  0x3DAE,
+  0x3DAE,
+  0x3DAF,
+  0x3DAF,
+  0x3DAF,
+  0x3DAF,
+  0x3DB0,
+  0x3DB0,
+  0x3DB0,
+  0x3DB0,
+  0x3DB0,
+  0x3DB1,
+  0x3DB1,
+  0x3DB1,
+  0x3DB1,
+  0x3DB1,
+  0x3DB2,
+  0x3DB2,
+  0x3DB2,
+  0x3DB2,
+  0x3DB2,
+  0x3DB3,
+  0x3DB3,
+  0x3DB3,
+  0x3DB3,
+  0x3DB3,
+  0x3DB4,
+  0x3DB4,
+  0x3DB4,
+  0x3DB4,
+  0x3DB4,
+  0x3DB5,
+  0x3DB5,
+  0x3DB5,
+  0x3DB5,
+  0x3DB5,
+  0x3DB6,
+  0x3DB6,
+  0x3DB6,
+  0x3DB6,
+  0x3DB6,
+  0x3DB7,
+  0x3DB7,
+  0x3DB7,
+  0x3DB7,
+  0x3DB8,
+  0x3DB8,
+  0x3DB8,
+  0x3DB8,
+  0x3DB8,
+  0x3DB9,
+  0x3DB9,
+  0x3DB9,
+  0x3DB9,
+  0x3DB9,
+  0x3DBA,
+  0x3DBA,
+  0x3DBA,
+  0x3DBA,
+  0x3DBA,
+  0x3DBB,
+  0x3DBB,
+  0x3DBB,
+  0x3DBB,
+  0x3DBB,
+  0x3DBC,
+  0x3DBC,
+  0x3DBC,
+  0x3DBC,
+  0x3DBC,
+  0x3DBD,
+  0x3DBD,
+  0x3DBD,
+  0x3DBD,
+  0x3DBD,
+  0x3DBE,
+  0x3DBE,
+  0x3DBE,
+  0x3DBE,
+  0x3DBE,
+  0x3DBF,
+  0x3DBF,
+  0x3DBF,
+  0x3DBF,
+  0x3DBF,
+  0x3DC0,
+  0x3DC0,
+  0x3DC0,
+  0x3DC0,
+  0x3DC1,
+  0x3DC1,
+  0x3DC1,
+  0x3DC1,
+  0x3DC1,
+  0x3DC2,
+  0x3DC2,
+  0x3DC2,
+  0x3DC2,
+  0x3DC2,
+  0x3DC3,
+  0x3DC3,
+  0x3DC3,
+  0x3DC3,
+  0x3DC3,
+  0x3DC4,
+  0x3DC4,
+  0x3DC4,
+  0x3DC4,
+  0x3DC4,
+  0x3DC5,
+  0x3DC5,
+  0x3DC5,
+  0x3DC5,
+  0x3DC5,
+  0x3DC6,
+  0x3DC6,
+  0x3DC6,
+  0x3DC6,
+  0x3DC6,
+  0x3DC7,
+  0x3DC7,
+  0x3DC7,
+  0x3DC7,
+  0x3DC7,
+  0x3DC8,
+  0x3DC8,
+  0x3DC8,
+  0x3DC8,
+  0x3DC8,
+  0x3DC9,
+  0x3DC9,
+  0x3DC9,
+  0x3DC9,
+  0x3DCA,
+  0x3DCA,
+  0x3DCA,
+  0x3DCA,
+  0x3DCA,
+  0x3DCB,
+  0x3DCB,
+  0x3DCB,
+  0x3DCB,
+  0x3DCB,
+  0x3DCC,
+  0x3DCC,
+  0x3DCC,
+  0x3DCC,
+  0x3DCC,
+  0x3DCD,
+  0x3DCE,
+  0x3DCF,
+  0x3DD0,
+  0x3DD1,
+  0x3DD2,
+  0x3DD3,
+  0x3DD4,
+  0x3DD5,
+  0x3DD6,
+  0x3DD7,
+  0x3DD8,
+  0x3DD9,
+  0x3DDA,
+  0x3DDB,
+  0x3DDC,
+  0x3DDD,
+  0x3DDE,
+  0x3DDF,
+  0x3DE0,
+  0x3DE1,
+  0x3DE2,
+  0x3DE3,
+  0x3DE4,
+  0x3DE5,
+};
+
+static u16 sigmode_golden_bf16[] = {
+  0x0,
+  0x3c24,
+  0x3c68,
+  0x3c8e,
+  0x3ca4,
+  0x3cb7,
+  0x3cc8,
+  0x3cd9,
+  0x3ce8,
+  0x3cf6,
+  0x3d01,
+  0x3d08,
+  0x3d0e,
+  0x3d14,
+  0x3d19,
+  0x3d1f,
+  0x3d24,
+  0x3d29,
+  0x3d2e,
+  0x3d33,
+  0x3d37,
+  0x3d3c,
+  0x3d40,
+  0x3d45,
+  0x3d48,
+  0x3d4d,
+  0x3d51,
+  0x3d55,
+  0x3d59,
+  0x3d5d,
+  0x3d61,
+  0x3d64,
+  0x3d68,
+  0x3d6b,
+  0x3d6f,
+  0x3d72,
+  0x3d76,
+  0x3d79,
+  0x3d7c,
+  0x3d80,
+  0x3d81,
+  0x3d83,
+  0x3d85,
+  0x3d86,
+  0x3d88,
+  0x3d89,
+  0x3d8b,
+  0x3d8c,
+  0x3d8e,
+  0x3d90,
+  0x3d91,
+  0x3d92,
+  0x3d94,
+  0x3d95,
+  0x3d97,
+  0x3d98,
+  0x3d99,
+  0x3d9b,
+  0x3d9c,
+  0x3d9d,
+  0x3d9f,
+  0x3da0,
+  0x3da1,
+  0x3da2,
+  0x3da4,
+  0x3da5,
+  0x3da6,
+  0x3da8,
+  0x3da9,
+  0x3daa,
+  0x3dab,
+  0x3dad,
+  0x3dae,
+  0x3daf,
+  0x3db0,
+  0x3db1,
+  0x3db3,
+  0x3db4,
+  0x3db5,
+  0x3db6,
+  0x3db7,
+  0x3db9,
+  0x3db9,
+  0x3dbb,
+  0x3dbc,
+  0x3dbd,
+  0x3dbe,
+  0x3dbf,
+  0x3dc0,
+  0x3dc1,
+  0x3dc2,
+  0x3dc3,
+  0x3dc5,
+  0x3dc5,
+  0x3dc7,
+  0x3dc8,
+  0x3dc8,
+  0x3dca,
+  0x3dcb,
+  0x3dcc,
+  0x3dcd,
+  0x3dce,
+  0x3dcf,
+  0x3dd0,
+  0x3dd1,
+  0x3dd2,
+  0x3dd3,
+  0x3dd4,
+  0x3dd5,
+  0x3dd6,
+  0x3dd7,
+  0x3dd8,
+  0x3dd9,
+  0x3dda,
+  0x3ddb,
+  0x3ddb,
+  0x3ddd,
+  0x3dde,
+  0x3dde,
+  0x3ddf,
+  0x3de1,
+  0x3de1,
+  0x3de2,
+  0x3de3,
+  0x3de4,
+  0x3de5,
+  0x3de6,
+  0x3de7,
+  0x3de8,
+  0x3de8,
+  0x3dea,
+  0x3deb,
+  0x3deb,
+  0x3dec,
+  0x3ded,
+  0x3dee,
+  0x3def,
+  0x3def,
+  0x3df1,
+  0x3df2,
+  0x3df2,
+  0x3df3,
+  0x3df4,
+  0x3df5,
+  0x3df6,
+  0x3df7,
+  0x3df7,
+  0x3df8,
+  0x3df9,
+  0x3dfa,
+  0x3dfb,
+  0x3dfb,
+  0x3dfc,
+  0x3dfd,
+  0x3dfe,
+  0x3dff,
+  0x3e00,
+  0x3e00,
+  0x3e00,
+  0x3e01,
+  0x3e01,
+  0x3e02,
+  0x3e02,
+  0x3e03,
+  0x3e03,
+  0x3e03,
+  0x3e04,
+  0x3e04,
+  0x3e05,
+  0x3e05,
+  0x3e05,
+  0x3e06,
+  0x3e06,
+  0x3e07,
+  0x3e07,
+  0x3e07,
+  0x3e08,
+  0x3e08,
+  0x3e09,
+  0x3e09,
+  0x3e09,
+  0x3e0a,
+  0x3e0a,
+  0x3e0b,
+  0x3e0b,
+  0x3e0b,
+  0x3e0b,
+  0x3e0c,
+  0x3e0c,
+  0x3e0d,
+  0x3e0d,
+  0x3e0d,
+  0x3e0e,
+  0x3e0e,
+  0x3e0f,
+  0x3e0f,
+  0x3e10,
+  0x3e10,
+  0x3e10,
+  0x3e10,
+  0x3e11,
+  0x3e11,
+  0x3e11,
+  0x3e12,
+  0x3e12,
+  0x3e13,
+  0x3e13,
+  0x3e14,
+  0x3e14,
+  0x3e14,
+  0x3e14,
+  0x3e15,
+  0x3e15,
+  0x3e15,
+  0x3e16,
+  0x3e16,
+  0x3e17,
+  0x3e17,
+  0x3e17,
+  0x3e17,
+  0x3e18,
+  0x3e18,
+  0x3e19,
+  0x3e19,
+  0x3e19,
+  0x3e19,
+  0x3e1a,
+  0x3e1a,
+  0x3e1b,
+  0x3e1b,
+  0x3e1b,
+  0x3e1c,
+  0x3e1c,
+  0x3e1c,
+  0x3e1d,
+  0x3e1d,
+  0x3e1d,
+  0x3e1e,
+  0x3e1e,
+  0x3e1e,
+  0x3e1f,
+  0x3e1f,
+  0x3e1f,
+  0x3e20,
+  0x3e20,
+  0x3e20,
+  0x3e21,
+  0x3e21,
+  0x3e21,
+  0x3e22,
+  0x3e22,
+  0x3e22,
+  0x3e22,
+  0x3e23,
+  0x3e23,
+  0x3e24,
+  0x3e24,
+  0x3e24,
+  0x3e24,
+  0x3e25,
+  0x3e25,
+  0x3e26,
+  0x3e26,
+  0x3e26,
+  0x3e26,
+  0x3e27,
+  0x3e27,
+  0x3e27,
+  0x3e28,
+  0x3e28,
+  0x3e28,
+  0x3e29,
+  0x3e29,
+  0x3e29,
+  0x3e29,
+  0x3e2a,
+  0x3e2a,
+  0x3e2a,
+  0x3e2b,
+  0x3e2b,
+  0x3e2b,
+  0x3e2c,
+  0x3e2c,
+  0x3e2c,
+  0x3e2d,
+  0x3e2d,
+  0x3e2d,
+  0x3e2d,
+  0x3e2e,
+  0x3e2e,
+  0x3e2f,
+  0x3e2f,
+  0x3e2f,
+  0x3e2f,
+  0x3e30,
+  0x3e30,
+  0x3e30,
+  0x3e30,
+  0x3e31,
+  0x3e31,
+  0x3e31,
+  0x3e32,
+  0x3e32,
+  0x3e32,
+  0x3e33,
+  0x3e33,
+  0x3e33,
+  0x3e33,
+  0x3e34,
+  0x3e34,
+  0x3e34,
+  0x3e35,
+  0x3e35,
+  0x3e35,
+  0x3e36,
+  0x3e36,
+  0x3e36,
+  0x3e36,
+  0x3e36,
+  0x3e37,
+  0x3e37,
+  0x3e37,
+  0x3e38,
+  0x3e38,
+  0x3e39,
+  0x3e39,
+  0x3e39,
+  0x3e39,
+  0x3e39,
+  0x3e3a,
+  0x3e3a,
+  0x3e3b,
+  0x3e3b,
+  0x3e3b,
+  0x3e3b,
+  0x3e3b,
+  0x3e3c,
+  0x3e3c,
+  0x3e3c,
+  0x3e3d,
+  0x3e3d,
+  0x3e3d,
+  0x3e3d,
+  0x3e3d,
+  0x3e3e,
+  0x3e3e,
+  0x3e3f,
+  0x3e3f,
+  0x3e3f,
+  0x3e3f,
+  0x3e3f,
+  0x3e40,
+  0x3e40,
+  0x3e41,
+  0x3e41,
+  0x3e41,
+  0x3e41,
+  0x3e41,
+  0x3e42,
+  0x3e42,
+  0x3e42,
+  0x3e43,
+  0x3e43,
+  0x3e43,
+  0x3e43,
+  0x3e44,
+  0x3e44,
+  0x3e44,
+  0x3e45,
+  0x3e45,
+  0x3e45,
+  0x3e45,
+  0x3e45,
+  0x3e46,
+  0x3e46,
+  0x3e47,
+  0x3e47,
+  0x3e47,
+  0x3e47,
+  0x3e47,
+  0x3e48,
+  0x3e48,
+  0x3e48,
+  0x3e48,
+  0x3e48,
+  0x3e49,
+  0x3e49,
+  0x3e4a,
+  0x3e4a,
+  0x3e4a,
+  0x3e4a,
+  0x3e4a,
+  0x3e4b,
+  0x3e4b,
+  0x3e4b,
+  0x3e4c,
+  0x3e4c,
+  0x3e4c,
+  0x3e4c,
+  0x3e4c,
+  0x3e4d,
+  0x3e4d,
+  0x3e4e,
+  0x3e4e,
+  0x3e4e,
+  0x3e4e,
+  0x3e4e,
+  0x3e4f,
+  0x3e4f,
+  0x3e4f,
+  0x3e4f,
+  0x3e4f,
+  0x3e50,
+  0x3e50,
+  0x3e51,
+  0x3e51,
+  0x3e51,
+  0x3e51,
+  0x3e51,
+  0x3e52,
+  0x3e52,
+  0x3e52,
+  0x3e52,
+  0x3e52,
+  0x3e53,
+  0x3e53,
+  0x3e53,
+  0x3e54,
+  0x3e54,
+  0x3e54,
+  0x3e54,
+  0x3e55,
+  0x3e55,
+  0x3e55,
+  0x3e55,
+  0x3e55,
+  0x3e56,
+  0x3e56,
+  0x3e56,
+  0x3e57,
+  0x3e57,
+  0x3e57,
+  0x3e57,
+  0x3e57,
+  0x3e58,
+  0x3e58,
+  0x3e58,
+  0x3e58,
+  0x3e59,
+  0x3e59,
+  0x3e59,
+  0x3e5a,
+  0x3e5a,
+  0x3e5a,
+  0x3e5a,
+  0x3e5a,
+  0x3e5b,
+  0x3e5b,
+  0x3e5b,
+  0x3e5b,
+  0x3e5b,
+  0x3e5c,
+  0x3e5c,
+  0x3e5d,
+  0x3e5d,
+  0x3e5d,
+  0x3e5d,
+  0x3e5d,
+  0x3e5e,
+  0x3e5e,
+  0x3e5e,
+  0x3e5e,
+  0x3e5e,
+  0x3e5f,
+  0x3e5f,
+  0x3e5f,
+  0x3e5f,
+  0x3e5f,
+  0x3e60,
+  0x3e60,
+  0x3e61,
+  0x3e61,
+  0x3e61,
+  0x3e61,
+  0x3e61,
+  0x3e62,
+  0x3e62,
+  0x3e62,
+  0x3e62,
+  0x3e62,
+  0x3e63,
+  0x3e63,
+  0x3e63,
+  0x3e63,
+  0x3e63,
+  0x3e64,
+  0x3e64,
+  0x3e65,
+  0x3e65,
+  0x3e65,
+  0x3e65,
+  0x3e65,
+  0x3e66,
+  0x3e66,
+  0x3e66,
+  0x3e66,
+  0x3e66,
+  0x3e67,
+  0x3e67,
+  0x3e67,
+  0x3e67,
+  0x3e67,
+  0x3e68,
+  0x3e68,
+  0x3e68,
+  0x3e68,
+  0x3e68,
+  0x3e69,
+  0x3e69,
+  0x3e6a,
+  0x3e6a,
+  0x3e6a,
+  0x3e6a,
+  0x3e6a,
+  0x3e6b,
+  0x3e6b,
+  0x3e6b,
+  0x3e6b,
+  0x3e6b,
+  0x3e6c,
+  0x3e6c,
+  0x3e6c,
+  0x3e6c,
+  0x3e6c,
+  0x3e6d,
+  0x3e6d,
+  0x3e6d,
+  0x3e6d,
+  0x3e6d,
+  0x3e6e,
+  0x3e6e,
+  0x3e6e,
+  0x3e6e,
+  0x3e6e,
+  0x3e6f,
+  0x3e6f,
+  0x3e6f,
+  0x3e6f,
+  0x3e6f,
+  0x3e70,
+  0x3e70,
+  0x3e71,
+  0x3e71,
+  0x3e71,
+  0x3e71,
+  0x3e71,
+  0x3e72,
+  0x3e72,
+  0x3e72,
+  0x3e72,
+  0x3e72,
+  0x3e73,
+  0x3e73,
+  0x3e73,
+  0x3e73,
+  0x3e73,
+  0x3e74,
+  0x3e74,
+  0x3e74,
+  0x3e74,
+  0x3e74,
+  0x3e75,
+  0x3e75,
+  0x3e75,
+  0x3e75,
+  0x3e76,
+  0x3e76,
+  0x3e76,
+  0x3e76,
+  0x3e76,
+  0x3e77,
+  0x3e77,
+  0x3e77,
+  0x3e77,
+  0x3e77,
+  0x3e78,
+  0x3e78,
+  0x3e78,
+  0x3e78,
+  0x3e78,
+  0x3e79,
+  0x3e79,
+  0x3e79,
+  0x3e79,
+  0x3e79,
+  0x3e7a,
+  0x3e7a,
+  0x3e7a,
+  0x3e7a,
+  0x3e7a,
+  0x3e7b,
+  0x3e7b,
+  0x3e7b,
+  0x3e7b,
+  0x3e7b,
+  0x3e7c,
+  0x3e7c,
+  0x3e7c,
+  0x3e7c,
+  0x3e7c,
+  0x3e7d,
+  0x3e7d,
+  0x3e7d,
+  0x3e7d,
+  0x3e7d,
+  0x3e7e,
+  0x3e7e,
+  0x3e7e,
+  0x3e7e,
+  0x3e7f,
+  0x3e7f,
+  0x3e7f,
+  0x3e7f,
+  0x3e7f,
+  0x3e80,
+  0x3e80,
+  0x3e80,
+  0x3e80,
+  0x3e80,
+  0x3e80,
+  0x3e80,
+  0x3e80,
+  0x3e80,
+  0x3e81,
+  0x3e81,
+  0x3e81,
+  0x3e81,
+  0x3e81,
+  0x3e81,
+  0x3e81,
+  0x3e81,
+  0x3e81,
+  0x3e81,
+  0x3e82,
+  0x3e82,
+  0x3e82,
+  0x3e82,
+  0x3e82,
+  0x3e82,
+  0x3e82,
+  0x3e82,
+  0x3e82,
+  0x3e83,
+  0x3e83,
+  0x3e83,
+  0x3e83,
+  0x3e83,
+  0x3e83,
+  0x3e83,
+  0x3e83,
+  0x3e83,
+  0x3e83,
+  0x3e84,
+  0x3e84,
+  0x3e84,
+  0x3e84,
+  0x3e84,
+  0x3e84,
+  0x3e84,
+  0x3e84,
+  0x3e84,
+  0x3e84,
+  0x3e85,
+  0x3e85,
+  0x3e85,
+  0x3e85,
+  0x3e85,
+  0x3e85,
+  0x3e85,
+  0x3e85,
+  0x3e85,
+  0x3e85,
+  0x3e86,
+  0x3e86,
+  0x3e86,
+  0x3e86,
+  0x3e86,
+  0x3e86,
+  0x3e86,
+  0x3e86,
+  0x3e86,
+  0x3e87,
+  0x3e87,
+  0x3e87,
+  0x3e87,
+  0x3e87,
+  0x3e87,
+  0x3e87,
+  0x3e87,
+  0x3e87,
+  0x3e87,
+  0x3e88,
+  0x3e88,
+  0x3e88,
+  0x3e88,
+  0x3e88,
+  0x3e88,
+  0x3e88,
+  0x3e88,
+  0x3e88,
+  0x3e88,
+  0x3e89,
+  0x3e89,
+  0x3e89,
+  0x3e89,
+  0x3e89,
+  0x3e89,
+  0x3e89,
+  0x3e89,
+  0x3e89,
+  0x3e89,
+  0x3e8a,
+  0x3e8a,
+  0x3e8a,
+  0x3e8a,
+  0x3e8a,
+  0x3e8a,
+  0x3e8a,
+  0x3e8a,
+  0x3e8a,
+  0x3e8b,
+  0x3e8b,
+  0x3e8b,
+  0x3e8b,
+  0x3e8b,
+  0x3e8b,
+  0x3e8b,
+  0x3e8b,
+  0x3e8b,
+  0x3e8b,
+  0x3e8b,
+  0x3e8b,
+  0x3e8b,
+  0x3e8b,
+  0x3e8b,
+  0x3e8c,
+  0x3e8c,
+  0x3e8c,
+  0x3e8c,
+  0x3e8c,
+  0x3e8c,
+  0x3e8c,
+  0x3e8c,
+  0x3e8c,
+  0x3e8c,
+  0x3e8d,
+  0x3e8d,
+  0x3e8d,
+  0x3e8d,
+  0x3e8d,
+  0x3e8d,
+  0x3e8d,
+  0x3e8d,
+  0x3e8d,
+  0x3e8d,
+  0x3e8e,
+  0x3e8e,
+  0x3e8e,
+  0x3e8e,
+  0x3e8e,
+  0x3e8e,
+  0x3e8e,
+  0x3e8e,
+  0x3e8e,
+  0x3e8f,
+  0x3e8f,
+  0x3e8f,
+  0x3e8f,
+  0x3e8f,
+  0x3e8f,
+  0x3e8f,
+  0x3e8f,
+  0x3e8f,
+  0x3e8f,
+  0x3e90,
+  0x3e90,
+  0x3e90,
+  0x3e90,
+  0x3e90,
+  0x3e90,
+  0x3e90,
+  0x3e90,
+  0x3e90,
+  0x3e90,
+  0x3e90,
+  0x3e90,
+  0x3e90,
+  0x3e90,
+  0x3e90,
+  0x3e91,
+  0x3e91,
+  0x3e91,
+  0x3e91,
+  0x3e91,
+  0x3e91,
+  0x3e91,
+  0x3e91,
+  0x3e91,
+  0x3e91,
+  0x3e92,
+  0x3e92,
+  0x3e92,
+  0x3e92,
+  0x3e92,
+  0x3e92,
+  0x3e92,
+  0x3e92,
+  0x3e92,
+  0x3e93,
+  0x3e93,
+  0x3e93,
+  0x3e93,
+  0x3e93,
+  0x3e93,
+  0x3e93,
+  0x3e93,
+  0x3e93,
+  0x3e93,
+  0x3e94,
+  0x3e94,
+  0x3e94,
+  0x3e94,
+  0x3e94,
+  0x3e94,
+  0x3e94,
+  0x3e94,
+  0x3e94,
+  0x3e94,
+  0x3e94,
+  0x3e94,
+  0x3e94,
+  0x3e94,
+  0x3e94,
+  0x3e95,
+  0x3e95,
+  0x3e95,
+  0x3e95,
+  0x3e95,
+  0x3e95,
+  0x3e95,
+  0x3e95,
+  0x3e95,
+  0x3e95,
+  0x3e96,
+  0x3e96,
+  0x3e96,
+  0x3e96,
+  0x3e96,
+  0x3e96,
+  0x3e96,
+  0x3e96,
+  0x3e96,
+  0x3e97,
+  0x3e97,
+  0x3e97,
+  0x3e97,
+  0x3e97,
+  0x3e97,
+  0x3e97,
+  0x3e97,
+  0x3e97,
+  0x3e97,
+  0x3e97,
+  0x3e97,
+  0x3e97,
+  0x3e97,
+  0x3e97,
+  0x3e98,
+  0x3e98,
+  0x3e98,
+  0x3e98,
+  0x3e98,
+  0x3e98,
+  0x3e98,
+  0x3e98,
+  0x3e98,
+  0x3e98,
+  0x3e99,
+  0x3e99,
+  0x3e99,
+  0x3e99,
+  0x3e99,
+  0x3e99,
+  0x3e99,
+  0x3e99,
+  0x3e99,
+  0x3e99,
+  0x3e99,
+  0x3e99,
+  0x3e99,
+  0x3e99,
+  0x3e9a,
+  0x3e9a,
+  0x3e9a,
+  0x3e9a,
+  0x3e9a,
+  0x3e9a,
+  0x3e9a,
+  0x3e9a,
+  0x3e9a,
+  0x3e9a,
+  0x3e9b,
+  0x3e9b,
+  0x3e9b,
+  0x3e9b,
+  0x3e9b,
+  0x3e9b,
+  0x3e9b,
+  0x3e9b,
+  0x3e9b,
+  0x3e9b,
+  0x3e9c,
+  0x3e9c,
+  0x3e9c,
+  0x3e9c,
+  0x3e9c,
+  0x3e9c,
+  0x3e9c,
+  0x3e9c,
+  0x3e9c,
+  0x3e9c,
+  0x3e9c,
+  0x3e9c,
+  0x3e9c,
+  0x3e9c,
+  0x3e9c,
+  0x3e9d,
+  0x3e9d,
+  0x3e9d,
+  0x3e9d,
+  0x3e9d,
+  0x3e9d,
+  0x3e9d,
+  0x3e9d,
+  0x3e9d,
+  0x3e9e,
+  0x3e9e,
+  0x3e9e,
+  0x3e9e,
+  0x3e9e,
+  0x3e9e,
+  0x3e9e,
+  0x3e9e,
+  0x3e9e,
+  0x3e9e,
+  0x3e9e,
+  0x3e9e,
+  0x3e9e,
+  0x3e9e,
+  0x3e9e,
+  0x3e9f,
+  0x3e9f,
+  0x3e9f,
+  0x3e9f,
+  0x3e9f,
+  0x3e9f,
+  0x3e9f,
+  0x3e9f,
+  0x3e9f,
+  0x3e9f,
+  0x3ea0,
+  0x3ea0,
+  0x3ea0,
+  0x3ea0,
+  0x3ea0,
+  0x3ea0,
+  0x3ea0,
+  0x3ea0,
+  0x3ea0,
+  0x3ea0,
+  0x3ea0,
+  0x3ea0,
+  0x3ea0,
+  0x3ea0,
+  0x3ea1,
+  0x3ea1,
+  0x3ea1,
+  0x3ea1,
+  0x3ea1,
+  0x3ea1,
+  0x3ea1,
+  0x3ea1,
+  0x3ea1,
+  0x3ea1,
+  0x3ea2,
+  0x3ea2,
+  0x3ea2,
+  0x3ea2,
+  0x3ea2,
+  0x3ea2,
+  0x3ea2,
+  0x3ea3,
+  0x3ea3,
+  0x3ea4,
+  0x3ea4,
+  0x3ea4,
+  0x3ea5,
+  0x3ea5,
+  0x3ea6,
+  0x3ea6,
+  0x3ea6,
+  0x3ea7,
+  0x3ea7,
+  0x3ea7,
+  0x3ea8,
+  0x3ea8,
+  0x3ea9,
+  0x3ea9,
+  0x3ea9,
+  0x3eaa,
+  0x3eaa,
+  0x3eaa,
+  0x3eab,
+  0x3eab,
+};
+
+// <! gen invert sqrt
+static double _gen_sqrt(int base, int p) {
+  // y = x ^ 0.5
+  double f = (double) (pow(base, p * 0.5));
+
+  if (isnan(f)) {
+    assert(0);
+  }
+  return f;
+}
+
+static void tl_lut_ref(
+    u16 *ofmap,
+    u16 *ifmap,
+    tl_shape_t ifmap_shape
+    )
+{
+  for (u32 i = 0; i < tl_shape_size(&ifmap_shape); i++) {
+	if (mode == PRE_DATA_COMPARE_FIX) {
+      ofmap[i] = sigmode_golden_bf16[i];
+    }
+	else if (mode == GEN_POW_20_DATA_MAX_ERROR_U8) {
+	  u16 v = convert_fp32_bf16(pow(convert_bf16_fp32(ifmap[i]), 0.5));
+      ofmap[i] = (u8) convert_bf16_s8(v);
+	}
+    else {
+      ofmap[i] = convert_fp32_bf16(pow(convert_bf16_fp32(ifmap[i]), 0.5));
+    }
+  }
+}
+
+static void gen_sqrt(u16 *table_data, u64 table_size) {
+  //<! 32*8 table, duplicate `channel` times;
+  int half = table_size / channel / 2;
+  u64 idx = 0;
+  assert(table_size);
+  assert(half == 128);
+
+  // prepare channel 0
+  double s = 0.0;
+  table_data[idx] = convert_fp32_bf16(s); // 0^0.5 = 0
+#ifdef DBG
+  printf("t [%" PRIu64 "] is %f(%.8lf)[idx:%f][2^%f] bf %x\n", idx, convert_bf16_fp32(table_data[idx]), s, (float)exp_start, (float)(exp_start/2), table_data[idx]);
+#endif
+  idx++;
+
+  // > 0, exp from 0 -62 -61 ..  62  63
+  for (int i = 0; i < half; i++) {
+    //float exp = round((exp_start + i) / 2) * 2;
+    int shift = (exp_start + i);
+    bool is_odd = (shift % 2);
+    float exp = shift;
+    if (is_odd) {
+      exp = shift > 0 ? exp - 1 : exp - 1;
+    }
+
+    double s = _gen_sqrt(2, exp);
+    table_data[idx] = convert_fp32_bf16(s);
+#ifdef DBG
+    printf("t [%" PRIu64 "] is %f [idx:%f][2^%f(%f)] bf %x\n", idx,
+        convert_bf16_fp32(table_data[idx]),
+        float(exp_start + i), exp/2, (exp_start + i) / 2.0, 
+        table_data[idx]);
+#endif
+    idx++;
+  }
+
+  //// idx = 127 dont care
+#if 0
+  s = _gen_sqrt(2, -0);
+  table_data[idx] = convert_fp32_bf16(s);
+#if 1
+  printf("t [%" PRIu64 "] is %f[%d] bf %x\n", idx, convert_bf16_fp32(table_data[idx]), 0, table_data[idx]);
+#endif
+  idx++;
+
+  for (int i = 1; i < half; i++) {
+    float exp = exp_start + i;
+    double s = _gen_sqrt(-2, exp);
+    table_data[idx] = convert_fp32_bf16(s);
+#ifdef DBG
+    printf("t [%" PRIu64 "] is %f(%e - %.8lf)[(-2)^%f] bf %x\n", idx, convert_bf16_fp32(table_data[idx]), s, s, exp, table_data[idx]);
+#endif
+    idx++;
+  }
+
+  // idx = 255 dont care
+  //s = _gen_sqrt(2, 0);
+  //table_data[idx] = convert_fp32_bf16(s);
+  //printf("t [%" PRIu64 "] is %f[%d]\n", idx, convert_bf16_fp32(table_data[idx]), 0);
+  //idx++;
+#endif
+
+  // duplicate channel #1 to #31
+  //TODO: tensor copy
+  for (u32 i = 1; i < channel; i++) {
+    memcpy(&table_data[i * table_hw], &table_data[0], sizeof(u16) * table_hw);
+  }
+}
+
+static void gen_sqrt_mantissa(u16 IN *table_data, u16* OUT table_mantissa, u64 table_size) {
+
+  u32 half = table_size / channel / 2;
+  assert(half == 128);
+  assert(table_data);
+
+  int idx = 0;
+  double d;
+  for (u32 i = 0; i < half; i++) {
+    d = 1 + i * 1 / 128.0;
+    d = (double) pow(d, 0.5);
+    table_mantissa[128+idx] = convert_fp32_bf16(d);
+#ifdef DBG
+    //printf(", [%u] is %lf\n", i+128, d);
+#endif /* ifdef DBG */
+
+    //13=2^3x1.625=(2^2)x(2^1x1.625)
+    d = 2 * (1 + i * 1 / 128.0);
+    d = (double) pow(d, 0.5);
+    table_mantissa[idx] = convert_fp32_bf16(d);
+#ifdef DBG
+    //printf("mantissa [%u] is %lf", i, d);
+#endif /* ifdef DBG */
+    idx++;
+  }
+#ifdef DBG
+  for (u32 i = 0; i < 2 * half; i++) {
+	printf("mantissa [%u] is %lf, 0x%x\n", i, convert_bf16_fp32(table_mantissa[i]),
+		table_mantissa[i]);
+  }
+#endif /* ifdef DBG */
+
+  // duplicate channel #1 to #31
+  //TODO: tensor copy
+  for (u64 i = 1; i < channel; i++) {
+    memcpy(&table_mantissa[table_hw * i], &table_mantissa[0], sizeof(u16) * table_hw);
+  }
+}
+
+static bool verify(u16 *ofmap_data, u16 *ref_data, u16* ifmap, u64 ifmap_size) {
+  u64 size = ifmap_size;
+
+  for (u64 i = 0; i < size; i++) {
+    bool is_close;
+	u16 ref;
+	u16 ofmap_data_bf16;
+	float ref_f;
+	float ofmap_data_f;
+	u32 shift;
+    
+	if (mode == GEN_POW_20_DATA_MAX_ERROR_U8) {
+	  shift = (i%2)*8;
+	  ref = ref_data[i];
+	  ofmap_data_bf16 = (u16)ofmap_data[i/2];
+	  ofmap_data_f = (float)(ofmap_data[i/2] >> shift);
+	  ref_f = (float)(ref);
+
+      is_close = ((u8)(ofmap_data[i/2] >> shift)) == (u8)ref;
+
+	  //printf("[%" PRIu64 "] of is %x ref is %x\n", i, (u8)(ofmap_data[i/2] >> shift), (u8)ref);
+	}
+	else {
+	  ref = ref_data[i];
+	  ref_f = convert_bf16_fp32(ref);
+	  ofmap_data_f = convert_bf16_fp32(ofmap_data[i]);
+	  ofmap_data_bf16 = ofmap_data[i];
+
+	  if (mode == PRE_DATA_COMPARE_FIX) {
+		is_close = ofmap_data[i] == ref;
+	  }
+	  else {
+		is_close = almost_equal(ref_f, ofmap_data_f, 1);
+		is_close = fabs(ref_f-ofmap_data_f) < 0.001;
+	  }
+	}
+
+#if 0
+	if (i == 0) {
+	  fprintf(stderr,
+		  "input, ofmap, ref, diff, diff / ref_f\n");
+    }
+
+    fprintf(stderr,
+        "%.16f, %f, %lf, %lf, %lf\n",
+        convert_bf16_fp32(ifmap[i]),
+        ofmap_data_f, ref_f, fabs(ref_f - ofmap_data_f), fabs(ref_f - ofmap_data_f) / ref_f);
+    //if (ofmap_data[i] != ref && fabs(ref_f-ofmap_data_f) > 0.07) 
+    //if (ofmap_data[i] != ref && AlmostEqual2sComplement(ref_f, ofmap_data_f, 1))
+    //if (ofmap_data[i] != ref && AlmostEqual(ref_f, ofmap_data_f, FLT_EPSILON))
+#endif
+    if (!is_close) {
+      fprintf(stderr,
+          "comparing failed at ofmap_data[%" PRIu64 "](input:%e), got %x, exp %x, fp32: got %e exp %e\n",
+          i, convert_bf16_fp32(ifmap[i]),
+          ofmap_data_bf16, ref, ofmap_data_f, ref_f);
+      exit(-1);
+    }
+  }
+
+  return true;
+}
+
+static void test_tl_int8_lut_bf16(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk)
+{
+  // TODO: check more shape / align
+  tl_shape_t ifmap_shape;
+  if (mode == PRE_DATA_COMPARE_FIX) {
+    ifmap_shape = {1, channel, 4, 8};
+  }
+  else {
+    ifmap_shape = {1, channel, 16, 16};
+  }
+
+  tl_shape_t table_shape = {1, channel, table_h, table_w}; // hard code for hw, hw:32x8
+  tl_shape_t ofmap_shape = ifmap_shape;
+
+  u64 ifmap_size = tl_shape_size(&ifmap_shape);
+  u64 table_size = tl_shape_size(&table_shape);
+  u64 ofmap_size = tl_shape_size(&ofmap_shape);
+
+  fmt_t fmt = FMT_BF16;
+
+  int data_type_size = bytesize_of_fmt(fmt);
+  u64 ifmap_bytesize  =  ifmap_size * data_type_size;
+  u64 table_bytesize  =  table_size * data_type_size;
+  u64 ofmap_bytesize  =  ofmap_size * data_type_size;
+
+  // hw ONLY support index in int8
+  u16 *ifmap = (u16 *)xmalloc(ifmap_bytesize);
+  memset(ifmap, 0x00, ifmap_bytesize);
+
+  u16 *ifmap_mantissa = (u16 *)xmalloc(ifmap_bytesize);
+  memset(ifmap_mantissa, 0x00, ifmap_bytesize);
+
+  if (mode == PRE_DATA_COMPARE_FIX) {
+    memcpy(ifmap, &test_pattern, sizeof(test_pattern));
+  }
+  else {
+    for (u64 i = 0; i < ifmap_size; i++) {
+      srand (static_cast <unsigned> (time(0)));
+      std::random_device rd;
+      std::mt19937 e2(rd());
+      float LO = pow(2, -10);
+      float HI = pow(2, 10);
+      //std::uniform_real_distribution<> dist(pow(2,-62), pow(2,63));
+      for (u64 i = 0; i < ifmap_size; i++) {
+        //float r3 = dist(e2);
+        float r3 = LO + static_cast <float> (rand()) /( static_cast <float> (RAND_MAX/(HI-LO)));
+        ifmap[i] = convert_fp32_bf16(r3);
+      }
+    }
+  }
+
+#ifdef DBG
+  for (u64 i = 0; i < ifmap_size; i++) {
+    printf("source if[%" PRIu64 "] bf16 %f 0x%x, log2f is %f\n", i, convert_bf16_fp32(ifmap[i]), ifmap[i], floor(log2((convert_bf16_fp32(ifmap[i])))));
+  }
+#endif /* ifdef DBG */
+
+  u16 *table_data = (u16 *)xmalloc(table_bytesize);
+  gen_sqrt (table_data, table_size);
+
+  u16 *table_data_mantissa = (u16 *)xmalloc(table_bytesize);
+  gen_sqrt_mantissa(table_data, table_data_mantissa, table_size);
+
+  u16 *ref_data = (u16 *)xmalloc(ofmap_bytesize);
+  tl_lut_ref(ref_data, ifmap, ifmap_shape);
+
+  tl_t *tl_ifmap =
+    alloc_tl(bmk,ifmap_shape, fmt, /*align*/1);
+  tl_t *tl_table_answer =
+    alloc_tl(bmk, table_shape, fmt, /*align*/1);
+  tl_t *tl_table_answer_mantissa =
+    alloc_tl(bmk, table_shape, fmt, /*align*/1);
+
+  tl_t *tl_ofmap_exp =
+    alloc_tl(bmk,ofmap_shape, fmt, /*align*/1);
+  tl_t *tl_ofmap_mantissa =
+    alloc_tl(bmk,ofmap_shape, fmt, /*align*/1);
+  tl_t *tl_ofmap_exp_val =
+    alloc_tl(bmk,ofmap_shape, fmt, /*align*/1);
+
+  tl_t *tl_ofmap_exp_val_u8 = nullptr;
+  tl_t *out = tl_ofmap_exp_val;
+
+  if (mode == GEN_POW_20_DATA_MAX_ERROR_U8) {
+	tl_ofmap_exp_val_u8 =
+	  alloc_tl(bmk,ofmap_shape, FMT_U8, /*align*/1);
+  }
+  
+  // <! FIXME: prepare it
+  bmk1880v2_tdma_tg2l_tensor_copy_param_t copy_p1, copy_p2, copy_p3;
+  prepare_put_bf16_tensor_g2l(ctx, bmk, tl_ifmap, ifmap, fmt, &copy_p1);
+  prepare_put_bf16_tensor_g2l(ctx, bmk, tl_table_answer, table_data, fmt, &copy_p2);
+  prepare_put_bf16_tensor_g2l(ctx, bmk, tl_table_answer_mantissa, table_data_mantissa, fmt, &copy_p3);
+
+  launch_put_bf16_tensor_g2l(ctx, bmk, copy_p1.src, &copy_p1); // input
+  launch_put_bf16_tensor_g2l(ctx, bmk, copy_p2.src, &copy_p2); // table value
+  launch_put_bf16_tensor_g2l(ctx, bmk, copy_p3.src, &copy_p3); // table mantissa
+
+  bmk1880v2_tdma_l2l_tensor_copy_param_t p10;
+  u16 *ofmap_data;
+
+  // remove low 8 bits by int8 copy with stride
+  // <! get index(pow)
+  memset(&p10, 0x00, sizeof(p10));
+  p10.dst = tl_ofmap_exp;
+  p10.src = tl_ifmap;
+  p10.mv_lut_idx = true;
+  bmk1880v2_tdma_l2l_bf16_tensor_copy(bmk, &p10);
+  p10.mv_lut_idx = false;
+  test_submit(ctx);
+
+  // <! get f(x0) = 2^(x0*-0.5)
+  bmk1880v2_tiu_lookup_table_param_t p12;
+  memset(&p12, 0, sizeof(p12));
+  p12.ofmap = tl_ofmap_exp_val;
+  p12.ifmap = tl_ofmap_exp;
+  p12.table = tl_table_answer;
+  bmk1880v2_tiu_lookup_table(bmk, &p12);
+
+  // <! get mantissa value
+  p12.ofmap = tl_ofmap_mantissa;
+  p12.ifmap = tl_ifmap;
+  p12.table = tl_table_answer_mantissa;
+  bmk1880v2_tiu_lookup_table(bmk, &p12);
+
+  // sqrt = (2^exp) * mantissa
+  bmk1880v2_tiu_element_wise_mul_param_t p1;
+  memset(&p1, 0, sizeof(p1));
+  p1.res_high = NULL;
+  p1.res_low = tl_ofmap_exp_val;
+  p1.a = tl_ofmap_mantissa;
+  p1.b_is_const = 0;
+  p1.b = tl_ofmap_exp_val;
+  p1.rshift_bits = 0;
+  p1.relu_enable = 0;
+  bmk1880v2_tiu_element_wise_mul(bmk, &p1);
+
+  if (mode == GEN_POW_20_DATA_MAX_ERROR_U8) {
+	p10.dst = tl_ofmap_exp_val_u8;
+	p10.src = tl_ofmap_exp_val;
+	bmk1880v2_tdma_l2l_bf16_tensor_copy(bmk, &p10);
+	out = tl_ofmap_exp_val_u8;
+  }
+
+  test_submit(ctx);
+
+  ofmap_data = (u16*)get_bf16_tensor_l2g(ctx, bmk, out, out->fmt);
+  verify(ofmap_data, ref_data, ifmap, ifmap_size);
+
+  if (tl_ofmap_exp_val_u8) {
+	free_tl(bmk, tl_ofmap_exp_val_u8);
+  }
+  free_tl(bmk, tl_ofmap_exp_val);
+  free_tl(bmk, tl_ofmap_mantissa);
+  free_tl(bmk, tl_ofmap_exp);
+  free_tl(bmk, tl_table_answer_mantissa);
+  free_tl(bmk, tl_table_answer);
+  free_tl(bmk, tl_ifmap);
+
+  free(ifmap);
+  free(ifmap_mantissa);
+  free(table_data);
+  free(table_data_mantissa);
+  free(ref_data);
+  free(ofmap_data);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  int round_mode;
+
+  round_mode = set_store_feround();
+
+  test_init(&ctx, &bmk);
+
+  for (int i = PRE_DATA_COMPARE_FIX; i < TEST_MODE_MAX; i++) {
+  //for (int i = PRE_DATA_COMPARE_FIX; i < GEN_POW_20_DATA_MAX_ERROR_U8; i++) {
+  //for (int i = GEN_POW_20_DATA_MAX_ERROR_U8; i < TEST_MODE_MAX; i++) {
+    mode = static_cast<TEST_MODE>(i);
+    printf ("test mode %d...\n", mode);
+    test_tl_int8_lut_bf16(&ctx, bmk);
+  }
+
+  test_exit(&ctx);
+  restore_feround(round_mode);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_bf16_sqrt_kernel.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_sqrt_kernel.cpp
new file mode 100644
index 000000000..5ecadcc76
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_sqrt_kernel.cpp
@@ -0,0 +1,442 @@
+/**
+ */
+#include "../1880v2_test_util.h"
+
+#include <cfloat>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <random>
+#include <string>
+//#define DBG
+
+using namespace std;
+
+/**
+ * pre_data means we test fixed pattern, it should be same sa lut
+ */
+enum TEST_MODE {
+  PRE_DATA_COMPARE_FIX = 0,   // pre-data + fix compare
+  GEN_POW_20_DATA_MAX_ERROR,  // generate 2^-20 ~ 2^20 value that check epsilon
+  TEST_MODE_MAX,
+};
+
+static TEST_MODE mode;
+
+static u16 test_pattern[] = {
+    0x0000, 0x38D2, 0x3952, 0x399D, 0x39D2, 0x3A03, 0x3A1D, 0x3A38, 0x3A52,
+    0x3A6C, 0x3A83, 0x3A90, 0x3A9D, 0x3AAA, 0x3AB8, 0x3AC5, 0x3AD2, 0x3ADF,
+    0x3AEC, 0x3AF9, 0x3B03, 0x3B0A, 0x3B10, 0x3B17, 0x3B1D, 0x3B24, 0x3B2A,
+    0x3B31, 0x3B38, 0x3B3E, 0x3B45, 0x3B4B, 0x3B52, 0x3B58, 0x3B5F, 0x3B65,
+    0x3B6C, 0x3B72, 0x3B79, 0x3B80, 0x3B83, 0x3B86, 0x3B8A, 0x3B8D, 0x3B90,
+    0x3B93, 0x3B97, 0x3B9A, 0x3B9D, 0x3BA1, 0x3BA4, 0x3BA7, 0x3BAA, 0x3BAE,
+    0x3BB1, 0x3BB4, 0x3BB8, 0x3BBB, 0x3BBE, 0x3BC1, 0x3BC5, 0x3BC8, 0x3BCB,
+    0x3BCE, 0x3BD2, 0x3BD5, 0x3BD8, 0x3BDC, 0x3BDF, 0x3BE2, 0x3BE5, 0x3BE9,
+    0x3BEC, 0x3BEF, 0x3BF2, 0x3BF6, 0x3BF9, 0x3BFC, 0x3C00, 0x3C01, 0x3C03,
+    0x3C05, 0x3C06, 0x3C08, 0x3C0A, 0x3C0B, 0x3C0D, 0x3C0F, 0x3C10, 0x3C12,
+    0x3C13, 0x3C15, 0x3C17, 0x3C18, 0x3C1A, 0x3C1C, 0x3C1D, 0x3C1F, 0x3C21,
+    0x3C22, 0x3C24, 0x3C25, 0x3C27, 0x3C29, 0x3C2A, 0x3C2C, 0x3C2E, 0x3C2F,
+    0x3C31, 0x3C33, 0x3C34, 0x3C36, 0x3C38, 0x3C39, 0x3C3B, 0x3C3C, 0x3C3E,
+    0x3C40, 0x3C41, 0x3C43, 0x3C45, 0x3C46, 0x3C48, 0x3C4A, 0x3C4B, 0x3C4D,
+    0x3C4E, 0x3C50, 0x3C52, 0x3C53, 0x3C55, 0x3C57, 0x3C58, 0x3C5A, 0x3C5C,
+    0x3C5D, 0x3C5F, 0x3C60, 0x3C62, 0x3C64, 0x3C65, 0x3C67, 0x3C69, 0x3C6A,
+    0x3C6C, 0x3C6E, 0x3C6F, 0x3C71, 0x3C72, 0x3C74, 0x3C76, 0x3C77, 0x3C79,
+    0x3C7B, 0x3C7C, 0x3C7E, 0x3C80, 0x3C81, 0x3C81, 0x3C82, 0x3C83, 0x3C84,
+    0x3C85, 0x3C86, 0x3C86, 0x3C87, 0x3C88, 0x3C89, 0x3C8A, 0x3C8A, 0x3C8B,
+    0x3C8C, 0x3C8D, 0x3C8E, 0x3C8F, 0x3C8F, 0x3C90, 0x3C91, 0x3C92, 0x3C93,
+    0x3C93, 0x3C94, 0x3C95, 0x3C96, 0x3C97, 0x3C98, 0x3C98, 0x3C99, 0x3C9A,
+    0x3C9B, 0x3C9C, 0x3C9C, 0x3C9D, 0x3C9E, 0x3C9F, 0x3CA0, 0x3CA1, 0x3CA1,
+    0x3CA2, 0x3CA3, 0x3CA4, 0x3CA5, 0x3CA5, 0x3CA6, 0x3CA7, 0x3CA8, 0x3CA9,
+    0x3CAA, 0x3CAA, 0x3CAB, 0x3CAC, 0x3CAD, 0x3CAE, 0x3CAE, 0x3CAF, 0x3CB0,
+    0x3CB1, 0x3CB2, 0x3CB3, 0x3CB3, 0x3CB4, 0x3CB5, 0x3CB6, 0x3CB7, 0x3CB8,
+    0x3CB8, 0x3CB9, 0x3CBA, 0x3CBB, 0x3CBC, 0x3CBC, 0x3CBD, 0x3CBE, 0x3CBF,
+    0x3CC0, 0x3CC1, 0x3CC1, 0x3CC2, 0x3CC3, 0x3CC4, 0x3CC5, 0x3CC5, 0x3CC6,
+    0x3CC7, 0x3CC8, 0x3CC9, 0x3CCA, 0x3CCA, 0x3CCB, 0x3CCC, 0x3CCD, 0x3CCE,
+    0x3CCE, 0x3CCF, 0x3CD0, 0x3CD1, 0x3CD2, 0x3CD3, 0x3CD3, 0x3CD4, 0x3CD5,
+    0x3CD6, 0x3CD7, 0x3CD7, 0x3CD8, 0x3CD9, 0x3CDA, 0x3CDB, 0x3CDC, 0x3CDC,
+    0x3CDD, 0x3CDE, 0x3CDF, 0x3CE0, 0x3CE0, 0x3CE1, 0x3CE2, 0x3CE3, 0x3CE4,
+    0x3CE5, 0x3CE5, 0x3CE6, 0x3CE7, 0x3CE8, 0x3CE9, 0x3CE9, 0x3CEA, 0x3CEB,
+    0x3CEC, 0x3CED, 0x3CEE, 0x3CEE, 0x3CEF, 0x3CF0, 0x3CF1, 0x3CF2, 0x3CF2,
+    0x3CF3, 0x3CF4, 0x3CF5, 0x3CF6, 0x3CF7, 0x3CF7, 0x3CF8, 0x3CF9, 0x3CFA,
+    0x3CFB, 0x3CFB, 0x3CFC, 0x3CFD, 0x3CFE, 0x3CFF, 0x3D00, 0x3D00, 0x3D01,
+    0x3D01, 0x3D01, 0x3D02, 0x3D02, 0x3D03, 0x3D03, 0x3D03, 0x3D04, 0x3D04,
+    0x3D05, 0x3D05, 0x3D06, 0x3D06, 0x3D06, 0x3D07, 0x3D07, 0x3D08, 0x3D08,
+    0x3D08, 0x3D09, 0x3D09, 0x3D0A, 0x3D0A, 0x3D0A, 0x3D0B, 0x3D0B, 0x3D0C,
+    0x3D0C, 0x3D0C, 0x3D0D, 0x3D0D, 0x3D0E, 0x3D0E, 0x3D0F, 0x3D0F, 0x3D0F,
+    0x3D10, 0x3D10, 0x3D11, 0x3D11, 0x3D11, 0x3D12, 0x3D12, 0x3D13, 0x3D13,
+    0x3D13, 0x3D14, 0x3D14, 0x3D15, 0x3D15, 0x3D16, 0x3D16, 0x3D16, 0x3D17,
+    0x3D17, 0x3D18, 0x3D18, 0x3D18, 0x3D19, 0x3D19, 0x3D1A, 0x3D1A, 0x3D1A,
+    0x3D1B, 0x3D1B, 0x3D1C, 0x3D1C, 0x3D1C, 0x3D1D, 0x3D1D, 0x3D1E, 0x3D1E,
+    0x3D1F, 0x3D1F, 0x3D1F, 0x3D20, 0x3D20, 0x3D21, 0x3D21, 0x3D21, 0x3D22,
+    0x3D22, 0x3D23, 0x3D23, 0x3D23, 0x3D24, 0x3D24, 0x3D25, 0x3D25, 0x3D25,
+    0x3D26, 0x3D26, 0x3D27, 0x3D27, 0x3D28, 0x3D28, 0x3D28, 0x3D29, 0x3D29,
+    0x3D2A, 0x3D2A, 0x3D2A, 0x3D2B, 0x3D2B, 0x3D2C, 0x3D2C, 0x3D2C, 0x3D2D,
+    0x3D2D, 0x3D2E, 0x3D2E, 0x3D2E, 0x3D2F, 0x3D2F, 0x3D30, 0x3D30, 0x3D31,
+    0x3D31, 0x3D31, 0x3D32, 0x3D32, 0x3D33, 0x3D33, 0x3D33, 0x3D34, 0x3D34,
+    0x3D35, 0x3D35, 0x3D35, 0x3D36, 0x3D36, 0x3D37, 0x3D37, 0x3D38, 0x3D38,
+    0x3D38, 0x3D39, 0x3D39, 0x3D3A, 0x3D3A, 0x3D3A, 0x3D3B, 0x3D3B, 0x3D3C,
+    0x3D3C, 0x3D3C, 0x3D3D, 0x3D3D, 0x3D3E, 0x3D3E, 0x3D3E, 0x3D3F, 0x3D3F,
+    0x3D40, 0x3D40, 0x3D41, 0x3D41, 0x3D41, 0x3D42, 0x3D42, 0x3D43, 0x3D43,
+    0x3D43, 0x3D44, 0x3D44, 0x3D45, 0x3D45, 0x3D45, 0x3D46, 0x3D46, 0x3D47,
+    0x3D47, 0x3D47, 0x3D48, 0x3D48, 0x3D49, 0x3D49, 0x3D4A, 0x3D4A, 0x3D4A,
+    0x3D4B, 0x3D4B, 0x3D4C, 0x3D4C, 0x3D4C, 0x3D4D, 0x3D4D, 0x3D4E, 0x3D4E,
+    0x3D4E, 0x3D4F, 0x3D4F, 0x3D50, 0x3D50, 0x3D50, 0x3D51, 0x3D51, 0x3D52,
+    0x3D52, 0x3D53, 0x3D53, 0x3D53, 0x3D54, 0x3D54, 0x3D55, 0x3D55, 0x3D55,
+    0x3D56, 0x3D56, 0x3D57, 0x3D57, 0x3D57, 0x3D58, 0x3D58, 0x3D59, 0x3D59,
+    0x3D59, 0x3D5A, 0x3D5A, 0x3D5B, 0x3D5B, 0x3D5C, 0x3D5C, 0x3D5C, 0x3D5D,
+    0x3D5D, 0x3D5E, 0x3D5E, 0x3D5E, 0x3D5F, 0x3D5F, 0x3D60, 0x3D60, 0x3D60,
+    0x3D61, 0x3D61, 0x3D62, 0x3D62, 0x3D63, 0x3D63, 0x3D63, 0x3D64, 0x3D64,
+    0x3D65, 0x3D65, 0x3D65, 0x3D66, 0x3D66, 0x3D67, 0x3D67, 0x3D67, 0x3D68,
+    0x3D68, 0x3D69, 0x3D69, 0x3D69, 0x3D6A, 0x3D6A, 0x3D6B, 0x3D6B, 0x3D6C,
+    0x3D6C, 0x3D6C, 0x3D6D, 0x3D6D, 0x3D6E, 0x3D6E, 0x3D6E, 0x3D6F, 0x3D6F,
+    0x3D70, 0x3D70, 0x3D70, 0x3D71, 0x3D71, 0x3D72, 0x3D72, 0x3D72, 0x3D73,
+    0x3D73, 0x3D74, 0x3D74, 0x3D75, 0x3D75, 0x3D75, 0x3D76, 0x3D76, 0x3D77,
+    0x3D77, 0x3D77, 0x3D78, 0x3D78, 0x3D79, 0x3D79, 0x3D79, 0x3D7A, 0x3D7A,
+    0x3D7B, 0x3D7B, 0x3D7B, 0x3D7C, 0x3D7C, 0x3D7D, 0x3D7D, 0x3D7E, 0x3D7E,
+    0x3D7E, 0x3D7F, 0x3D7F, 0x3D80, 0x3D80, 0x3D80, 0x3D80, 0x3D81, 0x3D81,
+    0x3D81, 0x3D81, 0x3D81, 0x3D82, 0x3D82, 0x3D82, 0x3D82, 0x3D82, 0x3D83,
+    0x3D83, 0x3D83, 0x3D83, 0x3D83, 0x3D84, 0x3D84, 0x3D84, 0x3D84, 0x3D85,
+    0x3D85, 0x3D85, 0x3D85, 0x3D85, 0x3D86, 0x3D86, 0x3D86, 0x3D86, 0x3D86,
+    0x3D87, 0x3D87, 0x3D87, 0x3D87, 0x3D87, 0x3D88, 0x3D88, 0x3D88, 0x3D88,
+    0x3D88, 0x3D89, 0x3D89, 0x3D89, 0x3D89, 0x3D89, 0x3D8A, 0x3D8A, 0x3D8A,
+    0x3D8A, 0x3D8A, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8C, 0x3D8C,
+    0x3D8C, 0x3D8C, 0x3D8C, 0x3D8D, 0x3D8D, 0x3D8D, 0x3D8D, 0x3D8E, 0x3D8E,
+    0x3D8E, 0x3D8E, 0x3D8E, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D90,
+    0x3D90, 0x3D90, 0x3D90, 0x3D90, 0x3D91, 0x3D91, 0x3D91, 0x3D91, 0x3D91,
+    0x3D92, 0x3D92, 0x3D92, 0x3D92, 0x3D92, 0x3D93, 0x3D93, 0x3D93, 0x3D93,
+    0x3D93, 0x3D94, 0x3D94, 0x3D94, 0x3D94, 0x3D94, 0x3D95, 0x3D95, 0x3D95,
+    0x3D95, 0x3D96, 0x3D96, 0x3D96, 0x3D96, 0x3D96, 0x3D97, 0x3D97, 0x3D97,
+    0x3D97, 0x3D97, 0x3D98, 0x3D98, 0x3D98, 0x3D98, 0x3D98, 0x3D99, 0x3D99,
+    0x3D99, 0x3D99, 0x3D99, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9B,
+    0x3D9B, 0x3D9B, 0x3D9B, 0x3D9B, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9C,
+    0x3D9D, 0x3D9D, 0x3D9D, 0x3D9D, 0x3D9D, 0x3D9E, 0x3D9E, 0x3D9E, 0x3D9E,
+    0x3D9F, 0x3D9F, 0x3D9F, 0x3D9F, 0x3D9F, 0x3DA0, 0x3DA0, 0x3DA0, 0x3DA0,
+    0x3DA0, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA2, 0x3DA2, 0x3DA2,
+    0x3DA2, 0x3DA2, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA4, 0x3DA4,
+    0x3DA4, 0x3DA4, 0x3DA4, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA6,
+    0x3DA6, 0x3DA6, 0x3DA6, 0x3DA7, 0x3DA7, 0x3DA7, 0x3DA7, 0x3DA7, 0x3DA8,
+    0x3DA8, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA9, 0x3DA9, 0x3DA9, 0x3DA9, 0x3DA9,
+    0x3DAA, 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAB,
+    0x3DAB, 0x3DAC, 0x3DAC, 0x3DAC, 0x3DAC, 0x3DAC, 0x3DAD, 0x3DAD, 0x3DAD,
+    0x3DAD, 0x3DAD, 0x3DAE, 0x3DAE, 0x3DAE, 0x3DAE, 0x3DAE, 0x3DAF, 0x3DAF,
+    0x3DAF, 0x3DAF, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB1, 0x3DB1,
+    0x3DB1, 0x3DB1, 0x3DB1, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB3,
+    0x3DB3, 0x3DB3, 0x3DB3, 0x3DB3, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB4,
+    0x3DB5, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB6, 0x3DB6, 0x3DB6, 0x3DB6,
+    0x3DB6, 0x3DB7, 0x3DB7, 0x3DB7, 0x3DB7, 0x3DB8, 0x3DB8, 0x3DB8, 0x3DB8,
+    0x3DB8, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DBA, 0x3DBA, 0x3DBA,
+    0x3DBA, 0x3DBA, 0x3DBB, 0x3DBB, 0x3DBB, 0x3DBB, 0x3DBB, 0x3DBC, 0x3DBC,
+    0x3DBC, 0x3DBC, 0x3DBC, 0x3DBD, 0x3DBD, 0x3DBD, 0x3DBD, 0x3DBD, 0x3DBE,
+    0x3DBE, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBF, 0x3DBF, 0x3DBF, 0x3DBF, 0x3DBF,
+    0x3DC0, 0x3DC0, 0x3DC0, 0x3DC0, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC1,
+    0x3DC2, 0x3DC2, 0x3DC2, 0x3DC2, 0x3DC2, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC3,
+    0x3DC3, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC5, 0x3DC5, 0x3DC5,
+    0x3DC5, 0x3DC5, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC7, 0x3DC7,
+    0x3DC7, 0x3DC7, 0x3DC7, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC9,
+    0x3DC9, 0x3DC9, 0x3DC9, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCB,
+    0x3DCB, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCC, 0x3DCC, 0x3DCC, 0x3DCC, 0x3DCC,
+    0x3DCD, 0x3DCE, 0x3DCF, 0x3DD0, 0x3DD1, 0x3DD2, 0x3DD3, 0x3DD4, 0x3DD5,
+    0x3DD6, 0x3DD7, 0x3DD8, 0x3DD9, 0x3DDA, 0x3DDB, 0x3DDC, 0x3DDD, 0x3DDE,
+    0x3DDF, 0x3DE0, 0x3DE1, 0x3DE2, 0x3DE3, 0x3DE4, 0x3DE5,
+};
+
+static u16 test_pattern_ref[] = {
+    0x0,    0x3c24, 0x3c68, 0x3c8e, 0x3ca4, 0x3cb7, 0x3cc8, 0x3cd9, 0x3ce8,
+    0x3cf6, 0x3d01, 0x3d08, 0x3d0e, 0x3d14, 0x3d19, 0x3d1f, 0x3d24, 0x3d29,
+    0x3d2e, 0x3d33, 0x3d37, 0x3d3c, 0x3d40, 0x3d45, 0x3d48, 0x3d4d, 0x3d51,
+    0x3d55, 0x3d59, 0x3d5d, 0x3d61, 0x3d64, 0x3d68, 0x3d6b, 0x3d6f, 0x3d72,
+    0x3d76, 0x3d79, 0x3d7c, 0x3d80, 0x3d81, 0x3d83, 0x3d85, 0x3d86, 0x3d88,
+    0x3d89, 0x3d8b, 0x3d8c, 0x3d8e, 0x3d90, 0x3d91, 0x3d92, 0x3d94, 0x3d95,
+    0x3d97, 0x3d98, 0x3d99, 0x3d9b, 0x3d9c, 0x3d9d, 0x3d9f, 0x3da0, 0x3da1,
+    0x3da2, 0x3da4, 0x3da5, 0x3da6, 0x3da8, 0x3da9, 0x3daa, 0x3dab, 0x3dad,
+    0x3dae, 0x3daf, 0x3db0, 0x3db1, 0x3db3, 0x3db4, 0x3db5, 0x3db6, 0x3db7,
+    0x3db9, 0x3db9, 0x3dbb, 0x3dbc, 0x3dbd, 0x3dbe, 0x3dbf, 0x3dc0, 0x3dc1,
+    0x3dc2, 0x3dc3, 0x3dc5, 0x3dc5, 0x3dc7, 0x3dc8, 0x3dc8, 0x3dca, 0x3dcb,
+    0x3dcc, 0x3dcd, 0x3dce, 0x3dcf, 0x3dd0, 0x3dd1, 0x3dd2, 0x3dd3, 0x3dd4,
+    0x3dd5, 0x3dd6, 0x3dd7, 0x3dd8, 0x3dd9, 0x3dda, 0x3ddb, 0x3ddb, 0x3ddd,
+    0x3dde, 0x3dde, 0x3ddf, 0x3de1, 0x3de1, 0x3de2, 0x3de3, 0x3de4, 0x3de5,
+    0x3de6, 0x3de7, 0x3de8, 0x3de8, 0x3dea, 0x3deb, 0x3deb, 0x3dec, 0x3ded,
+    0x3dee, 0x3def, 0x3def, 0x3df1, 0x3df2, 0x3df2, 0x3df3, 0x3df4, 0x3df5,
+    0x3df6, 0x3df7, 0x3df7, 0x3df8, 0x3df9, 0x3dfa, 0x3dfb, 0x3dfb, 0x3dfc,
+    0x3dfd, 0x3dfe, 0x3dff, 0x3e00, 0x3e00, 0x3e00, 0x3e01, 0x3e01, 0x3e02,
+    0x3e02, 0x3e03, 0x3e03, 0x3e03, 0x3e04, 0x3e04, 0x3e05, 0x3e05, 0x3e05,
+    0x3e06, 0x3e06, 0x3e07, 0x3e07, 0x3e07, 0x3e08, 0x3e08, 0x3e09, 0x3e09,
+    0x3e09, 0x3e0a, 0x3e0a, 0x3e0b, 0x3e0b, 0x3e0b, 0x3e0b, 0x3e0c, 0x3e0c,
+    0x3e0d, 0x3e0d, 0x3e0d, 0x3e0e, 0x3e0e, 0x3e0f, 0x3e0f, 0x3e10, 0x3e10,
+    0x3e10, 0x3e10, 0x3e11, 0x3e11, 0x3e11, 0x3e12, 0x3e12, 0x3e13, 0x3e13,
+    0x3e14, 0x3e14, 0x3e14, 0x3e14, 0x3e15, 0x3e15, 0x3e15, 0x3e16, 0x3e16,
+    0x3e17, 0x3e17, 0x3e17, 0x3e17, 0x3e18, 0x3e18, 0x3e19, 0x3e19, 0x3e19,
+    0x3e19, 0x3e1a, 0x3e1a, 0x3e1b, 0x3e1b, 0x3e1b, 0x3e1c, 0x3e1c, 0x3e1c,
+    0x3e1d, 0x3e1d, 0x3e1d, 0x3e1e, 0x3e1e, 0x3e1e, 0x3e1f, 0x3e1f, 0x3e1f,
+    0x3e20, 0x3e20, 0x3e20, 0x3e21, 0x3e21, 0x3e21, 0x3e22, 0x3e22, 0x3e22,
+    0x3e22, 0x3e23, 0x3e23, 0x3e24, 0x3e24, 0x3e24, 0x3e24, 0x3e25, 0x3e25,
+    0x3e26, 0x3e26, 0x3e26, 0x3e26, 0x3e27, 0x3e27, 0x3e27, 0x3e28, 0x3e28,
+    0x3e28, 0x3e29, 0x3e29, 0x3e29, 0x3e29, 0x3e2a, 0x3e2a, 0x3e2a, 0x3e2b,
+    0x3e2b, 0x3e2b, 0x3e2c, 0x3e2c, 0x3e2c, 0x3e2d, 0x3e2d, 0x3e2d, 0x3e2d,
+    0x3e2e, 0x3e2e, 0x3e2f, 0x3e2f, 0x3e2f, 0x3e2f, 0x3e30, 0x3e30, 0x3e30,
+    0x3e30, 0x3e31, 0x3e31, 0x3e31, 0x3e32, 0x3e32, 0x3e32, 0x3e33, 0x3e33,
+    0x3e33, 0x3e33, 0x3e34, 0x3e34, 0x3e34, 0x3e35, 0x3e35, 0x3e35, 0x3e36,
+    0x3e36, 0x3e36, 0x3e36, 0x3e36, 0x3e37, 0x3e37, 0x3e37, 0x3e38, 0x3e38,
+    0x3e39, 0x3e39, 0x3e39, 0x3e39, 0x3e39, 0x3e3a, 0x3e3a, 0x3e3b, 0x3e3b,
+    0x3e3b, 0x3e3b, 0x3e3b, 0x3e3c, 0x3e3c, 0x3e3c, 0x3e3d, 0x3e3d, 0x3e3d,
+    0x3e3d, 0x3e3d, 0x3e3e, 0x3e3e, 0x3e3f, 0x3e3f, 0x3e3f, 0x3e3f, 0x3e3f,
+    0x3e40, 0x3e40, 0x3e41, 0x3e41, 0x3e41, 0x3e41, 0x3e41, 0x3e42, 0x3e42,
+    0x3e42, 0x3e43, 0x3e43, 0x3e43, 0x3e43, 0x3e44, 0x3e44, 0x3e44, 0x3e45,
+    0x3e45, 0x3e45, 0x3e45, 0x3e45, 0x3e46, 0x3e46, 0x3e47, 0x3e47, 0x3e47,
+    0x3e47, 0x3e47, 0x3e48, 0x3e48, 0x3e48, 0x3e48, 0x3e48, 0x3e49, 0x3e49,
+    0x3e4a, 0x3e4a, 0x3e4a, 0x3e4a, 0x3e4a, 0x3e4b, 0x3e4b, 0x3e4b, 0x3e4c,
+    0x3e4c, 0x3e4c, 0x3e4c, 0x3e4c, 0x3e4d, 0x3e4d, 0x3e4e, 0x3e4e, 0x3e4e,
+    0x3e4e, 0x3e4e, 0x3e4f, 0x3e4f, 0x3e4f, 0x3e4f, 0x3e4f, 0x3e50, 0x3e50,
+    0x3e51, 0x3e51, 0x3e51, 0x3e51, 0x3e51, 0x3e52, 0x3e52, 0x3e52, 0x3e52,
+    0x3e52, 0x3e53, 0x3e53, 0x3e53, 0x3e54, 0x3e54, 0x3e54, 0x3e54, 0x3e55,
+    0x3e55, 0x3e55, 0x3e55, 0x3e55, 0x3e56, 0x3e56, 0x3e56, 0x3e57, 0x3e57,
+    0x3e57, 0x3e57, 0x3e57, 0x3e58, 0x3e58, 0x3e58, 0x3e58, 0x3e59, 0x3e59,
+    0x3e59, 0x3e5a, 0x3e5a, 0x3e5a, 0x3e5a, 0x3e5a, 0x3e5b, 0x3e5b, 0x3e5b,
+    0x3e5b, 0x3e5b, 0x3e5c, 0x3e5c, 0x3e5d, 0x3e5d, 0x3e5d, 0x3e5d, 0x3e5d,
+    0x3e5e, 0x3e5e, 0x3e5e, 0x3e5e, 0x3e5e, 0x3e5f, 0x3e5f, 0x3e5f, 0x3e5f,
+    0x3e5f, 0x3e60, 0x3e60, 0x3e61, 0x3e61, 0x3e61, 0x3e61, 0x3e61, 0x3e62,
+    0x3e62, 0x3e62, 0x3e62, 0x3e62, 0x3e63, 0x3e63, 0x3e63, 0x3e63, 0x3e63,
+    0x3e64, 0x3e64, 0x3e65, 0x3e65, 0x3e65, 0x3e65, 0x3e65, 0x3e66, 0x3e66,
+    0x3e66, 0x3e66, 0x3e66, 0x3e67, 0x3e67, 0x3e67, 0x3e67, 0x3e67, 0x3e68,
+    0x3e68, 0x3e68, 0x3e68, 0x3e68, 0x3e69, 0x3e69, 0x3e6a, 0x3e6a, 0x3e6a,
+    0x3e6a, 0x3e6a, 0x3e6b, 0x3e6b, 0x3e6b, 0x3e6b, 0x3e6b, 0x3e6c, 0x3e6c,
+    0x3e6c, 0x3e6c, 0x3e6c, 0x3e6d, 0x3e6d, 0x3e6d, 0x3e6d, 0x3e6d, 0x3e6e,
+    0x3e6e, 0x3e6e, 0x3e6e, 0x3e6e, 0x3e6f, 0x3e6f, 0x3e6f, 0x3e6f, 0x3e6f,
+    0x3e70, 0x3e70, 0x3e71, 0x3e71, 0x3e71, 0x3e71, 0x3e71, 0x3e72, 0x3e72,
+    0x3e72, 0x3e72, 0x3e72, 0x3e73, 0x3e73, 0x3e73, 0x3e73, 0x3e73, 0x3e74,
+    0x3e74, 0x3e74, 0x3e74, 0x3e74, 0x3e75, 0x3e75, 0x3e75, 0x3e75, 0x3e76,
+    0x3e76, 0x3e76, 0x3e76, 0x3e76, 0x3e77, 0x3e77, 0x3e77, 0x3e77, 0x3e77,
+    0x3e78, 0x3e78, 0x3e78, 0x3e78, 0x3e78, 0x3e79, 0x3e79, 0x3e79, 0x3e79,
+    0x3e79, 0x3e7a, 0x3e7a, 0x3e7a, 0x3e7a, 0x3e7a, 0x3e7b, 0x3e7b, 0x3e7b,
+    0x3e7b, 0x3e7b, 0x3e7c, 0x3e7c, 0x3e7c, 0x3e7c, 0x3e7c, 0x3e7d, 0x3e7d,
+    0x3e7d, 0x3e7d, 0x3e7d, 0x3e7e, 0x3e7e, 0x3e7e, 0x3e7e, 0x3e7f, 0x3e7f,
+    0x3e7f, 0x3e7f, 0x3e7f, 0x3e80, 0x3e80, 0x3e80, 0x3e80, 0x3e80, 0x3e80,
+    0x3e80, 0x3e80, 0x3e80, 0x3e81, 0x3e81, 0x3e81, 0x3e81, 0x3e81, 0x3e81,
+    0x3e81, 0x3e81, 0x3e81, 0x3e81, 0x3e82, 0x3e82, 0x3e82, 0x3e82, 0x3e82,
+    0x3e82, 0x3e82, 0x3e82, 0x3e82, 0x3e83, 0x3e83, 0x3e83, 0x3e83, 0x3e83,
+    0x3e83, 0x3e83, 0x3e83, 0x3e83, 0x3e83, 0x3e84, 0x3e84, 0x3e84, 0x3e84,
+    0x3e84, 0x3e84, 0x3e84, 0x3e84, 0x3e84, 0x3e84, 0x3e85, 0x3e85, 0x3e85,
+    0x3e85, 0x3e85, 0x3e85, 0x3e85, 0x3e85, 0x3e85, 0x3e85, 0x3e86, 0x3e86,
+    0x3e86, 0x3e86, 0x3e86, 0x3e86, 0x3e86, 0x3e86, 0x3e86, 0x3e87, 0x3e87,
+    0x3e87, 0x3e87, 0x3e87, 0x3e87, 0x3e87, 0x3e87, 0x3e87, 0x3e87, 0x3e88,
+    0x3e88, 0x3e88, 0x3e88, 0x3e88, 0x3e88, 0x3e88, 0x3e88, 0x3e88, 0x3e88,
+    0x3e89, 0x3e89, 0x3e89, 0x3e89, 0x3e89, 0x3e89, 0x3e89, 0x3e89, 0x3e89,
+    0x3e89, 0x3e8a, 0x3e8a, 0x3e8a, 0x3e8a, 0x3e8a, 0x3e8a, 0x3e8a, 0x3e8a,
+    0x3e8a, 0x3e8b, 0x3e8b, 0x3e8b, 0x3e8b, 0x3e8b, 0x3e8b, 0x3e8b, 0x3e8b,
+    0x3e8b, 0x3e8b, 0x3e8b, 0x3e8b, 0x3e8b, 0x3e8b, 0x3e8b, 0x3e8c, 0x3e8c,
+    0x3e8c, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8d,
+    0x3e8d, 0x3e8d, 0x3e8d, 0x3e8d, 0x3e8d, 0x3e8d, 0x3e8d, 0x3e8d, 0x3e8d,
+    0x3e8e, 0x3e8e, 0x3e8e, 0x3e8e, 0x3e8e, 0x3e8e, 0x3e8e, 0x3e8e, 0x3e8e,
+    0x3e8f, 0x3e8f, 0x3e8f, 0x3e8f, 0x3e8f, 0x3e8f, 0x3e8f, 0x3e8f, 0x3e8f,
+    0x3e8f, 0x3e90, 0x3e90, 0x3e90, 0x3e90, 0x3e90, 0x3e90, 0x3e90, 0x3e90,
+    0x3e90, 0x3e90, 0x3e90, 0x3e90, 0x3e90, 0x3e90, 0x3e90, 0x3e91, 0x3e91,
+    0x3e91, 0x3e91, 0x3e91, 0x3e91, 0x3e91, 0x3e91, 0x3e91, 0x3e91, 0x3e92,
+    0x3e92, 0x3e92, 0x3e92, 0x3e92, 0x3e92, 0x3e92, 0x3e92, 0x3e92, 0x3e93,
+    0x3e93, 0x3e93, 0x3e93, 0x3e93, 0x3e93, 0x3e93, 0x3e93, 0x3e93, 0x3e93,
+    0x3e94, 0x3e94, 0x3e94, 0x3e94, 0x3e94, 0x3e94, 0x3e94, 0x3e94, 0x3e94,
+    0x3e94, 0x3e94, 0x3e94, 0x3e94, 0x3e94, 0x3e94, 0x3e95, 0x3e95, 0x3e95,
+    0x3e95, 0x3e95, 0x3e95, 0x3e95, 0x3e95, 0x3e95, 0x3e95, 0x3e96, 0x3e96,
+    0x3e96, 0x3e96, 0x3e96, 0x3e96, 0x3e96, 0x3e96, 0x3e96, 0x3e97, 0x3e97,
+    0x3e97, 0x3e97, 0x3e97, 0x3e97, 0x3e97, 0x3e97, 0x3e97, 0x3e97, 0x3e97,
+    0x3e97, 0x3e97, 0x3e97, 0x3e97, 0x3e98, 0x3e98, 0x3e98, 0x3e98, 0x3e98,
+    0x3e98, 0x3e98, 0x3e98, 0x3e98, 0x3e98, 0x3e99, 0x3e99, 0x3e99, 0x3e99,
+    0x3e99, 0x3e99, 0x3e99, 0x3e99, 0x3e99, 0x3e99, 0x3e99, 0x3e99, 0x3e99,
+    0x3e99, 0x3e9a, 0x3e9a, 0x3e9a, 0x3e9a, 0x3e9a, 0x3e9a, 0x3e9a, 0x3e9a,
+    0x3e9a, 0x3e9a, 0x3e9b, 0x3e9b, 0x3e9b, 0x3e9b, 0x3e9b, 0x3e9b, 0x3e9b,
+    0x3e9b, 0x3e9b, 0x3e9b, 0x3e9c, 0x3e9c, 0x3e9c, 0x3e9c, 0x3e9c, 0x3e9c,
+    0x3e9c, 0x3e9c, 0x3e9c, 0x3e9c, 0x3e9c, 0x3e9c, 0x3e9c, 0x3e9c, 0x3e9c,
+    0x3e9d, 0x3e9d, 0x3e9d, 0x3e9d, 0x3e9d, 0x3e9d, 0x3e9d, 0x3e9d, 0x3e9d,
+    0x3e9e, 0x3e9e, 0x3e9e, 0x3e9e, 0x3e9e, 0x3e9e, 0x3e9e, 0x3e9e, 0x3e9e,
+    0x3e9e, 0x3e9e, 0x3e9e, 0x3e9e, 0x3e9e, 0x3e9e, 0x3e9f, 0x3e9f, 0x3e9f,
+    0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3ea0, 0x3ea0,
+    0x3ea0, 0x3ea0, 0x3ea0, 0x3ea0, 0x3ea0, 0x3ea0, 0x3ea0, 0x3ea0, 0x3ea0,
+    0x3ea0, 0x3ea0, 0x3ea0, 0x3ea1, 0x3ea1, 0x3ea1, 0x3ea1, 0x3ea1, 0x3ea1,
+    0x3ea1, 0x3ea1, 0x3ea1, 0x3ea1, 0x3ea2, 0x3ea2, 0x3ea2, 0x3ea2, 0x3ea2,
+    0x3ea2, 0x3ea2, 0x3ea3, 0x3ea3, 0x3ea4, 0x3ea4, 0x3ea4, 0x3ea5, 0x3ea5,
+    0x3ea6, 0x3ea6, 0x3ea6, 0x3ea7, 0x3ea7, 0x3ea7, 0x3ea8, 0x3ea8, 0x3ea9,
+    0x3ea9, 0x3ea9, 0x3eaa, 0x3eaa, 0x3eaa, 0x3eab, 0x3eab,
+};
+
+static void tl_lut_ref(u16 *ofmap, u16 *ifmap, tl_shape_t ifmap_shape)
+{
+  for (u32 i = 0; i < tl_shape_size(&ifmap_shape); i++) {
+    if (mode == PRE_DATA_COMPARE_FIX) {
+      ofmap[i] = test_pattern_ref[i];
+    } else {
+      ofmap[i] = convert_fp32_bf16(pow(convert_bf16_fp32(ifmap[i]), 0.5));
+    }
+  }
+}
+
+static bool verify(u16 *ofmap_data, u16 *ref_data, u16 *ifmap,
+                   u64 ifmap_shape_size, TEST_MODE mode)
+{
+  u64 size = ifmap_shape_size;
+
+  for (u64 i = 0; i < size; i++) {
+    bool is_close;
+    u16 ref;
+    u16 ofmap_data_bf16;
+    float ref_f;
+    float ofmap_data_f;
+
+    ref = ref_data[i];
+    ref_f = convert_bf16_fp32(ref);
+    ofmap_data_f = convert_bf16_fp32(ofmap_data[i]);
+    ofmap_data_bf16 = ofmap_data[i];
+
+    if (mode == PRE_DATA_COMPARE_FIX) {
+      is_close = ofmap_data[i] == ref;
+    } else {
+      is_close = fabs(ref_f - ofmap_data_f) < 0.001;
+    }
+
+    if (!is_close) {
+      fprintf(stderr,
+              "comparing failed at ofmap_data[%" PRIu64 "](input:%e), got %x, exp %x, "
+              "fp32: got %e exp %e\n",
+              i, convert_bf16_fp32(ifmap[i]), ofmap_data_bf16, ref,
+              ofmap_data_f, ref_f);
+      exit(-1);
+    }
+  }
+
+  return true;
+}
+
+static void gen_input(u16 *ifmap, u64 ifmap_shape_size)
+{
+
+  if (mode == PRE_DATA_COMPARE_FIX) {
+    memcpy(ifmap, &test_pattern, sizeof(test_pattern));
+  } else {
+    for (u64 i = 0; i < ifmap_shape_size; i++) {
+      srand(static_cast<unsigned>(time(0)));
+      std::random_device rd;
+      std::mt19937 e2(rd());
+      float LO = pow(2, -10);
+      float HI = pow(2, 10);
+      // std::uniform_real_distribution<> dist(pow(2,-62), pow(2,63));
+      for (u64 i = 0; i < ifmap_shape_size; i++) {
+        // float r3 = dist(e2);
+        float r3 = LO + static_cast<float>(rand()) /
+                            (static_cast<float>(RAND_MAX / (HI - LO)));
+        ifmap[i] = convert_fp32_bf16(r3);
+      }
+    }
+  }
+
+#ifdef DBG
+  for (u64 i = 0; i < ifmap_shape_size; i++) {
+    printf("source if[%" PRIu64 "] bf16 %f 0x%x, log2f is %f\n", i,
+           convert_bf16_fp32(ifmap[i]), ifmap[i],
+           floor(log2((convert_bf16_fp32(ifmap[i])))));
+  }
+#endif /* ifdef DBG */
+}
+
+static void testbench(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, u32 input_n, u32 input_c,
+                      u32 input_h, u32 input_w)
+{
+  fmt_t fmt = FMT_BF16;
+
+  // TODO: check more shape / align
+  tl_shape_t ifmap_shape = {input_n, input_c, input_h, input_w};
+  tl_shape_t ofmap_shape = ifmap_shape;
+  tl_shape_t table_shape;
+  bf16_table_shape(bmk, &table_shape);
+
+  u64 ifmap_shape_size = tl_shape_size(&ifmap_shape);
+  u64 ofmap_size = tl_shape_size(&ofmap_shape);
+  u64 table_size = tl_shape_size(&table_shape);
+
+  // prepare input data with size
+  int data_type_size = bytesize_of_fmt(fmt);
+  u64 ifmap_bytesize = ifmap_shape_size * data_type_size;
+  u64 ofmap_bytesize = ofmap_size * data_type_size;
+  u64 table_bytesize = table_size * data_type_size;
+
+  u16 *ifmap = (u16 *)xmalloc(ifmap_bytesize);
+  u16 *ref_data = (u16 *)xmalloc(ofmap_bytesize);
+  u16 *table_data = (u16 *)xmalloc(table_bytesize);
+  u16 *table_data_mantissa = (u16 *)xmalloc(table_bytesize);
+
+  // alloc lmem
+  tl_t *tl_ifmap = alloc_tl(bmk, ifmap_shape, fmt, /*align*/1);
+  tl_t *tl_ofmap_bf16 = alloc_tl(bmk, ofmap_shape, fmt, /*align*/1);
+  tl_t *tl_buf = tl_ifmap ? alloc_tl(bmk, tl_ifmap->shape, fmt, /*align*/1) : nullptr;
+  tl_t *tl_table_answer = alloc_tl(bmk, table_shape, fmt, /*align*/1);
+  tl_t *tl_table_answer_mantissa =
+      alloc_tl(bmk, table_shape, fmt, /*align*/1);
+
+  // generate testbench
+  gen_input(ifmap, ifmap_shape_size);
+  tl_lut_ref(ref_data, ifmap, ifmap_shape);
+
+  // prepare table
+  bf16_sqrt_tbl(table_data, table_data_mantissa, &table_shape);
+
+  // sys->lmem
+  put_bf16_tensor_g2l(ctx, bmk, tl_ifmap, (u16 *)ifmap, FMT_BF16);
+  put_bf16_tensor_g2l(ctx, bmk, tl_table_answer, (u16 *)table_data, FMT_BF16);
+  put_bf16_tensor_g2l(ctx, bmk, tl_table_answer_mantissa,
+                      (u16 *)table_data_mantissa, FMT_BF16);
+
+  bf16_emit_sqrt(bmk, tl_ifmap, tl_buf, tl_table_answer,
+                 tl_table_answer_mantissa, tl_ofmap_bf16);
+
+  // issue cmd
+  test_submit(ctx);
+
+  // get output from lmem->sys
+  u16 *ofmap_data =
+      (u16 *)get_bf16_tensor_l2g(ctx, bmk, tl_ofmap_bf16, tl_ofmap_bf16->fmt);
+
+  verify(ofmap_data, ref_data, ifmap, ifmap_shape_size, mode);
+
+  free_tl(bmk, tl_table_answer_mantissa);
+  free_tl(bmk, tl_table_answer);
+  free_tl(bmk, tl_buf);
+  free_tl(bmk, tl_ofmap_bf16);
+  free_tl(bmk, tl_ifmap);
+
+  free(ifmap);
+  free(ref_data);
+  free(ofmap_data);
+  free(table_data);
+  free(table_data_mantissa);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  int round_mode;
+
+  round_mode = set_store_feround();
+
+  test_init(&ctx, &bmk);
+
+  for (int i = PRE_DATA_COMPARE_FIX; i < TEST_MODE_MAX; i++) {
+    mode = static_cast<TEST_MODE>(i);
+    printf("test mode %d...\n", mode);
+
+    int input_n = 1;
+    int input_c = 32;
+    int input_h = 1;
+    int input_w = 1;
+
+    if (mode == PRE_DATA_COMPARE_FIX) {
+      input_h = 4;
+      input_w = 8;
+    } else {
+      input_h = input_w = 16;
+    }
+
+    testbench(&ctx, bmk, input_n, input_c, input_h, input_w);
+  }
+
+  test_exit(&ctx);
+  restore_feround(round_mode);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tdma_l2l_tensor_lrn_shift.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tdma_l2l_tensor_lrn_shift.cpp
new file mode 100644
index 000000000..3b3add499
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tdma_l2l_tensor_lrn_shift.cpp
@@ -0,0 +1,196 @@
+#include "../1880v2_test_util.h"
+#include "1880v2_bf16_util.h"
+
+
+typedef bmk1880v2_tdma_l2l_tensor_lrn_shift_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) %s%u%s (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      (p->right_shift? "": "<-"),
+      p->lrn_step,
+      (p->right_shift? "->": ""),
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  u32 n;
+  u32 c;
+  u32 src_h;
+  u32 src_w;
+  u32 dst_h;
+  u32 dst_w;
+} case_t;
+
+static case_t g_cases[] = {
+  { 0, 0, 1, 1, 1, 1 },
+  { 0, 0, 3, 7, 7, 3 },
+  { 0, 0, 4, 4, 2, 8 },
+  { 0, 0, 7, 7, 1, 49 },
+  { 0, 0, 7, 8, 14, 4 },
+  { 0, 0, 14, 6, 12, 7 },
+};
+
+static void destroy_param(bmk_ctx_t *bmk, param_t *p)
+{
+  free_tl(bmk, p->dst);
+  free_tl(bmk, p->src);
+}
+
+static void lrn_left_shift_ref(param_t *p, u16 ref_data[], u16 src_data[])
+{
+  u32 n = p->src->shape.n;
+  u32 c = p->src->shape.c;
+  u32 hw = p->src->shape.h * p->src->shape.w;
+  u64 size = tl_shape_size(&p->src->shape);
+
+  for (u64 i = 0; i < size; i++)
+    ref_data[i] = convert_int8_bf16(0, 1);
+
+  for (u32 ni = 0; ni < n; ni++) {
+    for (u32 ci = p->lrn_step; ci < c; ci++) {
+      for (u32 hwi = 0; hwi < hw; hwi++) {
+        u32 src_i = (ni * c + ci) * hw + hwi;
+        u32 dst_i = src_i - p->lrn_step * hw;
+        ref_data[dst_i] = src_data[src_i];
+      }
+    }
+  }
+}
+
+static void lrn_right_shift_ref(param_t *p, u16 ref_data[], u16 src_data[])
+{
+  u32 n = p->src->shape.n;
+  u32 c = p->src->shape.c;
+  u32 hw = p->src->shape.h * p->src->shape.w;
+  u64 size = tl_shape_size(&p->src->shape);
+
+  for (u64 i = 0; i < size; i++)
+    ref_data[i] = convert_int8_bf16(0, 1);
+
+  for (u32 ni = 0; ni < n; ni++) {
+    for (u32 ci = 0; ci < c - p->lrn_step; ci++) {
+      for (u32 hwi = 0; hwi < hw; hwi++) {
+        u32 src_i = (ni * c + ci) * hw + hwi;
+        u32 dst_i = src_i + p->lrn_step * hw;
+        ref_data[dst_i] = src_data[src_i];
+      }
+    }
+  }
+}
+
+static void l2l_tensor_lrn_shift_ref(
+    param_t *p, u16 ref_data[], u16 src_data[])
+{
+  if (p->right_shift)
+    return lrn_right_shift_ref(p, ref_data, src_data);
+  else
+    return lrn_left_shift_ref(p, ref_data, src_data);
+}
+
+static void test_param(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = tl_shape_size(&p->src->shape);
+
+  u16 *src_data = (u16 *)malloc(sizeof(u16) * size);
+  for (u64 i = 0; i < size; i++) {
+    src_data[i] = convert_int8_bf16(/*200 + */i, 1);
+//    printf(" src_data[%ld] %d(%d)\n",
+//           i, src_data[i], convert_bf16_s8(src_data[i]));
+  }
+
+  put_bf16_tensor_g2l(ctx, bmk, p->src, src_data, FMT_BF16);
+  bmk1880v2_tdma_l2l_tensor_lrn_shift(bmk, p);
+  u16 *dst_data = (u16 *)get_bf16_tensor_l2g(ctx, bmk, p->dst, FMT_BF16);
+
+  u16 *ref_data = (u16 *)malloc(sizeof(u16) * size);
+  l2l_tensor_lrn_shift_ref(p, ref_data, src_data);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d(%d), exp %d(%d)\n",
+              i,
+              dst_data[i], convert_bf16_s8(dst_data[i]),
+              ref_data[i], convert_bf16_s8(ref_data[i]));
+      exit(-1);
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+static void execute_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  static const u32 steps[] = { 1, 2, 4, 7 }; // less than npu_num/2
+  u32 nr_steps = sizeof(steps) / sizeof(steps[0]);
+
+  for (int src_align = 0; src_align < 2; src_align++) {
+    for (int dst_align = 0; dst_align < 2; dst_align++) {
+      tl_shape_t src_shape, dst_shape;
+      src_shape.n = c->n;
+      src_shape.c = c->c;
+      src_shape.h = c->src_h;
+      src_shape.w = c->src_w;
+      dst_shape.n = c->n;
+      dst_shape.c = c->c;
+      dst_shape.h = c->dst_h;
+      dst_shape.w = c->dst_w;
+
+      param_t p;
+      memset(&p, 0, sizeof(p));
+      p.src = alloc_tl(bmk, src_shape, FMT_BF16, src_align);
+      p.dst = alloc_tl(bmk, dst_shape, FMT_BF16, dst_align);
+
+      for (u32 i = 0; i < nr_steps; i++) {
+        if (steps[i] >= p.src->shape.c)
+          break;
+        p.lrn_step = steps[i];
+
+        p.right_shift = 0;
+        test_param(ctx, bmk, &p);
+
+        p.right_shift = 1;
+        test_param(ctx, bmk, &p);
+      }
+
+      destroy_param(bmk, &p);
+    }
+  }
+}
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *ca)
+{
+  for (u32 n = 1; n < 8; n += 2) {
+    ca->n = n;
+    for (u32 c = 1; c < 36; c += 3) {
+      ca->c = c;
+      execute_case(ctx, bmk, ca);
+    }
+    for (u32 c = 36; c < 66; c += 7) {
+      ca->c = c;
+      execute_case(ctx, bmk, ca);
+    }
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tensor_add.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tensor_add.cpp
new file mode 100644
index 000000000..6348ede8b
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tensor_add.cpp
@@ -0,0 +1,106 @@
+#include "../1880v2_test_util.h"
+
+static void tl_add_ref(
+    u16 *ref_low,
+    u16 *a_low,
+    u16 *b_low,
+    u64 size, int relu_enable)
+{
+  for (u64 i = 0; i < size; i++) {
+    float ta = convert_bf16_fp32(a_low[i]);
+    float tb = convert_bf16_fp32(b_low[i]);
+    float res = ta + tb;
+    if(relu_enable && res <0)
+        res = 0;
+    ref_low[i] = convert_fp32_bf16(res);
+  }
+}
+
+static void test_tl_add(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+  int rshift_bits;
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  fmt_t fmt_type = FMT_BF16;
+  for(int relu_enable = 0; relu_enable < 2; relu_enable++) {
+    u64 size = n * c * h  * w;
+    u64 data_size = size * (fmt_type == FMT_BF16 ? 2 : 1);
+    fmt_t fmt_type = FMT_BF16;
+    u16 *a_low_data = (u16 *)xmalloc(data_size);
+    u16 *b_low_data = (u16 *)xmalloc(data_size);
+
+    for (u64 i = 0; i < size; i++) {
+      a_low_data[i] = convert_fp32_bf16(i);
+      b_low_data[i] = convert_fp32_bf16(i);
+    }
+    rshift_bits = 0;
+
+    u16 *ref_low_data = (u16 *)xmalloc(data_size);
+
+    tl_add_ref(ref_low_data,
+               a_low_data,
+               b_low_data,
+               size, relu_enable);
+
+    tl_t *tl_a_low = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+    tl_t *tl_b_low = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+    tl_t *tl_res_low = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+
+    put_bf16_tensor_g2l(ctx, bk_ctx, tl_a_low, (u16*)a_low_data, fmt_type);
+    put_bf16_tensor_g2l(ctx, bk_ctx, tl_b_low, (u16*)b_low_data, fmt_type);
+    bmk1880v2_tiu_element_wise_add_param_t p4;
+    memset(&p4, 0, sizeof(p4));
+    p4.res_high = 0;
+    p4.res_low = tl_res_low;
+    p4.a_high = 0;
+    p4.a_low = tl_a_low;
+    p4.b_is_const = 0;
+    p4.b_high = 0;
+    p4.b_low = tl_b_low;
+    p4.rshift_bits = rshift_bits;
+    p4.relu_enable = relu_enable;
+    bmk1880v2_tiu_element_wise_add(bk_ctx, &p4);
+    u16 *res_low_data = (u16*)get_bf16_tensor_l2g(ctx, bk_ctx, tl_res_low, fmt_type);
+    
+    for (u64 i = 0; i < size; i++) {
+      if (res_low_data[i] != ref_low_data[i]) {
+        fprintf(stderr, "comparing failed at res_low_data[%" PRIu64 "], got %x, exp %x\n",
+               i, res_low_data[i], ref_low_data[i]);
+        exit(-1);
+      }
+    }
+
+    free_tl(bk_ctx, tl_res_low);
+    free_tl(bk_ctx, tl_b_low);
+    free_tl(bk_ctx, tl_a_low);
+
+    free(a_low_data);
+    free(b_low_data);
+    free(ref_low_data);
+    free(res_low_data);
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  int round_mode;
+  round_mode = set_store_feround();
+
+  test_tl_add(&ctx, bk_ctx, 0);
+  test_tl_add(&ctx, bk_ctx, 1);
+
+  restore_feround(round_mode);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tensor_add_const.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tensor_add_const.cpp
new file mode 100644
index 000000000..b4d2b1f99
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tensor_add_const.cpp
@@ -0,0 +1,96 @@
+#include "../1880v2_test_util.h"
+
+static void tl_add_const_ref(
+    u16 *ref_low,
+    u16 *a_low,
+    u16 b,
+    u64 size, int relu_enable)
+{
+  for (u64 i = 0; i < size; i++) {
+    float ta = convert_bf16_fp32(a_low[i]);
+    float tb = convert_bf16_fp32(b);
+    float res = ta + tb;
+    if(relu_enable && res <0)
+        res = 0;
+    ref_low[i] = convert_fp32_bf16(res);
+  }
+}
+
+static void test_tl_add_const(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  fmt_t fmt_type = FMT_BF16;
+  for(int relu_enable = 0; relu_enable < 2; relu_enable++) {
+    u64 size = n * c * h  * w;
+    u64 data_size = size * (fmt_type == FMT_BF16 ? 2 : 1);
+
+    u16 *a_low_data = (u16 *)xmalloc(data_size);
+    u16 b = convert_fp32_bf16(-3);
+
+    for (u64 i = 0; i < size; i++) {
+      a_low_data[i] = convert_fp32_bf16(i);
+    }
+
+    u16 *ref_low_data = (u16 *)xmalloc(data_size);
+    tl_add_const_ref(ref_low_data,
+                     a_low_data,
+                     b, size,relu_enable);
+
+    tl_t *tl_a_low = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+    tl_t *tl_res_low = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+
+    put_bf16_tensor_g2l(ctx, bk_ctx, tl_a_low, (u16*) a_low_data, fmt_type);
+
+    bmk1880v2_tiu_element_wise_add_param_t p4;
+    memset(&p4, 0, sizeof(p4));
+    p4.res_high = 0;
+    p4.res_low = tl_res_low;
+    p4.a_high = 0;
+    p4.a_low = tl_a_low;
+    p4.b_is_const = 1;
+    p4.b_const.val = b;
+//    p4.b_const.is_signed = b_is_signed;
+//    p4.rshift_bits = rshift_bits;
+    p4.relu_enable = relu_enable;
+    bmk1880v2_tiu_element_wise_add(bk_ctx, &p4);
+
+//    u8 *res_high_data = get_tensor_l2g(ctx, bk_ctx, tl_res_high);
+    u16 *res_low_data = (u16 *) get_bf16_tensor_l2g(ctx, bk_ctx, tl_res_low, fmt_type);
+    for (u64 i = 0; i < size; i++) {
+      if (res_low_data[i] != ref_low_data[i]) {
+        fprintf(stderr, "comparing failed at res_low_data[%" PRIu64 "], got %d, exp %d\n",
+                i, res_low_data[i], ref_low_data[i]);
+        exit(-1);
+      }
+    }
+
+    free_tl(bk_ctx, tl_res_low);
+    free_tl(bk_ctx, tl_a_low);
+
+    free(a_low_data);
+    free(ref_low_data);
+    free(res_low_data);
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_add_const(&ctx, bk_ctx, 0);
+  test_tl_add_const(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tensor_copy.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tensor_copy.cpp
new file mode 100644
index 000000000..30ef5711f
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tensor_copy.cpp
@@ -0,0 +1,77 @@
+#include "../1880v2_test_util.h"
+
+static void tl_copy_ref(u16 *a, u16 *res, u64 size, fmt_t fmt_type)
+{
+  if(fmt_type == FMT_BF16) {
+    for (u64 i = 0; i < size; i++)
+      res[i] = a[i];
+  } else {
+    u8* u8res = (u8*) res;
+    u8* u8a = (u8*) a;
+    for (u64 i = 0; i < size; i++)
+      u8res[i] = u8a[i];
+  }
+}
+
+static void test_tl_copy(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  fmt_t fmt_type = FMT_BF16;
+  u64 size = n * c * h * w;
+  u64 data_size = size * (fmt_type == FMT_BF16 ? 2 : 1);
+  u16 *a_data = (u16 *)xmalloc(data_size);
+  for (u64 i = 0; i < size; i++)
+    a_data[i] = convert_fp32_bf16(rand());
+
+  u16 *ref_data = (u16 *)xmalloc(data_size);
+  tl_copy_ref(a_data, ref_data, size, fmt_type);
+
+  tl_t *tl_a = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+  tl_t *tl_res = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+
+  put_bf16_tensor_g2l(ctx, bk_ctx, tl_a, (u16 *)a_data, fmt_type);
+  bmk1880v2_tiu_element_wise_copy_param_t p10;
+  memset(&p10, 0, sizeof(p10));
+  p10.dst = tl_res;
+  p10.src = tl_a;
+  bmk1880v2_tiu_element_wise_copy(bk_ctx, &p10);
+  u16 *res_data = (u16 *)get_bf16_tensor_l2g(ctx, bk_ctx, tl_res, fmt_type);
+
+  for (u64 i = 0; i < size; i++) {
+    if (res_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+             i, res_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_res);
+  free_tl(bk_ctx, tl_a);
+
+  free(a_data);
+  free(ref_data);
+  free(res_data);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_copy(&ctx, bk_ctx, 0);
+  test_tl_copy(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tensor_copy_with_stride.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tensor_copy_with_stride.cpp
new file mode 100644
index 000000000..4ce671b02
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tensor_copy_with_stride.cpp
@@ -0,0 +1,214 @@
+#include "../1880v2_test_util.h"
+#include "1880v2_bf16_util.h"
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+ {FMT_I8, FMT_I8},
+};
+
+static int npu_num = 32;
+
+static u64 shape_size(tl_shape_t s)
+{
+  return s.n * s.c * s.h * s.w;
+}
+
+static tl_shape_t shape_of_stride(
+    tl_shape_t tl_shape,
+    bmk1880v2_tensor_lmem_stride_t tl_stride,
+    fmt_t fmt)
+{
+  tl_shape_t shape;
+  shape.n = tl_shape.n;
+  shape.c = npu_num;
+  shape.h = tl_stride.n / ((fmt == FMT_BF16) ?2:1);
+  shape.w = 1;
+  return shape;
+}
+
+static void tl_copy_with_stride_ref(
+    void *src,
+    void *dst,
+    tl_shape_t shape,
+    bmk1880v2_tensor_lmem_stride_t src_stride,
+    bmk1880v2_tensor_lmem_stride_t dst_stride,
+    fmt_t fmt)
+{
+  int nsrc_byte = ((fmt == FMT_BF16) ? 2 : 1);
+  int n = shape.n;
+  int c = shape.c;
+  int h = shape.h;
+  int w = shape.w;
+
+  tl_shape_t dst_stride_shape = shape_of_stride(shape, dst_stride, fmt);
+  
+  u16 *u16_ref = NULL;
+  u16 *u16_src = NULL;
+  u8 *u8_ref = NULL;
+  u8 *u8_src = NULL;
+
+  int dst_size = 
+      dst_stride_shape.n *
+      dst_stride_shape.c *
+      dst_stride_shape.h *
+      dst_stride_shape.w;
+
+  if (fmt == FMT_BF16) {
+    u16_ref = (u16 *)dst;
+    u16_src = (u16 *)src;
+  } else {
+    u8_ref = (u8 *)dst;
+    u8_src = (u8 *)src;
+  }
+
+  for (int i = 0; i < dst_size; i++) {
+    if (fmt == FMT_BF16) {
+      u16_ref[i] = 0x0;
+    } else {
+      u8_ref[i] = 0x0;
+    }
+  }
+
+  for (int ni = 0; ni < n; ni++) {
+    for (int ci = 0; ci < c; ci++) {
+      for (int hi = 0; hi < h; hi++) {
+        for (int wi = 0; wi < w; wi++) {
+          int src_i = (ni * npu_num + ci % npu_num) * src_stride.n / nsrc_byte +
+              ci / npu_num * src_stride.c / nsrc_byte +
+              hi * src_stride.h / nsrc_byte +
+              wi;
+          int dst_i = (ni * npu_num + ci % npu_num) * dst_stride.n / nsrc_byte +
+              ci / npu_num * dst_stride.c / nsrc_byte +
+              hi * dst_stride.h / nsrc_byte +
+              wi;
+          if (fmt == FMT_BF16) {
+            u16_ref[dst_i] = u16_src[src_i];
+          } else {
+            u8_ref[dst_i] = u8_src[src_i];
+          }
+        }
+      }
+    }
+  }
+
+  dst =  (fmt == FMT_BF16) ? (void *)u16_ref : (void *)u8_ref;
+}
+
+static void test_tl_copy_with_stride(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    fmt_t fmt,
+    int eu_align)
+{
+  int n = 3;
+  int c = 38;
+  int h = 2;
+  int w = 3;
+  int c_layers = ALIGN(c, npu_num) / npu_num;
+  int nsrc_byte = ((fmt == FMT_BF16) ? 2 : 1);
+
+  bmk1880v2_tensor_lmem_stride_t src_stride;
+  src_stride.w = nsrc_byte;
+  src_stride.h = (w + 3) * nsrc_byte;
+  src_stride.c = h * src_stride.h + (13 * nsrc_byte);
+  src_stride.n = c_layers * src_stride.c + (7 * nsrc_byte);
+
+  bmk1880v2_tensor_lmem_stride_t dst_stride;
+  dst_stride.w = nsrc_byte;
+  dst_stride.h = (w + 1) * nsrc_byte;
+  dst_stride.c = h * dst_stride.h + (5 * nsrc_byte);
+  dst_stride.n = c_layers * dst_stride.c + (19 * nsrc_byte);
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  tl_shape_t src_stride_shape = shape_of_stride(tl_shape, src_stride, fmt);
+  tl_shape_t dst_stride_shape = shape_of_stride(tl_shape, dst_stride, fmt);
+
+  int src_size = shape_size(src_stride_shape);
+  int dst_size = shape_size(dst_stride_shape);
+
+  float val = -100;
+  void *src_data = NULL;
+  u16 *u16src_data = (u16 *)malloc(sizeof(u16) * src_size);
+  s8 *s8src_data = (s8 *)malloc(sizeof(s8) * src_size);
+  void *dst_data = NULL;
+  u16 *u16dst_data = (u16 *)malloc(sizeof(u16) * dst_size);
+  s8 *s8dst_data = (s8 *)malloc(sizeof(s8) * dst_size);
+  void *result_x = NULL;
+  void *ref_x = NULL;
+
+  // prepare source data
+  ref_x = (u8 *)xmalloc(dst_size * nsrc_byte);
+  for (int i = 0; i < src_size; i++) {
+    if(fmt == FMT_BF16) {
+      u16src_data[i] = convert_fp32_bf16(val);
+      val += 0.1;
+    } else {
+      s8src_data[i] = i;
+    }
+  }
+  for (int i = 0; i < dst_size; i++) {
+      u16dst_data[i] = s8dst_data[i] = 0;
+  }
+  src_data =  (fmt == FMT_BF16) ? (void *)u16src_data : (void *)s8src_data;
+  dst_data =  (fmt == FMT_BF16) ? (void *)u16dst_data : (void *)s8dst_data;
+
+  // run tpu operations
+  tl_t *tl_src = alloc_tl( bk_ctx, src_stride_shape, fmt, eu_align);
+  tl_t *tl_dst = alloc_tl( bk_ctx, dst_stride_shape, fmt, eu_align);
+  put_bf16_tensor_g2l(ctx, bk_ctx, tl_src, (u16 *)src_data, fmt);
+  put_bf16_tensor_g2l(ctx, bk_ctx, tl_dst, (u16 *)dst_data, fmt);
+  {
+    tl_t src = *tl_src;
+    tl_t dst = *tl_dst;
+    src.shape = dst.shape = tl_shape;
+    src.stride = src_stride;
+    dst.stride = dst_stride;
+    bmk1880v2_tiu_element_wise_copy_param_t p11;
+    memset(&p11, 0, sizeof(p11));
+    p11.dst = &dst;
+    p11.src = &src;
+    bmk1880v2_tiu_element_wise_copy(bk_ctx, &p11);
+    
+  }
+  result_x = get_bf16_tensor_l2g(ctx, bk_ctx, tl_dst, fmt);
+
+  tl_copy_with_stride_ref(src_data, ref_x, tl_shape, src_stride, dst_stride, fmt);
+
+  // compare data
+  if( COMPARE_PASS != compare_result( ref_x, result_x, fmt, dst_size))
+    exit(-1);
+  
+  // free variables
+  free_tl(bk_ctx, tl_dst);
+  free_tl(bk_ctx, tl_src);
+  free(s8src_data);
+  free(u16src_data);
+  free(s8dst_data);
+  free(u16dst_data);
+  free(ref_x);
+  free(result_x);
+}
+
+#define TEST_ALIGNED 1 // 1: test unalign only, 2: test both align/unalign
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+
+  test_init(&ctx, &bk_ctx);
+
+  for (u32 i = 0; i < nr_fmt; i++) {
+    for (u8 u8_align = 0; u8_align < TEST_ALIGNED; u8_align++) {
+      test_tl_copy_with_stride(&ctx, bk_ctx, input_fmt[i].src_fmt, u8_align);
+    }
+  }
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tensor_mac.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tensor_mac.cpp
new file mode 100644
index 000000000..0930ed31b
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tensor_mac.cpp
@@ -0,0 +1,110 @@
+#include "../1880v2_test_util.h"
+
+static void tl_mac_ref(
+    u16 *ref,
+    u16 *a, u16 *b, u16 *c,
+    u64 size, int relu_enable)
+{
+  for (u64 i = 0; i < size; i++) {
+    float ta = convert_bf16_fp32(a[i]);
+    float tb = convert_bf16_fp32(b[i]);
+    float tc = convert_bf16_fp32(c[i]);
+    float res = ta * tb + tc;
+
+    if(relu_enable)
+      if(res<0)
+        res=0;
+    ref[i] = convert_fp32_bf16(res);
+  }
+}
+
+static void test_tl_mac(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int lshift_bits = 1;
+  int rshift_bits = 3;
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+  fmt_t fmt_type = FMT_BF16;
+  for(int relu_enable = 0; relu_enable < 2; relu_enable++) {
+    u64 size = n * c * h * w;
+    u64 data_size = size * (fmt_type == FMT_BF16 ? 2 : 1);
+    u16 *a_data = (u16 *)xmalloc(data_size);
+    u16 *b_data = (u16 *)xmalloc(data_size);
+    u16 *c_data = (u16 *)xmalloc(data_size);
+
+    for (u64 i = 0; i < size; i++) {
+      a_data[i] = convert_fp32_bf16(rand());
+      b_data[i] = convert_fp32_bf16(rand());
+      c_data[i] = convert_fp32_bf16(rand());
+    }
+
+    u16 *ref_data = (u16 *)xmalloc(data_size);
+
+    tl_mac_ref(ref_data,
+               a_data, b_data, c_data,
+               size, relu_enable);
+
+    tl_t *tl_a = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+    tl_t *tl_b = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+    tl_t *tl_c = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+
+    put_bf16_tensor_g2l(ctx, bk_ctx, tl_a, a_data, fmt_type);
+    put_bf16_tensor_g2l(ctx, bk_ctx, tl_b, b_data, fmt_type);
+    put_bf16_tensor_g2l(ctx, bk_ctx, tl_c, c_data, fmt_type);
+
+    bmk1880v2_tiu_element_wise_mac_param_t p2;
+    memset(&p2, 0, sizeof(p2));
+    p2.res_high = 0;
+    p2.res_low = tl_c;
+    p2.res_is_int8 = relu_enable;
+    p2.a = tl_a;
+    p2.b_is_const = 0;
+    p2.b = tl_b;
+    p2.lshift_bits = lshift_bits;
+    p2.rshift_bits = rshift_bits;
+    p2.relu_enable = relu_enable;
+    bmk1880v2_tiu_element_wise_mac(bk_ctx, &p2);
+    u16 *mac_data = (u16 *)get_bf16_tensor_l2g(ctx, bk_ctx, tl_c, fmt_type);
+
+    for (u64 i = 0; i < size; i++) {
+      if (mac_data[i] != ref_data[i]) {
+        fprintf(stderr, "comparing failed at mac_data[%" PRIu64 "], got %d, exp %d\n",
+               i, mac_data[i], ref_data[i]);
+        exit(-1);
+      }
+    }
+
+    free_tl(bk_ctx, tl_c);
+    free_tl(bk_ctx, tl_b);
+    free_tl(bk_ctx, tl_a);
+
+    free(a_data);
+    free(b_data);
+    free(c_data);
+    free(ref_data);
+    free(mac_data);
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  int round_mode;
+  round_mode = set_store_feround();
+  test_tl_mac(&ctx, bk_ctx, 0);
+  test_tl_mac(&ctx, bk_ctx, 1);
+  restore_feround(round_mode);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tensor_mac_const.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tensor_mac_const.cpp
new file mode 100644
index 000000000..815ff0e12
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tensor_mac_const.cpp
@@ -0,0 +1,105 @@
+#include "../1880v2_test_util.h"
+
+static void tl_mac_const_ref(
+    u16 *ref_low,
+    u16 *a, u16 b_const,
+    u16 *c_low,
+    u64 size, int relu_enable)
+{
+  for (u64 i = 0; i < size; i++) {
+    float ta = convert_bf16_fp32(a[i]);
+    float tb = convert_bf16_fp32(b_const);
+    float tc = convert_bf16_fp32(c_low[i]);
+    float res = ta * tb + tc;
+
+    if(relu_enable)
+    {
+      if(res<0)
+        res=0;
+    }
+    ref_low[i] = convert_fp32_bf16(res);
+  }
+}
+
+static void test_tl_mac_const(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  for(int relu_enable = 0; relu_enable < 2; relu_enable++) {
+    fmt_t fmt_type = FMT_BF16;
+    u64 size = n * c * h  * w;
+    u64 data_size = size * (fmt_type == FMT_BF16 ? 2 : 1);
+
+    u16 *a_data = (u16 *)xmalloc(data_size);
+    u16 *c_low_data = (u16 *)xmalloc(data_size);
+    for (u64 i = 0; i < size; i++) {
+      a_data[i] = convert_fp32_bf16(rand() % 256);
+      c_low_data[i] = convert_fp32_bf16(i);
+    }
+
+    u16 b_const = convert_fp32_bf16(37);
+
+    u16 *ref_low_data = (u16 *)xmalloc(data_size);
+    tl_mac_const_ref(ref_low_data,
+                     a_data, b_const, c_low_data,
+                     size, relu_enable);
+
+    tl_t *tl_a = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+    tl_t *tl_c_low = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+
+    put_bf16_tensor_g2l(ctx, bk_ctx, tl_a, (u16*) a_data, fmt_type);
+    put_bf16_tensor_g2l(ctx, bk_ctx, tl_c_low, (u16*) c_low_data, fmt_type);
+    bmk1880v2_tiu_element_wise_mac_param_t p3;
+    memset(&p3, 0, sizeof(p3));
+    p3.res_high = 0;
+    p3.res_low = tl_c_low;
+    p3.res_is_int8 = 1;//relu_enable;
+    p3.a = tl_a;
+    p3.b_is_const = 1;
+    p3.b_const.val = b_const;
+    p3.relu_enable = relu_enable;
+
+    bmk1880v2_tiu_element_wise_mac(bk_ctx, &p3);
+    u16 *mac_low_data = (u16*) get_bf16_tensor_l2g(ctx, bk_ctx, tl_c_low, fmt_type);
+    for (u64 i = 0; i < size; i++) {
+      if (mac_low_data[i] != ref_low_data[i]) {
+        fprintf(stderr, "comparing failed at mac_low_data[%" PRIu64 "], got %d, exp %d\n",
+               i, mac_low_data[i], ref_low_data[i]);
+        exit(-1);
+      }
+    }
+
+    free_tl(bk_ctx, tl_c_low);
+    free_tl(bk_ctx, tl_a);
+
+    free(a_data);
+    free(c_low_data);
+    free(ref_low_data);
+    free(mac_low_data);
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  int round_mode;
+  round_mode = set_store_feround();
+
+  test_tl_mac_const(&ctx, bk_ctx, 0);
+  test_tl_mac_const(&ctx, bk_ctx, 1);
+
+  restore_feround(round_mode);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tensor_max.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tensor_max.cpp
new file mode 100644
index 000000000..95723e18f
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tensor_max.cpp
@@ -0,0 +1,89 @@
+#include "../1880v2_test_util.h"
+
+static void tl_max_ref(u16 *a, u16 *b, u16 *max, u64 size)
+{
+  for (u64 i = 0; i < size; i++) {
+    float fa = convert_bf16_fp32(a[i]);
+    float fb = convert_bf16_fp32(b[i]);
+    float fmax;
+    if (fa > fb)
+      fmax = fa;
+    else
+      fmax = fb;
+    max[i] = convert_fp32_bf16(fmax);
+  }
+}
+
+static void test_tl_max(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  fmt_t fmt_type = FMT_BF16;
+  u64 size = n * c * h * w;
+  u64 data_size = size * (fmt_type == FMT_BF16 ? 2 : 1);
+  u16 *a_data = (u16 *)xmalloc(data_size);
+  for (u64 i = 0; i < size; i++)
+    a_data[i] = convert_fp32_bf16((s8)(i % 256));
+
+  u16 *b_data = (u16 *)xmalloc(data_size);
+  for (u64 i = 0; i < size; i++)
+    b_data[i] = convert_fp32_bf16((s8)(100 - i % 256));
+
+  u16 *ref_data = (u16 *)xmalloc(data_size);
+  tl_max_ref(a_data, b_data, ref_data, size);
+
+  tl_t *tl_a = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+  tl_t *tl_b = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+  tl_t *tl_max = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+
+  put_bf16_tensor_g2l(ctx, bk_ctx, tl_a, (u16 *)a_data, fmt_type);
+  put_bf16_tensor_g2l(ctx, bk_ctx, tl_b, (u16 *)b_data, fmt_type);
+
+  bmk1880v2_tiu_element_wise_max_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.max = tl_max;
+  p.a = tl_a;
+  p.b_is_const = 0;
+  p.b = tl_b;
+  bmk1880v2_tiu_element_wise_max(bk_ctx, &p);
+  u16 *max_data = (u16*)get_bf16_tensor_l2g(ctx, bk_ctx, tl_max, fmt_type);
+
+  for (u64 i = 0; i < size; i++) {
+    if (max_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+             i, max_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_max);
+  free_tl(bk_ctx, tl_b);
+  free_tl(bk_ctx, tl_a);
+
+  free(a_data);
+  free(b_data);
+  free(ref_data);
+  free(max_data);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_max(&ctx, bk_ctx, 0);
+  test_tl_max(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tensor_max_const.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tensor_max_const.cpp
new file mode 100644
index 000000000..883ccd9e5
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tensor_max_const.cpp
@@ -0,0 +1,82 @@
+#include "../1880v2_test_util.h"
+
+static void tl_max_const_ref(u16 *a, u16 b, u16 *max, u64 size)
+{
+  for (u64 i = 0; i < size; i++) {
+    if (convert_bf16_fp32(a[i]) > convert_bf16_fp32(b))
+      max[i] = a[i];
+    else
+      max[i] = b;
+  }
+}
+
+static void test_tl_max_const(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  fmt_t fmt_type = FMT_BF16;
+  u64 size = n * c * h  * w;
+  u64 data_size = size * (fmt_type == FMT_BF16 ? 2 : 1);
+
+  u16 *a_data = (u16 *)xmalloc(data_size);
+  for (u64 i = 0; i < size; i++)
+    a_data[i] = convert_fp32_bf16(i);
+    //a_data[i] = convert_fp32_bf16(rand()%100 - 50);
+
+  u16 b = convert_fp32_bf16(20);
+  u16 *ref_data = (u16 *)xmalloc(data_size);
+  tl_max_const_ref(a_data, b, ref_data, size);
+
+  tl_t *tl_a = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+  tl_t *tl_max = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+  
+  put_bf16_tensor_g2l(ctx, bk_ctx, tl_a, (u16 *)a_data, fmt_type);
+  bmk1880v2_tiu_element_wise_max_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.max = tl_max;
+  p.a = tl_a;
+  p.b_is_const = 1;
+  p.b_const.val = b;
+  p.b_const.is_signed = 1;
+
+  bmk1880v2_tiu_element_wise_max(bk_ctx, &p);
+
+  u16 *max_data = (u16*) get_bf16_tensor_l2g(ctx, bk_ctx, tl_max, fmt_type);
+
+  for (u64 i = 0; i < size; i++) {
+    if (max_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+             i, max_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_max);
+  free_tl(bk_ctx, tl_a);
+
+  free(a_data);
+  free(ref_data);
+  free(max_data);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_max_const(&ctx, bk_ctx, 0);
+  test_tl_max_const(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tensor_min.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tensor_min.cpp
new file mode 100644
index 000000000..e103008d0
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tensor_min.cpp
@@ -0,0 +1,88 @@
+#include "../1880v2_test_util.h"
+
+static void tl_min_ref(u16 *a, u16 *b, u16 *max, u64 size)
+{
+  for (u64 i = 0; i < size; i++) {
+    float fa = convert_bf16_fp32(a[i]);
+    float fb = convert_bf16_fp32(b[i]);
+    float fmax;
+    if (fa > fb)
+      fmax = fb;
+    else
+      fmax = fa;
+    max[i] = convert_fp32_bf16(fmax);
+  }
+}
+
+static void test_tl_min(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+  fmt_t fmt_type = FMT_BF16;
+  u64 size = n * c * h * w;
+  u64 data_size = size * (fmt_type == FMT_BF16 ? 2 : 1); 
+
+  u16 *a_data = (u16 *)xmalloc(data_size);
+  for (u64 i = 0; i < size; i++)
+    a_data[i] = convert_fp32_bf16(rand());
+
+  u16 *b_data = (u16 *)xmalloc(data_size);
+  for (u64 i = 0; i < size; i++)
+    b_data[i] = convert_fp32_bf16(rand()/2);
+
+  u16 *ref_data = (u16 *)xmalloc(data_size);
+  tl_min_ref(a_data, b_data, ref_data, size);
+
+  tl_t *tl_a = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+  tl_t *tl_b = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+  tl_t *tl_min = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+
+  put_bf16_tensor_g2l(ctx, bk_ctx, tl_a, (u16 *)a_data, fmt_type);
+  put_bf16_tensor_g2l(ctx, bk_ctx, tl_b, (u16 *)b_data, fmt_type);
+  bmk1880v2_tiu_element_wise_min_param_t p6;
+  memset(&p6, 0, sizeof(p6));
+  p6.min = tl_min;
+  p6.a = tl_a;
+  p6.b_is_const = 0;
+  p6.b = tl_b;
+  bmk1880v2_tiu_element_wise_min(bk_ctx, &p6);
+  u16 *min_data = (u16*)get_bf16_tensor_l2g(ctx, bk_ctx, tl_min, fmt_type);
+
+  for (u64 i = 0; i < size; i++) {
+    if (min_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+             i, min_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_min);
+  free_tl(bk_ctx, tl_b);
+  free_tl(bk_ctx, tl_a);
+
+  free(a_data);
+  free(b_data);
+  free(ref_data);
+  free(min_data);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_min(&ctx, bk_ctx, 0);
+  test_tl_min(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tensor_min_const.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tensor_min_const.cpp
new file mode 100644
index 000000000..7d47dd7b8
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tensor_min_const.cpp
@@ -0,0 +1,80 @@
+#include "../1880v2_test_util.h"
+
+static void tl_min_const_ref(u16 *a, u16 b, u16 *max, u64 size)
+{
+  for (u64 i = 0; i < size; i++) {
+    if (convert_bf16_fp32(a[i]) > convert_bf16_fp32(b))
+      max[i] = b;
+    else
+      max[i] = a[i];
+  }
+}
+
+static void test_tl_min_const(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  fmt_t fmt_type = FMT_BF16;
+  u64 size = n * c * h  * w;
+  u64 data_size = size * (fmt_type == FMT_BF16 ? 2 : 1);
+
+  u16 *a_data = (u16 *)xmalloc(data_size);
+  for (u64 i = 0; i < size; i++)
+    a_data[i] = convert_fp32_bf16(rand() % 100 -50);
+
+  u16 b = convert_fp32_bf16(20);
+
+  u16 *ref_data = (u16 *)xmalloc(data_size);
+  tl_min_const_ref(a_data, b, ref_data, size);
+
+  tl_t *tl_a = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+  tl_t *tl_min = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+
+  put_bf16_tensor_g2l(ctx, bk_ctx, tl_a, (u16 *)a_data, fmt_type);
+  bmk1880v2_tiu_element_wise_min_param_t p7;
+  memset(&p7, 0, sizeof(p7));
+  p7.min = tl_min;
+  p7.a = tl_a;
+  p7.b_is_const = 1;
+  p7.b_const.val = b;
+  p7.b_const.is_signed = 1;
+  bmk1880v2_tiu_element_wise_min(bk_ctx, &p7);
+  u16 *min_data = (u16*) get_bf16_tensor_l2g(ctx, bk_ctx, tl_min, fmt_type);
+
+  for (u64 i = 0; i < size; i++) {
+    if (min_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+             i, min_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_min);
+  free_tl(bk_ctx, tl_a);
+
+  free(a_data);
+  free(ref_data);
+  free(min_data);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_min_const(&ctx, bk_ctx, 0);
+  test_tl_min_const(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tensor_mul.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tensor_mul.cpp
new file mode 100644
index 000000000..05d7d7456
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tensor_mul.cpp
@@ -0,0 +1,113 @@
+#include "../1880v2_test_util.h"
+
+static void tl_mul_ref(u16 *ofmap, u16 *a, u16 *b, u64 size, int shift_bits, int relu_enable, fmt_t fmt_type)
+{
+  if(fmt_type == FMT_BF16) {
+    for (u64 i = 0; i < size; i++) {
+      float tmp = convert_bf16_fp32(a[i]) * convert_bf16_fp32(b[i]);
+      if(relu_enable)
+        if(tmp<0)
+          tmp=0;
+      ofmap[i] = convert_fp32_bf16(tmp);
+    }
+  } else {
+    for (u64 i = 0; i < size; i++) {
+      s32 tmp = a[i] * b[i];
+      tmp += 1 << (shift_bits - 1);
+      tmp >>= shift_bits;
+      if (tmp > 127)
+        tmp = 127;
+      else if (tmp < -128)
+        tmp = -128;
+      if(relu_enable)
+        if(tmp<0)
+          tmp=0;
+      ofmap[i] = tmp;
+    }
+  }
+}
+
+static void test_tl_mul(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  fmt_t fmt_type = FMT_BF16;
+  u64 size = n * c * h  * w;
+  u64 data_size = size * (fmt_type == FMT_BF16 ? 2 : 1);
+  int shift_bits = 1;
+
+  for (u32 relu_enable = 0; relu_enable < 2; relu_enable++)
+  {
+     u16 *a_data = (u16 *)xmalloc(data_size);
+     u16 *b_data = (u16 *)xmalloc(data_size);
+     for (u64 i = 0; i < size; i++) {
+       a_data[i] = convert_fp32_bf16(random()%0x10);
+       b_data[i] = convert_fp32_bf16(random());
+     }
+   
+     tl_t *tl_a = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+     tl_t *tl_b = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+     tl_t *tl_res_low = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+   
+     put_bf16_tensor_g2l(ctx, bk_ctx, tl_a, (u16 *)a_data, fmt_type);
+     put_bf16_tensor_g2l(ctx, bk_ctx, tl_b, (u16 *)b_data, fmt_type);
+   
+     bmk1880v2_tiu_element_wise_mul_param_t p1;
+     memset(&p1, 0, sizeof(p1));
+     p1.res_high = NULL;
+     p1.res_low = tl_res_low;
+     p1.a = tl_a;
+     p1.b_is_const = 0;
+     p1.b = tl_b;
+     p1.rshift_bits = shift_bits;
+     p1.relu_enable = relu_enable;
+     bmk1880v2_tiu_element_wise_mul(bk_ctx, &p1);
+   
+     u16 *res_low_data = (u16 *)get_bf16_tensor_l2g(ctx, bk_ctx, tl_res_low, fmt_type);
+   
+     u16 *ref_data = (u16 *)xmalloc(data_size);
+     tl_mul_ref(ref_data, a_data, b_data, size, shift_bits, relu_enable, fmt_type);
+   
+     for (u64 i = 0; i < size; i++) {
+       if (res_low_data[i] != ref_data[i]) {
+         fprintf(stderr, "comparing failed at res_low_data[%" PRIu64 "], got %x, exp %x\n",
+                i, res_low_data[i], ref_data[i]);
+         exit(-1);
+       }
+     }
+   
+     free_tl(bk_ctx, tl_res_low);
+     free_tl(bk_ctx, tl_b);
+     free_tl(bk_ctx, tl_a);
+   
+     free(a_data);
+     free(b_data);
+     free(ref_data);
+     free(res_low_data);
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  int round_mode;
+  round_mode = set_store_feround();
+
+  test_tl_mul(&ctx, bk_ctx, 0);
+  test_tl_mul(&ctx, bk_ctx, 1);
+
+  restore_feround(round_mode);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tensor_mul_const.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tensor_mul_const.cpp
new file mode 100644
index 000000000..117db6905
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tensor_mul_const.cpp
@@ -0,0 +1,107 @@
+#include "../1880v2_test_util.h"
+
+static void tl_mul_const_ref(
+    u16 *ofmap, u16 *ifmap, u64 size, u16 mul_const, int shift_bits, int relu_enable, fmt_t fmt_type)
+{
+
+  if(fmt_type == FMT_BF16) {
+    for (u64 i = 0; i < size; i++) {
+      float tmp = convert_bf16_fp32(ifmap[i]) * convert_bf16_fp32(mul_const);
+      if(relu_enable)
+        if(tmp<0)
+          tmp=0;
+      ofmap[i] = convert_fp32_bf16(tmp);
+    }
+  } else {
+    for (u64 i = 0; i < size; i++) {
+      s32 tmp = ifmap[i] * (s16) mul_const;
+      tmp += 1 << (shift_bits - 1);
+      tmp >>= shift_bits;
+      if (tmp > 127)
+        tmp = 127;
+      else if (tmp < -128)
+        tmp = -128;
+      if(relu_enable)
+        if(tmp<0)
+          tmp=0;
+      ofmap[i] = tmp;
+    }
+  }
+}
+
+static void test_tl_mul_const(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  u64 size = n * c * h  * w;
+  fmt_t fmt_type = FMT_BF16;
+  u64 data_size = size * (fmt_type == FMT_BF16 ? 2 : 1);
+  int shift_bits = 1;
+
+  for (u32 relu_enable = 0; relu_enable < 2; relu_enable++)
+  {
+    u16 *ifmap_data = (u16 *)xmalloc(data_size);
+    for (u64 i = 0; i < size; i++)
+      ifmap_data[i] = convert_fp32_bf16(random() % 256);
+  
+    u16 mul_const = convert_fp32_bf16(20);
+  
+    u16 *ref_data = (u16 *)xmalloc(data_size);
+    tl_mul_const_ref(ref_data, ifmap_data, size, mul_const, shift_bits, relu_enable, fmt_type);
+  
+    tl_t *tl_ifmap = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+    tl_t *tl_ofmap = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+  
+    put_bf16_tensor_g2l(ctx, bk_ctx, tl_ifmap, (u16 *)ifmap_data, fmt_type);
+  
+    bmk1880v2_tiu_element_wise_mul_param_t p;
+    memset(&p, 0, sizeof(p));
+    p.res_high = NULL;
+    p.res_low = tl_ofmap;
+    p.a = tl_ifmap;
+    p.b_is_const = 1;
+    p.b_const.val = mul_const;
+    p.relu_enable = relu_enable;
+
+    bmk1880v2_tiu_element_wise_mul(bk_ctx, &p);
+  
+    u16 *ofmap_data = (u16*) get_bf16_tensor_l2g(ctx, bk_ctx, tl_ofmap, fmt_type);
+  
+    for (u64 i = 0; i < size; i++) {
+      if (ofmap_data[i] != ref_data[i]) {
+        fprintf(stderr, "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+               i, ofmap_data[i], ref_data[i]);
+        exit(-1);
+      }
+    }
+  
+    free_tl(bk_ctx, tl_ofmap);
+    free_tl(bk_ctx, tl_ifmap);
+  
+    free(ifmap_data);
+    free(ref_data);
+    free(ofmap_data);
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_mul_const(&ctx, bk_ctx, 0);
+  test_tl_mul_const(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tensor_sub.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tensor_sub.cpp
new file mode 100644
index 000000000..9aaf7f8eb
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tensor_sub.cpp
@@ -0,0 +1,96 @@
+#include "../1880v2_test_util.h"
+
+static void tl_sub_ref(
+    u16 *ref_low,
+    u16 *a_low,
+    u16 *b_low,
+    u64 size)
+{
+  for (u64 i = 0; i < size; i++) {
+    float ta = convert_bf16_fp32(a_low[i]);
+    float tb = convert_bf16_fp32(b_low[i]);
+    float res = ta - tb;
+
+    ref_low[i] = convert_fp32_bf16(res);
+  }
+}
+
+static void test_tl_sub(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  fmt_t fmt_type = FMT_BF16;
+  u64 size = n * c * h * w;
+  u64 data_size = size * (fmt_type == FMT_BF16 ? 2 : 1);
+  u16 *a_low_data = (u16 *)xmalloc(data_size);
+  u16 *b_low_data = (u16 *)xmalloc(data_size);
+  for (u64 i = 0; i < size; i++) {
+    a_low_data[i] = convert_fp32_bf16(rand());
+    b_low_data[i] = convert_fp32_bf16(rand());
+  }
+
+  u16 *ref_low_data = (u16 *)xmalloc(data_size);
+  tl_sub_ref(ref_low_data,
+             a_low_data,
+             b_low_data,
+             size);
+
+  tl_t *tl_a_low = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+  tl_t *tl_b_low = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+  tl_t *tl_res_low = alloc_tl(bk_ctx, tl_shape, fmt_type, eu_align);
+
+  put_bf16_tensor_g2l(ctx, bk_ctx, tl_a_low, a_low_data, fmt_type);
+  put_bf16_tensor_g2l(ctx, bk_ctx, tl_b_low, b_low_data, fmt_type);
+  bmk1880v2_tiu_element_wise_sub_param_t p5;
+  memset(&p5, 0, sizeof(p5));
+  p5.res_high = 0;
+  p5.res_low = tl_res_low;
+  p5.a_high = 0;
+  p5.a_low = tl_a_low;
+  p5.b_high = 0;
+  p5.b_low = tl_b_low;
+  p5.rshift_bits = 0;
+  bmk1880v2_tiu_element_wise_sub(bk_ctx, &p5);
+  u16 *res_low_data = (u16*)get_bf16_tensor_l2g(ctx, bk_ctx, tl_res_low, fmt_type);
+
+  for (u64 i = 0; i < size ; i++) {
+    if (res_low_data[i] != ref_low_data[i]) {
+      fprintf(stderr, "comparing failed at res_low_data[%" PRIu64 "], got %d, exp %d\n",
+             i, res_low_data[i], ref_low_data[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_res_low);
+  free_tl(bk_ctx, tl_b_low);
+  free_tl(bk_ctx, tl_a_low);
+
+  free(a_low_data);
+  free(b_low_data);
+  free(ref_low_data);
+  free(res_low_data);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  int round_mode;
+  round_mode = set_store_feround();
+
+  test_tl_sub(&ctx, bk_ctx, 0);
+  test_tl_sub(&ctx, bk_ctx, 1);
+
+  restore_feround(round_mode);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tensor_transfer.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tensor_transfer.cpp
new file mode 100644
index 000000000..a72211f9e
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_bf16_tensor_transfer.cpp
@@ -0,0 +1,133 @@
+#include "../1880v2_test_util.h"
+#include "1880v2_bf16_util.h"
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+ {FMT_I8, FMT_I8},
+};
+
+static void test_put_and_get_tensor_l2g(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    fmt_t fmt)
+{
+  int n = 2;
+  int c = 66;
+  int h = 3;
+  int w = 15;
+  u64 size = n * c * h * w;
+  s8 *s8data_x = (s8 *)malloc(sizeof(s8) * size);
+  s8 *s8data_y = (s8 *)malloc(sizeof(s8) * size);
+  u16 *u16data_x = (u16 *)malloc(sizeof(u16) * size);
+  u16 *u16data_y = (u16 *)malloc(sizeof(u16) * size);
+  u8 *u8src_data_x;
+  u8 *u8src_data_y;
+
+  if(fmt == FMT_BF16) {
+    /* bf16*/
+    float val = -100;
+    for(u64 i = 0; i < size; i++) {
+      u16data_x[i] = generate_bf16_corner_val(val);
+      u16data_y[i] = generate_bf16_corner_val(val);
+      val += 0.1;
+    }
+    u8src_data_x = (u8 *)u16data_x;
+    u8src_data_y = (u8 *)u16data_y;
+  } else {
+    /* int8 -> bf16*/
+    for(u64 i = 0; i < size; i++) {
+      s8data_x[i] = i-100;
+      s8data_y[i] = -i;
+    }
+    u8src_data_x = (u8 *)s8data_x;
+    u8src_data_y = (u8 *)s8data_y;
+  }
+  /*
+   * Interleave two tensors in case the same devmem is reused between
+   * put_tensor_g2l() and get_tensor_l2g(), in which case the content of
+   * devmem is already what is expected before bmk1880v2_gdma_store(bk_ctx, ).
+   */
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  tg_shape_t ts_shape;
+  ts_shape.n = n;
+  ts_shape.c = c;
+  ts_shape.h = h;
+  ts_shape.w = w;
+
+  tl_t *tl_x = alloc_tl( bk_ctx, tl_shape, fmt, 1);
+  tl_t *tl_y = alloc_tl( bk_ctx, tl_shape, fmt, 1);
+
+  tg_t ts_x;
+  ts_x.base_reg_index = 0;
+  ts_x.start_address = 0;
+  ts_x.shape = ts_shape;
+  ts_x.stride = bmk1880v2_tensor_tgmem_default_stride(ts_shape, fmt);
+
+  put_bf16_tensor_g2l( ctx, bk_ctx, tl_x, (u16 *)u8src_data_x, fmt);
+  put_bf16_tensor_g2l( ctx, bk_ctx, tl_y, (u16 *)u8src_data_y, fmt);
+
+  u8 *result_x = get_bf16_tensor_l2g( ctx, bk_ctx, tl_x, fmt);
+  u8 *result_y = get_bf16_tensor_l2g( ctx, bk_ctx, tl_y, fmt);
+
+  for (u64 i = 0; i < size; i++) {
+    if (result_x[i] != u8src_data_x[i]) {
+      printf("compare 1 failed at result_x[%d]\n", (int)i);
+      exit(-1);
+    }
+    if (result_y[i] != u8src_data_y[i]) {
+      printf("compare 1 failed at result_y[%d]\n", (int)i);
+      exit(-1);
+    }
+  }
+  free(result_x);
+  free(result_y);
+
+  /*
+   * Get result_y before result_x.
+   */
+
+
+  result_y = get_bf16_tensor_l2g(ctx, bk_ctx, tl_y, fmt);
+  result_x = get_bf16_tensor_l2g(ctx, bk_ctx, tl_x, fmt);
+  for (u64 i = 0; i < size; i++) {
+    if (result_x[i] != u8src_data_x[i]) {
+      printf("compare 2 failed at result_x[%d]\n", (int)i);
+      exit(-1);
+    }
+    if (result_y[i] != u8src_data_y[i]) {
+      printf("compare 2 failed at result_y[%d]\n", (int)i);
+      exit(-1);
+    }
+  }
+  free(result_x);
+  free(result_y);
+
+  free_tl(bk_ctx, tl_y);
+  free_tl(bk_ctx, tl_x);
+
+  free(s8data_x);
+  free(s8data_y);
+  free(u16data_x);
+  free(u16data_y);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+
+  test_init(&ctx, &bk_ctx);
+  
+  for (u32 i = 0; i < nr_fmt; i++) {
+    test_put_and_get_tensor_l2g(&ctx, bk_ctx, input_fmt[i].src_fmt);
+  }
+  test_exit(&ctx);
+
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_g2g_bf16_tensor_copy.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_g2g_bf16_tensor_copy.cpp
new file mode 100644
index 000000000..f2963f073
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_g2g_bf16_tensor_copy.cpp
@@ -0,0 +1,120 @@
+#include "../1880v2_test_util.h"
+#include "1880v2_bf16_util.h"
+
+typedef bmk1880v2_tdma_tg2tg_tensor_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  tg_shape_t src_shape;
+  tg_stride_t src_stride;
+  tg_shape_t dst_shape;
+  tg_stride_t dst_stride;
+} case_t;
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+ {FMT_I8, FMT_I8},
+};
+
+static case_t g_cases[] = {
+  {
+    {1, 3, 3, 3}, {27, 9, 3},
+    {1, 3, 3, 3}, {27, 9, 3},
+  },
+  {
+    // YOLOv2 concat layer
+    {1, 256, 19, 19}, {92416, 361, 19},
+    {1, 256, 19, 19}, {462080, 361, 19},
+  }
+};
+
+static void test_param_g2g(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+
+  u64 size = p->src->shape.c * p->src->shape.h * p->src->shape.w;
+
+  u16 *u16src_data = (u16 *)malloc(sizeof(u16) * size);
+  u8 *u8src_data = (u8 *)malloc(sizeof(u8) * size);
+  u8 *src_data;
+
+  if(p->src->fmt == FMT_BF16) {
+    /* bf16*/
+    float val = -100;
+    for(u64 i = 0; i < size; i++) {
+      u16src_data[i] = generate_bf16_corner_val(val);
+      val += 0.1;
+    }
+    src_data = (u8*)u16src_data;
+  } else {
+    /* int8 -> bf16*/
+    for(u64 i = 0; i < size; i++) {
+      u8src_data[i] = 200 + i;
+    }
+    src_data = u8src_data;
+  }
+
+  put_tg_bf16_gmem(ctx, p->src, src_data);
+
+  bmk1880v2_tdma_tg2tg_bf16_tensor_copy(bmk, p);
+
+  test_submit(ctx);
+  
+  u8 *dst_data = get_tg_bf16_gmem(ctx, p->dst);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != src_data[i]) {
+      fprintf(stderr, "comparing(%d->%d) failed at dst[%" PRIu64 "], got %x, exp %x\n",
+              p->src->fmt, p->dst->fmt, i, dst_data[i], src_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(u8src_data);
+  free(u16src_data);
+  free(dst_data);
+}
+
+static void destroy_param_g2g(CVI_RT_HANDLE *ctx, param_t *p)
+{
+  free_tg_gmem(ctx, p->src);
+  free_tg_gmem(ctx, p->dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+
+  for (u32 i = 0; i < nr_fmt; i++) {
+        param_t p;
+        memset(&p, 0, sizeof(p));
+        p.src = alloc_tg_bf16_gmem(ctx, c->src_shape, input_fmt[i].src_fmt);
+        p.dst = alloc_tg_bf16_gmem(ctx, c->dst_shape, input_fmt[i].dst_fmt);
+        test_param_g2g(ctx, bmk, &p);
+        destroy_param_g2g(ctx, &p);
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
\ No newline at end of file
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_get_bf16_matrix_stride.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_get_bf16_matrix_stride.cpp
new file mode 100644
index 000000000..13ea3b972
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_get_bf16_matrix_stride.cpp
@@ -0,0 +1,185 @@
+#include "../1880v2_test_util.h"
+#include "1880v2_bf16_util.h"
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+ {FMT_I8, FMT_I8},
+};
+
+static void get_matrix_l2g_stride_ref(
+    void *ref,
+    void *a,
+    ml_shape_t ml_shape,
+    bmk1880v2_matrix_tgmem_stride_t gmem_stride,
+    fmt_t fmt)
+{
+  int row = ml_shape.n;
+  int col = ml_shape.col;
+  int row_stride = gmem_stride.row / ((fmt == FMT_BF16) ?2:1);
+  int stride_size = row * row_stride;
+  u16 *u16_ref = NULL;
+  u16 *u16_src = NULL;
+  u8 *u8_ref = NULL;
+  u8 *u8_src = NULL;
+
+  if (fmt == FMT_BF16) {
+    u16_ref = (u16 *)ref;
+    u16_src = (u16 *)a;
+    for (int i = 0; i < stride_size; i++)
+      u16_ref[i] = 0xaf;
+  } else {
+    u8_ref = (u8 *)ref;
+    u8_src = (u8 *)a;
+    for (int i = 0; i < stride_size; i++)
+      u8_ref[i] = 0xaf;
+  }
+
+  for (int ri = 0; ri < row; ri++) {
+    for (int ci = 0; ci < col; ci++) {
+      if (fmt == FMT_BF16) {
+        u16_ref[ri * row_stride + ci] = u16_src[ri * col + ci];
+      } else {
+        u8_ref[ri * row_stride + ci] = u8_src[ri * col + ci];
+      }
+    }
+  }
+
+  if (fmt == FMT_BF16) {
+    ref = (void *)u16_ref;
+  } else {
+    ref = (void *)u8_ref;
+  }
+}
+
+static u8 * get_matrix_l2g_stride(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    ml_t *ml,
+    bmk1880v2_matrix_tgmem_stride_t mg_stride,
+    fmt_t fmt)
+{
+  int row = ml->shape.n;
+  int row_stride = mg_stride.row;
+  int col = ml->shape.col;
+  int stride_size = row * row_stride;
+
+  u8 *data = NULL;
+  u8 *u8data = (u8 *)malloc(sizeof(u8) * stride_size);
+  u16 *u16data = (u16 *)malloc(sizeof(u16) * stride_size);
+  if (!u8data || !u16data) {
+    free(u8data);
+    free(u16data);
+    return NULL;
+  }
+
+  for (int i = 0; i < stride_size; i++)
+  {
+    if(fmt == FMT_BF16) {
+      u16data[i] = 0xaf;
+    } else {
+      u8data[i] = 0xaf;
+    }
+  }
+
+  if (fmt == FMT_BF16) {
+    data = (u8 *)u16data;
+    free(u8data);
+  } else {
+    data = u8data;
+    free(u16data);
+  }
+
+  bmshape_t bms = BM_TENSOR_WITH_FMT( row, row_stride, 1, 1,
+                                     (fmt == FMT_BF16)? BM_FMT_BF16 : BM_FMT_INT8);
+  CVI_RT_MEM devmem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms));
+  int ret = CVI_RT_MemCopyS2D(*ctx, devmem, data);
+  assert(ret == BM_SUCCESS);
+
+  mg_t mg;
+  mg.base_reg_index = 0;
+  mg.start_address = CVI_RT_MemGetPAddr(devmem);
+  mg.shape.row = row;
+  mg.shape.col = col;
+  mg.stride = mg_stride;
+  mg.fmt = fmt;
+
+  bmk1880v2_tdma_l2tg_matrix_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = ml;
+  p.dst = &mg;
+
+  bmk1880v2_tdma_l2g_bf16_matrix_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  ret = CVI_RT_MemCopyD2S(*ctx, data, devmem);
+  assert(ret == BM_SUCCESS);
+
+  CVI_RT_MemFree(*ctx, devmem);
+  return data;
+}
+
+static void test_get_matrix_l2g_stride(
+    CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, fmt_t fmt, int eu_align)
+{
+  int row = 80;
+  int col = 70;
+  float val = -100;
+  int size = row * col;
+  int row_stride = col * 2;
+  int stride_size = row * row_stride;
+  u16 *u16src_data = (u16 *)malloc(sizeof(u16) * size);
+  s8 *s8src_data = (s8 *)malloc(sizeof(s8) * size);
+  void *src_data = NULL;
+  u8 *result_x = NULL;
+  void *ref_x = NULL;
+
+  ml_shape_t ml_shape = bmk1880v2_matrix_lmem_default_shape(bk_ctx, row, col, fmt);
+  bmk1880v2_matrix_tgmem_stride_t gmem_stride;
+  gmem_stride.row = row_stride  * ((fmt == FMT_BF16) ?2:1);
+
+  // prepare source data
+  for (int i = 0; i < size; i++) {
+    if(fmt == FMT_BF16) {
+      u16src_data[i] = generate_bf16_corner_val(val);
+      val += 0.1;
+    } else {
+      s8src_data[i] = i;
+    }
+  }
+  ref_x = (u8 *)xmalloc(stride_size * ((fmt == FMT_BF16) ?2:1));
+  src_data =  (fmt == FMT_BF16) ? (void *)u16src_data : (void *)s8src_data;
+
+  // run tpu operations
+  ml_t *ml_x = bmk1880v2_lmem_alloc_matrix(bk_ctx,ml_shape, fmt, eu_align);
+  put_bf16_matrix_g2l(ctx, bk_ctx, ml_x, (u8 *)src_data, fmt);
+  result_x = get_matrix_l2g_stride(ctx, bk_ctx, ml_x, gmem_stride, fmt);
+  get_matrix_l2g_stride_ref(ref_x, src_data, ml_shape, gmem_stride, fmt);
+
+   // compare data
+  if( COMPARE_PASS != compare_result( ref_x, result_x, fmt, stride_size))
+    exit(-1);
+
+  // free variables
+  bmk1880v2_lmem_free_matrix(bk_ctx, ml_x);
+  free(s8src_data);
+  free(u16src_data);
+  free(ref_x);
+  free(result_x);
+}
+
+#define TEST_ALIGNED 2 // 1: test unalign only, 2: test both align/unalign
+int main (void)
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+
+  test_init(&ctx, &bk_ctx);
+  for (u32 i = 0; i < nr_fmt; i++) {
+    for (u8 u8_align = 0; u8_align < TEST_ALIGNED; u8_align++) {
+      test_get_matrix_l2g_stride(&ctx, bk_ctx, input_fmt[i].src_fmt, u8_align);
+    }
+  }
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_get_bf16_tensor_gl_stride.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_get_bf16_tensor_gl_stride.cpp
new file mode 100644
index 000000000..10f0b656a
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_get_bf16_tensor_gl_stride.cpp
@@ -0,0 +1,225 @@
+#include "../1880v2_test_util.h"
+#include "1880v2_bf16_util.h"
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+ {FMT_I8, FMT_I8},
+};
+
+static void get_tensor_l2g_stride_ref(
+    void *ref, void *a,
+    tl_shape_t tl_shape,
+    bmk1880v2_tensor_lmem_stride_t tl_stride,
+    bmk1880v2_tensor_tgmem_stride_t tg_stride,
+    fmt_t fmt)
+{
+  int nsrc_byte = 1;
+  int n = tl_shape.n;
+  int c = tl_shape.c;
+  int h = tl_shape.h;
+  int w = tl_shape.w;
+  u16 *u16_ref = NULL;
+  u16 *u16_src = NULL;
+  u8 *u8_ref = NULL;
+  u8 *u8_src = NULL;
+  int stride_size = n * tg_stride.n;
+
+  if (fmt == FMT_BF16) {
+    nsrc_byte = 2; // FMT_BF16
+    u16_ref = (u16 *)ref;
+    u16_src = (u16 *)a;
+  } else {
+    nsrc_byte = 1; // FMT_U8, FMT_I8
+    u8_ref = (u8 *)ref;
+    u8_src = (u8 *)a;
+  }
+
+  int n_str = tg_stride.n / nsrc_byte;
+  int c_str = tg_stride.c / nsrc_byte;
+  int h_str = tg_stride.h / nsrc_byte;
+  int w_str = 1;
+
+  for (int i = 0; i < stride_size; i++) {
+    if (fmt == FMT_BF16) {
+      u16_ref[i] = 0xcf;
+    } else {
+      u8_ref[i] = 0xcf;
+    }
+  }
+
+  for (int ni = 0; ni < n; ni++) {
+    for (int ci = 0; ci < c; ci++) {
+      for (int hi = 0; hi < h; hi++) {
+        for (int wi = 0; wi < w; wi++) {
+          u64 src_i = (ni * c + ci) * tl_stride.c/nsrc_byte + hi * tl_stride.h/nsrc_byte + wi * 1;
+          u64 dst_i = ni * n_str + ci * c_str + hi * h_str + wi * w_str;
+          if (fmt == FMT_BF16) {
+            u16_ref[dst_i] = u16_src[src_i];
+          } else {
+            u8_ref[dst_i] = u8_src[src_i];
+          }
+        }
+      }
+    }
+  }
+  if (fmt == FMT_BF16) {
+    ref = (void *)u16_ref;
+  } else {
+    ref = (void *)u8_ref;
+  }
+}
+
+static inline u8 * get_tensor_l2g_stride(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    tl_t *tl,
+    bmk1880v2_tensor_tgmem_stride_t tg_stride,
+    fmt_t fmt)
+{
+  u8 *data = NULL;
+  int n = tl->shape.n;
+  int n_stride = tg_stride.n;
+  int stride_size = n * n_stride;
+  u16 *u16_data = (u16 *)malloc(sizeof(u16) * stride_size);
+  u8 *u8_data = (u8 *)malloc(sizeof(u8) * stride_size);
+  if (!u16_data || !u8_data) {
+    free(u16_data);
+    free(u8_data);
+    return NULL;
+  }
+
+  for (int i = 0; i < stride_size; i++) {
+    if (fmt == FMT_BF16) {
+      u16_data[i] = 0xcf;
+    } else {
+      u8_data[i] = 0xcf;
+    }
+  }
+
+  if (fmt == FMT_BF16) {
+    data = (u8 *)u16_data;
+    free(u8_data);
+  } else {
+    data = u8_data;
+    free(u16_data);
+  }
+
+  bmshape_t bms = BM_TENSOR_WITH_FMT(n, n_stride, 1, 1,
+                                     (fmt == FMT_BF16)? BM_FMT_BF16 : BM_FMT_INT8);
+  CVI_RT_MEM dev_mem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = CVI_RT_MemGetPAddr(dev_mem);
+  int ret = CVI_RT_MemCopyS2D(*ctx, dev_mem, (u8 *)data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = fmt;
+  tg.shape.n = tl->shape.n;
+  tg.shape.c = tl->shape.c;
+  tg.shape.h = tl->shape.h;
+  tg.shape.w = tl->shape.w;
+  tg.stride = tg_stride;
+  tg.base_reg_index = 0;
+
+  bmk1880v2_tdma_l2tg_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = tl;
+  p.dst = &tg;
+  bmk1880v2_tdma_l2g_bf16_tensor_copy(bk_ctx, &p);
+
+  test_submit(ctx);
+
+  ret = CVI_RT_MemCopyD2S(*ctx, (u8 *)data, dev_mem);
+  assert(ret == BM_SUCCESS);
+  CVI_RT_MemFree(*ctx, dev_mem);
+
+  return data;
+}
+
+static void test_get_tensor_l2g_gl_stride(
+    CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, fmt_t fmt, int eu_align)
+{
+  tl_t *tl_x = NULL;
+  int n = 2;
+  int c = 35;
+  int h = 2;
+  int w = 3;
+
+  tg_shape_t tg_shape;
+  tg_shape.n = n;
+  tg_shape.c = c;
+  tg_shape.h = h;
+  tg_shape.w = w;
+
+  bmk1880v2_tensor_tgmem_stride_t tg_stride =
+      bmk1880v2_tensor_tgmem_default_stride( tg_shape, fmt);
+
+  int stride_size = n * tg_stride.n;
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h * w;
+  tl_shape.w = 1;
+  float val = -100;
+  u16 *u16src_data = (u16 *)malloc(sizeof(u16) * stride_size);
+  s8 *s8src_data = (s8 *)malloc(sizeof(s8) * stride_size);
+  void *src_data;
+  u8 *result_x = NULL;
+  void *ref_x = NULL;
+  u8 *u8ref_x = NULL;
+  u16 *u16ref_x = NULL;
+
+  // prepare source data
+  ref_x = (u8 *)xmalloc(stride_size * ((fmt == FMT_BF16) ?2:1) );
+  for (int i = 0; i < stride_size; i++) {
+    if(fmt == FMT_BF16) {
+      u16src_data[i] = generate_bf16_corner_val(val);
+      val += 0.1;
+    } else {
+      s8src_data[i] = i;
+    }
+  }
+  src_data =  (fmt == FMT_BF16) ? (void *)u16src_data : (void *)s8src_data;
+
+  // run tpu operations
+  tl_x = alloc_tl( bk_ctx, tl_shape, fmt, eu_align);
+  put_bf16_tensor_g2l( ctx, bk_ctx, tl_x, (u16 *)src_data, fmt);
+  tl_x->shape.n = n;
+  tl_x->shape.c = c;
+  tl_x->shape.h = h;
+  tl_x->shape.w = w;
+  tl_x->stride = bmk1880v2_tensor_lmem_default_stride(bk_ctx, tl_x->shape, fmt, eu_align);
+  result_x = get_tensor_l2g_stride(ctx, bk_ctx, tl_x, tg_stride, fmt);
+  get_tensor_l2g_stride_ref( ref_x, src_data, tl_x->shape, tl_x->stride, tg_stride, fmt);
+
+  // compare data
+  compare_result( ref_x, result_x, fmt, stride_size);
+
+  // free variables
+  free_tl(bk_ctx, tl_x);
+  free(result_x);
+  free(u8ref_x);
+  free(u16ref_x);
+  free(s8src_data);
+  free(u16src_data);
+  free(ref_x);
+}
+
+#define TEST_ALIGNED 1 // 1: test unalign only, 2: test both align/unalign
+int main (void)
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+
+  test_init(&ctx, &bk_ctx);
+  for (u32 i = 0; i < nr_fmt; i++) {
+    for (u8 u8_align = 0; u8_align < TEST_ALIGNED; u8_align++) {
+        test_get_tensor_l2g_gl_stride(&ctx, bk_ctx, input_fmt[i].src_fmt, u8_align);
+    }
+  }
+  test_exit(&ctx);
+
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_get_bf16_tensor_stride.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_get_bf16_tensor_stride.cpp
new file mode 100644
index 000000000..4dcf90c49
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_get_bf16_tensor_stride.cpp
@@ -0,0 +1,212 @@
+#include "../1880v2_test_util.h"
+#include "1880v2_bf16_util.h"
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+ {FMT_I8, FMT_I8},
+};
+
+static void get_tensor_l2g_stride_ref(
+    void *ref,
+    void *a,
+    tl_shape_t tl_shape,
+    bmk1880v2_tensor_tgmem_stride_t tg_stride,
+    fmt_t fmt)
+{
+  int n = tl_shape.n;
+  int c = tl_shape.c;
+  int h = tl_shape.h;
+  int w = tl_shape.w;
+  int nsrc_byte = 1;
+  u16 *u16_ref = NULL;
+  u16 *u16_src = NULL;
+  u8 *u8_ref = NULL;
+  u8 *u8_src = NULL;
+  int stride_size = n * tg_stride.n;
+
+  if (fmt == FMT_BF16) {
+    nsrc_byte = 2; // FMT_BF16
+    u16_ref = (u16 *)ref;
+    u16_src = (u16 *)a;
+  } else {
+    nsrc_byte = 1; // FMT_U8, FMT_I8
+    u8_ref = (u8 *)ref;
+    u8_src = (u8 *)a;
+  }
+  int n_str = tg_stride.n / nsrc_byte;
+  int c_str = tg_stride.c / nsrc_byte;
+  int h_str = tg_stride.h / nsrc_byte;
+  int w_str = 1;
+
+  for (int i = 0; i < stride_size; i++) {
+    if (fmt == FMT_BF16) {
+      u16_ref[i] = 0xcf;
+    } else {
+      u8_ref[i] = 0xcf;
+    }
+  }
+
+  for (int ni = 0; ni < n; ni++) {
+    for (int ci = 0; ci < c; ci++) {
+      for (int hi = 0; hi < h; hi++) {
+        for (int wi = 0; wi < w; wi++) {
+          u64 src_i = ni * c * h * w + ci * h * w + hi * w + wi * w_str;
+          u64 dst_i = ni * n_str + ci * c_str + hi * h_str + wi * w_str;
+          if (fmt == FMT_BF16) {
+            u16_ref[dst_i] = u16_src[src_i];
+          } else {
+            u8_ref[dst_i] = u8_src[src_i];
+          }
+        }
+      }
+    }
+  }
+
+  if (fmt == FMT_BF16) {
+    ref = (void *)u16_ref;
+  } else {
+    ref = (void *)u8_ref;
+  }
+}
+
+static inline u8 * get_tensor_l2g_stride(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    tl_t *tl,
+    bmk1880v2_tensor_tgmem_stride_t tg_stride,
+    fmt_t fmt)
+{
+  u8 *data = NULL;
+  int n = tl->shape.n;
+  int n_stride = tg_stride.n;
+  int stride_size = n * n_stride;
+  uint16_t *u16_data = (uint16_t *)malloc(sizeof(uint16_t) * stride_size);
+  uint8_t *u8_data = (uint8_t *)malloc(sizeof(uint8_t) * stride_size);
+  if (!u16_data || !u8_data) {
+    free(u16_data);
+    free(u8_data);
+    return NULL;
+  }
+
+  for (int i = 0; i < stride_size; i++) {
+    if (fmt == FMT_BF16) {
+      u16_data[i] = 0xcf;
+    } else {
+      u8_data[i] = 0xcf;
+    }
+  }
+
+  if (fmt == FMT_BF16) {
+    data = (u8 *)u16_data;
+    free(u8_data);
+  } else {
+    data = u8_data;
+    free(u16_data);
+  }
+
+  bmshape_t bms = BM_TENSOR_WITH_FMT(n, n_stride, 1, 1,
+                                     (fmt == FMT_BF16)? BM_FMT_BF16 : BM_FMT_INT8);
+  CVI_RT_MEM dev_mem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = CVI_RT_MemGetPAddr(dev_mem);
+  int ret = CVI_RT_MemCopyS2D(*ctx, dev_mem, (u8 *)data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = fmt;
+  tg.shape.n = tl->shape.n;
+  tg.shape.c = tl->shape.c;
+  tg.shape.h = tl->shape.h;
+  tg.shape.w = tl->shape.w;
+  tg.stride = tg_stride;
+
+  bmk1880v2_tdma_l2tg_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = tl;
+  p.dst = &tg;
+  bmk1880v2_tdma_l2g_bf16_tensor_copy(bk_ctx, &p);
+
+  test_submit(ctx);
+
+  ret = CVI_RT_MemCopyD2S(*ctx, (u8 *)data, dev_mem);
+  assert(ret == BM_SUCCESS);
+  CVI_RT_MemFree(*ctx, dev_mem);
+
+  return data;
+}
+
+static void test_get_tensor_l2g_stride(
+    CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, fmt_t fmt, int eu_align)
+{
+  tl_t *tl_x = NULL;
+  int n = 2;
+  int c = 15;
+  int h = 10;
+  int w = 8;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  bmk1880v2_tensor_tgmem_stride_t tg_stride;
+  tg_stride.h = w * 2;
+  tg_stride.c = tg_stride.h * h * 2;
+  tg_stride.n = tg_stride.c * c * 2;
+  int stride_size = n * tg_stride.n;
+
+  float val = -100;
+  void *src_data = NULL;
+  u16 *u16src_data = (u16 *)malloc(sizeof(u16) * stride_size);
+  s8 *s8src_data = (s8 *)malloc(sizeof(s8) * stride_size);
+  u8 *result_x = NULL;
+  void *ref_x = NULL;
+
+  // prepare source data
+  ref_x = (u8 *)xmalloc(stride_size * ((fmt == FMT_BF16) ?2:1) );
+  for (int i = 0; i < stride_size; i++) {
+    if(fmt == FMT_BF16) {
+      u16src_data[i] = generate_bf16_corner_val(val);
+      val += 0.1;
+    } else {
+      s8src_data[i] = i;
+    }
+  }
+  src_data =  (fmt == FMT_BF16) ? (void *)u16src_data : (void *)s8src_data;
+
+  // run tpu operations
+  tl_x = alloc_tl(bk_ctx, tl_shape, fmt, eu_align);
+  put_bf16_tensor_g2l(ctx, bk_ctx, tl_x, (u16 *)src_data, fmt);
+  result_x = get_tensor_l2g_stride(ctx, bk_ctx ,tl_x, tg_stride, fmt);
+  get_tensor_l2g_stride_ref( ref_x, src_data, tl_shape, tg_stride, fmt);
+
+  // compare data
+  compare_result( ref_x, result_x, fmt, stride_size);
+
+  // free variables
+  free_tl(bk_ctx, tl_x);
+  free(ref_x);
+  free(result_x);
+  free(s8src_data);
+  free(u16src_data);
+}
+
+#define TEST_ALIGNED 2 // 1: test unalign only, 2: test both align/unalign
+int main (void)
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+
+  test_init(&ctx, &bk_ctx);
+  for (u32 i = 0; i < nr_fmt; i++) {
+    for (u8 u8_align = 0; u8_align < TEST_ALIGNED; u8_align++) {
+      test_get_tensor_l2g_stride(&ctx, bk_ctx, input_fmt[i].src_fmt, u8_align);
+    }
+  }
+  test_exit(&ctx);
+
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_get_bf16_tensor_stride_unalign.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_get_bf16_tensor_stride_unalign.cpp
new file mode 100644
index 000000000..0dad47701
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_get_bf16_tensor_stride_unalign.cpp
@@ -0,0 +1,234 @@
+#include "../1880v2_test_util.h"
+#include "1880v2_bf16_util.h"
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+ {FMT_I8, FMT_I8},
+};
+
+static void get_tensor_l2g_stride_unalign_ref(
+    void *ref,
+    void *a,
+    tl_shape_t tl_shape,
+    bmk1880v2_tensor_tgmem_stride_t gmem_stride,
+    fmt_t fmt)
+{
+  int n = tl_shape.n;
+  int c = tl_shape.c;
+  int h = tl_shape.h;
+  int w = tl_shape.w;
+  int nsrc_byte = 1;
+  int new_n = n * 2;
+  int new_h = h / 2;
+  u16 *u16_ref = NULL;
+  u16 *u16_src = NULL;
+  u8 *u8_ref = NULL;
+  u8 *u8_src = NULL;
+
+  if (fmt == FMT_BF16) {
+    nsrc_byte = 2; // FMT_BF16
+    u16_ref = (u16 *)ref;
+    u16_src = (u16 *)a;
+  } else {
+    nsrc_byte = 1; // FMT_U8, FMT_I8
+    u8_ref = (u8 *)ref;
+    u8_src = (u8 *)a;
+  }
+  int n_str = gmem_stride.n / nsrc_byte;
+  int c_str = gmem_stride.c / nsrc_byte;
+  int h_str = gmem_stride.h / nsrc_byte;
+  /*
+   * Same as in get_tensor_l2g_stride_unalign().
+   */
+  int stride_size = new_n * gmem_stride.n;
+  for (int i = 0; i < stride_size; i++) {
+    if (fmt == FMT_BF16) {
+      u16_ref[i] = 0xcf;
+    } else {
+      u8_ref[i] = 0xcf;
+    }
+  }
+  /*
+   * (n, c, h, w) => (n * 2, c, h / 2, w)
+   */
+  for (int ni = 0; ni < n; ni++) {
+    for (int ci = 0; ci < c; ci++) {
+      for (int hi = 0; hi < h; hi++) {
+        for (int wi = 0; wi < w; wi++) {
+          u64 src_i = ni * c * h * w + ci * h * w + hi * w + wi;
+          u64 dst_i = (ni * 2 + hi / new_h) * n_str +
+              ci * c_str + (hi % new_h) * h_str + wi;
+          if (fmt == FMT_BF16) {
+            u16_ref[dst_i] = u16_src[src_i];
+          } else {
+            u8_ref[dst_i] = u8_src[src_i];
+          }
+        }
+      }
+    }
+  }
+
+  if (fmt == FMT_BF16) {
+    ref = (void *)u16_ref;
+  } else {
+    ref = (void *)u8_ref;
+  }
+}
+
+static inline u8 * get_tensor_l2g_stride(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    tl_t *tl,
+    bmk1880v2_tensor_tgmem_stride_t tg_stride,
+    fmt_t fmt)
+{
+  bmk1880v2_tdma_l2tg_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  int n = tl->shape.n;
+  int n_stride = tg_stride.n;
+  int stride_size = n * n_stride;
+  u8 *data = NULL;
+  u16 *u16_data = (u16 *)malloc(sizeof(u16) * stride_size);
+  u8 *u8_data = (u8 *)malloc(sizeof(u8) * stride_size);
+  if (!u16_data || !u8_data) {
+    free(u16_data);
+    free(u8_data);
+    return NULL;
+  }
+
+  memset(&p, 0, sizeof(p));
+
+  for (int i = 0; i < stride_size; i++) {
+    if (fmt == FMT_BF16) {
+      u16_data[i] = 0xcf;
+    } else {
+      u8_data[i] = 0xcf;
+    }
+  }
+
+  if (fmt == FMT_BF16) {
+    data = (u8 *)u16_data;
+    free(u8_data);
+  } else {
+    data = u8_data;
+    free(u16_data);
+  }
+
+  bmshape_t bms = BM_TENSOR_WITH_FMT(n, n_stride, 1, 1,
+                                     (fmt == FMT_BF16)? BM_FMT_BF16 : BM_FMT_INT8);
+  CVI_RT_MEM dev_mem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = CVI_RT_MemGetPAddr(dev_mem);
+  int ret = CVI_RT_MemCopyS2D(*ctx, dev_mem, (u8 *)data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = fmt;
+  tg.shape.n = tl->shape.n;
+  tg.shape.c = tl->shape.c;
+  tg.shape.h = tl->shape.h;
+  tg.shape.w = tl->shape.w;
+  tg.stride = tg_stride;
+  tg.base_reg_index = 0;
+
+  p.src = tl;
+  p.dst = &tg;
+
+  bmk1880v2_tdma_l2g_bf16_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+  ret = CVI_RT_MemCopyD2S(*ctx, (u8 *)data, dev_mem);
+  assert(ret == BM_SUCCESS);
+  CVI_RT_MemFree(*ctx, dev_mem);
+
+  return data;
+}
+
+static void test_get_tensor_l2g_stride_unalign(
+    CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, fmt_t fmt)
+{
+  bmk1880v2_tensor_tgmem_stride_t tg_stride;
+  /*
+   * Make sure (h / 2 * w) is not eu-aligned.
+   */
+  int n = 1;
+  int c = 5;
+  int h = 18;
+  int w = 7;
+  tl_t *tl_x = NULL;
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+  int new_n = n * 2;
+  int new_h = h / 2;
+  tg_stride.h = w * 2;
+  tg_stride.c = w * 2 * new_h * 2;
+  tg_stride.n = w * 2 * new_h * 2 * c * 2;
+
+  float val = -100;
+  int stride_size = new_n * tg_stride.n;
+  u16 *u16src_data = (u16 *)malloc(sizeof(u16) * stride_size);
+  s8 *s8src_data = (s8 *)malloc(sizeof(s8) * stride_size);
+  void *src_data = NULL;
+  u8 *result_x = NULL;
+  void *ref_x = NULL;
+  u8 *u8ref_x = NULL;
+  u16 *u16ref_x = NULL;
+
+  // prepare source data
+  ref_x = (u8 *)xmalloc(stride_size * ((fmt == FMT_BF16) ?2:1) );
+  for (int i = 0; i < stride_size; i++) {
+    if(fmt == FMT_BF16) {
+      u16src_data[i] = generate_bf16_corner_val(val);
+      val += 0.1;
+    } else {
+      s8src_data[i] = i;
+    }
+  }
+  src_data =  (fmt == FMT_BF16) ? (void *)u16src_data : (void *)s8src_data;
+
+  // run tpu operations
+  tl_x = alloc_tl(bk_ctx, tl_shape, fmt, 1);
+  put_bf16_tensor_g2l(ctx, bk_ctx, tl_x, (u16 *)src_data, fmt);
+  tl_x->shape.n = new_n;
+  tl_x->shape.c = c;
+  tl_x->shape.h = new_h;
+  tl_x->shape.w = w;
+  tl_x->stride = bmk1880v2_tensor_lmem_default_stride(bk_ctx, tl_x->shape, fmt, 0);
+  result_x = get_tensor_l2g_stride(ctx, bk_ctx, tl_x, tg_stride, fmt);
+  tl_x->shape = tl_shape;
+  tl_x->stride = bmk1880v2_tensor_lmem_default_stride(bk_ctx, tl_x->shape, fmt, 1);
+  get_tensor_l2g_stride_unalign_ref(ref_x, (u16 *)src_data, tl_shape, tg_stride, fmt);
+
+  // compare data
+  compare_result( ref_x, result_x, fmt, stride_size);
+
+  // free variables
+  free_tl(bk_ctx, tl_x);
+  free(ref_x);
+  free(result_x);
+  free(u8ref_x);
+  free(u16ref_x);
+  free(s8src_data);
+  free(u16src_data);
+}
+
+#define TEST_ALIGNED 1 // 1: test unalign only, 2: test both align/unalign
+int main (void)
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+
+  test_init(&ctx, &bk_ctx);
+  for (u32 i = 0; i < nr_fmt; i++) {
+    for (u8 u8_align = 0; u8_align < TEST_ALIGNED; u8_align++) {
+      test_get_tensor_l2g_stride_unalign(&ctx, bk_ctx, input_fmt[i].src_fmt);
+    }
+  }
+  test_exit(&ctx);
+
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_hists_svm_kernel.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_hists_svm_kernel.cpp
new file mode 100644
index 000000000..96033c3f0
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_hists_svm_kernel.cpp
@@ -0,0 +1,863 @@
+#include <sys/time.h>
+#include "../1880v2_test_util.h"
+//#include <float.h>
+//#undef printf
+//#define printf(...) {}
+#define IS_PRINT_INPUT (0)
+
+typedef struct {
+  int random_seed;
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int using_bias;
+  int bReLU_EN;
+  int r_shift_m;
+  int opd0_sign;
+  int opd1_sign;
+  int opd2_sign;
+  int bf16_enable;
+  int unit_size;
+} conv_param_t;
+
+static void print_conv_param(const conv_param_t *p);
+
+static inline void bf16_relu(float *buf, u64 size) {
+  for (u64 i = 0; i < size; i++)
+    if (buf[i] < 0) buf[i] = 0;
+}
+
+static int conv_ref(const conv_param_t *p_param, const u16 *ifmap, const u16 *weight,
+                    const u32 *bias, u16 *ofmap) {
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int do_relu = p_param->bReLU_EN;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last, pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last, pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  float *result = (float *)malloc(sizeof(float) * in * oc * oh * ow);
+  if (!result)
+    return BM_ERR_FAILURE;
+
+  memset(result, 0, sizeof(float) * in * oc * oh * ow);
+  int ret = BM_SUCCESS;
+
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      u16 *i_fmap_pad[ic];
+      u16 *kernel_pad[ic];
+
+      for (int iic = 0; iic < ic; ++iic) {
+        i_fmap_pad[iic] = NULL;
+        kernel_pad[iic] = NULL;
+        fill_pad_fmap_bf16((ifmap + n * ic * ih * iw + iic * ih * iw), &i_fmap_pad[iic],
+                           convert_fp32_bf16(0), pad_left, pad_right, pad_top, pad_bot, ins_h,
+                           ins_w, ins_h_last, ins_w_last, ih, iw);
+        // kernel_dilation(
+        fill_pad_fmap_bf16((weight + c * ic * kh * kw + iic * kh * kw), &kernel_pad[iic],
+                           convert_fp32_bf16(0), 0, 0, 0, 0,  // no padding
+                           dh - 1, dw - 1, 0, 0, kh, kw);
+      }
+      for (int ph = 0; ph < oh; ++ph) {
+        for (int pw = 0; pw < ow; ++pw) {
+          float result_val = result[n * oc * oh * ow + c * oh * ow + ph * ow + pw];
+          for (int idxh = 0; idxh < kh_ext; ++idxh) {
+            for (int idxw = 0; idxw < kw_ext; ++idxw) {
+              for (int iic = 0; iic < ic; ++iic) {
+                float ifv = convert_bf16_fp32(
+                    i_fmap_pad[iic][(idxh + ph * stride_h) * iw_ext + idxw + pw * stride_w]);
+                float ikv = convert_bf16_fp32(kernel_pad[iic][idxh * kw_ext + idxw]);
+                result_val += ifv * ikv;
+              }
+            }
+          }
+          result[n * oc * oh * ow + c * oh * ow + ph * ow + pw] = result_val;
+        }
+      }
+
+      if (p_param->using_bias) {
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            result[n * oc * oh * ow + c * oh * ow + ph * ow + pw] +=
+                convert_hex_fp32(bias[c]);  // bias+c ;
+          }
+        }
+      }
+
+      if (do_relu) bf16_relu(&result[n * oc * oh * ow + c * oh * ow], oh * ow);
+      for (int i = 0; i < ic; i++) {
+        free(i_fmap_pad[i]);
+        free(kernel_pad[i]);
+      }
+      if (ret != BM_SUCCESS) goto error_release;
+    }  // end for (int c = 0; c < oc; ++c)
+  }    // end for (int n = 0; n < in; n++)
+
+  for (int i = 0; i < in * oc * oh * ow; i++) {
+    ofmap[i] = convert_fp32_bf16(result[i]);
+  }
+
+error_release:
+  free(result);
+
+  return ret;
+}
+
+static u16 *transform_weight(const tl_shape_t *s, u16 before[]) {
+  u32 ic = s->n;
+  u32 oc = s->c;
+  u32 kh = s->h;
+  u32 kw = s->w;
+
+  u32 size = ic * oc * kh * kw;
+  u16 *after = (u16 *)malloc(sizeof(u16) * size);
+
+  /*
+   * (oc, ic, kh, kw) -> (1, oc, kh * kw, ic)
+   */
+  for (u32 oci = 0; oci < oc; oci++) {
+    for (u32 ici = 0; ici < ic; ici++) {
+      for (u32 khi = 0; khi < kh; khi++) {
+        for (u32 kwi = 0; kwi < kw; kwi++) {
+          u32 src_i = oci * ic * kh * kw + ici * kh * kw + khi * kw + kwi;
+          u32 dst_i = oci * kh * kw * ic + khi * kw * ic + kwi * ic + ici;
+          after[dst_i] = before[src_i];
+        }
+      }
+    }
+  }
+
+  return after;
+}
+
+static u16 *re_pack_by_slice_col(bmk1880v2_tensor_tgmem_shape_t *s, int unit_size, u16 before[]) {
+  u32 ic = s->n;
+  u32 oc = s->c;
+  u32 kh = s->h;
+  u32 kw = s->w;
+
+  u32 size = ic * oc * kh * kw;
+  u16 *after = (u16 *)malloc(sizeof(u16) * size);
+  u32 khw = kh * kw;
+  u32 chw = ic * kh * kw;
+  u32 act_khw = kw / unit_size * kh;
+
+  for (u32 oci = 0; oci < oc; oci++) {
+    for (u32 ici = 0; ici < ic; ici++) {
+      for (u32 u = 0; u < khw; u++) {
+        // slice by column
+        u32 src_i = oci * chw + ici * khw + u;
+
+        u32 row_idx = u % unit_size;
+        u32 col_idx = u / unit_size;
+
+        u32 dst_i = oci * chw + ici * khw + (act_khw * row_idx) + col_idx;
+        after[dst_i] = before[src_i];
+      }
+    }
+  }
+
+  return after;
+}
+
+static u16 *put_conv_weight(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, const tl_t *tl, u16 *data) {
+  const tl_shape_t *s = &tl->shape;
+  u32 ic = s->n;
+  u32 oc = s->c;
+  u32 kh = s->h;
+  u32 kw = s->w;
+
+  bmshape_t bms = BM_TENSOR_INT8((int)oc, (int)ic, (int)kh, (int)kw * 2);
+  CVI_RT_MEM dev_mem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = CVI_RT_MemGetPAddr(dev_mem);
+  u16 *transformed_data = transform_weight(s, data);
+
+  /* we put weight to region 1. CVI_RT_MemCopyS2D regard dev_mem as
+   * absolute address, so we should pass abs address to copy weight
+   * to right place.
+   */
+
+  // u64 ab_addr = bm_device_read_base_reg(*ctx, 1);
+  // CVI_RT_MEM ab_dev_mem =
+  // bmmem_device_prealloc(*ctx, NULL, ab_addr + gaddr, &bms);
+
+  // int ret = CVI_RT_MemCopyS2D(*ctx, ab_dev_mem, (u8*)transformed_data);
+  int ret = CVI_RT_MemCopyS2D(*ctx, dev_mem, (u8 *)transformed_data);
+
+  assert(ret == BM_SUCCESS);
+  tl_shape_t tdma_shape = {1, oc, kh * kw, ic};
+
+  tg_t tdma_tg;
+  tdma_tg.base_reg_index = 0;
+  tdma_tg.start_address = gaddr;
+  tdma_tg.fmt = FMT_BF16;
+  tdma_tg.shape.n = tdma_shape.n;
+  tdma_tg.shape.c = tdma_shape.c;
+  tdma_tg.shape.h = tdma_shape.h;
+  tdma_tg.shape.w = tdma_shape.w;
+  tdma_tg.stride = bmk1880v2_tensor_tgmem_default_stride(tdma_tg.shape, FMT_BF16);
+  tdma_tg.base_reg_index = 1;
+
+  tl_t tdma_tl = *tl;
+  tdma_tl.shape = tdma_shape;
+  tdma_tl.stride = bmk1880v2_tensor_lmem_default_stride(bk_ctx, tdma_shape, FMT_BF16, 0);
+
+  bmk1880v2_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tdma_tg;
+  p.dst = &tdma_tl;
+
+  bmk1880v2_tdma_g2l_bf16_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+  CVI_RT_MemFree(*ctx, dev_mem);
+  return transformed_data;
+}
+
+static u16 *transform_bias(int oc, u32 before[]) {
+  u16 *after = (u16 *)malloc(sizeof(u16) * 2 * oc);
+  if (!after)
+    return NULL;
+
+  for (int i = 0; i < oc; i++) {
+    after[i] = (before[i] >> 16) & 0xffff;
+    after[i + oc] = before[i] & 0xffff;
+  }
+  return after;
+}
+
+static void put_conv_bias(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, const tl_t *tl, u32 *data) {
+  int oc = tl->shape.c;
+
+  bmshape_t bms = BM_TENSOR_INT8(2 * sizeof(short), oc, 1, 1);
+  CVI_RT_MEM dev_mem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = CVI_RT_MemGetPAddr(dev_mem);
+  u16 *transformed_data = transform_bias(oc, data);
+  int ret = CVI_RT_MemCopyS2D(*ctx, dev_mem, (u8 *)transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_BF16;
+  tg.shape.n = 2;
+  tg.shape.c = oc;
+  tg.shape.h = 1;
+  tg.shape.w = 1;
+  tg.stride = bmk1880v2_tensor_tgmem_default_stride(tg.shape, FMT_BF16);
+
+  bmk1880v2_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1880v2_tdma_g2l_bf16_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  CVI_RT_MemFree(*ctx, dev_mem);
+}
+
+static int conv_kh_ext(const conv_param_t *p) { return (p->kh - 1) * p->dh + 1; }
+
+static int conv_kw_ext(const conv_param_t *p) { return (p->kw - 1) * p->dw + 1; }
+
+static int conv_ih_ext(const conv_param_t *p) {
+  return (p->input_h - 1) * (p->ins_h + 1) + p->ins_h_last + 1 + p->pad_top + p->pad_bot;
+}
+
+static int conv_iw_ext(const conv_param_t *p) {
+  return (p->input_w - 1) * (p->ins_w + 1) + p->ins_w_last + 1 + p->pad_left + p->pad_right;
+}
+
+static int conv_oh(const conv_param_t *p) {
+  return (conv_ih_ext(p) - conv_kh_ext(p)) / p->stride_h + 1;
+}
+
+static int conv_ow(const conv_param_t *p) {
+  return (conv_iw_ext(p) - conv_kw_ext(p)) / p->stride_w + 1;
+}
+
+static int conv_input_size(const conv_param_t *p) {
+  int in = p->input_n;
+  int ic = p->input_c;
+  int ih = p->input_h;
+  int iw = p->input_w;
+  return in * ic * ih * iw;
+}
+
+static int conv_output_size(const conv_param_t *p) {
+  int in = p->input_n;
+  int oc = p->output_c;
+  int oh = conv_oh(p);
+  int ow = conv_ow(p);
+  return in * oc * oh * ow;
+}
+
+static int conv_weight_size(const conv_param_t *p) {
+  int oc = p->output_c;
+  int ic = p->input_c;
+  int kh = p->kh;
+  int kw = p->kw;
+  return oc * ic * kh * kw;
+}
+
+static u16 *alloc_input(const conv_param_t *p) {
+  int size = conv_input_size(p);
+  u16 *buf = (u16 *)malloc(sizeof(u16) * size);
+  if (!buf)
+    return NULL;
+
+  for (int i = 0; i < size; i++) {
+    float val = 0;
+    int RAND_MAX2 = RAND_MAX / 2;  // 5 ~ -5
+    val = (float)(rand() - RAND_MAX2) * 5 / (float)RAND_MAX;
+    // val = i;
+    buf[i] = convert_fp32_bf16(val);
+  }
+
+  if (IS_PRINT_INPUT) {
+    printf("input:\n");
+    int i = 0;
+    for (int n = 0; n < p->input_n; n++) {
+      for (int c = 0; c < p->input_c; c++) {
+        printf("n/c is %d %d\n", n, c);
+        for (int h = 0; h < p->input_h; h++) {
+          for (int w = 0; w < p->input_w; w++) {
+            printf("%f ", convert_bf16_fp32(buf[i]));
+            i++;
+          }
+          printf("\n");
+        }
+      }
+    }
+  }
+  return buf;
+}
+
+static u16 *alloc_weight(const conv_param_t *p) {
+  int size = conv_weight_size(p);
+  u16 *buf = (u16 *)malloc(sizeof(u16) * size);
+  if (!buf)
+    return NULL;
+
+  for (int i = 0; i < size; i++) {
+    float val = 0;
+    int RAND_MAX2 = RAND_MAX / 2;  // 5 ~ -5
+    val = (float)(rand() - RAND_MAX2) * 5 / (float)RAND_MAX;
+    // val = i * 0.01;
+    buf[i] = convert_fp32_bf16(val);
+  }
+
+  return buf;
+}
+
+static u32 *alloc_bias(const conv_param_t *p) {
+  int oc = p->output_c;
+  u32 *bias = (u32 *)malloc(sizeof(u32) * oc);
+  if (!bias)
+    return NULL;
+
+  for (int i = 0; i < oc; i++) {
+    float val = 0;
+    int RAND_MAX2 = RAND_MAX / 2;  // 5 ~ -5
+    val = (float)(rand() - RAND_MAX2) * 5 / (float)RAND_MAX;
+    bias[i] = convert_fp32_hex(val);
+  }
+
+  return bias;
+}
+
+static tl_t *conv_ifmap_tensor(bmk_ctx_t *ctx, const conv_param_t *p) {
+  fmt_t fmt = FMT_BF16;  // p->opd0_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return bmk1880v2_lmem_alloc_tensor(ctx, s, fmt, 1);
+}
+
+static tl_t *conv_weight_tensor(bmk_ctx_t *ctx, const conv_param_t *p) {
+  fmt_t fmt = FMT_BF16;  // p->opd1_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+
+  return bmk1880v2_lmem_alloc_tensor(ctx, s, fmt, 0);
+}
+
+static tl_t *conv_ofmap_tensor(bmk_ctx_t *ctx, const conv_param_t *p) {
+  tl_shape_t s;
+  fmt_t fmt = FMT_BF16;
+  s.n = p->input_n;
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  return bmk1880v2_lmem_alloc_tensor(ctx, s, fmt, 1);
+}
+
+static tl_t *conv_bias_tensor(bmk_ctx_t *ctx, const conv_param_t *p) {
+  fmt_t fmt = FMT_BF16;
+  tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+
+  return bmk1880v2_lmem_alloc_tensor(ctx, s, fmt, 0);
+}
+
+static int conv_param_is_ok(const conv_param_t *p) {
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+
+  if ((kh_ext > ih_ext) || (kw_ext > iw_ext) || (kh_ext <= p->pad_top) || (kh_ext <= p->pad_bot) ||
+      (kw_ext <= p->pad_left) || (kw_ext <= p->pad_right) || (p->pad_top >= (1 << 4)) ||
+      (p->pad_bot >= (1 << 4)) || (p->pad_left >= (1 << 4)) || (p->pad_right >= (1 << 4))) {
+    return 0;
+  }
+
+  return 1;
+}
+
+static int bmk_conv_param_alloc_ok(const bmk1880v2_tiu_convolution_param_t *p,
+                                   const conv_param_t *param) {
+  if (!p->ifmap || !p->ofmap || !p->weight) return 0;
+
+  if (param->using_bias)
+    if (!p->bias) return 0;
+
+  return 1;
+}
+
+static void make_bmk_conv_param(bmk_ctx_t *ctx,
+                                bmk1880v2_tiu_convolution_param_t *dst,
+                                const conv_param_t *p) {
+  memset(dst, 0, sizeof(*dst));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  dst->relu_enable = p->bReLU_EN;
+  dst->rshift_bits = p->r_shift_m;
+
+  dst->ifmap = conv_ifmap_tensor(ctx, p);
+  dst->weight = conv_weight_tensor(ctx, p);
+  dst->ofmap = conv_ofmap_tensor(ctx, p);
+  dst->bias = NULL;
+  dst->ps32_mode = 0;
+  if (p->using_bias) dst->bias = conv_bias_tensor(ctx, p);
+  dst->w_is_const = 0;
+}
+
+static void free_bmk_conv_param(bmk_ctx_t *ctx, bmk1880v2_tiu_convolution_param_t *r,
+                                const conv_param_t *p) {
+  if (p->using_bias && r->bias) free_tl(ctx, r->bias);
+  if (r->ofmap) free_tl(ctx, r->ofmap);
+  if (r->weight) free_tl(ctx, r->weight);
+  if (r->ifmap) free_tl(ctx, r->ifmap);
+}
+
+#if 1
+static void print_conv_param(const conv_param_t *p) {
+  printf("%s\n", "Conv parameters:");
+  printf("  %s%d;\n", "p->random_seed = ", p->random_seed);
+
+  printf("  %s%d;\n", "p->input_n = ", p->input_n);
+  printf("  %s%d;\n", "p->input_c = ", p->input_c);
+  printf("  %s%d;\n", "p->input_h = ", p->input_h);
+  printf("  %s%d;\n", "p->input_w = ", p->input_w);
+  printf("  %s%d;\n", "p->output_c = ", p->output_c);
+
+  printf("  %s%d;\n", "p->kh = ", p->kh);
+  printf("  %s%d;\n", "p->kw = ", p->kw);
+  printf("  %s%d;\n", "p->dh = ", p->dh);
+  printf("  %s%d;\n", "p->dw = ", p->dw);
+  printf("  %s%d;\n", "p->pad_top = ", p->pad_top);
+  printf("  %s%d;\n", "p->pad_bot = ", p->pad_bot);
+  printf("  %s%d;\n", "p->pad_left = ", p->pad_left);
+  printf("  %s%d;\n", "p->pad_right = ", p->pad_right);
+  printf("  %s%d;\n", "p->stride_h = ", p->stride_h);
+  printf("  %s%d;\n", "p->stride_w = ", p->stride_w);
+  printf("  %s%d;\n", "p->ins_w = ", p->ins_w);
+  printf("  %s%d;\n", "p->ins_h = ", p->ins_h);
+  printf("  %s%d;\n", "p->ins_w_last = ", p->ins_w_last);
+  printf("  %s%d;\n", "p->ins_h_last = ", p->ins_h_last);
+
+  printf("  %s%d;\n", "p->r_shift_m = ", p->r_shift_m);
+  printf("  %s%d;\n", "p->opd0_sign = ", p->opd0_sign);
+  printf("  %s%d;\n", "p->opd1_sign = ", p->opd1_sign);
+  printf("  %s%d;\n", "p->opd2_sign = ", p->opd2_sign);
+  printf("  %s%d;\n", "p->bReLU_EN = ", p->bReLU_EN);
+  printf("  %s%d;\n", "p->using_bias = ", p->using_bias);
+
+  printf("  %s%d\n", "kh_ext = ", conv_kh_ext(p));
+  printf("  %s%d\n", "kw_ext = ", conv_kw_ext(p));
+  printf("  %s%d\n", "ih_ext = ", conv_ih_ext(p));
+  printf("  %s%d\n", "iw_ext = ", conv_iw_ext(p));
+  printf("  %s%d\n", "output_h = ", conv_oh(p));
+  printf("  %s%d\n", "output_w = ", conv_ow(p));
+}
+#endif
+
+static void init_conv_param(conv_param_t &p, int oc, bool is_basic) {
+  printf("init_conv_param\n");
+  memset(&p, 0, sizeof(p));
+  p.random_seed = clock();
+  srand(p.random_seed);
+
+retry:
+
+  p.input_n = rand() % 5 + 1;
+  p.input_c = rand() % (5 * 32) + 1;
+  p.kh = rand() % 7 + 1;
+  p.kw = rand() % 7 + 1;
+  p.input_h = rand() % 40 + p.kh;
+  p.input_w = rand() % 40 + p.kw;
+  p.output_c = rand() % 10 + 3;
+  p.stride_h = rand() % (p.kh) + 1;
+  p.stride_w = rand() % (p.kw) + 1;
+  p.ins_h = rand() % p.kh;
+  p.ins_w = rand() % p.kw;
+  p.ins_h_last = rand() % p.kh;
+  ;
+  p.ins_w_last = rand() % p.kw;
+  ;
+  p.dh = rand() % 3 + 1;
+  p.dw = rand() % 3 + 1;
+  int kh_ext = conv_kh_ext(&p);
+  int kw_ext = conv_kw_ext(&p);
+  p.pad_top = rand() % kh_ext;
+  p.pad_bot = rand() % kh_ext;
+  p.pad_left = rand() % kw_ext;
+  p.pad_right = rand() % kw_ext;
+  p.using_bias = rand() % 2;
+  p.bReLU_EN = rand() % 2;
+  p.unit_size = 36;
+
+  {
+    p.input_n = 1;
+    p.input_c = 1;
+    p.kh = 15;
+    p.kw = 7 * p.unit_size;
+    p.input_h = 89;
+    p.input_w = 159 * p.unit_size;
+    p.output_c = 1;
+    p.output_c = 500;  // test
+    p.output_c = oc;
+    p.stride_h = 1;
+    p.stride_w = 1 * p.unit_size;
+    p.ins_h = 0;
+    p.ins_w = 0;
+    p.ins_h_last = 0;
+    p.ins_w_last = 0;
+    p.dh = 1;
+    p.dw = 1;
+    p.pad_top = 0;
+    p.pad_bot = 0;
+    p.pad_left = 0;
+    p.pad_right = 0;
+    p.using_bias = 0;
+    p.bReLU_EN = 0;
+    kh_ext = conv_kh_ext(&p);
+    kw_ext = conv_kw_ext(&p);
+  }
+
+  if (is_basic) {
+    p.unit_size = 2;
+    p.output_c = oc;
+    p.kh = 3;
+    p.kw = 2 * p.unit_size;
+    p.input_h = 6;
+    p.input_w = 4 * p.unit_size;
+    p.stride_w = 1 * p.unit_size;
+    kh_ext = conv_kh_ext(&p);
+    kw_ext = conv_kw_ext(&p);
+  }
+
+  if (!conv_param_is_ok(&p)) {
+    assert(0 && "init param fail, plz check input");
+    printf("retry init_conv_param\n");
+    goto retry;
+  }
+
+  p.opd0_sign = rand() % 2;
+  p.opd1_sign = 1;
+  p.opd2_sign = 1;
+
+  assert(p.opd1_sign == 1 && p.opd2_sign == 1);
+
+  int ih_ext = conv_ih_ext(&p);
+  int iw_ext = conv_iw_ext(&p);
+  assert(ih_ext >= kh_ext);
+  assert(iw_ext >= kw_ext);
+  // print_conv_param(&p);
+}
+
+static void print_reorder_data(bmk1880v2_tensor_tgmem_shape_t *tmp_shape, u16 *reorder_data) {
+  if (IS_PRINT_INPUT) {
+    printf("reorder_data:\n");
+    int i = 0;
+    for (int n = 0; n < (int)tmp_shape->n; n++) {
+      for (int c = 0; c < (int)tmp_shape->c; c++) {
+        printf("n/c is %d %d\n", n, c);
+        for (int h = 0; h < (int)tmp_shape->h; h++) {
+          for (int w = 0; w < (int)tmp_shape->w; w++) {
+            printf("%f ", convert_bf16_fp32(reorder_data[i]));
+            i++;
+          }
+          printf("\n");
+        }
+      }
+    }
+  }
+}
+
+static void print_transformed_data(tl_shape_t *tl_svm_shape, u16 *transformed_data) {
+  if (IS_PRINT_INPUT) {
+    printf("transformed_data:\n");
+    int i = 0;
+    for (int n = 0; n < (int)tl_svm_shape->n; n++) {
+      for (int c = 0; c < (int)tl_svm_shape->c; c++) {
+        printf("n/c is %d %d\n", n, c);
+        for (int h = 0; h < (int)tl_svm_shape->h; h++) {
+          for (int w = 0; w < (int)tl_svm_shape->w; w++) {
+            printf("%f ", convert_bf16_fp32(transformed_data[i]));
+            i++;
+          }
+          printf("\n");
+        }
+      }
+    }
+  }
+}
+
+static int test_conv(conv_param_t &p_param, CVI_RT_HANDLE ctx, bmk_ctx_t *bk_ctx) {
+  // fake input/weight, dont care bias
+  u16 *input = alloc_input(&p_param);
+  u16 *weight = alloc_weight(&p_param);
+  u32 *bias = alloc_bias(&p_param);
+
+  // print_conv_param(&p_param);
+  u16 *output_ref = (u16 *)malloc(sizeof(u16) * conv_output_size(&p_param));
+  if (!output_ref)
+    return 0;
+
+  bmk1880v2_tiu_convolution_param_t conv_param;
+  memset(&conv_param, 0, sizeof(conv_param));
+  u16 *output;
+  long elapsed;
+  struct timeval t0, t1;
+  int tl_alloc_success = 1;
+  int verify_hists_svm = 1;
+  tg_t *tg_image;
+  tg_t *tg_nc_image;
+  tg_t *tg_svm;
+  tg_t *tg_output;
+  u16 *transformed_data;
+  u16 *reorder_data;
+
+  if (verify_hists_svm) {
+    // input
+    bmk1880v2_tensor_tgmem_shape_t image_shape, tmp_shape;
+    tmp_shape.n = p_param.input_n;
+    tmp_shape.c = p_param.input_c;
+    tmp_shape.h = p_param.input_h;
+    tmp_shape.w = p_param.input_w;
+
+    image_shape = tmp_shape;
+    image_shape.w = p_param.input_w / p_param.unit_size;
+
+    tg_image = alloc_tg_bf16_gmem(&ctx, tmp_shape, FMT_BF16);
+    // save to gaddr
+    put_tg_bf16_gmem(&ctx, tg_image, (u8 *)input);
+
+    // after nc
+    tg_nc_image = alloc_tg_bf16_gmem(&ctx, tmp_shape, FMT_BF16);
+
+    // svm, shape should be (oc, p_param.unit_size, kh, kw)
+    bmk1880v2_tensor_tgmem_shape_t svm_shape;
+    svm_shape.n = p_param.output_c;
+    svm_shape.c = p_param.unit_size;
+    svm_shape.h = p_param.kh;
+    svm_shape.w = p_param.kw / p_param.unit_size;
+    tg_svm = alloc_tg_bf16_gmem(&ctx, svm_shape, FMT_BF16);
+
+    tmp_shape = svm_shape;
+    tmp_shape.c = p_param.input_c;
+    tmp_shape.w = p_param.kw;
+
+    // re-order by \unit_size
+    reorder_data = re_pack_by_slice_col(&tmp_shape, p_param.unit_size, weight);
+    print_reorder_data(&tmp_shape, reorder_data);
+
+    tl_shape_t tl_svm_shape;
+    tl_svm_shape.n = svm_shape.c;
+    tl_svm_shape.c = svm_shape.n;
+    tl_svm_shape.h = svm_shape.h;
+    tl_svm_shape.w = svm_shape.w;
+    // transpose for hw
+    transformed_data = transform_weight(&tl_svm_shape, reorder_data);
+    print_transformed_data(&tl_svm_shape, transformed_data);
+
+    // dont care region 0/1
+    // save to gaddr
+    put_tg_bf16_gmem(&ctx, tg_svm, (u8 *)transformed_data);
+
+    // output
+    tmp_shape.n = p_param.input_n;
+    tmp_shape.c = p_param.output_c;
+    tmp_shape.h = conv_oh(&p_param);
+    tmp_shape.w = conv_ow(&p_param);
+    tg_output = alloc_tg_bf16_gmem(&ctx, tmp_shape, FMT_BF16);
+
+    // call kernel function
+    bf16_hists_svm(bk_ctx, tg_image->start_address, tg_nc_image->start_address, image_shape,
+                   tg_svm->start_address,
+                   svm_shape,  // (oc, ic, kh, kw)
+                   tg_output->start_address, p_param.unit_size, FMT_BF16);
+
+    gettimeofday(&t0, NULL);
+
+    test_submit(&ctx);
+
+    gettimeofday(&t1, NULL);
+    elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
+    printf("tpu takes %ld us\n", elapsed);
+
+    output = (u16 *)get_tg_bf16_gmem(&ctx, tg_output);
+  } else {
+    // dont care, test used
+    make_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+    tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, &p_param);
+    if (tl_alloc_success) {
+      put_bf16_tensor_g2l(&ctx, bk_ctx, conv_param.ifmap, (u16 *)input, FMT_BF16);
+      put_conv_weight(&ctx, bk_ctx, conv_param.weight, (u16 *)weight);
+      if (p_param.using_bias) put_conv_bias(&ctx, bk_ctx, conv_param.bias, bias);
+      bmk1880v2_tiu_convolution(bk_ctx, &conv_param);
+      output = (u16 *)get_bf16_tensor_l2g(&ctx, bk_ctx, conv_param.ofmap, FMT_BF16);
+    }
+  }
+
+  if (tl_alloc_success) {
+    // verity data
+    gettimeofday(&t0, NULL);
+
+    bmerr_t ret = conv_ref(&p_param, input, weight, bias, output_ref);
+
+    gettimeofday(&t1, NULL);
+    elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
+    printf("cpu takes %ld us\n", elapsed);
+
+    assert(ret == BM_SUCCESS);
+
+    int has_error = array_cmp_int8("Comparing results ...\n", (s8 *)output_ref, (s8 *)output,
+                                   conv_output_size(&p_param) * 2);
+
+    if (has_error) {
+      print_conv_param(&p_param);
+      printf("Comparison FAILED\n");
+      exit(-1);
+    }
+    free(output);
+  }
+
+  if (verify_hists_svm) {
+    free_tg_gmem(&ctx, tg_image);
+    free_tg_gmem(&ctx, tg_nc_image);
+    free_tg_gmem(&ctx, tg_svm);
+    free_tg_gmem(&ctx, tg_output);
+    free(transformed_data);
+    free(reorder_data);
+    // dont care region 0/1
+  } else {
+    free_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+  }
+
+  free(input);
+  free(weight);
+  free(output_ref);
+  free(bias);
+  return tl_alloc_success ? 1 : 0;
+}
+
+int main() {
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  int round_mode;
+  round_mode = set_store_feround();
+  int test_finished_num = 0;
+
+  for (int i = 0; i < 1; i++) {
+    printf("random_test_conv iteration: %d\n", i);
+    conv_param_t test_conv_param;
+
+    // is_basic_test = 0 means HOG settings
+    for (int is_basic_test = 1; is_basic_test >= 0; is_basic_test--) {
+      init_conv_param(test_conv_param, 1, is_basic_test);
+      test_finished_num += test_conv(test_conv_param, ctx, bk_ctx);
+
+      init_conv_param(test_conv_param, 10, is_basic_test);
+      test_finished_num += test_conv(test_conv_param, ctx, bk_ctx);
+
+      init_conv_param(test_conv_param, 100, is_basic_test);
+      test_finished_num += test_conv(test_conv_param, ctx, bk_ctx);
+    }
+  }
+
+  restore_feround(round_mode);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_put_bf16_matrix_stride.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_put_bf16_matrix_stride.cpp
new file mode 100644
index 000000000..cd2f5e850
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_put_bf16_matrix_stride.cpp
@@ -0,0 +1,151 @@
+#include "../1880v2_test_util.h"
+#include "1880v2_bf16_util.h"
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+ {FMT_I8, FMT_I8},
+};
+
+static void put_matrix_g2l_stride_ref(
+    void *ref,
+    void *a,
+    ml_shape_t  lmem_shape,
+    bmk1880v2_matrix_tgmem_stride_t gmem_stride,
+    fmt_t fmt)
+{
+  int row = lmem_shape.n;
+  int col = lmem_shape.col;
+  int row_stride = gmem_stride.row / ((fmt == FMT_BF16) ?2:1);
+  u16 *u16_ref = NULL;
+  u16 *u16_src = NULL;
+  u8 *u8_ref = NULL;
+  u8 *u8_src = NULL;
+
+  if (fmt == FMT_BF16) {
+    u16_ref = (u16 *)ref;
+    u16_src = (u16 *)a;
+  } else {
+    u8_ref = (u8 *)ref;
+    u8_src = (u8 *)a;
+  }
+
+  for (int ri = 0; ri < row; ri++) {
+    for (int ci = 0; ci < col; ci++) {
+      if (fmt == FMT_BF16) {
+        u16_ref[ri * col + ci] = u16_src[ri * row_stride + ci];
+      } else {
+        u8_ref[ri * col + ci] = u8_src[ri * row_stride + ci];
+      }
+    }
+  }
+
+  if (fmt == FMT_BF16) {
+    ref = (void *)u16_ref;
+  } else {
+    ref = (void *)u8_ref;
+  }
+}
+
+static void put_matrix_g2l_stride(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    ml_t *ml,
+    bmk1880v2_matrix_tgmem_stride_t gmem_stride,
+    void *data,
+    fmt_t fmt)
+{
+  int row = ml->shape.n;
+  int col = ml->shape.col;
+  int row_stride = gmem_stride.row;
+
+  bmshape_t bms = BM_MATRIX_INT16(row, row_stride );
+  CVI_RT_MEM devmem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms));
+  int ret = CVI_RT_MemCopyS2D(*ctx, devmem, (u8 *)data);
+  assert(ret == BM_SUCCESS);
+
+  gaddr_t gaddr = CVI_RT_MemGetPAddr(devmem);
+  mg_t mg;
+  mg.base_reg_index = 0;
+  mg.start_address = gaddr;
+  mg.shape.row = row;
+  mg.shape.col = col;
+  mg.stride = gmem_stride;
+  mg.fmt = fmt;
+  mg.base_reg_index = 0;
+
+  bmk1880v2_tdma_tg2l_matrix_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.dst = ml;
+  p.src = &mg;
+  bmk1880v2_tdma_g2l_bf16_matrix_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  CVI_RT_MemFree(*ctx, devmem);
+  return ;
+}
+
+static void test_put_matrix_g2l_stride(
+    CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, fmt_t fmt, int eu_align)
+{
+  int row = 80;
+  int col = 70;
+  float val = -100;
+  int size = row * col;
+  int row_stride = col * 2;
+  int stride_size = row * row_stride;
+  u16 *u16src_data = (u16 *)malloc(sizeof(u16) * stride_size);
+  s8 *s8src_data = (s8 *)malloc(sizeof(s8) * stride_size);
+  void *src_data = NULL;
+  void *result_x = NULL;
+  void *ref_x = NULL;
+
+  ml_shape_t mls = bmk1880v2_matrix_lmem_default_shape(bk_ctx, row, col, fmt);
+  ml_t *ml = bmk1880v2_lmem_alloc_matrix(bk_ctx, mls, fmt, eu_align);
+  bmk1880v2_matrix_tgmem_stride_t gmem_stride;
+  gmem_stride.row = row_stride * ((fmt == FMT_BF16) ?2:1);
+
+  // prepare source data
+  for (int i = 0; i < stride_size; i++) {
+    if(fmt == FMT_BF16) {
+      u16src_data[i] = generate_bf16_corner_val(val);
+      val += 0.1;
+    } else {
+      s8src_data[i] = i;
+    }
+  }
+  ref_x = (u8 *)xmalloc(size * ((fmt == FMT_BF16) ?2:1));
+  src_data =  (fmt == FMT_BF16) ? (void *)u16src_data : (void *)s8src_data;
+
+  // run tpu operations
+  put_matrix_g2l_stride(ctx, bk_ctx, ml, gmem_stride, src_data, fmt);
+  result_x = get_bf16_matrix_l2g(ctx, bk_ctx, ml, fmt);
+  put_matrix_g2l_stride_ref(ref_x, src_data, mls, gmem_stride, fmt);
+
+  // compare data
+  if( COMPARE_PASS != compare_result( ref_x, result_x, fmt, size))
+    exit(-1);
+
+  // free variables
+  bmk1880v2_lmem_free_matrix(bk_ctx, ml);
+  free(s8src_data);
+  free(u16src_data);
+  free(ref_x);
+  free(result_x);
+}
+
+#define TEST_ALIGNED 2 // 1: test unalign only, 2: test both align/unalign
+int main ()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+
+  test_init(&ctx, &bk_ctx);
+  for (u32 i = 0; i < nr_fmt; i++) {
+    for (u8 u8_align = 0; u8_align < TEST_ALIGNED; u8_align++) {
+      test_put_matrix_g2l_stride(&ctx, bk_ctx, input_fmt[i].src_fmt, u8_align);
+    }
+  }
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_put_bf16_tensor_stride.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_put_bf16_tensor_stride.cpp
new file mode 100644
index 000000000..2672b46f8
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_put_bf16_tensor_stride.cpp
@@ -0,0 +1,181 @@
+#include "../1880v2_test_util.h"
+#include "1880v2_bf16_util.h"
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+ {FMT_I8, FMT_I8},
+};
+
+static void put_tensor_g2l_stride_ref(
+    void *ref,
+    void *a,
+    tl_shape_t lmem_shape,
+    bmk1880v2_tensor_tgmem_stride_t gmem_stride,
+    fmt_t fmt)
+{
+  int n = lmem_shape.n;
+  int c = lmem_shape.c;
+  int h = lmem_shape.h;
+  int w = lmem_shape.w;
+  int nsrc_byte = 1;
+  u16 *u16_ref = NULL;
+  u16 *u16_src = NULL;
+  u8 *u8_ref = NULL;
+  u8 *u8_src = NULL;
+  if (fmt == FMT_BF16) {
+    nsrc_byte = 2; // FMT_BF16
+    u16_ref = (u16 *)ref;
+    u16_src = (u16 *)a;
+  } else {
+    nsrc_byte = 1; // FMT_U8, FMT_I8
+    u8_ref = (u8 *)ref;
+    u8_src = (u8 *)a;
+  }
+  int n_str = gmem_stride.n / nsrc_byte;
+  int c_str = gmem_stride.c / nsrc_byte;
+  int h_str = gmem_stride.h / nsrc_byte;
+  int w_str = 1;
+
+  /*
+   * put stride ddr tensor to local memory in default stride.
+   */
+
+  for (int ni = 0; ni < n; ni++) {
+    for (int ci = 0; ci < c; ci++) {
+      for (int hi = 0; hi < h; hi++) {
+        for (int wi = 0; wi < w; wi++) {
+          u64 src_i = ni * n_str + ci * c_str + hi * h_str + wi * w_str;
+          u64 dst_i = ni * c * h * w + ci * h * w + hi * w + wi;
+          if (fmt == FMT_BF16) {
+            u16_ref[dst_i] = u16_src[src_i];
+          } else {
+            u8_ref[dst_i] = u8_src[src_i];
+          }
+        }
+      }
+    }
+  }
+
+  if (fmt == FMT_BF16) {
+    ref = (void *)u16_ref;
+  } else {
+    ref = (void *)u8_ref;
+  }
+}
+
+static inline void put_tensor_g2l_stride(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    tl_t *tl,
+    bmk1880v2_tensor_tgmem_stride_t tg_stride,
+    void *data,
+    fmt_t fmt)
+{
+  int n = tl->shape.n;
+  int n_stride = tg_stride.n;
+  bmshape_t bms = BM_TENSOR_WITH_FMT(n, n_stride, 1, 1,
+                                     (fmt == FMT_BF16)? BM_FMT_BF16 : BM_FMT_INT8);
+  CVI_RT_MEM devmem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms));
+  int ret = CVI_RT_MemCopyS2D(*ctx, devmem, (u8 *)data);
+  assert(ret == BM_SUCCESS);
+
+  gaddr_t gaddr = CVI_RT_MemGetPAddr(devmem);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = fmt;
+  tg.shape.n = tl->shape.n;
+  tg.shape.c = tl->shape.c;
+  tg.shape.h = tl->shape.h;
+  tg.shape.w = tl->shape.w;
+  tg.stride = tg_stride;
+  tg.base_reg_index = 0;
+
+  bmk1880v2_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1880v2_tdma_g2l_bf16_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  CVI_RT_MemFree(*ctx, devmem);
+}
+
+static void test_put_tensor_g2l_stride(
+    CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, fmt_t fmt, int eu_align)
+{
+  tl_t *tl_x = NULL;
+  int n = 2;
+  int c = 15;
+  int h = 10;
+  int w = 8;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  bmk1880v2_tensor_tgmem_stride_t gmem_stride;
+  gmem_stride.h = w * 2;
+  gmem_stride.c = gmem_stride.h * h * 2;
+  gmem_stride.n = gmem_stride.c * c * 2;
+
+  int size = n * c * h * w;
+  int stride_size = gmem_stride.n * n;
+  float val = -100;
+  void *src_data = NULL;
+  u16 *u16src_data = (u16 *)malloc(sizeof(u16) * stride_size);
+  s8 *s8src_data = (s8 *)malloc(sizeof(s8) * stride_size);
+  void *result_x = NULL;
+  void *ref_x = NULL;
+
+  // prepare source data
+  ref_x = (u8 *)xmalloc(size * ((fmt == FMT_BF16) ?2:1) );
+  for (int i = 0; i < stride_size; i++) {
+    if(fmt == FMT_BF16) {
+      u16src_data[i] = generate_bf16_corner_val(val);
+      val += 0.1;
+    } else {
+      s8src_data[i] = i;
+    }
+  }
+  src_data =  (fmt == FMT_BF16) ? (void *)u16src_data : (void *)s8src_data;
+
+  // run tpu operations
+  tl_x = alloc_tl(bk_ctx, tl_shape, fmt, eu_align);
+  put_tensor_g2l_stride(ctx, bk_ctx, tl_x, gmem_stride, (u8 *)src_data, fmt);
+  result_x = get_bf16_tensor_l2g(ctx, bk_ctx, tl_x, fmt);
+  put_tensor_g2l_stride_ref(ref_x, src_data, tl_shape, gmem_stride, fmt);
+
+  // compare data
+  if( COMPARE_PASS != compare_result( ref_x, result_x, fmt, size))
+      exit(-1);
+
+  // free variables
+  free_tl(bk_ctx, tl_x);
+  free(s8src_data);
+  free(u16src_data);
+  free(ref_x);
+  free(result_x);
+}
+
+#define TEST_ALIGNED 2 // 1: test unalign only, 2: test both align/unalign
+int main (void)
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+
+  test_init(&ctx, &bk_ctx);
+  for (u32 i = 0; i < nr_fmt; i++) {
+    for (u8 u8_align = 0; u8_align < TEST_ALIGNED; u8_align++) {
+        test_put_tensor_g2l_stride(&ctx, bk_ctx, input_fmt[i].src_fmt, u8_align);
+    }
+  }
+  test_exit(&ctx);
+
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_put_bf16_tensor_stride_unalign.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_put_bf16_tensor_stride_unalign.cpp
new file mode 100644
index 000000000..f9efda75d
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_put_bf16_tensor_stride_unalign.cpp
@@ -0,0 +1,181 @@
+#include "../1880v2_test_util.h"
+#include "1880v2_bf16_util.h"
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+ {FMT_I8, FMT_I8},
+};
+
+static void put_tensor_g2l_stride_unalign_ref(
+    void *ref,
+    void *a,
+    tl_shape_t tl_shape,
+    bmk1880v2_tensor_tgmem_stride_t gmem_stride,
+    fmt_t fmt)
+{
+  int n = tl_shape.n;
+  int c = tl_shape.c;
+  int h = tl_shape.h;
+  int w = tl_shape.w;
+  int nsrc_byte = 1;
+  u16 *u16_ref = NULL;
+  u16 *u16_src = NULL;
+  u8 *u8_ref = NULL;
+  u8 *u8_src = NULL;
+    if (fmt == FMT_BF16) {
+    nsrc_byte = 2; // FMT_BF16
+    u16_ref = (u16 *)ref;
+    u16_src = (u16 *)a;
+  } else {
+    nsrc_byte = 1; // FMT_U8, FMT_I8
+    u8_ref = (u8 *)ref;
+    u8_src = (u8 *)a;
+  }
+  int n_str = gmem_stride.n / nsrc_byte;
+  int c_str = gmem_stride.c / nsrc_byte;
+  int h_str = gmem_stride.h / nsrc_byte;
+  int w_str = 1;
+
+  for (int ni = 0; ni < n; ni++) {
+    for (int ci = 0; ci < c; ci++) {
+      for (int hi = 0; hi < h; hi++) {
+        for (int wi = 0; wi < w; wi++) {
+          u64 src_i = ni * n_str + ci * c_str + hi * h_str + wi * w_str;
+          u64 dst_i = ci * n * h * w + ni * h * w + hi * w + wi;
+          if (fmt == FMT_BF16) {
+            u16_ref[dst_i] = u16_src[src_i];
+          } else {
+            u8_ref[dst_i] = u8_src[src_i];
+          }
+        }
+      }
+    }
+  }
+
+  if (fmt == FMT_BF16) {
+    ref = (void *)u16_ref;
+  } else {
+    ref = (void *)u8_ref;
+  }
+}
+
+static inline void put_tensor_g2l_stride(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    tl_t *tl,
+    bmk1880v2_tensor_tgmem_stride_t tg_stride,
+    void *data,
+    fmt_t fmt)
+{
+  int n = tl->shape.n;
+  int n_stride = tg_stride.n;
+  bmshape_t bms = BM_TENSOR_WITH_FMT(n, n_stride, 1, 1,
+                                     (fmt == FMT_BF16)? BM_FMT_BF16 : BM_FMT_INT8);
+  CVI_RT_MEM devmem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms));
+  int ret = CVI_RT_MemCopyS2D(*ctx, devmem, (u8 *)data);
+  assert(ret == BM_SUCCESS);
+
+  gaddr_t gaddr = CVI_RT_MemGetPAddr(devmem);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = fmt;
+  tg.shape.n = tl->shape.n;
+  tg.shape.c = tl->shape.c;
+  tg.shape.h = tl->shape.h;
+  tg.shape.w = tl->shape.w;
+  tg.stride = tg_stride;
+  tg.base_reg_index = 0;
+
+  bmk1880v2_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1880v2_tdma_g2l_bf16_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  CVI_RT_MemFree(*ctx, devmem);
+}
+
+static void test_put_tensor_g2l_stride_unalign(
+    CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, fmt_t fmt, int eu_align)
+{
+  tl_t *tl_x = NULL;
+  int n = 6;
+  int c = 9; //just larger than (npu_num/2)
+  int h = 1;
+  int w = 8;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  bmk1880v2_tensor_tgmem_stride_t gmem_stride;
+  gmem_stride.h = w * 2;
+  gmem_stride.c = gmem_stride.h * h * 2;
+  gmem_stride.n = gmem_stride.c * c * 2;
+
+  int size = n * c * h * w;
+  int stride_size = gmem_stride.n * n;
+  float val = -100;
+  void *src_data = NULL;
+  u16 *u16src_data = (u16 *)malloc(sizeof(u16) * stride_size);
+  s8 *s8src_data = (s8 *)malloc(sizeof(s8) * stride_size);
+  u8 *result_x = NULL;
+  void *ref_x = NULL;
+
+  // prepare source data
+  ref_x = (u8 *)xmalloc(size * ((fmt == FMT_BF16) ?2:1) );
+  for (int i = 0; i < stride_size; i++) {
+    if(fmt == FMT_BF16) {
+      u16src_data[i] = generate_bf16_corner_val(val);
+      val += 0.1;
+    } else {
+      s8src_data[i] = i;
+    }
+  }
+  src_data =  (fmt == FMT_BF16) ? (void *)u16src_data : (void *)s8src_data;
+
+  // run tpu operations
+  tl_x = alloc_tl(bk_ctx, tl_shape, fmt, eu_align);
+  put_tensor_g2l_stride(ctx, bk_ctx, tl_x, gmem_stride, (u8 *)src_data, fmt);
+  tl_x->shape.n = 1;
+  tl_x->shape.c = c;
+  tl_x->shape.h = n * h;
+  tl_x->shape.w = w;
+  result_x = get_bf16_tensor_l2g(ctx, bk_ctx, tl_x, fmt);
+  put_tensor_g2l_stride_unalign_ref(ref_x, src_data, tl_shape, gmem_stride, fmt);
+
+  // compare data
+  if( COMPARE_PASS != compare_result( ref_x, result_x, fmt, size))
+      exit(-1);
+
+  // free variables
+  free_tl(bk_ctx, tl_x);
+  free(ref_x);
+  free(result_x);
+  free(s8src_data);
+  free(u16src_data);
+}
+
+#define TEST_ALIGNED 1 // 1: test unalign only, 2: test both align/unalign
+int main (void)
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+
+  test_init(&ctx, &bk_ctx);
+  for (u32 i = 0; i < nr_fmt; i++) {
+    for (u8 u8_align = 0; u8_align < TEST_ALIGNED; u8_align++) {
+        test_put_tensor_g2l_stride_unalign(&ctx, bk_ctx, input_fmt[i].src_fmt, u8_align);
+    }
+  }
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_put_bf16_tensor_tp_unalign.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_put_bf16_tensor_tp_unalign.cpp
new file mode 100644
index 000000000..18a4afef4
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_put_bf16_tensor_tp_unalign.cpp
@@ -0,0 +1,168 @@
+#include "../1880v2_test_util.h"
+#include "1880v2_bf16_util.h"
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+ {FMT_I8, FMT_I8},
+};
+
+static void put_tensor_g2l_tp_unalign_ref(
+    void *ref,
+    void *a,
+    tl_shape_t tl_shape,
+    fmt_t fmt)
+{
+  /*
+   * (c, n, h, w) => (n, c, h, w) => (1, c, n * h, w)
+   */
+  int n = tl_shape.n;
+  int c = tl_shape.c;
+  int h = tl_shape.h;
+  int w = tl_shape.w;
+  u16 *u16_ref = NULL;
+  u16 *u16_src = NULL;
+  u8 *u8_ref = NULL;
+  u8 *u8_src = NULL;
+
+  int size = n * c * h * w;
+
+  if (fmt == FMT_BF16) {
+    u16_ref = (u16 *)ref;
+    u16_src = (u16 *)a;
+  } else {
+    u8_ref = (u8 *)ref;
+    u8_src = (u8 *)a;
+  }
+
+  for (int i = 0; i < size; i++)
+  {
+    if (fmt == FMT_BF16) {
+      u16_ref[i] = u16_src[i];
+    } else {
+      u8_ref[i] = u8_src[i];
+    }
+  }
+
+  if (fmt == FMT_BF16) {
+    ref = (void *)u16_ref;
+  } else {
+    ref = (void *)u8_ref;
+  }
+}
+
+static void put_tensor_g2l_tp(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    tl_t *tl,
+    void *data,
+    fmt_t fmt)
+{
+  int n = tl->shape.n;
+  int c = tl->shape.c;
+  int h = tl->shape.h;
+  int w = tl->shape.w;
+
+  bmshape_t bms = BM_TENSOR_WITH_FMT(n, c, h, w,
+                                     (fmt == FMT_BF16)? BM_FMT_BF16 : BM_FMT_INT8);
+
+  CVI_RT_MEM devmem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms));
+  int ret = CVI_RT_MemCopyS2D(*ctx, devmem, (u8 *)data);
+  assert(ret == BM_SUCCESS);
+
+  gaddr_t gaddr = CVI_RT_MemGetPAddr(devmem);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = fmt;
+  tg.shape.n = tl->shape.c;
+  tg.shape.c = tl->shape.n;
+  tg.shape.h = tl->shape.h;
+  tg.shape.w = tl->shape.w;
+  tg.stride = bmk1880v2_tensor_tgmem_default_stride(tg.shape, fmt);
+  tg.base_reg_index = 0;
+
+  bmk1880v2_tdma_tg2l_tensor_copy_nc_transposed_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1880v2_tdma_g2l_bf16_tensor_copy_nc_transposed(bk_ctx, &p);
+  test_submit(ctx);
+
+  CVI_RT_MemFree(*ctx, devmem);
+}
+
+static void test_put_tensor_g2l_tp_unalign(
+    CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, fmt_t fmt, int eu_align)
+{
+  tl_t *tl_x = NULL;
+  int n = 2;
+  int c = 15;
+  int h = 1;
+  int w = 8;
+  int size = n * c * h * w;
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  float val = -100;
+  void *src_data = NULL;
+  u16 *u16src_data = (u16 *)malloc(sizeof(u16) * size);
+  s8 *s8src_data = (s8 *)malloc(sizeof(s8) * size);
+  void *result_x = NULL;
+  void *ref_x = NULL;
+
+  // prepare source data
+  ref_x = (u8 *)xmalloc(size * ((fmt == FMT_BF16) ?2:1) );
+  for (int i = 0; i < size; i++) {
+    if(fmt == FMT_BF16) {
+      u16src_data[i] = generate_bf16_corner_val(val);
+      val += 0.1;
+    } else {
+      s8src_data[i] = i;
+    }
+  }
+  src_data = (fmt == FMT_BF16) ? (void *)u16src_data : (void *)s8src_data;
+
+  // run tpu operations
+  tl_x = alloc_tl(bk_ctx, tl_shape, fmt, eu_align);
+  put_tensor_g2l_tp(ctx, bk_ctx, tl_x, src_data, fmt);
+  tl_x->shape.n = 1;
+  tl_x->shape.c = c;
+  tl_x->shape.h = n * h;
+  tl_x->shape.w = w;
+  result_x = get_bf16_tensor_l2g(ctx, bk_ctx, tl_x, fmt);
+  tl_x->shape = tl_shape;
+  put_tensor_g2l_tp_unalign_ref( ref_x, src_data, tl_shape, fmt);
+
+  // compare data
+  compare_result( ref_x, result_x, fmt, size);
+
+  // free variables
+  free_tl(bk_ctx, tl_x);
+  free(ref_x);
+  free(s8src_data);
+  free(u16src_data);
+  free(result_x);
+}
+
+#define TEST_ALIGNED 2 // 1: test unalign only, 2: test both align/unalign
+int main (void)
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+
+  test_init(&ctx, &bk_ctx);
+  for (u32 i = 0; i < nr_fmt; i++) {
+    for (u8 u8_align = 0; u8_align < TEST_ALIGNED; u8_align++) {
+      test_put_tensor_g2l_tp_unalign(&ctx, bk_ctx, input_fmt[i].src_fmt, u8_align);
+    }
+  }
+  test_exit(&ctx);
+
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_put_bf16_tensor_unalign.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_put_bf16_tensor_unalign.cpp
new file mode 100644
index 000000000..6829a30de
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_put_bf16_tensor_unalign.cpp
@@ -0,0 +1,131 @@
+#include "../1880v2_test_util.h"
+#include "1880v2_bf16_util.h"
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+ {FMT_I8, FMT_I8},
+};
+
+static void put_tensor_g2l_unalign_ref(
+    void *ref,
+    void *a,
+    tl_shape_t tl_shape,
+    fmt_t fmt)
+{
+  int n = tl_shape.n;
+  int c = tl_shape.c;
+  int h = tl_shape.h;
+  int w = tl_shape.w;
+  u16 *u16_ref = NULL;
+  u16 *u16_src = NULL;
+  u8 *u8_ref = NULL;
+  u8 *u8_src = NULL;
+
+  if (fmt == FMT_BF16) {
+    u16_ref = (u16 *)ref;
+    u16_src = (u16 *)a;
+  } else {
+    u8_ref = (u8 *)ref;
+    u8_src = (u8 *)a;
+  }
+  /*
+   * (n, c, h, w) => (1, c, n * h, w)
+   */
+  for (int ni = 0; ni < n; ni++) {
+    for (int ci = 0; ci < c; ci++) {
+      for (int hi = 0; hi < h; hi++) {
+        for (int wi = 0; wi < w; wi++) {
+          u64 src_i = ni * c * h * w + ci * h * w + hi * w + wi;
+          u64 dst_i = ci * n * h * w + ni * h * w + hi * w + wi;
+          if (fmt == FMT_BF16) {
+            u16_ref[dst_i] = u16_src[src_i];
+          } else {
+            u8_ref[dst_i] = u8_src[src_i];
+          }
+        }
+      }
+    }
+  }
+
+  if (fmt == FMT_BF16) {
+    ref = (void *)u16_ref;
+  } else {
+    ref = (void *)u8_ref;
+  }
+}
+
+static void test_put_tensor_g2l_unalign(
+    CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, fmt_t fmt, int eu_align)
+{
+  tl_t *tl_x = NULL;
+  int n = 4;
+  int c = 9; //just larger than (npu_num/2)
+  int h = 1;
+  int w = 8;
+  int size = n * c * h * w;
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+  float val = -100;
+  u16 *u16src_data = (u16 *)malloc(sizeof(u16) * size);
+  s8 *s8src_data = (s8 *)malloc(sizeof(s8) * size);
+  void *src_data;
+  void *result_x = NULL;
+  void *ref_x = NULL;
+  u8 *u8ref_x = NULL;
+  u16 *u16ref_x = NULL;
+
+  // prepare source data
+  ref_x = (u8 *)xmalloc(size * ((fmt == FMT_BF16) ?2:1) );
+  for (int i = 0; i < size; i++) {
+    if(fmt == FMT_BF16) {
+      u16src_data[i] = generate_bf16_corner_val(val);
+      val += 0.1;
+    } else {
+      s8src_data[i] = i;
+    }
+  }
+  src_data =  (fmt == FMT_BF16) ? (void *)u16src_data : (void *)s8src_data;
+
+  // run tpu operations
+  tl_x = alloc_tl(bk_ctx, tl_shape, fmt, eu_align);
+  put_bf16_tensor_g2l(ctx, bk_ctx, tl_x, (u16 *)src_data, fmt);
+  tl_x->shape.n = 1;
+  tl_x->shape.c = c;
+  tl_x->shape.h = n * h;
+  tl_x->shape.w = w;
+  result_x = get_bf16_tensor_l2g(ctx, bk_ctx, tl_x, fmt);
+  put_tensor_g2l_unalign_ref(ref_x, src_data, tl_shape, fmt);
+
+   // compare data
+  compare_result( ref_x, result_x, fmt, size);
+
+  // free variables
+  free_tl(bk_ctx, tl_x);
+  free(ref_x);
+  free(u8ref_x);
+  free(u16ref_x);
+  free(s8src_data);
+  free(u16src_data);
+  free(result_x);
+}
+
+#define TEST_ALIGNED 1 // 1: test unalign only, 2: test both align/unalign
+int main (void)
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+
+  test_init(&ctx, &bk_ctx);
+  for (u32 i = 0; i < nr_fmt; i++) {
+    for (u8 u8_align = 0; u8_align < TEST_ALIGNED; u8_align++) {
+      test_put_tensor_g2l_unalign(&ctx, bk_ctx, input_fmt[i].src_fmt, u8_align);
+    }
+  }
+  test_exit(&ctx);
+
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_tdma_bf16_matrix_vlc_decompress_compress.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_tdma_bf16_matrix_vlc_decompress_compress.cpp
new file mode 100644
index 000000000..0f4e5559e
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_tdma_bf16_matrix_vlc_decompress_compress.cpp
@@ -0,0 +1,192 @@
+#include "../1880v2_test_util.h"
+
+typedef bmk1880v2_tdma_tg2l_matrix_copy_decompressed_param_t decompress_param_t;
+typedef bmk1880v2_tdma_l2tg_matrix_copy_compressed_param_t compress_param_t;
+
+typedef struct{
+  decompress_param_t dec_p;
+  compress_param_t com_p;
+} param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => %s\n",
+      tag,
+      p->dec_p.dst->shape.n, p->dec_p.dst->shape.c, p->dec_p.dst->shape.w, p->dec_p.dst->shape.col,
+      (p->dec_p.dst->fmt == FMT_I8)? "signed": "unsigned");
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  mg_shape_t src_shape;
+  ml_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 0, 1 },
+    { 0, 1, 1, 1 },
+  },
+  {
+    { 0, 2 },
+    { 0, 1, 2, 2 },
+  },
+  {
+    { 0, 2 },
+    { 0, 2, 1, 2 },
+  },
+  {
+    { 0, 7 },
+    { 0, 1, 7, 7 },
+  },
+#ifndef ENABEL_SIMPLE_BMK1880V2_VLC_TEST
+  {
+    { 0, 7 },
+    { 0, 2, 4, 7 },
+  },
+  {
+    { 0, 7 },
+    { 0, 7, 1, 7 },
+  },
+  {
+    { 0, 17 },
+    { 0, 1, 17, 17 },
+  },
+  {
+    { 0, 17 },
+    { 0, 3, 7, 17 },
+  },
+  {
+    { 0, 17 },
+    { 0, 17, 1, 17 },
+  },
+  {
+    { 0, 60 },
+    { 0, 1, 60, 60 },
+  },
+  {
+    { 0, 60 },
+    { 0, 30, 2, 60 },
+  },
+  {
+    { 0, 60 },
+    { 0, 60, 1, 60 },
+  }
+#endif /* ifndef ENABEL_SIMPLE_BMK1880V2_VLC_TEST*/
+};
+
+static void test_param_g2l(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p, u16 *src_data,
+  CommandInfo* cmd_info)
+{
+  print_param(stderr, p);
+  u64 size = ml_shape_size(&p->dec_p.dst->shape);
+  u64 bytesize = size * bytesize_of_fmt(p->dec_p.dst->fmt);
+  int is_signed = (p->dec_p.dst->fmt == FMT_I8);
+
+  u16 *gmem_data;
+  size_t bs_size;
+  size_t data_type = (p->dec_p.dst->fmt == FMT_BF16) ? 1 : 0;
+
+  gmem_data = (u16* ) vlc_compress((u8* )src_data, bytesize, is_signed, data_type, &bs_size, cmd_info, NULL);
+
+  //1. send compressed one to gaddr and decompress from gaddr to local
+  put_compressed_mg_gmem(ctx, p->dec_p.src, (u8* ) gmem_data, bs_size);
+  bmk1880v2_tdma_g2l_matrix_copy_decompressed(bmk, &p->dec_p);
+  test_submit(ctx);
+
+  //2. decompress from sram
+  bmk1880v2_tdma_l2g_matrix_copy_compressed(bmk, &p->com_p);
+  test_submit(ctx);
+
+  //3. get final data
+  size_t bs_buf_size = get_out_bs_buf_size(bytesize, data_type);
+  u16 *dst_data = (u16* )get_compressed_mg_gmem(ctx, p->com_p.dst, bs_buf_size);
+
+  for (u64 i = 0; i < bs_size / 2; i++) {
+    if (dst_data[i] != gmem_data[i]) {
+      fprintf(stderr, "vlc compress comparing failed at dst[%" PRIx64 "], got %d, exp %d\n",
+              i, dst_data[i], gmem_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(dst_data);
+  free(gmem_data);
+}
+
+static void destroy_param_g2l(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_compressed_mg_gmem(ctx, p->dec_p.src);
+  free_compressed_mg_gmem(ctx, p->com_p.dst);
+  free_ml(bmk, p->dec_p.dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  fmt_t fmts[] = { FMT_BF16 };
+  u8 fmts_sz = sizeof(fmts) / sizeof(fmts[0]);
+
+  for (u32 row = 1; row < 13; row += 2) {
+    c->src_shape.row = row;
+    c->dst_shape.n = row;
+    for (int dst_align = 0; dst_align < 2; dst_align++) {
+      for (u8 fmt_i = 0; fmt_i < fmts_sz; fmt_i++) {
+        fmt_t fmt = fmts[fmt_i];
+        param_t p;
+        memset(&p, 0, sizeof(p));
+        //put compressed data to gaddr ->decompress to local -> compress to gaddr
+
+        int is_signed = (fmt == FMT_I8);
+        int data_type = (fmt == FMT_BF16) ? 1 : 0;
+
+        CommandInfo cmd_info;
+        memset(&cmd_info, 0, sizeof(CommandInfo));
+        cmd_info.signedness = is_signed;
+        cmd_info.is_bfloat16 = data_type;
+        cmd_info.bias0 = 127;
+
+        // <! not support bias0/1 setting compress by hw
+        //get_vlc_compressed_meta(src_data, size, fmt, &bs_size, &cmd_info);
+
+        //1. alloc decompress
+        p.dec_p.src = alloc_vlc_compressed_mg_gmem(ctx, c->src_shape, fmt, &cmd_info);
+        p.dec_p.dst = alloc_ml_bf16(bmk, c->dst_shape, fmt, dst_align);
+
+        u64 size = ml_shape_size(&p.dec_p.dst->shape);
+        u16 *src_data = (u16 *)malloc(sizeof(u16) * size);
+        vlc_init_testdata(src_data, size, fmt == FMT_I8, fmt == FMT_BF16);
+
+        assert(p.dec_p.dst);
+
+        //2. alloc compress
+        p.com_p.src = p.dec_p.dst; //alloc_tl(bmk, c->lmem_shape, fmt, align);
+        p.com_p.dst = alloc_vlc_compressed_mg_gmem(ctx, c->src_shape, fmt, &cmd_info);
+
+        //3. test: the seqence like below:
+        //3.1 put compressed data to gaddr
+        //3.2 decompress to local
+        //3.3 compress to gaddr
+        //printf ("row %u is_align %d fmt %d\n", row, dst_align, fmt);
+        test_param_g2l(ctx, bmk, &p, src_data, &cmd_info);
+        destroy_param_g2l(ctx, bmk, &p);
+        free(src_data);
+      }
+    }
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_tdma_bf16_tensor_vlc_decompress_compress.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_tdma_bf16_tensor_vlc_decompress_compress.cpp
new file mode 100644
index 000000000..79d68e41f
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_tdma_bf16_tensor_vlc_decompress_compress.cpp
@@ -0,0 +1,174 @@
+#include "../1880v2_test_util.h"
+
+typedef bmk1880v2_tdma_tg2l_tensor_copy_decompressed_param_t decompress_param_t;
+typedef bmk1880v2_tdma_l2tg_tensor_copy_compressed_param_t compress_param_t;
+
+typedef struct{
+  decompress_param_t dec_p;
+  compress_param_t com_p;
+} param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => %d-bit %s\n",
+      tag,
+      p->dec_p.dst->shape.n, p->dec_p.dst->shape.c, p->dec_p.dst->shape.h, p->dec_p.dst->shape.w,
+      p->dec_p.src->bit_length,
+      (p->dec_p.dst->fmt == FMT_I8)? "signed": "unsigned");
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  tl_shape_t lmem_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 17, 13 }
+  },
+  {
+    { 3, 39, 17, 23 }
+  },
+  {
+    { 5, 39, 17, 23 }
+  },
+  {
+    { 20, 35,  2, 2 }
+  },
+#ifndef ENABEL_SIMPLE_BMK1880V2_VLC_TEST
+  {
+    { 1, 1, 1, 1 }
+  },
+  {
+    { 1, 1, 1, 2 }
+  },
+  {
+    { 1, 1, 7, 2 }
+  },
+  {
+    { 1, 1, 10, 60 }
+  },
+  {
+    { 1, 2, 1, 1 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 3, 16, 1, 1 }
+  },
+  {
+    { 3, 36, 16,  20 }
+  },
+#endif /* ifndef ENABEL_SIMPLE_BMK1880V2_VLC_TEST*/
+};
+
+static void test_param_g2l(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p, compressed_tg_t* dst)
+{
+  print_param(stderr, p);
+  u64 size = tl_shape_size(&p->dec_p.dst->shape);
+  u64 bytesize = size * bytesize_of_fmt(p->dec_p.dst->fmt);
+  int is_signed = (p->dec_p.dst->fmt == FMT_I8);
+  u16 *src_data = (u16 *)malloc(sizeof(u16) * size);
+  vlc_init_testdata(src_data, size, p->dec_p.dst->fmt == FMT_I8, p->dec_p.dst->fmt == FMT_BF16);
+
+  u8 *gmem_data;
+  size_t total_size;
+  size_t data_type = (p->dec_p.dst->fmt == FMT_BF16) ? 1 : 0;
+  size_t bs_buf_size = get_out_bs_buf_size(bytesize, data_type);
+  gmem_data = (uint8_t *) malloc(bs_buf_size * sizeof(uint8_t));
+
+  // command info
+  CommandInfo cmd_info;
+  memset(&cmd_info, 0, sizeof(CommandInfo));
+  cmd_info.signedness = is_signed;
+  cmd_info.is_bfloat16 = data_type;
+  cmd_info.bias0 = 127;
+  // TODO: test
+  //cmd_info.zero_guard_en = 1;
+  // TODO generate +-inf +-nan, plz refere https://en.wikipedia.org/wiki/Bfloat16_floating-point_format
+
+  // <! not support bias0/1 setting compress by hw
+  //bm_vlc_est_weight_bias(src_data, in_size, (bool)is_signed, (bool)data_type, &cmd_info);
+  bm_vlc_enc_bf16((u16* )src_data, bytesize, gmem_data, &total_size, &cmd_info);
+
+  put_compressed_tg_gmem(ctx, p->dec_p.src, gmem_data, total_size);
+  bmk1880v2_tdma_g2l_tensor_copy_decompressed(bmk, &p->dec_p);
+  test_submit(ctx);
+
+  dst->zero_guard_en = cmd_info.zero_guard_en;
+  dst->bias0 = cmd_info.bias0;
+  dst->bias1 = cmd_info.bias1;
+  p->com_p.dst = dst;
+  bmk1880v2_tdma_l2g_tensor_copy_compressed(bmk, &p->com_p);
+  test_submit(ctx);
+
+  u16 *dst_data = (u16* ) get_compressed_tg_gmem(ctx, p->com_p.dst);
+  u16* ref_data = (u16* ) gmem_data;
+
+  //<! div 2 means compare base bf16(2bytes), total_size unit is byte
+  for (u64 i = 0; i < total_size / 2 ; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "vlc compress comparing failed at dst[%" PRIx64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+  free(gmem_data);
+}
+
+static void destroy_param_g2l(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_compressed_tg_gmem(ctx, p->dec_p.src);
+  free_compressed_tg_gmem(ctx, p->com_p.dst);
+  free_tl(bmk, p->dec_p.dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  fmt_t fmts[] = { FMT_BF16 };
+  u8 fmts_sz = sizeof(fmts) / sizeof(fmts[0]);
+
+  for (int align = 0; align < 2; align++) {
+    for (u8 fmt_i = 0; fmt_i < fmts_sz; fmt_i++) {
+      fmt_t fmt = fmts[fmt_i];
+
+      param_t p;
+      memset(&p, 0, sizeof(p));
+      p.dec_p.src = alloc_vlc_compressed_tg_gmem(ctx,
+          &c->lmem_shape, fmt);
+      p.dec_p.dst = alloc_tl(bmk, c->lmem_shape, fmt, align);
+      assert(p.dec_p.dst);
+
+      p.com_p.src = p.dec_p.dst; //alloc_tl(bmk, c->lmem_shape, fmt, align);
+      assert(p.com_p.src);
+      compressed_tg_t* dst = alloc_vlc_compressed_tg_gmem(ctx,
+          &c->lmem_shape, fmt);
+
+      test_param_g2l(ctx, bmk, &p, dst);
+      destroy_param_g2l(ctx, bmk, &p);
+    }
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_tdma_l2l_bf16_tensor_copy.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_tdma_l2l_bf16_tensor_copy.cpp
new file mode 100644
index 000000000..a946af291
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_tdma_l2l_bf16_tensor_copy.cpp
@@ -0,0 +1,194 @@
+#include "../1880v2_test_util.h"
+#include "1880v2_bf16_util.h"
+
+typedef bmk1880v2_tdma_l2l_tensor_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+ {FMT_BF16, FMT_I8},
+ {FMT_BF16, FMT_U8},
+ {FMT_I8, FMT_BF16},
+ {FMT_U8, FMT_BF16},
+ {FMT_U8, FMT_U8},
+ {FMT_I8, FMT_I8},
+};
+
+typedef struct {
+  tl_shape_t src_shape;
+  tl_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 1, 1 },
+    { 1, 1, 1, 1 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 2, 1 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 2, 7 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 13, 17 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 120, 5 },
+  }, {
+    { 1, 2, 1, 1 },
+    { 1, 1, 1, 2 },
+  }, {
+    { 2, 17, 1,  4 },
+    { 2,  1, 4, 17 },
+  }, {
+    { 2, 17, 1,  4 },
+    { 2,  1, 17, 4 },
+  }, {
+    { 3, 16, 1, 1 },
+    { 3,  1, 2, 8 },
+  }, {
+    { 3, 39, 17, 23 },
+    { 3, 17, 39, 23 },
+  }, {
+    { 3, 36, 16,  20 },
+    { 3, 18,  1, 640 },
+  }, {
+    { 5, 39, 17, 23 },
+    { 5, 17, 39, 23 },
+  }, {
+    { 20, 35,  2, 2 },
+    { 20,  7, 10, 2 },
+  }
+};
+
+static void destroy_param(bmk_ctx_t *bmk, param_t *p)
+{
+  free_tl(bmk, p->dst);
+  free_tl(bmk, p->src);
+}
+
+static void l2l_tensor_copy_ref(param_t *p, u16 ref_data[], u16 src_data[])
+{
+  u64 size = tl_shape_size(&p->src->shape);
+
+  for (u64 i = 0; i < size; i++) {
+    if(p->src->fmt == FMT_BF16 && p->dst->fmt == FMT_BF16) {
+      ref_data[i] = src_data[i];
+    } else if(p->src->fmt == FMT_BF16 && (p->dst->fmt == FMT_I8 || p->dst->fmt == FMT_U8)){
+      ref_data[i] = (p->dst->fmt == FMT_I8) ? (u8) convert_bf16_s8(src_data[i]) : (u8) convert_bf16_u8(src_data[i]);
+    } else if(p->dst->fmt == FMT_BF16 && (p->src->fmt == FMT_I8 || p->src->fmt == FMT_U8)){
+      u8* u8src_data = (u8*)src_data;
+      u8 sign = p->src->fmt == FMT_I8 ? 1 : 0;
+      ref_data[i] = convert_int8_bf16(u8src_data[i], sign);
+    } else if(p->dst->fmt == p->src->fmt){ // fix8b -> fix8b
+      u8* u8src_data;
+      u8src_data = (u8*) src_data;
+      ref_data[i] = u8src_data[i];
+    } else {
+      fprintf(stderr, "Error src_fmt_type (%d) or dst_fmttype (%d)", p->src->fmt, p->dst->fmt);
+      exit(-1);
+    }
+  }
+}
+
+static void test_param(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = tl_shape_size(&p->src->shape);
+
+  u16 *u16src_data = (u16 *)malloc(sizeof(u16) * size);
+  u8 *u8src_data = (u8 *)malloc(sizeof(u8) * size);
+  u8 *src_data;
+
+  if(p->src->fmt == FMT_BF16) {
+    /* bf16*/
+    float val = -100;
+    for(u64 i = 0; i < size; i++) {
+      u16src_data[i] = generate_bf16_corner_val(val);
+      val += 0.1;
+    }
+    src_data = (u8*)u16src_data;
+  } else {
+    /* int8 -> bf16*/
+    for(u64 i = 0; i < size; i++) {
+      u8src_data[i] = 200 + i;
+    }
+    src_data = u8src_data;
+  }
+
+  put_bf16_tensor_g2l(ctx, bmk, p->src, (u16*)src_data, p->src->fmt);
+  bmk1880v2_tdma_l2l_bf16_tensor_copy(bmk, p);
+  u16 *dst_data = (u16*) get_bf16_tensor_l2g(ctx, bmk, p->dst, p->dst->fmt);
+
+  u16 *ref_data = (u16 *)malloc(sizeof(u16) * size);
+  l2l_tensor_copy_ref(p, ref_data, (u16*)src_data);
+
+  if(p->dst->fmt == FMT_BF16) {
+    for (u64 i = 0; i < size; i++) {
+      if (dst_data[i] != ref_data[i]) {
+        fprintf(stderr, "comparing(%d->%d) failed at dst[%" PRIu64 "], got %x, exp %x\n",
+                p->src->fmt, p->dst->fmt, i, dst_data[i], ref_data[i]);
+        exit(-1);
+      }
+    }
+  } else if(p->dst->fmt == FMT_U8 || p->dst->fmt == FMT_I8) {
+    for (u64 i = 0; i < size; i++) {
+      u32 shift = (i%2)*8;
+      if ((u8)(dst_data[i/2] >> shift) != (u8)ref_data[i]) {
+        fprintf(stderr, "comparing(%d->%d) failed at dst[%" PRIu64 "], got %x, exp %x\n",
+                p->src->fmt, p->dst->fmt, i, (dst_data[i/2] >> shift), ref_data[i]);
+        exit(-1);
+      }
+    }
+  } else {
+    fprintf(stderr, "Error compreing type src_fmt_type (%d) or dst_fmttype (%d)", p->src->fmt, p->dst->fmt);
+  }
+
+  free(u8src_data);
+  free(u16src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+  for (u32 i = 0; i < nr_fmt; i++) {
+    for (int src_align = 0; src_align < 2; src_align++) {
+      for (int dst_align = 0; dst_align < 2; dst_align++) {
+        param_t p;
+        memset(&p, 0, sizeof(p));
+        p.src = alloc_tl(bmk, c->src_shape, input_fmt[i].src_fmt, src_align);
+        p.dst = alloc_tl(bmk, c->dst_shape, input_fmt[i].dst_fmt, dst_align);
+        test_param(ctx, bmk, &p);
+        destroy_param(bmk, &p);
+      }
+    }
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_tdma_l2tg_bf16_general_copy.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_tdma_l2tg_bf16_general_copy.cpp
new file mode 100644
index 000000000..977a44bf8
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_tdma_l2tg_bf16_general_copy.cpp
@@ -0,0 +1,92 @@
+#include "../1880v2_test_util.h"
+#include "1880v2_bf16_util.h"
+
+typedef bmk1880v2_tdma_l2tg_bf16_general_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: %u bytes from %x to %u:%lx\n", tag,
+      p->src_bytes, p->src_address, p->dst_base_reg_index, p->dst_address);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef param_t case_t;
+
+static fmt_type input_fmt[] = {
+  {FMT_BF16, FMT_BF16},
+};
+
+static case_t g_cases[] = {
+  { 0, 0, 0, 1 * 2, FMT_F32, FMT_F32 },
+  { 0, 0, 0, 39 * 2, FMT_F32, FMT_F32 },
+  { 0, 0, 0, 4096 * 2, FMT_F32, FMT_F32 },
+  { 0, 0, 100, 1 * 2, FMT_F32, FMT_F32 },
+  { 0, 0, 200, 39 * 2, FMT_F32, FMT_F32 },
+  { 0, 0, 1024, 4096 * 2, FMT_F32, FMT_F32 },
+  { 39, 0, 100, 1 * 2, FMT_F32, FMT_F32 },
+  { 47, 0, 200, 39 * 2, FMT_F32, FMT_F32 },
+  { 2048, 0, 1024, 4096 * 2, FMT_F32, FMT_F32 },
+};
+
+static void l2tg_general_copy_ref(param_t *p, u16 ref_data[], u16 src_data[])
+{
+  for (u32 i = 0; i < p->src_bytes/2; i++)
+    ref_data[i] = src_data[i];
+}
+
+static void test_param_l2g(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = p->src_bytes/2 ;
+  u16 *src_data = (u16 *)malloc(sizeof(u16) * size);
+  static float val = -100;
+  for (u64 i = 0; i < size; i++) {
+    src_data[i] = generate_bf16_corner_val(val);
+    val += 0.1;
+  }
+  put_bytes_g2l(ctx, bmk, p->src_address, size * 2, (u8*)src_data);
+
+  bmk1880v2_tdma_l2g_bf16_general_copy(bmk, p);
+  test_submit(ctx);
+  //u16 *dst_data = (u16*) get_bytes_gmem(ctx, p->dst_address, size * 2);
+  u16 *dst_data = (u16*)get_bytes_l2g(ctx, bmk, p->src_address, size * 2);
+
+  u16 *ref_data = (u16 *)malloc(sizeof(u16) * size);
+  l2tg_general_copy_ref(p, ref_data, src_data);
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      exit(-1);
+    }
+  }
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  param_t *p = c;
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+  for (u32 i = 0; i < nr_fmt; i++) {
+    p->src_fmt = input_fmt[i].src_fmt;
+    p->dst_fmt = input_fmt[i].dst_fmt;
+    test_param_l2g(ctx, bmk, p);
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_tdma_l2tg_bf16_matrix_copy.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_tdma_l2tg_bf16_matrix_copy.cpp
new file mode 100644
index 000000000..aca183cfe
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_tdma_l2tg_bf16_matrix_copy.cpp
@@ -0,0 +1,191 @@
+#include "../1880v2_test_util.h"
+#include "1880v2_bf16_util.h"
+
+typedef bmk1880v2_tdma_l2tg_matrix_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.w, p->src->shape.col,
+      p->dst->shape.row, p->dst->shape.col);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+ {FMT_BF16, FMT_I8},
+ {FMT_BF16, FMT_U8},
+ {FMT_U8, FMT_U8},
+ {FMT_I8, FMT_I8},
+};
+
+typedef struct {
+  ml_shape_t src_shape;
+  mg_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 0, 1, 1, 1 },
+    { 0, 1 },
+  }, {
+    { 0, 1, 2, 2 },
+    { 1, 2 },
+  }, {
+    { 0, 2, 1, 2 },
+    { 1, 2 },
+  }, {
+    { 0, 1, 7, 7 },
+    { 0, 7 },
+  }, {
+    { 0, 2, 4, 7 },
+    { 0, 7 },
+  }, {
+    { 0, 7, 1, 7 },
+    { 0, 7 },
+  }, {
+    { 0, 1, 17, 17 },
+    { 0, 17 },
+  }, {
+    { 0, 3, 7, 17 },
+    { 0, 17 },
+  }, {
+    { 0, 17, 1, 17 },
+    { 0, 17 },
+  }, {
+    { 0, 1, 60, 60 },
+    { 0, 60 },
+  }, {
+    { 0, 30, 2, 60 },
+    { 0, 60 },
+  }, {
+    { 0, 60, 1, 60 },
+    { 0, 60 },
+  }
+};
+
+static void l2tg_matrix_copy_ref(param_t *p, u16 ref_data[], u16 src_data[])
+{
+  u64 size = ml_shape_size(&p->src->shape);
+
+  for (u64 i = 0; i < size; i++) {
+    if(p->src->fmt == FMT_BF16 && p->dst->fmt == FMT_BF16) // bf16 -> bf16
+      ref_data[i] = src_data[i];
+    else if(p->src->fmt == FMT_BF16 && (p->dst->fmt == FMT_I8 || p->dst->fmt == FMT_U8)){ // i8/u8 -> bf16
+      u8 sign = p->dst->fmt == FMT_I8 ? 1 : 0;
+      u8 val = sign ? (u8) convert_bf16_s8(src_data[i]) : (u8) convert_bf16_u8(src_data[i]);
+      ref_data[i] = (u16) val;
+    } else if(p->dst->fmt == p->src->fmt) { // i8/u8 -> i8/u8
+      u8* u8src_data;
+      u8src_data = (u8*) src_data;
+      ref_data[i] = u8src_data[i];
+    } else {
+      fprintf(stderr, "Error src_fmt_type (%d) or dst_fmttype (%d)", p->src->fmt, p->dst->fmt);
+    }
+  }
+}
+
+static void test_param_l2g(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = ml_shape_size(&p->src->shape);
+
+  u16 *u16src_data = (u16 *)malloc(sizeof(u16) * size);
+  u8 *u8src_data = (u8 *)malloc(sizeof(u8) * size);
+  u8 *src_data;
+
+  if(p->src->fmt == FMT_BF16) {
+    /* bf16*/
+    float val = -100;
+    for(u64 i = 0; i < size; i++) {
+      u16src_data[i] = generate_bf16_corner_val(val);
+      val += 0.1;
+    }
+    src_data = (u8*)u16src_data;
+  } else {
+    /* int8 -> bf16*/
+    for(u64 i = 0; i < size; i++) {
+      u8src_data[i] = 200 + i;
+    }
+    src_data = u8src_data;
+  }
+
+  put_bf16_matrix_g2l(ctx, bmk, p->src, (u8*)src_data, p->src->fmt);
+  bmk1880v2_tdma_l2g_bf16_matrix_copy(bmk, p);
+  test_submit(ctx);
+  u16 *dst_data = (u16*) get_mg_bf16_gmem(ctx, p->dst);
+
+  u16 *ref_data = (u16 *)malloc(sizeof(u16) * size);
+  l2tg_matrix_copy_ref(p, ref_data, (u16*) src_data);
+
+  if(p->dst->fmt == FMT_BF16 && p->src->fmt == FMT_BF16) {
+    for (u64 i = 0; i < size; i++) {
+      if (dst_data[i] != ref_data[i]) {
+        fprintf(stderr, "comparing(%d->%d) failed at dst[%" PRIu64 "], got %x, exp %x\n",
+                p->src->fmt, p->dst->fmt, i, dst_data[i], ref_data[i]);
+        exit(-1);
+      }
+    }
+  } else if(p->dst->fmt == FMT_U8 || p->dst->fmt == FMT_I8) {
+    for (u64 i = 0; i < size; i++) {
+      u32 shift = (i%2)*8;
+      if ((u8)(dst_data[i/2] >> shift) != (u8)ref_data[i]) {
+        fprintf(stderr, "comparing(%d->%d) failed at dst[%" PRIu64 "], got %x, exp %x\n",
+                p->src->fmt, p->dst->fmt, i, (dst_data[i/2] >> shift), ref_data[i]);
+        exit(-1);
+      }
+    }
+  } else {
+    fprintf(stderr, "Error compreing type src_fmt_type (%d) or dst_fmttype (%d)", p->src->fmt, p->dst->fmt);
+  }
+
+  free(u8src_data);
+  free(u16src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+
+static void destroy_param_l2g(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_ml(bmk, p->src);
+  free_mg_gmem(ctx, p->dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+  for (u32 i = 0; i < nr_fmt; i++) {
+    for (u32 row = 1; row < 13; row += 2) {
+      c->src_shape.n = row;
+      c->dst_shape.row = row;
+      for (int src_align = 0; src_align < 2; src_align++) {
+        param_t p;
+        memset(&p, 0, sizeof(p));
+  
+        p.src = alloc_ml_bf16(bmk, c->src_shape, input_fmt[i].src_fmt, src_align);
+        p.dst = alloc_mg_bf16_gmem(ctx, c->dst_shape, input_fmt[i].dst_fmt);
+        test_param_l2g(ctx, bmk, &p);
+        destroy_param_l2g(ctx, bmk, &p);
+  
+      }
+    }
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_tdma_l2tg_bf16_matrix_vlc_copy_compressed.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_tdma_l2tg_bf16_matrix_vlc_copy_compressed.cpp
new file mode 100644
index 000000000..abdeac6e5
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_tdma_l2tg_bf16_matrix_vlc_copy_compressed.cpp
@@ -0,0 +1,167 @@
+#include "../1880v2_test_util.h"
+
+typedef bmk1880v2_tdma_l2tg_matrix_copy_compressed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.w, p->src->shape.col,
+      p->dst->m.shape.row, p->dst->m.shape.col);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  ml_shape_t src_shape;
+  mg_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+ {
+    { 0, 2, 4, 7 },
+    { 0, 7 },
+  },
+ {
+    { 0, 1, 2, 2 },
+    { 1, 2 },
+  },
+ {
+    { 0, 3, 7, 17 },
+    { 0, 17 },
+  },
+ {
+    { 0, 17, 1, 17 },
+    { 0, 17 },
+  },
+ {
+    { 0, 60, 1, 60 },
+    { 0, 60 },
+  },
+#ifndef ENABEL_SIMPLE_BMK1880V2_VLC_TEST
+  {
+    { 0, 1, 1, 1 },
+    { 0, 1 },
+  },
+ {
+    { 0, 2, 1, 2 },
+    { 1, 2 },
+  },
+ {
+    { 0, 1, 7, 7 },
+    { 0, 7 },
+  },
+ {
+    { 0, 7, 1, 7 },
+    { 0, 7 },
+  },
+ {
+    { 0, 1, 17, 17 },
+    { 0, 17 },
+  },
+ {
+    { 0, 1, 60, 60 },
+    { 0, 60 },
+  },
+ {
+    { 0, 30, 2, 60 },
+    { 0, 60 },
+  },
+#endif /* ifndef ENABEL_SIMPLE_BMK1880V2_VLC_TEST*/
+};
+
+static void test_param_l2g(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p, u16* src_data, CommandInfo * cmd_info)
+{
+  print_param(stderr, p);
+  u64 size = ml_shape_size(&p->src->shape);
+  u64 bytesize = size * bytesize_of_fmt(p->src->fmt);
+
+  put_bf16_matrix_g2l(ctx, bmk, p->src, (u8*)src_data, p->src->fmt);
+  bmk1880v2_tdma_l2g_matrix_copy_compressed(bmk, p);
+  test_submit(ctx);
+
+  int is_signed = (p->src->fmt == FMT_I8);
+  int data_type = (p->src->fmt == FMT_BF16) ? 1 : 0;
+  size_t bs_size;
+
+  size_t bs_buf_size = get_out_bs_buf_size(bytesize, data_type);
+  u16 *ref_data = (u16* ) vlc_compress((u8* )src_data, bytesize, is_signed, data_type, &bs_size, cmd_info, NULL);
+  u16 *dst_data = (u16* ) get_compressed_mg_gmem(ctx, p->dst, bs_buf_size);
+
+  // <! compare unit is 2bytes
+  for (u64 i = 0; i < bs_size / 2; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+          i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(dst_data);
+  free(ref_data);
+}
+
+
+static void destroy_param_l2g(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_ml(bmk, p->src);
+  free_compressed_mg_gmem(ctx, p->dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  fmt_t fmts[] = { FMT_BF16 };
+  u8 fmts_sz = sizeof(fmts) / sizeof(fmts[0]);
+
+  for (u32 row = 1; row < 13; row += 2) {
+    c->src_shape.n = row;
+    c->dst_shape.row = row;
+    for (int src_align = 0; src_align < 2; src_align++) {
+      for (u8 fmt_i = 0; fmt_i < fmts_sz; fmt_i++) {
+        fmt_t fmt = fmts[fmt_i];
+        param_t p;
+        memset(&p, 0, sizeof(p));
+        p.src = alloc_ml_bf16(bmk, c->src_shape, fmt, src_align);
+
+        u64 size = ml_shape_size(&p.src->shape);
+        u16 *src_data = (u16 *)malloc(sizeof(u16) * size);
+        vlc_init_testdata(src_data, size, fmt == FMT_I8, fmt == FMT_BF16);
+
+        //size_t bs_size;
+        CommandInfo cmd_info;
+        int is_signed = (p.src->fmt == FMT_I8);
+        int data_type = (p.src->fmt == FMT_BF16) ? 1 : 0;
+
+        memset(&cmd_info, 0, sizeof(CommandInfo));
+        cmd_info.signedness = is_signed;
+        cmd_info.is_bfloat16 = data_type;
+        cmd_info.bias0 = 127;
+
+        // <! not support bias0/1 setting compress by hw
+        //get_vlc_compressed_meta(src_data, size, p.src->fmt, &bs_size, &cmd_info);
+
+        // <! max compressed size
+        p.dst = alloc_vlc_compressed_mg_gmem(ctx, c->dst_shape, p.src->fmt, &cmd_info);
+
+        test_param_l2g(ctx, bmk, &p, src_data, &cmd_info);
+        destroy_param_l2g(ctx, bmk, &p);
+        free(src_data);
+      }
+    }
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_tdma_l2tg_bf16_tensor_copy.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_tdma_l2tg_bf16_tensor_copy.cpp
new file mode 100644
index 000000000..b08ab01eb
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_tdma_l2tg_bf16_tensor_copy.cpp
@@ -0,0 +1,174 @@
+#include "../1880v2_test_util.h"
+#include "1880v2_bf16_util.h"
+
+typedef bmk1880v2_tdma_l2tg_tensor_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+ {FMT_BF16, FMT_I8},
+ {FMT_BF16, FMT_U8},
+};
+
+typedef struct {
+  tl_shape_t src_shape;
+  tg_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 1, 1 },
+    { 1, 1, 1, 1 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 2, 1 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 2, 7 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 13, 17 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 120, 5 },
+  }, {
+    { 1, 2, 1, 1 },
+    { 1, 1, 1, 2 },
+  }, {
+    { 2, 17, 1,  4 },
+    { 2,  1, 4, 17 },
+  }, {
+    { 2, 17, 1,  4 },
+    { 2,  1, 17, 4 },
+  }, {
+    { 3, 16, 1, 1 },
+    { 3,  1, 2, 8 },
+  }, {
+    { 3, 39, 17, 23 },
+    { 3, 17, 39, 23 },
+  }, {
+    { 3, 36, 16,  20 },
+    { 3, 18,  1, 640 },
+  }, {
+    { 5, 39, 17, 23 },
+    { 5, 17, 39, 23 },
+  }, {
+    { 20, 35,  2, 2 },
+    { 20,  7, 10, 2 },
+  }    
+};
+
+static void l2tg_tensor_copy_ref(param_t *p, u16 ref_data[], u16 src_data[])
+{
+  u64 size = tl_shape_size(&p->src->shape);
+
+  for (u64 i = 0; i < size; i++) {
+    if(p->src->fmt == FMT_BF16 && p->dst->fmt == FMT_BF16)
+      ref_data[i] = src_data[i];
+    else if (p->src->fmt == FMT_BF16 && (p->dst->fmt == FMT_I8 || p->dst->fmt == FMT_U8)) {
+      u8 sign = p->dst->fmt == FMT_I8 ? 1 : 0;
+      s16 val = sign ? (s16) convert_bf16_s8(src_data[i]) : (u16) convert_bf16_u8(src_data[i]);
+      ref_data[i] = u16 (val);
+    } else if(p->dst->fmt == p->src->fmt){ //i8->i8
+      ref_data[i] = src_data[i];
+    } else {
+      fprintf(stderr, "Error src_fmt_type (%d) or dst_fmttype (%d)", p->src->fmt, p->dst->fmt);
+      exit(-1);
+    }
+  }
+}
+
+static void test_param_l2g(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = tl_shape_size(&p->src->shape);
+
+  u16 *src_data = (u16 *)malloc(sizeof(u16) * size);
+  float val = -100;
+  for(u64 i = 0; i < size; i++) {
+    src_data[i] = generate_bf16_corner_val(val);
+    val += 0.1;
+  }
+
+  put_bf16_tensor_g2l(ctx, bmk, p->src, src_data, p->src->fmt);
+  bmk1880v2_tdma_l2g_bf16_tensor_copy(bmk, p);
+  test_submit(ctx);
+  u16 *dst_data = (u16*)get_tg_bf16_gmem(ctx, p->dst);
+
+  u16 *ref_data = (u16 *)malloc(sizeof(u16) * size);
+  l2tg_tensor_copy_ref(p, ref_data, src_data);
+
+  if(p->dst->fmt == FMT_BF16 && p->src->fmt == FMT_BF16) {
+    for (u64 i = 0; i < size; i++) {
+      if (dst_data[i] != ref_data[i]) {
+        fprintf(stderr, "comparing(%d->%d) failed at dst[%" PRIu64 "], got %x, exp %x\n",
+                p->src->fmt, p->dst->fmt, i, dst_data[i], ref_data[i]);
+        exit(-1);
+      }
+    }
+  } else if(p->dst->fmt == FMT_U8 || p->dst->fmt == FMT_I8) {
+    for (u64 i = 0; i < size; i++) {
+      u32 shift = (i%2)*8;
+      if ((u8)(dst_data[i/2] >> shift) != (u8)ref_data[i]) {
+        fprintf(stderr, "comparing (bf16->i8/u8) failed at dst[%" PRIu64 "], got %x, exp %x\n",
+                i, (dst_data[i/2] >> shift) , ref_data[i]);
+        exit(-1);
+      }
+    }
+  } else {
+    fprintf(stderr, "Error compreing type src_fmt_type (%d) or dst_fmttype (%d)", p->src->fmt, p->dst->fmt);
+    exit(-1);
+  }
+
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+
+static void destroy_param_l2g(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_tg_gmem(ctx, p->dst);
+  free_tl(bmk, p->src);
+}
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+  for (u32 i = 0; i < nr_fmt; i++) {
+    for (int src_align = 0; src_align < 2; src_align++) {
+      param_t p;
+      memset(&p, 0, sizeof(p));
+  
+      p.src = alloc_tl(bmk, c->src_shape, input_fmt[i].src_fmt, src_align);
+      p.dst = alloc_tg_bf16_gmem(ctx, c->dst_shape, input_fmt[i].dst_fmt);
+      test_param_l2g(ctx, bmk, &p);
+      destroy_param_l2g(ctx, bmk, &p);
+  
+    }
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_tdma_l2tg_bf16_tensor_copy_nc_transposed.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_tdma_l2tg_bf16_tensor_copy_nc_transposed.cpp
new file mode 100644
index 000000000..470444b2c
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_tdma_l2tg_bf16_tensor_copy_nc_transposed.cpp
@@ -0,0 +1,263 @@
+#include "../1880v2_test_util.h"
+#include "1880v2_bf16_util.h"
+
+typedef bmk1880v2_tdma_l2tg_tensor_copy_nc_transposed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+ {FMT_BF16, FMT_I8},
+ {FMT_BF16, FMT_U8},
+};
+
+typedef struct {
+  tl_shape_t src_shape;
+  tg_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 1, 1 },
+    { 1, 1, 1, 1 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 1, 2 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 2, 1 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 7, 2 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 2, 7 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 17, 13 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 13, 17 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 10, 60 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 2, 300 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 3, 200 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 4, 150 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 5, 120 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 60, 10 },
+  }, {
+    { 1, 1, 120, 5 },
+    { 1, 1, 120, 5 },
+  }, {
+    { 1, 2, 1, 1 },
+    { 2, 1, 1, 1 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 1, 4 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 2, 2 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 4, 1 },
+  }, {
+    { 17, 2, 2, 2 },
+    { 2, 17, 2, 2 },
+  }, {
+    { 17, 2, 4, 1 },
+    { 2, 17, 4, 1 },
+  }, {
+    {  3, 16, 1, 1 },
+    { 16,  3, 1, 1 },
+  }, {
+    { 3, 39, 23, 17 },
+    { 39, 3, 23, 17 },
+  }, {
+    { 3, 39, 17, 23 },
+    { 39, 3, 17, 23 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  16, 20 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  2, 160 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  4, 80 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  8, 40 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  20, 16 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  32, 10 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  64, 5 },
+  }, {
+    { 5, 39, 17, 23 },
+    { 39, 5, 17, 23 },
+  }, {
+    { 20, 35, 2, 2 },
+    { 35, 20, 2, 2 },
+  }, {
+    { 35, 20, 2, 2 },
+    { 20, 35, 2, 2 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 160, 2 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 2, 160 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 4, 80 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 8, 40 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 10, 32 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 20, 16 },
+  }, {
+    { 39, 5, 23, 17 },
+    { 5, 39, 23, 17 },
+  }    
+};
+
+static void l2tg_tensor_copy_nc_transposed_ref(
+    param_t *p, u16 ref_data[], u16 src_data[])
+{
+  tl_shape_t s = p->src->shape;
+  u32 n = s.n;
+  u32 c = s.c;
+  u32 hw = s.h * s.w;
+
+  for (u32 ni = 0; ni < n; ni++) {
+    for (u32 ci = 0; ci < c; ci++) {
+      for (u32 hwi = 0; hwi < hw; hwi++) {
+        u32 src_i = ni * c * hw + ci * hw + hwi;
+        u32 dst_i = ci * n * hw + ni * hw + hwi;
+        if(p->src->fmt == FMT_BF16 && p->dst->fmt == FMT_BF16)
+          ref_data[dst_i] = src_data[src_i];
+        else if (p->src->fmt == FMT_BF16 && (p->dst->fmt == FMT_I8 || p->dst->fmt == FMT_U8)) {
+          u8 sign = p->dst->fmt == FMT_I8 ? 1 : 0;
+          u8 val = sign ? (u8) convert_bf16_s8(src_data[src_i]) : (u8) convert_bf16_u8(src_data[src_i]);
+          ref_data[dst_i] = u8 (val);
+        } else if(p->dst->fmt == p->src->fmt){ //i8->i8
+          ref_data[dst_i] = src_data[src_i];
+        } else {
+          fprintf(stderr, "Error src_fmt_type (%d) or dst_fmttype (%d)", p->src->fmt, p->dst->fmt);
+          exit(-1);
+        }
+      }
+    }
+  }
+}
+
+static void test_param_l2g(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = tl_shape_size(&p->src->shape);
+
+  u16 *src_data = (u16 *)malloc(sizeof(u16) * size);
+  float val = -100;
+  for (u64 i = 0; i < size; i++) {
+    src_data[i] = generate_bf16_corner_val(val);
+    val += 0.1;
+  }
+
+  put_bf16_tensor_g2l(ctx, bmk, p->src, src_data,  p->src->fmt);
+  bmk1880v2_tdma_l2g_bf16_tensor_copy_nc_transposed(bmk, p);
+  test_submit(ctx);
+  u16 *dst_data = (u16*) get_tg_bf16_gmem(ctx, p->dst);
+
+  u16 *ref_data = (u16 *)malloc(sizeof(u16) * size);
+  l2tg_tensor_copy_nc_transposed_ref(p, ref_data, src_data);
+
+  if(p->dst->fmt == FMT_BF16 && p->src->fmt == FMT_BF16) {
+    for (u64 i = 0; i < size; i++) {
+      if (dst_data[i] != ref_data[i]) {
+        fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+                i, dst_data[i], ref_data[i]);
+        exit(-1);
+      }
+    }
+  } else if(p->dst->fmt == FMT_U8 || p->dst->fmt == FMT_I8) {
+    for (u64 i = 0; i < size; i++) {
+      u32 shift = (i%2)*8;
+      if ((u8)(dst_data[i/2] >> shift) != (u8)ref_data[i]) {
+        fprintf(stderr, "comparing (bf16->i8/u8) failed at dst[%" PRIu64 "], got %x, exp %x\n",
+                i,(u8) (dst_data[i/2] >> shift) , ref_data[i]);
+        exit(-1);
+      }
+    }
+  } else {
+    fprintf(stderr, "Error compreing type src_fmt_type (%d) or dst_fmttype (%d)", p->src->fmt, p->dst->fmt);
+  }
+
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+static void destroy_param_l2g(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_tg_gmem(ctx, p->dst);
+  free_tl(bmk, p->src);
+}
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+  for (u32 i = 0; i < nr_fmt; i++) {
+    for (int src_align = 0; src_align < 2; src_align++) {
+      param_t p;
+      memset(&p, 0, sizeof(p));
+  
+      p.src = alloc_tl(bmk, c->src_shape, input_fmt[i].src_fmt, src_align);
+      p.dst = alloc_tg_bf16_gmem(ctx, c->dst_shape, input_fmt[i].dst_fmt);
+      test_param_l2g(ctx, bmk, &p);
+      destroy_param_l2g(ctx, bmk, &p);
+  
+    }
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_tdma_l2tg_bf16_tensor_fill_constant.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_tdma_l2tg_bf16_tensor_fill_constant.cpp
new file mode 100644
index 000000000..ae7ab80f2
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_tdma_l2tg_bf16_tensor_fill_constant.cpp
@@ -0,0 +1,141 @@
+#include "../1880v2_test_util.h"
+#include "1880v2_bf16_util.h"
+
+typedef bmk1880v2_tdma_l2tg_tensor_fill_constant_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: %u => (%u, %u, %u, %u)\n",
+      tag, p->constant,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  float constant;
+  tg_shape_t dst_shape;
+} case_t;
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+};
+
+static case_t g_cases[] = {
+  {
+    37, { 1, 1, 1, 1 }
+  }, {
+    39, { 1, 1, 1, 2 }
+  }, {
+    23, { 1, 1, 2, 1 }
+  }, {
+    19, { 1, 1, 7, 2 }
+  }, {
+    17, { 1, 1, 2, 7 }
+  }, {
+    13, { 1, 1, 17, 13 }
+  }, {
+    11, { 1, 1, 13, 17 }
+  }, {
+    7, { 1, 1, 10, 60 }
+  }, {
+    9, { 1, 1, 120, 5 }
+  }, {
+    2, { 1, 2, 1, 1 }
+  }, {
+    3, { 1, 1, 1, 2 }
+  }, {
+    5, { 2, 17, 1,  4 }
+  }, {
+    41, { 2,  1, 4, 17 }
+  }, {
+    5, { 2, 17, 1,  4 }
+  }, {
+    9, { 2,  1, 17, 4 }
+  }, {
+    17, { 3, 16, 1, 1 }
+  }, {
+    26, { 3,  1, 2, 8 }
+  }, {
+    103, { 3, 39, 17, 23 }
+  }, {
+    255, { 3, 17, 39, 23 }
+  }, {
+    254, { 3, 36, 16,  20 }
+  }, {
+    127, { 3, 18,  1, 640 }
+  }, {
+    128, { 5, 39, 17, 23 }
+  }, {
+    129, { 5, 17, 39, 23 }
+  }, {
+    55, { 20, 35,  2, 2 }
+  }, {
+    1, { 20,  7, 10, 2 }
+  }    
+};
+
+static void l2tg_tensor_fill_constant_ref(param_t *p, u16 ref_data[])
+{
+  u64 size = tg_shape_size(&p->dst->shape);
+  printf("float =%x\n",p->constant);
+  for (u64 i = 0; i < size; i++)
+    ref_data[i] = p->constant;
+}
+
+static void test_param_l2g(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = tg_shape_size(&p->dst->shape);
+
+  bmk1880v2_tdma_l2g_tensor_fill_constant(bmk, p);
+  test_submit(ctx);
+  u16 *dst_data = (u16*)get_tg_bf16_gmem(ctx, p->dst);
+
+  u16 *ref_data = (u16 *)malloc(sizeof(u16) * size);
+  l2tg_tensor_fill_constant_ref(p, ref_data);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %x, exp %x\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(dst_data);
+  free(ref_data);
+}
+
+static void destroy_param_l2g(CVI_RT_HANDLE *ctx, param_t *p)
+{
+  free_tg_gmem(ctx, p->dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+  for (u32 i = 0; i < nr_fmt; i++) {
+    param_t p;
+    memset(&p, 0, sizeof(p));
+    p.constant = generate_bf16_corner_val(c->constant);
+    p.dst = alloc_tg_bf16_gmem(ctx, c->dst_shape, input_fmt[i].src_fmt);
+    test_param_l2g(ctx, bmk, &p);
+    destroy_param_l2g(ctx, &p);
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
\ No newline at end of file
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_tdma_l2tg_bf16_tensor_vlc_copy_compressed.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_tdma_l2tg_bf16_tensor_vlc_copy_compressed.cpp
new file mode 100644
index 000000000..376c05592
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_tdma_l2tg_bf16_tensor_vlc_copy_compressed.cpp
@@ -0,0 +1,153 @@
+#include "../1880v2_test_util.h"
+
+
+typedef bmk1880v2_tdma_l2tg_tensor_copy_compressed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => %d-bit %s\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->bit_length,
+      (p->src->fmt == FMT_I8)? "signed": "unsigned");
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  tl_shape_t lmem_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 17, 13 }
+  },
+  {
+    { 3, 39, 17, 23 }
+  },
+  {
+    { 5, 39, 17, 23 }
+  },
+  {
+    { 20, 35,  2, 2 }
+  },
+#ifndef ENABEL_SIMPLE_BMK1880V2_VLC_TEST
+  {
+    { 1, 1, 1, 1 }
+  },
+  {
+    { 1, 1, 1, 2 }
+  },
+  {
+    { 1, 1, 7, 2 }
+  },
+  {
+    { 1, 1, 10, 60 }
+  },
+  {
+    { 1, 2, 1, 1 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 3, 16, 1, 1 }
+  },
+  {
+    { 3, 36, 16,  20 }
+  },
+#endif /* ifndef ENABEL_SIMPLE_BMK1880V2_VLC_TEST*/
+};
+
+static void test_param_l2g(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p, CommandInfo* cmd_info, u16 *src_data)
+{
+  print_param(stderr, p);
+  u64 size = tl_shape_size(&p->src->shape);
+  u64 bytesize = size * bytesize_of_fmt(p->src->fmt);
+  int is_signed = (p->src->fmt == FMT_I8);
+  u8 data_type = (p->src->fmt == FMT_BF16) ? 1 : 0;
+  size_t bs_size = 0;
+
+  put_bf16_tensor_g2l(ctx, bmk, p->src, src_data, p->src->fmt);
+  bmk1880v2_tdma_l2g_tensor_copy_compressed(bmk, p);
+  test_submit(ctx);
+
+  u16 *dst_data = (u16* ) get_compressed_tg_gmem(ctx, p->dst);
+  u16 *ref_data = (u16* ) vlc_compress((u8 *)src_data, bytesize, is_signed, data_type, &bs_size, cmd_info, NULL);
+
+  for (u64 i = 0; i < bs_size / 2 ; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "vlc compress comparing failed at dst[%" PRIx64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+
+      exit(-1);
+    }
+  }
+
+  free(dst_data);
+  free(ref_data);
+}
+
+static void destroy_param_l2g(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_compressed_tg_gmem(ctx, p->dst);
+  free_tl(bmk, p->src);
+}
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  fmt_t fmts[] = { FMT_BF16 };
+  u8 fmts_sz = sizeof(fmts) / sizeof(fmts[0]);
+
+  for (int src_align = 0; src_align < 2; src_align++) {
+    for (u8 fmt_i = 0; fmt_i < fmts_sz; fmt_i++) {
+      fmt_t fmt = fmts[fmt_i];
+      u8 data_type = (fmt == FMT_BF16) ? 1 : 0;
+      param_t p;
+      memset(&p, 0, sizeof(p));
+
+      p.src = alloc_tl(bmk, c->lmem_shape, fmt, src_align);
+      assert(p.src);
+
+      CommandInfo cmd_info;
+      memset(&cmd_info, 0, sizeof(CommandInfo));
+      u64 in_size = tl_shape_size(&p.src->shape);
+
+      u16 *src_data = (u16 *)malloc(sizeof(u16) * in_size);
+      vlc_init_testdata(src_data, in_size, fmt == FMT_I8, fmt == FMT_BF16);
+
+      int is_signed = (p.src->fmt == FMT_I8);
+      cmd_info.signedness = is_signed;
+      cmd_info.is_bfloat16 = data_type;
+      cmd_info.bias0 = 127;
+
+      // <! not support bias0/1 setting compress by hw
+      //bm_vlc_est_weight_bias(src_data, in_size, (bool)is_signed, (bool)data_type, &cmd_info);
+
+      p.dst = _alloc_vlc_compressed_tg_gmem(ctx,
+          &c->lmem_shape, fmt, &cmd_info);
+      test_param_l2g(ctx, bmk, &p, &cmd_info, src_data);
+      destroy_param_l2g(ctx, bmk, &p);
+
+      free(src_data);
+    }
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_tdma_tg2l_bf16_general_copy.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_tdma_tg2l_bf16_general_copy.cpp
new file mode 100644
index 000000000..760310f7e
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_tdma_tg2l_bf16_general_copy.cpp
@@ -0,0 +1,104 @@
+#include "../1880v2_test_util.h"
+#include "1880v2_bf16_util.h"
+
+typedef bmk1880v2_tdma_tg2l_bf16_general_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: %u bytes from %u:%lx to %x\n", tag,
+      p->src_bytes, p->src_base_reg_index, p->src_address, p->dst_address);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef param_t case_t;
+
+static fmt_type input_fmt[] = {
+  {FMT_BF16, FMT_BF16},
+};
+
+static case_t g_cases[] = {
+  { 0, 0, 0, 1 * 2 , FMT_F32, FMT_F32 },
+  { 0, 0, 0, 39 * 2, FMT_F32, FMT_F32 },
+  { 0, 0, 0, 4096 * 2, FMT_F32, FMT_F32 },
+  { 0, 1, 0, 1 * 2, FMT_F32, FMT_F32 },
+  { 0, 1, 0, 39 * 2, FMT_F32, FMT_F32 },
+  { 0, 1, 0, 4096 * 2, FMT_F32, FMT_F32 },
+  { 0, 1, 100, 1 * 2, FMT_F32, FMT_F32 },
+  { 0, 1, 200, 39 * 2, FMT_F32, FMT_F32 },
+  { 0, 1, 4096, 4096 * 2, FMT_F32, FMT_F32 },
+  { 0, 257, 100, 1 * 2, FMT_F32, FMT_F32 },
+  { 0, 349, 200, 39 * 2, FMT_F32, FMT_F32 },
+  { 0, 3356, 4096, 4096 * 2, FMT_F32, FMT_F32 },
+};
+
+static void tg2l_general_copy_ref(param_t *p, u16 ref_data[], u16 src_data[])
+{
+  for (u32 i = 0; i < p->src_bytes/2; i++)
+    ref_data[i] = src_data[i];
+}
+
+static void test_param_g2l(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = p->src_bytes/2;
+  float val = -100;
+  u16 *src_data = (u16 *)malloc(sizeof(u16) * size);
+  for (u64 i = 0; i < size; i++) {
+    src_data[i] = generate_bf16_corner_val(val);
+    val += 0.1;
+  }
+
+  CVI_RT_MEM mem = CVI_RT_MemAlloc(*ctx, size * 2);
+  u64 gmem_addr = CVI_RT_MemGetPAddr(mem);
+  put_bytes_gmem(ctx, mem, (u8*)src_data);
+
+  p->src_address = gmem_addr;
+  bmk1880v2_tdma_g2l_bf16_general_copy(bmk, p);
+  test_submit(ctx);
+  CVI_RT_MemFree(*ctx, mem);
+
+
+  u16 *dst_data = (u16*) get_bytes_l2g(ctx, bmk, p->dst_address, size * 2);
+
+  u16 *ref_data = (u16 *)malloc(sizeof(u16) * size);
+  tg2l_general_copy_ref(p, ref_data, src_data);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  param_t *p = c;
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+  for (u32 i = 0; i < nr_fmt; i++) {
+    p->src_fmt = input_fmt[i].src_fmt;
+    p->dst_fmt = input_fmt[i].dst_fmt;
+    test_param_g2l(ctx, bmk, p);
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_tdma_tg2l_bf16_matrix_copy.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_tdma_tg2l_bf16_matrix_copy.cpp
new file mode 100644
index 000000000..7214be924
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_tdma_tg2l_bf16_matrix_copy.cpp
@@ -0,0 +1,180 @@
+#include "../1880v2_test_util.h"
+#include "1880v2_bf16_util.h"
+
+typedef bmk1880v2_tdma_tg2l_matrix_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.row, p->src->shape.col,
+      p->dst->shape.n, p->dst->shape.c,
+      p->dst->shape.w, p->dst->shape.col);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+ {FMT_I8, FMT_BF16},
+ {FMT_U8, FMT_BF16},
+};
+
+typedef struct {
+  mg_shape_t src_shape;
+  ml_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 0, 1 },
+    { 0, 1, 1, 1 },
+  }, {
+    { 0, 2 },
+    { 0, 1, 2, 2 },
+  }, {
+    { 0, 2 },
+    { 0, 2, 1, 2 },
+  }, {
+    { 0, 7 },
+    { 0, 1, 7, 7 },
+  }, {
+    { 0, 7 },
+    { 0, 2, 4, 7 },
+  }, {
+    { 0, 7 },
+    { 0, 7, 1, 7 },
+  }, {
+    { 0, 17 },
+    { 0, 1, 17, 17 },
+  }, {
+    { 0, 17 },
+    { 0, 3, 7, 17 },
+  }, {
+    { 0, 17 },
+    { 0, 17, 1, 17 },
+  }, {
+    { 0, 60 },
+    { 0, 1, 60, 60 },
+  }, {
+    { 0, 60 },
+    { 0, 30, 2, 60 },
+  }, {
+    { 0, 60 },
+    { 0, 60, 1, 60 },
+  }
+};
+
+static void tg2l_matrix_copy_ref(param_t *p, u16 ref_data[], u16 src_data[])
+{
+  u64 size = ml_shape_size(&p->dst->shape);
+
+  for (u64 i = 0; i < size; i++) {
+    if(p->src->fmt == FMT_BF16) {
+      ref_data[i] = src_data[i];
+    } else {
+      u8* u8src_data = (u8*)src_data;
+      u8 sign = p->src->fmt == FMT_I8 ? 1 : 0;
+      ref_data[i] = (u16)convert_int8_bf16(u8src_data[i], sign);
+    }
+  }
+}
+
+static void test_param_g2l(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = ml_shape_size(&p->dst->shape);
+
+  u16 *u16src_data = (u16 *)malloc(sizeof(u16) * size);
+  u8 *u8src_data = (u8 *)malloc(sizeof(u8) * size);
+  u8 *src_data;
+
+  if(p->src->fmt == FMT_BF16) {
+    float val = -10;
+    for(u64 i = 0; i < size; i++) {
+      u16src_data[i] = generate_bf16_corner_val(val);
+      val += 0.2;
+    }
+    src_data = (u8*)u16src_data;
+  } else {
+    for(u64 i = 0; i < size; i++) {
+      u8src_data[i] = 200 + i;
+    }
+    src_data = u8src_data;
+  }
+
+  put_mg_bf16_gmem(ctx, p->src, (u8*) src_data);
+  bmk1880v2_tdma_g2l_bf16_matrix_copy(bmk, p);
+  test_submit(ctx);
+  u16 *dst_data = (u16*) get_bf16_matrix_l2g(ctx, bmk, p->dst, p->dst->fmt);
+
+  u16 *ref_data = (u16 *)malloc(sizeof(u16) * size);
+  tg2l_matrix_copy_ref(p, ref_data, (u16*)src_data);
+
+  if(p->dst->fmt == FMT_BF16 && p->src->fmt == FMT_BF16) {
+    for (u64 i = 0; i < size; i++) {
+      if (dst_data[i] != ref_data[i]) {
+        fprintf(stderr, "comparing(%d->%d) failed at dst[%" PRIu64 "], got %x, exp %x\n",
+                p->src->fmt, p->dst->fmt, i, dst_data[i], ref_data[i]);
+        exit(-1);
+      }
+    }
+  } else if(p->src->fmt == FMT_U8 || p->src->fmt == FMT_I8) {
+    for (u64 i = 0; i < size; i++) {
+      if (dst_data[i] != ref_data[i]) {
+        fprintf(stderr, "comparing(%d->%d) failed at dst[%" PRIu64 "], got %x, exp %x\n",
+                p->src->fmt, p->dst->fmt, i, dst_data[i] , ref_data[i]);
+        exit(-1);
+      }
+    }
+  } else {
+    fprintf(stderr, "Error compreing type src_fmt_type (%d) or dst_fmttype (%d)", p->src->fmt, p->dst->fmt);
+    exit(-1);
+  }
+
+  free(u8src_data);
+  free(u16src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+static void destroy_param_g2l(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_mg_gmem(ctx, p->src);
+  free_ml(bmk, p->dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+  for (u32 i = 0; i < nr_fmt; i++) {
+    for (u32 row = 1; row < 13; row += 2) {
+      c->src_shape.row = row;
+      c->dst_shape.n = row;
+      for (int dst_align = 0; dst_align < 2; dst_align++) {
+        param_t p;
+        memset(&p, 0, sizeof(p));
+  
+        p.src = alloc_mg_bf16_gmem(ctx, c->src_shape, input_fmt[i].src_fmt);
+        p.dst = alloc_ml_bf16(bmk, c->dst_shape, input_fmt[i].dst_fmt, dst_align);
+        test_param_g2l(ctx, bmk, &p);
+        destroy_param_g2l(ctx, bmk, &p);
+      }
+    }
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_tdma_tg2l_bf16_matrix_vlc_copy_decompressed.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_tdma_tg2l_bf16_matrix_vlc_copy_decompressed.cpp
new file mode 100644
index 000000000..fb0349f84
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_tdma_tg2l_bf16_matrix_vlc_copy_decompressed.cpp
@@ -0,0 +1,184 @@
+#include "../1880v2_test_util.h"
+
+typedef bmk1880v2_tdma_tg2l_matrix_copy_decompressed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->m.shape.row, p->src->m.shape.col,
+      p->dst->shape.n, p->dst->shape.c,
+      p->dst->shape.w, p->dst->shape.col);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  mg_shape_t src_shape;
+  ml_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 0, 17 },
+    { 0, 3, 7, 17 },
+  },
+  {
+    { 0, 7 },
+    { 0, 7, 1, 7 },
+  },
+  {
+    { 0, 60 },
+    { 0, 30, 2, 60 },
+  },
+#ifndef ENABEL_SIMPLE_BMK1880V2_VLC_TEST
+  {
+    { 0, 1 },
+    { 0, 1, 1, 1 },
+  },
+  {
+    { 0, 2 },
+    { 0, 1, 2, 2 },
+  },
+  {
+    { 0, 2 },
+    { 0, 2, 1, 2 },
+  },
+  {
+    { 0, 7 },
+    { 0, 1, 7, 7 },
+  },
+  {
+    { 0, 7 },
+    { 0, 2, 4, 7 },
+  },
+  {
+    { 0, 17 },
+    { 0, 1, 17, 17 },
+  },
+  {
+    { 0, 17 },
+    { 0, 17, 1, 17 },
+  },
+  {
+    { 0, 60 },
+    { 0, 1, 60, 60 },
+  },
+  {
+    { 0, 60 },
+    { 0, 60, 1, 60 },
+  }
+#endif /* ifndef ENABEL_SIMPLE_BMK1880V2_VLC_TEST*/
+};
+
+static void tg2l_matrix_copy_ref(param_t *p, u16 ref_data[], u16 src_data[])
+{
+  u64 size = ml_shape_size(&p->dst->shape);
+
+  for (u64 i = 0; i < size; i++)
+    ref_data[i] = src_data[i];
+}
+
+static void test_param_g2l(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p, u16 *src_data, CommandInfo* cmd_info)
+{
+  print_param(stderr, p);
+
+  u64 in_size = ml_shape_size(&p->dst->shape);
+  size_t bs_size = 0;
+  int is_signed = (p->dst->fmt == FMT_I8);
+  size_t data_type = (p->dst->fmt == FMT_BF16) ? 1 : 0;
+  u64 size = ml_shape_size(&p->dst->shape);
+  u64 bytesize = size * bytesize_of_fmt(p->dst->fmt);
+
+  u8 *bsbuf = vlc_compress((u8* )src_data, bytesize, is_signed, data_type, &bs_size, cmd_info, NULL);
+
+  put_compressed_mg_gmem(ctx, p->src, bsbuf, bs_size);
+  free(bsbuf);
+  bmk1880v2_tdma_g2l_matrix_copy_decompressed(bmk, p);
+  test_submit(ctx);
+  u16 *dst_data = (u16*) get_bf16_matrix_l2g(ctx, bmk, p->dst, p->dst->fmt);
+
+  u16 *ref_data = (u16 *)malloc(sizeof(u16) * in_size);
+  tg2l_matrix_copy_ref(p, ref_data, src_data);
+
+  for (u64 i = 0; i < in_size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(dst_data);
+  free(ref_data);
+}
+
+static void destroy_param_g2l(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_compressed_mg_gmem(ctx, p->src);
+  free_ml(bmk, p->dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  fmt_t fmts[] = { FMT_BF16 };
+  u8 fmts_sz = sizeof(fmts) / sizeof(fmts[0]);
+
+  for (u32 row = 1; row < 13; row += 2) {
+    c->src_shape.row = row;
+    c->dst_shape.n = row;
+    for (int mode = 0; mode < VLC_CMP_MODE_MAX; mode++) {
+      for (int dst_align = 0; dst_align < 2; dst_align++) {
+        for (u8 fmt_i = 0; fmt_i < fmts_sz; fmt_i++) {
+          fmt_t fmt = fmts[fmt_i];
+          param_t p;
+          int is_signed = (fmt == FMT_I8);
+          size_t data_type = (fmt == FMT_BF16) ? 1 : 0;
+          CommandInfo cmd_info;
+
+          memset(&cmd_info, 0, sizeof(CommandInfo));
+          cmd_info.signedness = is_signed;
+          cmd_info.is_bfloat16 = data_type;
+
+          memset(&p, 0, sizeof(p));
+
+          // <! 1. alloc source
+          p.dst = alloc_ml_bf16(bmk, c->dst_shape, fmt, dst_align);
+          u64 in_size = ml_shape_size(&p.dst->shape);
+
+          // <! 2 init input
+          u16 *src_data = (u16 *)malloc(sizeof(u16) * in_size);
+          vlc_init_testdata(src_data, in_size, fmt == FMT_I8, fmt == FMT_BF16);
+
+          // <! 3 try to manual set bias0/bias1
+          if (mode == VLC_CMP_MODE_COMPILER) {
+            bm_vlc_est_weight_bias((u8*) src_data, in_size * sizeof(u16), (bool)is_signed, (bool)data_type, &cmd_info);
+          }
+
+          p.src = alloc_vlc_compressed_mg_gmem(ctx, c->src_shape, fmt, &cmd_info);
+
+          //printf ("row %u mode %d is_align %d fmt %d\n", row, mode, dst_align, fmt);
+          test_param_g2l(ctx, bmk, &p, src_data, &cmd_info);
+
+          free(src_data);
+          destroy_param_g2l(ctx, bmk, &p);
+        }
+      }
+    }
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_tdma_tg2l_bf16_tensor_copy.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_tdma_tg2l_bf16_tensor_copy.cpp
new file mode 100644
index 000000000..62ea62e39
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_tdma_tg2l_bf16_tensor_copy.cpp
@@ -0,0 +1,162 @@
+#include "../1880v2_test_util.h"
+#include "1880v2_bf16_util.h"
+
+typedef bmk1880v2_tdma_tg2l_tensor_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+ {FMT_I8, FMT_BF16},
+ {FMT_U8, FMT_BF16},
+};
+
+typedef struct {
+  tg_shape_t src_shape;
+  tl_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 1, 1 },
+    { 1, 1, 1, 1 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 2, 1 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 2, 7 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 13, 17 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 120, 5 },
+  }, {
+    { 1, 2, 1, 1 },
+    { 1, 1, 1, 2 },
+  }, {
+    { 2, 17, 1,  4 },
+    { 2,  1, 4, 17 },
+  }, {
+    { 2, 17, 1,  4 },
+    { 2,  1, 17, 4 },
+  }, {
+    { 3, 16, 1, 1 },
+    { 3,  1, 2, 8 },
+  }, {
+    { 3, 39, 17, 23 },
+    { 3, 17, 39, 23 },
+  }, {
+    { 3, 36, 16,  20 },
+    { 3, 18,  1, 640 },
+  }, {
+    { 5, 39, 17, 23 },
+    { 5, 17, 39, 23 },
+  }, {
+    { 20, 35,  2, 2 },
+    { 20,  7, 10, 2 },
+  }    
+};
+
+static void tg2l_tensor_copy_ref(param_t *p, u16 ref_data[], u16 src_data[])
+{
+  u64 size = tl_shape_size(&p->dst->shape);
+  for (u64 i = 0; i < size; i++) {
+    if(p->src->fmt == FMT_BF16) {
+      ref_data[i] = src_data[i];
+    }else {      
+      u8* u8src_data = (u8*)src_data;
+      u8 sign = p->src->fmt == FMT_I8 ? 1 : 0;
+      ref_data[i] = convert_int8_bf16(u8src_data[i], sign);
+    }
+  }
+}
+
+static void test_param_g2l(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = tl_shape_size(&p->dst->shape);
+
+  u16 *u16src_data = (u16 *)malloc(sizeof(u16) * size);
+  u8 *u8src_data = (u8 *)malloc(sizeof(u8) * size);
+  u8 *src_data;
+  if(p->src->fmt == FMT_BF16) {
+    float val = -100;
+    for(u64 i = 0; i < size; i++) {
+      u16src_data[i] = generate_bf16_corner_val(val);
+      val += 0.1;
+    }
+    src_data = (u8*)u16src_data;
+  } else {
+    for(u64 i = 0; i < size; i++) {
+      u8src_data[i] = 200 + i;
+    }
+    src_data = u8src_data;
+  }
+  
+  put_tg_bf16_gmem(ctx, p->src, (u8*) src_data);
+  bmk1880v2_tdma_g2l_bf16_tensor_copy(bmk, p);
+  test_submit(ctx);
+  u16 *dst_data = (u16*) get_bf16_tensor_l2g(ctx, bmk, p->dst, p->dst->fmt);
+
+  u16 *ref_data = (u16 *)malloc(sizeof(u16) * size);
+  tg2l_tensor_copy_ref(p, ref_data, (u16*) src_data);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %x, exp %x\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+  free(u8src_data);
+  free(u16src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+static void destroy_param_g2l(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_tg_gmem(ctx, p->src);
+  free_tl(bmk, p->dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+  for (u32 i = 0; i < nr_fmt; i++) {
+    for (int dst_align = 0; dst_align < 2; dst_align++) {
+      param_t p;
+      memset(&p, 0, sizeof(p));
+  
+      p.src = alloc_tg_bf16_gmem(ctx, c->src_shape, input_fmt[i].src_fmt);
+      p.dst = alloc_tl(bmk, c->dst_shape, input_fmt[i].dst_fmt, dst_align);
+      test_param_g2l(ctx, bmk, &p);
+      destroy_param_g2l(ctx, bmk, &p);
+    }
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_tdma_tg2l_bf16_tensor_copy_nc_transposed.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_tdma_tg2l_bf16_tensor_copy_nc_transposed.cpp
new file mode 100644
index 000000000..32122e4f9
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_tdma_tg2l_bf16_tensor_copy_nc_transposed.cpp
@@ -0,0 +1,256 @@
+#include "../1880v2_test_util.h"
+#include "1880v2_bf16_util.h"
+
+typedef bmk1880v2_tdma_tg2l_tensor_copy_nc_transposed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+ {FMT_I8, FMT_BF16},
+ {FMT_U8, FMT_BF16},
+};
+
+typedef struct {
+  tg_shape_t src_shape;
+  tl_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 1, 1 },
+    { 1, 1, 1, 1 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 1, 2 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 2, 1 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 7, 2 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 2, 7 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 17, 13 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 13, 17 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 10, 60 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 2, 300 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 3, 200 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 4, 150 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 5, 120 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 60, 10 },
+  }, {
+    { 1, 1, 120, 5 },
+    { 1, 1, 120, 5 },
+  }, {
+    { 1, 2, 1, 1 },
+    { 2, 1, 1, 1 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 1, 4 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 2, 2 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 4, 1 },
+  }, {
+    { 17, 2, 2, 2 },
+    { 2, 17, 2, 2 },
+  }, {
+    { 17, 2, 4, 1 },
+    { 2, 17, 4, 1 },
+  }, {
+    {  3, 16, 1, 1 },
+    { 16,  3, 1, 1 },
+  }, {
+    { 3, 39, 23, 17 },
+    { 39, 3, 23, 17 },
+  }, {
+    { 3, 39, 17, 23 },
+    { 39, 3, 17, 23 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  16, 20 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  2, 160 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  4, 80 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  8, 40 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  20, 16 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  32, 10 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  64, 5 },
+  }, {
+    { 5, 39, 17, 23 },
+    { 39, 5, 17, 23 },
+  }, {
+    { 20, 35, 2, 2 },
+    { 35, 20, 2, 2 },
+  }, {
+    { 35, 20, 2, 2 },
+    { 20, 35, 2, 2 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 160, 2 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 2, 160 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 4, 80 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 8, 40 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 10, 32 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 20, 16 },
+  }, {
+    { 39, 5, 23, 17 },
+    { 5, 39, 23, 17 },
+  }    
+};
+
+static void tg2l_tensor_copy_nc_transposed_ref(
+    param_t *p, u16 ref_data[], u16 src_data[])
+{
+  tg_shape_t s = p->src->shape;
+  u32 n = s.n;
+  u32 c = s.c;
+  u32 hw = s.h * s.w;
+  for (u32 ni = 0; ni < n; ni++) {
+    for (u32 ci = 0; ci < c; ci++) {
+      for (u32 hwi = 0; hwi < hw; hwi++) {
+        u32 src_i = ni * c * hw + ci * hw + hwi;
+        u32 dst_i = ci * n * hw + ni * hw + hwi;
+        if(p->src->fmt == FMT_BF16 && p->dst->fmt == FMT_BF16)
+          ref_data[dst_i] = src_data[src_i];
+        else {
+          u8* u8src_data = (u8*)src_data;
+          u8 sign = p->src->fmt == FMT_I8 ? 1 : 0;
+          ref_data[dst_i] = convert_int8_bf16(u8src_data[src_i], sign);
+        }
+      }
+    }
+  }
+}
+
+static void test_param_g2l(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = tl_shape_size(&p->dst->shape);
+
+  u16 *u16src_data = (u16 *)malloc(sizeof(u16) * size);
+  u8 *u8src_data = (u8 *)malloc(sizeof(u8) * size);
+  u8 *src_data;
+  if(p->src->fmt == FMT_BF16) {
+    float val = -100;
+    for(u64 i = 0; i < size; i++) {
+      u16src_data[i] = generate_bf16_corner_val(val);
+      val += 0.1;
+    }
+    src_data = (u8*)u16src_data;
+  } else {
+    for(u64 i = 0; i < size; i++) {
+      u8src_data[i] = 200 + i;
+    }
+    src_data = u8src_data;
+  }
+
+  put_tg_bf16_gmem(ctx, p->src, (u8*) src_data);
+  bmk1880v2_tdma_g2l_bf16_tensor_copy_nc_transposed(bmk, p);
+  test_submit(ctx);
+  u16 *dst_data = (u16 *) get_bf16_tensor_l2g(ctx, bmk, p->dst, p->dst->fmt);
+
+  u16 *ref_data = (u16 *)malloc(sizeof(u16) * size);
+  tg2l_tensor_copy_nc_transposed_ref(p, ref_data, (u16*) src_data);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %x, exp %x\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(u8src_data);
+  free(u16src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+
+static void destroy_param_g2l(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_tl(bmk, p->dst);
+  free_tg_gmem(ctx, p->src);
+}
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+  for (u32 i = 0; i < nr_fmt; i++) {
+    for (int dst_align = 0; dst_align < 2; dst_align++) {
+      param_t p;
+      memset(&p, 0, sizeof(p));
+  
+      p.src = alloc_tg_bf16_gmem(ctx, c->src_shape, input_fmt[i].src_fmt);
+      p.dst = alloc_tl(bmk, c->dst_shape, input_fmt[i].dst_fmt, dst_align);
+      test_param_g2l(ctx, bmk, &p);
+      destroy_param_g2l(ctx, bmk, &p);
+  
+    }
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_tdma_tg2l_bf16_tensor_fill_constant.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_tdma_tg2l_bf16_tensor_fill_constant.cpp
new file mode 100644
index 000000000..811142d20
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_tdma_tg2l_bf16_tensor_fill_constant.cpp
@@ -0,0 +1,144 @@
+#include "../1880v2_test_util.h"
+#include "1880v2_bf16_util.h"
+
+typedef bmk1880v2_tdma_tg2l_tensor_fill_constant_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: %u => (%u, %u, %u, %u)\n",
+      tag, p->constant,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  u16 constant;
+  tl_shape_t dst_shape;
+} case_t;
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+};
+
+static case_t g_cases[] = {
+  {
+    37, { 1, 1, 1, 1 }
+  }, {
+    39, { 1, 1, 1, 2 }
+  }, {
+    23, { 1, 1, 2, 1 }
+  }, {
+    19, { 1, 1, 7, 2 }
+  }, {
+    17, { 1, 1, 2, 7 }
+  }, {
+    13, { 1, 1, 17, 13 }
+  }, {
+    11, { 1, 1, 13, 17 }
+  }, {
+    7, { 1, 1, 10, 60 }
+  }, {
+    9, { 1, 1, 120, 5 }
+  }, {
+    2, { 1, 2, 1, 1 }
+  }, {
+    3, { 1, 1, 1, 2 }
+  }, {
+    5, { 2, 17, 1,  4 }
+  }, {
+    41, { 2,  1, 4, 17 }
+  }, {
+    5, { 2, 17, 1,  4 }
+  }, {
+    9, { 2,  1, 17, 4 }
+  }, {
+    17, { 3, 16, 1, 1 }
+  }, {
+    26, { 3,  1, 2, 8 }
+  }, {
+    103, { 3, 39, 17, 23 }
+  }, {
+    255, { 3, 17, 39, 23 }
+  }, {
+    254, { 3, 36, 16,  20 }
+  }, {
+    127, { 3, 18,  1, 640 }
+  }, {
+    128, { 5, 39, 17, 23 }
+  }, {
+    129, { 5, 17, 39, 23 }
+  }, {
+    55, { 20, 35,  2, 2 }
+  }, {
+    1, { 20,  7, 10, 2 }
+  }    
+};
+
+static void tg2l_tensor_fill_constant_ref(param_t *p, u16 ref_data[])
+{
+  u64 size = tl_shape_size(&p->dst->shape);
+
+  for (u64 i = 0; i < size; i++)
+    ref_data[i] = p->constant;
+}
+
+static void test_param_tg2l(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = tl_shape_size(&p->dst->shape);
+
+  bmk1880v2_tdma_tg2l_bf16_tensor_fill_constant(bmk, p);
+  test_submit(ctx);
+  u16 *dst_data = (u16*) get_bf16_tensor_l2g(ctx, bmk, p->dst, FMT_BF16);
+
+  u16 *ref_data = (u16 *)malloc(sizeof(u16) * size);
+  tg2l_tensor_fill_constant_ref(p, ref_data);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(dst_data);
+  free(ref_data);
+}
+
+static void destroy_param_tg2l(bmk_ctx_t *bmk, param_t *p)
+{
+  free_tl(bmk, p->dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+  for (u32 i = 0; i < nr_fmt; i++) {
+    for (int dst_align = 0; dst_align < 2; dst_align++) {
+      param_t p;
+      memset(&p, 0, sizeof(p));
+      p.constant = generate_bf16_corner_val(c->constant);
+      p.dst = alloc_tl(bmk, c->dst_shape, input_fmt[i].src_fmt, dst_align);
+
+      test_param_tg2l(ctx, bmk, &p);
+      destroy_param_tg2l(bmk, &p);
+    }
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_tdma_tg2l_bf16_tensor_vlc_copy_decompressed.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_tdma_tg2l_bf16_tensor_vlc_copy_decompressed.cpp
new file mode 100644
index 000000000..5ec5cb12a
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_tdma_tg2l_bf16_tensor_vlc_copy_decompressed.cpp
@@ -0,0 +1,160 @@
+#include "../1880v2_test_util.h"
+#include "../bm_vlc_compress.h"
+
+typedef bmk1880v2_tdma_tg2l_tensor_copy_decompressed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => fmt(%d) bias0/1/zero is (%u/%u/%u) %s\n",
+      tag,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w,
+      p->dst->fmt,
+      p->src->bias0, p->src->bias1, p->src->zero_guard_en,
+      (p->dst->fmt == FMT_I8)? "signed": "unsigned");
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  tl_shape_t lmem_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 17, 13 }
+  },
+  {
+    { 3, 39, 17, 23 }
+  },
+  {
+    { 5, 39, 17, 23 }
+  },
+  {
+    { 20, 35,  2, 2 }
+  },
+#ifndef ENABEL_SIMPLE_BMK1880V2_VLC_TEST
+  {
+    { 1, 1, 1, 1 }
+  },
+  {
+    { 1, 1, 1, 2 }
+  },
+  {
+    { 1, 1, 7, 2 }
+  },
+  {
+    { 1, 1, 10, 60 }
+  },
+  {
+    { 1, 2, 1, 1 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 3, 16, 1, 1 }
+  },
+  {
+    { 3, 36, 16,  20 }
+  },
+#endif /* ifndef ENABEL_SIMPLE_BMK1880V2_VLC_TEST*/
+};
+
+static void test_param_g2l(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p, u16 *src_data, CommandInfo* cmd_info)
+{
+  print_param(stderr, p);
+  u64 size = tl_shape_size(&p->dst->shape);
+  size_t bs_size = 0;
+  int is_signed = (p->dst->fmt == FMT_I8);
+  u8 data_type = (p->dst->fmt == FMT_BF16) ? 1 : 0;
+  u64 bytesize = size * bytesize_of_fmt(p->dst->fmt);
+
+  u8 *bsbuf = vlc_compress((u8 *)src_data, bytesize, is_signed, data_type, &bs_size, cmd_info, NULL);
+
+  u16 *ref_data = (u16 *)malloc(sizeof(u16) * size);
+  bm_vlc_dec_bf16(bsbuf, bytesize, (u16* )ref_data);
+
+  put_compressed_tg_gmem(ctx, p->src, bsbuf, bs_size);
+  bmk1880v2_tdma_g2l_tensor_copy_decompressed(bmk, p);
+  test_submit(ctx);
+
+  u16 *dst_data = (u16* )get_bf16_tensor_l2g(ctx, bmk, p->dst, p->dst->fmt);
+
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "vlc decompress comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+          i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(bsbuf);
+  free(dst_data);
+  free(ref_data);
+}
+
+static void destroy_param_g2l(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_compressed_tg_gmem(ctx, p->src);
+  free_tl(bmk, p->dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  fmt_t fmts[] = { FMT_BF16 };
+  u8 fmts_sz = sizeof(fmts) / sizeof(fmts[0]);
+
+  for (int dst_align = 0; dst_align < 2; dst_align++) {
+    for (int mode = 0; mode < VLC_CMP_MODE_MAX; mode++) {
+      for (u8 fmt_i = 0; fmt_i < fmts_sz; fmt_i++) {
+        fmt_t fmt = fmts[fmt_i];
+        param_t p;
+        memset(&p, 0, sizeof(p));
+        p.dst = alloc_tl(bmk, c->lmem_shape, fmt, dst_align);
+        assert(p.dst);
+
+        u64 size = tl_shape_size(&p.dst->shape);
+        u16 *src_data = (u16 *)malloc(sizeof(u16) * size);
+        vlc_init_testdata(src_data, size, fmt == FMT_I8, fmt == FMT_BF16);
+
+        CommandInfo cmd_info;
+        memset(&cmd_info, 0, sizeof(CommandInfo));
+        int is_signed = (fmt == FMT_I8);
+        u8 data_type = (fmt == FMT_BF16) ? 1 : 0;
+
+        cmd_info.signedness = is_signed;
+        cmd_info.is_bfloat16 = data_type;
+
+        if (mode == VLC_CMP_MODE_COMPILER) {
+          bm_vlc_est_weight_bias((u8* )src_data, size * sizeof(u16), (bool)is_signed, (bool)data_type, &cmd_info);
+        }
+
+        p.src = _alloc_vlc_compressed_tg_gmem(ctx, &c->lmem_shape, fmt, &cmd_info);
+
+        test_param_g2l(ctx, bmk, &p, src_data, &cmd_info);
+
+        free(src_data);
+        destroy_param_g2l(ctx, bmk, &p);
+      }
+    }
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_tg_copy_bf16_tensor.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_tg_copy_bf16_tensor.cpp
new file mode 100644
index 000000000..6784ec927
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_tg_copy_bf16_tensor.cpp
@@ -0,0 +1,111 @@
+#include "../1880v2_test_util.h"
+#include "1880v2_bf16_util.h"
+
+typedef bmk1880v2_tdma_tg2tg_tensor_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  tg_shape_t src_shape;
+  tg_stride_t src_stride;
+  tg_shape_t dst_shape;
+  tg_stride_t dst_stride;
+} case_t;
+
+static fmt_type input_fmt[] = {
+ {FMT_BF16, FMT_BF16},
+};
+
+static case_t g_cases[] = {
+  {
+    {1, 3, 3, 3}, {27*2, 9*2, 3*2},
+    {1, 3, 3, 3}, {27*2, 9*2, 3*2},
+  },
+  {
+    // YOLOv2 concat layer
+    {1, 256, 19, 19}, {92416*2, 361*2, 19*2},
+    {1, 256, 19, 19}, {462080*2, 361*2, 19*2},
+  }
+};
+
+static void test_param_g2g(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+
+  u64 size = p->src->shape.c * p->src->shape.h * p->src->shape.w;
+  u16 *src_data = (u16 *)malloc(sizeof(u16) * size);
+  for (u64 i = 0; i < size; i++)
+    src_data[i] = 0x1234 + i;
+
+  put_tg_bf16_gmem(ctx, p->src, (u8*)src_data);
+
+  bmk1880v2_tdma_tg2tg_bf16_tensor_copy(bmk, p);
+  test_submit(ctx);
+
+  u16 *dst_data = (u16*) get_tg_bf16_gmem(ctx, p->dst);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != src_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %x, exp %x\n",
+              i, dst_data[i], src_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+}
+
+static void destroy_param_g2g(CVI_RT_HANDLE *ctx, param_t *p)
+{
+  free_tg_gmem(ctx, p->src);
+  free_tg_gmem(ctx, p->dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+  for (u32 i = 0; i < nr_fmt; i++) {
+        param_t p;
+        bmk1880v2_tensor_tgmem_t *src, *dst;
+
+        memset(&p, 0, sizeof(p));
+
+        src = alloc_tg_bf16_gmem(ctx, c->src_shape, input_fmt[i].src_fmt);
+        src->stride.n = c->src_stride.n;
+        src->stride.c = c->src_stride.c;
+        src->stride.h = c->src_stride.h;
+
+        dst = alloc_tg_bf16_gmem(ctx, c->dst_shape, input_fmt[i].dst_fmt);
+        dst->stride.n = c->dst_stride.n;
+        dst->stride.c = c->dst_stride.c;
+        dst->stride.h = c->dst_stride.h;
+        p.src = src;
+        p.dst = dst;
+        test_param_g2g(ctx, bmk, &p);
+        destroy_param_g2g(ctx, &p);
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_1880v2_tg_copy_fp32_bf16_tensor.cpp b/cviruntime/test/1880v2/bf16/test_1880v2_tg_copy_fp32_bf16_tensor.cpp
new file mode 100644
index 000000000..14585787f
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_1880v2_tg_copy_fp32_bf16_tensor.cpp
@@ -0,0 +1,129 @@
+#include "../1880v2_test_util.h"
+#include "1880v2_bf16_util.h"
+#include <sys/time.h>
+
+typedef bmk1880v2_tdma_tg2tg_tensor_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p) {
+  fprintf(f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n", tag, p->src->shape.n, p->src->shape.c,
+          p->src->shape.h, p->src->shape.w, p->dst->shape.n, p->dst->shape.c, p->dst->shape.h,
+          p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  tg_shape_t src_shape;
+  tg_shape_t dst_shape;
+} case_t;
+
+static fmt_type input_fmt[] = {
+    {FMT_BF16, FMT_BF16},
+};
+
+static case_t g_cases[] = {
+    {
+        {1, 3, 3, 2},
+        {1, 3, 3, 2},
+    },
+	{
+        {4, 3, 3, 2},
+        {4, 3, 3, 2},
+    },
+
+    //{
+    //  // YOLOv2 concat layer
+    //  {1, 256, 19, 19},
+    //  {1, 256, 19, 19},
+    //},
+    {
+        {1, 256, 19, 20},
+        {1, 256, 19, 20},
+    },
+    {
+        {1, 1280, 3, 4},
+        {1, 1280, 3, 4},
+    },
+    {
+        {1, 159 * 89, 36, 4},
+        {1, 159 * 89, 36, 4},
+    },
+    {
+        {159, 89, 36, 4},
+        {159, 89, 36, 4},
+    },
+};
+
+static void test_param_g2g(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p) {
+  print_param(stderr, p);
+
+  // 2 means source is fp32, occupy 2 * bf16 size
+  u64 size = p->src->shape.n * p->src->shape.c * p->src->shape.h * p->src->shape.w / 2;
+  u32 *src_data = (u32 *)malloc(sizeof(u32) * size);
+  for (u64 i = 0; i < size; i++) {
+    src_data[i] = ((0x1234 + i) << 16) + 0x5678 + i;
+    // printf("src[%" PRIu64 "] 0x%x\n", i, src_data[i]);
+  }
+
+  put_tg_bf16_gmem(ctx, p->src, (u8 *)src_data);
+
+  bf16_s2s_fp32_bf16(bmk, p->src->start_address, p->src->shape, p->dst->start_address,
+                     p->dst->shape, FMT_BF16);
+
+  long elapsed;
+  struct timeval t0, t1;
+  gettimeofday(&t0, NULL);
+
+  test_submit(ctx);
+
+  gettimeofday(&t1, NULL);
+  elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
+  printf("kernel takes %ld us\n", elapsed);
+
+  u16 *dst_data = (u16 *)get_tg_bf16_gmem(ctx, p->dst);
+
+  for (u64 i = 0; i < size; i++) {
+    u16 _src_data = (src_data[i] >> 16) & 0xffff;
+    if (dst_data[i] != _src_data) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %x, exp %x\n", i, dst_data[i], _src_data);
+      exit(-1);
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+}
+
+static void destroy_param_g2g(CVI_RT_HANDLE *ctx, param_t *p) {
+  free_tg_gmem(ctx, p->src);
+  free_tg_gmem(ctx, p->dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c) {
+  u32 nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+  for (u32 i = 0; i < nr_fmt; i++) {
+    param_t p;
+    bmk1880v2_tensor_tgmem_t *src, *dst;
+
+    memset(&p, 0, sizeof(p));
+
+    src = alloc_tg_bf16_gmem(ctx, c->src_shape, input_fmt[i].src_fmt);
+    dst = alloc_tg_bf16_gmem(ctx, c->dst_shape, input_fmt[i].dst_fmt);
+    p.src = src;
+    p.dst = dst;
+    test_param_g2g(ctx, bmk, &p);
+    destroy_param_g2g(ctx, &p);
+  }
+}
+
+int main() {
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++) test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_cv1880v2_conv3d_bf16.cpp b/cviruntime/test/1880v2/bf16/test_cv1880v2_conv3d_bf16.cpp
new file mode 100644
index 000000000..276f8a3c1
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_cv1880v2_conv3d_bf16.cpp
@@ -0,0 +1,3148 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <time.h>
+#include <math.h>
+#include "test_cvikernel_util.h"
+#include "test_tf_quant_util.h"
+#include "test_native_ref.h"
+
+//#define DUMP_MSG
+
+#define TEST_CASE_NAME    "test_cv1880v2_conv"
+
+typedef enum {
+  NCDHW_N = 0,
+  NCDHW_C = 1,
+  NCDHW_D = 2,
+  NCDHW_H = 3,
+  NCDHW_W = 4,
+  NCDHW_MAX_DIMS
+} NCDHW_DIMS;
+
+typedef enum {
+  SPATIAL_D = 0,
+  SPATIAL_H = 1,
+  SPATIAL_W = 2,
+  SPATIAL_MAX_DIMS
+} SPATIAL_DIMS;
+
+typedef struct {
+  int input_shapes[5];
+  int output_shapes[5];
+  int weight_shapes[5];
+  int bias_shapes[5];
+  int weight_strides[SPATIAL_MAX_DIMS];
+  int dilations[SPATIAL_MAX_DIMS];
+  int paddings[6]; // depth[2], top, bottom, left, right
+  void *input_data;
+  void *output_data;
+  void *weight_data;
+  void *bias_data;
+  void *ref_output_data;
+  cvk_fmt_t data_format;
+} conv3d_test_param_t;
+
+static void permute5d(float *dst, float *src, int src_shapes[5], int orders[5])
+{
+  assert((orders[0] < 5) && (orders[1] < 5) && (orders[2] < 5) &&
+         (orders[3] < 5) && (orders[4] < 5) && "Expect 5d permute");
+
+  int dst_shapes[5] = {
+      src_shapes[orders[0]], src_shapes[orders[1]], src_shapes[orders[2]],
+      src_shapes[orders[3]], src_shapes[orders[4]]};
+
+  // logical stride, in unit of float
+  int src_strides[5], dst_strides[5];
+  get_strides_from_shapes5d(src_strides, src_shapes, 1);
+  get_strides_from_shapes5d(dst_strides, dst_shapes, 1);
+
+  for (int i = 0; i < src_shapes[0]; i++) {
+    for (int j = 0; j < src_shapes[1]; j++) {
+      for (int z = 0; z < src_shapes[2]; z++) {
+        for (int y = 0; y < src_shapes[3]; y++) {
+          for (int x = 0; x < src_shapes[4]; x++) {
+            int src_poss[5] = {i, j, z, y, x};
+            int dst_poss[5] = {
+                src_poss[orders[0]], src_poss[orders[1]], src_poss[orders[2]],
+                src_poss[orders[3]], src_poss[orders[4]]};
+            int src_offset = get_tensor5d_offset(src_poss, src_strides);
+            int dst_offset = get_tensor5d_offset(dst_poss, dst_strides);
+            dst[dst_offset] = src[src_offset];
+          }
+        }
+      }
+    }
+  }
+}
+
+static void convert_ncdhw_to_ndchw(float *dst, float *src, int src_shapes[5])
+{
+  // Permute
+  //    0  1  2  3  4      0  2  1  3  4
+  //   (n, c, d, h, w) -> (n, d, c, h, w)
+  int orders[5] = {0, 2, 1, 3, 4};
+  permute5d(dst, src, src_shapes, orders);
+}
+
+static void convert_tpu_weight_for_ncdhw(
+    float *tpu_weight, float cpu_weight[5],
+    int cpu_shapes[5])
+{
+  //           0   1   2   3   4       2   0   3   4   1
+  //           N   C   D   H   W       D   N   H   W   C
+  // Permute (oc, ic, kd, kh, kw) -> (kd, oc, kh, kw, ic)
+  int orders[5] = {2, 0, 3, 4, 1};
+  permute5d(tpu_weight, cpu_weight, cpu_shapes, orders);
+}
+
+void dumpFloatData(float *data, int shapes[5])
+{
+  int strides[5];
+
+  // logical stride, in unit of float
+  get_strides_from_shapes5d(strides, shapes, 1);
+
+  printf("%s (%d, %d, %d, %d, %d)=>\n",
+        __FUNCTION__, shapes[0], shapes[1], shapes[2], shapes[3], shapes[4]);
+
+  for (int i = 0; i < shapes[NCDHW_N]; i++) {
+    for (int j = 0; j < shapes[NCDHW_C]; j++) {
+      for (int z = 0; z < shapes[NCDHW_D]; z++) {
+        for (int y = 0; y < shapes[NCDHW_H]; y++) {
+          printf("  [n=%d][c=%d][d=%d][h=%d] ", i, j, z, y);
+          for (int x = 0; x < shapes[NCDHW_W]; x++) {
+            int poss[5] = {i, j, z, y, x};
+            int offset = get_tensor5d_offset(poss, strides);
+            printf("%f ", data[offset]);
+          }
+          printf("\n");
+        }
+      }
+    }
+  }
+
+  printf("<= %s\n", __FUNCTION__);
+
+}
+
+static uint32_t addr_after_right_shift(
+    cvk_context_t *cvk_ctx, int addr, uint32_t step, int c_str)
+{
+  uint32_t npu_num = cvk_ctx->info.npu_num;
+  uint32_t lmem_size = cvk_ctx->info.lmem_size; 
+
+  uint32_t lmem_i = (addr / lmem_size + step) % npu_num;
+  uint32_t offset = addr % lmem_size + (lmem_i + step) / npu_num * c_str;
+  return lmem_i * lmem_size + offset;
+}
+
+// input (n, ic, id, ih, iw)
+// output (n, oc, od, oh, ow)
+// weight (oc, kd, kh, kw, ic)
+void conv3d_float_ref_for_ncdhw(
+  float *input, float *weight, float *bias, float *output,
+  int batch, int input_c, int input_d, int input_h, int input_w,
+  int output_c, int output_d, int output_h, int output_w,
+  int kernel_d, int kernel_h, int kernel_w,
+  int stride_d, int stride_h, int stride_w,
+  int dilation_d, int dilation_h, int dilation_w,
+  int pad_d0, int pad_d1,
+  int pad_top, int pad_bottom, int pad_left, int pad_right) {
+  (void)pad_d1;
+  (void)pad_bottom;
+  (void)pad_right;
+
+  int input_shapes[5] = {batch, input_c, input_d, input_h, input_w};
+  int output_shapes[5] = {batch, output_c, output_d, output_h, output_w};
+  int kernel_shapes[5] = {output_c, kernel_d, kernel_h, kernel_w, input_c};
+  int input_strides[5];
+  int output_strides[5];
+  int kernel_strides[5];
+
+  // input/output shape (n, c, d, h, w)
+  get_strides_from_shapes5d(input_strides, input_shapes, sizeof(float));
+  get_strides_from_shapes5d(output_strides, output_shapes, sizeof(float));
+
+  // kernel shape (oc, kd, kh, kw, kc)
+  get_strides_from_shapes5d(kernel_strides, kernel_shapes, sizeof(float));
+
+#ifdef DUMP_MSG
+  printf("  %s =>\n", __FUNCTION__);
+#endif
+
+  for (int i = 0; i < batch; ++i) {
+    for (int oc = 0; oc < output_c; oc++) {
+      for (int oz = 0; oz < output_d; oz++) {
+        for (int oy = 0; oy < output_h; ++oy) {
+          for (int ox = 0; ox < output_w; ++ox) {
+            for (int ic = 0; ic < input_c; ++ic) {
+              for (int kz = 0; kz < kernel_d; ++kz) {
+                const int iz = oz * stride_d + kz * dilation_d - pad_d0;
+
+#ifdef DUMP_MSG
+                printf("    [i=%d][oc=%d][oz=%d][oy=%d][ox=%d][ic=%d][kz=%d]" \
+                       "iz= %d = %d(oz) * %d(stride_depth) + "\
+                       "%d(kz) * %d(dilation_depth) - %d(padding_d_start)\n",
+                      i, oc, oz, oy, ox, ic, kz,
+                      iz, oz, stride_d, kz, dilation_d,
+                      pad_d0);
+#endif
+
+                if (iz >= 0 && iz < input_d) {
+                  for (int ky = 0; ky < kernel_h; ++ky) {
+                    const int iy = oy * stride_h + ky * dilation_h - pad_top;
+                    if (iy >= 0 && iy < input_h) {
+                      for (int kx = 0; kx < kernel_w; ++kx) {
+                        const int ix = ox * stride_w + kx * dilation_w - pad_left;
+                        if (ix >= 0 && ix < input_w) {
+                          int input_poss[5] = {i, ic, iz, iy, ix};
+                          int input_offset =
+                              get_tensor5d_offset(input_poss, input_strides)
+                                  / input_strides[5 - 1];
+
+                          // pytorch (Oc=1, Id=1, Ic=1, kh=3, kw=3)
+                          int kernel_poss[5] = {oc, ic, kz, ky, kx};
+
+                          int kernel_offset =
+                            get_tensor5d_offset(kernel_poss, kernel_strides)
+                                / kernel_strides[5 - 1];
+
+                          int output_poss[5] = {i, oc, oz, oy, ox};
+                          int output_offset =
+                            get_tensor5d_offset(output_poss, output_strides)
+                                / output_strides[5 - 1];
+
+                          output[output_offset] +=
+                            input[input_offset] * weight[kernel_offset];
+
+#ifdef DUMP_MSG
+                          printf("      [n=%d][oc=%d][oz=%d][oh=%d][ow=%d]" \
+                                 "[ic=%d][kz=%d[ky=%d][kx=%d] output[%d](%f) "\
+                                 "+= input[n=%d][ic=%d][iz=%d][iy=%d][ix=%d]"\
+                                 "[%d](%f) * weight[oc=%d][ic=%d][kz=%d]"\
+                                 "[ky=%d][kx=%d][%d](%f) = %f\n",
+                              i, oc, oz, oy, ox, ic, kz, ky, kx, output_offset,
+                              output[output_offset] -
+                              input[input_offset] * weight[kernel_offset],
+                              i, ic, iz, iy, ix,
+                              input_offset, input[input_offset],
+                              oc, ic, kz, ky, kx,
+                              kernel_offset, weight[kernel_offset],
+                              output[output_offset]);
+#endif
+
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  for (int i = 0; i < batch; ++i) {
+    for (int oy = 0; oy < output_h; ++oy) {
+      for (int ox = 0; ox < output_w; ++ox) {
+        for (int oc = 0; oc < output_c; ++oc) {
+          for (int od = 0; od < output_d; ++od) {
+            int output_poss[5] = {i, oc, od, oy, ox};
+            int output_offset =
+                get_tensor5d_offset(output_poss, output_strides)
+                    / output_strides[5 - 1];
+            output[output_offset] += bias[oc];
+
+#ifdef DUMP_MSG
+            printf("    [n=%d][oy=%d][ox=%d][oc=%d][od=%d] output[%d](%f)" \
+                   " += bias(%f) = %f\n",
+                   i, oy, ox, oc, od, output_offset,
+                   output[output_offset] - bias[oc], bias[oc],
+                   output[output_offset]);
+#endif
+
+          }
+        }
+      }
+    }
+  }
+
+#ifdef DUMP_MSG
+  printf("  <= %s\n", __FUNCTION__);
+#endif
+
+}
+
+void conv3d_float_ref(conv3d_test_param_t *test_param)
+{
+  // input
+  int batch = test_param->input_shapes[NCDHW_N];
+  int input_channel = test_param->input_shapes[NCDHW_C];
+  int input_depth = test_param->input_shapes[NCDHW_D];
+  int input_height = test_param->input_shapes[NCDHW_H];
+  int input_width = test_param->input_shapes[NCDHW_W];
+
+  int padding_d_start = test_param->paddings[0];
+  // int padding_d_end = test_param->paddings[1];
+  int padding_h_start = test_param->paddings[2];
+  // int padding_h_end = test_param->paddings[3];
+  int padding_w_start = test_param->paddings[4];
+  // int padding_w_end = test_param->paddings[5];
+
+  // output
+  int output_depth = test_param->output_shapes[NCDHW_D];
+  int output_channel = test_param->output_shapes[NCDHW_C];
+  int output_height = test_param->output_shapes[NCDHW_H];
+  int output_width = test_param->output_shapes[NCDHW_W];
+
+#if 1
+  // pytorch weight (oc, ic, kd, kh, kw)
+  int weight_depth = test_param->weight_shapes[NCDHW_D];
+  int weight_height = test_param->weight_shapes[NCDHW_H];
+  int weight_width = test_param->weight_shapes[NCDHW_W];
+#else
+  // weight
+  // weight (oc=1, id=1, kh=3, kw=3, ic=1)
+  int weight_height = test_param->weight_shapes[2];
+  int weight_width = test_param->weight_shapes[3];
+#endif
+
+  int stride_depth = test_param->weight_strides[SPATIAL_D];
+  int stride_height = test_param->weight_strides[SPATIAL_H];
+  int stride_width = test_param->weight_strides[SPATIAL_W];
+  int dilation_depth = test_param->dilations[SPATIAL_D];
+  int dilation_height = test_param->dilations[SPATIAL_H];
+  int dilation_width = test_param->dilations[SPATIAL_W];
+
+  int input_strides[5];
+  int output_strides[5];
+  int weight_strides[5];
+
+  float *input_data = (float *)test_param->input_data;
+  float *output_data = (float *)test_param->output_data;
+  float *weight_data = (float *)test_param->weight_data;
+  float *bias_data = (float *)test_param->bias_data;
+
+  // input/output shape (n, c, d, h, w)
+  get_strides_from_shapes5d(input_strides, test_param->input_shapes,
+                           sizeof(float));
+  get_strides_from_shapes5d(output_strides, test_param->output_shapes,
+                            sizeof(float));
+
+  // weight shape (oc, kd, kh, kw, ic)
+  get_strides_from_shapes5d(weight_strides, test_param->weight_shapes,
+                            sizeof(float));
+
+  memset(output_data, 0,
+         sizeof(float) * batch * output_channel * output_depth *
+         output_height * output_width);
+
+#ifdef DUMP_MSG
+  printf("  %s =>\n", __FUNCTION__);
+#endif
+
+  for (int i = 0; i < batch; i++) {
+    for (int oc = 0; oc < output_channel; oc++) {
+      for (int oz = 0; oz < output_depth; oz++) {
+        for (int oy = 0; oy < output_height; oy++) {
+          for (int ox = 0; ox < output_width; ox++) {
+            for (int ic = 0; ic < input_channel; ic++) {
+              for (int kz = 0; kz < weight_depth; kz++) {
+                const int iz =
+                  oz * stride_depth + kz * dilation_depth - padding_d_start;
+
+#ifdef DUMP_MSG
+                printf("    [i=%d][oc=%d][oz=%d][oy=%d][ox=%d][ic=%d][kz=%d]" \
+                       "iz= %d = %d(oz) * %d(stride_depth) + "\
+                       "%d(kz) * %d(dilation_depth) - %d(padding_d_start)\n",
+                      i, oc, oz, oy, ox, ic, kz,
+                      iz, oz, stride_depth, kz, dilation_depth,
+                      padding_d_start);
+#endif
+
+                if (iz < input_depth) {
+                  for (int ky = 0; ky < weight_height; ky++) {
+                    const int iy =
+                        oy * stride_height + ky * dilation_height - padding_h_start;
+                    if (iy < input_height) {
+                      for (int kx = 0; kx < weight_width; kx++) {
+                        const int ix =
+                            ox * stride_width + kx * dilation_width - padding_w_start;
+                        if (ix < input_width) {
+                          int input_poss[5] = {i, ic, iz, iy, ix};
+                          int input_offset =
+                            get_tensor5d_offset(input_poss, input_strides)
+                                / input_strides[5 - 1];
+
+                          // pytorch (Oc=1, Id=1, Ic=1, kh=3, kw=3)
+                          int weight_poss[5] = {
+                              oc, ic, kz, ky, kx};
+
+                          int weight_offset =
+                            get_tensor5d_offset(weight_poss, weight_strides)
+                                / weight_strides[5 - 1];
+
+                          int output_poss[5] = {i, oc, oz, oy, ox};
+                          int output_offset =
+                            get_tensor5d_offset(output_poss, output_strides)
+                                / output_strides[5 - 1];
+
+                          output_data[output_offset] +=
+                            input_data[input_offset] * weight_data[weight_offset];
+
+#ifdef DUMP_MSG
+                          printf("      [n=%d][oc=%d][oz=%d][oh=%d][ow=%d]" \
+                                 "[ic=%d][kz=%d[ky=%d][kx=%d] output[%d](%f) "\
+                                 "+= input[n=%d][ic=%d][iz=%d][iy=%d][ix=%d]"\
+                                 "[%d](%f) * weight[oc=%d][ic=%d][kz=%d]"\
+                                 "[ky=%d][kx=%d][%d](%f) = %f\n",
+                              i, oc, oz, oy, ox, ic, kz, ky, kx, output_offset,
+                              output_data[output_offset] -
+                              input_data[input_offset] * weight_data[weight_offset],
+                              i, ic, iz, iy, ix,
+                              input_offset, input_data[input_offset],
+                              oc, ic, kz, ky, kx,
+                              weight_offset, weight_data[weight_offset],
+                              output_data[output_offset]);
+#endif
+
+                        } // if (ix < input_width
+                      } // for (int kx = 0; kx < weight_width; kx++)
+                    } // if (iy < input_height)
+                  } // for (int ky = 0; ky < weight_height; ky++)
+                } // if (iz < input_depth)
+
+              } // for (int ow = 0; ow < output_width; ow++) {
+            } // for (int ic = 0; ic < input_channel; ic++)
+          } // for (int oh = 0; oh < output_height; oh++) {
+        } // for (int kz = 0; kz < weight_depth; kz++)
+      } // for (int od = 0; od < output_depth; od++)
+    } // for (int oc = 0; oc < output_channel; oc++)
+  } // for (int i = 0; i < batch; ++i) {
+
+  for (int i = 0; i < batch; ++i) {
+    for (int oy = 0; oy < output_height; oy++) {
+      for (int ox = 0; ox < output_width; ox++) {
+        for (int oc = 0; oc < output_channel; oc++) {
+          for (int od = 0; od < output_depth; od++) {
+            int output_poss[5] = {i, oc, od, oy, ox};
+            int output_offset =
+                get_tensor5d_offset(output_poss, output_strides)
+                    / output_strides[5 - 1];
+            output_data[output_offset] += bias_data[oc];
+
+#ifdef DUMP_MSG
+            printf("    [n=%d][oy=%d][ox=%d][oc=%d][od=%d] output[%d](%f)" \
+                   " += bias(%f) = %f\n",
+                   i, oy, ox, oc, od, output_offset,
+                   output_data[output_offset] - bias_data[oc], bias_data[oc],
+                   output_data[output_offset]);
+#endif
+
+          } // for (int od = 0; od < output_depth; od++)
+        } // for (int oc = 0; oc < output_channel; oc++)
+      } // for (int ox = 0; ox < output_width; ox++)
+    } // for (int oy = 0; oy < output_height; oy++)
+  } // for (int i = 0; i < batch; ++i)
+
+#ifdef DUMP_MSG
+  printf("  <= %s\n", __FUNCTION__);
+#endif
+
+}
+
+static void load_bias(cvk_context_t *cvk_ctx,
+                      uint64_t ga_bias,
+                      cvk_tl_t *tl_bias_al)
+{
+  cvk_fmt_t fmt = tl_bias_al->fmt;
+  cvk_tg_shape_t gm_bias_shape = {
+      tl_bias_al->shape.n, tl_bias_al->shape.c, tl_bias_al->shape.h,
+      tl_bias_al->shape.w};
+  cvk_tg_t gm_bias;
+  cvk_ctx->ops->gmem_init_tensor(cvk_ctx, &gm_bias, gm_bias_shape, fmt);
+  gm_bias.start_address = ga_bias;
+
+  cvk_tdma_g2l_tensor_copy_param_t param;
+  memset(&param, 0, sizeof(param));
+  param.src = &gm_bias;
+  param.dst = tl_bias_al;
+  cvk_ctx->ops->tdma_g2l_bf16_tensor_copy(cvk_ctx, &param);
+}
+
+// Input (n, ic, id, ih, iw)
+static void load_input(cvk_context_t *cvk_ctx,
+                       int n, int ic, int id, int ih, int iw,
+                       int idi,
+                       uint64_t ga_input,
+                       cvk_tl_t *tl_input_al)
+{
+  // reshape (n, ic, id, ih, iw) => (n, ic, id, ih*iw)
+  cvk_fmt_t fmt = tl_input_al->fmt;
+  cvk_tl_shape_t tl_shape = {(uint32_t)n, (uint32_t)ic, 1, (uint32_t)(ih*iw)};
+  cvk_tl_t tl_input;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tl_input, tl_shape, fmt,
+                                 tl_input_al->eu_align);
+  tl_input.start_address = tl_input_al->start_address;
+
+  uint32_t ds = (fmt == CVK_FMT_BF16) ? 2 : 1;
+  cvk_tg_shape_t gm_input_shape = {
+      (uint32_t)n, (uint32_t)ic, 1, (uint32_t)(ih*iw)};
+  cvk_tg_stride_t gm_input_stride = {ic*id*ih*iw*ds, id*ih*iw*ds, ih*iw*ds, ds};
+
+  if (idi >= 0 && idi < id) {
+    cvk_tg_t gm_input;
+    cvk_ctx->ops->gmem_init_tensor(cvk_ctx, &gm_input, gm_input_shape, fmt);
+    gm_input.start_address = ga_input + gm_input_stride.h * idi;
+    gm_input.stride = gm_input_stride;
+
+    cvk_tdma_g2l_tensor_copy_param_t param;
+    memset(&param, 0, sizeof(param));
+    param.src = &gm_input;
+    param.dst = &tl_input;
+    cvk_ctx->ops->tdma_g2l_bf16_tensor_copy(cvk_ctx, &param);
+  } else {
+    uint32_t elt_size = fmt == CVK_FMT_BF16 ? 2 : 1;
+
+    cvk_tl_shape_t tl_pad_shape = {
+        (uint32_t)n, (uint32_t)ic, 1, (uint32_t)(ih*iw*elt_size)};
+    cvk_tl_t tl_pad;
+    cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tl_pad, tl_pad_shape,
+                                   CVK_FMT_I8, /*eu_align=*/1);
+    tl_pad.start_address = tl_input_al->start_address;
+
+    cvk_tiu_xor_int8_param_t param;
+    memset(&param, 0, sizeof(param));
+    param.res = &tl_pad;
+    param.a = &tl_pad;
+    param.b = &tl_pad;
+    cvk_ctx->ops->tiu_xor_int8(cvk_ctx, &param);
+  }
+}
+
+// TPU weight (kd, oc, kh*kw, ic)
+static void load_weight(cvk_context_t *cvk_ctx,
+                        int oc, int ic,
+                        int kh, int kw, int kdi,
+                        uint64_t ga_weight,
+                        cvk_tl_t *tl_weight_al)
+{
+  cvk_fmt_t fmt = tl_weight_al->fmt;
+  uint32_t ds = (fmt == CVK_FMT_BF16) ? 2 : 1;
+  cvk_tg_shape_t gm_weight_shape = {
+      1, (uint32_t)oc, (uint32_t)(kh*kw), (uint32_t)ic};
+  cvk_tg_stride_t gm_weight_stride = {oc*kh*kw*ic*ds, kh*kw*ic*ds, ic*ds, ds};
+  cvk_tg_t gm_weight;
+  cvk_ctx->ops->gmem_init_tensor(cvk_ctx, &gm_weight, gm_weight_shape, fmt);
+  gm_weight.start_address = ga_weight + gm_weight_stride.n * kdi;
+
+  cvk_tdma_g2l_tensor_copy_param_t param;
+  memset(&param, 0, sizeof(param));
+  param.src = &gm_weight;
+  param.dst = tl_weight_al;
+  cvk_ctx->ops->tdma_g2l_bf16_tensor_copy(cvk_ctx, &param);
+}
+
+static int get_ps32_mode(int kdi, int kd) {
+  if (kd == 1)
+    return 0;
+
+  if (kdi == 0)
+    return 2; // [1]: write
+  else if (kdi == (kd - 1))
+    return 1; // [0]: read
+
+  return 3; // [1]: write, [0]: read
+}
+
+static void compute(cvk_context_t *cvk_ctx,
+                    int n, int ic,
+                    int kh, int kw,
+                    int pad_top, int pad_bot,
+                    int pad_left, int pad_right,
+                    int oc, int oh, int ow,
+                    int ps32_mode,
+                    cvk_tl_t *tl_input_al,
+                    cvk_tl_t *tl_weight_al,
+                    cvk_tl_t *tl_bias_al,
+                    cvk_tl_t *tl_output_al)
+{
+  cvk_fmt_t fmt = tl_weight_al->fmt;
+  cvk_tl_shape_t tl_output_shape = {
+      (uint32_t)n, (uint32_t)oc, (uint32_t)oh, (uint32_t)ow};
+  cvk_tl_t tl_output;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tl_output, tl_output_shape, fmt,
+                                 /*eu_align=*/1);
+  tl_output.start_address = tl_output_al->start_address;
+  cvk_tl_t *tl_input = tl_input_al;
+  cvk_tl_shape_t tl_weight_shape = {
+      (uint32_t)ic, (uint32_t)oc, (uint32_t)kh, (uint32_t)kw};
+  cvk_tl_t tl_weight;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tl_weight, tl_weight_shape, fmt,
+                                 /*eu_align=*/0);
+  tl_weight.start_address = tl_weight_al->start_address;
+
+  cvk_tl_shape_t tl_bias_shape = {2, (uint32_t)oc, 1, 1};
+  cvk_tl_t tl_bias;
+  if (tl_bias_al) {
+    cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tl_bias, tl_bias_shape, fmt,
+                                  /*eu_align=*/0);
+    tl_bias.start_address = tl_bias_al->start_address;
+  }
+
+  cvk_tiu_pt_convolution_param_t param;
+  memset(&param, 0, sizeof(param));
+  param.ifmap = tl_input;
+  param.ofmap = &tl_output;
+  param.weight = &tl_weight;
+  param.bias = (tl_bias_al && ps32_mode == 1) ? &tl_bias : NULL;
+  param.pad_top = (uint8_t)pad_top;
+  param.pad_bottom = (uint8_t)pad_bot;
+  param.pad_left = (uint8_t)pad_left;
+  param.pad_right = (uint8_t)pad_right;
+  param.stride_h = 1;
+  param.stride_w = 1;
+  param.dilation_h = 1;
+  param.dilation_w = 1;
+  param.ps32_mode = ps32_mode;
+  cvk_ctx->ops->tiu_pt_convolution(cvk_ctx, &param);
+}
+
+static void store_output(cvk_context_t *cvk_ctx,
+                         int oc, int od, int oh, int ow,
+                         int odi,
+                         uint64_t ga_res,
+                         cvk_tl_t *tl_res)
+{
+  cvk_fmt_t fmt = tl_res->fmt;
+  uint32_t ds = (fmt == CVK_FMT_BF16) ? 2 : 1;
+
+  // Global memory shape (n, oc, od, oh, ow)
+  cvk_tg_shape_t tg_res_shape = {
+      tl_res->shape.n, tl_res->shape.c, tl_res->shape.h, tl_res->shape.w};
+  cvk_tg_stride_t tg_stride = {
+      oc * od * oh * ow * ds, od * oh * ow * ds, ow * ds, ds};
+  uint32_t od_stride = oh * ow * ds;
+
+  cvk_tg_t gm_res;
+  cvk_ctx->ops->gmem_init_tensor(cvk_ctx, &gm_res, tg_res_shape, fmt);
+  gm_res.start_address = ga_res + od_stride * odi;
+  gm_res.stride = tg_stride;
+
+  cvk_tdma_l2g_tensor_copy_param_t param;
+  memset(&param, 0, sizeof(param));
+  param.src = tl_res;
+  param.dst = &gm_res;
+  cvk_ctx->ops->tdma_l2g_bf16_tensor_copy(cvk_ctx, &param);
+}
+
+//
+//        N  IC  ID  IH  IW
+// input (1,  2,  4,  3,  3)
+//
+//        OC  IC  KD  KH  KW
+// kernel (4,  2,  2,  2,  2)
+//
+//         N  OC  OD  OH  OW
+// output (1,  4,  3,  2,  2)
+//
+// pytorch:
+//   import torch
+//   import torch.nn as nn
+//   m = nn.Conv3d(2, 4, [2, 2, 2], stride=1)
+//   input = torch.rand(1, 2, 4, 3, 3)
+//   output = m(input)
+//
+static int conv3d_test(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+#ifdef DUMP_MSG
+  printf("===================================\n\n");
+  printf("%s =>\n", __FUNCTION__);
+#endif
+
+  int ret = 0;
+
+  // input (N=1, IC=2, ID=4, IH=3, IW=3)
+  int n = 1, ic = 2, id = 4, ih = 3, iw = 3;
+  float input_data[] = {
+      // IC=0
+      0.6762, 0.9451, 0.9486,   // ic = 0, id = 0, ih = 0
+      0.1077, 0.6062, 0.1011,   // ic = 0, id = 0, ih = 1
+      0.1065, 0.9864, 0.8988,   // ic = 0, id = 0, ih = 2
+
+      0.1986, 0.6289, 0.9028,
+      0.6754, 0.3942, 0.3231,
+      0.4473, 0.9430, 0.1674,
+
+      0.8915, 0.2300, 0.2834,
+      0.8005, 0.9905, 0.5067,
+      0.5892, 0.3737, 0.1197,
+
+      0.2946, 0.8567, 0.7306,
+      0.6123, 0.9854, 0.4904,
+      0.9217, 0.8343, 0.0686,
+
+      // IC=1
+      0.7664, 0.0755, 0.4231,   // ic = 1, id = 0, ih = 0
+      0.4695, 0.5165, 0.9785,   // ic = 1, id = 0, ih = 1
+      0.6668, 0.4878, 0.5354,   // ic = 1, id = 0, ih = 2
+
+      0.1907, 0.7196, 0.7503,
+      0.9623, 0.4420, 0.1084,
+      0.5654, 0.9658, 0.8150,
+
+      0.3203, 0.6839, 0.4136,
+      0.3514, 0.4005, 0.4281,
+      0.1185, 0.0036, 0.1968,
+
+      0.8295, 0.1635, 0.6517,
+      0.0113, 0.9510, 0.4708,
+      0.0686, 0.1143, 0.6780
+  };
+
+  // pytorch weight (Oc=4, Ic=2, kd=2, kh=2, kw=2)
+  int oc = 4, kd = 2, kh = 2, kw = 2;
+  int weight_shapes[5] = {oc, ic, kd, kh, kw};
+  float weight_data[] = {
+       // OC=0
+       0.1715,  0.1906,   // ic = 0, kd = 0, kh = 0
+       0.0437, -0.0401,   // ic = 0, kd = 0, kh = 1
+
+      -0.2442,  0.1911,   // ic = 0, kd = 1, kh = 0
+      -0.0082, -0.0663,   // ic = 0, kd = 1, kh = 1
+
+      -0.1137, -0.0246,   // ic = 1, kd = 0, kh = 0
+       0.2495, -0.0684,   // ic = 1, kd = 0, kh = 1
+
+      -0.0456,  0.0776,   // ic = 1, kd = 1, kh = 0
+       0.1798,  0.1516,   // ic = 1, kd = 1, kh = 1
+
+      // OC=1
+       0.0527,  0.2034,
+      -0.1434, -0.1642,
+
+      -0.0797,  0.0839,
+      -0.0746,  0.1446,
+
+       0.1706,  0.1556,
+       0.0149,  0.1610,
+
+       0.0890,  0.0433,
+       0.0363,  0.2293,
+
+      // OC=2
+       0.2052,  0.0489,
+      -0.1775,  0.0486,
+
+       0.1524,  0.0386,
+       0.1624,  0.0692,
+
+       0.1914,  0.0774,
+      -0.1583,  0.1109,
+
+       0.2034, -0.1709,
+       0.1521, -0.1975,
+
+      // OC=3
+       0.1881,  0.1785,
+       0.0584, -0.0217,
+
+       0.1191,  0.2206,
+       0.1310, -0.0952,
+
+      -0.1424,  0.1071,
+       0.0292, -0.1104,
+
+       0.1335,  0.1561,
+      -0.1034, -0.2354
+  };
+
+  // tpu weight shape (kd=2, oc=4, kh(2)*kw(2), ic=2)
+  // int weight_shapes_tpu[5] = {kd, oc, kh, kw, ic};
+  float weight_data_tpu[kd * oc * kh * kw * ic];
+  convert_tpu_weight_for_ncdhw(weight_data_tpu, weight_data, weight_shapes);
+
+#if 0
+  float weight_data_tpu_ref[] = {
+       0.171500, -0.113700,   // kd = 0, oc = 0, kh = 0, kw = 0
+       0.190600, -0.024600,   // kd = 0, oc = 0, kh = 0, kw = 1
+       0.043700,  0.249500,   // kd = 0, oc = 0, kh = 1, kw = 0
+      -0.040100, -0.068400,   // kd = 0, oc = 0, kh = 1, kw = 1
+
+       0.052700,  0.170600,   // kd = 0, oc = 1
+       0.203400,  0.155600,
+      -0.143400,  0.014900,
+      -0.164200,  0.161000,
+
+       0.205200,  0.191400,   // kd = 0, oc = 2
+       0.048900,  0.077400,
+      -0.177500, -0.158300,
+       0.048600,  0.110900,
+
+       0.188100, -0.142400,   // kd = 0, oc = 3
+       0.178500,  0.107100,
+       0.058400,  0.029200,
+      -0.021700, -0.110400,
+
+      -0.244200, -0.045600,   // kd = 1, oc = 0
+       0.191100,  0.077600,
+      -0.008200,  0.179800,
+      -0.066300,  0.151600,
+
+      -0.079700,  0.089000,   // kd = 1, oc = 1
+       0.083900,  0.043300,
+      -0.074600,  0.036300,
+       0.144600,  0.229300,
+
+       0.152400,  0.203400,   // kd = 1, oc = 2
+       0.038600, -0.170900,
+       0.162400,  0.152100,
+       0.069200, -0.197500,
+
+       0.119100,  0.133500,   // kd = 1, oc = 3
+       0.220600,  0.156100,
+       0.131000, -0.103400,
+      -0.095200, -0.235400,
+  };
+#endif
+
+  // dumpFloatData(weight_data, weight_shapes);
+  // dumpFloatData(weight_data_tpu, weight_shapes_tpu);
+
+  // bias (4)
+  float bias_data[] = {
+    0.1204, -0.1286, -0.0339, -0.1120
+  };
+
+  // output (N=1, Oc=4, Od=3, Oh=2, Ow=2)
+  int od = 3, oh = 2, ow = 2;
+  float ref_output_data[] = {
+      // OC=0
+      0.7170, 0.6444,
+      0.3692, 0.4852,
+
+      0.3749, 0.5013,
+      0.2489, 0.3058,
+
+      0.4620, 0.3949,
+      0.5157, 0.2879,
+
+      // OC=1
+      0.4449, 0.4349,
+      0.5010, 0.1843,
+
+      0.2726, 0.4384,
+      0.2482, 0.0854,
+
+      0.3631, 0.1475,
+      0.2504, 0.2950,
+
+      // OC=2
+      0.4633, 0.4587,
+      0.3968, 0.4154,
+
+      0.1917, 0.5096,
+      0.6285, 0.1435,
+
+      0.3697, 0.3493,
+      0.3388, 0.5705,
+
+      // OC=3
+      0.1802, 0.6468,
+      0.0031, 0.0546,
+
+      0.2840, 0.3474,
+      0.3630, 0.2990,
+
+      0.2374, 0.2000,
+      0.6851, 0.5085
+  };
+
+  // dilation = (depth=1, height=1, width=1)
+  int dilation_d = 1, dilation_h = 1, dilation_w = 1;
+
+  // stride = (depth=1, height=1, width=1)
+  int stride_d = 1, stride_h = 1, stride_w = 1;
+
+  // zero padding
+  int pad_top = 0, pad_bot = 0, pad_left = 0, pad_right = 0;
+
+  float output_data_cpu[sizeof(ref_output_data)/sizeof(float)] = {0.0};
+  conv3d_float_ref_for_ncdhw(input_data, weight_data, bias_data, output_data_cpu,
+                             n, ic, id, ih, iw,
+                             oc, od, oh, ow,
+                             kd, kh, kw,
+                             stride_d, stride_h, stride_w,
+                             dilation_d, dilation_h, dilation_w,
+                             0, 0, 0, 0, 0, 0
+                             );
+
+  printf("  %s: compare ref\n", __FUNCTION__);
+  const float precision = 0.0002;
+  for (size_t i = 0; i < sizeof(output_data_cpu)/sizeof(float); i++)
+  {
+    if (fabs(output_data_cpu[i] - ref_output_data[i]) > precision) {
+      printf("    [%d] Error ! val %f, expected %f\n",
+             (int)i, output_data_cpu[i], ref_output_data[i]);
+      ret = -1;
+    }
+  }
+  printf("  %s: compare ref %s\n", __FUNCTION__, ret ? "fail" : "pass");
+
+  // Partial sum
+  //   oc[0]od[0]  = id[0]ic[0] * oc[0]kd[0]ic[0] + id[0]ic[1] * oc[0]kd[0]ic[1]
+  //   oc[0]od[0] += id[1]ic[0] * oc[0]kd[1]ic[0] + id[1]ic[1] * oc[0]kd[1]ic[1]
+  //
+  //   oc[0]od[1]  = id[1]ic[0] * oc[0]kd[0]ic[0] + id[1]ic[1] * oc[0]kd[0]ic[1]
+  //   oc[0]od[1] += ic[2]ic[0] * oc[0]kd[1]ic[0] + id[2]ic[1] * oc[0]kd[1]ic[1]
+  //
+  //   oc[0]od[2]  = id[2]ic[0] * oc[0]kd[0]ic[0] + id[2]ic[1] * oc[0]kd[0]ic[1]
+  //   oc[0]od[2] += ic[3]ic[0] * oc[0]kd[1]ic[0] + id[3]ic[1] * oc[0]kd[1]ic[1]
+  //
+  //   ...
+  //
+  //   oc[3]od[0]  = id[0]ic[0] * oc[3]kd[0]ic[0] + id[0]ic[1] * oc[3]kd[0]ic[1]
+  //   oc[3]od[0] += id[1]ic[0] * oc[3]kd[1]ic[0] + id[1]ic[1] * oc[3]kd[1]ic[1]
+  //
+  //   oc[3]od[1]  = id[1]ic[0] * oc[3]kd[0]ic[0] + id[1]ic[1] * oc[3]kd[0]ic[1]
+  //   oc[3]od[1] += ic[2]ic[0] * oc[3]kd[1]ic[0] + id[2]ic[1] * oc[3]kd[1]ic[1]
+  //
+  //   oc[3]od[2]  = id[2]ic[0] * oc[3]kd[0]ic[0] + id[2]ic[1] * oc[3]kd[0]ic[1]
+  //   oc[3]od[2] += ic[3]ic[0] * oc[3]kd[1]ic[0] + id[3]ic[1] * oc[3]kd[1]ic[1]
+
+  cvk_fmt_t fmt = CVK_FMT_BF16;
+  cvk_tl_t *tl_input_al = NULL;
+  cvk_tl_t *tl_output_al = NULL;
+  cvk_tl_t *tl_weight_al = NULL;
+  cvk_tl_t *tl_bias_al = NULL;
+
+  // Allocate ps32 output
+  {
+    cvk_tl_shape_t shape = {
+        4 * (uint32_t)n, (uint32_t)oc, (uint32_t)oh, (uint32_t)ow}; // 4x
+    tl_output_al = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, shape, fmt,
+                                                   /*eu_align=*/1);
+  }
+
+  // Allocate input
+  {
+    cvk_tl_shape_t shape = {
+        (uint32_t)n, (uint32_t)ic, (uint32_t)ih, (uint32_t)iw};
+    tl_input_al = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, shape, fmt,
+                                                  /*eu_align=*/1);
+  }
+
+  // Allocate weight
+  {
+    cvk_tl_shape_t shape = {
+        1, (uint32_t)oc, (uint32_t)(kh*kw), (uint32_t)ic};
+    tl_weight_al = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, shape, fmt,
+                                                   /*eu_align=*/0);
+  }
+
+  // Allocate bias
+  // bias
+  {
+    cvk_tl_shape_t shape = {2, (uint32_t)oc, 1, 1};
+    tl_bias_al = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, shape, fmt,
+                                                 /*eu_align=*/0);
+  }
+
+  assert(tl_output_al && tl_input_al && tl_weight_al && tl_bias_al &&
+         "Expect all allocated");
+
+  CVI_RT_MEM gm_input_dev_mem = NULL;
+  CVI_RT_MEM gm_weight_dev_mem = NULL;
+  CVI_RT_MEM gm_bias_dev_mem = NULL;
+  CVI_RT_MEM gm_output_dev_mem = NULL;
+  uint64_t ga_input = 0;
+  uint64_t ga_weight = 0;
+  uint64_t ga_bias = 0;
+  uint64_t ga_output = 0;
+
+  // Allocate device memory of input
+  {
+    // shape (1, ic=2, id=4, ih=3, iw=3)
+    // reshape (1, ic=2, id=4, ih=3, iw=3) -> (1, 2, 4, 3x3)
+    int total_len = 1 * ic * id * ih * iw;
+    uint16_t input_bf16_data[total_len];
+    convert_fp32_to_bf16_data(cvk_ctx, input_bf16_data, input_data, total_len);
+
+    gm_input_dev_mem = CVI_RT_MemAlloc(rt_handle, total_len * sizeof(uint16_t));
+
+    // Copy from system memory to device memory
+    CVI_RT_MemCopyS2D(rt_handle, gm_input_dev_mem, (uint8_t *)input_bf16_data);
+
+    ga_input = CVI_RT_MemGetPAddr(gm_input_dev_mem);
+  }
+
+  // Allocate device memory of weight
+  {
+    int len = kd * oc * kh * kw * ic;
+    uint16_t weight_bf16_data_tpu[len];
+    convert_fp32_to_bf16_data(cvk_ctx, weight_bf16_data_tpu, weight_data_tpu,
+                              len);
+
+    gm_weight_dev_mem = CVI_RT_MemAlloc(rt_handle, len * sizeof(uint16_t));
+
+    // Copy from system memory to device memory
+    CVI_RT_MemCopyS2D(rt_handle, gm_weight_dev_mem,
+                      (uint8_t *)weight_bf16_data_tpu);
+
+    ga_weight = CVI_RT_MemGetPAddr(gm_weight_dev_mem);
+  }
+
+  // Allocate device memory of bias
+  {
+    int len = oc;
+    uint16_t bias_bf16_data_tpu[len];
+    convert_fp32_to_bf16_data(cvk_ctx, bias_bf16_data_tpu, bias_data, len);
+
+    gm_bias_dev_mem = CVI_RT_MemAlloc(rt_handle, len * sizeof(uint16_t));
+
+    // Copy from system memory to device memory
+    CVI_RT_MemCopyS2D(rt_handle, gm_bias_dev_mem,
+                      (uint8_t *)bias_bf16_data_tpu);
+
+    ga_bias = CVI_RT_MemGetPAddr(gm_bias_dev_mem);
+  }
+
+  // Allocate device memory of output
+  {
+    int len = n * oc * od * oh * ow;
+    gm_output_dev_mem = CVI_RT_MemAlloc(rt_handle, len * sizeof(uint16_t));
+    ga_output = CVI_RT_MemGetPAddr(gm_output_dev_mem);
+  }
+
+  assert(gm_input_dev_mem && gm_output_dev_mem && gm_weight_dev_mem &&
+         gm_bias_dev_mem && "Expect valid gm dev mem");
+  assert(ga_input && ga_output && ga_weight && ga_bias && "Expect valid gaddr");
+
+  load_bias(cvk_ctx, ga_bias, tl_bias_al);
+
+  for (int odi = 0; odi < od; odi++) {
+    int id_start = odi; // not support padding
+
+    for (int kdi = 0; kdi < kd; kdi++) {
+      int idi = id_start + kdi;
+      int ps32_mode = get_ps32_mode(kdi, kd);
+
+      load_input(cvk_ctx, n, ic, id, ih, iw, idi, ga_input, tl_input_al);
+      load_weight(cvk_ctx, oc, ic, kh, kw, kdi, ga_weight, tl_weight_al);
+      compute(cvk_ctx, n, ic, kh, kw,
+              pad_top, pad_bot, pad_left, pad_right,
+              oc, oh, ow, ps32_mode, tl_input_al,
+              tl_weight_al, tl_bias_al, tl_output_al);
+    }
+    store_output(cvk_ctx, oc, od, oh, ow, odi, ga_output, tl_output_al);
+  }
+
+  CVI_RT_Submit(cvk_ctx);
+
+  // copy from device memory to system memory
+  int output_len = n * oc * od * oh * ow;
+
+  uint16_t ref_output_bf16_data[output_len];
+  convert_fp32_to_bf16_data(cvk_ctx, ref_output_bf16_data, ref_output_data,
+                            output_len);
+
+  uint16_t output_bf16_data_tpu[output_len];
+  CVI_RT_MemCopyD2S(rt_handle, (uint8_t *) output_bf16_data_tpu,
+                    gm_output_dev_mem);
+
+  printf("  %s: compare tpu\n", __FUNCTION__);
+  const float tpu_precision = 0.01;
+  for (int i = 0; i < output_len; i++) {
+    float tpu_data = cvk_convert_bf16_fp32(output_bf16_data_tpu[i]);
+    if (fabs(tpu_data - ref_output_data[i]) > tpu_precision) {
+      printf("    [%d] Error ! val %f(0x%x), expected %f(0x%x)\n",
+             (int)i, tpu_data, output_bf16_data_tpu[i], ref_output_data[i],
+             ref_output_bf16_data[i]);
+      ret = -1;
+    }
+  }
+  printf("  %s: compare tpu %s\n", __FUNCTION__, ret ? "fail" : "pass");
+
+  // Reverse order
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_bias_al);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_weight_al);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_input_al);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_output_al);
+
+  CVI_RT_MemFree(rt_handle, gm_input_dev_mem);
+  CVI_RT_MemFree(rt_handle, gm_weight_dev_mem);
+  CVI_RT_MemFree(rt_handle, gm_bias_dev_mem);
+  CVI_RT_MemFree(rt_handle, gm_output_dev_mem);
+
+#ifdef DUMP_MSG
+  printf("<= %s\n", __FUNCTION__);
+  printf("===================================\n\n");
+#endif
+
+  return ret;
+}
+
+// # pytorch
+// #
+// #        N  IC  ID  IH  IW
+// # input (1,  2,  4,  3,  3)
+// #
+// #        OC  IC  KD  KH  KW
+// # kernel (4,  2,  2,  2,  2)
+// #
+// #         N  OC  OD  OH  OW
+// # output (1,  4,  5,  4,  4)
+// #
+// #            IC OC  KD KH KW
+// m = nn.Conv3d(2, 4, [2, 2, 2], stride=(1, 1, 1), padding=(1, 1, 1))
+// input = torch.rand(1, 2, 4, 3, 3)
+// output = m(input)
+//
+static int conv3d_test_ncdhw_pad_dhw(CVI_RT_HANDLE rt_handle,
+                                     cvk_context_t *cvk_ctx)
+{
+#ifdef DUMP_MSG
+  printf("===================================\n\n");
+  printf("%s =>\n", __FUNCTION__);
+#endif
+
+  int ret = 0;
+
+  // input (N=1, IC=2, ID=4, IH=3, IW=3)
+  int n = 1, ic = 2, id = 4, ih = 3, iw = 3;
+  float input_data[] = {
+      // IC=0
+      0.3307, 0.6577, 0.3520,   // ic = 0, id = 0, ih = 0
+      0.5691, 0.1531, 0.6240,   // ic = 0, id = 0, ih = 1
+      0.4324, 0.9731, 0.4587,   // ic = 0, id = 0, ih = 2
+
+      0.6121, 0.5937, 0.8512,
+      0.7932, 0.3473, 0.4032,
+      0.0156, 0.6799, 0.8587,
+
+      0.9278, 0.1046, 0.2478,
+      0.4399, 0.2543, 0.8906,
+      0.0275, 0.0450, 0.1212,
+
+      0.5655, 0.6741, 0.3396,
+      0.6126, 0.6385, 0.5160,
+      0.9062, 0.5286, 0.7064,
+
+      // IC=1
+      0.0512, 0.9951, 0.8289,   // ic = 1, id = 0, ih = 0
+      0.9011, 0.0602, 0.5583,   // ic = 1, id = 0, ih = 1
+      0.5176, 0.9857, 0.8772,   // ic = 1, id = 0, ih = 2
+
+      0.8971, 0.5207, 0.1500,
+      0.8408, 0.2034, 0.7618,
+      0.7618, 0.0702, 0.9254,
+
+      0.2110, 0.1366, 0.5222,
+      0.0626, 0.9902, 0.2842,
+      0.0101, 0.6390, 0.0038,
+
+      0.7045, 0.3892, 0.7232,
+      0.7224, 0.8458, 0.6474,
+      0.0602, 0.9074, 0.4171
+  };
+
+
+  // pytorch weight (Oc=4, Ic=2, kd=2, kh=2, kw=2)
+  int oc = 4, kd = 2, kh = 2, kw = 2;
+  int weight_shapes[5] = {oc, ic, kd, kh, kw};
+  float weight_data[] = {
+      // OC=0
+      -0.2046, -0.2492,   // ic = 0, kd = 0, kh = 0
+      -0.0783,  0.1082,   // ic = 0, kd = 0, kh = 1
+
+       0.1393, -0.1803,   // ic = 0, kd = 1, kh = 0
+      -0.0110, -0.1141,   // ic = 0, kd = 1, kh = 1
+
+       0.0606,  0.1902,   // ic = 1, kd = 0, kh = 0
+       0.1254,  0.1572,   // ic = 1, kd = 0, kh = 1
+
+       0.0887, -0.0336,   // ic = 1, kd = 1, kh = 0
+       0.0918, -0.1099,   // ic = 1, kd = 1, kh = 1
+
+      // OC=1
+      -0.0181, -0.2228,
+      -0.0575, -0.2464,
+
+      -0.0757, -0.0122,
+      -0.1896,  0.1301,
+
+      -0.0215,  0.0568,
+      -0.1381, -0.1621,
+
+      -0.1247, -0.0738,
+      -0.0146,  0.0719,
+
+      // OC=2
+       0.0960, -0.1865,
+      -0.2124, -0.0125,
+
+       0.0159,  0.1148,
+       0.1430,  0.1978,
+
+       0.0292, -0.2130,
+       0.2055,  0.1678,
+
+       0.2236, -0.0215,
+      -0.2171,  0.1709,
+
+      // OC=3
+       0.2186,  0.1488,
+       0.1558,  0.0359,
+
+       0.1822, -0.0433,
+       0.0960,  0.1791,
+
+      -0.0060,  0.0006,
+       0.0400,  0.1488,
+
+       0.1811, -0.1055,
+       0.1138, -0.0898
+  };
+
+  // tpu weight shape (kd=2, oc=4, kh(2)*kw(2), ic=2)
+  // int weight_shapes_tpu[5] = {kd, oc, kh, kw, ic};
+  float weight_data_tpu[kd * oc * kh * kw * ic];
+  convert_tpu_weight_for_ncdhw(weight_data_tpu, weight_data, weight_shapes);
+
+  // bias (4)
+  float bias_data[] = {
+      -0.2107, -0.1894, -0.0108,  0.1728
+  };
+
+  // output (N=1, Oc=4, Od=5, Oh=4, Ow=4)
+  int od = 5, oh = 4, ow = 4;
+  float ref_output_data[] = {
+      // OC=0
+      -2.5403e-01, -3.9400e-01, -2.5784e-01, -1.3846e-01,
+      -4.3596e-01, -2.5972e-01, -2.5080e-01, -4.3702e-02,
+      -4.4977e-01, -2.5769e-01, -3.8422e-01,  1.2387e-03,
+      -3.0602e-01, -3.1306e-01, -9.9820e-02, -6.8943e-02,
+
+      -3.3526e-01, -5.1864e-02, -4.1363e-02, -1.2992e-01,
+      -4.0344e-01, -1.0866e-01, -2.0857e-01, -1.3983e-02,
+      -3.0966e-01,  9.2221e-02, -2.8528e-01, -3.1210e-02,
+      -2.4841e-01, -3.7795e-01, -3.8244e-01, -4.9618e-02,
+
+      -1.3243e-01, -1.7816e-02, -1.5046e-01, -2.1334e-01,
+      -2.0596e-01, -2.3001e-01, -4.0274e-01, -2.1468e-01,
+      -2.1257e-01, -2.7799e-01, -3.3916e-02, -4.9950e-02,
+      -7.4998e-02, -3.4861e-01, -3.4250e-01, -3.1303e-01,
+
+      -2.1902e-01, -2.8536e-01, -1.8272e-01, -1.0197e-01,
+      -6.1921e-01, -3.3074e-01,  4.2541e-02, -9.8628e-02,
+      -5.4856e-01, -2.2603e-01, -2.8005e-01, -2.2485e-01,
+      -3.8100e-01, -9.9595e-02, -1.9782e-01, -9.9829e-02,
+
+      -3.8680e-02, -3.2488e-02, -6.4191e-02, -1.4660e-01,
+      -3.7711e-02, -1.3294e-01, -5.8401e-02, -1.9556e-01,
+      -1.1838e-01, -1.5398e-01, -8.1088e-02, -2.8004e-01,
+      -4.2503e-01, -3.5157e-01, -3.6049e-01, -3.2990e-01,
+
+      // OC=1
+      -1.4269e-01, -9.5723e-02, -2.2320e-01, -2.6823e-01,
+      -5.8374e-02, -3.9911e-01, -3.3735e-01, -4.4587e-01,
+      -1.6939e-01, -2.4322e-01, -3.3348e-01, -4.0600e-01,
+      -2.3289e-01, -3.7133e-01, -4.5634e-01, -3.3350e-01,
+
+      -1.3504e-01, -5.5332e-01, -5.8443e-01, -4.8773e-01,
+      -4.5654e-01, -7.9793e-01, -6.0842e-01, -4.9727e-01,
+      -4.7046e-01, -8.5047e-01, -8.1262e-01, -6.6208e-01,
+      -3.1279e-01, -4.7892e-01, -4.1965e-01, -3.9698e-01,
+
+      -3.4979e-01, -7.3478e-01, -4.8156e-01, -3.1361e-01,
+      -5.7180e-01, -6.9073e-01, -6.5631e-01, -5.9334e-01,
+      -4.5140e-01, -6.4365e-01, -8.3343e-01, -5.1614e-01,
+      -1.5070e-01, -4.0468e-01, -4.2686e-01, -2.3451e-01,
+
+      -3.2802e-01, -3.2160e-01, -3.9730e-01, -3.5070e-01,
+      -4.2998e-01, -6.3385e-01, -8.1355e-01, -5.1874e-01,
+      -2.3089e-01, -5.6220e-01, -7.1846e-01, -4.7895e-01,
+      -2.1047e-01, -3.1337e-01, -4.2336e-01, -2.9714e-01,
+
+      -4.4296e-01, -5.4842e-01, -4.8282e-01, -3.0881e-01,
+      -5.4348e-01, -7.7235e-01, -6.3022e-01, -3.3020e-01,
+      -5.1796e-01, -6.4802e-01, -6.9482e-01, -3.1089e-01,
+      -3.8791e-01, -2.7335e-01, -3.5223e-01, -2.1117e-01,
+
+      // OC=2
+       6.3343e-02,  3.2551e-01,  7.8462e-02, -1.4047e-01,
+       2.9263e-01, -1.3682e-02,  4.7238e-01,  1.4810e-01,
+       2.0917e-01,  5.2640e-01,  2.3049e-01, -9.6724e-04,
+       2.7707e-02,  2.0233e-01,  2.5884e-01,  1.9259e-01,
+
+       2.6804e-01,  1.8736e-01,  3.5448e-01,  1.7387e-01,
+       4.1227e-01,  6.1802e-02,  3.4067e-01, -3.1375e-02,
+      -2.1211e-02,  4.1589e-01,  3.9848e-01,  2.4676e-01,
+      -2.1633e-01, -9.8574e-02, -5.5862e-02,  2.7933e-01,
+
+       3.5165e-01,  2.5434e-01,  1.0813e-01, -2.3880e-01,
+       1.4803e-02,  2.2636e-01,  5.6942e-02,  3.3249e-01,
+      -1.5394e-01,  2.8699e-01,  1.9381e-02,  1.5203e-01,
+      -1.7307e-01, -1.3476e-01, -1.4338e-01,  1.0136e-01,
+
+       2.4526e-01, -1.5181e-02,  2.8220e-01, -6.4634e-02,
+       7.0619e-02,  5.5526e-01,  2.7332e-01, -2.2993e-03,
+       1.3952e-01,  4.8027e-01,  2.7088e-01,  2.2137e-01,
+       8.4666e-02, -8.3372e-02,  2.7218e-01,  1.0539e-01,
+
+       1.0030e-01,  7.0672e-02,  4.3040e-02,  6.5646e-02,
+      -1.5283e-01,  7.5928e-03, -1.1840e-02,  6.6282e-02,
+      -2.8021e-01, -2.6473e-01, -2.3691e-02, -6.7624e-03,
+      -1.9269e-01, -2.1400e-01, -1.5423e-01,  6.9144e-02,
+
+      // OC=3
+       2.2745e-01,  2.3889e-01,  3.3790e-01,  3.0098e-01,
+       1.7417e-01,  2.8818e-01,  4.5343e-01,  5.1056e-01,
+       8.4155e-02,  6.1303e-01,  3.3481e-01,  5.3153e-01,
+       9.9508e-02,  1.9928e-01,  4.1631e-01,  4.1526e-01,
+
+       2.2143e-01,  6.1859e-01,  7.0634e-01,  3.5959e-01,
+       3.2209e-01,  8.9161e-01,  7.0540e-01,  6.7204e-01,
+       1.6198e-01,  1.0484e+00,  7.8346e-01,  8.1161e-01,
+       1.5643e-01,  5.1360e-01,  4.5026e-01,  5.9188e-01,
+
+       4.7556e-01,  5.2241e-01,  3.6209e-01,  3.9465e-01,
+       4.2880e-01,  7.8418e-01,  8.6553e-01,  7.0884e-01,
+       3.8365e-01,  3.9124e-01,  8.4084e-01,  6.5296e-01,
+       1.7330e-01,  2.1039e-01,  5.6763e-01,  3.7774e-01,
+
+       2.7558e-01,  5.7020e-01,  3.8613e-01,  3.4723e-01,
+       2.8224e-01,  9.5750e-01,  6.7976e-01,  6.9007e-01,
+       2.9504e-01,  6.4116e-01,  8.1472e-01,  7.1143e-01,
+       1.3136e-01,  2.4329e-01,  3.8296e-01,  4.0355e-01,
+
+       2.9795e-01,  3.7119e-01,  4.1321e-01,  2.5462e-01,
+       3.8687e-01,  6.6586e-01,  6.1692e-01,  3.4898e-01,
+       3.0586e-01,  6.9549e-01,  5.9051e-01,  4.0845e-01,
+       3.0771e-01,  4.4973e-01,  3.8828e-01,  3.2473e-01,
+  };
+
+  // dilation = (depth=1, height=1, width=1)
+  int dilation_d = 1, dilation_h = 1, dilation_w = 1;
+
+  // stride = (depth=1, height=1, width=1)
+  int stride_d = 1, stride_h = 1, stride_w = 1;
+
+  // padding = (1, 1, 1)
+  int pad_d0 = 1, pad_d1 = 1;
+  int pad_top = 1, pad_bot = 1, pad_left = 1, pad_right = 1;
+
+  float output_data_cpu[sizeof(ref_output_data)/sizeof(float)] = {0.0};
+  conv3d_float_ref_for_ncdhw(
+      input_data, weight_data, bias_data, output_data_cpu,
+      n, ic, id, ih, iw,
+      oc, od, oh, ow,
+      kd, kh, kw,
+      stride_d, stride_h, stride_w,
+      dilation_d, dilation_h, dilation_w,
+      pad_d0, pad_d1,
+      pad_top, pad_bot, pad_left, pad_right);
+
+  printf("  %s: compare ref\n", __FUNCTION__);
+  const float precision = 0.0002;
+  for (size_t i = 0; i < sizeof(output_data_cpu)/sizeof(float); i++)
+  {
+    if (fabs(output_data_cpu[i] - ref_output_data[i]) > precision) {
+      printf("    [%d] Error ! val %f, expected %f\n",
+             (int)i, output_data_cpu[i], ref_output_data[i]);
+      ret = -1;
+    }
+  }
+  printf("  %s: compare ref %s\n", __FUNCTION__, ret ? "fail" : "pass");
+
+  if (ret)
+    return ret;
+
+  cvk_fmt_t fmt = CVK_FMT_BF16;
+  cvk_tl_t *tl_input_al = NULL;
+  cvk_tl_t *tl_output_al = NULL;
+  cvk_tl_t *tl_weight_al = NULL;
+  cvk_tl_t *tl_bias_al = NULL;
+
+  // Allocate ps32 output
+  {
+    cvk_tl_shape_t shape = {
+        4 * (uint32_t)n, (uint32_t)oc, (uint32_t)oh, (uint32_t)ow}; // 4x
+    tl_output_al = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, shape, fmt,
+                                                   /*eu_align=*/1);
+  }
+
+  // Allocate input
+  {
+    cvk_tl_shape_t shape = {
+        (uint32_t)n, (uint32_t)ic, (uint32_t)ih, (uint32_t)iw};
+    tl_input_al = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, shape, fmt,
+                                                  /*eu_align=*/1);
+  }
+
+  // Allocate weight
+  {
+    cvk_tl_shape_t shape = {
+        1, (uint32_t)oc, (uint32_t)(kh*kw), (uint32_t)ic};
+    tl_weight_al = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, shape, fmt,
+                                                   /*eu_align=*/0);
+  }
+
+  // Allocate bias
+  // bias
+  {
+    cvk_tl_shape_t shape = {2, (uint32_t)oc, 1, 1};
+    tl_bias_al = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, shape, fmt,
+                                                 /*eu_align=*/0);
+  }
+
+  assert(tl_output_al && tl_input_al && tl_weight_al && tl_bias_al &&
+         "Expect all allocated");
+
+  CVI_RT_MEM gm_input_dev_mem = NULL;
+  CVI_RT_MEM gm_weight_dev_mem = NULL;
+  CVI_RT_MEM gm_bias_dev_mem = NULL;
+  CVI_RT_MEM gm_output_dev_mem = NULL;
+  uint64_t ga_input = 0;
+  uint64_t ga_weight = 0;
+  uint64_t ga_bias = 0;
+  uint64_t ga_output = 0;
+
+  // Allocate device memory of input
+  {
+    // shape (1, ic=2, id=4, ih=3, iw=3)
+    // reshape (1, ic=2, id=4, ih=3, iw=3) -> (1, 2, 4, 3x3)
+    int total_len = 1 * ic * id * ih * iw;
+    uint16_t input_bf16_data[total_len];
+    convert_fp32_to_bf16_data(cvk_ctx, input_bf16_data, input_data, total_len);
+
+    gm_input_dev_mem = CVI_RT_MemAlloc(rt_handle, total_len * sizeof(uint16_t));
+
+    // Copy from system memory to device memory
+    CVI_RT_MemCopyS2D(rt_handle, gm_input_dev_mem, (uint8_t *)input_bf16_data);
+
+    ga_input = CVI_RT_MemGetPAddr(gm_input_dev_mem);
+  }
+
+  // Allocate device memory of weight
+  {
+    int len = kd * oc * kh * kw * ic;
+    uint16_t weight_bf16_data_tpu[len];
+    convert_fp32_to_bf16_data(cvk_ctx, weight_bf16_data_tpu, weight_data_tpu,
+                              len);
+
+    gm_weight_dev_mem = CVI_RT_MemAlloc(rt_handle, len * sizeof(uint16_t));
+
+    // Copy from system memory to device memory
+    CVI_RT_MemCopyS2D(rt_handle, gm_weight_dev_mem,
+                      (uint8_t *)weight_bf16_data_tpu);
+
+    ga_weight = CVI_RT_MemGetPAddr(gm_weight_dev_mem);
+  }
+
+  // Allocate device memory of bias
+  {
+    int len = oc;
+    uint16_t bias_bf16_data_tpu[len];
+    convert_fp32_to_bf16_data(cvk_ctx, bias_bf16_data_tpu, bias_data, len);
+
+    gm_bias_dev_mem = CVI_RT_MemAlloc(rt_handle, len * sizeof(uint16_t));
+
+    // Copy from system memory to device memory
+    CVI_RT_MemCopyS2D(rt_handle, gm_bias_dev_mem,
+                      (uint8_t *)bias_bf16_data_tpu);
+
+    ga_bias = CVI_RT_MemGetPAddr(gm_bias_dev_mem);
+  }
+
+  // Allocate device memory of output
+  {
+    int len = n * oc * od * oh * ow;
+    gm_output_dev_mem = CVI_RT_MemAlloc(rt_handle, len * sizeof(uint16_t));
+    ga_output = CVI_RT_MemGetPAddr(gm_output_dev_mem);
+  }
+
+  assert(gm_input_dev_mem && gm_output_dev_mem && gm_weight_dev_mem &&
+         gm_bias_dev_mem && "Expect valid gm dev mem");
+  assert(ga_input && ga_output && ga_weight && ga_bias && "Expect valid gaddr");
+
+  load_bias(cvk_ctx, ga_bias, tl_bias_al);
+
+  for (int odi = 0; odi < od; odi++) {
+    for (int kdi = 0; kdi < kd; kdi++) {
+      int idi = odi * stride_d + kdi * dilation_d - pad_d0;
+      int ps32_mode = get_ps32_mode(kdi, kd);
+
+      load_input(cvk_ctx, n, ic, id, ih, iw, idi, ga_input, tl_input_al);
+      load_weight(cvk_ctx, oc, ic, kh, kw, kdi, ga_weight, tl_weight_al);
+      compute(cvk_ctx, n, ic, kh, kw,
+              pad_top, pad_bot, pad_left, pad_right,
+              oc, oh, ow,
+              ps32_mode, tl_input_al,
+              tl_weight_al, tl_bias_al, tl_output_al);
+    }
+    store_output(cvk_ctx, oc, od, oh, ow, odi, ga_output, tl_output_al);
+  }
+
+  CVI_RT_Submit(cvk_ctx);
+
+  // copy from device memory to system memory
+  int output_len = n * oc * od * oh * ow;
+
+  uint16_t ref_output_bf16_data[output_len];
+  convert_fp32_to_bf16_data(cvk_ctx, ref_output_bf16_data, ref_output_data,
+                            output_len);
+
+  uint16_t output_bf16_data_tpu[output_len];
+  CVI_RT_MemCopyD2S(rt_handle, (uint8_t *) output_bf16_data_tpu,
+                    gm_output_dev_mem);
+
+  printf("  %s: compare tpu\n", __FUNCTION__);
+  const float tpu_precision = 0.01;
+  for (int i = 0; i < output_len; i++) {
+    float tpu_data = cvk_convert_bf16_fp32(output_bf16_data_tpu[i]);
+    if (fabs(tpu_data - ref_output_data[i]) > tpu_precision) {
+      printf("    [%d] Error ! val %f(0x%x), expected %f(0x%x)\n",
+             (int)i, tpu_data, output_bf16_data_tpu[i], ref_output_data[i],
+             ref_output_bf16_data[i]);
+      ret = -1;
+    }
+  }
+  printf("  %s: compare tpu %s\n", __FUNCTION__, ret ? "fail" : "pass");
+
+  // Reverse order
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_bias_al);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_weight_al);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_input_al);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_output_al);
+
+  CVI_RT_MemFree(rt_handle, gm_input_dev_mem);
+  CVI_RT_MemFree(rt_handle, gm_weight_dev_mem);
+  CVI_RT_MemFree(rt_handle, gm_bias_dev_mem);
+  CVI_RT_MemFree(rt_handle, gm_output_dev_mem);
+
+#ifdef DUMP_MSG
+  printf("<= %s\n", __FUNCTION__);
+  printf("===================================\n\n");
+#endif
+
+  return ret;
+}
+
+// Input (n, id, ic, ih, iw)
+static void load_input_for_ndchw(cvk_context_t *cvk_ctx,
+                                 int n, int ic, int id, int ih, int iw,
+                                 int pad_d0, int pad_d1,
+                                 uint64_t ga_input,
+                                 cvk_tl_t *tl_input_al)
+{
+  cvk_fmt_t fmt = tl_input_al->fmt;
+  uint32_t start_addr = tl_input_al->start_address;
+
+  // Fill (n, pad0, ic, ih, iw)
+  if (pad_d0) {
+    uint32_t elt_size = fmt == CVK_FMT_BF16 ? 2 : 1;
+
+    cvk_tl_shape_t tl_shape = {
+        (uint32_t)n, (uint32_t)(pad_d0 * ic), (uint32_t)ih,
+        (uint32_t)iw * elt_size};
+    cvk_tl_t tl_pad;
+    cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tl_pad, tl_shape, CVK_FMT_I8,
+                                   /*eu_align=*/1);
+    tl_pad.start_address = start_addr;
+
+    cvk_tiu_xor_int8_param_t param;
+    memset(&param, 0, sizeof(param));
+    param.res = &tl_pad;
+    param.a = &tl_pad;
+    param.b = &tl_pad;
+    cvk_ctx->ops->tiu_xor_int8(cvk_ctx, &param);
+
+    start_addr = addr_after_right_shift(cvk_ctx, start_addr,
+                                        (uint32_t)(pad_d0 * ic),
+                                        tl_pad.stride.c);
+  }
+
+  // reshape (n, id, ic, ih, iw) -> (n, id*ic, ih, iw)
+  cvk_tl_shape_t tl_shape = {
+      (uint32_t)n, (uint32_t)id*ic, (uint32_t)ih, (uint32_t)iw};
+  cvk_tl_t tl_input;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tl_input, tl_shape, fmt,
+                                 tl_input_al->eu_align);
+  tl_input.start_address = start_addr;
+
+  cvk_tg_shape_t gm_input_shape = {
+      (uint32_t)n, (uint32_t)id*ic, (uint32_t)ih, (uint32_t)iw};
+
+  cvk_tg_t gm_input;
+  cvk_ctx->ops->gmem_init_tensor(cvk_ctx, &gm_input, gm_input_shape, fmt);
+  gm_input.start_address = ga_input;
+
+  cvk_tdma_g2l_tensor_copy_param_t param;
+  memset(&param, 0, sizeof(param));
+  param.src = &gm_input;
+  param.dst = &tl_input;
+  cvk_ctx->ops->tdma_g2l_bf16_tensor_copy(cvk_ctx, &param);
+
+  if (pad_d1) {
+    start_addr = addr_after_right_shift(cvk_ctx, start_addr,
+                                        (uint32_t)(id * ic),
+                                        tl_input.stride.c);
+    cvk_tl_shape_t tl_shape = {
+        (uint32_t)n, (uint32_t)(pad_d1 * ic), (uint32_t)ih, (uint32_t)iw};
+    cvk_tl_t tl_pad;
+    cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tl_pad, tl_shape, fmt,
+                                   /*eu_align=*/1);
+    tl_pad.start_address = start_addr;
+
+    cvk_tdma_g2l_tensor_fill_constant_param_t param;
+    memset(&param, 0, sizeof(param));
+    param.constant = cvk_ctx->misc_ops->float_to_bfloat16(cvk_ctx, 0.0);
+    param.dst = &tl_pad;
+    cvk_ctx->ops->tdma_g2l_bf16_tensor_fill_constant(cvk_ctx, &param);
+  }
+}
+
+static void convert_tpu_weight_for_ndchw(
+    float *tpu_weight, float cpu_weight[5], int cpu_shapes[5])
+{
+  // Permute
+  //     0   1   2   3   4       0   3   4   2   1
+  //   (oc, ic, kd, kh, kw) -> (oc, kh, kw, kd, ic)
+  int orders[5] = {0, 3, 4, 2, 1};
+  permute5d(tpu_weight, cpu_weight, cpu_shapes, orders);
+}
+
+// TPU weight (1, oc, kh*kw, kd*ic)
+static void load_weight_for_ndchw(cvk_context_t *cvk_ctx,
+                                  int oc, int ic, int kd,
+                                  int kh, int kw,
+                                  uint64_t ga_weight,
+                                  cvk_tl_t *tl_weight_al)
+{
+  cvk_fmt_t fmt = tl_weight_al->fmt;
+  cvk_tg_shape_t gm_weight_shape = {
+      1, (uint32_t)oc, (uint32_t)kh*kw, (uint32_t)kd*ic};
+  cvk_tg_t gm_weight;
+  cvk_ctx->ops->gmem_init_tensor(cvk_ctx, &gm_weight, gm_weight_shape, fmt);
+  gm_weight.start_address = ga_weight;
+
+  cvk_tdma_g2l_tensor_copy_param_t param;
+  memset(&param, 0, sizeof(param));
+  param.src = &gm_weight;
+  param.dst = tl_weight_al;
+  cvk_ctx->ops->tdma_g2l_bf16_tensor_copy(cvk_ctx, &param);
+}
+
+static void store_output_for_ndchw(cvk_context_t *cvk_ctx,
+                                   int oc, int od, int oh, int ow,
+                                   int odi,
+                                   uint64_t ga_output,
+                                   cvk_tl_t *tl_output)
+{
+  assert(odi < od);
+
+  cvk_fmt_t fmt = tl_output->fmt;
+  uint32_t ds = (fmt == CVK_FMT_BF16) ? 2 : 1;
+
+  // Global memory shape (n, od, oc, oh, ow)
+  cvk_tg_shape_t tg_output_shape = {
+      tl_output->shape.n, tl_output->shape.c, tl_output->shape.h,
+      tl_output->shape.w};
+  cvk_tg_stride_t tg_stride = {
+      oc * oh * ow * ds, oh * ow * ds, ow * ds, ds};
+
+  cvk_tg_t gm_output;
+  cvk_ctx->ops->gmem_init_tensor(cvk_ctx, &gm_output, tg_output_shape, fmt);
+  gm_output.start_address = ga_output + tg_stride.n * odi;
+
+  cvk_tdma_l2g_tensor_copy_param_t param;
+  memset(&param, 0, sizeof(param));
+  param.src = tl_output;
+  param.dst = &gm_output;
+  cvk_ctx->ops->tdma_l2g_bf16_tensor_copy(cvk_ctx, &param);
+}
+
+static void compute_for_ndchw(cvk_context_t *cvk_ctx,
+                              int ic, int id, int ih, int iw,
+                              int kd, int kh, int kw,
+                              int sd, int sh, int sw,
+                              int pad_top, int pad_bot,
+                              int pad_left, int pad_right,
+                              int oc, int oh, int ow,
+                              int odi,
+                              cvk_tl_t *tl_input_al,
+                              cvk_tl_t *tl_weight_al,
+                              cvk_tl_t *tl_bias_al,
+                              cvk_tl_t *tl_output_al)
+{
+  (void)id;
+
+  cvk_fmt_t fmt = tl_weight_al->fmt;
+  cvk_tl_shape_t tl_input_shape = {
+      1, (uint32_t)kd*ic, (uint32_t)ih, (uint32_t)iw};
+  cvk_tl_t tl_input;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tl_input, tl_input_shape, fmt,
+                                 /*eu_align=*/1);
+  tl_input.start_address = addr_after_right_shift(cvk_ctx,
+                                                  tl_input_al->start_address,
+                                                  (uint32_t)(odi * sd * kd),
+                                                  tl_input.stride.c);
+
+  cvk_tl_shape_t tl_output_shape = {
+      1, (uint32_t)oc, (uint32_t)oh, (uint32_t)ow};
+  cvk_tl_t tl_output;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tl_output, tl_output_shape, fmt,
+                                 /*eu_align=*/1);
+  tl_output.start_address = tl_output_al->start_address;
+
+  cvk_tl_shape_t tl_weight_shape = {
+      (uint32_t)kd*ic, (uint32_t)oc, (uint32_t)kh, (uint32_t)kw};
+  cvk_tl_t tl_weight;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tl_weight, tl_weight_shape, fmt,
+                                 /*eu_align=*/0);
+  tl_weight.start_address = tl_weight_al->start_address;
+
+  cvk_tl_t tl_bias;
+  if (tl_bias_al) {
+    cvk_tl_shape_t tl_bias_shape = {2, (uint32_t)oc, 1, 1};
+    cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tl_bias, tl_bias_shape, fmt,
+                                   /*eu_align=*/0);
+    tl_bias.start_address = tl_bias_al->start_address;
+  }
+
+  cvk_tiu_pt_convolution_param_t param;
+  memset(&param, 0, sizeof(param));
+  param.ifmap = &tl_input;
+  param.ofmap = &tl_output;
+  param.weight = &tl_weight;
+  param.bias = tl_bias_al ? &tl_bias : NULL;
+  param.pad_top = (uint8_t)pad_top;
+  param.pad_bottom = (uint8_t)pad_bot;
+  param.pad_left = (uint8_t)pad_left;
+  param.pad_right = (uint8_t)pad_right;
+  param.stride_h = sh;
+  param.stride_w = sw;
+  param.dilation_h = 1;
+  param.dilation_w = 1;
+  cvk_ctx->ops->tiu_pt_convolution(cvk_ctx, &param);
+}
+
+static int conv3d_test_ndchw(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+#ifdef DUMP_MSG
+  printf("===================================\n\n");
+  printf("%s =>\n", __FUNCTION__);
+#endif
+
+  int ret = 0;
+
+  // input (N=1, IC=2, ID=4, IH=3, IW=3)
+  int n = 1, ic = 2, id = 4, ih = 3, iw = 3;
+  int input_shapes[5] = {n, ic, id, ih, iw};
+  float input_data[] = {
+      // IC=0
+      0.6762, 0.9451, 0.9486,   // ic = 0, id = 0, ih = 0
+      0.1077, 0.6062, 0.1011,   // ic = 0, id = 0, ih = 1
+      0.1065, 0.9864, 0.8988,   // ic = 0, id = 0, ih = 2
+
+      0.1986, 0.6289, 0.9028,
+      0.6754, 0.3942, 0.3231,
+      0.4473, 0.9430, 0.1674,
+
+      0.8915, 0.2300, 0.2834,
+      0.8005, 0.9905, 0.5067,
+      0.5892, 0.3737, 0.1197,
+
+      0.2946, 0.8567, 0.7306,
+      0.6123, 0.9854, 0.4904,
+      0.9217, 0.8343, 0.0686,
+
+      // IC=1
+      0.7664, 0.0755, 0.4231,   // ic = 1, id = 0, ih = 0
+      0.4695, 0.5165, 0.9785,   // ic = 1, id = 0, ih = 1
+      0.6668, 0.4878, 0.5354,   // ic = 1, id = 0, ih = 2
+
+      0.1907, 0.7196, 0.7503,
+      0.9623, 0.4420, 0.1084,
+      0.5654, 0.9658, 0.8150,
+
+      0.3203, 0.6839, 0.4136,
+      0.3514, 0.4005, 0.4281,
+      0.1185, 0.0036, 0.1968,
+
+      0.8295, 0.1635, 0.6517,
+      0.0113, 0.9510, 0.4708,
+      0.0686, 0.1143, 0.6780
+  };
+
+  // tpu input shape (n=1, id=4, ic=2, ih=3, iw=3)
+  float input_data_tpu[n * id * ic * ih * iw];
+  convert_ncdhw_to_ndchw(input_data_tpu, input_data, input_shapes);
+
+#if 0
+  int input_shapes_tpu[5] = {n, id, ic, ih, iw};
+  dumpFloatData(input_data_tpu, input_shapes_tpu);
+
+  float input_data_tpu_ref[] = {
+      0.676200, 0.945100, 0.948600, // id=0, ic=0, ih=0, iw=0
+      0.107700, 0.606200, 0.101100,
+      0.106500, 0.986400, 0.898800,
+
+      0.766400, 0.075500, 0.423100, // id=0, ic=1, ih=0, iw=0
+      0.469500, 0.516500, 0.978500,
+      0.666800, 0.487800, 0.535400,
+
+      0.198600, 0.628900, 0.902800, // id=1, ic=0, ih=0, iw=0
+      0.675400, 0.394200, 0.323100,
+      0.447300, 0.943000, 0.167400,
+
+      0.190700, 0.719600, 0.750300, // id=1, ic=1, ih=0, iw=0
+      0.962300, 0.442000, 0.108400,
+      0.565400, 0.965800, 0.815000,
+
+      0.891500, 0.230000, 0.283400, // id=2, ic=0, ih=0, iw=0
+      0.800500, 0.990500, 0.506700,
+      0.589200, 0.373700, 0.119700,
+
+      0.320300, 0.683900, 0.413600, // id=2, ic=1, ih=0, iw=0
+      0.351400, 0.400500, 0.428100,
+      0.118500, 0.003600, 0.196800,
+
+      0.294600, 0.856700, 0.730600, // id=3, ic=0, ih=0, iw=0
+      0.612300, 0.985400, 0.490400,
+      0.921700, 0.834300, 0.068600,
+
+      0.829500, 0.163500, 0.651700, // id=3, ic=1, ih=0, iw=0
+      0.011300, 0.951000, 0.470800,
+      0.068600, 0.114300, 0.678000,
+  };
+#endif
+
+  // pytorch weight (Oc=4, Ic=2, kd=2, kh=2, kw=2)
+  int oc = 4, kd = 2, kh = 2, kw = 2;
+  int weight_shapes[5] = {oc, ic, kd, kh, kw};
+  float weight_data[] = {
+       // OC=0
+       0.1715,  0.1906,   // ic = 0, kd = 0, kh = 0
+       0.0437, -0.0401,   // ic = 0, kd = 0, kh = 1
+
+      -0.2442,  0.1911,   // ic = 0, kd = 1, kh = 0
+      -0.0082, -0.0663,   // ic = 0, kd = 1, kh = 1
+
+      -0.1137, -0.0246,   // ic = 1, kd = 0, kh = 0
+       0.2495, -0.0684,   // ic = 1, kd = 0, kh = 1
+
+      -0.0456,  0.0776,   // ic = 1, kd = 1, kh = 0
+       0.1798,  0.1516,   // ic = 1, kd = 1, kh = 1
+
+      // OC=1
+       0.0527,  0.2034,
+      -0.1434, -0.1642,
+
+      -0.0797,  0.0839,
+      -0.0746,  0.1446,
+
+       0.1706,  0.1556,
+       0.0149,  0.1610,
+
+       0.0890,  0.0433,
+       0.0363,  0.2293,
+
+      // OC=2
+       0.2052,  0.0489,
+      -0.1775,  0.0486,
+
+       0.1524,  0.0386,
+       0.1624,  0.0692,
+
+       0.1914,  0.0774,
+      -0.1583,  0.1109,
+
+       0.2034, -0.1709,
+       0.1521, -0.1975,
+
+      // OC=3
+       0.1881,  0.1785,
+       0.0584, -0.0217,
+
+       0.1191,  0.2206,
+       0.1310, -0.0952,
+
+      -0.1424,  0.1071,
+       0.0292, -0.1104,
+
+       0.1335,  0.1561,
+      -0.1034, -0.2354
+  };
+
+  // tpu weight shape (1, oc=4, kh(2)*kw(2), kd(2)*ic(2))
+  float weight_data_tpu[oc * kh * kw * kd * ic];
+  convert_tpu_weight_for_ndchw(weight_data_tpu, weight_data, weight_shapes);
+
+#if 0
+  int weight_shapes_tpu[5] = {oc, kh, kw, kd, ic};
+  dumpFloatData(weight_data_tpu, weight_shapes_tpu);
+
+  flaot weight_data_tpu_ref[] = {
+       0.171500, -0.113700, // oc=0, kh=0, kw=0, kd=0, ic=0
+      -0.244200, -0.045600,
+
+       0.190600, -0.024600, // oc=0, kh=0, kw=1, kd=0, ic=0
+       0.191100,  0.077600,
+
+       0.043700,  0.249500, // oc=0, kh=1, kw=0, kd=0, ic=0
+      -0.008200,  0.179800,
+
+      -0.040100, -0.068400, // oc=0, kh=1, kw=1, kd=0, ic=0
+      -0.066300,  0.151600,
+
+       0.052700,  0.170600, // oc=1, kh=0, kw=0, kd=0, ic=0
+      -0.079700,  0.089000,
+
+       0.203400,  0.155600,
+       0.083900,  0.043300,
+
+      -0.143400,  0.014900,
+      -0.074600,  0.036300,
+
+      -0.164200,  0.161000,
+       0.144600,  0.229300,
+
+       0.205200,  0.191400,
+       0.152400,  0.203400,
+
+       0.048900,  0.077400,
+       0.038600, -0.170900,
+
+      -0.177500, -0.158300,
+       0.162400,  0.152100,
+
+       0.048600,  0.110900,
+       0.069200, -0.197500,
+
+       0.188100, -0.142400,
+       0.119100,  0.133500,
+
+       0.178500,  0.107100,
+       0.220600,  0.156100,
+
+       0.058400,  0.029200,
+       0.131000, -0.103400,
+
+      -0.021700, -0.110400,
+      -0.095200, -0.235400,
+  };
+#endif
+
+  // bias (4)
+  float bias_data[] = {
+    0.1204, -0.1286, -0.0339, -0.1120
+  };
+
+  // output (N=1, Oc=4, Od=3, Oh=2, Ow=2)
+  int od = 3, oh = 2, ow = 2;
+  int output_shapes[5] = {n, oc, od, oh, ow};
+  float ref_output_data[] = {
+      // OC=0
+      0.7170, 0.6444,
+      0.3692, 0.4852,
+
+      0.3749, 0.5013,
+      0.2489, 0.3058,
+
+      0.4620, 0.3949,
+      0.5157, 0.2879,
+
+      // OC=1
+      0.4449, 0.4349,
+      0.5010, 0.1843,
+
+      0.2726, 0.4384,
+      0.2482, 0.0854,
+
+      0.3631, 0.1475,
+      0.2504, 0.2950,
+
+      // OC=2
+      0.4633, 0.4587,
+      0.3968, 0.4154,
+
+      0.1917, 0.5096,
+      0.6285, 0.1435,
+
+      0.3697, 0.3493,
+      0.3388, 0.5705,
+
+      // OC=3
+      0.1802, 0.6468,
+      0.0031, 0.0546,
+
+      0.2840, 0.3474,
+      0.3630, 0.2990,
+
+      0.2374, 0.2000,
+      0.6851, 0.5085
+  };
+
+  // tpu output (n, od, oc, oh, ow)
+  float ref_output_data_ndchw[n * od * oc * oh * ow];
+  convert_ncdhw_to_ndchw(ref_output_data_ndchw, ref_output_data, output_shapes);
+
+  // dilation = (depth=1, height=1, width=1)
+  int dilation_d = 1, dilation_h = 1, dilation_w = 1;
+
+  // stride = (depth=1, height=1, width=1)
+  int stride_d = 1, stride_h = 1, stride_w = 1;
+
+  // zero padding
+  int pad_d0 = 0, pad_d1 = 0;
+  int pad_top = 0, pad_bot = 0, pad_left = 0, pad_right = 0;
+
+  float output_data_cpu[sizeof(ref_output_data)/sizeof(float)] = {0.0};
+  conv3d_float_ref_for_ncdhw(
+      input_data, weight_data, bias_data, output_data_cpu,
+      n, ic, id, ih, iw,
+      oc, od, oh, ow,
+      kd, kh, kw,
+      stride_d, stride_h, stride_w,
+      dilation_d, dilation_h, dilation_w,
+      pad_d0, pad_d1,
+      pad_top, pad_bot, pad_left, pad_right);
+
+  printf("  %s: compare ref\n", __FUNCTION__);
+  const float precision = 0.0002;
+  for (size_t i = 0; i < sizeof(output_data_cpu)/sizeof(float); i++)
+  {
+    if (fabs(output_data_cpu[i] - ref_output_data[i]) > precision) {
+      printf("    [%d] Error ! val %f, expected %f\n",
+             (int)i, output_data_cpu[i], ref_output_data[i]);
+      ret = -1;
+    }
+  }
+  printf("  %s: compare ref %s\n", __FUNCTION__, ret ? "fail" : "pass");
+
+  if (ret)
+    return ret;
+
+  cvk_fmt_t fmt = CVK_FMT_BF16;
+  cvk_tl_t *tl_input_al = NULL;
+  cvk_tl_t *tl_output_al = NULL;
+  cvk_tl_t *tl_weight_al = NULL;
+  cvk_tl_t *tl_bias_al = NULL;
+
+  // Allocate output
+  {
+    cvk_tl_shape_t shape = {
+        (uint32_t)n, (uint32_t)oc, (uint32_t)oh, (uint32_t)ow};
+    tl_output_al = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, shape, fmt,
+                                                   /*eu_align=*/1);
+  }
+
+  // Allocate input
+  {
+    cvk_tl_shape_t shape = {
+        (uint32_t)n, (uint32_t)id*ic, (uint32_t)ih, (uint32_t)iw};
+    tl_input_al = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, shape, fmt,
+                                                  /*eu_align=*/1);
+  }
+
+  // Allocate weight
+  {
+    cvk_tl_shape_t shape = {
+        1, (uint32_t)oc, (uint32_t)(kh*kw), (uint32_t)kd*ic};
+    tl_weight_al = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, shape, fmt,
+                                                   /*eu_align=*/0);
+  }
+
+  // Allocate bias
+  // bias
+  {
+    cvk_tl_shape_t shape = {2, (uint32_t)oc, 1, 1};
+    tl_bias_al = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, shape, fmt,
+                                                 /*eu_align=*/0);
+  }
+
+  assert(tl_output_al && tl_input_al && tl_weight_al && tl_bias_al &&
+         "Expect all allocated");
+
+  CVI_RT_MEM gm_input_dev_mem = NULL;
+  CVI_RT_MEM gm_weight_dev_mem = NULL;
+  CVI_RT_MEM gm_bias_dev_mem = NULL;
+  CVI_RT_MEM gm_output_dev_mem = NULL;
+  uint64_t ga_input = 0;
+  uint64_t ga_weight = 0;
+  uint64_t ga_bias = 0;
+  uint64_t ga_output = 0;
+
+  // Allocate device memory of input
+  {
+    // shape (1, id=4, ic=2, ih=3, iw=3)
+    // reshape (1, id=4, ic=2, ih=3, iw=3) -> (1, 4, 2, 3x3)
+    int total_len = 1 * id * ic * ih * iw;
+    uint16_t input_bf16_data_tpu[total_len];
+    convert_fp32_to_bf16_data(cvk_ctx, input_bf16_data_tpu, input_data_tpu,
+                              total_len);
+
+    gm_input_dev_mem = CVI_RT_MemAlloc(rt_handle, total_len * sizeof(uint16_t));
+
+    // Copy from system memory to device memory
+    CVI_RT_MemCopyS2D(rt_handle, gm_input_dev_mem,
+                      (uint8_t *)input_bf16_data_tpu);
+
+    ga_input = CVI_RT_MemGetPAddr(gm_input_dev_mem);
+  }
+
+  // Allocate device memory of weight
+  {
+    int len = oc * kh * kw * kd * ic;
+    uint16_t weight_bf16_data_tpu[len];
+    convert_fp32_to_bf16_data(cvk_ctx, weight_bf16_data_tpu, weight_data_tpu,
+                              len);
+
+    gm_weight_dev_mem = CVI_RT_MemAlloc(rt_handle, len * sizeof(uint16_t));
+
+    // Copy from system memory to device memory
+    CVI_RT_MemCopyS2D(rt_handle, gm_weight_dev_mem,
+                      (uint8_t *)weight_bf16_data_tpu);
+
+    ga_weight = CVI_RT_MemGetPAddr(gm_weight_dev_mem);
+  }
+
+  // Allocate device memory of bias
+  {
+    int len = oc;
+    uint16_t bias_bf16_data_tpu[len];
+    convert_fp32_to_bf16_data(cvk_ctx, bias_bf16_data_tpu, bias_data, len);
+
+    gm_bias_dev_mem = CVI_RT_MemAlloc(rt_handle, len * sizeof(uint16_t));
+
+    // Copy from system memory to device memory
+    CVI_RT_MemCopyS2D(rt_handle, gm_bias_dev_mem,
+                      (uint8_t *)bias_bf16_data_tpu);
+
+    ga_bias = CVI_RT_MemGetPAddr(gm_bias_dev_mem);
+  }
+
+  // Allocate device memory of output
+  {
+    int len = n * od * oc * oh * ow;
+    gm_output_dev_mem = CVI_RT_MemAlloc(rt_handle, len * sizeof(uint16_t));
+    ga_output = CVI_RT_MemGetPAddr(gm_output_dev_mem);
+  }
+
+  assert(gm_input_dev_mem && gm_output_dev_mem && gm_weight_dev_mem &&
+         gm_bias_dev_mem && "Expect valid gm dev mem");
+  assert(ga_input && ga_output && ga_weight && ga_bias && "Expect valid gaddr");
+
+  load_bias(cvk_ctx, ga_bias, tl_bias_al);
+  load_weight_for_ndchw(cvk_ctx, oc, ic, kd, kh, kw, ga_weight, tl_weight_al);
+  load_input_for_ndchw(cvk_ctx,
+                       n, ic, id, ih, iw,
+                       pad_d0, pad_d1,
+                       ga_input, tl_input_al);
+
+  for (int odi = 0; odi < od; odi++) {
+    compute_for_ndchw(cvk_ctx,
+                      ic, id, ih, iw,
+                      kd, kh, kw,
+                      stride_d, stride_h, stride_w,
+                      pad_top, pad_bot, pad_left, pad_right,
+                      oc, oh, ow,
+                      odi,
+                      tl_input_al,
+                      tl_weight_al,
+                      tl_bias_al,
+                      tl_output_al);
+    store_output_for_ndchw(cvk_ctx,
+                           oc, od, oh, ow,
+                           odi,
+                           ga_output,
+                           tl_output_al);
+  }
+
+  CVI_RT_Submit(cvk_ctx);
+
+  // copy from device memory to system memory
+  int output_len = n * od * oc * oh * ow;
+
+  uint16_t ref_output_bf16_data[output_len];
+  convert_fp32_to_bf16_data(cvk_ctx, ref_output_bf16_data, ref_output_data,
+                            output_len);
+
+  uint16_t output_bf16_data_tpu[output_len];
+  CVI_RT_MemCopyD2S(rt_handle, (uint8_t *) output_bf16_data_tpu,
+                    gm_output_dev_mem);
+
+  printf("  %s: compare tpu\n", __FUNCTION__);
+  const float tpu_precision = 0.01;
+  for (int i = 0; i < output_len; i++) {
+    float tpu_data = cvk_convert_bf16_fp32(output_bf16_data_tpu[i]);
+    if (fabs(tpu_data - ref_output_data_ndchw[i]) > tpu_precision) {
+      printf("    [%d] Error ! val %f(0x%x), expected %f(0x%x)\n",
+             (int)i, tpu_data, output_bf16_data_tpu[i],
+             ref_output_data_ndchw[i], ref_output_bf16_data[i]);
+      ret = -1;
+    }
+  }
+  printf("  %s: compare tpu %s\n", __FUNCTION__, ret ? "fail" : "pass");
+
+  // Reverse order
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_bias_al);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_weight_al);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_input_al);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_output_al);
+
+  CVI_RT_MemFree(rt_handle, gm_input_dev_mem);
+  CVI_RT_MemFree(rt_handle, gm_weight_dev_mem);
+  CVI_RT_MemFree(rt_handle, gm_bias_dev_mem);
+  CVI_RT_MemFree(rt_handle, gm_output_dev_mem);
+
+#ifdef DUMP_MSG
+  printf("<= %s\n", __FUNCTION__);
+  printf("===================================\n\n");
+#endif
+
+  return ret;
+}
+
+// # pytorch
+// #
+// #        N  IC  ID  IH  IW
+// # input (1,  2,  4,  3,  3)
+// #
+// #        OC  IC  KD  KH  KW
+// # kernel (4,  2,  2,  2,  2)
+// #
+// #         N  OC  OD  OH  OW
+// # output (1,  4,  3,  4,  4)
+// #
+// #            IC OC  KD KH KW
+// m = nn.Conv3d(2, 4, [2, 2, 2], stride=(1, 1, 1), padding=(0, 1, 1))
+// input = torch.rand(1, 2, 4, 3, 3)
+// output = m(input)
+//
+static int conv3d_test_ndchw_pad_hw(CVI_RT_HANDLE rt_handle,
+                                    cvk_context_t *cvk_ctx)
+{
+#ifdef DUMP_MSG
+  printf("===================================\n\n");
+  printf("%s =>\n", __FUNCTION__);
+#endif
+
+  int ret = 0;
+
+  // input (N=1, IC=2, ID=4, IH=3, IW=3)
+  int n = 1, ic = 2, id = 4, ih = 3, iw = 3;
+  int input_shapes[5] = {n, ic, id, ih, iw};
+  float input_data[] = {
+      // IC=0
+      0.3519, 0.4651, 0.9173,   // ic = 0, id = 0, ih = 0
+      0.5175, 0.1068, 0.8725,   // ic = 0, id = 0, ih = 1
+      0.3365, 0.7358, 0.0661,   // ic = 0, id = 0, ih = 2
+
+      0.7301, 0.5292, 0.8247,
+      0.3301, 0.4827, 0.9843,
+      0.8852, 0.3967, 0.9667,
+
+      0.2098, 0.7486, 0.9042,
+      0.6317, 0.7464, 0.0149,
+      0.0671, 0.2331, 0.2570,
+
+      0.0870, 0.1124, 0.6606,
+      0.3854, 0.9084, 0.2225,
+      0.6355, 0.1670, 0.8032,
+
+      // IC=1
+      0.8309, 0.8918, 0.3241,   // ic = 1, id = 0, ih = 0
+      0.1311, 0.1492, 0.8990,   // ic = 1, id = 0, ih = 1
+      0.4111, 0.4590, 0.4945,   // ic = 1, id = 0, ih = 2
+
+      0.4529, 0.5284, 0.3906,
+      0.4197, 0.4409, 0.8277,
+      0.4973, 0.2427, 0.5823,
+
+      0.3362, 0.3004, 0.5242,
+      0.9657, 0.9594, 0.0147,
+      0.6477, 0.6943, 0.2552,
+
+      0.8387, 0.4657, 0.5361,
+      0.4703, 0.6478, 0.8114,
+      0.6255, 0.6939, 0.2478
+  };
+
+  // tpu input shape (n=1, id=4, ic=2, ih=3, iw=3)
+  float input_data_tpu[n * id * ic * ih * iw];
+  convert_ncdhw_to_ndchw(input_data_tpu, input_data, input_shapes);
+
+  // pytorch weight (Oc=4, Ic=2, kd=2, kh=2, kw=2)
+  int oc = 4, kd = 2, kh = 2, kw = 2;
+  int weight_shapes[5] = {oc, ic, kd, kh, kw};
+  float weight_data[] = {
+      // OC=0
+       0.0405, -0.1043,   // ic = 0, kd = 0, kh = 0
+       0.1978,  0.0733,   // ic = 0, kd = 0, kh = 1
+
+      -0.1659, -0.1932,   // ic = 0, kd = 1, kh = 0
+       0.0010, -0.2304,   // ic = 0, kd = 1, kh = 1
+
+      -0.2116, -0.0438,   // ic = 1, kd = 0, kh = 0
+      -0.1418, -0.1537,   // ic = 1, kd = 0, kh = 1
+
+       0.0897,  0.0856,   // ic = 1, kd = 1, kh = 0
+      -0.0758,  0.1184,   // ic = 1, kd = 1, kh = 1
+
+      // OC=1
+      -0.1565, -0.0361,
+       0.0632,  0.1512,
+
+      -0.2068,  0.0470,
+       0.1306,  0.1910,
+
+      -0.2418, -0.0457,
+       0.1569,  0.0661,
+
+       0.0540,  0.0379,
+       0.2167, -0.0688,
+
+      // OC=2
+      -0.0279, -0.1743,
+      -0.2304, -0.0677,
+
+       0.0108, -0.0891,
+      -0.0099, -0.0792,
+
+       0.1598,  0.0055,
+      -0.2395, -0.0643,
+
+      -0.0918,  0.0547,
+       0.0517, -0.0688,
+
+      // OC=3
+       0.0506,  0.1193,
+      -0.0683, -0.1807,
+
+      -0.1150, -0.0912,
+      -0.0225, -0.1393,
+
+       0.0520,  0.1461,
+       0.1875, -0.2178,
+
+       0.0830,  0.1741,
+      -0.0341,  0.0611
+  };
+
+  // tpu weight shape (1, oc=4, kh(2)*kw(2), kd(2)*ic(2))
+  float weight_data_tpu[oc * kh * kw * kd * ic];
+  convert_tpu_weight_for_ndchw(weight_data_tpu, weight_data, weight_shapes);
+
+  // bias (4)
+  float bias_data[] = {
+      0.1053, -0.0710,  0.0016, -0.1625
+  };
+
+  // output (N=1, Oc=4, Od=3, Oh=4, Ow=4)
+  int od = 3, oh = 4, ow = 4;
+  int output_shapes[5] = {n, oc, od, oh, ow};
+  float ref_output_data[] = {
+      // OC=0
+      -1.1119e-01, -1.3884e-01, -9.5057e-02,  2.1200e-01,
+      -7.8709e-02, -3.0321e-01, -5.7672e-01, -4.4566e-02,
+      -1.6587e-01, -9.9638e-02, -3.7471e-01, -2.3886e-01,
+      -7.6292e-02, -2.2308e-01, -1.7159e-01, -1.0482e-01,
+
+       8.0675e-02, -1.9141e-02, -3.2850e-02,  1.7421e-01,
+      -7.3950e-02, -3.2045e-01, -4.1114e-01,  2.9242e-02,
+       6.2759e-02, -4.4301e-02, -2.0282e-01,  5.8408e-02,
+       3.3655e-02,  4.5276e-02, -6.0553e-02,  1.4932e-03,
+
+       1.4830e-01,  7.3535e-02,  7.2533e-02,  1.6984e-01,
+      -1.1557e-02, -2.4226e-01, -9.6795e-02, -9.0924e-02,
+      -2.0409e-01, -5.0648e-01, -4.1673e-01,  1.3535e-01,
+       6.8198e-04, -1.0596e-01, -1.6961e-01, -4.9334e-02,
+
+      // OC=1
+       1.4539e-01,  4.6912e-01,  5.7270e-01,  2.3017e-01,
+       5.0830e-02, -1.9102e-01,  7.6440e-02,  6.1754e-02,
+       1.4862e-01,  3.0930e-01,  2.1541e-01, -2.4954e-01,
+      -4.1520e-02, -3.9902e-01, -3.2361e-01, -3.6940e-01,
+
+       8.6251e-02,  3.8368e-01,  4.9539e-01,  2.7409e-01,
+       3.6369e-02,  2.4048e-01,  2.0503e-01, -2.5604e-01,
+       9.9149e-02,  8.7076e-02,  3.2400e-02, -1.8619e-01,
+      -9.8027e-02, -2.9687e-01, -2.4229e-01, -4.0250e-01,
+
+      -5.8173e-02,  3.1059e-01,  3.9969e-01,  2.7085e-01,
+       1.4254e-01,  4.7329e-01,  1.8244e-01, -2.3881e-01,
+       2.9225e-02, -7.1563e-02, -4.4758e-02,  1.3586e-01,
+      -4.9485e-02, -3.4177e-01, -2.4624e-01, -3.2567e-01,
+
+      // OC=2
+      -1.6466e-01, -4.2935e-01, -4.7211e-01, -2.7528e-01,
+      -1.9397e-01, -2.2121e-01, -4.1602e-01, -3.8243e-01,
+      -2.4789e-01, -3.5001e-01, -6.2618e-01, -5.7601e-02,
+      -1.0650e-01, -1.2604e-01, -2.6756e-02,  3.5703e-02,
+
+      -1.1669e-01, -4.0951e-01, -4.2731e-01, -2.6376e-01,
+      -2.8931e-01, -4.3372e-01, -4.3838e-01, -4.2177e-01,
+      -1.9895e-01, -5.1715e-01, -4.4574e-01, -2.4636e-01,
+      -1.2053e-01, -5.3043e-02, -2.0617e-01,  4.6975e-02,
+
+      -9.8851e-02, -1.9570e-01, -4.0398e-01, -3.1105e-01,
+      -1.6280e-01, -7.2503e-01, -6.4973e-01,  5.0791e-02,
+      -2.5145e-01, -3.3708e-01, -1.9385e-01, -1.8400e-01,
+      -2.8993e-02,  3.8838e-02, -5.7331e-02,  2.1099e-02,
+
+      // OC=3
+      -4.8111e-01, -3.8231e-01, -3.8434e-01, -1.9630e-01,
+      -1.2935e-01, -4.2991e-02, -4.0529e-01, -1.0304e-01,
+      -2.8196e-01, -3.2151e-01, -7.8678e-02, -6.9456e-02,
+      -5.6521e-02, -2.3751e-02, -3.3585e-02, -1.9628e-01,
+
+      -4.0179e-01, -4.4029e-01, -4.5469e-01, -1.8383e-01,
+      -1.4998e-01, -1.9349e-01, -3.6435e-01, -7.3818e-02,
+      -1.8945e-01, -9.8586e-04, -2.1411e-01, -4.1524e-02,
+       1.2230e-01,  1.3655e-01,  1.2234e-01, -9.1697e-02,
+
+      -2.3454e-01, -3.3230e-01, -5.1257e-01, -1.5916e-01,
+      -2.9983e-01, -1.8840e-01,  2.3329e-01, -1.5189e-01,
+      -1.0291e-01,  8.0501e-02, -1.1257e-01, -1.1544e-01,
+      -9.0000e-03,  8.8157e-02, -3.8429e-02, -2.0804e-01,
+  };
+
+  // tpu output (n, od, oc, oh, ow)
+  float ref_output_data_ndchw[n * od * oc * oh * ow];
+  convert_ncdhw_to_ndchw(ref_output_data_ndchw, ref_output_data, output_shapes);
+
+  // dilation = (depth=1, height=1, width=1)
+  int dilation_d = 1, dilation_h = 1, dilation_w = 1;
+
+  // stride = (depth=1, height=1, width=1)
+  int stride_d = 1, stride_h = 1, stride_w = 1;
+
+  // padding = (1, 1, 1)
+  int pad_d0 = 0, pad_d1 = 0;
+  int pad_top = 1, pad_bot = 1, pad_left = 1, pad_right = 1;
+
+  float output_data_cpu[sizeof(ref_output_data)/sizeof(float)] = {0.0};
+  conv3d_float_ref_for_ncdhw(
+      input_data, weight_data, bias_data, output_data_cpu,
+      n, ic, id, ih, iw,
+      oc, od, oh, ow,
+      kd, kh, kw,
+      stride_d, stride_h, stride_w,
+      dilation_d, dilation_h, dilation_w,
+      pad_d0, pad_d1,
+      pad_top, pad_bot, pad_left, pad_right);
+
+  printf("  %s: compare ref\n", __FUNCTION__);
+  const float precision = 0.0002;
+  for (size_t i = 0; i < sizeof(output_data_cpu)/sizeof(float); i++)
+  {
+    if (fabs(output_data_cpu[i] - ref_output_data[i]) > precision) {
+      printf("    [%d] Error ! val %f, expected %f\n",
+             (int)i, output_data_cpu[i], ref_output_data[i]);
+      ret = -1;
+    }
+  }
+  printf("  %s: compare ref %s\n", __FUNCTION__, ret ? "fail" : "pass");
+  if (ret)
+    return ret;
+
+  cvk_fmt_t fmt = CVK_FMT_BF16;
+  cvk_tl_t *tl_input_al = NULL;
+  cvk_tl_t *tl_output_al = NULL;
+  cvk_tl_t *tl_weight_al = NULL;
+  cvk_tl_t *tl_bias_al = NULL;
+
+  // Allocate output
+  {
+    cvk_tl_shape_t shape = {
+        (uint32_t)n, (uint32_t)oc, (uint32_t)oh, (uint32_t)ow};
+    tl_output_al = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, shape, fmt,
+                                                   /*eu_align=*/1);
+  }
+
+  // Allocate input
+  {
+    cvk_tl_shape_t shape = {
+        (uint32_t)n, (uint32_t)id*ic, (uint32_t)ih, (uint32_t)iw};
+    tl_input_al = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, shape, fmt,
+                                                  /*eu_align=*/1);
+  }
+
+  // Allocate weight
+  {
+    cvk_tl_shape_t shape = {
+        1, (uint32_t)oc, (uint32_t)(kh*kw), (uint32_t)kd*ic};
+    tl_weight_al = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, shape, fmt,
+                                                   /*eu_align=*/0);
+  }
+
+  // Allocate bias
+  // bias
+  {
+    cvk_tl_shape_t shape = {2, (uint32_t)oc, 1, 1};
+    tl_bias_al = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, shape, fmt,
+                                                 /*eu_align=*/0);
+  }
+
+  assert(tl_output_al && tl_input_al && tl_weight_al && tl_bias_al &&
+         "Expect all allocated");
+
+  CVI_RT_MEM gm_input_dev_mem = NULL;
+  CVI_RT_MEM gm_weight_dev_mem = NULL;
+  CVI_RT_MEM gm_bias_dev_mem = NULL;
+  CVI_RT_MEM gm_output_dev_mem = NULL;
+  uint64_t ga_input = 0;
+  uint64_t ga_weight = 0;
+  uint64_t ga_bias = 0;
+  uint64_t ga_output = 0;
+
+  // Allocate device memory of input
+  {
+    // shape (1, id=4, ic=2, ih=3, iw=3)
+    // reshape (1, id=4, ic=2, ih=3, iw=3) -> (1, 4, 2, 3x3)
+    int total_len = 1 * id * ic * ih * iw;
+    uint16_t input_bf16_data_tpu[total_len];
+    convert_fp32_to_bf16_data(cvk_ctx, input_bf16_data_tpu, input_data_tpu,
+                              total_len);
+
+    gm_input_dev_mem = CVI_RT_MemAlloc(rt_handle, total_len * sizeof(uint16_t));
+
+    // Copy from system memory to device memory
+    CVI_RT_MemCopyS2D(rt_handle, gm_input_dev_mem,
+                      (uint8_t *)input_bf16_data_tpu);
+
+    ga_input = CVI_RT_MemGetPAddr(gm_input_dev_mem);
+  }
+
+  // Allocate device memory of weight
+  {
+    int len = oc * kh * kw * kd * ic;
+    uint16_t weight_bf16_data_tpu[len];
+    convert_fp32_to_bf16_data(cvk_ctx, weight_bf16_data_tpu, weight_data_tpu,
+                              len);
+
+    gm_weight_dev_mem = CVI_RT_MemAlloc(rt_handle, len * sizeof(uint16_t));
+
+    // Copy from system memory to device memory
+    CVI_RT_MemCopyS2D(rt_handle, gm_weight_dev_mem,
+                      (uint8_t *)weight_bf16_data_tpu);
+
+    ga_weight = CVI_RT_MemGetPAddr(gm_weight_dev_mem);
+  }
+
+  // Allocate device memory of bias
+  {
+    int len = oc;
+    uint16_t bias_bf16_data_tpu[len];
+    convert_fp32_to_bf16_data(cvk_ctx, bias_bf16_data_tpu, bias_data, len);
+
+    gm_bias_dev_mem = CVI_RT_MemAlloc(rt_handle, len * sizeof(uint16_t));
+
+    // Copy from system memory to device memory
+    CVI_RT_MemCopyS2D(rt_handle, gm_bias_dev_mem,
+                      (uint8_t *)bias_bf16_data_tpu);
+
+    ga_bias = CVI_RT_MemGetPAddr(gm_bias_dev_mem);
+  }
+
+  // Allocate device memory of output
+  {
+    int len = n * od * oc * oh * ow;
+    gm_output_dev_mem = CVI_RT_MemAlloc(rt_handle, len * sizeof(uint16_t));
+    ga_output = CVI_RT_MemGetPAddr(gm_output_dev_mem);
+  }
+
+  assert(gm_input_dev_mem && gm_output_dev_mem && gm_weight_dev_mem &&
+         gm_bias_dev_mem && "Expect valid gm dev mem");
+  assert(ga_input && ga_output && ga_weight && ga_bias && "Expect valid gaddr");
+
+  load_bias(cvk_ctx, ga_bias, tl_bias_al);
+  load_weight_for_ndchw(cvk_ctx, oc, ic, kd, kh, kw, ga_weight, tl_weight_al);
+  load_input_for_ndchw(cvk_ctx,
+                       n, ic, id, ih, iw,
+                       pad_d0, pad_d1,
+                       ga_input, tl_input_al);
+
+  for (int odi = 0; odi < od; odi++) {
+    compute_for_ndchw(cvk_ctx,
+                      ic, id, ih, iw,
+                      kd, kh, kw,
+                      stride_d, stride_h, stride_w,
+                      pad_top, pad_bot, pad_left, pad_right,
+                      oc, oh, ow,
+                      odi,
+                      tl_input_al,
+                      tl_weight_al,
+                      tl_bias_al,
+                      tl_output_al);
+    store_output_for_ndchw(cvk_ctx,
+                           oc, od, oh, ow,
+                           odi,
+                           ga_output,
+                           tl_output_al);
+  }
+
+  CVI_RT_Submit(cvk_ctx);
+
+  // copy from device memory to system memory
+  int output_len = n * od * oc * oh * ow;
+
+  uint16_t ref_output_bf16_data[output_len];
+  convert_fp32_to_bf16_data(cvk_ctx, ref_output_bf16_data, ref_output_data,
+                            output_len);
+
+  uint16_t output_bf16_data_tpu[output_len];
+  CVI_RT_MemCopyD2S(rt_handle, (uint8_t *) output_bf16_data_tpu,
+                    gm_output_dev_mem);
+
+  printf("  %s: compare tpu\n", __FUNCTION__);
+  const float tpu_precision = 0.01;
+  for (int i = 0; i < output_len; i++) {
+    float tpu_data = cvk_convert_bf16_fp32(output_bf16_data_tpu[i]);
+    if (fabs(tpu_data - ref_output_data_ndchw[i]) > tpu_precision) {
+      printf("    [%d] Error ! val %f(0x%x), expected %f(0x%x)\n",
+             (int)i, tpu_data, output_bf16_data_tpu[i],
+             ref_output_data_ndchw[i], ref_output_bf16_data[i]);
+      ret = -1;
+    }
+  }
+  printf("  %s: compare tpu %s\n", __FUNCTION__, ret ? "fail" : "pass");
+
+  // Reverse order
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_bias_al);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_weight_al);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_input_al);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_output_al);
+
+  CVI_RT_MemFree(rt_handle, gm_input_dev_mem);
+  CVI_RT_MemFree(rt_handle, gm_weight_dev_mem);
+  CVI_RT_MemFree(rt_handle, gm_bias_dev_mem);
+  CVI_RT_MemFree(rt_handle, gm_output_dev_mem);
+
+#ifdef DUMP_MSG
+  printf("<= %s\n", __FUNCTION__);
+  printf("===================================\n\n");
+#endif
+
+  return ret;
+}
+
+// # pytorch
+// #
+// #        N  IC  ID  IH  IW
+// # input (1,  2,  4,  3,  3)
+// #
+// #        OC  IC  KD  KH  KW
+// # kernel (4,  2,  2,  2,  2)
+// #
+// #         N  OC  OD  OH  OW
+// # output (1,  4,  5,  4,  4)
+// #
+// #            IC OC  KD KH KW
+// m = nn.Conv3d(2, 4, [2, 2, 2], stride=(1, 1, 1), padding=(1, 1, 1))
+// input = torch.rand(1, 2, 4, 3, 3)
+// output = m(input)
+//
+static int conv3d_test_ndchw_pad_dhw(CVI_RT_HANDLE rt_handle,
+                                     cvk_context_t *cvk_ctx)
+{
+#ifdef DUMP_MSG
+  printf("===================================\n\n");
+  printf("%s =>\n", __FUNCTION__);
+#endif
+
+  int ret = 0;
+
+  // input (N=1, IC=2, ID=4, IH=3, IW=3)
+  int n = 1, ic = 2, id = 4, ih = 3, iw = 3;
+  int input_shapes[5] = {n, ic, id, ih, iw};
+  float input_data[] = {
+      // IC=0
+      0.3307, 0.6577, 0.3520,   // ic = 0, id = 0, ih = 0
+      0.5691, 0.1531, 0.6240,   // ic = 0, id = 0, ih = 1
+      0.4324, 0.9731, 0.4587,   // ic = 0, id = 0, ih = 2
+
+      0.6121, 0.5937, 0.8512,
+      0.7932, 0.3473, 0.4032,
+      0.0156, 0.6799, 0.8587,
+
+      0.9278, 0.1046, 0.2478,
+      0.4399, 0.2543, 0.8906,
+      0.0275, 0.0450, 0.1212,
+
+      0.5655, 0.6741, 0.3396,
+      0.6126, 0.6385, 0.5160,
+      0.9062, 0.5286, 0.7064,
+
+      // IC=1
+      0.0512, 0.9951, 0.8289,   // ic = 1, id = 0, ih = 0
+      0.9011, 0.0602, 0.5583,   // ic = 1, id = 0, ih = 1
+      0.5176, 0.9857, 0.8772,   // ic = 1, id = 0, ih = 2
+
+      0.8971, 0.5207, 0.1500,
+      0.8408, 0.2034, 0.7618,
+      0.7618, 0.0702, 0.9254,
+
+      0.2110, 0.1366, 0.5222,
+      0.0626, 0.9902, 0.2842,
+      0.0101, 0.6390, 0.0038,
+
+      0.7045, 0.3892, 0.7232,
+      0.7224, 0.8458, 0.6474,
+      0.0602, 0.9074, 0.4171
+  };
+
+  // tpu input shape (n=1, id=4, ic=2, ih=3, iw=3)
+  float input_data_tpu[n * id * ic * ih * iw];
+  convert_ncdhw_to_ndchw(input_data_tpu, input_data, input_shapes);
+
+  // pytorch weight (Oc=4, Ic=2, kd=2, kh=2, kw=2)
+  int oc = 4, kd = 2, kh = 2, kw = 2;
+  int weight_shapes[5] = {oc, ic, kd, kh, kw};
+  float weight_data[] = {
+      // OC=0
+      -0.2046, -0.2492,   // ic = 0, kd = 0, kh = 0
+      -0.0783,  0.1082,   // ic = 0, kd = 0, kh = 1
+
+       0.1393, -0.1803,   // ic = 0, kd = 1, kh = 0
+      -0.0110, -0.1141,   // ic = 0, kd = 1, kh = 1
+
+       0.0606,  0.1902,   // ic = 1, kd = 0, kh = 0
+       0.1254,  0.1572,   // ic = 1, kd = 0, kh = 1
+
+       0.0887, -0.0336,   // ic = 1, kd = 1, kh = 0
+       0.0918, -0.1099,   // ic = 1, kd = 1, kh = 1
+
+      // OC=1
+      -0.0181, -0.2228,
+      -0.0575, -0.2464,
+
+      -0.0757, -0.0122,
+      -0.1896,  0.1301,
+
+      -0.0215,  0.0568,
+      -0.1381, -0.1621,
+
+      -0.1247, -0.0738,
+      -0.0146,  0.0719,
+
+      // OC=2
+       0.0960, -0.1865,
+      -0.2124, -0.0125,
+
+       0.0159,  0.1148,
+       0.1430,  0.1978,
+
+       0.0292, -0.2130,
+       0.2055,  0.1678,
+
+       0.2236, -0.0215,
+      -0.2171,  0.1709,
+
+      // OC=3
+       0.2186,  0.1488,
+       0.1558,  0.0359,
+
+       0.1822, -0.0433,
+       0.0960,  0.1791,
+
+      -0.0060,  0.0006,
+       0.0400,  0.1488,
+
+       0.1811, -0.1055,
+       0.1138, -0.0898
+  };
+
+  // tpu weight shape (1, oc=4, kh(2)*kw(2), kd(2)*ic(2))
+  float weight_data_tpu[oc * kh * kw * kd * ic];
+  convert_tpu_weight_for_ndchw(weight_data_tpu, weight_data, weight_shapes);
+
+  // bias (4)
+  float bias_data[] = {
+      -0.2107, -0.1894, -0.0108,  0.1728
+  };
+
+  // output (N=1, Oc=4, Od=5, Oh=4, Ow=4)
+  int od = 5, oh = 4, ow = 4;
+  int output_shapes[5] = {n, oc, od, oh, ow};
+  float ref_output_data[] = {
+      // OC=0
+      -2.5403e-01, -3.9400e-01, -2.5784e-01, -1.3846e-01,
+      -4.3596e-01, -2.5972e-01, -2.5080e-01, -4.3702e-02,
+      -4.4977e-01, -2.5769e-01, -3.8422e-01,  1.2387e-03,
+      -3.0602e-01, -3.1306e-01, -9.9820e-02, -6.8943e-02,
+
+      -3.3526e-01, -5.1864e-02, -4.1363e-02, -1.2992e-01,
+      -4.0344e-01, -1.0866e-01, -2.0857e-01, -1.3983e-02,
+      -3.0966e-01,  9.2221e-02, -2.8528e-01, -3.1210e-02,
+      -2.4841e-01, -3.7795e-01, -3.8244e-01, -4.9618e-02,
+
+      -1.3243e-01, -1.7816e-02, -1.5046e-01, -2.1334e-01,
+      -2.0596e-01, -2.3001e-01, -4.0274e-01, -2.1468e-01,
+      -2.1257e-01, -2.7799e-01, -3.3916e-02, -4.9950e-02,
+      -7.4998e-02, -3.4861e-01, -3.4250e-01, -3.1303e-01,
+
+      -2.1902e-01, -2.8536e-01, -1.8272e-01, -1.0197e-01,
+      -6.1921e-01, -3.3074e-01,  4.2541e-02, -9.8628e-02,
+      -5.4856e-01, -2.2603e-01, -2.8005e-01, -2.2485e-01,
+      -3.8100e-01, -9.9595e-02, -1.9782e-01, -9.9829e-02,
+
+      -3.8680e-02, -3.2488e-02, -6.4191e-02, -1.4660e-01,
+      -3.7711e-02, -1.3294e-01, -5.8401e-02, -1.9556e-01,
+      -1.1838e-01, -1.5398e-01, -8.1088e-02, -2.8004e-01,
+      -4.2503e-01, -3.5157e-01, -3.6049e-01, -3.2990e-01,
+
+      // OC=1
+      -1.4269e-01, -9.5723e-02, -2.2320e-01, -2.6823e-01,
+      -5.8374e-02, -3.9911e-01, -3.3735e-01, -4.4587e-01,
+      -1.6939e-01, -2.4322e-01, -3.3348e-01, -4.0600e-01,
+      -2.3289e-01, -3.7133e-01, -4.5634e-01, -3.3350e-01,
+
+      -1.3504e-01, -5.5332e-01, -5.8443e-01, -4.8773e-01,
+      -4.5654e-01, -7.9793e-01, -6.0842e-01, -4.9727e-01,
+      -4.7046e-01, -8.5047e-01, -8.1262e-01, -6.6208e-01,
+      -3.1279e-01, -4.7892e-01, -4.1965e-01, -3.9698e-01,
+
+      -3.4979e-01, -7.3478e-01, -4.8156e-01, -3.1361e-01,
+      -5.7180e-01, -6.9073e-01, -6.5631e-01, -5.9334e-01,
+      -4.5140e-01, -6.4365e-01, -8.3343e-01, -5.1614e-01,
+      -1.5070e-01, -4.0468e-01, -4.2686e-01, -2.3451e-01,
+
+      -3.2802e-01, -3.2160e-01, -3.9730e-01, -3.5070e-01,
+      -4.2998e-01, -6.3385e-01, -8.1355e-01, -5.1874e-01,
+      -2.3089e-01, -5.6220e-01, -7.1846e-01, -4.7895e-01,
+      -2.1047e-01, -3.1337e-01, -4.2336e-01, -2.9714e-01,
+
+      -4.4296e-01, -5.4842e-01, -4.8282e-01, -3.0881e-01,
+      -5.4348e-01, -7.7235e-01, -6.3022e-01, -3.3020e-01,
+      -5.1796e-01, -6.4802e-01, -6.9482e-01, -3.1089e-01,
+      -3.8791e-01, -2.7335e-01, -3.5223e-01, -2.1117e-01,
+
+      // OC=2
+       6.3343e-02,  3.2551e-01,  7.8462e-02, -1.4047e-01,
+       2.9263e-01, -1.3682e-02,  4.7238e-01,  1.4810e-01,
+       2.0917e-01,  5.2640e-01,  2.3049e-01, -9.6724e-04,
+       2.7707e-02,  2.0233e-01,  2.5884e-01,  1.9259e-01,
+
+       2.6804e-01,  1.8736e-01,  3.5448e-01,  1.7387e-01,
+       4.1227e-01,  6.1802e-02,  3.4067e-01, -3.1375e-02,
+      -2.1211e-02,  4.1589e-01,  3.9848e-01,  2.4676e-01,
+      -2.1633e-01, -9.8574e-02, -5.5862e-02,  2.7933e-01,
+
+       3.5165e-01,  2.5434e-01,  1.0813e-01, -2.3880e-01,
+       1.4803e-02,  2.2636e-01,  5.6942e-02,  3.3249e-01,
+      -1.5394e-01,  2.8699e-01,  1.9381e-02,  1.5203e-01,
+      -1.7307e-01, -1.3476e-01, -1.4338e-01,  1.0136e-01,
+
+       2.4526e-01, -1.5181e-02,  2.8220e-01, -6.4634e-02,
+       7.0619e-02,  5.5526e-01,  2.7332e-01, -2.2993e-03,
+       1.3952e-01,  4.8027e-01,  2.7088e-01,  2.2137e-01,
+       8.4666e-02, -8.3372e-02,  2.7218e-01,  1.0539e-01,
+
+       1.0030e-01,  7.0672e-02,  4.3040e-02,  6.5646e-02,
+      -1.5283e-01,  7.5928e-03, -1.1840e-02,  6.6282e-02,
+      -2.8021e-01, -2.6473e-01, -2.3691e-02, -6.7624e-03,
+      -1.9269e-01, -2.1400e-01, -1.5423e-01,  6.9144e-02,
+
+      // OC=3
+       2.2745e-01,  2.3889e-01,  3.3790e-01,  3.0098e-01,
+       1.7417e-01,  2.8818e-01,  4.5343e-01,  5.1056e-01,
+       8.4155e-02,  6.1303e-01,  3.3481e-01,  5.3153e-01,
+       9.9508e-02,  1.9928e-01,  4.1631e-01,  4.1526e-01,
+
+       2.2143e-01,  6.1859e-01,  7.0634e-01,  3.5959e-01,
+       3.2209e-01,  8.9161e-01,  7.0540e-01,  6.7204e-01,
+       1.6198e-01,  1.0484e+00,  7.8346e-01,  8.1161e-01,
+       1.5643e-01,  5.1360e-01,  4.5026e-01,  5.9188e-01,
+
+       4.7556e-01,  5.2241e-01,  3.6209e-01,  3.9465e-01,
+       4.2880e-01,  7.8418e-01,  8.6553e-01,  7.0884e-01,
+       3.8365e-01,  3.9124e-01,  8.4084e-01,  6.5296e-01,
+       1.7330e-01,  2.1039e-01,  5.6763e-01,  3.7774e-01,
+
+       2.7558e-01,  5.7020e-01,  3.8613e-01,  3.4723e-01,
+       2.8224e-01,  9.5750e-01,  6.7976e-01,  6.9007e-01,
+       2.9504e-01,  6.4116e-01,  8.1472e-01,  7.1143e-01,
+       1.3136e-01,  2.4329e-01,  3.8296e-01,  4.0355e-01,
+
+       2.9795e-01,  3.7119e-01,  4.1321e-01,  2.5462e-01,
+       3.8687e-01,  6.6586e-01,  6.1692e-01,  3.4898e-01,
+       3.0586e-01,  6.9549e-01,  5.9051e-01,  4.0845e-01,
+       3.0771e-01,  4.4973e-01,  3.8828e-01,  3.2473e-01,
+  };
+
+  // tpu output (n, od, oc, oh, ow)
+  float ref_output_data_ndchw[n * od * oc * oh * ow];
+  convert_ncdhw_to_ndchw(ref_output_data_ndchw, ref_output_data, output_shapes);
+
+  // dilation = (depth=1, height=1, width=1)
+  int dilation_d = 1, dilation_h = 1, dilation_w = 1;
+
+  // stride = (depth=1, height=1, width=1)
+  int stride_d = 1, stride_h = 1, stride_w = 1;
+
+  // padding = (1, 1, 1)
+  int pad_d0 = 1, pad_d1 = 1;
+  int pad_top = 1, pad_bot = 1, pad_left = 1, pad_right = 1;
+
+  float output_data_cpu[sizeof(ref_output_data)/sizeof(float)] = {0.0};
+  conv3d_float_ref_for_ncdhw(
+      input_data, weight_data, bias_data, output_data_cpu,
+      n, ic, id, ih, iw,
+      oc, od, oh, ow,
+      kd, kh, kw,
+      stride_d, stride_h, stride_w,
+      dilation_d, dilation_h, dilation_w,
+      pad_d0, pad_d1,
+      pad_top, pad_bot, pad_left, pad_right);
+
+  printf("  %s: compare ref\n", __FUNCTION__);
+  const float precision = 0.0002;
+  for (size_t i = 0; i < sizeof(output_data_cpu)/sizeof(float); i++)
+  {
+    if (fabs(output_data_cpu[i] - ref_output_data[i]) > precision) {
+      printf("    [%d] Error ! val %f, expected %f\n",
+             (int)i, output_data_cpu[i], ref_output_data[i]);
+      ret = -1;
+    }
+  }
+  printf("  %s: compare ref %s\n", __FUNCTION__, ret ? "fail" : "pass");
+
+  if (ret)
+    return ret;
+
+  cvk_fmt_t fmt = CVK_FMT_BF16;
+  cvk_tl_t *tl_input_al = NULL;
+  cvk_tl_t *tl_output_al = NULL;
+  cvk_tl_t *tl_weight_al = NULL;
+  cvk_tl_t *tl_bias_al = NULL;
+
+  // Allocate output
+  {
+    cvk_tl_shape_t shape = {
+        (uint32_t)n, (uint32_t)oc, (uint32_t)oh, (uint32_t)ow};
+    tl_output_al = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, shape, fmt,
+                                                   /*eu_align=*/1);
+  }
+
+  // Allocate input
+  {
+    cvk_tl_shape_t shape = {
+        (uint32_t)n, (uint32_t)(id + pad_d0 + pad_d1) * ic, (uint32_t)ih,
+        (uint32_t)iw};
+    tl_input_al = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, shape, fmt,
+                                                  /*eu_align=*/1);
+  }
+
+  // Allocate weight
+  {
+    cvk_tl_shape_t shape = {
+        1, (uint32_t)oc, (uint32_t)(kh*kw), (uint32_t)kd*ic};
+    tl_weight_al = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, shape, fmt,
+                                                   /*eu_align=*/0);
+  }
+
+  // Allocate bias
+  // bias
+  {
+    cvk_tl_shape_t shape = {2, (uint32_t)oc, 1, 1};
+    tl_bias_al = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, shape, fmt,
+                                                 /*eu_align=*/0);
+  }
+
+  assert(tl_output_al && tl_input_al && tl_weight_al && tl_bias_al &&
+         "Expect all allocated");
+
+  CVI_RT_MEM gm_input_dev_mem = NULL;
+  CVI_RT_MEM gm_weight_dev_mem = NULL;
+  CVI_RT_MEM gm_bias_dev_mem = NULL;
+  CVI_RT_MEM gm_output_dev_mem = NULL;
+  uint64_t ga_input = 0;
+  uint64_t ga_weight = 0;
+  uint64_t ga_bias = 0;
+  uint64_t ga_output = 0;
+
+  // Allocate device memory of input
+  {
+    // shape (1, id=4, ic=2, ih=3, iw=3)
+    // reshape (1, id=4, ic=2, ih=3, iw=3) -> (1, 4, 2, 3x3)
+    int total_len = 1 * id * ic * ih * iw;
+    uint16_t input_bf16_data_tpu[total_len];
+    convert_fp32_to_bf16_data(cvk_ctx, input_bf16_data_tpu, input_data_tpu,
+                              total_len);
+
+    gm_input_dev_mem = CVI_RT_MemAlloc(rt_handle, total_len * sizeof(uint16_t));
+
+    // Copy from system memory to device memory
+    CVI_RT_MemCopyS2D(rt_handle, gm_input_dev_mem,
+                      (uint8_t *)input_bf16_data_tpu);
+
+    ga_input = CVI_RT_MemGetPAddr(gm_input_dev_mem);
+  }
+
+  // Allocate device memory of weight
+  {
+    int len = oc * kh * kw * kd * ic;
+    uint16_t weight_bf16_data_tpu[len];
+    convert_fp32_to_bf16_data(cvk_ctx, weight_bf16_data_tpu, weight_data_tpu,
+                              len);
+
+    gm_weight_dev_mem = CVI_RT_MemAlloc(rt_handle, len * sizeof(uint16_t));
+
+    // Copy from system memory to device memory
+    CVI_RT_MemCopyS2D(rt_handle, gm_weight_dev_mem,
+                      (uint8_t *)weight_bf16_data_tpu);
+
+    ga_weight = CVI_RT_MemGetPAddr(gm_weight_dev_mem);
+  }
+
+  // Allocate device memory of bias
+  {
+    int len = oc;
+    uint16_t bias_bf16_data_tpu[len];
+    convert_fp32_to_bf16_data(cvk_ctx, bias_bf16_data_tpu, bias_data, len);
+
+    gm_bias_dev_mem = CVI_RT_MemAlloc(rt_handle, len * sizeof(uint16_t));
+
+    // Copy from system memory to device memory
+    CVI_RT_MemCopyS2D(rt_handle, gm_bias_dev_mem,
+                      (uint8_t *)bias_bf16_data_tpu);
+
+    ga_bias = CVI_RT_MemGetPAddr(gm_bias_dev_mem);
+  }
+
+  // Allocate device memory of output
+  {
+    int len = n * od * oc * oh * ow;
+    gm_output_dev_mem = CVI_RT_MemAlloc(rt_handle, len * sizeof(uint16_t));
+    ga_output = CVI_RT_MemGetPAddr(gm_output_dev_mem);
+  }
+
+  assert(gm_input_dev_mem && gm_output_dev_mem && gm_weight_dev_mem &&
+         gm_bias_dev_mem && "Expect valid gm dev mem");
+  assert(ga_input && ga_output && ga_weight && ga_bias && "Expect valid gaddr");
+
+  load_bias(cvk_ctx, ga_bias, tl_bias_al);
+  load_weight_for_ndchw(cvk_ctx, oc, ic, kd, kh, kw, ga_weight, tl_weight_al);
+  load_input_for_ndchw(cvk_ctx,
+                       n, ic, id, ih, iw,
+                       pad_d0, pad_d1,
+                       ga_input, tl_input_al);
+
+  for (int odi = 0; odi < od; odi++) {
+    compute_for_ndchw(cvk_ctx,
+                      ic, id, ih, iw,
+                      kd, kh, kw,
+                      stride_d, stride_h, stride_w,
+                      pad_top, pad_bot, pad_left, pad_right,
+                      oc, oh, ow,
+                      odi,
+                      tl_input_al,
+                      tl_weight_al,
+                      tl_bias_al,
+                      tl_output_al);
+    store_output_for_ndchw(cvk_ctx,
+                           oc, od, oh, ow,
+                           odi,
+                           ga_output,
+                           tl_output_al);
+  }
+
+  CVI_RT_Submit(cvk_ctx);
+
+  // copy from device memory to system memory
+  int output_len = n * od * oc * oh * ow;
+
+  uint16_t ref_output_bf16_data[output_len];
+  convert_fp32_to_bf16_data(cvk_ctx, ref_output_bf16_data, ref_output_data,
+                            output_len);
+
+  uint16_t output_bf16_data_tpu[output_len];
+  CVI_RT_MemCopyD2S(rt_handle, (uint8_t *) output_bf16_data_tpu,
+                    gm_output_dev_mem);
+
+  printf("  %s: compare tpu\n", __FUNCTION__);
+  const float tpu_precision = 0.01;
+  for (int i = 0; i < output_len; i++) {
+    float tpu_data = cvk_convert_bf16_fp32(output_bf16_data_tpu[i]);
+    if (fabs(tpu_data - ref_output_data_ndchw[i]) > tpu_precision) {
+      printf("    [%d] Error ! val %f(0x%x), expected %f(0x%x)\n",
+             (int)i, tpu_data, output_bf16_data_tpu[i],
+             ref_output_data_ndchw[i], ref_output_bf16_data[i]);
+      ret = -1;
+    }
+  }
+  printf("  %s: compare tpu %s\n", __FUNCTION__, ret ? "fail" : "pass");
+
+  // Reverse order
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_bias_al);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_weight_al);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_input_al);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_output_al);
+
+  CVI_RT_MemFree(rt_handle, gm_input_dev_mem);
+  CVI_RT_MemFree(rt_handle, gm_weight_dev_mem);
+  CVI_RT_MemFree(rt_handle, gm_bias_dev_mem);
+  CVI_RT_MemFree(rt_handle, gm_output_dev_mem);
+
+#ifdef DUMP_MSG
+  printf("<= %s\n", __FUNCTION__);
+  printf("===================================\n\n");
+#endif
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_HANDLE rt_handle;
+  cvk_context_t *cvk_ctx = NULL;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = (cvk_context_t *)CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+
+  ret |= conv3d_test_ncdhw_pad_dhw(rt_handle, cvk_ctx);
+  if (ret)
+    goto end;
+
+  ret |= conv3d_test_ndchw_pad_dhw(rt_handle, cvk_ctx);
+  if (ret)
+    goto end;
+
+  ret |= conv3d_test_ndchw_pad_hw(rt_handle, cvk_ctx);
+  if (ret)
+    goto end;
+
+  ret |= conv3d_test_ndchw(rt_handle, cvk_ctx);
+  if (ret)
+    goto end;
+
+  ret |= conv3d_test(rt_handle, cvk_ctx);
+
+end:
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  if (!ret)
+    printf("%s pass\n", __FILENAME__);
+  else
+    printf("%s fail\n", __FILENAME__);
+
+  return ret;
+}
diff --git a/cviruntime/test/1880v2/bf16/test_cv1880v2_pool3d_bf16.cpp b/cviruntime/test/1880v2/bf16/test_cv1880v2_pool3d_bf16.cpp
new file mode 100644
index 000000000..f24e46e1c
--- /dev/null
+++ b/cviruntime/test/1880v2/bf16/test_cv1880v2_pool3d_bf16.cpp
@@ -0,0 +1,697 @@
+#include <limits>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <time.h>
+#include <math.h>
+#include <cmath>
+#include "test_cvikernel_util.h"
+#include "test_tf_quant_util.h"
+#include "test_native_ref.h"
+
+// #define DUMP_MSG
+
+#define TEST_CASE_NAME    "test_cv1880v2_pool3d_bf16"
+
+// input (n, c, id, ih, iw)
+// weight (kd, kh, kw), stride (sd, sh, sw)
+// output (n, c, od, oh, ow)
+void pool3d_float_ref(float *input, float *output,
+    int input_n, int input_c, int input_d, int input_h, int input_w,
+    int output_d, int output_h, int output_w,
+    int kernel_d, int kernel_h, int kernel_w,
+    int stride_d, int stride_h, int stride_w,
+    int pad_d0, int pad_d1,
+    int pad_top, int pad_bot, int pad_left, int pad_right)
+{
+  (void)pad_d1;
+  (void)pad_bot;
+  (void)pad_right;
+
+#ifdef DUMP_MSG
+  printf("  %s =>\n", __FUNCTION__);
+#endif
+
+  int input_shapes[5] = {input_n, input_c, input_d, input_h, input_w};
+  int input_strides[5];
+
+  int output_shapes[5] = {input_n, input_c, output_d, output_h, output_w};
+  int output_strides[5];
+
+  // logical stride, in unit of float
+  get_strides_from_shapes5d(input_strides, input_shapes, 1);
+  get_strides_from_shapes5d(output_strides, output_shapes, 1);
+
+  for (int i = 0; i < input_n; i++) {
+    for (int c = 0; c < input_c; c++) {
+      for (int oz = 0; oz < output_d; oz++) {
+        for (int oy = 0; oy < output_h; oy++) {
+          for (int ox = 0; ox < output_w; ox++) {
+            float max_value = -std::numeric_limits<float>::infinity();
+
+            for (int pd = 0; pd < kernel_d; pd++) {
+              int iz = oz * stride_d + pd - pad_d0;
+              for (int py = 0; py < kernel_h; py++) {
+                int iy = oy * stride_h + py - pad_top;
+                for (int px = 0; px < kernel_w; px++) {
+                  int ix = ox * stride_w + px - pad_left;
+                  if (iz < input_d && iy < input_h && ix < input_w) {
+                    int poss[5] = {i, c, iz, iy, ix};
+                    int input_offset = get_tensor5d_offset(poss, input_strides);
+                    max_value = std::fmax(max_value, input[input_offset]);
+
+#ifdef DUMP_MSG
+                    printf("  [i=%d][c=%d][oz=%d][oy=%d][ox=%d]" \
+                           "[pd=%d][py=%d][px=%d] input(%d, %d, %d, %d, %d)"\
+                           "(offset=%d)=%f, max=%f\n",
+                           i, c, oz, oy, ox,
+                           pd, py, px,
+                           poss[0], poss[1], poss[2], poss[3], poss[4],
+                           input_offset,
+                           input[input_offset],
+                           max_value);
+#endif
+
+                  }
+                }
+              }
+            }
+
+            int output_poss[5] = {i, c, oz, oy, ox};
+            int output_offset = get_tensor5d_offset(output_poss, output_strides);
+            output[output_offset] = max_value;
+
+#ifdef DUMP_MSG
+            printf("    output(%d, %d, %d, %d, %d)(offset=%d)=%f\n",
+                  output_poss[0], output_poss[1], output_poss[2],
+                  output_poss[3], output_poss[4],
+                  output_offset, max_value);
+#endif
+
+          }
+        }
+      }
+    }
+  }
+
+#ifdef DUMP_MSG
+  printf("  <= %s\n", __FUNCTION__);
+#endif
+
+}
+
+// Input
+// global memory (n, ic, id, ih, iw)
+// local memory  (n*id, ic, ih, iw), align eu
+static void loadInput(
+    cvk_context_t *cvk_ctx,
+    int n, int ic, int id, int ih, int iw,
+    uint64_t ga_input, cvk_tl_t *tl_input_al)
+{
+  assert(n == 1 && "Only handle batch 1");
+  int eu_num = (int)cvk_ctx->info.eu_num;
+
+  // local memory layout (id, ic, ih, iw)
+  cvk_fmt_t fmt = tl_input_al->fmt;
+  cvk_tl_shape_t tl_shape = {1, (uint32_t)ic, (uint32_t)id, (uint32_t)ih*iw};
+  cvk_tl_t tl_input;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tl_input, tl_shape, fmt,
+                                 tl_input_al->eu_align);
+  tl_input.start_address = tl_input_al->start_address;
+  tl_input.stride.h = ((tl_input.stride.h + eu_num - 1) / eu_num) * eu_num;
+  tl_input.stride.n = tl_input.stride.h * id;
+
+  // global memory (1, ic, id, ih*iw)
+  cvk_tg_shape_t tg_shape = {1, (uint32_t)ic, (uint32_t)id, (uint32_t)ih*iw};
+  cvk_tg_t gm_input;
+  cvk_ctx->ops->gmem_init_tensor(cvk_ctx, &gm_input, tg_shape, fmt);
+  gm_input.start_address = ga_input;
+
+  cvk_tdma_g2l_tensor_copy_param_t param;
+  memset(&param, 0, sizeof(param));
+  param.src = &gm_input;
+  param.dst = &tl_input;
+
+#ifdef DUMP_MSG
+  printf("  loadInput:\n" \
+         "    src %" PRIu64 ", (%" PRIu32 ", %" PRIu32 ", %" PRIu32 ", "\
+         "%" PRIu32 "), stride(%" PRIu32 ", %" PRIu32 ", "\
+         "%" PRIu32 ", %" PRIu32 ")\n" \
+         "    dst %" PRIu32 ", (%" PRIu32 ", %" PRIu32 ", %" PRIu32 ", "\
+         "%" PRIu32 "), stride(%" PRIu32 ", %" PRIu32 ", "\
+         "%" PRIu32 ", %" PRIu32 ")\n",
+         gm_input.start_address,
+         gm_input.shape.n, gm_input.shape.c, gm_input.shape.h, gm_input.shape.w,
+         gm_input.stride.n, gm_input.stride.c, gm_input.stride.h,
+         gm_input.stride.w,
+         tl_input.start_address,
+         tl_input.shape.n, tl_input.shape.c, tl_input.shape.h, tl_input.shape.w,
+         tl_input.stride.n, tl_input.stride.c, tl_input.stride.h,
+         tl_input.stride.w);
+#endif
+
+  cvk_ctx->ops->tdma_g2l_bf16_tensor_copy(cvk_ctx, &param);
+}
+
+static void compute(
+    cvk_context_t *cvk_ctx,
+    int n, int ic, int id, int ih, int iw,
+    int od, int oh, int ow,
+    int pad_d0, int pad_d1,
+    int pad_top, int pad_bot, int pad_left, int pad_right,
+    int kd, int kh, int kw,
+    int stride_d, int stride_h, int stride_w,
+    cvk_tl_t *tl_input_al, cvk_tl_t *tl_work, cvk_tl_t *tl_output_al)
+{
+  (void)pad_d1;
+  (void)pad_bot;
+  (void)pad_right;
+  assert(n == 1 && "Only support batch 1");
+
+  cvk_fmt_t fmt = tl_input_al->fmt;
+
+  // Apply 2d max pool to each input depth
+  {
+    cvk_tl_shape_t input_shape = {
+        (uint32_t)id, (uint32_t)ic, (uint32_t)ih, (uint32_t)iw};
+    cvk_tl_t tl_input;
+    cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tl_input, input_shape, fmt,
+                                  /*eu_align=*/1);
+    tl_input.start_address = tl_input_al->start_address;
+
+    cvk_tl_shape_t output_shape = {
+      (uint32_t)id, (uint32_t)ic, (uint32_t)oh, (uint32_t)ow};
+    cvk_tl_t tl_output;
+    cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tl_output, output_shape, fmt,
+                                   /*eu_align=*/1);
+    tl_output.start_address = tl_output_al->start_address;
+
+    cvk_tiu_max_pooling_param_t param;
+    memset(&param, 0, sizeof(param));
+    param.ofmap = &tl_output;
+    param.ifmap = &tl_input;
+    param.kh = kh;
+    param.kw = kw;
+    param.pad_top = (uint8_t)pad_top;
+    param.pad_bottom = (uint8_t)pad_bot;
+    param.pad_left = (uint8_t)pad_left;
+    param.pad_right = (uint8_t)pad_right;
+    param.stride_h = (uint8_t)stride_h;
+    param.stride_w = (uint8_t)stride_w;
+
+#ifdef DUMP_MSG
+    printf("  compute, max pool:\n" \
+           "    ifmap %" PRIu32 ", shape(%" PRIu32 ", %" PRIu32 ", "\
+           "%" PRIu32 ", %" PRIu32 "), stride (%" PRIu32 ", %" PRIu32 ", "\
+           "%" PRIu32 ", %" PRIu32 ")\n" \
+           "    ofmap %" PRIu32 ", shape(%" PRIu32 ", %" PRIu32 ", "\
+           "%" PRIu32 ", %" PRIu32 "), stride (%" PRIu32 ", %" PRIu32 ", "\
+           "%" PRIu32 ", %" PRIu32 ")\n",
+           tl_input.start_address,
+           tl_input.shape.n, tl_input.shape.c, tl_input.shape.h,
+           tl_input.shape.w,
+           tl_input.stride.n, tl_input.stride.c, tl_input.stride.h,
+           tl_input.stride.w,
+           tl_output.start_address,
+           tl_output.shape.n, tl_output.shape.c, tl_output.shape.h,
+           tl_output.shape.w,
+           tl_output.stride.n, tl_output.stride.c, tl_output.stride.h,
+           tl_output.stride.w);
+#endif
+
+    cvk_ctx->ops->tiu_max_pooling(cvk_ctx, &param);
+  }
+
+  // TIU copy (od, ic, oh, ow) -> (1, ic, od, oh*ow)
+  // from eu-aligned to contiguous
+  {
+    cvk_tl_shape_t tl_input_shape = {
+        1, (uint32_t)ic, (uint32_t)oh, (uint32_t)ow};
+
+    cvk_tl_stride_t aligned_stride =
+        cvk_ctx->ops->tl_default_stride(cvk_ctx, tl_input_shape, fmt, 1);
+    cvk_tl_stride_t unaligned_stride =
+        cvk_ctx->ops->tl_default_stride(cvk_ctx, tl_input_shape, fmt, 0);
+
+    for (int oz = 0; oz < od; ++oz) {
+      for (int pd = 0; pd < kd; pd++) {
+        int iz = oz * stride_d + pd - pad_d0;
+
+        cvk_tl_t tl_input;
+        cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tl_input, tl_input_shape, fmt,
+                                       /*eu_align=*/1);
+        tl_input.start_address = tl_output_al->start_address +
+                                 aligned_stride.n * iz;
+
+        cvk_tl_shape_t tl_output_shape = {
+            1, (uint32_t)ic, (uint32_t)oh, (uint32_t)ow};
+        cvk_tl_t tl_output;
+        cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tl_output, tl_output_shape,
+                                       fmt, /*eu_align=*/0);
+        tl_output.start_address = tl_work->start_address +
+                                  aligned_stride.n * oz * kd +
+                                  unaligned_stride.n * pd;
+
+        cvk_tiu_copy_param_t param;
+        memset(&param, 0, sizeof(param));
+        param.src = &tl_input;
+        param.dst = &tl_output;
+
+#ifdef DUMP_MSG
+        printf("    [oz=%d][pd=%d] tiu_copy:\n" \
+              "       src %" PRIu32 ", (%" PRIu32 ", %" PRIu32 ", "\
+              "%" PRIu32 ", %" PRIu32 "), stride(%" PRIu32 ", "\
+              "%" PRIu32 ", %" PRIu32 ", %" PRIu32 ")\n" \
+              "       dst %" PRIu32 ", (%" PRIu32 ", %" PRIu32 ", "\
+              "%" PRIu32 ", %" PRIu32 "), stride(%" PRIu32 ", "\
+              "%" PRIu32 ", %" PRIu32 ", %" PRIu32 ")\n",
+              oz, pd,
+              tl_input.start_address,
+              tl_input.shape.n, tl_input.shape.c, tl_input.shape.h,
+              tl_input.shape.w,
+              tl_input.stride.n, tl_input.stride.c, tl_input.stride.h,
+              tl_input.stride.w,
+              tl_output.start_address,
+              tl_output.shape.n, tl_output.shape.c, tl_output.shape.h,
+              tl_output.shape.w,
+              tl_output.stride.n, tl_output.stride.c, tl_output.stride.h,
+              tl_output.stride.w);
+#endif
+
+        cvk_ctx->ops->tiu_copy(cvk_ctx, &param);
+      }
+    }
+  }
+
+  // Apply 2d max pool to input depth
+  // input (od, ic, kd, oh*ow)
+  // kernel (id, 1)
+  // output (od, ic, 1, oh*ow)
+  {
+    cvk_tl_shape_t tiu_copy_input_shape = {
+        1, (uint32_t)ic, (uint32_t)oh, (uint32_t)ow};
+
+    cvk_tl_stride_t tiu_copy_aligned_stride =
+        cvk_ctx->ops->tl_default_stride(cvk_ctx, tiu_copy_input_shape, fmt, 1);
+
+    for (int oz = 0; oz < od; ++oz) {
+      cvk_tl_shape_t tl_input_shape = {
+          1, (uint32_t)ic, (uint32_t)kd, (uint32_t)oh*ow};
+      cvk_tl_t tl_input;
+      cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tl_input, tl_input_shape, fmt,
+                                     /*eu_align=*/1);
+      tl_input.start_address = tl_work->start_address +
+                               tiu_copy_aligned_stride.n * oz * kd;
+
+      cvk_tl_shape_t tl_output_shape = {
+          1, (uint32_t)ic, 1, (uint32_t)oh*ow};
+      cvk_tl_t tl_output;
+      cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tl_output, tl_output_shape, fmt,
+                                     /*eu_align=*/1);
+      tl_output.start_address = tl_output_al->start_address +
+                                tl_output.stride.n * oz;
+
+      cvk_tiu_max_pooling_param_t param;
+      memset(&param, 0, sizeof(param));
+      param.ofmap = &tl_output;
+      param.ifmap = &tl_input;
+      param.kh = kd;
+      param.kw = 1;
+      param.pad_top = 0;
+      param.pad_bottom = 0;
+      param.pad_left = 0;
+      param.pad_right = 0;
+      param.stride_h = 1;
+      param.stride_w = 1;
+
+#ifdef DUMP_MSG
+      printf("    [%d] max pool:\n" \
+            "      ifmap %" PRIu32 ", shape(%" PRIu32 ", %" PRIu32 ", "\
+            "%" PRIu32 ", %" PRIu32 "), stride (%" PRIu32 ", %" PRIu32 ", "\
+            "%" PRIu32 ", %" PRIu32 ")\n" \
+            "      ofmap %" PRIu32 ", shape(%" PRIu32 ", %" PRIu32 ", "\
+            "%" PRIu32 ", %" PRIu32 "), stride (%" PRIu32 ", %" PRIu32 ", "\
+            "%" PRIu32 ", %" PRIu32 ")\n",
+            oz,
+            tl_input.start_address,
+            tl_input.shape.n, tl_input.shape.c, tl_input.shape.h,
+            tl_input.shape.w,
+            tl_input.stride.n, tl_input.stride.c, tl_input.stride.h,
+            tl_input.stride.w,
+            tl_output.start_address,
+            tl_output.shape.n, tl_output.shape.c, tl_output.shape.h,
+            tl_output.shape.w,
+            tl_output.stride.n, tl_output.stride.c, tl_output.stride.h,
+            tl_output.stride.w);
+#endif
+
+      cvk_ctx->ops->tiu_max_pooling(cvk_ctx, &param);
+    }
+
+  }
+}
+
+static void storeOutput(
+    cvk_context_t *cvk_ctx,
+    int n, int ic, int od, int oh, int ow,
+    uint64_t ga_output, cvk_tl_t *tl_output_al) {
+
+  assert(n == 1 && "Only support batch 1");
+  int eu_num = (int)cvk_ctx->info.eu_num;
+
+  cvk_fmt_t fmt = tl_output_al->fmt;
+  cvk_tl_shape_t tl_output_shape = {
+      1, (uint32_t)ic, (uint32_t)od, (uint32_t)oh*ow};
+  cvk_tl_t tl_output;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tl_output, tl_output_shape, fmt,
+                                 /*eu_align=*/1);
+  tl_output.start_address = tl_output_al->start_address;
+  tl_output.stride.h = ((tl_output.stride.h + eu_num - 1) / eu_num) * eu_num;
+
+  cvk_tg_shape_t tg_output_shape = {
+      1, (uint32_t)ic, (uint32_t)od, (uint32_t)oh*ow};
+  cvk_tg_t tg_output;
+  cvk_ctx->ops->gmem_init_tensor(cvk_ctx, &tg_output, tg_output_shape, fmt);
+  tg_output.start_address = ga_output;
+
+  cvk_tdma_l2g_tensor_copy_param_t param;
+  memset(&param, 0, sizeof(param));
+  param.src = &tl_output;
+  param.dst = &tg_output;
+
+#ifdef DUMP_MSG
+  printf("  storeOutput:\n" \
+         "    src %" PRIu32 ", (%" PRIu32 ", %" PRIu32 ", %" PRIu32 ", "\
+         "%" PRIu32 "), stride(%" PRIu32 ", %" PRIu32 ", "\
+         "%" PRIu32 ", %" PRIu32 ")\n" \
+         "    dst %" PRIu64 ", (%" PRIu32 ", %" PRIu32 ", %" PRIu32 ", "\
+         "%" PRIu32 "), stride(%" PRIu32 ", %" PRIu32 ", "\
+         "%" PRIu32 ", %" PRIu32 ")\n",
+         tl_output.start_address,
+         tl_output.shape.n, tl_output.shape.c, tl_output.shape.h,
+         tl_output.shape.w,
+         tl_output.stride.n, tl_output.stride.c, tl_output.stride.h,
+         tl_output.stride.w,
+         tg_output.start_address,
+         tg_output.shape.n, tg_output.shape.c, tg_output.shape.h,
+         tg_output.shape.w,
+         tg_output.stride.n, tg_output.stride.c, tg_output.stride.h,
+         tg_output.stride.w);
+#endif
+
+  cvk_ctx->ops->tdma_l2g_bf16_tensor_copy(cvk_ctx, &param);
+}
+
+//
+//        N  IC  ID  IH  IW
+// input (1,  2,  4,  6,  6)
+//
+//        KD  KH  KW
+// kernel (2,  2,  2), stride (2, 2, 2)
+//
+//         N  OC  OD  OH  OW
+// output (1,  2,  2,  3,  3)
+//
+// pytorch:
+//   import torch
+//   import torch.nn as nn
+//   m = nn.MaxPool3d(kernel_size=(2, 2, 2), stride=(2, 2, 2))
+//   input = torch.rand(1, 2, 4, 6, 6)
+//   output = m(input)
+//
+static int pool3d_test(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+#ifdef DUMP_MSG
+  printf("===================================\n\n");
+  printf("%s =>\n", __FUNCTION__);
+#endif
+
+  (void)rt_handle;
+  (void)cvk_ctx;
+
+  int ret = 0;
+
+  // input (n=1, c=2, d=4, h=6, w=6)
+  int n = 1, ic = 2, id = 4, ih = 6, iw = 6;
+  //int input_shapes[5] = {n, ic, id, ih, iw};
+  float input_data[] = {
+        // IC=0
+        0.6907, 0.9408, 0.5792, 0.0155, 0.3657, 0.3555, // ic=0, id=0, ih=0
+        0.0933, 0.1023, 0.1212, 0.6805, 0.7696, 0.9585,
+        0.9876, 0.8971, 0.9734, 0.7682, 0.1532, 0.7841,
+        0.5079, 0.0060, 0.6073, 0.7310, 0.2100, 0.6355,
+        0.6528, 0.4982, 0.6783, 0.9851, 0.5486, 0.4885,
+        0.9682, 0.8595, 0.0752, 0.1718, 0.4142, 0.3268,
+
+        0.5709, 0.1998, 0.0288, 0.7744, 0.8626, 0.9731, // ic=0, id=1, ih=0
+        0.7898, 0.9141, 0.5941, 0.2131, 0.4342, 0.4791,
+        0.5463, 0.5508, 0.7898, 0.6795, 0.2314, 0.4887,
+        0.2611, 0.0303, 0.1486, 0.1386, 0.7260, 0.7107,
+        0.2266, 0.4536, 0.0832, 0.7075, 0.4564, 0.3716,
+        0.1076, 0.3894, 0.2450, 0.7306, 0.5887, 0.9756,
+
+        0.1618, 0.7542, 0.9378, 0.3864, 0.5607, 0.7753, // ic=0, id=2, ih=0
+        0.1143, 0.7555, 0.9824, 0.4149, 0.4235, 0.4298,
+        0.8252, 0.7352, 0.7124, 0.3989, 0.1546, 0.8114,
+        0.9839, 0.5569, 0.5464, 0.3193, 0.7710, 0.5774,
+        0.9209, 0.2261, 0.9562, 0.0164, 0.4955, 0.6177,
+        0.5790, 0.1055, 0.2765, 0.5313, 0.9878, 0.4712,
+
+        0.1046, 0.4042, 0.8381, 0.2987, 0.8166, 0.0809, // ic=0, id=3, ih=0
+        0.1088, 0.7503, 0.1620, 0.0757, 0.0527, 0.4848,
+        0.6198, 0.3235, 0.7783, 0.1561, 0.4574, 0.3427,
+        0.3435, 0.2180, 0.8079, 0.2090, 0.8582, 0.0381,
+        0.6562, 0.7126, 0.5397, 0.5042, 0.0537, 0.7438,
+        0.4731, 0.8265, 0.6840, 0.2583, 0.2385, 0.9646,
+
+        // IC=1
+        0.4381, 0.0173, 0.8579, 0.0612, 0.1047, 0.3874, // ic=1, id=0, ih=0
+        0.6478, 0.1154, 0.2030, 0.8241, 0.8321, 0.3291,
+        0.1108, 0.0253, 0.1421, 0.8958, 0.4012, 0.7565,
+        0.7778, 0.5838, 0.3576, 0.3304, 0.8876, 0.8442,
+        0.6297, 0.9765, 0.5933, 0.0829, 0.3601, 0.1416,
+        0.6709, 0.5021, 0.3242, 0.0080, 0.6258, 0.3447,
+
+        0.8346, 0.7683, 0.5717, 0.1758, 0.7923, 0.1612, // ic=1, id=1, ih=0
+        0.2289, 0.5588, 0.2911, 0.7938, 0.9405, 0.6662,
+        0.0654, 0.1096, 0.9990, 0.6394, 0.6518, 0.0435,
+        0.8786, 0.6171, 0.6420, 0.8809, 0.8736, 0.6752,
+        0.2551, 0.6027, 0.6452, 0.9325, 0.4888, 0.3601,
+        0.3799, 0.9947, 0.1809, 0.9057, 0.5961, 0.3890,
+
+        0.1345, 0.0214, 0.9583, 0.3609, 0.8325, 0.9346, // ic=1, id=2, ih=0
+        0.5194, 0.0346, 0.4887, 0.3495, 0.6093, 0.1503,
+        0.9276, 0.0147, 0.7693, 0.2042, 0.0840, 0.0885,
+        0.5881, 0.0846, 0.4413, 0.1325, 0.2781, 0.6833,
+        0.1373, 0.1428, 0.4761, 0.3221, 0.6378, 0.2035,
+        0.1556, 0.6974, 0.0709, 0.1990, 0.5579, 0.5456,
+
+        0.4670, 0.7720, 0.8733, 0.6276, 0.4938, 0.8457, // ic=1, id=3, ih=0
+        0.2115, 0.0147, 0.0665, 0.3245, 0.9537, 0.6126,
+        0.9876, 0.5670, 0.8435, 0.2570, 0.8289, 0.4260,
+        0.2659, 0.3500, 0.6909, 0.2132, 0.8072, 0.6591,
+        0.0285, 0.8939, 0.1296, 0.2531, 0.1313, 0.8860,
+        0.5761, 0.5247, 0.6818, 0.0679, 0.9456, 0.3174
+  };
+
+  // weight (2, 2, 2), stride (2, 2, 2)
+  int kd = 2, kh = 2, kw = 2;
+  int stride_d = 2, stride_h = 2, stride_w = 2;
+
+  // output (n=1, oc=2, od=2, oh=3, ow=3)
+  int pad_d0=0, pad_d1=0, pad_top=0, pad_bot=0, pad_left=0, pad_right=0;
+  int od = 2, oh = 3, ow = 3;
+  float ref_output_data[] = {
+        // oc=0
+        0.9408, 0.7744, 0.9731, // oc=0, od=0, oh=0
+        0.9876, 0.9734, 0.7841,
+        0.9682, 0.9851, 0.9756,
+
+        0.7555, 0.9824, 0.8166, // oc=0, od=1, oh=0
+        0.9839, 0.8079, 0.8582,
+        0.9209, 0.9562, 0.9878,
+
+        // oc=1
+        0.8346, 0.8579, 0.9405, // oc=1, od=0, oh=0
+        0.8786, 0.9990, 0.8876,
+        0.9947, 0.9325, 0.6258,
+
+        0.7720, 0.9583, 0.9537, // oc=1, od=1, oh=0
+        0.9876, 0.8435, 0.8289,
+        0.8939, 0.6818, 0.9456
+  };
+
+  float output_data_cpu[sizeof(ref_output_data)/sizeof(float)];
+  pool3d_float_ref(input_data, output_data_cpu,
+                   n, ic, id, ih, iw,
+                   od, oh, ow,
+                   kd, kh, kw,
+                   stride_d, stride_h, stride_w,
+                   pad_d0, pad_d1,
+                   pad_top, pad_bot, pad_left, pad_right);
+
+  printf("  pool3d_test: compare ref\n");
+  const float precision = 0.0002;
+  for (size_t i = 0; i < sizeof(output_data_cpu)/sizeof(float); i++)
+  {
+    if (fabs(output_data_cpu[i] - ref_output_data[i]) > precision) {
+      printf("    [%d] Error ! val %f, expected %f\n",
+             (int)i, output_data_cpu[i], ref_output_data[i]);
+      ret = -1;
+    }
+  }
+  printf("  pool3d_test: compare ref %s\n", ret ? "fail" : "pass");
+
+  cvk_fmt_t fmt = CVK_FMT_BF16;
+  cvk_tl_t *tl_input_al = NULL;
+  cvk_tl_t *tl_output_al = NULL;
+  cvk_tl_t *tl_work_al = NULL;
+
+  // Allocate input (n*id, ic, ih, iw)
+  // treat input depth as batch, do 2d max pool for each input depth
+  // align EU for tiu max pool.
+  {
+    cvk_tl_shape_t shape = {
+        (uint32_t)n*id, (uint32_t)ic, (uint32_t)ih, (uint32_t)iw};
+    tl_input_al = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, shape, fmt,
+                                                  /*eu_align=*/1);
+  }
+
+  // Allocate work (n=od*kd, ic, oh, ow)
+  {
+    cvk_tl_shape_t shape = {
+        (uint32_t)od*kd, (uint32_t)ic, (uint32_t)oh, (uint32_t)ow};
+    tl_work_al = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, shape, fmt,
+                                                 /*eu_align=*/1);
+  }
+
+  // Allocate output (n=1, ic, od, oh*ow)
+  {
+    cvk_tl_shape_t shape = {
+        (uint32_t)n, (uint32_t)ic, (uint32_t)od, (uint32_t)oh*ow};
+    tl_output_al = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, shape, fmt,
+                                                   /*eu_align=*/0);
+  }
+
+  CVI_RT_MEM gm_input_dev_mem = NULL;
+  CVI_RT_MEM gm_output_dev_mem = NULL;
+  uint64_t ga_input = 0;
+  uint64_t ga_output = 0;
+
+  // Allocate device memory of input
+  {
+    // shape (1, ic=2, id=4, ih=6, iw=6)
+    int total_len = 1 * ic * id * ih * iw;
+    uint16_t input_bf16_data[total_len];
+    convert_fp32_to_bf16_data(cvk_ctx, input_bf16_data, input_data, total_len);
+
+    gm_input_dev_mem = CVI_RT_MemAlloc(rt_handle, total_len * sizeof(uint16_t));
+
+    // Copy from system memory to device memory
+    CVI_RT_MemCopyS2D(rt_handle, gm_input_dev_mem, (uint8_t *)input_bf16_data);
+
+    ga_input = CVI_RT_MemGetPAddr(gm_input_dev_mem);
+  }
+
+  // Allocate device memory of output
+  {
+    int len = n * ic * od * oh * ow;
+    gm_output_dev_mem = CVI_RT_MemAlloc(rt_handle, len * sizeof(uint16_t));
+    ga_output = CVI_RT_MemGetPAddr(gm_output_dev_mem);
+  }
+
+  assert(gm_input_dev_mem && gm_output_dev_mem && "Expect valid gm dev mem");
+  assert(ga_input && ga_output && "Expect valid gaddr");
+
+  // load input
+  loadInput(cvk_ctx,
+            n, ic, id, ih, iw,
+            ga_input, tl_input_al);
+
+  // 3d max pool
+  compute(cvk_ctx,
+          n, ic, id, ih, iw,
+          od, oh, ow,
+          pad_d0, pad_d1,
+          pad_top, pad_bot, pad_left, pad_right,
+          kd, kh, kw,
+          stride_d, stride_h, stride_w,
+          tl_input_al, tl_work_al, tl_output_al);
+
+  // store output
+  storeOutput(cvk_ctx,
+              n, ic, od, oh, ow,
+              ga_output, tl_output_al);
+
+  CVI_RT_Submit(cvk_ctx);
+
+  // copy from device memory to system memory
+  int output_len = n * ic * od * oh * ow;
+
+  uint16_t ref_output_bf16_data[output_len];
+  convert_fp32_to_bf16_data(cvk_ctx, ref_output_bf16_data, ref_output_data,
+                            output_len);
+
+  uint16_t output_bf16_data_tpu[output_len];
+  CVI_RT_MemCopyD2S(rt_handle, (uint8_t *) output_bf16_data_tpu,
+                    gm_output_dev_mem);
+
+  printf("  pool3d_test: compare tpu\n");
+  const float tpu_precision = 0.01;
+  for (int i = 0; i < output_len; i++) {
+    float tpu_data = cvk_convert_bf16_fp32(output_bf16_data_tpu[i]);
+    if (fabs(tpu_data - ref_output_data[i]) > tpu_precision) {
+      printf("    [%d] Error ! val %f(0x%x), expected %f(0x%x)\n",
+             (int)i, tpu_data, output_bf16_data_tpu[i], ref_output_data[i],
+             ref_output_bf16_data[i]);
+      ret = -1;
+    }
+  }
+  printf("  pool3d_test: compare tpu %s\n", ret ? "fail" : "pass");
+
+  // Reverse order
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_output_al);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_work_al);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_input_al);
+
+  CVI_RT_MemFree(rt_handle, gm_input_dev_mem);
+  CVI_RT_MemFree(rt_handle, gm_output_dev_mem);
+
+#ifdef DUMP_MSG
+  printf("<= %s\n", __FUNCTION__);
+  printf("===================================\n\n");
+#endif
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_HANDLE rt_handle;
+  cvk_context_t *cvk_ctx = NULL;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = (cvk_context_t *)CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+
+  ret = pool3d_test(rt_handle, cvk_ctx);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  if (!ret)
+    printf("%s pass\n", __FILENAME__);
+  else
+    printf("%s fail\n", __FILENAME__);
+
+  return ret;
+}
diff --git a/cviruntime/test/1880v2/bm_vlc_compress.h b/cviruntime/test/1880v2/bm_vlc_compress.h
new file mode 100644
index 000000000..e82a96abd
--- /dev/null
+++ b/cviruntime/test/1880v2/bm_vlc_compress.h
@@ -0,0 +1,672 @@
+#ifndef __BM_VLC_COMPRESS_H__
+#define __BM_VLC_COMPRESS_H__
+#include <stdint.h>
+#include <stdbool.h>
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#define MAX_UNARY_FIELD_SIZE 47
+#define MAX_ORDER_K 5
+
+  /**
+   * \data_type 0 means 8bit, 1 means 16bit
+   */
+  static inline size_t get_out_bs_buf_size(u64 in_size, u8 data_type) {
+    size_t blk_num = (data_type) ? ((in_size + 31) >> 5) : ((in_size + 15) >> 4);
+    size_t in_size_pad = blk_num << (4 + data_type);
+    size_t bs_buf_size = in_size_pad + (ceiling_func(blk_num, 16) << 4) + 16;
+    return bs_buf_size;
+  }
+
+  typedef struct
+  {
+    bool signedness;
+    bool is_bfloat16;
+    uint8_t bias0;
+    uint8_t bias1;
+    bool zero_guard_en;
+  } CommandInfo;
+  typedef struct
+  {
+    uint8_t *stream; // stream buffer pointer
+    int bit_pos;     // current pointer (in bit)
+    int buf_size;    // in byte
+  } StreamBuffer;
+
+static inline int8_t two_side_circular_shift(int8_t val, uint8_t bias0, uint8_t bias1);
+static inline int8_t inv_two_side_circular_shift(int8_t val, uint8_t bias0, uint8_t bias1);
+static inline uint8_t center_shift(uint8_t val, uint8_t bias, bool zero_guard);
+static inline uint8_t inv_center_shift(uint8_t val, uint8_t bias, bool zero_guard);
+
+static inline void init_stream(StreamBuffer *bs, const uint8_t *buf, int buf_size, bool read_only);
+
+static inline void bm_vlc_est_weight_bias(const uint8_t *ibuf, size_t isz, bool signedness, bool isBfloat16, CommandInfo *cmd_info);
+static inline void bm_vlc_enc_int8(const uint8_t *ibuf, size_t isz, uint8_t *obuf, size_t *osz, CommandInfo *cmd_info);
+static inline void bm_vlc_dec_int8(const uint8_t *ibuf, size_t isz, uint8_t *obuf);
+static inline void bm_vlc_enc_bf16(const uint16_t *ibuf, size_t isz, uint8_t *obuf, size_t *osz, CommandInfo *cmd_info);
+ static inline void bm_vlc_dec_bf16(const uint8_t *ibuf, size_t isz, uint16_t *obuf);
+
+static inline uint8_t get_bit_val(uint8_t *buf, int byte_idx, int bit_idx)
+  {
+    return (buf[byte_idx] >> bit_idx) & 0x1;
+  }
+
+static inline uint8_t sign_to_unsign(uint8_t val)
+  {
+    uint8_t sign_i = (val >> 7) & 0x1;
+    int abs_data_i = abs(((int8_t)val));
+    return ((abs_data_i << 1) - sign_i);
+  }
+
+static inline int8_t unsign_to_sign(uint8_t val)
+  {
+    uint8_t sign_i = val & 0x1;
+    int abs_data_i = (((int)val) + 1) >> 1;
+    return (uint8_t)((sign_i == 1) ? (-abs_data_i) : abs_data_i);
+  }
+
+static inline void dispatch_bf16_data(const uint16_t *bf16_in, uint8_t *exp, uint8_t *frac, size_t isz)
+{
+  for (size_t i = 0; i < isz; i++)
+  {
+    exp[i] = (uint8_t)((bf16_in[i] >> 7) & 0xFF);
+    frac[i] = (uint8_t)(((bf16_in[i] >> 15) << 7) | (bf16_in[i] & 0x7F));
+  }
+}
+
+static inline void merge_bf16_data(const uint8_t *exp_in, const uint8_t *frac_in, uint16_t *bf16_out, size_t isz)
+{
+  memset(bf16_out, 0, sizeof(uint16_t));
+  for (size_t i = 0; i < isz; i++)
+  {
+    bf16_out[i] = ((frac_in[i] >> 7) << 15) | (exp_in[i] << 7) | (frac_in[i] & 0x7F);
+  }
+}
+
+// -- streaming operation handler --
+static inline void init_stream(StreamBuffer *bs, const uint8_t *buf, int buf_size, bool read_only)
+{
+  bs->bit_pos = 0;
+  bs->stream = (uint8_t *)buf;
+  bs->buf_size = buf_size;
+  if (!read_only)
+    memset((uint8_t *)buf, 0, sizeof(uint8_t) * buf_size);
+}
+
+static inline void write_stream(StreamBuffer *bs, uint8_t *src, int bit_len)
+{
+  for (int bit = 0; bit < bit_len; bit++)
+  {
+    int src_byte_i = bit / 8;
+    int src_bit_i = bit % 8;
+    int dest_byte_i = (bs->bit_pos + bit) / 8;
+    int dest_bit_i = (bs->bit_pos + bit) % 8;
+    bs->stream[dest_byte_i] |= (get_bit_val(src, src_byte_i, src_bit_i) << dest_bit_i);
+  }
+  bs->bit_pos += bit_len;
+}
+
+static inline void move_stream_ptr(StreamBuffer *bs, int bit_len)
+{
+  bs->bit_pos += bit_len;
+}
+
+static inline void parse_stream(StreamBuffer *bs, uint8_t *dest, int bit_len)
+{
+  memset(dest, 0, sizeof(uint8_t) * (bit_len + 7) >> 3);
+  for (int bit = 0; bit < bit_len; bit++)
+  {
+    int dest_byte_i = bit / 8;
+    int dest_bit_i = bit % 8;
+    int bs_byte_i = (bs->bit_pos + bit) / 8;
+    int bs_bit_i = (bs->bit_pos + bit) % 8;
+    dest[dest_byte_i] |= (get_bit_val(bs->stream, bs_byte_i, bs_bit_i) << dest_bit_i);
+  }
+  bs->bit_pos += bit_len;
+}
+
+// -- header read/write operation handler --
+static inline void vlc_enc_header(StreamBuffer *bs_header, CommandInfo *cmd_info, size_t blk_bs_size)
+{
+  write_stream(bs_header, (uint8_t *)&blk_bs_size, 24);            // bit[23:0] compressed block stream size
+  move_stream_ptr(bs_header, 4);                                   // bit[27:24] reserved
+  write_stream(bs_header, (uint8_t *)&cmd_info->signedness, 1);    // bit[28] signedness
+  write_stream(bs_header, (uint8_t *)&cmd_info->is_bfloat16, 1);   // bit[29] data type
+  move_stream_ptr(bs_header, 2);                                   // bit[31:30] bit depth
+  write_stream(bs_header, (uint8_t *)&cmd_info->bias0, 8);         // bit[39:32] bias0 for symbol remapping
+  write_stream(bs_header, (uint8_t *)&cmd_info->bias1, 7);         // bit[46:40] bias1 for symbol remapping
+  write_stream(bs_header, (uint8_t *)&cmd_info->zero_guard_en, 1); // bit[47] zero guard
+}
+
+static inline void vlc_dec_header(StreamBuffer *bs_header, CommandInfo *cmd_info)
+{
+  move_stream_ptr(bs_header, 28);                                // bit[27:24] reserved
+  parse_stream(bs_header, (uint8_t *)&cmd_info->signedness, 1);  // bit[28] signedness
+  parse_stream(bs_header, (uint8_t *)&cmd_info->is_bfloat16, 1); // bit[29] data type
+  move_stream_ptr(bs_header, 2);
+  parse_stream(bs_header, (uint8_t *)&cmd_info->bias0, 8);         // bit[39:32] bias0 for symbol remapping
+  parse_stream(bs_header, (uint8_t *)&cmd_info->bias1, 7);         // bit[46:40] bias1 for symbol remapping
+  parse_stream(bs_header, (uint8_t *)&cmd_info->zero_guard_en, 1); // bit[47] zero guard
+}
+
+// -- symbol remmaping handler --
+static inline uint8_t center_shift(uint8_t val, uint8_t bias, bool zero_guard)
+{
+  if (val == 0 && zero_guard)
+    return 0;
+
+  int16_t shift_data_i = val - bias;
+  uint8_t range = (bias <= 128) ? bias : 255 - bias;
+  if (bias <= 128)
+  {
+    return (val >= (range << 1)) ? val : sign_to_unsign(shift_data_i) + zero_guard;
+  }
+  else
+  {
+    return (val < (bias - range)) ? (range + bias - val + zero_guard) : (sign_to_unsign(shift_data_i) + zero_guard);
+  }
+}
+
+static inline uint8_t inv_center_shift(uint8_t val, uint8_t bias, bool zero_guard)
+{
+  if (val == 0 && zero_guard)
+    return 0;
+
+  uint8_t unsign_data_i = val - zero_guard;
+  uint8_t range = (bias <= 128) ? bias : 255 - bias;
+  if (bias <= 128)
+  {
+    return (val >= (range << 1)) ? val : unsign_to_sign(unsign_data_i) + bias;
+  }
+  else
+  {
+    return (unsign_data_i > (range << 1)) ? (range + bias - val + zero_guard) : unsign_to_sign(unsign_data_i) + bias;
+  }
+}
+
+static inline int8_t two_side_circular_shift(int8_t val, uint8_t bias0, uint8_t bias1)
+{
+  if (val == 0)
+    return 0;
+
+  bool sign = (val < 0) ? true : false;
+  int32_t abs_val = abs(val);
+  abs_val -= (sign) ? bias1 : bias0;
+  abs_val += (abs_val <= 0) ? (127 + sign) : 0;
+  return (sign) ? -abs_val : abs_val;
+}
+
+static inline int8_t inv_two_side_circular_shift(int8_t val, uint8_t bias0, uint8_t bias1)
+{
+  if (val == 0)
+    return 0;
+
+  bool sign = (val < 0) ? true : false;
+  uint32_t abs_val = abs(val);
+  abs_val += (sign) ? bias1 : bias0;
+  int32_t abs_val_minus = abs_val - (127 + sign);
+  uint8_t abs_val_lsb = ((abs_val_minus <= 0)
+                             ? abs_val
+                             : abs_val_minus) &
+                        0xFF;
+  return (sign) ? -abs_val_lsb : abs_val_lsb;
+}
+
+static inline void symbol_remapping(uint8_t *blk_in, uint8_t *blk_out, uint8_t bias0, uint8_t bias1, bool signedness, bool is_bf16_exp, bool zero_guard)
+{
+  if (is_bf16_exp == false && signedness == false)
+  {
+    // remapping bypass
+    memcpy(blk_out, blk_in, sizeof(uint8_t) * 16);
+    return;
+  }
+
+  if (is_bf16_exp == true)
+  {
+    // center circular shift
+    for (int i = 0; i < 16; i++)
+    {
+      blk_out[i] = center_shift(blk_in[i], bias0, zero_guard);
+    }
+  }
+  else
+  {
+    // two-side circular shift
+    for (int i = 0; i < 16; i++)
+    {
+      int8_t shift_data_i = two_side_circular_shift((int8_t)blk_in[i], bias0, bias1);
+      blk_out[i] = sign_to_unsign(shift_data_i);
+    }
+  }
+}
+
+static inline void inv_symbol_remapping(uint8_t *blk_in, uint8_t *blk_out, uint8_t bias0, uint8_t bias1, bool signedness, bool is_bf16_exp, bool zero_guard)
+{
+  if (is_bf16_exp == false && signedness == false)
+  {
+    // remapping bypass
+    memcpy(blk_out, blk_in, sizeof(uint8_t) * 16);
+    return;
+  }
+
+  if (is_bf16_exp == true)
+  {
+    // center circular shift
+    for (int i = 0; i < 16; i++)
+    {
+      blk_out[i] = inv_center_shift(blk_in[i], bias0, zero_guard);
+    }
+  }
+  else
+  {
+    // two-side circular shift
+    for (int i = 0; i < 16; i++)
+    {
+      int8_t sign_data_i = unsign_to_sign(blk_in[i]);
+      blk_out[i] = (uint8_t)inv_two_side_circular_shift(sign_data_i, bias0, bias1);
+    }
+  }
+}
+
+static inline int vlc_estimate_block_order(uint8_t *blk_in, bool bf16_zvc_en)
+{
+  int best_k = 0;
+  int best_bs_size = 0x7FFFFFFF;
+
+  for (int k = 0; k <= (int)MAX_ORDER_K; k++)
+  {
+    uint8_t remain_field_size = k << 4;
+    int unary_field_len = 0;
+    for (int i = 0; i < 16; i++)
+    {
+      uint8_t group_idx = blk_in[i] >> k;
+      unary_field_len += (group_idx + 1);
+    }
+    int znum_bit = (bf16_zvc_en && k > 0) ? 4 : 0;
+    int blk_size = (unary_field_len <= MAX_UNARY_FIELD_SIZE)
+                       ? remain_field_size + unary_field_len + znum_bit
+                       : 255;
+    if (blk_size < best_bs_size)
+    {
+      best_k = k;
+      best_bs_size = blk_size;
+    }
+  }
+
+  best_k = (best_bs_size > 128) ? -1 : best_k;
+  return best_k;
+}
+// -- vlc block parrelel GR encode/decode --
+static inline uint8_t vlc_gr_enc_block_data(uint8_t *blk_in, StreamBuffer *bs, int order_k, bool bf16_zvc_en)
+{
+  // uncompressed mode
+  if (order_k == -1)
+  {
+    write_stream(bs, blk_in, 128);
+    return 128;
+  }
+
+  // remain field
+  uint8_t remain_field[16] = {0};
+  uint8_t unary_field[8] = {0};
+  uint8_t sym_end_pos[16] = {0};
+  uint8_t unary_field_len = 0;
+  int sym_end_pos_accum = -1;
+
+  // bit plane encode for remain field
+  for (int k = 0; k < order_k; k++)
+  {
+    uint8_t bit_plane0 = 0, bit_plane1 = 0;
+    for (int i = 0; i < 8; i++)
+    {
+      bit_plane0 |= (get_bit_val(blk_in, i, k) << i);
+      bit_plane1 |= (get_bit_val(blk_in, i + 8, k) << i);
+    }
+    remain_field[k << 1] = bit_plane0;
+    remain_field[(k << 1) + 1] = bit_plane1;
+  }
+  write_stream(bs, remain_field, order_k << 4);
+
+  if (bf16_zvc_en && order_k > 0)
+  {
+    int zero_num = 0;
+    for (int i = 0; i < 16; i++)
+    {
+      if (blk_in[i] == 0)
+        zero_num++;
+    }
+    assert(zero_num < 16);
+    write_stream(bs, (uint8_t *)&zero_num, 4);
+  }
+
+  // unary encode for unary field
+  for (int i = 0; i < 16; i++)
+  {
+    int group_idx = blk_in[i] >> order_k;
+    sym_end_pos_accum += (group_idx + 1);
+    sym_end_pos[i] = sym_end_pos_accum;
+    int byte_idx = sym_end_pos[i] / 8;
+    int bit_idx = sym_end_pos[i] % 8;
+    unary_field[byte_idx] |= (1 << (bit_idx));
+  }
+  unary_field_len = sym_end_pos[15] + 1;
+  assert(unary_field_len <= MAX_UNARY_FIELD_SIZE);
+  uint8_t ulen = (unary_field_len - 16) & 0x1F;
+  write_stream(bs, unary_field, unary_field_len);
+
+  return ulen;
+}
+
+static inline void vlc_gr_dec_block_data(StreamBuffer *bs, uint8_t bs_size, uint8_t *rec, int order_k, bool bf16_zvc_en)
+{
+  assert(bs_size <= 128);
+  // uncompressed mode
+  if (order_k == -1)
+  {
+    parse_stream(bs, rec, 128);
+    return;
+  }
+
+  // remain field
+  uint8_t remain_data[16] = {0};
+  uint8_t remain_bs[16] = {0};
+  uint8_t unary_field[8] = {0};
+  uint8_t sym_end_pos[16] = {0};
+  uint8_t unary_sym[16] = {0};
+  uint8_t remain_field_size = order_k << 4;
+
+  parse_stream(bs, remain_bs, remain_field_size);
+  // bit plane encode for remain field
+  for (int k = 0; k < order_k; k++)
+  {
+    for (int i = 0; i < 8; i++)
+    {
+      remain_data[i] |= (get_bit_val(remain_bs, k << 1, i) << k);
+      remain_data[i + 8] |= (get_bit_val(remain_bs, (k << 1) + 1, i) << k);
+    }
+  }
+
+  // zero number info
+  int znum_bit = (bf16_zvc_en && order_k > 0) ? 4 : 0;
+  uint8_t znum = 0;
+  parse_stream(bs, &znum, znum_bit);
+
+  // unary encode for unary field
+  uint8_t unary_field_len = bs_size - remain_field_size - znum_bit;
+  parse_stream(bs, unary_field, unary_field_len);
+
+  int sym_cnt = 0;
+  for (uint8_t ubit_i = 0; ubit_i < unary_field_len; ubit_i++)
+  {
+    int byte_idx = ubit_i / 8;
+    int bit_idx = ubit_i % 8;
+    if (get_bit_val(unary_field, byte_idx, bit_idx) == 1)
+    {
+      sym_end_pos[sym_cnt] = ubit_i;
+      sym_cnt++;
+    }
+  }
+  unary_sym[0] = sym_end_pos[0];
+  for (int i = 1; i < 16; i++)
+  {
+    unary_sym[i] = sym_end_pos[i] - sym_end_pos[i - 1] - 1;
+  }
+  for (int i = 0; i < 16; i++)
+  {
+    rec[i] = (unary_sym[i] << order_k) + remain_data[i];
+  }
+}
+
+// -- vlc encode int8 entry funtion --
+static inline void bm_vlc_enc_int8(const uint8_t *ibuf, size_t isz, uint8_t *obuf, size_t *osz, CommandInfo *cmd_info)
+{
+  StreamBuffer bs_header, bs_kmap, bs_data;
+  size_t blk_num = (isz + 15) >> 4;
+  size_t header_size = 16;
+  size_t kmap_size = ceiling_func(blk_num, 16) << 4;
+  size_t bs_buf_size = header_size + kmap_size + (blk_num << 4);
+  uint8_t *bsbuf = (uint8_t *)calloc(bs_buf_size, sizeof(uint8_t));
+
+  // block encode
+  init_stream(&bs_kmap, bsbuf + header_size, kmap_size, false);
+  init_stream(&bs_data, bsbuf + header_size + kmap_size, blk_num << 4, false);
+
+  for (size_t blk_idx = 0; blk_idx < blk_num; blk_idx++)
+  {
+    uint8_t blk_data[16] = {0}, blk_sr_data[16] = {0};
+    size_t in_size = (blk_idx == (blk_num - 1)) ? isz - (blk_idx << 4) : 16;
+    memcpy(blk_data, &ibuf[blk_idx << 4], sizeof(uint8_t) * in_size);
+
+    symbol_remapping(blk_data, blk_sr_data, cmd_info->bias0, cmd_info->bias1, cmd_info->signedness, false, false);
+
+    int k = vlc_estimate_block_order(blk_sr_data, false);
+    uint8_t ulen = vlc_gr_enc_block_data(blk_sr_data, &bs_data, k, false);
+    uint8_t k_info = (k == -1) ? 0xE0 : (k << 5) + ulen;
+    write_stream(&bs_kmap, &k_info, 8);
+  }
+
+  int blk_bs_size = ceiling_func(((bs_data.bit_pos + 7) >> 3), 16) << 4; // 16 byte align
+  *osz = header_size + kmap_size + blk_bs_size;
+
+  // write header
+  init_stream(&bs_header, bsbuf, header_size, false);
+  vlc_enc_header(&bs_header, cmd_info, blk_bs_size);
+
+  memcpy(obuf, bsbuf, (*osz) * sizeof(uint8_t));
+  free(bsbuf);
+}
+
+// -- vlc decode int8 entry funtion --
+static inline void bm_vlc_dec_int8(const uint8_t *ibuf, size_t isz, uint8_t *obuf)
+{
+  StreamBuffer bs_header, bs_kmap, bs_data;
+  CommandInfo cmd_info;
+  memset(&cmd_info, 0, sizeof(CommandInfo));
+
+  size_t blk_num = (isz + 15) >> 4;
+  int header_size = 16;
+  int kmap_size = ceiling_func(blk_num, 16) << 4;
+
+  // parse header
+  init_stream(&bs_header, ibuf, header_size, true);
+  vlc_dec_header(&bs_header, &cmd_info);
+
+  // block decode
+  init_stream(&bs_kmap, ibuf + header_size, kmap_size, true);
+  init_stream(&bs_data, ibuf + header_size + kmap_size, blk_num << 4, true);
+
+  for (size_t blk_idx = 0; blk_idx < blk_num; blk_idx++)
+  {
+    uint8_t blk_data[16] = {0}, blk_sr_data[16] = {0};
+    uint8_t k_info = 0;
+    parse_stream(&bs_kmap, &k_info, 8);
+    uint8_t ulen = k_info & 0x1F;
+    int k = (k_info >> 5 == 7) ? -1 : k_info >> 5;
+    int blk_bs_size = (k == -1) ? 128 : (k << 4) + ulen + 16;
+    vlc_gr_dec_block_data(&bs_data, blk_bs_size, blk_data, k, false);
+
+    inv_symbol_remapping(blk_data, blk_sr_data, cmd_info.bias0, cmd_info.bias1, cmd_info.signedness, false, false);
+
+    int out_size = (blk_idx == (blk_num - 1)) ? isz - (blk_idx << 4) : 16;
+    memcpy(&obuf[blk_idx << 4], blk_sr_data, sizeof(uint8_t) * out_size);
+  }
+}
+
+// -- vlc encode bfloat16 entry funtion --
+static inline void bm_vlc_enc_bf16(const uint16_t *ibuf, size_t isz, uint8_t *obuf, size_t *osz, CommandInfo *cmd_info)
+{
+  StreamBuffer bs_header, bs_kmap, bs_data;
+  size_t blk_num = (isz + 31) >> 5; // 32 bytes per blok
+  size_t header_size = 16;
+  size_t kmap_size = ceiling_func(blk_num, 16) << 4;
+  size_t bs_buf_size = header_size + kmap_size + (blk_num << 5);
+  uint8_t *bsbuf = (uint8_t *)calloc(bs_buf_size, sizeof(uint8_t));
+
+  // block encode
+  init_stream(&bs_kmap, bsbuf + header_size, kmap_size, false);
+  init_stream(&bs_data, bsbuf + header_size + kmap_size, blk_num << 5, false);
+
+  for (size_t blk_idx = 0; blk_idx < blk_num; blk_idx++)
+  {
+    uint8_t blk_data[16] = {0}, blk_sr_data[16] = {0}, blk_data_frac[16] = {0};
+    size_t in_num = (blk_idx == (blk_num - 1)) ? ((isz >> 1) - (blk_idx << 4)) : 16;
+    dispatch_bf16_data(&ibuf[blk_idx << 4], blk_data, blk_data_frac, in_num);
+
+    // exp: BGR encode
+    symbol_remapping(blk_data, blk_sr_data, cmd_info->bias0, cmd_info->bias1, false, true, cmd_info->zero_guard_en);
+
+    int k = vlc_estimate_block_order(blk_sr_data, cmd_info->zero_guard_en);
+    uint8_t ulen = vlc_gr_enc_block_data(blk_sr_data, &bs_data, k, cmd_info->zero_guard_en);
+    uint8_t k_info = (k == -1) ? 0xE0 : (k << 5) + ulen;
+    write_stream(&bs_kmap, &k_info, 8);
+
+    // frac: implicit zero compression
+    for (size_t i = 0; i < 16; i++)
+    {
+      if (!cmd_info->zero_guard_en || blk_data[i] != 0)
+      {
+        write_stream(&bs_data, &blk_data_frac[i], 8);
+      }
+    }
+  }
+
+  int blk_bs_size = ceiling_func(((bs_data.bit_pos + 7) >> 3), 16) << 4; // 16 byte align
+  *osz = header_size + kmap_size + blk_bs_size;
+
+  // write header
+  init_stream(&bs_header, bsbuf, header_size, false);
+  vlc_enc_header(&bs_header, cmd_info, blk_bs_size);
+
+  memcpy(obuf, bsbuf, (*osz) * sizeof(uint8_t));
+  free(bsbuf);
+}
+
+// -- vlc decode bfloat16 entry funtion --
+static inline void bm_vlc_dec_bf16(const uint8_t *ibuf, size_t isz, uint16_t *obuf)
+{
+  StreamBuffer bs_header, bs_kmap, bs_data;
+  CommandInfo cmd_info;
+  memset(&cmd_info, 0, sizeof(CommandInfo));
+
+  size_t blk_num = (isz + 31) >> 5; // 32 bytes per blok
+  int header_size = 16;
+  int kmap_size = ceiling_func(blk_num, 16) << 4;
+
+  // parse header
+  init_stream(&bs_header, ibuf, header_size, true);
+  vlc_dec_header(&bs_header, &cmd_info);
+
+  // block decode
+  init_stream(&bs_kmap, ibuf + header_size, kmap_size, true);
+  init_stream(&bs_data, ibuf + header_size + kmap_size, blk_num << 5, true);
+
+  for (size_t blk_idx = 0; blk_idx < blk_num; blk_idx++)
+  {
+    uint8_t blk_data[16] = {0}, blk_sr_data[16] = {0}, blk_data_frac[16] = {0};
+    uint8_t k_info = 0;
+    parse_stream(&bs_kmap, &k_info, 8);
+    uint8_t ulen = k_info & 0x1F;
+    int k = (k_info >> 5 == 7) ? -1 : k_info >> 5;
+    int znum_bit = (cmd_info.zero_guard_en && k > 0) ? 4 : 0;
+    uint8_t blk_bs_size = (k == -1) ? 128 : (k << 4) + ulen + 16 + znum_bit;
+
+    // exp: BGR decode
+    vlc_gr_dec_block_data(&bs_data, blk_bs_size, blk_data, k, cmd_info.zero_guard_en);
+
+    inv_symbol_remapping(blk_data, blk_sr_data, cmd_info.bias0, cmd_info.bias1, false, true, cmd_info.zero_guard_en);
+
+    size_t out_num = (blk_idx == (blk_num - 1)) ? ((isz >> 1) - (blk_idx << 4)) : 16;
+
+    // frac: implicit zero compression
+    for (size_t i = 0; i < out_num; i++)
+    {
+      if (!cmd_info.zero_guard_en || blk_sr_data[i] != 0)
+      {
+        parse_stream(&bs_data, &blk_data_frac[i], 8);
+      }
+    }
+    merge_bf16_data(blk_sr_data, blk_data_frac, &obuf[blk_idx << 4], out_num);
+  }
+}
+
+// -- offline estimate model weight params --
+static inline void bm_vlc_est_weight_bias(const uint8_t *ibuf, size_t isz, bool signedness, bool isBfloat16, CommandInfo *cmd_info)
+{
+  assert(!(isBfloat16 && signedness)); // WARNING: signedness MUST be 0 as isBfloat16==True
+
+  cmd_info->is_bfloat16 = isBfloat16;
+  if (isBfloat16 == false && signedness == true)
+  {
+    // two-side circular shift
+    int hist[256] = {0};
+    for (size_t i = 0; i < isz; i++)
+    {
+      hist[ibuf[i]]++;
+    }
+
+    int8_t pos_v = 1;
+    //while (pos_v < 128)
+    // comparison is always   true due to limited range of data type [-Werror=type-limits]
+    while (true)
+    {
+      if (hist[((uint8_t)pos_v)] == 0)
+      {
+        pos_v++;
+      }
+      else
+      {
+        break;
+      }
+    }
+    //cmd_info->bias0 = (pos_v > 1 && pos_v < 128) ? (pos_v - 1) : 0;
+    // comparison is always   true due to limited range of data type [-Werror=type-limits]
+    cmd_info->bias0 = (pos_v > 1) ? (pos_v - 1) : 0;
+    int8_t neg_v = -1;
+    //while (neg_v >= (-128)) // comparison is always   true due to limited range of data type [-Werror=type-limits]
+    while (true)
+    {
+      if (hist[(uint8_t)neg_v] == 0)
+      {
+        neg_v--;
+      }
+      else
+      {
+        break;
+      }
+    }
+    //cmd_info->bias1 = (neg_v < -1 && neg_v >= -128) ? abs(neg_v + 1) : 0;
+    // comparison is always   true due to limited range of data type [-Werror=type-limits]
+    cmd_info->bias1 = (neg_v < -1) ? abs(neg_v + 1) : 0;
+    cmd_info->signedness = true;
+  }
+
+  if (isBfloat16 == true)
+  {
+    // center shift
+    int64_t exp_accum = 0;
+    uint16_t *bf16_in = (uint16_t *)ibuf;
+    size_t inum = (isz >> 1), cnt = 0;
+    for (size_t i = 0; i < inum; i++)
+    {
+      uint8_t exp = ((bf16_in[i] >> 7) & 0xFF);
+      if (exp != 0)
+      {
+        exp_accum += exp;
+        cnt++;
+      }
+    }
+    if (cnt > 0)
+    {
+      cmd_info->bias0 = (uint8_t)((exp_accum / (float)cnt) + 0.5);
+    }
+    cmd_info->zero_guard_en = (inum == cnt) ? false : true;
+    cmd_info->signedness = false;
+  }
+}
+  #ifdef __cplusplus
+}
+#endif
+
+#endif /* __BM_VLC_COMPRESS_H__ */
diff --git a/cviruntime/test/1880v2/compression.h b/cviruntime/test/1880v2/compression.h
new file mode 100644
index 000000000..10452c738
--- /dev/null
+++ b/cviruntime/test/1880v2/compression.h
@@ -0,0 +1,367 @@
+#ifndef COMPRESSION_H
+#define COMPRESSION_H
+
+typedef struct {
+  u32 compress_md;
+  u32 bit_length;
+  int is_signed;
+
+  u64 total_data_num;
+  u32 non_zero_data_num;
+
+  u64 header_bytes;
+  u64 map_bytes;
+  u64 data_bytes;
+  u64 total_bytes;
+
+  int compressed_min;
+  int compressed_max;
+} compression_info_t;
+
+typedef struct {
+  u64 header_offset;
+  u64 header_size;
+  u64 map_offset;
+  u64 map_size;
+  u64 data_offset;
+  u64 data_size;
+  u64 total_size;
+} compress_addr_info;
+
+static u64 compression_map_bytes(u64 total_data_num)
+{
+  u64 bit_alignment = 16 * 8;
+  u64 bits = total_data_num;
+
+  return ceiling_func(bits, bit_alignment)*16;
+}
+
+static u64 compression_map_clear_bytes(u64 total_data_num)
+{
+  u64 bit_alignment = 2 * 8;
+  u64 bits = total_data_num;
+
+  return ceiling_func(bits, bit_alignment)*2;
+}
+
+
+static u64 compression_data_bytes(u64 non_zero_data_num, u32 bit_length)
+{
+  if (bit_length == 1)
+    return 0;
+
+  u64 bit_alignment = 8;
+  u64 bits = non_zero_data_num * bit_length;
+
+  return ceiling_func(bits, bit_alignment);
+}
+
+static inline u32 compression_bit_length(u32 compress_md)
+{
+  switch (compress_md) {
+    case 0:
+      return 8;
+    case 1:
+      return 4;
+    case 2:
+      return 2;
+    case 3:
+      return 1;
+    default:
+      assert(0);
+  }
+}
+
+static inline void compute_compressed_range(
+    u32 bit_length, int is_signed, int *min, int *max)
+{
+  if (is_signed) {
+    switch (bit_length) {
+      case 1:
+        *min = -1;
+        *max = 0;
+        return;
+      case 2:
+        *min = -2;
+        *max = 1;
+        return;
+      case 4:
+        *min = -8;
+        *max = 7;
+        return;
+      case 8:
+        *min = -128;
+        *max = 127;
+        return;
+    }
+  } else {
+    *min = 0;
+    switch (bit_length) {
+      case 1:
+        *max = 1;
+        return;
+      case 2:
+        *max = 3;
+        return;
+      case 4:
+        *max = 15;
+        return;
+      case 8:
+        *max = 255;
+        return;
+    }
+  }
+  assert(0);
+}
+
+static inline int saturate(int val, int max, int min)
+{
+  if (val < min)
+    return min;
+  else if (val > max)
+    return max;
+  else
+    return val;
+}
+
+static inline u64 count_non_zero_results(
+    u8 buf[], u64 size, int is_signed, int max, int min)
+{
+  u64 n = 0;
+
+  for (u64 i = 0; i < size; i++) {
+    int val = is_signed? (s8)buf[i]: buf[i];
+    int res = saturate(val, max, min);
+    if (res != 0)
+      n++;
+  }
+
+  return n;
+}
+
+static inline void set_map_bit(u8 map[], u64 i)
+{
+  u64 byte_i = i / 8;
+  u64 bit_i = i % 8;
+
+  map[byte_i] |= (1 << bit_i);
+}
+
+static inline u8 read_map_bit(u8 map[], u64 i)
+{
+  u64 byte_i = i / 8;
+  u64 bit_i = i % 8;
+
+  return (map[byte_i] >> bit_i) & 1;
+}
+
+static inline void parse_header(
+    u32 header, int *is_signed, u32 *compress_md, u32 *nz_num)
+{
+  *is_signed = (header >> 29) & 1;
+  *compress_md = (header >> 24) & 0b11;
+  *nz_num = header & 0xffffff;
+}
+
+static inline void fill_header(u32 *hdr, compression_info_t *info)
+{
+  if(compression_bit_length(info->compress_md)!=1)
+  {
+    *hdr = (info->is_signed << 29) | (1 << 28) |
+        (info->compress_md << 24) |
+        info->non_zero_data_num;
+  }else
+  {
+    *hdr = (info->is_signed << 29) | (1 << 28) |
+        (info->compress_md << 24);
+  }
+}
+
+static inline void fill_map(u8 map[], u8 buf[], compression_info_t *info)
+{
+  int min = info->compressed_min;
+  int max = info->compressed_max;
+
+  u64 clear_map = compression_map_clear_bytes(info->total_data_num);
+  for (u64 i = 0; i < clear_map; i++)
+    map[i] = 0;
+
+  for (u64 i = 0; i < info->total_data_num; i++) {
+    int val = info->is_signed? (s8)buf[i]: buf[i];
+    int res = saturate(val, max, min);
+    if (res != 0)
+      set_map_bit(map, i);
+  }
+}
+
+static inline void compress_one_data(
+    u8 data[], u64 i, u8 val, compression_info_t *info)
+{
+  u32 bit_len = info->bit_length;
+  u32 data_per_byte = 8 / bit_len;
+
+  u32 byte_i = i / data_per_byte;
+  u32 bit_i = (i % data_per_byte) * bit_len;
+  u8 mask = (1 << bit_len) - 1;
+
+  data[byte_i] |= (val & mask) << bit_i;
+}
+
+static inline u8 sign_extend(u8 val, u32 bit_len)
+{
+  int shift = 8 - bit_len;
+  return (s8)(val << shift) >> shift;
+}
+
+static inline u8 decompress_one_data(
+    u8 data[], u64 i, compression_info_t *info)
+{
+  u32 bit_len = info->bit_length;
+  u32 data_per_byte = 8 / bit_len;
+
+  u32 byte_i = i / data_per_byte;
+  u32 bit_i = (i % data_per_byte) * bit_len;
+  u8 mask = (1 << bit_len) - 1;
+
+  u8 val = (data[byte_i] >> bit_i) & mask;
+  if (info->is_signed)
+    val = sign_extend(val, bit_len);
+
+  return val;
+}
+
+static inline void fill_data(u8 data[], u8 buf[], compression_info_t *info)
+{
+  int min = info->compressed_min;
+  int max = info->compressed_max;
+
+  for (u64 i = 0; i < info->data_bytes; i++)
+    data[i] = 0;
+
+  u64 nz_i = 0;
+  for (u64 i = 0; i < info->total_data_num; i++) {
+    int val = info->is_signed? (s8)buf[i]: buf[i];
+    int res = saturate(val, max, min);
+    if (res != 0) {
+      compress_one_data(data, nz_i, res, info);
+      nz_i++;
+    }
+  }
+}
+
+static inline compression_info_t make_compression_info(
+    u8 buf[], u64 size, u32 compress_md, int is_signed)
+{
+  u32 bit_length = compression_bit_length(compress_md);
+
+  int min, max;
+  compute_compressed_range(bit_length, is_signed, &min, &max);
+
+  u32 nz_num = count_non_zero_results(buf, size, is_signed, max, min);
+  assert(nz_num <= 0xffffff);
+
+  compression_info_t info;
+  info.compress_md = compress_md;
+  info.bit_length = bit_length;
+  info.is_signed = is_signed;
+  info.total_data_num = size;
+  info.non_zero_data_num = nz_num;
+  info.header_bytes = 16;
+  info.map_bytes = compression_map_bytes(size);
+  info.data_bytes = compression_data_bytes(nz_num, bit_length);
+  info.total_bytes = info.header_bytes + info.map_bytes + info.data_bytes;
+  info.compressed_min = min;
+  info.compressed_max = max;
+  return info;
+}
+
+static inline compression_info_t parse_compression_info(
+    u8 compressed_buf[], u64 max_size, u64 total_data_num)
+{
+  u64 header_bytes = 16;
+  assert(header_bytes <= max_size);
+
+  int is_signed;
+  u32 compress_md, nz_num;
+  parse_header(*(u32 *)compressed_buf, &is_signed, &compress_md, &nz_num);
+
+  u32 bit_length = compression_bit_length(compress_md);
+  int min, max;
+  compute_compressed_range(bit_length, is_signed, &min, &max);
+
+  compression_info_t info;
+  info.compress_md = compress_md;
+  info.bit_length = compression_bit_length(compress_md);
+  info.is_signed = is_signed;
+  info.total_data_num = total_data_num;
+  info.non_zero_data_num = nz_num;
+  info.header_bytes = header_bytes;
+  info.map_bytes = compression_map_bytes(total_data_num);
+  info.data_bytes = compression_data_bytes(nz_num, info.bit_length);
+  info.total_bytes = header_bytes + info.map_bytes + info.data_bytes;
+  info.compressed_min = min;
+  info.compressed_max = max;
+
+  assert(info.total_bytes <= max_size);
+
+  return info;
+}
+
+static inline u8 * compress(
+    u8 buf[], u64 size, u32 compress_md, int is_signed, compress_addr_info *compressed_data)
+{
+  compression_info_t info =
+      make_compression_info(buf, size, compress_md, is_signed);
+
+  assert(info.total_bytes < 0x100000);
+  static u8 *result = (u8 *)malloc(sizeof(u8) * 0x100000);
+  u32 *hdr = (u32 *)result;
+  u8 *map = &result[info.header_bytes];
+  u8 *data = &map[info.map_bytes];
+
+  fill_header(hdr, &info);
+  fill_map(map, buf, &info);
+  if (info.bit_length != 1)
+    fill_data(data, buf, &info);
+
+  compressed_data->header_offset = 0;
+  compressed_data->header_size = 4;
+  compressed_data->map_offset = info.header_bytes;
+  compressed_data->map_size = compression_map_clear_bytes(info.total_data_num);
+  compressed_data->data_offset = info.map_bytes + info.header_bytes;
+  compressed_data->data_size = info.data_bytes;
+  compressed_data->total_size = info.total_bytes;
+
+  return result;
+}
+
+static inline void decompress(
+    u8 buf[], u64 size, u8 compressed_buf[], u64 max_size)
+{
+  compression_info_t info =
+      parse_compression_info(compressed_buf, max_size, size);
+  assert(info.total_bytes <= max_size);
+  assert(info.total_data_num == size);
+
+  u8 *map = &compressed_buf[info.header_bytes];
+  if (info.bit_length == 1) {
+    for (u64 i = 0; i < size; i++) {
+      u8 val = read_map_bit(map, i);
+      buf[i] = info.is_signed? sign_extend(val, 1): val;
+    }
+  } else {
+    u8 *data = &map[info.map_bytes];
+    u64 data_i = 0;
+    for (u64 i = 0; i < size; i++) {
+      u8 val = read_map_bit(map, i);
+      if (val == 0) {
+        buf[i] = 0;
+      } else {
+        buf[i] = decompress_one_data(data, data_i, &info);
+        data_i++;
+      }
+    }
+  }
+}
+
+#endif /* COMPRESSION_H */
diff --git a/cviruntime/test/1880v2/test_1880v2_avg_pooling.cpp b/cviruntime/test/1880v2/test_1880v2_avg_pooling.cpp
new file mode 100644
index 000000000..9453cea30
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_avg_pooling.cpp
@@ -0,0 +1,216 @@
+#include "1880v2_test_util.h"
+
+typedef bmk1880v2_tiu_average_pooling_param_t param_t;
+
+static void print_pooling_param(const param_t *p)
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+
+  printf("  Pooling parameters:\n");
+  printf("    ifmap = (%d, %d, %d, %d)\n", in, ic, ih, iw);
+  printf("    opd0_sign = %d\n", p->ifmap->fmt == FMT_I8);
+  printf("    weight = (%d, %d)\n", p->kh, p->kw);
+  printf("    padding = (%d, %d, %d, %d)\n",
+         p->pad_top, p->pad_bottom, p->pad_left, p->pad_right);
+  printf("    stride = (%d, %d)\n", p->stride_h, p->stride_w);
+  printf("    ins0 = (%d, %d, %d, %d)\n",
+         p->ins_h, p->ins_last_h, p->ins_w, p->ins_last_w);
+  printf("    avg_pooling_const = %d\n", p->avg_pooling_const);
+  printf("    rshift_bits = %d\n", p->rshift_bits);
+}
+
+static s8 *alloc_input(param_t *p)
+{
+  u64 size = tl_shape_size(&p->ifmap->shape);
+  s8 *data = (s8 *)xmalloc(size);
+  if (!data)
+    return NULL;
+
+  for (u64 i = 0; i < size; i++)
+    data[i] = rand() % 256 - 128;
+  return data;
+}
+
+static s8 *alloc_output(param_t *p)
+{
+  u64 size = tl_shape_size(&p->ofmap->shape);
+  return (s8 *)xmalloc(size);
+}
+
+static int pooling_ih_ext(param_t *p, int ih)
+{
+  int ins = p->ins_h;
+  int ins_last = p->ins_last_h;
+  int pad = p->pad_top + p->pad_bottom;
+  return (ih - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+static int pooling_iw_ext(param_t *p, int iw)
+{
+  int ins = p->ins_w;
+  int ins_last = p->ins_last_w;
+  int pad = p->pad_left + p->pad_right;
+  return (iw - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+static int pooling_oh(param_t *p, int ih)
+{
+  int ih_ext = pooling_ih_ext(p, ih);
+  return (ih_ext - p->kh) / p->stride_h + 1;
+}
+
+static int pooling_ow(param_t *p, int iw)
+{
+  int iw_ext = pooling_iw_ext(p, iw);
+  return (iw_ext - p->kw) / p->stride_w + 1;
+}
+
+static void free_pooling_param(
+    bmk_ctx_t *ctx,
+    param_t *p)
+{
+  if (p->ifmap)
+    free_tl(ctx, p->ifmap);
+  if (p->ofmap)
+    free_tl(ctx, p->ofmap);
+}
+
+static param_t random_pooling_param(bmk_ctx_t *ctx)
+{
+  srand(clock());
+  param_t p;
+
+  memset(&p, 0, sizeof(p));
+
+retry:
+  int in = rand() % 5 + 1;
+  int ic = rand() % (3 * BM1880V2_HW_NPU_NUM) + 1;
+  int ih = rand() % 30 + 3;
+  int iw = rand() % 30 + 6;
+  int opd0_sign = rand() % 2;
+
+  p.kh = rand() % 7 + 1;
+  p.kw = rand() % 7 + 1;
+  p.stride_h = rand() % p.kh + 1;
+  p.stride_w = rand() % p.kw + 1;
+  p.ins_h = rand() % p.kh;
+  p.ins_w = rand() % p.kw;
+  p.ins_last_h = rand() % p.kh;
+  p.ins_last_w = rand() % p.kw;
+  p.pad_top = rand() % p.kh;
+  p.pad_bottom = rand() % p.kh;
+  p.pad_left = rand() % p.kw;
+  p.pad_right= rand() % p.kw;
+  p.avg_pooling_const = rand() % 256;
+  p.rshift_bits = rand() % 32;
+
+  tl_shape_t ifmap_shape;
+  ifmap_shape.n = in;
+  ifmap_shape.c = ic;
+  ifmap_shape.h = ih;
+  ifmap_shape.w = iw;
+
+  int on = in;
+  int oc = ic;
+  int oh = pooling_oh(&p, ih);
+  int ow = pooling_ow(&p, iw);
+  tl_shape_t ofmap_shape;
+  ofmap_shape.n = on;
+  ofmap_shape.c = oc;
+  ofmap_shape.h = oh;
+  ofmap_shape.w = ow;
+
+  fmt_t fmt = opd0_sign? FMT_I8: FMT_U8;
+  p.ofmap = bmk1880v2_lmem_alloc_tensor(ctx, ofmap_shape, FMT_I8, 1);
+  p.ifmap = bmk1880v2_lmem_alloc_tensor(ctx, ifmap_shape, fmt, 1);
+
+  if ((p.kh > pooling_ih_ext(&p, ih))
+      || (p.kw > pooling_iw_ext(&p, iw))
+      || (p.pad_top >= (1 << 4))
+      || (p.pad_bottom >= (1 << 4))
+      || (p.pad_left >= (1 << 4))
+      || (p.pad_right >= (1 << 4))
+      || !p.ofmap
+      || !p.ifmap) {
+    printf("retry init_pooling_param\n");
+    free_pooling_param(ctx, &p);
+    goto retry;
+  }
+
+  return p;
+}
+
+static void compare_results(
+    param_t *p,
+    s8 input[],
+    s8 output[])
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+  int opd0_sign = (p->ifmap->fmt == FMT_I8);
+
+  s8 *output_ref = alloc_output(p);
+  bmerr_t ret = native_pooling_ave_int8(
+      input, &p->avg_pooling_const, NULL, output_ref,
+      in, ic, ih, iw, p->kh, p->kw,
+      p->pad_top, p->pad_bottom, p->pad_left, p->pad_right,
+      p->stride_h, p->stride_w,
+      p->ins_h, p->ins_w, p->ins_last_h, p->ins_last_w,
+      opd0_sign, opd0_sign, p->rshift_bits, 1);
+  assert(ret == BM_SUCCESS);
+
+  int cmp_res = array_cmp_int8(
+      "Comparing results ...\n", output_ref, output,
+      tl_shape_size(&p->ofmap->shape));
+
+  if (cmp_res != 0) {
+    printf("Comparison FAILED!!!\n");
+    print_pooling_param(p);
+    exit(-1);
+  }
+
+  free(output_ref);
+}
+
+static int test_pooling(CVI_RT_HANDLE ctx, bmk_ctx_t *bk_ctx)
+{
+  param_t p = random_pooling_param(bk_ctx);
+  s8 *input = alloc_input(&p);
+
+  put_tensor_g2l(&ctx, bk_ctx, p.ifmap, (u8 *)input);
+  bmk1880v2_tiu_average_pooling(bk_ctx, &p);
+  s8 *output = (s8 *)get_tensor_l2g(&ctx, bk_ctx, p.ofmap);
+
+  compare_results(&p, input, output);
+
+  free_pooling_param(bk_ctx, &p);
+  free(output);
+  free(input);
+
+  return 1;
+}
+
+static void test_avg_pooling(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx)
+{
+  int test_finished_num = 0;
+  for (u64 i = 0; i < 16; i++)
+    test_finished_num += test_pooling(*ctx, bk_ctx);
+  printf("Test finished %d\n", test_finished_num);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_avg_pooling(&ctx, bk_ctx);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_base_reg_selection.cpp b/cviruntime/test/1880v2/test_1880v2_base_reg_selection.cpp
new file mode 100644
index 000000000..aa6a3e767
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_base_reg_selection.cpp
@@ -0,0 +1,285 @@
+#include "1880v2_test_util.h"
+
+typedef struct {
+  long index;
+  long offset;
+}Base_reg;
+
+#define BASE_REG_LIMIT  0x900000
+Base_reg base_reg[]={
+ {0, 0x000000 },
+ {1, 0x100000 },
+ {2, 0x200000 },
+ {3, 0x300000 },
+ {4, 0x400000 },
+ {5, 0x500000 },
+ {6, 0x600000 },
+ {7, 0x700000 },
+};
+
+static void test_tensor_base_selection(
+    CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, u32 reg_index, int offset)
+{
+  int n = 2;
+  int c = 66;
+  int h = 3;
+  int w = 15;
+
+  int size = n * c * h * w;
+  u8 *data_x = (u8 *)xmalloc(size);
+
+  for (int i = 0; i < size; i++)
+    data_x[i] = i - 100;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  tg_shape_t ts_shape;
+  ts_shape.n = n;
+  ts_shape.c = c;
+  ts_shape.h = h;
+  ts_shape.w = w;
+
+  tl_t *tl_x = alloc_tl(bk_ctx, tl_shape, FMT_I8, 1);
+
+  /*
+   * Copy test data to the fixed address.(gaddr + offset)
+   */
+  bmshape_t bms = BM_TENSOR_INT8((int)n, (int)c, (int)h, (int)w);
+  CVI_RT_MEM dev_mem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms) + BASE_REG_LIMIT);
+  gaddr_t gaddr = CVI_RT_MemGetPAddr(dev_mem);
+  //CVI_RT_MEM ab_dev_mem = bmmem_device_prealloc(*ctx, NULL, gaddr + offset, &bms);
+  CVI_RT_MEM ab_dev_mem = CVI_RT_MemPreAlloc(dev_mem, offset, bmshape_get_size(&bms));
+
+  int ret = CVI_RT_MemCopyS2D(*ctx, ab_dev_mem, data_x);
+  assert(ret == BM_SUCCESS);
+
+  /*
+   * tensor transfer
+   * g2l array base = offset, index = reg_index
+   * l2g array base = 0, index = 0
+   */
+  CVI_RT_SetBaseReg(*ctx, reg_index, offset);
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_I8;
+  tg.shape = ts_shape;
+  tg.stride = bmk1880v2_tensor_tgmem_default_stride(ts_shape, tg.fmt);
+  tg.base_reg_index = reg_index;
+
+  bmk1880v2_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl_x;
+
+  bmk1880v2_tdma_g2l_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  CVI_RT_SetBaseReg(*ctx, 0, 0);
+  u8 *result_x = get_tensor_l2g(ctx, bk_ctx, tl_x);
+  for (int i = 0; i < size; i++) {
+    if (result_x[i] != data_x[i]) {
+      fprintf(stderr, "compare failed at result_x[%d]\n", i);
+      exit(-1);
+    }
+  }
+  free(result_x);
+
+  /*
+   * tensor transfer
+   * g2l array base = 0, index = reg_index
+   * l2g array base = 0, index = 0
+   */
+  CVI_RT_SetBaseReg(*ctx, reg_index, 0);
+  tg.start_address = gaddr + offset;
+  bmk1880v2_tdma_g2l_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+  CVI_RT_SetBaseReg(*ctx, 0, 0);
+  result_x = get_tensor_l2g(ctx, bk_ctx, tl_x);
+  for (int i = 0; i < size; i++) {
+    if (result_x[i] != data_x[i]) {
+      fprintf(stderr, "compare failed at result_x[%d]\n", i);
+      exit(-1);
+    }
+  }
+
+  /*
+   * tensor transfer
+   * g2l, array base = offset, index = reg_index
+   * l2g, array_base = offset, index = reg_index
+   */
+  CVI_RT_SetBaseReg(*ctx, reg_index, offset);
+  tg.start_address = gaddr;
+  tg.base_reg_index = reg_index;
+  bmk1880v2_tdma_g2l_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  CVI_RT_SetBaseReg(*ctx, reg_index, offset);
+  tg.start_address = gaddr;
+  bmk1880v2_tdma_l2tg_tensor_copy_param_t l2g_p;
+  memset(&l2g_p, 0, sizeof(l2g_p));
+  l2g_p.src = tl_x;
+  l2g_p.dst = &tg;
+  bmk1880v2_tdma_l2g_tensor_copy(bk_ctx, &l2g_p);
+  test_submit(ctx);
+  ret = CVI_RT_MemCopyD2S(*ctx, result_x,ab_dev_mem);
+  assert(ret == BM_SUCCESS);
+
+  for (int i = 0; i < size; i++) {
+    if (result_x[i] != data_x[i]) {
+      fprintf(stderr, "compare failed at result_x[%d]\n", i);
+      exit(-1);
+    }
+  }
+
+  CVI_RT_SetBaseReg(*ctx, 0, 0);
+  CVI_RT_SetBaseReg(*ctx, 1, 0);
+  free(result_x);
+  free_tl(bk_ctx, tl_x);
+  CVI_RT_MemFree(*ctx, dev_mem);
+  CVI_RT_MemFree(*ctx, ab_dev_mem);
+  free(data_x);
+}
+static void test_matrix_base_selection(
+    CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, u32 reg_index, int offset)
+{
+  int row = 5;
+  int col = 16 * 5 + 2;
+  int size = row * col;
+
+  u8 *data_x = (u8 *)xmalloc(size);
+
+  for (int i = 0; i < size; i++)
+    data_x[i] = i - 100;
+
+  ml_shape_t ml_shape =
+    bmk1880v2_matrix_lmem_default_shape(bk_ctx, row, col, FMT_I8);
+  mg_shape_t mg_shape;
+  mg_shape.row = row;
+  mg_shape.col = col;
+
+  ml_t *ml =
+    bmk1880v2_lmem_alloc_matrix(bk_ctx, ml_shape, FMT_I8, 1);
+
+  /*
+   * Copy test data to the specified offset address.
+   */
+
+  bmshape_t bms = BM_MATRIX_INT8(row,col);
+  CVI_RT_MEM dev_mem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms) + BASE_REG_LIMIT);
+  gaddr_t gaddr = CVI_RT_MemGetPAddr(dev_mem);
+  //CVI_RT_MEM ab_dev_mem = bmmem_device_prealloc(*ctx, NULL, gaddr + offset, &bms);
+  CVI_RT_MEM ab_dev_mem = CVI_RT_MemPreAlloc(dev_mem, offset, bmshape_get_size(&bms));
+
+  int ret = CVI_RT_MemCopyS2D(*ctx, ab_dev_mem, data_x);
+  assert(ret == BM_SUCCESS);
+
+  /*
+   * matrix transfer
+   * g2l array base = offset, index = reg_index
+   * l2g array base = 0, index = 0
+   */
+  CVI_RT_SetBaseReg(*ctx, reg_index, offset);
+  mg_t mg;
+  mg.base_reg_index = 0;
+  mg.start_address = gaddr;
+  mg.shape = mg_shape;
+  mg.stride.row = mg_shape.col;
+  mg.base_reg_index = reg_index;
+
+  bmk1880v2_tdma_tg2l_matrix_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &mg;
+  p.dst = ml;
+
+  bmk1880v2_tdma_g2l_matrix_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  CVI_RT_SetBaseReg(*ctx, 0, 0);
+  u8 *result_x = get_matrix_l2g(ctx, bk_ctx, ml);
+  for (int i = 0; i < size; i++) {
+    if (result_x[i] != data_x[i]) {
+      fprintf(stderr, "compare failed at result_x[%d]\n", i);
+      exit(-1);
+    }
+  }
+  free(result_x);
+
+  /*
+   * matrix transfer
+   * g2l array base = 0, index = reg_index
+   * l2g array base = 0, index = 0
+   */
+  CVI_RT_SetBaseReg(*ctx, reg_index, 0);
+  mg.start_address = gaddr + offset;
+  bmk1880v2_tdma_g2l_matrix_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  CVI_RT_SetBaseReg(*ctx, 0, 0);
+  result_x = get_matrix_l2g(ctx, bk_ctx, ml);
+  for (int i = 0; i < size; i++) {
+    if (result_x[i] != data_x[i]) {
+      fprintf(stderr, "compare failed at result_x[%d]\n", i);
+      exit(-1);
+    }
+  }
+
+ /*
+  * Matrix transfer
+  * g2l, array base = offset, index = reg_index
+  * l2g, array_base = offset, index = reg_index
+  */
+  CVI_RT_SetBaseReg(*ctx, reg_index, offset);
+  mg.start_address = gaddr;
+  mg.base_reg_index = reg_index;
+  bmk1880v2_tdma_g2l_matrix_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  mg.start_address = gaddr;
+  bmk1880v2_tdma_l2tg_matrix_copy_param_t l2g_p;
+  memset(&l2g_p, 0, sizeof(l2g_p));
+  l2g_p.src = ml;
+  l2g_p.dst = &mg;
+
+  CVI_RT_SetBaseReg(*ctx, reg_index, offset);
+
+  bmk1880v2_tdma_l2g_matrix_copy(bk_ctx, &l2g_p);
+  test_submit(ctx);
+
+  ret = CVI_RT_MemCopyD2S(*ctx, result_x,ab_dev_mem);
+  assert(ret == BM_SUCCESS);
+
+  for (int i = 0; i < size; i++) {
+    if (result_x[i] != data_x[i]) {
+      fprintf(stderr, "compare failed at result_x[%d]\n", i);
+      exit(-1);
+    }
+  }
+  free(result_x);
+
+  CVI_RT_SetBaseReg(*ctx, 0, 0);
+  CVI_RT_SetBaseReg(*ctx, 1, 0);
+  bmk1880v2_lmem_free_matrix(bk_ctx, ml);
+  CVI_RT_MemFree(*ctx, dev_mem);
+  CVI_RT_MemFree(*ctx, ab_dev_mem);
+  free(data_x);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  for(int i=0; i<8; i ++)
+  {
+    test_matrix_base_selection(&ctx, bk_ctx, base_reg[i].index, base_reg[i].offset );
+    test_tensor_base_selection(&ctx, bk_ctx, base_reg[i].index, base_reg[i].offset);
+  }
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_conv.cpp b/cviruntime/test/1880v2/test_1880v2_conv.cpp
new file mode 100644
index 000000000..9e36948c9
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_conv.cpp
@@ -0,0 +1,738 @@
+#include "1880v2_test_util.h"
+
+typedef struct {
+  int random_seed;
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int using_bias;
+  int bReLU_EN;
+  int r_shift_m;
+  int opd0_sign;
+  int opd1_sign;
+  int opd2_sign;
+} conv_param_t;
+
+static int index_get(int h, int w1, int w2)
+{
+  return h * w1 + w2;
+}
+
+static int matrix_dot_mult(
+    s8 *A, s8 *B, int dim_n, int dim_m,
+    int opd0_sign)
+{
+  int sum = 0;
+  for (int i=0; i<dim_n; i++){
+    for (int j=0; j<dim_m; j++){
+      int index = index_get(i, dim_m, j);
+      if(opd0_sign) {
+        sum += A[index] * B [index];
+      } else {
+        sum += (int)((uint8_t)A[index]) * B [index];
+      }
+    }
+  }
+  return sum;
+}
+
+static int conv_ref(
+    const conv_param_t *p_param,
+    const s8 *ifmap,
+    const s8 *weight,
+    const s16 *bias,
+    s8 *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+  int r_shift_bits = p_param->r_shift_m;
+  int do_relu = p_param->bReLU_EN;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  s8 *i_fmap_pad_ker = (s8 *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return BM_ERR_FAILURE;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = BM_SUCCESS;
+
+  s8 *i_fmap_pad = NULL;
+  s8 *kernel_after = NULL;
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+
+      if (p_param->using_bias) {
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += bias[c]; //bias+c ;
+          }
+        }
+      }
+
+      if (do_relu)
+        relu(&result[n*oc*oh*ow + c*oh*ow], oh * ow);
+
+      // ofmap is s8, signed
+      ret = satu_2_8bit(&result[n*oc*oh*ow + c*oh*ow], oh * ow,
+                        &ofmap[n*oc*oh*ow + c*oh*ow], r_shift_bits, /*round_floor=*/1,
+                        /*sign_unsign=*/1);
+
+      if (ret != BM_SUCCESS)
+        goto error_release;
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+error_release:
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+static u8 * transform_weight(const tl_shape_t *s, u8 before[])
+{
+  u32 ic = s->n;
+  u32 oc = s->c;
+  u32 kh = s->h;
+  u32 kw = s->w;
+
+  u32 size = ic * oc * kh * kw;
+  u8 *after = (u8 *)malloc(sizeof(u8) * size);
+
+  /*
+   * (oc, ic, kh, kw) -> (1, oc, kh * kw, ic)
+   */
+  for (u32 oci = 0; oci < oc; oci++) {
+    for (u32 ici = 0; ici < ic; ici++) {
+      for (u32 khi = 0; khi < kh; khi++) {
+        for (u32 kwi = 0; kwi < kw; kwi++) {
+          u32 src_i = oci * ic * kh * kw + ici * kh * kw + khi * kw + kwi;
+          u32 dst_i = oci * kh * kw * ic + khi * kw * ic + kwi * ic + ici;
+          after[dst_i] = before[src_i];
+        }
+      }
+    }
+  }
+
+  return after;
+}
+
+static void put_conv_weight(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    const tl_t *tl,
+    u8 *data)
+{
+  const tl_shape_t *s = &tl->shape;
+  u32 ic = s->n;
+  u32 oc = s->c;
+  u32 kh = s->h;
+  u32 kw = s->w;
+
+  bmshape_t bms = BM_TENSOR_INT8((int)oc, (int)ic, (int)kh, (int)kw);
+  CVI_RT_MEM dev_mem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = CVI_RT_MemGetPAddr(dev_mem);
+  u8 *transformed_data = transform_weight(s, data);
+
+  /* we put weight to region 1. CVI_RT_MemCopyS2D regard dev_mem as
+   * absolute address, so we should pass abs address to copy weight
+   * to right place.
+   */
+
+  //u64 ab_addr = bm_device_read_base_reg(*ctx, 1);
+  //CVI_RT_MEM ab_dev_mem =
+    //bmmem_device_prealloc(*ctx, NULL, ab_addr + gaddr, &bms);
+
+  //int ret = CVI_RT_MemCopyS2D(*ctx, ab_dev_mem, transformed_data);
+  int ret = CVI_RT_MemCopyS2D(*ctx, dev_mem, transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tl_shape_t tdma_shape = { 1, oc, kh * kw, ic };
+
+  tg_t tdma_tg;
+  tdma_tg.base_reg_index = 0;
+  tdma_tg.start_address = gaddr;
+  tdma_tg.fmt = FMT_I8;
+  tdma_tg.shape.n = tdma_shape.n;
+  tdma_tg.shape.c = tdma_shape.c;
+  tdma_tg.shape.h = tdma_shape.h;
+  tdma_tg.shape.w = tdma_shape.w;
+  tdma_tg.stride = bmk1880v2_tensor_tgmem_default_stride(tdma_tg.shape, tdma_tg.fmt);
+  tdma_tg.base_reg_index = 1;
+
+  tl_t tdma_tl = *tl;
+  tdma_tl.shape = tdma_shape;
+  tdma_tl.stride = bmk1880v2_tensor_lmem_default_stride(bk_ctx, tdma_shape, FMT_I8, 0);
+
+  bmk1880v2_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tdma_tg;
+  p.dst = &tdma_tl;
+
+  bmk1880v2_tdma_g2l_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+  CVI_RT_MemFree(*ctx, dev_mem);
+  free(transformed_data);
+}
+
+static s8 * transform_bias(int oc, s16 before[])
+{
+  s8 *after = (s8 *)malloc(sizeof(s8) * 2 * oc);
+  if (!after)
+    return NULL;
+
+  for (int i = 0; i < oc; i++){
+    after[i] = before[i] & 0xff;
+    after[i + oc] = (before[i] >> 8) & 0xff;
+  }
+  return after;
+}
+
+static void put_conv_bias(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    const tl_t *tl,
+    s16 *data)
+{
+  int oc = tl->shape.c;
+
+  bmshape_t bms = BM_TENSOR_INT8(2, oc, 1, 1);
+  CVI_RT_MEM dev_mem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = CVI_RT_MemGetPAddr(dev_mem);
+  s8 *transformed_data = transform_bias(oc, data);
+  int ret = CVI_RT_MemCopyS2D(*ctx, dev_mem, (u8 *)transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_I8;
+  tg.shape.n = 2;
+  tg.shape.c = oc;
+  tg.shape.h = 1;
+  tg.shape.w = 1;
+  tg.stride = bmk1880v2_tensor_tgmem_default_stride(tg.shape, tg.fmt);
+
+  bmk1880v2_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1880v2_tdma_g2l_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  CVI_RT_MemFree(*ctx, dev_mem);
+  free(transformed_data);
+}
+
+static int conv_kh_ext(const conv_param_t *p)
+{
+  return (p->kh - 1) * p->dh + 1;
+}
+
+static int conv_kw_ext(const conv_param_t *p)
+{
+  return (p->kw - 1) * p->dw + 1;
+}
+
+static int conv_ih_ext(const conv_param_t *p)
+{
+  return (p->input_h - 1) * (p->ins_h + 1) +
+      p->ins_h_last + 1 + p->pad_top + p->pad_bot;
+}
+
+static int conv_iw_ext(const conv_param_t *p)
+{
+  return (p->input_w - 1) * (p->ins_w + 1) +
+      p->ins_w_last + 1 + p->pad_left + p->pad_right;
+}
+
+static int conv_oh(const conv_param_t *p)
+{
+  return (conv_ih_ext(p) - conv_kh_ext(p)) / p->stride_h + 1;
+}
+
+static int conv_ow(const conv_param_t *p)
+{
+  return (conv_iw_ext(p) - conv_kw_ext(p)) / p->stride_w + 1;
+}
+
+static int conv_input_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int ic = p->input_c;
+  int ih = p->input_h;
+  int iw = p->input_w;
+  return in * ic * ih * iw;
+}
+
+static int conv_output_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int oc = p->output_c;
+  int oh = conv_oh(p);
+  int ow = conv_ow(p);
+  return in * oc * oh * ow;
+}
+
+static int conv_weight_size(const conv_param_t *p)
+{
+  int oc = p->output_c;
+  int ic = p->input_c;
+  int kh = p->kh;
+  int kw = p->kw;
+  return oc * ic * kh * kw;
+}
+
+static s8 * alloc_input(const conv_param_t *p)
+{
+  int size = conv_input_size(p);
+
+  s8 *buf = (s8 *)malloc(sizeof(s8) * size);
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static s8 * alloc_weight(const conv_param_t *p)
+{
+  int size = conv_weight_size(p);
+
+  s8 *buf = (s8 *)malloc(sizeof(s8) * size);
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static s16 * alloc_bias(const conv_param_t *p)
+{
+  int oc = p->output_c;
+
+  s16 *bias = (s16 *)malloc(sizeof(s16) * oc);
+  for (int i = 0; i < oc; i++)
+    bias[i] = rand() % 65536 - 32768;
+
+  return bias;
+}
+
+static tl_t * conv_ifmap_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = p->opd0_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return bmk1880v2_lmem_alloc_tensor(ctx, s, fmt, 1);
+}
+
+static tl_t * conv_weight_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = p->opd1_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+  return bmk1880v2_lmem_alloc_tensor(ctx, s, fmt, 0);
+}
+
+static tl_t * conv_ofmap_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  return bmk1880v2_lmem_alloc_tensor(ctx, s, FMT_I8, 1);
+}
+
+static tl_t * conv_bias_tensor(
+    bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = p->opd2_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+  return bmk1880v2_lmem_alloc_tensor(ctx, s, fmt, 0);
+}
+
+static int conv_param_is_ok(const conv_param_t *p)
+{
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+
+  if ((kh_ext > ih_ext)
+      || (kw_ext > iw_ext)
+      || (kh_ext <= p->pad_top)
+      || (kh_ext <= p->pad_bot)
+      || (kw_ext <= p->pad_left)
+      || (kw_ext <= p->pad_right)
+      || (p->pad_top >= (1 << 4))
+      || (p->pad_bot >= (1 << 4))
+      || (p->pad_left >= (1 << 4))
+      || (p->pad_right >= (1 << 4))) {
+    return 0;
+  }
+
+  return 1;
+}
+
+static int bmk_conv_param_alloc_ok(
+    const bmk1880v2_tiu_convolution_param_t *p,
+    const conv_param_t *param)
+{
+  if (!p->ifmap || !p->ofmap || !p->weight)
+    return 0;
+
+  if (param->using_bias)
+    if (!p->bias)
+      return 0;
+
+  return 1;
+}
+
+static void make_bmk_conv_param(
+    bmk_ctx_t *ctx,
+    bmk1880v2_tiu_convolution_param_t *dst,
+    const conv_param_t *p)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  dst->relu_enable = p->bReLU_EN;
+  dst->rshift_bits = p->r_shift_m;
+
+  dst->ifmap = conv_ifmap_tensor(ctx, p);
+  dst->weight = conv_weight_tensor(ctx, p);
+  dst->ofmap = conv_ofmap_tensor(ctx, p);
+  dst->bias = NULL;
+  dst->ps32_mode = 0;
+  if (p->using_bias)
+    dst->bias = conv_bias_tensor(ctx, p);
+
+  dst->w_is_const = 0;
+}
+
+static void free_bmk_conv_param(
+    bmk_ctx_t *ctx,
+    bmk1880v2_tiu_convolution_param_t *r,
+    const conv_param_t *p)
+{
+  if (p->using_bias && r->bias)
+    free_tl(ctx, r->bias);
+  if (r->ofmap)
+    free_tl(ctx, r->ofmap);
+  if (r->weight)
+    free_tl(ctx, r->weight);
+  if (r->ifmap)
+    free_tl(ctx, r->ifmap);
+}
+
+static void init_conv_param(conv_param_t &p)
+{
+  printf("init_conv_param\n");
+  memset(&p, 0, sizeof(p));
+  p.random_seed = clock();
+  srand(p.random_seed);
+
+retry:
+  p.input_n = rand() % 5 + 1;
+  p.input_c = rand() % (5 * 32) + 1;
+  p.kh = rand() % 7 + 1;
+  p.kw = rand() % 7 + 1;
+  p.input_h = rand() % 40 + p.kh;
+  p.input_w = rand() % 40 + p.kw;
+  p.output_c = rand() % 10 + 3;
+  p.stride_h = rand() % (p.kh) + 1;
+  p.stride_w = rand() % (p.kw) + 1;
+  p.ins_h = rand() % p.kh;
+  p.ins_w = rand() % p.kw;
+  p.ins_h_last = rand() % p.kh;;
+  p.ins_w_last = rand() % p.kw;;
+  p.dh = rand() % 3 + 1;
+  p.dw = rand() % 3 + 1;
+
+  int kh_ext = conv_kh_ext(&p);
+  int kw_ext = conv_kw_ext(&p);
+  p.pad_top = rand() % kh_ext;
+  p.pad_bot = rand() % kh_ext;
+  p.pad_left = rand() % kw_ext;
+  p.pad_right = rand() % kw_ext;
+
+  if (!conv_param_is_ok(&p)) {
+    printf("retry init_conv_param\n");
+    goto retry;
+  }
+
+  p.using_bias = rand() % 2;
+  p.r_shift_m = rand() % 8;
+  p.bReLU_EN = rand() % 2;
+
+  p.opd0_sign = rand() % 2;
+  p.opd1_sign = 1;
+  p.opd2_sign = 1;
+
+  assert(p.opd1_sign == 1 && p.opd2_sign == 1);
+
+  int ih_ext = conv_ih_ext(&p);
+  int iw_ext = conv_iw_ext(&p);
+  assert(ih_ext >= kh_ext);
+  assert(iw_ext >= kw_ext);
+}
+
+static void print_conv_param(const conv_param_t *p)
+{
+  printf("%s\n", "Conv parameters:");
+  printf("  %s%d;\n", "p->random_seed = ", p->random_seed);
+
+  printf("  %s%d;\n", "p->input_n = ", p->input_n);
+  printf("  %s%d;\n", "p->input_c = ", p->input_c);
+  printf("  %s%d;\n", "p->input_h = ", p->input_h);
+  printf("  %s%d;\n", "p->input_w = ", p->input_w);
+  printf("  %s%d;\n", "p->output_c = ", p->output_c);
+
+  printf("  %s%d;\n", "p->kh = ", p->kh);
+  printf("  %s%d;\n", "p->kw = ", p->kw);
+  printf("  %s%d;\n", "p->dh = ", p->dh);
+  printf("  %s%d;\n", "p->dw = ", p->dw);
+  printf("  %s%d;\n", "p->pad_top = ", p->pad_top);
+  printf("  %s%d;\n", "p->pad_bot = ", p->pad_bot);
+  printf("  %s%d;\n", "p->pad_left = ", p->pad_left);
+  printf("  %s%d;\n", "p->pad_right = ", p->pad_right);
+  printf("  %s%d;\n", "p->stride_h = ", p->stride_h);
+  printf("  %s%d;\n", "p->stride_w = ", p->stride_w);
+  printf("  %s%d;\n", "p->ins_w = ", p->ins_w);
+  printf("  %s%d;\n", "p->ins_h = ", p->ins_h);
+  printf("  %s%d;\n", "p->ins_w_last = ", p->ins_w_last);
+  printf("  %s%d;\n", "p->ins_h_last = ", p->ins_h_last);
+
+  printf("  %s%d;\n", "p->r_shift_m = ", p->r_shift_m);
+  printf("  %s%d;\n", "p->opd0_sign = ", p->opd0_sign);
+  printf("  %s%d;\n", "p->opd1_sign = ", p->opd1_sign);
+  printf("  %s%d;\n", "p->opd2_sign = ", p->opd2_sign);
+  printf("  %s%d;\n", "p->bReLU_EN = ", p->bReLU_EN);
+  printf("  %s%d;\n", "p->using_bias = ", p->using_bias);
+
+  printf("  %s%d\n", "kh_ext = ", conv_kh_ext(p));
+  printf("  %s%d\n", "kw_ext = ", conv_kw_ext(p));
+  printf("  %s%d\n", "ih_ext = ", conv_ih_ext(p));
+  printf("  %s%d\n", "iw_ext = ", conv_iw_ext(p));
+  printf("  %s%d\n", "output_h = ", conv_oh(p));
+  printf("  %s%d\n", "output_w = ", conv_ow(p));
+}
+
+// Calculate the right shift value, m
+// Steps:
+//  1. Get the abs() of each weight;
+//  2. Summary all the abs() in one kernel;
+//  3. Get Log2 of each sum;
+//  4. Downward rounding;
+// After every r_shift value got, sort and find the middle one.
+static int calc_rshift_m(const conv_param_t *p, s8* weight)
+{
+  int kernel_cnt = p->output_c * p->input_c;
+  int kernel_size = p->kh * p->kw;
+  int *kernel_shifts = (int *)malloc(sizeof(int) * kernel_cnt);
+
+  // Tscan does not recognize C++ zero-initialized.
+  memset(kernel_shifts, 0, sizeof(int) * kernel_cnt);
+
+  // Part 1:
+  // Get right shift value for each kernel
+  int sum = 0;
+  for (int i = 0; i < kernel_cnt; i++) {
+    // Step 1 & 2: Get the sum of abs()
+    for (int j = 0; j < kernel_size; j++) {
+      sum += (int)(*weight < 0 ? -(*weight) : (*weight));
+      weight++;
+    }
+    // Step 3 & 4: log2 and downward rounding
+    sum >>= 1;
+    while (sum) {
+      sum >>= 1;
+      kernel_shifts[i]++;
+    }
+  }
+
+  // Part 2:
+  // Find the middle of all the values
+  int tag[32] = {0};
+  for (int cnt = 0; cnt < kernel_cnt; cnt++) {
+    tag[kernel_shifts[cnt]]++;
+  }
+
+  int rshift_m = 0;
+  int mid = 0;
+  do {
+    mid += tag[rshift_m++];
+  } while(mid < (kernel_cnt - 1) >> 1);
+
+  free(kernel_shifts);
+
+  return rshift_m - 1;
+}
+
+static int test_conv(
+    conv_param_t &p_param, CVI_RT_HANDLE ctx, bmk_ctx_t *bk_ctx)
+{
+  s8 *input = alloc_input(&p_param);
+  s8 *weight = alloc_weight(&p_param);
+  s16 *bias = alloc_bias(&p_param);
+  p_param.r_shift_m = calc_rshift_m(&p_param, weight);
+
+  s8 *output_ref = (s8 *)malloc(sizeof(s8) * conv_output_size(&p_param));
+  if (!output_ref)
+    return BM_ERR_FAILURE;
+
+  bmerr_t ret = conv_ref(&p_param, input, weight, bias, output_ref);
+  assert(ret == BM_SUCCESS);
+
+  bmk1880v2_tiu_convolution_param_t conv_param;
+  make_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+  int tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, &p_param);
+  if (tl_alloc_success) {
+    put_tensor_g2l(&ctx, bk_ctx, conv_param.ifmap, (u8 *)input);
+    put_conv_weight(&ctx, bk_ctx, conv_param.weight, (u8 *)weight);
+    if (p_param.using_bias)
+      put_conv_bias(&ctx, bk_ctx, conv_param.bias, bias);
+    bmk1880v2_tiu_convolution(bk_ctx, &conv_param);
+    u8 *output = get_tensor_l2g(&ctx, bk_ctx, conv_param.ofmap);
+
+    int has_error = array_cmp_int8(
+        "Comparing results ...\n",
+        output_ref, (s8 *)output, conv_output_size(&p_param));
+
+    if (has_error) {
+      print_conv_param(&p_param);
+      printf("Comparison FAILED\n");
+      exit(-1);
+    }
+    free(output);
+  }
+
+  free_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+  free(input);
+  free(weight);
+  free(output_ref);
+  free(bias);
+
+  return tl_alloc_success ? 1 : 0;
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  int test_finished_num = 0;
+  for (int i = 0; i < 5; i++) {
+    printf("random_test_conv iteration: %d\n", i);
+    conv_param_t test_conv_param;
+    init_conv_param(test_conv_param);
+    test_finished_num += test_conv(test_conv_param, ctx, bk_ctx);
+    if (!test_conv_param.using_bias)
+      test_conv_param.using_bias = 1;
+    if (test_conv_param.output_c <= 32)
+      test_conv_param.output_c += 32;
+    test_finished_num += test_conv(test_conv_param, ctx, bk_ctx);
+  }
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_conv_max_power.cpp b/cviruntime/test/1880v2/test_1880v2_conv_max_power.cpp
new file mode 100644
index 000000000..012900b68
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_conv_max_power.cpp
@@ -0,0 +1,1068 @@
+#include "1880v2_test_util.h"
+
+typedef bmk1880v2_tdma_l2tg_tensor_copy_cw_transposed_param_t l2tg_cw_param_t;
+typedef bmk1880v2_tdma_tg2l_matrix_copy_row_col_transposed_param_t tg2l_matrix_param_t;
+typedef bmk1880v2_tdma_l2l_tensor_copy_param_t l2l_tensor_copy_param_t;
+
+typedef struct{
+    s8 *conv_input;
+    s8 *conv_weight;
+    s16 *conv_bias;
+    u8 *conv_output;
+    s8 *conv_output_ref;
+    u8 *l2g_cw_src;
+    u8 *l2g_cw_output;
+    u8 *l2g_cw_output_ref;
+    u8 *g2l_matrix_src;
+    u8 *g2l_matrix_output;
+    u8 *g2l_matrix_output_ref;
+    u8 *l2l_tensor_src;
+    u8 *l2l_tensor_output;
+    u8 *l2l_tensor_output_ref;
+}s_test_data;
+
+typedef struct {
+  int random_seed;
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int using_bias;
+  int bReLU_EN;
+  int r_shift_m;
+  int opd0_sign;
+  int opd1_sign;
+  int opd2_sign;
+} conv_param_t;
+
+conv_param_t conv_param;
+l2tg_cw_param_t l2tg_cw_param;
+tg2l_matrix_param_t tg2l_matrix_param;
+l2l_tensor_copy_param_t l2l_tensor_copy_param;
+s_test_data s8_test_data;
+bmk1880v2_tiu_convolution_param_t bmk_conv_param;
+
+bmk1880v2_tensor_lmem_t *skip_tensor_lmem[10];
+u32 skip_tensor_num=0;
+
+/* need to make sure the free order of alloc_tl for skip_tensor_lmem*/
+void skip_tensor_lmem_size(bmk_ctx_t *bmk, const bmk1880v2_tensor_lmem_t *p)
+{
+  u32 needed = align_up(p->shape.n * p->stride.n, BM1880V2_HW_EU_NUM);
+  u32 start_addr = p->start_address + needed; //src_shape.n*src_shape.c*src_shape.h*src_shape.w/32;
+  u32 remain_size = start_addr % BM1880V2_HW_LMEM_BANK_SIZE ? (BM1880V2_HW_LMEM_BANK_SIZE - start_addr % BM1880V2_HW_LMEM_BANK_SIZE) : 0; // remain size for each lane
+  if(remain_size)
+  {
+    tl_shape_t src_shape2 = {1, BM1880V2_HW_NPU_NUM, 1, remain_size};
+    skip_tensor_lmem[skip_tensor_num] = alloc_tl(bmk, src_shape2, FMT_I8, 1); // skip the lmem size and next tl can alignment to bank size
+  }
+  skip_tensor_num++;
+}
+
+void skip_matrix_lmem_size(bmk_ctx_t *bmk, const bmk1880v2_matrix_lmem_t *p)
+{
+  u32 needed = align_up(p->shape.n * p->stride.n, BM1880V2_HW_EU_NUM);
+  u32 start_addr = p->start_address + needed; //src_shape.n*src_shape.c*src_shape.h*src_shape.w/32;
+  u32 remain_size = start_addr % BM1880V2_HW_LMEM_BANK_SIZE ? (BM1880V2_HW_LMEM_BANK_SIZE - start_addr % BM1880V2_HW_LMEM_BANK_SIZE) : 0; // remain size for each lane
+  if(remain_size)
+  {
+    tl_shape_t src_shape2 = {1, BM1880V2_HW_NPU_NUM, 1, remain_size};
+    skip_tensor_lmem[skip_tensor_num] = alloc_tl(bmk, src_shape2, FMT_I8, 1); // skip the lmem size and next tl can alignment to bank size
+  }
+  skip_tensor_num++;
+}
+
+void free_skip_tensor_lmem(bmk_ctx_t *ctx)
+{
+  if(skip_tensor_lmem[--skip_tensor_num]!=NULL)
+    free_tl(ctx, skip_tensor_lmem[skip_tensor_num]);
+}
+
+static int index_get(int h, int w1, int w2)
+{
+  return h * w1 + w2;
+}
+
+static int matrix_dot_mult(
+    s8 *A, s8 *B, int dim_n, int dim_m,
+    int opd0_sign)
+{
+  int sum = 0;
+  for (int i=0; i<dim_n; i++){
+    for (int j=0; j<dim_m; j++){
+      int index = index_get(i, dim_m, j);
+      if(opd0_sign) {
+        sum += A[index] * B [index];
+      } else {
+        sum += (int)((uint8_t)A[index]) * B [index];
+      }
+    }
+  }
+  return sum;
+}
+
+static int conv_ref(
+    const conv_param_t *p_param,
+    const s8 *ifmap,
+    const s8 *weight,
+    const s16 *bias,
+    s8 *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+  int r_shift_bits = p_param->r_shift_m;
+  int do_relu = p_param->bReLU_EN;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  s8 *i_fmap_pad_ker = (s8 *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return BM_ERR_FAILURE;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = BM_SUCCESS;
+
+  s8 *i_fmap_pad = NULL;
+  s8 *kernel_after = NULL;
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+
+      if (p_param->using_bias) {
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += bias[c]; //bias+c ;
+          }
+        }
+      }
+
+      if (do_relu)
+        relu(&result[n*oc*oh*ow + c*oh*ow], oh * ow);
+
+      // ofmap is s8, signed
+      ret = satu_2_8bit(&result[n*oc*oh*ow + c*oh*ow], oh * ow,
+                        &ofmap[n*oc*oh*ow + c*oh*ow], r_shift_bits, /*round_floor=*/1,
+                        /*sign_unsign=*/1);
+
+      if (ret != BM_SUCCESS)
+        goto error_release;
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+error_release:
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+static u8 * transform_weight(const tl_shape_t *s, u8 before[])
+{
+  u32 ic = s->n;
+  u32 oc = s->c;
+  u32 kh = s->h;
+  u32 kw = s->w;
+
+  u32 size = ic * oc * kh * kw;
+  u8 *after = (u8 *)malloc(sizeof(u8) * size);
+
+  /*
+   * (oc, ic, kh, kw) -> (1, oc, kh * kw, ic)
+   */
+  for (u32 oci = 0; oci < oc; oci++) {
+    for (u32 ici = 0; ici < ic; ici++) {
+      for (u32 khi = 0; khi < kh; khi++) {
+        for (u32 kwi = 0; kwi < kw; kwi++) {
+          u32 src_i = oci * ic * kh * kw + ici * kh * kw + khi * kw + kwi;
+          u32 dst_i = oci * kh * kw * ic + khi * kw * ic + kwi * ic + ici;
+          after[dst_i] = before[src_i];
+        }
+      }
+    }
+  }
+
+  return after;
+}
+
+static void put_conv_weight(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    const tl_t *tl,
+    u8 *data)
+{
+  const tl_shape_t *s = &tl->shape;
+  u32 ic = s->n;
+  u32 oc = s->c;
+  u32 kh = s->h;
+  u32 kw = s->w;
+
+  bmshape_t bms = BM_TENSOR_INT8((int)oc, (int)ic, (int)kh, (int)kw);
+  CVI_RT_MEM dev_mem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = CVI_RT_MemGetPAddr(dev_mem);
+  u8 *transformed_data = transform_weight(s, data);
+
+  /* we put weight to region 1. CVI_RT_MemCopyS2D regard dev_mem as
+   * absolute address, so we should pass abs address to copy weight
+   * to right place.
+   */
+
+  //u64 ab_addr = bm_device_read_base_reg(*ctx, 1);
+  //CVI_RT_MEM ab_dev_mem =
+    //bmmem_device_prealloc(*ctx, NULL, ab_addr + gaddr, &bms);
+
+  //int ret = CVI_RT_MemCopyS2D(*ctx, ab_dev_mem, transformed_data);
+  int ret = CVI_RT_MemCopyS2D(*ctx, dev_mem, transformed_data);
+
+  assert(ret == BM_SUCCESS);
+
+  tl_shape_t tdma_shape = { 1, oc, kh * kw, ic };
+
+  tg_t tdma_tg;
+  tdma_tg.base_reg_index = 0;
+  tdma_tg.start_address = gaddr;
+  tdma_tg.fmt = FMT_I8;
+  tdma_tg.shape.n = tdma_shape.n;
+  tdma_tg.shape.c = tdma_shape.c;
+  tdma_tg.shape.h = tdma_shape.h;
+  tdma_tg.shape.w = tdma_shape.w;
+  tdma_tg.stride = bmk1880v2_tensor_tgmem_default_stride(tdma_tg.shape, tdma_tg.fmt);
+  tdma_tg.base_reg_index = 1;
+
+  tl_t tdma_tl = *tl;
+  tdma_tl.shape = tdma_shape;
+  tdma_tl.stride = bmk1880v2_tensor_lmem_default_stride(bk_ctx, tdma_shape, FMT_I8, 0);
+
+  bmk1880v2_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tdma_tg;
+  p.dst = &tdma_tl;
+
+  bmk1880v2_tdma_g2l_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+  CVI_RT_MemFree(*ctx, dev_mem);
+  free(transformed_data);
+}
+
+static s8 * transform_bias(int oc, s16 before[])
+{
+  uint32_t sz = 2 * oc;
+  s8 *after = (s8 *)malloc(sizeof(s8) * sz);
+  if (!after)
+    return NULL;
+
+  for (int i = 0; i < oc; i++){
+    after[i] = before[i] & 0xff;
+    after[i + oc] = (before[i] >> 8) & 0xff;
+  }
+  return after;
+}
+
+static void put_conv_bias(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    const tl_t *tl,
+    s16 *data)
+{
+  int oc = tl->shape.c;
+
+  bmshape_t bms = BM_TENSOR_INT8(2, oc, 1, 1);
+  CVI_RT_MEM dev_mem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = CVI_RT_MemGetPAddr(dev_mem);
+  s8 *transformed_data = transform_bias(oc, data);
+  int ret = CVI_RT_MemCopyS2D(*ctx, dev_mem, (u8 *)transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_I8;
+  tg.shape.n = 2;
+  tg.shape.c = oc;
+  tg.shape.h = 1;
+  tg.shape.w = 1;
+  tg.stride = bmk1880v2_tensor_tgmem_default_stride(tg.shape, tg.fmt);
+
+  bmk1880v2_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1880v2_tdma_g2l_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  CVI_RT_MemFree(*ctx, dev_mem);
+  free(transformed_data);
+}
+
+static int conv_kh_ext(const conv_param_t *p)
+{
+  return (p->kh - 1) * p->dh + 1;
+}
+
+static int conv_kw_ext(const conv_param_t *p)
+{
+  return (p->kw - 1) * p->dw + 1;
+}
+
+static int conv_ih_ext(const conv_param_t *p)
+{
+  return (p->input_h - 1) * (p->ins_h + 1) +
+      p->ins_h_last + 1 + p->pad_top + p->pad_bot;
+}
+
+static int conv_iw_ext(const conv_param_t *p)
+{
+  return (p->input_w - 1) * (p->ins_w + 1) +
+      p->ins_w_last + 1 + p->pad_left + p->pad_right;
+}
+
+static int conv_oh(const conv_param_t *p)
+{
+  return (conv_ih_ext(p) - conv_kh_ext(p)) / p->stride_h + 1;
+}
+
+static int conv_ow(const conv_param_t *p)
+{
+  return (conv_iw_ext(p) - conv_kw_ext(p)) / p->stride_w + 1;
+}
+
+static int conv_input_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int ic = p->input_c;
+  int ih = p->input_h;
+  int iw = p->input_w;
+  return in * ic * ih * iw;
+}
+
+static int conv_output_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int oc = p->output_c;
+  int oh = conv_oh(p);
+  int ow = conv_ow(p);
+  return in * oc * oh * ow;
+}
+
+static int conv_weight_size(const conv_param_t *p)
+{
+  int oc = p->output_c;
+  int ic = p->input_c;
+  int kh = p->kh;
+  int kw = p->kw;
+  return oc * ic * kh * kw;
+}
+
+static s8 * alloc_input(const conv_param_t *p)
+{
+  int size = conv_input_size(p);
+
+  s8 *buf = (s8 *)malloc(sizeof(s8) * size);
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static s8 * alloc_weight(const conv_param_t *p)
+{
+  int size = conv_weight_size(p);
+
+  s8 *buf = (s8 *)malloc(sizeof(s8) * size);
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static s16 * alloc_bias(const conv_param_t *p)
+{
+  int oc = p->output_c;
+
+  s16 *bias = (s16 *)malloc(sizeof(s16) * oc);
+  for (int i = 0; i < oc; i++)
+    bias[i] = rand() % 65536 - 32768;
+
+  return bias;
+}
+
+static tl_t * conv_ifmap_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = p->opd0_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return bmk1880v2_lmem_alloc_tensor(ctx, s, fmt, 1);
+}
+
+static tl_t * conv_weight_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = p->opd1_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+  return bmk1880v2_lmem_alloc_tensor(ctx, s, fmt, 0);
+}
+
+static tl_t * conv_ofmap_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  return bmk1880v2_lmem_alloc_tensor(ctx, s, FMT_I8, 1);
+}
+
+static tl_t * conv_bias_tensor(
+    bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = p->opd2_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+  return bmk1880v2_lmem_alloc_tensor(ctx, s, fmt, 0);
+}
+
+static int conv_param_is_ok(const conv_param_t *p)
+{
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+
+  if ((kh_ext > ih_ext)
+      || (kw_ext > iw_ext)
+      || (kh_ext <= p->pad_top)
+      || (kh_ext <= p->pad_bot)
+      || (kw_ext <= p->pad_left)
+      || (kw_ext <= p->pad_right)
+      || (p->pad_top >= (1 << 4))
+      || (p->pad_bot >= (1 << 4))
+      || (p->pad_left >= (1 << 4))
+      || (p->pad_right >= (1 << 4))) {
+    return 0;
+  }
+
+  return 1;
+}
+
+static int bmk_conv_param_alloc_ok(
+    const bmk1880v2_tiu_convolution_param_t *p,
+    const conv_param_t *param)
+{
+  if (!p->ifmap || !p->ofmap || !p->weight)
+    return 0;
+
+  if (param->using_bias)
+    if (!p->bias)
+      return 0;
+
+  return 1;
+}
+
+static void make_bmk_conv_param(
+    bmk_ctx_t *ctx,
+    bmk1880v2_tiu_convolution_param_t *dst,
+    const conv_param_t *p)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  dst->relu_enable = p->bReLU_EN;
+  dst->rshift_bits = p->r_shift_m;
+
+  dst->ifmap = conv_ifmap_tensor(ctx, p);
+  skip_tensor_lmem_size(ctx, dst->ifmap);
+  dst->weight = conv_weight_tensor(ctx, p);
+  skip_tensor_lmem_size(ctx, dst->weight);
+  dst->ofmap = conv_ofmap_tensor(ctx, p);
+  skip_tensor_lmem_size(ctx, dst->ofmap);
+  dst->bias = NULL;
+  dst->ps32_mode = 0;
+  if (p->using_bias)
+  {
+    dst->bias = conv_bias_tensor(ctx, p);
+    skip_tensor_lmem_size(ctx, dst->bias);
+  }
+  dst->w_is_const = 0;
+}
+
+static void free_bmk_conv_param(
+    bmk_ctx_t *ctx,
+    bmk1880v2_tiu_convolution_param_t *r,
+    const conv_param_t *p)
+{
+  if (p->using_bias && r->bias)
+  {
+    free_skip_tensor_lmem(ctx);
+    free_tl(ctx, r->bias);
+  }
+  if (r->ofmap)
+  {
+    free_skip_tensor_lmem(ctx);
+    free_tl(ctx, r->ofmap);
+  }
+  if (r->weight)
+  {
+    free_skip_tensor_lmem(ctx);
+    free_tl(ctx, r->weight);
+  }
+  if (r->ifmap)
+  {
+    free_skip_tensor_lmem(ctx);
+    free_tl(ctx, r->ifmap);
+  }
+}
+
+static void init_conv_param(conv_param_t &p)
+{
+retry:
+  p.input_n = 1;
+  p.input_c = 64;
+  p.input_h = 2;
+  p.input_w = 600;
+
+  p.kh = 2;
+  p.kw = 16;
+  p.output_c = 64;
+
+  p.stride_h = 1;
+  p.stride_w = 15;
+  p.ins_h = 0;
+  p.ins_w = 0;
+  p.ins_h_last = 0;;
+  p.ins_w_last = 0;;
+  p.dh = 1;
+  p.dw = 1;
+
+  int kh_ext = conv_kh_ext(&p);
+  int kw_ext = conv_kw_ext(&p);
+  p.pad_top = 1;
+  p.pad_bot = 0;
+  p.pad_left = 0;
+  p.pad_right = 0;
+
+  if (!conv_param_is_ok(&p)) {
+    printf("retry init_conv_param\n");
+    goto retry;
+  }
+
+  p.using_bias = 0;
+  p.r_shift_m = 7;
+  p.bReLU_EN = 1;
+
+  p.opd0_sign = 0;
+  p.opd1_sign = 1;
+  p.opd2_sign = 1;
+
+  assert(p.opd1_sign == 1 && p.opd2_sign == 1);
+
+  int ih_ext = conv_ih_ext(&p);
+  int iw_ext = conv_iw_ext(&p);
+  assert(ih_ext >= kh_ext);
+  assert(iw_ext >= kw_ext);
+
+}
+
+static void print_conv_param(const conv_param_t *p)
+{
+  printf("%s\n", "Conv parameters:");
+  printf("  %s%d;\n", "p->random_seed = ", p->random_seed);
+
+  printf("  %s%d;\n", "p->input_n = ", p->input_n);
+  printf("  %s%d;\n", "p->input_c = ", p->input_c);
+  printf("  %s%d;\n", "p->input_h = ", p->input_h);
+  printf("  %s%d;\n", "p->input_w = ", p->input_w);
+  printf("  %s%d;\n", "p->output_c = ", p->output_c);
+
+  printf("  %s%d;\n", "p->kh = ", p->kh);
+  printf("  %s%d;\n", "p->kw = ", p->kw);
+  printf("  %s%d;\n", "p->dh = ", p->dh);
+  printf("  %s%d;\n", "p->dw = ", p->dw);
+  printf("  %s%d;\n", "p->pad_top = ", p->pad_top);
+  printf("  %s%d;\n", "p->pad_bot = ", p->pad_bot);
+  printf("  %s%d;\n", "p->pad_left = ", p->pad_left);
+  printf("  %s%d;\n", "p->pad_right = ", p->pad_right);
+  printf("  %s%d;\n", "p->stride_h = ", p->stride_h);
+  printf("  %s%d;\n", "p->stride_w = ", p->stride_w);
+  printf("  %s%d;\n", "p->ins_w = ", p->ins_w);
+  printf("  %s%d;\n", "p->ins_h = ", p->ins_h);
+  printf("  %s%d;\n", "p->ins_w_last = ", p->ins_w_last);
+  printf("  %s%d;\n", "p->ins_h_last = ", p->ins_h_last);
+
+  printf("  %s%d;\n", "p->r_shift_m = ", p->r_shift_m);
+  printf("  %s%d;\n", "p->opd0_sign = ", p->opd0_sign);
+  printf("  %s%d;\n", "p->opd1_sign = ", p->opd1_sign);
+  printf("  %s%d;\n", "p->opd2_sign = ", p->opd2_sign);
+  printf("  %s%d;\n", "p->bReLU_EN = ", p->bReLU_EN);
+  printf("  %s%d;\n", "p->using_bias = ", p->using_bias);
+
+  printf("  %s%d\n", "kh_ext = ", conv_kh_ext(p));
+  printf("  %s%d\n", "kw_ext = ", conv_kw_ext(p));
+  printf("  %s%d\n", "ih_ext = ", conv_ih_ext(p));
+  printf("  %s%d\n", "iw_ext = ", conv_iw_ext(p));
+  printf("  %s%d\n", "output_h = ", conv_oh(p));
+  printf("  %s%d\n", "output_w = ", conv_ow(p));
+}
+
+// Calculate the right shift value, m
+// Steps:
+//  1. Get the abs() of each weight;
+//  2. Summary all the abs() in one kernel;
+//  3. Get Log2 of each sum;
+//  4. Downward rounding;
+// After every r_shift value got, sort and find the middle one.
+static int calc_rshift_m(const conv_param_t *p, s8* weight)
+{
+  int kernel_cnt = p->output_c * p->input_c;
+  int kernel_size = p->kh * p->kw;
+  int *kernel_shifts = (int *)malloc(sizeof(int) * kernel_cnt);
+
+  // Tscan does not recognize C++ zero-initialized.
+  memset(kernel_shifts, 0, sizeof(int) * kernel_cnt);
+
+  // Part 1:
+  // Get right shift value for each kernel
+  int sum = 0;
+  for (int i = 0; i < kernel_cnt; i++) {
+    // Step 1 & 2: Get the sum of abs()
+    for (int j = 0; j < kernel_size; j++) {
+      sum += (int)(*weight < 0 ? -(*weight) : (*weight));
+      weight++;
+    }
+    // Step 3 & 4: log2 and downward rounding
+    sum >>= 1;
+    while (sum) {
+      sum >>= 1;
+      kernel_shifts[i]++;
+    }
+  }
+
+  // Part 2:
+  // Find the middle of all the values
+  int tag[32] = {0};
+  for (int cnt = 0; cnt < kernel_cnt; cnt++) {
+    tag[kernel_shifts[cnt]]++;
+  }
+
+  int rshift_m = 0;
+  int mid = 0;
+  do {
+    mid += tag[rshift_m++];
+  } while(mid < (kernel_cnt - 1) >> 1);
+
+  free(kernel_shifts);
+
+  return rshift_m - 1;
+}
+
+
+static void l2tg_tensor_copy_cw_transposed_ref(
+    l2tg_cw_param_t *p, u8 ref_data[], u8 src_data[])
+{
+  tl_shape_t s = p->src->shape;
+  u32 n = s.n;
+  u32 c = s.c;
+  u32 h = s.h;
+  u32 w = s.w;
+
+  for (u32 ni = 0; ni < n; ni++) {
+    for (u32 ci = 0; ci < c; ci++) {
+      for (u32 hi = 0; hi < h; hi++) {
+        for (u32 wi = 0; wi < w; wi++) {
+          u32 src_i = ni * c * h * w + ci * h * w + hi * w + wi;
+          u32 dst_i = ni * c * h * w + wi * h * c + hi * c + ci;
+          ref_data[dst_i] = src_data[src_i];
+        }
+      }
+    }
+  }
+}
+
+static void test_param_l2g(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, l2tg_cw_param_t *p)
+{
+  u64 size = tl_shape_size(&p->src->shape);
+
+  s8_test_data.l2g_cw_src = (u8 *)malloc(sizeof(u8) * size);
+  if (!s8_test_data.l2g_cw_src)
+    return;
+
+  for (u64 i = 0; i < size; i++)
+    s8_test_data.l2g_cw_src[i] = rand()%0x100;
+
+  s8_test_data.l2g_cw_output_ref = (u8 *)malloc(sizeof(u8) * size);
+  if (!s8_test_data.l2g_cw_output_ref)
+    return;
+
+  l2tg_tensor_copy_cw_transposed_ref(p, s8_test_data.l2g_cw_output_ref, s8_test_data.l2g_cw_src);
+
+  put_tensor_g2l(ctx, bmk, p->src, s8_test_data.l2g_cw_src);
+}
+
+static void destroy_param_l2g(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, l2tg_cw_param_t *p)
+{
+  free_tg_gmem(ctx, p->dst);
+  free_skip_tensor_lmem(bmk);
+  free_tl(bmk, p->src);
+}
+
+static void test_l2tg_cw_transpose(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, l2tg_cw_param_t *p)
+{
+  tl_shape_t src_shape = {1, 0x100, 1, 0x080};
+  tg_shape_t dst_shape = {1, 0x080, 1, 0x100};
+
+  p->src = alloc_tl(bmk, src_shape, FMT_I8, 1);
+  p->dst = alloc_tg_gmem(ctx, dst_shape, FMT_I8);
+  skip_tensor_lmem_size(bmk, p->src);
+  test_param_l2g(ctx, bmk, p);
+}
+
+static void tg2l_matrix_copy_row_col_transposed_ref(
+    tg2l_matrix_param_t *p, u8 ref_data[], u8 src_data[])
+{
+  u64 row = p->src->shape.row;
+  u64 col = p->src->shape.col;
+
+  for (u64 ri = 0; ri < row; ri++) {
+    for (u64 ci = 0; ci < col; ci++) {
+      u64 src_i = ri * col + ci;
+      u64 dst_i = ci * row + ri;
+      ref_data[dst_i] = src_data[src_i];
+    }
+  }
+}
+
+static void test_param_g2l(CVI_RT_HANDLE *ctx, tg2l_matrix_param_t *p)
+{
+  u64 size = ml_shape_size(&p->dst->shape);
+
+  s8_test_data.g2l_matrix_src = (u8 *)malloc(sizeof(u8) * size);
+  if (!s8_test_data.g2l_matrix_src)
+    return;
+
+  for (u64 i = 0; i < size; i++)
+    s8_test_data.g2l_matrix_src[i] = rand()%0x100;
+
+  s8_test_data.g2l_matrix_output_ref = (u8 *)malloc(sizeof(u8) * size);
+  if (!s8_test_data.g2l_matrix_output_ref)
+    return;
+
+  tg2l_matrix_copy_row_col_transposed_ref(p, s8_test_data.g2l_matrix_output_ref, s8_test_data.g2l_matrix_src);
+
+  put_mg_gmem(ctx, p->src, s8_test_data.g2l_matrix_src);
+}
+
+static void destroy_param_g2l(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, tg2l_matrix_param_t *p)
+{
+  free_mg_gmem(ctx, p->src);
+  free_skip_tensor_lmem(bmk);
+  free_ml(bmk, p->dst);
+}
+
+
+static void test_tg2l_matrix_transpose(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, tg2l_matrix_param_t *p)
+{
+  //tg2l_matrix_param_t p;
+  /*
+   * Matrix transpose must be n/c stride alignment
+   * for TDMA limitation
+   */
+  mg_shape_t src_shape={0x100, 0x80};
+  ml_shape_t dst_shape={0x80, 0x10, 0x10, 0x100};
+
+  int dst_align = 1;
+
+  p->src = alloc_mg_gmem(ctx, src_shape);
+  p->dst = alloc_ml(bmk, dst_shape, dst_align);
+  skip_matrix_lmem_size(bmk, p->dst);
+  test_param_g2l(ctx, p);
+}
+
+static void l2l_tensor_copy_ref(l2l_tensor_copy_param_t *p, u8 ref_data[], u8 src_data[])
+{
+  u64 size = tl_shape_size(&p->src->shape);
+
+  for (u64 i = 0; i < size; i++)
+    ref_data[i] = src_data[i];
+}
+
+static void test_l2l_param(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, l2l_tensor_copy_param_t *p)
+{
+  u64 size = tl_shape_size(&p->src->shape);
+
+  s8_test_data.l2l_tensor_src = (u8 *)malloc(sizeof(u8) * size);
+  if (!s8_test_data.l2l_tensor_src)
+    return;
+
+  for (u64 i = 0; i < size; i++)
+    s8_test_data.l2l_tensor_src[i] = rand()%0x100;
+
+  s8_test_data.l2l_tensor_output_ref = (u8 *)malloc(sizeof(u8) * size);
+  if (!s8_test_data.l2l_tensor_output_ref)
+    return;
+
+  l2l_tensor_copy_ref(p, s8_test_data.l2l_tensor_output_ref, s8_test_data.l2l_tensor_src);
+
+  put_tensor_g2l(ctx, bmk, p->src, s8_test_data.l2l_tensor_src);
+}
+
+static void destroy_param_l2l(bmk_ctx_t *bmk, l2l_tensor_copy_param_t *p)
+{
+  free_skip_tensor_lmem(bmk);
+  free_tl(bmk, p->dst);
+  free_skip_tensor_lmem(bmk);
+  free_tl(bmk, p->src);
+}
+
+static void test_l2l_tensor_copy(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, l2l_tensor_copy_param_t *p)
+{
+  tl_shape_t src_shape = {1, 0x10, 0x1, 0x400};
+  tl_shape_t dst_shape = {1, 0x10, 0x1, 0x400};
+
+  p->src = alloc_tl(bmk, src_shape, FMT_I8, 1);
+  skip_tensor_lmem_size(bmk, p->src);
+  p->dst = alloc_tl(bmk, dst_shape, FMT_I8, 1);
+  skip_tensor_lmem_size(bmk, p->dst);
+  test_l2l_param(ctx, bmk, p);
+}
+
+static int setup_conv(
+    conv_param_t &p_param, CVI_RT_HANDLE ctx, bmk_ctx_t *bk_ctx)
+{
+  s8_test_data.conv_input = alloc_input(&p_param);
+  s8_test_data.conv_weight = alloc_weight(&p_param);
+  s8_test_data.conv_bias = alloc_bias(&p_param);
+  p_param.r_shift_m = calc_rshift_m(&p_param, s8_test_data.conv_weight);
+  s8_test_data.conv_output_ref = (s8 *)malloc(sizeof(s8) * conv_output_size(&p_param));
+  if (!s8_test_data.conv_output_ref)
+    return BM_ERR_FAILURE;
+
+  bmerr_t ret = conv_ref(&p_param, s8_test_data.conv_input, s8_test_data.conv_weight, s8_test_data.conv_bias, s8_test_data.conv_output_ref);
+  assert(ret == BM_SUCCESS);
+  make_bmk_conv_param(bk_ctx, &bmk_conv_param, &p_param);
+
+  bmk_conv_param_alloc_ok(&bmk_conv_param, &p_param);
+
+  put_tensor_g2l(&ctx, bk_ctx, bmk_conv_param.ifmap, (u8 *)s8_test_data.conv_input);
+  put_conv_weight(&ctx, bk_ctx, bmk_conv_param.weight, (u8 *)s8_test_data.conv_weight);
+  if (p_param.using_bias)
+    put_conv_bias(&ctx, bk_ctx, bmk_conv_param.bias, s8_test_data.conv_bias);
+
+  return 1;
+}
+
+void get_result(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk)
+{
+  s8_test_data.conv_output = get_tensor_l2g(ctx, bmk, bmk_conv_param.ofmap);
+  s8_test_data.l2g_cw_output = get_tg_gmem(ctx, l2tg_cw_param.dst);
+  s8_test_data.g2l_matrix_output = get_matrix_l2g(ctx, bmk, tg2l_matrix_param.dst);
+  s8_test_data.l2l_tensor_output = get_tensor_l2g(ctx, bmk, l2l_tensor_copy_param.dst);
+}
+
+void check_result()
+{
+    int has_error = array_cmp_int8(
+        "conv Comparing results ...\n",
+        s8_test_data.conv_output_ref, (s8 *)s8_test_data.conv_output, conv_output_size(&conv_param));
+
+    if (has_error) {
+      print_conv_param(&conv_param);
+      printf("Comparison FAILED\n");
+      exit(-1);
+    }
+
+  for (u64 i = 0; i < tl_shape_size(&l2tg_cw_param.src->shape); i++) {
+    if (s8_test_data.l2g_cw_output[i] != s8_test_data.l2g_cw_output_ref[i]) {
+      fprintf(stderr, "l2g_cw comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, s8_test_data.l2g_cw_output[i], s8_test_data.l2g_cw_output_ref[i]);
+      exit(-1);
+    }
+  }
+  for (u64 i = 0; i < ml_shape_size(&tg2l_matrix_param.dst->shape); i++) {
+    if (s8_test_data.g2l_matrix_output[i] != s8_test_data.g2l_matrix_output_ref[i]) {
+      fprintf(stderr, "g2l_matrix comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, s8_test_data.g2l_matrix_output[i], s8_test_data.g2l_matrix_output_ref[i]);
+      exit(-1);
+    }
+  }
+
+  for (u64 i = 0; i < tl_shape_size(&l2l_tensor_copy_param.src->shape); i++) {
+    if (s8_test_data.l2l_tensor_output[i] != s8_test_data.l2l_tensor_output_ref[i]) {
+      fprintf(stderr, "l2l_tensor comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, s8_test_data.l2l_tensor_output[i], s8_test_data.l2l_tensor_output_ref[i]);
+      exit(-1);
+    }
+  }
+
+
+}
+
+void trigger_max_power(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk)
+{
+ bmk1880v2_parallel_enable(bmk);
+ bmk1880v2_tdma_l2g_tensor_copy_cw_transposed(bmk, &l2tg_cw_param);
+ bmk1880v2_tdma_g2l_matrix_copy_row_col_transposed(bmk, &tg2l_matrix_param);
+ bmk1880v2_tdma_l2l_tensor_copy(bmk, &l2l_tensor_copy_param);
+ bmk1880v2_tiu_convolution(bmk, &bmk_conv_param);
+ bmk1880v2_parallel_disable(bmk);
+ bmk1880v2_parallel_enable(bmk);
+ bmk1880v2_tdma_l2g_tensor_copy_cw_transposed(bmk, &l2tg_cw_param);
+ bmk1880v2_tdma_g2l_matrix_copy_row_col_transposed(bmk, &tg2l_matrix_param);
+ bmk1880v2_tdma_l2l_tensor_copy(bmk, &l2l_tensor_copy_param);
+ bmk1880v2_tiu_convolution(bmk, &bmk_conv_param);
+ bmk1880v2_parallel_disable(bmk);
+ test_submit(ctx);
+}
+
+void free_s8_data()
+{
+  free(s8_test_data.conv_input);
+  free(s8_test_data.conv_weight);
+  free(s8_test_data.conv_bias);
+  free(s8_test_data.conv_output);
+  free(s8_test_data.conv_output_ref);
+  free(s8_test_data.l2g_cw_src);
+  free(s8_test_data.l2g_cw_output);
+  free(s8_test_data.l2g_cw_output_ref);
+  free(s8_test_data.g2l_matrix_src);
+  free(s8_test_data.g2l_matrix_output);
+  free(s8_test_data.g2l_matrix_output_ref);
+  free(s8_test_data.l2l_tensor_src);
+  free(s8_test_data.l2l_tensor_output);
+  free(s8_test_data.l2l_tensor_output_ref);
+}
+
+int main(int argc, char **argv)
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  int loop_cnt = 0, i = 0;
+
+  switch (argc) {
+    case 1:
+      loop_cnt = 1;
+      break;
+
+    case 2:
+      loop_cnt = atoi(argv[1]);
+      break;
+
+    default:
+      printf("input parameter incorrect\n");
+      return -1;
+  }
+
+  for (i = 0; i < loop_cnt; i ++) {
+    test_init(&ctx, &bk_ctx);
+
+    printf("conv max_power test loop_cnt=%d\n", i);
+    init_conv_param(conv_param);
+    setup_conv(conv_param, ctx, bk_ctx);
+
+    test_l2tg_cw_transpose(&ctx, bk_ctx, &l2tg_cw_param);
+    test_tg2l_matrix_transpose(&ctx, bk_ctx, &tg2l_matrix_param);
+    test_l2l_tensor_copy(&ctx, bk_ctx, &l2l_tensor_copy_param);
+
+    trigger_max_power(&ctx, bk_ctx);
+    get_result(&ctx, bk_ctx);
+    check_result();
+
+    destroy_param_l2l(bk_ctx,&l2l_tensor_copy_param);
+    destroy_param_g2l(&ctx, bk_ctx, &tg2l_matrix_param);
+    destroy_param_l2g(&ctx, bk_ctx, &l2tg_cw_param);
+    free_bmk_conv_param(bk_ctx, &bmk_conv_param, &conv_param);
+    free_s8_data();
+    test_exit(&ctx);
+  }
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_conv_ps32.cpp b/cviruntime/test/1880v2/test_1880v2_conv_ps32.cpp
new file mode 100644
index 000000000..c85707a60
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_conv_ps32.cpp
@@ -0,0 +1,1521 @@
+#include "1880v2_test_util.h"
+
+typedef struct {
+  int random_seed;
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int using_bias;
+  int bReLU_EN;
+  int r_shift_m;
+  int opd0_sign;
+  int opd1_sign;
+  int opd2_sign;
+} conv_param_t;
+
+static int index_get(int h, int w1, int w2)
+{
+  return h * w1 + w2;
+}
+
+static int matrix_dot_mult(
+    s8 *A, s8 *B, int dim_n, int dim_m,
+    int opd0_sign)
+{
+  int sum = 0;
+  for (int i=0; i<dim_n; i++){
+    for (int j=0; j<dim_m; j++){
+      int index = index_get(i, dim_m, j);
+      if (opd0_sign) {
+        sum += A[index] * B [index];
+      } else {
+        sum += (int)((uint8_t)A[index]) * B [index];
+      }
+    }
+  }
+  return sum;
+}
+
+static int ps32_m2_conv_ref(
+    const conv_param_t *p_param,
+    const s8 *ifmap,
+    const s8 *weight,
+    s8 *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  s8 *i_fmap_pad_ker = (s8 *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return BM_ERR_FAILURE;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = BM_SUCCESS;
+
+  s8 *i_fmap_pad = NULL;
+  s8 *kernel_after = NULL;
+  u32 bstride = in * oc * oh * ow;
+
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+    ofmap[i] = result[i];
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+    ofmap[bstride + i] = result[i] >> 8;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+    ofmap[2 * bstride + i] = result[i] >> 16;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+    ofmap[3 * bstride + i] = result[i] >> 24;
+
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+static int ps32_m1_conv_ref(
+    const conv_param_t *p_param,
+    const s8 *ifmap,
+    const s8 *weight,
+    const s16 *bias,
+    s8 *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+  int r_shift_bits = p_param->r_shift_m;
+  int do_relu = p_param->bReLU_EN;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  s8 *i_fmap_pad_ker = (s8 *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return BM_ERR_FAILURE;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = BM_SUCCESS;
+
+  s8 *i_fmap_pad = NULL;
+  s8 *kernel_after = NULL;
+
+  u32 bstride = in * oc * oh * ow;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      result[i] = (u8)ofmap[i];
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      result[i] |= (u8)ofmap[bstride + i] << 8;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      result[i] |= (u8)ofmap[2 * bstride + i] << 16;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      result[i] |= (u8)ofmap[3 * bstride + i] << 24;
+
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+
+      if (p_param->using_bias) {
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += bias[c]; //bias+c ;
+          }
+        }
+      }
+
+      if (do_relu)
+        relu(&result[n*oc*oh*ow + c*oh*ow], oh * ow);
+
+      // ofmap is s8, signed
+      ret = satu_2_8bit(&result[n*oc*oh*ow + c*oh*ow], oh * ow,
+                        &ofmap[n*oc*oh*ow + c*oh*ow], r_shift_bits, /*round_floor=*/1,
+                        /*sign_unsign=*/1);
+
+      if (ret != BM_SUCCESS)
+        goto error_release;
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+error_release:
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+
+static int ps32_m3_conv_ref(
+    const conv_param_t *p_param,
+    const s8 *ifmap,
+    const s8 *weight,
+    s8 *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  s8 *i_fmap_pad_ker = (s8 *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return BM_ERR_FAILURE;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = BM_SUCCESS;
+
+  s8 *i_fmap_pad = NULL;
+  s8 *kernel_after = NULL;
+
+  u32 bstride = in * oc * oh * ow;
+
+  for (int i = 0; i < in * oc * oh * ow; i++)
+      result[i] = (u8)ofmap[i];
+
+  for (int i = 0; i < in * oc * oh * ow; i++)
+      result[i] |= (u8)ofmap[bstride + i] << 8;
+
+  for (int i = 0; i < in * oc * oh * ow; i++)
+      result[i] |= (u8)ofmap[2 * bstride + i] << 16;
+
+  for (int i = 0; i < in * oc * oh * ow; i++)
+      result[i] |= (u8)ofmap[3 * bstride + i] << 24;
+
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      ofmap[i] = result[i];
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      ofmap[bstride + i] = result[i] >> 8;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      ofmap[2 * bstride + i] = result[i] >> 16;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      ofmap[3 * bstride + i] = result[i] >> 24;
+
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+static int conv_ref(
+    const conv_param_t *p_param,
+    const s8 *ifmap,
+    const s8 *weight,
+    const s16 *bias,
+    s8 *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+  int r_shift_bits = p_param->r_shift_m;
+  int do_relu = p_param->bReLU_EN;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  s8 *i_fmap_pad_ker = (s8 *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return BM_ERR_FAILURE;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = BM_SUCCESS;
+
+  s8 *i_fmap_pad = NULL;
+  s8 *kernel_after = NULL;
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+
+      if (p_param->using_bias) {
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += bias[c]; //bias+c ;
+          }
+        }
+      }
+
+      if (do_relu)
+        relu(&result[n*oc*oh*ow + c*oh*ow], oh * ow);
+
+      // ofmap is s8, signed
+      ret = satu_2_8bit(&result[n*oc*oh*ow + c*oh*ow], oh * ow,
+                        &ofmap[n*oc*oh*ow + c*oh*ow], r_shift_bits, /*round_floor=*/1,
+                        /*sign_unsign=*/1);
+
+      if (ret != BM_SUCCESS)
+        goto error_release;
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+  neuron_dump <s32>(
+      "test_code:conv_ref:pure result + bias",
+      (u32)in,
+      (u32)oc,
+      (u32)oh,
+      (u32)ow,
+      (s32 *)result);
+
+
+error_release:
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+static u8 * transform_weight(const tl_shape_t *s, u8 before[])
+{
+  u32 ic = s->n;
+  u32 oc = s->c;
+  u32 kh = s->h;
+  u32 kw = s->w;
+
+  u32 size = ic * oc * kh * kw;
+  u8 *after = (u8 *)malloc(sizeof(u8) * size);
+
+  /*
+   * (oc, ic, kh, kw) -> (1, oc, kh * kw, ic)
+   */
+  for (u32 oci = 0; oci < oc; oci++) {
+    for (u32 ici = 0; ici < ic; ici++) {
+      for (u32 khi = 0; khi < kh; khi++) {
+        for (u32 kwi = 0; kwi < kw; kwi++) {
+          u32 src_i = oci * ic * kh * kw + ici * kh * kw + khi * kw + kwi;
+          u32 dst_i = oci * kh * kw * ic + khi * kw * ic + kwi * ic + ici;
+          after[dst_i] = before[src_i];
+        }
+      }
+    }
+  }
+
+  return after;
+}
+
+static void put_conv_weight(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    const tl_t *tl,
+    u8 *data)
+{
+  const tl_shape_t *s = &tl->shape;
+  u32 ic = s->n;
+  u32 oc = s->c;
+  u32 kh = s->h;
+  u32 kw = s->w;
+
+  bmshape_t bms = BM_TENSOR_INT8((int)oc, (int)ic, (int)kh, (int)kw);
+  CVI_RT_MEM dev_mem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = CVI_RT_MemGetPAddr(dev_mem);
+  u8 *transformed_data = transform_weight(s, data);
+
+  /* we put weight to region 1. CVI_RT_MemCopyS2D regard dev_mem as
+   * absolute address, so we should pass abs address to copy weight
+   * to right place.
+   */
+
+  //u64 ab_addr = bm_device_read_base_reg(*ctx, 1);
+  //CVI_RT_MEM ab_dev_mem =
+    //bmmem_device_prealloc(*ctx, NULL, ab_addr + gaddr, &bms);
+
+  //int ret = CVI_RT_MemCopyS2D(*ctx, ab_dev_mem, transformed_data);
+  int ret = CVI_RT_MemCopyS2D(*ctx, dev_mem, transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tl_shape_t tdma_shape = { 1, oc, kh * kw, ic };
+
+  tg_t tdma_tg;
+  tdma_tg.base_reg_index = 0;
+  tdma_tg.start_address = gaddr;
+  tdma_tg.fmt = FMT_I8;
+  tdma_tg.shape.n = tdma_shape.n;
+  tdma_tg.shape.c = tdma_shape.c;
+  tdma_tg.shape.h = tdma_shape.h;
+  tdma_tg.shape.w = tdma_shape.w;
+  tdma_tg.stride = bmk1880v2_tensor_tgmem_default_stride(tdma_tg.shape, tdma_tg.fmt);
+  tdma_tg.base_reg_index = 1;
+
+  tl_t tdma_tl = *tl;
+  tdma_tl.shape = tdma_shape;
+  tdma_tl.stride = bmk1880v2_tensor_lmem_default_stride(bk_ctx, tdma_shape, FMT_I8, 0);
+
+  bmk1880v2_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tdma_tg;
+  p.dst = &tdma_tl;
+
+  bmk1880v2_tdma_g2l_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+  CVI_RT_MemFree(*ctx, dev_mem);
+  free(transformed_data);
+}
+
+static s8 * transform_bias(int oc, s16 before[])
+{
+  s8 *after = (s8 *)malloc(sizeof(s8) * 2 * oc);
+  if (!after)
+    return NULL;
+
+  for (int i = 0; i < oc; i++){
+    after[i] = before[i] & 0xff;
+    after[i + oc] = (before[i] >> 8) & 0xff;
+  }
+  return after;
+}
+
+static void put_conv_bias(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    const tl_t *tl,
+    s16 *data)
+{
+  int oc = tl->shape.c;
+
+  bmshape_t bms = BM_TENSOR_INT8(2, oc, 1, 1);
+  CVI_RT_MEM dev_mem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = CVI_RT_MemGetPAddr(dev_mem);
+  s8 *transformed_data = transform_bias(oc, data);
+  int ret = CVI_RT_MemCopyS2D(*ctx, dev_mem, (u8 *)transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_I8;
+  tg.shape.n = 2;
+  tg.shape.c = oc;
+  tg.shape.h = 1;
+  tg.shape.w = 1;
+  tg.stride = bmk1880v2_tensor_tgmem_default_stride(tg.shape, tg.fmt);
+
+  bmk1880v2_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1880v2_tdma_g2l_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  CVI_RT_MemFree(*ctx, dev_mem);
+  free(transformed_data);
+}
+
+static int conv_kh_ext(const conv_param_t *p)
+{
+  return (p->kh - 1) * p->dh + 1;
+}
+
+static int conv_kw_ext(const conv_param_t *p)
+{
+  return (p->kw - 1) * p->dw + 1;
+}
+
+static int conv_ih_ext(const conv_param_t *p)
+{
+  return (p->input_h - 1) * (p->ins_h + 1) +
+      p->ins_h_last + 1 + p->pad_top + p->pad_bot;
+}
+
+static int conv_iw_ext(const conv_param_t *p)
+{
+  return (p->input_w - 1) * (p->ins_w + 1) +
+      p->ins_w_last + 1 + p->pad_left + p->pad_right;
+}
+
+static int conv_oh(const conv_param_t *p)
+{
+  return (conv_ih_ext(p) - conv_kh_ext(p)) / p->stride_h + 1;
+}
+
+static int conv_ow(const conv_param_t *p)
+{
+  return (conv_iw_ext(p) - conv_kw_ext(p)) / p->stride_w + 1;
+}
+
+static int conv_input_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int ic = p->input_c;
+  int ih = p->input_h;
+  int iw = p->input_w;
+  return in * ic * ih * iw;
+}
+
+static int conv_output_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int oc = p->output_c;
+  int oh = conv_oh(p);
+  int ow = conv_ow(p);
+  return in * oc * oh * ow;
+}
+
+static int conv_weight_size(const conv_param_t *p)
+{
+  int oc = p->output_c;
+  int ic = p->input_c;
+  int kh = p->kh;
+  int kw = p->kw;
+  return oc * ic * kh * kw;
+}
+
+static s8 * alloc_input(const conv_param_t *p)
+{
+  int size = conv_input_size(p);
+
+  s8 *buf = (s8 *)malloc(sizeof(s8) * size);
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static s8 * alloc_weight(const conv_param_t *p)
+{
+  int size = conv_weight_size(p);
+
+  s8 *buf = (s8 *)malloc(sizeof(s8) * size);
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static s16 * alloc_bias(const conv_param_t *p)
+{
+  int oc = p->output_c;
+
+  s16 *bias = (s16 *)malloc(sizeof(s16) * oc);
+  for (int i = 0; i < oc; i++)
+    bias[i] = rand() % 65536 - 32768;
+
+  return bias;
+}
+
+static tl_t * conv_ifmap_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = p->opd0_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return bmk1880v2_lmem_alloc_tensor(ctx, s, fmt, 1);
+}
+
+static uint32_t conv_ifmap_tensor_size(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = p->opd0_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return bmk1880v2_lmem_tensor_to_size(ctx, s, fmt, 1);
+}
+
+static tl_t * conv_weight_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = p->opd1_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+  return bmk1880v2_lmem_alloc_tensor(ctx, s, fmt, 0);
+}
+
+static uint32_t conv_weight_tensor_size(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = p->opd1_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+  return bmk1880v2_lmem_tensor_to_size(ctx, s, fmt, 0);
+}
+
+static tl_t * conv_ofmap_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  return bmk1880v2_lmem_alloc_ps32_tensor(ctx, s, FMT_I8, 1);
+}
+
+static uint32_t conv_ofmap_tensor_size(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  tl_shape_t s;
+  s.n = p->input_n * sizeof(u32) / sizeof(u8);
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  return bmk1880v2_lmem_tensor_to_size(ctx, s, FMT_I8, 1);
+}
+
+static tl_t * conv_bias_tensor(
+    bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = p->opd2_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+  return bmk1880v2_lmem_alloc_tensor(ctx, s, fmt, 0);
+}
+
+static uint32_t conv_bias_tensor_size(
+    bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = p->opd2_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+  return bmk1880v2_lmem_tensor_to_size(ctx, s, fmt, 0);
+}
+
+static int conv_param_is_ok(const conv_param_t *p)
+{
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+
+  if ((kh_ext > ih_ext)
+      || (kw_ext > iw_ext)
+      || (kh_ext <= p->pad_top)
+      || (kh_ext <= p->pad_bot)
+      || (kw_ext <= p->pad_left)
+      || (kw_ext <= p->pad_right)
+      || (p->pad_top >= (1 << 4))
+      || (p->pad_bot >= (1 << 4))
+      || (p->pad_left >= (1 << 4))
+      || (p->pad_right >= (1 << 4))) {
+    return 0;
+  }
+
+  return 1;
+}
+
+static int bmk_conv_param_alloc_ok(
+    const bmk1880v2_tiu_convolution_param_t *p,
+    const conv_param_t *param)
+{
+  if (!p->ifmap || !p->ofmap || !p->weight)
+    return 0;
+  if(p->ps32_mode==1)
+      if (param->using_bias)
+        if (!p->bias)
+          return 0;
+
+  return 1;
+}
+
+static void make_bmk_conv_param_ps32(
+    bmk_ctx_t *ctx,
+    bmk1880v2_tiu_convolution_param_t *dst,
+    const conv_param_t *p, u32 ps32_mode)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+
+  if(ps32_mode==2)
+  {
+    u32 ifmap_size = conv_ifmap_tensor_size(ctx, p);
+    u32 weight_size = conv_weight_tensor_size(ctx, p);
+    u32 ofmap_size = conv_ofmap_tensor_size(ctx, p);
+    u32 bias_size = p->using_bias ? conv_bias_tensor_size(ctx, p) : 0;
+    u32 total_size = ifmap_size + weight_size + ofmap_size + bias_size;
+
+    // Allocation if size fit.
+    // Assertion check in bmk1880v2_lmem_alloc_ps32_tensor().
+    bmk1880v2_chip_info_t chip_info = bmk1880v2_chip_info();
+    if (total_size <= chip_info.lmem_size) {
+      dst->ifmap = conv_ifmap_tensor(ctx, p);
+      dst->weight = conv_weight_tensor(ctx, p);
+      dst->ofmap = conv_ofmap_tensor(ctx, p);
+    } else {
+      dst->ifmap = nullptr;
+      dst->weight = nullptr;
+      dst->ofmap = nullptr;
+    }
+  }
+
+  dst->ps32_mode = ps32_mode;
+  dst->bias = NULL;
+  dst->relu_enable = 0;
+  dst->rshift_bits = 0;
+  if(ps32_mode==1)
+  {
+    dst->relu_enable = p->bReLU_EN;
+    dst->rshift_bits = p->r_shift_m;
+    // only mode=1 can use bias
+    if (p->using_bias)
+      dst->bias = conv_bias_tensor(ctx, p);
+  }
+}
+
+static void make_bmk_conv_param(
+    bmk_ctx_t *ctx,
+    bmk1880v2_tiu_convolution_param_t *dst,
+    const conv_param_t *p)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  dst->relu_enable = p->bReLU_EN;
+  dst->rshift_bits = p->r_shift_m;
+  dst->ifmap = conv_ifmap_tensor(ctx, p);
+  dst->weight = conv_weight_tensor(ctx, p);
+  dst->ofmap = conv_ofmap_tensor(ctx, p);
+  dst->bias = NULL;
+  dst->ps32_mode = 0;
+  if (p->using_bias)
+    dst->bias = conv_bias_tensor(ctx, p);
+
+}
+
+static void free_bmk_conv_param(
+    bmk_ctx_t *ctx,
+    bmk1880v2_tiu_convolution_param_t *r,
+    const conv_param_t *p)
+{
+  if (p->using_bias && r->bias) {
+    free_tl(ctx, r->bias);
+    r->bias = nullptr;
+  }
+
+  if (r->ofmap) {
+    free_tl(ctx, r->ofmap);
+    r->ofmap = nullptr;
+  }
+
+  if (r->weight) {
+    free_tl(ctx, r->weight);
+    r->weight = nullptr;
+  }
+
+  if (r->ifmap) {
+    free_tl(ctx, r->ifmap);
+    r->ifmap = nullptr;
+  }
+
+}
+
+static void init_conv_param(conv_param_t &p)
+{
+  printf("init_conv_param\n");
+
+  memset(&p, 0, sizeof(p));
+
+  p.random_seed = clock();
+  srand(p.random_seed);
+
+retry:
+  p.input_n = 1;
+  p.input_c = rand() % (10) + 2;
+  p.kh = rand() % 7 + 1;
+  p.kw = rand() % 7 + 1;
+  p.input_h = rand() % 10 + p.kh;
+  p.input_w = rand() % 10 + p.kw;
+  p.output_c = rand() % 10 + 3;
+  p.stride_h = rand() % (p.kh) + 1;
+  p.stride_w = rand() % (p.kw) + 1;
+  p.ins_h = rand() % p.kh;
+  p.ins_w = rand() % p.kw;
+  p.ins_h_last = rand() % p.kh;;
+  p.ins_w_last = rand() % p.kw;;
+  p.dh = rand() % 3 + 1;
+  p.dw = rand() % 3 + 1;
+
+  int kh_ext = conv_kh_ext(&p);
+  int kw_ext = conv_kw_ext(&p);
+  p.pad_top = rand() % kh_ext;
+  p.pad_bot = rand() % kh_ext;
+  p.pad_left = rand() % kw_ext;
+  p.pad_right = rand() % kw_ext;
+
+  if (!conv_param_is_ok(&p)) {
+    printf("retry init_conv_param\n");
+    goto retry;
+  }
+
+  p.using_bias = rand() % 2;
+  p.r_shift_m = rand() % 8;
+  p.bReLU_EN = rand() % 2;
+
+  p.opd0_sign = rand() % 2;
+  p.opd1_sign = 1;
+  p.opd2_sign = 1;
+
+  assert(p.opd1_sign == 1 && p.opd2_sign == 1);
+
+  int ih_ext = conv_ih_ext(&p);
+  int iw_ext = conv_iw_ext(&p);
+  assert(ih_ext >= kh_ext);
+  assert(iw_ext >= kw_ext);
+}
+
+static void print_conv_param(const conv_param_t *p)
+{
+  printf("%s\n", "Conv parameters:");
+  printf("  %s%d;\n", "p->random_seed = ", p->random_seed);
+
+  printf("  %s%d;\n", "p->input_n = ", p->input_n);
+  printf("  %s%d;\n", "p->input_c = ", p->input_c);
+  printf("  %s%d;\n", "p->input_h = ", p->input_h);
+  printf("  %s%d;\n", "p->input_w = ", p->input_w);
+  printf("  %s%d;\n", "p->output_c = ", p->output_c);
+
+  printf("  %s%d;\n", "p->kh = ", p->kh);
+  printf("  %s%d;\n", "p->kw = ", p->kw);
+  printf("  %s%d;\n", "p->dh = ", p->dh);
+  printf("  %s%d;\n", "p->dw = ", p->dw);
+  printf("  %s%d;\n", "p->pad_top = ", p->pad_top);
+  printf("  %s%d;\n", "p->pad_bot = ", p->pad_bot);
+  printf("  %s%d;\n", "p->pad_left = ", p->pad_left);
+  printf("  %s%d;\n", "p->pad_right = ", p->pad_right);
+  printf("  %s%d;\n", "p->stride_h = ", p->stride_h);
+  printf("  %s%d;\n", "p->stride_w = ", p->stride_w);
+  printf("  %s%d;\n", "p->ins_w = ", p->ins_w);
+  printf("  %s%d;\n", "p->ins_h = ", p->ins_h);
+  printf("  %s%d;\n", "p->ins_w_last = ", p->ins_w_last);
+  printf("  %s%d;\n", "p->ins_h_last = ", p->ins_h_last);
+
+  printf("  %s%d;\n", "p->r_shift_m = ", p->r_shift_m);
+  printf("  %s%d;\n", "p->opd0_sign = ", p->opd0_sign);
+  printf("  %s%d;\n", "p->opd1_sign = ", p->opd1_sign);
+  printf("  %s%d;\n", "p->opd2_sign = ", p->opd2_sign);
+  printf("  %s%d;\n", "p->bReLU_EN = ", p->bReLU_EN);
+  printf("  %s%d;\n", "p->using_bias = ", p->using_bias);
+
+  printf("  %s%d\n", "kh_ext = ", conv_kh_ext(p));
+  printf("  %s%d\n", "kw_ext = ", conv_kw_ext(p));
+  printf("  %s%d\n", "ih_ext = ", conv_ih_ext(p));
+  printf("  %s%d\n", "iw_ext = ", conv_iw_ext(p));
+  printf("  %s%d\n", "output_h = ", conv_oh(p));
+  printf("  %s%d\n", "output_w = ", conv_ow(p));
+}
+
+/* Calculate the right shift value, m
+ * Steps:
+ *  1. Get the abs() of each weight;
+ *  2. Summary all the abs() in one kernel;
+ *  3. Get Log2 of each sum;
+ *  4. Downward rounding;
+ * After every r_shift value got, sort and find the middle one.
+ */
+
+static int calc_rshift_m(const conv_param_t *p, s8* weight)
+{
+  int kernel_cnt = p->output_c * p->input_c;
+  int kernel_size = p->kh * p->kw;
+  int *kernel_shifts = (int *)malloc(sizeof(int) * kernel_cnt);
+
+  // Tscan does not recognize C++ zero-initialized.
+  memset(kernel_shifts, 0, sizeof(int) * kernel_cnt);
+
+  // Part 1:
+  // Get right shift value for each kernel
+  int sum = 0;
+  for (int i = 0; i < kernel_cnt; i++) {
+    // Step 1 & 2: Get the sum of abs()
+    for (int j = 0; j < kernel_size; j++) {
+      sum += (int)(*weight < 0 ? -(*weight) : (*weight));
+      weight++;
+    }
+    // Step 3 & 4: log2 and downward rounding
+    sum >>= 1;
+    while (sum) {
+      sum >>= 1;
+      kernel_shifts[i]++;
+    }
+  }
+
+  // Part 2:
+  // Find the middle of all the values
+  int tag[32] = {0};
+  for (int cnt = 0; cnt < kernel_cnt; cnt++) {
+    tag[kernel_shifts[cnt]]++;
+  }
+
+  int rshift_m = 0;
+  int mid = 0;
+  do {
+    mid += tag[rshift_m++];
+  } while(mid < (kernel_cnt - 1) >> 1);
+
+  free(kernel_shifts);
+
+  return rshift_m - 1;
+}
+
+static int test_ps32_ut(
+    conv_param_t &p_param, CVI_RT_HANDLE ctx, bmk_ctx_t *bk_ctx)
+{
+  printf("  test_ps32_ut\n");
+  s8 *input = alloc_input(&p_param);
+  s8 *weight = alloc_weight(&p_param);
+  s16 *bias = alloc_bias(&p_param);
+  p_param.r_shift_m = calc_rshift_m(&p_param, weight);
+  s8 *output_ref = (s8 *)malloc(sizeof(s8) * conv_output_size(&p_param) * sizeof(int));
+  if (!output_ref)
+    return BM_ERR_FAILURE;
+
+  bmerr_t ret = ps32_m2_conv_ref(&p_param, input, weight, output_ref);
+  assert(ret == BM_SUCCESS);
+
+  bmk1880v2_tiu_convolution_param_t conv_param;
+  make_bmk_conv_param_ps32(bk_ctx, &conv_param, &p_param, 2);
+  int tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, &p_param);
+
+  if (tl_alloc_success) {
+
+    put_tensor_g2l(&ctx, bk_ctx, conv_param.ifmap, (u8 *)input);
+    put_conv_weight(&ctx, bk_ctx, conv_param.weight, (u8 *)weight);
+    bmk1880v2_tiu_convolution(bk_ctx, &conv_param);
+
+    bmk1880v2_tensor_lmem_t ps32_ofmap;
+    ps32_ofmap = *conv_param.ofmap;
+    ps32_ofmap.shape.n = ps32_ofmap.shape.n * sizeof(int);
+    u8 *output = get_tensor_l2g(&ctx, bk_ctx, &ps32_ofmap);
+
+    int has_error = array_cmp_int8(
+        "    Comparing begin_mode results ...\n",
+        output_ref, (s8 *)output, conv_output_size(&p_param) * sizeof(int));
+
+    if (has_error) {
+      print_conv_param(&p_param);
+      printf("    Comparison FAILED\n");
+      exit(-1);
+    }
+
+    free_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+    free(output);
+  }
+
+  printf("    test_ps32_intermediate_mode\n");
+  for (int i=0; i < conv_input_size(&p_param); i++)
+    input[i] = rand() % 256 - 128;
+
+  for (int i=0; i < conv_weight_size(&p_param); i++)
+    weight[i] = rand() % 256 - 128;
+
+  ret = ps32_m3_conv_ref(&p_param, input, weight, output_ref);
+  assert(ret == BM_SUCCESS);
+
+  make_bmk_conv_param_ps32(bk_ctx, &conv_param, &p_param, 3);
+  tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, &p_param);
+
+  if (tl_alloc_success) {
+    put_tensor_g2l(&ctx, bk_ctx, conv_param.ifmap, (u8 *)input);
+    put_conv_weight(&ctx, bk_ctx, conv_param.weight, (u8 *)weight);
+
+    bmk1880v2_tiu_convolution(bk_ctx, &conv_param);
+
+    bmk1880v2_tensor_lmem_t ps32_ofmap;
+    ps32_ofmap = *conv_param.ofmap;
+    ps32_ofmap.shape.n = ps32_ofmap.shape.n * sizeof(int);
+
+    u8 *output = get_tensor_l2g(&ctx, bk_ctx, &ps32_ofmap);
+
+    int has_error = array_cmp_int8(
+        "    Comparing intermediate results ...\n",
+        output_ref, (s8 *)output, conv_output_size(&p_param) * sizeof(int));
+
+    if (has_error) {
+      print_conv_param(&p_param);
+      printf("    Comparison FAILED\n");
+      exit(-1);
+    }
+
+    free_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+    free(output);
+  }
+
+  printf("    test_ps32_end_mode\n");
+  for (int i=0; i < conv_input_size(&p_param); i++)
+    input[i] = rand() % 256 - 128;
+
+  for (int i=0; i < conv_weight_size(&p_param); i++)
+    weight[i] = rand() % 256 - 128;
+
+  ret = ps32_m1_conv_ref(&p_param, input, weight, bias, output_ref);
+  assert(ret == BM_SUCCESS);
+
+  make_bmk_conv_param_ps32(bk_ctx, &conv_param, &p_param, 1);
+
+  tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, &p_param);
+
+  if (tl_alloc_success) {
+
+    put_tensor_g2l(&ctx, bk_ctx, conv_param.ifmap, (u8 *)input);
+    put_conv_weight(&ctx, bk_ctx, conv_param.weight, (u8 *)weight);
+    if (p_param.using_bias) {
+      put_conv_bias(&ctx, bk_ctx, conv_param.bias, bias);
+    }
+    bmk1880v2_tiu_convolution(bk_ctx, &conv_param);
+    u8 *output = get_tensor_l2g(&ctx, bk_ctx, conv_param.ofmap);
+
+    int has_error = array_cmp_int8(
+        "    Comparing end results ...\n",
+        output_ref, (s8 *)output, conv_output_size(&p_param));
+
+    if (has_error) {
+      print_conv_param(&p_param);
+      printf("    Comparison FAILED\n");
+      exit(-1);
+    }
+    free(output);
+  }
+  free_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+  free(input);
+  free(weight);
+  free(bias);
+  free(output_ref);
+
+  return tl_alloc_success ? 1 : 0;
+}
+
+static int test_ic_tiling_conv(
+    conv_param_t &p_param, CVI_RT_HANDLE ctx, bmk_ctx_t *bk_ctx)
+{
+  printf("  test tiled ps32 conv\n");
+  s8 *input = alloc_input(&p_param);
+  s8 *weight = alloc_weight(&p_param);
+  s16 *bias = alloc_bias(&p_param);
+  p_param.r_shift_m = calc_rshift_m(&p_param, weight);
+  s8 *output_ref = (s8 *)malloc(sizeof(s8) * conv_output_size(&p_param));
+  if (!output_ref)
+    return BM_ERR_FAILURE;
+
+  bmerr_t ret = conv_ref(&p_param, input, weight, bias, output_ref);
+  assert(ret == BM_SUCCESS);
+  bmk1880v2_tiu_convolution_param_t conv_tmp_param;
+  bmk1880v2_tiu_convolution_param_t conv_param;
+  make_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+  memset(&conv_tmp_param, 0, sizeof(conv_tmp_param));
+
+  int tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, &p_param);
+  if (tl_alloc_success) {
+    if (p_param.using_bias) {
+      conv_tmp_param.bias = conv_param.bias;
+      put_conv_bias(&ctx, bk_ctx, conv_param.bias, bias);
+      neuron_dump <s16> (
+        "    test_ic_tiling_conv: bias",
+        1,
+        conv_param.bias->shape.c,
+        conv_param.bias->shape.h,
+        conv_param.bias->shape.w,
+        (s16 *)bias);
+    }
+    // for ps32_md[1] = 1, relu_enable & rshift_bits need to set to 0
+    // so we store those parameters to conv_tmp_para
+    conv_tmp_param.relu_enable = conv_param.relu_enable;
+    conv_tmp_param.rshift_bits = conv_param.rshift_bits;
+    conv_tmp_param.bias        = conv_param.bias;
+
+    u32 ic_step = 1;
+    u32 n_step = 1;
+    tl_t ifmap = *conv_param.ifmap;
+    tl_t ofmap = *conv_param.ofmap;
+    tg_shape_t s;
+    s.n = conv_param.ifmap->shape.n;
+    s.c = conv_param.ifmap->shape.c;
+    s.h = conv_param.ifmap->shape.h;
+    s.w = conv_param.ifmap->shape.w;
+    tg_t *tg_ifmap = alloc_tg_gmem(&ctx, s, FMT_I8);
+    put_tg_gmem(&ctx, tg_ifmap, (u8 *)input);
+
+    s.n = conv_param.weight->shape.n;
+    s.c = conv_param.weight->shape.c;
+    s.h = conv_param.weight->shape.h;
+    s.w = conv_param.weight->shape.w;
+    u8 *transformed_weight =
+      transform_weight(&conv_param.weight->shape, (u8 *)weight);
+    tg_t *tg_weight = alloc_tg_gmem(&ctx, s, FMT_I8);
+    put_tg_gmem(&ctx, tg_weight, (u8 *)transformed_weight);
+
+    neuron_dump <s8> (
+        "    test_ic_tiling_conv: input",
+        p_param.input_n,
+        p_param.input_c,
+        p_param.input_h,
+        p_param.input_w,
+        (s8 *)input);
+
+    neuron_dump <s8> (
+        "    test_ic_tiling_conv: kernel",
+        1,
+        conv_param.weight->shape.c,
+        conv_param.weight->shape.h * conv_param.weight->shape.w,
+        conv_param.weight->shape.n,
+        (s8 *)transformed_weight);
+    free(transformed_weight);
+
+    tl_shape_t cur_tl_ifmap_shape = {
+        n_step,
+        ic_step,
+        ifmap.shape.h,
+        ifmap.shape.w
+    };
+
+    tg_shape_t cur_tg_ifmap_shape = {
+      cur_tl_ifmap_shape.n,
+      cur_tl_ifmap_shape.c,
+      cur_tl_ifmap_shape.h,
+      cur_tl_ifmap_shape.w
+    };
+
+    tg_stride_t cur_tg_ifmap_stride = {
+      tg_ifmap->stride.n,
+      tg_ifmap->stride.c,
+      tg_ifmap->stride.h,
+    };
+
+    tg_t cur_tg_ifmap;
+    cur_tg_ifmap.base_reg_index = 0;
+    cur_tg_ifmap.start_address = tg_ifmap->start_address;
+    cur_tg_ifmap.shape = cur_tg_ifmap_shape;
+    cur_tg_ifmap.stride = cur_tg_ifmap_stride;
+    cur_tg_ifmap.fmt = FMT_I8;
+
+    tl_t cur_tl_ifmap;
+    cur_tl_ifmap.shape = cur_tl_ifmap_shape;
+    cur_tl_ifmap.stride =
+      bmk1880v2_tensor_lmem_default_stride(bk_ctx, cur_tl_ifmap_shape, FMT_I8, 1);
+    cur_tl_ifmap.start_address = ifmap.start_address;
+    cur_tl_ifmap.fmt = ifmap.fmt;
+
+    tl_t cur_tl_ofmap;
+    cur_tl_ofmap.start_address = ofmap.start_address;
+    cur_tl_ofmap.shape = ofmap.shape;
+    cur_tl_ofmap.shape.n = n_step;
+    cur_tl_ofmap.stride =
+      bmk1880v2_tensor_lmem_default_stride(bk_ctx, cur_tl_ofmap.shape, FMT_I8, 1);
+    cur_tl_ofmap.fmt = ofmap.fmt;
+
+    tl_t cur_tl_weight;
+    cur_tl_weight.start_address = conv_param.weight->start_address;
+    cur_tl_weight.shape = conv_param.weight->shape;
+    cur_tl_weight.shape.n = ic_step;
+    cur_tl_weight.stride = {
+      1,
+      cur_tl_weight.shape.n * cur_tl_weight.shape.h * cur_tl_weight.shape.w,
+      cur_tl_weight.shape.n * cur_tl_weight.shape.w,
+      cur_tl_weight.shape.n
+    };
+    cur_tl_weight.fmt = conv_param.weight->fmt;
+
+    const tl_t *saved_tl_weight = conv_param.weight;
+    const tl_t *saved_tl_ifmap = conv_param.ifmap;
+    for (u32 ci = 0; ci < ifmap.shape.c; ci += ic_step) {
+      {
+        u32 ic = tg_weight->shape.n;
+        u32 oc = tg_weight->shape.c;
+        u32 kh = tg_weight->shape.h;
+        u32 kw = tg_weight->shape.w;
+
+        tg_t cur_tdma_tg_weight;
+        cur_tdma_tg_weight.base_reg_index = tg_weight->base_reg_index;
+        cur_tdma_tg_weight.start_address = tg_weight->start_address + ci;
+        cur_tdma_tg_weight.fmt = tg_weight->fmt;
+        cur_tdma_tg_weight.shape = {1, oc, kh * kw, ic};
+        cur_tdma_tg_weight.stride =
+          bmk1880v2_tensor_tgmem_default_stride(cur_tdma_tg_weight.shape, cur_tdma_tg_weight.fmt);
+        cur_tdma_tg_weight.shape = {1, oc, kh * kw, ic_step};
+
+        tl_t cur_tdma_tl_weight;
+        cur_tdma_tl_weight = cur_tl_weight;
+        cur_tdma_tl_weight.shape.n = cur_tdma_tg_weight.shape.n;
+        cur_tdma_tl_weight.shape.c = cur_tdma_tg_weight.shape.c;
+        cur_tdma_tl_weight.shape.h = cur_tdma_tg_weight.shape.h;
+        cur_tdma_tl_weight.shape.w = cur_tdma_tg_weight.shape.w;
+        cur_tdma_tl_weight.stride = bmk1880v2_tensor_lmem_default_stride(
+            bk_ctx, cur_tdma_tl_weight.shape, FMT_I8, 0);
+
+        bmk1880v2_tdma_tg2l_tensor_copy_param_t p1;
+        memset(&p1, 0, sizeof(p1));
+        p1.src = &cur_tdma_tg_weight;
+        p1.dst = &cur_tdma_tl_weight;
+        bmk1880v2_tdma_g2l_tensor_copy(bk_ctx, &p1);
+        test_submit(&ctx);
+      }
+      {
+        bmk1880v2_tdma_tg2l_tensor_copy_param_t p2;
+        memset(&p2, 0, sizeof(p2));
+        cur_tg_ifmap.start_address =
+          tg_ifmap->start_address + ci * tg_ifmap->stride.c;
+        p2.src = &cur_tg_ifmap;
+        p2.dst = &cur_tl_ifmap;
+        bmk1880v2_tdma_g2l_tensor_copy(bk_ctx, &p2);
+        test_submit(&ctx);
+      }
+
+      conv_param.ifmap = &cur_tl_ifmap;
+      conv_param.weight = &cur_tl_weight;
+
+      // for ps32_md[1] = 1, relu_enable & rshift_bits need to set to 0
+      // so we store those parameters to conv_tmp_para
+      if (ci == (ifmap.shape.c - 1))
+      {
+        conv_param.relu_enable = conv_tmp_param.relu_enable;
+        conv_param.rshift_bits = conv_tmp_param.rshift_bits;
+        conv_param.bias        = conv_tmp_param.bias;
+        conv_param.ps32_mode   = 1;
+      }
+      else if (ci == 0)
+      {
+        conv_param.relu_enable = 0;
+        conv_param.rshift_bits = 0;
+        conv_param.bias        = 0;;
+        conv_param.ps32_mode   = 2;
+      }
+      else
+      {
+        conv_param.relu_enable = 0;
+        conv_param.rshift_bits = 0;
+        conv_param.bias        = 0;;
+        conv_param.ps32_mode   = 3;
+      }
+      bmk1880v2_tiu_convolution(bk_ctx, &conv_param);
+      conv_param.weight = saved_tl_weight;
+      conv_param.ifmap = saved_tl_ifmap;
+    }
+
+    u8 *output = get_tensor_l2g(&ctx, bk_ctx, conv_param.ofmap);
+
+    free_tg_gmem(&ctx, tg_ifmap);
+    free_tg_gmem(&ctx, tg_weight);
+    int has_error = array_cmp_int8(
+        "    Comparing results ...\n",
+        output_ref, (s8 *)output, conv_output_size(&p_param));
+
+    if (has_error) {
+      print_conv_param(&p_param);
+      printf("    Comparison FAILED\n");
+      exit(-1);
+    }
+    free(output);
+  }
+  free_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+  free(input);
+  free(weight);
+  free(output_ref);
+  free(bias);
+
+  return tl_alloc_success ? 1 : 0;
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  int test_finished_num = 0;
+  for (int i = 0; i < 5; i++) {
+    printf("random_test_conv iteration: %d\n", i);
+    conv_param_t test_conv_param;
+    init_conv_param(test_conv_param);
+    test_finished_num += test_ic_tiling_conv(test_conv_param, ctx, bk_ctx);
+    test_finished_num += test_ps32_ut(test_conv_param, ctx, bk_ctx);
+    if (!test_conv_param.using_bias)
+      test_conv_param.using_bias = 1;
+    if (test_conv_param.output_c <= 9)
+      test_conv_param.output_c += 3;
+    test_finished_num += test_ic_tiling_conv(test_conv_param, ctx, bk_ctx);
+    test_finished_num += test_ps32_ut(test_conv_param, ctx, bk_ctx);
+  }
+  printf("test_finished_num: %d\n", test_finished_num);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_conv_qdm.cpp b/cviruntime/test/1880v2/test_1880v2_conv_qdm.cpp
new file mode 100644
index 000000000..e17667b2e
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_conv_qdm.cpp
@@ -0,0 +1,1645 @@
+#include <limits.h>
+#include "1880v2_test_util.h"
+#include "test_tf_quant_util.h"
+
+// #define ENABLE_DEBUG_MSG
+// #define ENABLE_FULL_REGRESSION
+// #define ENABLE_TV_GEN_PATTERN
+
+#define MIN_EXEC_TESTS  20
+
+typedef struct {
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int output_h;
+  int output_w;
+  int has_bias;
+  int relu_enable;
+  s8 *input_data;
+  s8 *filter_data;
+  s8 *output_data;
+  s32 *bias_data;
+  u32 *multiplier_data;
+  s8 *shift_data;
+  float float_multiplier;
+  int retry_cnt;
+} conv_test_param_t;
+
+inline int Offset(tl_shape_t shape, int n, int c, int h, int w)
+{
+  return n * (shape.c * shape.h * shape.w) + c * (shape.h * shape.w) +
+         h * shape.w + w;
+}
+
+void conv_per_channel_ref(conv_test_param_t *p_param)
+{
+  const int stride_width = p_param->stride_w;
+  const int stride_height = p_param->stride_h;
+  const int dilation_width_factor = 1;
+  const int dilation_height_factor = 1;
+  const int pad_width = p_param->pad_left;
+  const int pad_height = p_param->pad_top;
+
+  const s32 output_activation_min = -128;
+  const s32 output_activation_max = 127;
+
+  const int batches = p_param->input_n;
+  const int input_depth = p_param->input_c;
+  const int output_depth = p_param->output_c;
+
+  const int input_height = p_param->input_h;
+  const int input_width = p_param->input_w;
+  const int filter_height = p_param->kh;
+  const int filter_width = p_param->kw;
+  const int output_height = p_param->output_h;
+  const int output_width = p_param->output_w;
+  s8 *input_data = p_param->input_data;
+  s8 *filter_data = p_param->filter_data;
+  s8 *output_data = p_param->output_data;
+  s32 *bias_data = p_param->has_bias ? p_param->bias_data : nullptr;
+  u32 *output_multiplier = p_param->multiplier_data;
+  s8 *output_rshift = p_param->shift_data;
+
+  tl_shape_t input_shape = {
+      static_cast<u32>(batches), static_cast<u32>(input_depth),
+      static_cast<u32>(input_height), static_cast<u32>(input_width)};
+  tl_shape_t filter_shape = {
+      static_cast<u32>(output_depth), static_cast<u32>(filter_height),
+      static_cast<u32>(filter_width), static_cast<u32>(input_depth)};
+  tl_shape_t output_shape = {
+      static_cast<u32>(batches), static_cast<u32>(output_depth),
+      static_cast<u32>(output_height), static_cast<u32>(output_width)};
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("conv_per_channel_ref: \n"
+         "  input (n=%d, ic=%d, h=%d, w=%d)\n"
+         "  kernel (oc=%d, kh=%d, kw=%d, ic=%d)\n",
+         batches, input_depth, input_height, input_width, output_depth,
+         filter_height, filter_width, input_depth);
+#endif
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          const int in_y_origin = (out_y * stride_height) - pad_height;
+          s32 acc = 0;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  s32 input_val = input_data[Offset(input_shape, batch,
+                                                    in_channel, in_y, in_x)];
+                  // s32 filter_val = filter_data[Offset(filter_shape,
+                  // out_channel, in_channel,
+                  //                                    filter_y, filter_x)];
+                  s32 filter_val =
+                      filter_data[Offset(filter_shape, out_channel, filter_y,
+                                         filter_x, in_channel)];
+                  acc += filter_val * input_val;
+
+#ifdef ENABLE_DEBUG_MSG
+                  printf("  [batch=%d][out_channel=%d][out_y=%d][out_x=%d]"
+                         "[filter_y=%d][filter_x=%d][in_channel=%d] acc(%d) += "
+                         "%d * %d = %d\n",
+                         batch, out_channel, out_y, out_x, filter_y, filter_x,
+                         in_channel, acc - filter_val * input_val, filter_val,
+                         input_val, acc);
+#endif
+                }
+              }
+            }
+          }
+
+          if (bias_data) {
+            acc += bias_data[out_channel];
+          }
+
+#ifdef ENABLE_DEBUG_MSG
+          printf("  [batch=%d][out_channel=%d][out_y=%d][out_x=%d] acc = %d, "
+                 "bias %d\n",
+                 batch, out_channel, out_y, out_x, acc,
+                 bias_data ? bias_data[out_channel] : 0);
+#endif
+
+          acc = MultiplyByQuantizedMultiplier(
+              acc, output_multiplier[out_channel], output_rshift[out_channel]);
+
+#ifdef ENABLE_DEBUG_MSG
+          printf("  [batch=%d][out_channel=%d][out_y=%d][out_x=%d] acc = %d, "
+                 "multiplier %d, shift %d\n",
+                 batch, out_channel, out_y, out_x, acc,
+                 output_multiplier[out_channel], output_rshift[out_channel]);
+#endif
+
+          acc = MAX(acc, output_activation_min);
+          acc = MIN(acc, output_activation_max);
+
+#ifdef ENABLE_DEBUG_MSG
+          printf("  [batch=%d][out_channel=%d][out_y=%d][out_x=%d] acc = %d\n",
+                 batch, out_channel, out_y, out_x, acc);
+#endif
+
+          output_data[Offset(output_shape, batch, out_channel, out_y, out_x)] =
+              static_cast<s8>(acc);
+        }
+      }
+    }
+  }
+}
+
+void calc_conv_float_multiplier(conv_test_param_t *p_param)
+{
+  const int stride_width = p_param->stride_w;
+  const int stride_height = p_param->stride_h;
+  const int dilation_width_factor = 1;
+  const int dilation_height_factor = 1;
+  const int pad_width = p_param->pad_left;
+  const int pad_height = p_param->pad_top;
+
+  const int batches = p_param->input_n;
+  const int input_depth = p_param->input_c;
+  const int output_depth = p_param->output_c;
+
+  const int input_height = p_param->input_h;
+  const int input_width = p_param->input_w;
+  const int filter_height = p_param->kh;
+  const int filter_width = p_param->kw;
+  const int output_height = p_param->output_h;
+  const int output_width = p_param->output_w;
+  s8 *input_data = p_param->input_data;
+  s8 *filter_data = p_param->filter_data;
+  s32 *bias_data = p_param->has_bias ? p_param->bias_data : nullptr;
+
+  tl_shape_t input_shape = {
+      static_cast<u32>(batches), static_cast<u32>(input_depth),
+      static_cast<u32>(input_height), static_cast<u32>(input_width)};
+  tl_shape_t filter_shape = {
+      static_cast<u32>(output_depth), static_cast<u32>(filter_height),
+      static_cast<u32>(filter_width), static_cast<u32>(input_depth)};
+
+  int output_accu_min = INT_MAX;
+  int output_accu_max = INT_MIN;
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("calc_conv_float_multiplier =>\n");
+#endif
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          const int in_y_origin = (out_y * stride_height) - pad_height;
+          s32 acc = 0;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  s32 input_val = input_data[Offset(input_shape, batch,
+                                                    in_channel, in_y, in_x)];
+                  // s32 filter_val = filter_data[Offset(filter_shape,
+                  // out_channel, in_channel,
+                  //                                    filter_y, filter_x)];
+                  s32 filter_val =
+                      filter_data[Offset(filter_shape, out_channel, filter_y,
+                                         filter_x, in_channel)];
+                  acc += filter_val * input_val;
+
+                  // printf("  [batch=%d][out_channel=%d][out_y=%d][out_x=%d]"
+                  //        "[filter_y=%d][filter_x=%d][in_channel=%d] acc(%d)
+                  //        += %d * %d = %d\n", batch, out_channel, out_y,
+                  //        out_x, filter_y, filter_x, in_channel, acc -
+                  //        filter_val * input_val, filter_val, input_val, acc);
+                }
+              }
+            }
+          }
+
+          if (bias_data) {
+            acc += bias_data[out_channel];
+          }
+
+          output_accu_max = MAX(acc, output_accu_max);
+          output_accu_min = MIN(acc, output_accu_min);
+        }
+      }
+    }
+  }
+
+  // Since int8 ranges from -128 to 127, we need to squeeze the accumulator
+  // MIN/MAX fit in those ranges correspondingly as much as possible.
+  if (abs(output_accu_max) > abs(output_accu_min)) {
+    p_param->float_multiplier = 127.0f / abs(output_accu_max);
+  } else {
+    p_param->float_multiplier = 128.0f / abs(output_accu_min);
+  }
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("  output_accu_min %d, output_accu_max %d, output_multiplier %f\n",
+         output_accu_min, output_accu_max, p_param->float_multiplier);
+#endif
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("<= calc_dw_conv_float_multiplier\n");
+#endif
+}
+
+int simple_test(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk)
+{
+  int ret = 0;
+
+  const int batches = 1;
+  const int input_depth = 2;
+  const int input_height = 2;
+  const int input_width = 3;
+  tl_shape_t input_shape = {batches, input_depth, input_height, input_width};
+  s8 input_data[12] = {
+      9,  1,   -11,  // ic = 0, h = 0
+      13, 5,   -15,  // ic = 0, h = 1
+      5,  -7,  -15,  // ic = 1, h = 0
+      9,  -11, -19   // ic = 1, h = 1
+  };
+
+  const int output_depth = 2;
+  const int kernel_height = 2;
+  const int kernel_width = 2;
+  tl_shape_t filter_shape = {output_depth, input_depth, kernel_height,
+                             kernel_width};
+
+  // TIU weight layout (1, oc, hw*kc, ic)
+  tl_shape_t filter_shape_for_dma = {1, output_depth,
+                                     kernel_height * kernel_width, input_depth};
+  s8 filter_data_for_dma[16] = {
+      2,  4,  6,  8,  6,  8,  10, 12,  // oc = 0
+      28, 32, 20, 24, 12, 16, 4,  8    // oc = 1
+  };
+
+  s32 bias_data[2] = {12, -16};
+
+  const int output_height = 1;
+  const int output_width = 2;
+  tl_shape_t output_shape = {1, output_depth, output_height, output_width};
+  // zero_point = 0
+  s8 ref_output_data[4] = {
+      17, -128,  // oc = 0
+      60, -128,  // oc = 1
+  };
+
+  u32 output_multiplier[] = {1073741824, 1073741824};
+  s8 output_rshift[2] = {1, 2};  // changed to right shift
+
+  s8 output_data[4];
+
+  conv_test_param_t params;
+  memset(&params, 0, sizeof(params));
+
+  params.input_n = batches;
+  params.input_c = input_depth;
+  params.input_h = input_height;
+  params.input_w = input_width;
+  params.kh = kernel_height;
+  params.kw = kernel_width;
+  params.output_c = output_depth;
+  params.output_h = output_height;
+  params.output_w = output_width;
+  params.stride_w = 1;
+  params.stride_h = 1;
+  params.input_data = input_data;
+  params.filter_data = filter_data_for_dma;
+  params.output_data = output_data;
+  params.has_bias = 1;
+  params.bias_data = bias_data;
+  params.multiplier_data = output_multiplier;
+  params.shift_data = output_rshift;
+  conv_per_channel_ref(&params);
+
+  printf("Compare ref and golden\n");
+  for (int i = 0; i < 4; i++) {
+    if (output_data[i] != ref_output_data[i]) {
+      printf("Error ! output[%d]=%d != ref_output_data[%d]=%d\n", i,
+             output_data[i], i, ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  // tl_shape_t per_channel_cal_shape = {1, /*oc=*/2, 1, 9};
+  u8 per_channel_cal_data[18];
+  pack_chl_quan_param(2, /*has_bias=*/true, bias_data, output_multiplier,
+                      output_rshift, per_channel_cal_data);
+
+  bmk1880v2_tensor_lmem_t *tl_per_channel_cal =
+      bmk1880v2_lmem_alloc_tensor(bmk, {1, 2, 1, 9}, FMT_U8,
+                                  /*eu_align*/ 0);
+
+  bmk1880v2_tensor_lmem_t *tl_input =
+      bmk1880v2_lmem_alloc_tensor(bmk, input_shape, FMT_I8, /*eu_aign=*/1);
+
+  bmk1880v2_tensor_lmem_t *tl_filter = bmk1880v2_lmem_alloc_tensor(
+      bmk, filter_shape_for_dma, FMT_I8, /*eu_align=*/1);
+
+  bmk1880v2_tensor_lmem_t *tl_output =
+      bmk1880v2_lmem_alloc_tensor(bmk, output_shape, FMT_I8, /*eu_align=*/1);
+
+  put_tensor_g2l(ctx, bmk, tl_per_channel_cal, per_channel_cal_data);
+  put_tensor_g2l(ctx, bmk, tl_input, reinterpret_cast<u8 *>(input_data));
+  put_tensor_g2l(ctx, bmk, tl_filter,
+                 reinterpret_cast<u8 *>(filter_data_for_dma));
+
+  // Restore filter shape for tiu operation
+  tl_filter->shape = filter_shape;
+  tl_filter->stride = bmk1880v2_tensor_lmem_default_stride(
+      bmk, tl_filter->shape, FMT_I8, /*eu_align=*/1);
+
+  {
+    // Reshape per channel quantization data
+    tl_per_channel_cal->shape = {1, 2, 1, 1};
+    tl_per_channel_cal->stride = bmk1880v2_tensor_lmem_default_stride(
+        bmk, tl_per_channel_cal->shape, FMT_I8, /*eu_align=*/0);
+
+    bmk1880v2_tiu_convolution_qdm_param_t param;
+    memset(&param, 0, sizeof(param));
+    param.ofmap = tl_output;
+    param.ifmap = tl_input;
+    param.weight = tl_filter;
+    param.chl_quan_param = tl_per_channel_cal;
+    param.stride_h = 1;
+    param.stride_w = 1;
+    param.dilation_h = 1;
+    param.dilation_w = 1;
+    param.has_bias = 1;
+    bmk1880v2_tiu_convolution_qdm(bmk, &param);
+  }
+
+  test_submit(ctx);
+
+  printf("Compare tiu and golden\n");
+  s8 *conv_output_data =
+      reinterpret_cast<s8 *>(get_tensor_l2g(ctx, bmk, tl_output));
+  for (int i = 0; i < static_cast<int>(sizeof(ref_output_data)); i++) {
+    if (conv_output_data[i] != ref_output_data[i]) {
+      printf("output_data[%d] %d != %d\n", i, conv_output_data[i],
+             ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  free(conv_output_data);
+
+  // Reverse order
+  bmk1880v2_lmem_free_tensor(bmk, tl_output);
+  bmk1880v2_lmem_free_tensor(bmk, tl_filter);
+  bmk1880v2_lmem_free_tensor(bmk, tl_input);
+  bmk1880v2_lmem_free_tensor(bmk, tl_per_channel_cal);
+
+  return ret;
+}
+
+void fill_random_data_s8(s8 *input_data, int size)
+{
+  for (int i = 0; i < size; ++i) {
+    int is_satured = ((rand() % 1000) == 1) ? 1 : 0;
+    int is_sign = rand() % 2 ? 1 : -1;
+
+    if (is_satured && is_sign) {
+      input_data[i] = -128;
+    } else if (is_satured) {
+      input_data[i] = 127;
+    } else {
+      input_data[i] = is_sign * rand() % 128;
+    }
+  }
+}
+
+void fill_random_data_s32(s32 *input_data, int size)
+{
+  for (int i = 0; i < size; ++i) {
+    int is_satured = ((rand() % 1000) == 1) ? 1 : 0;
+    int is_sign = rand() % 2 ? 1 : -1;
+
+    if (is_satured && is_sign) {
+      input_data[i] = INT_MIN;
+    } else if (is_satured) {
+      input_data[i] = INT_MAX;
+    } else {
+      input_data[i] = is_sign * rand() % 128;
+    }
+  }
+}
+
+bool check_valid_test_param(bmk_ctx_t *bk_ctx, conv_test_param_t *p_param)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int oh = p_param->output_h;
+  int ow = p_param->output_w;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int per_chan_cal_data_size =
+      p_param->has_bias ? 9 : 5;  // bias(4) + multiplier(4) + shift(1)
+
+  // Skip invalid shape
+  if ((kh > ih) || (kw > iw) || (stride_h > ih) || (stride_w > iw)) {
+    return false;
+  }
+
+  // muliply random-choosen value may exceeded than s32
+  u32 input_size = in * ic * ih * iw;
+  u32 kernel_size = oc * ic * kh * kw;
+  u32 output_size = in * oc * oh * ow;
+
+  bmk1880v2_chip_info_t chip_info = bmk1880v2_chip_info();
+  u32 lmem_size_per_lane = chip_info.lmem_size;
+  u32 total_lmem_size = chip_info.lmem_size * chip_info.npu_num;
+
+  u32 total_needed_size = input_size + kernel_size + output_size +
+                          per_chan_cal_data_size * chip_info.npu_num;
+  if (total_needed_size > total_lmem_size) {
+    return false;
+  }
+
+  tl_shape_t input_shape = {static_cast<u32>(in), static_cast<u32>(ic),
+                            static_cast<u32>(ih), static_cast<u32>(iw)};
+  tl_shape_t filter_shape = {1, static_cast<u32>(oc),
+                             static_cast<u32>(kh) * static_cast<u32>(kw),
+                             static_cast<u32>(ic)};
+  tl_shape_t output_shape = {static_cast<u32>(in), static_cast<u32>(oc),
+                             static_cast<u32>(oh), static_cast<u32>(ow)};
+  tl_shape_t cal_shape = {1, static_cast<u32>(oc), 1,
+                          static_cast<u32>(per_chan_cal_data_size)};
+
+  u32 needed_size =
+      bmk1880v2_lmem_tensor_to_size(bk_ctx, input_shape, FMT_I8, /*eu_align=*/1) +
+      bmk1880v2_lmem_tensor_to_size(bk_ctx, filter_shape, FMT_I8, /*eu_align=*/0) +
+      bmk1880v2_lmem_tensor_to_size(bk_ctx, output_shape, FMT_I8, /*eu_align=*/1) +
+      bmk1880v2_lmem_tensor_to_size(bk_ctx, cal_shape, FMT_I8, /*eu_align=*/0);
+
+  // Skip invalid shape
+  if (needed_size > lmem_size_per_lane) {
+    return false;
+  }
+
+  return true;
+}
+
+int choose_from_range(int table[], int size, int index)
+{
+  if (index >= size) {
+    return 0;
+  }
+
+  int val = table[index];
+  if (index < (size - 1)) {
+    int range = MAX(table[index + 1] - table[index] - 1, 1);
+    val += rand() % range;
+  }
+
+  return val;
+}
+
+void dump_test_param(conv_test_param_t *p_param, bool dump_content)
+{
+  printf("Dump test parameter:\n");
+  printf("  input_n %d\n", p_param->input_n);
+  printf("  input_c %d\n", p_param->input_c);
+  printf("  input_h %d\n", p_param->input_h);
+  printf("  input_w %d\n", p_param->input_w);
+  printf("  kw %d\n", p_param->kw);
+  printf("  kh %d\n", p_param->kh);
+  printf("  dh %d\n", p_param->dh);
+  printf("  dw %d\n", p_param->dw);
+  printf("  pad_top %d\n", p_param->pad_top);
+  printf("  pad_bot %d\n", p_param->pad_bot);
+  printf("  pad_left %d\n", p_param->pad_left);
+  printf("  pad_right %d\n", p_param->pad_right);
+  printf("  ins_h %d\n", p_param->ins_h);
+  printf("  ins_h_last %d\n", p_param->ins_h_last);
+  printf("  ins_w %d\n", p_param->ins_w);
+  printf("  ins_w_last %d\n", p_param->ins_w_last);
+  printf("  stride_h %d\n", p_param->stride_h);
+  printf("  stride_w %d\n", p_param->stride_w);
+  printf("  output_c %d\n", p_param->output_c);
+  printf("  output_h %d\n", p_param->output_h);
+  printf("  output_w %d\n", p_param->output_w);
+  printf("  has_bias %d\n", p_param->has_bias);
+  printf("  relu_enable %d\n", p_param->relu_enable);
+
+  if (dump_content) {
+    printf("input_data(%d, %d, %d, %d) :\n", p_param->input_n, p_param->input_c,
+           p_param->input_h, p_param->input_w);
+    int in = p_param->input_n;
+    int ic = p_param->input_c;
+    int ih = p_param->input_h;
+    int iw = p_param->input_w;
+    for (int i = 0; i < in; ++i) {
+      for (int j = 0; j < ic; ++j) {
+        for (int k = 0; k < ih; ++k) {
+          for (int l = 0; l < iw; ++l) {
+            int offset = i * (ic * ih * iw) + j * (ih * iw) + k * iw + l;
+            printf("%d, ", p_param->input_data[offset]);
+          }
+          printf("\n");
+        }
+      }
+    }
+    printf("\n\n");
+
+    printf("kener_data (oc=%d, kh=%d, kw=%d, ic=%d)\n", p_param->output_c,
+           p_param->kh, p_param->kw, p_param->input_c);
+    int oc = p_param->output_c;
+    int kh = p_param->kh;
+    int kw = p_param->kw;
+    for (int i = 0; i < oc; ++i) {
+      for (int j = 0; j < kh; ++j) {
+        for (int k = 0; k < kw; ++k) {
+          for (int l = 0; l < ic; ++l) {
+            int offset = i * (kh * kw * ic) + j * (kw * ic) + k * ic + l;
+            printf("%d, ", p_param->filter_data[offset]);
+          }
+          printf("\n");
+        }
+      }
+    }
+    printf("\n\n");
+
+    if (p_param->has_bias) {
+      printf("bias_data:\n");
+      for (int i = 0; i < oc; ++i) {
+        printf("%d, ", p_param->bias_data[i]);
+      }
+      printf("\n\n");
+    }
+
+    printf("multiplier_data:\n");
+    for (int i = 0; i < oc; ++i) {
+      printf("%d, ", p_param->multiplier_data[i]);
+    }
+    printf("\n\n");
+
+    printf("shift_data:\n");
+    for (int i = 0; i < oc; ++i) {
+      printf("%d, ", p_param->shift_data[i]);
+    }
+    printf("\n\n");
+  }
+}
+
+
+
+static conv_test_param_t keepFailParam;;
+static s8 *keep_input_data = NULL;
+
+static int keep_kernel_size = 0;
+static s8 *keep_kernel_data = NULL;
+
+static int keep_output_size = 0;
+static s8 *keep_output_data = NULL;
+
+static s32 *keep_bias_data = NULL;
+static u32 *keep_multiplier_data = NULL;
+static s8 *keep_shift_data = NULL;
+
+
+int keep_fail_param(conv_test_param_t *p_param)
+{
+	int in = p_param->input_n;
+	int ic = p_param->input_c;
+	int ih = p_param->input_h;
+	int iw = p_param->input_w;
+	int oc = p_param->output_c;
+	int oh = p_param->output_h;
+	int ow = p_param->output_w;
+	int kh = p_param->kh;
+	int kw = p_param->kw;
+	//int dh = p_param->dh;
+	//int dw = p_param->dw;
+	//int pad_top = p_param->pad_top;
+	//int pad_bot = p_param->pad_bot;
+	//int pad_left = p_param->pad_left;
+	//int pad_right = p_param->pad_right;
+	//int ins_h = p_param->ins_h;
+	//int ins_last_h = p_param->ins_h_last;
+	//int ins_w = p_param->ins_w;
+	//int ins_last_w = p_param->ins_w_last;
+	//int stride_h = p_param->stride_h;
+	//int stride_w = p_param->stride_w;
+	int has_bias = p_param->has_bias;
+	//int relu_enable = p_param->relu_enable;
+
+
+	memcpy(&keepFailParam, p_param, sizeof(conv_test_param_t));
+
+	int input_size = in * ic * iw * ih;
+	keep_input_data = (s8 *)malloc(input_size);
+	memcpy(keep_input_data, p_param->input_data, input_size);
+
+
+	keep_kernel_size = oc * ic * kh * kw;
+	keep_kernel_data = (s8 *)malloc(keep_kernel_size);
+	memcpy(keep_kernel_data, p_param->filter_data, keep_kernel_size);
+
+	keep_output_size = in * oc * oh * ow;
+	keep_output_data = (s8 *)malloc(keep_output_size);
+	memcpy(keep_output_data, p_param->output_data, keep_output_size);
+
+	keep_bias_data = (s32 *) malloc(sizeof(s32) * oc);
+	memcpy(keep_bias_data, p_param->bias_data, sizeof(s32) * oc);
+
+	keep_multiplier_data = (u32 *) malloc(sizeof(u32) * oc);
+	memcpy(keep_multiplier_data, p_param->multiplier_data, sizeof(s32) * oc);
+
+	keep_shift_data = (s8 *)malloc(oc);
+	memcpy(keep_shift_data, p_param->shift_data, oc);
+
+
+
+	keepFailParam.input_data = keep_input_data;
+	keepFailParam.filter_data = keep_kernel_data;
+	keepFailParam.output_data = keep_output_data;
+	keepFailParam.has_bias = has_bias;
+	keepFailParam.bias_data = keep_bias_data;
+	keepFailParam.multiplier_data = keep_multiplier_data;
+	keepFailParam.shift_data = keep_shift_data;
+
+	return 0;
+}
+
+
+void dump2_test_param(conv_test_param_t *p_param)
+{
+	printf("dump2_test_param:\n");
+	printf("  input_n %d\n", p_param->input_n);
+	printf("  input_c %d\n", p_param->input_c);
+	printf("  input_h %d\n", p_param->input_h);
+	printf("  input_w %d\n", p_param->input_w);
+	printf("  kw %d\n", p_param->kw);
+	printf("  kh %d\n", p_param->kh);
+	printf("  dh %d\n", p_param->dh);
+	printf("  dw %d\n", p_param->dw);
+	printf("  pad_top %d\n", p_param->pad_top);
+	printf("  pad_bot %d\n", p_param->pad_bot);
+	printf("  pad_left %d\n", p_param->pad_left);
+	printf("  pad_right %d\n", p_param->pad_right);
+	printf("  ins_h %d\n", p_param->ins_h);
+	printf("  ins_h_last %d\n", p_param->ins_h_last);
+	printf("  ins_w %d\n", p_param->ins_w);
+	printf("  ins_w_last %d\n", p_param->ins_w_last);
+	printf("  stride_h %d\n", p_param->stride_h);
+	printf("  stride_w %d\n", p_param->stride_w);
+	printf("  output_c %d\n", p_param->output_c);
+	printf("  output_h %d\n", p_param->output_h);
+	printf("  output_w %d\n", p_param->output_w);
+	printf("  has_bias %d\n", p_param->has_bias);
+	printf("  relu_enable %d\n", p_param->relu_enable);
+
+	keep_fail_param(p_param);
+	printf("dump2_test_param\n\n");
+	assert(0);
+}
+
+int run_compare_conv(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx,
+                     conv_test_param_t *p_param)
+{
+  int ret = 0;
+
+  if (ctx == nullptr || bk_ctx == nullptr) {
+    return -1;
+  }
+
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int oh = p_param->output_h;
+  int ow = p_param->output_w;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_last_h = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_last_w = p_param->ins_w_last;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int has_bias = p_param->has_bias;
+  int relu_enable = p_param->relu_enable;
+
+  int input_size = in * ic * iw * ih;
+  s8 *input_data = (s8 *)malloc(input_size);
+
+  int kernel_size = oc * ic * kh * kw;
+  s8 *kernel_data = (s8 *)malloc(kernel_size);
+
+  int output_size = in * oc * oh * ow;
+  s8 *output_data = (s8 *)malloc(output_size);
+  if (!input_data || !kernel_data || !output_data) {
+    free(input_data);
+    free(kernel_data);
+    free(output_data);
+    return -1;
+  }
+
+  memset(output_data, 0, output_size);
+
+  s32 *bias_data = (s32 *) malloc(sizeof(s32) * oc);
+  u32 *multiplier_data = (u32 *) malloc(sizeof(u32) * oc);
+  s8 *shift_data = (s8 *)malloc(oc);
+
+  p_param->input_data = input_data;
+  p_param->filter_data = kernel_data;
+  p_param->output_data = output_data;
+  p_param->has_bias = has_bias;
+  p_param->bias_data = bias_data;
+  p_param->multiplier_data = multiplier_data;
+  p_param->shift_data = shift_data;
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    run_compare_conv =>\n");
+  printf("      input (n=%d, ic=%d, h=%d, w=%d), kernel (oc=%d, ic=%d, h=%d, "
+         "w=%d), output (, c=%d, h=%d, w=%d), has_bias %d\n",
+         in, ic, ih, iw, oc, ic, kh, kw, oc, oh, ow, has_bias);
+#endif
+
+  int retry_cnt = p_param->retry_cnt;
+  do {
+    fill_random_data_s8(input_data, input_size);
+    fill_random_data_s8(kernel_data, kernel_size);
+    if (has_bias) {
+      fill_random_data_s32(bias_data, oc);
+    }
+
+    p_param->float_multiplier = 100.0;  // should be < 1.0
+    calc_conv_float_multiplier(p_param);
+
+    if (p_param->float_multiplier > 0.f && p_param->float_multiplier < 1.0) {
+      break;
+    }
+
+  } while (--retry_cnt);
+
+  if (p_param->float_multiplier >= 1.0) {
+    printf("    run_compare_dw_conv: unable to find valid multiplier\n");
+    free(input_data);
+    free(kernel_data);
+    free(output_data);
+    free(bias_data);
+    free(multiplier_data);
+    free(shift_data);
+
+    return -1;
+  }
+
+  u32 base_multiplier = 0;
+  int base_shift = 0;
+  QuantizeMultiplierSmallerThanOne(p_param->float_multiplier, &base_multiplier,
+                                   &base_shift);
+
+  for (int i = 0; i < oc; ++i) {
+    // multipliers typically range in [2^30 ; 2^31 - 1].
+    // Values in [0, 2^30 - 1] are normally unused, but harmless.
+    // Thus a good way to randomize multipliers is to subtract from them
+    // a random value smaller than 2^30 but still significant compared to it.
+    p_param->multiplier_data[i] = base_multiplier - (rand() % (1 << 26));
+
+    // Our H/W only supports right shift
+    int right_shift = base_shift - 1 + (rand() % 4);
+    p_param->shift_data[i] = right_shift > 0 ? right_shift : 0;
+
+#ifdef ENABLE_DEBUG_MSG
+    printf("      [oc=%d] multiplier_data %d, shift_data %d\n", i,
+           p_param->multiplier_data[i], p_param->shift_data[i]);
+#endif
+  }
+
+  conv_per_channel_ref(p_param);
+
+  const int per_chan_cal_data_size =
+      p_param->has_bias ? 9 : 5;  // bias(4) + multiplier(4) + shift(1)
+  const int cal_data_size = oc * per_chan_cal_data_size;
+  u8 *cal_data = (u8 *) malloc(cal_data_size);
+  pack_chl_quan_param(oc, p_param->has_bias, p_param->bias_data,
+                      p_param->multiplier_data, p_param->shift_data,
+                      cal_data);
+
+  tl_shape_t input_shape = {static_cast<u32>(in), static_cast<u32>(ic),
+                            static_cast<u32>(ih), static_cast<u32>(iw)};
+  tl_shape_t filter_shape = {1, static_cast<u32>(oc),
+                             static_cast<u32>(kh) * static_cast<u32>(kw),
+                             static_cast<u32>(ic)};
+  tl_shape_t output_shape = {static_cast<u32>(in), static_cast<u32>(oc),
+                             static_cast<u32>(oh), static_cast<u32>(ow)};
+  tl_shape_t cal_shape = {1, static_cast<u32>(oc), 1,
+                          static_cast<u32>(per_chan_cal_data_size)};
+
+  bmk1880v2_tensor_lmem_t *tl_input =
+      bmk1880v2_lmem_alloc_tensor(bk_ctx, input_shape, FMT_I8, /*eu_aign=*/1);
+
+  bmk1880v2_tensor_lmem_t *tl_filter =
+      bmk1880v2_lmem_alloc_tensor(bk_ctx, filter_shape, FMT_I8, /*eu_align=*/0);
+
+  bmk1880v2_tensor_lmem_t *tl_output =
+      bmk1880v2_lmem_alloc_tensor(bk_ctx, output_shape, FMT_I8, /*eu_align=*/1);
+
+  // Shape for TDMA load
+  bmk1880v2_tensor_lmem_t *tl_cal_data =
+      bmk1880v2_lmem_alloc_tensor(bk_ctx, cal_shape, FMT_U8, /*eu_align*/ 0);
+
+  if (!tl_input || !tl_filter || !tl_output || !tl_cal_data) {
+    if (tl_input == nullptr) {
+      printf("      fail to alloc tl_input (%d, %d, %d, %d)\n", input_shape.n,
+            input_shape.c, input_shape.h, input_shape.w);
+    }
+    if (tl_filter == nullptr) {
+      printf("     fail to alloc tl_filter (%d, %d, %d, %d)\n", filter_shape.n,
+            filter_shape.c, filter_shape.h, filter_shape.w);
+    }
+    if (tl_output == nullptr) {
+      printf("    fail to alloc tl_output (%d, %d, %d, %d)\n", output_shape.n,
+            output_shape.c, output_shape.h, output_shape.w);
+    }
+    if (tl_cal_data == nullptr) {
+      printf("    fail to alloc tl_cal_data (%d, %d ,%d, %d)\n", cal_shape.n,
+            cal_shape.c, cal_shape.h, cal_shape.w);
+    }
+
+    // Reverse order
+    if (tl_cal_data)
+      bmk1880v2_lmem_free_tensor(bk_ctx, tl_cal_data);
+    if (tl_output)
+      bmk1880v2_lmem_free_tensor(bk_ctx, tl_output);
+    if (tl_filter)
+      bmk1880v2_lmem_free_tensor(bk_ctx, tl_filter);
+    if (tl_input)
+      bmk1880v2_lmem_free_tensor(bk_ctx, tl_input);
+
+    return -1;
+  }
+
+  put_tensor_g2l(ctx, bk_ctx, tl_cal_data, cal_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_input, reinterpret_cast<u8 *>(input_data));
+  put_tensor_g2l(ctx, bk_ctx, tl_filter, reinterpret_cast<u8 *>(kernel_data));
+
+  {
+    // Reshape per channel quantization data for TIU
+    tl_cal_data->shape = {1, static_cast<u32>(oc), 1, 1};
+    tl_cal_data->stride = bmk1880v2_tensor_lmem_default_stride(
+        bk_ctx, tl_cal_data->shape, FMT_I8, /*eu_align=*/0);
+
+    // Reshape weight for TIU
+    tl_filter->shape = {static_cast<u32>(ic), static_cast<u32>(oc),
+                        static_cast<u32>(kh), static_cast<u32>(kw)};
+
+    bmk1880v2_tiu_convolution_qdm_param_t param;
+    memset(&param, 0, sizeof(param));
+    param.ofmap = tl_output;
+    param.ifmap = tl_input;
+    param.weight = tl_filter;
+    param.chl_quan_param = tl_cal_data;
+    param.ins_h = ins_h;
+    param.ins_last_h = ins_last_h;
+    param.ins_w = ins_w;
+    param.ins_last_w = ins_last_w;
+    param.stride_h = stride_h;
+    param.stride_w = stride_w;
+    param.dilation_h = dh;
+    param.dilation_w = dw;
+    param.pad_top = pad_top;
+    param.pad_bottom = pad_bot;
+    param.pad_left = pad_left;
+    param.pad_right = pad_right;
+    param.has_bias = has_bias;
+    param.relu_enable = relu_enable;
+
+#ifdef ENABLE_DEBUG_MSG
+    printf("      tiu_conv_qdm:\n");
+    printf("        ifmap shape (%d, %d, %d, %d)\n", param.ifmap->shape.n,
+           param.ifmap->shape.c, param.ifmap->shape.h, param.ifmap->shape.w);
+    printf("        weight shape (%d, %d, %d, %d)\n", param.weight->shape.n,
+           param.weight->shape.c, param.weight->shape.h, param.weight->shape.w);
+    printf("        ofmap shape (%d, %d, %d, %d)\n", param.ofmap->shape.n,
+           param.ofmap->shape.c, param.ofmap->shape.h, param.ofmap->shape.w);
+#endif
+
+    bmk1880v2_tiu_convolution_qdm(bk_ctx, &param);
+  }
+
+  test_submit(ctx);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("      compare result:\n");
+#endif
+  s8 *conv_output_data =
+      reinterpret_cast<s8 *>(get_tensor_l2g(ctx, bk_ctx, tl_output));
+  for (int i = 0; i < in; ++i) {
+    for (int j = 0; j < oc; ++j) {
+      for (int k = 0; k < oh; ++k) {
+        for (int l = 0; l < ow; ++l) {
+          int offset = i * (oc * oh * ow) + j * (oh * ow) + k * ow + l;
+          if (conv_output_data[offset] != output_data[offset]) {
+            printf("        [ni=%d][oci=%d][ohi=%d][owi=%d] output %d(tiu) != "
+                   "%d(ref)\n",
+                   i, j, k, l, conv_output_data[offset], output_data[offset]);
+            ret = -1;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  if (ret) {
+    //dump_test_param(p_param, /*dump_content=*/true);
+	dump2_test_param(p_param);
+  }
+
+  // Reverse order
+  bmk1880v2_lmem_free_tensor(bk_ctx, tl_cal_data);
+  bmk1880v2_lmem_free_tensor(bk_ctx, tl_output);
+  bmk1880v2_lmem_free_tensor(bk_ctx, tl_filter);
+  bmk1880v2_lmem_free_tensor(bk_ctx, tl_input);
+
+  free(conv_output_data);
+
+  free(input_data);
+  free(kernel_data);
+  free(output_data);
+  free(bias_data);
+  free(multiplier_data);
+  free(shift_data);
+  free(cal_data);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    <= run_compare_conv\n");
+#endif
+
+  return ret;
+}
+
+
+
+
+int run2_compare_conv(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx)
+{
+  int ret = 0;
+
+  if (ctx == nullptr || bk_ctx == nullptr) {
+    return -1;
+  }
+
+  conv_test_param_t *p_param = &keepFailParam;
+
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int oh = p_param->output_h;
+  int ow = p_param->output_w;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_last_h = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_last_w = p_param->ins_w_last;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int has_bias = p_param->has_bias;
+  int relu_enable = p_param->relu_enable;
+
+  int input_size = in * ic * iw * ih;
+  s8 *input_data = (s8 *)malloc(input_size);
+
+  int kernel_size = oc * ic * kh * kw;
+  s8 *kernel_data = (s8 *)malloc(kernel_size);
+
+  int output_size = in * oc * oh * ow;
+  s8 *output_data = (s8 *)malloc(output_size);
+  if (!input_data || !kernel_data || !output_data) {
+    free(input_data);
+    free(kernel_data);
+    free(output_data);
+    return -1;
+  }
+
+  memset(output_data, 0, output_size);
+
+  s32 *bias_data = (s32 *) malloc(sizeof(s32) * oc);
+  u32 *multiplier_data = (u32 *) malloc(sizeof(u32) * oc);
+  s8 *shift_data = (s8 *)malloc(oc);
+
+  //p_param->input_data = input_data;
+  //p_param->filter_data = kernel_data;
+  //p_param->output_data = output_data;
+  //p_param->has_bias = has_bias;
+  //p_param->bias_data = bias_data;
+  //p_param->multiplier_data = multiplier_data;
+  //p_param->shift_data = shift_data;
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    run_compare_conv =>\n");
+  printf("      input (n=%d, ic=%d, h=%d, w=%d), kernel (oc=%d, ic=%d, h=%d, "
+         "w=%d), output (, c=%d, h=%d, w=%d), has_bias %d\n",
+         in, ic, ih, iw, oc, ic, kh, kw, oc, oh, ow, has_bias);
+#endif
+
+  int retry_cnt = p_param->retry_cnt;
+  do {
+    fill_random_data_s8(input_data, input_size);
+    fill_random_data_s8(kernel_data, kernel_size);
+    if (has_bias) {
+      fill_random_data_s32(bias_data, oc);
+    }
+
+    p_param->float_multiplier = 100.0;  // should be < 1.0
+    calc_conv_float_multiplier(p_param);
+
+    if (p_param->float_multiplier > 0.f && p_param->float_multiplier < 1.0) {
+      break;
+    }
+
+  } while (--retry_cnt);
+
+  if (p_param->float_multiplier >= 1.0) {
+    printf("    run_compare_dw_conv: unable to find valid multiplier\n");
+    free(input_data);
+    free(kernel_data);
+    free(output_data);
+    free(bias_data);
+    free(multiplier_data);
+    free(shift_data);
+
+    return -1;
+  }
+
+  u32 base_multiplier = 0;
+  int base_shift = 0;
+  QuantizeMultiplierSmallerThanOne(p_param->float_multiplier, &base_multiplier,
+                                   &base_shift);
+
+  for (int i = 0; i < oc; ++i) {
+    // multipliers typically range in [2^30 ; 2^31 - 1].
+    // Values in [0, 2^30 - 1] are normally unused, but harmless.
+    // Thus a good way to randomize multipliers is to subtract from them
+    // a random value smaller than 2^30 but still significant compared to it.
+    p_param->multiplier_data[i] = base_multiplier - (rand() % (1 << 26));
+
+    // Our H/W only supports right shift
+    int right_shift = base_shift - 1 + (rand() % 4);
+    p_param->shift_data[i] = right_shift > 0 ? right_shift : 0;
+
+#ifdef ENABLE_DEBUG_MSG
+    printf("      [oc=%d] multiplier_data %d, shift_data %d\n", i,
+           p_param->multiplier_data[i], p_param->shift_data[i]);
+#endif
+  }
+
+  conv_per_channel_ref(p_param);
+
+  const int per_chan_cal_data_size =
+      p_param->has_bias ? 9 : 5;  // bias(4) + multiplier(4) + shift(1)
+  const int cal_data_size = oc * per_chan_cal_data_size;
+  u8 *cal_data = (u8 *) malloc(cal_data_size);
+  pack_chl_quan_param(oc, p_param->has_bias, p_param->bias_data,
+                      p_param->multiplier_data, p_param->shift_data,
+                      cal_data);
+
+  tl_shape_t input_shape = {static_cast<u32>(in), static_cast<u32>(ic),
+                            static_cast<u32>(ih), static_cast<u32>(iw)};
+  tl_shape_t filter_shape = {1, static_cast<u32>(oc),
+                             static_cast<u32>(kh) * static_cast<u32>(kw),
+                             static_cast<u32>(ic)};
+  tl_shape_t output_shape = {static_cast<u32>(in), static_cast<u32>(oc),
+                             static_cast<u32>(oh), static_cast<u32>(ow)};
+  tl_shape_t cal_shape = {1, static_cast<u32>(oc), 1,
+                          static_cast<u32>(per_chan_cal_data_size)};
+
+  bmk1880v2_tensor_lmem_t *tl_input =
+      bmk1880v2_lmem_alloc_tensor(bk_ctx, input_shape, FMT_I8, /*eu_aign=*/1);
+
+  bmk1880v2_tensor_lmem_t *tl_filter =
+      bmk1880v2_lmem_alloc_tensor(bk_ctx, filter_shape, FMT_I8, /*eu_align=*/0);
+
+  bmk1880v2_tensor_lmem_t *tl_output =
+      bmk1880v2_lmem_alloc_tensor(bk_ctx, output_shape, FMT_I8, /*eu_align=*/1);
+
+  // Shape for TDMA load
+  bmk1880v2_tensor_lmem_t *tl_cal_data =
+      bmk1880v2_lmem_alloc_tensor(bk_ctx, cal_shape, FMT_U8, /*eu_align*/ 0);
+
+  if (!tl_input || !tl_filter || !tl_output || !tl_cal_data) {
+    if (tl_input == nullptr) {
+      printf("      fail to alloc tl_input (%d, %d, %d, %d)\n", input_shape.n,
+            input_shape.c, input_shape.h, input_shape.w);
+    }
+    if (tl_filter == nullptr) {
+      printf("     fail to alloc tl_filter (%d, %d, %d, %d)\n", filter_shape.n,
+            filter_shape.c, filter_shape.h, filter_shape.w);
+    }
+    if (tl_output == nullptr) {
+      printf("    fail to alloc tl_output (%d, %d, %d, %d)\n", output_shape.n,
+            output_shape.c, output_shape.h, output_shape.w);
+    }
+    if (tl_cal_data == nullptr) {
+      printf("    fail to alloc tl_cal_data (%d, %d ,%d, %d)\n", cal_shape.n,
+            cal_shape.c, cal_shape.h, cal_shape.w);
+    }
+
+    // Reverse order
+    if (tl_cal_data)
+      bmk1880v2_lmem_free_tensor(bk_ctx, tl_cal_data);
+    if (tl_output)
+      bmk1880v2_lmem_free_tensor(bk_ctx, tl_output);
+    if (tl_filter)
+      bmk1880v2_lmem_free_tensor(bk_ctx, tl_filter);
+    if (tl_input)
+      bmk1880v2_lmem_free_tensor(bk_ctx, tl_input);
+
+    return -1;
+  }
+
+  put_tensor_g2l(ctx, bk_ctx, tl_cal_data, cal_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_input, reinterpret_cast<u8 *>(input_data));
+  put_tensor_g2l(ctx, bk_ctx, tl_filter, reinterpret_cast<u8 *>(kernel_data));
+
+  {
+    // Reshape per channel quantization data for TIU
+    tl_cal_data->shape = {1, static_cast<u32>(oc), 1, 1};
+    tl_cal_data->stride = bmk1880v2_tensor_lmem_default_stride(
+        bk_ctx, tl_cal_data->shape, FMT_I8, /*eu_align=*/0);
+
+    // Reshape weight for TIU
+    tl_filter->shape = {static_cast<u32>(ic), static_cast<u32>(oc),
+                        static_cast<u32>(kh), static_cast<u32>(kw)};
+
+    bmk1880v2_tiu_convolution_qdm_param_t param;
+    memset(&param, 0, sizeof(param));
+    param.ofmap = tl_output;
+    param.ifmap = tl_input;
+    param.weight = tl_filter;
+    param.chl_quan_param = tl_cal_data;
+    param.ins_h = ins_h;
+    param.ins_last_h = ins_last_h;
+    param.ins_w = ins_w;
+    param.ins_last_w = ins_last_w;
+    param.stride_h = stride_h;
+    param.stride_w = stride_w;
+    param.dilation_h = dh;
+    param.dilation_w = dw;
+    param.pad_top = pad_top;
+    param.pad_bottom = pad_bot;
+    param.pad_left = pad_left;
+    param.pad_right = pad_right;
+    param.has_bias = has_bias;
+    param.relu_enable = relu_enable;
+
+#ifdef ENABLE_DEBUG_MSG
+    printf("      tiu_conv_qdm:\n");
+    printf("        ifmap shape (%d, %d, %d, %d)\n", param.ifmap->shape.n,
+           param.ifmap->shape.c, param.ifmap->shape.h, param.ifmap->shape.w);
+    printf("        weight shape (%d, %d, %d, %d)\n", param.weight->shape.n,
+           param.weight->shape.c, param.weight->shape.h, param.weight->shape.w);
+    printf("        ofmap shape (%d, %d, %d, %d)\n", param.ofmap->shape.n,
+           param.ofmap->shape.c, param.ofmap->shape.h, param.ofmap->shape.w);
+#endif
+
+    bmk1880v2_tiu_convolution_qdm(bk_ctx, &param);
+  }
+
+  test_submit(ctx);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("      compare result:\n");
+#endif
+  s8 *conv_output_data =
+      reinterpret_cast<s8 *>(get_tensor_l2g(ctx, bk_ctx, tl_output));
+  for (int i = 0; i < in; ++i) {
+    for (int j = 0; j < oc; ++j) {
+      for (int k = 0; k < oh; ++k) {
+        for (int l = 0; l < ow; ++l) {
+          int offset = i * (oc * oh * ow) + j * (oh * ow) + k * ow + l;
+          if (conv_output_data[offset] != output_data[offset]) {
+            printf("        [ni=%d][oci=%d][ohi=%d][owi=%d] output %d(tiu) != "
+                   "%d(ref)\n",
+                   i, j, k, l, conv_output_data[offset], output_data[offset]);
+            ret = -1;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  if (ret) {
+    //dump_test_param(p_param, /*dump_content=*/true);
+	dump2_test_param(p_param);
+  }
+
+  // Reverse order
+  bmk1880v2_lmem_free_tensor(bk_ctx, tl_cal_data);
+  bmk1880v2_lmem_free_tensor(bk_ctx, tl_output);
+  bmk1880v2_lmem_free_tensor(bk_ctx, tl_filter);
+  bmk1880v2_lmem_free_tensor(bk_ctx, tl_input);
+
+  free(conv_output_data);
+
+  free(input_data);
+  free(kernel_data);
+  free(output_data);
+  free(bias_data);
+  free(multiplier_data);
+  free(shift_data);
+  free(cal_data);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    <= run_compare_conv\n");
+#endif
+
+  return ret;
+}
+
+int random_test(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx)
+{
+  int ret = 0;
+
+#ifndef ENABLE_FULL_REGRESSION
+#ifndef ENABLE_TV_GEN_PATTERN
+  // n: 12b, c: 12b, h: 12b(4095-32), wb: 12b(4095-32)
+  int batch_range[] = {1, 1, 2, 4095 - 32};
+  int input_height_range[] = {1, 512, 1024, 4095 - 32};
+  int input_width_range[] = {1, 512, 1024, 4095 - 32};
+  int input_depth_range[] = {1, 16, 32, 4095};
+  int output_depth_range[] = {1, 16, 32, 4095};
+
+  // h: 12b, w: 12b
+  // stride_h: 4b, strid_w: 4b
+  int kernel_height_range[] = {1, 11, 4095};
+  int kernel_width_range[] = {1, 11, 4095};
+  int kernel_stride_height_range[] = {1, 5, 15};
+  int kernel_stride_width_range[] = {1, 5, 15};
+#else
+  // TV_GEN pattern
+  // Random Test, total 19683, skipped 118066, executed 32, failed 0, ret 0
+
+  // n: 12b, c: 12b, h: 12b(4095-32), wb: 12b(4095-32)
+  int batch_range[] = {1, 1, 32};
+  int input_height_range[] = {1, 512, 4095 - 32};
+  int input_width_range[] = {1, 512, 4095 - 32};
+  int input_depth_range[] = {1, 16, 4095};
+  int output_depth_range[] = {1, 16, 4095};
+
+  // h: 12b, w: 12b
+  // stride_h: 4b, strid_w: 4b
+  int kernel_height_range[] = {1, 11, 4095};
+  int kernel_width_range[] = {1, 11, 4095};
+  int kernel_stride_height_range[] = {1, 5, 15};
+  int kernel_stride_width_range[] = {1, 5, 15};
+
+#endif //ENABLE_TV_GEN_PATTERN
+#else
+#if 0
+  // Input with same range size
+  int batch_range[] = {1};
+  int input_height_range[] = {1};
+  int input_width_range[] = {1};
+  int input_depth_range[] = {1};
+  const int input_range_size = sizeof(input_height_range)/sizeof(input_height_range[0]);
+
+  // Kernel with same range size
+  int kernel_height_range[] = {1};
+  int kernel_width_range[] = {1};
+  int kernel_stride_height_range[] = {1};
+  int kernel_stride_width_range[] = {1};
+  int output_depth_range[] = {1};
+  const int kernel_range_size = sizeof(kernel_height_range)/sizeof(kernel_height_range[0]);
+#else
+  // 10/21/2019 overnight
+  // total 20480000, skipped 20301713, executed 178287, failed 0
+
+  // n: 12b, c: 12b, h: 12b(4095-32), wb: 12b(4095-32)
+  int batch_range[] = {1, 2, 4, 8, 16, 32, 64, 4095 - 32};
+  int input_height_range[] = {1, 3, 11, 128, 512, 1024, 2048, 4095 - 32};
+  int input_width_range[] = {1, 3, 11, 128, 512, 1024, 2048, 4095 - 32};
+  int input_depth_range[] = {1, 3, 11, 32, 64, 1024, 2048, 4095};
+  int output_depth_range[] = {1, 3, 11, 32, 64, 1024, 2048, 4095};
+
+  // h: 12b, w: 12b
+  // stride_h: 4b, strid_w: 4b
+  int kernel_height_range[] = {1, 3, 11, 511, 4095};
+  int kernel_width_range[] = {1, 3, 11, 511, 4095};
+  int kernel_stride_height_range[] = {1, 3, 5, 7, 15};
+  int kernel_stride_width_range[] = {1, 3, 5, 7, 15};
+#endif
+#endif /* ENABLE_FULL_REGRESSION */
+
+  const int batch_range_size = sizeof(batch_range) / sizeof(batch_range[0]);
+  const int input_height_range_size =
+      sizeof(input_height_range) / sizeof(input_height_range[0]);
+  const int input_width_range_size =
+      sizeof(input_width_range) / sizeof(input_width_range[0]);
+  const int input_depth_range_size =
+      sizeof(input_depth_range) / sizeof(input_depth_range[0]);
+  const int output_depth_range_size =
+      sizeof(output_depth_range) / sizeof(output_depth_range[0]);
+
+  const int kernel_height_range_size =
+      sizeof(kernel_height_range) / sizeof(kernel_height_range[0]);
+  const int kernel_width_range_size =
+      sizeof(kernel_width_range) / sizeof(kernel_width_range[0]);
+  const int kernel_stride_height_range_size =
+      sizeof(kernel_stride_height_range) /
+      sizeof(kernel_stride_height_range[0]);
+  const int kernel_stride_width_range_size =
+      sizeof(kernel_stride_width_range) / sizeof(kernel_stride_width_range[0]);
+
+  int random_seed = clock();
+  srand(random_seed);
+
+  const int retry_test_count = 100;
+
+  bool stop_at_first_error = true;
+
+  int total_tests = batch_range_size * input_depth_range_size *
+                    input_height_range_size * input_width_range_size *
+                    output_depth_range_size * kernel_height_range_size *
+                    kernel_width_range_size * kernel_stride_height_range_size *
+                    kernel_stride_width_range_size;
+  int skipped_tests = 0;
+  int executed_tests = 0;
+  int failed_tests = 0;
+  int current_test = 0;
+
+  printf("Random Test =>\n");
+  for (int m = 0; m < retry_test_count; ++m) {
+    for (int i = 0; i < batch_range_size; ++i) {
+      // random choosed from [range[i] : range[i+1]]
+      int batch = choose_from_range(batch_range, batch_range_size, i);
+
+      for (int j = 0; j < input_height_range_size; ++j) {
+        int input_height =
+            choose_from_range(input_height_range, input_height_range_size, j);
+
+        for (int k = 0; k < input_width_range_size; ++k) {
+          int input_width =
+              choose_from_range(input_width_range, input_width_range_size, k);
+
+          for (int l = 0; l < input_depth_range_size; ++l) {
+            int input_depth =
+                choose_from_range(input_depth_range, input_depth_range_size, k);
+
+            for (int m = 0; m < kernel_height_range_size; ++m) {
+              int kernel_height = choose_from_range(
+                  kernel_height_range, kernel_height_range_size, m);
+
+              for (int n = 0; n < kernel_width_range_size; ++n) {
+                int kernel_width = choose_from_range(
+                    kernel_width_range, kernel_width_range_size, n);
+
+                for (int x = 0; x < kernel_stride_height_range_size; ++x) {
+                  int kernel_stride_height =
+                      choose_from_range(kernel_stride_height_range,
+                                        kernel_stride_height_range_size, x);
+
+                  for (int y = 0; y < kernel_stride_width_range_size; ++y) {
+                    int kernel_stride_width =
+                        choose_from_range(kernel_stride_width_range,
+                                          kernel_stride_width_range_size, y);
+
+                    for (int z = 0; z < output_depth_range_size; ++z) {
+                      int output_depth = choose_from_range(
+                          output_depth_range, output_depth_range_size, y);
+
+#ifdef ENABLE_DEBUG_MSG
+                      printf(
+                          "  [%d/%d/%d] random test: input shape(%d, %d, %d, %d)",
+                          executed_tests, current_test, total_tests,
+                          batch, input_depth, input_height, input_width);
+                      printf(", kernel shape (oc=%d, ic=%d, kh=%d, kw=%d), "
+                             "stride_h %d, stride_w %d\n",
+                             output_depth, input_depth, kernel_height,
+                             kernel_width, kernel_stride_height,
+                             kernel_stride_width);
+#else
+                      if ((current_test % 10000) == 0) {
+                        printf("  [%d/%d/%d] random test: input shape(%d, %d, %d, "
+                               "%d)",
+                               executed_tests, current_test, total_tests,
+                               batch, input_depth, input_height, input_width);
+                        printf(", kernel shape (oc=%d, ic=%d, kh=%d, kw=%d), "
+                               "stride_h %d, stride_w %d\n",
+                               input_depth, input_depth, kernel_height,
+                               kernel_width, kernel_stride_height,
+                               kernel_stride_width);
+                      }
+#endif
+
+                      current_test++;
+
+                      int has_bias = rand() % 2;
+                      int dh = 1;
+                      int dw = 1;
+                      int ins_h = 0;
+                      int ins_h_last = 0;
+                      int ins_w = 0;
+                      int ins_w_last = 0;
+                      int pad_top = 0;
+                      int pad_bot = 0;
+                      int pad_left = 0;
+                      int pad_right = 0;
+
+                      int ih_ext = calc_dilute_hw(input_height, ins_h,
+                                                  ins_h_last, pad_top, pad_bot);
+                      int iw_ext = calc_dilute_hw(
+                          input_width, ins_w, ins_w_last, pad_left, pad_right);
+                      int kh_ext =
+                          calc_dilute_hw(kernel_height, dh - 1, 0, 0, 0);
+                      int kw_ext =
+                          calc_dilute_hw(kernel_width, dw - 1, 0, 0, 0);
+
+                      int oh =
+                          calc_output_hw(ih_ext, kh_ext, kernel_stride_height);
+                      int ow =
+                          calc_output_hw(iw_ext, kw_ext, kernel_stride_width);
+
+                      conv_test_param_t test_param;
+                      memset(&test_param, 0, sizeof(test_param));
+                      test_param.input_n = batch;
+                      test_param.input_c = input_depth;
+                      test_param.input_h = input_height;
+                      test_param.input_w = input_width;
+                      test_param.kh = kernel_height;
+                      test_param.kw = kernel_width;
+                      test_param.dh = dh;
+                      test_param.dw = dw;
+                      test_param.pad_top = pad_top;
+                      test_param.pad_bot = pad_bot;
+                      test_param.pad_left = pad_left;
+                      test_param.pad_right = pad_right;
+                      test_param.ins_h = ins_h;
+                      test_param.ins_h_last = ins_h_last;
+                      test_param.ins_w = ins_w;
+                      test_param.ins_w_last = ins_w_last;
+                      test_param.stride_h = kernel_stride_height;
+                      test_param.stride_w = kernel_stride_width;
+                      test_param.output_c = output_depth;
+                      test_param.output_h = oh;
+                      test_param.output_w = ow;
+                      test_param.has_bias = has_bias;
+                      test_param.retry_cnt = 5;
+
+                      bool is_valid_param =
+                          check_valid_test_param(bk_ctx, &test_param);
+                      if (is_valid_param == false) {
+                        skipped_tests++;
+#ifdef ENABLE_DEBUG_MSG
+                        printf(
+                            "  [%d/%d] random test: invalid parameter, skip\n",
+                            current_test, total_tests);
+#endif
+                        continue;
+                      }
+
+                      int ret2 = run_compare_conv(ctx, bk_ctx, &test_param);
+                      failed_tests = ret2 ? failed_tests + 1 : failed_tests;
+                      ret |= ret2;
+                      executed_tests++;
+
+#ifdef ENABLE_DEBUG_MSG
+                      printf(
+                          "  [%d/%d] random test: input shape(%d, %d, %d, %d)",
+                          current_test, total_tests, batch, input_depth,
+                          input_height, input_width);
+                      printf(", kernel shape (%d, %d, %d, %d), result %d\n",
+                             output_depth, input_depth, kernel_height,
+                             kernel_width, ret2);
+#endif
+
+                      // Stop at first error
+                      if (ret && stop_at_first_error) {
+                        break;
+                      }
+                    }
+
+                    // Stop at first error
+                    if (ret && stop_at_first_error) {
+                      break;
+                    }
+                  }
+
+                  // Stop at first error
+                  if (ret && stop_at_first_error) {
+                    break;
+                  }
+                }
+
+                // Stop at first error
+                if (ret && stop_at_first_error) {
+                  break;
+                }
+              }
+
+              // Stop at first error
+              if (ret && stop_at_first_error) {
+                break;
+              }
+            }
+
+            // Stop at first error
+            if (ret && stop_at_first_error) {
+              break;
+            }
+          }
+
+          // Stop at first error
+          if (ret && stop_at_first_error) {
+            break;
+          }
+        }
+
+        // Stop at first error
+        if (ret && stop_at_first_error) {
+          break;
+        }
+      }
+
+      // Stop at first error
+      if (ret && stop_at_first_error) {
+        break;
+      }
+    }
+    // Stop at first error
+    if (ret && stop_at_first_error) {
+      break;
+    }
+
+    if (executed_tests >= MIN_EXEC_TESTS) {
+      break;
+    }
+  }
+
+  printf(
+      "<= Random Test, total %d, skipped %d, executed %d, failed %d, ret %d\n",
+      total_tests, skipped_tests, executed_tests, failed_tests, ret);
+
+  return ret;
+}
+
+int main()
+{
+  int ret = 0;
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  ret = simple_test(&ctx, bk_ctx);
+  ret = random_test(&ctx, bk_ctx);
+
+  test_exit(&ctx);
+
+  return ret;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_conv_wtiling.cpp b/cviruntime/test/1880v2/test_1880v2_conv_wtiling.cpp
new file mode 100644
index 000000000..b24592997
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_conv_wtiling.cpp
@@ -0,0 +1,883 @@
+#include "1880v2_test_util.h"
+#include <iostream>
+typedef struct {
+  int random_seed;
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int using_bias;
+  int bReLU_EN;
+  int r_shift_m;
+  int opd0_sign;
+  int opd1_sign;
+  int opd2_sign;
+} conv_param_t;
+
+typedef struct {
+  u32 n;
+  u32 c;
+  u32 h;
+  u32 w;
+}slice_t;
+
+static int index_get(int h, int w1, int w2)
+{
+  return h * w1 + w2;
+}
+
+static int matrix_dot_mult(
+    s8 *A, s8 *B, int dim_n, int dim_m,
+    int opd0_sign)
+{
+  int sum = 0;
+  for (int i=0; i<dim_n; i++){
+    for (int j=0; j<dim_m; j++){
+      int index = index_get(i, dim_m, j);
+      if (opd0_sign) {
+        sum += A[index] * B [index];
+      } else {
+        sum += (int)((uint8_t)A[index]) * B [index];
+      }
+    }
+  }
+  return sum;
+}
+
+static int conv_ref(
+    const conv_param_t *p_param,
+    const s8 *ifmap,
+    const s8 *weight,
+    const s16 *bias,
+    s8 *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+  int r_shift_bits = p_param->r_shift_m;
+  int do_relu = p_param->bReLU_EN;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  s8 *i_fmap_pad_ker = (s8 *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return BM_ERR_FAILURE;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = BM_SUCCESS;
+
+  s8 *i_fmap_pad = NULL;
+  s8 *kernel_after = NULL;
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+
+      if (p_param->using_bias) {
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += bias[c]; //bias+c ;
+          }
+        }
+      }
+
+      if (do_relu)
+        relu(&result[n*oc*oh*ow + c*oh*ow], oh * ow);
+
+      // ofmap is s8, signed
+      ret = satu_2_8bit(&result[n*oc*oh*ow + c*oh*ow], oh * ow,
+                        &ofmap[n*oc*oh*ow + c*oh*ow], r_shift_bits, /*round_floor=*/1,
+                        /*sign_unsign=*/1);
+
+      if (ret != BM_SUCCESS)
+        goto error_release;
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+  neuron_dump <s32>(
+      "test_w_tiling_code:conv_ref:pure result + bias",
+      (u32)in,
+      (u32)oc,
+      (u32)oh,
+      (u32)ow,
+      (s32 *)result);
+
+  neuron_dump <s8>(
+      "test_w_tiling_code:conv_ref:final result",
+      (u32)in,
+      (u32)oc,
+      (u32)oh,
+      (u32)ow,
+      (s8 *)ofmap);
+
+error_release:
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+static u8 * transform_weight(const tl_shape_t *s, u8 before[])
+{
+  u32 ic = s->n;
+  u32 oc = s->c;
+  u32 kh = s->h;
+  u32 kw = s->w;
+
+  u32 size = ic * oc * kh * kw;
+  u8 *after = (u8 *)malloc(sizeof(u8) * size);
+
+  /*
+   * (oc, ic, kh, kw) -> (1, oc, kh * kw, ic)
+   */
+  for (u32 oci = 0; oci < oc; oci++) {
+    for (u32 ici = 0; ici < ic; ici++) {
+      for (u32 khi = 0; khi < kh; khi++) {
+        for (u32 kwi = 0; kwi < kw; kwi++) {
+          u32 src_i = oci * ic * kh * kw + ici * kh * kw + khi * kw + kwi;
+          u32 dst_i = oci * kh * kw * ic + khi * kw * ic + kwi * ic + ici;
+          after[dst_i] = before[src_i];
+        }
+      }
+    }
+  }
+
+  return after;
+}
+
+static void put_conv_weight(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    const tl_t *tl,
+    u8 *data)
+{
+  const tl_shape_t *s = &tl->shape;
+  u32 ic = s->n;
+  u32 oc = s->c;
+  u32 kh = s->h;
+  u32 kw = s->w;
+
+  bmshape_t bms = BM_TENSOR_INT8((int)oc, (int)ic, (int)kh, (int)kw);
+  CVI_RT_MEM dev_mem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = CVI_RT_MemGetPAddr(dev_mem);
+  u8 *transformed_data = transform_weight(s, data);
+
+  /* we put weight to region 1. CVI_RT_MemCopyS2D regard dev_mem as
+   * absolute address, so we should pass abs address to copy weight
+   * to right place.
+   */
+
+  //u64 ab_addr = bm_device_read_base_reg(*ctx, 1);
+  //CVI_RT_MEM ab_dev_mem =
+    //bmmem_device_prealloc(*ctx, NULL, ab_addr + gaddr, &bms);
+
+  //int ret = CVI_RT_MemCopyS2D(*ctx, ab_dev_mem, transformed_data);
+  int ret = CVI_RT_MemCopyS2D(*ctx, dev_mem, transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tl_shape_t tdma_shape = { 1, oc, kh * kw, ic };
+
+  tg_t tdma_tg;
+  tdma_tg.base_reg_index = 0;
+  tdma_tg.start_address = gaddr;
+  tdma_tg.fmt = FMT_I8;
+  tdma_tg.shape.n = tdma_shape.n;
+  tdma_tg.shape.c = tdma_shape.c;
+  tdma_tg.shape.h = tdma_shape.h;
+  tdma_tg.shape.w = tdma_shape.w;
+  tdma_tg.stride = bmk1880v2_tensor_tgmem_default_stride(tdma_tg.shape, tdma_tg.fmt);
+  tdma_tg.base_reg_index = 1;
+
+  tl_t tdma_tl = *tl;
+  tdma_tl.shape = tdma_shape;
+  tdma_tl.stride = bmk1880v2_tensor_lmem_default_stride(bk_ctx, tdma_shape, FMT_I8, 0);
+
+  bmk1880v2_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tdma_tg;
+  p.dst = &tdma_tl;
+
+  bmk1880v2_tdma_g2l_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  neuron_dump <s8> (
+      "test_ic_tiling_conv: kernel",
+      tdma_tg.shape.n,
+      tdma_tg.shape.c,
+      tdma_tg.shape.h,
+      tdma_tg.shape.w,
+      (s8 *)transformed_data);
+
+  CVI_RT_MemFree(*ctx, dev_mem);
+  free(transformed_data);
+}
+
+static s8 * transform_bias(int oc, s16 before[])
+{
+  s8 *after = (s8 *)malloc(sizeof(s8) * 2 * oc);
+  if (!after)
+    return NULL;
+
+  for (int i = 0; i < oc; i++){
+    after[i] = before[i] & 0xff;
+    after[i + oc] = (before[i] >> 8) & 0xff;
+  }
+  return after;
+}
+
+static void put_conv_bias(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    const tl_t *tl,
+    s16 *data)
+{
+  int oc = tl->shape.c;
+
+  bmshape_t bms = BM_TENSOR_INT8(2, oc, 1, 1);
+  CVI_RT_MEM dev_mem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = CVI_RT_MemGetPAddr(dev_mem);
+  s8 *transformed_data = transform_bias(oc, data);
+  int ret = CVI_RT_MemCopyS2D(*ctx, dev_mem, (u8 *)transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_I8;
+  tg.shape.n = 2;
+  tg.shape.c = oc;
+  tg.shape.h = 1;
+  tg.shape.w = 1;
+  tg.stride = bmk1880v2_tensor_tgmem_default_stride(tg.shape, tg.fmt);
+
+  bmk1880v2_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1880v2_tdma_g2l_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  CVI_RT_MemFree(*ctx, dev_mem);
+  free(transformed_data);
+}
+
+static int conv_kh_ext(const conv_param_t *p)
+{
+  return (p->kh - 1) * p->dh + 1;
+}
+
+static int conv_kw_ext(const conv_param_t *p)
+{
+  return (p->kw - 1) * p->dw + 1;
+}
+
+static int conv_ih_ext(const conv_param_t *p)
+{
+  return (p->input_h - 1) * (p->ins_h + 1) +
+      p->ins_h_last + 1 + p->pad_top + p->pad_bot;
+}
+
+static int conv_iw_ext(const conv_param_t *p)
+{
+  return (p->input_w - 1) * (p->ins_w + 1) +
+      p->ins_w_last + 1 + p->pad_left + p->pad_right;
+}
+
+static int conv_oh(const conv_param_t *p)
+{
+  return (conv_ih_ext(p) - conv_kh_ext(p)) / p->stride_h + 1;
+}
+
+static int conv_ow(const conv_param_t *p)
+{
+  return (conv_iw_ext(p) - conv_kw_ext(p)) / p->stride_w + 1;
+}
+
+static int conv_input_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int ic = p->input_c;
+  int ih = p->input_h;
+  int iw = p->input_w;
+  return in * ic * ih * iw;
+}
+
+static int conv_output_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int oc = p->output_c;
+  int oh = conv_oh(p);
+  int ow = conv_ow(p);
+  return in * oc * oh * ow;
+}
+
+static int conv_weight_size(const conv_param_t *p)
+{
+  int oc = p->output_c;
+  int ic = p->input_c;
+  int kh = p->kh;
+  int kw = p->kw;
+  return oc * ic * kh * kw;
+}
+
+static s8 * alloc_input(const conv_param_t *p)
+{
+  int size = conv_input_size(p);
+
+  s8 *buf = (s8 *)malloc(sizeof(s8) * size);
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static s8 * alloc_weight(const conv_param_t *p)
+{
+  int size = conv_weight_size(p);
+
+  s8 *buf = (s8 *)malloc(sizeof(s8) * size);
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static s16 * alloc_bias(const conv_param_t *p)
+{
+  int oc = p->output_c;
+
+  s16 *bias = (s16 *)malloc(sizeof(s16) * oc);
+  for (int i = 0; i < oc; i++)
+    bias[i] = rand() % 65536 - 32768;
+
+  return bias;
+}
+
+static tl_t * conv_ifmap_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = p->opd0_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return bmk1880v2_lmem_alloc_tensor(ctx, s, fmt, 1);
+}
+
+static tl_t * conv_weight_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = p->opd1_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+  return bmk1880v2_lmem_alloc_tensor(ctx, s, fmt, 0);
+}
+
+static tl_t * conv_ofmap_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  return bmk1880v2_lmem_alloc_ps32_tensor(ctx, s, FMT_I8, 1);
+}
+
+static tl_t * conv_bias_tensor(
+    bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = p->opd2_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+  return bmk1880v2_lmem_alloc_tensor(ctx, s, fmt, 0);
+}
+
+static int conv_param_is_ok(const conv_param_t *p)
+{
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+
+  if ((kh_ext > ih_ext)
+      || (kw_ext > iw_ext)
+      || (kh_ext <= p->pad_top)
+      || (kh_ext <= p->pad_bot)
+      || (kw_ext <= p->pad_left)
+      || (kw_ext <= p->pad_right)
+      || (p->pad_top >= (1 << 4))
+      || (p->pad_bot >= (1 << 4))
+      || (p->pad_left >= (1 << 4))
+      || (p->pad_right >= (1 << 4))) {
+    return 0;
+  }
+
+  return 1;
+}
+
+static int bmk_conv_param_alloc_ok(
+    const bmk1880v2_tiu_convolution_param_t *p,
+    const conv_param_t *param)
+{
+  if (!p->ifmap || !p->ofmap || !p->weight)
+    return 0;
+
+  if (param->using_bias)
+    if (!p->bias)
+      return 0;
+
+  return 1;
+}
+
+static void make_bmk_conv_param(
+    bmk_ctx_t *ctx,
+    bmk1880v2_tiu_convolution_param_t *dst,
+    const conv_param_t *p)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  dst->relu_enable = p->bReLU_EN;
+  dst->rshift_bits = p->r_shift_m;
+  dst->ifmap = conv_ifmap_tensor(ctx, p);
+  dst->weight = conv_weight_tensor(ctx, p);
+  dst->ofmap = conv_ofmap_tensor(ctx, p);
+  dst->bias = NULL;
+  dst->ps32_mode = 0;
+  if (p->using_bias)
+    dst->bias = conv_bias_tensor(ctx, p);
+
+  dst->w_is_const = 0;
+}
+
+static void free_bmk_conv_param(
+    bmk_ctx_t *ctx,
+    bmk1880v2_tiu_convolution_param_t *r,
+    const conv_param_t *p)
+{
+  if (p->using_bias && r->bias)
+    free_tl(ctx, r->bias);
+  if (r->ofmap)
+    free_tl(ctx, r->ofmap);
+  if (r->weight)
+    free_tl(ctx, r->weight);
+  if (r->ifmap)
+    free_tl(ctx, r->ifmap);
+}
+
+static void init_conv_param(conv_param_t &p)
+{
+  printf("init_conv_param\n");
+  p.random_seed = clock();
+  srand(p.random_seed);
+
+retry:
+  p.input_n = 1;
+  p.input_c = 1;
+  p.kh = 3;
+  p.kw = 3;
+  p.input_h = 4 + p.kh;
+  p.input_w = 4 + p.kw ;
+  p.output_c = 1;
+  p.stride_h = 1;
+  p.stride_w = 1;
+  p.ins_h = 0;
+  p.ins_w = 0;
+  p.ins_h_last = 0;
+  p.ins_w_last = 0;
+  p.dh = 1;
+  p.dw = 1;
+
+  int kh_ext = conv_kh_ext(&p);
+  int kw_ext = conv_kw_ext(&p);
+  p.pad_top = 0;
+  p.pad_bot = 0;
+  p.pad_left = 0;
+  p.pad_right = 0;
+
+  if (!conv_param_is_ok(&p)) {
+    printf("retry init_conv_param\n");
+    goto retry;
+  }
+
+  p.using_bias = 1;
+  p.r_shift_m = rand() % 8;
+  p.bReLU_EN = rand() % 2;
+  p.opd0_sign = rand() % 2;
+
+  p.opd1_sign = 1;
+  p.opd2_sign = 1;
+
+  assert(p.opd1_sign == 1 && p.opd2_sign == 1);
+
+  int ih_ext = conv_ih_ext(&p);
+  int iw_ext = conv_iw_ext(&p);
+  assert(ih_ext >= kh_ext);
+  assert(iw_ext >= kw_ext);
+}
+
+static void print_conv_param(const conv_param_t *p)
+{
+  printf("%s\n", "Conv parameters:");
+  printf("  %s%d;\n", "p->random_seed = ", p->random_seed);
+
+  printf("  %s%d;\n", "p->input_n = ", p->input_n);
+  printf("  %s%d;\n", "p->input_c = ", p->input_c);
+  printf("  %s%d;\n", "p->input_h = ", p->input_h);
+  printf("  %s%d;\n", "p->input_w = ", p->input_w);
+  printf("  %s%d;\n", "p->output_c = ", p->output_c);
+
+  printf("  %s%d;\n", "p->kh = ", p->kh);
+  printf("  %s%d;\n", "p->kw = ", p->kw);
+  printf("  %s%d;\n", "p->dh = ", p->dh);
+  printf("  %s%d;\n", "p->dw = ", p->dw);
+  printf("  %s%d;\n", "p->pad_top = ", p->pad_top);
+  printf("  %s%d;\n", "p->pad_bot = ", p->pad_bot);
+  printf("  %s%d;\n", "p->pad_left = ", p->pad_left);
+  printf("  %s%d;\n", "p->pad_right = ", p->pad_right);
+  printf("  %s%d;\n", "p->stride_h = ", p->stride_h);
+  printf("  %s%d;\n", "p->stride_w = ", p->stride_w);
+  printf("  %s%d;\n", "p->ins_w = ", p->ins_w);
+  printf("  %s%d;\n", "p->ins_h = ", p->ins_h);
+  printf("  %s%d;\n", "p->ins_w_last = ", p->ins_w_last);
+  printf("  %s%d;\n", "p->ins_h_last = ", p->ins_h_last);
+
+  printf("  %s%d;\n", "p->r_shift_m = ", p->r_shift_m);
+  printf("  %s%d;\n", "p->opd0_sign = ", p->opd0_sign);
+  printf("  %s%d;\n", "p->opd1_sign = ", p->opd1_sign);
+  printf("  %s%d;\n", "p->opd2_sign = ", p->opd2_sign);
+  printf("  %s%d;\n", "p->bReLU_EN = ", p->bReLU_EN);
+  printf("  %s%d;\n", "p->using_bias = ", p->using_bias);
+
+  printf("  %s%d\n", "kh_ext = ", conv_kh_ext(p));
+  printf("  %s%d\n", "kw_ext = ", conv_kw_ext(p));
+  printf("  %s%d\n", "ih_ext = ", conv_ih_ext(p));
+  printf("  %s%d\n", "iw_ext = ", conv_iw_ext(p));
+  printf("  %s%d\n", "output_h = ", conv_oh(p));
+  printf("  %s%d\n", "output_w = ", conv_ow(p));
+}
+
+/* Calculate the right shift value, m
+ * Steps:
+ *  1. Get the abs() of each weight;
+ *  2. Summary all the abs() in one kernel;
+ *  3. Get Log2 of each sum;
+ *  4. Downward rounding;
+ * After every r_shift value got, sort and find the middle one.
+ */
+
+static int calc_rshift_m(const conv_param_t *p, s8* weight)
+{
+  int kernel_cnt = p->output_c * p->input_c;
+  int kernel_size = p->kh * p->kw;
+  int *kernel_shifts = (int *)malloc(sizeof(int) * kernel_cnt);
+
+  // Tscan does not recognize C++ zero-initialized.
+  memset(kernel_shifts, 0, sizeof(int) * kernel_cnt);
+
+  // Part 1:
+  // Get right shift value for each kernel
+  int sum = 0;
+  for (int i = 0; i < kernel_cnt; i++) {
+    // Step 1 & 2: Get the sum of abs()
+    for (int j = 0; j < kernel_size; j++) {
+      sum += (int)(*weight < 0 ? -(*weight) : (*weight));
+      weight++;
+    }
+    // Step 3 & 4: log2 and downward rounding
+    sum >>= 1;
+    while (sum) {
+      sum >>= 1;
+      kernel_shifts[i]++;
+    }
+  }
+
+  // Part 2:
+  // Find the middle of all the values
+  int tag[32] = {0};
+  for (int cnt = 0; cnt < kernel_cnt; cnt++) {
+    tag[kernel_shifts[cnt]]++;
+  }
+
+  int rshift_m = 0;
+  int mid = 0;
+  do {
+    mid += tag[rshift_m++];
+  } while(mid < (kernel_cnt - 1) >> 1);
+
+  free(kernel_shifts);
+
+  return rshift_m - 1;
+}
+
+static int test_w_tiling_conv(
+    conv_param_t &p_param, CVI_RT_HANDLE ctx, bmk_ctx_t *bk_ctx)
+{
+  printf("test w tiled conv\n");
+  s8 *input = alloc_input(&p_param);
+  s8 *weight = alloc_weight(&p_param);
+  s16 *bias = alloc_bias(&p_param);
+  p_param.r_shift_m = calc_rshift_m(&p_param, weight);
+  s8 *output_ref = (s8 *)malloc(sizeof(s8) * conv_output_size(&p_param));
+  if (!output_ref)
+    return BM_ERR_FAILURE;
+
+  bmerr_t ret = conv_ref(&p_param, input, weight, bias, output_ref);
+  assert(ret == BM_SUCCESS);
+
+  bmk1880v2_tiu_convolution_param_t conv_param;
+  make_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+  /*We tile the finest granule to test w tiling*/
+  u32 ow_step = 1;
+
+  int tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, &p_param);
+  if (tl_alloc_success) {
+    put_conv_weight(&ctx, bk_ctx, conv_param.weight, (u8 *)weight);
+    if (p_param.using_bias) {
+      put_conv_bias(&ctx, bk_ctx, conv_param.bias, bias);
+      neuron_dump <s16> (
+        "test_w_tiling_conv: bias",
+        1,
+        conv_param.bias->shape.c,
+        conv_param.bias->shape.h,
+        conv_param.bias->shape.w,
+        (s16 *)bias);
+    }
+
+    tl_t tl_ifmap = *conv_param.ifmap;
+    tl_t tl_ofmap = *conv_param.ofmap;
+
+    tg_shape_t s;
+    s.n = tl_ifmap.shape.n;
+    s.c = tl_ifmap.shape.c;
+    s.h = tl_ifmap.shape.h;
+    s.w = tl_ifmap.shape.w;
+    tg_t *ts_ifmap = alloc_tg_gmem(&ctx, s, FMT_I8);
+    put_tg_gmem(&ctx, ts_ifmap, (u8 *)input);
+
+    s.n = tl_ofmap.shape.n;
+    s.c = tl_ofmap.shape.c;
+    s.h = tl_ofmap.shape.h;
+    s.w = tl_ofmap.shape.w;
+    tg_t *ts_ofmap = alloc_tg_gmem(&ctx, s, FMT_I8);
+
+    neuron_dump <s8> (
+      "test_w_tiling_conv: input",
+      conv_param.ifmap->shape.n,
+      conv_param.ifmap->shape.c,
+      conv_param.ifmap->shape.h,
+      conv_param.ifmap->shape.w,
+      (s8 *)input);
+
+    for (u32 ow_pos = 0; ow_pos < tl_ofmap.shape.w; ow_pos += ow_step) {
+      u32 cur_ow = math_min(tl_ofmap.shape.w - ow_pos, ow_step);
+
+      tg_t ts_cur_ofmap;
+      ts_cur_ofmap.shape.n = ts_ofmap->shape.n;
+      ts_cur_ofmap.shape.c = ts_ofmap->shape.c;
+      ts_cur_ofmap.shape.h = ts_ofmap->shape.h;
+      ts_cur_ofmap.shape.w = cur_ow;
+      ts_cur_ofmap.stride = ts_ofmap->stride;
+      ts_cur_ofmap.start_address = ts_ofmap->start_address + ow_pos;
+      ts_cur_ofmap.fmt = ts_ofmap->fmt;
+      ts_cur_ofmap.base_reg_index = ts_ofmap->base_reg_index;
+
+      tl_t tl_cur_ofmap;
+      tl_cur_ofmap.shape.n = tl_ofmap.shape.n;
+      tl_cur_ofmap.shape.c = tl_ofmap.shape.c;
+      tl_cur_ofmap.shape.h = tl_ofmap.shape.h;
+      tl_cur_ofmap.shape.w = cur_ow;
+      tl_cur_ofmap.stride = bmk1880v2_tensor_lmem_default_stride(bk_ctx, tl_cur_ofmap.shape, FMT_I8, 1);
+      tl_cur_ofmap.fmt = tl_ofmap.fmt;
+      tl_cur_ofmap.start_address = tl_ofmap.start_address;
+
+      tg_t ts_cur_ifmap;
+      ts_cur_ifmap.shape.n = ts_ifmap->shape.n;
+      ts_cur_ifmap.shape.c = ts_ifmap->shape.c;
+      ts_cur_ifmap.shape.h = ts_ifmap->shape.h;
+      ts_cur_ifmap.shape.w = (cur_ow - 1) * conv_param.stride_w + conv_kw_ext(&p_param);
+      ts_cur_ifmap.stride = ts_ifmap->stride;
+      ts_cur_ifmap.start_address = ts_ifmap->start_address + ow_pos;
+      ts_cur_ifmap.fmt = ts_ifmap->fmt;
+      ts_cur_ifmap.base_reg_index = ts_ifmap->base_reg_index;
+
+      tl_t tl_cur_ifmap;
+      tl_cur_ifmap.shape.n = tl_ifmap.shape.n;
+      tl_cur_ifmap.shape.c = tl_ifmap.shape.c;
+      tl_cur_ifmap.shape.h = tl_ifmap.shape.h;
+      tl_cur_ifmap.shape.w = (cur_ow - 1) * conv_param.stride_w + conv_kw_ext(&p_param);
+      tl_cur_ifmap.stride = bmk1880v2_tensor_lmem_default_stride(bk_ctx, tl_cur_ifmap.shape, FMT_I8, 1);
+      tl_cur_ifmap.fmt = tl_ifmap.fmt;
+      tl_cur_ifmap.start_address = tl_ifmap.start_address;
+
+      {
+        bmk1880v2_tdma_tg2l_tensor_copy_param_t p;
+        memset(&p, 0, sizeof(p));
+        p.src = &ts_cur_ifmap;
+        p.dst = &tl_cur_ifmap;
+        bmk1880v2_tdma_g2l_tensor_copy(bk_ctx, &p);
+        test_submit(&ctx);
+      }
+      {
+        bmk1880v2_tiu_convolution_param_t p;
+        memset(&p, 0, sizeof(p));
+        p = conv_param;
+        p.ifmap = &tl_cur_ifmap;
+        p.ofmap = &tl_cur_ofmap;
+        if(p_param.ins_w_last == 1 && (ow_pos + ow_step) >= tl_ofmap.shape.w)
+          p.ins_last_w = 1;
+        else
+          p.ins_last_w = 0;
+
+        bmk1880v2_tiu_convolution(bk_ctx, &p);
+      }
+      {
+        bmk1880v2_tdma_l2tg_tensor_copy_param_t p;
+        memset(&p, 0, sizeof(p));
+        p.src = &tl_cur_ofmap;
+        p.dst = &ts_cur_ofmap;
+        bmk1880v2_tdma_l2g_tensor_copy(bk_ctx, &p);
+        test_submit(&ctx);
+      }
+    }
+    u8 *output = get_tg_gmem(&ctx, ts_ofmap);
+    free_tg_gmem(&ctx, ts_ifmap);
+    free_tg_gmem(&ctx, ts_ofmap);
+
+    neuron_dump <s8> (
+      "test_w_tiling_conv: output",
+      conv_param.ofmap->shape.n,
+      conv_param.ofmap->shape.c,
+      conv_param.ofmap->shape.h,
+      conv_param.ofmap->shape.w,
+      (s8 *)output);
+    free(output);
+
+    int has_error = array_cmp_int8(
+        "Comparing results ...\n",
+        output_ref, (s8 *)output, conv_output_size(&p_param));
+
+    if (has_error) {
+      print_conv_param(&p_param);
+      printf("Comparison FAILED\n");
+      exit(-1);
+    }
+  }
+
+  free_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+  free(input);
+  free(weight);
+  free(output_ref);
+  free(bias);
+  return tl_alloc_success ? 1 : 0;
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  int test_finished_num = 0;
+  for (int i = 0; i < 1; i++) {
+    printf("random_test_conv iteration: %d\n", i);
+    conv_param_t test_conv_param;
+    init_conv_param(test_conv_param);
+    test_finished_num += test_w_tiling_conv(test_conv_param, ctx, bk_ctx);
+    if (!test_conv_param.using_bias)
+      test_conv_param.using_bias = 1;
+    if (test_conv_param.output_c <= 9)
+      test_conv_param.output_c += 3;
+    test_finished_num += test_w_tiling_conv(test_conv_param, ctx, bk_ctx);
+  }
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_depthwise.cpp b/cviruntime/test/1880v2/test_1880v2_depthwise.cpp
new file mode 100644
index 000000000..320ef61c8
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_depthwise.cpp
@@ -0,0 +1,314 @@
+#include "1880v2_test_util.h"
+
+typedef bmk1880v2_tiu_depthwise_convolution_param_t param_t;
+
+static void print_pooling_param(param_t *p)
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+  int kh = p->weight->shape.h;
+  int kw = p->weight->shape.w;
+
+  printf("  Pooling parameters:\n");
+  printf("    ifmap = (%d, %d, %d, %d)\n", in, ic, ih, iw);
+  printf("    opd0_sign = %d\n", p->ifmap->fmt == FMT_I8);
+  printf("    weight = (%d, %d)\n", kh, kw);
+  printf("    padding = (%d, %d, %d, %d)\n",
+         p->pad_top, p->pad_bottom, p->pad_left, p->pad_right);
+  printf("    stride = (%d, %d)\n", p->stride_h, p->stride_w);
+  printf("    ins0 = (%d, %d, %d, %d)\n",
+         p->ins_h, p->ins_last_h, p->ins_w, p->ins_last_w);
+  printf("    rshift_bits = %d\n", p->rshift_bits);
+  printf("    relu_enable = %d\n", p->relu_enable);
+  printf("    res0_sign = %d\n", p->ofmap->fmt == FMT_I8);
+}
+
+static s8 *alloc_input(param_t *p)
+{
+  u64 size = tl_shape_size(&p->ifmap->shape);
+  s8 *data = (s8 *)xmalloc(size);
+  if (!data)
+    return NULL;
+
+  for (u64 i = 0; i < size; i++)
+    data[i] = rand() % 256 - 128;
+  return data;
+}
+
+static s8 *alloc_weight(param_t *p)
+{
+  int size = tl_shape_size(&p->weight->shape);
+  s8 *data = (s8 *)xmalloc(size);
+  if (!data)
+    return NULL;
+
+  for (int i = 0; i < size; i++)
+    data[i] = rand() % 256 - 128;
+  return data;
+}
+
+static s16 *alloc_bias(param_t *p)
+{
+  int c = p->bias->shape.c;
+  s16 *bias = (s16 *)malloc(sizeof(s16) * c);
+  if (!bias)
+    return NULL;
+
+  for (int i = 0; i < c; i++)
+    bias[i] = rand() % 65536 - 32768;
+  return bias;
+}
+
+static s8 *alloc_output(param_t *p)
+{
+  u64 size = tl_shape_size(&p->ofmap->shape);
+  return (s8 *)xmalloc(size);
+}
+
+static inline void relu8(s8 *buf, u64 size)
+{
+  for (u64 i = 0; i < size; i++)
+    if (buf[i] < 0)
+      buf[i] = 0;
+}
+
+
+static void compare_results(
+    param_t *p,
+    s8 input[],
+    s8 weight[],
+    s16 bias[],
+    s8 output[])
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+  int kh = p->weight->shape.h;
+  int kw = p->weight->shape.w;
+  int opd0_sign = (p->ifmap->fmt == FMT_I8);
+  int res0_sign = (p->ofmap->fmt == FMT_I8);
+  s8 *output_ref = alloc_output(p);
+  bmerr_t ret = native_pooling_ave_int8(
+      input, weight, p->bias ? bias : NULL, output_ref,
+      in, ic, ih, iw, kh, kw,
+      p->pad_top, p->pad_bottom, p->pad_left, p->pad_right,
+      p->stride_h, p->stride_w,
+      p->ins_h, p->ins_w, p->ins_last_h, p->ins_last_w,
+      opd0_sign, res0_sign, p->rshift_bits, 0);
+  assert(ret == BM_SUCCESS);
+
+  if(p->relu_enable )
+    relu8(output_ref, tl_shape_size(&p->ofmap->shape));
+
+  int cmp_res = array_cmp_int8(
+      "Comparing results ...\n", output_ref, output,
+      tl_shape_size(&p->ofmap->shape));
+
+  if (cmp_res != 0) {
+    printf("Comparison FAILED!!!\n");
+    print_pooling_param(p);
+    exit(-1);
+  }
+
+  free(output_ref);
+}
+
+static int pooling_ih_ext(param_t *p, int ih)
+{
+  int ins = p->ins_h;
+  int ins_last = p->ins_last_h;
+  int pad = p->pad_top + p->pad_bottom;
+  return (ih - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+static int pooling_iw_ext(param_t *p, int iw)
+{
+  int ins = p->ins_w;
+  int ins_last = p->ins_last_w;
+  int pad = p->pad_left + p->pad_right;
+  return (iw - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+static int pooling_oh(param_t *p, int ih, int kh)
+{
+  int ih_ext = pooling_ih_ext(p, ih);
+  return (ih_ext - kh) / p->stride_h + 1;
+}
+
+static int pooling_ow(param_t *p, int iw, int kw)
+{
+  int iw_ext = pooling_iw_ext(p, iw);
+  return (iw_ext - kw) / p->stride_w + 1;
+}
+
+static void free_depthwise_param(
+    bmk_ctx_t *ctx,
+    param_t *p)
+{
+  if (p->bias)
+    free_tl(ctx, p->bias);
+
+  if (p->weight)
+    free_tl(ctx, p->weight);
+
+  if (p->ifmap)
+    free_tl(ctx, p->ifmap);
+
+  if (p->ofmap)
+    free_tl(ctx, p->ofmap);
+}
+
+static param_t random_depthwise_param(bmk_ctx_t *ctx)
+{
+  srand(clock());
+  param_t p;
+
+  memset(&p, 0, sizeof(p));
+
+retry:
+  int using_bias = rand() % 2;
+  int n = rand() % 5 + 1;
+  int c = rand() % (3 * BM1880V2_HW_NPU_NUM) + 1;
+  int ih = rand() % 30 + 3;
+  int iw = rand() % 30 + 6;
+  int kh = rand() % 7 + 1;
+  int kw = rand() % 7 + 1;
+  int opd0_sign = rand() % 2;
+
+  p.ins_h = rand() % kh;
+  p.ins_w = rand() % kw;
+  p.ins_last_h = rand() % kh;
+  p.ins_last_w = rand() % kw;
+  p.stride_h = rand() % kh + 1;
+  p.stride_w = rand() % kw + 1;
+  p.pad_top = rand() % kh;
+  p.pad_bottom = rand() % kh;
+  p.pad_left = rand() % kw;
+  p.pad_right = rand() % kw;
+  p.rshift_bits = rand() % 32;
+
+  int oh = pooling_oh(&p, ih, kh);
+  int ow = pooling_ow(&p, iw, kw);
+  tl_shape_t ofmap_shape;
+  ofmap_shape.n = n;
+  ofmap_shape.c = c;
+  ofmap_shape.h = oh;
+  ofmap_shape.w = ow;
+  tl_shape_t ifmap_shape;
+  ifmap_shape.n = n;
+  ifmap_shape.c = c;
+  ifmap_shape.h = ih;
+  ifmap_shape.w = iw;
+  tl_shape_t weight_shape;
+  weight_shape.n = 1;
+  weight_shape.c = c;
+  weight_shape.h = kh;
+  weight_shape.w = kw;
+  tl_shape_t bias_shape;
+  bias_shape.n = 2;
+  bias_shape.c = c;
+  bias_shape.h = 1;
+  bias_shape.w = 1;
+  p.relu_enable = rand()%2;
+  /*test case ref does not support dilation !=1*/
+  p.dilation_h = 1;
+  p.dilation_w = 1;
+  fmt_t ifmt = opd0_sign ? FMT_I8: FMT_U8;
+
+  p.ofmap = bmk1880v2_lmem_alloc_tensor(ctx, ofmap_shape, FMT_I8, 1);
+  p.ifmap = bmk1880v2_lmem_alloc_tensor(ctx, ifmap_shape, ifmt, 1);
+  p.weight = bmk1880v2_lmem_alloc_tensor(ctx, weight_shape, FMT_I8, 1);
+  p.bias = NULL;
+  if (using_bias)
+    p.bias = bmk1880v2_lmem_alloc_tensor(ctx, bias_shape, FMT_I8, 0);
+
+  if ((kh > pooling_ih_ext(&p, ih))
+      || (kw > pooling_iw_ext(&p, iw))
+      || (p.pad_top >= (1 << 4))
+      || (p.pad_bottom >= (1 << 4))
+      || (p.pad_left >= (1 << 4))
+      || (p.pad_right >= (1 << 4))
+      || !p.ofmap
+      || !p.ifmap
+      || !p.weight
+      || (using_bias && !p.bias)) {
+    printf("retry init_pooling_param\n");
+    free_depthwise_param(ctx, &p);
+    goto retry;
+  }
+  return p;
+}
+
+static void put_bias_tensor(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    const tl_t *tl,
+    s16 data[])
+{
+  int c = tl->shape.c;
+
+  u8 *lo_hi = (u8 *)xmalloc(2 * c);
+  if (!lo_hi)
+    return;
+
+  for (int i = 0; i < c; i++) {
+    lo_hi[i] = data[i] & 0xff;
+    lo_hi[i + c] = (data[i] >> 8) & 0xff;
+  }
+
+  put_tensor_g2l(ctx, bk_ctx, tl, (u8 *)lo_hi);
+
+  free(lo_hi);
+}
+
+static int test_pooling(CVI_RT_HANDLE ctx, bmk_ctx_t *bk_ctx)
+{
+  param_t param = random_depthwise_param(bk_ctx);
+
+  s8 *input = alloc_input(&param);
+  s8 *weight = alloc_weight(&param);
+  s16 *bias = NULL;
+  if (param.bias)
+    bias = alloc_bias(&param);
+
+  put_tensor_g2l(&ctx, bk_ctx, param.ifmap, (u8 *)input);
+  put_tensor_g2l(&ctx, bk_ctx, param.weight, (u8 *)weight);
+  if (param.bias)
+    put_bias_tensor(&ctx, bk_ctx, param.bias, bias);
+
+  bmk1880v2_tiu_depthwise_convolution(bk_ctx, &param);
+  s8 *output = (s8 *)get_tensor_l2g(&ctx, bk_ctx, param.ofmap);
+
+  compare_results(&param, input, weight, bias, output);
+
+  free_depthwise_param(bk_ctx, &param);
+  free(input);
+  free(weight);
+  free(bias);
+  free(output);
+
+  return 1;
+}
+
+static void test_depthwise_pooling(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx)
+{
+  int test_finished_num = 0;
+  for (u64 i = 0; i < 16; i++)
+    test_finished_num += test_pooling(*ctx, bk_ctx);
+  printf("Test finished %d\n", test_finished_num);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_depthwise_pooling(&ctx, bk_ctx);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_depthwise_conv_qdm.cpp b/cviruntime/test/1880v2/test_1880v2_depthwise_conv_qdm.cpp
new file mode 100644
index 000000000..483dd63fb
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_depthwise_conv_qdm.cpp
@@ -0,0 +1,1603 @@
+#include <limits.h>
+#include "1880v2_test_util.h"
+#include "test_tf_quant_util.h"
+
+// #define ENABLE_DEBUG_MSG
+// #define ENABLE_FULL_REGRESSION
+// #define ENABLE_TV_GEN_PATTERN
+
+#define MIN_EXEC_TESTS  20
+
+typedef struct {
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int output_h;
+  int output_w;
+  int has_bias;
+  int relu_enable;
+  s8 *input_data;
+  s8 *filter_data;
+  s8 *output_data;
+  s32 *bias_data;
+  u32 *multiplier_data;
+  s8 *shift_data;
+  float float_multiplier;
+  int retry_cnt;
+} dw_conv_test_param_t;
+
+static inline int Offset(tl_shape_t shape, int i0, int i1, int i2, int i3)
+{
+  // return n * (shape.c * shape.h * shape.w) + c * (shape.h * shape.w) + h *
+  // shape.w + w;
+  int dims_data[4] = {static_cast<int>(shape.n), static_cast<int>(shape.c),
+                      static_cast<int>(shape.h), static_cast<int>(shape.w)};
+  return ((i0 * dims_data[1] + i1) * dims_data[2] + i2) * dims_data[3] + i3;
+}
+
+void fill_random_data_s8(s8 *input_data, int size)
+{
+  for (int i = 0; i < size; ++i) {
+    int is_satured = ((rand() % 1000) == 1) ? 1 : 0;
+    int is_sign = rand() % 2 ? 1 : -1;
+
+    if (is_satured && is_sign) {
+      input_data[i] = -128;
+    } else if (is_satured) {
+      input_data[i] = 127;
+    } else {
+      input_data[i] = is_sign * rand() % 128;
+    }
+  }
+}
+
+void fill_random_data_s32(s32 *input_data, int size)
+{
+  for (int i = 0; i < size; ++i) {
+    int is_satured = ((rand() % 1000) == 1) ? 1 : 0;
+    int is_sign = rand() % 2 ? 1 : -1;
+
+    if (is_satured && is_sign) {
+      input_data[i] = INT_MIN;
+    } else if (is_satured) {
+      input_data[i] = INT_MAX;
+    } else {
+      input_data[i] = is_sign * rand() % 128;
+    }
+  }
+}
+
+void convert_nhwc_to_nchw(tl_shape_t tl_shape, s8 *src, s8 *dst)
+{
+  // NHWC
+  u32 src_shape_n = tl_shape.n;
+  u32 src_shape_h = tl_shape.c;
+  u32 src_shape_w = tl_shape.h;
+  u32 src_shape_c = tl_shape.w;
+  u32 src_stride_c = 1;
+  u32 src_stride_w = src_shape_c * src_stride_c;
+  u32 src_stride_h = src_shape_w * src_stride_w;
+  u32 src_stride_n = src_shape_h * src_stride_h;
+
+  // NCHW
+  // u32 dst_shape_n = src_shape_n;
+  u32 dst_shape_c = src_shape_c;
+  u32 dst_shape_h = src_shape_h;
+  u32 dst_shape_w = src_shape_w;
+  u32 dst_stride_w = 1;
+  u32 dst_stride_h = dst_shape_w * dst_stride_w;
+  u32 dst_stride_c = dst_shape_h * dst_stride_h;
+  u32 dst_stride_n = dst_shape_c * dst_stride_c;
+
+  printf("convert_nhwc_to_nchw:\n");
+  printf("  src shape (%d, %d, %d, %d), stride (%d, %d, %d, %d)\n", src_shape_n,
+         src_shape_c, src_shape_h, src_shape_w, src_stride_n, src_stride_c,
+         src_stride_h, src_stride_w);
+  printf("  dst shape (%d, %d, %d, %d), stride (%d, %d, %d, %d)\n", src_shape_n,
+         dst_shape_c, dst_shape_h, dst_shape_w, dst_stride_n, dst_stride_c,
+         dst_stride_h, dst_stride_w);
+
+  for (u32 i = 0; i < src_shape_n; ++i) {
+    for (u32 j = 0; j < src_shape_h; ++j) {
+      for (u32 k = 0; k < src_shape_w; ++k) {
+        for (u32 l = 0; l < src_shape_c; ++l) {
+          u32 src_offset = i * src_stride_n + j * src_stride_h +
+                           k * src_stride_w + l * src_stride_c;
+          u32 dst_offset = i * dst_stride_n + j * dst_stride_h +
+                           k * dst_stride_w + l * dst_stride_c;
+          dst[dst_offset] = src[src_offset];
+        }
+      }
+    }
+  }
+}
+
+int test_nhwc_to_nchw()
+{
+  int ret = 0;
+
+  tl_shape_t shape = {2, 2, 2, 2};
+  int size = shape.n * shape.c * shape.h * shape.w;
+
+  s8 src[2 * 2 * 2 * 2] = {1,  5,  2,  6,  3,  7,  4,  8,
+                           11, 15, 12, 16, 13, 17, 14, 18};
+
+  s8 dst[2 * 2 * 2 * 2] = {0};
+  s8 ref_dst[2 * 2 * 2 * 2] = {1,  2,  3,  4,  5,  6,  7,  8,
+                               11, 12, 13, 14, 15, 16, 17, 18};
+
+  convert_nhwc_to_nchw(shape, src, dst);
+  for (int i = 0; i < size; ++i) {
+    if (dst[i] != ref_dst[i]) {
+      printf("Error ! dst[%d] %d != %d(expected)\n", i, dst[i], ref_dst[i]);
+      ret = -1;
+    }
+  }
+
+  tl_shape_t input_shape = {/*n=*/1, /*h=*/5, /*w=*/6, /*c=*/8};
+  int input_size =
+      input_shape.n * input_shape.c * input_shape.h * input_shape.w;
+  s8 nhwc_input_data[240] = {
+      103,  85,   -96,  120,  105,  -72,  33,   -50,  -104, 12,   -57,  -80,
+      12,   126,  117,  127,  119,  119,  -88,  57,   120,  123,  117,  -100,
+      -4,   76,   76,   -52,  -92,  -127, -21,  -100, 106,  35,   74,   96,
+      117,  0,    39,   76,   -119, -36,  89,   -74,  111,  46,   45,   -26,
+      65,   61,   62,   -7,   -28,  -20,  39,   -84,  -85,  -51,  52,   76,
+      -120, -47,  -58,  95,   -117, -90,  -104, 126,  82,   82,   49,   -96,
+      -47,  67,   115,  -3,   -120, 41,   -16,  -96,  -31,  -75,  67,   -115,
+      75,   -119, -81,  -24,  -3,   -11,  -14,  -4,   37,   75,   53,   107,
+      65,   78,   -58,  52,   46,   -128, 39,   53,   -87,  36,   -98,  -12,
+      -1,   70,   117,  18,   -41,  96,   21,   78,   -71,  -124, 64,   82,
+      -63,  82,   1,    112,  50,   -23,  100,  -20,  117,  20,   12,   -88,
+      -93,  67,   -90,  -70,  -63,  79,   87,   125,  -63,  -43,  80,   -52,
+      -66,  -125, 109,  -73,  -39,  104,  -78,  89,   -64,  116,  29,   71,
+      -7,   124,  -38,  -111, 84,   75,   21,   24,   12,   59,   106,  49,
+      -55,  46,   65,   -28,  64,   15,   -31,  -75,  17,   7,    -109, -25,
+      -115, -38,  7,    23,   71,   -37,  111,  119,  -95,  -89,  17,   -27,
+      -8,   -29,  -125, 58,   -42,  -29,  -87,  109,  75,   -17,  -49,  92,
+      7,    30,   -86,  -98,  26,   -8,   -61,  -41,  39,   7,    48,   55,
+      63,   125,  -13,  56,   -107, 105,  -70,  1,    105,  14,   -89,  0,
+      83,   -10,  9,    11,   127,  -14,  -108, 90,   -15,  26,   -101, -1};
+  s8 input_data[240];
+  convert_nhwc_to_nchw(input_shape, nhwc_input_data, input_data);
+  printf("NCHW input_data[%d] = {\n", input_size);
+  for (int i = 0; i < input_size; ++i) {
+    printf("%d, ", input_data[i]);
+    if (i && ((i % 16) == 0)) {
+      printf("\n");
+    }
+  }
+  printf("};\n\n");
+
+  tl_shape_t filter_shape = {1, 3, 3, 8};
+  int filter_size =
+      filter_shape.n * filter_shape.c * filter_shape.h * filter_shape.w;
+  s8 nhwc_filter_data[72] = {
+      103,  85,  -96, 120, 105,  -72,  33,   -50,  -104, 12,  -57, -80,
+      12,   126, 117, 127, 119,  119,  -88,  57,   120,  123, 117, -100,
+      -4,   76,  76,  -52, -92,  -127, -21,  -100, 106,  35,  74,  96,
+      117,  0,   39,  76,  -119, -36,  89,   -74,  111,  46,  45,  -26,
+      65,   61,  62,  -7,  -28,  -20,  39,   -84,  -85,  -51, 52,  76,
+      -120, -47, -58, 95,  -117, -90,  -104, 126,  82,   82,  49,  -96};
+  s8 filter_data[72];
+  convert_nhwc_to_nchw(filter_shape, nhwc_filter_data, filter_data);
+  printf("NCHW filter_data[%d] = {\n", filter_size);
+  for (int i = 0; i < filter_size; ++i) {
+    printf("%d, ", filter_data[i]);
+    if (i && ((i % 16) == 0)) {
+      printf("\n");
+    }
+  }
+  printf("}\n\n");
+
+  tl_shape_t output_shape = {1, 3, 4, 8};
+  int output_size =
+      output_shape.n * output_shape.c * output_shape.h * output_shape.w;
+  s8 nhwc_output_data[96] = {
+      127,  127,  69,   34,  36,   127,  127,  127,  -101, -65,  39,   13,
+      26,   6,    127,  -67, 60,   123,  31,   17,   3,    -128, -58,  -64,
+      -128, 26,   -128, -21, 72,   55,   127,  94,   -46,  -128, -37,  1,
+      -6,   109,  98,   -14, -11,  48,   -128, -3,   -50,  37,   -20,  79,
+      -94,  -36,  127,  19,  3,    -18,  -40,  -115, 24,   124,  -128, -1,
+      -52,  -123, -54,  -1,  -62,  95,   127,  24,   10,   -74,  127,  -128,
+      -2,   111,  106,  4,   3,    -128, 127,  127,  -30,  98,   -21,  -1,
+      -11,  -12,  58,   -72, -128, 127,  30,   32,   -85,  -11,  -35,  34};
+  s8 output_data[96] = {0};
+  convert_nhwc_to_nchw(output_shape, nhwc_output_data, output_data);
+  printf("NCHW output_data[%d] = {\n", output_size);
+  for (int i = 0; i < output_size; ++i) {
+    printf("%d, ", output_data[i]);
+    if (i && ((i % 16) == 0)) {
+      printf("\n");
+    }
+  }
+  printf("};\n\n");
+
+  return ret;
+}
+
+int simple_nhwc_dw_conv_test(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk)
+{
+  int ret = 0;
+
+  const int stride_width = 1;
+  const int stride_height = 1;
+  const int dilation_width_factor = 1;
+  const int dilation_height_factor = 1;
+  const int pad_width = 0;
+  const int pad_height = 0;
+  const int depth_multiplier = 1;
+  const int input_offset = 0;   // symmetric
+  const int output_offset = 0;  // symmetric
+  const int output_activation_min = -128;
+  const int output_activation_max = 127;
+
+  if (ctx == nullptr) {
+    return -1;
+  }
+  if (bmk == nullptr) {
+    return -1;
+  }
+
+  tl_shape_t input_shape = {/*n=*/1, /*h=*/5, /*w=*/6, /*c=*/8};
+  s8 input_data[240] = {
+      103,  85,   -96,  120,  105,  -72,  33,   -50,  -104, 12,   -57,  -80,
+      12,   126,  117,  127,  119,  119,  -88,  57,   120,  123,  117,  -100,
+      -4,   76,   76,   -52,  -92,  -127, -21,  -100, 106,  35,   74,   96,
+      117,  0,    39,   76,   -119, -36,  89,   -74,  111,  46,   45,   -26,
+      65,   61,   62,   -7,   -28,  -20,  39,   -84,  -85,  -51,  52,   76,
+      -120, -47,  -58,  95,   -117, -90,  -104, 126,  82,   82,   49,   -96,
+      -47,  67,   115,  -3,   -120, 41,   -16,  -96,  -31,  -75,  67,   -115,
+      75,   -119, -81,  -24,  -3,   -11,  -14,  -4,   37,   75,   53,   107,
+      65,   78,   -58,  52,   46,   -128, 39,   53,   -87,  36,   -98,  -12,
+      -1,   70,   117,  18,   -41,  96,   21,   78,   -71,  -124, 64,   82,
+      -63,  82,   1,    112,  50,   -23,  100,  -20,  117,  20,   12,   -88,
+      -93,  67,   -90,  -70,  -63,  79,   87,   125,  -63,  -43,  80,   -52,
+      -66,  -125, 109,  -73,  -39,  104,  -78,  89,   -64,  116,  29,   71,
+      -7,   124,  -38,  -111, 84,   75,   21,   24,   12,   59,   106,  49,
+      -55,  46,   65,   -28,  64,   15,   -31,  -75,  17,   7,    -109, -25,
+      -115, -38,  7,    23,   71,   -37,  111,  119,  -95,  -89,  17,   -27,
+      -8,   -29,  -125, 58,   -42,  -29,  -87,  109,  75,   -17,  -49,  92,
+      7,    30,   -86,  -98,  26,   -8,   -61,  -41,  39,   7,    48,   55,
+      63,   125,  -13,  56,   -107, 105,  -70,  1,    105,  14,   -89,  0,
+      83,   -10,  9,    11,   127,  -14,  -108, 90,   -15,  26,   -101, -1};
+
+  tl_shape_t filter_shape = {1, 3, 3, 8};
+  s8 filter_data[72] = {
+      103,  85,  -96, 120, 105,  -72,  33,   -50,  -104, 12,  -57, -80,
+      12,   126, 117, 127, 119,  119,  -88,  57,   120,  123, 117, -100,
+      -4,   76,  76,  -52, -92,  -127, -21,  -100, 106,  35,  74,  96,
+      117,  0,   39,  76,  -119, -36,  89,   -74,  111,  46,  45,  -26,
+      65,   61,  62,  -7,  -28,  -20,  39,   -84,  -85,  -51, 52,  76,
+      -120, -47, -58, 95,  -117, -90,  -104, 126,  82,   82,  49,  -96};
+
+  s32 bias_data[8] = {812, 670, -746, 938, 827, -558, 265, -384};
+
+  u32 output_multiplier[8] = {1155460505, 1210948247, 1203328687, 1166122678,
+                              1155273687, 1196350022, 1169748238, 1183287581};
+
+  s8 output_rshift[8] = {-7, -6, -6, -9, -8, -6, -6, -7};
+
+  tl_shape_t output_shape = {1, 3, 4, 8};
+  s8 output_data[96] = {0};
+  s8 ref_output_data[96] = {
+      127,  127,  69,   34,  36,   127,  127,  127,  -101, -65,  39,   13,
+      26,   6,    127,  -67, 60,   123,  31,   17,   3,    -128, -58,  -64,
+      -128, 26,   -128, -21, 72,   55,   127,  94,   -46,  -128, -37,  1,
+      -6,   109,  98,   -14, -11,  48,   -128, -3,   -50,  37,   -20,  79,
+      -94,  -36,  127,  19,  3,    -18,  -40,  -115, 24,   124,  -128, -1,
+      -52,  -123, -54,  -1,  -62,  95,   127,  24,   10,   -74,  127,  -128,
+      -2,   111,  106,  4,   3,    -128, 127,  127,  -30,  98,   -21,  -1,
+      -11,  -12,  58,   -72, -128, 127,  30,   32,   -85,  -11,  -35,  34};
+
+  const int batches = input_shape.n;
+  // const int output_depth = 8;
+  const int input_height = input_shape.c;
+  const int input_width = input_shape.h;
+  const int input_depth = input_shape.w;
+  const int filter_height = filter_shape.c;
+  const int filter_width = filter_shape.h;
+  const int output_height = output_shape.c;
+  const int output_width = output_shape.h;
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          for (int m = 0; m < depth_multiplier; ++m) {
+            const int output_channel = m + in_channel * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - pad_width;
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+            s32 acc = 0;
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  s32 input_val = input_data[Offset(input_shape, batch, in_y,
+                                                    in_x, in_channel)];
+                  s32 filter_val = filter_data[Offset(
+                      filter_shape, 0, filter_y, filter_x, output_channel)];
+                  acc += filter_val * (input_val + input_offset);
+
+                  printf("  [batch=%d][out_y=%d][out_x=%d][in_channel=%d][m=%d]"
+                         "[filter_y=%d][filter_x=%d] acc(%d) += %d * (%d + %d) "
+                         "= %d\n",
+                         batch, out_y, out_x, in_channel, m, filter_y, filter_x,
+                         acc - filter_val * (input_val + input_offset),
+                         filter_val, input_val, input_offset, acc);
+                }
+              }
+            }
+            if (1 /*bias_data*/) {
+              acc += bias_data[output_channel];
+            }
+
+            printf("  [batch=%d][out_y=%d][out_x=%d][output_channel=%d] acc = "
+                   "%d, bias %d\n",
+                   batch, out_y, out_x, output_channel, acc,
+                   bias_data[output_channel]);
+
+            acc = MultiplyByQuantizedMultiplier(
+                acc, output_multiplier[output_channel],
+                output_rshift[output_channel]);
+
+            printf("  [batch=%d][out_y=%d][out_x=%d][output_channel=%d] acc = "
+                   "%d, multiplier %d, shift %d\n",
+                   batch, out_y, out_x, output_channel, acc,
+                   output_multiplier[output_channel],
+                   output_rshift[output_channel]);
+
+            acc += output_offset;
+            acc = MAX(acc, output_activation_min);
+            acc = MIN(acc, output_activation_max);
+
+            printf("  [batch=%d][out_y=%d][out_x=%d][output_channel=%d] acc = "
+                   "%d\n",
+                   batch, out_y, out_x, output_channel, acc);
+
+            {
+              int x = Offset(output_shape, batch, out_y, out_x, output_channel);
+              if (x >= 96) {
+                printf("Error ! shape=(%d, %d, %d, %d), batch %d, out_y %d, "
+                       "out_x %d, output_channel %d, offset %d\n",
+                       output_shape.n, output_shape.c, output_shape.h,
+                       output_shape.w, batch, out_y, out_x, output_channel, x);
+              }
+            }
+
+            output_data[Offset(output_shape, batch, out_y, out_x,
+                               output_channel)] = static_cast<int8_t>(acc);
+          }
+        }
+      }
+    }
+  }
+
+  int output_size =
+      output_shape.n * output_shape.c * output_shape.h * output_shape.w;
+  for (int i = 0; i < output_size; ++i) {
+    if (output_data[i] != ref_output_data[i]) {
+      printf("  output_data[%d] = %d != %d\n", i, output_data[i],
+             ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  return ret;
+}
+
+typedef struct {
+  int stride_width;
+  int stride_height;
+  int dilation_width_factor;
+  int dilation_height_factor;
+  int padding_width;
+  int padding_height;
+  int depth_multiplier;
+} DwConvParams;
+
+void dw_conv_per_channel_ref(const dw_conv_test_param_t *p_param)
+{
+  const int input_offset = 0;   // symmetric
+  const int output_offset = 0;  // symmetric
+  const int output_activation_min = -128;
+  const int output_activation_max = 127;
+
+  const int stride_width = p_param->stride_w;
+  const int stride_height = p_param->stride_h;
+  const int dilation_width_factor = 1;   // params.dilation_width_factor;
+  const int dilation_height_factor = 1;  // params.dilation_height_factor;
+  const int pad_width = p_param->pad_left;
+  ;
+  const int pad_height = p_param->pad_top;
+  const int depth_multiplier = 1;  // params.depth_multiplier;
+
+  const int batches = p_param->input_n;
+  const int input_height = p_param->input_h;
+  const int input_width = p_param->input_w;
+  const int input_depth = p_param->input_c;
+  const int filter_height = p_param->kh;
+  const int filter_width = p_param->kw;
+  const int output_depth = p_param->output_c;
+  const int output_height = p_param->output_h;
+  const int output_width = p_param->output_w;
+  s8 *input_data = p_param->input_data;
+  s8 *filter_data = p_param->filter_data;
+  s8 *output_data = p_param->output_data;
+  s32 *bias_data = p_param->has_bias ? p_param->bias_data : nullptr;
+  u32 *output_multiplier = p_param->multiplier_data;
+  s8 *output_rshift = p_param->shift_data;
+
+  tl_shape_t input_shape = {
+      static_cast<u32>(batches), static_cast<u32>(input_depth),
+      static_cast<u32>(input_height), static_cast<u32>(input_width)};
+  tl_shape_t filter_shape = {
+      static_cast<u32>(output_depth), static_cast<u32>(input_depth),
+      static_cast<u32>(filter_height), static_cast<u32>(filter_width)};
+  tl_shape_t output_shape = {
+      static_cast<u32>(batches), static_cast<u32>(output_depth),
+      static_cast<u32>(output_height), static_cast<u32>(output_width)};
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("dw_conv_per_channel_ref =>\n");
+  printf("  input shape (n=%d, c=%d, h=%d, w=%d)\n", batches, input_depth,
+         input_height, input_width);
+  // printf("  filter shape (oc=%d, kh=%d, kw=%d\n",
+  //       );
+  printf("  output shape (n=%d, c=%d, h=%d, w=%d)\n", batches, output_depth,
+         output_height, output_width);
+  printf("  stride_h %d, stride_w %d\n", stride_height, stride_width);
+#endif
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          for (int m = 0; m < depth_multiplier; ++m) {
+            const int output_channel = m + in_channel * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - pad_width;
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+            s32 acc = 0;
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  s32 input_val = input_data[Offset(input_shape, batch,
+                                                    in_channel, in_y, in_x)];
+                  s32 filter_val = filter_data[Offset(
+                      filter_shape, 0, output_channel, filter_y, filter_x)];
+                  acc += filter_val * (input_val + input_offset);
+
+#ifdef ENABLE_DEBUG_MSG
+                  printf("  [batch=%d][out_y=%d][out_x=%d][in_channel=%d][m=%d]"
+                         "[filter_y=%d][filter_x=%d] acc(%d) += %d * (%d + %d) "
+                         "= %d, in_x_origin %d, in_x %d\n",
+                         batch, out_y, out_x, in_channel, m, filter_y, filter_x,
+                         acc - filter_val * (input_val + input_offset),
+                         filter_val, input_val, input_offset, acc, in_x_origin,
+                         in_x);
+#endif
+                }
+              }
+            }
+            if (bias_data) {
+              acc += bias_data[output_channel];
+            }
+
+#ifdef ENABLE_DEBUG_MSG
+            printf("  [batch=%d][out_y=%d][out_x=%d][output_channel=%d] acc = "
+                   "%d, bias %d\n",
+                   batch, out_y, out_x, output_channel, acc,
+                   bias_data ? bias_data[output_channel] : 0);
+#endif
+
+            acc = MultiplyByQuantizedMultiplier(
+                acc, output_multiplier[output_channel],
+                static_cast<int>(output_rshift[output_channel]));
+
+#ifdef ENABLE_DEBUG_MSG
+            printf("  [batch=%d][out_y=%d][out_x=%d][output_channel=%d] acc = "
+                   "%d, multiplier %d, shift %d\n",
+                   batch, out_y, out_x, output_channel, acc,
+                   output_multiplier[output_channel],
+                   output_rshift[output_channel]);
+#endif
+
+            acc += output_offset;
+            acc = MAX(acc, output_activation_min);
+            acc = MIN(acc, output_activation_max);
+
+#ifdef ENABLE_DEBUG_MSG
+            printf("  [batch=%d][out_y=%d][out_x=%d][output_channel=%d] acc = "
+                   "%d\n",
+                   batch, out_y, out_x, output_channel, acc);
+#endif
+
+            output_data[Offset(output_shape, batch, output_channel, out_y,
+                               out_x)] = static_cast<int8_t>(acc);
+          }
+        }
+      }
+    }
+  }
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("<= dw_conv_per_channel_ref\n");
+#endif
+}
+
+void calc_dw_conv_float_multiplier(dw_conv_test_param_t *p_param)
+{
+  const int input_offset = 0;  // symmetric
+
+  const int stride_width = p_param->stride_w;
+  const int stride_height = p_param->stride_h;
+  const int dilation_width_factor = 1;   // params.dilation_width_factor;
+  const int dilation_height_factor = 1;  // params.dilation_height_factor;
+  const int pad_width = p_param->pad_left;
+  ;
+  const int pad_height = p_param->pad_top;
+  const int depth_multiplier = 1;  // params.depth_multiplier;
+
+  const int batches = p_param->input_n;
+  const int input_height = p_param->input_h;
+  const int input_width = p_param->input_w;
+  const int input_depth = p_param->input_c;
+  const int filter_height = p_param->kh;
+  const int filter_width = p_param->kw;
+  const int output_depth = p_param->output_c;
+  const int output_height = p_param->output_h;
+  const int output_width = p_param->output_w;
+  s8 *input_data = p_param->input_data;
+  s8 *filter_data = p_param->filter_data;
+  s32 *bias_data = p_param->has_bias ? p_param->bias_data : nullptr;
+
+  tl_shape_t input_shape = {
+      static_cast<u32>(batches), static_cast<u32>(input_depth),
+      static_cast<u32>(input_height), static_cast<u32>(input_width)};
+  tl_shape_t filter_shape = {
+      static_cast<u32>(output_depth), static_cast<u32>(input_depth),
+      static_cast<u32>(filter_height), static_cast<u32>(filter_width)};
+
+  int output_accu_min = INT_MAX;
+  int output_accu_max = INT_MIN;
+
+  // printf("calc_dw_conv_float_multiplier =>\n");
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+          for (int m = 0; m < depth_multiplier; ++m) {
+            const int output_channel = m + in_channel * depth_multiplier;
+            const int in_x_origin = (out_x * stride_width) - pad_width;
+            const int in_y_origin = (out_y * stride_height) - pad_height;
+            s32 acc = 0;
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  s32 input_val = input_data[Offset(input_shape, batch,
+                                                    in_channel, in_y, in_x)];
+                  s32 filter_val = filter_data[Offset(
+                      filter_shape, 0, output_channel, filter_y, filter_x)];
+                  acc += filter_val * (input_val + input_offset);
+
+                  // printf("
+                  // [batch=%d][out_y=%d][out_x=%d][in_channel=%d][m=%d]"
+                  //        "[filter_y=%d][filter_x=%d] acc(%d) += %d * (%d +
+                  //        %d) = %d\n",
+                  //         batch, out_y, out_x, in_channel, m, filter_y,
+                  //         filter_x, acc - filter_val * (input_val +
+                  //         input_offset), filter_val, input_val, input_offset,
+                  //         acc);
+                }
+              }
+            }
+            if (bias_data) {
+              acc += bias_data[output_channel];
+            }
+
+            output_accu_max = MAX(acc, output_accu_max);
+            output_accu_min = MIN(acc, output_accu_min);
+
+            // printf("  [batch=%d][out_y=%d][out_x=%d][output_channel=%d] acc =
+            // %d, MIN = %d, MAX = %d\n",
+            //        batch, out_y, out_x, output_channel, acc,
+            //        output_accu_min, output_accu_max);
+          }
+        }
+      }
+    }
+  }
+
+  // Since int8 ranges from -128 to 127, we need to squeeze the accumulator
+  // MIN/MAX fit in those ranges correspondingly as much as possible.
+  if (abs(output_accu_max) > abs(output_accu_min)) {
+    p_param->float_multiplier = 127.0f / abs(output_accu_max);
+  } else {
+    p_param->float_multiplier = 128.0f / abs(output_accu_min);
+  }
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("  output_accu_min %d, output_accu_max %d, output_multiplier %f\n",
+         output_accu_min, output_accu_max, p_param->float_multiplier);
+#endif
+
+  // printf("<= calc_dw_conv_float_multiplier\n");
+}
+
+int simple_dw_conv_test(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk)
+{
+  int ret = 0;
+
+  if (ctx == nullptr) {
+    return -1;
+  }
+  if (bmk == nullptr) {
+    return -1;
+  }
+
+  const int batches = 1;
+  const int input_depth = 8;
+  const int input_height = 5;
+  const int input_width = 6;
+  tl_shape_t input_shape = {batches, input_depth, input_height, input_width};
+  s8 input_data[240] = {
+      /* ic = 0 */
+      103, -104, 119, -4, 106, -119, 65, -85, -117, -47, -31, -3, 65, -87, -41,
+      -63, 117, -63, -66, -64, 84, -55, 17, 71, -8, 75, 26, 63, 105, 127,
+
+      /* ic = 1 */
+      85, 12, 119, 76, 35, -36, 61, -51, -90, 67, -75, -11, 78, 36, 96, 82, 20,
+      79, -125, 116, 75, 46, 7, -37, -29, -17, -8, 125, 14, -14,
+
+      /* ic = 2 */
+      -96, -57, -88, 76, 74, 89, 62, 52, -104, 115, 67, -14, -58, -98, 21, 1,
+      12, 87, 109, 29, 21, 65, -109, 111, -125, -49, -61, -13, -89, -108,
+
+      /* ic = 3 */
+      120, -80, 57, -52, 96, -74, -7, 76, 126, -3, -115, -4, 52, -12, 78, 112,
+      -88, 125, -73, 71, 24, -28, -25, 119, 58, 92, -41, 56, 0, 90,
+
+      /* ic = 4 */
+      105, 12, 120, -92, 117, 111, -28, -120, 82, -120, 75, 37, 46, -1, -71, 50,
+      -93, -63, -39, -7, 12, 64, -115, -95, -42, 7, 39, -107, 83, -15,
+
+      /* ic = 5 */
+      -72, 126, 123, -127, 0, 46, -20, -47, 82, 41, -119, 75, -128, 70, -124,
+      -23, 67, -43, 104, 124, 59, 15, -38, -89, -29, 30, 7, 105, -10, 26,
+
+      /* ic = 6 */
+      33, 117, 117, -21, 39, 45, 39, -58, 49, -16, -81, 53, 39, 117, 64, 100,
+      -90, 80, -78, -38, 106, -31, 7, 17, -87, -86, 48, -70, 9, -101,
+
+      /* ic = 7 */
+      -50, 127, -100, -100, 76, -26, -84, 95, -96, -96, -24, 107, 53, 18, 82,
+      -20, -70, -52, 89, -111, 49, -75, 23, -27, 109, -98, 55, 1, 11, -1};
+
+  const int kernel_height = 3;
+  const int kernel_width = 3;
+  tl_shape_t filter_shape = {1, input_depth, kernel_height, kernel_width};
+  // Global memory layout: OcKhKw
+  s8 filter_data[72] = {
+      103,  -104, 119,  -4,  106, -119, 65,   -85,  -117, 85,  12,  119,
+      76,   35,   -36,  61,  -51, -90,  -96,  -57,  -88,  76,  74,  89,
+      62,   52,   -104, 120, -80, 57,   -52,  96,   -74,  -7,  76,  126,
+      105,  12,   120,  -92, 117, 111,  -28,  -120, 82,   -72, 126, 123,
+      -127, 0,    46,   -20, -47, 82,   33,   117,  117,  -21, 39,  45,
+      39,   -58,  49,   -50, 127, -100, -100, 76,   -26,  -84, 95,  -96};
+
+  s32 bias_data[8] = {812, 670, -746, 938, 827, -558, 265, -384};
+
+  u32 output_multiplier[8] = {1155460505, 1210948247, 1203328687, 1166122678,
+                              1155273687, 1196350022, 1169748238, 1183287581};
+
+  // Change to right shift
+  s8 output_rshift[8] = {7, 6, 6, 9, 8, 6, 6, 7};
+
+  u8 per_channel_cal_data[8 * 4 + 8 * 4 + 8];
+  pack_chl_quan_param(8, /*has_bias=*/true, bias_data, output_multiplier,
+                      output_rshift, per_channel_cal_data);
+
+  const int output_height = 3;
+  const int output_width = 4;
+  tl_shape_t output_shape = {batches, input_depth, output_height, output_width};
+  s8 ref_output_data[96] = {
+      /* oc = 0 */
+      127, -101, 60, -128, -46, -11, -94, 24, -62, -2, -30, -128,
+
+      /* oc = 1 */
+      127, -65, 123, 26, -128, 48, -36, 124, 95, 111, 98, 127,
+
+      /* oc = 2 */
+      69, 39, 31, -128, -37, -128, 127, -128, 127, 106, -21, 30,
+
+      /* oc = 3 */
+      34, 13, 17, -21, 1, -3, 19, -1, 24, 4, -1, 32,
+
+      /* oc = 4 */
+      36, 26, 3, 72, -6, -50, 3, -52, 10, 3, -11, -85,
+
+      /* oc = 5 */
+      127, 6, -128, 55, 109, 37, -18, -123, -74, -128, -12, -11,
+
+      /* oc = 6 */
+      127, 127, -58, 127, 98, -20, -40, -54, 127, 127, 58, -35,
+
+      /* oc = 7 */
+      127, -67, -64, 94, -14, 79, -115, -1, -128, 127, -72, 34};
+
+  bmk1880v2_tensor_lmem_t *tl_input =
+      bmk1880v2_lmem_alloc_tensor(bmk, input_shape, FMT_I8, /*eu_aign=*/1);
+
+  bmk1880v2_tensor_lmem_t *tl_filter =
+      bmk1880v2_lmem_alloc_tensor(bmk, filter_shape, FMT_I8, /*eu_align=*/1);
+
+  bmk1880v2_tensor_lmem_t *tl_output =
+      bmk1880v2_lmem_alloc_tensor(bmk, output_shape, FMT_I8, /*eu_align=*/1);
+
+  bmk1880v2_tensor_lmem_t *tl_per_channel_cal =
+      bmk1880v2_lmem_alloc_tensor(bmk, {1, 8, 1, 9}, FMT_U8,
+                                  /*eu_align*/ 0);
+
+  put_tensor_g2l(ctx, bmk, tl_per_channel_cal, per_channel_cal_data);
+  put_tensor_g2l(ctx, bmk, tl_input, reinterpret_cast<u8 *>(input_data));
+  put_tensor_g2l(ctx, bmk, tl_filter, reinterpret_cast<u8 *>(filter_data));
+
+  {
+    // Reshape per channel quantization data
+    tl_per_channel_cal->shape = {1, 8, 1, 1};
+    tl_per_channel_cal->stride = bmk1880v2_tensor_lmem_default_stride(
+        bmk, tl_per_channel_cal->shape, FMT_I8, /*eu_align=*/0);
+
+    bmk1880v2_tiu_depthwise_convolution_qdm_param_t param;
+    memset(&param, 0, sizeof(param));
+    param.ofmap = tl_output;
+    param.ifmap = tl_input;
+    param.weight = tl_filter;
+    param.chl_quan_param = tl_per_channel_cal;
+    param.dilation_h = 1;
+    param.dilation_w = 1;
+    param.stride_h = 1;
+    param.stride_w = 1;
+    param.has_bias = 1;
+    bmk1880v2_tiu_depthwise_convolution_qdm(bmk, &param);
+  }
+
+  test_submit(ctx);
+
+  printf("Compare tiu and golden\n");
+  s8 *conv_output_data =
+      reinterpret_cast<s8 *>(get_tensor_l2g(ctx, bmk, tl_output));
+  for (int i = 0; i < static_cast<int>(sizeof(ref_output_data)); i++) {
+    if (conv_output_data[i] != ref_output_data[i]) {
+      printf("output_data[%d] %d != %d\n", i, conv_output_data[i],
+             ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  free(conv_output_data);
+
+  s8 output_data[96] = {0};
+  memset(output_data, 0, sizeof(output_data));
+
+  dw_conv_test_param_t params;
+  memset(&params, 0, sizeof(params));
+  params.input_n = batches;
+  params.input_c = input_depth;
+  params.input_h = input_height;
+  params.input_w = input_width;
+  params.kh = kernel_height;
+  params.kw = kernel_width;
+  params.output_c = input_depth;
+  params.output_h = output_height;
+  params.output_w = output_width;
+  params.stride_w = 1;
+  params.stride_h = 1;
+  params.input_data = input_data;
+  params.filter_data = filter_data;
+  params.output_data = output_data;
+  params.has_bias = 1;
+  params.bias_data = bias_data;
+  params.multiplier_data = output_multiplier;
+  params.shift_data = output_rshift;
+
+  dw_conv_per_channel_ref(&params);
+
+  printf("Compare ref and golden\n");
+  int output_size =
+      output_shape.n * output_shape.c * output_shape.h * output_shape.w;
+  for (int i = 0; i < output_size; ++i) {
+    if (output_data[i] != ref_output_data[i]) {
+      printf("  output_data[%d] = %d != %d\n", i, output_data[i],
+             ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  // Reverse order
+  bmk1880v2_lmem_free_tensor(bmk, tl_per_channel_cal);
+  bmk1880v2_lmem_free_tensor(bmk, tl_output);
+  bmk1880v2_lmem_free_tensor(bmk, tl_filter);
+  bmk1880v2_lmem_free_tensor(bmk, tl_input);
+
+  return ret;
+}
+
+int choose_from_range(int table[], int size, int index)
+{
+  if (index >= size) {
+    return 0;
+  }
+
+  int val = table[index];
+  if (index < (size - 1)) {
+    int range = MAX(table[index + 1] - table[index] - 1, 1);
+    val += rand() % range;
+  }
+
+  return val;
+}
+
+void dump_test_param(dw_conv_test_param_t *p_param, bool dump_content)
+{
+  printf("Dump test parameter:\n");
+  printf("  input_n %d\n", p_param->input_n);
+  printf("  input_c %d\n", p_param->input_c);
+  printf("  input_h %d\n", p_param->input_h);
+  printf("  input_w %d\n", p_param->input_w);
+  printf("  kw %d\n", p_param->kw);
+  printf("  kh %d\n", p_param->kh);
+  printf("  dh %d\n", p_param->dh);
+  printf("  dw %d\n", p_param->dw);
+  printf("  pad_top %d\n", p_param->pad_top);
+  printf("  pad_bot %d\n", p_param->pad_bot);
+  printf("  pad_left %d\n", p_param->pad_left);
+  printf("  pad_right %d\n", p_param->pad_right);
+  printf("  ins_h %d\n", p_param->ins_h);
+  printf("  ins_h_last %d\n", p_param->ins_h_last);
+  printf("  ins_w %d\n", p_param->ins_w);
+  printf("  ins_w_last %d\n", p_param->ins_w_last);
+  printf("  stride_h %d\n", p_param->stride_h);
+  printf("  stride_w %d\n", p_param->stride_w);
+  printf("  output_c %d\n", p_param->output_c);
+  printf("  output_h %d\n", p_param->output_h);
+  printf("  output_w %d\n", p_param->output_w);
+  printf("  has_bias %d\n", p_param->has_bias);
+  printf("  relu_enable %d\n", p_param->relu_enable);
+
+  if (dump_content) {
+    printf("input_data(%d, %d, %d, %d) :\n", p_param->input_n, p_param->input_c,
+           p_param->input_h, p_param->input_w);
+    int in = p_param->input_n;
+    int ic = p_param->input_c;
+    int ih = p_param->input_h;
+    int iw = p_param->input_w;
+    for (int i = 0; i < in; ++i) {
+      for (int j = 0; j < ic; ++j) {
+        for (int k = 0; k < ih; ++k) {
+          for (int l = 0; l < iw; ++l) {
+            int offset = i * (ic * ih * iw) + j * (ih * iw) + k * iw + l;
+            printf("%d, ", p_param->input_data[offset]);
+          }
+          printf("\n");
+        }
+      }
+    }
+    printf("\n\n");
+
+    printf("kener_data (%d, %d, %d)\n", p_param->output_c, p_param->kh,
+           p_param->kw);
+    int kh = p_param->kh;
+    int kw = p_param->kw;
+    for (int i = 0; i < ic; ++i) {
+      for (int j = 0; j < kh; ++j) {
+        for (int k = 0; k < kw; ++k) {
+          int offset = i * (kh * kw) + j * kw + k;
+          printf("%d, ", p_param->filter_data[offset]);
+        }
+      }
+      printf("\n");
+    }
+    printf("\n\n");
+
+    if (p_param->has_bias) {
+      printf("bias_data:\n");
+      for (int i = 0; i < ic; ++i) {
+        printf("%d, ", p_param->bias_data[i]);
+      }
+      printf("\n\n");
+    }
+
+    printf("multiplier_data:\n");
+    for (int i = 0; i < ic; ++i) {
+      printf("%d, ", p_param->multiplier_data[i]);
+    }
+    printf("\n\n");
+
+    printf("shift_data:\n");
+    for (int i = 0; i < ic; ++i) {
+      printf("%d, ", p_param->shift_data[i]);
+    }
+    printf("\n\n");
+  }
+}
+
+int run_compare_dw_conv(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx,
+                        dw_conv_test_param_t *p_param)
+{
+  int ret = 0;
+
+  if (ctx == nullptr || bk_ctx == nullptr) {
+    return -1;
+  }
+
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int oh = p_param->output_h;
+  int ow = p_param->output_w;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_last_h = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_last_w = p_param->ins_w_last;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int has_bias = p_param->has_bias;
+  int relu_enable = p_param->relu_enable;
+
+  int input_size = in * ic * iw * ih;
+  s8 *input_data = (s8 *)malloc(input_size);
+
+  int kernel_size = oc * ic * kh * kw;
+  s8 *kernel_data = (s8 *)malloc(kernel_size);
+
+  int output_size = in * oc * oh * ow;
+  s8 *output_data = (s8 *)malloc(output_size);
+  if (!input_data || !kernel_data || !output_data) {
+    free(input_data);
+    free(kernel_data);
+    free(output_data);
+    return -1;
+  }
+
+  memset(output_data, 0, output_size);
+
+  s32 *bias_data = (s32 *)malloc(sizeof(s32) * oc);
+  u32 *multiplier_data = (u32 *)malloc(sizeof(u32) * oc);
+  s8 *shift_data = (s8 *)malloc(oc);
+
+  p_param->input_data = input_data;
+  p_param->filter_data = kernel_data;
+  p_param->output_data = output_data;
+  p_param->has_bias = has_bias;
+  p_param->bias_data = bias_data;
+  p_param->multiplier_data = multiplier_data;
+  p_param->shift_data = shift_data;
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    run_compare_dw_conv =>\n");
+  printf("      input (n=%d, ic=%d, h=%d, w=%d), kernel (oc=%d, ic=%d, h=%d, "
+         "w=%d), output (, c=%d, h=%d, w=%d), has_bias %d\n",
+         in, ic, ih, iw, oc, ic, kh, kw, oc, oh, ow, has_bias);
+#endif
+
+  int retry_cnt = p_param->retry_cnt;
+  do {
+    fill_random_data_s8(input_data, input_size);
+    fill_random_data_s8(kernel_data, kernel_size);
+    if (has_bias) {
+      fill_random_data_s32(bias_data, oc);
+    }
+
+    p_param->float_multiplier = 100.0;  // should be < 1.0
+    calc_dw_conv_float_multiplier(p_param);
+
+    if (p_param->float_multiplier > 0.f && p_param->float_multiplier < 1.0) {
+      break;
+    }
+
+  } while (--retry_cnt);
+
+  if (p_param->float_multiplier >= 1.0) {
+    printf("    run_compare_dw_conv: unable to find valid multiplier\n");
+    free(input_data);
+    free(kernel_data);
+    free(output_data);
+    free(bias_data);
+    free(multiplier_data);
+    free(shift_data);
+    return -1;
+  }
+
+  u32 base_multiplier = 0;
+  int base_shift = 0;
+  QuantizeMultiplierSmallerThanOne(p_param->float_multiplier, &base_multiplier,
+                                   &base_shift);
+
+  for (int i = 0; i < oc; ++i) {
+    // multipliers typically range in [2^30 ; 2^31 - 1].
+    // Values in [0, 2^30 - 1] are normally unused, but harmless.
+    // Thus a good way to randomize multipliers is to subtract from them
+    // a random value smaller than 2^30 but still significant compared to it.
+    p_param->multiplier_data[i] = base_multiplier - (rand() % (1 << 26));
+
+    // Our H/W only supports right shift
+    int right_shift = base_shift - 1 + (rand() % 4);
+    p_param->shift_data[i] = right_shift > 0 ? right_shift : 0;
+
+#ifdef ENABLE_DEBUG_MSG
+    printf("      [oc=%d] multiplier_data %d, shift_data %d\n", i,
+           p_param->multiplier_data[i], p_param->shift_data[i]);
+#endif
+  }
+
+  dw_conv_per_channel_ref(p_param);
+
+  const int per_chan_cal_data_size =
+      p_param->has_bias ? 9 : 5;  // bias(4) + multiplier(4) + shift(1)
+  const int cal_data_size = oc * per_chan_cal_data_size;
+  u8 *cal_data = (u8 *)malloc(cal_data_size);
+  pack_chl_quan_param(oc, p_param->has_bias, p_param->bias_data,
+                      p_param->multiplier_data, p_param->shift_data,
+                      cal_data);
+
+  tl_shape_t input_shape = {static_cast<u32>(in), static_cast<u32>(ic),
+                            static_cast<u32>(ih), static_cast<u32>(iw)};
+  tl_shape_t filter_shape = {1, static_cast<u32>(oc), static_cast<u32>(kh),
+                             static_cast<u32>(kw)};
+  tl_shape_t output_shape = {static_cast<u32>(in), static_cast<u32>(oc),
+                             static_cast<u32>(oh), static_cast<u32>(ow)};
+  tl_shape_t cal_shape = {1, static_cast<u32>(oc), 1,
+                          static_cast<u32>(per_chan_cal_data_size)};
+
+  bmk1880v2_tensor_lmem_t *tl_input =
+      bmk1880v2_lmem_alloc_tensor(bk_ctx, input_shape, FMT_I8, /*eu_aign=*/1);
+
+  bmk1880v2_tensor_lmem_t *tl_filter =
+      bmk1880v2_lmem_alloc_tensor(bk_ctx, filter_shape, FMT_I8, /*eu_align=*/1);
+
+  bmk1880v2_tensor_lmem_t *tl_output =
+      bmk1880v2_lmem_alloc_tensor(bk_ctx, output_shape, FMT_I8, /*eu_align=*/1);
+
+  // Shape for TDMA load
+  bmk1880v2_tensor_lmem_t *tl_cal_data =
+      bmk1880v2_lmem_alloc_tensor(bk_ctx, cal_shape, FMT_U8, /*eu_align*/ 0);
+
+  if (!tl_input || !tl_filter || !tl_output || !tl_cal_data) {
+    if (tl_input == nullptr) {
+      printf("      fail to alloc tl_input (%d, %d, %d, %d)\n", input_shape.n,
+             input_shape.c, input_shape.h, input_shape.w);
+    }
+    if (tl_filter == nullptr) {
+      printf("      fail to alloc tl_filter (%d, %d, %d, %d)\n", filter_shape.n,
+             filter_shape.c, filter_shape.h, filter_shape.w);
+    }
+    if (tl_output == nullptr) {
+      printf("      fail to alloc tl_output (%d, %d, %d, %d)\n", output_shape.n,
+             output_shape.c, output_shape.h, output_shape.w);
+    }
+    if (tl_cal_data == nullptr) {
+      printf("      fail to alloc tl_cal_data (%d, %d, %d, %d)\n", cal_shape.n,
+             cal_shape.c, cal_shape.h, cal_shape.w);
+    }
+
+    // Reverse order
+    if (tl_cal_data)
+      bmk1880v2_lmem_free_tensor(bk_ctx, tl_cal_data);
+    if (tl_output)
+      bmk1880v2_lmem_free_tensor(bk_ctx, tl_output);
+    if (tl_filter)
+      bmk1880v2_lmem_free_tensor(bk_ctx, tl_filter);
+    if (tl_input)
+      bmk1880v2_lmem_free_tensor(bk_ctx, tl_input);
+
+    return -1;
+  }
+
+  put_tensor_g2l(ctx, bk_ctx, tl_cal_data, cal_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_input, reinterpret_cast<u8 *>(input_data));
+  put_tensor_g2l(ctx, bk_ctx, tl_filter, reinterpret_cast<u8 *>(kernel_data));
+
+  {
+    // Reshape per channel quantization data for TIU
+    tl_cal_data->shape = {1, static_cast<u32>(oc), 1, 1};
+    tl_cal_data->stride = bmk1880v2_tensor_lmem_default_stride(
+        bk_ctx, tl_cal_data->shape, FMT_I8, /*eu_align=*/0);
+
+    bmk1880v2_tiu_depthwise_convolution_qdm_param_t param;
+    memset(&param, 0, sizeof(param));
+    param.ofmap = tl_output;
+    param.ifmap = tl_input;
+    param.weight = tl_filter;
+    param.chl_quan_param = tl_cal_data;
+    param.ins_h = ins_h;
+    param.ins_last_h = ins_last_h;
+    param.ins_w = ins_w;
+    param.ins_last_w = ins_last_w;
+    param.stride_h = stride_h;
+    param.stride_w = stride_w;
+    param.dilation_h = dh;
+    param.dilation_w = dw;
+    param.pad_top = pad_top;
+    param.pad_bottom = pad_bot;
+    param.pad_left = pad_left;
+    param.pad_right = pad_right;
+    param.has_bias = has_bias;
+    param.relu_enable = relu_enable;
+
+#ifdef ENABLE_DEBUG_MSG
+    printf("      tiu_dw_conv_qdm:\n");
+    printf("        ifmap shape (%d, %d, %d, %d)\n", param.ifmap->shape.n,
+           param.ifmap->shape.c, param.ifmap->shape.h, param.ifmap->shape.w);
+    printf("        weight shape (%d, %d, %d, %d)\n", param.weight->shape.n,
+           param.weight->shape.c, param.weight->shape.h, param.weight->shape.w);
+    printf("        ofmap shape (%d, %d, %d, %d)\n", param.ofmap->shape.n,
+           param.ofmap->shape.c, param.ofmap->shape.h, param.ofmap->shape.w);
+#endif
+
+    bmk1880v2_tiu_depthwise_convolution_qdm(bk_ctx, &param);
+  }
+
+  test_submit(ctx);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("      compare result:\n");
+#endif
+  s8 *conv_output_data =
+      reinterpret_cast<s8 *>(get_tensor_l2g(ctx, bk_ctx, tl_output));
+  for (int i = 0; i < output_size; i++) {
+    if (conv_output_data[i] != output_data[i]) {
+      printf("        output_data[%d] %d(tiu) != %d(ref)\n", i,
+             conv_output_data[i], output_data[i]);
+      ret = -1;
+      break;
+    }
+  }
+
+  if (ret) {
+    dump_test_param(p_param, /*dump_content=*/true);
+  }
+
+  // Reverse order
+  bmk1880v2_lmem_free_tensor(bk_ctx, tl_cal_data);
+  bmk1880v2_lmem_free_tensor(bk_ctx, tl_output);
+  bmk1880v2_lmem_free_tensor(bk_ctx, tl_filter);
+  bmk1880v2_lmem_free_tensor(bk_ctx, tl_input);
+
+  free(conv_output_data);
+
+  free(input_data);
+  free(kernel_data);
+  free(output_data);
+  free(bias_data);
+  free(multiplier_data);
+  free(shift_data);
+  free(cal_data);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    <= run_compare_dw_conv\n");
+#endif
+
+  return ret;
+}
+
+bool check_valid_test_param(bmk_ctx_t *bk_ctx, dw_conv_test_param_t *p_param)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int oh = p_param->output_h;
+  int ow = p_param->output_w;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int per_chan_cal_data_size =
+      p_param->has_bias ? 9 : 5;  // bias(4) + multiplier(4) + shift(1)
+
+  // Skip invalid shape
+  if ((kh > ih) || (kw > iw) || (stride_h > ih) || (stride_w > iw)) {
+    return false;
+  }
+
+  // muliply random-choosen value may exceeded than s32
+  u32 input_size = in * ic * ih * iw;
+  u32 kernel_size = ic * kh * kw;  // no oc
+  u32 output_size = in * oc * oh * ow;
+
+  bmk1880v2_chip_info_t chip_info = bmk1880v2_chip_info();
+  u32 lmem_size_per_lane = chip_info.lmem_size;
+  u32 total_lmem_size = chip_info.lmem_size * chip_info.npu_num;
+
+  u32 total_needed_size = input_size + kernel_size + output_size +
+                          per_chan_cal_data_size * chip_info.npu_num;
+  if (total_needed_size > total_lmem_size) {
+    return false;
+  }
+
+  tl_shape_t input_shape = {static_cast<u32>(in), static_cast<u32>(ic),
+                            static_cast<u32>(ih), static_cast<u32>(iw)};
+  tl_shape_t filter_shape = {1, static_cast<u32>(oc), static_cast<u32>(kh),
+                             static_cast<u32>(kw)};
+  tl_shape_t output_shape = {static_cast<u32>(in), static_cast<u32>(oc),
+                             static_cast<u32>(oh), static_cast<u32>(ow)};
+  tl_shape_t cal_shape = {1, static_cast<u32>(oc), 1,
+                          static_cast<u32>(per_chan_cal_data_size)};
+
+  u32 needed_size =
+      bmk1880v2_lmem_tensor_to_size(bk_ctx, input_shape, FMT_I8, /*eu_align=*/1) +
+      bmk1880v2_lmem_tensor_to_size(bk_ctx, filter_shape, FMT_I8, /*eu_align=*/1) +
+      bmk1880v2_lmem_tensor_to_size(bk_ctx, output_shape, FMT_I8, /*eu_align=*/1) +
+      bmk1880v2_lmem_tensor_to_size(bk_ctx, cal_shape, FMT_I8, /*eu_align=*/0);
+
+  // Skip invalid shape
+  if (needed_size > lmem_size_per_lane) {
+    return false;
+  }
+
+  return true;
+}
+
+int random_test(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx)
+{
+  int ret = 0;
+
+  if (ctx == nullptr || bk_ctx == nullptr) {
+    return -1;
+  }
+
+#ifndef ENABLE_FULL_REGRESSION
+#ifndef ENABLE_TV_GEN_PATTERN
+  // Input with same range size
+  // n: 12b, c: 12b, h: 12b(4095-32), wb: 12b(4095-32)
+  int batch_range[] = {1, 1, 2, 4095 - 32};
+  int input_height_range[] = {1, 512, 1024, 4095 - 32};
+  int input_width_range[] = {1, 512, 1024, 4095 - 32};
+  int input_depth_range[] = {1, 16, 32, 4095 - 32};
+
+  // Kernel with same range size
+  // h: 12b, w: 12b
+  // stride_h: 4b, strid_w: 4b
+  int kernel_height_range[] = {1, 11, 4095};
+  int kernel_width_range[] = {1, 11, 4095};
+  int kernel_stride_height_range[] = {1, 5, 15};
+  int kernel_stride_width_range[] = {1, 5, 15};
+#else
+  // TV_GEN pattern
+  //  Random Test, total 2187, skipped 13095, executed 27, failed 0, ret 0
+
+  // Input with same range size
+  // n: 12b, c: 12b, h: 12b(4095-32), wb: 12b(4095-32)
+  int batch_range[] = {1, 1, 3232};
+  int input_height_range[] = {1, 512, 4095 - 32};
+  int input_width_range[] = {1, 512, 4095 - 32};
+  int input_depth_range[] = {1, 16, 4095 - 32};
+
+  // Kernel with same range size
+  // h: 12b, w: 12b
+  // stride_h: 4b, strid_w: 4b
+  int kernel_height_range[] = {1, 11, 4095};
+  int kernel_width_range[] = {1, 11, 4095};
+  int kernel_stride_height_range[] = {1, 5, 15};
+  int kernel_stride_width_range[] = {1, 5, 15};
+#endif // ENABLE_TV_GEN_PATTERN
+#else
+#if 0
+  // Input with same range size
+  int batch_range[] = {1};
+  int input_height_range[] = {1};
+  int input_width_range[] = {1};
+  int input_depth_range[] = {1};
+
+  // Kernel with same range size
+  int kernel_height_range[] = {1};
+  int kernel_width_range[] = {1};
+  int kernel_stride_height_range[] = {1};
+  int kernel_stride_width_range[] = {1};
+  int output_depth_range[] = {1};
+#else
+  // 10/21/2019
+  // Random Test, total 512000, skipped 2535629, executed 24371
+
+  // Input with same range size
+  // n: 12b, c: 12b, h: 12b(4095-32), wb: 12b(4095-32)
+  int batch_range[] = {1, 2, 4, 8, 16, 32, 64, 4095 - 32};
+  int input_height_range[] = {1, 3, 11, 128, 512, 1024, 2048, 4095 - 32};
+  int input_width_range[] = {1, 3, 11, 128, 512, 1024, 2048, 4095 - 32};
+  int input_depth_range[] = {1, 3, 11, 32, 64, 1024, 2048, 4095 - 32};
+
+  // Kernel with same range size
+  // h: 12b, w: 12b
+  // stride_h: 4b, strid_w: 4b
+  int kernel_height_range[] = {1, 3, 11, 511, 4095};
+  int kernel_width_range[] = {1, 3, 11, 511, 4095};
+  int kernel_stride_height_range[] = {1, 3, 5, 7, 15};
+  int kernel_stride_width_range[] = {1, 3, 5, 7, 15};
+#endif
+#endif /* ENABLE_FULL_REGRESSION */
+
+  const int input_range_size =
+      sizeof(input_height_range) / sizeof(input_height_range[0]);
+  const int kernel_range_size =
+      sizeof(kernel_height_range) / sizeof(kernel_height_range[0]);
+
+  int random_seed = clock();
+  srand(random_seed);
+
+  const int retry_test_count = 100;
+  bool stop_at_first_error = true;
+
+  int total_tests = input_range_size * input_range_size *
+                    input_range_size * input_range_size * kernel_range_size *
+                    kernel_range_size * kernel_range_size;
+  int skipped_tests = 0;
+  int executed_tests = 0;
+  int failed_tests = 0;
+  int current_test = 0;
+
+  printf("Random Test =>\n");
+  for (int m = 0; m < retry_test_count; ++m) {
+    for (int i = 0; i < input_range_size; ++i) {
+      // random choosed from [range[i] : range[i+1]]
+      int batch = choose_from_range(batch_range, input_range_size, i);
+
+      for (int j = 0; j < input_range_size; ++j) {
+        int input_height =
+            choose_from_range(input_height_range, input_range_size, j);
+
+        for (int k = 0; k < input_range_size; ++k) {
+          int input_width =
+              choose_from_range(input_width_range, input_range_size, k);
+
+          for (int l = 0; l < input_range_size; ++l) {
+            int input_depth =
+                choose_from_range(input_depth_range, input_range_size, k);
+
+            for (int m = 0; m < kernel_range_size; ++m) {
+              int kernel_height =
+                  choose_from_range(kernel_height_range, kernel_range_size, m);
+
+              for (int n = 0; n < kernel_range_size; ++n) {
+                int kernel_width =
+                    choose_from_range(kernel_width_range, kernel_range_size, n);
+
+                for (int x = 0; x < kernel_range_size; ++x) {
+                  int kernel_stride_height = choose_from_range(
+                      kernel_stride_height_range, kernel_range_size, x);
+
+                  for (int y = 0; y < kernel_range_size; ++y) {
+                    int kernel_stride_width = choose_from_range(
+                        kernel_stride_width_range, kernel_range_size, y);
+
+#ifdef ENABLE_DEBUG_MSG
+                    printf("  [%d/%d] random test: input shape(%d, %d, %d, %d)",
+                           current_test, total_tests, batch, input_depth,
+                           input_height, input_width);
+                    printf(", kernel shape (oc=%d, ic=%d, kh=%d, kw=%d), "
+                           "stride_h %d, stride_w %d\n",
+                           input_depth, input_depth, kernel_height,
+                           kernel_width, kernel_stride_height,
+                           kernel_stride_width);
+#else
+                    if ((current_test % 10000) == 0) {
+                      printf(
+                          "  [%d/%d] random test: input shape(%d, %d, %d, %d)",
+                          current_test, total_tests, batch, input_depth,
+                          input_height, input_width);
+                      printf(", kernel shape (oc=%d, ic=%d, kh=%d, kw=%d), "
+                             "stride_h %d, stride_w %d\n",
+                             input_depth, input_depth, kernel_height,
+                             kernel_width, kernel_stride_height,
+                             kernel_stride_width);
+                    }
+
+#endif
+
+                    current_test++;
+
+                    int has_bias = rand() % 2;
+                    int dh = 1;
+                    int dw = 1;
+                    int ins_h = 0;
+                    int ins_h_last = 0;
+                    int ins_w = 0;
+                    int ins_w_last = 0;
+                    int pad_top = 0;
+                    int pad_bot = 0;
+                    int pad_left = 0;
+                    int pad_right = 0;
+
+                    int ih_ext = calc_dilute_hw(input_height, ins_h, ins_h_last,
+                                                pad_top, pad_bot);
+                    int iw_ext = calc_dilute_hw(input_width, ins_w, ins_w_last,
+                                                pad_left, pad_right);
+                    int kh_ext = calc_dilute_hw(kernel_height, dh - 1, 0, 0, 0);
+                    int kw_ext = calc_dilute_hw(kernel_width, dw - 1, 0, 0, 0);
+
+                    int oh =
+                        calc_output_hw(ih_ext, kh_ext, kernel_stride_height);
+                    int ow =
+                        calc_output_hw(iw_ext, kw_ext, kernel_stride_width);
+
+                    // depthwise, input depth == output depth
+                    int output_depth = input_depth;
+
+                    dw_conv_test_param_t test_param;
+                    memset(&test_param, 0, sizeof(test_param));
+                    test_param.input_n = batch;
+                    test_param.input_c = input_depth;
+                    test_param.input_h = input_height;
+                    test_param.input_w = input_width;
+                    test_param.kh = kernel_height;
+                    test_param.kw = kernel_width;
+                    test_param.dh = dh;
+                    test_param.dw = dw;
+                    test_param.pad_top = pad_top;
+                    test_param.pad_bot = pad_bot;
+                    test_param.pad_left = pad_left;
+                    test_param.pad_right = pad_right;
+                    test_param.ins_h = ins_h;
+                    test_param.ins_h_last = ins_h_last;
+                    test_param.ins_w = ins_w;
+                    test_param.ins_w_last = ins_w_last;
+                    test_param.stride_h = kernel_stride_height;
+                    test_param.stride_w = kernel_stride_width;
+                    test_param.output_c = output_depth;
+                    test_param.output_h = oh;
+                    test_param.output_w = ow;
+                    test_param.has_bias = has_bias;
+                    test_param.retry_cnt = 5;
+
+                    bool is_valid_param =
+                        check_valid_test_param(bk_ctx, &test_param);
+                    if (is_valid_param == false) {
+                      skipped_tests++;
+#ifdef ENABLE_DEBUG_MSG
+                      printf("  [%d] random test: invalid parameter, skip\n",
+                             m);
+#endif
+                      continue;
+                    }
+
+                    int ret2 = run_compare_dw_conv(ctx, bk_ctx, &test_param);
+                    failed_tests = ret2 ? failed_tests + 1 : failed_tests;
+                    ret |= ret2;
+                    executed_tests++;
+
+#ifdef ENABLE_DEBUG_MSG
+                    printf("  [%d/%d] random test: input shape(%d, %d, %d, %d)",
+                           current_test, total_tests, batch, input_depth,
+                           input_height, input_width);
+                    printf(", kernel shape (%d, %d, %d, %d), result %d\n",
+                           output_depth, input_depth, kernel_height,
+                           kernel_width, ret);
+#endif
+
+                    // Stop at first error
+                    if (ret && stop_at_first_error) {
+                      break;
+                    }
+                  }
+
+                  // Stop at first error
+                  if (ret && stop_at_first_error) {
+                    break;
+                  }
+                }
+
+                // Stop at first error
+                if (ret && stop_at_first_error) {
+                  break;
+                }
+              }
+
+              // Stop at first error
+              if (ret && stop_at_first_error) {
+                break;
+              }
+            }
+
+            // Stop at first error
+            if (ret && stop_at_first_error) {
+              break;
+            }
+          }
+
+          // Stop at first error
+          if (ret && stop_at_first_error) {
+            break;
+          }
+        }
+
+        // Stop at first error
+        if (ret && stop_at_first_error) {
+          break;
+        }
+      }
+
+      // Stop at first error
+      if (ret && stop_at_first_error) {
+        break;
+      }
+    }
+
+    // Stop at first error
+    if (ret && stop_at_first_error) {
+      break;
+    }
+
+    if (executed_tests >= MIN_EXEC_TESTS) {
+      break;
+    }
+  }
+
+  printf(
+      "<= Random Test, total %d, skipped %d, executed %d, failed %d, ret %d\n",
+      total_tests, skipped_tests, executed_tests, failed_tests, ret);
+
+  return ret;
+}
+
+int main()
+{
+  int ret = 0;
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  // ret = simple_nhwc_dw_conv_test(&ctx, bk_ctx);
+  // ret |= test_nhwc_to_nchw();
+  ret |= simple_dw_conv_test(&ctx, bk_ctx);
+  ret |= random_test(&ctx, bk_ctx);
+
+  test_exit(&ctx);
+  return ret;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_depthwise_max_power.cpp b/cviruntime/test/1880v2/test_1880v2_depthwise_max_power.cpp
new file mode 100644
index 000000000..1d3277640
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_depthwise_max_power.cpp
@@ -0,0 +1,597 @@
+#include "1880v2_test_util.h"
+
+typedef bmk1880v2_tiu_depthwise_convolution_param_t depthwise_conv_param_t;
+
+typedef bmk1880v2_tdma_l2tg_tensor_copy_cw_transposed_param_t l2tg_cw_param_t;
+typedef bmk1880v2_tdma_tg2l_matrix_copy_row_col_transposed_param_t tg2l_matrix_param_t;
+typedef bmk1880v2_tdma_l2l_tensor_copy_param_t l2l_tensor_copy_param_t;
+
+typedef struct{
+    s8  *depthwise_conv_input;
+    s8  *depthwise_conv_weight;
+    s16 *depthwise_conv_bias;
+    u8  *depthwise_conv_output;
+    s8  *depthwise_conv_output_ref;
+    u8  *l2g_cw_src;
+    u8  *l2g_cw_output;
+    u8  *l2g_cw_output_ref;
+    u8  *g2l_matrix_src;
+    u8  *g2l_matrix_output;
+    u8  *g2l_matrix_output_ref;
+    u8  *l2l_tensor_src;
+    u8  *l2l_tensor_output;
+    u8  *l2l_tensor_output_ref;
+}s_test_data;
+
+depthwise_conv_param_t depthwise_conv_param;
+l2tg_cw_param_t l2tg_cw_param;
+tg2l_matrix_param_t tg2l_matrix_param;
+l2l_tensor_copy_param_t l2l_tensor_copy_param;
+s_test_data s8_test_data;
+
+bmk1880v2_tensor_lmem_t *skip_tensor_lmem[10];
+u32 skip_tensor_num=0;
+
+void skip_tensor_lmem_size(bmk_ctx_t *bmk, const bmk1880v2_tensor_lmem_t *p)
+{
+  if (!p)
+    return;
+
+  u32 needed = align_up(p->shape.n * p->stride.n, BM1880V2_HW_EU_NUM);
+  u32 start_addr = p->start_address + needed;
+  u32 remain_size = start_addr % BM1880V2_HW_LMEM_BANK_SIZE ? (BM1880V2_HW_LMEM_BANK_SIZE - start_addr % BM1880V2_HW_LMEM_BANK_SIZE) : 0; // remain size for each lane
+  if(remain_size)
+  {
+    tl_shape_t src_shape2 = {1, BM1880V2_HW_NPU_NUM, 1, remain_size};
+    skip_tensor_lmem[skip_tensor_num] = alloc_tl(bmk, src_shape2, FMT_I8, 1); // skip the lmem size and next tl can alignment to bank si     ze
+  }
+  skip_tensor_num++;
+}
+
+void skip_matrix_lmem_size(bmk_ctx_t *bmk, const bmk1880v2_matrix_lmem_t *p)
+{
+  u32 needed = align_up(p->shape.n * p->stride.n, BM1880V2_HW_EU_NUM);
+
+  u32 start_addr = p->start_address + needed; //src_shape.n*src_shape.c*src_shape.h*src_shape.w/32;
+  u32 remain_size = start_addr % BM1880V2_HW_LMEM_BANK_SIZE ? (BM1880V2_HW_LMEM_BANK_SIZE - start_addr % BM1880V2_HW_LMEM_BANK_SIZE) : 0; // remain size for each lane
+  if(remain_size)
+  {
+    tl_shape_t src_shape2 = {1, BM1880V2_HW_NPU_NUM, 1, remain_size};
+    skip_tensor_lmem[skip_tensor_num] = alloc_tl(bmk, src_shape2, FMT_I8, 1); // skip the lmem size and next tl can alignment to bank si     ze
+  }
+  skip_tensor_num++;
+}
+
+void free_skip_tensor_lmem(bmk_ctx_t *ctx)
+{
+  if(skip_tensor_lmem[--skip_tensor_num]!=NULL)
+    free_tl(ctx, skip_tensor_lmem[skip_tensor_num]);
+}
+
+static s8 * alloc_input(const depthwise_conv_param_t *p)
+{
+  u64 size = tl_shape_size(&p->ifmap->shape);
+  s8 *buf = (s8 *)malloc(sizeof(s8) * size);
+
+  for (u64 i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static s8 * alloc_weight(const depthwise_conv_param_t *p)
+{
+  int size = tl_shape_size(&p->weight->shape);
+  s8 *buf = (s8 *)malloc(sizeof(s8) * size);
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static s16 * alloc_bias(const depthwise_conv_param_t *p)
+{
+  int c = p->bias->shape.c;
+  s16 *bias = (s16 *)malloc(sizeof(s16) * c);
+
+  for (int i = 0; i < c; i++)
+    bias[i] = rand() % 65536 - 32768;
+
+  return bias;
+}
+
+static s8 *alloc_output(depthwise_conv_param_t *p)
+{
+  u64 size = tl_shape_size(&p->ofmap->shape);
+  s8 *output = (s8 *)malloc(sizeof(s8) * size);
+  return output;
+}
+
+static inline void relu8(s8 *buf, u64 size)
+{
+  for (u64 i = 0; i < size; i++)
+    if (buf[i] < 0)
+      buf[i] = 0;
+}
+
+static void generate_results(
+    depthwise_conv_param_t *p,
+    s8 input[],
+    s8 weight[],
+    s16 bias[]
+    )
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+  int kh = p->weight->shape.h;
+  int kw = p->weight->shape.w;
+  int opd0_sign = (p->ifmap->fmt == FMT_I8);
+  int res0_sign = (p->ofmap->fmt == FMT_I8);
+  s8_test_data.depthwise_conv_output_ref = alloc_output(p);
+
+  bmerr_t ret = native_pooling_ave_int8(
+      input, weight, p->bias ? bias : NULL, s8_test_data.depthwise_conv_output_ref,
+      in, ic, ih, iw, kh, kw,
+      p->pad_top, p->pad_bottom, p->pad_left, p->pad_right,
+      p->stride_h, p->stride_w,
+      p->ins_h, p->ins_w, p->ins_last_h, p->ins_last_w,
+      opd0_sign, res0_sign, p->rshift_bits, 0);
+  assert(ret == BM_SUCCESS);
+
+  if(p->relu_enable )
+    relu8(s8_test_data.depthwise_conv_output_ref, tl_shape_size(&p->ofmap->shape));
+}
+
+static int pooling_ih_ext(depthwise_conv_param_t *p, int ih)
+{
+  int ins = p->ins_h;
+  int ins_last = p->ins_last_h;
+  int pad = p->pad_top + p->pad_bottom;
+  return (ih - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+static int pooling_iw_ext(depthwise_conv_param_t *p, int iw)
+{
+  int ins = p->ins_w;
+  int ins_last = p->ins_last_w;
+  int pad = p->pad_left + p->pad_right;
+  return (iw - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+static int pooling_oh(depthwise_conv_param_t *p, int ih, int kh)
+{
+  int ih_ext = pooling_ih_ext(p, ih);
+  return (ih_ext - kh) / p->stride_h + 1;
+}
+
+static int pooling_ow(depthwise_conv_param_t *p, int iw, int kw)
+{
+  int iw_ext = pooling_iw_ext(p, iw);
+  return (iw_ext - kw) / p->stride_w + 1;
+}
+
+static void free_depthwise_param(
+    bmk_ctx_t *ctx,
+    depthwise_conv_param_t *p)
+{
+  if (p->bias)
+  {
+    free_skip_tensor_lmem(ctx);
+    free_tl(ctx, p->bias);
+  }
+  if (p->weight)
+  {
+    free_skip_tensor_lmem(ctx);
+    free_tl(ctx, p->weight);
+  }
+  if (p->ifmap)
+  {
+    free_skip_tensor_lmem(ctx);
+    free_tl(ctx, p->ifmap);
+  }
+  if (p->ofmap)
+  {
+    free_skip_tensor_lmem(ctx);
+    free_tl(ctx, p->ofmap);
+  }
+}
+
+static void put_bias_tensor(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    const tl_t *tl,
+    s16 data[])
+{
+  int c = tl->shape.c;
+
+  u8 *lo_hi = (u8 *)xmalloc(2 * c);
+  if (!lo_hi)
+    return;
+
+  for (int i = 0; i < c; i++) {
+    lo_hi[i] = data[i] & 0xff;
+    lo_hi[i + c] = (data[i] >> 8) & 0xff;
+  }
+
+  put_tensor_g2l(ctx, bk_ctx, tl, (u8 *)lo_hi);
+
+  free(lo_hi);
+}
+
+static depthwise_conv_param_t random_depthwise_param(bmk_ctx_t *ctx)
+{
+  srand(clock());
+  depthwise_conv_param_t p;
+
+  memset(&p, 0, sizeof(p));
+
+retry:
+  int using_bias = 0;
+  int n = 1;
+  int c = 4000;
+  int ih = 2;
+  int iw = 8;
+  int kh = 1;
+  int kw = 1;
+  int opd0_sign = 0;
+
+  p.ins_h = rand() % kh;
+  p.ins_w = rand() % kw;
+  p.ins_last_h = rand() % kh;
+  p.ins_last_w = rand() % kw;
+  p.stride_h = rand() % kh + 1;
+  p.stride_w = rand() % kw + 1;
+  p.pad_top = 0;
+  p.pad_bottom = 0;
+  p.pad_left = 0;
+  p.pad_right = 0;
+  p.rshift_bits = 2;
+  int oh = pooling_oh(&p, ih, kh);
+  int ow = pooling_ow(&p, iw, kw);
+  tl_shape_t ofmap_shape;
+  ofmap_shape.n = n;
+  ofmap_shape.c = c;
+  ofmap_shape.h = oh;
+  ofmap_shape.w = ow;
+  tl_shape_t ifmap_shape;
+  ifmap_shape.n = n;
+  ifmap_shape.c = c;
+  ifmap_shape.h = ih;
+  ifmap_shape.w = iw;
+  tl_shape_t weight_shape;
+  weight_shape.n = 1;
+  weight_shape.c = c;
+  weight_shape.h = kh;
+  weight_shape.w = kw;
+  tl_shape_t bias_shape;
+  bias_shape.n = 2;
+  bias_shape.c = c;
+  bias_shape.h = 1;
+  bias_shape.w = 1;
+  p.relu_enable = 1;
+  /*test case ref does not support dilation !=1*/
+  p.dilation_w = 1;
+  p.dilation_h = 1;
+  fmt_t ifmt = opd0_sign ? FMT_I8: FMT_U8;
+
+  p.ofmap = bmk1880v2_lmem_alloc_tensor(ctx, ofmap_shape, FMT_I8, 1);
+  skip_tensor_lmem_size(ctx, p.ofmap);
+  p.ifmap = bmk1880v2_lmem_alloc_tensor(ctx, ifmap_shape, ifmt, 1);
+  skip_tensor_lmem_size(ctx, p.ifmap);
+  p.weight = bmk1880v2_lmem_alloc_tensor(ctx, weight_shape, FMT_I8, 1);
+  skip_tensor_lmem_size(ctx, p.weight);
+  p.bias = NULL;
+  if (using_bias)
+  {
+    p.bias = bmk1880v2_lmem_alloc_tensor(ctx, bias_shape, FMT_I8, 0);
+    skip_tensor_lmem_size(ctx, p.bias);
+  }
+  if ((kh > pooling_ih_ext(&p, ih))
+      || (kw > pooling_iw_ext(&p, iw))
+      || (p.pad_top >= (1 << 4))
+      || (p.pad_bottom >= (1 << 4))
+      || (p.pad_left >= (1 << 4))
+      || (p.pad_right >= (1 << 4))
+      || !p.ofmap
+      || !p.ifmap
+      || !p.weight
+      || (using_bias && !p.bias)) {
+    printf("retry init_pooling_param\n");
+    free_depthwise_param(ctx, &p);
+    goto retry;
+  }
+  return p;
+}
+
+
+static int test_pooling(CVI_RT_HANDLE ctx, bmk_ctx_t *bk_ctx)
+{
+  depthwise_conv_param = random_depthwise_param(bk_ctx);
+
+  s8 *input = alloc_input(&depthwise_conv_param);
+  s8 *weight = alloc_weight(&depthwise_conv_param);
+  s16 *bias = NULL;
+  if (depthwise_conv_param.bias)
+    bias = alloc_bias(&depthwise_conv_param);
+
+  put_tensor_g2l(&ctx, bk_ctx, depthwise_conv_param.ifmap, (u8 *)input);
+  put_tensor_g2l(&ctx, bk_ctx, depthwise_conv_param.weight, (u8 *)weight);
+  if (depthwise_conv_param.bias)
+    put_bias_tensor(&ctx, bk_ctx, depthwise_conv_param.bias, bias);
+
+  generate_results(&depthwise_conv_param, input, weight, bias);
+
+  free(input);
+  free(weight);
+  free(bias);
+
+  return 1;
+}
+
+static void l2tg_tensor_copy_cw_transposed_ref(
+    l2tg_cw_param_t *p, u8 ref_data[], u8 src_data[])
+{
+  tl_shape_t s = p->src->shape;
+  u32 n = s.n;
+  u32 c = s.c;
+  u32 h = s.h;
+  u32 w = s.w;
+
+  for (u32 ni = 0; ni < n; ni++) {
+    for (u32 ci = 0; ci < c; ci++) {
+      for (u32 hi = 0; hi < h; hi++) {
+        for (u32 wi = 0; wi < w; wi++) {
+          u32 src_i = ni * c * h * w + ci * h * w + hi * w + wi;
+          u32 dst_i = ni * c * h * w + wi * h * c + hi * c + ci;
+          ref_data[dst_i] = src_data[src_i];
+        }
+      }
+    }
+  }
+}
+
+static void test_param_l2g(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, l2tg_cw_param_t *p)
+{
+  u64 size = tl_shape_size(&p->src->shape);
+
+  s8_test_data.l2g_cw_src = (u8 *)malloc(sizeof(u8) * size);
+  for (u64 i = 0; i < size; i++)
+    s8_test_data.l2g_cw_src[i] = rand()%0x100;
+
+  s8_test_data.l2g_cw_output_ref = (u8 *)malloc(sizeof(u8) * size);
+  if (!s8_test_data.l2g_cw_output_ref)
+    return;
+
+  l2tg_tensor_copy_cw_transposed_ref(p, s8_test_data.l2g_cw_output_ref, s8_test_data.l2g_cw_src);
+
+  put_tensor_g2l(ctx, bmk, p->src, s8_test_data.l2g_cw_src);
+}
+
+static void destroy_param_l2g(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, l2tg_cw_param_t *p)
+{
+  free_tg_gmem(ctx, p->dst);
+  free_skip_tensor_lmem(bmk);
+  free_tl(bmk, p->src);
+}
+
+static void test_l2tg_cw_transpose(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, l2tg_cw_param_t *p)
+{
+  tl_shape_t src_shape = {1, 0x100, 1, 0x080};
+  tg_shape_t dst_shape = {1, 0x080, 1, 0x100};
+
+  p->src = alloc_tl(bmk, src_shape, FMT_I8, 1);
+  p->dst = alloc_tg_gmem(ctx, dst_shape, FMT_I8);
+  skip_tensor_lmem_size(bmk, p->src);
+  test_param_l2g(ctx, bmk, p);
+}
+
+static void tg2l_matrix_copy_row_col_transposed_ref(
+    tg2l_matrix_param_t *p, u8 ref_data[], u8 src_data[])
+{
+  u64 row = p->src->shape.row;
+  u64 col = p->src->shape.col;
+
+  for (u64 ri = 0; ri < row; ri++) {
+    for (u64 ci = 0; ci < col; ci++) {
+      u64 src_i = ri * col + ci;
+      u64 dst_i = ci * row + ri;
+      ref_data[dst_i] = src_data[src_i];
+    }
+  }
+}
+
+static void test_param_g2l(CVI_RT_HANDLE *ctx, tg2l_matrix_param_t *p)
+{
+  u64 size = ml_shape_size(&p->dst->shape);
+
+  s8_test_data.g2l_matrix_src = (u8 *)malloc(sizeof(u8) * size);
+  if (!s8_test_data.g2l_matrix_src)
+    return;
+
+  for (u64 i = 0; i < size; i++)
+    s8_test_data.g2l_matrix_src[i] = rand()%0x100;
+
+  s8_test_data.g2l_matrix_output_ref = (u8 *)malloc(sizeof(u8) * size);
+  if (!s8_test_data.g2l_matrix_output_ref)
+    return;
+
+  tg2l_matrix_copy_row_col_transposed_ref(p, s8_test_data.g2l_matrix_output_ref, s8_test_data.g2l_matrix_src);
+
+  put_mg_gmem(ctx, p->src, s8_test_data.g2l_matrix_src);
+}
+
+static void destroy_param_g2l(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, tg2l_matrix_param_t *p)
+{
+  free_mg_gmem(ctx, p->src);
+  free_skip_tensor_lmem(bmk);
+  free_ml(bmk, p->dst);
+}
+
+
+static void test_tg2l_matrix_transpose(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, tg2l_matrix_param_t *p)
+{
+  //tg2l_matrix_param_t p;
+  /*
+   * Matrix transpose must be n/c stride alignment
+   * for TDMA limitation
+   */
+  mg_shape_t src_shape={0x100, 0x80};
+  ml_shape_t dst_shape={0x80, 0x10, 0x10, 0x100};
+
+  int dst_align = 1;
+
+  p->src = alloc_mg_gmem(ctx, src_shape);
+  p->dst = alloc_ml(bmk, dst_shape, dst_align);
+  skip_matrix_lmem_size(bmk, p->dst);
+  test_param_g2l(ctx, p);
+}
+
+static void l2l_tensor_copy_ref(l2l_tensor_copy_param_t *p, u8 ref_data[], u8 src_data[])
+{
+  u64 size = tl_shape_size(&p->src->shape);
+
+  for (u64 i = 0; i < size; i++)
+    ref_data[i] = src_data[i];
+}
+
+static void test_l2l_param(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, l2l_tensor_copy_param_t *p)
+{
+  u64 size = tl_shape_size(&p->src->shape);
+
+  s8_test_data.l2l_tensor_src = (u8 *)malloc(sizeof(u8) * size);
+  if (!s8_test_data.l2l_tensor_src)
+    return;
+
+  for (u64 i = 0; i < size; i++)
+    s8_test_data.l2l_tensor_src[i] = rand()%0x100;
+
+  s8_test_data.l2l_tensor_output_ref = (u8 *)malloc(sizeof(u8) * size);
+  if (!s8_test_data.l2l_tensor_output_ref)
+    return;
+
+  l2l_tensor_copy_ref(p, s8_test_data.l2l_tensor_output_ref, s8_test_data.l2l_tensor_src);
+
+  put_tensor_g2l(ctx, bmk, p->src, s8_test_data.l2l_tensor_src);
+}
+
+static void destroy_param_l2l(bmk_ctx_t *bmk, l2l_tensor_copy_param_t *p)
+{
+  free_skip_tensor_lmem(bmk);
+  free_tl(bmk, p->dst);
+  free_skip_tensor_lmem(bmk);
+  free_tl(bmk, p->src);
+}
+
+static void test_l2l_tensor_copy(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, l2l_tensor_copy_param_t *p)
+{
+  tl_shape_t src_shape = {1, 0x10, 0x1, 0x400};
+  tl_shape_t dst_shape = {1, 0x10, 0x1, 0x400};
+
+  p->src = alloc_tl(bmk, src_shape, FMT_I8, 1);
+  skip_tensor_lmem_size(bmk, p->src);
+  p->dst = alloc_tl(bmk, dst_shape, FMT_I8, 1);
+  skip_tensor_lmem_size(bmk, p->dst);
+  test_l2l_param(ctx, bmk, p);
+}
+
+void get_result(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk)
+{
+  s8_test_data.depthwise_conv_output = get_tensor_l2g(ctx, bmk, depthwise_conv_param.ofmap);
+  s8_test_data.l2g_cw_output = get_tg_gmem(ctx, l2tg_cw_param.dst);
+  s8_test_data.g2l_matrix_output = get_matrix_l2g(ctx, bmk, tg2l_matrix_param.dst);
+  s8_test_data.l2l_tensor_output = get_tensor_l2g(ctx, bmk, l2l_tensor_copy_param.dst);
+}
+
+void check_result()
+{
+
+  int cmp_res = array_cmp_int8(
+      "Comparing results ...\n", s8_test_data.depthwise_conv_output_ref,  (s8 *)s8_test_data.depthwise_conv_output,
+      tl_shape_size(&depthwise_conv_param.ofmap->shape));
+  if (cmp_res != 0) {
+    printf("Comparison FAILED!!!\n");
+    exit(-1);
+  }
+
+  for (u64 i = 0; i < tl_shape_size(&l2tg_cw_param.src->shape); i++) {
+    if (s8_test_data.l2g_cw_output[i] != s8_test_data.l2g_cw_output_ref[i]) {
+      fprintf(stderr, "l2g_cw comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, s8_test_data.l2g_cw_output[i], s8_test_data.l2g_cw_output_ref[i]);
+      exit(-1);
+    }
+  }
+  for (u64 i = 0; i < ml_shape_size(&tg2l_matrix_param.dst->shape); i++) {
+    if (s8_test_data.g2l_matrix_output[i] != s8_test_data.g2l_matrix_output_ref[i]) {
+      fprintf(stderr, "g2l_matrix comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, s8_test_data.g2l_matrix_output[i], s8_test_data.g2l_matrix_output_ref[i]);
+      exit(-1);
+    }
+  }
+
+  for (u64 i = 0; i < tl_shape_size(&l2l_tensor_copy_param.src->shape); i++) {
+    if (s8_test_data.l2l_tensor_output[i] != s8_test_data.l2l_tensor_output_ref[i]) {
+      fprintf(stderr, "l2l_tensor comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, s8_test_data.l2l_tensor_output[i], s8_test_data.l2l_tensor_output_ref[i]);
+      exit(-1);
+    }
+  }
+
+
+}
+
+void trigger_max_power(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk)
+{
+ bmk1880v2_parallel_enable(bmk);
+ bmk1880v2_tdma_l2g_tensor_copy_cw_transposed(bmk, &l2tg_cw_param);
+ bmk1880v2_tdma_g2l_matrix_copy_row_col_transposed(bmk, &tg2l_matrix_param);
+ bmk1880v2_tdma_l2l_tensor_copy(bmk, &l2l_tensor_copy_param);
+ bmk1880v2_tiu_depthwise_convolution(bmk, &depthwise_conv_param);
+ bmk1880v2_parallel_disable(bmk);
+ test_submit(ctx);
+}
+
+void free_s8_data()
+{
+  free(s8_test_data.depthwise_conv_input);
+  free(s8_test_data.depthwise_conv_weight);
+  free(s8_test_data.depthwise_conv_bias);
+  free(s8_test_data.depthwise_conv_output);
+  free(s8_test_data.depthwise_conv_output_ref);
+  free(s8_test_data.l2g_cw_src);
+  free(s8_test_data.l2g_cw_output);
+  free(s8_test_data.l2g_cw_output_ref);
+  free(s8_test_data.g2l_matrix_src);
+  free(s8_test_data.g2l_matrix_output);
+  free(s8_test_data.g2l_matrix_output_ref);
+  free(s8_test_data.l2l_tensor_src);
+  free(s8_test_data.l2l_tensor_output);
+  free(s8_test_data.l2l_tensor_output_ref);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  printf("depthwise max_power test\n");
+
+  test_pooling(ctx, bk_ctx);
+  test_l2tg_cw_transpose(&ctx, bk_ctx, &l2tg_cw_param);
+  test_tg2l_matrix_transpose(&ctx, bk_ctx, &tg2l_matrix_param);
+  test_l2l_tensor_copy(&ctx, bk_ctx, &l2l_tensor_copy_param);
+
+  trigger_max_power(&ctx, bk_ctx);
+  get_result(&ctx, bk_ctx);
+  check_result();
+
+  destroy_param_l2l(bk_ctx,&l2l_tensor_copy_param);
+  destroy_param_g2l(&ctx, bk_ctx, &tg2l_matrix_param);
+  destroy_param_l2g(&ctx, bk_ctx, &l2tg_cw_param);
+  free_depthwise_param(bk_ctx, &depthwise_conv_param);
+  free_s8_data();
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_double_conv.cpp b/cviruntime/test/1880v2/test_1880v2_double_conv.cpp
new file mode 100644
index 000000000..3bcb5bfe4
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_double_conv.cpp
@@ -0,0 +1,739 @@
+#include "1880v2_test_util.h"
+
+typedef struct {
+  int random_seed;
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int using_bias;
+  int bReLU_EN;
+  int r_shift_m;
+  int opd0_sign;
+  int opd1_sign;
+  int opd2_sign;
+} conv_param_t;
+
+static int index_get(int h, int w1, int w2)
+{
+  return h * w1 + w2;
+}
+
+static int matrix_dot_mult(
+    s8 *A, s8 *B, int dim_n, int dim_m,
+    int opd0_sign)
+{
+  int sum = 0;
+  for (int i=0; i<dim_n; i++){
+    for (int j=0; j<dim_m; j++){
+      int index = index_get(i, dim_m, j);
+      if(opd0_sign) {
+        sum += A[index] * B [index];
+      } else {
+        sum += (int)((uint8_t)A[index]) * B [index];
+      }
+    }
+  }
+  return sum;
+}
+
+static int conv_ref(
+    const conv_param_t *p_param,
+    const s8 *ifmap,
+    const s8 *weight,
+    const s16 *bias,
+    s8 *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+  int r_shift_bits = p_param->r_shift_m;
+  int do_relu = p_param->bReLU_EN;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  s8 *i_fmap_pad_ker = (s8 *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return BM_ERR_FAILURE;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = BM_SUCCESS;
+
+  s8 *i_fmap_pad = NULL;
+  s8 *kernel_after = NULL;
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+
+      if (p_param->using_bias) {
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += bias[c]; //bias+c ;
+          }
+        }
+      }
+
+      if (do_relu)
+        relu(&result[n*oc*oh*ow + c*oh*ow], oh * ow);
+
+      // ofmap is s8, signed
+      ret = satu_2_8bit(&result[n*oc*oh*ow + c*oh*ow], oh * ow,
+                        &ofmap[n*oc*oh*ow + c*oh*ow], r_shift_bits, /*round_floor=*/1,
+                        /*sign_unsign=*/1);
+
+      if (ret != BM_SUCCESS)
+        goto error_release;
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+error_release:
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+static u8 * transform_weight(const tl_shape_t *s, u8 before[])
+{
+  u32 ic = s->n;
+  u32 oc = s->c;
+  u32 kh = s->h;
+  u32 kw = s->w;
+
+  u32 size = ic * oc * kh * kw;
+  u8 *after = (u8 *)malloc(sizeof(u8) * size);
+
+  /*
+   * (oc, ic, kh, kw) -> (1, oc, kh * kw, ic)
+   */
+  for (u32 oci = 0; oci < oc; oci++) {
+    for (u32 ici = 0; ici < ic; ici++) {
+      for (u32 khi = 0; khi < kh; khi++) {
+        for (u32 kwi = 0; kwi < kw; kwi++) {
+          u32 src_i = oci * ic * kh * kw + ici * kh * kw + khi * kw + kwi;
+          u32 dst_i = oci * kh * kw * ic + khi * kw * ic + kwi * ic + ici;
+          after[dst_i] = before[src_i];
+        }
+      }
+    }
+  }
+
+  return after;
+}
+
+static void put_conv_weight(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    const tl_t *tl,
+    u8 *data)
+{
+  const tl_shape_t *s = &tl->shape;
+  u32 ic = s->n;
+  u32 oc = s->c;
+  u32 kh = s->h;
+  u32 kw = s->w;
+
+  bmshape_t bms = BM_TENSOR_INT8((int)oc, (int)ic, (int)kh, (int)kw);
+  CVI_RT_MEM dev_mem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = CVI_RT_MemGetPAddr(dev_mem);
+  u8 *transformed_data = transform_weight(s, data);
+
+  /* we put weight to region 1. CVI_RT_MemCopyS2D regard dev_mem as
+   * absolute address, so we should pass abs address to copy weight
+   * to right place.
+   */
+
+  //u64 ab_addr = bm_device_read_base_reg(*ctx, 1);
+  //CVI_RT_MEM ab_dev_mem =
+    //bmmem_device_prealloc(*ctx, NULL, ab_addr + gaddr, &bms);
+
+  //int ret = CVI_RT_MemCopyS2D(*ctx, ab_dev_mem, transformed_data);
+  int ret = CVI_RT_MemCopyS2D(*ctx, dev_mem, transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tl_shape_t tdma_shape = { 1, oc, kh * kw, ic };
+
+  tg_t tdma_tg;
+  tdma_tg.base_reg_index = 0;
+  tdma_tg.start_address = gaddr;
+  tdma_tg.fmt = FMT_I8;
+  tdma_tg.shape.n = tdma_shape.n;
+  tdma_tg.shape.c = tdma_shape.c;
+  tdma_tg.shape.h = tdma_shape.h;
+  tdma_tg.shape.w = tdma_shape.w;
+  tdma_tg.stride = bmk1880v2_tensor_tgmem_default_stride(tdma_tg.shape, tdma_tg.fmt);
+  tdma_tg.base_reg_index = 1;
+
+  tl_t tdma_tl = *tl;
+  tdma_tl.shape = tdma_shape;
+  tdma_tl.stride = bmk1880v2_tensor_lmem_default_stride(bk_ctx, tdma_shape, FMT_I8, 0);
+
+  bmk1880v2_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tdma_tg;
+  p.dst = &tdma_tl;
+
+  bmk1880v2_tdma_g2l_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+  CVI_RT_MemFree(*ctx, dev_mem);
+  free(transformed_data);
+}
+
+static s8 * transform_bias(int oc, s16 before[])
+{
+  s8 *after = (s8 *)malloc(sizeof(s8) * 2 * oc);
+  if (!after)
+    return NULL;
+
+  for (int i = 0; i < oc; i++){
+    after[i] = before[i] & 0xff;
+    after[i + oc] = (before[i] >> 8) & 0xff;
+  }
+  return after;
+}
+
+static void put_conv_bias(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    const tl_t *tl,
+    s16 *data)
+{
+  int oc = tl->shape.c;
+
+  bmshape_t bms = BM_TENSOR_INT8(2, oc, 1, 1);
+  CVI_RT_MEM dev_mem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = CVI_RT_MemGetPAddr(dev_mem);
+  s8 *transformed_data = transform_bias(oc, data);
+  int ret = CVI_RT_MemCopyS2D(*ctx, dev_mem, (u8 *)transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_I8;
+  tg.shape.n = 2;
+  tg.shape.c = oc;
+  tg.shape.h = 1;
+  tg.shape.w = 1;
+  tg.stride = bmk1880v2_tensor_tgmem_default_stride(tg.shape, tg.fmt);
+
+  bmk1880v2_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1880v2_tdma_g2l_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  CVI_RT_MemFree(*ctx, dev_mem);
+  free(transformed_data);
+}
+
+static int conv_kh_ext(const conv_param_t *p)
+{
+  return (p->kh - 1) * p->dh + 1;
+}
+
+static int conv_kw_ext(const conv_param_t *p)
+{
+  return (p->kw - 1) * p->dw + 1;
+}
+
+static int conv_ih_ext(const conv_param_t *p)
+{
+  return (p->input_h - 1) * (p->ins_h + 1) +
+      p->ins_h_last + 1 + p->pad_top + p->pad_bot;
+}
+
+static int conv_iw_ext(const conv_param_t *p)
+{
+  return (p->input_w - 1) * (p->ins_w + 1) +
+      p->ins_w_last + 1 + p->pad_left + p->pad_right;
+}
+
+static int conv_oh(const conv_param_t *p)
+{
+  return (conv_ih_ext(p) - conv_kh_ext(p)) / p->stride_h + 1;
+}
+
+static int conv_ow(const conv_param_t *p)
+{
+  return (conv_iw_ext(p) - conv_kw_ext(p)) / p->stride_w + 1;
+}
+
+static int conv_input_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int ic = p->input_c;
+  int ih = p->input_h;
+  int iw = p->input_w;
+  return in * ic * ih * iw;
+}
+
+static int conv_output_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int oc = p->output_c;
+  int oh = conv_oh(p);
+  int ow = conv_ow(p);
+  return in * oc * oh * ow;
+}
+
+static int conv_weight_size(const conv_param_t *p)
+{
+  int oc = p->output_c;
+  int ic = p->input_c;
+  int kh = p->kh;
+  int kw = p->kw;
+  return oc * ic * kh * kw;
+}
+
+static s8 * alloc_input(const conv_param_t *p)
+{
+  int size = conv_input_size(p);
+
+  s8 *buf = (s8 *)malloc(sizeof(s8) * size);
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static s8 * alloc_weight(const conv_param_t *p)
+{
+  int size = conv_weight_size(p);
+
+  s8 *buf = (s8 *)malloc(sizeof(s8) * size);
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static s16 * alloc_bias(const conv_param_t *p)
+{
+  int oc = p->output_c;
+
+  s16 *bias = (s16 *)malloc(sizeof(s16) * oc);
+  for (int i = 0; i < oc; i++)
+    bias[i] = rand() % 65536 - 32768;
+
+  return bias;
+}
+
+static tl_t * conv_ifmap_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = p->opd0_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return bmk1880v2_lmem_alloc_tensor(ctx, s, fmt, 1);
+}
+
+static tl_t * conv_weight_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = p->opd1_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+  return bmk1880v2_lmem_alloc_tensor(ctx, s, fmt, 0);
+}
+
+static tl_t * conv_ofmap_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  return bmk1880v2_lmem_alloc_tensor(ctx, s, FMT_I8, 1);
+}
+
+static tl_t * conv_bias_tensor(
+    bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = p->opd2_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+  return bmk1880v2_lmem_alloc_tensor(ctx, s, fmt, 0);
+}
+
+static int conv_param_is_ok(const conv_param_t *p)
+{
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+
+  if ((kh_ext > ih_ext)
+      || (kw_ext > iw_ext)
+      || (kh_ext <= p->pad_top)
+      || (kh_ext <= p->pad_bot)
+      || (kw_ext <= p->pad_left)
+      || (kw_ext <= p->pad_right)
+      || (p->pad_top >= (1 << 4))
+      || (p->pad_bot >= (1 << 4))
+      || (p->pad_left >= (1 << 4))
+      || (p->pad_right >= (1 << 4))) {
+    return 0;
+  }
+
+  return 1;
+}
+
+static int bmk_conv_param_alloc_ok(
+    const bmk1880v2_tiu_convolution_param_t *p,
+    const conv_param_t *param)
+{
+  if (!p->ifmap || !p->ofmap || !p->weight)
+    return 0;
+
+  if (param->using_bias)
+    if (!p->bias)
+      return 0;
+
+  return 1;
+}
+
+static void make_bmk_conv_param(
+    bmk_ctx_t *ctx,
+    bmk1880v2_tiu_convolution_param_t *dst,
+    const conv_param_t *p)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  dst->relu_enable = p->bReLU_EN;
+  dst->rshift_bits = p->r_shift_m;
+
+  dst->ifmap = conv_ifmap_tensor(ctx, p);
+  dst->weight = conv_weight_tensor(ctx, p);
+  dst->ofmap = conv_ofmap_tensor(ctx, p);
+  dst->bias = NULL;
+  dst->ps32_mode = 0;
+  if (p->using_bias)
+    dst->bias = conv_bias_tensor(ctx, p);
+
+  dst-> w_is_const = 0;
+
+}
+
+static void free_bmk_conv_param(
+    bmk_ctx_t *ctx,
+    bmk1880v2_tiu_convolution_param_t *r,
+    const conv_param_t *p)
+{
+  if (p->using_bias && r->bias)
+    free_tl(ctx, r->bias);
+  if (r->ofmap)
+    free_tl(ctx, r->ofmap);
+  if (r->weight)
+    free_tl(ctx, r->weight);
+  if (r->ifmap)
+    free_tl(ctx, r->ifmap);
+}
+
+static void init_conv_param(conv_param_t &p)
+{
+  printf("init_conv_param\n");
+  memset(&p, 0, sizeof(p));
+  p.random_seed = clock();
+  srand(p.random_seed);
+
+retry:
+  p.input_n = rand() % 5 + 1;
+  p.input_c = (rand() % (5 * 32)/2)*2 + 8;
+  p.kh = rand() % 7 + 1;
+  p.kw = rand() % 7 + 1;
+  p.input_h = rand() % 40 + p.kh;
+  p.input_w = rand() % 40 + p.kw;
+  p.output_c = rand() % 10 + 3;
+  p.stride_h = rand() % (p.kh) + 1;
+  p.stride_w = rand() % (p.kw) + 1;
+  p.ins_h = rand() % p.kh;
+  p.ins_w = rand() % p.kw;
+  p.ins_h_last = rand() % p.kh;;
+  p.ins_w_last = rand() % p.kw;;
+  p.dh = rand() % 3 + 1;
+  p.dw = rand() % 3 + 1;
+
+  int kh_ext = conv_kh_ext(&p);
+  int kw_ext = conv_kw_ext(&p);
+  p.pad_top = rand() % kh_ext;
+  p.pad_bot = rand() % kh_ext;
+  p.pad_left = rand() % kw_ext;
+  p.pad_right = rand() % kw_ext;
+
+  if (!conv_param_is_ok(&p)) {
+    printf("retry init_conv_param\n");
+    goto retry;
+  }
+
+  p.using_bias = rand() % 2;
+  p.r_shift_m = rand() % 8;
+  p.bReLU_EN = rand() % 2;
+
+  p.opd0_sign = rand() % 2;
+  p.opd1_sign = 1;
+  p.opd2_sign = 1;
+
+  assert(p.opd1_sign == 1 && p.opd2_sign == 1);
+
+  int ih_ext = conv_ih_ext(&p);
+  int iw_ext = conv_iw_ext(&p);
+  assert(ih_ext >= kh_ext);
+  assert(iw_ext >= kw_ext);
+}
+
+static void print_conv_param(const conv_param_t *p)
+{
+  printf("%s\n", "Conv parameters:");
+  printf("  %s%d;\n", "p->random_seed = ", p->random_seed);
+
+  printf("  %s%d;\n", "p->input_n = ", p->input_n);
+  printf("  %s%d;\n", "p->input_c = ", p->input_c);
+  printf("  %s%d;\n", "p->input_h = ", p->input_h);
+  printf("  %s%d;\n", "p->input_w = ", p->input_w);
+  printf("  %s%d;\n", "p->output_c = ", p->output_c);
+
+  printf("  %s%d;\n", "p->kh = ", p->kh);
+  printf("  %s%d;\n", "p->kw = ", p->kw);
+  printf("  %s%d;\n", "p->dh = ", p->dh);
+  printf("  %s%d;\n", "p->dw = ", p->dw);
+  printf("  %s%d;\n", "p->pad_top = ", p->pad_top);
+  printf("  %s%d;\n", "p->pad_bot = ", p->pad_bot);
+  printf("  %s%d;\n", "p->pad_left = ", p->pad_left);
+  printf("  %s%d;\n", "p->pad_right = ", p->pad_right);
+  printf("  %s%d;\n", "p->stride_h = ", p->stride_h);
+  printf("  %s%d;\n", "p->stride_w = ", p->stride_w);
+  printf("  %s%d;\n", "p->ins_w = ", p->ins_w);
+  printf("  %s%d;\n", "p->ins_h = ", p->ins_h);
+  printf("  %s%d;\n", "p->ins_w_last = ", p->ins_w_last);
+  printf("  %s%d;\n", "p->ins_h_last = ", p->ins_h_last);
+
+  printf("  %s%d;\n", "p->r_shift_m = ", p->r_shift_m);
+  printf("  %s%d;\n", "p->opd0_sign = ", p->opd0_sign);
+  printf("  %s%d;\n", "p->opd1_sign = ", p->opd1_sign);
+  printf("  %s%d;\n", "p->opd2_sign = ", p->opd2_sign);
+  printf("  %s%d;\n", "p->bReLU_EN = ", p->bReLU_EN);
+  printf("  %s%d;\n", "p->using_bias = ", p->using_bias);
+
+  printf("  %s%d\n", "kh_ext = ", conv_kh_ext(p));
+  printf("  %s%d\n", "kw_ext = ", conv_kw_ext(p));
+  printf("  %s%d\n", "ih_ext = ", conv_ih_ext(p));
+  printf("  %s%d\n", "iw_ext = ", conv_iw_ext(p));
+  printf("  %s%d\n", "output_h = ", conv_oh(p));
+  printf("  %s%d\n", "output_w = ", conv_ow(p));
+}
+
+// Calculate the right shift value, m
+// Steps:
+//  1. Get the abs() of each weight;
+//  2. Summary all the abs() in one kernel;
+//  3. Get Log2 of each sum;
+//  4. Downward rounding;
+// After every r_shift value got, sort and find the middle one.
+static int calc_rshift_m(const conv_param_t *p, s8* weight)
+{
+  int kernel_cnt = p->output_c * p->input_c;
+  int kernel_size = p->kh * p->kw;
+  int *kernel_shifts = (int *)malloc(sizeof(int) * kernel_cnt);
+
+  // Tscan does not recognize C++ zero-initialized.
+  memset(kernel_shifts, 0, sizeof(int) * kernel_cnt);
+
+  // Part 1:
+  // Get right shift value for each kernel
+  int sum = 0;
+  for (int i = 0; i < kernel_cnt; i++) {
+    // Step 1 & 2: Get the sum of abs()
+    for (int j = 0; j < kernel_size; j++) {
+      sum += (int)(*weight < 0 ? -(*weight) : (*weight));
+      weight++;
+    }
+    // Step 3 & 4: log2 and downward rounding
+    sum >>= 1;
+    while (sum) {
+      sum >>= 1;
+      kernel_shifts[i]++;
+    }
+  }
+
+  // Part 2:
+  // Find the middle of all the values
+  int tag[32] = {0};
+  for (int cnt = 0; cnt < kernel_cnt; cnt++) {
+    tag[kernel_shifts[cnt]]++;
+  }
+
+  int rshift_m = 0;
+  int mid = 0;
+  do {
+    mid += tag[rshift_m++];
+  } while(mid < (kernel_cnt - 1) >> 1);
+
+  free(kernel_shifts);
+
+  return rshift_m - 1;
+}
+
+static int test_conv(
+    conv_param_t &p_param, CVI_RT_HANDLE ctx, bmk_ctx_t *bk_ctx)
+{
+  s8 *input = alloc_input(&p_param);
+  s8 *weight = alloc_weight(&p_param);
+  s16 *bias = alloc_bias(&p_param);
+  p_param.r_shift_m = calc_rshift_m(&p_param, weight);
+
+  s8 *output_ref = (s8 *)malloc(sizeof(s8) * conv_output_size(&p_param));
+  if (!output_ref)
+    return BM_ERR_FAILURE;
+
+  bmerr_t ret = conv_ref(&p_param, input, weight, bias, output_ref);
+  assert(ret == BM_SUCCESS);
+
+  bmk1880v2_tiu_convolution_param_t conv_param;
+  make_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+  int tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, &p_param);
+  if (tl_alloc_success) {
+    put_tensor_g2l(&ctx, bk_ctx, conv_param.ifmap, (u8 *)input);
+    put_conv_weight(&ctx, bk_ctx, conv_param.weight, (u8 *)weight);
+    if (p_param.using_bias)
+      put_conv_bias(&ctx, bk_ctx, conv_param.bias, bias);
+    bmk1880v2_tiu_convolution(bk_ctx, &conv_param);
+    u8 *output = get_tensor_l2g(&ctx, bk_ctx, conv_param.ofmap);
+
+    int has_error = array_cmp_int8(
+        "Comparing results ...\n",
+        output_ref, (s8 *)output, conv_output_size(&p_param));
+
+    if (has_error) {
+      print_conv_param(&p_param);
+      printf("Comparison FAILED\n");
+      exit(-1);
+    }
+    free(output);
+  }
+
+  free_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+  free(input);
+  free(weight);
+  free(output_ref);
+  free(bias);
+
+  return tl_alloc_success ? 1 : 0;
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  int test_finished_num = 0;
+  for (int i = 0; i < 5; i++) {
+    printf("random_test_conv iteration: %d\n", i);
+    conv_param_t test_conv_param;
+    init_conv_param(test_conv_param);
+    test_finished_num += test_conv(test_conv_param, ctx, bk_ctx);
+    if (!test_conv_param.using_bias)
+      test_conv_param.using_bias = 1;
+    if (test_conv_param.output_c <= 32)
+      test_conv_param.output_c += 32;
+    test_finished_num += test_conv(test_conv_param, ctx, bk_ctx);
+  }
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_double_conv_ps32.cpp b/cviruntime/test/1880v2/test_1880v2_double_conv_ps32.cpp
new file mode 100644
index 000000000..e451fd4f1
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_double_conv_ps32.cpp
@@ -0,0 +1,1453 @@
+#include "1880v2_test_util.h"
+
+typedef struct {
+  int random_seed;
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int using_bias;
+  int bReLU_EN;
+  int r_shift_m;
+  int opd0_sign;
+  int opd1_sign;
+  int opd2_sign;
+} conv_param_t;
+
+static int index_get(int h, int w1, int w2)
+{
+  return h * w1 + w2;
+}
+
+static int matrix_dot_mult(
+    s8 *A, s8 *B, int dim_n, int dim_m,
+    int opd0_sign)
+{
+  int sum = 0;
+  for (int i=0; i<dim_n; i++){
+    for (int j=0; j<dim_m; j++){
+      int index = index_get(i, dim_m, j);
+      if (opd0_sign) {
+        sum += A[index] * B [index];
+      } else {
+        sum += (int)((uint8_t)A[index]) * B [index];
+      }
+    }
+  }
+  return sum;
+}
+
+static int ps32_m2_conv_ref(
+    const conv_param_t *p_param,
+    const s8 *ifmap,
+    const s8 *weight,
+    s8 *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  s8 *i_fmap_pad_ker = (s8 *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return BM_ERR_FAILURE;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = BM_SUCCESS;
+
+  s8 *i_fmap_pad = NULL;
+  s8 *kernel_after = NULL;
+  u32 bstride = in * oc * oh * ow;
+
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+    ofmap[i] = result[i];
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+    ofmap[bstride + i] = result[i] >> 8;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+    ofmap[2 * bstride + i] = result[i] >> 16;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+    ofmap[3 * bstride + i] = result[i] >> 24;
+
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+static int ps32_m1_conv_ref(
+    const conv_param_t *p_param,
+    const s8 *ifmap,
+    const s8 *weight,
+    const s16 *bias,
+    s8 *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+  int r_shift_bits = p_param->r_shift_m;
+  int do_relu = p_param->bReLU_EN;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  s8 *i_fmap_pad_ker = (s8 *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return BM_ERR_FAILURE;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = BM_SUCCESS;
+
+  s8 *i_fmap_pad = NULL;
+  s8 *kernel_after = NULL;
+
+  u32 bstride = in * oc * oh * ow;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      result[i] = (u8)ofmap[i];
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      result[i] |= (u8)ofmap[bstride + i] << 8;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      result[i] |= (u8)ofmap[2 * bstride + i] << 16;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      result[i] |= (u8)ofmap[3 * bstride + i] << 24;
+
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+
+      if (p_param->using_bias) {
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += bias[c]; //bias+c ;
+          }
+        }
+      }
+
+      if (do_relu)
+        relu(&result[n*oc*oh*ow + c*oh*ow], oh * ow);
+
+      // ofmap is s8, signed
+      ret = satu_2_8bit(&result[n*oc*oh*ow + c*oh*ow], oh * ow,
+                        &ofmap[n*oc*oh*ow + c*oh*ow], r_shift_bits, /*round_floor=*/1,
+                        /*sign_unsign=*/1);
+
+      if (ret != BM_SUCCESS)
+        goto error_release;
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+error_release:
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+
+static int ps32_m3_conv_ref(
+    const conv_param_t *p_param,
+    const s8 *ifmap,
+    const s8 *weight,
+    s8 *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  s8 *i_fmap_pad_ker = (s8 *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return BM_ERR_FAILURE;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = BM_SUCCESS;
+
+  s8 *i_fmap_pad = NULL;
+  s8 *kernel_after = NULL;
+
+  u32 bstride = in * oc * oh * ow;
+
+  for (int i = 0; i < in * oc * oh * ow; i++)
+      result[i] = (u8)ofmap[i];
+
+  for (int i = 0; i < in * oc * oh * ow; i++)
+      result[i] |= (u8)ofmap[bstride + i] << 8;
+
+  for (int i = 0; i < in * oc * oh * ow; i++)
+      result[i] |= (u8)ofmap[2 * bstride + i] << 16;
+
+  for (int i = 0; i < in * oc * oh * ow; i++)
+      result[i] |= (u8)ofmap[3 * bstride + i] << 24;
+
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      ofmap[i] = result[i];
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      ofmap[bstride + i] = result[i] >> 8;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      ofmap[2 * bstride + i] = result[i] >> 16;
+
+  for (int i = 0; i < in * oc * oh * ow; i ++)
+      ofmap[3 * bstride + i] = result[i] >> 24;
+
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+static int conv_ref(
+    const conv_param_t *p_param,
+    const s8 *ifmap,
+    const s8 *weight,
+    const s16 *bias,
+    s8 *ofmap)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_h_last = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_w_last = p_param->ins_w_last;
+  int input_sign = p_param->opd0_sign;
+  int r_shift_bits = p_param->r_shift_m;
+  int do_relu = p_param->bReLU_EN;
+
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_top, pad_bot);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_left, pad_right);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  s8 *i_fmap_pad_ker = (s8 *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return BM_ERR_FAILURE;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = BM_SUCCESS;
+
+  s8 *i_fmap_pad = NULL;
+  s8 *kernel_after = NULL;
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_left, pad_right, pad_top, pad_bot,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+
+      if (p_param->using_bias) {
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += bias[c]; //bias+c ;
+          }
+        }
+      }
+
+      if (do_relu)
+        relu(&result[n*oc*oh*ow + c*oh*ow], oh * ow);
+
+      // ofmap is s8, signed
+      ret = satu_2_8bit(&result[n*oc*oh*ow + c*oh*ow], oh * ow,
+                        &ofmap[n*oc*oh*ow + c*oh*ow], r_shift_bits, /*round_floor=*/1,
+                        /*sign_unsign=*/1);
+
+      if (ret != BM_SUCCESS)
+        goto error_release;
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+  neuron_dump <s32>(
+      "test_code:conv_ref:pure result + bias",
+      (u32)in,
+      (u32)oc,
+      (u32)oh,
+      (u32)ow,
+      (s32 *)result);
+
+
+error_release:
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+static u8 * transform_weight(const tl_shape_t *s, u8 before[])
+{
+  u32 ic = s->n;
+  u32 oc = s->c;
+  u32 kh = s->h;
+  u32 kw = s->w;
+
+  u32 size = ic * oc * kh * kw;
+  u8 *after = (u8 *)malloc(sizeof(u8) * size);
+
+  /*
+   * (oc, ic, kh, kw) -> (1, oc, kh * kw, ic)
+   */
+  for (u32 oci = 0; oci < oc; oci++) {
+    for (u32 ici = 0; ici < ic; ici++) {
+      for (u32 khi = 0; khi < kh; khi++) {
+        for (u32 kwi = 0; kwi < kw; kwi++) {
+          u32 src_i = oci * ic * kh * kw + ici * kh * kw + khi * kw + kwi;
+          u32 dst_i = oci * kh * kw * ic + khi * kw * ic + kwi * ic + ici;
+          after[dst_i] = before[src_i];
+        }
+      }
+    }
+  }
+
+  return after;
+}
+
+static void put_conv_weight(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    const tl_t *tl,
+    u8 *data)
+{
+  const tl_shape_t *s = &tl->shape;
+  u32 ic = s->n;
+  u32 oc = s->c;
+  u32 kh = s->h;
+  u32 kw = s->w;
+
+  bmshape_t bms = BM_TENSOR_INT8((int)oc, (int)ic, (int)kh, (int)kw);
+  CVI_RT_MEM dev_mem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = CVI_RT_MemGetPAddr(dev_mem);
+  u8 *transformed_data = transform_weight(s, data);
+
+  /* we put weight to region 1. CVI_RT_MemCopyS2D regard dev_mem as
+   * absolute address, so we should pass abs address to copy weight
+   * to right place.
+   */
+
+  //u64 ab_addr = bm_device_read_base_reg(*ctx, 1);
+  //CVI_RT_MEM ab_dev_mem =
+    //bmmem_device_prealloc(*ctx, NULL, ab_addr + gaddr, &bms);
+
+  //int ret = CVI_RT_MemCopyS2D(*ctx, ab_dev_mem, transformed_data);
+  int ret = CVI_RT_MemCopyS2D(*ctx, dev_mem, transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tl_shape_t tdma_shape = { 1, oc, kh * kw, ic };
+
+  tg_t tdma_tg;
+  tdma_tg.base_reg_index = 0;
+  tdma_tg.start_address = gaddr;
+  tdma_tg.fmt = FMT_I8;
+  tdma_tg.shape.n = tdma_shape.n;
+  tdma_tg.shape.c = tdma_shape.c;
+  tdma_tg.shape.h = tdma_shape.h;
+  tdma_tg.shape.w = tdma_shape.w;
+  tdma_tg.stride = bmk1880v2_tensor_tgmem_default_stride(tdma_tg.shape, tdma_tg.fmt);
+  tdma_tg.base_reg_index = 1;
+
+  tl_t tdma_tl = *tl;
+  tdma_tl.shape = tdma_shape;
+  tdma_tl.stride = bmk1880v2_tensor_lmem_default_stride(bk_ctx, tdma_shape, FMT_I8, 0);
+
+  bmk1880v2_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tdma_tg;
+  p.dst = &tdma_tl;
+
+  bmk1880v2_tdma_g2l_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+  CVI_RT_MemFree(*ctx, dev_mem);
+  free(transformed_data);
+}
+
+static s8 * transform_bias(int oc, s16 before[])
+{
+  s8 *after = (s8 *)malloc(sizeof(s8) * 2 * oc);
+  if (!after)
+    return NULL;
+
+  for (int i = 0; i < oc; i++){
+    after[i] = before[i] & 0xff;
+    after[i + oc] = (before[i] >> 8) & 0xff;
+  }
+  return after;
+}
+
+static void put_conv_bias(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    const tl_t *tl,
+    s16 *data)
+{
+  int oc = tl->shape.c;
+
+  bmshape_t bms = BM_TENSOR_INT8(2, oc, 1, 1);
+  CVI_RT_MEM dev_mem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = CVI_RT_MemGetPAddr(dev_mem);
+  s8 *transformed_data = transform_bias(oc, data);
+  int ret = CVI_RT_MemCopyS2D(*ctx, dev_mem, (u8 *)transformed_data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_I8;
+  tg.shape.n = 2;
+  tg.shape.c = oc;
+  tg.shape.h = 1;
+  tg.shape.w = 1;
+  tg.stride = bmk1880v2_tensor_tgmem_default_stride(tg.shape, tg.fmt);
+
+  bmk1880v2_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1880v2_tdma_g2l_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  CVI_RT_MemFree(*ctx, dev_mem);
+  free(transformed_data);
+}
+
+static int conv_kh_ext(const conv_param_t *p)
+{
+  return (p->kh - 1) * p->dh + 1;
+}
+
+static int conv_kw_ext(const conv_param_t *p)
+{
+  return (p->kw - 1) * p->dw + 1;
+}
+
+static int conv_ih_ext(const conv_param_t *p)
+{
+  return (p->input_h - 1) * (p->ins_h + 1) +
+      p->ins_h_last + 1 + p->pad_top + p->pad_bot;
+}
+
+static int conv_iw_ext(const conv_param_t *p)
+{
+  return (p->input_w - 1) * (p->ins_w + 1) +
+      p->ins_w_last + 1 + p->pad_left + p->pad_right;
+}
+
+static int conv_oh(const conv_param_t *p)
+{
+  return (conv_ih_ext(p) - conv_kh_ext(p)) / p->stride_h + 1;
+}
+
+static int conv_ow(const conv_param_t *p)
+{
+  return (conv_iw_ext(p) - conv_kw_ext(p)) / p->stride_w + 1;
+}
+
+static int conv_input_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int ic = p->input_c;
+  int ih = p->input_h;
+  int iw = p->input_w;
+  return in * ic * ih * iw;
+}
+
+static int conv_output_size(const conv_param_t *p)
+{
+  int in = p->input_n;
+  int oc = p->output_c;
+  int oh = conv_oh(p);
+  int ow = conv_ow(p);
+  return in * oc * oh * ow;
+}
+
+static int conv_weight_size(const conv_param_t *p)
+{
+  int oc = p->output_c;
+  int ic = p->input_c;
+  int kh = p->kh;
+  int kw = p->kw;
+  return oc * ic * kh * kw;
+}
+
+static s8 * alloc_input(const conv_param_t *p)
+{
+  int size = conv_input_size(p);
+
+  s8 *buf = (s8 *)malloc(sizeof(s8) * size);
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static s8 * alloc_weight(const conv_param_t *p)
+{
+  int size = conv_weight_size(p);
+
+  s8 *buf = (s8 *)malloc(sizeof(s8) * size);
+  for (int i = 0; i < size; i++)
+    buf[i] = rand() % 256 - 128;
+
+  return buf;
+}
+
+static s16 * alloc_bias(const conv_param_t *p)
+{
+  int oc = p->output_c;
+
+  s16 *bias = (s16 *)malloc(sizeof(s16) * oc);
+  for (int i = 0; i < oc; i++)
+    bias[i] = rand() % 65536 - 32768;
+
+  return bias;
+}
+
+static tl_t * conv_ifmap_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = p->opd0_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->input_c;
+  s.h = p->input_h;
+  s.w = p->input_w;
+  return bmk1880v2_lmem_alloc_tensor(ctx, s, fmt, 1);
+}
+
+static tl_t * conv_weight_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = p->opd1_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = p->input_c;
+  s.c = p->output_c;
+  s.h = p->kh;
+  s.w = p->kw;
+  return bmk1880v2_lmem_alloc_tensor(ctx, s, fmt, 0);
+}
+
+static tl_t * conv_ofmap_tensor(bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  tl_shape_t s;
+  s.n = p->input_n;
+  s.c = p->output_c;
+  s.h = conv_oh(p);
+  s.w = conv_ow(p);
+  return bmk1880v2_lmem_alloc_ps32_tensor(ctx, s, FMT_I8, 1);
+}
+
+static tl_t * conv_bias_tensor(
+    bmk_ctx_t *ctx, const conv_param_t *p)
+{
+  fmt_t fmt = p->opd2_sign? FMT_I8: FMT_U8;
+  tl_shape_t s;
+  s.n = 2;
+  s.c = p->output_c;
+  s.h = 1;
+  s.w = 1;
+  return bmk1880v2_lmem_alloc_tensor(ctx, s, fmt, 0);
+}
+
+static int conv_param_is_ok(const conv_param_t *p)
+{
+  int kh_ext = conv_kh_ext(p);
+  int kw_ext = conv_kw_ext(p);
+  int ih_ext = conv_ih_ext(p);
+  int iw_ext = conv_iw_ext(p);
+
+  if ((kh_ext > ih_ext)
+      || (kw_ext > iw_ext)
+      || (kh_ext <= p->pad_top)
+      || (kh_ext <= p->pad_bot)
+      || (kw_ext <= p->pad_left)
+      || (kw_ext <= p->pad_right)
+      || (p->pad_top >= (1 << 4))
+      || (p->pad_bot >= (1 << 4))
+      || (p->pad_left >= (1 << 4))
+      || (p->pad_right >= (1 << 4))) {
+    return 0;
+  }
+
+  return 1;
+}
+
+static int bmk_conv_param_alloc_ok(
+    const bmk1880v2_tiu_convolution_param_t *p,
+    const conv_param_t *param)
+{
+  if (!p->ifmap || !p->ofmap || !p->weight)
+    return 0;
+  if(p->ps32_mode==1)
+      if (param->using_bias)
+        if (!p->bias)
+          return 0;
+
+  return 1;
+}
+
+static void make_bmk_conv_param_ps32(
+    bmk_ctx_t *ctx,
+    bmk1880v2_tiu_convolution_param_t *dst,
+    const conv_param_t *p, u32 ps32_mode)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  if(ps32_mode==2)
+  {
+    dst->ifmap = conv_ifmap_tensor(ctx, p);
+    dst->weight = conv_weight_tensor(ctx, p);
+    dst->ofmap = conv_ofmap_tensor(ctx, p);
+  }
+
+  dst->ps32_mode = ps32_mode;
+
+  dst->bias = NULL;
+  dst->relu_enable = 0;
+  dst->rshift_bits = 0;
+  if(ps32_mode==1)
+  {
+    dst->relu_enable = p->bReLU_EN;
+    dst->rshift_bits = p->r_shift_m;
+    // only mode=1 can use bias
+    if (p->using_bias)
+      dst->bias = conv_bias_tensor(ctx, p);
+  }
+
+  dst->w_is_const = 0;
+}
+
+static void make_bmk_conv_param(
+    bmk_ctx_t *ctx,
+    bmk1880v2_tiu_convolution_param_t *dst,
+    const conv_param_t *p)
+{
+  memset(dst, 0, sizeof(*dst));
+
+  dst->ins_h = p->ins_h;
+  dst->ins_last_h = p->ins_h_last;
+  dst->ins_w = p->ins_w;
+  dst->ins_last_w = p->ins_w_last;
+  dst->pad_top = p->pad_top;
+  dst->pad_bottom = p->pad_bot;
+  dst->pad_left = p->pad_left;
+  dst->pad_right = p->pad_right;
+  dst->stride_h = p->stride_h;
+  dst->stride_w = p->stride_w;
+  dst->dilation_h = p->dh;
+  dst->dilation_w = p->dw;
+  dst->relu_enable = p->bReLU_EN;
+  dst->rshift_bits = p->r_shift_m;
+  dst->ifmap = conv_ifmap_tensor(ctx, p);
+  dst->weight = conv_weight_tensor(ctx, p);
+  dst->ofmap = conv_ofmap_tensor(ctx, p);
+  dst->bias = NULL;
+  dst->ps32_mode = 0;
+  if (p->using_bias)
+    dst->bias = conv_bias_tensor(ctx, p);
+
+  dst->w_is_const = 0;
+}
+
+static void free_bmk_conv_param(
+    bmk_ctx_t *ctx,
+    bmk1880v2_tiu_convolution_param_t *r,
+    const conv_param_t *p)
+{
+  if (p->using_bias && r->bias)
+    free_tl(ctx, r->bias);
+
+  if (r->ofmap)
+    free_tl(ctx, r->ofmap);
+
+  if (r->weight)
+    free_tl(ctx, r->weight);
+
+  if (r->ifmap)
+    free_tl(ctx, r->ifmap);
+}
+
+static void init_conv_param(conv_param_t &p)
+{
+  printf("init_conv_param\n");
+  memset(&p, 0, sizeof(p));
+  p.random_seed = clock();
+  srand(p.random_seed);
+
+retry:
+  p.input_n = 1;
+  p.input_c = rand() % (10) + 2;
+  p.kh = rand() % 7 + 1;
+  p.kw = rand() % 7 + 1;
+  p.input_h = rand() % 10 + p.kh;
+  p.input_w = rand() % 10 + p.kw;
+  p.output_c = rand() % 10 + 3;
+  p.stride_h = rand() % (p.kh) + 1;
+  p.stride_w = rand() % (p.kw) + 1;
+  p.ins_h = rand() % p.kh;
+  p.ins_w = rand() % p.kw;
+  p.ins_h_last = rand() % p.kh;;
+  p.ins_w_last = rand() % p.kw;;
+  p.dh = rand() % 3 + 1;
+  p.dw = rand() % 3 + 1;
+
+  int kh_ext = conv_kh_ext(&p);
+  int kw_ext = conv_kw_ext(&p);
+  p.pad_top = rand() % kh_ext;
+  p.pad_bot = rand() % kh_ext;
+  p.pad_left = rand() % kw_ext;
+  p.pad_right = rand() % kw_ext;
+
+  if (!conv_param_is_ok(&p)) {
+    printf("retry init_conv_param\n");
+    goto retry;
+  }
+
+  p.using_bias = rand() % 2;
+  p.r_shift_m = rand() % 8;
+  p.bReLU_EN = rand() % 2;
+
+  p.opd0_sign = rand() % 2;
+  p.opd1_sign = 1;
+  p.opd2_sign = 1;
+
+  assert(p.opd1_sign == 1 && p.opd2_sign == 1);
+
+  int ih_ext = conv_ih_ext(&p);
+  int iw_ext = conv_iw_ext(&p);
+  assert(ih_ext >= kh_ext);
+  assert(iw_ext >= kw_ext);
+}
+
+static void print_conv_param(const conv_param_t *p)
+{
+  printf("%s\n", "Conv parameters:");
+  printf("  %s%d;\n", "p->random_seed = ", p->random_seed);
+
+  printf("  %s%d;\n", "p->input_n = ", p->input_n);
+  printf("  %s%d;\n", "p->input_c = ", p->input_c);
+  printf("  %s%d;\n", "p->input_h = ", p->input_h);
+  printf("  %s%d;\n", "p->input_w = ", p->input_w);
+  printf("  %s%d;\n", "p->output_c = ", p->output_c);
+
+  printf("  %s%d;\n", "p->kh = ", p->kh);
+  printf("  %s%d;\n", "p->kw = ", p->kw);
+  printf("  %s%d;\n", "p->dh = ", p->dh);
+  printf("  %s%d;\n", "p->dw = ", p->dw);
+  printf("  %s%d;\n", "p->pad_top = ", p->pad_top);
+  printf("  %s%d;\n", "p->pad_bot = ", p->pad_bot);
+  printf("  %s%d;\n", "p->pad_left = ", p->pad_left);
+  printf("  %s%d;\n", "p->pad_right = ", p->pad_right);
+  printf("  %s%d;\n", "p->stride_h = ", p->stride_h);
+  printf("  %s%d;\n", "p->stride_w = ", p->stride_w);
+  printf("  %s%d;\n", "p->ins_w = ", p->ins_w);
+  printf("  %s%d;\n", "p->ins_h = ", p->ins_h);
+  printf("  %s%d;\n", "p->ins_w_last = ", p->ins_w_last);
+  printf("  %s%d;\n", "p->ins_h_last = ", p->ins_h_last);
+
+  printf("  %s%d;\n", "p->r_shift_m = ", p->r_shift_m);
+  printf("  %s%d;\n", "p->opd0_sign = ", p->opd0_sign);
+  printf("  %s%d;\n", "p->opd1_sign = ", p->opd1_sign);
+  printf("  %s%d;\n", "p->opd2_sign = ", p->opd2_sign);
+  printf("  %s%d;\n", "p->bReLU_EN = ", p->bReLU_EN);
+  printf("  %s%d;\n", "p->using_bias = ", p->using_bias);
+
+  printf("  %s%d\n", "kh_ext = ", conv_kh_ext(p));
+  printf("  %s%d\n", "kw_ext = ", conv_kw_ext(p));
+  printf("  %s%d\n", "ih_ext = ", conv_ih_ext(p));
+  printf("  %s%d\n", "iw_ext = ", conv_iw_ext(p));
+  printf("  %s%d\n", "output_h = ", conv_oh(p));
+  printf("  %s%d\n", "output_w = ", conv_ow(p));
+}
+
+/* Calculate the right shift value, m
+ * Steps:
+ *  1. Get the abs() of each weight;
+ *  2. Summary all the abs() in one kernel;
+ *  3. Get Log2 of each sum;
+ *  4. Downward rounding;
+ * After every r_shift value got, sort and find the middle one.
+ */
+
+static int calc_rshift_m(const conv_param_t *p, s8* weight)
+{
+  int kernel_cnt = p->output_c * p->input_c;
+  int kernel_size = p->kh * p->kw;
+  int *kernel_shifts = (int *)malloc(sizeof(int) * kernel_cnt);
+
+  // Tscan does not recognize C++ zero-initialized.
+  memset(kernel_shifts, 0, sizeof(int) * kernel_cnt);
+
+  // Part 1:
+  // Get right shift value for each kernel
+  int sum = 0;
+  for (int i = 0; i < kernel_cnt; i++) {
+    // Step 1 & 2: Get the sum of abs()
+    for (int j = 0; j < kernel_size; j++) {
+      sum += (int)(*weight < 0 ? -(*weight) : (*weight));
+      weight++;
+    }
+    // Step 3 & 4: log2 and downward rounding
+    sum >>= 1;
+    while (sum) {
+      sum >>= 1;
+      kernel_shifts[i]++;
+    }
+  }
+
+  // Part 2:
+  // Find the middle of all the values
+  int tag[32] = {0};
+  for (int cnt = 0; cnt < kernel_cnt; cnt++) {
+    tag[kernel_shifts[cnt]]++;
+  }
+
+  int rshift_m = 0;
+  int mid = 0;
+  do {
+    mid += tag[rshift_m++];
+  } while(mid < (kernel_cnt - 1) >> 1);
+
+  free(kernel_shifts);
+
+  return rshift_m - 1;
+}
+
+static int test_ps32_ut(
+    conv_param_t &p_param, CVI_RT_HANDLE ctx, bmk_ctx_t *bk_ctx)
+{
+  printf("test_ps32_ut\n");
+  s8 *input = alloc_input(&p_param);
+  s8 *weight = alloc_weight(&p_param);
+  s16 *bias = alloc_bias(&p_param);
+  p_param.r_shift_m = calc_rshift_m(&p_param, weight);
+  s8 *output_ref = (s8 *)malloc(sizeof(s8) * conv_output_size(&p_param) * sizeof(int));
+  if (!output_ref)
+    return BM_ERR_FAILURE;
+
+  bmerr_t ret = ps32_m2_conv_ref(&p_param, input, weight, output_ref);
+  assert(ret == BM_SUCCESS);
+
+  bmk1880v2_tiu_convolution_param_t conv_param;
+  make_bmk_conv_param_ps32(bk_ctx, &conv_param, &p_param, 2);
+
+  int tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, &p_param);
+
+  if (tl_alloc_success) {
+
+    put_tensor_g2l(&ctx, bk_ctx, conv_param.ifmap, (u8 *)input);
+    put_conv_weight(&ctx, bk_ctx, conv_param.weight, (u8 *)weight);
+    bmk1880v2_tiu_convolution(bk_ctx, &conv_param);
+
+    bmk1880v2_tensor_lmem_t ps32_ofmap;
+    ps32_ofmap = *conv_param.ofmap;
+    ps32_ofmap.shape.n = ps32_ofmap.shape.n * sizeof(int);
+    u8 *output = get_tensor_l2g(&ctx, bk_ctx, &ps32_ofmap);
+
+    int has_error = array_cmp_int8(
+        "Comparing begin_mode results ...\n",
+        output_ref, (s8 *)output, conv_output_size(&p_param) * sizeof(int));
+
+    if (has_error) {
+      print_conv_param(&p_param);
+      printf("Comparison FAILED\n");
+      exit(-1);
+    }
+
+    free(output);
+  }
+  free_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+  printf("test_ps32_intermediate_mode\n");
+  for (int i=0; i < conv_input_size(&p_param); i++)
+    input[i] = rand() % 256 - 128;
+
+  for (int i=0; i < conv_weight_size(&p_param); i++)
+    weight[i] = rand() % 256 - 128;
+
+  ret = ps32_m3_conv_ref(&p_param, input, weight, output_ref);
+  assert(ret == BM_SUCCESS);
+
+  make_bmk_conv_param_ps32(bk_ctx, &conv_param, &p_param, 3);
+  tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, &p_param);
+
+  if (tl_alloc_success) {
+    put_tensor_g2l(&ctx, bk_ctx, conv_param.ifmap, (u8 *)input);
+    put_conv_weight(&ctx, bk_ctx, conv_param.weight, (u8 *)weight);
+
+    bmk1880v2_tiu_convolution(bk_ctx, &conv_param);
+
+    bmk1880v2_tensor_lmem_t ps32_ofmap;
+    ps32_ofmap = *conv_param.ofmap;
+    ps32_ofmap.shape.n = ps32_ofmap.shape.n * sizeof(int);
+
+    u8 *output = get_tensor_l2g(&ctx, bk_ctx, &ps32_ofmap);
+
+    int has_error = array_cmp_int8(
+        "Comparing intermediate results ...\n",
+        output_ref, (s8 *)output, conv_output_size(&p_param) * sizeof(int));
+
+    if (has_error) {
+      print_conv_param(&p_param);
+      printf("Comparison FAILED\n");
+      exit(-1);
+    }
+
+    free(output);
+  }
+  free_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+  printf("test_ps32_end_mode\n");
+  for (int i=0; i < conv_input_size(&p_param); i++)
+    input[i] = rand() % 256 - 128;
+
+  for (int i=0; i < conv_weight_size(&p_param); i++)
+    weight[i] = rand() % 256 - 128;
+
+  ret = ps32_m1_conv_ref(&p_param, input, weight, bias, output_ref);
+  assert(ret == BM_SUCCESS);
+
+  make_bmk_conv_param_ps32(bk_ctx, &conv_param, &p_param, 1);
+
+  tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, &p_param);
+
+  if (tl_alloc_success) {
+
+    put_tensor_g2l(&ctx, bk_ctx, conv_param.ifmap, (u8 *)input);
+    put_conv_weight(&ctx, bk_ctx, conv_param.weight, (u8 *)weight);
+    if (p_param.using_bias) {
+      put_conv_bias(&ctx, bk_ctx, conv_param.bias, bias);
+    }
+    bmk1880v2_tiu_convolution(bk_ctx, &conv_param);
+    u8 *output = get_tensor_l2g(&ctx, bk_ctx, conv_param.ofmap);
+
+    int has_error = array_cmp_int8(
+        "Comparing end results ...\n",
+        output_ref, (s8 *)output, conv_output_size(&p_param));
+
+    if (has_error) {
+      print_conv_param(&p_param);
+      printf("Comparison FAILED\n");
+      exit(-1);
+    }
+
+    free(output);
+  }
+
+  free_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+  free(input);
+  free(weight);
+  free(bias);
+  free(output_ref);
+
+  return tl_alloc_success ? 1 : 0;
+}
+
+static int test_ic_tiling_conv(
+    conv_param_t &p_param, CVI_RT_HANDLE ctx, bmk_ctx_t *bk_ctx)
+{
+  printf("test tiled ps32 conv\n");
+  s8 *input = alloc_input(&p_param);
+  s8 *weight = alloc_weight(&p_param);
+  s16 *bias = alloc_bias(&p_param);
+  p_param.r_shift_m = calc_rshift_m(&p_param, weight);
+  s8 *output_ref = (s8 *)malloc(sizeof(s8) * conv_output_size(&p_param));
+  if (!output_ref)
+    return BM_ERR_FAILURE;
+
+  bmerr_t ret = conv_ref(&p_param, input, weight, bias, output_ref);
+  assert(ret == BM_SUCCESS);
+  bmk1880v2_tiu_convolution_param_t conv_tmp_param;
+  bmk1880v2_tiu_convolution_param_t conv_param;
+  make_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+  int tl_alloc_success = bmk_conv_param_alloc_ok(&conv_param, &p_param);
+  if (tl_alloc_success) {
+    if (p_param.using_bias) {
+      conv_tmp_param.bias = conv_param.bias;
+      put_conv_bias(&ctx, bk_ctx, conv_param.bias, bias);
+      neuron_dump <s16> (
+        "test_ic_tiling_conv: bias",
+        1,
+        conv_param.bias->shape.c,
+        conv_param.bias->shape.h,
+        conv_param.bias->shape.w,
+        (s16 *)bias);
+    }
+    // for ps32_md[1] = 1, relu_enable & rshift_bits need to set to 0
+    // so we store those parameters to conv_tmp_para
+    conv_tmp_param.relu_enable = conv_param.relu_enable;
+    conv_tmp_param.rshift_bits = conv_param.rshift_bits;
+    conv_tmp_param.bias        = conv_param.bias;
+
+    u32 ic_step = 1;
+    u32 n_step = 1;
+    tl_t ifmap = *conv_param.ifmap;
+    tl_t ofmap = *conv_param.ofmap;
+    tg_shape_t s;
+    s.n = conv_param.ifmap->shape.n;
+    s.c = conv_param.ifmap->shape.c;
+    s.h = conv_param.ifmap->shape.h;
+    s.w = conv_param.ifmap->shape.w;
+    tg_t *tg_ifmap = alloc_tg_gmem(&ctx, s, FMT_I8);
+    put_tg_gmem(&ctx, tg_ifmap, (u8 *)input);
+
+    s.n = conv_param.weight->shape.n;
+    s.c = conv_param.weight->shape.c;
+    s.h = conv_param.weight->shape.h;
+    s.w = conv_param.weight->shape.w;
+    u8 *transformed_weight =
+      transform_weight(&conv_param.weight->shape, (u8 *)weight);
+    tg_t *tg_weight = alloc_tg_gmem(&ctx, s, FMT_I8);
+    put_tg_gmem(&ctx, tg_weight, (u8 *)transformed_weight);
+    free(transformed_weight);
+
+    neuron_dump <s8> (
+        "test_ic_tiling_conv: input",
+        p_param.input_n,
+        p_param.input_c,
+        p_param.input_h,
+        p_param.input_w,
+        (s8 *)input);
+
+    neuron_dump <s8> (
+        "test_ic_tiling_conv: kernel",
+        1,
+        conv_param.weight->shape.c,
+        conv_param.weight->shape.h * conv_param.weight->shape.w,
+        conv_param.weight->shape.n,
+        (s8 *)transformed_weight);
+
+    tl_shape_t cur_tl_ifmap_shape = {
+        n_step,
+        ic_step,
+        ifmap.shape.h,
+        ifmap.shape.w
+    };
+
+    tg_shape_t cur_tg_ifmap_shape = {
+      cur_tl_ifmap_shape.n,
+      cur_tl_ifmap_shape.c,
+      cur_tl_ifmap_shape.h,
+      cur_tl_ifmap_shape.w
+    };
+
+    tg_stride_t cur_tg_ifmap_stride = {
+      tg_ifmap->stride.n,
+      tg_ifmap->stride.c,
+      tg_ifmap->stride.h,
+    };
+
+    tg_t cur_tg_ifmap;
+    cur_tg_ifmap.base_reg_index = 0;
+    cur_tg_ifmap.start_address = tg_ifmap->start_address;
+    cur_tg_ifmap.shape = cur_tg_ifmap_shape;
+    cur_tg_ifmap.stride = cur_tg_ifmap_stride;
+    cur_tg_ifmap.fmt = FMT_I8;
+
+    tl_t cur_tl_ifmap;
+    cur_tl_ifmap.shape = cur_tl_ifmap_shape;
+    cur_tl_ifmap.stride =
+      bmk1880v2_tensor_lmem_default_stride(bk_ctx, cur_tl_ifmap_shape, FMT_I8, 1);
+    cur_tl_ifmap.start_address = ifmap.start_address;
+    cur_tl_ifmap.fmt = ifmap.fmt;
+
+    tl_t cur_tl_ofmap;
+    cur_tl_ofmap.start_address = ofmap.start_address;
+    cur_tl_ofmap.shape = ofmap.shape;
+    cur_tl_ofmap.shape.n = n_step;
+    cur_tl_ofmap.stride =
+      bmk1880v2_tensor_lmem_default_stride(bk_ctx, cur_tl_ofmap.shape, FMT_I8, 1);
+    cur_tl_ofmap.fmt = ofmap.fmt;
+
+    tl_t cur_tl_weight;
+    memset(&cur_tl_weight, 0, sizeof(cur_tl_weight));
+    cur_tl_weight.start_address = conv_param.weight->start_address;
+    cur_tl_weight.shape = conv_param.weight->shape;
+    cur_tl_weight.shape.n = ic_step;
+    cur_tl_weight.stride = {
+      1,
+      cur_tl_weight.shape.n * cur_tl_weight.shape.h * cur_tl_weight.shape.w,
+      cur_tl_weight.shape.n * cur_tl_weight.shape.w,
+      cur_tl_weight.shape.n
+    };
+    cur_tl_weight.fmt = conv_param.weight->fmt;
+
+    const tl_t *saved_tl_weight = conv_param.weight;
+    const tl_t *saved_tl_ifmap = conv_param.ifmap;
+    for (u32 ci = 0; ci < ifmap.shape.c; ci += ic_step) {
+      {
+        u32 ic = tg_weight->shape.n;
+        u32 oc = tg_weight->shape.c;
+        u32 kh = tg_weight->shape.h;
+        u32 kw = tg_weight->shape.w;
+
+        tg_t cur_tdma_tg_weight;
+        cur_tdma_tg_weight.base_reg_index = tg_weight->base_reg_index;
+        cur_tdma_tg_weight.start_address = tg_weight->start_address + ci;
+        cur_tdma_tg_weight.fmt = tg_weight->fmt;
+        cur_tdma_tg_weight.shape = {1, oc, kh * kw, ic};
+        cur_tdma_tg_weight.stride =
+          bmk1880v2_tensor_tgmem_default_stride(cur_tdma_tg_weight.shape, cur_tdma_tg_weight.fmt);
+        cur_tdma_tg_weight.shape = {1, oc, kh * kw, ic_step};
+
+        tl_t cur_tdma_tl_weight;
+        cur_tdma_tl_weight = cur_tl_weight;
+        cur_tdma_tl_weight.shape.n = cur_tdma_tg_weight.shape.n;
+        cur_tdma_tl_weight.shape.c = cur_tdma_tg_weight.shape.c;
+        cur_tdma_tl_weight.shape.h = cur_tdma_tg_weight.shape.h;
+        cur_tdma_tl_weight.shape.w = cur_tdma_tg_weight.shape.w;
+        cur_tdma_tl_weight.stride = bmk1880v2_tensor_lmem_default_stride(
+            bk_ctx, cur_tdma_tl_weight.shape, FMT_I8, 0);
+
+        bmk1880v2_tdma_tg2l_tensor_copy_param_t p1;
+        p1.src = &cur_tdma_tg_weight;
+        p1.dst = &cur_tdma_tl_weight;
+        bmk1880v2_tdma_g2l_tensor_copy(bk_ctx, &p1);
+        test_submit(&ctx);
+      }
+      {
+        bmk1880v2_tdma_tg2l_tensor_copy_param_t p2;
+        cur_tg_ifmap.start_address =
+          tg_ifmap->start_address + ci * tg_ifmap->stride.c;
+        p2.src = &cur_tg_ifmap;
+        p2.dst = &cur_tl_ifmap;
+        bmk1880v2_tdma_g2l_tensor_copy(bk_ctx, &p2);
+        test_submit(&ctx);
+      }
+
+      conv_param.ifmap = &cur_tl_ifmap;
+      conv_param.weight = &cur_tl_weight;
+
+      // for ps32_md[1] = 1, relu_enable & rshift_bits need to set to 0
+      // so we store those parameters to conv_tmp_para
+      if (ci == (ifmap.shape.c - 1))
+      {
+        conv_param.relu_enable = conv_tmp_param.relu_enable;
+        conv_param.rshift_bits = conv_tmp_param.rshift_bits;
+        conv_param.bias        = conv_tmp_param.bias;
+        conv_param.ps32_mode   = 1;
+      }
+      else if (ci == 0)
+      {
+        conv_param.relu_enable = 0;
+        conv_param.rshift_bits = 0;
+        conv_param.bias        = 0;;
+        conv_param.ps32_mode   = 2;
+      }
+      else
+      {
+        conv_param.relu_enable = 0;
+        conv_param.rshift_bits = 0;
+        conv_param.bias        = 0;;
+        conv_param.ps32_mode   = 3;
+      }
+      bmk1880v2_tiu_convolution(bk_ctx, &conv_param);
+      conv_param.weight = saved_tl_weight;
+      conv_param.ifmap = saved_tl_ifmap;
+    }
+
+    u8 *output = get_tensor_l2g(&ctx, bk_ctx, conv_param.ofmap);
+
+    free_tg_gmem(&ctx, tg_ifmap);
+    free_tg_gmem(&ctx, tg_weight);
+    int has_error = array_cmp_int8(
+        "Comparing results ...\n",
+        output_ref, (s8 *)output, conv_output_size(&p_param));
+
+    if (has_error) {
+      print_conv_param(&p_param);
+      printf("Comparison FAILED\n");
+      exit(-1);
+    }
+    free(output);
+  }
+  free_bmk_conv_param(bk_ctx, &conv_param, &p_param);
+
+  free(input);
+  free(weight);
+  free(output_ref);
+  free(bias);
+
+  return tl_alloc_success ? 1 : 0;
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  int test_finished_num = 0;
+  for (int i = 0; i < 5; i++) {
+    printf("random_test_conv iteration: %d\n", i);
+    conv_param_t test_conv_param;
+    init_conv_param(test_conv_param);
+    test_finished_num += test_ic_tiling_conv(test_conv_param, ctx, bk_ctx);
+    test_finished_num += test_ps32_ut(test_conv_param, ctx, bk_ctx);
+    if (!test_conv_param.using_bias)
+      test_conv_param.using_bias = 1;
+    if (test_conv_param.output_c <= 9)
+      test_conv_param.output_c += 3;
+    test_finished_num += test_ic_tiling_conv(test_conv_param, ctx, bk_ctx);
+    test_finished_num += test_ps32_ut(test_conv_param, ctx, bk_ctx);
+  }
+  printf("test_finished_num: %d\n", test_finished_num);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_g2g_general_copy.cpp b/cviruntime/test/1880v2/test_1880v2_g2g_general_copy.cpp
new file mode 100644
index 000000000..6fa80a766
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_g2g_general_copy.cpp
@@ -0,0 +1,106 @@
+#include "1880v2_test_util.h"
+
+typedef bmk1880v2_tdma_tg2tg_tensor_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  tg_shape_t src_shape;
+  tg_stride_t src_stride;
+  tg_shape_t dst_shape;
+  tg_stride_t dst_stride;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    {1, 3, 3, 3}, {27, 9, 3},
+    {1, 3, 3, 3}, {27, 9, 3},
+  },
+  {
+    // YOLOv2 concat layer
+    {1, 256, 19, 19}, {92416, 361, 19},
+    {1, 256, 19, 19}, {462080, 361, 19},
+  }
+};
+
+
+static void test_param_g2g(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+
+  u64 size = p->src->shape.c * p->src->shape.h * p->src->shape.w;
+  u8 *src_data = (u8 *)malloc(sizeof(u8) * size);
+  for (u64 i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  put_tg_gmem(ctx, p->src, src_data);
+
+  bmk1880v2_tdma_tg2tg_general_copy(bmk, p);
+
+  test_submit(ctx);
+
+  u8 *dst_data = get_tg_gmem(ctx, p->dst);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != src_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], src_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+}
+
+static void destroy_param_g2g(CVI_RT_HANDLE *ctx, param_t *p)
+{
+  free_tg_gmem(ctx, p->src);
+  free_tg_gmem(ctx, p->dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+    param_t p;
+    bmk1880v2_tensor_tgmem_t *src, *dst;
+
+    src = alloc_tg_gmem(ctx, c->src_shape, FMT_I8);
+    src->stride.n = c->src_stride.n;
+    src->stride.c = c->src_stride.c;
+    src->stride.h = c->src_stride.h;
+
+    dst = alloc_tg_gmem(ctx, c->dst_shape, FMT_I8);
+    dst->stride.n = c->dst_stride.n;
+    dst->stride.c = c->dst_stride.c;
+    dst->stride.h = c->dst_stride.h;
+
+    memset(&p, 0, sizeof(p));
+    p.src = src;
+    p.dst = dst;
+    test_param_g2g(ctx, bmk, &p);
+
+    destroy_param_g2g(ctx, &p);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_g2g_tensor_copy.cpp b/cviruntime/test/1880v2/test_1880v2_g2g_tensor_copy.cpp
new file mode 100644
index 000000000..120580cb2
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_g2g_tensor_copy.cpp
@@ -0,0 +1,106 @@
+#include "1880v2_test_util.h"
+
+typedef bmk1880v2_tdma_tg2tg_tensor_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  tg_shape_t src_shape;
+  tg_stride_t src_stride;
+  tg_shape_t dst_shape;
+  tg_stride_t dst_stride;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    {1, 3, 3, 3}, {27, 9, 3},
+    {1, 3, 3, 3}, {27, 9, 3},
+  },
+  {
+    // YOLOv2 concat layer
+    {1, 256, 19, 19}, {92416, 361, 19},
+    {1, 256, 19, 19}, {462080, 361, 19},
+  }
+};
+
+
+static void test_param_g2g(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+
+  u64 size = p->src->shape.c * p->src->shape.h * p->src->shape.w;
+  u8 *src_data = (u8 *)malloc(sizeof(u8) * size);
+  for (u64 i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  put_tg_gmem(ctx, p->src, src_data);
+
+  bmk1880v2_tdma_tg2tg_tensor_copy(bmk, p);
+
+  test_submit(ctx);
+  
+  u8 *dst_data = get_tg_gmem(ctx, p->dst);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != src_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], src_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+}
+
+static void destroy_param_g2g(CVI_RT_HANDLE *ctx, param_t *p)
+{
+  free_tg_gmem(ctx, p->src);
+  free_tg_gmem(ctx, p->dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+    param_t p;
+    bmk1880v2_tensor_tgmem_t *src, *dst;
+
+    src = alloc_tg_gmem(ctx, c->src_shape, FMT_I8);
+    src->stride.n = c->src_stride.n;
+    src->stride.c = c->src_stride.c;
+    src->stride.h = c->src_stride.h;
+
+    dst = alloc_tg_gmem(ctx, c->dst_shape, FMT_I8);
+    dst->stride.n = c->dst_stride.n;
+    dst->stride.c = c->dst_stride.c;
+    dst->stride.h = c->dst_stride.h;
+
+    memset(&p, 0, sizeof(p));
+    p.src = src;
+    p.dst = dst;
+    test_param_g2g(ctx, bmk, &p);
+
+    destroy_param_g2g(ctx, &p);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
\ No newline at end of file
diff --git a/cviruntime/test/1880v2/test_1880v2_get_matrix_stride.cpp b/cviruntime/test/1880v2/test_1880v2_get_matrix_stride.cpp
new file mode 100644
index 000000000..d3825c43a
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_get_matrix_stride.cpp
@@ -0,0 +1,125 @@
+#include "1880v2_test_util.h"
+
+static void get_matrix_l2g_stride_ref(
+    u8 *ref, u8 *a,
+    ml_shape_t ml_shape,
+    bmk1880v2_matrix_tgmem_stride_t gmem_stride)
+{
+  int row = ml_shape.n;
+  int col = ml_shape.col;
+  int row_stride = gmem_stride.row;
+
+  /*
+   * Same as in get_matrix_l2g_stride().
+   */
+  int stride_size = row * row_stride;
+  for (int i = 0; i < stride_size; i++)
+    ref[i] = 0xaf;
+
+  for (int ri = 0; ri < row; ri++) {
+    for (int ci = 0; ci < col; ci++) {
+      int src_i = ri * col + ci;
+      int dst_i = ri * row_stride + ci;
+      ref[dst_i] = a[src_i];
+    }
+  }
+}
+
+static u8 * get_matrix_l2g_stride(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    ml_t *ml,
+    bmk1880v2_matrix_tgmem_stride_t mg_stride)
+{
+  int row = ml->shape.n;
+  int row_stride = mg_stride.row;
+  int col = ml->shape.col;
+  int stride_size = row * row_stride;
+
+  u8 *data = (u8 *)malloc(sizeof(u8) * stride_size);
+  if (!data)
+    return NULL;
+
+  for (int i = 0; i < stride_size; i++)
+    data[i] = 0xaf;
+
+  bmshape_t bms = BM_TENSOR_WITH_FMT(row, row_stride, 1, 1, BM_FMT_INT8);
+  CVI_RT_MEM devmem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms));
+  int ret = CVI_RT_MemCopyS2D(*ctx, devmem, data);
+  assert(ret == BM_SUCCESS);
+
+  mg_t mg;
+  mg.base_reg_index = 0;
+  mg.start_address = CVI_RT_MemGetPAddr(devmem);
+  mg.shape.row = row;
+  mg.shape.col = col;
+  mg.stride = mg_stride;
+
+  bmk1880v2_tdma_l2tg_matrix_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = ml;
+  p.dst = &mg;
+
+  bmk1880v2_tdma_l2g_matrix_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  ret = CVI_RT_MemCopyD2S(*ctx, data, devmem);
+  assert(ret == BM_SUCCESS);
+
+  CVI_RT_MemFree(*ctx, devmem);
+  return data;
+}
+
+static void test_get_matrix_l2g_stride(
+    CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx)
+{
+  int row = 80;
+  int col = 70;
+  int size = row * col;
+  int row_stride = col * 2;
+
+  ml_shape_t ml_shape =
+      bmk1880v2_matrix_lmem_default_shape(bk_ctx, row, col, FMT_I8);
+  bmk1880v2_matrix_tgmem_stride_t gmem_stride;
+  gmem_stride.row = row_stride;
+  int stride_size = row * row_stride;
+
+  u8 *data_x = (u8 *)xmalloc(size);
+  for (int i = 0; i < size; i++)
+    data_x[i] = i;
+
+  ml_t *ml_x =
+      bmk1880v2_lmem_alloc_matrix(bk_ctx,ml_shape, FMT_I8, 1);
+  put_matrix_g2l(ctx, bk_ctx, ml_x, data_x);
+  u8 *result_x = get_matrix_l2g_stride(ctx, bk_ctx, ml_x, gmem_stride);
+  u8 *ref_x = (u8 *)xmalloc(stride_size);
+  if (!result_x || !ref_x)
+    goto fail_exit;
+
+  get_matrix_l2g_stride_ref(ref_x, data_x, ml_shape, gmem_stride);
+
+  for (int i = 0; i < stride_size; i++) {
+    if (result_x[i] != ref_x[i]) {
+      printf("compare failed at result_x[%d], got %d, exp %d\n",
+             i, result_x[i], ref_x[i]);
+      exit(-1);
+    }
+  }
+
+  bmk1880v2_lmem_free_matrix(bk_ctx, ml_x);
+
+fail_exit:
+  free(data_x);
+  free(result_x);
+  free(ref_x);
+}
+
+int main (void)
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  test_get_matrix_l2g_stride(&ctx, bk_ctx);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_get_tensor_gl_stride.cpp b/cviruntime/test/1880v2/test_1880v2_get_tensor_gl_stride.cpp
new file mode 100644
index 000000000..704367cf5
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_get_tensor_gl_stride.cpp
@@ -0,0 +1,158 @@
+#include "1880v2_test_util.h"
+
+static void get_tensor_l2g_stride_ref(
+    u8 *ref, u8 *a,
+    int n, int c, int h, int w,
+    bmk1880v2_tensor_lmem_stride_t tl_stride,
+    bmk1880v2_tensor_tgmem_stride_t tg_stride)
+{
+  /*
+   * Same as in get_tensor_l2g_stride().
+   */
+  int stride_size = n * tg_stride.n;
+  for (int i = 0; i < stride_size; i++)
+    ref[i] = 0xcf;
+
+  for (int ni = 0; ni < n; ni++) {
+    for (int ci = 0; ci < c; ci++) {
+      for (int hi = 0; hi < h; hi++) {
+        for (int wi = 0; wi < w; wi++) {
+          u64 src_i =
+              (ni * c + ci) * tl_stride.c +
+              hi * tl_stride.h +
+              wi * 1;
+          u64 dst_i =
+              ni * tg_stride.n +
+              ci * tg_stride.c +
+              hi * tg_stride.h +
+              wi * 1;
+          ref[dst_i] = a[src_i];
+        }
+      }
+    }
+  }
+}
+
+static inline u8 * get_tensor_l2g_stride(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    tl_t *tl,
+    bmk1880v2_tensor_tgmem_stride_t tg_stride)
+{
+  int n = tl->shape.n;
+  int n_stride = tg_stride.n;
+
+  int stride_size = n * n_stride;
+  u8 *data = (u8 *)malloc(sizeof(u8) * stride_size);
+  if (!data)
+    return NULL;
+
+  for (int i = 0; i < stride_size; i++)
+    data[i] = 0xcf;
+
+  bmshape_t bms = BM_TENSOR_WITH_FMT(n, n_stride, 1, 1, BM_FMT_INT8);
+  CVI_RT_MEM dev_mem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = CVI_RT_MemGetPAddr(dev_mem);
+  int ret = CVI_RT_MemCopyS2D(*ctx, dev_mem, data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_I8;
+  tg.shape.n = tl->shape.n;
+  tg.shape.c = tl->shape.c;
+  tg.shape.h = tl->shape.h;
+  tg.shape.w = tl->shape.w;
+  tg.stride = tg_stride;
+  tg.base_reg_index = 0;
+
+  bmk1880v2_tdma_l2tg_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = tl;
+  p.dst = &tg;
+  bmk1880v2_tdma_l2g_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  ret = CVI_RT_MemCopyD2S(*ctx, data, dev_mem);
+  assert(ret == BM_SUCCESS);
+
+  CVI_RT_MemFree(*ctx, dev_mem);
+  return data;
+}
+
+static void test_get_tensor_l2g_gl_stride(
+    CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx)
+{
+  int n = 2;
+  int c = 35;
+  int h = 2;
+  int w = 3;
+
+  tg_shape_t tg_shape;
+  tg_shape.n = n;
+  tg_shape.c = c;
+  tg_shape.h = h;
+  tg_shape.w = w;
+
+  bmk1880v2_tensor_tgmem_stride_t tg_stride =
+      bmk1880v2_tensor_tgmem_default_stride(tg_shape, FMT_I8);
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h * w;
+  tl_shape.w = 1;
+
+  int size = tl_shape.n * tl_shape.c * tl_shape.h * tl_shape.w;
+  u8 *data_x = (u8 *)xmalloc(size);
+  for (int i = 0; i < size; i++)
+    data_x[i] = i;
+
+  tl_t *tl_x =
+      alloc_tl(bk_ctx, tl_shape, FMT_I8, 0);
+  put_tensor_g2l(ctx, bk_ctx, tl_x, data_x);
+
+  tl_x->shape.n = n;
+  tl_x->shape.c = c;
+  tl_x->shape.h = h;
+  tl_x->shape.w = w;
+
+  tl_x->stride = bmk1880v2_tensor_lmem_default_stride(bk_ctx, tl_x->shape, FMT_I8, 0);
+
+  u8 *result_x = get_tensor_l2g_stride(ctx, bk_ctx, tl_x, tg_stride);
+  int stride_size = tg_shape.n * tg_stride.n;
+  u8 *ref_x = (u8 *)xmalloc(stride_size);
+  if (!result_x || !ref_x)
+    goto fail_exit;
+
+  get_tensor_l2g_stride_ref(ref_x,
+      data_x, tg_shape.n,
+      tg_shape.c, tg_shape.h,
+      tg_shape.w, tl_x->stride, tg_stride);
+
+  for (int i = 0; i < stride_size; i++) {
+    if (result_x[i] != ref_x[i]) {
+      printf("compare failed at result_x[%d], got %d, exp %d\n",
+             i, result_x[i], ref_x[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_x);
+
+fail_exit:
+  free(data_x);
+  free(result_x);
+  free(ref_x);
+}
+
+int main (void)
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  test_get_tensor_l2g_gl_stride(&ctx, bk_ctx);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_get_tensor_stride.cpp b/cviruntime/test/1880v2/test_1880v2_get_tensor_stride.cpp
new file mode 100644
index 000000000..afaad6b66
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_get_tensor_stride.cpp
@@ -0,0 +1,147 @@
+#include "1880v2_test_util.h"
+
+static void get_tensor_l2g_stride_ref(
+    u8 *ref, u8 *a,
+    tl_shape_t tl_shape,
+    bmk1880v2_tensor_tgmem_stride_t tg_stride)
+{
+  int n = tl_shape.n;
+  int c = tl_shape.c;
+  int h = tl_shape.h;
+  int w = tl_shape.w;
+
+  int n_str = tg_stride.n;
+  int c_str = tg_stride.c;
+  int h_str = tg_stride.h;
+  int w_str = 1;
+
+  /*
+   * Same as in get_tensor_l2g_stride().
+   */
+  int stride_size = n * tg_stride.n;
+  for (int i = 0; i < stride_size; i++)
+    ref[i] = 0xcf;
+
+  for (int ni = 0; ni < n; ni++) {
+    for (int ci = 0; ci < c; ci++) {
+      for (int hi = 0; hi < h; hi++) {
+        for (int wi = 0; wi < w; wi++) {
+          u64 src_i = ni * c * h * w + ci * h * w + hi * w + wi;
+          u64 dst_i = ni * n_str + ci * c_str + hi * h_str + wi * w_str;
+          ref[dst_i] = a[src_i];
+        }
+      }
+    }
+  }
+}
+
+static inline u8 * get_tensor_l2g_stride(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    tl_t *tl,
+    bmk1880v2_tensor_tgmem_stride_t tg_stride)
+{
+  int n = tl->shape.n;
+  int n_stride = tg_stride.n;
+
+  int stride_size = n * n_stride;
+  u8 *data = (u8 *)malloc(sizeof(u8) * stride_size);
+  if (!data)
+    return NULL;
+
+  for (int i = 0; i < stride_size; i++)
+    data[i] = 0xcf;
+
+  bmshape_t bms = BM_TENSOR_WITH_FMT(n, n_stride, 1, 1, BM_FMT_INT8);
+  CVI_RT_MEM dev_mem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = CVI_RT_MemGetPAddr(dev_mem);
+  int ret = CVI_RT_MemCopyS2D(*ctx, dev_mem, data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_I8;
+  tg.shape.n = tl->shape.n;
+  tg.shape.c = tl->shape.c;
+  tg.shape.h = tl->shape.h;
+  tg.shape.w = tl->shape.w;
+  tg.stride = tg_stride;
+
+  bmk1880v2_tdma_l2tg_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = tl;
+  p.dst = &tg;
+  bmk1880v2_tdma_l2g_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  ret = CVI_RT_MemCopyD2S(*ctx, data, dev_mem);
+  assert(ret == BM_SUCCESS);
+
+  CVI_RT_MemFree(*ctx, dev_mem);
+  return data;
+}
+
+static void test_get_tensor_l2g_stride(
+    CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx)
+{
+  int n = 2;
+  int c = 15;
+  int h = 10;
+  int w = 8;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  int size = n * c * h * w;
+  u8 *data_x = (u8 *)xmalloc(size);
+  if (!data_x)
+    return;
+
+  for (int i = 0; i < size; i++)
+    data_x[i] = i;
+
+  bmk1880v2_tensor_tgmem_stride_t tg_stride;
+  tg_stride.h = w * 2;
+  tg_stride.c = tg_stride.h * h * 2;
+  tg_stride.n = tg_stride.c * c * 2;
+  tl_t *tl_x =
+      alloc_tl(bk_ctx, tl_shape, FMT_I8, 1);
+
+  put_tensor_g2l(ctx, bk_ctx, tl_x, data_x);
+  u8 *result_x = get_tensor_l2g_stride(ctx, bk_ctx ,tl_x, tg_stride);
+  int stride_size = n * tg_stride.n;
+  u8 *ref_x = (u8 *)xmalloc(stride_size);
+  if (!result_x || !ref_x)
+    goto fail_exit;
+
+  get_tensor_l2g_stride_ref(ref_x, data_x, tl_shape, tg_stride);
+
+  for (int i = 0; i < stride_size; i++) {
+    if (result_x[i] != ref_x[i]) {
+      printf("compare failed at result_x[%d], got %d, exp %d\n",
+             i, result_x[i], ref_x[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_x);
+
+fail_exit:
+  free(data_x);
+  free(result_x);
+  free(ref_x);
+}
+
+int main (void)
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  test_get_tensor_l2g_stride(&ctx, bk_ctx);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_get_tensor_stride_unalign.cpp b/cviruntime/test/1880v2/test_1880v2_get_tensor_stride_unalign.cpp
new file mode 100644
index 000000000..61d33443f
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_get_tensor_stride_unalign.cpp
@@ -0,0 +1,164 @@
+#include "1880v2_test_util.h"
+
+static void get_tensor_l2g_stride_unalign_ref(
+    u8 *ref, u8 *a,
+    tl_shape_t tl_shape,
+    bmk1880v2_tensor_tgmem_stride_t gmem_stride)
+{
+  int n = tl_shape.n;
+  int c = tl_shape.c;
+  int h = tl_shape.h;
+  int w = tl_shape.w;
+  int n_str = gmem_stride.n;
+  int c_str = gmem_stride.c;
+  int h_str = gmem_stride.h;
+  int new_n = n * 2;
+  int new_h = h / 2;
+
+  /*
+   * Same as in get_tensor_l2g_stride_unalign().
+   */
+  int stride_size = new_n * gmem_stride.n;
+  for (int i = 0; i < stride_size; i++)
+    ref[i] = 0xcf;
+
+  /*
+   * (n, c, h, w) => (n * 2, c, h / 2, w)
+   */
+  for (int ni = 0; ni < n; ni++) {
+    for (int ci = 0; ci < c; ci++) {
+      for (int hi = 0; hi < h; hi++) {
+        for (int wi = 0; wi < w; wi++) {
+          u64 src_i = ni * c * h * w + ci * h * w + hi * w + wi;
+          u64 dst_i = (ni * 2 + hi / new_h) * n_str +
+              ci * c_str + (hi % new_h) * h_str + wi;
+          ref[dst_i] = a[src_i];
+        }
+      }
+    }
+  }
+}
+
+static inline u8 * get_tensor_l2g_stride(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    tl_t *tl,
+    bmk1880v2_tensor_tgmem_stride_t tg_stride)
+{
+  int n = tl->shape.n;
+  int n_stride = tg_stride.n;
+  int stride_size = n * n_stride;
+  u8 *data = (u8 *)malloc(sizeof(u8) * stride_size);
+  if (!data)
+    return NULL;
+
+  for (int i = 0; i < stride_size; i++)
+    data[i] = 0xcf;
+
+  bmshape_t bms = BM_TENSOR_WITH_FMT(n, n_stride, 1, 1, BM_FMT_INT8);
+  CVI_RT_MEM dev_mem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = CVI_RT_MemGetPAddr(dev_mem);
+  int ret = CVI_RT_MemCopyS2D(*ctx, dev_mem, data);
+  assert(ret == BM_SUCCESS);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_I8;
+  tg.shape.n = tl->shape.n;
+  tg.shape.c = tl->shape.c;
+  tg.shape.h = tl->shape.h;
+  tg.shape.w = tl->shape.w;
+  tg.stride = tg_stride;
+  tg.base_reg_index = 0;
+
+  bmk1880v2_tdma_l2tg_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = tl;
+  p.dst = &tg;
+  bmk1880v2_tdma_l2g_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  ret = CVI_RT_MemCopyD2S(*ctx, data, dev_mem);
+  assert(ret == BM_SUCCESS);
+
+  CVI_RT_MemFree(*ctx, dev_mem);
+  return data;
+}
+
+static void test_get_tensor_l2g_stride_unalign(
+    CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx)
+{
+  /*
+   * Make sure (h / 2 * w) is not eu-aligned.
+   */
+  int n = 1;
+  int c = 5;
+  int h = 18;
+  int w = 7;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  int size = n * c * h * w;
+  u8 *data_x = (u8 *)xmalloc(size);
+  for (int i = 0; i < size; i++)
+    data_x[i] = i;
+
+  int new_n = n * 2;
+  int new_h = h / 2;
+
+  bmk1880v2_tensor_tgmem_stride_t tg_stride;
+  tg_stride.h = w * 2;
+  tg_stride.c = w * 2 * new_h * 2;
+  tg_stride.n = w * 2 * new_h * 2 * c * 2;
+
+  tl_t *tl_x =
+      alloc_tl(bk_ctx, tl_shape, FMT_I8, 1);
+  put_tensor_g2l(ctx, bk_ctx, tl_x, data_x);
+
+  tl_x->shape.n = new_n;
+  tl_x->shape.c = c;
+  tl_x->shape.h = new_h;
+  tl_x->shape.w = w;
+
+  tl_x->stride = bmk1880v2_tensor_lmem_default_stride(bk_ctx, tl_x->shape, FMT_I8, 0);
+  u8 *result_x = get_tensor_l2g_stride(ctx, bk_ctx, tl_x, tg_stride);
+  tl_x->shape = tl_shape;
+  tl_x->stride = bmk1880v2_tensor_lmem_default_stride(bk_ctx, tl_x->shape, FMT_I8, 1);
+
+  int stride_size = new_n * tg_stride.n;
+  u8 *ref_x = (u8 *)xmalloc(stride_size);
+  if (!result_x || !ref_x)
+    goto fail_exit;
+
+  get_tensor_l2g_stride_unalign_ref(ref_x, data_x, tl_shape, tg_stride);
+
+  for (int i = 0; i < stride_size; i++) {
+    if (result_x[i] != ref_x[i]) {
+      printf("compare failed at result_x[%d], got %d, exp %d\n",
+             i, result_x[i], ref_x[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_x);
+
+fail_exit:
+  free(data_x);
+  free(result_x);
+  free(ref_x);
+}
+
+int main (void)
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  test_get_tensor_l2g_stride_unalign(&ctx, bk_ctx);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_lut.cpp b/cviruntime/test/1880v2/test_1880v2_lut.cpp
new file mode 100644
index 000000000..34e23c25f
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_lut.cpp
@@ -0,0 +1,96 @@
+#include "1880v2_test_util.h"
+
+static u64 shape_size(tl_shape_t s)
+{
+  return s.n * s.c * s.h * s.w;
+}
+
+static void tl_lut_ref(
+    u8 *ofmap,
+    u8 *ifmap,
+    u8 *table,
+    tl_shape_t ifmap_shape,
+    tl_shape_t table_shape)
+{
+  int ih, iw;
+  int tn, th, tw;
+
+  ih = ifmap_shape.h;
+  iw = ifmap_shape.w;
+  tn = table_shape.n;
+  th = table_shape.h;
+  tw = table_shape.w;
+  assert(tn == 1);
+  assert(th * tw == 256);
+
+  for (u64 i = 0; i < shape_size(ifmap_shape); i++) {
+    int ici = i / (ih * iw) % 32;
+    ofmap[i] = table[ici * (th * tw) + ifmap[i]];
+  }
+}
+
+static void test_tl_lut(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx)
+{
+  tl_shape_t ifmap_shape = {1, 32, 1, 224};
+  tl_shape_t table_shape = {1, 32, 16, 16};
+  tl_shape_t ofmap_shape = ifmap_shape;
+
+  u64 ifmap_size = shape_size(ifmap_shape);
+  u64 table_size = shape_size(table_shape);
+  u64 ofmap_size = shape_size(ofmap_shape);
+
+  u8 *ifmap_data = (u8 *)xmalloc(ifmap_size);
+  for (u64 i = 0; i < ifmap_size; i++)
+    ifmap_data[i] = i - 20;
+
+  u8 *table_data = (u8 *)xmalloc(table_size);
+  for (u64 i = 0; i < table_size; i++)
+    table_data[i] = i + i / 256 * 3;
+
+  u8 *ref_data = (u8 *)xmalloc(ofmap_size);
+  tl_lut_ref(ref_data, ifmap_data, table_data, ifmap_shape, table_shape);
+
+  tl_t *tl_ifmap =
+    alloc_tl(bk_ctx,ifmap_shape, FMT_I8, 1);;
+  tl_t *tl_table =
+    alloc_tl(bk_ctx, table_shape, FMT_I8, /*align*/1);
+  tl_t *tl_ofmap =
+    alloc_tl(bk_ctx,ofmap_shape, FMT_I8, /*align*/1);
+
+  put_tensor_g2l(ctx, bk_ctx, tl_ifmap, ifmap_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_table, table_data);
+  bmk1880v2_tiu_lookup_table_param_t p12;
+  memset(&p12, 0, sizeof(p12));
+  p12.ofmap = tl_ofmap;
+  p12.ifmap = tl_ifmap;
+  p12.table = tl_table;
+  bmk1880v2_tiu_lookup_table(bk_ctx, &p12);
+  test_submit(ctx);
+  u8 *ofmap_data = get_tensor_l2g(ctx, bk_ctx, tl_ofmap);
+  for (u64 i = 0; i < ofmap_size; i++) {
+    if (ofmap_data[i] != ref_data[i]) {
+      fprintf(stderr,
+          "comparing failed at ofmap_data[%" PRIu64 "], got %d, exp %d\n",
+          i, ofmap_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+  free_tl(bk_ctx, tl_ofmap);
+  free_tl(bk_ctx, tl_table);
+  free_tl(bk_ctx, tl_ifmap);
+
+  free(ifmap_data);
+  free(table_data);
+  free(ref_data);
+  free(ofmap_data);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  test_tl_lut(&ctx, bk_ctx);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_matrix_mac.cpp b/cviruntime/test/1880v2/test_1880v2_matrix_mac.cpp
new file mode 100644
index 000000000..795bf64bc
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_matrix_mac.cpp
@@ -0,0 +1,1965 @@
+#include "1880v2_test_util.h"
+
+typedef bmk1880v2_tiu_matrix_multiplication_param_t param_t;
+
+static u64 matrix_size(const ml_t *ml)
+{
+  u64 row = ml->shape.n;
+  u64 col = ml->shape.col;
+  return row * col;
+}
+
+static u64 res_size(param_t *p)
+{
+  if (p->res_is_int8 && !p->add_result)
+    return matrix_size(p->res);
+  else
+    return matrix_size(p->res) / 2;
+}
+
+static u8 * alloc_left(param_t *p)
+{
+  u64 size = matrix_size(p->left);
+
+  u8 *buf = (u8 *)malloc(sizeof(u8) * size);
+  for (u64 i = 0; i < size; i++)
+    buf[i] = i % 17 - 9;
+
+  return buf;
+}
+
+static u8 * alloc_right(param_t *p)
+{
+  u64 size = matrix_size(p->right);
+
+  u8 *buf = (u8 *)malloc(sizeof(u8) * size);
+  for (u64 i = 0; i < size; i++)
+    buf[i] = i % 13 - 6;
+
+  return buf;
+}
+
+static u16 * alloc_bias(param_t *p)
+{
+  if (!p->bias)
+    return NULL;
+
+  u64 size = matrix_size(p->bias) / 2;
+
+  u16 *buf = (u16 *)malloc(sizeof(u16) * size);
+  for (u64 i = 0; i < size; i++)
+    buf[i] = 5 - (i % 7);
+
+  return buf;
+}
+
+static u16 * alloc_res(param_t *p)
+{
+  u64 size = res_size(p);
+
+  u16 *buf = (u16 *)malloc(sizeof(u16) * size);
+  for (u64 i = 0; i < size; i++)
+    buf[i] = 17 - (i % 35);
+
+  return buf;
+}
+
+static void right_shift(param_t *p, s32 *buf, u64 size)
+{
+  int shift_bits = p->rshift_bits;
+  int round_up = 1;
+  if (1)
+    arith_right_shift(buf, size, shift_bits, round_up);
+  else
+    logic_right_shift(buf, size, shift_bits, round_up);
+}
+
+static void matrix_mac_ref(
+    param_t *p, u8 left[], u8 right[], u16 bias[], u16 res[])
+{
+  u64 size = res_size(p);
+  u32 left_col = p->left->shape.col;
+  u32 right_col = p->right->shape.col;
+  u32 res_row = p->left->shape.n;
+  u32 res_col = p->res->shape.col;
+  int left_sign = (p->left->fmt == FMT_I8);
+  int right_sign = (p->right->fmt == FMT_I8);
+  int res_sign = (p->res->fmt == FMT_I8);
+
+  s32 *tmp_res = (s32 *)malloc(sizeof(s32) * size);
+  if (p->add_result) {
+    for (u32 i = 0; i < res_row * res_col; i++) {
+      tmp_res[i] = res_sign? (s16)res[i]: res[i];
+      tmp_res[i] <<= p->lshift_bits;
+    }
+  } else {
+    for (u32 i = 0; i < res_row * res_col; i++)
+      tmp_res[i] = 0;
+  }
+
+  for (u32 row = 0; row < res_row; row++) {
+    for (u32 col = 0; col < res_col; col++) {
+      for (u32 i = 0; i < left_col; i++) {
+        u32 li = row * left_col + i;
+        u32 ri = i * right_col + col;
+        s32 l = left_sign? (s8)left[li]: left[li];
+        s32 r = right_sign? (s8)right[ri]: right[ri];
+        tmp_res[row * res_col + col] += l * r;
+      }
+    }
+  }
+
+  if (p->bias && bias) {
+    for (u32 row = 0; row < res_row; row++) {
+      for (u32 col = 0; col < res_col; col++) {
+        int bias_sign = (p->bias->fmt == FMT_I8);
+        s32 b = bias_sign? (s16)bias[col]: bias[col];
+        tmp_res[row * res_col + col] += b;
+      }
+    }
+  }
+
+  if (p->relu_enable)
+    relu(tmp_res, size);
+
+  right_shift(p, tmp_res, size);
+
+  if (p->res_is_int8)
+    saturate_to_int8(tmp_res, size, res_sign);
+  else
+    saturate_to_int16(tmp_res, size, res_sign);
+
+  for (u64 i = 0; i < size; i++)
+    res[i] = tmp_res[i];
+
+  free(tmp_res);
+}
+
+static void put_bias(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    const ml_t *ml,
+    u16 data[])
+{
+  u64 size = ml->shape.col;
+
+  u8 *tmp = (u8 *)malloc(sizeof(u8) * size * 2);
+  if (!tmp)
+    return;
+
+  for (u64 i = 0; i < size; i++) {
+    tmp[i] = data[i] & 0xff;
+    tmp[i + size] = (data[i] >> 8) & 0xff;
+  }
+
+  put_matrix_g2l(ctx, bk_ctx, ml, tmp);
+
+  free(tmp);
+}
+
+static void put_res(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    const ml_t *ml,
+    u16 data[])
+{
+  u64 size = ml->shape.n / 2 * ml->shape.col;
+
+  u8 *tmp = (u8 *)malloc(sizeof(u8) * size * 2);
+  if (!tmp)
+    return;
+
+  for (u64 i = 0; i < size; i++) {
+    tmp[i] = data[i] & 0xff;
+    tmp[i + size] = (data[i] >> 8) & 0xff;
+  }
+
+  put_matrix_g2l(ctx, bk_ctx, ml, tmp);
+
+  free(tmp);
+}
+
+static u16 * get_res(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    param_t *p)
+{
+  u64 size = res_size(p);
+  u16 *res = (u16 *)malloc(sizeof(u16) * size);
+
+  u8 *tmp = get_matrix_l2g(ctx, bk_ctx, p->res);
+  if (p->res_is_int8) {
+    int res_sign = (p->res->fmt == FMT_I8);
+    for (u64 i = 0; i < size; i++)
+      res[i] = res_sign? (s8)tmp[i]: tmp[i];
+  } else {
+    for (u64 i = 0; i < size; i++)
+      res[i] = tmp[i] + (tmp[i + size] << 8);
+  }
+
+  free(tmp);
+  return res;
+}
+
+static void test_param(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, param_t *p)
+{
+  u8 *left = alloc_left(p);
+  u8 *right = alloc_right(p);
+  u16 *bias = alloc_bias(p);
+  u16 *ref = alloc_res(p);
+
+  put_matrix_g2l(ctx, bk_ctx, p->left, left);
+  put_matrix_g2l(ctx, bk_ctx, p->right, right);
+  if (bias)
+    put_bias(ctx, bk_ctx, p->bias, bias);
+  if (p->add_result)
+    put_res(ctx, bk_ctx, p->res, ref);
+
+  bmk1880v2_tiu_matrix_multiplication(bk_ctx, p);
+  u16 *res = get_res(ctx, bk_ctx, p);
+
+  matrix_mac_ref(p, left, right, bias, ref);
+
+  u64 size = res_size(p);
+  for (u64 i = 0; i < size; i++) {
+    if (res[i] != ref[i]) {
+      fprintf(stderr, "comparing failed at out[%" PRIu64 "], got %d, exp %d\n",
+              i, (s16)res[i], (s16)ref[i]);
+      exit(-1);
+    }
+  }
+
+  free(left);
+  free(right);
+  free(bias);
+  free(ref);
+  free(res);
+}
+
+static void destroy_param(bmk_ctx_t *bk_ctx, param_t *p)
+{
+  bmk1880v2_lmem_free_matrix(bk_ctx, p->res);
+  if (p->bias)
+    bmk1880v2_lmem_free_matrix(bk_ctx, p->bias);
+  bmk1880v2_lmem_free_matrix(bk_ctx, p->right);
+  bmk1880v2_lmem_free_matrix(bk_ctx, p->left);
+}
+
+static ml_t *alloc_param_res(
+    bmk_ctx_t *bk_ctx, param_t *p)
+{
+  ml_shape_t s;
+  s.n = p->left->shape.n;
+  if (p->add_result || !p->res_is_int8)
+    s.n *= 2;
+  s.c = p->right->shape.c;
+  s.w = p->right->shape.w;
+  s.col = p->right->shape.col;
+
+  fmt_t fmt = FMT_U8;
+  if (p->left->fmt == FMT_I8)
+    fmt = FMT_I8;
+  if (p->right->fmt == FMT_I8)
+    fmt = FMT_I8;
+  if (p->bias)
+    if (p->bias->fmt == FMT_I8)
+      fmt = FMT_I8;
+
+  if (p->relu_enable)
+    fmt = FMT_U8;
+
+  return bmk1880v2_lmem_alloc_matrix(bk_ctx, s, fmt, 1);
+}
+
+static param_t param_0(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+  p.ps32_mode = 0;
+
+  u32 left_row = 1;
+  u32 left_col = 1;
+  u32 left_c = 1;
+  u32 left_w = 1;
+
+  u32 right_row = 1;
+  u32 right_col = 1;
+  u32 right_c = 1;
+  u32 right_w = 1;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = bmk1880v2_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1880v2_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_1(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 6;
+  u32 left_col = 1;
+  u32 left_c = 1;
+  u32 left_w = 1;
+
+  u32 right_row = 1;
+  u32 right_col = 1;
+  u32 right_c = 1;
+  u32 right_w = 1;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = bmk1880v2_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1880v2_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_2(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 6;
+  u32 left_col = 25;
+  u32 left_c = 1;
+  u32 left_w = 25;
+
+  u32 right_row = 25;
+  u32 right_col = 1;
+  u32 right_c = 1;
+  u32 right_w = 1;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = bmk1880v2_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1880v2_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_3(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 6;
+  u32 left_col = 25;
+  u32 left_c = 2;
+  u32 left_w = 18;
+
+  u32 right_row = 25;
+  u32 right_col = 1;
+  u32 right_c = 1;
+  u32 right_w = 1;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = bmk1880v2_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1880v2_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_4(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 6;
+  u32 left_col = 39;
+  u32 left_c = 4;
+  u32 left_w = 10;
+
+  u32 right_row = 39;
+  u32 right_col = 1;
+  u32 right_c = 1;
+  u32 right_w = 1;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = bmk1880v2_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1880v2_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_5(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 1;
+  u32 left_col = 1;
+  u32 left_c = 1;
+  u32 left_w = 1;
+
+  u32 right_row = 1;
+  u32 right_col = 2;
+  u32 right_c = 1;
+  u32 right_w = 2;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = bmk1880v2_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1880v2_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_6(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 1;
+  u32 left_col = 1;
+  u32 left_c = 1;
+  u32 left_w = 1;
+
+  u32 right_row = 1;
+  u32 right_col = 2;
+  u32 right_c = 2;
+  u32 right_w = 1;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = bmk1880v2_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1880v2_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_7(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 1;
+  u32 left_col = 1;
+  u32 left_c = 1;
+  u32 left_w = 1;
+
+  u32 right_row = 1;
+  u32 right_col = 10;
+  u32 right_c = 3;
+  u32 right_w = 4;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = bmk1880v2_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1880v2_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_8(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 7;
+  u32 left_col = 1;
+  u32 left_c = 1;
+  u32 left_w = 1;
+
+  u32 right_row = 1;
+  u32 right_col = 10;
+  u32 right_c = 3;
+  u32 right_w = 4;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = bmk1880v2_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1880v2_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_9(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 7;
+  u32 left_col = 23;
+  u32 left_c = 3;
+  u32 left_w = 8;
+
+  u32 right_row = 23;
+  u32 right_col = 10;
+  u32 right_c = 3;
+  u32 right_w = 4;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = bmk1880v2_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1880v2_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_10(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 7;
+  u32 left_col = 477;
+  u32 left_c = 60;
+  u32 left_w = 8;
+
+  u32 right_row = 477;
+  u32 right_col = 10;
+  u32 right_c = 3;
+  u32 right_w = 4;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = bmk1880v2_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1880v2_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_11(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 7;
+  u32 left_col = 23;
+  u32 left_c = 3;
+  u32 left_w = 8;
+
+  u32 right_row = 23;
+  u32 right_col = 477;
+  u32 right_c = 60;
+  u32 right_w = 8;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = bmk1880v2_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1880v2_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_12(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 1;
+  u32 left_col = 1;
+  u32 left_c = 1;
+  u32 left_w = 1;
+
+  u32 right_row = 1;
+  u32 right_col = 1;
+  u32 right_c = 1;
+  u32 right_w = 1;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1880v2_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1880v2_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = bmk1880v2_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_I8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_13(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 1;
+  u32 left_col = 1;
+  u32 left_c = 1;
+  u32 left_w = 1;
+
+  u32 right_row = 1;
+  u32 right_col = 2;
+  u32 right_c = 1;
+  u32 right_w = 2;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1880v2_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1880v2_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = bmk1880v2_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_I8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_14(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 7;
+  u32 left_col = 477;
+  u32 left_c = 60;
+  u32 left_w = 8;
+
+  u32 right_row = 477;
+  u32 right_col = 10;
+  u32 right_c = 3;
+  u32 right_w = 4;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1880v2_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1880v2_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = bmk1880v2_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_I8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_15(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 3;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 7;
+  u32 left_col = 477;
+  u32 left_c = 60;
+  u32 left_w = 8;
+
+  u32 right_row = 477;
+  u32 right_col = 10;
+  u32 right_c = 3;
+  u32 right_w = 4;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1880v2_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1880v2_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = bmk1880v2_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_I8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_16(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 3;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 7;
+  u32 left_col = 477;
+  u32 left_c = 60;
+  u32 left_w = 8;
+
+  u32 right_row = 477;
+  u32 right_col = 10;
+  u32 right_c = 3;
+  u32 right_w = 4;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1880v2_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1880v2_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = bmk1880v2_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_U8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_17(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 3;
+  p.res_is_int8 = true;
+  p.relu_enable = true;
+  p.add_result = false;
+
+  u32 left_row = 7;
+  u32 left_col = 477;
+  u32 left_c = 60;
+  u32 left_w = 8;
+
+  u32 right_row = 477;
+  u32 right_col = 10;
+  u32 right_c = 3;
+  u32 right_w = 4;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1880v2_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1880v2_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = bmk1880v2_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_U8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_18(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 7;
+  u32 left_col = 23;
+  u32 left_c = 3;
+  u32 left_w = 8;
+
+  u32 right_row = 23;
+  u32 right_col = 477;
+  u32 right_c = 60;
+  u32 right_w = 8;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1880v2_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1880v2_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = bmk1880v2_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_U8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_19(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 1;
+  u32 left_col = 1;
+  u32 left_c = 1;
+  u32 left_w = 1;
+
+  u32 right_row = 1;
+  u32 right_col = 1;
+  u32 right_c = 1;
+  u32 right_w = 1;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1880v2_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1880v2_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = bmk1880v2_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_I8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_20(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 4;
+  u32 left_col = 1;
+  u32 left_c = 1;
+  u32 left_w = 1;
+
+  u32 right_row = 1;
+  u32 right_col = 1;
+  u32 right_c = 1;
+  u32 right_w = 1;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1880v2_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1880v2_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = bmk1880v2_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_I8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_21(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 3;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 7;
+  u32 left_col = 477;
+  u32 left_c = 60;
+  u32 left_w = 8;
+
+  u32 right_row = 477;
+  u32 right_col = 10;
+  u32 right_c = 3;
+  u32 right_w = 4;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1880v2_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1880v2_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = bmk1880v2_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_U8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_22(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 2;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 7;
+  u32 left_col = 23;
+  u32 left_c = 3;
+  u32 left_w = 8;
+
+  u32 right_row = 23;
+  u32 right_col = 477;
+  u32 right_c = 60;
+  u32 right_w = 8;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1880v2_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1880v2_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = bmk1880v2_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_U8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_23(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 2;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 7;
+  u32 left_col = 23;
+  u32 left_c = 3;
+  u32 left_w = 8;
+
+  u32 right_row = 23;
+  u32 right_col = 477;
+  u32 right_c = 60;
+  u32 right_w = 8;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1880v2_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1880v2_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = bmk1880v2_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_U8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_24(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  u32 left_row = 1;
+  u32 left_col = 1;
+  u32 left_c = 1;
+  u32 left_w = 1;
+
+  u32 right_row = 1;
+  u32 right_col = 1;
+  u32 right_c = 1;
+  u32 right_w = 1;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1880v2_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1880v2_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = bmk1880v2_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_I8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_25(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  u32 left_row = 4;
+  u32 left_col = 1;
+  u32 left_c = 1;
+  u32 left_w = 1;
+
+  u32 right_row = 1;
+  u32 right_col = 1;
+  u32 right_c = 1;
+  u32 right_w = 1;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1880v2_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1880v2_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = bmk1880v2_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_I8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_26(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  u32 left_row = 4;
+  u32 left_col = 1;
+  u32 left_c = 1;
+  u32 left_w = 1;
+
+  u32 right_row = 1;
+  u32 right_col = 1;
+  u32 right_c = 1;
+  u32 right_w = 1;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1880v2_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1880v2_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = bmk1880v2_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_I8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_27(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 3;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  u32 left_row = 7;
+  u32 left_col = 477;
+  u32 left_c = 60;
+  u32 left_w = 8;
+
+  u32 right_row = 477;
+  u32 right_col = 10;
+  u32 right_c = 3;
+  u32 right_w = 4;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1880v2_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1880v2_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = bmk1880v2_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_U8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_28(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 2;
+  p.rshift_bits = 3;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  u32 left_row = 7;
+  u32 left_col = 477;
+  u32 left_c = 60;
+  u32 left_w = 8;
+
+  u32 right_row = 477;
+  u32 right_col = 10;
+  u32 right_c = 3;
+  u32 right_w = 4;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1880v2_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1880v2_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = bmk1880v2_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_U8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_29(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 2;
+  p.rshift_bits = 3;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  u32 left_row = 7;
+  u32 left_col = 477;
+  u32 left_c = 60;
+  u32 left_w = 8;
+
+  u32 right_row = 477;
+  u32 right_col = 10;
+  u32 right_c = 3;
+  u32 right_w = 4;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1880v2_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1880v2_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = bmk1880v2_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_U8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_30(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 2;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  u32 left_row = 7;
+  u32 left_col = 23;
+  u32 left_c = 3;
+  u32 left_w = 8;
+
+  u32 right_row = 23;
+  u32 right_col = 477;
+  u32 right_c = 60;
+  u32 right_w = 8;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1880v2_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1880v2_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = bmk1880v2_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_U8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_31(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 3;
+  p.rshift_bits = 2;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  u32 left_row = 7;
+  u32 left_col = 23;
+  u32 left_c = 3;
+  u32 left_w = 8;
+
+  u32 right_row = 23;
+  u32 right_col = 477;
+  u32 right_c = 60;
+  u32 right_w = 8;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1880v2_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1880v2_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = bmk1880v2_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_U8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_32(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 6;
+  p.rshift_bits = 2;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  u32 left_row = 7;
+  u32 left_col = 23;
+  u32 left_c = 3;
+  u32 left_w = 8;
+
+  u32 right_row = 23;
+  u32 right_col = 477;
+  u32 right_c = 60;
+  u32 right_w = 8;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1880v2_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1880v2_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = bmk1880v2_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_U8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_33(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 6;
+  p.rshift_bits = 2;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  u32 left_row = 7;
+  u32 left_col = 23;
+  u32 left_c = 3;
+  u32 left_w = 8;
+
+  u32 right_row = 23;
+  u32 right_col = 477;
+  u32 right_c = 60;
+  u32 right_w = 8;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1880v2_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  p.right = bmk1880v2_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = bmk1880v2_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_U8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_34(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 1;
+  p.rshift_bits = 13;
+  p.res_is_int8 = true;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  u32 left_row = 7;
+  u32 left_col = 23;
+  u32 left_c = 3;
+  u32 left_w = 8;
+
+  u32 right_row = 23;
+  u32 right_col = 477;
+  u32 right_c = 60;
+  u32 right_w = 8;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1880v2_lmem_alloc_matrix(bk_ctx, left_shape, FMT_U8, 1);
+  p.right = bmk1880v2_lmem_alloc_matrix(bk_ctx, right_shape, FMT_U8, 1);
+  p.bias = bmk1880v2_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_U8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_35(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 1;
+  p.rshift_bits = 10;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = true;
+
+  u32 left_row = 7;
+  u32 left_col = 23;
+  u32 left_c = 3;
+  u32 left_w = 8;
+
+  u32 right_row = 23;
+  u32 right_col = 477;
+  u32 right_c = 60;
+  u32 right_w = 8;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1880v2_lmem_alloc_matrix(bk_ctx, left_shape, FMT_U8, 1);
+  p.right = bmk1880v2_lmem_alloc_matrix(bk_ctx, right_shape, FMT_U8, 1);
+  p.bias = bmk1880v2_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_U8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_36(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 1;
+  p.rshift_bits = 10;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 7;
+  u32 left_col = 23;
+  u32 left_c = 3;
+  u32 left_w = 8;
+
+  u32 right_row = 23;
+  u32 right_col = 477;
+  u32 right_c = 60;
+  u32 right_w = 8;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  ml_shape_t bias_shape = right_shape;
+  bias_shape.n = 2;
+
+  p.left = bmk1880v2_lmem_alloc_matrix(bk_ctx, left_shape, FMT_U8, 1);
+  p.right = bmk1880v2_lmem_alloc_matrix(bk_ctx, right_shape, FMT_U8, 1);
+  p.bias = bmk1880v2_lmem_alloc_matrix(bk_ctx, bias_shape, FMT_U8, 1);
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_37(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 1;
+  p.rshift_bits = 10;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 7;
+  u32 left_col = 23;
+  u32 left_c = 3;
+  u32 left_w = 8;
+
+  u32 right_row = 23;
+  u32 right_col = 477;
+  u32 right_c = 60;
+  u32 right_w = 8;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = bmk1880v2_lmem_alloc_matrix(bk_ctx, left_shape, FMT_U8, 1);
+  p.right = bmk1880v2_lmem_alloc_matrix(bk_ctx, right_shape, FMT_U8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+static param_t param_38(bmk_ctx_t *bk_ctx)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 1;
+  p.rshift_bits = 6;
+  p.res_is_int8 = false;
+  p.relu_enable = false;
+  p.add_result = false;
+
+  u32 left_row = 7;
+  u32 left_col = 23;
+  u32 left_c = 3;
+  u32 left_w = 8;
+
+  u32 right_row = 23;
+  u32 right_col = 477;
+  u32 right_c = 60;
+  u32 right_w = 8;
+
+  ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  p.left = bmk1880v2_lmem_alloc_matrix(bk_ctx, left_shape, FMT_U8, 1);
+  p.right = bmk1880v2_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  p.bias = NULL;
+  p.res = alloc_param_res(bk_ctx, &p);
+
+  return p;
+}
+
+#define test_one_param(n)                               \
+  do {                                                  \
+    param_t p = param_##n(bk_ctx);                      \
+    test_param(&ctx, bk_ctx, &p);                       \
+    destroy_param(bk_ctx, &p);                          \
+  } while (0)
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_one_param(0);
+  test_one_param(1);
+  test_one_param(2);
+  test_one_param(3);
+  test_one_param(4);
+  test_one_param(5);
+  test_one_param(6);
+  test_one_param(7);
+  test_one_param(8);
+  test_one_param(9);
+  test_one_param(10);
+  test_one_param(11);
+  test_one_param(12);
+  test_one_param(13);
+  test_one_param(14);
+  test_one_param(15);
+  test_one_param(16);
+  test_one_param(17);
+  test_one_param(18);
+  test_one_param(19);
+  test_one_param(20);
+  test_one_param(21);
+  test_one_param(22);
+  test_one_param(23);
+  test_one_param(24);
+  test_one_param(25);
+  test_one_param(26);
+  test_one_param(27);
+  test_one_param(28);
+  test_one_param(29);
+  test_one_param(30);
+  test_one_param(31);
+  test_one_param(32);
+  test_one_param(33);
+  test_one_param(34);
+  test_one_param(35);
+  test_one_param(36);
+  test_one_param(37);
+  test_one_param(38);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_matrix_mac_ps32.cpp b/cviruntime/test/1880v2/test_1880v2_matrix_mac_ps32.cpp
new file mode 100644
index 000000000..0d0ba0a79
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_matrix_mac_ps32.cpp
@@ -0,0 +1,560 @@
+#include "1880v2_test_util.h"
+
+typedef bmk1880v2_tiu_matrix_multiplication_param_t param_t;
+
+typedef struct{
+  fmt_t left_sign;
+  u32 left_row ;
+  u32 left_col ;
+  u32 left_c ;
+  u32 left_w ;
+  fmt_t right_sign;
+  u32 right_row ;
+  u32 right_col ;
+  u32 right_c ;
+  u32 right_w ;
+  u32 lshift_bits ;
+  u32 rshift_bits ;
+  u32 relu_enable ;
+  u32 using_bias;
+  fmt_t bias_sign;
+} matrix_init_para_t;
+
+matrix_init_para_t matrix_para_t;
+
+static void make_bmk_matrix_param_ps32(bmk_ctx_t *bk_ctx, param_t *p, int ps32_mode);
+static param_t param_init();
+
+void print_param(param_t *p)
+{
+  printf("ps32_mode =%d\n",p->ps32_mode);
+  printf("left_shape.n =%d\n",p->left->shape.n);
+  printf("left_shape.col =%d\n",p->left->shape.col);
+  printf("left_shape.c =%d\n",p->left->shape.c);
+  printf("left_shape.w =%d\n",p->left->shape.w);
+  printf("left_fmt =%d\n",p->left->fmt);
+  printf("right_shape.n =%d\n",p->right->shape.n);
+  printf("right_shape.col =%d\n",p->right->shape.col);
+  printf("right_shape.c =%d\n",p->right->shape.c);
+  printf("right_shape.w =%d\n",p->right->shape.w);
+  printf("right_fmt =%d\n",p->right->fmt);
+  if(p->bias)
+  {
+    printf("bias_shape.n =%d\n",p->bias->shape.n);
+    printf("bias_shape.col =%d\n",p->bias->shape.col);
+    printf("bias_shape.c =%d\n",p->bias->shape.c);
+    printf("bias_shape.w =%d\n",p->bias->shape.w);
+    printf("bias_fmt =%d\n",p->bias->fmt);
+  }
+  printf("result_shape.n =%d\n",p->res->shape.n);
+  printf("result_shape.col =%d\n",p->res->shape.col);
+  printf("result_shape.c =%d\n",p->res->shape.c);
+  printf("result_shape.w =%d\n",p->res->shape.w);
+  printf("result_fmt =%d\n",p->res->fmt);
+  printf("relu_enable=%d\n",p->relu_enable);
+  printf("rshift_bits=%d\n",p->rshift_bits);
+}
+
+
+static u64 matrix_size(const ml_t *ml)
+{
+  u64 row = ml->shape.n;
+  u64 col = ml->shape.col;
+  return row * col;
+}
+
+static u64 res_ps32_size(param_t *p)
+{
+    return matrix_size(p->res);
+}
+
+static u64 res_size(param_t *p)
+{
+  if (p->res_is_int8 && !p->add_result)
+    return matrix_size(p->res);
+  else
+    return matrix_size(p->res) *2 ;
+}
+
+static u8 * alloc_left(param_t *p)
+{
+  u64 size = matrix_size(p->left);
+  u8 *buf = (u8 *)malloc(sizeof(u8) * size);
+  for (u64 i = 0; i < size; i++)
+    buf[i] = i % 17 - 9;
+
+  return buf;
+}
+
+static u8 * alloc_right(param_t *p)
+{
+  u64 size = matrix_size(p->right);
+
+  u8 *buf = (u8 *)malloc(sizeof(u8) * size);
+  for (u64 i = 0; i < size; i++)
+    buf[i] = i % 13 - 6;
+
+  return buf;
+}
+static u16 * alloc_bias(param_t *p)
+{
+  if (!p->bias)
+    return NULL;
+
+  u64 size = matrix_size(p->bias) / 2;
+
+  u16 *buf = (u16 *)malloc(sizeof(u16) * size);
+  for (u64 i = 0; i < size; i++)
+    buf[i] = 5 - (i % 7);
+
+  return buf;
+}
+
+static u8 * alloc_ps32_res(param_t *p)
+{
+  u64 size = res_ps32_size(p)*4;
+  u8 *buf = (u8 *)malloc(sizeof(u8) * size);
+  for (u64 i = 0; i < size; i++)
+    buf[i] = 17 - (i % 35);
+
+  return buf;
+}
+
+static void right_shift(param_t *p, s32 *buf, u64 size)
+{
+  int shift_bits = p->rshift_bits;
+  int round_up = 1;
+  if (1)
+    arith_right_shift(buf, size, shift_bits, round_up);
+  else
+    logic_right_shift(buf, size, shift_bits, round_up);
+}
+
+static int ps32_m2_matrix_mac_ref(
+  param_t *p,
+  u8 *left,
+  u8 *right,
+  u8 *res)
+{
+  u64 size = res_ps32_size(p);
+  u32 left_col = p->left->shape.col;
+  u32 right_col = p->right->shape.col;
+  u32 res_row = p->left->shape.n;
+  u32 res_col = p->res->shape.col;
+  int left_sign = (p->left->fmt == FMT_I8);
+  int right_sign = (p->right->fmt == FMT_I8);
+  int ret = BM_SUCCESS;
+  int bstride = res_row * res_col;
+
+  s32 *tmp_res = (s32 *)malloc(sizeof(s32) * size);
+  for (u32 i = 0; i < res_row * res_col; i++)
+      tmp_res[i] = 0;
+  for (u32 row = 0; row < res_row; row++) {
+    for (u32 col = 0; col < res_col; col++) {
+      for (u32 i = 0; i < left_col; i++) {
+        u32 li = row * left_col + i;
+        u32 ri = i * right_col + col;
+        s32 l = left_sign? (s8)left[li]: left[li];
+        s32 r = right_sign? (s8)right[ri]: right[ri];
+        tmp_res[row * res_col + col] += l * r;
+      }
+    }
+  }
+
+  for (u64 i = 0; i < size; i++)
+    res[i + bstride*0] = tmp_res[i]>>0;
+  for (u64 i = 0; i < size; i++)
+    res[i + bstride*1] = tmp_res[i]>>8;
+  for (u64 i = 0; i < size; i++)
+    res[i + bstride*2] = tmp_res[i]>>16;
+  for (u64 i = 0; i < size; i++)
+    res[i + bstride*3] = tmp_res[i]>>24;
+
+  free(tmp_res);
+
+  return ret;
+}
+
+static int ps32_m3_matrix_mac_ref(
+  param_t *p,
+  u8 *left,
+  u8 *right,
+  u8 *res)
+{
+  u64 size = res_ps32_size(p);
+  u32 left_col = p->left->shape.col;
+  u32 right_col = p->right->shape.col;
+  u32 res_row = p->left->shape.n;
+  u32 res_col = p->res->shape.col;
+  int left_sign = (p->left->fmt == FMT_I8);
+  int right_sign = (p->right->fmt == FMT_I8);
+  int ret = BM_SUCCESS;
+  int bstride = res_row * res_col;
+
+  u32 *tmp_res = (u32 *)malloc(sizeof(u32) * size);
+
+  for (u64 i = 0; i < size; i++)
+    tmp_res[i] = res[i + bstride*0];
+  for (u64 i = 0; i < size; i++)
+    tmp_res[i] |= res[i + bstride*1]<<8;
+  for (u64 i = 0; i < size; i++)
+    tmp_res[i] |= res[i + bstride*2]<<16;
+  for (u64 i = 0; i < size; i++)
+    tmp_res[i] |= res[i + bstride*3]<<24;
+
+  for (u32 row = 0; row < res_row; row++) {
+    for (u32 col = 0; col < res_col; col++) {
+      for (u32 i = 0; i < left_col; i++) {
+        u32 li = row * left_col + i;
+        u32 ri = i * right_col + col;
+        s32 l = left_sign? (s8)left[li]: left[li];
+        s32 r = right_sign? (s8)right[ri]: right[ri];
+        tmp_res[row * res_col + col] += l * r;
+      }
+    }
+  }
+
+  for (u64 i = 0; i < size; i++)
+    res[i + bstride*0] = tmp_res[i]>>0;
+  for (u64 i = 0; i < size; i++)
+    res[i + bstride*1] = tmp_res[i]>>8;
+  for (u64 i = 0; i < size; i++)
+    res[i + bstride*2] = tmp_res[i]>>16;
+  for (u64 i = 0; i < size; i++)
+    res[i + bstride*3] = tmp_res[i]>>24;
+
+  free(tmp_res);
+
+  return ret;
+}
+
+static int ps32_m1_matrix_mac_ref(
+  param_t *p,
+  u8 *left,
+  u8 *right,
+  u16 * bias,
+  u8 *res)
+{
+  u64 size = res_ps32_size(p);
+  u32 left_col = p->left->shape.col;
+  u32 right_col = p->right->shape.col;
+  u32 res_row = p->left->shape.n;
+  u32 res_col = p->res->shape.col;
+  int left_sign = (p->left->fmt == FMT_I8);
+  int right_sign = (p->right->fmt == FMT_I8);
+  int res_sign = (p->res->fmt == FMT_I8);
+  int ret = BM_SUCCESS;
+  int bstride = res_row * res_col;
+
+  s32 *tmp_res = (s32 *)malloc(sizeof(s32) * size);
+
+  for (u64 i = 0; i < size; i++)
+    tmp_res[i] = res[i + bstride*0];
+  for (u64 i = 0; i < size; i++)
+    tmp_res[i] |= res[i + bstride*1]<<8;
+  for (u64 i = 0; i < size; i++)
+    tmp_res[i] |= res[i + bstride*2]<<16;
+  for (u64 i = 0; i < size; i++)
+    tmp_res[i] |= res[i + bstride*3]<<24;
+
+  for (u32 row = 0; row < res_row; row++) {
+    for (u32 col = 0; col < res_col; col++) {
+      for (u32 i = 0; i < left_col; i++) {
+        u32 li = row * left_col + i;
+        u32 ri = i * right_col + col;
+        s32 l = left_sign? (s8)left[li]: left[li];
+        s32 r = right_sign? (s8)right[ri]: right[ri];
+        tmp_res[row * res_col + col] += l * r;
+      }
+    }
+  }
+
+  if (p->bias && bias) {
+    for (u32 row = 0; row < res_row; row++) {
+      for (u32 col = 0; col < res_col; col++) {
+        int bias_sign = (p->bias->fmt == FMT_I8);
+        s32 b = bias_sign? (s16)bias[col]: bias[col];
+        tmp_res[row * res_col + col] += b;
+      }
+    }
+  }
+
+  if (p->relu_enable)
+    relu(tmp_res, size);
+  right_shift(p, tmp_res, size);
+  if (p->res_is_int8)
+    saturate_to_int8(tmp_res, size, res_sign);
+  else
+    saturate_to_int16(tmp_res, size, res_sign);
+
+  for (u64 i = 0; i < size; i++)
+    res[i + bstride*0] = tmp_res[i]>>0;
+  for (u64 i = 0; i < size; i++)
+    res[i + bstride*1] = tmp_res[i]>>8;
+
+  free(tmp_res);
+
+  return ret;
+}
+
+static void put_bias(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    const ml_t *ml,
+    u16 data[])
+{
+  u64 size = ml->shape.col;
+
+  u8 *tmp = (u8 *)malloc(sizeof(u8) * size * 2);
+  if (!tmp)
+    return;
+
+  for (u64 i = 0; i < size; i++) {
+    tmp[i] = data[i] & 0xff;
+    tmp[i + size] = (data[i] >> 8) & 0xff;
+  }
+  put_matrix_g2l(ctx, bk_ctx, ml, tmp);
+
+  free(tmp);
+}
+
+
+static int test_matrix_ps32_ut(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, param_t *p)
+{
+  make_bmk_matrix_param_ps32(bk_ctx, p, 2);
+  u8 *left = alloc_left(p);
+  u8 *right = alloc_right(p);
+  u8 *ref = alloc_ps32_res(p);
+
+  {
+     bmerr_t ret = ps32_m2_matrix_mac_ref(p, left, right, ref);
+     assert(ret == BM_SUCCESS);
+
+     put_matrix_g2l(ctx, bk_ctx, p->left, left);
+     put_matrix_g2l(ctx, bk_ctx, p->right, right);
+     bmk1880v2_tiu_matrix_multiplication(bk_ctx, p);
+     bmk1880v2_matrix_lmem_t ps32_res;
+     ps32_res = *p->res;
+     ps32_res.shape.n *= sizeof(int);
+     u8 *res = get_matrix_l2g(ctx, bk_ctx, &ps32_res);
+
+     int has_error = array_cmp_int8(
+         "Comparing begin_mode results ...\n",
+         (s8 *)ref, (s8 *)res ,(int)res_ps32_size(p)*sizeof(int));
+     if (has_error) {
+       printf("Comparison M2 FAILED\n");
+       print_param(p);
+       exit(-1);
+     }else
+       printf("Comparison M2 PASS\n");
+     free(res);
+  }
+
+  {
+    make_bmk_matrix_param_ps32(bk_ctx, p, 3);
+
+    bmerr_t ret = ps32_m3_matrix_mac_ref(p, left, right, ref);
+    assert(ret == BM_SUCCESS);
+
+    bmk1880v2_tiu_matrix_multiplication(bk_ctx, p);
+    bmk1880v2_matrix_lmem_t ps32_res;
+    ps32_res = *p->res;
+    ps32_res.shape.n *= sizeof(int);
+    u8 *res = get_matrix_l2g(ctx, bk_ctx, &ps32_res);
+
+    int has_error = array_cmp_int8(
+        "Comparing m3 results ...\n",
+        (s8 *)ref, (s8 *)res ,(int)res_ps32_size(p)*sizeof(int));
+    if (has_error) {
+      printf("Comparison M3 FAILED\n");
+      print_param(p);
+      exit(-1);
+    }else
+      printf("Comparison M3 PASS\n");
+
+    free(res);
+  }
+  {
+    make_bmk_matrix_param_ps32(bk_ctx, p, 1);
+    u16 *bias = alloc_bias(p);
+
+    bmerr_t ret = ps32_m1_matrix_mac_ref(p, left, right, bias, ref);
+    assert(ret == BM_SUCCESS);
+
+    if(p->bias)
+      put_bias(ctx, bk_ctx, p->bias, bias);
+
+    bmk1880v2_tiu_matrix_multiplication(bk_ctx, p);
+    bmk1880v2_matrix_lmem_t ps32_res;
+    ps32_res = *p->res;
+    ps32_res.shape.n *= 2;
+
+    u8 *res = get_matrix_l2g(ctx, bk_ctx, &ps32_res);
+    int has_error = array_cmp_int8(
+        "Comparing m1 results ...\n",
+        (s8 *)ref, (s8 *)res ,(int)res_size(p));
+    if (has_error) {
+      printf("Comparison M1 FAILED\n");
+      print_param(p);
+      exit(-1);
+    }else
+      printf("Comparison M1 PASS\n");
+
+    free(res);
+    free(bias);
+  }
+    free(left);
+    free(right);
+    free(ref);
+  return 1;
+}
+
+static void destroy_param(bmk_ctx_t *bk_ctx, param_t *p)
+{
+  if (p->bias)
+    bmk1880v2_lmem_free_matrix(bk_ctx, p->bias);
+  bmk1880v2_lmem_free_matrix(bk_ctx, p->res);
+  bmk1880v2_lmem_free_matrix(bk_ctx, p->right);
+  bmk1880v2_lmem_free_matrix(bk_ctx, p->left);
+}
+
+static fmt_t modify_res_fmt()
+{
+  // Note:
+  //   From 7/29/2019 update H/W relu design,
+  //   res0_sign can be assigned and is not affected by relu.
+  //   Kernel will use result data format to set res0_sign.
+  fmt_t fmt = FMT_U8;
+  if (matrix_para_t.left_sign == FMT_I8)
+    fmt = FMT_I8;
+  if (matrix_para_t.right_sign == FMT_I8)
+    fmt = FMT_I8;
+  if (matrix_para_t.using_bias)
+    if (matrix_para_t.bias_sign == FMT_I8)
+      fmt = FMT_I8;
+
+  return fmt;
+}
+
+static ml_t *alloc_param_res(
+    bmk_ctx_t *bk_ctx, param_t *p)
+{
+  ml_shape_t s;
+  s.n = p->left->shape.n;
+  s.c = p->right->shape.c;
+  s.w = p->right->shape.w;
+  s.col = p->right->shape.col;
+
+  fmt_t fmt = FMT_U8;
+  fmt = modify_res_fmt();
+  return bmk1880v2_lmem_alloc_ps32_matrix(bk_ctx, s, fmt, 1);
+}
+
+
+static void make_bmk_matrix_param_ps32(bmk_ctx_t *bk_ctx, param_t *p, int ps32_mode)
+{
+
+  ml_shape_t left_shape;
+  ml_shape_t right_shape;
+
+  p->ps32_mode = ps32_mode;
+  p->relu_enable = 0;
+  p->lshift_bits = 0;
+  p->rshift_bits = 0;
+
+  if(ps32_mode==2)
+  {
+    left_shape.n = matrix_para_t.left_row;
+    left_shape.c = matrix_para_t.left_c;
+    left_shape.w = matrix_para_t.left_w;
+    left_shape.col = matrix_para_t.left_col;
+
+    right_shape.n = matrix_para_t.right_row;
+    right_shape.c = matrix_para_t.right_c;
+    right_shape.w = matrix_para_t.right_w;
+    right_shape.col = matrix_para_t.right_col;
+    p->left  = bmk1880v2_lmem_alloc_matrix(bk_ctx, left_shape,  matrix_para_t.left_sign , 1);
+    p->right = bmk1880v2_lmem_alloc_matrix(bk_ctx, right_shape, matrix_para_t.right_sign, 1);
+    p->bias = NULL;
+    p->res = alloc_param_res(bk_ctx, p);
+  }else if(ps32_mode==3)
+  {
+
+  }else if(ps32_mode==1)
+  {
+     p->relu_enable = matrix_para_t.relu_enable;
+     p->rshift_bits = matrix_para_t.rshift_bits;
+     if(matrix_para_t.using_bias)
+     {
+       right_shape.n = matrix_para_t.right_row;
+       right_shape.c = matrix_para_t.right_c;
+       right_shape.w = matrix_para_t.right_w;
+       right_shape.col = matrix_para_t.right_col;
+
+       ml_shape_t bias_shape = right_shape;
+       bias_shape.n = 2;
+       p->bias = bmk1880v2_lmem_alloc_matrix(bk_ctx, bias_shape, matrix_para_t.bias_sign, 1);
+       assert(p->bias);
+    }
+  }
+
+}
+static param_t param_init(void)
+{
+  param_t p;
+
+  //srand(clock());
+
+  memset(&p, 0, sizeof(param_t));
+  memset(&matrix_para_t, 0, sizeof(matrix_init_para_t));
+
+  matrix_para_t.rshift_bits = rand()%4+2;
+  matrix_para_t.using_bias = rand()%2;
+  matrix_para_t.relu_enable = rand()%2;
+  matrix_para_t.right_sign = rand()%2? FMT_I8 : FMT_U8;
+  matrix_para_t.left_sign = rand()%2? FMT_I8 : FMT_U8;
+
+  if(matrix_para_t.using_bias)
+    matrix_para_t.bias_sign = rand()%2? FMT_I8 : FMT_U8;
+
+  if(matrix_para_t.right_sign != FMT_I8 && matrix_para_t.left_sign != FMT_I8)
+    matrix_para_t.relu_enable=0;
+
+  matrix_para_t.left_row = rand()%60+1;
+  matrix_para_t.left_col = rand()%40+1;
+  matrix_para_t.left_w = matrix_para_t.left_col/0x10 ? rand()%8+8 : matrix_para_t.left_col;
+  //matrix_para_t.left_w = rand()%16+1;
+  matrix_para_t.left_c =
+    matrix_para_t.left_col%matrix_para_t.left_w?
+      matrix_para_t.left_col/matrix_para_t.left_w+1 : matrix_para_t.left_col/matrix_para_t.left_w;
+
+  matrix_para_t.right_row = matrix_para_t.left_col;
+  matrix_para_t.right_col = rand()%50+1;
+  //matrix_para_t.right_w = 16;
+  matrix_para_t.right_w = rand()%16+1;
+  matrix_para_t.right_c =
+    matrix_para_t.right_col%matrix_para_t.right_w?
+      matrix_para_t.right_col/matrix_para_t.right_w+1 : matrix_para_t.right_col/matrix_para_t.right_w;
+
+  return p;
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  int test_finished_num = 0;
+  for (int i = 0; i < 20; i++) {
+    printf("random_test_conv iteration: %d\n", i);
+    param_t p = param_init();
+
+    test_finished_num += test_matrix_ps32_ut(&ctx, bk_ctx, &p);
+    destroy_param(bk_ctx, &p);
+  }
+  printf("test_finished_num: %d\n", test_finished_num);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_matrix_mac_qdm.cpp b/cviruntime/test/1880v2/test_1880v2_matrix_mac_qdm.cpp
new file mode 100644
index 000000000..fb1941cf8
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_matrix_mac_qdm.cpp
@@ -0,0 +1,846 @@
+#include <limits.h>
+#include "1880v2_test_util.h"
+#include "test_tf_quant_util.h"
+
+// #define ENABLE_DEBUG_MSG
+// #define ENABLE_TV_GEN_PATTERN
+
+#define MIN_EXEC_TESTS  20
+
+using param_t = bmk1880v2_tiu_matrix_multiplication_qdm_param_t;
+
+typedef struct {
+  int left_row;
+  int left_col;
+  int right_col;
+  int has_bias;
+  int relu_enable;
+  s8 *input_data;
+  s8 *filter_data;
+  s8 *output_data;
+  s32 *bias_data;
+  u32 multiplier;
+  s8 right_shift;
+  float float_multiplier;
+  int retry_cnt;
+} fc_test_param_t;
+
+void fully_connected_ref(fc_test_param_t *p_param)
+{
+  const s32 input_offset = 0;
+  const s32 filter_offset = 0;
+  const s32 output_offset = 0;
+  const s32 output_multiplier = p_param->multiplier;
+  const int output_rshift = p_param->right_shift;
+  const int batches = p_param->left_row;
+  const int output_depth = p_param->right_col;
+  const int accum_depth = p_param->left_col;
+  s8 *input_data = p_param->input_data;
+  s8 *filter_data = p_param->filter_data;
+  s8 *output_data = p_param->output_data;
+  s32 *bias_data = p_param->has_bias ? p_param->bias_data : nullptr;
+
+  const s32 output_activation_min = -128;
+  const s32 output_activation_max = 127;
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("fully_connected_ref:\n");
+  printf("  batches %d, output_depth %d, accum_depth %d, filter_offset %d, "
+         "input_offset %d\n",
+         batches, output_depth, accum_depth, filter_offset, input_offset);
+  printf("  output_multiplier %d, output_rshift %d\n", output_multiplier,
+         output_rshift);
+#endif
+
+  for (int b = 0; b < batches; ++b) {
+    for (int out_c = 0; out_c < output_depth; ++out_c) {
+      s32 acc = 0;
+      for (int d = 0; d < accum_depth; ++d) {
+        s32 input_val = input_data[b * accum_depth + d];
+        // s32 filter_val = filter_data[out_c * accum_depth + d];
+        s32 filter_val = filter_data[output_depth * d + out_c];
+        acc += (filter_val + filter_offset) * (input_val + input_offset);
+
+#ifdef ENABLE_DEBUG_MSG
+        printf("  [%d][%d][%d] acc(%d) += (%d + %d) * (%d + %d) = %d\n", b,
+               out_c, d,
+               acc - (filter_val + filter_offset) * (input_val + input_offset),
+               filter_val, filter_offset, input_val, input_offset, acc);
+#endif
+      }
+      if (bias_data) {
+        acc += bias_data[out_c];
+
+#ifdef ENABLE_DEBUG_MSG
+        printf("  [%d][%d] acc %d, bias %d\n", b, out_c, acc,
+               bias_data ? bias_data[out_c] : 0);
+#endif
+      }
+      acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_rshift);
+
+#ifdef ENABLE_DEBUG_MSG
+      printf("  [%d][%d] acc %d, output_multiplier %d, output_rshift %d\n", b,
+             out_c, acc, output_multiplier, output_rshift);
+#endif
+
+      acc += output_offset;
+
+#ifdef ENABLE_DEBUG_MSG
+      printf("  [%d][%d] acc %d, output_offset %d\n", b, out_c, acc,
+             output_offset);
+#endif
+
+      acc = MAX(acc, output_activation_min);
+      acc = MIN(acc, output_activation_max);
+
+#ifdef ENABLE_DEBUG_MSG
+      printf("  [%d][%d] acc %d, output_activation_min %d, "
+             "output_activation_max %d\n",
+             b, out_c, acc, output_activation_min, output_activation_max);
+#endif
+
+      output_data[out_c + output_depth * b] = static_cast<int8_t>(acc);
+    }
+  }
+}
+
+void calc_fc_float_multiplier(fc_test_param_t *p_param)
+{
+  const s32 input_offset = 0;
+  const s32 filter_offset = 0;
+  const int batches = p_param->left_row;
+  const int output_depth = p_param->right_col;
+  const int accum_depth = p_param->left_col;
+  s8 *input_data = p_param->input_data;
+  s8 *filter_data = p_param->filter_data;
+  s32 *bias_data = p_param->has_bias ? p_param->bias_data : nullptr;
+
+  int output_accu_min = INT_MIN;
+  int output_accu_max = INT_MAX;
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("calc_fc_float_multiplier:\n");
+#endif
+
+  for (int b = 0; b < batches; ++b) {
+    for (int out_c = 0; out_c < output_depth; ++out_c) {
+      s32 acc = 0;
+      for (int d = 0; d < accum_depth; ++d) {
+        s32 input_val = input_data[b * accum_depth + d];
+        // s32 filter_val = filter_data[out_c * accum_depth + d];
+        s32 filter_val = filter_data[output_depth * d + out_c];
+        acc += (filter_val + filter_offset) * (input_val + input_offset);
+      }
+      if (bias_data) {
+        acc += bias_data[out_c];
+      }
+
+      output_accu_max = MAX(acc, output_accu_max);
+      output_accu_min = MAX(acc, output_accu_min);
+
+    }
+  }
+
+  // Since int8 ranges from -128 to 127, we need to squeeze the accumulator
+  // MAX/MAX fit in those ranges correspondingly as much as possible.
+  if (abs(output_accu_max) > abs(output_accu_min)) {
+    p_param->float_multiplier = 127.0f / abs(output_accu_max);
+  } else {
+    p_param->float_multiplier = 128.0f / abs(output_accu_min);
+  }
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("  output_accu_min %d, output_accu_max %d, output_multiplier %f\n",
+         output_accu_min, output_accu_max, p_param->float_multiplier);
+#endif
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("<= calc_fc_float_multiplier\n");
+#endif
+}
+
+static void put_bias32(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, const ml_t *ml,
+                       s32 data[])
+{
+  u64 size = ml->shape.col;
+
+  u8 *tmp = (u8 *)malloc(size * 4);
+  if (!tmp)
+    return;
+
+  for (u64 i = 0; i < size; i++) {
+    u32 val = static_cast<u32>(data[i]);
+    tmp[i] = val & 0xff;
+    tmp[i + size] = (val >> 8) & 0xff;
+    tmp[i + 2 * size] = (val >> 16) & 0xff;
+    tmp[i + 3 * size] = (val >> 24) & 0xff;
+  }
+
+  put_matrix_g2l(ctx, bk_ctx, ml, tmp);
+
+  free(tmp);
+}
+
+#if 0
+typedef struct {
+  s32 input_offset;
+  s32 weights_offset;
+  s32 output_offset;
+  s32 output_multiplier;
+  int output_rshift;
+} FullyConnectedParams;
+
+int tfl_original_test()
+{
+  int ret = 0;
+
+  // 2x10
+  s8 input_data[20] = {
+    1, 3, 5, 7,  9, 11, 13,  15, -19, -21,
+    1, 3, 5, 7,  9, 11, 13, -17,  17, -21};
+
+  // 3x10
+  s8 filter_data[30] = {
+    1, 3, 5, 7, 9, 11, 13, 15, 17, 19,
+    1, 3, 5, 7, 9, 11, 13, 15, 17, 19,
+    1, 3, 5, 7, 9, 11, 13, 15, 17, 19};
+
+  // 1x3
+  s32 bias_data[3] = {4, 8, 12};
+
+  // 2x3
+  s8 ref_output_data[6] = {
+    23, 24, 25,
+    57, 58, 59};
+
+  s8 output_rshift = 1; // change to right shift
+  u32 output_multiplier = 1073741824;
+
+  s32 input_offset = 1;
+  s32 filter_offset = 1;
+  s32 output_offset = 1;  // change to right shift
+
+  FullyConnectedParams params;
+  params.input_offset = input_offset;
+  params.weights_offset = filter_offset;
+  params.output_offset = output_offset;
+  params.output_multiplier = output_multiplier;
+  params.output_rshift = output_rshift;
+
+  tl_shape_t input_shape = {2, 10, 1, 1};
+  tl_shape_t filter_shape = {3, 10, 1, 1};
+  tl_shape_t bias_shape = {1, 3, 1, 1};
+  tl_shape_t output_shape = {2, 3, 1, 1};
+
+  s8 output_data[6];
+  fully_connected_ref(params, input_shape,
+                      input_data, filter_shape,
+                      filter_data, bias_shape,
+                      bias_data, output_shape,
+                      output_data);
+  for (int i = 0; i < 6; i++) {
+    if (output_data[i] != ref_output_data[i]) {
+      printf("  output_data[%d] %d != %d\n",
+             i, output_data[i], ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  return ret;
+}
+#endif
+
+int simple_test(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx)
+{
+  int ret = 0;
+
+  // 2x10
+  s8 input_data[20] = {1, 3, 5, 7, 9, 11, 13, 15,  -19, -21,
+                       1, 3, 5, 7, 9, 11, 13, -17, 17,  -21};
+
+#if 0
+  // 3x10
+  // tfl use transposed filter
+  s8 filter_data_tp[30] = {
+    1, 3, 5, 7, 9, 11, 13, 15, 17, 19,
+    1, 3, 5, 7, 9, 11, 13, 15, 17, 19,
+    1, 3, 5, 7, 9, 11, 13, 15, 17, 19};
+#endif
+
+  // 10x3
+  s8 filter_data[30] = {1,  1,  1,  3,  3,  3,  5,  5,  5,  7,
+                        7,  7,  9,  9,  9,  11, 11, 11, 13, 13,
+                        13, 15, 15, 15, 17, 17, 17, 19, 19, 19};
+
+  // 1x3
+  s32 bias_data[3] = {4, 8, 12};
+
+  // 2x3, input/kernel/output zero_point = 0
+  s8 ref_output_data[6] = {-10, -9, -8, 24, 25, 26};
+  s8 output_data[6];
+
+  s8 output_rshift = 1;  // change to right shift
+  u32 output_multiplier = 1073741824;
+
+  int left_row = 2;
+  int left_col = 10;
+  int right_col = 3;
+
+  fc_test_param_t params;
+  memset(&params, 0, sizeof(params));
+  params.left_row = left_row;
+  params.left_col = left_col;
+  params.right_col = right_col;
+  params.has_bias = 1;
+  params.relu_enable = 0;
+  params.input_data = input_data;
+  params.filter_data = filter_data;
+  params.output_data = output_data;
+  params.bias_data = bias_data;
+  params.multiplier = output_multiplier;
+  params.right_shift = output_rshift;
+  fully_connected_ref(&params);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("Compare ref and golden\n");
+#endif
+  for (int i = 0; i < 6; i++) {
+    if (output_data[i] != ref_output_data[i]) {
+      printf("  output_data[%d] %d(ref) != %d(golden)\n", i, output_data[i],
+             ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  ml_shape_t left_shape =
+      bmk1880v2_matrix_lmem_default_shape(bk_ctx, left_row, left_col, FMT_I8);
+
+  ml_shape_t right_shape =
+      bmk1880v2_matrix_lmem_default_shape(bk_ctx, left_col, right_col, FMT_I8);
+
+  ml_shape_t b_shape =
+      bmk1880v2_matrix_lmem_default_shape(bk_ctx, 4, right_col, FMT_I8);  // 32bit
+
+  ml_shape_t y_shape =
+      bmk1880v2_matrix_lmem_default_shape(bk_ctx, left_row, right_col, FMT_I8);
+
+  bmk1880v2_matrix_lmem_t *tl_left =
+      bmk1880v2_lmem_alloc_matrix(bk_ctx, left_shape, FMT_I8, 1);
+  bmk1880v2_matrix_lmem_t *tl_right =
+      bmk1880v2_lmem_alloc_matrix(bk_ctx, right_shape, FMT_I8, 1);
+  bmk1880v2_matrix_lmem_t *tl_b =
+      bmk1880v2_lmem_alloc_matrix(bk_ctx, b_shape, FMT_I8, 1);
+  bmk1880v2_matrix_lmem_t *tl_y =
+      bmk1880v2_lmem_alloc_matrix(bk_ctx, y_shape, FMT_I8, 1);
+
+  put_matrix_g2l(ctx, bk_ctx, tl_left, reinterpret_cast<u8 *>(input_data));
+  put_matrix_g2l(ctx, bk_ctx, tl_right, reinterpret_cast<u8 *>(filter_data));
+  put_bias32(ctx, bk_ctx, tl_b, bias_data);
+
+  {
+    param_t p;
+    memset(&p, 0, sizeof(p));
+    p.left = tl_left;
+    p.right = tl_right;
+    p.bias = tl_b;
+    p.res = tl_y;
+    p.rshift_bits = output_rshift;
+    p.res_is_int8 = 1;
+    p.ps32_mode = 0;
+    p.quan_m = output_multiplier;
+    bmk1880v2_tiu_matrix_multiplication_qdm(bk_ctx, &p);
+  }
+
+  s8 *tiu_output_data =
+      reinterpret_cast<s8 *>(get_matrix_l2g(ctx, bk_ctx, tl_y));
+#ifdef ENABLE_DEBUG_MSG
+  printf("Compare tiu and ref\n");
+#endif
+  for (int i = 0; i < 6; i++) {
+    if (tiu_output_data[i] != ref_output_data[i]) {
+      printf("  output_data[%d] %d(tiu) != %d(ref)\n", i, tiu_output_data[i],
+             ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  free(tiu_output_data);
+
+  bmk1880v2_lmem_free_matrix(bk_ctx, tl_y);
+  bmk1880v2_lmem_free_matrix(bk_ctx, tl_b);
+  bmk1880v2_lmem_free_matrix(bk_ctx, tl_right);
+  bmk1880v2_lmem_free_matrix(bk_ctx, tl_left);
+
+  return ret;
+}
+
+int choose_from_range(int table[], int size, int index)
+{
+  if (index >= size) {
+    return 0;
+  }
+
+  int val = table[index];
+  if (index < (size - 1)) {
+    int range = MAX(table[index + 1] - table[index] - 1, 1);
+    val += rand() % range;
+  }
+
+  return val;
+}
+
+bool check_valid_test_param(bmk_ctx_t *bk_ctx, fc_test_param_t *p_param)
+{
+  int left_row = p_param->left_row;
+  int left_col = p_param->left_col;
+  int right_col = p_param->right_col;
+  int has_bias = p_param->has_bias;
+
+  bmk1880v2_matrix_lmem_shape_t tl_input_shape =
+      bmk1880v2_matrix_lmem_default_shape(bk_ctx, left_row, left_col, FMT_I8);
+  bmk1880v2_matrix_lmem_stride_t tl_input_stride =
+      bmk1880v2_matrix_lmem_default_stride(bk_ctx, tl_input_shape, FMT_I8, 1);
+
+  bmk1880v2_matrix_lmem_shape_t tl_filter_shape =
+      bmk1880v2_matrix_lmem_default_shape(bk_ctx, left_col, right_col, FMT_I8);
+  bmk1880v2_matrix_lmem_stride_t tl_filter_stride =
+      bmk1880v2_matrix_lmem_default_stride(bk_ctx, tl_filter_shape, FMT_I8, 1);
+
+  bmk1880v2_matrix_lmem_shape_t tl_output_shape =
+      bmk1880v2_matrix_lmem_default_shape(bk_ctx, left_row, right_col, FMT_I8);
+  bmk1880v2_matrix_lmem_stride_t tl_output_stride =
+      bmk1880v2_matrix_lmem_default_stride(bk_ctx, tl_output_shape, FMT_I8, 1);
+
+  u32 bias_size = 0;
+  if (has_bias) {
+    bmk1880v2_matrix_lmem_shape_t tl_bias_shape =
+        bmk1880v2_matrix_lmem_default_shape(bk_ctx, 4, right_col, FMT_I8);  // 32bit
+    bmk1880v2_matrix_lmem_stride_t tl_bias_stride =
+        bmk1880v2_matrix_lmem_default_stride(bk_ctx, tl_bias_shape, FMT_I8, 1);
+    bias_size = tl_bias_shape.n * tl_bias_stride.n;
+  }
+
+  bmk1880v2_chip_info_t chip_info = bmk1880v2_chip_info();
+  u32 lmem_size_per_lane = chip_info.lmem_size;
+  // u32 total_lmem_size = chip_info.lmem_size * chip_info.npu_num;
+
+  u32 needed_size = tl_input_shape.n * tl_input_stride.n +
+                    tl_filter_shape.n * tl_filter_stride.n +
+                    tl_output_shape.n * tl_output_stride.n + bias_size;
+
+  if (needed_size > lmem_size_per_lane) {
+    return false;
+  }
+
+  return true;
+}
+
+void fill_random_data_s8(s8 *input_data, int size)
+{
+  for (int i = 0; i < size; ++i) {
+    int is_satured = ((rand() % 1000) == 1) ? 1 : 0;
+    int is_sign = rand() % 2 ? 1 : -1;
+
+    if (is_satured && is_sign) {
+      input_data[i] = -128;
+    } else if (is_satured) {
+      input_data[i] = 127;
+    } else {
+      input_data[i] = is_sign * rand() % 128;
+    }
+  }
+}
+
+void fill_random_data_s32(s32 *input_data, int size)
+{
+  for (int i = 0; i < size; ++i) {
+    int is_satured = ((rand() % 1000) == 1) ? 1 : 0;
+    int is_sign = rand() % 2 ? 1 : -1;
+
+    if (is_satured && is_sign) {
+      input_data[i] = INT_MIN;
+    } else if (is_satured) {
+      input_data[i] = INT_MAX;
+    } else {
+      input_data[i] = is_sign * rand() % 128;
+    }
+  }
+}
+
+void dump_test_param(fc_test_param_t *p_param, bool dump_content)
+{
+  printf("Dump test paramter:\n");
+  printf("  left_row %d\n", p_param->left_col);
+  printf("  left_col %d\n", p_param->left_col);
+  printf("  right_col %d\n", p_param->right_col);
+  printf("  has_bias %d\n", p_param->has_bias);
+  printf("  multiplier %d\n", p_param->multiplier);
+  printf("  right_shift %d\n", p_param->right_shift);
+
+  if (dump_content) {
+    printf("input_data(%d, %d)\n", p_param->left_row, p_param->left_col);
+    int left_row = p_param->left_row;
+    int left_col = p_param->left_col;
+    for (int i = 0; i < left_row; ++i) {
+      for (int j = 0; j < left_col; ++j) {
+        int offset = i * left_col + j;
+        printf("%d, ", p_param->input_data[offset]);
+      }
+      printf("\n");
+    }
+    printf("\n\n");
+
+    int right_col = p_param->right_col;
+    printf("kernel_data (%d, %d)\n", left_col, right_col);
+    for (int i = 0; i < left_col; ++i) {
+      for (int j = 0; j < right_col; ++j) {
+        int offset = i * right_col + j;
+        printf("%d, ", p_param->filter_data[offset]);
+      }
+      printf("\n");
+    }
+    printf("\n\n");
+
+    if (p_param->has_bias) {
+      for (int i = 0; i < right_col; ++i) {
+        printf("%d, ", p_param->bias_data[i]);
+      }
+      printf("\n\n");
+    }
+  }
+}
+
+int run_compare_fc(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, fc_test_param_t *p_param)
+{
+  int ret = 0;
+
+  int left_row = p_param->left_row;
+  int left_col = p_param->left_col;
+  int right_col = p_param->right_col;
+  int has_bias = p_param->has_bias;
+
+  int input_size = left_row * left_col;
+  s8 *input_data = (s8 *)malloc(input_size);
+
+  int kernel_size = left_col * right_col;
+  s8 *kernel_data = (s8 *)malloc(kernel_size);
+
+  int output_size = left_row * right_col;
+  s8 *output_data = (s8 *)malloc(output_size);
+
+  s32 *bias_data = (s32 *) malloc(sizeof(s32) * right_col);
+
+  p_param->input_data = input_data;
+  p_param->filter_data = kernel_data;
+  p_param->output_data = output_data;
+  p_param->bias_data = bias_data;
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    run_compare_conv =>\n");
+  printf("      left (%d, %d), right (%d, %d), has_bias %d\n", left_row,
+         left_col, left_col, right_col, has_bias);
+#endif
+
+  int retry_cnt = p_param->retry_cnt;
+  do {
+    fill_random_data_s8(input_data, input_size);
+    fill_random_data_s8(kernel_data, kernel_size);
+    if (has_bias) {
+      fill_random_data_s32(bias_data, right_col);
+    }
+
+    p_param->float_multiplier = 100.0;  // should be < 1.0
+    calc_fc_float_multiplier(p_param);
+
+    if (p_param->float_multiplier > 0.f && p_param->float_multiplier < 1.0) {
+      break;
+    }
+
+  } while (--retry_cnt);
+
+  if (p_param->float_multiplier >= 1.0) {
+    printf("    run_compare_dw_conv: unable to find valid multiplier\n");
+    free(input_data);
+    free(kernel_data);
+    free(output_data);
+    free(bias_data);
+    return -1;
+  }
+
+  u32 base_multiplier = 0;
+  int base_shift = 0;
+  QuantizeMultiplierSmallerThanOne(p_param->float_multiplier, &base_multiplier,
+                                   &base_shift);
+
+  // multipliers typically range in [2^30 ; 2^31 - 1].
+  // Values in [0, 2^30 - 1] are normally unused, but harmless.
+  // Thus a good way to randomize multipliers is to subtract from them
+  // a random value smaller than 2^30 but still significant compared to it.
+  u32 output_multiplier = base_multiplier - (rand() % (1 << 26));
+
+  // Our H/W only supports right shift
+  int right_shift = base_shift - 1 + (rand() % 4);
+  u8 output_right_shift = right_shift > 0 ? right_shift : 0;
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("      multiplier_data %d, shift_data %d\n", output_multiplier,
+         output_right_shift);
+#endif
+
+  p_param->multiplier = output_multiplier;
+  p_param->right_shift = output_right_shift;
+  fully_connected_ref(p_param);
+
+  bmk1880v2_matrix_lmem_shape_t tl_input_shape =
+      bmk1880v2_matrix_lmem_default_shape(bk_ctx, left_row, left_col, FMT_I8);
+
+  bmk1880v2_matrix_lmem_shape_t tl_filter_shape =
+      bmk1880v2_matrix_lmem_default_shape(bk_ctx, left_col, right_col, FMT_I8);
+
+  bmk1880v2_matrix_lmem_shape_t tl_output_shape =
+      bmk1880v2_matrix_lmem_default_shape(bk_ctx, left_row, right_col, FMT_I8);
+
+  bmk1880v2_matrix_lmem_shape_t tl_bias_shape =
+      bmk1880v2_matrix_lmem_default_shape(bk_ctx, 4, right_col, FMT_I8);  // 32bit
+
+  bmk1880v2_matrix_lmem_t *tl_input = bmk1880v2_lmem_alloc_matrix(
+      bk_ctx, tl_input_shape, FMT_I8, /*eu_align=*/1);
+  bmk1880v2_matrix_lmem_t *tl_filter = bmk1880v2_lmem_alloc_matrix(
+      bk_ctx, tl_filter_shape, FMT_I8, /*eu_align=*/1);
+  bmk1880v2_matrix_lmem_t *tl_output = bmk1880v2_lmem_alloc_matrix(
+      bk_ctx, tl_output_shape, FMT_I8, /*eu_align=*/1);
+
+  bmk1880v2_matrix_lmem_t *tl_bias = nullptr;
+  if (has_bias) {
+    tl_bias = bmk1880v2_lmem_alloc_matrix(bk_ctx, tl_bias_shape, FMT_I8,
+                                          /*eu_align=*/1);
+  }
+
+  if (tl_input == nullptr) {
+    printf("   fail to alloc tl_input (%d, %d)\n", left_row, left_col);
+    return -1;
+  }
+  if (tl_filter == nullptr) {
+    printf("    fail to alloc tl_filter (%d, %d)\n", left_col, right_col);
+    return -1;
+  }
+  if (tl_output == nullptr) {
+    printf("    fail to alloc tl_output (%d, %d)\n", left_row, right_col);
+    return -1;
+  }
+  if (has_bias && (tl_bias == nullptr)) {
+    printf("  fail to alloc bias (%d, %d)\n", 4, right_col);
+    return -1;
+  }
+
+  put_matrix_g2l(ctx, bk_ctx, tl_input, reinterpret_cast<u8 *>(input_data));
+  put_matrix_g2l(ctx, bk_ctx, tl_filter, reinterpret_cast<u8 *>(kernel_data));
+  if (tl_bias) {
+    put_bias32(ctx, bk_ctx, tl_bias, bias_data);
+  }
+
+  {
+    param_t p;
+    memset(&p, 0, sizeof(p));
+    p.left = tl_input;
+    p.right = tl_filter;
+    p.bias = tl_bias;
+    p.res = tl_output;
+    p.rshift_bits = output_right_shift;
+    p.res_is_int8 = 1;
+    p.ps32_mode = 0;
+    p.quan_m = output_multiplier;
+    bmk1880v2_tiu_matrix_multiplication_qdm(bk_ctx, &p);
+  }
+
+  s8 *tiu_output_data =
+      reinterpret_cast<s8 *>(get_matrix_l2g(ctx, bk_ctx, tl_output));
+#ifdef ENABLE_DEBUG_MSG
+  printf("Compare tiu and ref\n");
+#endif
+  for (int i = 0; i < left_row; ++i) {
+    for (int j = 0; j < right_col; ++j) {
+      int offset = i * right_col + j;
+      if (tiu_output_data[offset] != output_data[offset]) {
+        printf("  output_data[%d][%d] %d(tiu) != %d(ref)\n", i, j,
+               tiu_output_data[offset], output_data[offset]);
+        ret = -1;
+      }
+    }
+  }
+
+  if (ret) {
+    dump_test_param(p_param, /*dump_content=*/true);
+  }
+
+  // Reverse order
+  if (tl_bias) {
+    bmk1880v2_lmem_free_matrix(bk_ctx, tl_bias);
+  }
+
+  bmk1880v2_lmem_free_matrix(bk_ctx, tl_output);
+  bmk1880v2_lmem_free_matrix(bk_ctx, tl_filter);
+  bmk1880v2_lmem_free_matrix(bk_ctx, tl_input);
+
+  free(tiu_output_data);
+
+  free(input_data);
+  free(kernel_data);
+  free(output_data);
+  free(bias_data);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    <= run_compare_conv, ret %d\n", ret);
+#endif
+
+  return ret;
+}
+
+int random_test(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx)
+{
+  int ret = 0;
+
+#if 0
+  int left_row_range[] = {1};
+  int left_col_range[] = {1};
+  int right_col_range[] = {1};
+#else
+#ifndef ENABLE_TV_GEN_PATTERN
+  int left_row_range[] = {1, 16, 32, 64, 128, 256, 1024, 2048, 4095};
+  int left_col_range[] = {1, 16, 32, 64, 128, 256, 1024, 2048, 4095};
+  int right_col_range[] = {1, 16, 32, 64, 128, 256, 1024, 2048, 4095};
+#else
+  // TV_GEN pattern
+  // Random Test, total 27, skipped 86, executed 22, failed 0, ret 0
+
+  int left_row_range[] =  {1, 16, 4095};
+  int left_col_range[] =  {1, 16, 4095};
+  int right_col_range[] = {1, 16, 4095};
+#endif
+#endif
+
+  const int left_row_range_size =
+      sizeof(left_row_range) / sizeof(left_row_range[0]);
+  const int left_col_range_size =
+      sizeof(left_col_range) / sizeof(left_col_range[0]);
+  const int right_col_range_size =
+      sizeof(right_col_range) / sizeof(right_col_range[0]);
+
+  int random_seed = clock();
+  srand(random_seed);
+
+  const int retry_test_count = 100;
+  bool stop_at_first_error = true;
+
+  int total_tests =
+      left_row_range_size * left_col_range_size * right_col_range_size;
+  int skipped_tests = 0;
+  int executed_tests = 0;
+  int failed_tests = 0;
+  int current_test = 0;
+
+  printf("Random Test =>\n");
+  for (int m = 0; m < retry_test_count; ++m) {
+    for (int i = 0; i < left_row_range_size; ++i) {
+      // random choosed from [range[i] : range[i+1]]
+      int left_row = choose_from_range(left_row_range, left_row_range_size, i);
+
+      for (int j = 0; j < left_col_range_size; ++j) {
+        int left_col =
+            choose_from_range(left_col_range, left_col_range_size, j);
+
+        for (int k = 0; k < right_col_range_size; ++k) {
+          int right_col =
+              choose_from_range(right_col_range, right_col_range_size, k);
+
+#ifdef ENABLE_DEBUG_MSG
+          printf("  [%d/%d] random test: left(%d, %d), right (%d, %d)\n",
+                 current_test, total_tests, left_row, left_col, left_col,
+                 right_col);
+#else
+          if ((current_test % 100) == 0) {
+            printf("  [%d/%d] random test: left(%d, %d), right (%d, %d)\n",
+                   current_test, total_tests, left_row, left_col, left_col,
+                   right_col);
+          }
+#endif
+
+          current_test++;
+
+          int has_bias = rand() % 2;
+
+          fc_test_param_t test_param;
+          memset(&test_param, 0, sizeof(test_param));
+          test_param.left_row = left_row;
+          test_param.left_col = left_col;
+          test_param.right_col = right_col;
+          test_param.has_bias = has_bias;
+          test_param.retry_cnt = 5;
+
+          bool is_valid_param = check_valid_test_param(bk_ctx, &test_param);
+          if (is_valid_param == false) {
+            skipped_tests++;
+#ifdef ENABLE_DEBUG_MSG
+            printf("  [%d/%d] random test: invalid parameter, skip\n",
+                   current_test, total_tests);
+#endif
+            continue;
+          }
+
+          int ret2 = run_compare_fc(ctx, bk_ctx, &test_param);
+          failed_tests = ret2 ? failed_tests + 1 : failed_tests;
+          ret |= ret2;
+          executed_tests++;
+
+#ifdef ENABLE_DEBUG_MSG
+          printf("  [%d/%d] random test: left(%d, %d), right (%d, %d), result "
+                 "%d\n",
+                 current_test, total_tests, left_row, left_col, left_col,
+                 right_col, ret2);
+#endif
+        }
+
+        // Stop at first error
+        if (ret && stop_at_first_error) {
+          break;
+        }
+      }
+
+      // Stop at first error
+      if (ret && stop_at_first_error) {
+        break;
+      }
+    }
+
+    // Stop at first error
+    if (ret && stop_at_first_error) {
+      break;
+    }
+
+    if (executed_tests >= MIN_EXEC_TESTS) {
+      break;
+    }
+  }
+
+  printf(
+      "<= Random Test, total %d, skipped %d, executed %d, failed %d, ret %d\n",
+      total_tests, skipped_tests, executed_tests, failed_tests, ret);
+
+  return 0;
+}
+
+int main()
+{
+  int ret = 0;
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  // ret |= tfl_original_test();
+  ret |= simple_test(&ctx, bk_ctx);
+  ret |= random_test(&ctx, bk_ctx);
+
+  test_exit(&ctx);
+
+  return ret;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_matrix_transfer.cpp b/cviruntime/test/1880v2/test_1880v2_matrix_transfer.cpp
new file mode 100644
index 000000000..714c8a3d4
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_matrix_transfer.cpp
@@ -0,0 +1,82 @@
+#include "1880v2_test_util.h"
+
+static void test_put_and_get_matrix_l2g(
+    CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx)
+{
+  int row = 5;
+  int col = 16 * 5 + 2;
+  int size = row * col;
+
+   ml_shape_t s =
+      bmk1880v2_matrix_lmem_default_shape(bk_ctx, row, col, FMT_I8);
+
+  u8 *data_x = (u8 *)xmalloc(size);
+  u8 *data_y = (u8 *)xmalloc(size);
+
+  for (int i = 0; i < size; i++)
+    data_x[i] = i - 100;
+
+  for (int i = 0; i < size; i++)
+    data_y[i] = -i;
+
+  ml_t *ml_x =
+      bmk1880v2_lmem_alloc_matrix(bk_ctx,s, FMT_I8, 1);
+  ml_t *ml_y =
+      bmk1880v2_lmem_alloc_matrix(bk_ctx,s, FMT_I8, 1);
+
+  /*
+   * Interleave two matrice in case the same devmem is reused between
+   * put_matrix_g2l() and get_matrix_l2g(), in which case the content of
+   * devmem is already what is expected before bmk1880v2_gdma_store_matrix().
+   */
+  put_matrix_g2l(ctx, bk_ctx, ml_x, data_x);
+  put_matrix_g2l(ctx, bk_ctx, ml_y, data_y);
+
+  u8 *result_x = get_matrix_l2g(ctx, bk_ctx, ml_x);
+  u8 *result_y = get_matrix_l2g(ctx, bk_ctx, ml_y);
+  for (int i = 0; i < size; i++) {
+    if (result_x[i] != data_x[i]) {
+      printf("compare failed at result_x[%d]\n", i);
+      exit(-1);
+    }
+    if (result_y[i] != data_y[i]) {
+      printf("compare failed at result_y[%d]\n", i);
+      exit(-1);
+    }
+  }
+  free(result_x);
+  free(result_y);
+
+  /*
+   * Get result_y before result_x.
+   */
+  result_y = get_matrix_l2g(ctx, bk_ctx, ml_y);
+  result_x = get_matrix_l2g(ctx, bk_ctx, ml_x);
+  for (int i = 0; i < size; i++) {
+    if (result_x[i] != data_x[i]) {
+      printf("compare failed at result_x[%d]\n", i);
+      exit(-1);
+    }
+    if (result_y[i] != data_y[i]) {
+      printf("compare failed at result_y[%d]\n", i);
+      exit(-1);
+    }
+  }
+  free(result_x);
+  free(result_y);
+
+  bmk1880v2_lmem_free_matrix(bk_ctx, ml_y);
+  bmk1880v2_lmem_free_matrix(bk_ctx, ml_x);
+  free(data_x);
+  free(data_y);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  test_put_and_get_matrix_l2g(&ctx, bk_ctx);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_max_pooling.cpp b/cviruntime/test/1880v2/test_1880v2_max_pooling.cpp
new file mode 100644
index 000000000..cfbaa6a57
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_max_pooling.cpp
@@ -0,0 +1,192 @@
+#include "1880v2_test_util.h"
+
+typedef bmk1880v2_tiu_max_pooling_param_t param_t;
+
+static void print_pooling_param(param_t *p)
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+
+  printf("  Pooling parameters:\n");
+  printf("    ifmap = (%d, %d, %d, %d)\n", in, ic, ih, iw);
+  printf("    opd0_sign = %d\n", p->ifmap->fmt == FMT_I8);
+  printf("    weight = (%d, %d)\n", p->kh, p->kw);
+  printf("    padding = (%d, %d, %d, %d)\n",
+         p->pad_top, p->pad_bottom, p->pad_left, p->pad_right);
+  printf("    stride = (%d, %d)\n", p->stride_h, p->stride_w);
+}
+
+static int pooling_ih_ext(param_t *p, int ih)
+{
+  int pad = p->pad_top + p->pad_bottom;
+  return ih + pad;
+}
+
+static int pooling_iw_ext(param_t *p, int iw)
+{
+  int pad = p->pad_left + p->pad_right;
+  return iw + pad;
+}
+
+static int pooling_oh(param_t *p, int ih)
+{
+  int ih_ext = pooling_ih_ext(p, ih);
+  return (ih_ext - p->kh) / p->stride_h + 1;
+}
+
+static int pooling_ow(param_t *p, int iw)
+{
+  int iw_ext = pooling_iw_ext(p, iw);
+  return (iw_ext - p->kw) / p->stride_w + 1;
+}
+
+static s8 *alloc_input(param_t *p)
+{
+  u64 size = tl_shape_size(&p->ifmap->shape);
+  s8 *data = (s8 *)xmalloc(size);
+  if (!data)
+    return NULL;
+
+  for (u64 i = 0; i < size; i++)
+    data[i] = rand() % 256 - 128;
+  return data;
+}
+
+static s8 *alloc_output(param_t *p)
+{
+  u64 size = tl_shape_size(&p->ofmap->shape);
+  return (s8 *)xmalloc(size);
+}
+
+static void free_pooling_param(
+    bmk_ctx_t *ctx,
+    param_t *r)
+{
+  if (r->ifmap)
+    free_tl(ctx, r->ifmap);
+  if (r->ofmap)
+    free_tl(ctx, r->ofmap);
+}
+
+static param_t random_pooling_param(bmk_ctx_t *ctx)
+{
+  srand(clock());
+  param_t p;
+
+  memset(&p, 0, sizeof(p));
+
+retry:
+  int in = rand() % 5 + 1;
+  int ic = rand() % (3 * BM1880V2_HW_NPU_NUM) + 1;
+  int ih = rand() % 30 + 3;
+  int iw = rand() % 30 + 6;
+  int opd0_sign = rand() % 2;
+
+  p.kh = rand() % 7 + 1;
+  p.kw = rand() % 7 + 1;
+  p.stride_h = rand() % (p.kh) + 1;
+  p.stride_w = rand() % (p.kw) + 1;
+  p.pad_top = rand() % p.kh;
+  p.pad_bottom = rand() % p.kh;
+  p.pad_left = rand() % p.kw;
+  p.pad_right = rand() % p.kw;
+
+  tl_shape_t ifmap_shape;
+  ifmap_shape.n = in;
+  ifmap_shape.c = ic;
+  ifmap_shape.h = ih;
+  ifmap_shape.w = iw;
+  tl_shape_t ofmap_shape;
+  ofmap_shape.n = in;
+  ofmap_shape.c = ic;
+  ofmap_shape.h = pooling_oh(&p, ih);
+  ofmap_shape.w = pooling_ow(&p, iw);
+
+  fmt_t fmt = opd0_sign? FMT_I8: FMT_U8;
+  p.ofmap = bmk1880v2_lmem_alloc_tensor(ctx, ofmap_shape, FMT_I8, 1);
+  p.ifmap = bmk1880v2_lmem_alloc_tensor(ctx, ifmap_shape, fmt, 1);
+
+  if ((p.kh > pooling_ih_ext(&p, ih))
+      || (p.kw > pooling_iw_ext(&p, iw))
+      || (p.pad_top >= (1 << 4))
+      || (p.pad_bottom >= (1 << 4))
+      || (p.pad_left >= (1 << 4))
+      || (p.pad_right >= (1 << 4))
+      || (p.kh * p.kw == 1)
+      || !p.ofmap || !p.ifmap) {
+    printf("retry init_pooling_param\n");
+    free_pooling_param(ctx, &p);
+    goto retry;
+  }
+
+  return p;
+}
+
+static void compare_results(
+    param_t *p,
+    s8 input[],
+    s8 output[])
+{
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+  int sign = (p->ifmap->fmt == FMT_I8);
+
+  s8 *output_ref = alloc_output(p);
+  bmerr_t ret = native_pooling_max_int8(
+      input, output_ref, in, ic, ih, iw, p->kh, p->kw,
+      p->pad_top, p->pad_bottom, p->pad_left, p->pad_right,
+      p->stride_h, p->stride_w, 0, 0, 0, 0, sign);
+  assert(ret == BM_SUCCESS);
+
+  int cmp_res = array_cmp_int8(
+      "Comparing results ...\n", output_ref, output,
+      tl_shape_size(&p->ofmap->shape));
+
+  if (cmp_res != 0) {
+    printf("Comparison FAILED!!!\n");
+    print_pooling_param(p);
+    exit(-1);
+  }
+
+  free(output_ref);
+}
+
+static int test_pooling(CVI_RT_HANDLE ctx, bmk_ctx_t *bk_ctx)
+{
+  param_t param = random_pooling_param(bk_ctx);
+  s8 *input = alloc_input(&param);
+
+  put_tensor_g2l(&ctx, bk_ctx, param.ifmap, (u8 *)input);
+  bmk1880v2_tiu_max_pooling(bk_ctx, &param);
+  s8 *output = (s8 *)get_tensor_l2g(&ctx, bk_ctx, param.ofmap);
+
+  compare_results(&param, input, output);
+
+  free_pooling_param(bk_ctx, &param);
+  free(output);
+  free(input);
+
+  return 1;
+}
+
+static void test_max_pooling(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx)
+{
+  int test_finished_num = 0;
+  for (u64 i = 0; i < 16; i++)
+    test_finished_num += test_pooling(*ctx, bk_ctx);
+  printf("Test finished %d\n", test_finished_num);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  test_max_pooling(&ctx, bk_ctx);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_put_matrix_stride.cpp b/cviruntime/test/1880v2/test_1880v2_put_matrix_stride.cpp
new file mode 100644
index 000000000..12ea16a3f
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_put_matrix_stride.cpp
@@ -0,0 +1,106 @@
+#include "1880v2_test_util.h"
+
+static void put_matrix_g2l_stride_ref(u8 *ref,
+    u8 *a,
+    ml_shape_t  lmem_shape,
+    bmk1880v2_matrix_tgmem_stride_t gmem_stride)
+{
+  int row = lmem_shape.n;
+  int col = lmem_shape.col;
+  int row_stride = gmem_stride.row;
+
+  for (int ri = 0; ri < row; ri++)
+    for (int ci = 0; ci < col; ci++)
+      ref[ri * col + ci] = a[ri * row_stride + ci];
+}
+
+static void put_matrix_g2l_stride(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    ml_t *ml,
+    bmk1880v2_matrix_tgmem_stride_t gmem_stride,
+    u8 *data)
+{
+  int row = ml->shape.n;
+  int col = ml->shape.col;
+  int row_stride = gmem_stride.row;
+
+  bmshape_t bms = BM_MATRIX_INT8(row, row_stride);
+  CVI_RT_MEM devmem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms));
+  int ret = CVI_RT_MemCopyS2D(*ctx, devmem, data);
+  assert(ret == BM_SUCCESS);
+
+  gaddr_t gaddr = CVI_RT_MemGetPAddr(devmem);
+  mg_t mg;
+  mg.base_reg_index = 0;
+  mg.start_address = gaddr;
+  mg.shape.row = row;
+  mg.shape.col = col;
+  mg.stride = gmem_stride;
+  mg.base_reg_index = 0;
+
+  bmk1880v2_tdma_tg2l_matrix_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.dst = ml;
+  p.src = &mg;
+
+  bmk1880v2_tdma_g2l_matrix_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  CVI_RT_MemFree(*ctx, devmem);
+  return ;
+}
+
+static void test_put_matrix_g2l_stride(
+    CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx)
+{
+  int row = 80;
+  int col = 70;
+  int size = row * col;
+  ml_shape_t mls =
+      bmk1880v2_matrix_lmem_default_shape(bk_ctx, row, col, FMT_I8);
+  ml_t *ml =
+      bmk1880v2_lmem_alloc_matrix(bk_ctx,mls, FMT_I8, 0);
+
+  int row_stride = col * 2;
+  bmk1880v2_matrix_tgmem_stride_t gmem_stride;
+  gmem_stride.row = row_stride;
+  int stride_size = row * row_stride;
+
+  u8 *data_x = (u8 *)xmalloc(stride_size);
+  for (int i = 0; i < stride_size; i++)
+    data_x[i] = i;
+
+  put_matrix_g2l_stride(ctx, bk_ctx, ml, gmem_stride, data_x);
+  u8 *result_x = get_matrix_l2g(ctx, bk_ctx, ml);
+  u8 *ref_x = (u8 *)xmalloc(size);
+  if (!result_x || !ref_x)
+    goto fail_exit;
+
+  put_matrix_g2l_stride_ref(ref_x, data_x, mls, gmem_stride);
+
+  for (int i = 0; i < size; i++) {
+    if (result_x[i] != ref_x[i]) {
+      printf("compare failed at result_x[%d], got %d, exp %d\n",
+             i, result_x[i], ref_x[i]);
+      exit(-1);
+    }
+  }
+
+  bmk1880v2_lmem_free_matrix(bk_ctx, ml);
+
+fail_exit:
+  free(data_x);
+  free(result_x);
+  free(ref_x);
+}
+
+int main ()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  test_put_matrix_g2l_stride(&ctx, bk_ctx);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_put_matrix_tp.cpp b/cviruntime/test/1880v2/test_1880v2_put_matrix_tp.cpp
new file mode 100644
index 000000000..93334a3ae
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_put_matrix_tp.cpp
@@ -0,0 +1,108 @@
+#include "1880v2_test_util.h"
+
+static void matrix_tp_ref(
+    u8 *ref, u8 *a, ml_shape_t s)
+{
+  /*
+   * ref[] is transposed matrix in lmem.
+   * row/col are shape in DDR
+   */
+  int row = s.col;
+  int col = s.n;
+
+  for (int ri = 0; ri < row; ri++) {
+    for (int ci = 0; ci < col; ci++) {
+      ref[ci * row + ri] = a[ri * col + ci];
+    }
+  }
+}
+
+static void  put_matrix_g2l_tp(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    ml_t *ml,
+    u8 *data)
+{
+  /*
+   * raw_row = row of src, raw_col = col of dst.
+   * raw and col of ml.shape are transposed raw and col
+   */
+
+  int raw_row = ml->shape.col;
+  int raw_col = ml->shape.n;
+
+  bmshape_t bms = BM_MATRIX_INT8(raw_row,raw_col);
+  CVI_RT_MEM dev_mem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms));
+  gaddr_t gaddr = CVI_RT_MemGetPAddr(dev_mem);
+  int ret = CVI_RT_MemCopyS2D(*ctx, dev_mem, data);
+  assert(ret == BM_SUCCESS);
+
+  mg_t mg;
+  mg.base_reg_index = 0;
+  mg.start_address = gaddr;
+  mg.shape.row = raw_row;
+  mg.shape.col = raw_col;
+  mg.stride.row = raw_col;
+  mg.base_reg_index = 0;
+
+  bmk1880v2_tdma_tg2l_matrix_copy_row_col_transposed_param_t g2lp;
+  memset(&g2lp, 0, sizeof(g2lp));
+  g2lp.src = &mg;
+  g2lp.dst = ml;
+
+  bmk1880v2_tdma_g2l_matrix_copy_row_col_transposed(bk_ctx, &g2lp);
+  test_submit(ctx);
+
+  CVI_RT_MemFree(*ctx, dev_mem);
+  return ;
+}
+
+static void test_put_matrix_g2l_tp(
+    CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx)
+{
+  int row = 80;
+  int col = 70;
+  int size = row * col;
+
+  u8 *data_x = (u8 *)xmalloc(size);
+  for (int i = 0; i < size; i++)
+    data_x[i] = i;
+
+  ml_shape_t mls =
+      bmk1880v2_matrix_lmem_default_shape(bk_ctx, col, row, FMT_I8);
+  ml_t *ml =
+      bmk1880v2_lmem_alloc_matrix(bk_ctx,mls, FMT_I8, 1);
+
+  put_matrix_g2l_tp(ctx, bk_ctx,ml, data_x);
+  u8 *result_x = get_matrix_l2g(ctx, bk_ctx, ml);
+  u8 *ref_x = (u8 *)xmalloc(size);
+  if (!result_x || !ref_x)
+    goto fail_exit;
+
+  matrix_tp_ref(ref_x, data_x, mls);
+
+  for (int i = 0; i < size; i++) {
+    if (result_x[i] != ref_x[i]) {
+      printf("compare failed at result_x[%d], got %d, exp %d\n",
+             i, result_x[i], ref_x[i]);
+      exit(-1);
+    }
+  }
+
+  bmk1880v2_lmem_free_matrix(bk_ctx, ml);
+
+fail_exit:
+  free(data_x);
+  free(result_x);
+  free(ref_x);
+}
+
+int main (void)
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  test_put_matrix_g2l_tp(&ctx, bk_ctx);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_put_tensor_stride.cpp b/cviruntime/test/1880v2/test_1880v2_put_tensor_stride.cpp
new file mode 100644
index 000000000..f23582621
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_put_tensor_stride.cpp
@@ -0,0 +1,133 @@
+#include "1880v2_test_util.h"
+
+static void put_tensor_g2l_stride_ref(
+    u8 *ref, u8 *a,
+    tl_shape_t lmem_shape,
+    bmk1880v2_tensor_tgmem_stride_t gmem_stride)
+{
+  int n = lmem_shape.n;
+  int c = lmem_shape.c;
+  int h = lmem_shape.h;
+  int w = lmem_shape.w;
+
+  int n_str = gmem_stride.n;
+  int c_str = gmem_stride.c;
+  int h_str = gmem_stride.h;
+  int w_str = 1;
+
+  /*
+   * put stride ddr tensor to local memory in default stride.
+   */
+
+  for (int ni = 0; ni < n; ni++) {
+    for (int ci = 0; ci < c; ci++) {
+      for (int hi = 0; hi < h; hi++) {
+        for (int wi = 0; wi < w; wi++) {
+          u64 src_i = ni * n_str + ci * c_str + hi * h_str + wi * w_str;
+          u64 dst_i = ni * c * h * w + ci * h * w + hi * w + wi;
+          ref[dst_i] = a[src_i];
+        }
+      }
+    }
+  }
+}
+
+static inline void put_tensor_g2l_stride(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    tl_t *tl,
+    bmk1880v2_tensor_tgmem_stride_t tg_stride,
+    u8 *data)
+{
+  int n = tl->shape.n;
+  int n_stride = tg_stride.n;
+  bmshape_t bms = BM_TENSOR_WITH_FMT(n, n_stride, 1, 1, BM_FMT_INT8);
+  CVI_RT_MEM devmem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms));
+  int ret = CVI_RT_MemCopyS2D(*ctx, devmem, data);
+  assert(ret == BM_SUCCESS);
+
+  gaddr_t gaddr = CVI_RT_MemGetPAddr(devmem);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_I8;
+  tg.shape.n = tl->shape.n;
+  tg.shape.c = tl->shape.c;
+  tg.shape.h = tl->shape.h;
+  tg.shape.w = tl->shape.w;
+  tg.stride = tg_stride;
+  tg.base_reg_index = 0;
+
+  bmk1880v2_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1880v2_tdma_g2l_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  CVI_RT_MemFree(*ctx, devmem);
+}
+
+static void test_put_tensor_g2l_stride(
+    CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx)
+{
+  int n = 2;
+  int c = 15;
+  int h = 10;
+  int w = 8;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  bmk1880v2_tensor_tgmem_stride_t gmem_stride;
+  gmem_stride.h = w * 2;
+  gmem_stride.c = gmem_stride.h * h * 2;
+  gmem_stride.n = gmem_stride.c * c * 2;
+
+  int size = n * c * h * w;
+  int stride_size = gmem_stride.n * n;
+
+  u8 *data_x = (u8 *)xmalloc(stride_size);
+  for (int i = 0; i < stride_size; i++)
+    data_x[i] = i;
+
+  tl_t *tl_x =
+      alloc_tl(bk_ctx, tl_shape, FMT_I8, 1);
+  put_tensor_g2l_stride(ctx, bk_ctx, tl_x, gmem_stride, data_x);
+  u8 *result_x = get_tensor_l2g(ctx, bk_ctx, tl_x);
+  u8 *ref_x = (u8 *)xmalloc(size);
+  if (!result_x || !ref_x)
+    goto fail_exit;
+
+  put_tensor_g2l_stride_ref(ref_x, data_x, tl_shape, gmem_stride);
+
+  for (int i = 0; i < size; i++) {
+    if (result_x[i] != ref_x[i]) {
+      printf("compare failed at result_x[%d], got %d, exp %d\n",
+             i, result_x[i], ref_x[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_x);
+
+fail_exit:
+  free(data_x);
+  free(result_x);
+  free(ref_x);
+}
+
+int main (void)
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  test_put_tensor_g2l_stride(&ctx, bk_ctx);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_put_tensor_stride_unalign.cpp b/cviruntime/test/1880v2/test_1880v2_put_tensor_stride_unalign.cpp
new file mode 100644
index 000000000..1be9b9258
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_put_tensor_stride_unalign.cpp
@@ -0,0 +1,135 @@
+#include "1880v2_test_util.h"
+
+static void put_tensor_g2l_stride_unalign_ref(
+    u8 *ref, u8 *a, tl_shape_t tl_shape,
+    bmk1880v2_tensor_tgmem_stride_t gmem_stride)
+{
+  int n = tl_shape.n;
+  int c = tl_shape.c;
+  int h = tl_shape.h;
+  int w = tl_shape.w;
+
+  int n_str = gmem_stride.n;
+  int c_str = gmem_stride.c;
+  int h_str = gmem_stride.h;
+  int w_str = 1;
+
+  for (int ni = 0; ni < n; ni++) {
+    for (int ci = 0; ci < c; ci++) {
+      for (int hi = 0; hi < h; hi++) {
+        for (int wi = 0; wi < w; wi++) {
+          u64 src_i = ni * n_str + ci * c_str + hi * h_str + wi * w_str;
+          u64 dst_i = ci * n * h * w + ni * h * w + hi * w + wi;
+          ref[dst_i] = a[src_i];
+        }
+      }
+    }
+  }
+}
+
+static inline void put_tensor_g2l_stride(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    tl_t *tl,
+    bmk1880v2_tensor_tgmem_stride_t tg_stride,
+    u8 *data)
+{
+  int n = tl->shape.n;
+  int n_stride = tg_stride.n;
+  bmshape_t bms = BM_TENSOR_WITH_FMT(n, n_stride, 1, 1, BM_FMT_INT8);
+  CVI_RT_MEM devmem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms));
+  int ret = CVI_RT_MemCopyS2D(*ctx, devmem, data);
+  assert(ret == BM_SUCCESS);
+
+  gaddr_t gaddr = CVI_RT_MemGetPAddr(devmem);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_I8;
+  tg.shape.n = tl->shape.n;
+  tg.shape.c = tl->shape.c;
+  tg.shape.h = tl->shape.h;
+  tg.shape.w = tl->shape.w;
+  tg.stride = tg_stride;
+  tg.base_reg_index = 0;
+
+  bmk1880v2_tdma_tg2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1880v2_tdma_g2l_tensor_copy(bk_ctx, &p);
+  test_submit(ctx);
+
+  CVI_RT_MemFree(*ctx, devmem);
+}
+
+static void test_put_tensor_g2l_stride_unalign(
+    CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx)
+{
+  int n = 6;
+  int c = 9; //just larger than (npu_num/2)
+  int h = 1;
+  int w = 8;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  bmk1880v2_tensor_tgmem_stride_t gmem_stride;
+  gmem_stride.h = w * 2;
+  gmem_stride.c = gmem_stride.h * h * 2;
+  gmem_stride.n = gmem_stride.c * c * 2;
+
+  int size = n * c * h * w;
+  int stride_size = gmem_stride.n * n;
+
+  u8 *data_x = (u8 *)xmalloc(stride_size);
+  for (int i = 0; i < stride_size; i++)
+    data_x[i] = i;
+
+
+  tl_t *tl_x =
+      alloc_tl(bk_ctx, tl_shape, FMT_I8, 0);
+
+  put_tensor_g2l_stride(ctx, bk_ctx, tl_x, gmem_stride, data_x);
+
+  tl_x->shape.n = 1;
+  tl_x->shape.c = c;
+  tl_x->shape.h = n * h;
+  tl_x->shape.w = w;
+
+  u8 *result_x = get_tensor_l2g(ctx, bk_ctx, tl_x);
+  u8 *ref_x = (u8 *)xmalloc(size);
+  if (!result_x || !ref_x)
+    goto fail_exit;
+
+  put_tensor_g2l_stride_unalign_ref(ref_x, data_x, tl_shape, gmem_stride);
+
+  for (int i = 0; i < size; i++) {
+    if (result_x[i] != ref_x[i]) {
+      printf("compare failed at result_x[%d], got %d, exp %d\n",
+             i, result_x[i], ref_x[i]);
+      exit(-1);
+    }
+  }
+  free_tl(bk_ctx, tl_x);
+
+fail_exit:
+  free(data_x);
+  free(result_x);
+  free(ref_x);
+}
+
+int main (void)
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  test_put_tensor_g2l_stride_unalign(&ctx, bk_ctx);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_put_tensor_tp_unalign.cpp b/cviruntime/test/1880v2/test_1880v2_put_tensor_tp_unalign.cpp
new file mode 100644
index 000000000..d88f0ceac
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_put_tensor_tp_unalign.cpp
@@ -0,0 +1,122 @@
+#include "1880v2_test_util.h"
+
+static void put_tensor_g2l_tp_unalign_ref(
+    u8 *ref, u8 *a, tl_shape_t tl_shape)
+{
+  /*
+   * (c, n, h, w) => (n, c, h, w) => (1, c, n * h, w)
+   */
+
+  int n = tl_shape.n;
+  int c = tl_shape.c;
+  int h = tl_shape.h;
+  int w = tl_shape.w;
+
+  int size = n * c * h * w;
+  for (int i = 0; i < size; i++)
+    ref[i] = a[i];
+}
+
+
+static void put_tensor_g2l_tp(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    tl_t *tl,
+    u8 *data)
+{
+  int n = tl->shape.n;
+  int c = tl->shape.c;
+  int h = tl->shape.h;
+  int w = tl->shape.w;
+
+  bmshape_t bms = BM_TENSOR_WITH_FMT(n, c, h, w, BM_FMT_INT8);
+  CVI_RT_MEM devmem = CVI_RT_MemAlloc(*ctx, bmshape_get_size(&bms));
+  int ret = CVI_RT_MemCopyS2D(*ctx, devmem, data);
+  assert(ret == BM_SUCCESS);
+
+  gaddr_t gaddr = CVI_RT_MemGetPAddr(devmem);
+
+  tg_t tg;
+  tg.base_reg_index = 0;
+  tg.start_address = gaddr;
+  tg.fmt = FMT_I8;
+  tg.shape.n = tl->shape.c;
+  tg.shape.c = tl->shape.n;
+  tg.shape.h = tl->shape.h;
+  tg.shape.w = tl->shape.w;
+  tg.stride = bmk1880v2_tensor_tgmem_default_stride(tg.shape, tg.fmt);
+  tg.base_reg_index = 0 ;
+
+  bmk1880v2_tdma_tg2l_tensor_copy_nc_transposed_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = &tg;
+  p.dst = tl;
+
+  bmk1880v2_tdma_g2l_tensor_copy_nc_transposed(bk_ctx, &p);
+  test_submit(ctx);
+
+  CVI_RT_MemFree(*ctx, devmem);
+}
+
+static void test_put_tensor_g2l_tp_unalign(
+    CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx)
+{
+  int n = 2;
+  int c = 15;
+  int h = 1;
+  int w = 8;
+  int size = n * c * h * w;
+
+  u8 *data_x = (u8 *)xmalloc(size);
+  for (int i = 0; i < size; i++)
+    data_x[i] = i;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  tl_t *tl_x =
+      alloc_tl(bk_ctx, tl_shape, FMT_I8, 0);
+
+  put_tensor_g2l_tp(ctx, bk_ctx, tl_x, data_x);
+
+  tl_x->shape.n = 1;
+  tl_x->shape.c = c;
+  tl_x->shape.h = n * h;
+  tl_x->shape.w = w;
+
+  u8 *result_x = get_tensor_l2g(ctx, bk_ctx, tl_x);
+  tl_x->shape = tl_shape;
+  u8 *ref_x = (u8 *)xmalloc(size);
+  if (!result_x || !ref_x)
+    goto fail_exit;
+
+  put_tensor_g2l_tp_unalign_ref(ref_x, data_x, tl_shape);
+
+  for (int i = 0; i < size; i++) {
+    if (result_x[i] != ref_x[i]) {
+      printf("compare failed at result_x[%d], got %d, exp %d\n",
+             i, result_x[i], ref_x[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_x);
+
+fail_exit:
+  free(data_x);
+  free(result_x);
+  free(ref_x);
+}
+
+int main (void)
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  test_put_tensor_g2l_tp_unalign(&ctx, bk_ctx);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_put_tensor_unalign.cpp b/cviruntime/test/1880v2/test_1880v2_put_tensor_unalign.cpp
new file mode 100644
index 000000000..3403a194d
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_put_tensor_unalign.cpp
@@ -0,0 +1,86 @@
+#include "1880v2_test_util.h"
+
+static void put_tensor_g2l_unalign_ref(
+    u8 *ref, u8 *a, tl_shape_t tl_shape)
+{
+  int n = tl_shape.n;
+  int c = tl_shape.c;
+  int h = tl_shape.h;
+  int w = tl_shape.w;
+
+  /*
+   * (n, c, h, w) => (1, c, n * h, w)
+   */
+  for (int ni = 0; ni < n; ni++) {
+    for (int ci = 0; ci < c; ci++) {
+      for (int hi = 0; hi < h; hi++) {
+        for (int wi = 0; wi < w; wi++) {
+          u64 src_i = ni * c * h * w + ci * h * w + hi * w + wi;
+          u64 dst_i = ci * n * h * w + ni * h * w + hi * w + wi;
+          ref[dst_i] = a[src_i];
+        }
+      }
+    }
+  }
+}
+
+static void test_put_tensor_g2l_unalign(
+    CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx)
+{
+  int n = 4;
+  int c = 9; //just larger than (npu_num/2)
+  int h = 1;
+  int w = 8;
+  int size = n * c * h * w;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  u8 *data_x = (u8 *)xmalloc(size);
+  for (int i = 0; i < size; i++)
+    data_x[i] = i;
+
+  tl_t *tl_x =
+      alloc_tl(bk_ctx, tl_shape, FMT_I8, 0);
+  put_tensor_g2l(ctx, bk_ctx, tl_x, data_x);
+
+  tl_x->shape.n = 1;
+  tl_x->shape.c = c;
+  tl_x->shape.h = n * h;
+  tl_x->shape.w = w;
+
+  u8 *result_x = get_tensor_l2g(ctx, bk_ctx, tl_x);
+  u8 *ref_x = (u8 *)xmalloc(size);
+  if (!result_x || !ref_x)
+    goto fail_exit;
+
+  put_tensor_g2l_unalign_ref(ref_x, data_x, tl_shape);
+
+  for (int i = 0; i < size; i++) {
+    if (result_x[i] != ref_x[i]) {
+      printf("compare failed at result_x[%d], got %d, exp %d\n",
+             i, result_x[i], ref_x[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_x);
+
+fail_exit:
+  free(data_x);
+  free(result_x);
+  free(ref_x);
+}
+
+int main (void)
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  test_put_tensor_g2l_unalign(&ctx, bk_ctx);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_tdma_l2l_tensor_copy.cpp b/cviruntime/test/1880v2/test_1880v2_tdma_l2l_tensor_copy.cpp
new file mode 100644
index 000000000..0bee57eab
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_tdma_l2l_tensor_copy.cpp
@@ -0,0 +1,133 @@
+#include "1880v2_test_util.h"
+
+typedef bmk1880v2_tdma_l2l_tensor_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  tl_shape_t src_shape;
+  tl_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 1, 1 },
+    { 1, 1, 1, 1 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 2, 1 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 2, 7 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 13, 17 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 120, 5 },
+  }, {
+    { 1, 2, 1, 1 },
+    { 1, 1, 1, 2 },
+  }, {
+    { 2, 17, 1,  4 },
+    { 2,  1, 4, 17 },
+  }, {
+    { 2, 17, 1,  4 },
+    { 2,  1, 17, 4 },
+  }, {
+    { 3, 16, 1, 1 },
+    { 3,  1, 2, 8 },
+  }, {
+    { 3, 39, 17, 23 },
+    { 3, 17, 39, 23 },
+  }, {
+    { 3, 36, 16,  20 },
+    { 3, 18,  1, 640 },
+  }, {
+    { 5, 39, 17, 23 },
+    { 5, 17, 39, 23 },
+  }, {
+    { 20, 35,  2, 2 },
+    { 20,  7, 10, 2 },
+  }    
+};
+
+static void destroy_param(bmk_ctx_t *bmk, param_t *p)
+{
+  free_tl(bmk, p->dst);
+  free_tl(bmk, p->src);
+}
+
+static void l2l_tensor_copy_ref(param_t *p, u8 ref_data[], u8 src_data[])
+{
+  u64 size = tl_shape_size(&p->src->shape);
+
+  for (u64 i = 0; i < size; i++)
+    ref_data[i] = src_data[i];
+}
+
+static void test_param(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = tl_shape_size(&p->src->shape);
+
+  u8 *src_data = (u8 *)malloc(sizeof(u8) * size);
+  for (u64 i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  put_tensor_g2l(ctx, bmk, p->src, src_data);
+  bmk1880v2_tdma_l2l_tensor_copy(bmk, p);
+  u8 *dst_data = get_tensor_l2g(ctx, bmk, p->dst);
+
+  u8 *ref_data = (u8 *)malloc(sizeof(u8) * size);
+  l2l_tensor_copy_ref(p, ref_data, src_data);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  for (int src_align = 0; src_align < 2; src_align++) {
+    for (int dst_align = 0; dst_align < 2; dst_align++) {
+      param_t p;
+      memset(&p, 0, sizeof(p));
+      p.src = alloc_tl(bmk, c->src_shape, FMT_I8, src_align);
+      p.dst = alloc_tl(bmk, c->dst_shape, FMT_I8, dst_align);
+      test_param(ctx, bmk, &p);
+      destroy_param(bmk, &p);
+    }
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_tdma_l2l_tensor_lrn_shift.cpp b/cviruntime/test/1880v2/test_1880v2_tdma_l2l_tensor_lrn_shift.cpp
new file mode 100644
index 000000000..c06bddd25
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_tdma_l2l_tensor_lrn_shift.cpp
@@ -0,0 +1,189 @@
+#include "1880v2_test_util.h"
+
+typedef bmk1880v2_tdma_l2l_tensor_lrn_shift_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) %s%u%s (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      (p->right_shift? "": "<-"),
+      p->lrn_step,
+      (p->right_shift? "->": ""),
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  u32 n;
+  u32 c;
+  u32 src_h;
+  u32 src_w;
+  u32 dst_h;
+  u32 dst_w;
+} case_t;
+
+static case_t g_cases[] = {
+  { 0, 0, 1, 1, 1, 1 },
+  { 0, 0, 3, 7, 7, 3 },
+  { 0, 0, 4, 4, 2, 8 },
+  { 0, 0, 7, 7, 1, 49 },
+  { 0, 0, 7, 8, 14, 4 },
+  { 0, 0, 14, 6, 12, 7 },
+};
+
+static void destroy_param(bmk_ctx_t *bmk, param_t *p)
+{
+  free_tl(bmk, p->dst);
+  free_tl(bmk, p->src);
+}
+
+static void lrn_left_shift_ref(param_t *p, u8 ref_data[], u8 src_data[])
+{
+  u32 n = p->src->shape.n;
+  u32 c = p->src->shape.c;
+  u32 hw = p->src->shape.h * p->src->shape.w;
+  u64 size = tl_shape_size(&p->src->shape);
+
+  for (u64 i = 0; i < size; i++)
+    ref_data[i] = 0;
+
+  for (u32 ni = 0; ni < n; ni++) {
+    for (u32 ci = p->lrn_step; ci < c; ci++) {
+      for (u32 hwi = 0; hwi < hw; hwi++) {
+        u32 src_i = (ni * c + ci) * hw + hwi;
+        u32 dst_i = src_i - p->lrn_step * hw;
+        ref_data[dst_i] = src_data[src_i];
+      }
+    }
+  }
+}
+
+static void lrn_right_shift_ref(param_t *p, u8 ref_data[], u8 src_data[])
+{
+  u32 n = p->src->shape.n;
+  u32 c = p->src->shape.c;
+  u32 hw = p->src->shape.h * p->src->shape.w;
+  u64 size = tl_shape_size(&p->src->shape);
+
+  for (u64 i = 0; i < size; i++)
+    ref_data[i] = 0;
+
+  for (u32 ni = 0; ni < n; ni++) {
+    for (u32 ci = 0; ci < c - p->lrn_step; ci++) {
+      for (u32 hwi = 0; hwi < hw; hwi++) {
+        u32 src_i = (ni * c + ci) * hw + hwi;
+        u32 dst_i = src_i + p->lrn_step * hw;
+        ref_data[dst_i] = src_data[src_i];
+      }
+    }
+  }
+}
+
+static void l2l_tensor_lrn_shift_ref(
+    param_t *p, u8 ref_data[], u8 src_data[])
+{
+  if (p->right_shift)
+    return lrn_right_shift_ref(p, ref_data, src_data);
+  else
+    return lrn_left_shift_ref(p, ref_data, src_data);
+}
+
+static void test_param(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = tl_shape_size(&p->src->shape);
+
+  u8 *src_data = (u8 *)malloc(sizeof(u8) * size);
+  for (u64 i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  put_tensor_g2l(ctx, bmk, p->src, src_data);
+  bmk1880v2_tdma_l2l_tensor_lrn_shift(bmk, p);
+  u8 *dst_data = get_tensor_l2g(ctx, bmk, p->dst);
+
+  u8 *ref_data = (u8 *)malloc(sizeof(u8) * size);
+  l2l_tensor_lrn_shift_ref(p, ref_data, src_data);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+static void execute_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  static const u32 steps[] = { 1, 2, 4, 7 }; // less than npu_num/2
+  u32 nr_steps = sizeof(steps) / sizeof(steps[0]);
+
+  for (int src_align = 0; src_align < 2; src_align++) {
+    for (int dst_align = 0; dst_align < 2; dst_align++) {
+      tl_shape_t src_shape, dst_shape;
+      src_shape.n = c->n;
+      src_shape.c = c->c;
+      src_shape.h = c->src_h;
+      src_shape.w = c->src_w;
+      dst_shape.n = c->n;
+      dst_shape.c = c->c;
+      dst_shape.h = c->dst_h;
+      dst_shape.w = c->dst_w;
+
+      param_t p;
+      memset(&p, 0, sizeof(p));
+      p.src = alloc_tl(bmk, src_shape, FMT_I8, src_align);
+      p.dst = alloc_tl(bmk, dst_shape, FMT_I8, dst_align);
+
+      for (u32 i = 0; i < nr_steps; i++) {
+        if (steps[i] >= p.src->shape.c)
+          break;
+        p.lrn_step = steps[i];
+
+        p.right_shift = 0;
+        test_param(ctx, bmk, &p);
+
+        p.right_shift = 1;
+        test_param(ctx, bmk, &p);
+      }
+
+      destroy_param(bmk, &p);
+    }
+  }
+}
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *ca)
+{
+  for (u32 n = 1; n < 8; n += 2) {
+    ca->n = n;
+    for (u32 c = 1; c < 36; c += 3) {
+      ca->c = c;
+      execute_case(ctx, bmk, ca);
+    }
+    for (u32 c = 36; c < 66; c += 7) {
+      ca->c = c;
+      execute_case(ctx, bmk, ca);
+    }
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_tdma_l2tg_general_copy.cpp b/cviruntime/test/1880v2/test_1880v2_tdma_l2tg_general_copy.cpp
new file mode 100644
index 000000000..60c2533a1
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_tdma_l2tg_general_copy.cpp
@@ -0,0 +1,90 @@
+#include "1880v2_test_util.h"
+
+typedef bmk1880v2_tdma_l2tg_general_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: %u bytes from %x to %u:%lx\n", tag,
+      p->bytes, p->src_address, p->dst_base_reg_index, p->dst_address);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef param_t case_t;
+
+static case_t g_cases[] = {
+  { 0, 0, 0, 1 },
+  { 0, 0, 0, 39 },
+  { 0, 0, 0, 4096 },
+  { 0, 0, 100, 1 },
+  { 0, 0, 200, 39 },
+  { 0, 0, 1024, 4096 },
+  { 39, 0, 100, 1 },
+  { 47, 0, 200, 39 },
+  { 2048, 0, 1024, 4096 },
+};
+
+static void l2tg_general_copy_ref(param_t *p, u8 ref_data[], u8 src_data[])
+{
+  for (u32 i = 0; i < p->bytes; i++)
+    ref_data[i] = src_data[i];
+}
+
+static void test_param_l2g(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = p->bytes;
+
+  u8 *src_data = (u8 *)malloc(sizeof(u8) * size);
+  for (u64 i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  put_bytes_g2l(ctx, bmk, p->src_address, size, src_data);
+
+  #if 1
+  u8 *dst_data = get_bytes_l2g(ctx, bmk, p->src_address, size);
+
+  #else
+  bmk1880v2_tdma_l2g_general_copy(bmk, p);
+  test_submit(ctx);
+  u8 *dst_data = get_bytes_gmem(ctx, p->dst_address, size);
+  #endif
+
+  u8 *ref_data = (u8 *)malloc(sizeof(u8) * size);
+  l2tg_general_copy_ref(p, ref_data, src_data);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  param_t *p = c;
+
+  test_param_l2g(ctx, bmk, p);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_tdma_l2tg_matrix_copy.cpp b/cviruntime/test/1880v2/test_1880v2_tdma_l2tg_matrix_copy.cpp
new file mode 100644
index 000000000..f29a62315
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_tdma_l2tg_matrix_copy.cpp
@@ -0,0 +1,136 @@
+#include "1880v2_test_util.h"
+
+typedef bmk1880v2_tdma_l2tg_matrix_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.w, p->src->shape.col,
+      p->dst->shape.row, p->dst->shape.col);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  ml_shape_t src_shape;
+  mg_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 0, 1, 1, 1 },
+    { 0, 1 },
+  }, {
+    { 0, 1, 2, 2 },
+    { 1, 2 },
+  }, {
+    { 0, 2, 1, 2 },
+    { 1, 2 },
+  }, {
+    { 0, 1, 7, 7 },
+    { 0, 7 },
+  }, {
+    { 0, 2, 4, 7 },
+    { 0, 7 },
+  }, {
+    { 0, 7, 1, 7 },
+    { 0, 7 },
+  }, {
+    { 0, 1, 17, 17 },
+    { 0, 17 },
+  }, {
+    { 0, 3, 7, 17 },
+    { 0, 17 },
+  }, {
+    { 0, 17, 1, 17 },
+    { 0, 17 },
+  }, {
+    { 0, 1, 60, 60 },
+    { 0, 60 },
+  }, {
+    { 0, 30, 2, 60 },
+    { 0, 60 },
+  }, {
+    { 0, 60, 1, 60 },
+    { 0, 60 },
+  }
+};
+
+static void l2tg_matrix_copy_ref(param_t *p, u8 ref_data[], u8 src_data[])
+{
+  u64 size = ml_shape_size(&p->src->shape);
+
+  for (u64 i = 0; i < size; i++)
+    ref_data[i] = src_data[i];
+}
+
+static void test_param_l2g(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = ml_shape_size(&p->src->shape);
+
+  u8 *src_data = (u8 *)malloc(sizeof(u8) * size);
+  for (u64 i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  put_matrix_g2l(ctx, bmk, p->src, src_data);
+  bmk1880v2_tdma_l2g_matrix_copy(bmk, p);
+  test_submit(ctx);
+  u8 *dst_data = get_mg_gmem(ctx, p->dst);
+
+  u8 *ref_data = (u8 *)malloc(sizeof(u8) * size);
+  l2tg_matrix_copy_ref(p, ref_data, src_data);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+
+static void destroy_param_l2g(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_ml(bmk, p->src);
+  free_mg_gmem(ctx, p->dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  for (u32 row = 1; row < 13; row += 2) {
+    c->src_shape.n = row;
+    c->dst_shape.row = row;
+    for (int src_align = 0; src_align < 2; src_align++) {
+      param_t p;
+
+      memset(&p, 0, sizeof(p));
+      p.src = alloc_ml(bmk, c->src_shape, src_align);
+      p.dst = alloc_mg_gmem(ctx, c->dst_shape);
+      test_param_l2g(ctx, bmk, &p);
+      destroy_param_l2g(ctx, bmk, &p);
+
+    }
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_tdma_l2tg_matrix_vlc_copy_compressed.cpp b/cviruntime/test/1880v2/test_1880v2_tdma_l2tg_matrix_vlc_copy_compressed.cpp
new file mode 100644
index 000000000..3dcde6602
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_tdma_l2tg_matrix_vlc_copy_compressed.cpp
@@ -0,0 +1,163 @@
+#include "1880v2_test_util.h"
+
+typedef bmk1880v2_tdma_l2tg_matrix_copy_compressed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.w, p->src->shape.col,
+      p->dst->m.shape.row, p->dst->m.shape.col);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  ml_shape_t src_shape;
+  mg_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+ {
+    { 0, 2, 4, 7 },
+    { 0, 7 },
+  },
+ {
+    { 0, 1, 2, 2 },
+    { 1, 2 },
+  },
+ {
+    { 0, 3, 7, 17 },
+    { 0, 17 },
+  },
+ {
+    { 0, 17, 1, 17 },
+    { 0, 17 },
+  },
+ {
+    { 0, 60, 1, 60 },
+    { 0, 60 },
+  },
+#ifndef ENABEL_SIMPLE_BMK1880V2_VLC_TEST
+  {
+    { 0, 1, 1, 1 },
+    { 0, 1 },
+  },
+ {
+    { 0, 2, 1, 2 },
+    { 1, 2 },
+  },
+ {
+    { 0, 1, 7, 7 },
+    { 0, 7 },
+  },
+ {
+    { 0, 7, 1, 7 },
+    { 0, 7 },
+  },
+ {
+    { 0, 1, 17, 17 },
+    { 0, 17 },
+  },
+ {
+    { 0, 1, 60, 60 },
+    { 0, 60 },
+  },
+ {
+    { 0, 30, 2, 60 },
+    { 0, 60 },
+  },
+#endif /* ifndef ENABEL_SIMPLE_BMK1880V2_VLC_TEST*/
+};
+
+static void test_param_l2g(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p, u8* src_data, CommandInfo * cmd_info)
+{
+  print_param(stderr, p);
+  u64 size = ml_shape_size(&p->src->shape);
+
+  put_matrix_g2l(ctx, bmk, p->src, src_data);
+  bmk1880v2_tdma_l2g_matrix_copy_compressed(bmk, p);
+  test_submit(ctx);
+
+  int is_signed = (p->src->fmt == FMT_I8);
+  int data_type = (p->src->fmt == FMT_BF16) ? 1 : 0;
+  size_t bs_size;
+
+  size_t bs_buf_size = get_out_bs_buf_size(size, data_type);
+  u8 *ref_data = vlc_compress(src_data, size, is_signed, data_type, &bs_size, cmd_info, NULL);
+  u8 *dst_data = get_compressed_mg_gmem(ctx, p->dst, bs_buf_size);
+
+  for (u64 i = 0; i < bs_size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+          i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(dst_data);
+  free(ref_data);
+}
+
+
+static void destroy_param_l2g(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_ml(bmk, p->src);
+  free_compressed_mg_gmem(ctx, p->dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  fmt_t fmts[] = { FMT_I8, FMT_U8 };
+  u8 fmts_sz = sizeof(fmts) / sizeof(fmts[0]);
+
+  for (u32 row = 1; row < 13; row += 2) {
+    c->src_shape.n = row;
+    c->dst_shape.row = row;
+    for (int src_align = 0; src_align < 2; src_align++) {
+      for (u8 fmt_i = 0; fmt_i < fmts_sz; fmt_i++) {
+        fmt_t fmt = fmts[fmt_i];
+        param_t p;
+        memset(&p, 0, sizeof(p));
+        p.src = alloc_ml(bmk, c->src_shape, fmt, src_align);
+
+        u64 size = ml_shape_size(&p.src->shape);
+        u8 *src_data = (u8 *)malloc(sizeof(u8) * size);
+        vlc_init_testdata(src_data, size, fmt == FMT_I8, fmt == FMT_BF16);
+
+        //size_t bs_size;
+        CommandInfo cmd_info;
+        memset(&cmd_info, 0, sizeof(CommandInfo));
+
+        // <! not support bias0/1 setting compress by hw
+        //get_vlc_compressed_meta(src_data, size, p.src->fmt, &bs_size, &cmd_info);
+
+        int is_signed = (p.src->fmt == FMT_I8);
+        cmd_info.signedness = is_signed;
+
+        // <! max compressed size
+        p.dst = alloc_vlc_compressed_mg_gmem(ctx, c->dst_shape, p.src->fmt, &cmd_info);
+
+        //printf ("row %u is_align %d fmt %d\n", row, src_align, fmt);
+        test_param_l2g(ctx, bmk, &p, src_data, &cmd_info);
+        destroy_param_l2g(ctx, bmk, &p);
+        free(src_data);
+      }
+    }
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_tdma_l2tg_tensor_copy.cpp b/cviruntime/test/1880v2/test_1880v2_tdma_l2tg_tensor_copy.cpp
new file mode 100644
index 000000000..ccbdc02f7
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_tdma_l2tg_tensor_copy.cpp
@@ -0,0 +1,135 @@
+#include "1880v2_test_util.h"
+
+typedef bmk1880v2_tdma_l2tg_tensor_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  tl_shape_t src_shape;
+  tg_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 1, 1 },
+    { 1, 1, 1, 1 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 2, 1 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 2, 7 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 13, 17 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 120, 5 },
+  }, {
+    { 1, 2, 1, 1 },
+    { 1, 1, 1, 2 },
+  }, {
+    { 2, 17, 1,  4 },
+    { 2,  1, 4, 17 },
+  }, {
+    { 2, 17, 1,  4 },
+    { 2,  1, 17, 4 },
+  }, {
+    { 3, 16, 1, 1 },
+    { 3,  1, 2, 8 },
+  }, {
+    { 3, 39, 17, 23 },
+    { 3, 17, 39, 23 },
+  }, {
+    { 3, 36, 16,  20 },
+    { 3, 18,  1, 640 },
+  }, {
+    { 5, 39, 17, 23 },
+    { 5, 17, 39, 23 },
+  }, {
+    { 20, 35,  2, 2 },
+    { 20,  7, 10, 2 },
+  }    
+};
+
+static void l2tg_tensor_copy_ref(param_t *p, u8 ref_data[], u8 src_data[])
+{
+  u64 size = tl_shape_size(&p->src->shape);
+
+  for (u64 i = 0; i < size; i++)
+    ref_data[i] = src_data[i];
+}
+
+static void test_param_l2g(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = tl_shape_size(&p->src->shape);
+
+  u8 *src_data = (u8 *)malloc(sizeof(u8) * size);
+  for (u64 i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  put_tensor_g2l(ctx, bmk, p->src, src_data);
+  bmk1880v2_tdma_l2g_tensor_copy(bmk, p);
+  test_submit(ctx);
+  u8 *dst_data = get_tg_gmem(ctx, p->dst);
+
+  u8 *ref_data = (u8 *)malloc(sizeof(u8) * size);
+  l2tg_tensor_copy_ref(p, ref_data, src_data);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+
+static void destroy_param_l2g(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_tg_gmem(ctx, p->dst);
+  free_tl(bmk, p->src);
+}
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  for (int src_align = 0; src_align < 2; src_align++) {
+    param_t p;
+    memset(&p, 0, sizeof(p));
+
+    p.src = alloc_tl(bmk, c->src_shape, FMT_I8, src_align);
+    p.dst = alloc_tg_gmem(ctx, c->dst_shape, FMT_I8);
+    test_param_l2g(ctx, bmk, &p);
+    destroy_param_l2g(ctx, bmk, &p);
+
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_tdma_l2tg_tensor_copy_cw_transposed.cpp b/cviruntime/test/1880v2/test_1880v2_tdma_l2tg_tensor_copy_cw_transposed.cpp
new file mode 100644
index 000000000..3f3e12585
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_tdma_l2tg_tensor_copy_cw_transposed.cpp
@@ -0,0 +1,152 @@
+#include "1880v2_test_util.h"
+
+typedef bmk1880v2_tdma_l2tg_tensor_copy_cw_transposed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  tl_shape_t src_shape;
+  tg_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 1, 1 },
+    { 1, 1, 1, 1 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 2, 1, 1 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 2, 7, 1 },
+  }, {
+    { 1,  1, 17, 13 },
+    { 1, 13, 17,  1 },
+  }, {
+    { 1,  1, 10, 60 },
+    { 1, 60, 10,  1 },
+  }, {
+    { 1, 2, 1, 1 },
+    { 1, 1, 1, 2 },
+  }, {
+    {  2, 17, 1,  4 },
+    {  2,  4, 1, 17 },
+  }, {
+    {  2, 17, 3,  4 },
+    {  2,  4, 3, 17 },
+  }, {
+    {  3, 16, 7,  1 },
+    {  3,  1, 7, 16 },
+  }, {
+    {  3, 39, 17, 23 },
+    {  3, 23, 17, 39 },
+  }, {
+    {  3, 36,  16, 20 },
+    {  3, 20,  16, 36 },
+  }, {
+    {  5, 39, 17, 23 },
+    {  5, 23, 17, 39 },
+  }, {
+    { 20, 35,  2,  2 },
+    { 20,  2,  2, 35 },
+  }, {
+    { 20, 35,  3,  2 },
+    { 20,  2,  3, 35 },
+  }    
+};
+
+static void l2tg_tensor_copy_cw_transposed_ref(
+    param_t *p, u8 ref_data[], u8 src_data[])
+{
+  tl_shape_t s = p->src->shape;
+  u32 n = s.n;
+  u32 c = s.c;
+  u32 h = s.h;
+  u32 w = s.w;
+
+  for (u32 ni = 0; ni < n; ni++) {
+    for (u32 ci = 0; ci < c; ci++) {
+      for (u32 hi = 0; hi < h; hi++) {
+        for (u32 wi = 0; wi < w; wi++) {
+          u32 src_i = ni * c * h * w + ci * h * w + hi * w + wi;
+          u32 dst_i = ni * c * h * w + wi * h * c + hi * c + ci;
+          ref_data[dst_i] = src_data[src_i];
+        }
+      }
+    }
+  }
+}
+
+static void test_param_l2g(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = tl_shape_size(&p->src->shape);
+
+  u8 *src_data = (u8 *)malloc(sizeof(u8) * size);
+  for (u64 i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  put_tensor_g2l(ctx, bmk, p->src, src_data);
+  bmk1880v2_tdma_l2g_tensor_copy_cw_transposed(bmk, p);
+  test_submit(ctx);
+  u8 *dst_data = get_tg_gmem(ctx, p->dst);
+
+  u8 *ref_data = (u8 *)malloc(sizeof(u8) * size);
+  l2tg_tensor_copy_cw_transposed_ref(p, ref_data, src_data);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+
+static void destroy_param_l2g(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_tg_gmem(ctx, p->dst);
+  free_tl(bmk, p->src);
+}
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  for (int src_align = 0; src_align < 2; src_align++) {
+    param_t p;
+    memset(&p, 0, sizeof(p));
+
+    p.src = alloc_tl(bmk, c->src_shape, FMT_I8, src_align);
+    p.dst = alloc_tg_gmem(ctx, c->dst_shape, FMT_I8);
+    test_param_l2g(ctx, bmk, &p);
+    destroy_param_l2g(ctx, bmk, &p);
+
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_tdma_l2tg_tensor_copy_nc_transposed.cpp b/cviruntime/test/1880v2/test_1880v2_tdma_l2tg_tensor_copy_nc_transposed.cpp
new file mode 100644
index 000000000..02670dd59
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_tdma_l2tg_tensor_copy_nc_transposed.cpp
@@ -0,0 +1,226 @@
+#include "1880v2_test_util.h"
+
+typedef bmk1880v2_tdma_l2tg_tensor_copy_nc_transposed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  tl_shape_t src_shape;
+  tg_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 1, 1 },
+    { 1, 1, 1, 1 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 1, 2 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 2, 1 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 7, 2 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 2, 7 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 17, 13 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 13, 17 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 10, 60 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 2, 300 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 3, 200 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 4, 150 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 5, 120 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 60, 10 },
+  }, {
+    { 1, 1, 120, 5 },
+    { 1, 1, 120, 5 },
+  }, {
+    { 1, 2, 1, 1 },
+    { 2, 1, 1, 1 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 1, 4 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 2, 2 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 4, 1 },
+  }, {
+    { 17, 2, 2, 2 },
+    { 2, 17, 2, 2 },
+  }, {
+    { 17, 2, 4, 1 },
+    { 2, 17, 4, 1 },
+  }, {
+    {  3, 16, 1, 1 },
+    { 16,  3, 1, 1 },
+  }, {
+    { 3, 39, 23, 17 },
+    { 39, 3, 23, 17 },
+  }, {
+    { 3, 39, 17, 23 },
+    { 39, 3, 17, 23 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  16, 20 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  2, 160 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  4, 80 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  8, 40 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  20, 16 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  32, 10 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  64, 5 },
+  }, {
+    { 5, 39, 17, 23 },
+    { 39, 5, 17, 23 },
+  }, {
+    { 20, 35, 2, 2 },
+    { 35, 20, 2, 2 },
+  }, {
+    { 35, 20, 2, 2 },
+    { 20, 35, 2, 2 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 160, 2 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 2, 160 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 4, 80 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 8, 40 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 10, 32 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 20, 16 },
+  }, {
+    { 39, 5, 23, 17 },
+    { 5, 39, 23, 17 },
+  }    
+};
+
+static void l2tg_tensor_copy_nc_transposed_ref(
+    param_t *p, u8 ref_data[], u8 src_data[])
+{
+  tl_shape_t s = p->src->shape;
+  u32 n = s.n;
+  u32 c = s.c;
+  u32 hw = s.h * s.w;
+
+  for (u32 ni = 0; ni < n; ni++) {
+    for (u32 ci = 0; ci < c; ci++) {
+      for (u32 hwi = 0; hwi < hw; hwi++) {
+        u32 src_i = ni * c * hw + ci * hw + hwi;
+        u32 dst_i = ci * n * hw + ni * hw + hwi;
+        ref_data[dst_i] = src_data[src_i];
+      }
+    }
+  }
+}
+
+static void test_param_l2g(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = tl_shape_size(&p->src->shape);
+
+  u8 *src_data = (u8 *)malloc(sizeof(u8) * size);
+  for (u64 i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  put_tensor_g2l(ctx, bmk, p->src, src_data);
+  bmk1880v2_tdma_l2g_tensor_copy_nc_transposed(bmk, p);
+  test_submit(ctx);
+  u8 *dst_data = get_tg_gmem(ctx, p->dst);
+
+  u8 *ref_data = (u8 *)malloc(sizeof(u8) * size);
+  l2tg_tensor_copy_nc_transposed_ref(p, ref_data, src_data);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+static void destroy_param_l2g(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_tg_gmem(ctx, p->dst);
+  free_tl(bmk, p->src);
+}
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  for (int src_align = 0; src_align < 2; src_align++) {
+    param_t p;
+    memset(&p, 0, sizeof(p));
+
+    p.src = alloc_tl(bmk, c->src_shape, FMT_I8, src_align);
+    p.dst = alloc_tg_gmem(ctx, c->dst_shape, FMT_I8);
+    test_param_l2g(ctx, bmk, &p);
+    destroy_param_l2g(ctx, bmk, &p);
+
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_tdma_l2tg_tensor_fill_constant.cpp b/cviruntime/test/1880v2/test_1880v2_tdma_l2tg_tensor_fill_constant.cpp
new file mode 100644
index 000000000..8d1c431c3
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_tdma_l2tg_tensor_fill_constant.cpp
@@ -0,0 +1,136 @@
+#include "1880v2_test_util.h"
+
+typedef bmk1880v2_tdma_l2tg_tensor_fill_constant_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: %u => (%u, %u, %u, %u)\n",
+      tag, p->constant,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  u8 constant;
+  tg_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    37, { 1, 1, 1, 1 }
+  }, {
+    39, { 1, 1, 1, 2 }
+  }, {
+    23, { 1, 1, 2, 1 }
+  }, {
+    19, { 1, 1, 7, 2 }
+  }, {
+    17, { 1, 1, 2, 7 }
+  }, {
+    13, { 1, 1, 17, 13 }
+  }, {
+    11, { 1, 1, 13, 17 }
+  }, {
+    7, { 1, 1, 10, 60 }
+  }, {
+    9, { 1, 1, 120, 5 }
+  }, {
+    2, { 1, 2, 1, 1 }
+  }, {
+    3, { 1, 1, 1, 2 }
+  }, {
+    5, { 2, 17, 1,  4 }
+  }, {
+    41, { 2,  1, 4, 17 }
+  }, {
+    5, { 2, 17, 1,  4 }
+  }, {
+    9, { 2,  1, 17, 4 }
+  }, {
+    17, { 3, 16, 1, 1 }
+  }, {
+    26, { 3,  1, 2, 8 }
+  }, {
+    103, { 3, 39, 17, 23 }
+  }, {
+    255, { 3, 17, 39, 23 }
+  }, {
+    254, { 3, 36, 16,  20 }
+  }, {
+    127, { 3, 18,  1, 640 }
+  }, {
+    128, { 5, 39, 17, 23 }
+  }, {
+    129, { 5, 17, 39, 23 }
+  }, {
+    55, { 20, 35,  2, 2 }
+  }, {
+    1, { 20,  7, 10, 2 }
+  }    
+};
+
+static void l2tg_tensor_fill_constant_ref(param_t *p, u8 ref_data[])
+{
+  u64 size = tg_shape_size(&p->dst->shape);
+
+  for (u64 i = 0; i < size; i++)
+    ref_data[i] = p->constant;
+}
+
+static void test_param_l2g(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = tg_shape_size(&p->dst->shape);
+
+  bmk1880v2_tdma_l2g_tensor_fill_constant(bmk, p);
+  test_submit(ctx);
+  u8 *dst_data = get_tg_gmem(ctx, p->dst);
+
+  u8 *ref_data = (u8 *)malloc(sizeof(u8) * size);
+  l2tg_tensor_fill_constant_ref(p, ref_data);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(dst_data);
+  free(ref_data);
+}
+
+
+static void destroy_param_l2g(CVI_RT_HANDLE *ctx, param_t *p)
+{
+  free_tg_gmem(ctx, p->dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.constant = c->constant;
+
+  p.dst = alloc_tg_gmem(ctx, c->dst_shape, FMT_I8);
+  test_param_l2g(ctx, bmk, &p);
+  destroy_param_l2g(ctx, &p);
+
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_tdma_l2tg_tensor_vlc_copy_compressed.cpp b/cviruntime/test/1880v2/test_1880v2_tdma_l2tg_tensor_vlc_copy_compressed.cpp
new file mode 100644
index 000000000..8c35ce419
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_tdma_l2tg_tensor_vlc_copy_compressed.cpp
@@ -0,0 +1,156 @@
+#include "1880v2_test_util.h"
+
+typedef bmk1880v2_tdma_l2tg_tensor_copy_compressed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => %d-bit %s\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->bit_length,
+      (p->src->fmt == FMT_I8)? "signed": "unsigned");
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  tl_shape_t lmem_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 17, 13 }
+  },
+  {
+    { 3, 39, 17, 23 }
+  },
+  {
+    { 5, 39, 17, 23 }
+  },
+  {
+    { 20, 35,  2, 2 }
+  },
+#ifndef ENABEL_SIMPLE_BMK1880V2_VLC_TEST
+  {
+    { 1, 1, 1, 1 }
+  },
+  {
+    { 1, 1, 1, 2 }
+  },
+  {
+    { 1, 1, 7, 2 }
+  },
+  {
+    { 1, 1, 10, 60 }
+  },
+  {
+    { 1, 2, 1, 1 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 3, 16, 1, 1 }
+  },
+  {
+    { 3, 36, 16,  20 }
+  },
+#endif /* ifndef ENABEL_SIMPLE_BMK1880V2_VLC_TEST*/
+};
+
+static u64 l2tg_tensor_copy_vlc_compressed_ref(
+    param_t *p, u8 ref_data[], u8 src_data[], CommandInfo *cmd_info)
+{
+  u64 in_size = tl_shape_size(&p->src->shape);
+  size_t bs_size = 0;
+
+  bm_vlc_enc_int8(src_data, in_size, ref_data, &bs_size, cmd_info);
+  return bs_size;
+}
+
+static void test_param_l2g(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p, CommandInfo* cmd_info_est, u8 *src_data)
+{
+  print_param(stderr, p);
+  u64 size = tl_shape_size(&p->src->shape);
+
+  put_tensor_g2l(ctx, bmk, p->src, src_data);
+  bmk1880v2_tdma_l2g_tensor_copy_compressed(bmk, p);
+  test_submit(ctx);
+
+  u8 *dst_data = get_compressed_tg_gmem(ctx, p->dst);
+  u8 *ref_data = (u8 *)malloc(sizeof(u8) * p->dst->reserved_size); //<! bs_buf_size
+
+  size = l2tg_tensor_copy_vlc_compressed_ref(p, ref_data, src_data, cmd_info_est);
+
+  for (u64 i = 0; i < size ; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "vlc compress comparing failed at dst[%" PRIx64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+
+      exit(-1);
+    }
+  }
+
+  free(dst_data);
+  free(ref_data);
+}
+
+static void destroy_param_l2g(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_compressed_tg_gmem(ctx, p->dst);
+  free_tl(bmk, p->src);
+}
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  fmt_t fmts[] = { FMT_I8, FMT_U8 };
+
+  for (int src_align = 0; src_align < 2; src_align++) {
+    for (u8 fmt_i = 0; fmt_i < 2; fmt_i++) {
+      fmt_t fmt = fmts[fmt_i];
+      param_t p;
+      memset(&p, 0, sizeof(p));
+
+      p.src = alloc_tl(bmk, c->lmem_shape, fmt, src_align);
+      assert(p.src);
+
+      CommandInfo cmd_info;
+      memset(&cmd_info, 0, sizeof(CommandInfo));
+      u64 in_size = tl_shape_size(&p.src->shape);
+
+      u8 *src_data = (u8 *)malloc(sizeof(u8) * in_size);
+      vlc_init_testdata(src_data, in_size, fmt == FMT_I8, fmt == FMT_BF16);
+
+      int is_signed = (p.src->fmt == FMT_I8);
+      cmd_info.signedness = is_signed;
+
+      // <! not support bias0/1 setting compress by hw
+      //bm_vlc_est_weight_bias(src_data, in_size, (bool)is_signed, (bool)data_type, &cmd_info);
+
+      p.dst = _alloc_vlc_compressed_tg_gmem(ctx,
+          &c->lmem_shape, fmt, &cmd_info);
+      test_param_l2g(ctx, bmk, &p, &cmd_info, src_data);
+      destroy_param_l2g(ctx, bmk, &p);
+
+      free(src_data);
+    }
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_tdma_matrix_vlc_decompress_compress.cpp b/cviruntime/test/1880v2/test_1880v2_tdma_matrix_vlc_decompress_compress.cpp
new file mode 100644
index 000000000..2703906d3
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_tdma_matrix_vlc_decompress_compress.cpp
@@ -0,0 +1,189 @@
+#include "1880v2_test_util.h"
+
+typedef bmk1880v2_tdma_tg2l_matrix_copy_decompressed_param_t decompress_param_t;
+typedef bmk1880v2_tdma_l2tg_matrix_copy_compressed_param_t compress_param_t;
+
+typedef struct{
+  decompress_param_t dec_p;
+  compress_param_t com_p;
+} param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => %s\n",
+      tag,
+      p->dec_p.dst->shape.n, p->dec_p.dst->shape.c, p->dec_p.dst->shape.w, p->dec_p.dst->shape.col,
+      (p->dec_p.dst->fmt == FMT_I8)? "signed": "unsigned");
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  mg_shape_t src_shape;
+  ml_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 0, 1 },
+    { 0, 1, 1, 1 },
+  },
+  {
+    { 0, 2 },
+    { 0, 1, 2, 2 },
+  },
+  {
+    { 0, 2 },
+    { 0, 2, 1, 2 },
+  },
+  {
+    { 0, 7 },
+    { 0, 1, 7, 7 },
+  },
+#ifndef ENABEL_SIMPLE_BMK1880V2_VLC_TEST
+  {
+    { 0, 7 },
+    { 0, 2, 4, 7 },
+  },
+  {
+    { 0, 7 },
+    { 0, 7, 1, 7 },
+  },
+  {
+    { 0, 17 },
+    { 0, 1, 17, 17 },
+  },
+  {
+    { 0, 17 },
+    { 0, 3, 7, 17 },
+  },
+  {
+    { 0, 17 },
+    { 0, 17, 1, 17 },
+  },
+  {
+    { 0, 60 },
+    { 0, 1, 60, 60 },
+  },
+  {
+    { 0, 60 },
+    { 0, 30, 2, 60 },
+  },
+  {
+    { 0, 60 },
+    { 0, 60, 1, 60 },
+  }
+#endif /* ifndef ENABEL_SIMPLE_BMK1880V2_VLC_TEST*/
+};
+
+static void test_param_g2l(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p, u8 *src_data,
+  CommandInfo* cmd_info)
+{
+  print_param(stderr, p);
+  u64 size = ml_shape_size(&p->dec_p.dst->shape);
+  int is_signed = (p->dec_p.dst->fmt == FMT_I8);
+
+  u8 *gmem_data;
+  size_t bs_size;
+  size_t data_type = (p->dec_p.dst->fmt == FMT_BF16) ? 1 : 0;
+
+  // command info
+  gmem_data = vlc_compress(src_data, size, is_signed, data_type, &bs_size, cmd_info, NULL);
+
+  //1. send compressed one to gaddr and decompress from gaddr to local
+  put_compressed_mg_gmem(ctx, p->dec_p.src, gmem_data, bs_size);
+  bmk1880v2_tdma_g2l_matrix_copy_decompressed(bmk, &p->dec_p);
+  test_submit(ctx);
+
+  //2. decompress from sram
+  bmk1880v2_tdma_l2g_matrix_copy_compressed(bmk, &p->com_p);
+  test_submit(ctx);
+
+  //3. get final data
+  size_t bs_buf_size = get_out_bs_buf_size(size, data_type);
+  u8 *dst_data = get_compressed_mg_gmem(ctx, p->com_p.dst, bs_buf_size);
+
+  for (u64 i = 0; i < bs_size ; i++) {
+    if (dst_data[i] != gmem_data[i]) {
+      fprintf(stderr, "vlc compress comparing failed at dst[%" PRIx64 "], got %d, exp %d\n",
+              i, dst_data[i], gmem_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(dst_data);
+  free(gmem_data);
+}
+
+static void destroy_param_g2l(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_compressed_mg_gmem(ctx, p->dec_p.src);
+  free_compressed_mg_gmem(ctx, p->com_p.dst);
+  free_ml(bmk, p->dec_p.dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  fmt_t fmts[] = { FMT_I8, FMT_U8 };
+  u8 fmts_sz = sizeof(fmts) / sizeof(fmts[0]);
+
+  for (u32 row = 1; row < 13; row += 2) {
+    c->src_shape.row = row;
+    c->dst_shape.n = row;
+    for (int dst_align = 0; dst_align < 2; dst_align++) {
+      for (u8 fmt_i = 0; fmt_i < fmts_sz; fmt_i++) {
+        fmt_t fmt = fmts[fmt_i];
+        param_t p;
+        memset(&p, 0, sizeof(p));
+        //put compressed data to gaddr ->decompress to local -> compress to gaddr
+
+        int is_signed = (fmt == FMT_I8);
+
+        CommandInfo cmd_info;
+        memset(&cmd_info, 0, sizeof(CommandInfo));
+        cmd_info.signedness = is_signed;
+
+        // <! not support bias0/1 setting compress by hw
+        //get_vlc_compressed_meta(src_data, size, fmt, &bs_size, &cmd_info);
+
+        //1. alloc decompress
+        p.dec_p.src = alloc_vlc_compressed_mg_gmem(ctx, c->src_shape, fmt, &cmd_info);
+        p.dec_p.dst = alloc_ml(bmk, c->dst_shape, fmt, dst_align);
+
+        u64 size = ml_shape_size(&p.dec_p.dst->shape);
+        u8 *src_data = (u8 *)malloc(sizeof(u8) * size);
+        vlc_init_testdata(src_data, size, fmt == FMT_I8, fmt == FMT_BF16);
+
+        assert(p.dec_p.dst);
+
+        //2. alloc compress
+        p.com_p.src = p.dec_p.dst; //alloc_tl(bmk, c->lmem_shape, fmt, align);
+        p.com_p.dst = alloc_vlc_compressed_mg_gmem(ctx, c->src_shape, fmt, &cmd_info);
+
+        //3. test: the seqence like below:
+        //3.1 put compressed data to gaddr
+        //3.2 decompress to local
+        //3.3 compress to gaddr
+        //printf ("row %u is_align %d fmt %d\n", row, dst_align, fmt);
+        test_param_g2l(ctx, bmk, &p, src_data, &cmd_info);
+        destroy_param_g2l(ctx, bmk, &p);
+        free(src_data);
+      }
+    }
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_tdma_tensor_vlc_decompress_compress.cpp b/cviruntime/test/1880v2/test_1880v2_tdma_tensor_vlc_decompress_compress.cpp
new file mode 100644
index 000000000..699f17300
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_tdma_tensor_vlc_decompress_compress.cpp
@@ -0,0 +1,166 @@
+#include "1880v2_test_util.h"
+
+typedef bmk1880v2_tdma_tg2l_tensor_copy_decompressed_param_t decompress_param_t;
+typedef bmk1880v2_tdma_l2tg_tensor_copy_compressed_param_t compress_param_t;
+
+typedef struct{
+  decompress_param_t dec_p;
+  compress_param_t com_p;
+} param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => %d-bit %s\n",
+      tag,
+      p->dec_p.dst->shape.n, p->dec_p.dst->shape.c, p->dec_p.dst->shape.h, p->dec_p.dst->shape.w,
+      p->dec_p.src->bit_length,
+      (p->dec_p.dst->fmt == FMT_I8)? "signed": "unsigned");
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  tl_shape_t lmem_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 17, 13 }
+  },
+  {
+    { 3, 39, 17, 23 }
+  },
+  {
+    { 5, 39, 17, 23 }
+  },
+  {
+    { 20, 35,  2, 2 }
+  },
+#ifndef ENABEL_SIMPLE_BMK1880V2_VLC_TEST
+  {
+    { 1, 1, 1, 1 }
+  },
+  {
+    { 1, 1, 1, 2 }
+  },
+  {
+    { 1, 1, 7, 2 }
+  },
+  {
+    { 1, 1, 10, 60 }
+  },
+  {
+    { 1, 2, 1, 1 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 3, 16, 1, 1 }
+  },
+  {
+    { 3, 36, 16,  20 }
+  },
+#endif /* ifndef ENABEL_SIMPLE_BMK1880V2_VLC_TEST*/
+};
+
+static void test_param_g2l(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p, compressed_tg_t* dst)
+{
+  print_param(stderr, p);
+  u64 size = tl_shape_size(&p->dec_p.dst->shape);
+  int is_signed = (p->dec_p.dst->fmt == FMT_I8);
+  u8 *src_data = (u8 *)malloc(sizeof(u8) * size);
+  vlc_init_testdata(src_data, size, p->dec_p.dst->fmt == FMT_I8, p->dec_p.dst->fmt == FMT_BF16);
+
+  u8 *gmem_data;
+  size_t total_size;
+  size_t data_type = (p->dec_p.dst->fmt == FMT_BF16) ? 1 : 0;
+  size_t in_size = size;
+  size_t bs_buf_size = get_out_bs_buf_size(size, data_type);
+  gmem_data = (uint8_t *) malloc(bs_buf_size * sizeof(uint8_t));
+
+  // command info
+  CommandInfo cmd_info;
+  memset(&cmd_info, 0, sizeof(CommandInfo));
+  cmd_info.signedness = is_signed;
+
+  // <! not support bias0/1 setting compress by hw
+  //bm_vlc_est_weight_bias(src_data, in_size, (bool)is_signed, (bool)data_type, &cmd_info);
+  bm_vlc_enc_int8(src_data, in_size, gmem_data, &total_size, &cmd_info);
+
+  put_compressed_tg_gmem(ctx, p->dec_p.src, gmem_data, total_size);
+  bmk1880v2_tdma_g2l_tensor_copy_decompressed(bmk, &p->dec_p);
+  test_submit(ctx);
+
+  dst->zero_guard_en = cmd_info.zero_guard_en;
+  dst->bias0 = cmd_info.bias0;
+  dst->bias1 = cmd_info.bias1;
+  p->com_p.dst = dst;
+  bmk1880v2_tdma_l2g_tensor_copy_compressed(bmk, &p->com_p);
+  test_submit(ctx);
+
+  u8 *dst_data = get_compressed_tg_gmem(ctx, p->com_p.dst);
+
+  for (u64 i = 0; i < total_size ; i++) {
+    if (dst_data[i] != gmem_data[i]) {
+      fprintf(stderr, "vlc compress comparing failed at dst[%" PRIx64 "], got %d, exp %d\n",
+              i, dst_data[i], gmem_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+  free(gmem_data);
+}
+
+static void destroy_param_g2l(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_compressed_tg_gmem(ctx, p->dec_p.src);
+  free_compressed_tg_gmem(ctx, p->com_p.dst);
+  free_tl(bmk, p->dec_p.dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  fmt_t fmts[2] = { FMT_I8, FMT_U8 };
+
+  for (int align = 0; align < 2; align++) {
+    for (u8 fmt_i = 0; fmt_i < 2; fmt_i++) {
+      fmt_t fmt = fmts[fmt_i];
+
+      param_t p;
+      memset(&p, 0, sizeof(p));
+      p.dec_p.src = alloc_vlc_compressed_tg_gmem(ctx,
+          &c->lmem_shape, fmt);
+      p.dec_p.dst = alloc_tl(bmk, c->lmem_shape, fmt, align);
+      assert(p.dec_p.dst);
+
+      p.com_p.src = p.dec_p.dst; //alloc_tl(bmk, c->lmem_shape, fmt, align);
+      assert(p.com_p.src);
+      compressed_tg_t* dst = alloc_vlc_compressed_tg_gmem(ctx,
+          &c->lmem_shape, fmt);
+
+      test_param_g2l(ctx, bmk, &p, dst);
+      destroy_param_g2l(ctx, bmk, &p);
+    }
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_tdma_tg2l_general_copy.cpp b/cviruntime/test/1880v2/test_1880v2_tdma_tg2l_general_copy.cpp
new file mode 100644
index 000000000..295457447
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_tdma_tg2l_general_copy.cpp
@@ -0,0 +1,92 @@
+#include "1880v2_test_util.h"
+
+typedef bmk1880v2_tdma_tg2l_general_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: %u bytes from %u:%lx to %x\n", tag,
+      p->bytes, p->src_base_reg_index, p->src_address, p->dst_address);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef param_t case_t;
+
+static case_t g_cases[] = {
+  { 0, 0, 0, 1 },
+  { 0, 0, 0, 39 },
+  { 0, 0, 0, 4096 },
+  { 0, 1, 0, 1 },
+  { 0, 1, 0, 39 },
+  { 0, 1, 0, 4096 },
+  { 0, 1, 100, 1 },
+  { 0, 1, 200, 39 },
+  { 0, 1, 4096, 4096 },
+  { 0, 257, 100, 1 },
+  { 0, 349, 200, 39 },
+  { 0, 3356, 4096, 4096 },
+};
+
+static void tg2l_general_copy_ref(param_t *p, u8 ref_data[], u8 src_data[])
+{
+  for (u32 i = 0; i < p->bytes; i++)
+    ref_data[i] = src_data[i];
+}
+
+static void test_param_g2l(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = p->bytes;
+
+  u8 *src_data = (u8 *)malloc(sizeof(u8) * size);
+  for (u64 i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+#if 1
+  put_bytes_g2l(ctx, bmk, p->dst_address, size, src_data);
+
+#else
+  put_bytes_gmem(ctx, p->src_address, size, src_data);
+  bmk1880v2_tdma_g2l_general_copy(bmk, p);
+  test_submit(ctx);
+#endif
+
+  u8 *dst_data = get_bytes_l2g(ctx, bmk, p->dst_address, size);
+
+  u8 *ref_data = (u8 *)malloc(sizeof(u8) * size);
+  tg2l_general_copy_ref(p, ref_data, src_data);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  param_t *p = c;
+
+  test_param_g2l(ctx, bmk, p);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_tdma_tg2l_matrix_copy.cpp b/cviruntime/test/1880v2/test_1880v2_tdma_tg2l_matrix_copy.cpp
new file mode 100644
index 000000000..48edbb16f
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_tdma_tg2l_matrix_copy.cpp
@@ -0,0 +1,135 @@
+#include "1880v2_test_util.h"
+
+typedef bmk1880v2_tdma_tg2l_matrix_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.row, p->src->shape.col,
+      p->dst->shape.n, p->dst->shape.c,
+      p->dst->shape.w, p->dst->shape.col);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  mg_shape_t src_shape;
+  ml_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 0, 1 },
+    { 0, 1, 1, 1 },
+  }, {
+    { 0, 2 },
+    { 0, 1, 2, 2 },
+  }, {
+    { 0, 2 },
+    { 0, 2, 1, 2 },
+  }, {
+    { 0, 7 },
+    { 0, 1, 7, 7 },
+  }, {
+    { 0, 7 },
+    { 0, 2, 4, 7 },
+  }, {
+    { 0, 7 },
+    { 0, 7, 1, 7 },
+  }, {
+    { 0, 17 },
+    { 0, 1, 17, 17 },
+  }, {
+    { 0, 17 },
+    { 0, 3, 7, 17 },
+  }, {
+    { 0, 17 },
+    { 0, 17, 1, 17 },
+  }, {
+    { 0, 60 },
+    { 0, 1, 60, 60 },
+  }, {
+    { 0, 60 },
+    { 0, 30, 2, 60 },
+  }, {
+    { 0, 60 },
+    { 0, 60, 1, 60 },
+  }
+};
+
+static void tg2l_matrix_copy_ref(param_t *p, u8 ref_data[], u8 src_data[])
+{
+  u64 size = ml_shape_size(&p->dst->shape);
+
+  for (u64 i = 0; i < size; i++)
+    ref_data[i] = src_data[i];
+}
+
+static void test_param_g2l(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = ml_shape_size(&p->dst->shape);
+
+  u8 *src_data = (u8 *)malloc(sizeof(u8) * size);
+  for (u64 i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  put_mg_gmem(ctx, p->src, src_data);
+  bmk1880v2_tdma_g2l_matrix_copy(bmk, p);
+  test_submit(ctx);
+  u8 *dst_data = get_matrix_l2g(ctx, bmk, p->dst);
+
+  u8 *ref_data = (u8 *)malloc(sizeof(u8) * size);
+  tg2l_matrix_copy_ref(p, ref_data, src_data);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+static void destroy_param_g2l(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_mg_gmem(ctx, p->src);
+  free_ml(bmk, p->dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  for (u32 row = 1; row < 13; row += 2) {
+    c->src_shape.row = row;
+    c->dst_shape.n = row;
+    for (int dst_align = 0; dst_align < 2; dst_align++) {
+      param_t p;
+      memset(&p, 0, sizeof(p));
+
+      p.src = alloc_mg_gmem(ctx, c->src_shape);
+      p.dst = alloc_ml(bmk, c->dst_shape, dst_align);
+      test_param_g2l(ctx, bmk, &p);
+      destroy_param_g2l(ctx, bmk, &p);
+    }
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_tdma_tg2l_matrix_copy_row_col_transposed.cpp b/cviruntime/test/1880v2/test_1880v2_tdma_tg2l_matrix_copy_row_col_transposed.cpp
new file mode 100644
index 000000000..17348005b
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_tdma_tg2l_matrix_copy_row_col_transposed.cpp
@@ -0,0 +1,383 @@
+#include "1880v2_test_util.h"
+
+typedef bmk1880v2_tdma_tg2l_matrix_copy_row_col_transposed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.row, p->src->shape.col,
+      p->dst->shape.n, p->dst->shape.c,
+      p->dst->shape.w, p->dst->shape.col);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  mg_shape_t src_shape;
+  ml_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1 },
+    { 1, 1, 1, 1 },
+  }, {
+    { 1, 2 },
+    { 2, 1, 1, 1 },
+  }, {
+    { 1, 7 },
+    { 7, 1, 1, 1 },
+  }, {
+    { 1, 17 },
+    { 17, 1, 1, 1 },
+  }, {
+    { 1, 60 },
+    { 60, 1, 1, 1 },
+  }, {
+    { 1, 139 },
+    { 139, 1, 1, 1 },
+  }, {
+    { 2, 1 },
+    { 1, 1, 2, 2 },
+  }, {
+    { 2, 1 },
+    { 1, 2, 1, 2 },
+  }, {
+    { 2, 2 },
+    { 2, 1, 2, 2 },
+  }, {
+    { 2, 2 },
+    { 2, 2, 1, 2 },
+  }, {
+    { 2, 7 },
+    { 7, 1, 2, 2 },
+  }, {
+    { 2, 7 },
+    { 7, 2, 1, 2 },
+  }, {
+    { 2, 17 },
+    { 17, 1, 2, 2 },
+  }, {
+    { 2, 17 },
+    { 17, 2, 1, 2 },
+  }, {
+    { 2, 60 },
+    { 60, 1, 2, 2 },
+  }, {
+    { 2, 60 },
+    { 60, 2, 1, 2 },
+  }, {
+    { 2, 139 },
+    { 139, 1, 2, 2 },
+  }, {
+    { 2, 139 },
+    { 139, 2, 1, 2 },
+  }, {
+    { 7, 1 },
+    { 1, 1, 7, 7 },
+  }, {
+    { 7, 1 },
+    { 1, 2, 4, 7 },
+  }, {
+    { 7, 1 },
+    { 1, 2, 5, 7 },
+  }, {
+    { 7, 1 },
+    { 1, 2, 6, 7 },
+  }, {
+    { 7, 1 },
+    { 1, 3, 3, 7 },
+  }, {
+    { 7, 1 },
+    { 1, 4, 2, 7 },
+  }, {
+    { 7, 1 },
+    { 1, 7, 1, 7 },
+  }, {
+    { 7, 2 },
+    { 2, 1, 7, 7 },
+  }, {
+    { 7, 2 },
+    { 2, 2, 4, 7 },
+  }, {
+    { 7, 2 },
+    { 2, 2, 5, 7 },
+  }, {
+    { 7, 2 },
+    { 2, 2, 6, 7 },
+  }, {
+    { 7, 2 },
+    { 2, 3, 3, 7 },
+  }, {
+    { 7, 2 },
+    { 2, 4, 2, 7 },
+  }, {
+    { 7, 2 },
+    { 2, 7, 1, 7 },
+  }, {
+    { 7, 7 },
+    { 7, 1, 7, 7 },
+  }, {
+    { 7, 7 },
+    { 7, 3, 3, 7 },
+  }, {
+    { 7, 7 },
+    { 7, 4, 2, 7 },
+  }, {
+    { 7, 7 },
+    { 7, 7, 1, 7 },
+  }, {
+    { 7, 17 },
+    { 17, 1, 7, 7 },
+  }, {
+    { 7, 17 },
+    { 17, 4, 2, 7 },
+  }, {
+    { 7, 17 },
+    { 17, 7, 1, 7 },
+  }, {
+    { 7, 60 },
+    { 60, 1, 7, 7 },
+  }, {
+    { 7, 60 },
+    { 60, 3, 3, 7 },
+  }, {
+    { 7, 60 },
+    { 60, 7, 1, 7 },
+  }, {
+    { 7, 139 },
+    { 139, 1, 7, 7 },
+  }, {
+    { 7, 139 },
+    { 139, 3, 3, 7 },
+  }, {
+    { 7, 139 },
+    { 139, 7, 1, 7 },
+  }, {
+    { 43, 1 },
+    { 1, 1, 43, 43 },
+  }, {
+    { 43, 1 },
+    { 1, 2, 22, 43 },
+  }, {
+    { 43, 1 },
+    { 1, 2, 25, 43 },
+  }, {
+    { 43, 1 },
+    { 1, 2, 37, 43 },
+  }, {
+    { 43, 1 },
+    { 1, 2, 41, 43 },
+  }, {
+    { 43, 1 },
+    { 1, 5, 9, 43 },
+  }, {
+    { 43, 1 },
+    { 1, 5, 10, 43 },
+  }, {
+    { 43, 1 },
+    { 1, 9, 5, 43 },
+  }, {
+    { 43, 1 },
+    { 1, 22, 2, 43 },
+  }, {
+    { 43, 1 },
+    { 1, 43, 1, 43 },
+  }, {
+    { 43, 2 },
+    { 2, 1, 43, 43 },
+  }, {
+    { 43, 2 },
+    { 2, 2, 27, 43 },
+  }, {
+    { 43, 2 },
+    { 2, 22, 2, 43 },
+  }, {
+    { 43, 2 },
+    { 2, 43, 1, 43 },
+  }, {
+    { 57, 7 },
+    { 7, 1, 57, 57 },
+  }, {
+    { 57, 7 },
+    { 7, 2, 37, 57 },
+  }, {
+    { 57, 7 },
+    { 7, 2, 43, 57 },
+  }, {
+    { 57, 7 },
+    { 7, 2, 55, 57 },
+  }, {
+    { 57, 7 },
+    { 7, 2, 56, 57 },
+  }, {
+    { 57, 7 },
+    { 7, 7, 9, 57 },
+  }, {
+    { 57, 7 },
+    { 7, 8, 8, 57 },
+  }, {
+    { 57, 7 },
+    { 7, 29, 2, 57 },
+  }, {
+    { 57, 7 },
+    { 7, 57, 1, 57 },
+  }, {
+    { 67, 17 },
+    { 17, 1, 67, 67 },
+  }, {
+    { 67, 17 },
+    { 17, 2, 34, 67 },
+  }, {
+    { 67, 17 },
+    { 17, 2, 49, 67 },
+  }, {
+    { 67, 17 },
+    { 17, 2, 66, 67 },
+  }, {
+    { 67, 17 },
+    { 17, 6, 12, 67 },
+  }, {
+    { 67, 17 },
+    { 17, 6, 13, 67 },
+  }, {
+    { 67, 17 },
+    { 17, 17, 4, 67 },
+  }, {
+    { 67, 17 },
+    { 17, 34, 2, 67 },
+  }, {
+    { 67, 17 },
+    { 17, 67, 1, 67 },
+  }, {
+    { 129, 139 },
+    { 139, 1, 129, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 2, 65, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 2, 80, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 2, 120, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 2, 128, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 3, 43, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 3, 47, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 3, 59, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 3, 64, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 7, 19, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 7, 20, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 7, 21, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 43, 3, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 65, 2, 129 },
+  }, {
+    { 129, 139 },
+    { 139, 129, 1, 129 },
+  }
+};
+
+static void tg2l_matrix_copy_row_col_transposed_ref(
+    param_t *p, u8 ref_data[], u8 src_data[])
+{
+  u64 row = p->src->shape.row;
+  u64 col = p->src->shape.col;
+
+  for (u64 ri = 0; ri < row; ri++) {
+    for (u64 ci = 0; ci < col; ci++) {
+      u64 src_i = ri * col + ci;
+      u64 dst_i = ci * row + ri;
+      ref_data[dst_i] = src_data[src_i];
+    }
+  }
+}
+
+static void test_param_g2l(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = ml_shape_size(&p->dst->shape);
+
+  u8 *src_data = (u8 *)malloc(sizeof(u8) * size);
+  for (u64 i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  put_mg_gmem(ctx, p->src, src_data);
+  bmk1880v2_tdma_g2l_matrix_copy_row_col_transposed(bmk, p);
+  test_submit(ctx);
+  u8 *dst_data = get_matrix_l2g(ctx, bmk, p->dst);
+
+  u8 *ref_data = (u8 *)malloc(sizeof(u8) * size);
+  tg2l_matrix_copy_row_col_transposed_ref(p, ref_data, src_data);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+
+static void destroy_param_g2l(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_mg_gmem(ctx, p->src);
+  free_ml(bmk, p->dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  param_t p;
+  /*
+   * Matrix transpose must be n/c stride alignment
+   * for TDMA limitation
+   */
+  int dst_align = 1;
+
+  memset(&p, 0, sizeof(p));
+  p.src = alloc_mg_gmem(ctx, c->src_shape);
+  p.dst = alloc_ml(bmk, c->dst_shape, dst_align);
+  test_param_g2l(ctx, bmk, &p);
+  destroy_param_g2l(ctx, bmk, &p);
+
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_tdma_tg2l_matrix_vlc_copy_decompressed.cpp b/cviruntime/test/1880v2/test_1880v2_tdma_tg2l_matrix_vlc_copy_decompressed.cpp
new file mode 100644
index 000000000..c98277cec
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_tdma_tg2l_matrix_vlc_copy_decompressed.cpp
@@ -0,0 +1,181 @@
+#include "1880v2_test_util.h"
+
+typedef bmk1880v2_tdma_tg2l_matrix_copy_decompressed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->m.shape.row, p->src->m.shape.col,
+      p->dst->shape.n, p->dst->shape.c,
+      p->dst->shape.w, p->dst->shape.col);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  mg_shape_t src_shape;
+  ml_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 0, 17 },
+    { 0, 3, 7, 17 },
+  },
+  {
+    { 0, 7 },
+    { 0, 7, 1, 7 },
+  },
+  {
+    { 0, 60 },
+    { 0, 30, 2, 60 },
+  },
+#ifndef ENABEL_SIMPLE_BMK1880V2_VLC_TEST
+  {
+    { 0, 1 },
+    { 0, 1, 1, 1 },
+  },
+  {
+    { 0, 2 },
+    { 0, 1, 2, 2 },
+  },
+  {
+    { 0, 2 },
+    { 0, 2, 1, 2 },
+  },
+  {
+    { 0, 7 },
+    { 0, 1, 7, 7 },
+  },
+  {
+    { 0, 7 },
+    { 0, 2, 4, 7 },
+  },
+  {
+    { 0, 17 },
+    { 0, 1, 17, 17 },
+  },
+  {
+    { 0, 17 },
+    { 0, 17, 1, 17 },
+  },
+  {
+    { 0, 60 },
+    { 0, 1, 60, 60 },
+  },
+  {
+    { 0, 60 },
+    { 0, 60, 1, 60 },
+  }
+#endif /* ifndef ENABEL_SIMPLE_BMK1880V2_VLC_TEST*/
+};
+
+static void tg2l_matrix_copy_ref(param_t *p, u8 ref_data[], u8 src_data[])
+{
+  u64 size = ml_shape_size(&p->dst->shape);
+
+  for (u64 i = 0; i < size; i++)
+    ref_data[i] = src_data[i];
+}
+
+static void test_param_g2l(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p, u8 *src_data, CommandInfo* cmd_info)
+{
+  print_param(stderr, p);
+
+  u64 in_size = ml_shape_size(&p->dst->shape);
+  size_t bs_size = 0;
+  int is_signed = (p->dst->fmt == FMT_I8);
+  size_t data_type = (p->dst->fmt == FMT_BF16) ? 1 : 0;
+
+  u8 *bsbuf = vlc_compress(src_data, in_size, is_signed, data_type, &bs_size, cmd_info, NULL);
+
+  put_compressed_mg_gmem(ctx, p->src, bsbuf, bs_size);
+  free(bsbuf);
+  bmk1880v2_tdma_g2l_matrix_copy_decompressed(bmk, p);
+  test_submit(ctx);
+  u8 *dst_data = get_matrix_l2g(ctx, bmk, p->dst);
+
+  u8 *ref_data = (u8 *)malloc(sizeof(u8) * in_size);
+  tg2l_matrix_copy_ref(p, ref_data, src_data);
+
+  for (u64 i = 0; i < in_size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(dst_data);
+  free(ref_data);
+}
+
+static void destroy_param_g2l(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_compressed_mg_gmem(ctx, p->src);
+  free_ml(bmk, p->dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  fmt_t fmts[] = { FMT_I8, FMT_U8 };
+  u8 fmts_sz = sizeof(fmts) / sizeof(fmts[0]);
+
+  for (u32 row = 1; row < 13; row += 2) {
+    c->src_shape.row = row;
+    c->dst_shape.n = row;
+    for (int mode = 0; mode < VLC_CMP_MODE_MAX; mode++) {
+      for (int dst_align = 0; dst_align < 2; dst_align++) {
+        for (u8 fmt_i = 0; fmt_i < fmts_sz; fmt_i++) {
+          fmt_t fmt = fmts[fmt_i];
+          param_t p;
+          int is_signed = (fmt == FMT_I8);
+          size_t data_type = (fmt == FMT_BF16) ? 1 : 0;
+          CommandInfo cmd_info;
+
+          memset(&cmd_info, 0, sizeof(CommandInfo));
+          cmd_info.signedness = is_signed;
+
+          memset(&p, 0, sizeof(p));
+
+          // <! 1. alloc source
+          p.dst = alloc_ml(bmk, c->dst_shape, fmt, dst_align);
+          u64 in_size = ml_shape_size(&p.dst->shape);
+
+          // <! 2 init input
+          u8 *src_data = (u8 *)malloc(sizeof(u8) * in_size);
+          vlc_init_testdata(src_data, in_size, fmt == FMT_I8, fmt == FMT_BF16);
+
+          // <! 3 try to manual set bias0/bias1
+          if (mode == VLC_CMP_MODE_COMPILER) {
+            bm_vlc_est_weight_bias(src_data, in_size, (bool)is_signed, (bool)data_type, &cmd_info);
+          }
+
+          p.src = alloc_vlc_compressed_mg_gmem(ctx, c->src_shape, fmt, &cmd_info);
+
+          //printf ("row %u mode %d is_align %d fmt %d\n", row, mode, dst_align, fmt);
+          test_param_g2l(ctx, bmk, &p, src_data, &cmd_info);
+
+          free(src_data);
+          destroy_param_g2l(ctx, bmk, &p);
+        }
+      }
+    }
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_tdma_tg2l_tensor_copy.cpp b/cviruntime/test/1880v2/test_1880v2_tdma_tg2l_tensor_copy.cpp
new file mode 100644
index 000000000..4816496a7
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_tdma_tg2l_tensor_copy.cpp
@@ -0,0 +1,133 @@
+#include "1880v2_test_util.h"
+
+typedef bmk1880v2_tdma_tg2l_tensor_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  tg_shape_t src_shape;
+  tl_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 1, 1 },
+    { 1, 1, 1, 1 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 2, 1 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 2, 7 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 13, 17 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 120, 5 },
+  }, {
+    { 1, 2, 1, 1 },
+    { 1, 1, 1, 2 },
+  }, {
+    { 2, 17, 1,  4 },
+    { 2,  1, 4, 17 },
+  }, {
+    { 2, 17, 1,  4 },
+    { 2,  1, 17, 4 },
+  }, {
+    { 3, 16, 1, 1 },
+    { 3,  1, 2, 8 },
+  }, {
+    { 3, 39, 17, 23 },
+    { 3, 17, 39, 23 },
+  }, {
+    { 3, 36, 16,  20 },
+    { 3, 18,  1, 640 },
+  }, {
+    { 5, 39, 17, 23 },
+    { 5, 17, 39, 23 },
+  }, {
+    { 20, 35,  2, 2 },
+    { 20,  7, 10, 2 },
+  }    
+};
+
+static void tg2l_tensor_copy_ref(param_t *p, u8 ref_data[], u8 src_data[])
+{
+  u64 size = tl_shape_size(&p->dst->shape);
+
+  for (u64 i = 0; i < size; i++)
+    ref_data[i] = src_data[i];
+}
+
+static void test_param_g2l(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = tl_shape_size(&p->dst->shape);
+
+  u8 *src_data = (u8 *)malloc(sizeof(u8) * size);
+  for (u64 i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  put_tg_gmem(ctx, p->src, src_data);
+  bmk1880v2_tdma_g2l_tensor_copy(bmk, p);
+  test_submit(ctx);
+  u8 *dst_data = get_tensor_l2g(ctx, bmk, p->dst);
+
+  u8 *ref_data = (u8 *)malloc(sizeof(u8) * size);
+  tg2l_tensor_copy_ref(p, ref_data, src_data);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+static void destroy_param_g2l(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_tg_gmem(ctx, p->src);
+  free_tl(bmk, p->dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  for (int dst_align = 0; dst_align < 2; dst_align++) {
+    param_t p;
+
+    memset(&p, 0, sizeof(p));
+    p.src = alloc_tg_gmem(ctx, c->src_shape, FMT_I8);
+    p.dst = alloc_tl(bmk, c->dst_shape, FMT_I8, dst_align);
+    test_param_g2l(ctx, bmk, &p);
+    destroy_param_g2l(ctx, bmk, &p);
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_tdma_tg2l_tensor_copy_chw_rotated.cpp b/cviruntime/test/1880v2/test_1880v2_tdma_tg2l_tensor_copy_chw_rotated.cpp
new file mode 100644
index 000000000..3c755acc4
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_tdma_tg2l_tensor_copy_chw_rotated.cpp
@@ -0,0 +1,179 @@
+#include "1880v2_test_util.h"
+
+typedef bmk1880v2_tdma_tg2l_tensor_copy_chw_rotated_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.h, p->src->shape.w, p->src->shape.c,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  tg_shape_t src_shape;
+  tl_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 3, 1, 1 }, // nchw for neuron
+    { 1, 3, 1, 1 }, // nchw for neuron
+  }, {
+    { 1, 4, 1, 1 },
+    { 1, 4, 1, 1 },
+  }, {
+    { 1, 3, 1, 7 },
+    { 1, 3, 1, 7 },
+  }, {
+    { 1, 4, 1, 7 },
+    { 1, 4, 1, 7 },
+  }, {
+    { 1, 3, 1, 17 },
+    { 1, 3, 1, 17 },
+  }, {
+    { 1, 4, 1, 17 },
+    { 1, 4, 1, 17 },
+  }, {
+    { 1, 3, 2, 1 },
+    { 1, 3, 2, 1 },
+  }, {
+    { 1, 4, 2, 1 },
+    { 1, 4, 2, 1 },
+  }, {
+    {  2, 3, 17, 1 },
+    {  2, 3, 17, 1 },
+  }, {
+    {  2, 4, 17, 1 },
+    {  2, 4, 17, 1 },
+  }, {
+    {  2, 3, 17, 3 },
+    {  2, 3, 17, 3 },
+  }, {
+    {  2, 4, 17, 3 },
+    {  2, 4, 17, 3 },
+  }, {
+    {  3, 3, 16, 7 },
+    {  3, 3, 16, 7 },
+  }, {
+    {  3, 4, 16, 7 },
+    {  3, 4, 16, 7 },
+  }, {
+    {  3, 3, 39, 17 },
+    {  3, 3, 39, 17 },
+  }, {
+    {  3, 4, 39, 17 },
+    {  3, 4, 39, 17 },
+  }, {
+    {  3, 3, 36, 16 },
+    {  3, 3, 36, 16 },
+  }, {
+    {  3, 4, 36, 16 },
+    {  3, 4, 36, 16 },
+  }, {
+    {  5, 3, 39, 17 },
+    {  5, 3, 39, 17 },
+  }, {
+    {  5, 4, 39, 17 },
+    {  5, 4, 39, 17 },
+  }, {
+    { 20, 3, 35, 2 },
+    { 20, 3, 35, 2 },
+  }, {
+    { 20, 4, 35, 2 },
+    { 20, 4, 35, 2 },
+  }, {
+    { 20, 3, 35, 3 },
+    { 20, 3, 35, 3 },
+  }, {
+    { 20, 4, 35, 3 },
+    { 20, 4, 35, 3 },
+  }
+};
+
+static void tg2l_tensor_copy_chw_rotated_ref(
+    param_t *p, u8 ref_data[], u8 src_data[])
+{
+  tg_shape_t s = p->src->shape;
+  // change nhwc -> nchw by HW design automatically
+  u32 n = s.n;
+  u32 c = s.h;
+  u32 h = s.w;
+  u32 w = s.c;
+  for (u32 ni = 0; ni < n; ni++) {
+    for (u32 ci = 0; ci < c; ci++) {
+      for (u32 hi = 0; hi < h; hi++) {
+        for (u32 wi = 0; wi < w; wi++) {
+          u64 src_i = ni * c * h * w + ci * h * w + hi * w + wi;
+          u64 dst_i = ni * w * c * h + wi * c * h + ci * h + hi;
+          ref_data[dst_i] = src_data[src_i];
+        }
+      }
+    }
+  }
+}
+
+static void test_param_g2l(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = tg_shape_size(&p->src->shape);
+  u8 *src_data = (u8 *)malloc(sizeof(u8) * size);
+  for (u64 i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  put_tg_gmem(ctx, p->src, src_data);
+  bmk1880v2_tdma_g2l_tensor_copy_chw_rotated(bmk, p);
+  test_submit(ctx);
+  u8 *dst_data = get_tensor_l2g(ctx, bmk, p->dst);
+
+  u8 *ref_data = (u8 *)malloc(sizeof(u8) * size);
+  tg2l_tensor_copy_chw_rotated_ref(p, ref_data, src_data);
+ 
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+static void destroy_param_g2l(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_tg_gmem(ctx, p->src);
+  free_tl(bmk, p->dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+
+  param_t p;
+
+  memset(&p, 0, sizeof(p));
+  p.src = alloc_tg_gmem(ctx, c->src_shape, FMT_I8);
+  p.dst = alloc_tl(bmk, c->dst_shape, FMT_I8, 1);
+  test_param_g2l(ctx, bmk, &p);
+  destroy_param_g2l(ctx, bmk, &p);
+
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_tdma_tg2l_tensor_copy_nc_transposed.cpp b/cviruntime/test/1880v2/test_1880v2_tdma_tg2l_tensor_copy_nc_transposed.cpp
new file mode 100644
index 000000000..8acda83ec
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_tdma_tg2l_tensor_copy_nc_transposed.cpp
@@ -0,0 +1,227 @@
+#include "1880v2_test_util.h"
+
+typedef bmk1880v2_tdma_tg2l_tensor_copy_nc_transposed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  tg_shape_t src_shape;
+  tl_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 1, 1 },
+    { 1, 1, 1, 1 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 1, 2 },
+  }, {
+    { 1, 1, 1, 2 },
+    { 1, 1, 2, 1 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 7, 2 },
+  }, {
+    { 1, 1, 7, 2 },
+    { 1, 1, 2, 7 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 17, 13 },
+  }, {
+    { 1, 1, 17, 13 },
+    { 1, 1, 13, 17 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 10, 60 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 2, 300 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 3, 200 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 4, 150 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 5, 120 },
+  }, {
+    { 1, 1, 10, 60 },
+    { 1, 1, 60, 10 },
+  }, {
+    { 1, 1, 120, 5 },
+    { 1, 1, 120, 5 },
+  }, {
+    { 1, 2, 1, 1 },
+    { 2, 1, 1, 1 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 1, 4 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 2, 2 },
+  }, {
+    { 2, 17, 1, 4 },
+    { 17, 2, 4, 1 },
+  }, {
+    { 17, 2, 2, 2 },
+    { 2, 17, 2, 2 },
+  }, {
+    { 17, 2, 4, 1 },
+    { 2, 17, 4, 1 },
+  }, {
+    {  3, 16, 1, 1 },
+    { 16,  3, 1, 1 },
+  }, {
+    { 3, 39, 23, 17 },
+    { 39, 3, 23, 17 },
+  }, {
+    { 3, 39, 17, 23 },
+    { 39, 3, 17, 23 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  16, 20 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  2, 160 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  4, 80 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  8, 40 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  20, 16 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  32, 10 },
+  }, {
+    { 3, 36,  16, 20 },
+    { 36, 3,  64, 5 },
+  }, {
+    { 5, 39, 17, 23 },
+    { 39, 5, 17, 23 },
+  }, {
+    { 20, 35, 2, 2 },
+    { 35, 20, 2, 2 },
+  }, {
+    { 35, 20, 2, 2 },
+    { 20, 35, 2, 2 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 160, 2 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 2, 160 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 4, 80 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 8, 40 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 10, 32 },
+  }, {
+    { 36, 3, 160, 2 },
+    { 3, 36, 20, 16 },
+  }, {
+    { 39, 5, 23, 17 },
+    { 5, 39, 23, 17 },
+  }    
+};
+
+static void tg2l_tensor_copy_nc_transposed_ref(
+    param_t *p, u8 ref_data[], u8 src_data[])
+{
+  tg_shape_t s = p->src->shape;
+  u32 n = s.n;
+  u32 c = s.c;
+  u32 hw = s.h * s.w;
+
+  for (u32 ni = 0; ni < n; ni++) {
+    for (u32 ci = 0; ci < c; ci++) {
+      for (u32 hwi = 0; hwi < hw; hwi++) {
+        u32 src_i = ni * c * hw + ci * hw + hwi;
+        u32 dst_i = ci * n * hw + ni * hw + hwi;
+        ref_data[dst_i] = src_data[src_i];
+      }
+    }
+  }
+}
+
+static void test_param_g2l(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = tl_shape_size(&p->dst->shape);
+
+  u8 *src_data = (u8 *)malloc(sizeof(u8) * size);
+  for (u64 i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  put_tg_gmem(ctx, p->src, src_data);
+  bmk1880v2_tdma_g2l_tensor_copy_nc_transposed(bmk, p);
+  test_submit(ctx);
+  u8 *dst_data = get_tensor_l2g(ctx, bmk, p->dst);
+
+  u8 *ref_data = (u8 *)malloc(sizeof(u8) * size);
+  tg2l_tensor_copy_nc_transposed_ref(p, ref_data, src_data);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+
+static void destroy_param_g2l(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_tl(bmk, p->dst);
+  free_tg_gmem(ctx, p->src);
+}
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  for (int dst_align = 0; dst_align < 2; dst_align++) {
+    param_t p;
+
+    memset(&p, 0, sizeof(p));
+    p.src = alloc_tg_gmem(ctx, c->src_shape, FMT_I8);
+    p.dst = alloc_tl(bmk, c->dst_shape, FMT_I8, dst_align);
+    test_param_g2l(ctx, bmk, &p);
+    destroy_param_g2l(ctx, bmk, &p);
+
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_tdma_tg2l_tensor_fill_constant.cpp b/cviruntime/test/1880v2/test_1880v2_tdma_tg2l_tensor_fill_constant.cpp
new file mode 100644
index 000000000..25804a139
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_tdma_tg2l_tensor_fill_constant.cpp
@@ -0,0 +1,136 @@
+#include "1880v2_test_util.h"
+
+typedef bmk1880v2_tdma_tg2l_tensor_fill_constant_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: %u => (%u, %u, %u, %u)\n",
+      tag, p->constant,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  u8 constant;
+  tl_shape_t dst_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    37, { 1, 1, 1, 1 }
+  }, {
+    39, { 1, 1, 1, 2 }
+  }, {
+    23, { 1, 1, 2, 1 }
+  }, {
+    19, { 1, 1, 7, 2 }
+  }, {
+    17, { 1, 1, 2, 7 }
+  }, {
+    13, { 1, 1, 17, 13 }
+  }, {
+    11, { 1, 1, 13, 17 }
+  }, {
+    7, { 1, 1, 10, 60 }
+  }, {
+    9, { 1, 1, 120, 5 }
+  }, {
+    2, { 1, 2, 1, 1 }
+  }, {
+    3, { 1, 1, 1, 2 }
+  }, {
+    5, { 2, 17, 1,  4 }
+  }, {
+    41, { 2,  1, 4, 17 }
+  }, {
+    5, { 2, 17, 1,  4 }
+  }, {
+    9, { 2,  1, 17, 4 }
+  }, {
+    17, { 3, 16, 1, 1 }
+  }, {
+    26, { 3,  1, 2, 8 }
+  }, {
+    103, { 3, 39, 17, 23 }
+  }, {
+    255, { 3, 17, 39, 23 }
+  }, {
+    254, { 3, 36, 16,  20 }
+  }, {
+    127, { 3, 18,  1, 640 }
+  }, {
+    128, { 5, 39, 17, 23 }
+  }, {
+    129, { 5, 17, 39, 23 }
+  }, {
+    55, { 20, 35,  2, 2 }
+  }, {
+    1, { 20,  7, 10, 2 }
+  }    
+};
+
+static void tg2l_tensor_fill_constant_ref(param_t *p, u8 ref_data[])
+{
+  u64 size = tl_shape_size(&p->dst->shape);
+
+  for (u64 i = 0; i < size; i++)
+    ref_data[i] = p->constant;
+}
+
+static void test_param_tg2l(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+  u64 size = tl_shape_size(&p->dst->shape);
+
+  bmk1880v2_tdma_tg2l_tensor_fill_constant(bmk, p);
+  test_submit(ctx);
+  u8 *dst_data = get_tensor_l2g(ctx, bmk, p->dst);
+
+  u8 *ref_data = (u8 *)malloc(sizeof(u8) * size);
+  tg2l_tensor_fill_constant_ref(p, ref_data);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(dst_data);
+  free(ref_data);
+}
+
+static void destroy_param_tg2l(bmk_ctx_t *bmk, param_t *p)
+{
+  free_tl(bmk, p->dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  for (int dst_align = 0; dst_align < 2; dst_align++) {
+    param_t p;
+    memset(&p, 0, sizeof(p));
+    p.constant = c->constant;
+    p.dst = alloc_tl(bmk, c->dst_shape, FMT_I8, dst_align);
+
+    test_param_tg2l(ctx, bmk, &p);
+    destroy_param_tg2l(bmk, &p);
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_tdma_tg2l_tensor_vlc_copy_decompressed.cpp b/cviruntime/test/1880v2/test_1880v2_tdma_tg2l_tensor_vlc_copy_decompressed.cpp
new file mode 100644
index 000000000..d764aab1e
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_tdma_tg2l_tensor_vlc_copy_decompressed.cpp
@@ -0,0 +1,159 @@
+#include "1880v2_test_util.h"
+#include "bm_vlc_compress.h"
+
+typedef bmk1880v2_tdma_tg2l_tensor_copy_decompressed_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => fmt(%d) bias0/1/zero is (%u/%u/%u) %s\n",
+      tag,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w,
+      p->dst->fmt,
+      p->src->bias0, p->src->bias1, p->src->zero_guard_en,
+      (p->dst->fmt == FMT_I8)? "signed": "unsigned");
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  tl_shape_t lmem_shape;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    { 1, 1, 17, 13 }
+  },
+  {
+    { 3, 39, 17, 23 }
+  },
+  {
+    { 5, 39, 17, 23 }
+  },
+  {
+    { 20, 35,  2, 2 }
+  },
+#ifndef ENABEL_SIMPLE_BMK1880V2_VLC_TEST
+  {
+    { 1, 1, 1, 1 }
+  },
+  {
+    { 1, 1, 1, 2 }
+  },
+  {
+    { 1, 1, 7, 2 }
+  },
+  {
+    { 1, 1, 10, 60 }
+  },
+  {
+    { 1, 2, 1, 1 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 2, 17, 1,  4 }
+  },
+  {
+    { 3, 16, 1, 1 }
+  },
+  {
+    { 3, 36, 16,  20 }
+  },
+#endif /* ifndef ENABEL_SIMPLE_BMK1880V2_VLC_TEST*/
+};
+
+static void tg2l_tensor_copy_vlc_decompressed_ref(
+    u8 ref_data[], u64 ref_size, u8 src_data[])
+{
+  bm_vlc_dec_int8(src_data, ref_size, ref_data);
+}
+
+static void test_param_g2l(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p, u8 *src_data, CommandInfo* cmd_info)
+{
+  print_param(stderr, p);
+  u64 size = tl_shape_size(&p->dst->shape);
+  size_t bs_size = 0;
+  int is_signed = (p->dst->fmt == FMT_I8);
+  u8 data_type = (p->dst->fmt == FMT_BF16) ? 1 : 0;
+
+  u8 *bsbuf = vlc_compress(src_data, size, is_signed, data_type, &bs_size, cmd_info, NULL);
+
+  put_compressed_tg_gmem(ctx, p->src, bsbuf, bs_size);
+  bmk1880v2_tdma_g2l_tensor_copy_decompressed(bmk, p);
+  test_submit(ctx);
+
+  u8 *dst_data = get_tensor_l2g(ctx, bmk, p->dst);
+  u8 *ref_data = (u8 *)malloc(sizeof(u8) * size);
+  tg2l_tensor_copy_vlc_decompressed_ref(ref_data, size, bsbuf);
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "vlc decompress comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+  free(bsbuf);
+  free(dst_data);
+  free(ref_data);
+}
+
+static void destroy_param_g2l(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  free_compressed_tg_gmem(ctx, p->src);
+  free_tl(bmk, p->dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  fmt_t fmts[] = { FMT_I8, FMT_U8 };
+
+  for (int dst_align = 0; dst_align < 2; dst_align++) {
+    for (int mode = 0; mode < VLC_CMP_MODE_MAX; mode++) {
+      for (u8 fmt_i = 0; fmt_i < 2; fmt_i++) {
+        fmt_t fmt = fmts[fmt_i];
+        param_t p;
+        memset(&p, 0, sizeof(p));
+        p.dst = alloc_tl(bmk, c->lmem_shape, fmt, dst_align);
+        assert(p.dst);
+
+        u64 size = tl_shape_size(&p.dst->shape);
+        u8 *src_data = (u8 *)malloc(sizeof(u8) * size);
+        vlc_init_testdata(src_data, size, fmt == FMT_I8, fmt == FMT_BF16);
+
+        CommandInfo cmd_info;
+        memset(&cmd_info, 0, sizeof(CommandInfo));
+        int is_signed = (fmt == FMT_I8);
+        u8 data_type = (fmt == FMT_BF16) ? 1 : 0;
+
+        cmd_info.signedness = is_signed;
+
+        if (mode == VLC_CMP_MODE_COMPILER) {
+          bm_vlc_est_weight_bias(src_data, size, (bool)is_signed, (bool)data_type, &cmd_info);
+        }
+
+        p.src = _alloc_vlc_compressed_tg_gmem(ctx, &c->lmem_shape, fmt, &cmd_info);
+
+        test_param_g2l(ctx, bmk, &p, src_data, &cmd_info);
+
+        free(src_data);
+        destroy_param_g2l(ctx, bmk, &p);
+      }
+    }
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_tensor_add.cpp b/cviruntime/test/1880v2/test_1880v2_tensor_add.cpp
new file mode 100644
index 000000000..3f9f03a4c
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_tensor_add.cpp
@@ -0,0 +1,148 @@
+#include "1880v2_test_util.h"
+
+static void tl_add_ref(
+    u8 *ref_high, u8 *ref_low,
+    u8 *a_high, u8 *a_low,
+    u8 *b_high, u8 *b_low,
+    int rshift_bits,
+    u64 size, int relu_enable)
+{
+  for (u64 i = 0; i < size; i++) {
+    s32 ta = ((s8)a_high[i] << 8) + a_low[i];
+    s32 tb = ((s8)b_high[i] << 8) + b_low[i];
+    s32 res = ta + tb;
+
+    res += 1 << (rshift_bits - 1);
+    res >>= rshift_bits;
+
+    if(relu_enable)
+    {
+      if (res > 127)
+        res = 127;
+      else if (res < -128)
+        res = -128;
+
+      if(relu_enable)
+        if(res<0)
+          res=0;
+      ref_high[i] = 0;
+      ref_low[i] = res & 0xff;
+
+    }else{
+      if (res > 32767)
+        res = 32767;
+      else if (res < -32768)
+        res = -32768;
+      ref_high[i] = (res >> 8) & 0xff;
+      ref_low[i] = res & 0xff;
+    }
+  }
+}
+
+static void test_tl_add(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+  int rshift_bits;
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+  for(int relu_enable = 0; relu_enable < 2; relu_enable++) {
+    u64 size = n * c * h  * w;
+
+    u8 *a_high_data = (u8 *)xmalloc(size);
+    u8 *a_low_data = (u8 *)xmalloc(size);
+    u8 *b_high_data = (u8 *)xmalloc(size);
+    u8 *b_low_data = (u8 *)xmalloc(size);
+    for (u64 i = 0; i < size; i++) {
+      a_high_data[i] = rand() % 64+ i ;
+      a_low_data[i] = i;
+      b_high_data[i] = (i + 250) / 20;
+      b_low_data[i] = 100 - i;
+    }
+    if(relu_enable)
+      rshift_bits = 7;
+    else
+      rshift_bits = 1;
+
+    u8 *ref_high_data = (u8 *)xmalloc(size);
+    u8 *ref_low_data = (u8 *)xmalloc(size);
+    tl_add_ref(ref_high_data, ref_low_data,
+               a_high_data, a_low_data,
+               b_high_data, b_low_data,
+               rshift_bits,
+               size, relu_enable);
+
+    tl_t *tl_a_low = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+    tl_t *tl_a_high = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+    tl_t *tl_b_low = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+    tl_t *tl_b_high = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+    tl_t *tl_res_low = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+    tl_t *tl_res_high = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+
+    put_tensor_g2l(ctx, bk_ctx, tl_a_low, a_low_data);
+    put_tensor_g2l(ctx, bk_ctx, tl_a_high, a_high_data);
+    put_tensor_g2l(ctx, bk_ctx, tl_b_low, b_low_data);
+    put_tensor_g2l(ctx, bk_ctx, tl_b_high, b_high_data);
+    bmk1880v2_tiu_element_wise_add_param_t p4;
+    memset(&p4, 0, sizeof(p4));
+    p4.res_high = relu_enable ? 0 : tl_res_high;
+    p4.res_low = tl_res_low;
+    p4.a_high = tl_a_high;
+    p4.a_low = tl_a_low;
+    p4.b_is_const = 0;
+    p4.b_high = tl_b_high;
+    p4.b_low = tl_b_low;
+    p4.rshift_bits = rshift_bits;
+    p4.relu_enable = relu_enable;
+    bmk1880v2_tiu_element_wise_add(bk_ctx, &p4);
+    u8 *res_high_data = get_tensor_l2g(ctx, bk_ctx, tl_res_high);
+    u8 *res_low_data = get_tensor_l2g(ctx, bk_ctx, tl_res_low);
+    for (u64 i = 0; i < size; i++) {
+      if(!relu_enable)
+        if (res_high_data[i] != ref_high_data[i]) {
+          fprintf(stderr, "comparing failed at res_high_data[%" PRIu64 "], got %d, exp %d\n",
+                 i, res_high_data[i], ref_high_data[i]);
+          exit(-1);
+        }
+      if (res_low_data[i] != ref_low_data[i]) {
+        fprintf(stderr, "comparing failed at res_low_data[%" PRIu64 "], got %d, exp %d\n",
+               i, res_low_data[i], ref_low_data[i]);
+        exit(-1);
+      }
+    }
+
+    free_tl(bk_ctx, tl_res_high);
+    free_tl(bk_ctx, tl_res_low);
+    free_tl(bk_ctx, tl_b_high);
+    free_tl(bk_ctx, tl_b_low);
+    free_tl(bk_ctx, tl_a_high);
+    free_tl(bk_ctx, tl_a_low);
+
+    free(a_high_data);
+    free(a_low_data);
+    free(b_high_data);
+    free(b_low_data);
+    free(ref_high_data);
+    free(ref_low_data);
+    free(res_high_data);
+    free(res_low_data);
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_add(&ctx, bk_ctx, 0);
+  test_tl_add(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_tensor_add_const.cpp b/cviruntime/test/1880v2/test_1880v2_tensor_add_const.cpp
new file mode 100644
index 000000000..786de0a7c
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_tensor_add_const.cpp
@@ -0,0 +1,143 @@
+#include "1880v2_test_util.h"
+
+static void tl_add_const_ref(
+    u8 *ref_high, u8 *ref_low,
+    u8 *a_high, u8 *a_low,
+    s16 b, int b_is_signed,
+    int rshift_bits,
+    u64 size, int relu_enable)
+{
+  for (u64 i = 0; i < size; i++) {
+    s32 ta = ((s8)a_high[i] << 8) + a_low[i];
+    s32 tb = b_is_signed? b: (u16)b;
+    s32 res = ta + tb;
+    res += 1 << (rshift_bits - 1);
+    res >>= rshift_bits;
+
+    if(relu_enable)
+    {
+      if (res > 127)
+        res = 127;
+      else if (res < -128)
+        res = -128;
+
+      if(relu_enable)
+        if(res<0)
+          res=0;
+      ref_high[i] = 0;
+      ref_low[i] = res & 0xff;
+    }else{
+      if (res > 32767)
+        res = 32767;
+      else if (res < -32768)
+        res = -32768;
+      ref_high[i] = (res >> 8) & 0xff;
+      ref_low[i] = res & 0xff;
+    }
+  }
+}
+
+static void test_tl_add_const(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+  int rshift_bits;
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  for(int relu_enable = 0; relu_enable < 2; relu_enable++) {
+    u64 size = n * c * h  * w;
+
+    u8 *a_high_data = (u8 *)xmalloc(size);
+    u8 *a_low_data = (u8 *)xmalloc(size);
+    s16 b;
+    int b_is_signed = 1;
+    for (u64 i = 0; i < size; i++) {
+      a_high_data[i] = rand() % 64+ i;
+      a_low_data[i] = i;
+    }
+
+    if(relu_enable)
+    {
+      b=-64;
+      rshift_bits = 7;
+    }
+    else
+    {
+      b=-278;
+      rshift_bits = 1;
+    }
+    u8 *ref_high_data = (u8 *)xmalloc(size);
+    u8 *ref_low_data = (u8 *)xmalloc(size);
+    tl_add_const_ref(ref_high_data, ref_low_data,
+                     a_high_data, a_low_data,
+                     b, b_is_signed, rshift_bits, size,relu_enable);
+
+    tl_t *tl_a_low = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+    tl_t *tl_a_high = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+    tl_t *tl_res_low = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+    tl_t *tl_res_high = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+
+    put_tensor_g2l(ctx, bk_ctx, tl_a_low, a_low_data);
+    put_tensor_g2l(ctx, bk_ctx, tl_a_high, a_high_data);
+
+    bmk1880v2_tiu_element_wise_add_param_t p4;
+    memset(&p4, 0, sizeof(p4));
+    p4.res_high = relu_enable ? 0 : tl_res_high;
+    p4.res_low = tl_res_low;
+    p4.a_high = tl_a_high;
+    p4.a_low = tl_a_low;
+    p4.b_is_const = 1;
+    p4.b_const.val = b;
+    p4.b_const.is_signed = b_is_signed;
+    p4.rshift_bits = rshift_bits;
+    p4.relu_enable = relu_enable;
+    bmk1880v2_tiu_element_wise_add(bk_ctx, &p4);
+
+    u8 *res_high_data = get_tensor_l2g(ctx, bk_ctx, tl_res_high);
+    u8 *res_low_data = get_tensor_l2g(ctx, bk_ctx, tl_res_low);
+    for (u64 i = 0; i < size; i++) {
+      if(!relu_enable)
+        if (res_high_data[i] != ref_high_data[i]) {
+          fprintf(stderr, "comparing failed at res_high_data[%" PRIu64 "], got %d, exp %d\n",
+                  i, res_high_data[i], ref_high_data[i]);
+          exit(-1);
+        }
+      if (res_low_data[i] != ref_low_data[i]) {
+        fprintf(stderr, "comparing failed at res_low_data[%" PRIu64 "], got %d, exp %d\n",
+                i, res_low_data[i], ref_low_data[i]);
+        exit(-1);
+      }
+    }
+
+    free_tl(bk_ctx, tl_res_high);
+    free_tl(bk_ctx, tl_res_low);
+    free_tl(bk_ctx, tl_a_high);
+    free_tl(bk_ctx, tl_a_low);
+
+    free(a_high_data);
+    free(a_low_data);
+    free(ref_high_data);
+    free(ref_low_data);
+    free(res_high_data);
+    free(res_low_data);
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_add_const(&ctx, bk_ctx, 0);
+  test_tl_add_const(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_tensor_and.cpp b/cviruntime/test/1880v2/test_1880v2_tensor_and.cpp
new file mode 100644
index 000000000..f8e3533ef
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_tensor_and.cpp
@@ -0,0 +1,182 @@
+#include "1880v2_test_util.h"
+
+static void tl_and_int8_ref(s8 *a, s8 *b, s8 *res, u64 size)
+{
+  for (u64 i = 0; i < size; i++)
+    res[i] = a[i] & b[i];
+}
+
+static void tl_and_int16_ref(
+    u8 *ref_high, u8 *ref_low,
+    u8 *a_high, u8 *a_low,
+    u8 *b_high, u8 *b_low,
+    u64 size)
+{
+  for (u64 i = 0; i < size; i++) {
+    s32 ta = ((s8)a_high[i] << 8) + a_low[i];
+    s32 tb = ((s8)b_high[i] << 8) + b_low[i];
+    s32 res = ta & tb;
+    ref_high[i] = (res >> 8) & 0xff;
+    ref_low[i] = res & 0xff;
+  }
+}
+
+static void test_tl_and_int8(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  u64 size = n * c * h * w;
+
+  s8 *a_data = (s8 *)xmalloc(size);
+  for (u64 i = 0; i < size; i++)
+    a_data[i] = (s8)(i % 256);
+
+  s8 *b_data = (s8 *)xmalloc(size);
+  for (u64 i = 0; i < size; i++)
+    b_data[i] = (s8)(100 - i % 256);
+
+  s8 *ref_data = (s8 *)xmalloc(size);
+  tl_and_int8_ref(a_data, b_data, ref_data, size);
+
+  tl_t *tl_a = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_b = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_res = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+
+  put_tensor_g2l(ctx, bk_ctx, tl_a, (u8 *)a_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_b, (u8 *)b_data);
+  bmk1880v2_tiu_element_wise_and_int8_param_t p9;
+  memset(&p9, 0, sizeof(p9));
+  p9.res = tl_res;
+  p9.a = tl_a;
+  p9.b = tl_b;
+  bmk1880v2_tiu_element_wise_and_int8(bk_ctx, &p9);
+  u8 *res_data = get_tensor_l2g(ctx, bk_ctx, tl_res);
+
+  for (u64 i = 0; i < size; i++) {
+    if ((s8)res_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+             i, res_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_res);
+  free_tl(bk_ctx, tl_b);
+  free_tl(bk_ctx, tl_a);
+
+  free(a_data);
+  free(b_data);
+  free(ref_data);
+  free(res_data);
+}
+
+static void test_tl_and_int16(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  u64 size = n * c * h * w;
+
+  u8 *a_high_data = (u8 *)xmalloc(size);
+  u8 *a_low_data = (u8 *)xmalloc(size);
+  u8 *b_high_data = (u8 *)xmalloc(size);
+  u8 *b_low_data = (u8 *)xmalloc(size);
+  for (u64 i = 0; i < size; i++) {
+    a_high_data[i] = i / 10;
+    a_low_data[i] = i;
+    b_high_data[i] = (i + 250) / 20;
+    b_low_data[i] = 100 - i;
+  }
+
+  u8 *ref_high_data = (u8 *)xmalloc(size);
+  u8 *ref_low_data = (u8 *)xmalloc(size);
+  tl_and_int16_ref(
+      ref_high_data, ref_low_data,
+      a_high_data, a_low_data,
+      b_high_data, b_low_data,
+      size);
+
+  tl_t *tl_a_low = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_a_high = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_b_low = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_b_high = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_res_low = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_res_high = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+
+  put_tensor_g2l(ctx, bk_ctx, tl_a_low, a_low_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_a_high, a_high_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_b_low, b_low_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_b_high, b_high_data);
+  bmk1880v2_tiu_element_wise_and_int16_param_t p8;
+  memset(&p8, 0, sizeof(p8));
+  p8.res_high = tl_res_high;
+  p8.res_low = tl_res_low;
+  p8.a_high = tl_a_high;
+  p8.a_low = tl_a_low;
+  p8.b_high = tl_b_high;
+  p8.b_low = tl_b_low;
+  bmk1880v2_tiu_element_wise_and_int16(bk_ctx, &p8);
+  u8 *res_high_data = get_tensor_l2g(ctx, bk_ctx, tl_res_high);
+  u8 *res_low_data = get_tensor_l2g(ctx, bk_ctx, tl_res_low);
+
+  for (u64 i = 0; i < size; i++) {
+    if (res_high_data[i] != ref_high_data[i]) {
+      fprintf(stderr, "comparing failed at res_high_data[%" PRIu64 "], got %d, exp %d\n",
+             i, res_high_data[i], ref_high_data[i]);
+      exit(-1);
+    }
+    if (res_low_data[i] != ref_low_data[i]) {
+      fprintf(stderr, "comparing failed at res_low_data[%" PRIu64 "], got %d, exp %d\n",
+             i, res_low_data[i], ref_low_data[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_res_high);
+  free_tl(bk_ctx, tl_res_low);
+  free_tl(bk_ctx, tl_b_high);
+  free_tl(bk_ctx, tl_b_low);
+  free_tl(bk_ctx, tl_a_high);
+  free_tl(bk_ctx, tl_a_low);
+
+  free(a_high_data);
+  free(a_low_data);
+  free(b_high_data);
+  free(b_low_data);
+  free(ref_high_data);
+  free(ref_low_data);
+  free(res_high_data);
+  free(res_low_data);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_and_int8(&ctx, bk_ctx, 0);
+  test_tl_and_int8(&ctx, bk_ctx, 1);
+  test_tl_and_int16(&ctx, bk_ctx, 0);
+  test_tl_and_int16(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_tensor_arith_shift.cpp b/cviruntime/test/1880v2/test_1880v2_tensor_arith_shift.cpp
new file mode 100644
index 000000000..d9b327384
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_tensor_arith_shift.cpp
@@ -0,0 +1,118 @@
+#include "1880v2_test_util.h"
+
+static void tl_arith_shift_ref(
+    u8 *ref_high, u8 *ref_low,
+    u8 *a_high, u8 *a_low,
+    u8 *bits, u64 size)
+{
+  for (u64 i = 0; i < size; i++) {
+    s32 ta = ((s8)a_high[i] << 8) + a_low[i];
+    s32 tbits = (s8)bits[i];
+
+    /*
+     * Yes, a @tbits bigger than zero means shifting LEFT,
+     * no matter whether the shift type is arithmetic
+     * RIGHT shift or logic RIGHT shift.
+     */
+    s32 res;
+    if (tbits >= 0)
+      res = ta << tbits;
+    else
+      res = ta >> -tbits;
+
+    ref_high[i] = (res >> 8) & 0xff;
+    ref_low[i] = res & 0xff;
+  }
+}
+
+static void test_tl_arith_shift(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  u64 size = n * c * h * w;
+
+  u8 *a_high_data = (u8 *)xmalloc(size);
+  u8 *a_low_data = (u8 *)xmalloc(size);
+  u8 *bits_data = (u8 *)xmalloc(size);
+  for (u64 i = 0; i < size; i++) {
+    a_high_data[i] = 240 + i;
+    a_low_data[i] = 200 + i;
+    bits_data[i] = (i % 33) - 16;
+  }
+
+  u8 *ref_high_data = (u8 *)xmalloc(size);
+  u8 *ref_low_data = (u8 *)xmalloc(size);
+  tl_arith_shift_ref(
+      ref_high_data, ref_low_data,
+      a_high_data, a_low_data,
+      bits_data, size);
+
+  tl_t *tl_a_low = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_a_high = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_bits = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_res_low = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_res_high = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+
+  put_tensor_g2l(ctx, bk_ctx, tl_a_low, a_low_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_a_high, a_high_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_bits, bits_data);
+  bmk1880v2_tiu_element_wise_arith_shift_param_t p8;
+  memset(&p8, 0, sizeof(p8));
+  p8.res_high = tl_res_high;
+  p8.res_low = tl_res_low;
+  p8.a_high = tl_a_high;
+  p8.a_low = tl_a_low;
+  p8.bits = tl_bits;
+  bmk1880v2_tiu_element_wise_arith_shift(bk_ctx, &p8);
+  u8 *res_high_data = get_tensor_l2g(ctx, bk_ctx, tl_res_high);
+  u8 *res_low_data = get_tensor_l2g(ctx, bk_ctx, tl_res_low);
+
+  for (u64 i = 0; i < size; i++) {
+    if (res_high_data[i] != ref_high_data[i]) {
+      fprintf(stderr, "comparing failed at res_high_data[%" PRIu64 "], got %d, exp %d\n",
+             i, res_high_data[i], ref_high_data[i]);
+      exit(-1);
+    }
+    if (res_low_data[i] != ref_low_data[i]) {
+      fprintf(stderr, "comparing failed at res_low_data[%" PRIu64 "], got %d, exp %d\n",
+             i, res_low_data[i], ref_low_data[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_res_high);
+  free_tl(bk_ctx, tl_res_low);
+  free_tl(bk_ctx, tl_bits);
+  free_tl(bk_ctx, tl_a_high);
+  free_tl(bk_ctx, tl_a_low);
+
+  free(a_high_data);
+  free(a_low_data);
+  free(bits_data);
+  free(ref_high_data);
+  free(ref_low_data);
+  free(res_high_data);
+  free(res_low_data);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_arith_shift(&ctx, bk_ctx, 0);
+  test_tl_arith_shift(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_tensor_copy.cpp b/cviruntime/test/1880v2/test_1880v2_tensor_copy.cpp
new file mode 100644
index 000000000..959cd6105
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_tensor_copy.cpp
@@ -0,0 +1,68 @@
+#include "1880v2_test_util.h"
+
+static void tl_copy_ref(s8 *a, s8 *res, u64 size)
+{
+  for (u64 i = 0; i < size; i++)
+    res[i] = a[i];
+}
+
+static void test_tl_copy(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  u64 size = n * c * h * w;
+  s8 *a_data = (s8 *)xmalloc(size);
+  for (u64 i = 0; i < size; i++)
+    a_data[i] = (s8)(i % 256);
+
+  s8 *ref_data = (s8 *)xmalloc(size);
+  tl_copy_ref(a_data, ref_data, size);
+
+  tl_t *tl_a = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_res = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+
+  put_tensor_g2l(ctx, bk_ctx, tl_a, (u8 *)a_data);
+  bmk1880v2_tiu_element_wise_copy_param_t p10;
+  memset(&p10, 0, sizeof(p10));
+  p10.dst = tl_res;
+  p10.src = tl_a;
+  bmk1880v2_tiu_element_wise_copy(bk_ctx, &p10);
+  u8 *res_data = get_tensor_l2g(ctx, bk_ctx, tl_res);
+
+  for (u64 i = 0; i < size; i++) {
+    if ((s8)res_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+             i, res_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_res);
+  free_tl(bk_ctx, tl_a);
+
+  free(a_data);
+  free(ref_data);
+  free(res_data);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_copy(&ctx, bk_ctx, 0);
+  test_tl_copy(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_tensor_copy_with_stride.cpp b/cviruntime/test/1880v2/test_1880v2_tensor_copy_with_stride.cpp
new file mode 100644
index 000000000..cd0c1bc8e
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_tensor_copy_with_stride.cpp
@@ -0,0 +1,163 @@
+#include "1880v2_test_util.h"
+
+static int npu_num = 32;
+
+static u64 shape_size(tl_shape_t s)
+{
+  return s.n * s.c * s.h * s.w;
+}
+
+static tl_shape_t shape_of_stride(
+    tl_shape_t tl_shape,
+    bmk1880v2_tensor_lmem_stride_t tl_stride)
+{
+  tl_shape_t shape;
+  shape.n = tl_shape.n;
+  shape.c = npu_num;
+  shape.h = tl_stride.n;
+  shape.w = 1;
+
+  return shape;
+}
+
+static void tl_copy_with_stride_ref(
+    s8 *src,
+    s8 *dst,
+    tl_shape_t shape,
+    bmk1880v2_tensor_lmem_stride_t src_stride,
+    bmk1880v2_tensor_lmem_stride_t dst_stride)
+{
+  int n = shape.n;
+  int c = shape.c;
+  int h = shape.h;
+  int w = shape.w;
+
+  tl_shape_t dst_stride_shape = shape_of_stride(shape, dst_stride);
+
+  u64 dst_size =
+      dst_stride_shape.n *
+      dst_stride_shape.c *
+      dst_stride_shape.h *
+      dst_stride_shape.w;
+
+  for (u64 i = 0; i < dst_size; i++)
+    dst[i] = 0;
+
+  for (int ni = 0; ni < n; ni++) {
+    for (int ci = 0; ci < c; ci++) {
+      for (int hi = 0; hi < h; hi++) {
+        for (int wi = 0; wi < w; wi++) {
+          int src_i = (ni * npu_num + ci % npu_num) * src_stride.n +
+              ci / npu_num * src_stride.c +
+              hi * src_stride.h +
+              wi;
+          int dst_i = (ni * npu_num + ci % npu_num) * dst_stride.n +
+              ci / npu_num * dst_stride.c +
+              hi * dst_stride.h +
+              wi;
+          dst[dst_i] = src[src_i];
+        }
+      }
+    }
+  }
+}
+
+static void test_tl_copy_with_stride(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx)
+{
+  int n = 3;
+  int c = 38;
+  int h = 2;
+  int w = 3;
+  int c_layers = ALIGN(c, npu_num) / npu_num;
+
+  bmk1880v2_tensor_lmem_stride_t src_stride;
+  src_stride.w = 1;
+  src_stride.h = w + 3;
+  src_stride.c = h * src_stride.h + 13;
+  src_stride.n = c_layers * src_stride.c + 7;
+
+  bmk1880v2_tensor_lmem_stride_t dst_stride;
+  dst_stride.w = 1;
+  dst_stride.h = w + 1;
+  dst_stride.c = h * dst_stride.h + 5;
+  dst_stride.n = c_layers * dst_stride.c + 19;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  tl_shape_t src_stride_shape =
+      shape_of_stride(tl_shape, src_stride);
+
+  tl_shape_t dst_stride_shape =
+      shape_of_stride(tl_shape, dst_stride);
+
+  u64 src_size = shape_size(src_stride_shape);
+  u64 dst_size = shape_size(dst_stride_shape);
+
+  s8 *src_data = (s8 *)xmalloc(src_size);
+  for (u64 i = 0; i < src_size; i++)
+    src_data[i] = i;
+
+  s8 *dst_init_data = (s8 *)xmalloc(dst_size);
+  for (u64 i = 0; i < dst_size; i++)
+    dst_init_data[i] = 0;
+
+  tl_t *tl_src = alloc_tl(
+      bk_ctx, src_stride_shape, FMT_I8, /*eu_align*/0);
+
+  tl_t *tl_dst = alloc_tl(
+      bk_ctx, dst_stride_shape, FMT_I8, /*eu_align*/0);
+
+  put_tensor_g2l(ctx, bk_ctx, tl_src, (u8 *)src_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_dst, (u8 *)dst_init_data);
+
+  {
+    tl_t src = *tl_src;
+    tl_t dst = *tl_dst;
+    src.shape = dst.shape = tl_shape;
+    src.stride = src_stride;
+    dst.stride = dst_stride;
+    bmk1880v2_tiu_element_wise_copy_param_t p11;
+    memset(&p11, 0, sizeof(p11));
+    p11.dst = &dst;
+    p11.src = &src;
+    bmk1880v2_tiu_element_wise_copy(bk_ctx, &p11);
+  }
+
+  u8 *dst_data = get_tensor_l2g(ctx, bk_ctx, tl_dst);
+
+  s8 *ref_data = (s8 *)xmalloc(dst_size);
+  tl_copy_with_stride_ref(src_data, ref_data,
+                          tl_shape, src_stride, dst_stride);
+
+  for (u64 i = 0; i < dst_size; i++) {
+    if ((s8)dst_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at dst_data[%" PRIu64 "], got %x, exp %x\n",
+             i, dst_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_dst);
+  free_tl(bk_ctx, tl_src);
+
+  free(src_data);
+  free(dst_init_data);
+  free(dst_data);
+  free(ref_data);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  test_tl_copy_with_stride(&ctx, bk_ctx);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_tensor_mac.cpp b/cviruntime/test/1880v2/test_1880v2_tensor_mac.cpp
new file mode 100644
index 000000000..f3a1a5d78
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_tensor_mac.cpp
@@ -0,0 +1,150 @@
+#include "1880v2_test_util.h"
+
+static void tl_mac_ref(
+    u8 *ref_high, u8 *ref_low,
+    u8 *a, u8 *b, u8 *c_high, u8 *c_low,
+    int lshift_bits, int rshift_bits, u64 size, int relu_enable)
+{
+  for (u64 i = 0; i < size; i++) {
+    s32 ta = (s8)a[i];
+    s32 tb = (s8)b[i];
+    s32 tc = ((s8)c_high[i] << 8) + c_low[i];
+    tc <<= lshift_bits;
+    s32 res = ta * tb + tc;
+
+    res += 1 << (rshift_bits - 1);
+    res >>= rshift_bits;
+    if(relu_enable)
+    {
+      if (res > 127)
+        res = 127;
+      else if (res < -128)
+        res = -128;
+
+      if(relu_enable)
+        if(res<0)
+          res=0;
+      ref_high[i] = 0;
+      ref_low[i] = res & 0xff;
+
+    }else{
+      if (res > 32767)
+        res = 32767;
+      else if (res < -32768)
+        res = -32768;
+      ref_high[i] = (res >> 8) & 0xff;
+      ref_low[i] = res & 0xff;
+    }
+  }
+}
+
+static void test_tl_mac(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int lshift_bits;
+  int rshift_bits;
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  for(int relu_enable = 0; relu_enable < 2; relu_enable++) {
+    u64 size = n * c * h * w;
+    u8 *a_data = (u8 *)xmalloc(size);
+    u8 *b_data = (u8 *)xmalloc(size);
+    u8 *c_high_data = (u8 *)xmalloc(size);
+    u8 *c_low_data = (u8 *)xmalloc(size);
+
+    for (u64 i = 0; i < size; i++) {
+      a_data[i] = rand() % 128;
+      b_data[i] = 100 - i;
+      c_high_data[i] = rand() % 64;
+      c_low_data[i] = 200 + 2 * i;
+    }
+
+    if(relu_enable) {
+      lshift_bits= 1;
+      rshift_bits = 7;
+    }else {
+      lshift_bits = 1;
+      rshift_bits = 3;
+    }
+
+    u8 *ref_high_data = (u8 *)xmalloc(size);
+    u8 *ref_low_data = (u8 *)xmalloc(size);
+
+    tl_mac_ref(ref_high_data, ref_low_data,
+               a_data, b_data, c_high_data, c_low_data,
+               lshift_bits, rshift_bits, size, relu_enable);
+
+    tl_t *tl_a = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+    tl_t *tl_b = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+    tl_t *tl_c_low = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+    tl_t *tl_c_high = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+
+    put_tensor_g2l(ctx, bk_ctx, tl_a, a_data);
+    put_tensor_g2l(ctx, bk_ctx, tl_b, b_data);
+    put_tensor_g2l(ctx, bk_ctx, tl_c_low, c_low_data);
+    put_tensor_g2l(ctx, bk_ctx, tl_c_high, c_high_data);
+    bmk1880v2_tiu_element_wise_mac_param_t p2;
+    memset(&p2, 0, sizeof(p2));
+    p2.res_high = tl_c_high;
+    p2.res_low = tl_c_low;
+    p2.res_is_int8 = relu_enable;
+    p2.a = tl_a;
+    p2.b_is_const = 0;
+    p2.b = tl_b;
+    p2.lshift_bits = lshift_bits;
+    p2.rshift_bits = rshift_bits;
+    p2.relu_enable = relu_enable;
+    bmk1880v2_tiu_element_wise_mac(bk_ctx, &p2);
+    u8 *mac_high_data = get_tensor_l2g(ctx, bk_ctx, tl_c_high);
+    u8 *mac_low_data = get_tensor_l2g(ctx, bk_ctx, tl_c_low);
+
+    for (u64 i = 0; i < size; i++) {
+      if(!relu_enable)
+        if (mac_high_data[i] != ref_high_data[i]) {
+          fprintf(stderr, "comparing failed at mac_high_data[%" PRIu64 "], got %d, exp %d\n",
+                 i, mac_high_data[i], ref_high_data[i]);
+          exit(-1);
+        }
+      if (mac_low_data[i] != ref_low_data[i]) {
+        fprintf(stderr, "comparing failed at mac_low_data[%" PRIu64 "], got %d, exp %d\n",
+               i, mac_low_data[i], ref_low_data[i]);
+        exit(-1);
+      }
+    }
+
+    free_tl(bk_ctx, tl_c_high);
+    free_tl(bk_ctx, tl_c_low);
+    free_tl(bk_ctx, tl_b);
+    free_tl(bk_ctx, tl_a);
+
+    free(a_data);
+    free(b_data);
+    free(c_high_data);
+    free(c_low_data);
+    free(ref_high_data);
+    free(ref_low_data);
+    free(mac_high_data);
+    free(mac_low_data);
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_mac(&ctx, bk_ctx, 0);
+  test_tl_mac(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_tensor_mac_const.cpp b/cviruntime/test/1880v2/test_1880v2_tensor_mac_const.cpp
new file mode 100644
index 000000000..28c535a29
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_tensor_mac_const.cpp
@@ -0,0 +1,147 @@
+#include "1880v2_test_util.h"
+
+static void tl_mac_const_ref(
+    u8 *ref_high, u8 *ref_low,
+    u8 *a, u8 b_const, int b_is_signed,
+    u8 *c_high, u8 *c_low,
+    int lshift_bits, int rshift_bits, u64 size, int relu_enable)
+{
+  for (u64 i = 0; i < size; i++) {
+    s32 ta = (s8)a[i];
+    s32 tb = b_is_signed? (s8)b_const: (u8)b_const;
+    s32 tc = ((s8)c_high[i] << 8) + c_low[i];
+    tc <<= lshift_bits;
+    s32 res = ta * tb + tc;
+
+    res += 1 << (rshift_bits - 1);
+    res >>= rshift_bits;
+
+    if(relu_enable)
+    {
+      if (res > 127)
+        res = 127;
+      else if (res < -128)
+        res = -128;
+
+      if(relu_enable)
+        if(res<0)
+          res=0;
+      ref_high[i] = 0;
+      ref_low[i] = res & 0xff;
+
+    }else{
+      if (res > 32767)
+        res = 32767;
+      else if (res < -32768)
+        res = -32768;
+      ref_high[i] = (res >> 8) & 0xff;
+      ref_low[i] = res & 0xff;
+    }
+  }
+}
+
+static void test_tl_mac_const(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int lshift_bits;
+  int rshift_bits;
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  for(int relu_enable = 0; relu_enable < 2; relu_enable++) {
+    u64 size = n * c * h * w;
+
+    u8 *a_data = (u8 *)xmalloc(size);
+    u8 *c_high_data = (u8 *)xmalloc(size);
+    u8 *c_low_data = (u8 *)xmalloc(size);
+    for (u64 i = 0; i < size; i++) {
+      a_data[i] = rand() % 256;
+      c_high_data[i] = rand() % 64;
+      c_low_data[i] = 200 + 2 * i;
+    }
+
+    u8 b_const = 37;
+    int b_is_signed = 1;
+     if(relu_enable) {
+      lshift_bits = 1;
+      rshift_bits = 8;
+    }else {
+      lshift_bits = 1;
+      rshift_bits = 3;
+    }
+
+    u8 *ref_high_data = (u8 *)xmalloc(size);
+    u8 *ref_low_data = (u8 *)xmalloc(size);
+    tl_mac_const_ref(ref_high_data, ref_low_data,
+                     a_data, b_const, b_is_signed, c_high_data, c_low_data,
+                     lshift_bits, rshift_bits, size, relu_enable);
+
+    tl_t *tl_a = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+    tl_t *tl_c_low = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+    tl_t *tl_c_high = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+
+    put_tensor_g2l(ctx, bk_ctx, tl_a, a_data);
+    put_tensor_g2l(ctx, bk_ctx, tl_c_low, c_low_data);
+    put_tensor_g2l(ctx, bk_ctx, tl_c_high, c_high_data);
+    bmk1880v2_tiu_element_wise_mac_param_t p3;
+    memset(&p3, 0, sizeof(p3));
+    p3.res_high = tl_c_high;
+    p3.res_low = tl_c_low;
+    p3.res_is_int8 = relu_enable;
+    p3.a = tl_a;
+    p3.b_is_const = 1;
+    p3.b_const.val = b_const;
+    p3.b_const.is_signed = b_is_signed;
+    p3.lshift_bits = lshift_bits;
+    p3.rshift_bits = rshift_bits;
+    p3.relu_enable = relu_enable;
+    bmk1880v2_tiu_element_wise_mac(bk_ctx, &p3);
+    u8 *mac_high_data = get_tensor_l2g(ctx, bk_ctx, tl_c_high);
+    u8 *mac_low_data = get_tensor_l2g(ctx, bk_ctx, tl_c_low);
+    for (u64 i = 0; i < size; i++) {
+      if(!relu_enable)
+        if (mac_high_data[i] != ref_high_data[i]) {
+          fprintf(stderr, "comparing failed at mac_high_data[%" PRIu64 "], got %d, exp %d\n",
+                 i, mac_high_data[i], ref_high_data[i]);
+          exit(-1);
+        }
+      if (mac_low_data[i] != ref_low_data[i]) {
+        fprintf(stderr, "comparing failed at mac_low_data[%" PRIu64 "], got %d, exp %d\n",
+               i, mac_low_data[i], ref_low_data[i]);
+        exit(-1);
+      }
+    }
+
+    free_tl(bk_ctx, tl_c_high);
+    free_tl(bk_ctx, tl_c_low);
+    free_tl(bk_ctx, tl_a);
+
+    free(a_data);
+    free(c_high_data);
+    free(c_low_data);
+    free(ref_high_data);
+    free(ref_low_data);
+    free(mac_high_data);
+    free(mac_low_data);
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_mac_const(&ctx, bk_ctx, 0);
+  test_tl_mac_const(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_tensor_max.cpp b/cviruntime/test/1880v2/test_1880v2_tensor_max.cpp
new file mode 100644
index 000000000..0976155c7
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_tensor_max.cpp
@@ -0,0 +1,83 @@
+#include "1880v2_test_util.h"
+
+static void tl_max_ref(s8 *a, s8 *b, s8 *max, u64 size)
+{
+  for (u64 i = 0; i < size; i++) {
+    if (a[i] > b[i])
+      max[i] = a[i];
+    else
+      max[i] = b[i];
+  }
+}
+
+static void test_tl_max(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  u64 size = n * c * h * w;
+  s8 *a_data = (s8 *)xmalloc(size);
+  for (u64 i = 0; i < size; i++)
+    a_data[i] = (s8)(i % 256);
+
+  s8 *b_data = (s8 *)xmalloc(size);
+  for (u64 i = 0; i < size; i++)
+    b_data[i] = (s8)(100 - i % 256);
+
+  s8 *ref_data = (s8 *)xmalloc(size);
+  tl_max_ref(a_data, b_data, ref_data, size);
+
+  tl_t *tl_a = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_b = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_max = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+
+  put_tensor_g2l(ctx, bk_ctx, tl_a, (u8 *)a_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_b, (u8 *)b_data);
+
+  bmk1880v2_tiu_element_wise_max_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.max = tl_max;
+  p.a = tl_a;
+  p.b_is_const = 0;
+  p.b = tl_b;
+  bmk1880v2_tiu_element_wise_max(bk_ctx, &p);
+  u8 *max_data = get_tensor_l2g(ctx, bk_ctx, tl_max);
+
+  for (u64 i = 0; i < size; i++) {
+    if ((s8)max_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+             i, max_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_max);
+  free_tl(bk_ctx, tl_b);
+  free_tl(bk_ctx, tl_a);
+
+  free(a_data);
+  free(b_data);
+  free(ref_data);
+  free(max_data);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_max(&ctx, bk_ctx, 0);
+  test_tl_max(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_tensor_max_const.cpp b/cviruntime/test/1880v2/test_1880v2_tensor_max_const.cpp
new file mode 100644
index 000000000..0e3af8511
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_tensor_max_const.cpp
@@ -0,0 +1,76 @@
+#include "1880v2_test_util.h"
+
+static void tl_max_const_ref(s8 *a, s8 b, s8 *max, u64 size)
+{
+  for (u64 i = 0; i < size; i++) {
+    if (a[i] > b)
+      max[i] = a[i];
+    else
+      max[i] = b;
+  }
+}
+
+static void test_tl_max_const(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  u64 size = n * c * h * w;
+  s8 *a_data = (s8 *)xmalloc(size);
+  for (u64 i = 0; i < size; i++)
+    a_data[i] = (s8)(i % 256);
+
+  s8 b = 47;
+  s8 *ref_data = (s8 *)xmalloc(size);
+  tl_max_const_ref(a_data, b, ref_data, size);
+
+  tl_t *tl_a = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_max = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+
+  put_tensor_g2l(ctx, bk_ctx, tl_a, (u8 *)a_data);
+  bmk1880v2_tiu_element_wise_max_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.max = tl_max;
+  p.a = tl_a;
+  p.b_is_const = 1;
+  p.b_const.val = b;
+  p.b_const.is_signed = 1;
+  bmk1880v2_tiu_element_wise_max(bk_ctx, &p);
+  u8 *max_data = get_tensor_l2g(ctx, bk_ctx, tl_max);
+
+  for (u64 i = 0; i < size; i++) {
+    if ((s8)max_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+             i, max_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_max);
+  free_tl(bk_ctx, tl_a);
+
+  free(a_data);
+  free(ref_data);
+  free(max_data);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_max_const(&ctx, bk_ctx, 0);
+  test_tl_max_const(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_tensor_mdsum.cpp b/cviruntime/test/1880v2/test_1880v2_tensor_mdsum.cpp
new file mode 100644
index 000000000..2440ec8c5
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_tensor_mdsum.cpp
@@ -0,0 +1,227 @@
+#include "1880v2_test_util.h"
+
+typedef struct {
+  bmk1880v2_tiu_mdsum_param_t p;
+  s8 *res;
+  s8 *input;
+} mdsum_case_t;
+
+static void destroy_mdsum_param(
+    bmk_ctx_t *bk_ctx,
+    bmk1880v2_tiu_mdsum_param_t *p)
+{
+  free_tl(bk_ctx, p->res);
+  free_tl(bk_ctx, p->input);
+}
+
+static void destroy_mdsum_case(
+    bmk_ctx_t *bk_ctx,
+    mdsum_case_t *mc)
+{
+  destroy_mdsum_param(bk_ctx, &mc->p);
+  free(mc->res);
+  free(mc->input);
+}
+
+static void mdsum_case_ref(mdsum_case_t *mc)
+{
+  bmk1880v2_tiu_mdsum_param_t *p = &mc->p;
+  int n = p->input->shape.n;
+  int c = p->input->shape.c;
+  int h = p->input->shape.h;
+  int w = p->input->shape.w;
+  int res_sign = (p->res->fmt == FMT_I8);
+
+  s32 *tmp_res = (s32 *)xmalloc(c * sizeof(s32));
+  for (int i = 0; i < c; i++)
+    tmp_res[i] = 0;
+
+  for (int ni = 0; ni < n; ni++) {
+    for (int ci = 0; ci < c; ci++) {
+      for (int hi = 0; hi < h; hi++) {
+        for (int wi = 0; wi < w; wi++) {
+          u64 h_size = w;
+          u64 c_size = h * h_size;
+          u64 n_size = c * c_size;
+          u64 i = ni * n_size + ci * c_size + hi * h_size + wi;
+          s8 input = mc->input[i];
+          tmp_res[ci] += res_sign? input: (u8)input;
+        }
+      }
+    }
+  }
+
+  int arith_shift = (p->res->fmt == FMT_I8);
+  if (arith_shift)
+    arith_right_shift(tmp_res, c, p->rshift_bits, 1);
+  else
+    logic_right_shift(tmp_res, c, p->rshift_bits, 1);
+
+  if (p->res_is_int8)
+    saturate_to_int8(tmp_res, c, res_sign);
+  else
+    saturate_to_int16(tmp_res, c, res_sign);
+
+  for (int i = 0; i < c; i++)
+    mc->res[i] = tmp_res[i];
+
+  if (!p->res_is_int8)
+    for (int i = 0; i < c; i++)
+      mc->res[c + i] = tmp_res[i] >> 8;
+
+  free(tmp_res);
+}
+
+static void execute_mdsum_case(
+    CVI_RT_HANDLE *ctx,
+    bmk_ctx_t *bk_ctx,
+    mdsum_case_t *mc)
+{
+  bmk1880v2_tiu_mdsum_param_t *p = &mc->p;
+
+  put_tensor_g2l(ctx, bk_ctx, p->input, (u8 *)mc->input);
+  bmk1880v2_tiu_mdsum(bk_ctx, p);
+  u8 *res = get_tensor_l2g(ctx, bk_ctx, p->res);
+
+  mdsum_case_ref(mc);
+
+  int size = p->input->shape.c;
+  if (!p->res_is_int8)
+    size *= 2;
+
+  for (int i = 0; i < size; i++) {
+    if ((s8)res[i] != mc->res[i]) {
+      fprintf(stderr, "comparing failed at res[%d], got %d, exp %d\n",
+              i, (s8)res[i], mc->res[i]);
+      exit(-1);
+    }
+  }
+
+  free(res);
+}
+
+static void init_mdsum_case_0(bmk_ctx_t *bk_ctx, mdsum_case_t *mc)
+{
+  int n = 4;
+  int c = 16;
+  int h = 1;
+  int w = 17;
+
+  tl_shape_t a_shape;
+  a_shape.n = n;
+  a_shape.c = c;
+  a_shape.h = h;
+  a_shape.w = w;
+
+  tl_shape_t res_shape;
+  res_shape.n = 1;
+  res_shape.c = c;
+  res_shape.h = 1;
+  res_shape.w = 1;
+
+  mc->p.res_is_int8 = 1;
+  mc->p.input = alloc_tl(bk_ctx, a_shape, FMT_I8, 1);
+  mc->p.res = alloc_tl(bk_ctx, res_shape, FMT_I8, 0);
+  mc->p.rshift_bits = 3;
+
+  u64 input_size = n * c * h * w;
+  mc->input = (s8 *)xmalloc(input_size);
+  for (u64 i = 0; i < input_size; i++)
+    mc->input[i] = (i % 13) - (i % 17) + (i % 5) - (i % 3);
+
+  u64 res_size = c * 2;
+  mc->res = (s8 *)xmalloc(res_size);
+}
+
+static void init_mdsum_case_1(bmk_ctx_t *bk_ctx, mdsum_case_t *mc)
+{
+  int n = 4;
+  int c = 16;
+  int h = 1;
+  int w = 17;
+
+  tl_shape_t a_shape;
+  a_shape.n = n;
+  a_shape.c = c;
+  a_shape.h = h;
+  a_shape.w = w;
+
+  tl_shape_t res_shape;
+  res_shape.n = 2;
+  res_shape.c = c;
+  res_shape.h = 1;
+  res_shape.w = 1;
+
+  mc->p.res_is_int8 = 0;
+  mc->p.input = alloc_tl(bk_ctx, a_shape, FMT_I8, 1);
+  mc->p.res = alloc_tl(bk_ctx, res_shape, FMT_I8, 0);
+  mc->p.rshift_bits = 3;
+
+  u64 input_size = n * c * h * w;
+  mc->input = (s8 *)xmalloc(input_size);
+  for (u64 i = 0; i < input_size; i++)
+    mc->input[i] = (i % 13) - (i % 17) + i - 30;
+
+  u64 res_size = c * 2;
+  mc->res = (s8 *)xmalloc(res_size);
+}
+
+static void init_mdsum_case_2(bmk_ctx_t *bk_ctx, mdsum_case_t *mc)
+{
+  int n = 4;
+  int c = 16;
+  int h = 1;
+  int w = 17;
+
+  tl_shape_t a_shape;
+  a_shape.n = n;
+  a_shape.c = c;
+  a_shape.h = h;
+  a_shape.w = w;
+
+  tl_shape_t res_shape;
+  res_shape.n = 2;
+  res_shape.c = c;
+  res_shape.h = 1;
+  res_shape.w = 1;
+
+  mc->p.res_is_int8 = 0;
+  mc->p.input = alloc_tl(bk_ctx, a_shape, FMT_U8, 1);
+  mc->p.res = alloc_tl(bk_ctx, res_shape, FMT_U8, 0);
+  mc->p.rshift_bits = 3;
+
+  u64 input_size = n * c * h * w;
+  mc->input = (s8 *)xmalloc(input_size);
+  for (u64 i = 0; i < input_size; i++)
+    mc->input[i] = (i % 13) - (i % 17) + i - 30;
+
+  u64 res_size = c * 2;
+  mc->res = (s8 *)xmalloc(res_size);
+}
+
+static void test_tl_mdsum(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx)
+{
+  mdsum_case_t mc;
+
+  init_mdsum_case_0(bk_ctx, &mc);
+  execute_mdsum_case(ctx, bk_ctx, &mc);
+  destroy_mdsum_case(bk_ctx, &mc);
+
+  init_mdsum_case_1(bk_ctx, &mc);
+  execute_mdsum_case(ctx, bk_ctx, &mc);
+  destroy_mdsum_case(bk_ctx, &mc);
+
+  init_mdsum_case_2(bk_ctx, &mc);
+  execute_mdsum_case(ctx, bk_ctx, &mc);
+  destroy_mdsum_case(bk_ctx, &mc);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  test_tl_mdsum(&ctx, bk_ctx);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_tensor_min.cpp b/cviruntime/test/1880v2/test_1880v2_tensor_min.cpp
new file mode 100644
index 000000000..81102432a
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_tensor_min.cpp
@@ -0,0 +1,82 @@
+#include "1880v2_test_util.h"
+
+static void tl_min_ref(s8 *a, s8 *b, s8 *max, u64 size)
+{
+  for (u64 i = 0; i < size; i++) {
+    if (a[i] > b[i])
+      max[i] = b[i];
+    else
+      max[i] = a[i];
+  }
+}
+
+static void test_tl_min(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  u64 size = n * c * h * w;
+  s8 *a_data = (s8 *)xmalloc(size);
+  for (u64 i = 0; i < size; i++)
+    a_data[i] = (s8)(i % 256);
+
+  s8 *b_data = (s8 *)xmalloc(size);
+  for (u64 i = 0; i < size; i++)
+    b_data[i] = (s8)(100 - i % 256);
+
+  s8 *ref_data = (s8 *)xmalloc(size);
+  tl_min_ref(a_data, b_data, ref_data, size);
+
+  tl_t *tl_a = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_b = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_min = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+
+  put_tensor_g2l(ctx, bk_ctx, tl_a, (u8 *)a_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_b, (u8 *)b_data);
+  bmk1880v2_tiu_element_wise_min_param_t p6;
+  memset(&p6, 0, sizeof(p6));
+  p6.min = tl_min;
+  p6.a = tl_a;
+  p6.b_is_const = 0;
+  p6.b = tl_b;
+  bmk1880v2_tiu_element_wise_min(bk_ctx, &p6);
+  u8 *min_data = get_tensor_l2g(ctx, bk_ctx, tl_min);
+
+  for (u64 i = 0; i < size; i++) {
+    if ((s8)min_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+             i, min_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_min);
+  free_tl(bk_ctx, tl_b);
+  free_tl(bk_ctx, tl_a);
+
+  free(a_data);
+  free(b_data);
+  free(ref_data);
+  free(min_data);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_min(&ctx, bk_ctx, 0);
+  test_tl_min(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_tensor_min_const.cpp b/cviruntime/test/1880v2/test_1880v2_tensor_min_const.cpp
new file mode 100644
index 000000000..4c294d5bb
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_tensor_min_const.cpp
@@ -0,0 +1,77 @@
+#include "1880v2_test_util.h"
+
+static void tl_min_const_ref(s8 *a, s8 b, s8 *max, u64 size)
+{
+  for (u64 i = 0; i < size; i++) {
+    if (a[i] > b)
+      max[i] = b;
+    else
+      max[i] = a[i];
+  }
+}
+
+static void test_tl_min_const(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  u64 size = n * c * h * w;
+  s8 *a_data = (s8 *)xmalloc(size);
+  for (u64 i = 0; i < size; i++)
+    a_data[i] = (s8)(i % 256);
+
+  s8 b = 47;
+
+  s8 *ref_data = (s8 *)xmalloc(size);
+  tl_min_const_ref(a_data, b, ref_data, size);
+
+  tl_t *tl_a = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_min = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+
+  put_tensor_g2l(ctx, bk_ctx, tl_a, (u8 *)a_data);
+  bmk1880v2_tiu_element_wise_min_param_t p7;
+  memset(&p7, 0, sizeof(p7));
+  p7.min = tl_min;
+  p7.a = tl_a;
+  p7.b_is_const = 1;
+  p7.b_const.val = b;
+  p7.b_const.is_signed = 1;
+  bmk1880v2_tiu_element_wise_min(bk_ctx, &p7);
+  u8 *min_data = get_tensor_l2g(ctx, bk_ctx, tl_min);
+
+  for (u64 i = 0; i < size; i++) {
+    if ((s8)min_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+             i, min_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_min);
+  free_tl(bk_ctx, tl_a);
+
+  free(a_data);
+  free(ref_data);
+  free(min_data);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_min_const(&ctx, bk_ctx, 0);
+  test_tl_min_const(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_tensor_mul.cpp b/cviruntime/test/1880v2/test_1880v2_tensor_mul.cpp
new file mode 100644
index 000000000..25ce3e3fc
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_tensor_mul.cpp
@@ -0,0 +1,99 @@
+#include "1880v2_test_util.h"
+
+static void tl_mul_ref(s8 *ofmap, s8 *a, s8 *b, u64 size, int shift_bits, int relu_enable)
+{
+  for (u64 i = 0; i < size; i++) {
+    s32 tmp = a[i] * b[i];
+    tmp += 1 << (shift_bits - 1);
+    tmp >>= shift_bits;
+    if (tmp > 127)
+      tmp = 127;
+    else if (tmp < -128)
+      tmp = -128;
+    if(relu_enable)
+      if(tmp<0)
+        tmp=0;
+    ofmap[i] = tmp;
+    
+  }
+}
+
+static void test_tl_mul(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  u64 size = n * c * h  * w;
+  int shift_bits = 1;
+
+  for (u32 relu_enable = 0; relu_enable < 2; relu_enable++)
+  {
+     s8 *a_data = (s8 *)xmalloc(size);
+     s8 *b_data = (s8 *)xmalloc(size);
+     for (u64 i = 0; i < size; i++) {
+       a_data[i] = random()%0x10;
+       b_data[i] = 128 - i;
+     }
+   
+     tl_t *tl_a = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+     tl_t *tl_b = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+     tl_t *tl_res_low = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+   
+     put_tensor_g2l(ctx, bk_ctx, tl_a, (u8 *)a_data);
+     put_tensor_g2l(ctx, bk_ctx, tl_b, (u8 *)b_data);
+   
+     bmk1880v2_tiu_element_wise_mul_param_t p1;
+     memset(&p1, 0, sizeof(p1));
+     p1.res_high = NULL;
+     p1.res_low = tl_res_low;
+     p1.a = tl_a;
+     p1.b_is_const = 0;
+     p1.b = tl_b;
+     p1.rshift_bits = shift_bits;
+     p1.relu_enable = relu_enable;
+     bmk1880v2_tiu_element_wise_mul(bk_ctx, &p1);
+   
+     u8 *res_low_data = get_tensor_l2g(ctx, bk_ctx, tl_res_low);
+   
+     s8 *ref_data = (s8 *)xmalloc(size);
+     tl_mul_ref(ref_data, a_data, b_data, size, shift_bits, relu_enable);
+   
+     for (u64 i = 0; i < size; i++) {
+       if ((s8)res_low_data[i] != ref_data[i]) {
+         fprintf(stderr, "comparing failed at res_low_data[%" PRIu64 "], got %x, exp %x\n",
+                i, res_low_data[i], ref_data[i]);
+         exit(-1);
+       }
+     }
+   
+     free_tl(bk_ctx, tl_res_low);
+     free_tl(bk_ctx, tl_b);
+     free_tl(bk_ctx, tl_a);
+   
+     free(a_data);
+     free(b_data);
+     free(ref_data);
+     free(res_low_data);
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_mul(&ctx, bk_ctx, 0);
+  test_tl_mul(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_tensor_mul_const.cpp b/cviruntime/test/1880v2/test_1880v2_tensor_mul_const.cpp
new file mode 100644
index 000000000..188717573
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_tensor_mul_const.cpp
@@ -0,0 +1,97 @@
+#include "1880v2_test_util.h"
+
+static void tl_mul_const_ref(
+    s8 *ofmap, s8 *ifmap, u64 size, s8 mul_const, int shift_bits, int relu_enable)
+{
+  for (u64 i = 0; i < size; i++) {
+    s32 tmp = ifmap[i] * mul_const;
+    tmp += 1 << (shift_bits - 1);
+    tmp >>= shift_bits;
+    if (tmp > 127)
+      tmp = 127;
+    else if (tmp < -128)
+      tmp = -128;
+    if(relu_enable)
+      if(tmp<0)
+        tmp=0;
+
+    ofmap[i] = tmp;
+  }
+}
+
+static void test_tl_mul_const(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  u64 size = n * c * h  * w;
+
+  for (u32 relu_enable = 0; relu_enable < 2; relu_enable++)
+  {
+    s8 *ifmap_data = (s8 *)xmalloc(size);
+    for (u64 i = 0; i < size; i++)
+      ifmap_data[i] = (u8)(random() % 256);
+  
+    s8 mul_const = 20;
+    int shift_bits = 1;
+  
+    s8 *ref_data = (s8 *)xmalloc(size);
+    tl_mul_const_ref(ref_data, ifmap_data, size, mul_const, shift_bits, relu_enable);
+  
+    tl_t *tl_ifmap = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+    tl_t *tl_ofmap = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  
+    put_tensor_g2l(ctx, bk_ctx, tl_ifmap, (u8 *)ifmap_data);
+  
+    bmk1880v2_tiu_element_wise_mul_param_t p;
+    memset(&p, 0, sizeof(p));
+    p.res_high = NULL;
+    p.res_low = tl_ofmap;
+    p.a = tl_ifmap;
+    p.b_is_const = 1;
+    p.b_const.val = mul_const;
+    p.b_const.is_signed = 1;
+    p.rshift_bits = shift_bits;
+    p.relu_enable = relu_enable;
+
+    bmk1880v2_tiu_element_wise_mul(bk_ctx, &p);
+  
+    u8 *ofmap_data = get_tensor_l2g(ctx, bk_ctx, tl_ofmap);
+  
+    for (u64 i = 0; i < size; i++) {
+      if ((s8)ofmap_data[i] != ref_data[i]) {
+        fprintf(stderr, "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+               i, ofmap_data[i], ref_data[i]);
+        exit(-1);
+      }
+    }
+  
+    free_tl(bk_ctx, tl_ofmap);
+    free_tl(bk_ctx, tl_ifmap);
+  
+    free(ifmap_data);
+    free(ref_data);
+    free(ofmap_data);
+  }
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_mul_const(&ctx, bk_ctx, 0);
+  test_tl_mul_const(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_tensor_mul_qdm.cpp b/cviruntime/test/1880v2/test_1880v2_tensor_mul_qdm.cpp
new file mode 100644
index 000000000..4bc383b86
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_tensor_mul_qdm.cpp
@@ -0,0 +1,615 @@
+#include <limits.h>
+#include "1880v2_test_util.h"
+#include "test_tf_quant_util.h"
+
+// #define ENABLE_DEBUG_MSG
+// #define ENABLE_TV_GEN_PATTERN
+
+#define MIN_EXEC_TESTS  20
+
+typedef struct {
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int relu_enable;
+  s8 *input1_data;
+  s8 *input2_data;
+  s8 *output_data;
+  u32 multiplier;
+  s8 right_shift;
+  float float_multiplier;
+  int retry_cnt;
+} elt_mul_test_param_t;
+
+void elt_mul_ref(elt_mul_test_param_t *p_param)
+{
+  int input_n = p_param->input_n;
+  int input_c = p_param->input_c;
+  int input_h = p_param->input_h;
+  int input_w = p_param->input_w;
+  s32 output_multiplier = p_param->multiplier;
+  s8 output_rshift = p_param->right_shift;
+  s8 *input1_data = p_param->input1_data;
+  s8 *input2_data = p_param->input2_data;
+  s8 *output_data = p_param->output_data;
+
+  s32 quantized_activation_min = -128;
+  s32 quantized_activation_max = 127;
+
+  int size = input_n * input_c * input_h * input_w;
+#ifdef ENABLE_DEBUG_MSG
+  printf("elt_mul_ref:\n");
+  printf("  shape (%d, %d, %d, %d)\n", input_n, input_c, input_h, input_w);
+#endif
+  for (int i = 0; i < size; ++i) {
+    const s32 input1_val = input1_data[i];
+    const s32 input2_val = input2_data[i];
+    const s32 unclamped_result = MultiplyByQuantizedMultiplier(
+        input1_val * input2_val, output_multiplier, output_rshift);
+    const s32 clamped_output =
+        MIN(quantized_activation_max,
+                 MAX(quantized_activation_min, unclamped_result));
+
+#ifdef ENABLE_DEBUG_MSG
+    printf("  [%d] unclamped_result %d,  clamped_output %d\n", i,
+           unclamped_result, clamped_output);
+#endif
+
+    output_data[i] = static_cast<int8_t>(clamped_output);
+  }
+}
+
+void calc_elt_mul_float_multiplier(elt_mul_test_param_t *p_param)
+{
+  int input_n = p_param->input_n;
+  int input_c = p_param->input_c;
+  int input_h = p_param->input_h;
+  int input_w = p_param->input_w;
+  s8 *input1_data = p_param->input1_data;
+  s8 *input2_data = p_param->input2_data;
+
+  int output_min = INT_MAX;
+  int output_max = INT_MIN;
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("calc_elt_mul_float_multiplier =>\n");
+#endif
+
+  int size = input_n * input_c * input_h * input_w;
+  for (int i = 0; i < size; ++i) {
+    const s32 input1_val = input1_data[i];
+    const s32 input2_val = input2_data[i];
+
+    const s32 val = input1_val * input2_val;
+
+    output_max = MAX(val, output_max);
+    output_min = MIN(val, output_min);
+  }
+
+  // Since int8 ranges from -128 to 127, we need to squeeze the accumulator
+  // MIN/MAX fit in those ranges correspondingly as much as possible.
+  if (abs(output_max) > abs(output_min)) {
+    p_param->float_multiplier = 127.0f / abs(output_max);
+  } else {
+    p_param->float_multiplier = 128.0f / abs(output_min);
+  }
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("  output_accu_min %d, output_accu_max %d, output_multiplier %f\n",
+         output_min, output_max, p_param->float_multiplier);
+#endif
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("<= calc_elt_mul_float_multiplier\n");
+#endif
+}
+
+int simple_test(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx)
+{
+  int ret = 0;
+
+  // TFL: QuantizedMulOpTest.NoActivationInt8
+  int size = 4;
+  s8 input1_data[4] = {-102, 25, 115, 89};
+  s8 input2_data[4] = {77, 51, 115, 102};
+  s8 ref_output_data[4] = {-62, 10, 104, 71};
+  s8 output_data[4];
+  u32 output_multiplier = 1077952640;
+  s8 output_rshift = 6;  // change to right shift
+
+  elt_mul_test_param_t test_param;
+  memset(&test_param, 0, sizeof(test_param));
+
+  test_param.input_n = 1;
+  test_param.input_c = 1;
+  test_param.input_h = 1;
+  test_param.input_w = 4;
+  test_param.input1_data = input1_data;
+  test_param.input2_data = input2_data;
+  test_param.output_data = output_data;
+  test_param.multiplier = output_multiplier;
+  test_param.right_shift = output_rshift;
+  elt_mul_ref(&test_param);
+
+  for (int i = 0; i < size; ++i) {
+    if (output_data[i] != ref_output_data[i]) {
+      printf("  Error ! output_data[%d] = %d != %d\n", i, output_data[i],
+             ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  tl_shape_t tl_shape = {1, 1, 1, static_cast<u32>(size)};
+  tl_t *tl_a = alloc_tl(bk_ctx, tl_shape, FMT_I8, /*align=*/1);
+  tl_t *tl_b = alloc_tl(bk_ctx, tl_shape, FMT_I8, /*align=*/1);
+  tl_t *tl_res = alloc_tl(bk_ctx, tl_shape, FMT_I8, /*align=*/1);
+
+  put_tensor_g2l(ctx, bk_ctx, tl_a, reinterpret_cast<u8 *>(input1_data));
+  put_tensor_g2l(ctx, bk_ctx, tl_b, reinterpret_cast<u8 *>(input2_data));
+
+  {
+    bmk1880v2_tiu_element_wise_mul_qdm_param_t p1;
+    memset(&p1, 0, sizeof(p1));
+    p1.res_high = nullptr;
+    p1.res_low = tl_res;
+    p1.a = tl_a;
+    p1.b_is_const = 0;
+    p1.b = tl_b;
+    p1.rshift_bits = output_rshift;
+    p1.relu_enable = 0;
+    p1.multiplier = output_multiplier;
+    bmk1880v2_tiu_element_wise_mul_qdm(bk_ctx, &p1);
+  }
+
+  s8 *res_tiu_data =
+      reinterpret_cast<s8 *>(get_tensor_l2g(ctx, bk_ctx, tl_res));
+  for (int i = 0; i < size; ++i) {
+    if (res_tiu_data[i] != ref_output_data[i]) {
+      printf("  Error ! result[%d] %d != %d\n", i, res_tiu_data[i],
+             ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  free(res_tiu_data);
+
+  // Reserver order
+  free_tl(bk_ctx, tl_res);
+  free_tl(bk_ctx, tl_b);
+  free_tl(bk_ctx, tl_a);
+
+  return ret;
+}
+
+int choose_from_range(int table[], int size, int index)
+{
+  if (index >= size) {
+    return 0;
+  }
+
+  int val = table[index];
+  if (index < (size - 1)) {
+    int range = MAX(table[index + 1] - table[index] - 1, 1);
+    val += rand() % range;
+  }
+
+  return val;
+}
+
+bool check_valid_test_param(bmk_ctx_t *bk_ctx, elt_mul_test_param_t *p_param)
+{
+  u32 input_n = p_param->input_n;
+  u32 input_c = p_param->input_c;
+  u32 input_h = p_param->input_h;
+  u32 input_w = p_param->input_w;
+
+  // input1, input2, output
+  u32 total_needed_size = 3 * input_n * input_c * input_h * input_w;
+
+  bmk1880v2_chip_info_t chip_info = bmk1880v2_chip_info();
+  u32 lmem_size_per_lane = chip_info.lmem_size;
+  u32 total_lmem_size = chip_info.lmem_size * chip_info.npu_num;
+
+  if (total_needed_size > total_lmem_size) {
+    return false;
+  }
+
+  tl_shape_t input_shape = {input_n, input_c, input_h, input_w};
+
+  u32 needed_size =
+      3 * bmk1880v2_lmem_tensor_to_size(bk_ctx, input_shape, FMT_I8, /*eu_align=*/1);
+
+  // Skip invalid shape
+  if (needed_size > lmem_size_per_lane) {
+    return false;
+  }
+
+  return true;
+}
+
+void fill_random_data_s8(s8 *input_data, int size)
+{
+  for (int i = 0; i < size; ++i) {
+    int is_satured = ((rand() % 1000) == 1) ? 1 : 0;
+    int is_sign = rand() % 2 ? 1 : -1;
+
+    if (is_satured && is_sign) {
+      input_data[i] = -128;
+    } else if (is_satured) {
+      input_data[i] = 127;
+    } else {
+      input_data[i] = is_sign * rand() % 128;
+    }
+  }
+}
+
+void dump_test_param(elt_mul_test_param_t *p_param, bool dump_content)
+{
+  printf("Dump test parameter:\n");
+  printf("  input_n %d\n", p_param->input_n);
+  printf("  input_c %d\n", p_param->input_c);
+  printf("  input_h %d\n", p_param->input_h);
+  printf("  input_w %d\n", p_param->input_w);
+  printf("  multiplier %d\n", p_param->multiplier);
+  printf("  right_shift %d\n", p_param->right_shift);
+
+  if (dump_content) {
+    printf("input1_data(%d, %d, %d, %d) :\n", p_param->input_n,
+           p_param->input_c, p_param->input_h, p_param->input_w);
+    int in = p_param->input_n;
+    int ic = p_param->input_c;
+    int ih = p_param->input_h;
+    int iw = p_param->input_w;
+    for (int i = 0; i < in; ++i) {
+      for (int j = 0; j < ic; ++j) {
+        for (int k = 0; k < ih; ++k) {
+          for (int l = 0; l < iw; ++l) {
+            int offset = i * (ic * ih * iw) + j * (ih * iw) + k * iw + l;
+            printf("%d, ", p_param->input1_data[offset]);
+          }
+          printf("\n");
+        }
+      }
+    }
+    printf("\n\n");
+
+    printf("input2_data(%d, %d, %d, %d) :\n", p_param->input_n,
+           p_param->input_c, p_param->input_h, p_param->input_w);
+    for (int i = 0; i < in; ++i) {
+      for (int j = 0; j < ic; ++j) {
+        for (int k = 0; k < ih; ++k) {
+          for (int l = 0; l < iw; ++l) {
+            int offset = i * (ic * ih * iw) + j * (ih * iw) + k * iw + l;
+            printf("%d, ", p_param->input2_data[offset]);
+          }
+          printf("\n");
+        }
+      }
+    }
+    printf("\n\n");
+  }
+}
+
+int run_compare_elt_mul(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx,
+                        elt_mul_test_param_t *p_param)
+{
+  int ret = 0;
+
+  int input_n = p_param->input_n;
+  int input_c = p_param->input_c;
+  int input_h = p_param->input_h;
+  int input_w = p_param->input_w;
+
+  int input_size = input_n * input_c * input_h * input_w;
+  s8 *input1_data = (s8 *)malloc(input_size);
+  s8 *input2_data = (s8 *)malloc(input_size);
+  s8 *output_data = (s8 *)malloc(input_size);
+
+  p_param->input1_data = input1_data;
+  p_param->input2_data = input2_data;
+  p_param->output_data = output_data;
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    run_compare_elt_mul => \n");
+#endif
+
+  int retry_cnt = p_param->retry_cnt;
+  do {
+    fill_random_data_s8(input1_data, input_size);
+    fill_random_data_s8(input2_data, input_size);
+
+    p_param->float_multiplier = 100.0;  // should be < 1.0
+    calc_elt_mul_float_multiplier(p_param);
+
+    if (p_param->float_multiplier > 0.f && p_param->float_multiplier < 1.0) {
+      break;
+    }
+
+  } while (--retry_cnt);
+
+  if (p_param->float_multiplier >= 1.0) {
+    printf("    run_compare_elt_mul: unable to find valid multiplier\n");
+    free(input1_data);
+    free(input2_data);
+    free(output_data);
+    return -1;
+  }
+
+  u32 base_multiplier = 0;
+  int base_shift = 0;
+  QuantizeMultiplierSmallerThanOne(p_param->float_multiplier, &base_multiplier,
+                                   &base_shift);
+
+  // multipliers typically range in [2^30 ; 2^31 - 1].
+  // Values in [0, 2^30 - 1] are normally unused, but harmless.
+  // Thus a good way to randomize multipliers is to subtract from them
+  // a random value smaller than 2^30 but still significant compared to it.
+  u32 output_multiplier = base_multiplier - (rand() % (1 << 26));
+
+  // Our H/W only supports right shift
+  int right_shift = base_shift - 1 + (rand() % 4);
+  u8 output_right_shift = right_shift > 0 ? right_shift : 0;
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("      multiplier_data %d, shift_data %d\n", output_multiplier,
+         output_right_shift);
+#endif
+
+  p_param->multiplier = output_multiplier;
+  p_param->right_shift = output_right_shift;
+
+  elt_mul_ref(p_param);
+
+  tl_shape_t input_shape = {
+      static_cast<u32>(input_n), static_cast<u32>(input_c),
+      static_cast<u32>(input_h), static_cast<u32>(input_w)};
+
+  bmk1880v2_tensor_lmem_t *tl_input1 =
+      bmk1880v2_lmem_alloc_tensor(bk_ctx, input_shape, FMT_I8, /*eu_aign=*/1);
+
+  bmk1880v2_tensor_lmem_t *tl_input2 =
+      bmk1880v2_lmem_alloc_tensor(bk_ctx, input_shape, FMT_I8, /*eu_aign=*/1);
+
+  bmk1880v2_tensor_lmem_t *tl_output =
+      bmk1880v2_lmem_alloc_tensor(bk_ctx, input_shape, FMT_I8, /*eu_aign=*/1);
+
+  if (tl_input1 == nullptr) {
+    printf("    fail to alloc tl_input1 (%d, %d, %d, %d)\n", input_n, input_c,
+           input_h, input_w);
+    return -1;
+  }
+  if (tl_input2 == nullptr) {
+    printf("    fail to alloc tl_input2 (%d, %d, %d, %d)\n", input_n, input_c,
+           input_h, input_w);
+    return -1;
+  }
+  if (tl_output == nullptr) {
+    printf("    fail to alloc tl_output (%d, %d, %d, %d)\n", input_n, input_c,
+           input_h, input_w);
+    return -1;
+  }
+
+  put_tensor_g2l(ctx, bk_ctx, tl_input1, reinterpret_cast<u8 *>(input1_data));
+  put_tensor_g2l(ctx, bk_ctx, tl_input2, reinterpret_cast<u8 *>(input2_data));
+
+  {
+    bmk1880v2_tiu_element_wise_mul_qdm_param_t p1;
+    memset(&p1, 0, sizeof(p1));
+    p1.res_high = nullptr;
+    p1.res_low = tl_output;
+    p1.a = tl_input1;
+    p1.b_is_const = 0;
+    p1.b = tl_input2;
+    p1.rshift_bits = output_right_shift;
+    p1.relu_enable = 0;
+    p1.multiplier = output_multiplier;
+    bmk1880v2_tiu_element_wise_mul_qdm(bk_ctx, &p1);
+  }
+
+
+  test_submit(ctx);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("      compare result:\n");
+#endif
+  s8 *tiu_output_data =
+      reinterpret_cast<s8 *>(get_tensor_l2g(ctx, bk_ctx, tl_output));
+  for (int i = 0; i < input_n; ++i) {
+    for (int j = 0; j < input_c; ++j) {
+      for (int k = 0; k < input_h; ++k) {
+        for (int l = 0; l < input_w; ++l) {
+          int offset = i * (input_c * input_h * input_w) +
+                       j * (input_h * input_w) + k * input_w + l;
+          if (tiu_output_data[offset] != output_data[offset]) {
+            printf("        [ni=%d][oci=%d][ohi=%d][owi=%d] output %d(tiu) != "
+                   "%d(ref)\n",
+                   i, j, k, l, tiu_output_data[offset], output_data[offset]);
+            ret = -1;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  if (ret) {
+    dump_test_param(p_param, /*dump_content=*/true);
+  }
+
+  // Reverse order
+  bmk1880v2_lmem_free_tensor(bk_ctx, tl_output);
+  bmk1880v2_lmem_free_tensor(bk_ctx, tl_input2);
+  bmk1880v2_lmem_free_tensor(bk_ctx, tl_input1);
+
+  free(input1_data);
+  free(input2_data);
+  free(output_data);
+  free(tiu_output_data);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    <= run_compare_elt_mul, ret %d\n", ret);
+#endif
+
+  return ret;
+}
+
+int random_test(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx)
+{
+  int ret = 0;
+
+  printf("Random Test =>\n");
+
+#if 0
+  int input_n_range[] = {1};
+  int input_c_range[] = {1};
+  int input_h_range[] = {1};
+  int input_w_range[] = {1};
+#else
+#ifndef ENABLE_TV_GEN_PATTERN
+  int input_n_range[] = {1, 2, 4, 8, 16, 32, 64, 4095 - 32};
+  int input_c_range[] = {1, 3, 11, 128, 512, 1024, 2048, 4095 - 32};
+  int input_h_range[] = {1, 3, 11, 128, 512, 1024, 2048, 4095 - 32};
+  int input_w_range[] = {1, 3, 11, 128, 512, 1024, 2048, 4095 - 32};
+#else
+  // TV_GEN
+  // Random Test, total 81, skipped 8095, executed 5, failed 0, ret 0
+
+  int input_n_range[] = {1,   2, 4095 - 32};
+  int input_c_range[] = {1, 512, 4095 - 32};
+  int input_h_range[] = {1, 512, 4095 - 32};
+  int input_w_range[] = {1, 512, 4095 - 32};
+#endif
+#endif
+
+  const int input_n_range_size =
+      sizeof(input_n_range) / sizeof(input_n_range[0]);
+  const int input_c_range_size =
+      sizeof(input_c_range) / sizeof(input_c_range[0]);
+  const int input_h_range_size =
+      sizeof(input_h_range) / sizeof(input_h_range[0]);
+  const int input_w_range_size =
+      sizeof(input_w_range) / sizeof(input_w_range[0]);
+
+  int random_seed = clock();
+  srand(random_seed);
+
+  const int retry_test_count = 100;
+  bool stop_at_first_error = true;
+
+  int total_tests = input_n_range_size * input_c_range_size *
+                    input_h_range_size * input_w_range_size;
+  int skipped_tests = 0;
+  int executed_tests = 0;
+  int failed_tests = 0;
+  int current_test = 0;
+
+  printf("Random Test =>\n");
+  for (int m = 0; m < retry_test_count; ++m) {
+    for (int i = 0; i < input_n_range_size; ++i) {
+      int input_n = choose_from_range(input_n_range, input_n_range_size, i);
+
+      for (int j = 0; j < input_c_range_size; ++j) {
+        int input_c = choose_from_range(input_c_range, input_c_range_size, j);
+
+        for (int k = 0; k < input_h_range_size; ++k) {
+          int input_h = choose_from_range(input_h_range, input_h_range_size, k);
+
+          for (int l = 0; l < input_w_range_size; ++l) {
+            int input_w =
+                choose_from_range(input_w_range, input_w_range_size, l);
+
+#ifdef ENABLE_DEBUG_MSG
+            printf("  [%d/%d] random test: input shape (%d, %d, %d, %d)\n",
+                   current_test, total_tests, input_n, input_c, input_h,
+                   input_w);
+#else
+            if ((current_test % 1000) == 0) {
+              printf("  [%d/%d] random test: input shape (%d, %d, %d, %d)\n",
+                     current_test, total_tests, input_n, input_c, input_h,
+                     input_w);
+            }
+#endif
+
+            current_test++;
+
+            elt_mul_test_param_t test_param;
+            memset(&test_param, 0, sizeof(test_param));
+            test_param.input_n = input_n;
+            test_param.input_c = input_c;
+            test_param.input_h = input_h;
+            test_param.input_w = input_w;
+            test_param.retry_cnt = 5;
+
+            bool is_valid_param = check_valid_test_param(bk_ctx, &test_param);
+            if (is_valid_param == false) {
+              skipped_tests++;
+#ifdef ENABLE_DEBUG_MSG
+              printf("  [%d/%d] random test: invalid parameter, skip\n",
+                     current_test, total_tests);
+#endif
+              continue;
+            }
+
+            int ret2 = run_compare_elt_mul(ctx, bk_ctx, &test_param);
+            failed_tests = ret2 ? failed_tests + 1 : failed_tests;
+            ret |= ret2;
+            executed_tests++;
+
+#ifdef ENABLE_DEBUG_MSG
+            printf(
+                "  [%d/%d] random test: input shape (%d, %d, %d, %d), ret %d\n",
+                current_test, total_tests, input_n, input_c, input_h, input_w,
+                ret2);
+#endif
+          }
+
+          // Stop at first error
+          if (ret && stop_at_first_error) {
+            break;
+          }
+        }
+
+        // Stop at first error
+        if (ret && stop_at_first_error) {
+          break;
+        }
+      }
+
+      // Stop at first error
+      if (ret && stop_at_first_error) {
+        break;
+      }
+    }
+
+    // Stop at first error
+    if (ret && stop_at_first_error) {
+      break;
+    }
+
+    if (executed_tests >= MIN_EXEC_TESTS) {
+      break;
+    }
+  }
+
+  printf(
+      "<= Random Test, total %d, skipped %d, executed %d, failed %d, ret %d\n",
+      total_tests, skipped_tests, executed_tests, failed_tests, ret);
+
+  return ret;
+}
+
+int main()
+{
+  int ret = 0;
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  ret = simple_test(&ctx, bk_ctx);
+  ret |= random_test(&ctx, bk_ctx);
+
+  test_exit(&ctx);
+
+  return ret;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_tensor_or.cpp b/cviruntime/test/1880v2/test_1880v2_tensor_or.cpp
new file mode 100644
index 000000000..d0907fa25
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_tensor_or.cpp
@@ -0,0 +1,181 @@
+#include "1880v2_test_util.h"
+
+static void tl_or_int8_ref(s8 *a, s8 *b, s8 *res, u64 size)
+{
+  for (u64 i = 0; i < size; i++)
+    res[i] = a[i] | b[i];
+}
+
+static void tl_or_int16_ref(
+    u8 *ref_high, u8 *ref_low,
+    u8 *a_high, u8 *a_low,
+    u8 *b_high, u8 *b_low,
+    u64 size)
+{
+  for (u64 i = 0; i < size; i++) {
+    s32 ta = ((s8)a_high[i] << 8) + a_low[i];
+    s32 tb = ((s8)b_high[i] << 8) + b_low[i];
+    s32 res = ta | tb;
+    ref_high[i] = (res >> 8) & 0xff;
+    ref_low[i] = res & 0xff;
+  }
+}
+
+static void test_tl_or_int8(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  u64 size = n * c * h * w;
+  s8 *a_data = (s8 *)xmalloc(size);
+  for (u64 i = 0; i < size; i++)
+    a_data[i] = (s8)(i % 256);
+
+  s8 *b_data = (s8 *)xmalloc(size);
+  for (u64 i = 0; i < size; i++)
+    b_data[i] = (s8)(100 - i % 256);
+
+  s8 *ref_data = (s8 *)xmalloc(size);
+  tl_or_int8_ref(a_data, b_data, ref_data, size);
+
+  tl_t *tl_a = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_b = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_res = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+
+  put_tensor_g2l(ctx, bk_ctx, tl_a, (u8 *)a_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_b, (u8 *)b_data);
+
+  bmk1880v2_tiu_element_wise_or_int8_param_t p9;
+  memset(&p9, 0, sizeof(p9));
+  p9.res = tl_res;
+  p9.a = tl_a;
+  p9.b = tl_b;
+  bmk1880v2_tiu_element_wise_or_int8(bk_ctx, &p9);
+  u8 *res_data = get_tensor_l2g(ctx, bk_ctx, tl_res);
+
+  for (u64 i = 0; i < size; i++) {
+    if ((s8)res_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+             i, res_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_res);
+  free_tl(bk_ctx, tl_b);
+  free_tl(bk_ctx, tl_a);
+
+  free(a_data);
+  free(b_data);
+  free(ref_data);
+  free(res_data);
+}
+
+static void test_tl_or_int16(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  u64 size = n * c * h * w;
+  u8 *a_high_data = (u8 *)xmalloc(size);
+  u8 *a_low_data = (u8 *)xmalloc(size);
+  u8 *b_high_data = (u8 *)xmalloc(size);
+  u8 *b_low_data = (u8 *)xmalloc(size);
+  for (u64 i = 0; i < size; i++) {
+    a_high_data[i] = i / 10;
+    a_low_data[i] = i;
+    b_high_data[i] = (i + 250) / 20;
+    b_low_data[i] = 100 - i;
+  }
+
+  u8 *ref_high_data = (u8 *)xmalloc(size);
+  u8 *ref_low_data = (u8 *)xmalloc(size);
+  tl_or_int16_ref(
+      ref_high_data, ref_low_data,
+      a_high_data, a_low_data,
+      b_high_data, b_low_data,
+      size);
+
+  tl_t *tl_a_low = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_a_high = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_b_low = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_b_high = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_res_low = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_res_high = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+
+  put_tensor_g2l(ctx, bk_ctx, tl_a_low, a_low_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_a_high, a_high_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_b_low, b_low_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_b_high, b_high_data);
+  bmk1880v2_tiu_element_wise_or_int16_param_t p9;
+  memset(&p9, 0, sizeof(p9));
+  p9.res_high = tl_res_high;
+  p9.res_low = tl_res_low;
+  p9.a_high = tl_a_high;
+  p9.a_low = tl_a_low;
+  p9.b_high = tl_b_high;
+  p9.b_low = tl_b_low;
+  bmk1880v2_tiu_element_wise_or_int16(bk_ctx, &p9);
+  u8 *res_high_data = get_tensor_l2g(ctx, bk_ctx, tl_res_high);
+  u8 *res_low_data = get_tensor_l2g(ctx, bk_ctx, tl_res_low);
+
+  for (u64 i = 0; i < size; i++) {
+    if (res_high_data[i] != ref_high_data[i]) {
+      fprintf(stderr, "comparing failed at res_high_data[%" PRIu64 "], got %d, exp %d\n",
+             i, res_high_data[i], ref_high_data[i]);
+      exit(-1);
+    }
+    if (res_low_data[i] != ref_low_data[i]) {
+      fprintf(stderr, "comparing failed at res_low_data[%" PRIu64 "], got %d, exp %d\n",
+             i, res_low_data[i], ref_low_data[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_res_high);
+  free_tl(bk_ctx, tl_res_low);
+  free_tl(bk_ctx, tl_b_high);
+  free_tl(bk_ctx, tl_b_low);
+  free_tl(bk_ctx, tl_a_high);
+  free_tl(bk_ctx, tl_a_low);
+
+  free(a_high_data);
+  free(a_low_data);
+  free(b_high_data);
+  free(b_low_data);
+  free(ref_high_data);
+  free(ref_low_data);
+  free(res_high_data);
+  free(res_low_data);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_or_int8(&ctx, bk_ctx, 0);
+  test_tl_or_int8(&ctx, bk_ctx, 1);
+  test_tl_or_int16(&ctx, bk_ctx, 0);
+  test_tl_or_int16(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_tensor_sub.cpp b/cviruntime/test/1880v2/test_1880v2_tensor_sub.cpp
new file mode 100644
index 000000000..2d303adaa
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_tensor_sub.cpp
@@ -0,0 +1,118 @@
+#include "1880v2_test_util.h"
+
+static void tl_sub_ref(
+    u8 *ref_high, u8 *ref_low,
+    u8 *a_high, u8 *a_low,
+    u8 *b_high, u8 *b_low,
+    u64 size)
+{
+  for (u64 i = 0; i < size; i++) {
+    s32 ta = ((s8)a_high[i] << 8) + a_low[i];
+    s32 tb = ((s8)b_high[i] << 8) + b_low[i];
+    s32 res = ta - tb;
+    if (res > 32767)
+      res = 32767;
+    else if (res < -32768)
+      res = -32768;
+    ref_high[i] = (res >> 8) & 0xff;
+    ref_low[i] = res & 0xff;
+  }
+}
+
+static void test_tl_sub(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  u64 size = n * c * h * w;
+  u8 *a_high_data = (u8 *)xmalloc(size);
+  u8 *a_low_data = (u8 *)xmalloc(size);
+  u8 *b_high_data = (u8 *)xmalloc(size);
+  u8 *b_low_data = (u8 *)xmalloc(size);
+  for (u64 i = 0; i < size; i++) {
+    a_high_data[i] = i / 10;
+    a_low_data[i] = i;
+    b_high_data[i] = (i + 250) / 20;
+    b_low_data[i] = 100 - i;
+  }
+
+  u8 *ref_high_data = (u8 *)xmalloc(size);
+  u8 *ref_low_data = (u8 *)xmalloc(size);
+  tl_sub_ref(ref_high_data, ref_low_data,
+             a_high_data, a_low_data,
+             b_high_data, b_low_data,
+             size);
+
+  tl_t *tl_a_low = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_a_high = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_b_low = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_b_high = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_res_low = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_res_high = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+
+  put_tensor_g2l(ctx, bk_ctx, tl_a_low, a_low_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_a_high, a_high_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_b_low, b_low_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_b_high, b_high_data);
+  bmk1880v2_tiu_element_wise_sub_param_t p5;
+  memset(&p5, 0, sizeof(p5));
+  p5.res_high = tl_res_high;
+  p5.res_low = tl_res_low;
+  p5.a_high = tl_a_high;
+  p5.a_low = tl_a_low;
+  p5.b_high = tl_b_high;
+  p5.b_low = tl_b_low;
+  p5.rshift_bits = 0;
+  bmk1880v2_tiu_element_wise_sub(bk_ctx, &p5);
+  u8 *res_high_data = get_tensor_l2g(ctx, bk_ctx, tl_res_high);
+  u8 *res_low_data = get_tensor_l2g(ctx, bk_ctx, tl_res_low);
+
+  for (u64 i = 0; i < size; i++) {
+    if (res_high_data[i] != ref_high_data[i]) {
+      fprintf(stderr, "comparing failed at res_high_data[%" PRIu64 "], got %d, exp %d\n",
+             i, res_high_data[i], ref_high_data[i]);
+      exit(-1);
+    }
+    if (res_low_data[i] != ref_low_data[i]) {
+      fprintf(stderr, "comparing failed at res_low_data[%" PRIu64 "], got %d, exp %d\n",
+             i, res_low_data[i], ref_low_data[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_res_high);
+  free_tl(bk_ctx, tl_res_low);
+  free_tl(bk_ctx, tl_b_high);
+  free_tl(bk_ctx, tl_b_low);
+  free_tl(bk_ctx, tl_a_high);
+  free_tl(bk_ctx, tl_a_low);
+
+  free(a_high_data);
+  free(a_low_data);
+  free(b_high_data);
+  free(b_low_data);
+  free(ref_high_data);
+  free(ref_low_data);
+  free(res_high_data);
+  free(res_low_data);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_sub(&ctx, bk_ctx, 0);
+  test_tl_sub(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_tensor_transfer.cpp b/cviruntime/test/1880v2/test_1880v2_tensor_transfer.cpp
new file mode 100644
index 000000000..a633632fc
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_tensor_transfer.cpp
@@ -0,0 +1,103 @@
+#include "1880v2_test_util.h"
+
+static void test_put_and_get_tensor_l2g(
+    CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx)
+{
+  int n = 2;
+  int c = 66;
+  int h = 3;
+  int w = 15;
+  int size = n * c * h * w;
+  u8 *data_x = (u8 *)xmalloc(size);
+  u8 *data_y = (u8 *)xmalloc(size);
+
+  for (int i = 0; i < size; i++)
+    data_x[i] = i - 100;
+
+  for (int i = 0; i < size; i++)
+    data_y[i] = -i;
+
+  /*
+   * Interleave two tensors in case the same devmem is reused between
+   * put_tensor_g2l() and get_tensor_l2g(), in which case the content of
+   * devmem is already what is expected before bmk1880v2_gdma_store(bk_ctx, ).
+   */
+
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  tg_shape_t ts_shape;
+  ts_shape.n = n;
+  ts_shape.c = c;
+  ts_shape.h = h;
+  ts_shape.w = w;
+
+  tl_t *tl_x =
+      alloc_tl(bk_ctx, tl_shape, FMT_I8, 1);
+  tl_t *tl_y =
+      alloc_tl(bk_ctx, tl_shape, FMT_I8, 1);
+
+  tg_t ts_x;
+  ts_x.base_reg_index = 0;
+  ts_x.start_address = 0;
+  ts_x.shape = ts_shape;
+  ts_x.stride = bmk1880v2_tensor_tgmem_default_stride(ts_shape, FMT_I8);
+
+  put_tensor_g2l(ctx, bk_ctx, tl_x, data_x);
+  put_tensor_g2l(ctx, bk_ctx, tl_y, data_y);
+
+  u8 *result_x = get_tensor_l2g(ctx, bk_ctx, tl_x);
+  u8 *result_y = get_tensor_l2g(ctx, bk_ctx, tl_y);
+
+  for (int i = 0; i < size; i++) {
+    if (result_x[i] != data_x[i]) {
+      printf("compare failed at result_x[%d]\n", i);
+      exit(-1);
+    }
+    if (result_y[i] != data_y[i]) {
+      printf("compare failed at result_y[%d]\n", i);
+      exit(-1);
+    }
+  }
+  free(result_x);
+  free(result_y);
+
+  /*
+   * Get result_y before result_x.
+   */
+
+
+  result_y = get_tensor_l2g(ctx, bk_ctx, tl_y);
+  result_x = get_tensor_l2g(ctx, bk_ctx, tl_x);
+  for (int i = 0; i < size; i++) {
+    if (result_x[i] != data_x[i]) {
+      printf("compare failed at result_x[%d]\n", i);
+      exit(-1);
+    }
+    if (result_y[i] != data_y[i]) {
+      printf("compare failed at result_y[%d]\n", i);
+      exit(-1);
+    }
+  }
+  free(result_x);
+  free(result_y);
+
+  free_tl(bk_ctx, tl_y);
+  free_tl(bk_ctx, tl_x);
+  free(data_x);
+  free(data_y);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+  test_put_and_get_tensor_l2g(&ctx, bk_ctx);
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_tensor_xor.cpp b/cviruntime/test/1880v2/test_1880v2_tensor_xor.cpp
new file mode 100644
index 000000000..49eb54ac9
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_tensor_xor.cpp
@@ -0,0 +1,182 @@
+#include "1880v2_test_util.h"
+
+static void tl_xor_int8_ref(s8 *a, s8 *b, s8 *res, u64 size)
+{
+  for (u64 i = 0; i < size; i++)
+    res[i] = a[i] ^ b[i];
+}
+
+static void tl_xor_int16_ref(
+    u8 *ref_high, u8 *ref_low,
+    u8 *a_high, u8 *a_low,
+    u8 *b_high, u8 *b_low,
+    u64 size)
+{
+  for (u64 i = 0; i < size; i++) {
+    s32 ta = ((s8)a_high[i] << 8) + a_low[i];
+    s32 tb = ((s8)b_high[i] << 8) + b_low[i];
+    s32 res = ta ^ tb;
+    ref_high[i] = (res >> 8) & 0xff;
+    ref_low[i] = res & 0xff;
+  }
+}
+
+static void test_tl_xor_int8(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  u64 size = n * c * h * w;
+  s8 *a_data = (s8 *)xmalloc(size);
+  for (u64 i = 0; i < size; i++)
+    a_data[i] = (s8)(i % 256);
+
+  s8 *b_data = (s8 *)xmalloc(size);
+  for (u64 i = 0; i < size; i++)
+    b_data[i] = (s8)(100 - i % 256);
+
+  s8 *ref_data = (s8 *)xmalloc(size);
+  tl_xor_int8_ref(a_data, b_data, ref_data, size);
+
+  tl_t *tl_a = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_b = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_res = alloc_tl(bk_ctx, tl_shape, FMT_I8, eu_align);
+
+  put_tensor_g2l(ctx, bk_ctx, tl_a, (u8 *)a_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_b, (u8 *)b_data);
+
+  bmk1880v2_tiu_element_wise_xor_int8_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.res = tl_res;
+  p.a = tl_a;
+  p.b = tl_b;
+  bmk1880v2_tiu_element_wise_xor_int8(bk_ctx, &p);
+  u8 *res_data = get_tensor_l2g(ctx, bk_ctx, tl_res);
+
+  for (u64 i = 0; i < size; i++) {
+    if ((s8)res_data[i] != ref_data[i]) {
+      fprintf(stderr, "comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+             i, res_data[i], ref_data[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_res);
+  free_tl(bk_ctx, tl_b);
+  free_tl(bk_ctx, tl_a);
+
+  free(a_data);
+  free(b_data);
+  free(ref_data);
+  free(res_data);
+}
+
+static void test_tl_xor_int16(CVI_RT_HANDLE *ctx, bmk_ctx_t *bk_ctx, int eu_align)
+{
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  u64 size = n * c * h * w;
+  u8 *a_high_data = (u8 *)xmalloc(size);
+  u8 *a_low_data = (u8 *)xmalloc(size);
+  u8 *b_high_data = (u8 *)xmalloc(size);
+  u8 *b_low_data = (u8 *)xmalloc(size);
+  for (u64 i = 0; i < size; i++) {
+    a_high_data[i] = i / 10;
+    a_low_data[i] = i;
+    b_high_data[i] = (i + 250) / 20;
+    b_low_data[i] = 100 - i;
+  }
+
+  u8 *ref_high_data = (u8 *)xmalloc(size);
+  u8 *ref_low_data = (u8 *)xmalloc(size);
+  tl_xor_int16_ref(
+      ref_high_data, ref_low_data,
+      a_high_data, a_low_data,
+      b_high_data, b_low_data,
+      size);
+
+  tl_t *tl_a_low = alloc_tl( bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_a_high = alloc_tl( bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_b_low = alloc_tl( bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_b_high = alloc_tl( bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_res_low = alloc_tl( bk_ctx, tl_shape, FMT_I8, eu_align);
+  tl_t *tl_res_high = alloc_tl( bk_ctx, tl_shape, FMT_I8, eu_align);
+
+  put_tensor_g2l(ctx, bk_ctx, tl_a_low, a_low_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_a_high, a_high_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_b_low, b_low_data);
+  put_tensor_g2l(ctx, bk_ctx, tl_b_high, b_high_data);
+
+  bmk1880v2_tiu_element_wise_xor_int16_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.res_high = tl_res_high;
+  p.res_low = tl_res_low;
+  p.a_high = tl_a_high;
+  p.a_low = tl_a_low;
+  p.b_high = tl_b_high;
+  p.b_low = tl_b_low;
+  bmk1880v2_tiu_element_wise_xor_int16(bk_ctx, &p);
+  u8 *res_high_data = get_tensor_l2g(ctx, bk_ctx, tl_res_high);
+  u8 *res_low_data = get_tensor_l2g(ctx, bk_ctx, tl_res_low);
+
+  for (u64 i = 0; i < size; i++) {
+    if (res_high_data[i] != ref_high_data[i]) {
+      fprintf(stderr, "comparing failed at res_high_data[%" PRIu64 "], got %d, exp %d\n",
+             i, res_high_data[i], ref_high_data[i]);
+      exit(-1);
+    }
+    if (res_low_data[i] != ref_low_data[i]) {
+      fprintf(stderr, "comparing failed at res_low_data[%" PRIu64 "], got %d, exp %d\n",
+             i, res_low_data[i], ref_low_data[i]);
+      exit(-1);
+    }
+  }
+
+  free_tl(bk_ctx, tl_res_high);
+  free_tl(bk_ctx, tl_res_low);
+  free_tl(bk_ctx, tl_b_high);
+  free_tl(bk_ctx, tl_b_low);
+  free_tl(bk_ctx, tl_a_high);
+  free_tl(bk_ctx, tl_a_low);
+
+  free(a_high_data);
+  free(a_low_data);
+  free(b_high_data);
+  free(b_low_data);
+  free(ref_high_data);
+  free(ref_low_data);
+  free(res_high_data);
+  free(res_low_data);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bk_ctx;
+  test_init(&ctx, &bk_ctx);
+
+  test_tl_xor_int8(&ctx, bk_ctx, 0);
+  test_tl_xor_int8(&ctx, bk_ctx, 1);
+  test_tl_xor_int16(&ctx, bk_ctx, 0);
+  test_tl_xor_int16(&ctx, bk_ctx, 1);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_1880v2_tg_copy_tensor.cpp b/cviruntime/test/1880v2/test_1880v2_tg_copy_tensor.cpp
new file mode 100644
index 000000000..a42af86ce
--- /dev/null
+++ b/cviruntime/test/1880v2/test_1880v2_tg_copy_tensor.cpp
@@ -0,0 +1,104 @@
+#include "1880v2_test_util.h"
+
+typedef bmk1880v2_tdma_tg2tg_tensor_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p)
+{
+  fprintf(
+      f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n",
+      tag,
+      p->src->shape.n, p->src->shape.c, p->src->shape.h, p->src->shape.w,
+      p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  tg_shape_t src_shape;
+  tg_stride_t src_stride;
+  tg_shape_t dst_shape;
+  tg_stride_t dst_stride;
+} case_t;
+
+static case_t g_cases[] = {
+  {
+    {1, 3, 3, 3}, {27, 9, 3},
+    {1, 3, 3, 3}, {27, 9, 3},
+  },
+  {
+    // YOLOv2 concat layer
+    {1, 256, 19, 19}, {92416, 361, 19},
+    {1, 256, 19, 19}, {462080, 361, 19},
+  }
+};
+
+static void test_param_g2g(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, param_t *p)
+{
+  print_param(stderr, p);
+
+  u64 size = p->src->shape.c * p->src->shape.h * p->src->shape.w;
+  u8 *src_data = (u8 *)malloc(sizeof(u8) * size);
+  for (u64 i = 0; i < size; i++)
+    src_data[i] = 200 + i;
+
+  put_tg_gmem(ctx, p->src, src_data);
+
+  bmk1880v2_tdma_tg2tg_tensor_copy(bmk, p);
+  test_submit(ctx);
+
+  u8 *dst_data = get_tg_gmem(ctx, p->dst);
+
+  for (u64 i = 0; i < size; i++) {
+    if (dst_data[i] != src_data[i]) {
+      fprintf(stderr, "comparing failed at dst[%" PRIu64 "], got %d, exp %d\n",
+              i, dst_data[i], src_data[i]);
+      exit(-1);
+    }
+  }
+
+  free(src_data);
+  free(dst_data);
+}
+
+static void destroy_param_g2g(CVI_RT_HANDLE *ctx, param_t *p)
+{
+  free_tg_gmem(ctx, p->src);
+  free_tg_gmem(ctx, p->dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE *ctx, bmk_ctx_t *bmk, case_t *c)
+{
+  param_t p;
+  bmk1880v2_tensor_tgmem_t *src, *dst;
+
+  src = alloc_tg_gmem(ctx, c->src_shape, FMT_I8);
+  src->stride.n = c->src_stride.n;
+  src->stride.c = c->src_stride.c;
+  src->stride.h = c->src_stride.h;
+
+  dst = alloc_tg_gmem(ctx, c->dst_shape, FMT_I8);
+  dst->stride.n = c->dst_stride.n;
+  dst->stride.c = c->dst_stride.c;
+  dst->stride.h = c->dst_stride.h;
+
+  memset(&p, 0, sizeof(p));
+  p.src = src;
+  p.dst = dst;
+  test_param_g2g(ctx, bmk, &p);
+
+  destroy_param_g2g(ctx, &p);
+}
+
+int main()
+{
+  CVI_RT_HANDLE ctx;
+  bmk_ctx_t *bmk;
+  test_init(&ctx, &bmk);
+
+  u32 nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (u32 i = 0; i < nr_cases; i++)
+    test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx);
+  return 0;
+}
diff --git a/cviruntime/test/1880v2/test_cv1880v2_conv.c b/cviruntime/test/1880v2/test_cv1880v2_conv.c
new file mode 100644
index 000000000..012b43b24
--- /dev/null
+++ b/cviruntime/test/1880v2/test_cv1880v2_conv.c
@@ -0,0 +1,1291 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <time.h>
+#include "test_cvikernel_util.h"
+#include "test_tf_quant_util.h"
+#include "test_native_ref.h"
+
+// #define ENABLE_DEBUG_MSG
+// #define ENABLE_FULL_REGRESSION
+// #define ENABLE_TV_GEN_PATTERN
+
+#define TEST_CASE_NAME    "test_cv1880v2_conv"
+#define MIN_EXEC_TESTS    20
+
+typedef struct {
+  int input_n;
+  int input_c;
+  int input_h;
+  int input_w;
+  int kw;
+  int kh;
+  int dh;
+  int dw;
+  int pad_top;
+  int pad_bot;
+  int pad_left;
+  int pad_right;
+  int ins_h;
+  int ins_h_last;
+  int ins_w;
+  int ins_w_last;
+  int stride_h;
+  int stride_w;
+  int output_c;
+  int output_h;
+  int output_w;
+  int has_bias;
+  int relu_enable;
+  int8_t *input_data;
+  int8_t *filter_data;
+  int8_t *output_data;
+  int32_t *bias_data;
+  uint32_t *multiplier_data;
+  int8_t *shift_data;
+  uint8_t *chl_quan_data;
+  uint32_t chl_quan_data_size;
+  float float_multiplier;
+  int retry_cnt;
+} conv_test_param_t;
+
+static inline int Offset(cvk_tl_shape_t shape, int n, int c, int h, int w)
+{
+  return n * (shape.c * shape.h * shape.w) + c * (shape.h * shape.w) +
+         h * shape.w + w;
+}
+
+void conv_per_channel_ref(conv_test_param_t *p_param)
+{
+  const int stride_width = p_param->stride_w;
+  const int stride_height = p_param->stride_h;
+  const int dilation_width_factor = 1;
+  const int dilation_height_factor = 1;
+  const int pad_width = p_param->pad_left;
+  const int pad_height = p_param->pad_top;
+
+  const int32_t output_activation_min = -128;
+  const int32_t output_activation_max = 127;
+
+  const int batches = p_param->input_n;
+  const int input_depth = p_param->input_c;
+  const int output_depth = p_param->output_c;
+
+  const int input_height = p_param->input_h;
+  const int input_width = p_param->input_w;
+  const int filter_height = p_param->kh;
+  const int filter_width = p_param->kw;
+  const int output_height = p_param->output_h;
+  const int output_width = p_param->output_w;
+  int8_t *input_data = p_param->input_data;
+  int8_t *filter_data = p_param->filter_data;
+  int8_t *output_data = p_param->output_data;
+  int32_t *bias_data = p_param->has_bias ? p_param->bias_data : NULL;
+  uint32_t *output_multiplier = p_param->multiplier_data;
+  int8_t *output_rshift = p_param->shift_data;
+
+  cvk_tl_shape_t input_shape = {
+      batches, input_depth,
+      input_height, input_width};
+  cvk_tl_shape_t filter_shape = {
+      output_depth, filter_height,
+      filter_width, input_depth};
+  cvk_tl_shape_t output_shape = {
+      batches, output_depth,
+      output_height, output_width};
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("conv_per_channel_ref: \n"
+         "  input (n=%d, ic=%d, h=%d, w=%d)\n"
+         "  kernel (oc=%d, kh=%d, kw=%d, ic=%d)\n",
+         batches, input_depth, input_height, input_width, output_depth,
+         filter_height, filter_width, input_depth);
+#endif
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          const int in_y_origin = (out_y * stride_height) - pad_height;
+          int32_t acc = 0;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  int32_t input_val = input_data[Offset(input_shape, batch,
+                                                    in_channel, in_y, in_x)];
+                  // int32_t filter_val = filter_data[Offset(filter_shape,
+                  // out_channel, in_channel,
+                  //                                    filter_y, filter_x)];
+                  int32_t filter_val =
+                      filter_data[Offset(filter_shape, out_channel, filter_y,
+                                         filter_x, in_channel)];
+                  acc += filter_val * input_val;
+
+#ifdef ENABLE_DEBUG_MSG
+                  printf("  [batch=%d][out_channel=%d][out_y=%d][out_x=%d]"
+                         "[filter_y=%d][filter_x=%d][in_channel=%d] acc(%d) += "
+                         "%d * %d = %d\n",
+                         batch, out_channel, out_y, out_x, filter_y, filter_x,
+                         in_channel, acc - filter_val * input_val, filter_val,
+                         input_val, acc);
+#endif
+                }
+              }
+            }
+          }
+
+          if (bias_data) {
+            acc += bias_data[out_channel];
+          }
+
+#ifdef ENABLE_DEBUG_MSG
+          printf("  [batch=%d][out_channel=%d][out_y=%d][out_x=%d] acc = %d, "
+                 "bias %d\n",
+                 batch, out_channel, out_y, out_x, acc,
+                 bias_data ? bias_data[out_channel] : 0);
+#endif
+
+          acc = MultiplyByQuantizedMultiplier(
+              acc, output_multiplier[out_channel], output_rshift[out_channel]);
+
+#ifdef ENABLE_DEBUG_MSG
+          printf("  [batch=%d][out_channel=%d][out_y=%d][out_x=%d] acc = %d, "
+                 "multiplier %d, shift %d\n",
+                 batch, out_channel, out_y, out_x, acc,
+                 output_multiplier[out_channel], output_rshift[out_channel]);
+#endif
+
+          acc = MAX(acc, output_activation_min);
+          acc = MIN(acc, output_activation_max);
+
+#ifdef ENABLE_DEBUG_MSG
+          printf("  [batch=%d][out_channel=%d][out_y=%d][out_x=%d] acc = %d\n",
+                 batch, out_channel, out_y, out_x, acc);
+#endif
+
+          output_data[Offset(output_shape, batch, out_channel, out_y, out_x)] =
+              acc;
+        }
+      }
+    }
+  }
+}
+
+void calc_conv_float_multiplier(conv_test_param_t *p_param)
+{
+  const int stride_width = p_param->stride_w;
+  const int stride_height = p_param->stride_h;
+  const int dilation_width_factor = 1;
+  const int dilation_height_factor = 1;
+  const int pad_width = p_param->pad_left;
+  const int pad_height = p_param->pad_top;
+
+  const int batches = p_param->input_n;
+  const int input_depth = p_param->input_c;
+  const int output_depth = p_param->output_c;
+
+  const int input_height = p_param->input_h;
+  const int input_width = p_param->input_w;
+  const int filter_height = p_param->kh;
+  const int filter_width = p_param->kw;
+  const int output_height = p_param->output_h;
+  const int output_width = p_param->output_w;
+  int8_t *input_data = p_param->input_data;
+  int8_t *filter_data = p_param->filter_data;
+  int32_t *bias_data = p_param->has_bias ? p_param->bias_data : NULL;
+
+  cvk_tl_shape_t input_shape = {
+      batches, input_depth,
+      input_height, input_width};
+  cvk_tl_shape_t filter_shape = {
+      output_depth, filter_height,
+      filter_width, input_depth};
+
+  int output_accu_min = INT_MAX;
+  int output_accu_max = INT_MIN;
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("calc_conv_float_multiplier =>\n");
+#endif
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          const int in_y_origin = (out_y * stride_height) - pad_height;
+          int32_t acc = 0;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                const int in_y =
+                    in_y_origin + dilation_height_factor * filter_y;
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+                if (is_point_inside_image) {
+                  int32_t input_val = input_data[Offset(input_shape, batch,
+                                                    in_channel, in_y, in_x)];
+                  // int32_t filter_val = filter_data[Offset(filter_shape,
+                  // out_channel, in_channel,
+                  //                                    filter_y, filter_x)];
+                  int32_t filter_val =
+                      filter_data[Offset(filter_shape, out_channel, filter_y,
+                                         filter_x, in_channel)];
+                  acc += filter_val * input_val;
+
+                  // printf("  [batch=%d][out_channel=%d][out_y=%d][out_x=%d]"
+                  //        "[filter_y=%d][filter_x=%d][in_channel=%d] acc(%d)
+                  //        += %d * %d = %d\n", batch, out_channel, out_y,
+                  //        out_x, filter_y, filter_x, in_channel, acc -
+                  //        filter_val * input_val, filter_val, input_val, acc);
+                }
+              }
+            }
+          }
+
+          if (bias_data) {
+            acc += bias_data[out_channel];
+          }
+
+          output_accu_max = MAX(acc, output_accu_max);
+          output_accu_min = MIN(acc, output_accu_min);
+        }
+      }
+    }
+  }
+
+  // Since int8 ranges from -128 to 127, we need to squeeze the accumulator
+  // min/max fit in those ranges correspondingly as much as possible.
+  if (abs(output_accu_max) > abs(output_accu_min)) {
+    p_param->float_multiplier = 127.0f / abs(output_accu_max);
+  } else {
+    p_param->float_multiplier = 128.0f / abs(output_accu_min);
+  }
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("  output_accu_min %d, output_accu_max %d, output_multiplier %f\n",
+         output_accu_min, output_accu_max, p_param->float_multiplier);
+#endif
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("<= calc_dw_conv_float_multiplier\n");
+#endif
+}
+
+
+static void fill_random_data_s8(int8_t *input_data, int size)
+{
+  for (int i = 0; i < size; ++i) {
+    int is_satured = ((rand() % 1000) == 1) ? 1 : 0;
+    int is_sign = rand() % 2 ? 1 : -1;
+
+    if (is_satured && is_sign) {
+      input_data[i] = -128;
+    } else if (is_satured) {
+      input_data[i] = 127;
+    } else {
+      input_data[i] = is_sign * rand() % 128;
+    }
+  }
+}
+
+static void fill_random_data_s32(int32_t *input_data, int size)
+{
+  for (int i = 0; i < size; ++i) {
+    int is_satured = ((rand() % 1000) == 1) ? 1 : 0;
+    int is_sign = rand() % 2 ? 1 : -1;
+
+    if (is_satured && is_sign) {
+      input_data[i] = INT_MIN;
+    } else if (is_satured) {
+      input_data[i] = INT_MAX;
+    } else {
+      input_data[i] = is_sign * rand() % 128;
+    }
+  }
+}
+
+static int check_valid_test_param(cvk_context_t *cvk_ctx,
+                                  conv_test_param_t *p_param)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int oh = p_param->output_h;
+  int ow = p_param->output_w;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int chl_quan_per_lane_data_size =
+      p_param->has_bias ? 9 : 5;  // bias(4) + multiplier(4) + shift(1)
+
+  // Skip invalid shape
+  if ((kh > ih) || (kw > iw) || (stride_h > ih) || (stride_w > iw)) {
+    return false;
+  }
+
+  // multiply random-choosen value may exceeded than int32_t
+  uint32_t input_size = in * ic * ih * iw;
+  uint32_t kernel_size = oc * ic * kh * kw;
+  uint32_t output_size = in * oc * oh * ow;
+
+  uint32_t lmem_size_per_lane = cvk_ctx->info.lmem_size;
+  uint32_t total_lmem_size = cvk_ctx->info.lmem_size * cvk_ctx->info.npu_num;
+
+  uint32_t total_needed_size =
+      input_size + kernel_size + output_size +
+      chl_quan_per_lane_data_size * cvk_ctx->info.npu_num;
+  if (total_needed_size > total_lmem_size) {
+    return false;
+  }
+
+  cvk_tl_shape_t input_shape = {in, ic, ih, iw};
+  cvk_tl_shape_t filter_shape = {1, oc, kh * kw, ic};
+  cvk_tl_shape_t output_shape = {in, oc, oh, ow};
+  cvk_tl_shape_t chl_quan_shape = {1, oc, 1, chl_quan_per_lane_data_size};
+
+  uint32_t needed_size =
+      cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, input_shape, CVK_FMT_I8, /*eu_align=*/1) +
+      cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, filter_shape, CVK_FMT_I8, /*eu_align=*/0) +
+      cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, output_shape, CVK_FMT_I8, /*eu_align=*/1) +
+      cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, chl_quan_shape, CVK_FMT_I8, /*eu_align=*/0);
+
+  // Skip invalid shape
+  if (needed_size > lmem_size_per_lane) {
+    return false;
+  }
+
+  return true;
+}
+
+static int choose_from_range(int table[], int size, int index)
+{
+  if (index >= size) {
+    return 0;
+  }
+
+  int val = table[index];
+  if (index < (size - 1)) {
+    int range = MAX(table[index + 1] - table[index] - 1, 1);
+    val += rand() % range;
+  }
+
+  return val;
+}
+
+static void save_input_data(conv_test_param_t *p_param)
+{
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  uint32_t input_size = in * ic * ih * iw;
+  char name[64];
+  FILE *fp = NULL;
+  snprintf(name, sizeof(name), "%s_input_%d_%d_%d_%d.bin",
+           TEST_CASE_NAME, in, ic, ih, iw);
+
+  fp = fopen(name, "wb");
+  if (fp) {
+    printf("Write %d bytes to %s\n", input_size, name);
+    fwrite(p_param->input_data, input_size, 1, fp);
+    fclose(fp);
+  } else {
+    printf("Fail to open %s\n", name);
+    return;
+  }
+}
+
+static void save_output_data(conv_test_param_t *p_param)
+{
+  int in = p_param->input_n;
+  int oc = p_param->output_c;
+  int oh = p_param->output_h;
+  int ow = p_param->output_w;
+  uint32_t output_size = in * oc * oh * ow;
+  char name[64];
+  FILE *fp = NULL;
+
+  snprintf(name, sizeof(name), "%s_%d_%d_%d_%d.bin",
+           TEST_CASE_NAME, in, oc, oh, ow);
+
+  fp = fopen(name, "wb");
+  if (fp) {
+    printf("Write %d bytes to %s\n", output_size, name);
+    fwrite(p_param->output_data, output_size, 1, fp);
+    fclose(fp);
+  } else {
+    printf("Fail to open %s\n", name);
+    return;
+  }
+}
+
+static void save_kernel_data(conv_test_param_t *p_param)
+{
+  int ic = p_param->input_c;
+  int oc = p_param->output_c;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  uint32_t kernel_size = oc * kh * kw * ic;
+  char name[64];
+  FILE *fp = NULL;
+
+  snprintf(name, sizeof(name), "%s_filter_oc%d_kh%d_kw%d_ic%d.bin",
+           TEST_CASE_NAME, oc, kh, kw, ic);
+  fp = fopen(name, "wb");
+  if (fp) {
+    printf("Write %d bytes to %s\n", kernel_size, name);
+    fwrite(p_param->filter_data, kernel_size, 1, fp);
+    fclose(fp);
+  } else {
+    printf("Fail to open %s\n", name);
+    return;
+  }
+
+  snprintf(name, sizeof(name), "%s_bias_oc%d.bin",
+           TEST_CASE_NAME, oc);
+  fp = fopen(name, "wb");
+  if (fp) {
+    printf("Write %" PRIu64 " bytes to %s\n", sizeof(int32_t) * oc, name);
+    fwrite(p_param->bias_data, sizeof(int32_t) * oc, 1, fp);
+    fclose(fp);
+  } else {
+    printf("Fail to open %s\n", name);
+    return;
+  }
+
+  snprintf(name, sizeof(name), "%s_multiplier_oc%d.bin",
+           TEST_CASE_NAME, oc);
+  fp = fopen(name, "wb");
+  if (fp) {
+    printf("Write %" PRIu64 " bytes to %s\n", sizeof(int32_t) * oc, name);
+    fwrite(p_param->multiplier_data, sizeof(int32_t) * oc, 1, fp);
+    fclose(fp);
+  } else {
+    printf("Fail to open %s\n", name);
+    return;
+  }
+
+  snprintf(name, sizeof(name), "%s_rshift_oc%d.bin",
+           TEST_CASE_NAME, oc);
+  fp = fopen(name, "wb");
+  if (fp) {
+    printf("Write %d bytes to %s\n", oc, name);
+    fwrite(p_param->shift_data, oc, 1, fp);
+    fclose(fp);
+  } else {
+    printf("Fail to open %s\n", name);
+    return;
+  }
+}
+
+static void save_test_param(conv_test_param_t *p_param)
+{
+  printf("Save test parameter:\n");
+  printf("  input (%d, %d, %d, %d)\n",
+         p_param->input_n, p_param->input_c, p_param->input_h,
+         p_param->input_w);
+  printf("  filter (oc=%d, kh=%d, kw=%d, ic=%d), dh=%d, dw=%d\n",
+         p_param->output_c, p_param->kh, p_param->kw, p_param->input_c,
+         p_param->dh, p_param->dw);
+  printf("output (%d, %d, %d, %d)\n",
+         p_param->input_n, p_param->output_c, p_param->output_h,
+         p_param->output_w);
+  printf("  pad_top %d, pad_bot %d, pad_left %d, pad_right %d\n",
+         p_param->pad_top, p_param->pad_bot, p_param->pad_left,
+         p_param->pad_right);
+  printf("  ins_h %d, ins_h_last %d, ins_w %d, ins_w_last %d\n",
+         p_param->ins_h, p_param->ins_h_last, p_param->ins_w,
+         p_param->ins_w_last);
+  printf("  stride_h %d, stride_w %d\n", p_param->stride_h, p_param->stride_w);
+  printf("  has_bias %d, relu_enable %d\n",
+         p_param->has_bias, p_param->relu_enable);
+
+  save_input_data(p_param);
+  save_output_data(p_param);
+  save_kernel_data(p_param);
+}
+
+int run_compare_conv(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx,
+                     conv_test_param_t *p_param)
+{
+  int ret = 0;
+
+  if (rt_handle == NULL || cvk_ctx == NULL) {
+    return -1;
+  }
+
+  int in = p_param->input_n;
+  int ic = p_param->input_c;
+  int ih = p_param->input_h;
+  int iw = p_param->input_w;
+  int oc = p_param->output_c;
+  int oh = p_param->output_h;
+  int ow = p_param->output_w;
+  int kh = p_param->kh;
+  int kw = p_param->kw;
+  int dh = p_param->dh;
+  int dw = p_param->dw;
+  int pad_top = p_param->pad_top;
+  int pad_bot = p_param->pad_bot;
+  int pad_left = p_param->pad_left;
+  int pad_right = p_param->pad_right;
+  int ins_h = p_param->ins_h;
+  int ins_last_h = p_param->ins_h_last;
+  int ins_w = p_param->ins_w;
+  int ins_last_w = p_param->ins_w_last;
+  int stride_h = p_param->stride_h;
+  int stride_w = p_param->stride_w;
+  int has_bias = p_param->has_bias;
+  int relu_enable = p_param->relu_enable;
+
+  int input_size = in * ic * iw * ih;
+  int8_t *input_data = (int8_t *)malloc(input_size);
+
+  int kernel_size = oc * ic * kh * kw;
+  int8_t *kernel_data = (int8_t *)malloc(kernel_size);
+
+  int output_size = in * oc * oh * ow;
+  int8_t *output_data = (int8_t *)malloc(output_size);
+  if (!kernel_data || !output_data) {
+    free(kernel_data);
+    free(output_data);
+    return -1;
+  }
+
+  memset(output_data, 0, output_size);
+
+  int32_t *bias_data = (int32_t *) malloc(sizeof(int32_t) * oc);
+  uint32_t *multiplier_data = (uint32_t *) malloc(sizeof(uint32_t) * oc);
+  int8_t *shift_data = (int8_t *)malloc(oc);
+
+  p_param->input_data = input_data;
+  p_param->filter_data = kernel_data;
+  p_param->output_data = output_data;
+  p_param->has_bias = has_bias;
+  p_param->bias_data = bias_data;
+  p_param->multiplier_data = multiplier_data;
+  p_param->shift_data = shift_data;
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    run_compare_conv =>\n");
+  printf("      input (n=%d, ic=%d, h=%d, w=%d), kernel (oc=%d, ic=%d, h=%d, "
+         "w=%d), output (, c=%d, h=%d, w=%d), has_bias %d\n",
+         in, ic, ih, iw, oc, ic, kh, kw, oc, oh, ow, has_bias);
+#endif
+
+  int retry_cnt = p_param->retry_cnt;
+  do {
+    fill_random_data_s8(input_data, input_size);
+    fill_random_data_s8(kernel_data, kernel_size);
+    if (has_bias) {
+      fill_random_data_s32(bias_data, oc);
+    }
+
+    p_param->float_multiplier = 100.0;  // should be < 1.0
+    calc_conv_float_multiplier(p_param);
+
+    if (p_param->float_multiplier > 0.f && p_param->float_multiplier < 1.0) {
+      break;
+    }
+
+  } while (--retry_cnt);
+
+  if (p_param->float_multiplier >= 1.0) {
+    printf("    run_compare_dw_conv: unable to find valid multiplier\n");
+    free(input_data);
+    free(kernel_data);
+    free(output_data);
+    free(bias_data);
+    free(multiplier_data);
+    free(shift_data);
+
+    return -1;
+  }
+
+  uint32_t base_multiplier = 0;
+  int base_shift = 0;
+  QuantizeMultiplierSmallerThanOne(p_param->float_multiplier, &base_multiplier,
+                                   &base_shift);
+
+  for (int i = 0; i < oc; ++i) {
+    // multipliers typically range in [2^30 ; 2^31 - 1].
+    // Values in [0, 2^30 - 1] are normally unused, but harmless.
+    // Thus a good way to randomize multipliers is to subtract from them
+    // a random value smaller than 2^30 but still significant compared to it.
+    p_param->multiplier_data[i] = base_multiplier - (rand() % (1 << 26));
+
+    // Our H/W only supports right shift
+    int right_shift = base_shift - 1 + (rand() % 4);
+    p_param->shift_data[i] = right_shift > 0 ? right_shift : 0;
+
+#ifdef ENABLE_DEBUG_MSG
+    printf("      [oc=%d] multiplier_data %d, shift_data %d\n", i,
+           p_param->multiplier_data[i], p_param->shift_data[i]);
+#endif
+  }
+
+  conv_per_channel_ref(p_param);
+
+  // w/  bias: bias(4) + multiplier(4) + shift(1)
+  // w/o bias: multiplier(4) + shift(1)
+  const int chl_quan_per_lane_data_size =
+      p_param->has_bias ? 9 : 5;
+  const int chl_quan_data_size = chl_quan_per_lane_data_size * oc;
+  uint8_t *chl_quan_data = (uint8_t *) malloc(chl_quan_data_size);
+  pack_chl_quan_param(oc, has_bias, bias_data, multiplier_data, shift_data,
+                      chl_quan_data);
+
+  p_param->chl_quan_data = chl_quan_data;
+  p_param->chl_quan_data_size = chl_quan_data_size;
+
+  cvk_tl_shape_t input_shape = {in, ic, ih, iw};
+  cvk_tl_shape_t filter_shape = {1, oc, kh * kw, ic};
+  cvk_tl_shape_t output_shape = {in, oc, oh, ow};
+  cvk_tl_shape_t chl_quan_shape = {1, oc, 1, chl_quan_per_lane_data_size};
+
+  cvk_tl_t *tl_input =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, input_shape, CVK_FMT_I8,
+                                      /*eu_aign=*/1);
+
+  cvk_tl_t *tl_filter =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, filter_shape, CVK_FMT_I8,
+                                      /*eu_align=*/0);
+
+  cvk_tl_t *tl_output =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, output_shape, CVK_FMT_I8,
+                                      /*eu_align=*/1);
+
+  // Shape for TDMA load
+  cvk_tl_t *tl_quan_data =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, chl_quan_shape, CVK_FMT_U8,
+                                      /*eu_align*/ 0);
+
+  if (tl_input == NULL) {
+    printf("      fail to alloc tl_input (%d, %d, %d, %d)\n",
+           input_shape.n, input_shape.c, input_shape.h, input_shape.w);
+    return -1;
+  }
+  if (tl_filter == NULL) {
+    printf("     fail to alloc tl_filter (%d, %d, %d, %d)\n",
+           filter_shape.n, filter_shape.c, filter_shape.h, filter_shape.w);
+    return -1;
+  }
+  if (tl_output == NULL) {
+    printf("    fail to alloc tl_output (%d, %d, %d, %d)\n", output_shape.n,
+           output_shape.c, output_shape.h, output_shape.w);
+    return -1;
+  }
+  if (tl_quan_data == NULL) {
+    printf("    fail to alloc tl_quan_data (%d, %d ,%d, %d)\n",
+           chl_quan_shape.n, chl_quan_shape.c, chl_quan_shape.h,
+           chl_quan_shape.w);
+    return -1;
+  }
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_quan_data, chl_quan_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_input, (uint8_t *)input_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_filter, (uint8_t *)kernel_data);
+
+  {
+    // Reshape per channel quantization data for TIU
+    tl_quan_data->shape.n = 1;
+    tl_quan_data->shape.c = oc;
+    tl_quan_data->shape.h = 1;
+    tl_quan_data->shape.w = 1;
+    tl_quan_data->stride = cvk_ctx->ops->tl_default_stride(
+        cvk_ctx, tl_quan_data->shape, CVK_FMT_I8, /*eu_align=*/0);
+
+    // Reshape weight for TIU
+    tl_filter->shape.n = ic;
+    tl_filter->shape.c = oc;
+    tl_filter->shape.h = kh;
+    tl_filter->shape.w = kw;
+
+    cvk_tiu_convolution_param_t param;
+    memset(&param, 0, sizeof(param));
+    param.ofmap = tl_output;
+    param.ifmap = tl_input;
+    param.weight = tl_filter;
+    param.chl_quan_param = tl_quan_data;
+    param.ins_h = ins_h;
+    param.ins_last_h = ins_last_h;
+    param.ins_w = ins_w;
+    param.ins_last_w = ins_last_w;
+    param.stride_h = stride_h;
+    param.stride_w = stride_w;
+    param.dilation_h = dh;
+    param.dilation_w = dw;
+    param.pad_top = pad_top;
+    param.pad_bottom = pad_bot;
+    param.pad_left = pad_left;
+    param.pad_right = pad_right;
+    param.has_bias = has_bias;
+    param.relu_enable = relu_enable;
+
+#ifdef ENABLE_DEBUG_MSG
+    printf("      tiu_conv:\n");
+    printf("        ifmap shape (%d, %d, %d, %d)\n",
+           param.ifmap->shape.n, param.ifmap->shape.c, param.ifmap->shape.h,
+           param.ifmap->shape.w);
+    printf("        weight shape (%d, %d, %d, %d)\n",
+           param.weight->shape.n, param.weight->shape.c, param.weight->shape.h,
+           param.weight->shape.w);
+    printf("        ofmap shape (%d, %d, %d, %d)\n",
+           param.ofmap->shape.n, param.ofmap->shape.c, param.ofmap->shape.h,
+           param.ofmap->shape.w);
+#endif
+
+    cvk_ctx->ops->tiu_convolution(cvk_ctx, &param);
+  }
+
+  CVI_RT_Submit(cvk_ctx);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("      compare result:\n");
+#endif
+  int8_t *conv_output_data =
+      (int8_t *)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_output);
+  for (int i = 0; i < in; ++i) {
+    for (int j = 0; j < oc; ++j) {
+      for (int k = 0; k < oh; ++k) {
+        for (int l = 0; l < ow; ++l) {
+          int offset = i * (oc * oh * ow) + j * (oh * ow) + k * ow + l;
+          if (conv_output_data[offset] != output_data[offset]) {
+            printf("        [ni=%d][oci=%d][ohi=%d][owi=%d] output %d(tiu) != "
+                   "%d(ref)\n",
+                   i, j, k, l, conv_output_data[offset], output_data[offset]);
+            ret = -1;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  if (ret) {
+    save_test_param(p_param);
+  }
+
+  // Reverse order
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_quan_data);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_output);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_filter);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_input);
+
+  free(conv_output_data);
+
+  free(input_data);
+  free(kernel_data);
+  free(output_data);
+  free(bias_data);
+  free(multiplier_data);
+  free(shift_data);
+  free(chl_quan_data);
+
+#ifdef ENABLE_DEBUG_MSG
+  printf("    <= run_compare_conv\n");
+#endif
+
+  return ret;
+}
+
+static int simple_test(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+
+  const int batches = 1;
+  const int input_depth = 2;
+  const int input_height = 2;
+  const int input_width = 3;
+  cvk_tl_shape_t input_shape = {batches, input_depth, input_height, input_width};
+  int8_t input_data[12] = {
+      9,  1,   -11,  // ic = 0, h = 0
+      13, 5,   -15,  // ic = 0, h = 1
+      5,  -7,  -15,  // ic = 1, h = 0
+      9,  -11, -19   // ic = 1, h = 1
+  };
+
+  const int output_depth = 2;
+  const int kernel_height = 2;
+  const int kernel_width = 2;
+  cvk_tl_shape_t filter_shape = {output_depth, input_depth, kernel_height,
+                                 kernel_width};
+
+  cvk_tl_shape_t quan_param_shape = {1, output_depth, 1, 9};
+
+  // TIU weight layout (1, oc, hw*kc, ic)
+  cvk_tl_shape_t filter_shape_for_dma = {1, output_depth,
+                                     kernel_height * kernel_width, input_depth};
+  int8_t filter_data_for_dma[16] = {
+      2,  4,  6,  8,  6,  8,  10, 12,  // oc = 0
+      28, 32, 20, 24, 12, 16, 4,  8    // oc = 1
+  };
+
+  int32_t bias_data[2] = {12, -16};
+
+  const int output_height = 1;
+  const int output_width = 2;
+  cvk_tl_shape_t output_shape = {1, output_depth, output_height, output_width};
+  // zero_point = 0
+  int8_t ref_output_data[4] = {
+      17, -128,  // oc = 0
+      60, -128,  // oc = 1
+  };
+
+  uint32_t output_multiplier[] = {1073741824, 1073741824};
+  int8_t output_rshift[2] = {1, 2};  // changed to right shift
+
+  int8_t output_data[4];
+
+  conv_test_param_t params;
+  memset(&params, 0, sizeof(params));
+
+  params.input_n = batches;
+  params.input_c = input_depth;
+  params.input_h = input_height;
+  params.input_w = input_width;
+  params.kh = kernel_height;
+  params.kw = kernel_width;
+  params.output_c = output_depth;
+  params.output_h = output_height;
+  params.output_w = output_width;
+  params.stride_w = 1;
+  params.stride_h = 1;
+  params.input_data = input_data;
+  params.filter_data = filter_data_for_dma;
+  params.output_data = output_data;
+  params.has_bias = 1;
+  params.bias_data = bias_data;
+  params.multiplier_data = output_multiplier;
+  params.shift_data = output_rshift;
+  conv_per_channel_ref(&params);
+
+  printf("Compare ref and golden\n");
+  for (int i = 0; i < 4; i++) {
+    if (output_data[i] != ref_output_data[i]) {
+      printf("Error ! output[%d]=%d != ref_output_data[%d]=%d\n", i,
+             output_data[i], i, ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  // cvk_tl_shape_t per_channel_cal_shape = {1, /*oc=*/2, 1, 9};
+  uint8_t per_channel_quan_data[18];
+  pack_chl_quan_param(2, /*has_bias=*/true, bias_data, output_multiplier,
+                      output_rshift, per_channel_quan_data);
+
+  cvk_tl_t *tl_per_channel_cal =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, quan_param_shape, CVK_FMT_U8,
+                                      /*eu_align*/ 0);
+
+  cvk_tl_t *tl_input =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, input_shape, CVK_FMT_I8,
+                                      /*eu_aign=*/1);
+
+  cvk_tl_t *tl_filter = cvk_ctx->ops->lmem_alloc_tensor(
+      cvk_ctx, filter_shape_for_dma, CVK_FMT_I8, /*eu_align=*/1);
+
+  cvk_tl_t *tl_output =
+      cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, output_shape, CVK_FMT_I8,
+                                      /*eu_align=*/1);
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_per_channel_cal,
+                           per_channel_quan_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_input, (uint8_t *)input_data);
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_filter,
+                           (uint8_t *)filter_data_for_dma);
+
+  // Restore filter shape for tiu operation
+  tl_filter->shape = filter_shape;
+  tl_filter->stride = cvk_ctx->ops->tl_default_stride(
+      cvk_ctx, tl_filter->shape, CVK_FMT_I8, /*eu_align=*/1);
+
+  {
+    // Reshape per channel quantization data
+    tl_per_channel_cal->shape.n = 1;
+    tl_per_channel_cal->shape.c = 2;
+    tl_per_channel_cal->shape.h = 1;
+    tl_per_channel_cal->shape.w = 1;
+    tl_per_channel_cal->stride = cvk_ctx->ops->tl_default_stride(
+        cvk_ctx, tl_per_channel_cal->shape, CVK_FMT_I8, /*eu_align=*/0);
+
+    cvk_tiu_convolution_param_t param;
+    memset(&param, 0, sizeof(param));
+    param.ofmap = tl_output;
+    param.ifmap = tl_input;
+    param.weight = tl_filter;
+    param.chl_quan_param = tl_per_channel_cal;
+    param.stride_h = 1;
+    param.stride_w = 1;
+    param.dilation_h = 1;
+    param.dilation_w = 1;
+    param.has_bias = 1;
+    cvk_ctx->ops->tiu_convolution(cvk_ctx, &param);
+  }
+
+  CVI_RT_Submit(cvk_ctx);
+
+  printf("Compare tiu and golden\n");
+  int8_t *conv_output_data =
+      (int8_t *)tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_output);
+  for (uint64_t i = 0; i < sizeof(ref_output_data); i++) {
+    if (conv_output_data[i] != ref_output_data[i]) {
+      printf("output_data[%" PRIu64 "] %d != %d\n", i, conv_output_data[i],
+             ref_output_data[i]);
+      ret = -1;
+    }
+  }
+
+  free(conv_output_data);
+
+  // Reverse order
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_output);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_filter);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_input);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_per_channel_cal);
+
+  return ret;
+}
+
+static int random_test(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+
+
+#ifndef ENABLE_FULL_REGRESSION
+#ifndef ENABLE_TV_GEN_PATTERN
+  // n: 12b, c: 12b, h: 12b(4095-32), wb: 12b(4095-32)
+  int batch_range[] = {1, 1, 2, 4095 - 32};
+  int input_height_range[] = {1, 512, 1024, 4095 - 32};
+  int input_width_range[] = {1, 512, 1024, 4095 - 32};
+  int input_depth_range[] = {1, 16, 32, 4095 - 32};
+  int output_depth_range[] = {1, 16, 32, 4095 - 32};
+
+  // h: 12b, w: 12b
+  // stride_h: 4b, stride_w: 4b
+  int kernel_height_range[] = {1, 11, 4095 - 32};
+  int kernel_width_range[] = {1, 11, 4095 - 32};
+  int kernel_stride_height_range[] = {1, 5, 15};
+  int kernel_stride_width_range[] = {1, 5, 15};
+#else
+  // TV_GEN pattern
+  // Random Test, total 19683, skipped 118066, executed 32, failed 0, ret 0
+
+  // n: 12b, c: 12b, h: 12b(4095-32), wb: 12b(4095-32)
+  int batch_range[] = {1, 1, 32};
+  int input_height_range[] = {1, 512, 4095 - 32};
+  int input_width_range[] = {1, 512, 4095 - 32};
+  int input_depth_range[] = {1, 16, 4095};
+  int output_depth_range[] = {1, 16, 4095};
+
+  // h: 12b, w: 12b
+  // stride_h: 4b, stride_w: 4b
+  int kernel_height_range[] = {1, 11, 4095};
+  int kernel_width_range[] = {1, 11, 4095};
+  int kernel_stride_height_range[] = {1, 5, 15};
+  int kernel_stride_width_range[] = {1, 5, 15};
+
+#endif //ENABLE_TV_GEN_PATTERN
+#else
+#if 0
+  // Input with same range size
+  int batch_range[] = {1};
+  int input_height_range[] = {1};
+  int input_width_range[] = {1};
+  int input_depth_range[] = {1};
+  const int input_range_size = sizeof(input_height_range)/sizeof(input_height_range[0]);
+
+  // Kernel with same range size
+  int kernel_height_range[] = {1};
+  int kernel_width_range[] = {1};
+  int kernel_stride_height_range[] = {1};
+  int kernel_stride_width_range[] = {1};
+  int output_depth_range[] = {1};
+  const int kernel_range_size = sizeof(kernel_height_range)/sizeof(kernel_height_range[0]);
+#else
+  // 10/21/2019 overnight
+  // total 20480000, skipped 20301713, executed 178287, failed 0
+
+  // n: 12b, c: 12b, h: 12b(4095-32), wb: 12b(4095-32)
+  int batch_range[] = {1, 2, 4, 8, 16, 32, 64, 4095 - 32};
+  int input_height_range[] = {1, 3, 11, 128, 512, 1024, 2048, 4095 - 32};
+  int input_width_range[] = {1, 3, 11, 128, 512, 1024, 2048, 4095 - 32};
+  int input_depth_range[] = {1, 3, 11, 32, 64, 1024, 2048, 4095};
+  int output_depth_range[] = {1, 3, 11, 32, 64, 1024, 2048, 4095};
+
+  // h: 12b, w: 12b
+  // stride_h: 4b, stride_w: 4b
+  int kernel_height_range[] = {1, 3, 11, 511, 4095};
+  int kernel_width_range[] = {1, 3, 11, 511, 4095};
+  int kernel_stride_height_range[] = {1, 3, 5, 7, 15};
+  int kernel_stride_width_range[] = {1, 3, 5, 7, 15};
+#endif
+#endif /* ENABLE_FULL_REGRESSION */
+
+  const int batch_range_size = sizeof(batch_range) / sizeof(batch_range[0]);
+  const int input_height_range_size =
+      sizeof(input_height_range) / sizeof(input_height_range[0]);
+  const int input_width_range_size =
+      sizeof(input_width_range) / sizeof(input_width_range[0]);
+  const int input_depth_range_size =
+      sizeof(input_depth_range) / sizeof(input_depth_range[0]);
+  const int output_depth_range_size =
+      sizeof(output_depth_range) / sizeof(output_depth_range[0]);
+
+  const int kernel_height_range_size =
+      sizeof(kernel_height_range) / sizeof(kernel_height_range[0]);
+  const int kernel_width_range_size =
+      sizeof(kernel_width_range) / sizeof(kernel_width_range[0]);
+  const int kernel_stride_height_range_size =
+      sizeof(kernel_stride_height_range) /
+      sizeof(kernel_stride_height_range[0]);
+  const int kernel_stride_width_range_size =
+      sizeof(kernel_stride_width_range) / sizeof(kernel_stride_width_range[0]);
+
+  int random_seed = clock();
+  srand(random_seed);
+
+  const int retry_test_count = 100;
+
+  bool stop_at_first_error = true;
+
+  int total_tests = batch_range_size * input_depth_range_size *
+                    input_height_range_size * input_width_range_size *
+                    output_depth_range_size * kernel_height_range_size *
+                    kernel_width_range_size * kernel_stride_height_range_size *
+                    kernel_stride_width_range_size;
+  int skipped_tests = 0;
+  int executed_tests = 0;
+  int failed_tests = 0;
+  int current_test = 0;
+
+  printf("Random Test =>\n");
+  for (int m = 0; m < retry_test_count; ++m) {
+    for (int i = 0; i < batch_range_size; ++i) {
+      // random choosen from [range[i] : range[i+1]]
+      int batch = choose_from_range(batch_range, batch_range_size, i);
+
+      for (int j = 0; j < input_height_range_size; ++j) {
+        int input_height =
+            choose_from_range(input_height_range, input_height_range_size, j);
+
+        for (int k = 0; k < input_width_range_size; ++k) {
+          int input_width =
+              choose_from_range(input_width_range, input_width_range_size, k);
+
+          for (int l = 0; l < input_depth_range_size; ++l) {
+            int input_depth =
+                choose_from_range(input_depth_range, input_depth_range_size, k);
+
+            for (int m = 0; m < kernel_height_range_size; ++m) {
+              int kernel_height = choose_from_range(
+                  kernel_height_range, kernel_height_range_size, m);
+
+              for (int n = 0; n < kernel_width_range_size; ++n) {
+                int kernel_width = choose_from_range(
+                    kernel_width_range, kernel_width_range_size, n);
+
+                for (int x = 0; x < kernel_stride_height_range_size; ++x) {
+                  int kernel_stride_height =
+                      choose_from_range(kernel_stride_height_range,
+                                        kernel_stride_height_range_size, x);
+
+                  for (int y = 0; y < kernel_stride_width_range_size; ++y) {
+                    int kernel_stride_width =
+                        choose_from_range(kernel_stride_width_range,
+                                          kernel_stride_width_range_size, y);
+
+                    for (int z = 0; z < output_depth_range_size; ++z) {
+                      int output_depth = choose_from_range(
+                          output_depth_range, output_depth_range_size, y);
+
+                      current_test++;
+
+                      int has_bias = rand() % 2;
+                      int dh = 1;
+                      int dw = 1;
+                      int ins_h = 0;
+                      int ins_h_last = 0;
+                      int ins_w = 0;
+                      int ins_w_last = 0;
+                      int pad_top = 0;
+                      int pad_bot = 0;
+                      int pad_left = 0;
+                      int pad_right = 0;
+
+                      int ih_ext = calc_dilute_hw(input_height, ins_h,
+                                                  ins_h_last, pad_top, pad_bot);
+                      int iw_ext = calc_dilute_hw(
+                          input_width, ins_w, ins_w_last, pad_left, pad_right);
+                      int kh_ext =
+                          calc_dilute_hw(kernel_height, dh - 1, 0, 0, 0);
+                      int kw_ext =
+                          calc_dilute_hw(kernel_width, dw - 1, 0, 0, 0);
+
+                      int oh =
+                          calc_output_hw(ih_ext, kh_ext, kernel_stride_height);
+                      int ow =
+                          calc_output_hw(iw_ext, kw_ext, kernel_stride_width);
+
+                      conv_test_param_t test_param;
+                      memset(&test_param, 0, sizeof(test_param));
+                      test_param.input_n = batch;
+                      test_param.input_c = input_depth;
+                      test_param.input_h = input_height;
+                      test_param.input_w = input_width;
+                      test_param.kh = kernel_height;
+                      test_param.kw = kernel_width;
+                      test_param.dh = dh;
+                      test_param.dw = dw;
+                      test_param.pad_top = pad_top;
+                      test_param.pad_bot = pad_bot;
+                      test_param.pad_left = pad_left;
+                      test_param.pad_right = pad_right;
+                      test_param.ins_h = ins_h;
+                      test_param.ins_h_last = ins_h_last;
+                      test_param.ins_w = ins_w;
+                      test_param.ins_w_last = ins_w_last;
+                      test_param.stride_h = kernel_stride_height;
+                      test_param.stride_w = kernel_stride_width;
+                      test_param.output_c = output_depth;
+                      test_param.output_h = oh;
+                      test_param.output_w = ow;
+                      test_param.has_bias = has_bias;
+                      test_param.retry_cnt = 5;
+
+                      bool is_valid_param =
+                          check_valid_test_param(cvk_ctx, &test_param);
+                      if (is_valid_param == false) {
+                        skipped_tests++;
+                        continue;
+                      }
+
+                      int ret2 = run_compare_conv(rt_handle, cvk_ctx, &test_param);
+                      failed_tests = ret2 ? failed_tests + 1 : failed_tests;
+                      ret |= ret2;
+                      executed_tests++;
+
+#ifdef ENABLE_DEBUG_MSG
+                      printf("  [%d] random test: input shape(%d, %d, %d, %d)",
+                             executed_tests, batch, input_depth,
+                             input_height, input_width);
+                      printf(", kernel shape (%d, %d, %d, %d), result %d\n",
+                             output_depth, input_depth, kernel_height,
+                             kernel_width, ret2);
+#endif
+
+
+                      // Stop at first error
+                      if (ret && stop_at_first_error) {
+                        break;
+                      }
+                    }
+
+                    // Stop at first error
+                    if (ret && stop_at_first_error) {
+                      break;
+                    }
+                  }
+
+                  // Stop at first error
+                  if (ret && stop_at_first_error) {
+                    break;
+                  }
+                }
+
+                // Stop at first error
+                if (ret && stop_at_first_error) {
+                  break;
+                }
+              }
+
+              // Stop at first error
+              if (ret && stop_at_first_error) {
+                break;
+              }
+            }
+
+            // Stop at first error
+            if (ret && stop_at_first_error) {
+              break;
+            }
+          }
+
+          // Stop at first error
+          if (ret && stop_at_first_error) {
+            break;
+          }
+        }
+
+        // Stop at first error
+        if (ret && stop_at_first_error) {
+          break;
+        }
+      }
+
+      // Stop at first error
+      if (ret && stop_at_first_error) {
+        break;
+      }
+    }
+    // Stop at first error
+    if (ret && stop_at_first_error) {
+      break;
+    }
+
+    if (executed_tests >= MIN_EXEC_TESTS) {
+      break;
+    }
+  }
+
+  printf(
+      "<= Random Test, total %d, skipped %d, executed %d, failed %d, ret %d\n",
+      total_tests, skipped_tests, executed_tests, failed_tests, ret);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_HANDLE rt_handle;
+  cvk_context_t *cvk_ctx = NULL;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+
+  ret |= simple_test(rt_handle, cvk_ctx);
+  ret |= random_test(rt_handle, cvk_ctx);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  if (!ret)
+    printf("%s pass\n", __FILENAME__);
+  else
+    printf("%s fail\n", __FILENAME__);
+
+  return ret;
+}
diff --git a/cviruntime/test/1880v2/test_cv1880v2_tensor_copy.c b/cviruntime/test/1880v2/test_cv1880v2_tensor_copy.c
new file mode 100644
index 000000000..d26cec25e
--- /dev/null
+++ b/cviruntime/test/1880v2/test_cv1880v2_tensor_copy.c
@@ -0,0 +1,927 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+#include <inttypes.h>
+
+#include "test_cvikernel_util.h"
+
+typedef enum {
+  NCHW_N = 0,
+  NCHW_C = 1,
+  NCHW_H = 2,
+  NCHW_W = 3,
+  NCHW_MAX_DIMS
+} NCHW_DIMS;
+
+
+void gmem_init_tensor(
+    struct cvikernel_context *cvk_ctx,
+    cvk_tg_t *tg,
+    cvk_tg_shape_t shape,
+    cvk_fmt_t fmt) {
+
+  memset(tg, 0, sizeof(*tg));
+  tg->fmt = fmt;
+  tg->shape = shape;
+  tg->stride = cvk_ctx->ops->tg_default_stride(cvk_ctx, tg->shape, tg->fmt);
+}
+
+static void tl_copy_ref(int8_t *a, int8_t *res, uint64_t size)
+{
+  for (uint64_t i = 0; i < size; i++)
+    res[i] = a[i];
+}
+
+static int test_tl_copy(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx, int eu_align)
+{
+  int ret = 0;
+
+  int n = 3;
+  int c = 39;
+  int h = 7;
+  int w = 37;
+
+  cvk_tl_shape_t tl_shape;
+  tl_shape.n = n;
+  tl_shape.c = c;
+  tl_shape.h = h;
+  tl_shape.w = w;
+
+  uint32_t size = n * c * h * w;
+  int8_t *a_data = (int8_t *)malloc(size);
+  assert(a_data && "Expect allocated a_data");
+  for (uint32_t i = 0; i < size; i++)
+    a_data[i] = (int8_t)(i % 256);
+
+  int8_t *ref_data = (int8_t *)malloc(size);
+  assert(ref_data && "Expect allocated ref_data");
+  tl_copy_ref(a_data, ref_data, size);
+
+  cvk_tl_t *tl_a = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+  cvk_tl_t *tl_res = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, CVK_FMT_I8, eu_align);
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, tl_a, (uint8_t *)a_data);
+
+  cvk_tiu_copy_param_t p10;
+  p10.dst = tl_res;
+  p10.src = tl_a;
+  cvk_ctx->ops->tiu_copy(cvk_ctx, &p10);
+  uint8_t *res_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, tl_res);
+
+  for (uint64_t i = 0; i < size; i++) {
+    if ((int8_t)res_data[i] != ref_data[i]) {
+      printf("    comparing failed at ofmap_data[%" PRIu64 "], got %x, exp %x\n",
+             i, res_data[i], ref_data[i]);
+      ret = -1;
+    }
+  }
+
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_res);
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_a);
+
+  free(a_data);
+  free(ref_data);
+  free(res_data);
+
+  return ret;
+}
+
+static int test_hw_tp(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+  int8_t data[6] = {1, 2, 3, 4, 5, 6};
+  int8_t ref_data_hw_tp[6] = {1, 4, 2, 5, 3, 6};
+
+  cvk_tl_shape_t src_shape = {1, 1, 2, 3};
+  cvk_tl_shape_t dst_shape = {1, 1, 3, 2};
+
+  int eu_align = 0; // contiguous memory layout
+  uint32_t offset = 0;
+  cvk_tl_t tl_src;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tl_src, src_shape, CVK_FMT_I8,
+                                 eu_align);
+  offset += cvk_ctx->ops->lmem_tensor_to_size(cvk_ctx, src_shape, CVK_FMT_I8,
+                                              eu_align);
+
+  // HW transpose, still use source shape for data transfer
+  cvk_tl_t tl_dst;
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tl_dst, src_shape, CVK_FMT_I8,
+                                 eu_align);
+  tl_dst.start_address = offset;
+  tl_dst.stride.h = tl_dst.stride.w; // unit of data type size (int8/bf16)
+  tl_dst.stride.w = dst_shape.w * tl_dst.stride.w;
+
+  tensor_copy_s2d_g2l(rt_handle, cvk_ctx, &tl_src, (uint8_t *)data);
+
+  cvk_tiu_copy_param_t param;
+  param.src = &tl_src;
+  param.dst = &tl_dst;
+  cvk_ctx->ops->tiu_copy(cvk_ctx, &param);
+
+  CVI_RT_Submit(cvk_ctx);
+
+  cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tl_dst, dst_shape, CVK_FMT_I8,
+                                 eu_align);
+  tl_dst.start_address = offset;
+
+  uint8_t *res_data = tensor_copy_l2g_d2s(rt_handle, cvk_ctx, &tl_dst);
+
+  printf("  test_hw_tp: compare\n");
+  for (uint64_t i = 0; i < sizeof(ref_data_hw_tp); i++) {
+    if (res_data[i] != ref_data_hw_tp[i]) {
+      printf("    res_data[%" PRIu64 "]  %d != %d\n",
+             i, res_data[i], ref_data_hw_tp[i]);
+      ret = -1;
+    }
+  }
+
+  free(res_data);
+
+  return ret;
+}
+
+static int test_tp_0213(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+  const uint32_t input_n = 1;
+  const uint32_t input_c = 3;
+  const uint32_t input_h = 2;
+  const uint32_t input_w = 4;
+  // const uint32_t output_n = 1;
+  // const uint32_t output_c = 3;
+  // const uint32_t output_h = 2;
+  // const uint32_t output_w = 4;
+  uint32_t order[4] = {0, 2, 1, 3};
+
+  uint32_t src_shape[4] = {input_n, input_c, input_h, input_w};
+  uint32_t dst_shape[4] = {src_shape[order[0]], src_shape[order[1]],
+                           src_shape[order[2]],  src_shape[order[3]]};
+
+  // Shape (1, 3, 2, 4) -> (1, 2, 3, 4)
+  int8_t data[] = {
+      //    H0     |        H1
+      1,  2,  3 , 4,  5,  6,  7,  8,   // C0
+      9, 10, 11, 12, 13, 14, 15, 16,   // C1
+     17, 18, 19, 20, 21, 22, 23, 24    // C2
+  };
+  int8_t ref_dst_data[] = {
+      //    H0     |        H1     |       H2
+      1,  2,  3,  4,  9, 10, 11, 12, 17, 18, 19, 20,  // C0
+      5,  6,  7,  8, 13, 14, 15, 16, 21, 22, 23, 24   // C1
+  };
+
+  int eu_align = 0; // contiguous memory layout
+  cvk_fmt_t fmt = CVK_FMT_I8;
+
+  // Alloc global memory
+  cvk_tg_shape_t tg_src_shape = {
+      src_shape[0], src_shape[1], src_shape[2], src_shape[3]};
+  cvk_tg_t *tg_mem_1 =
+      alloc_tensor_dev_mem(rt_handle, cvk_ctx, tg_src_shape, fmt);
+  if (!tg_mem_1)
+    return -1;
+
+  uint64_t ga_ifmap = tg_mem_1->start_address;
+  uint32_t tg_src_stride[3] = {
+      tg_mem_1->stride.n, tg_mem_1->stride.c, tg_mem_1->stride.h};
+
+  cvk_tg_shape_t tg_dst_shape = {
+      dst_shape[0], dst_shape[1], dst_shape[2], dst_shape[3]};
+  cvk_tg_t *tg_mem_2 =
+      alloc_tensor_dev_mem(rt_handle, cvk_ctx, tg_dst_shape, fmt);
+  uint32_t tg_dst_strides[3] = {
+      tg_mem_2->stride.n, tg_mem_2->stride.c, tg_mem_2->stride.h};
+  uint64_t ga_ofmap = tg_mem_2->start_address;
+
+  // test stride
+  {
+    int8_t test_dst_data[1 * 3 * 2 * 4];
+    uint32_t dst_strides[3] = {
+        tg_dst_strides[order[0]], tg_dst_strides[order[1]],
+        tg_dst_strides[order[2]]};
+
+    for (uint32_t i = 0; i < input_n; i++) {
+      for (uint32_t j = 0; j < input_c; j++) {
+        for (uint32_t k = 0; k < input_h; k++) {
+          for (uint32_t l = 0; l < input_w; l++) {
+              uint32_t src_offset = i * tg_src_stride[0] + j * tg_src_stride[1]
+                                  + k * tg_src_stride[2] + l;
+              uint32_t dst_offset = i * dst_strides[0] + j * dst_strides[1] +
+                                    k * dst_strides[2] + l;
+            test_dst_data[dst_offset] = data[src_offset];
+          }
+        }
+      }
+    }
+
+    printf("  test_tp_0213: compare test\n");
+    for (uint32_t i = 0; i < sizeof(ref_dst_data); i++) {
+      if (test_dst_data[i] != ref_dst_data[i])
+        printf("    [%d] test_dst_data %d != %d\n",
+               i, test_dst_data[i], ref_dst_data[i]);
+    }
+
+  }
+
+  // Fill data in global memory
+  tensor_copy_s2d(rt_handle, tg_mem_1, (uint8_t *)data);
+
+  // 1. tensor load
+  {
+    cvk_tg_t tg_src;
+    memset(&tg_src, 0, sizeof(tg_src));
+    tg_src.base_reg_index = 0;
+    tg_src.start_address = ga_ifmap;
+    tg_src.fmt = fmt;
+    tg_src.shape.n = src_shape[0];
+    tg_src.shape.c = src_shape[1];
+    tg_src.shape.h = src_shape[2];
+    tg_src.shape.w = src_shape[3];
+    tg_src.stride = cvk_ctx->ops->tg_default_stride(cvk_ctx, tg_src.shape, fmt);
+
+    cvk_tl_shape_t tl_dst_shape = {
+        src_shape[0], src_shape[1], src_shape[2], src_shape[3]};
+    cvk_tl_t tl_dst;
+    cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tl_dst, tl_dst_shape, fmt,
+                                   eu_align);
+
+    cvk_tdma_g2l_tensor_copy_param_t param;
+    param.src = &tg_src;
+    param.dst = &tl_dst;
+    cvk_ctx->ops->tdma_g2l_tensor_copy(cvk_ctx, &param);
+  }
+
+  // 2. tensor store w/ (1, 3, 2, 4) transpose
+  {
+    cvk_tl_shape_t tl_src_shape = {
+        src_shape[0], src_shape[1], src_shape[2], src_shape[3]};
+
+    cvk_tl_t tl_src;
+    cvk_ctx->ops->lmem_init_tensor(cvk_ctx, &tl_src, tl_src_shape, fmt,
+                                   eu_align);
+
+    cvk_tg_t tg_dst;
+    memset(&tg_dst, 0, sizeof(tg_dst));
+    tg_dst.base_reg_index = 0;
+    tg_dst.start_address = ga_ofmap;
+    tg_dst.fmt = fmt;
+    tg_dst.shape.n = src_shape[0];
+    tg_dst.shape.c = src_shape[1];
+    tg_dst.shape.h = src_shape[2];
+    tg_dst.shape.w = src_shape[3];
+    tg_dst.stride.n = tg_dst_strides[order[0]];
+    tg_dst.stride.c = tg_dst_strides[order[1]];
+    tg_dst.stride.h = tg_dst_strides[order[2]];
+
+    cvk_tdma_l2g_tensor_copy_param_t param;
+    param.src = &tl_src;
+    param.dst = &tg_dst;
+    cvk_ctx->ops->tdma_l2g_tensor_copy(cvk_ctx, &param);
+  }
+
+  CVI_RT_Submit(cvk_ctx);
+
+  int8_t *dst_data = (int8_t *)tensor_copy_d2s(rt_handle, tg_mem_2);
+
+  printf("  test_tp_0213: compare\n");
+  for (uint64_t i = 0; i < sizeof(ref_dst_data); i++) {
+    if (dst_data[i] != ref_dst_data[i]) {
+      printf("    dst_data[%" PRIu64 "]  %d != %d\n",
+             i, dst_data[i], ref_dst_data[i]);
+      ret = -1;
+    }
+  }
+
+  // Free global memory
+  free(dst_data);
+  free_tensor_dev_mem(rt_handle, tg_mem_1);
+  free_tensor_dev_mem(rt_handle, tg_mem_2);
+
+  return ret;
+}
+
+//
+//  Permute 0231, (N, C, H, W) -> (N, H, W, C)
+//    tensor load
+//    tensor move, hw transpose
+//    tensor store, cw transpose
+//
+//  0  1  2  3
+// (N, C, H, W) -> (N, H, W, C)
+// (1, 2, 4, 4) -> (1, 4, 4, 2)
+//
+// Source (1, 2, 4, 4)
+//
+//             Tile 1           ||          Tile 0
+//         H3           H2      ||      H1           H0
+// || 16 15 14 13 | 12 11 10  9 ||  8  7  6  5 |  4  3  2  1 ||   C0
+// || 32 31 30 29 | 28 27 26 25 || 24 23 22 21 | 20 19 18 17 ||   C1
+//
+//
+// Destination (1, 4, 4, 2)
+//
+//  20  4 | 19  3 | 18  2 | 17  1    C0    Tile 0
+//  24  8 | 23  7 | 22  6 | 21  5    C1
+//  ==============================================
+//  28 12 | 27 11 | 26 10 | 25  9    C2    Tile 1
+//  32 16 | 31 15 | 30 14 | 29 13    C3
+//
+// 1. Tile 0
+// 1.1. Tensor load
+//    src shape (1, 2, 2, 4), stride (32, 16, 4), offset 0
+//    dst shape (1, 2, 2, 4), stride (8, 8, 4)
+//
+//         H1            H0
+//     8  7  6  5 |  4  3  2  1    C0
+//    24 23 22 21 | 20 19 18 17    C1
+//
+// 1.2. Tensor move, HW transpose
+//    src shape (1, 2, 2, 4), stride (8, 8, 4, 1)
+//    dst shape (1, 2, 2, 4), stride (8, 8, 1, 2)
+//
+//      H3     H2      H1      H0
+//     8  4 | 7  3 |  6  2 |  5  1    C0
+//    24 20 |23 19 | 22 18 | 21 17    C1
+//
+// 1.3. Tensor store, CW transpose
+//    src shape (1, 2, 4, 2), stride (8, 8, 2)
+//    dst shape (1, 2, 4, 2), stride (16, 8, 2), offset 0
+//
+//    H3      H2      H1      H0
+//  20  4 | 19  3 | 18  2 | 17  1    C0
+//  24  8 | 23  7 | 22  6 | 21  5    C1
+//
+//
+// 2. Tile 1
+// 2.1. Tensor load
+//    src shape (1, 2, 2, 4), stride (32, 16, 4), offset 8
+//    dst shape (1, 2, 2, 4), stride (8, 8, 4)
+//
+//         H1            H0
+//    16 15 14 13 | 12 11 10  9    C0
+//    32 31 30 29 | 28 27 26 25    C1
+//
+// 2.2. Tensor move, HW transpose
+//    src shape (1, 2, 2, 4), stride (8, 8, 4, 1)
+//    dst shape (1, 2, 2, 4), stride (8, 8, 1, 2)
+//
+//      H3      H2      H1      H0
+//    16 12 | 15 11 | 14 10 | 13  9    C0
+//    32 28 | 31 27 | 30 26 | 29 25    C1
+//
+// 2.3. Tensor store, CW transpose
+//    src shape (1, 2, 4, 2), stride (8, 8, 2)
+//    dst shape (1, 2, 4, 2), stride (16, 8, 2), offset 16
+//
+//      H3      H2      H1      H0
+//    28 12 | 27 11 | 26 10 | 25  9    C0
+//    32 16 | 31 15 | 30 14 | 29 13    C1
+//
+//    destination in global memory
+//    shape (1, 4, 4, 2), stride (32, 8, 2)
+//    gm_permuted_strides[order_n 0] = dst_gm_stride.n 32
+//    gm_permuted_strides[order_c 2] = dst_gm_stride.c 8
+//    gm_permuted_strides[order_h 3] = dst_gm_stride.h 2
+//
+//    tile1 1
+//    source in global memory, offset [0][0][2][0], 9
+//      src_gm_offset = 2 * h_stride = 2 * 4 = 8, used in first load
+//
+//    destination in global memory, offset [0][2][0][0], 9
+//    dst_gm_offset = 2 * c_stride = 2 * 8 = 16
+//
+//    src[i][j][k][l] = dst[i][k][l][j]
+//    src[0][0][2][0] = dst[0][2][0][0]
+//
+static int test_tp_0231(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+  cvk_fmt_t fmt = CVK_FMT_I8;
+  uint8_t eu_align = 0; // No need to align eu
+  uint32_t orders[NCHW_MAX_DIMS] = {0, 2, 3, 1}; // NCHW -> NHWC
+
+  uint32_t src_shapes[NCHW_MAX_DIMS] = {1, 2, 4, 4};
+  uint32_t src_strides[NCHW_MAX_DIMS];
+  src_strides[NCHW_W] = 1; // int8
+  src_strides[NCHW_H] = src_shapes[NCHW_W] * src_strides[NCHW_W];
+  src_strides[NCHW_C] = src_shapes[NCHW_H] * src_strides[NCHW_H];
+  src_strides[NCHW_N] = src_shapes[NCHW_C] * src_strides[NCHW_C];
+
+  uint32_t dst_shapes[NCHW_MAX_DIMS] = {
+      src_shapes[orders[NCHW_N]], src_shapes[orders[NCHW_C]],
+      src_shapes[orders[NCHW_H]], src_shapes[orders[NCHW_W]]};
+  uint32_t dst_strides[NCHW_MAX_DIMS];
+  dst_strides[NCHW_W] = 1; // int8
+  dst_strides[NCHW_H] = dst_shapes[NCHW_W] * dst_strides[NCHW_W];
+  dst_strides[NCHW_C] = dst_shapes[NCHW_H] * dst_strides[NCHW_H];
+  dst_strides[NCHW_N] = dst_shapes[NCHW_C] * dst_strides[NCHW_C];
+
+  // Source shape (1, 2, 4, 4)
+  const int8_t src_data[] = {
+      //     H0     |      H1       |        H2     |       H3
+       1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, // C0
+      17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32  // C1
+  };
+
+  // Destination shape (1, 4, 4, 2)
+  const int8_t ref_dst_data[] = {
+      // H0 |   H1  |   H2  |   H3
+       1, 17,  2, 18,  3, 19,  4, 20, // C0
+       5, 21,  6, 22,  7, 23,  8, 24, // C1
+       9, 25, 10, 26, 11, 27, 12, 28, // C2
+      13, 29, 14, 30, 15, 31, 16, 32  // C3
+  };
+
+  int8_t dst_data_cpu[sizeof(ref_dst_data)];
+
+  // Derive destination offset from source position
+  uint32_t dst_index[NCHW_MAX_DIMS];
+  dst_index[orders[NCHW_N]] = 0;
+  dst_index[orders[NCHW_C]] = 1;
+  dst_index[orders[NCHW_H]] = 2;
+  dst_index[orders[NCHW_W]] = 3;
+
+  // test element-wise copy
+  {
+    // source is contiguous
+    for (uint32_t i = 0; i < src_shapes[NCHW_N]; ++i) {
+      for (uint32_t j = 0; j < src_shapes[NCHW_C]; ++j) {
+        for (uint32_t k = 0; k < src_shapes[NCHW_H]; ++k) {
+          for (uint32_t l = 0; l < src_shapes[NCHW_W]; ++l) {
+            uint32_t src_offset =
+                i * src_strides[NCHW_N] + j * src_strides[NCHW_C] +
+                k * src_strides[NCHW_H] + l * src_strides[NCHW_W];
+            uint32_t dst_offset = i * dst_strides[dst_index[NCHW_N]] +
+                                  j * dst_strides[dst_index[NCHW_C]] +
+                                  k * dst_strides[dst_index[NCHW_H]] +
+                                  l * dst_strides[dst_index[NCHW_W]];
+            dst_data_cpu[dst_offset] = src_data[src_offset];
+          }
+        }
+      }
+    }
+
+    printf("  test_tp_0231: elt copy, compare test\n");
+    for (uint32_t i = 0; i < sizeof(dst_data_cpu); ++i) {
+      if (dst_data_cpu[i] != ref_dst_data[i]) {
+        printf("    [%d] dst_data %d != %d\n",
+               i, dst_data_cpu[i], ref_dst_data[i]);
+        ret = -1;
+      }
+    }
+  }
+
+  //
+  // Data initialization in runtime.
+  //
+  cvk_tg_shape_t tg_src_shape = {
+      src_shapes[NCHW_N], src_shapes[NCHW_C], src_shapes[NCHW_H],
+      src_shapes[NCHW_W]};
+  cvk_tg_t *tg_src =
+      alloc_tensor_dev_mem(rt_handle, cvk_ctx, tg_src_shape, fmt);
+
+  // Fill data in global memory
+  tensor_copy_s2d(rt_handle, tg_src, (uint8_t *)src_data);
+
+  cvk_tg_shape_t tg_dst_shape = {
+      dst_shapes[NCHW_N], dst_shapes[NCHW_C], dst_shapes[NCHW_H],
+      dst_shapes[NCHW_W]};
+  cvk_tg_t *tg_dst =
+      alloc_tensor_dev_mem(rt_handle, cvk_ctx, tg_dst_shape, fmt);
+
+
+  //
+  // Main tiled transpose routine
+  //
+  uint32_t src_h_step = src_shapes[NCHW_H] / 2; // 2 tiles
+  uint32_t src_poss[NCHW_MAX_DIMS] = {0, 0, 0, 0};
+  for (src_poss[NCHW_H] = 0; src_poss[NCHW_H] < src_shapes[NCHW_H];
+       src_poss[NCHW_H] += src_h_step) {
+    uint32_t src_tiled_shapes[NCHW_MAX_DIMS] = {
+        src_shapes[NCHW_N], src_shapes[NCHW_C], src_shapes[NCHW_H],
+        src_shapes[NCHW_W]};
+    src_tiled_shapes[NCHW_H] =
+        ((src_poss[NCHW_H] + src_h_step) > src_shapes[NCHW_H]) ?
+            (src_shapes[NCHW_H] - src_poss[NCHW_H]) : src_h_step;
+
+    uint32_t src_offset =
+        src_poss[NCHW_N] * src_strides[NCHW_N] +
+        src_poss[NCHW_C] * src_strides[NCHW_C] +
+        src_poss[NCHW_H] * src_strides[NCHW_H] +
+        src_poss[NCHW_W] * src_strides[NCHW_W];
+    uint32_t dst_offset = src_poss[NCHW_N] * dst_strides[dst_index[NCHW_N]] +
+                          src_poss[NCHW_C] * dst_strides[dst_index[NCHW_C]] +
+                          src_poss[NCHW_H] * dst_strides[dst_index[NCHW_H]] +
+                          src_poss[NCHW_W] * dst_strides[dst_index[NCHW_W]];
+
+    // 1. Tensor load, tiled shape, global stride
+    cvk_tl_t *tl_load_dst_tiled = NULL;
+    {
+      cvk_tg_t tg_src_tiled;
+      memset(&tg_src_tiled, 0, sizeof(tg_src_tiled));
+      tg_src_tiled.base_reg_index = 0;
+      tg_src_tiled.start_address = tg_src->start_address + src_offset;
+      tg_src_tiled.fmt = fmt;
+      tg_src_tiled.shape.n = src_tiled_shapes[NCHW_N];
+      tg_src_tiled.shape.c = src_tiled_shapes[NCHW_C];
+      tg_src_tiled.shape.h = src_tiled_shapes[NCHW_H];
+      tg_src_tiled.shape.w = src_tiled_shapes[NCHW_W];
+      tg_src_tiled.stride.n = tg_src->stride.n;
+      tg_src_tiled.stride.c = tg_src->stride.c;
+      tg_src_tiled.stride.h = tg_src->stride.h;
+
+      cvk_tl_shape_t tl_dst_tiled_shape = {
+          src_tiled_shapes[NCHW_N], src_tiled_shapes[NCHW_C],
+          src_tiled_shapes[NCHW_H], src_tiled_shapes[NCHW_W]};
+      tl_load_dst_tiled =
+          cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_dst_tiled_shape, fmt,
+                                          eu_align);
+
+      cvk_tdma_g2l_tensor_copy_param_t param;
+      param.src = &tg_src_tiled;
+      param.dst = tl_load_dst_tiled;
+      cvk_ctx->ops->tdma_g2l_tensor_copy(cvk_ctx, &param);
+    }
+
+    // 2. Tensor move, HW transpose
+    cvk_tl_t *tl_move_dst = NULL;
+    {
+      cvk_tl_shape_t tl_move_dst_shape = {
+          src_tiled_shapes[NCHW_N], src_tiled_shapes[NCHW_C],
+          src_tiled_shapes[NCHW_W], src_tiled_shapes[NCHW_H]};
+      tl_move_dst =
+          cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_move_dst_shape, fmt,
+                                          eu_align);
+
+      // HW transpose, still use source shape for data transfer
+      cvk_tl_t tl_dst_hw_tp;
+      cvk_ctx->ops->lmem_init_tensor(
+          cvk_ctx, &tl_dst_hw_tp, tl_load_dst_tiled->shape, fmt, eu_align);
+      tl_dst_hw_tp.start_address = tl_move_dst->start_address;
+      tl_dst_hw_tp.stride.h = tl_move_dst->stride.w;
+      tl_dst_hw_tp.stride.w = tl_move_dst->stride.h;
+
+      cvk_tiu_copy_param_t param;
+      param.src = tl_load_dst_tiled;
+      param.dst = &tl_dst_hw_tp;
+      cvk_ctx->ops->tiu_copy(cvk_ctx, &param);
+    }
+
+    // 3. Tensor store, CW transpose
+    {
+      cvk_tg_t tg_dst_tiled;
+      memset(&tg_dst_tiled, 0, sizeof(tg_dst_tiled));
+      tg_dst_tiled.base_reg_index = 0;
+      tg_dst_tiled.start_address = tg_dst->start_address + dst_offset;
+      tg_dst_tiled.fmt = fmt;
+      tg_dst_tiled.shape.n = tl_move_dst->shape.n;
+      tg_dst_tiled.shape.c = tl_move_dst->shape.w; // CW transpose
+      tg_dst_tiled.shape.h = tl_move_dst->shape.h;
+      tg_dst_tiled.shape.w = tl_move_dst->shape.c; // CW transpose
+      tg_dst_tiled.stride =
+          cvk_ctx->ops->tg_default_stride(cvk_ctx, tg_dst_tiled.shape, fmt);
+
+      cvk_tdma_l2g_tensor_copy_cw_transposed_param_t param;
+      param.src = tl_move_dst;
+      param.dst = &tg_dst_tiled;
+      cvk_ctx->ops->tdma_l2g_tensor_copy_cw_transposed(cvk_ctx, &param);
+    }
+
+    // Free local memory
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_move_dst);
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_load_dst_tiled);
+  }
+
+  CVI_RT_Submit(cvk_ctx);
+
+  int8_t *dst_data = (int8_t *)tensor_copy_d2s(rt_handle, tg_dst);
+
+  printf("  test_tp_0231: compare\n");
+  for (uint64_t i = 0; i < sizeof(ref_dst_data); i++) {
+    if (dst_data[i] != ref_dst_data[i]) {
+      printf("    dst_data[%" PRIu64 "]  %d != %d\n",
+             i, dst_data[i], ref_dst_data[i]);
+      ret = -1;
+    }
+  }
+
+  // Free global memory
+  free(dst_data);
+  free_tensor_dev_mem(rt_handle, tg_src);
+  free_tensor_dev_mem(rt_handle, tg_dst);
+
+  return ret;
+}
+
+//
+// Permute 0321, (N, C, H, W) -> (N, H, W, C)
+//   tensor load
+//   tensor store, cw transpose
+//
+//  0  1  2  3
+// (N, C, H, W) -> (N, W, H, C)
+// (1, 4, 2, 2) -> (1, 2, 2, 4)
+//
+//
+// Source (1, 4, 2, 2)
+//
+// Tile 1   Tile 0
+//   H1  ||   H0
+//  3  2 ||  1  0     C0
+//  7  6 ||  5  4     C1
+// 11 10 ||  9  8     C2
+// 15 14 || 13 12     C3
+//
+//
+// Destination (1, 2, 2, 4)
+//
+//   Tile 1          Tile 0
+//      H1     ||      H0
+// 14 10  6  2 || 12  8  4  0   C0
+// 15 11  7  3 || 13  9  5  1   C1
+//
+// 1. Tile 0
+// 1.1. Tensor load
+//     src shape (1, 4, 1, 2), stride (16, 4, 2), offset 0
+//     dst shape (1, 4, 1, 2), stride (2, 2, 2)
+//
+//   H0
+//  1  0     C0
+//  5  4     C1
+//  9  8     C2
+// 13 12     C3
+//
+// 1.2. Tensor store, CW transpose
+//     src shape (1, 4, 1, 2), stride (2, 2, 2)
+//     dst shape (1, 2, 1, 4), stride (8, 2, 4), offset 0
+//
+//      H0
+// 12  8  4  0   C0
+// 13  9  5  1   C1
+//
+//
+// 2. Tile 1
+// 2.1. Tensor load
+//    src shape (1, 4, 1, 2), stride (16, 4, 2), offset 2
+//    dst shape (1, 4, 1, 2), stride (2, 1, 2)
+//
+//   H0
+//  3  2     C0
+//  7  6     C1
+// 11 10     C2
+// 15 14     C3
+//
+// 2.2. Tensor store, CW transpose
+//     src shape (1, 4, 1, 2), stride (1, 2, 1, 2)
+//     dst shape (1, 2, 1, 4), stride (8, 2, 4), offset 4
+//
+//       H1
+//  14 10  6  2    C0
+//  15 11  7  3    C1
+//
+static int test_tp_0321(CVI_RT_HANDLE rt_handle, cvk_context_t *cvk_ctx)
+{
+  int ret = 0;
+  cvk_fmt_t fmt = CVK_FMT_I8;
+  int64_t data_type_size = (fmt == CVK_FMT_BF16) ? 2 : 1;
+  uint8_t eu_align = 0; // No need to align eu
+  uint32_t orders[NCHW_MAX_DIMS] = {0, 3, 2, 1}; // NCHW -> NWHC
+
+  uint32_t src_shapes[NCHW_MAX_DIMS] = {1, 4, 2, 2};
+  uint32_t src_strides[NCHW_MAX_DIMS];
+  src_strides[NCHW_W] = data_type_size;
+  src_strides[NCHW_H] = src_shapes[NCHW_W] * src_strides[NCHW_W];
+  src_strides[NCHW_C] = src_shapes[NCHW_H] * src_strides[NCHW_H];
+  src_strides[NCHW_N] = src_shapes[NCHW_C] * src_strides[NCHW_C];
+
+  uint32_t dst_shapes[NCHW_MAX_DIMS] = {
+      src_shapes[orders[NCHW_N]], src_shapes[orders[NCHW_C]],
+      src_shapes[orders[NCHW_H]], src_shapes[orders[NCHW_W]]};
+  uint32_t dst_strides[NCHW_MAX_DIMS];
+  dst_strides[NCHW_W] = 1; // int8
+  dst_strides[NCHW_H] = dst_shapes[NCHW_W] * dst_strides[NCHW_W];
+  dst_strides[NCHW_C] = dst_shapes[NCHW_H] * dst_strides[NCHW_H];
+  dst_strides[NCHW_N] = dst_shapes[NCHW_C] * dst_strides[NCHW_C];
+
+  // Source shape (1, 4, 2, 2)
+  const int8_t src_data[] = {
+      //  H0 |   H1
+        0,  1,  2,  3,  // C0
+        4,  5,  6,  7,  // C1
+        8,  9, 10, 11,  // C2
+       12, 13, 14, 15   // C3
+  };
+
+  // Destination shape (1, 2, 2, 4)
+  const int8_t ref_dst_data[] = {
+      //    H0      |       H1
+       0,  4,  8, 12,  2,  6, 10, 14,   // C0
+       1,  5,  9, 13,  3,  7, 11, 15    // C1
+  };
+
+  int8_t dst_data_cpu[sizeof(ref_dst_data)];
+
+  // Derive destination offset from source position
+  uint32_t dst_index[NCHW_MAX_DIMS];
+  dst_index[orders[NCHW_N]] = 0;
+  dst_index[orders[NCHW_C]] = 1;
+  dst_index[orders[NCHW_H]] = 2;
+  dst_index[orders[NCHW_W]] = 3;
+
+  // test element-wise copy
+  {
+    // source is contiguous
+    for (uint32_t i = 0; i < src_shapes[NCHW_N]; ++i) {
+      for (uint32_t j = 0; j < src_shapes[NCHW_C]; ++j) {
+        for (uint32_t k = 0; k < src_shapes[NCHW_H]; ++k) {
+          for (uint32_t l = 0; l < src_shapes[NCHW_W]; ++l) {
+            uint32_t src_offset =
+                i * src_strides[NCHW_N] + j * src_strides[NCHW_C] +
+                k * src_strides[NCHW_H] + l * src_strides[NCHW_W];
+            uint32_t dst_offset = i * dst_strides[dst_index[NCHW_N]] +
+                                  j * dst_strides[dst_index[NCHW_C]] +
+                                  k * dst_strides[dst_index[NCHW_H]] +
+                                  l * dst_strides[dst_index[NCHW_W]];
+            dst_data_cpu[dst_offset] = src_data[src_offset];
+          }
+        }
+      }
+    }
+
+    printf("  test_tp_0321: elt copy, compare test\n");
+    for (uint32_t i = 0; i < sizeof(dst_data_cpu); ++i) {
+      if (dst_data_cpu[i] != ref_dst_data[i]) {
+        printf("    [%d] dst_data %d != %d\n",
+               i, dst_data_cpu[i], ref_dst_data[i]);
+        ret = -1;
+      }
+    }
+  }
+
+  //
+  // Data initialization in runtime.
+  //
+  cvk_tg_shape_t tg_src_shape = {
+      src_shapes[NCHW_N], src_shapes[NCHW_C], src_shapes[NCHW_H],
+      src_shapes[NCHW_W]};
+  int64_t src_length =
+      src_shapes[NCHW_N] * src_shapes[NCHW_C] * src_shapes[NCHW_H] *
+      src_shapes[NCHW_W];
+
+  CVI_RT_MEM gm_src_dev_mem =
+      CVI_RT_MemAlloc(rt_handle, src_length * data_type_size);
+
+  cvk_tg_t tg_src;
+  gmem_init_tensor(cvk_ctx, &tg_src, tg_src_shape, fmt);
+  tg_src.start_address = CVI_RT_MemGetPAddr(gm_src_dev_mem);
+
+  // Copy from system memory to device memory
+  CVI_RT_MemCopyS2D(rt_handle, gm_src_dev_mem, (uint8_t *)src_data);
+
+  cvk_tg_shape_t tg_dst_shape = {
+      dst_shapes[NCHW_N], dst_shapes[NCHW_C], dst_shapes[NCHW_H],
+      dst_shapes[NCHW_W]};
+  int64_t dst_length =
+      dst_shapes[NCHW_N] * dst_shapes[NCHW_C] * dst_shapes[NCHW_H] *
+      dst_shapes[NCHW_W];
+
+  CVI_RT_MEM gm_dst_dev_mem =
+      CVI_RT_MemAlloc(rt_handle, dst_length * data_type_size);
+
+  cvk_tg_t tg_dst;
+  gmem_init_tensor(cvk_ctx, &tg_dst, tg_dst_shape, fmt);
+  tg_dst.start_address = CVI_RT_MemGetPAddr(gm_dst_dev_mem);
+
+  //
+  // Main tiled transpose routine
+  //
+  uint32_t src_h_step = src_shapes[NCHW_H] / 2; // 2 tiles
+  uint32_t src_poss[NCHW_MAX_DIMS] = {0, 0, 0, 0};
+  for (src_poss[NCHW_H] = 0; src_poss[NCHW_H] < src_shapes[NCHW_H];
+       src_poss[NCHW_H] += src_h_step) {
+    uint32_t src_tiled_shapes[NCHW_MAX_DIMS] = {
+        src_shapes[NCHW_N], src_shapes[NCHW_C], src_shapes[NCHW_H],
+        src_shapes[NCHW_W]};
+    src_tiled_shapes[NCHW_H] =
+        ((src_poss[NCHW_H] + src_h_step) > src_shapes[NCHW_H]) ?
+            (src_shapes[NCHW_H] - src_poss[NCHW_H]) : src_h_step;
+
+    uint32_t src_offset =
+        src_poss[NCHW_N] * src_strides[NCHW_N] +
+        src_poss[NCHW_C] * src_strides[NCHW_C] +
+        src_poss[NCHW_H] * src_strides[NCHW_H] +
+        src_poss[NCHW_W] * src_strides[NCHW_W];
+    uint32_t dst_offset = src_poss[NCHW_N] * dst_strides[dst_index[NCHW_N]] +
+                          src_poss[NCHW_C] * dst_strides[dst_index[NCHW_C]] +
+                          src_poss[NCHW_H] * dst_strides[dst_index[NCHW_H]] +
+                          src_poss[NCHW_W] * dst_strides[dst_index[NCHW_W]];
+
+    // 1. Tensor load, tiled shape, global stride
+    cvk_tl_t *tl_load_dst_tiled = NULL;
+    {
+      cvk_tg_t tg_src_tiled;
+      memset(&tg_src_tiled, 0, sizeof(tg_src_tiled));
+      tg_src_tiled.base_reg_index = 0;
+      tg_src_tiled.start_address = tg_src.start_address + src_offset;
+      tg_src_tiled.fmt = fmt;
+      tg_src_tiled.shape.n = src_tiled_shapes[NCHW_N];
+      tg_src_tiled.shape.c = src_tiled_shapes[NCHW_C];
+      tg_src_tiled.shape.h = src_tiled_shapes[NCHW_H];
+      tg_src_tiled.shape.w = src_tiled_shapes[NCHW_W];
+      tg_src_tiled.stride.n = tg_src.stride.n;
+      tg_src_tiled.stride.c = tg_src.stride.c;
+      tg_src_tiled.stride.h = tg_src.stride.h;
+
+      cvk_tl_shape_t tl_dst_tiled_shape = {
+          src_tiled_shapes[NCHW_N], src_tiled_shapes[NCHW_C],
+          src_tiled_shapes[NCHW_H], src_tiled_shapes[NCHW_W]};
+      tl_load_dst_tiled =
+          cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_dst_tiled_shape, fmt,
+                                          eu_align);
+
+      cvk_tdma_g2l_tensor_copy_param_t param;
+      param.src = &tg_src_tiled;
+      param.dst = tl_load_dst_tiled;
+      cvk_ctx->ops->tdma_g2l_tensor_copy(cvk_ctx, &param);
+    }
+
+    // 2. Tensor store, CW transpose
+    {
+      cvk_tg_t tg_dst_tiled;
+      memset(&tg_dst_tiled, 0, sizeof(tg_dst_tiled));
+      tg_dst_tiled.base_reg_index = 0;
+      tg_dst_tiled.start_address = tg_dst.start_address + dst_offset;
+      tg_dst_tiled.fmt = fmt;
+      tg_dst_tiled.shape.n = src_tiled_shapes[NCHW_N];
+      tg_dst_tiled.shape.c = src_tiled_shapes[NCHW_W]; // CW transpose
+      tg_dst_tiled.shape.h = src_tiled_shapes[NCHW_H];
+      tg_dst_tiled.shape.w = src_tiled_shapes[NCHW_C]; // CW transpose
+      tg_dst_tiled.stride.n = tg_dst.stride.n;
+      tg_dst_tiled.stride.c = tg_dst.stride.c;
+      tg_dst_tiled.stride.h = tg_dst.stride.h;
+
+      cvk_tdma_l2g_tensor_copy_cw_transposed_param_t param;
+      param.src = tl_load_dst_tiled;
+      param.dst = &tg_dst_tiled;
+      cvk_ctx->ops->tdma_l2g_tensor_copy_cw_transposed(cvk_ctx, &param);
+    }
+
+    // Free local memory
+    cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_load_dst_tiled);
+  }
+
+  CVI_RT_Submit(cvk_ctx);
+
+  int8_t dst_data[src_length];
+
+  // copy from device memory to system memory
+  CVI_RT_MemCopyD2S(rt_handle, (uint8_t *) dst_data, gm_dst_dev_mem);
+
+  printf("  test_tp_0321: compare\n");
+  for (uint64_t i = 0; i < sizeof(ref_dst_data); i++) {
+    if (dst_data[i] != ref_dst_data[i]) {
+      printf("    dst_data[%" PRIu64 "]  %d != %d\n",
+             i, dst_data[i], ref_dst_data[i]);
+      ret = -1;
+    }
+  }
+
+  CVI_RT_MemFree(rt_handle, gm_dst_dev_mem);
+  CVI_RT_MemFree(rt_handle, gm_src_dev_mem);
+
+  return ret;
+}
+
+int main(int argc, char **argv)
+{
+  int ret = 0;
+
+  if (!argc)
+    return -1;
+  if (!argv)
+    return -1;
+
+  CVI_RT_HANDLE rt_handle;
+  cvk_context_t *cvk_ctx = NULL;
+
+  CVI_RT_Init(&rt_handle);
+  cvk_ctx = CVI_RT_RegisterKernel(rt_handle, CMDBUF_SIZE);
+
+  ret = test_tl_copy(rt_handle, cvk_ctx, 0);
+  ret |= test_tl_copy(rt_handle, cvk_ctx, 1);
+  ret |= test_hw_tp(rt_handle, cvk_ctx);
+  ret |= test_tp_0213(rt_handle, cvk_ctx);
+  ret |= test_tp_0231(rt_handle, cvk_ctx);
+  ret |= test_tp_0321(rt_handle, cvk_ctx);
+
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_DeInit(rt_handle);
+
+  if (!ret)
+    printf("%s pass\n", __FILENAME__);
+  else
+    printf("%s fail\n", __FILENAME__);
+
+  return ret;
+}
diff --git a/cviruntime/test/CMakeLists.txt b/cviruntime/test/CMakeLists.txt
new file mode 100644
index 000000000..cde21c79c
--- /dev/null
+++ b/cviruntime/test/CMakeLists.txt
@@ -0,0 +1,137 @@
+cmake_minimum_required(VERSION 2.8.0)
+enable_testing()
+
+include(CTest)
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/test_utils)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../include)
+include_directories(${CMAKE_SYSROOT}/include)
+
+set(CVI_LIBS ${CVI_LIBS} cviruntime)
+
+set(_TEST_UTILS
+  test_utils/test_tf_quant_util.c
+  test_utils/test_native_ref.c
+  test_utils/test_cvikernel_util.c
+)
+add_library(cviruntime_test STATIC ${_TEST_UTILS})
+
+if (CHIP STREQUAL "cv183x" OR RUNTIME STREQUAL "CMODEL")
+  # SOC BM1880v2 or CMODEL all in one
+  file(GLOB_RECURSE TEST_1880v2_CASES 1880v2/*.cpp)
+  foreach(TEST_SRC ${TEST_1880v2_CASES})
+    get_filename_component(TEST_NAME ${TEST_SRC} NAME_WE)
+
+    add_executable(${TEST_NAME} ${TEST_SRC})
+    target_link_libraries(${TEST_NAME} ${CVI_LIBS} ${EXTRA_LIBS} cviruntime_test)
+    set_target_properties(${TEST_NAME} PROPERTIES COMPILE_FLAGS "-Werror -Wall -Wextra")
+    install(TARGETS ${TEST_NAME} DESTINATION bin)
+
+    add_test(${TEST_NAME} ${TEST_NAME} ctest_test)
+  endforeach()
+
+  # C version test
+  file(GLOB TEST_1880v2_NEW_CASES 1880v2/test_*.c)
+
+  foreach(TEST_SRC ${TEST_1880v2_NEW_CASES})
+    get_filename_component(TEST_NAME ${TEST_SRC} NAME_WE)
+
+    add_executable(${TEST_NAME} ${TEST_SRC})
+    target_link_libraries(${TEST_NAME} ${CVI_LIBS} ${EXTRA_LIBS}
+                          cviruntime_test m)
+    set_target_properties(${TEST_NAME} PROPERTIES COMPILE_FLAGS "-Werror -Wall -Wextra")
+    install(TARGETS ${TEST_NAME} DESTINATION bin)
+
+    add_test(${TEST_NAME} ${TEST_NAME} ctest_test)
+  endforeach()
+endif()
+ 
+if(CHIP STREQUAL "cv182x" OR RUNTIME STREQUAL "CMODEL")
+  # SOC BM1822 or CMODEL all in one
+  file(GLOB_RECURSE TEST_1822_CASES 1822/*.cpp)
+  foreach(TEST_SRC ${TEST_1822_CASES})
+    get_filename_component(TEST_NAME ${TEST_SRC} NAME_WE)
+
+    add_executable(${TEST_NAME} ${TEST_SRC})
+    target_link_libraries(${TEST_NAME} ${CVI_LIBS} ${EXTRA_LIBS} cviruntime_test)
+    set_target_properties(${TEST_NAME} PROPERTIES COMPILE_FLAGS "-Werror -Wall -Wextra")
+    install(TARGETS ${TEST_NAME} DESTINATION bin)
+
+    add_test(${TEST_NAME} ${TEST_NAME} ctest_test)
+  endforeach()
+
+  # C version test
+  file(GLOB TEST_1822_NEW_CASES 1822/test_*.c)
+
+  foreach(TEST_SRC ${TEST_1822_NEW_CASES})
+    get_filename_component(TEST_NAME ${TEST_SRC} NAME_WE)
+
+    add_executable(${TEST_NAME} ${TEST_SRC})
+    target_link_libraries(${TEST_NAME} ${CVI_LIBS} ${EXTRA_LIBS}
+                          cviruntime_test m)
+    set_target_properties(${TEST_NAME} PROPERTIES COMPILE_FLAGS "-Werror -Wall -Wextra")
+    install(TARGETS ${TEST_NAME} DESTINATION bin)
+
+    add_test(${TEST_NAME} ${TEST_NAME} ctest_test)
+  endforeach()
+endif()
+
+if(CHIP STREQUAL "cv181x" OR RUNTIME STREQUAL "CMODEL")
+  # SOC BM1810 or CMODEL all in one
+  file(GLOB_RECURSE TEST_CV181X_CASES 181x/*.cpp)
+  foreach(TEST_SRC ${TEST_CV181X_CASES})
+    get_filename_component(TEST_NAME ${TEST_SRC} NAME_WE)
+
+    add_executable(${TEST_NAME} ${TEST_SRC})
+    target_link_libraries(${TEST_NAME} ${CVI_LIBS} ${EXTRA_LIBS} cviruntime_test)
+    set_target_properties(${TEST_NAME} PROPERTIES COMPILE_FLAGS "-Werror -Wall -Wextra")
+    install(TARGETS ${TEST_NAME} DESTINATION bin)
+
+    add_test(${TEST_NAME} ${TEST_NAME} ctest_test)
+  endforeach()
+
+  # C version test
+  file(GLOB TEST_CV181X_NEW_CASES 181x/test_*.c)
+
+  foreach(TEST_SRC ${TEST_CV181X_NEW_CASES})
+    get_filename_component(TEST_NAME ${TEST_SRC} NAME_WE)
+
+    add_executable(${TEST_NAME} ${TEST_SRC})
+    target_link_libraries(${TEST_NAME} ${CVI_LIBS} ${EXTRA_LIBS}
+                          cviruntime_test m)
+    set_target_properties(${TEST_NAME} PROPERTIES COMPILE_FLAGS "-Werror -Wall -Wextra")
+    install(TARGETS ${TEST_NAME} DESTINATION bin)
+
+    add_test(${TEST_NAME} ${TEST_NAME} ctest_test)
+  endforeach()
+endif()
+
+if(CHIP STREQUAL "cv180x" OR RUNTIME STREQUAL "CMODEL")
+  # SOC BM1810 or CMODEL all in one
+  file(GLOB_RECURSE TEST_CV180X_CASES 180x/*.cpp)
+  foreach(TEST_SRC ${TEST_CV180X_CASES})
+    get_filename_component(TEST_NAME ${TEST_SRC} NAME_WE)
+
+    add_executable(${TEST_NAME} ${TEST_SRC})
+    target_link_libraries(${TEST_NAME} ${CVI_LIBS} ${EXTRA_LIBS} cviruntime_test)
+    set_target_properties(${TEST_NAME} PROPERTIES COMPILE_FLAGS "-Werror -Wall -Wextra")
+    install(TARGETS ${TEST_NAME} DESTINATION bin)
+
+    add_test(${TEST_NAME} ${TEST_NAME} ctest_test)
+  endforeach()
+
+  # C version test
+  file(GLOB TEST_CV180X_NEW_CASES 180x/test_*.c)
+
+  foreach(TEST_SRC ${TEST_CV180X_NEW_CASES})
+    get_filename_component(TEST_NAME ${TEST_SRC} NAME_WE)
+
+    add_executable(${TEST_NAME} ${TEST_SRC})
+    target_link_libraries(${TEST_NAME} ${CVI_LIBS} ${EXTRA_LIBS}
+                          cviruntime_test m)
+    set_target_properties(${TEST_NAME} PROPERTIES COMPILE_FLAGS "-Werror -Wall -Wextra")
+    install(TARGETS ${TEST_NAME} DESTINATION bin)
+
+    add_test(${TEST_NAME} ${TEST_NAME} ctest_test)
+  endforeach()
+endif()
diff --git a/cviruntime/test/test_utils/test_cvikernel_util.c b/cviruntime/test/test_utils/test_cvikernel_util.c
new file mode 100644
index 000000000..8496e0318
--- /dev/null
+++ b/cviruntime/test/test_utils/test_cvikernel_util.c
@@ -0,0 +1,811 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <assert.h>
+#include "test_cvikernel_util.h"
+
+// Fail:
+//   test_1810_tdma_bf16_matrix_vlc_decompress_compress
+//   test_1810_tdma_bf16_tensor_vlc_decompress_compress
+//   test_1810_tdma_g2l_bf16_matrix_vlc_copy_decompressed
+//   test_1810_tdma_g2l_bf16_tensor_vlc_copy_decompressed
+//   test_1810_tdma_l2g_bf16_matrix_vlc_copy_compressed
+//   test_1810_tdma_l2g_bf16_tensor_vlc_copy_compressed
+//#define ENABEL_GAUSSIANRANDOM_VLC_TEST
+
+#define RAND_SEED_MOD 10
+
+#define container_of(ptr, type, member) ({                      \
+        const typeof( ((type *)0)->member ) *__mptr = (ptr);    \
+        (type *)( (char *)__mptr - offsetof(type,member) );})
+
+typedef struct {
+  cvk_tg_t tg;
+  CVI_RT_MEM mem;
+} test_tg_wrapper_t;
+
+typedef struct {
+  cvk_cmpr_tg_t cmpr_tg;
+  CVI_RT_MEM mem;
+} test_cmpr_tg_wrapper_t;
+
+typedef struct {
+  cvk_mg_t mg;
+  CVI_RT_MEM mem;
+} test_mg_wrapper_t;
+
+typedef struct {
+  cvk_cmpr_mg_t cmpr_mg;
+  CVI_RT_MEM mem;
+} test_cmpr_mg_wrapper_t;
+
+#define CHECK(_cond)           assert((_cond))
+#define CHECK_LE(a, b)         CHECK((a) <= (b))
+#define CHECK_GT(a, b)         CHECK((a) > (b))
+
+cvk_tg_t *alloc_tensor_dev_mem(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    cvk_tg_shape_t shape,
+    cvk_fmt_t fmt)
+{
+  test_tg_wrapper_t *w = malloc(sizeof(*w));
+  if (!w)
+    return NULL;
+
+  w->mem = CVI_RT_MemAlloc(rt_handle, tg_shape_size(&shape, fmt));
+  cvk_ctx->ops->gmem_init_tensor(cvk_ctx, &w->tg, shape, fmt);
+  w->tg.start_address = CVI_RT_MemGetPAddr(w->mem);
+
+  return &w->tg;
+}
+
+void free_tensor_dev_mem(
+    CVI_RT_HANDLE rt_handle,
+    const cvk_tg_t *tg)
+{
+  test_tg_wrapper_t *w = container_of(tg, test_tg_wrapper_t, tg);
+  CVI_RT_MemFree(rt_handle, w->mem);
+
+  free(w);
+}
+
+cvk_cmpr_tg_t *alloc_cmpr_tensor_dev_mem(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    cvk_tg_shape_t shape,
+    cvk_fmt_t fmt,
+    CommandInfo *cmd_info)
+{
+  if (fmt != CVK_FMT_I8 && fmt != CVK_FMT_U8 && fmt != CVK_FMT_BF16)
+    return NULL;
+
+  test_cmpr_tg_wrapper_t *w = malloc(sizeof(*w));
+  if (!w)
+    return NULL;
+
+  size_t bs_buf_size = cmpr_tg_shape_size(&shape, fmt);
+  w->mem = CVI_RT_MemAlloc(rt_handle, bs_buf_size);
+
+  memset(&w->cmpr_tg, 0, sizeof(w->cmpr_tg));
+  cvk_ctx->ops->gmem_init_tensor(cvk_ctx, &w->cmpr_tg.t, shape, fmt);
+  w->cmpr_tg.t.start_address = CVI_RT_MemGetPAddr(w->mem);
+  w->cmpr_tg.reserved_size = bs_buf_size;
+
+  if (cmd_info) {
+    w->cmpr_tg.bias0 = cmd_info->bias0;
+    w->cmpr_tg.bias1 = cmd_info->bias1;
+    w->cmpr_tg.zero_guard_en = cmd_info->zero_guard_en;
+  }
+  else {
+    if (fmt == CVK_FMT_BF16) {
+      w->cmpr_tg.bias0 = 127;
+    }
+    else if (fmt == CVK_FMT_I8 || fmt == CVK_FMT_U8) {
+      w->cmpr_tg.bias0 = 0;
+    }
+
+    w->cmpr_tg.bias1 = 0;
+    w->cmpr_tg.zero_guard_en = 0;
+  }
+
+  return &w->cmpr_tg;
+}
+
+void free_cmpr_tensor_dev_mem(
+    CVI_RT_HANDLE rt_handle,
+    const cvk_cmpr_tg_t *cmpr_tg)
+{
+  test_cmpr_tg_wrapper_t *w = container_of(cmpr_tg, test_cmpr_tg_wrapper_t, cmpr_tg);
+  CVI_RT_MemFree(rt_handle, w->mem);
+
+  free(w);
+}
+
+CVI_RC tensor_copy_s2d(
+  CVI_RT_HANDLE rt_handle,
+  const cvk_tg_t *tg,
+  uint8_t *data)
+{
+  test_tg_wrapper_t *w = container_of(tg, test_tg_wrapper_t, tg);
+  return CVI_RT_MemCopyS2D(rt_handle, w->mem, data);
+}
+
+CVI_RC cmpr_tensor_copy_s2d(
+    CVI_RT_HANDLE rt_handle,
+    const cvk_cmpr_tg_t *cmpr_tg,
+    uint8_t *data)
+{
+  test_cmpr_tg_wrapper_t *w = container_of(cmpr_tg, test_cmpr_tg_wrapper_t, cmpr_tg);
+  return CVI_RT_MemCopyS2D(rt_handle, w->mem, data);
+}
+
+uint8_t *tensor_copy_d2s(
+    CVI_RT_HANDLE rt_handle,
+    const cvk_tg_t *tg)
+{
+  uint32_t size = tg_shape_size(&tg->shape, tg->fmt);
+  uint8_t *data = (uint8_t *)malloc(size);
+  if (!data)
+    return NULL;
+
+  test_tg_wrapper_t *w = container_of(tg, test_tg_wrapper_t, tg);
+  CVI_RC ret = CVI_RT_MemCopyD2S(rt_handle, data, w->mem);
+  if (ret != CVI_SUCCESS) {
+    free(data);
+    data = NULL;
+  }
+
+  return data;
+}
+
+uint8_t *cmpr_tensor_copy_d2s(
+    CVI_RT_HANDLE rt_handle,
+    const cvk_cmpr_tg_t *cmpr_tg)
+{
+  size_t bs_buf_size = cmpr_tg_shape_size(&cmpr_tg->t.shape, cmpr_tg->t.fmt);
+  uint8_t *data = (uint8_t *)malloc(bs_buf_size);
+  if (!data)
+    return NULL;
+
+  test_cmpr_tg_wrapper_t *w = container_of(cmpr_tg, test_cmpr_tg_wrapper_t, cmpr_tg);
+  CVI_RC ret = CVI_RT_MemCopyD2S(rt_handle, data, w->mem);
+  if (ret != CVI_SUCCESS) {
+    free(data);
+    data = NULL;
+  }
+
+  return data;
+}
+
+uint8_t *tensor_copy_l2g_d2s_stride(
+  CVI_RT_HANDLE rt_handle,
+  cvk_context_t *cvk_ctx,
+  const cvk_tl_t *tl,
+  cvk_tg_stride_t tg_stride)
+{
+  cvk_tg_shape_t s;
+  s.n = tl->shape.n;
+  s.c = tl->shape.h;
+  s.h = tl->shape.w;
+  s.w = tl->shape.c;
+  cvk_tg_t *tg = alloc_tensor_dev_mem(rt_handle, cvk_ctx, s, tl->fmt);
+  if (!tg)
+    return NULL;
+
+  tg->stride = tg_stride;
+
+  cvk_tdma_l2g_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = tl;
+  p.dst = tg;
+  cvk_ctx->ops->tdma_l2g_tensor_copy(cvk_ctx, &p);
+  CVI_RT_Submit(cvk_ctx);
+
+  uint8_t *data = tensor_copy_d2s(rt_handle, tg);
+  free_tensor_dev_mem(rt_handle, tg);
+
+  return data;
+}
+
+uint8_t *tensor_copy_l2g_d2s(
+  CVI_RT_HANDLE rt_handle,
+  cvk_context_t *cvk_ctx,
+  const cvk_tl_t *tl)
+{
+  cvk_tg_shape_t s;
+  s.n = tl->shape.n;
+  s.c = tl->shape.h;
+  s.h = tl->shape.w;
+  s.w = tl->shape.c;
+  cvk_tg_stride_t tg_stride =
+      cvk_ctx->ops->tg_default_stride(cvk_ctx, s, tl->fmt);
+
+  uint8_t *data = tensor_copy_l2g_d2s_stride(rt_handle, cvk_ctx, tl, tg_stride);
+
+  return data;
+}
+
+void tensor_copy_s2d_g2l_stride(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    cvk_tg_stride_t tg_stride,
+    uint8_t *data)
+{
+  cvk_tg_shape_t tg_shape;
+  tg_shape.n = tl->shape.n;
+  tg_shape.c = tl->shape.c;
+  tg_shape.h = tl->shape.h;
+  tg_shape.w = tl->shape.w;
+
+  cvk_tg_t *tg = alloc_tensor_dev_mem(rt_handle, cvk_ctx, tg_shape, tl->fmt);
+  if (!tg)
+    return;
+
+  tg->stride = tg_stride;
+  tensor_copy_s2d(rt_handle, tg, data);
+
+  cvk_tdma_g2l_tensor_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = tg;
+  p.dst = tl;
+  cvk_ctx->ops->tdma_g2l_tensor_copy(cvk_ctx, &p);
+  CVI_RT_Submit(cvk_ctx);
+
+  free_tensor_dev_mem(rt_handle, tg);
+}
+
+void tensor_copy_s2d_g2l(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    uint8_t *data)
+{
+  cvk_tg_shape_t tg_shape;
+  tg_shape.n = tl->shape.n;
+  tg_shape.c = tl->shape.c;
+  tg_shape.h = tl->shape.h;
+  tg_shape.w = tl->shape.w;
+
+  cvk_tg_stride_t tg_stride =
+      cvk_ctx->ops->tg_default_stride(cvk_ctx, tg_shape, tl->fmt);
+
+  tensor_copy_s2d_g2l_stride(rt_handle, cvk_ctx, tl, tg_stride, data);
+}
+
+void tensor_copy_s2d_g2l_tp_stride(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    cvk_tg_stride_t tg_stride,
+    uint8_t *data)
+{
+  cvk_tg_shape_t tg_shape =
+      tg_shape_t4(tl->shape.n, tl->shape.c, tl->shape.h, tl->shape.w);
+
+  cvk_tg_t *tg = alloc_tensor_dev_mem(rt_handle, cvk_ctx, tg_shape, tl->fmt);
+  if (!tg)
+    return;
+
+  tg->stride = tg_stride;
+  tensor_copy_s2d(rt_handle, tg, data);
+
+  cvk_tdma_g2l_tensor_copy_nc_transposed_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = tg;
+  p.dst = tl;
+  cvk_ctx->ops->tdma_g2l_tensor_copy_nc_transposed(cvk_ctx, &p);
+  CVI_RT_Submit(cvk_ctx);
+
+  free_tensor_dev_mem(rt_handle, tg);
+}
+
+void tensor_copy_s2d_g2l_tp(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    uint8_t *data)
+{
+  cvk_tg_shape_t tg_shape =
+      tg_shape_t4(tl->shape.n, tl->shape.c, tl->shape.h, tl->shape.w);
+
+  cvk_tg_stride_t tg_stride =
+      cvk_ctx->ops->tg_default_stride(cvk_ctx, tg_shape, tl->fmt);
+
+  tensor_copy_s2d_g2l_tp_stride(rt_handle, cvk_ctx, tl, tg_stride, data);
+}
+
+cvk_mg_t *alloc_matrix_dev_mem(
+    CVI_RT_HANDLE rt_handle,
+    cvk_mg_shape_t shape,
+    cvk_fmt_t fmt)
+{
+  uint32_t w_stride = fmt_size(fmt);
+  test_mg_wrapper_t *w = malloc(sizeof(*w));
+  if (!w)
+    return NULL;
+
+  memset(&w->mg, 0, sizeof(w->mg));
+
+  w->mem = CVI_RT_MemAlloc(rt_handle, mg_shape_size(&shape, fmt));
+  w->mg.base_reg_index = 0;
+  w->mg.start_address = CVI_RT_MemGetPAddr(w->mem);
+  w->mg.fmt = fmt;
+  w->mg.shape = shape;
+  w->mg.stride.row = shape.col * w_stride;
+
+  return &w->mg;
+}
+
+void free_matrix_dev_mem(
+    CVI_RT_HANDLE rt_handle,
+    const cvk_mg_t *mg)
+{
+  test_mg_wrapper_t *w = container_of(mg, test_mg_wrapper_t, mg);
+  CVI_RT_MemFree(rt_handle, w->mem);
+
+  free(w);
+}
+
+cvk_cmpr_mg_t *alloc_cmpr_matrix_dev_mem(
+    CVI_RT_HANDLE rt_handle,
+    cvk_mg_shape_t shape,
+    cvk_fmt_t fmt,
+    CommandInfo *cmd_info)
+{
+  if (fmt != CVK_FMT_I8 && fmt != CVK_FMT_U8 && fmt != CVK_FMT_BF16)
+    return NULL;
+
+  uint32_t w_stride = fmt_size(fmt);
+  test_cmpr_mg_wrapper_t *w = malloc(sizeof(*w));
+  if (!w)
+    return NULL;
+
+  size_t bs_buf_size = cmpr_mg_shape_size(&shape, fmt);
+  w->mem = CVI_RT_MemAlloc(rt_handle, bs_buf_size);
+
+  memset(&w->cmpr_mg, 0, sizeof(w->cmpr_mg));
+  w->cmpr_mg.m.base_reg_index = 0;
+  w->cmpr_mg.m.start_address = CVI_RT_MemGetPAddr(w->mem);
+  w->cmpr_mg.m.fmt = fmt;
+  w->cmpr_mg.m.shape = shape;
+  w->cmpr_mg.m.stride.row = shape.col * w_stride;
+
+  if (cmd_info) {
+    w->cmpr_mg.bias0 = cmd_info->bias0;
+    w->cmpr_mg.bias1 = cmd_info->bias1;
+    w->cmpr_mg.zero_guard_en = cmd_info->zero_guard_en;
+  }
+  else {
+    w->cmpr_mg.bias0 = 0;
+
+    if (fmt == CVK_FMT_BF16) {
+      w->cmpr_mg.bias0 = 127;
+    }
+    else if (fmt == CVK_FMT_I8 || fmt == CVK_FMT_U8) {
+      w->cmpr_mg.bias0 = 0;
+    }
+
+    w->cmpr_mg.bias1 = 0;
+    w->cmpr_mg.zero_guard_en = 0;
+  }
+
+  return &w->cmpr_mg;
+}
+
+void free_cmpr_matrix_dev_mem(
+    CVI_RT_HANDLE rt_handle,
+    const cvk_cmpr_mg_t *cmpr_mg)
+{
+  test_cmpr_mg_wrapper_t *w = container_of(cmpr_mg, test_cmpr_mg_wrapper_t, cmpr_mg);
+  CVI_RT_MemFree(rt_handle, w->mem);
+
+  free(w);
+}
+
+CVI_RC matrix_copy_s2d(
+  CVI_RT_HANDLE rt_handle,
+  const cvk_mg_t *mg,
+  uint8_t *data)
+{
+  test_mg_wrapper_t *w = container_of(mg, test_mg_wrapper_t, mg);
+  return CVI_RT_MemCopyS2D(rt_handle, w->mem, data);
+}
+
+void matrix_copy_s2d_g2l_stride(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_ml_t *ml,
+    cvk_mg_stride_t mg_stride,
+    uint8_t *data)
+{
+  cvk_mg_shape_t mg_shape = {
+      .row = ml->shape.n,
+      .col = ml->shape.col
+  };
+
+  cvk_mg_t *mg = alloc_matrix_dev_mem(rt_handle, mg_shape, ml->fmt);
+  mg->stride = mg_stride;
+  matrix_copy_s2d(rt_handle, mg, data);
+
+  cvk_tdma_g2l_matrix_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = mg;
+  p.dst = ml;
+  cvk_ctx->ops->tdma_g2l_matrix_copy(cvk_ctx, &p);
+  CVI_RT_Submit(cvk_ctx);
+
+  free_matrix_dev_mem(rt_handle, mg);
+}
+
+void matrix_copy_s2d_g2l(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_ml_t *ml,
+    uint8_t *data)
+{
+  cvk_mg_shape_t mg_shape = {
+      .row = ml->shape.n,
+      .col = ml->shape.col
+  };
+
+  cvk_mg_stride_t mg_stride = { .row = mg_shape.col * fmt_size(ml->fmt) };
+  matrix_copy_s2d_g2l_stride(rt_handle, cvk_ctx, ml, mg_stride, data);
+}
+
+CVI_RC cmpr_matrix_copy_s2d(
+  CVI_RT_HANDLE rt_handle,
+  const cvk_cmpr_mg_t *cmpr_mg,
+  uint8_t *data)
+{
+  test_cmpr_mg_wrapper_t *w = container_of(cmpr_mg, test_cmpr_mg_wrapper_t, cmpr_mg);
+  return CVI_RT_MemCopyS2D(rt_handle, w->mem, data);
+}
+
+uint8_t *matrix_copy_d2s(
+    CVI_RT_HANDLE rt_handle,
+    const cvk_mg_t *mg)
+{
+  uint32_t size = mg_shape_size(&mg->shape, mg->fmt);
+  uint8_t *data = (uint8_t *)malloc(size);
+  if (!data)
+    return NULL;
+
+  test_mg_wrapper_t *w = container_of(mg, test_mg_wrapper_t, mg);
+  CVI_RC ret = CVI_RT_MemCopyD2S(rt_handle, data, w->mem);
+  if (ret != CVI_SUCCESS) {
+    free(data);
+    data = NULL;
+  }
+
+  return data;
+}
+
+uint8_t *matrix_copy_l2g_d2s_stride(
+  CVI_RT_HANDLE rt_handle,
+  cvk_context_t *cvk_ctx,
+  const cvk_ml_t *ml,
+  cvk_mg_stride_t mg_stride)
+{
+  cvk_mg_shape_t s;
+  s.row = ml->shape.n;
+  s.col = ml->shape.col;
+
+  cvk_mg_t *mg = alloc_matrix_dev_mem(rt_handle, s, ml->fmt);
+  mg->stride = mg_stride;
+
+  cvk_tdma_l2g_matrix_copy_param_t p;
+  memset(&p, 0, sizeof(p));
+  p.src = ml;
+  p.dst = mg;
+  cvk_ctx->ops->tdma_l2g_matrix_copy(cvk_ctx, &p);
+  CVI_RT_Submit(cvk_ctx);
+
+  uint8_t *data = matrix_copy_d2s(rt_handle, mg);
+  free_matrix_dev_mem(rt_handle, mg);
+
+  return data;
+}
+
+uint8_t *matrix_copy_l2g_d2s(
+  CVI_RT_HANDLE rt_handle,
+  cvk_context_t *cvk_ctx,
+  const cvk_ml_t *ml)
+{
+  cvk_mg_stride_t mg_stride = { ml->shape.col * fmt_size(ml->fmt)};
+  return matrix_copy_l2g_d2s_stride(rt_handle, cvk_ctx, ml, mg_stride);
+}
+
+uint8_t *cmpr_matrix_copy_d2s(
+    CVI_RT_HANDLE rt_handle,
+    const cvk_cmpr_mg_t *cmpr_mg)
+{
+  size_t bs_buf_size = cmpr_mg_shape_size(&cmpr_mg->m.shape, cmpr_mg->m.fmt);
+  uint8_t *data = (uint8_t *)malloc(bs_buf_size);
+  if (!data)
+    return NULL;
+
+  test_cmpr_mg_wrapper_t *w = container_of(cmpr_mg, test_cmpr_mg_wrapper_t, cmpr_mg);
+  CVI_RC ret = CVI_RT_MemCopyD2S(rt_handle, data, w->mem);
+  if (ret != CVI_SUCCESS) {
+    free(data);
+    data = NULL;
+  }
+
+  return data;
+}
+
+void convert_fp32_to_bf16_data(
+    cvk_context_t *cvk_ctx, uint16_t *bf16_data, float *fp32_data,
+    int length)
+{
+  // printf("  convert_fp32_to_bf16_data, len %d\n", length);
+  for (int i = 0; i < length; i++) {
+    bf16_data[i] = cvk_ctx->misc_ops->float_to_bfloat16(cvk_ctx, fp32_data[i]);
+    // printf("    [%d] %f -> 0x%x\n", i, fp32_data[i], bf16_data[i]);
+  }
+}
+
+void get_strides_from_shapes5d(int strides[5], const int shapes[5], int ws)
+{
+  strides[5 - 1] = ws;
+  for (int i = 5 - 2; i >= 0; i--)
+    strides[i] = shapes[i + 1] * strides[i + 1];
+}
+
+int get_tensor5d_offset(int poss[5], const int strides[5])
+{
+  int offset = 0;
+  for (int i = 0; i < 5; i++)
+    offset += poss[i] * strides[i];
+
+  return offset;
+}
+
+void arith_right_shift(
+    int32_t *buf, uint64_t size, int shift_bits, int round_up)
+{
+  if (shift_bits == 0)
+    return;
+
+  for (uint64_t i = 0; i < size; i++) {
+    buf[i] >>= shift_bits - 1;
+    if (round_up)
+      buf[i] += 1;
+    buf[i] >>= 1;
+  }
+}
+
+void logic_right_shift(
+    int32_t *buf, uint64_t size, int shift_bits, int round_up)
+{
+  if (shift_bits == 0)
+    return;
+
+  for (uint64_t i = 0; i < size; i++) {
+    buf[i] = (uint32_t)buf[i] >> (shift_bits - 1);
+    if (round_up)
+      buf[i] += 1;
+    buf[i] = (uint32_t)buf[i] >> 1;
+  }
+}
+
+void saturate_to_int8(int32_t *buf, uint64_t size, int res_sign)
+{
+  int32_t max, min;
+  if (res_sign) {
+    max = 127;
+    min = -128;
+  } else {
+    max = 255;
+    min = 0;
+  }
+
+  for (uint64_t i = 0; i < size; i++) {
+    if (buf[i] > max)
+      buf[i] = max;
+    else if (buf[i] < min)
+      buf[i] = min;
+  }
+}
+
+void saturate_to_int16(int32_t *buf, uint64_t size, int res_sign)
+{
+  int32_t max, min;
+  if (res_sign) {
+    max = 32767;
+    min = -32768;
+  } else {
+    max = 65535;
+    min = 0;
+  }
+
+  for (uint64_t i = 0; i < size; i++) {
+    if (buf[i] > max)
+      buf[i] = max;
+    else if (buf[i] < min)
+      buf[i] = min;
+  }
+}
+
+
+/**
+ * \cmd_info_est_in that manual set compress parameters, the possible input as below
+    1. NULL, it could call \bm_vlc_est_weight_bias
+    2. not NULL that directly send to \bm_vlc_enc_int8
+ * \cmd_info_est_out output est result, the passble value as following
+    1. \cmd_info_est_out = \cmd_info_est_in once cmd_info_est_in != NULL
+    2. \cmd_info_est_out = est result once cmd_info_est_in == NULL
+    3. NULL if you dont care
+ */
+uint8_t *test_vlc_compress (
+    uint8_t *src_data, uint64_t size, int is_signed, int data_type, size_t* bs_size, const CommandInfo* cmd_info_est_in, CommandInfo* cmd_info_est_out)
+{
+  CommandInfo cmd_info;
+  size_t bs_buf_size = get_out_bs_buf_size(size, data_type);
+
+  uint8_t *bsbuf = (uint8_t *)malloc(bs_buf_size);
+  if (!bsbuf)
+    return NULL;
+
+  memset(&cmd_info, 0x00, sizeof(CommandInfo));
+
+  /* generate comparess data (bsbuf)*/
+  if (cmd_info_est_in) {
+    memcpy(&cmd_info, cmd_info_est_in, sizeof(CommandInfo));
+  }
+  else {
+    cvk_vlc_est_weight_bias(src_data, size, (int8_t)is_signed, (int8_t)data_type, &cmd_info);
+  }
+
+  if (cmd_info_est_out) {
+    memcpy(cmd_info_est_out, &cmd_info, sizeof(CommandInfo));
+  }
+
+  if (data_type) {
+    cvk_vlc_enc_bf16((uint16_t *)src_data, size, bsbuf, bs_size, &cmd_info);
+  }
+  else {
+    cvk_vlc_enc_int8(src_data, size, bsbuf, bs_size, &cmd_info);
+  }
+
+  return bsbuf;
+}
+
+
+#ifdef ENABEL_GAUSSIANRANDOM_VLC_TEST
+// --- contrain random test ---
+static double getGaussianRandomVar(double mean, double std)
+{
+  double PI = 3.1415926;
+  double u0 = (double)rand() / RAND_MAX;
+  double u1 = (double)rand() / RAND_MAX;
+  double n = sqrt(-2 * log(u0)) * cos(2 * PI * u1);
+  return n * std + mean;
+}
+
+static double getExpRandomVar(double lambda)
+{
+  double x = (double)rand() / RAND_MAX;
+  return log(1 - x) / (-lambda);
+}
+
+static void random_gen_nn_data(uint8_t *ibuf, size_t in_num, int8_t signedness, int8_t data_type, double zero_ratio)
+{
+  float *random_buf = (float *)malloc(in_num * sizeof(float));
+  int zero_thr = (int)(100 * zero_ratio);
+  double lambda = getGaussianRandomVar(0, 0.5);
+  double mean = getGaussianRandomVar(0, 8);
+  int8_t pdf_sel = ((rand() % 10) < 9); // 9 over 10 choose exponential pdf
+  double max_v = 0;
+  double eps = 0.0001;
+  lambda += (lambda > 0) ? eps : -eps;
+  for (size_t i = 0; i < in_num; i++)
+  {
+    double val = (pdf_sel) ? getExpRandomVar(lambda) : getGaussianRandomVar(mean, lambda);
+    val = ((signedness || data_type) && rand() % 2) ? -val : val;
+    random_buf[i] = ((rand() % 100) < zero_thr) ? 0 : val;
+    max_v = (fabs(random_buf[i]) > max_v) ? fabs(random_buf[i]) : max_v;
+  }
+
+  if (data_type == 0) // INT8
+  {
+    double cali_decay = (signedness) ? (rand() / (double)RAND_MAX) + 1 : 1; // weight dacay by calibration
+    uint8_t pruned_thr = (signedness && !data_type && (rand() % 2)) ? rand() % 12 : 0;
+    for (size_t i = 0; i < in_num; i++)
+    {
+      int val = (int)((random_buf[i] * 127) / (max_v * cali_decay));
+      ibuf[i] = (abs(val) < pruned_thr)
+                    ? 0
+                    : (val > 127)
+                          ? 127
+                          : (val < (-128))
+                                ? -128
+                                : val;
+    }
+  }
+  else // BFloat16
+  {
+    uint16_t *bf16_buf = (uint16_t *)random_buf;
+    for (size_t i = 0; i < in_num; i++)
+    {
+      short bf16_val = bf16_buf[(i << 1) + 1];
+      // WARNING: set subnormal value to zero since HW do NOT support
+      int exp = ((bf16_val >> 7) & 0xFF);
+      bf16_val = (exp) ? bf16_val : 0;
+
+      ibuf[i << 1] = (uint8_t)(bf16_val & 0xFF);
+      ibuf[(i << 1) + 1] = (uint8_t)(bf16_val >> 8);
+    }
+  }
+  free(random_buf);
+}
+#endif /* ENABEL_GAUSSIANRANDOM_VLC_TEST */
+
+void test_vlc_init_testdata(
+    uint8_t *src_data,
+    uint64_t shape_size,
+    int8_t signedness,
+    int8_t data_type)
+{
+#ifdef ENABEL_GAUSSIANRANDOM_VLC_TEST
+  float zero_ratio = 0;
+  assert(data_type == 0); //<! bf16 only set to 1
+  random_gen_nn_data(src_data, shape_size, signedness, data_type, zero_ratio);
+#else
+  (void)signedness;
+  (void)data_type;
+
+  printf ("random size %d signedness %d data_type %d\n", (int)shape_size, signedness, data_type);
+
+  if (data_type == 1) {
+    // bf16
+    uint16_t *src_data_16  = (uint16_t *)src_data;
+    memset(src_data, 0x00, shape_size * sizeof(uint16_t));
+    for (uint64_t i = 0; i < shape_size; i++)
+      src_data_16[i] = 200 + i;
+
+    uint64_t zero_range = 20; //<! friendly enhance compress ratio
+    if (shape_size > zero_range) {
+      for (uint64_t i = 0; i < shape_size - zero_range; i++) {
+        src_data_16[i] = 0;
+      }
+    }
+  } else {
+    // int8
+    memset(src_data, 0x00, shape_size);
+    for (uint64_t i = 0; i < shape_size; i++)
+      src_data[i] = 200 + i;
+
+    uint64_t zero_range = 20; //<! friendly enhance compress ratio
+    if (shape_size > zero_range) {
+      for (uint64_t i = 0; i < shape_size - zero_range; i++) {
+        src_data[i] = 0;
+      }
+    }
+  }
+#endif /* ENABEL_GAUSSIANRANDOM_VLC_TEST */
+}
+
+uint16_t corner_val[] = {
+  0x0000, // 0 00000000 0000000 = zero
+  0x8000, // 1 00000000 0000000 = −zero
+  0x7f80, // 0 11111111 0000000 = infinity
+  0xff80, // 1 11111111 0000000 = −infinity
+  0x4049, // 0 10000000 1001001 = 3.140625 ≈ π ( pi )
+  0x3eab, // 0 01111101 0101011 = 0.333984375 ≈ 1/3
+  0xffc1, // x 11111111 1000001 => qNaN
+  0xff81, // x 11111111 0000001 => sNaN
+  0x00ff, // x 00000000 1111111 => denormal
+};
+
+uint16_t test_generate_bf16_corner_val(float val)
+{
+  if( rand()%RAND_SEED_MOD == 0 ) {
+    return corner_val[ rand() % (sizeof(corner_val)/sizeof(uint16_t)) ];
+  } else {
+    return cvk_convert_fp32_bf16(val);
+  }
+}
\ No newline at end of file
diff --git a/cviruntime/test/test_utils/test_cvikernel_util.h b/cviruntime/test/test_utils/test_cvikernel_util.h
new file mode 100644
index 000000000..77bda7669
--- /dev/null
+++ b/cviruntime/test/test_utils/test_cvikernel_util.h
@@ -0,0 +1,269 @@
+#ifndef CVIRUNTIME_TEST_UTIL_H
+#define CVIRUNTIME_TEST_UTIL_H
+
+#include <stdint.h>
+#include "cvikernel/cvikernel.h"
+#include "cvikernel/cvk_fp_convert.h"
+#include "cvikernel/cvk_vlc_compress.h"
+#include "cviruntime_context.h"
+#include "cvitpu_debug.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define CMDBUF_SIZE   (512*1024)  // Adjust based on test case
+
+#define __FILENAME__ (strrchr(__FILE__, '/') ? \
+                     strrchr(__FILE__, '/') + 1 : __FILE__)
+
+#define math_min(x, y)          ((x) < (y) ? (x) : (y))
+#define math_max(x, y)          ((x) > (y) ? (x) : (y))
+
+typedef enum {
+  VLC_CMP_MODE_HW = 0, // <! vlc compress mode - hw, ONLY support bias0/bias1
+  VLC_CMP_MODE_COMPILER, // <! vlc compress mode - sw, compiler, it could call bm_vlc_est_weight_bias
+  VLC_CMP_MODE_MAX,
+} vlc_cmp_mode_e;
+
+static inline uint32_t fmt_size(cvk_fmt_t fmt)
+{
+  if (fmt == CVK_FMT_I8)
+    return 1;
+  else if (fmt == CVK_FMT_BF16)
+    return 2;
+  else if (fmt == CVK_FMT_F32)
+    return 4;
+  else if (fmt == CVK_FMT_I32)
+    return 4;
+  else if (fmt == CVK_FMT_U32)
+    return 4;
+  else
+    return 1;
+}
+
+static inline uint32_t tl_shape_size(const cvk_tl_shape_t *s, cvk_fmt_t fmt)
+{
+  return s->n * s->c * s->h * s->w * fmt_size(fmt);
+}
+
+static inline uint32_t tg_shape_size(const cvk_tg_shape_t *s, cvk_fmt_t fmt)
+{
+  return s->n * s->c * s->h * s->w * fmt_size(fmt);
+}
+
+static inline uint32_t cmpr_tg_shape_size(const cvk_tg_shape_t *s, cvk_fmt_t fmt)
+{
+  uint8_t data_type = (fmt == CVK_FMT_BF16) ? 1 : 0;
+  uint64_t size = tg_shape_size(s, fmt);
+  return get_out_bs_buf_size(size, data_type);
+}
+
+static inline uint32_t mg_shape_size(const cvk_mg_shape_t *s, cvk_fmt_t fmt)
+{
+  return s->row * s->col * fmt_size(fmt);
+}
+
+static inline uint32_t cmpr_mg_shape_size(const cvk_mg_shape_t *s, cvk_fmt_t fmt)
+{
+  uint8_t data_type = (fmt == CVK_FMT_BF16) ? 1 : 0;
+  uint64_t size = mg_shape_size(s, fmt);
+  return get_out_bs_buf_size(size, data_type);
+}
+
+static inline uint32_t ml_shape_size(const cvk_ml_shape_t *s, cvk_fmt_t fmt)
+{
+  return s->n * s->col * fmt_size(fmt);
+}
+
+static inline cvk_tl_shape_t tl_shape_t4(uint32_t n, uint32_t c, uint32_t h, uint32_t w)
+{
+    cvk_tl_shape_t shape = {.n = n, .c = c, .h = h, .w = w};
+    return shape;
+}
+
+static inline cvk_tg_shape_t tg_shape_t4(uint32_t n, uint32_t c, uint32_t h, uint32_t w)
+{
+    cvk_tg_shape_t shape = {.n = n, .c = c, .h = h, .w = w};
+    return shape;
+}
+
+static inline uint64_t align_up(uint64_t x, uint64_t n)
+{
+  return (x + n - 1) / n * n;
+}
+
+cvk_tg_t *alloc_tensor_dev_mem(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    cvk_tg_shape_t shape,
+    cvk_fmt_t fmt);
+
+void free_tensor_dev_mem(
+    CVI_RT_HANDLE rt_handle,
+    const cvk_tg_t *tg);
+
+cvk_cmpr_tg_t *alloc_cmpr_tensor_dev_mem(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    cvk_tg_shape_t shape,
+    cvk_fmt_t fmt,
+    CommandInfo *cmd_info);
+
+void free_cmpr_tensor_dev_mem(
+    CVI_RT_HANDLE rt_handle,
+    const cvk_cmpr_tg_t *cmpr_tg);
+
+CVI_RC tensor_copy_s2d(
+    CVI_RT_HANDLE rt_handle,
+    const cvk_tg_t *tg,
+    uint8_t *data);
+
+CVI_RC cmpr_tensor_copy_s2d(
+    CVI_RT_HANDLE rt_handle,
+    const cvk_cmpr_tg_t *cmpr_tg,
+    uint8_t *data);
+
+uint8_t *tensor_copy_d2s(
+    CVI_RT_HANDLE rt_handle,
+    const cvk_tg_t *tg);
+
+uint8_t *cmpr_tensor_copy_d2s(
+    CVI_RT_HANDLE rt_handle,
+    const cvk_cmpr_tg_t *cmpr_tg);
+
+uint8_t *tensor_copy_l2g_d2s_stride(
+  CVI_RT_HANDLE rt_handle,
+  cvk_context_t *cvk_ctx,
+  const cvk_tl_t *tl,
+  cvk_tg_stride_t tg_stride);
+
+uint8_t *tensor_copy_l2g_d2s(
+  CVI_RT_HANDLE rt_handle,
+  cvk_context_t *cvk_ctx,
+  const cvk_tl_t *tl);
+
+void tensor_copy_s2d_g2l_stride(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    cvk_tg_stride_t tg_stride,
+    uint8_t *data);
+
+void tensor_copy_s2d_g2l(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    uint8_t *data);
+
+void tensor_copy_s2d_g2l_tp_stride(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    cvk_tg_stride_t tg_stride,
+    uint8_t *data);
+
+void tensor_copy_s2d_g2l_tp(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_tl_t *tl,
+    uint8_t *data);
+
+cvk_mg_t *alloc_matrix_dev_mem(
+    CVI_RT_HANDLE rt_handle,
+    cvk_mg_shape_t shape,
+    cvk_fmt_t fmt);
+
+void free_matrix_dev_mem(
+    CVI_RT_HANDLE rt_handle,
+    const cvk_mg_t *mg);
+
+cvk_cmpr_mg_t *alloc_cmpr_matrix_dev_mem(
+    CVI_RT_HANDLE rt_handle,
+    cvk_mg_shape_t shape,
+    cvk_fmt_t fmt,
+    CommandInfo* cmd_info);
+
+void free_cmpr_matrix_dev_mem(
+    CVI_RT_HANDLE rt_handle,
+    const cvk_cmpr_mg_t *cmpr_mg);
+
+CVI_RC matrix_copy_s2d(
+  CVI_RT_HANDLE rt_handle,
+  const cvk_mg_t *mg,
+  uint8_t *data);
+
+void matrix_copy_s2d_g2l_stride(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_ml_t *ml,
+    cvk_mg_stride_t mg_stride,
+    uint8_t *data);
+
+void matrix_copy_s2d_g2l(
+    CVI_RT_HANDLE rt_handle,
+    cvk_context_t *cvk_ctx,
+    const cvk_ml_t *ml,
+    uint8_t *data);
+
+CVI_RC cmpr_matrix_copy_s2d(
+  CVI_RT_HANDLE rt_handle,
+  const cvk_cmpr_mg_t *cmpr_mg,
+  uint8_t *data);
+
+uint8_t *matrix_copy_d2s(
+    CVI_RT_HANDLE rt_handle,
+    const cvk_mg_t *mg);
+
+uint8_t *matrix_copy_l2g_d2s_stride(
+  CVI_RT_HANDLE rt_handle,
+  cvk_context_t *cvk_ctx,
+  const cvk_ml_t *ml,
+  cvk_mg_stride_t mg_stride);
+
+uint8_t *matrix_copy_l2g_d2s(
+  CVI_RT_HANDLE rt_handle,
+  cvk_context_t *cvk_ctx,
+  const cvk_ml_t *ml);
+
+uint8_t *cmpr_matrix_copy_d2s(
+    CVI_RT_HANDLE rt_handle,
+    const cvk_cmpr_mg_t *cmpr_mg);
+
+void convert_fp32_to_bf16_data(
+    cvk_context_t *cvk_ctx, uint16_t *bf16_data, float *fp32_data,
+    int length);
+
+void get_strides_from_shapes5d(int strides[5], const int shapes[5], int ws);
+int get_tensor5d_offset(int poss[5], const int strides[5]);
+
+void arith_right_shift(
+    int32_t *buf, uint64_t size, int shift_bits, int round_up);
+void logic_right_shift(
+    int32_t *buf, uint64_t size, int shift_bits, int round_up);
+
+void saturate_to_int8(int32_t *buf, uint64_t size, int res_sign);
+void saturate_to_int16(int32_t *buf, uint64_t size, int res_sign);
+
+uint8_t *test_vlc_compress(
+    uint8_t *src_data,
+    uint64_t size,
+    int is_signed,
+    int data_type,
+    size_t* bs_size,
+    const CommandInfo* cmd_info_est_in,
+    CommandInfo* cmd_info_est_out);
+
+void test_vlc_init_testdata(
+    uint8_t *src_data,
+    uint64_t shape_size,
+    int8_t signedness,
+    int8_t data_type);
+
+uint16_t test_generate_bf16_corner_val(float val);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // TEST_CVIKERNEL_UTIL_H
\ No newline at end of file
diff --git a/cviruntime/test/test_utils/test_native_ref.c b/cviruntime/test/test_utils/test_native_ref.c
new file mode 100644
index 000000000..441823b38
--- /dev/null
+++ b/cviruntime/test/test_utils/test_native_ref.c
@@ -0,0 +1,1203 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <float.h>
+#include <errno.h>
+#include <assert.h>
+
+#include <test_native_ref.h>
+
+#define math_min(x, y)          ((x) < (y) ? (x) : (y))
+#define math_max(x, y)          ((x) > (y) ? (x) : (y))
+
+typedef uint8_t  u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+typedef uint64_t u64;
+
+typedef int8_t  s8;
+typedef int16_t s16;
+typedef int32_t s32;
+typedef int64_t s64;
+typedef u32 bmerr_t;
+
+#define BM_SUCCESS 0               // The operation was successful
+#define BM_ERR_AGAIN 1             // Not ready yet
+#define BM_ERR_FAILURE 2           // General failure
+#define BM_ERR_TIMEOUT 3           // Timeout
+#define BM_ERR_UNINITIALIZED 4     // Uninitialzed
+#define BM_ERR_INVALID_ARGUMENT 5  // Arguments invalid
+#define BM_ERR_NOMEM 6             // Not enough memory
+#define BM_ERR_DATA 7              // Data error
+#define BM_ERR_BUSY 8              // Busy
+#define BM_ERR_NOT_SUPPORTED 9     // Not supported yet
+
+typedef u32 BLOB_OP;
+#define BLOB_ADD 0
+#define BLOB_SUB 1
+#define BLOB_MUL 2
+#define BLOB_DIV 3
+#define BLOB_INVALID 4
+
+
+
+static inline int calc_offset(int *shape, int *offset)
+{
+  return ((offset[0] * shape[1] + offset[1]) * shape[2] + offset[2])
+      * shape[3] + offset[3];
+}
+
+static int index_get(int h, int w1, int w2)
+{
+  return h * w1 + w2;
+}
+
+int array_cmp_float_rel(const char * const info, float *p_exp, float *p_got,
+    int count, float delta)
+{
+  int idx = 0;
+  for (idx = 0; idx < count; idx++) {
+    if (math_max( fabs(p_exp[idx]), fabs(p_got[idx])) > 1.0 ) {
+      // compare rel
+      if( math_min(fabs(p_exp[idx]), fabs(p_got[idx])) < 1e-20 ) {
+        printf("%s rel error at index %d exp %.20f got %.20f\n",
+            info, idx, p_exp[idx], p_got[idx]);
+        if(isnan(p_exp[idx]) && isnan(p_got[idx])){
+          printf("both exp and got are NAN");
+          return 0;
+        }
+        return -1;
+      }
+      if (fabs(p_exp[idx] - p_got[idx]) >
+          delta * math_min(fabs(p_exp[idx]), fabs( p_got[idx]))) {
+        printf("%s rel error at index %d exp %.20f got %.20f\n",
+            info, idx, p_exp[idx], p_got[idx]);
+        if(isnan(p_exp[idx]) && isnan(p_got[idx])){
+          printf("both exp and got are NAN");
+          return 0;
+        }
+        return -1;
+      }
+    } else {
+      if ( fabs(p_exp[idx] - p_got[idx]) > delta ) {
+        printf("%s abs error at index %d exp %.20f got %.20f\n",
+            info, idx, p_exp[idx], p_got[idx]);
+        if(isnan(p_exp[idx]) && isnan(p_got[idx])){
+          printf("both exp and got are NAN");
+          return 0;
+        }
+        return -1;
+      }
+    }
+
+    if ( isnan(p_got[idx]) && !isnan(p_exp[idx])) {
+      printf("%s, found nans idx %d\n", info , idx);
+      printf("floating from exp %.10f got %.10f\n", p_exp[idx], p_got[idx]);
+      IF_VAL exp, got;
+      exp.fval = p_exp[idx];
+      got.fval = p_got[idx];
+      printf("hex form exp %8.8x got %8.8x\n", exp.ival, got.ival);
+      return -2;
+    }
+  }
+  return 0;
+}
+
+int array_cmp_float(const char * const info, float *p_exp, float *p_got,
+    int count, float delta)
+{
+  if (delta == 0.0f) {
+    for (int idx = 0; idx < count; idx++) {
+      if (p_exp[idx] != p_got[idx]) {
+        printf("%s error at index %d exp %.20f got %.20f\n",
+            info, idx, p_exp[idx], p_got[idx]);
+        if(isnan(p_exp[idx]) && isnan(p_got[idx])){
+          printf("both exp and got are NAN\n");
+          return 0;
+        }
+        return -1;
+      }
+    }
+  } else {
+    return array_cmp_float_rel(info, p_exp, p_got, count, delta);
+  }
+  return 0;
+}
+
+int array_cmp_int(const char * const info, int *p_exp, int *p_got, int count)
+{
+  int idx;
+  for (idx = 0; idx < count; idx++) {
+    if (p_exp[idx] != p_got[idx]) {
+      printf("%s error at index %d exp %d got %d\n",
+          info, idx, p_exp[idx], p_got[idx]);
+      return -1;
+    }
+  }
+  return 0;
+}
+
+int array_cmp_int8(const char * const info, const int8_t *p_exp,
+    const int8_t *p_got, int count)
+{
+  int idx;
+  for (idx = 0; idx < count; idx++) {
+    if (p_exp[idx] != p_got[idx]) {
+      printf("%s error at index %d exp %d got %d\n",
+          info, idx, p_exp[idx], p_got[idx]);
+      return -1;
+    }
+  }
+  return 0;
+}
+
+int calc_dilute_hw(int h, int ins_h, int ins_h_l, int pad_h_b, int pad_h_t)
+{
+  return (h - 1) * (ins_h + 1) + ins_h_l +
+                    1 + pad_h_t + pad_h_b;
+}
+
+int calc_output_hw(int hw, int khw, int stride)
+{
+  return (hw - khw)/stride + 1;
+}
+
+
+int fill_pad_fmap_int8(
+    const int8_t *before, int8_t **pafter, int val,
+    int pad_l, int pad_r, int pad_t, int pad_b,
+    int ins_h, int ins_w, int ins_h_last, int ins_w_last,
+    int h_before, int w_before)
+{
+  if (!before || !pafter)
+      return BM_ERR_INVALID_ARGUMENT;
+
+  int w_after = (w_before - 1) * (ins_w + 1) + ins_w_last + 1 + pad_l + pad_r;
+  int h_after = (h_before - 1) * (ins_h + 1) + ins_h_last + 1 + pad_t + pad_b;
+  int8_t *after = *pafter;
+
+  if (!after) {
+    after = malloc(sizeof(int8_t) * w_after * h_after);
+    if (!after)
+        return BM_ERR_NOMEM;
+  }
+
+  memset(after, val, w_after * h_after);
+  for (int h = 0; h < h_before; h++) {
+    for (int w = 0; w < w_before; w++) {
+      int i = (h * (ins_h + 1) + pad_t) * w_after + w * (ins_w + 1) + pad_l;
+      after[i] = before[h * w_before + w];
+    }
+  }
+
+  *pafter = after;
+  return BM_SUCCESS;
+}
+
+int fill_pad_fmap_bf16(
+    const u16 *before, u16 **pafter, int val,
+    int pad_l, int pad_r, int pad_t, int pad_b,
+    int ins_h, int ins_w, int ins_h_last, int ins_w_last,
+    int h_before, int w_before)
+{
+  if (!before || !pafter)
+      return BM_ERR_INVALID_ARGUMENT;
+
+  int w_after = (w_before - 1) * (ins_w + 1) + ins_w_last + 1 + pad_l + pad_r;
+  int h_after = (h_before - 1) * (ins_h + 1) + ins_h_last + 1 + pad_t + pad_b;
+  u16 *after = *pafter;
+  if (!after) {
+    after = malloc(sizeof(u16) * w_after * h_after);
+    if (!after)
+        return BM_ERR_NOMEM;
+  }
+  for(int i=0 ; i < w_after * h_after; i ++)
+    after[i] = val;
+
+  for (int h = 0; h < h_before; h++) {
+    for (int w = 0; w < w_before; w++) {
+      int i = (h * (ins_h + 1) + pad_t) * w_after + w * (ins_w + 1) + pad_l;
+      after[i] = before[h * w_before + w];
+    }
+  }
+#if 0
+  printf("bf16 padding:\n");
+  for(int i=0;i<h_after;i++) {
+    printf("[\n");
+    for(int j=0;j<w_after;j++)
+      printf("%04x ", (after[i*w_after+j]));
+    printf("\n");
+  }
+ printf("]\n");
+#endif
+  *pafter = after;
+  return BM_SUCCESS;
+}
+
+void fill_int_with_int8(int* pdest, int8_t * psrc, int len)
+{
+  for(int ii=0; ii<len; ii++)
+    pdest[ii] = (int)psrc[ii];
+}
+
+void fill_int_with_uint8(int *pdest, uint8_t *psrc, int len)
+{
+  for(int ii=0; ii<len; ii++)
+    pdest[ii] = psrc[ii];
+}
+
+void fill_int_with_int16(int* pdest, int16_t* psrc, int len)
+{
+  for(int ii=0; ii<len; ii++) {
+    pdest[ii] = (int16_t)psrc[ii];
+  }
+}
+
+void inner_product(const int* a, const int* b, int len, int *c)
+{
+  *c = 0;
+  for(int ii=0; ii<len; ii++) {
+    *c += (a[ii]*b[ii]);
+  }
+}
+
+void inner_float_product(const float* a, const float* b, int len, float *c)
+{
+  *c = 0;
+  for(int ii=0; ii<len; ii++) {
+    *c += (a[ii]*b[ii]);
+  }
+}
+
+int fill_pad_fmap_fp32(const float *before, float **after, float pad_value,
+    int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r,
+    int ins_h, int ins_w, int ins_h_l, int ins_w_l,
+    int h, int w)
+{
+  int h_after = calc_dilute_hw(h, ins_h, ins_h_l, pad_h_b, pad_h_t);
+  int w_after = calc_dilute_hw(w, ins_w, ins_w_l, pad_w_l, pad_w_r);
+  float *ofmap = NULL;
+
+  if (before == NULL || after == NULL) {
+    return BM_ERR_INVALID_ARGUMENT;
+  }
+  if (*after == NULL
+      && (*after = malloc(sizeof(float)*h_after*w_after)) == NULL) {
+    printf("No enough memory: [h_after, w_after]=[%i, %i].\n",
+           h_after, w_after);
+    return BM_ERR_NOMEM;
+  }
+
+  ofmap = *after;
+  for (int i = 0; i < h_after*w_after; i++) {
+    ofmap[i] = pad_value;
+  }
+  for (int i = 0; i < h; i++) {
+    float *start_addr = ofmap + (pad_h_t+i*(ins_h+1))*w_after + pad_w_l;
+    int ins_h_count = (i == h-1) ? ins_h_l : ins_h;
+
+    for (int j = 0; j < ins_h_count+1; j++) {
+      memset(start_addr+j*w_after, 0,
+             sizeof(float)*(w_after-pad_w_l-pad_w_r));
+    }
+    for (int j = 0; j < w; j++) {
+      start_addr[j*(ins_w+1)] = before[i*w+j];
+    }
+  }
+
+  return BM_SUCCESS;
+}
+
+void native_md_scalar(float *a, float *b, float *r,
+    int N, int C, int H, int W, int op, bool result_add)
+{
+  int count = N * C * H * W;
+  for (int i = 0; i < count; i++) {
+    switch (op) {
+    case BLOB_ADD:
+      r[i] = a[i] + b[i];
+      break;
+    case BLOB_SUB:
+      r[i] = a[i] - b[i];
+      break;
+    case BLOB_MUL:
+      r[i] = result_add ? r[i] : 0;
+      r[i] += a[i] * b[i];
+      break;
+    case BLOB_DIV:
+      r[i] = a[i] / b[i];
+      break;
+    default:
+      assert(0);
+      break;
+    }
+  }
+}
+
+void native_md_scalar_int8(int8_t *a, int8_t *b, int8_t *r,
+    int N, int C, int H, int W, int op, bool result_add)
+{
+  int count = N * C * H * W;
+  for (int i = 0; i < count; i++) {
+    switch (op) {
+    case BLOB_ADD:
+      r[i] = a[i] + b[i];
+      break;
+    case BLOB_SUB:
+      r[i] = a[i] - b[i];
+      break;
+    case BLOB_MUL:
+      r[i] = result_add ? r[i] : 0;
+      r[i] += a[i] * b[i];
+      break;
+    case BLOB_DIV:
+      r[i] = a[i] / b[i];
+      break;
+    default:
+      assert(0);
+      break;
+    }
+  }
+}
+
+static int matrix_dot_mult(
+    int8_t *A, int8_t *B, int dim_n, int dim_m,
+    int opd0_sign)
+{
+  int sum = 0;
+  for (int i=0; i<dim_n; i++){
+    for (int j=0; j<dim_m; j++){
+      int index = index_get(i, dim_m, j);
+      if(opd0_sign) {
+        sum += A[index] * B [index];
+      } else {
+        sum += (int)((uint8_t)A[index]) * B [index];
+      }
+    }
+  }
+  return sum;
+}
+
+int native_conv_int8(
+    const int8_t *ifmap, const int8_t *weight, const int16_t *bias,
+    int8_t *ofmap,
+    int in, int ic, int ih, int iw, int oc,
+    int kh, int kw, int dh, int dw,
+    int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r,
+    int stride_h, int stride_w,
+    int ins_h, int ins_w,
+    int ins_h_last, int ins_w_last,
+    int input_sign, int r_shift_width, int do_relu)
+{
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last,  pad_h_t, pad_h_b);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last,  pad_w_l, pad_w_r);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  int8_t *i_fmap_pad_ker = (int8_t *)malloc(kh_ext * kw_ext);
+  if (!result || !i_fmap_pad_ker) {
+    free(result);
+    free(i_fmap_pad_ker);
+    return -1;
+  }
+
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+
+  int ret = BM_SUCCESS;
+
+  int8_t *i_fmap_pad = NULL;
+  int8_t *kernel_after = NULL;
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8(
+            (int8_t*)ifmap + n*ic*ih*iw + cc*ih*iw, &i_fmap_pad, 0,
+            pad_w_l, pad_w_r, pad_h_t, pad_h_b,
+            ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+
+        //kernel_dilation(
+        fill_pad_fmap_int8(
+            (weight + c*ic*kh*kw + cc*kh*kw), &kernel_after, 0,
+            0, 0, 0, 0,  // no padding
+            dh - 1, dw - 1, 0, 0,
+            kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw){
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh+ph*stride_h) * iw_ext +
+                               idxw + pw*stride_w];
+              }
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after,
+                                kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+
+      if (bias) {
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            result[n*oc*oh*ow + c*oh*ow + ph*ow + pw] += bias[c]; //bias+c ;
+          }
+        }
+      }
+
+      ret = satu_2_8bit(&result[n*oc*oh*ow + c*oh*ow], oh * ow,
+              &ofmap[n*oc*oh*ow + c*oh*ow], r_shift_width, 1,
+              !do_relu);
+
+      if (ret != BM_SUCCESS)
+        goto error_release;
+    } //end for (int c = 0; c < oc; ++c)
+  } //end for (int n = 0; n < in; n++)
+
+error_release:
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+int native_depthwise_fp32(
+    const float *ifmap, const float *weight, const float *bias, float *ofmap,
+    int in, int ic, int ih, int iw,
+    int kh, int kw, int dh, int dw,
+    int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r,
+    int stride_h, int stride_w,
+    int ins_h, int ins_w,
+    int ins_h_last, int ins_w_last)
+{
+  int h_after = calc_dilute_hw(ih, ins_h, ins_h_last, pad_h_t, pad_h_b);
+  int w_after = calc_dilute_hw(iw, ins_w, ins_w_last, pad_w_l, pad_w_r);
+  int kh_dilation = (kh-1)*dh + 1, kw_dilatoin = (kw-1)*dw + 1;
+  int oh = calc_output_hw(h_after, kh_dilation, stride_h);
+  int ow = calc_output_hw(w_after, kw_dilatoin, stride_w);
+  float *ifmap_after = malloc(sizeof(float)*h_after*w_after);
+  float *weight_dilation = malloc(sizeof(float)*kh_dilation*kw_dilatoin);
+
+  if (ifmap_after == NULL || weight_dilation == NULL) {
+    printf("No enough memory.\n");
+    free(ifmap_after);
+    free(weight_dilation);
+
+    return BM_ERR_NOMEM;
+  }
+
+  for (int n = 0; n < in; n++) {
+    for (int c = 0; c < ic; c++, ifmap +=ih*iw, ofmap += oh*ow) {
+      float init_value = bias ? bias[c] : 0;
+      int ret_ifmap = fill_pad_fmap_fp32(ifmap, &ifmap_after, 0,
+                        pad_h_t, pad_h_b, pad_w_l, pad_w_r,
+                        ins_h, ins_w, ins_h_last, ins_w_last,
+                        ih, iw);
+      int ret_weight = fill_pad_fmap_fp32(weight+c*kh*kw, &weight_dilation, 0,
+                         0, 0, 0, 0,
+                         dh-1, dw-1, 0, 0,
+                         kh, kw);
+
+      if ((ret_ifmap != BM_SUCCESS) || (ret_weight != BM_SUCCESS)) {
+        printf("failed to pad ifmap or weight.\n");
+        return BM_ERR_FAILURE;
+      }
+
+      for (int h = 0; h < oh; h++) {
+        for (int w = 0; w < ow; w++) {
+          int rf_h = h*stride_h, rf_w = w*stride_w;
+          int kh_end = math_min(kh_dilation, h_after-rf_h);
+          int kw_end = math_min(kw_dilatoin, w_after-rf_w);
+          float *rf_addr = ifmap_after + rf_h*w_after + rf_w;
+          float dot_product_even = 0.0, dot_product_odd = 0.0;
+
+          for (int i = 0; i < kh_end; i++) {
+            for (int j = 0; j < kw_end; j++) {
+              if ((i*kw_end+j)%2) {
+                dot_product_odd += rf_addr[i*w_after+j]
+                                * weight_dilation[i*kw_dilatoin+j];
+              }
+              else {
+                dot_product_even += rf_addr[i*w_after+j]
+                                 * weight_dilation[i*kw_dilatoin+j];
+              }
+            }
+          }
+          ofmap[h*ow+w] = dot_product_even + dot_product_odd + init_value;
+        }
+      }
+    }
+  }
+
+  free(ifmap_after);
+  free(weight_dilation);
+
+  return BM_SUCCESS;
+}
+
+void native_conv_ref(
+    const void *ifmap, void *ofmap, const void *weight,
+    int input_n, int input_c, int input_h, int input_w,
+    int output_c, int output_h, int output_w,
+    int groups,
+    int kh, int kw,
+    int dilation_h, int dilation_w,
+    int pad_h, int pad_w,
+    int stride_h, int stride_w,
+    int flip,
+    int using_bias,
+    const void *bias,
+    int result_add)
+{
+  int kh_extent = dilation_h * (kh - 1) + 1;
+  int kw_extent = dilation_w * (kw - 1) + 1;
+  int output_h_expect = (input_h + 2 * pad_h - kh_extent) / stride_h + 1;
+  int output_w_expect = (input_w + 2 * pad_w - kw_extent) / stride_w + 1;
+  assert(output_h == output_h_expect && "Expect same output_h");
+  assert(output_w == output_w_expect && "Expect same output_w");
+
+  if (!result_add) {
+    memset(ofmap, 0, input_n * output_c * output_h * output_w * sizeof(float));
+  }
+
+  float *ifmap_f = (float *)ifmap;
+  float *ofmap_f = (float *)ofmap;
+  float *weight_f = (float *)weight;
+  float *bias_f = (float *)bias;
+  int i_shape[4];
+  i_shape[0] = input_n;
+  i_shape[1] = input_c;
+  i_shape[2] = input_h;
+  i_shape[3] = input_w;
+  int o_shape[4];
+  o_shape[0] = input_n;
+  o_shape[1] = output_c;
+  o_shape[2] = output_h;
+  o_shape[3] = output_w;
+  int k_shape[4];
+  k_shape[0] = output_c;
+  k_shape[1] = input_c / groups;
+  k_shape[2] = kh;
+  k_shape[3] = kw;
+
+  int o_g = output_c / groups;
+  int k_g = input_c / groups;
+  int o_head, k_head;
+  int weight_offset[4];
+  int in_offset[4];
+  int out_offset[4];
+
+  for (int n = 0; n < input_n; n++) {
+    for (int g = 0; g < groups; g++) {
+      o_head = o_g * g;
+      k_head = k_g * g;
+      for (int o = 0; o < o_g; o++) {
+        for (int y = 0; y < output_h; y++) {
+          for (int x = 0; x < output_w; x++) {
+            out_offset[0] = n;
+            out_offset[1] = o + o_head;
+            out_offset[2] = y;
+            out_offset[3] = x;
+            float result_init = ofmap_f[calc_offset(o_shape, out_offset)];
+            ofmap_f[calc_offset(o_shape, out_offset)] = 0.0f;
+            for (int k = 0; k < k_g; k++) {
+              for (int p = 0; p < kh; p++) {
+                for (int q = 0; q < kw; q++) {
+                  int in_y = y * stride_h - pad_h + p * dilation_h;
+                  int in_x = x * stride_w - pad_w + q * dilation_w;
+                  if (in_y >= 0 && in_y < input_h
+                      && in_x >= 0 && in_x < input_w) {
+                    weight_offset[0] = o + o_head;
+                    weight_offset[1] = k;
+                    if (flip) {
+                      weight_offset[2] = (kh - 1 - p);
+                      weight_offset[3] = (kw - 1 - q);
+                    } else {
+                      weight_offset[2] = p;
+                      weight_offset[3] = q;
+                    }
+                    in_offset[0] = n;
+                    in_offset[1] = k + k_head;
+                    in_offset[2] = in_y;
+                    in_offset[3] = in_x;
+                    ofmap_f[calc_offset(o_shape, out_offset)] +=
+                        ifmap_f[calc_offset(i_shape, in_offset)]
+                        * weight_f[calc_offset(k_shape, weight_offset)];
+                    if(k_g==1&&kh==1&&kw==1){
+                      ofmap_f[calc_offset(o_shape, out_offset)] =
+                          ifmap_f[calc_offset(i_shape, in_offset)]
+                          * weight_f[calc_offset(k_shape, weight_offset)];
+                    }
+                  }
+                }
+              }
+            }
+            if (using_bias) {
+              ofmap_f[calc_offset(o_shape, out_offset)] += bias_f[o + o_head];
+            }
+            if (result_add) {
+              ofmap_f[calc_offset(o_shape, out_offset)] += result_init;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+int native_fc_int8(
+    const int8_t *L, const int8_t *R, const int16_t *B, const int16_t *Y,
+    int *Y_ref,
+    int L_row_num, int L_col_num, int R_col_num,
+    int L_sign, int R_sign, int B_sign,
+    int l_shift_width, int r_shift_width,
+    int is_result_int8, int do_relu)
+{
+  const uint8_t *uL = (const uint8_t*)L;
+  const uint8_t *uR = (const uint8_t*)R;
+  const uint16_t *uB = (const uint16_t*)B;
+
+  int opd0, opd1, opd2;
+  int ret = BM_SUCCESS;
+
+  for (int hidx = 0; hidx < L_row_num; hidx++) {
+    for (int widx = 0; widx < R_col_num; widx++) {
+      int Y1 = 0;
+      int Y2 = 0;
+      int sum_idx = 0;
+      for (sum_idx = 0; sum_idx < L_col_num; sum_idx++) {
+        int idx_L = index_get(hidx, L_col_num,  sum_idx);
+        int idx_R = index_get(sum_idx, R_col_num, widx);
+        opd0 = (L_sign) ? L[idx_L] : uL[idx_L];
+        opd1 = (R_sign) ? R[idx_R] : uR[idx_R];
+        if((sum_idx % 2 == 0 && sum_idx != 2) || sum_idx == 1){
+          Y1 += opd0 * opd1;
+        } else {
+          Y2 += opd0 * opd1;
+        }
+      }
+      sum_idx++;
+
+      if (B){
+        opd2 = (B_sign) ? (int)B[widx] : (int)uB[widx];
+        if((sum_idx % 2 == 0 && sum_idx != 2) || sum_idx == 1){
+          Y1 += opd2;
+        } else {
+          Y2 += opd2;
+        }
+        sum_idx++;
+      }
+
+      int idx_Y = index_get(hidx, R_col_num, widx);
+      if (Y){
+        if((sum_idx % 2 == 0 && sum_idx != 2) || sum_idx == 1){
+          Y1 += (Y[idx_Y] << l_shift_width);
+        } else {
+          Y2 += (Y[idx_Y] << l_shift_width);
+        }
+      }
+
+      Y_ref[idx_Y] = Y1 + Y2;
+    }
+  }
+  uint8_t* Yout_int8 = malloc(sizeof(int8_t) * L_row_num * R_col_num);
+  uint16_t* Yout_int16 = malloc(sizeof(int16_t) * L_row_num * R_col_num);
+
+  if(is_result_int8)  {
+    ret = satu_2_8bit(Y_ref, L_row_num * R_col_num,
+                      (int8_t *)Yout_int8, r_shift_width, 1, !do_relu);
+    if (ret != BM_SUCCESS)
+      goto error_release;
+
+    fill_int_with_int8(Y_ref, (int8_t *)Yout_int8, L_row_num * R_col_num);
+  } else {
+    ret = satu_2_16bit(Y_ref, L_row_num * R_col_num,
+                       (int16_t *)Yout_int16, r_shift_width, 1, !do_relu);
+    if (ret != BM_SUCCESS)
+      goto error_release;
+
+    fill_int_with_int16(Y_ref, (int16_t *)Yout_int16, L_row_num * R_col_num);
+  }
+
+error_release:
+  free(Yout_int8);
+  free(Yout_int16);
+
+  return ret;
+}
+
+int native_pooling_ave_int8(
+    const int8_t* i_fmap,
+    const void* weight,
+    const int16_t *bias,
+    int8_t* o_fmap,
+    int input_n, int input_c, int input_h, int input_w,
+    int kh, int kw,
+    int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r,
+    int stride_h, int stride_w,
+    int ins_h, int ins_w,
+    int ins_h_last, int ins_w_last,
+    int input_sign, int satu_sign,
+    int r_shift_width, int const_weight)
+{
+  if (kh * kw <= 0)
+    return BM_ERR_INVALID_ARGUMENT;
+
+  int *avg_pooling_mac_a = (int *)malloc(kh * kw * sizeof(int));
+  int *avg_pooling_mac_b = (int *)malloc(kh * kw * sizeof(int));
+
+  uint8_t avg_const_weight = *(uint8_t *)weight;
+  const int8_t *weight_arr = weight;
+
+  int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b);
+  int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r);
+
+  int output_h = calc_output_hw(h_after, kh, stride_h);
+  int output_w = calc_output_hw(w_after, kw, stride_w);
+
+  int8_t *i_fmap_pad = NULL;
+  for (int n = 0; n < input_n; n++) {
+    if (const_weight == 0)
+      weight_arr = weight;
+
+    for (int c = 0; c < input_c; ++c) {
+      fill_pad_fmap_int8(i_fmap, &i_fmap_pad, 0,
+          pad_w_l, pad_w_r, pad_h_t, pad_h_b,
+          ins_h, ins_w, ins_h_last, ins_w_last,
+          input_h, input_w);
+      for (int ph = 0; ph < output_h; ++ph) {
+        for (int pw = 0; pw < output_w; ++pw) {
+          int hstart = ph * stride_h;
+          int wstart = pw * stride_w;
+          int pool_index = index_get(ph, output_w, pw);
+          int mac_index = 0;
+          int avg_pool_result;
+
+          for (int h = 0; h < kh; h++) {
+            for (int w = 0; w < kw; w++) {
+              int index = index_get((hstart+h), w_after, (w+wstart));
+              mac_index = index_get(h, kw, w);
+              avg_pooling_mac_a[mac_index] = input_sign ?
+                  i_fmap_pad[index] : (uint8_t)(i_fmap_pad[index]);
+
+              avg_pooling_mac_b[mac_index] = const_weight ?
+                  avg_const_weight : weight_arr[mac_index];
+            }
+          }
+
+          inner_product(avg_pooling_mac_a, avg_pooling_mac_b, kh * kw,
+              &avg_pool_result);
+
+          if(bias) {
+            avg_pool_result += bias[c];
+          }
+
+          int ret = satu_2_8bit(&avg_pool_result, sizeof(int8_t),
+                        o_fmap + pool_index, r_shift_width, 1,
+                        satu_sign);
+
+          if (ret != BM_SUCCESS) {
+            free(i_fmap_pad);
+            free(avg_pooling_mac_a);
+            free(avg_pooling_mac_b);
+
+            return BM_ERR_INVALID_ARGUMENT;
+          }
+        }
+      }
+      i_fmap += input_w * input_h;
+      if (const_weight == 0)
+        weight_arr += kh * kw;
+
+      o_fmap += output_w * output_h;
+    }
+  }
+  free(i_fmap_pad);
+
+  free(avg_pooling_mac_a);
+  free(avg_pooling_mac_b);
+
+  return BM_SUCCESS;
+}
+
+int native_pooling_max_int8(
+    const int8_t* i_fmap,
+    int8_t* o_fmap,
+    int input_n, int input_c, int input_h, int input_w,
+    int kh, int kw,
+    int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r,
+    int stride_h, int stride_w,
+    int ins_h, int ins_w,
+    int ins_h_last, int ins_w_last,
+    int input_sign)
+{
+  if (ins_h != 0 || ins_w != 0 || ins_h_last != 0  || ins_w_last !=0)
+    return BM_ERR_INVALID_ARGUMENT;
+
+  int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b);
+  int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r);
+
+  int output_h = calc_output_hw(h_after, kh, stride_h);
+  int output_w = calc_output_hw(w_after, kw, stride_w);
+
+  const int max_init = input_sign? -128: 0;
+  int8_t *i_fmap_pad = NULL;
+  for (int nc = 0; nc < input_n * input_c; nc++) {
+    fill_pad_fmap_int8(i_fmap, &i_fmap_pad, max_init,
+      pad_w_l, pad_w_r, pad_h_t, pad_h_b,
+      0, 0, 0, 0, input_h, input_w);
+
+    for (int ph = 0; ph < output_h; ++ph) {
+      for (int pw = 0; pw < output_w; ++pw) {
+        int hstart = ph * stride_h;
+        int wstart = pw * stride_w;
+        int pool_index = index_get(ph, output_w, pw);
+        int max = max_init;
+        for (int h = 0; h < kh; h++) {
+          for (int w = 0; w < kw; w++) {
+            int index = index_get((hstart + h), (input_w + pad_w_l + pad_w_r),
+                            (w + wstart));
+            int val = input_sign ? i_fmap_pad[index]: (uint8_t)i_fmap_pad[index];
+            max = (val > max)? val: max;
+          }
+        }
+        o_fmap[pool_index] = max;
+      }
+    }
+    i_fmap += input_w * input_h;
+    o_fmap += output_w * output_h;
+  }
+  free(i_fmap_pad);
+
+  return BM_SUCCESS;
+}
+
+int native_pooling_min_int8(
+    const int8_t* i_fmap,
+    int8_t* o_fmap,
+    int input_n, int input_c, int input_h, int input_w,
+    int kh, int kw,
+    int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r,
+    int stride_h, int stride_w,
+    int ins_h, int ins_w,
+    int ins_h_last, int ins_w_last,
+    int input_sign)
+{
+  if (ins_h != 0 || ins_w != 0 || ins_h_last != 0  || ins_w_last !=0)
+    return BM_ERR_INVALID_ARGUMENT;
+
+  int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b);
+  int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r);
+
+  int output_h = calc_output_hw(h_after, kh, stride_h);
+  int output_w = calc_output_hw(w_after, kw, stride_w);
+
+  const int min_init = input_sign? 127: 0xFF;
+  int8_t *i_fmap_pad = NULL;
+  for (int nc = 0; nc < input_n * input_c; nc++) {
+    fill_pad_fmap_int8(i_fmap, &i_fmap_pad, min_init,
+      pad_w_l, pad_w_r, pad_h_t, pad_h_b,
+      0, 0, 0, 0, input_h, input_w);
+
+    for (int ph = 0; ph < output_h; ++ph) {
+      for (int pw = 0; pw < output_w; ++pw) {
+        int hstart = ph * stride_h;
+        int wstart = pw * stride_w;
+        int pool_index = index_get(ph, output_w, pw);
+        int min = min_init;
+        for (int h = 0; h < kh; h++) {
+          for (int w = 0; w < kw; w++) {
+            int index = index_get((hstart + h), (input_w + pad_w_l + pad_w_r),
+                            (w + wstart));
+            int val = input_sign ? i_fmap_pad[index]: (uint8_t)i_fmap_pad[index];
+            min = (val < min)? val: min;
+          }
+        }
+        o_fmap[pool_index] = min;
+      }
+    }
+    i_fmap += input_w * input_h;
+    o_fmap += output_w * output_h;
+  }
+  free(i_fmap_pad);
+
+  return BM_SUCCESS;
+}
+
+
+int native_pooling_max_fp32(
+    const float *ifmap, float *ofmap,
+    int input_n, int input_c, int input_h, int input_w,
+    int kh, int kw,
+    int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r,
+    int stride_h, int stride_w,
+    int ins_h, int ins_w,
+    int ins_h_last, int ins_w_last
+    )
+{
+  int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b);
+  int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r);
+  int output_h = calc_output_hw(h_after, kh, stride_h);
+  int output_w = calc_output_hw(w_after, kw, stride_w);
+  float *ifmap_after = malloc(sizeof(float)*h_after*w_after);
+
+  if (ifmap_after == NULL) {
+    printf("No enough memory[h_after, w_after]: [%u, %u].\n", h_after, w_after);
+    return BM_ERR_NOMEM;
+  }
+
+  for (int n = 0; n < input_n; n++) {
+    for (int c = 0; c < input_c; c++) {
+      int ret = fill_pad_fmap_fp32(ifmap, &ifmap_after, -FLT_MAX,
+                  pad_h_t, pad_h_b, pad_w_l, pad_w_r,
+                  ins_h, ins_w, ins_h_last, ins_w_last,
+                  input_h, input_w);
+
+      if (ret != BM_SUCCESS) {
+        printf("Failed to pad input fmap.\n");
+        free(ifmap_after);
+        return BM_ERR_FAILURE;
+      }
+
+      for (int h = 0; h < output_h; h++) {
+        for (int w = 0; w < output_w; w++) {
+          int rf_h = h*stride_h, rf_w = w*stride_w;
+          int kh_end = math_min(kh, h_after-rf_h);
+          int kw_end = math_min(kw, w_after-rf_w);
+          float *rf_addr = ifmap_after + rf_h*w_after + rf_w;
+          float max_val = -FLT_MAX;
+
+          for (int i = 0; i < kh_end; i++) {
+            for (int j = 0; j < kw_end; j++) {
+              max_val = math_max(rf_addr[i*w_after+j], max_val);
+            }
+          }
+          ofmap[h*output_w+w] = max_val;
+        }
+      }
+
+      ifmap += input_h*input_w;
+      ofmap += output_h*output_w;
+    }
+  }
+
+  free(ifmap_after);
+  return BM_SUCCESS;
+}
+
+int native_pooling_avg_fp32(
+    const float* ifmap,
+    float* ofmap,
+    int input_n, int input_c, int input_h, int input_w,
+    int kh, int kw,
+    int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r,
+    int stride_h, int stride_w,
+    int ins_h, int ins_w,
+    int ins_h_last, int ins_w_last,
+    float avg_pooling_const
+    )
+{
+  int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b);
+  int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r);
+  int output_h = calc_output_hw(h_after, kh, stride_h);
+  int output_w = calc_output_hw(w_after, kw, stride_w);
+  float *ifmap_after = malloc(sizeof(float)*h_after*w_after);
+
+  if (ifmap_after == NULL) {
+    printf("No enough memory[h_after, w_after]: [%u, %u].\n", h_after, w_after);
+    return BM_ERR_NOMEM;
+  }
+
+  for (int n = 0; n < input_n; n++) {
+    for (int c = 0; c < input_c; c++) {
+      int ret = fill_pad_fmap_fp32(ifmap, &ifmap_after, 0,
+                  pad_h_t, pad_h_b, pad_w_l, pad_w_r,
+                  ins_h, ins_w, ins_h_last, ins_w_last,
+                  input_h, input_w);
+
+      if (ret != BM_SUCCESS) {
+        printf("Failed to pad input fmap.\n");
+        free(ifmap_after);
+        return BM_ERR_FAILURE;
+      }
+
+      for (int h = 0; h < output_h; h++) {
+        for (int w = 0; w < output_w; w++) {
+          int rf_h = h*stride_h, rf_w = w*stride_w;
+          int kh_end = math_min(kh, h_after-rf_h);
+          int kw_end = math_min(kw, w_after-rf_w);
+          float *rf_addr = ifmap_after + rf_h*w_after + rf_w;
+          float dot_product_even = 0.0, dot_product_odd = 0.0;
+
+          for (int i = 0; i < kh_end; i++) {
+            for (int j = 0; j < kw_end; j++) {
+              if ((i*kw_end+j)%2) {
+                dot_product_odd += rf_addr[i*w_after+j]*avg_pooling_const;
+              }
+              else {
+                dot_product_even += rf_addr[i*w_after+j]*avg_pooling_const;
+              }
+            }
+          }
+          ofmap[h*output_w+w] = dot_product_even + dot_product_odd;
+        }
+      }
+
+      ifmap += input_h*input_w;
+      ofmap += output_h*output_w;
+    }
+  }
+
+  free(ifmap_after);
+  return BM_SUCCESS;
+}
+
+void native_pooling_forward_max(
+    const float* bottom_data, float* top_data,
+    int* mask_data,
+    const int count,
+    const int num, const int channels,
+    const int height, const int width,
+    const int pooled_height, const int pooled_width,
+    const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w,
+    const int pad_h, const int pad_w)
+{
+  (void)num;
+  for (int index = 0; index < count; ++index) {
+    const int pw = index % pooled_width;
+    const int ph = (index / pooled_width) % pooled_height;
+    const int c = (index / pooled_width / pooled_height) % channels;
+    const int n = index / pooled_width / pooled_height / channels;
+    int hstart = ph * stride_h - pad_h;
+    int wstart = pw * stride_w - pad_w;
+    const int hend = math_min(hstart + kernel_h, height);
+    const int wend = math_min(wstart + kernel_w, width);
+    hstart = math_max(hstart, 0);
+    wstart = math_max(wstart, 0);
+    float maxval = -FLT_MAX;
+    int maxidx = -1;
+    const float* const bottom_slice =
+        bottom_data + (n * channels + c) * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        if (bottom_slice[h * width + w] > maxval) {
+          maxidx = h * width + w;
+          maxval = bottom_slice[maxidx];
+        }
+      }
+    }
+    top_data[index] = maxval;
+    mask_data[index] = maxidx;
+  }
+}
+
+void native_pooling_forward_ave(
+    const float* bottom_data, float* top_data,
+    const int count,
+    const int num, const int channels,
+    const int height, const int width,
+    const int pooled_height, const int pooled_width,
+    const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w,
+    const int pad_h, const int pad_w)
+{
+  (void)num;
+  for (int index = 0; index < count; ++index) {
+    const int pw = index % pooled_width;
+    const int ph = (index / pooled_width) % pooled_height;
+    const int c = (index / pooled_width / pooled_height) % channels;
+    const int n = index / pooled_width / pooled_height / channels;
+    int hstart = ph * stride_h - pad_h;
+    int wstart = pw * stride_w - pad_w;
+    int hend = math_min(hstart + kernel_h, height + pad_h);
+    int wend = math_min(wstart + kernel_w, width + pad_w);
+    const int pool_size = (hend - hstart) * (wend - wstart);
+    hstart = math_max(hstart, 0);
+    wstart = math_max(wstart, 0);
+    hend = math_min(hend, height);
+    wend = math_min(wend, width);
+    float aveval = 0;
+    const float* const bottom_slice =
+        bottom_data + (n * channels + c) * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        aveval += bottom_slice[h * width + w];
+      }
+    }
+    top_data[index] = aveval / pool_size;
+  }
+}
+
+int satu_2_8bit(
+    const int* pBuff, int len, int8_t* pByteOut, int rshiftbits,
+    int round_floor, int sign_unsign)
+{
+  if (rshiftbits < 0)
+    return BM_ERR_INVALID_ARGUMENT;
+
+  int temp;
+  int satu_max = sign_unsign ? 127 : 255;
+  int satu_min = sign_unsign ? -128 : 0;
+  if(rshiftbits == 0)  {
+    for (int ii=0; ii<len; ii++)  {
+      temp = (pBuff[ii]>satu_max) ? satu_max : ((pBuff[ii]<satu_min) ? satu_min : pBuff[ii]);
+      memcpy(pByteOut+ii, &temp, 1);
+    }
+  } else {  // rshiftbits>0
+    for (int ii=0; ii<len; ii++)  {
+      if(round_floor == 1)
+        temp = ((pBuff[ii]>>(rshiftbits-1))+1)>>1;
+      else
+        temp = pBuff[ii]>>rshiftbits;
+      temp = (temp>satu_max) ? satu_max : ((temp<satu_min) ? satu_min : temp);
+      memcpy(pByteOut+ii, &temp, 1);
+    }
+  }
+
+  return BM_SUCCESS;
+}
+
+int satu_2_16bit(
+    const int* pBuff, int len, short* pByteOut,
+    int rshiftbits, int round_floor, int sign_unsign)
+{
+  if (rshiftbits < 0)
+    return BM_ERR_INVALID_ARGUMENT;
+
+  int ii;
+  int temp;
+  int satu_max = sign_unsign ? 32767 : 65535;
+  int satu_min = sign_unsign ? -32768 : 0;
+  if(rshiftbits==0)  {
+    for(ii=0; ii<len; ii++)  {
+      temp = (pBuff[ii]>satu_max) ? satu_max : ((pBuff[ii]<satu_min) ? satu_min : pBuff[ii]);
+      memcpy(pByteOut+ii, &temp, 2);
+    }
+  } else {  // rshiftbits>0
+    for(ii=0; ii<len; ii++)  {
+      if(round_floor==1)
+        temp = ((pBuff[ii]>>(rshiftbits-1))+1)>>1;
+      else
+        temp = pBuff[ii]>>rshiftbits;
+      temp = (temp>satu_max) ? satu_max : ((temp<satu_min) ? satu_min : temp);
+      memcpy(pByteOut+ii, &temp, 2);
+    }
+  }
+
+  return BM_SUCCESS;
+}
+
+void relu(int32_t *buf, uint32_t size)
+{
+  for (uint64_t i = 0; i < size; i++)
+    if (buf[i] < 0)
+      buf[i] = 0;
+}
diff --git a/cviruntime/test/test_utils/test_native_ref.h b/cviruntime/test/test_utils/test_native_ref.h
new file mode 100644
index 000000000..beba10bbd
--- /dev/null
+++ b/cviruntime/test/test_utils/test_native_ref.h
@@ -0,0 +1,461 @@
+#ifndef _BM_NATIVE_REF_H_
+#define _BM_NATIVE_REF_H_
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef union {
+  uint32_t ival;
+  float fval;
+} IF_VAL;
+
+/*
+ * fp32 version
+ */
+
+int array_cmp_float(const char * const info, float *p_exp, float *p_got,
+    int count, float delta);
+int array_cmp_int(const char * const info, int *p_exp, int *p_got, int count);
+
+/**
+ * @name    calc_dilute_hw
+ * @brief   calculate diluted dimention
+ * @ingroup libbmutils
+ *
+ * @param [in] h       origin dimention
+ * @param [in] ins_h   scaleing factor, 0 -> no scaling
+ * @param [in] ins_h_l compensation value after last value in each row
+ * @param [in] pad_h_b extra padding left ofr bottom
+ * @param [in] pad_h_t extra padding right or top
+ *
+ * @retval diluted value
+ */
+int calc_dilute_hw(int h, int ins_h, int ins_h_l, int pad_h_b, int pad_h_t);
+
+/**
+ * @name    calc_output_hw
+ * @brief   calculate output dimention by kernel and stride size
+ * @ingroup libbmutils
+ *
+ * @param [in] hw       origin dimention
+ * @param [in] kwh      scaling factor, 0 -> no scaling
+ * @param [in] stride   compensation value after last value in each row
+ *
+ * @retval output dimention
+ */
+int calc_output_hw(int hw, int khw, int stride);
+
+/**
+ * @name    fill_pad_fmap_fp32
+ * @brief   fill padded feature map with unpadded map
+ * @ingroup libbmutils
+ *
+ * @param [in]  before       input array
+ * @param [out] pbefore      output array reference, if NULL, alloc a new one
+ * @param [in]  pad_val      padding value
+ * @param [in]  pad_l        padding left size
+ * @param [in]  pad_r        padding right size
+ * @param [in]  pad_t        padding top size
+ * @param [in]  pad_b        padding bottom size
+ * @param [in]  ins_h        scaling factor h
+ * @param [in]  ins_w        scaling factor w
+ * @param [in]  ins_h_last   compensation value after last value in each row
+ * @param [in]  ins_w_last   compensation value after last value in each col
+ * @param [in]  h_before     origin height
+ * @param [in]  w_before     origin width
+ *
+ * @retval BM_SUCCESS               success
+ * @retval BM_ERR_INVALID_ARGUMENT  before or pafter is null pointer
+ * @retval BM_ERR_NOMEM             can't alloc new output array
+ */
+int fill_pad_fmap_fp32(const float *before, float **after, float pad_value,
+    int pad_t, int pad_b, int pad_l, int pad_r,
+    int ins_h, int ins_w, int ins_h_last, int ins_w_last,
+    int h_before, int w_before);
+
+void native_md_scalar(float *a, float *b, float *r,
+    int N, int C, int H, int W, int op, bool result_add);
+
+void native_conv_ref(
+    const void *ifmap, void *ofmap, const void *weight,
+    int input_n, int input_c, int input_h, int input_w,
+    int output_c, int output_h, int output_w,
+    int groups,
+    int kh, int kw,
+    int dilation_h, int dilation_w,
+    int pad_h, int pad_w,
+    int stride_h, int stride_w,
+    int flip,
+    int using_bias,
+    const void *bias,
+    int result_add);
+
+void native_pooling_forward_max(
+    const float* bottom_data, float* top_data,
+    int* mask_data,
+    const int count,
+    const int num, const int channels,
+    const int height, const int width,
+    const int pooled_height, const int pooled_width,
+    const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w,
+    const int pad_h, const int pad_w);
+
+void native_pooling_forward_ave(
+    const float* bottom_data, float* top_data,
+    const int count,
+    const int num, const int channels,
+    const int height, const int width,
+    const int pooled_height, const int pooled_width,
+    const int kernel_h, const int kernel_w,
+    const int stride_h, const int stride_w,
+    const int pad_h, const int pad_w);
+
+/*
+ *  int8 vresion
+ */
+
+/**
+ * @name    array_cmp_int8
+ * @brief   compare the contect of p_exp and p_got and print the error index
+ *          and value
+ * @ingroup libbmutils
+ *
+ * @param [in] info   informataion string printed when encounter error
+ * @param [in]  p_exp  input array
+ * @param [in]  p_got  length of input array
+ * @param [in]  len    length of input array
+ * @retval      0      no error
+ * @retval      -1     error occur
+ */
+int array_cmp_int8(
+    const char * const info,
+    const int8_t *p_exp, const int8_t *p_got,
+    int count);
+
+/**
+ * @name    fill_pad_fmap_int8
+ * @brief   fill padded feature map with unpadded map
+ * @ingroup libbmutils
+ *
+ * @param [in]  before       input array
+ * @param [out] pbefore      output array reference, if NULL, alloc a new one
+ * @param [in]  pad_val      padding value
+ * @param [in]  pad_l        padding left size
+ * @param [in]  pad_r        padding right size
+ * @param [in]  pad_t        padding top size
+ * @param [in]  pad_b        padding bottom size
+ * @param [in]  ins_h        scaling factor h
+ * @param [in]  ins_w        scaling factor w
+ * @param [in]  ins_h_last   compensation value after last value in each row
+ * @param [in]  ins_w_last   compensation value after last value in each col
+ * @param [in]  h_before     origin height
+ * @param [in]  w_before     origin width
+ *
+ * @retval BM_SUCCESS               success
+ * @retval BM_ERR_INVALID_ARGUMENT  before or pafter is null pointer
+ * @retval BM_ERR_NOMEM             can't alloc new output array
+ */
+int fill_pad_fmap_int8(
+    const int8_t *before, int8_t **pafter, int pad_val,
+    int pad_l, int pad_r, int pad_t, int pad_b,
+    int ins_h, int ins_w, int ins_h_last, int ins_w_last,
+    int h_before, int w_before);
+
+int fill_pad_fmap_bf16(
+    const unsigned short *before, unsigned short **pafter, int pad_val,
+    int pad_l, int pad_r, int pad_t, int pad_b,
+    int ins_h, int ins_w, int ins_h_last, int ins_w_last,
+    int h_before, int w_before);
+
+/**
+ * @name    fill_int_with_int8
+ * @brief   (int) pdest[i] = (int8_t)pdest[i] for each element
+ * @ingroup libbmutils
+ *
+ * @param [out] pdest  output array
+ * @param [in]  psrc   input array
+ * @param [in]  len    length of input array
+ */
+void fill_int_with_int8(int* pdest, int8_t * psrc, int len);
+
+/**
+ * @name    fill_int_with_uint8
+ * @brief   (int) pdest[i] = (int16_t)pdest[i] for each element
+ * @ingroup libbmutils
+ *
+ * @param [out] pdest  output array
+ * @param [in]  psrc   input array
+ * @param [in]  len    length of input array
+ */
+void fill_int_with_uint8(int *pdest, uint8_t *psrc, int len);
+
+/**
+ * @name    fill_int_with_int16
+ * @brief   (int) pdest[i] = (int16_t)pdest[i] for each element
+ * @ingroup libbmutils
+ *
+ * @param [out] pdest  output array
+ * @param [in]  psrc   input array
+ * @param [in]  len    length of input array
+ */
+void fill_int_with_int16(int* pdest, int16_t* psrc, int len);
+
+void native_md_scalar_int8(int8_t *a, int8_t *b, int8_t *r,
+    int N, int C, int H, int W, int op, bool result_add);
+
+/**
+ * @name    inner_product
+ * @brief   inner product of two array
+ * @ingroup libbmutils
+ *
+ * @param [in]  a    input array 0
+ * @param [in]  b    input array 1
+ * @param [in]  len  length of a or b
+ * @param [out] c    store the summation
+ */
+void inner_product(const int* a, const int* b, int len, int *c);
+void inner_float_product(const float* a, const float* b, int len, float *c);
+
+/**
+ * @name    native_conv_int8
+ * @brief   do convolution specific 8bit feature map
+ * @ingroup libbmutils
+ *
+ * @param [in]  ifmap         input array
+ * @param [in]  weight        weight data array
+ * @param [in]  bias          bias array if !NULL, add bias
+ * @param [out] ofmap         lenght of input array
+ * @param [in]  in            input batch size
+ * @param [in]  ic            input channel size
+ * @param [in]  ih            input height
+ * @param [in]  iw            input width
+ * @param [in]  oc            output channle size
+ * @param [in]  kh            kernel height
+ * @param [in]  kw            kernel width
+ * @param [in]  dh            kernel dilute height factor
+ * @param [in]  dw            kernel dilute width factor
+ * @param [in]  pad_h_t       padding top size
+ * @param [in]  pad_h_b       padding bottom size
+ * @param [in]  pad_w_l       padding left size
+ * @param [in]  pad_w_r       padding right size
+ * @param [in]  stride_h      stride height
+ * @param [in]  stride_w      stride width
+ * @param [in]  ins_h         insert extra element for each i_fmap row
+ * @param [in]  ins_w         insert extra element for each i_fmap col
+ * @param [in]  ins_h_last    insert extra element for last i_fmap row
+ * @param [in]  ins_w_last    insert extra element for last i_fmap col
+ * @param [in]  input_sign    i_fmap data type. 0 => signed, 1 => unsigned
+ * @param [in]  r_shift_width scale bit for saturation
+ *
+ * @retval BM_SUCCESS               success
+ * @retval other                    saturation failed
+ */
+int native_conv_int8(
+    const int8_t *ifmap, const int8_t *weight, const int16_t *bias,
+    int8_t *ofmap,
+    int in, int ic, int ih, int iw, int oc,
+    int kh, int kw, int dh, int dw,
+    int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r,
+    int stride_h, int stride_w,
+    int ins_h, int ins_w,
+    int ins_h_last, int ins_w_last,
+    int input_sign, int r_shift_width, int do_relu);
+
+/**
+ * @name    native_fc_int8
+ * @brief   do full-connected layer for specific feature map
+ * @ingroup libbmutils
+ *
+ * @param [in]  L              input array
+ * @param [in]  R              weight array
+ * @param [in]  B              bias array if !NULL, add bias
+ * @param [in]  Y              accumulation array if !NULL, add this
+ * @param [out] Y_ref          output array
+ * @param [in]  L_row_num      input row size
+ * @param [in]  L_col_num      input col size
+ * @param [in]  R_col_num      weight
+ * @param [in]  L_sign         padding top size
+ * @param [in]  R_sign         padding top size
+ * @param [in]  B_sign         padding top size
+ * @param [in]  L_shift_width  padding top size
+ * @param [in]  R_shift_width  padding top size
+ * @param [in]  is_result_int8 padding top size
+ * @param [in]  do_relu        padding top size
+ *
+ * @retval BM_SUCCESS               success
+ * @retval other                    saturation failed
+ */
+int native_fc_int8(
+    const int8_t *L, const int8_t *R, const int16_t *B, const int16_t *Y,
+    int *Y_ref,
+    int L_row_num, int L_col_num, int R_col_num,
+    int L_sign, int R_sign, int B_sign,
+    int l_shift_width, int r_shift_width,
+    int is_result_int8, int do_relu);
+
+/**
+ * @name    native_pooling_ave_int8
+ * @brief   do average pooling for specific feature map
+ * @ingroup libbmutils
+ *
+ * @param [in]  i_fmap        input array
+ * @param [in]  weight        weight data array
+ * @param [in]  bias          bias array if !NULL, add bias
+ * @param [out] o_fmap        lenght of input array
+ * @param [in]  pad_h_t       padding top size
+ * @param [in]  pad_h_b       padding bottom size
+ * @param [in]  pad_w_l       padding left size
+ * @param [in]  pad_w_r       padding right size
+ * @param [in]  stride_h      stride height
+ * @param [in]  stride_w      stride width
+ * @param [in]  ins_h         insert extra element for each i_fmap row
+ * @param [in]  ins_w         insert extra element for each i_fmap col
+ * @param [in]  ins_h_last    insert extra element for last i_fmap row
+ * @param [in]  ins_w_last    insert extra element for last i_fmap col
+ * @param [in]  input_sign    i_fmap data type. 0 => signed, 1 => unsigned
+ * @param [in]  satu_sign     saturation data type. 0 => unsigned, 1 => signed
+ * @param [in]  r_shift_width scale bit for saturation
+ * @param [in]  const_weight  if weight array has one uint8_t value
+ *
+ * @retval BM_SUCCESS               success
+ * @retval BM_ERR_INVALID_ARGUMENT  illegal kh/kw or r_shift_width
+ */
+int native_pooling_ave_int8(
+    const int8_t *i_fmap,
+    const void *weight,
+    const int16_t *bias,
+    int8_t* o_fmap,
+    int input_n, int input_c, int input_h, int input_w,
+    int kh, int kw,
+    int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r,
+    int stride_h, int stride_w,
+    int ins_w, int ins_h,
+    int ins_w_last, int ins_h_last,
+    int input_sign, int satu_sign,
+    int r_shift_width, int const_weight);
+
+/**
+ * @name    native_pooling_max_int8
+ * @brief   do max pooling for specific feature map
+ * @ingroup libbmutils
+ *
+ * @param [in]  i_fmap        input array
+ * @param [out] o_fmap        lenght of input array
+ * @param [in]  pad_h_t       padding top size
+ * @param [in]  pad_h_b       padding bottom size
+ * @param [in]  pad_w_l       padding left size
+ * @param [in]  pad_w_r       padding right size
+ * @param [in]  stride_h      stride height
+ * @param [in]  stride_w      stride width
+ * @param [in]  ins_h         insert extra element for each i_fmap row
+ * @param [in]  ins_w         insert extra element for each i_fmap col
+ * @param [in]  ins_h_last    insert extra element for last i_fmap row
+ * @param [in]  ins_w_last    insert extra element for last i_fmap col
+ * @param [in]  input_sign    i_fmap data type. 0 => unsigned, 1 => signed
+ *
+ * @retval BM_SUCCESS               success
+ * @retval BM_ERR_INVALID_ARGUMENT  illegal ins_h/w or ins_[hw]_last
+ */
+int native_pooling_max_int8(
+    const int8_t* i_fmap,
+    int8_t* o_fmap,
+    int input_n, int input_c, int input_h, int input_w,
+    int kh, int kw,
+    int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r,
+    int stride_h, int stride_w,
+    int ins_h, int ins_w,
+    int ins_h_last, int ins_w_last,
+    int input_sign);
+
+int native_pooling_max_fp32(
+    const float* i_fmap,
+    float* o_fmap,
+    int input_n, int input_c, int input_h, int input_w,
+    int kh, int kw,
+    int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r,
+    int stride_h, int stride_w,
+    int ins_h, int ins_w,
+    int ins_h_last, int ins_w_last);
+
+int native_pooling_min_int8(
+    const int8_t* i_fmap,
+    int8_t* o_fmap,
+    int input_n, int input_c, int input_h, int input_w,
+    int kh, int kw,
+    int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r,
+    int stride_h, int stride_w,
+    int ins_h, int ins_w,
+    int ins_h_last, int ins_w_last,
+    int input_sign);
+
+int native_pooling_avg_fp32(
+    const float* i_fmap,
+    float* o_fmap,
+    int input_n, int input_c, int input_h, int input_w,
+    int kh, int kw,
+    int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r,
+    int stride_h, int stride_w,
+    int ins_h, int ins_w,
+    int ins_h_last, int ins_w_last,
+    float avg_pooling_const);
+
+int native_depthwise_fp32(
+    const float *ifmap, const float *weight, const float *bias, float *ofmap,
+    int in, int ic, int ih, int iw,
+    int kh, int kw, int dh, int dw,
+    int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r,
+    int stride_h, int stride_w,
+    int ins_h, int ins_w,
+    int ins_h_last, int ins_w_last);
+
+/**
+ * @name    satu_2_8bit
+ * @brief   saturate each signed or unsiged 8bit element in array
+ * @ingroup libbmutils
+ *
+ * @param [in]  pBuff       input array
+ * @param [in]  len         lenght of input array
+ * @param [out] pyByteOut   output array
+ * @param [in]  rshiftbits  right shift bit if round_floor && value != 0
+ * @param [in]  round_floor enable floor rounding
+ * @param [in]  sign_unsign 0 => unsigned, 1 => signed
+ *
+ * @retval BM_SUCCESS                success
+ * @retval BM_ERR_INVALID_ARGUMENT   rshiftbits < 0
+ */
+int satu_2_8bit(
+    const int* pBuff, int len, int8_t* pByteOut, int rshiftbits,
+    int round_floor, int sign_unsign);
+
+/**
+ * @name    satu_2_16bit
+ * @brief   saturate each signed or unsiged 16bit element in array
+ * @ingroup libbmutils
+ *
+ * @param [in]  pBuff       input array
+ * @param [in]  len         lenght of input array
+ * @param [out] pyByteOut   output array
+ * @param [in]  rshiftbits  right shift bit if round_floor && value != 0
+ * @param [in]  round_floor enable floor rounding
+ * @param [in]  sign_unsign 0 => unsigned, 1 => signed
+ *
+ * @retval BM_SUCCESS                success
+ * @retval BM_ERR_INVALID_ARGUMENT   rshiftbits < 0
+ */
+int satu_2_16bit(
+    const int* pBuff, int len, short* pByteOut,
+    int rshiftbits, int round_floor, int sign_unsign);
+
+void relu(int32_t *buf, uint32_t size);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BM_NATIVE_REF_H_ */
diff --git a/cviruntime/test/test_utils/test_neuron_dump.hpp b/cviruntime/test/test_utils/test_neuron_dump.hpp
new file mode 100644
index 000000000..1998f4ca9
--- /dev/null
+++ b/cviruntime/test/test_utils/test_neuron_dump.hpp
@@ -0,0 +1,46 @@
+#ifndef __BM_NEURON_DUMP_HPP_
+#define __BM_NEURON_DUMP_HPP_
+
+#include <stdint.h>
+
+typedef uint32_t u32;
+
+#define DEBUG_NEURON_DUMP 0
+
+#if (DEBUG_NEURON_DUMP)
+template <typename dtype>
+static void neuron_dump(
+    const char str[], u32 n, u32 c, u32 h, u32 w, dtype* data) {
+  printf("%s, shape = (%d, %d, %d, %d)\n", str, n, c, h, w);
+  for (u32 ni = 0; ni < n; ni ++) {
+    for (u32 ci = 0; ci < c; ci ++) {
+      printf("n = %d, c = %d\n", ni, ci);
+      for(u32 hi = 0; hi < h; hi++) {
+        printf("\t| ");
+        for(u32 wi = 0; wi < w; wi ++) {
+          u32 n_stride = c * h * w;
+          u32 c_stride = h * w;
+          u32 h_stride = w;
+          u32 index = ni * n_stride + ci * c_stride + hi * h_stride  + wi;
+          printf("%4d ", data[index]);
+        }
+        printf("| \n");
+      }
+    }
+  }
+}
+
+#else
+template <typename dtype>
+static void neuron_dump(const char str[], u32 n, u32 c, u32 h, u32 w, dtype data[]) {
+  (void)str;
+  (void)n;
+  (void)c;
+  (void)h;
+  (void)w;
+  (void)data;
+  return;
+}
+
+#endif //DEBUG_NEURON_DUMP
+#endif //__BM_NEURON_DUMP_HPP_
diff --git a/cviruntime/test/test_utils/test_tf_quant_util.c b/cviruntime/test/test_utils/test_tf_quant_util.c
new file mode 100644
index 000000000..c64528c38
--- /dev/null
+++ b/cviruntime/test/test_utils/test_tf_quant_util.c
@@ -0,0 +1,149 @@
+#include <stdio.h>
+#include <inttypes.h>
+#include <math.h>
+#include <limits.h>
+#include <assert.h>
+#include "test_tf_quant_util.h"
+
+// Correctly-rounded-to-nearest division by a power-of-two.
+// Also known as a rounding arithmetic right shift.
+int32_t RoundingDivideByPOT(int32_t x, int exponent)
+{
+  const int32_t mask = (1ll << exponent) - 1;
+  const int32_t remainder = x & mask;
+  const int32_t threshold = (mask >> 1) + ((x < 0) ? 1 : 0);
+  return ((x >> exponent) + ((remainder > threshold) ? 1 : 0));
+}
+
+int32_t SaturatingRoundingDoublingHighMul(int32_t a, int32_t b)
+{
+  int64_t a_64 = a;
+  int64_t b_64 = b;
+  int64_t ab_64 = a_64 * b_64;
+  int32_t nudge = ab_64 >= 0 ? (1 << 30) : (1 - (1 << 30));
+  int32_t ab_x2_high32 = (int32_t)((ab_64 + nudge) / (1ll << 31));
+
+  return ab_x2_high32;
+}
+
+int32_t MultiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multiplier,
+                                      int rshift)
+{
+  int left_shift = rshift > 0 ? 0 : -rshift;
+  int right_shift = rshift > 0 ? rshift : 0;
+
+  int32_t x1 = SaturatingRoundingDoublingHighMul(x * (1 << left_shift),
+                                                 quantized_multiplier);
+  int32_t x2 = RoundingDivideByPOT(x1, right_shift);
+
+  return x2;
+}
+
+// 1880v2: 5bit right shift
+// 1822:   6bit left/right shift, 1b sign, 5bit shift
+static uint8_t pack_6b_shift(int8_t rshift)
+{
+  // 8b -> 6b
+  uint8_t sign_bit = (((uint8_t)rshift) & 0x80) >> 7; // 1bit
+  uint8_t shift_val = ((uint8_t)rshift) & 0x1f; // 5bit
+  uint8_t val = (sign_bit << 5) | shift_val;
+
+  return val;
+}
+
+void pack_chl_quan_param(uint32_t channels, int has_bias, int32_t *bias,
+                         uint32_t *multiplier, int8_t *rshift,
+                         uint8_t *packed_data)
+{
+  uint8_t *ptr = packed_data;
+
+  for (uint32_t i = 0; i < channels; i++) {
+    if (has_bias) {
+      uint32_t val = (uint32_t)bias[i];
+      *ptr = val & 0xff;
+      ptr++;
+      *ptr = (val >> 8) & 0xff;
+      ptr++;
+      *ptr = (val >> 16) & 0xff;
+      ptr++;
+      *ptr = (val >> 24) & 0xff;
+      ptr++;
+    }
+
+    {
+      uint32_t val = multiplier[i];
+      *ptr = val & 0xff;
+      ptr++;
+      *ptr = (val >> 8) & 0xff;
+      ptr++;
+      *ptr = (val >> 16) & 0xff;
+      ptr++;
+      *ptr = (val >> 24) & 0xff;
+      ptr++;
+    }
+
+    {
+      uint8_t val = pack_6b_shift(rshift[i]);
+      *ptr = val;
+      ptr++;
+    }
+  }
+}
+
+void QuantizeMultiplierSmallerThanOne(float real_multiplier,
+                                      uint32_t *quantized_multiplier,
+                                      int *right_shift)
+{
+  assert(real_multiplier > 0.f);
+  assert(real_multiplier < 1.f);
+  int s = 0;
+  // We want to bring the real multiplier into the interval [1/2, 1).
+  // We can do so by multiplying it by two, and recording how many times
+  // we multiplied by two so that we can compensate that by a right
+  // shift by the same amount.
+  while (real_multiplier < 0.5f) {
+    real_multiplier *= 2.0f;
+    s++;
+  }
+  // Now that the real multiplier is in [1/2, 1), we convert it
+  // into a fixed-point number.
+  int64_t q = (int64_t)(round(real_multiplier * (1ll << 31)));
+  assert(q <= (1ll << 31));
+  // Handle the special case when the real multiplier was so close to 1
+  // that its fixed-point approximation was undistinguishable from 1.
+  // We handle this by dividing it by two, and remembering to decrement
+  // the right shift amount.
+  if (q == (1ll << 31)) {
+    q /= 2;
+    s--;
+  }
+  assert(s >= 0);
+  assert(q <= (int64_t)LONG_MAX);
+  *quantized_multiplier = (uint32_t)q;
+  *right_shift = s;
+
+#ifdef ENABLE_DEBUG_MSG
+  printf(
+      "    QuantizeMultiplierSmallerThanOne: %f -> multiplier %d, rshift %d\n",
+      real_multiplier, *quantized_multiplier, *right_shift);
+#endif
+}
+
+int8_t truncate_rshift(int8_t rshift, int8_t allow_lshift)
+{
+  int8_t lower_bound = 0;
+  int8_t upper_bound = 0;
+  const int8_t BITS = 6;
+
+  if (rshift < 0) {
+    printf("truncate_rshift rshift %d\n", rshift);
+  }
+
+  upper_bound = (1 << (BITS - 1)) - 1;
+  lower_bound = allow_lshift ? (-1 * (1 << (BITS-1))) : 0;
+
+  rshift = (rshift < lower_bound) ? lower_bound : rshift;
+  rshift = (rshift > upper_bound) ? upper_bound : rshift;
+
+  return rshift;
+}
diff --git a/cviruntime/test/test_utils/test_tf_quant_util.h b/cviruntime/test/test_utils/test_tf_quant_util.h
new file mode 100644
index 000000000..f81e959db
--- /dev/null
+++ b/cviruntime/test/test_utils/test_tf_quant_util.h
@@ -0,0 +1,40 @@
+#ifndef TEST_TF_QUANT_UTIL_H
+#define TEST_TF_QUANT_UTIL_H
+
+#include <stdint.h>
+
+#define MAX(a,b) \
+    ({ __typeof__ (a) _a = (a); \
+        __typeof__ (b) _b = (b); \
+      _a > _b ? _a : _b; })
+
+#define MIN(a,b) \
+    ({ __typeof__ (a) _a = (a); \
+        __typeof__ (b) _b = (b); \
+      _a > _b ? _b : _a; })
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int32_t RoundingDivideByPOT(int32_t x, int exponent);
+int32_t SaturatingRoundingDoublingHighMul(int32_t a, int32_t b);
+int32_t MultiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multiplier,
+                                      int rshift);
+void QuantizeMultiplierSmallerThanOne(float real_multiplier,
+                                      uint32_t *quantized_multiplier,
+                                      int *right_shift);
+
+void pack_chl_quan_param(uint32_t channels, int has_bias, int32_t *bias,
+                         uint32_t *multiplier, int8_t *rshift,
+                         uint8_t *packed_data);
+
+// 1880v2: 5bit right shift, [0, 31]
+// 1822:   1bit sign, 5b shift, [-32, 31]
+int8_t truncate_rshift(int8_t rshift, int8_t allow_lshift);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // TEST_TF_QUANT_UTIL_H
diff --git a/cviruntime/tool/CMakeLists.txt b/cviruntime/tool/CMakeLists.txt
new file mode 100644
index 000000000..4f5365feb
--- /dev/null
+++ b/cviruntime/tool/CMakeLists.txt
@@ -0,0 +1,40 @@
+cmake_minimum_required(VERSION 2.8.0)
+
+include_directories(${PROJECT_SOURCE_DIR}/../../third_party/cnpy)
+if(DEFINED CNPY_PATH)
+  include_directories(${CNPY_PATH}/include)
+  link_directories(${CNPY_PATH}/lib)
+endif()
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+include_directories(${PROJECT_SOURCE_DIR}/include)
+if(CMAKE_CROSSCOMPILING)
+  include_directories(${CMAKE_SYSROOT}/include)
+  link_directories(${CNPY_PATH})
+endif()
+
+set(CVI_LIBS ${CVI_LIBS} cviruntime)
+
+set(CVI_LIBS ${CVI_LIBS} cnpy z)
+add_executable(model_runner model_runner.cpp)
+target_link_libraries(model_runner ${CVI_LIBS} ${EXTRA_LIBS})
+
+add_executable(multi_model_tester multi_model_tester.cpp)
+target_link_libraries(multi_model_tester ${CVI_LIBS} ${EXTRA_LIBS})
+
+add_executable(multi_thread_tester multi_thread_tester.cpp)
+target_link_libraries(multi_thread_tester ${CVI_LIBS} ${EXTRA_LIBS})
+
+add_executable(model_interface_tester model_interface_tester.cpp)
+target_link_libraries(model_interface_tester ${CVI_LIBS} ${EXTRA_LIBS})
+
+add_executable(stress_tester stress_tester.cpp)
+target_link_libraries(stress_tester ${CVI_LIBS} ${EXTRA_LIBS})
+
+add_executable(cvimodel_tool cvimodel_tool.cpp md5.cpp)
+target_link_libraries(cvimodel_tool cviruntime)
+
+install(TARGETS model_runner 
+        multi_model_tester cvimodel_tool 
+        model_interface_tester stress_tester
+        multi_thread_tester
+        DESTINATION bin)
\ No newline at end of file
diff --git a/cviruntime/tool/argparse.hpp b/cviruntime/tool/argparse.hpp
new file mode 100644
index 000000000..2d508e14d
--- /dev/null
+++ b/cviruntime/tool/argparse.hpp
@@ -0,0 +1,602 @@
+#ifndef ARGPARSE_HPP_
+#define ARGPARSE_HPP_
+
+#include <string>
+#if __cplusplus >= 201103L
+#include <unordered_map>
+typedef std::unordered_map<std::string, size_t> IndexMap;
+#else
+#include <map>
+typedef std::map<std::string, size_t> IndexMap;
+#endif
+#include <vector>
+#include <typeinfo>
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+#include <cassert>
+#include <algorithm>
+
+namespace argparse {
+// Modified from https://github.com/davisking/dlib/blob/master/dlib/algs.h
+template <typename T>
+struct is_standard_type {
+  const static bool value = false;
+};
+
+template <>
+struct is_standard_type<float> {
+  const static bool value = true;
+};
+template <>
+struct is_standard_type<double> {
+  const static bool value = true;
+};
+template <>
+struct is_standard_type<long double> {
+  const static bool value = true;
+};
+template <>
+struct is_standard_type<short> {
+  const static bool value = true;
+};
+template <>
+struct is_standard_type<int> {
+  const static bool value = true;
+};
+template <>
+struct is_standard_type<long> {
+  const static bool value = true;
+};
+template <>
+struct is_standard_type<unsigned short> {
+  const static bool value = true;
+};
+template <>
+struct is_standard_type<unsigned int> {
+  const static bool value = true;
+};
+template <>
+struct is_standard_type<unsigned long> {
+  const static bool value = true;
+};
+template <>
+struct is_standard_type<char> {
+  const static bool value = true;
+};
+template <>
+struct is_standard_type<signed char> {
+  const static bool value = true;
+};
+template <>
+struct is_standard_type<unsigned char> {
+  const static bool value = true;
+};
+template <>
+struct is_standard_type<std::string> {
+  const static bool value = true;
+};
+
+// Copied from https://github.com/davisking/dlib/blob/master/dlib/enable_if.h
+template <bool B, class T = void>
+struct enable_if_c {
+  typedef T type;
+};
+
+template <class T>
+struct enable_if_c<false, T> {};
+
+template <class Cond, class T = void>
+struct enable_if : public enable_if_c<Cond::value, T> {};
+
+template <bool B, class T = void>
+struct disable_if_c {
+  typedef T type;
+};
+
+template <class T>
+struct disable_if_c<true, T> {};
+
+template <class Cond, class T = void>
+struct disable_if : public disable_if_c<Cond::value, T> {};
+
+template <typename T>
+T castTo(const std::string &item) {
+  std::istringstream sin(item);
+  T value;
+  sin >> value;
+  return value;
+}
+
+template <typename T>
+std::string toString(const T &item) {
+  std::ostringstream sout;
+  sout << item;
+  return sout.str();
+}
+
+void remove_space(std::string &str) {
+  str.erase(std::remove_if(str.begin(), str.end(),
+                           [](unsigned char x) { return std::isspace(x); }),
+            str.end());
+}
+
+void strip_brackets(std::string &str) {
+  auto first_bracket = str.find_first_of('[');
+  if (first_bracket == std::string::npos) {
+    std::ostringstream sout;
+    sout << "Could not find a left bracket in " << str;
+    throw std::runtime_error(sout.str());
+  }
+  str.erase(str.begin() + first_bracket);
+
+  auto last_bracket = str.find_last_of(']');
+  if (last_bracket == std::string::npos) {
+    std::ostringstream sout;
+    sout << "Could not find a right bracket in " << str;
+    throw std::runtime_error(sout.str());
+  }
+  str.erase(str.begin() + last_bracket);
+}
+
+/*! @class ArgumentParser
+ *  @brief A simple command-line argument parser based on the design of
+ *  python's parser of the same name.
+ *
+ *  ArgumentParser is a simple C++ class that can parse arguments from
+ *  the command-line or any array of strings. The syntax is familiar to
+ *  anyone who has used python's ArgumentParser:
+ *  \code
+ *    // create a parser and add the options
+ *    ArgumentParser parser;
+ *    parser.addArgument("-n", "--name");
+ *    parser.addArgument("--inputs", '+');
+ *
+ *    // parse the command-line arguments
+ *    parser.parse(argc, argv);
+ *
+ *    // get the inputs and iterate over them
+ *    string name = parser.retrieve("name");
+ *    vector<string> inputs = parser.retrieve<vector<string>>("inputs");
+ *  \endcode
+ *  https://github.com/jiwoong-choi/argparse/blob/master/argparse.hpp
+ */
+class ArgumentParser {
+private:
+  class Argument;
+  typedef std::string String;
+  typedef std::vector<String> StringVector;
+  typedef std::vector<Argument> ArgumentVector;
+
+  // --------------------------------------------------------------------------
+  // Argument
+  // --------------------------------------------------------------------------
+  static String delimit(const String &name) {
+    return String(std::min(name.size(), (size_t)2), '-').append(name);
+  }
+  static String strip(const String &name) {
+    size_t begin = 0;
+    begin += name.size() > 0 ? name[0] == '-' : 0;
+    begin += name.size() > 3 ? name[1] == '-' : 0;
+    return name.substr(begin);
+  }
+  static String upper(const String &in) {
+    String out(in);
+    std::transform(out.begin(), out.end(), out.begin(), ::toupper);
+    return out;
+  }
+  static String escape(const String &in) {
+    String out(in);
+    if (in.find(' ') != std::string::npos)
+      out = String("\"").append(out).append("\"");
+    return out;
+  }
+
+  struct Argument {
+    Argument()
+        : short_name(""), name(""), optional(true), fixed_nargs(0),
+          fixed(true) {}
+    Argument(const String &_short_name, const String &_name, bool _optional,
+             char nargs)
+        : short_name(_short_name), name(_name), optional(_optional) {
+      if (nargs == '+' || nargs == '*') {
+        variable_nargs = nargs;
+        fixed = false;
+      } else {
+        fixed_nargs = nargs;
+        fixed = true;
+      }
+    }
+    String short_name;
+    String name;
+    bool optional;
+    union {
+      size_t fixed_nargs;
+      char variable_nargs;
+    };
+    bool fixed;
+    bool specified = false;
+    String canonicalName() const { return (name.empty()) ? short_name : name; }
+    String toString(bool named = true) const {
+      std::ostringstream s;
+      String uname =
+          name.empty() ? upper(strip(short_name)) : upper(strip(name));
+      if (named && optional)
+        s << "[";
+      if (named)
+        s << canonicalName();
+      if (fixed) {
+        size_t N = std::min((size_t)3, fixed_nargs);
+        for (size_t n = 0; n < N; ++n)
+          s << " " << uname;
+        if (N < fixed_nargs)
+          s << " ...";
+      }
+      if (!fixed) {
+        s << " ";
+        if (variable_nargs == '*')
+          s << "[";
+        s << uname << " ";
+        if (variable_nargs == '+')
+          s << "[";
+        s << uname << "...]";
+      }
+      if (named && optional)
+        s << "]";
+      return s.str();
+    }
+  };
+
+  void insertArgument(const Argument &arg) {
+    size_t N = arguments_.size();
+    arguments_.push_back(arg);
+    if (arg.fixed && arg.fixed_nargs <= 1) {
+      variables_.push_back(String());
+    } else {
+      variables_.push_back(String());
+    }
+    if (!arg.short_name.empty())
+      index_[arg.short_name] = N;
+    if (!arg.name.empty())
+      index_[arg.name] = N;
+    if (!arg.optional)
+      required_++;
+  }
+
+  // --------------------------------------------------------------------------
+  // Error handling
+  // --------------------------------------------------------------------------
+  void argumentError(const std::string &msg, bool show_usage = false) {
+    if (use_exceptions_)
+      throw std::invalid_argument(msg);
+    std::cerr << "ArgumentParser error: " << msg << std::endl;
+    if (show_usage)
+      std::cerr << usage() << std::endl;
+    exit(-5);
+  }
+
+  // --------------------------------------------------------------------------
+  // Member variables
+  // --------------------------------------------------------------------------
+  IndexMap index_;
+  bool ignore_first_;
+  bool use_exceptions_;
+  size_t required_;
+  String app_name_;
+  String final_name_;
+  ArgumentVector arguments_;
+  StringVector variables_;
+
+public:
+  ArgumentParser()
+      : ignore_first_(true), use_exceptions_(false), required_(0) {}
+  // --------------------------------------------------------------------------
+  // addArgument
+  // --------------------------------------------------------------------------
+  void appName(const String &name) { app_name_ = name; }
+  void addArgument(const String &name, char nargs = 0, bool optional = true) {
+    if (name.size() > 2) {
+      Argument arg("", verify(name), optional, nargs);
+      insertArgument(arg);
+    } else {
+      Argument arg(verify(name), "", optional, nargs);
+      insertArgument(arg);
+    }
+  }
+  void addArgument(const String &short_name, const String &name, char nargs = 0,
+                   bool optional = true) {
+    Argument arg(verify(short_name), verify(name), optional, nargs);
+    insertArgument(arg);
+  }
+  void addFinalArgument(const String &name, char nargs = 1,
+                        bool optional = false) {
+    final_name_ = delimit(name);
+    Argument arg("", final_name_, optional, nargs);
+    insertArgument(arg);
+  }
+  void ignoreFirstArgument(bool ignore_first) { ignore_first_ = ignore_first; }
+  String verify(const String &name) {
+    if (name.empty())
+      argumentError("argument names must be non-empty");
+    if ((name.size() == 2 && name[0] != '-') || name.size() == 3)
+      argumentError(String("invalid argument '")
+                        .append(name)
+                        .append("'. Short names must begin with '-'"));
+    if (name.size() > 3 && (name[0] != '-' || name[1] != '-'))
+      argumentError(
+          String("invalid argument '")
+              .append(name)
+              .append("'. Multi-character names must begin with '--'"));
+    return name;
+  }
+
+  // --------------------------------------------------------------------------
+  // Parse
+  // --------------------------------------------------------------------------
+  void parse(size_t argc, const char **argv) {
+    parse(StringVector(argv, argv + argc));
+  }
+
+  void parse(const StringVector &argv) {
+    // check if the app is named
+    if (app_name_.empty() && ignore_first_ && !argv.empty())
+      app_name_ = argv[0];
+
+    // set up the working set
+    Argument active;
+    Argument final =
+        final_name_.empty() ? Argument() : arguments_[index_[final_name_]];
+    size_t consumed = 0;
+    size_t nrequired = final.optional ? required_ : required_ - 1;
+    size_t nfinal = final.optional
+                        ? 0
+                        : (final.fixed ? final.fixed_nargs
+                                       : (final.variable_nargs == '+' ? 1 : 0));
+
+    // iterate over each element of the array
+    for (StringVector::const_iterator in = argv.begin() + ignore_first_;
+         in < argv.end() - nfinal; ++in) {
+      String active_name = active.canonicalName();
+      String el = *in;
+
+      //  check if the element is a key
+      if (index_.count(el) == 0) {
+        // input
+        // is the current active argument expecting more inputs?
+        if (active.fixed && active.fixed_nargs <= consumed)
+          argumentError(
+              String("attempt to pass too many inputs to ").append(active_name),
+              true);
+        if (active.fixed && active.fixed_nargs == 1) {
+          variables_[index_[active_name]] = el;
+        } else {
+          String &variable = variables_[index_[active_name]];
+          StringVector value = castTo<StringVector>(variable);
+          value.push_back(el);
+          variable = toString(value);
+        }
+        consumed++;
+      } else {
+        // new key!
+        arguments_[index_[el]].specified = true;
+        // has the active argument consumed enough elements?
+        if ((active.fixed && active.fixed_nargs != consumed) ||
+            (!active.fixed && active.variable_nargs == '+' && consumed < 1))
+          argumentError(String("encountered argument ")
+                            .append(el)
+                            .append(" when expecting more inputs to ")
+                            .append(active_name),
+                        true);
+        active = arguments_[index_[el]];
+        // check if we've satisfied the required arguments
+        /*
+        if ((!active.optional) && nrequired > 0)
+          argumentError(String("encountered optional argument ")
+                            .append(el)
+                            .append(" when expecting more required arguments"),
+                        true);
+        */
+        // are there enough arguments for the new argument to consume?
+        if ((active.fixed &&
+             active.fixed_nargs > (argv.end() - in - nfinal - 1)) ||
+            (!active.fixed && active.variable_nargs == '+' &&
+             !(argv.end() - in - nfinal - 1)))
+          argumentError(String("too few inputs passed to argument ").append(el),
+                        true);
+        if (!active.optional)
+          nrequired--;
+        consumed = 0;
+      }
+    }
+
+    for (StringVector::const_iterator in =
+             std::max(argv.begin() + ignore_first_, argv.end() - nfinal);
+         in != argv.end(); ++in) {
+      String el = *in;
+      // check if we accidentally find an argument specifier
+      if (index_.count(el))
+        argumentError(String("encountered argument specifier ")
+                          .append(el)
+                          .append(" while parsing final required inputs"),
+                      true);
+      if (final.fixed && final.fixed_nargs == 1) {
+        variables_[index_[final_name_]] = el;
+      } else {
+        String &variable = variables_[index_[final_name_]];
+        StringVector value = castTo<StringVector>(variable);
+        value.push_back(el);
+        variable = toString(value);
+      }
+      nfinal--;
+    }
+
+    // check that all of the required arguments have been encountered
+    if (nrequired > 0 || nfinal > 0)
+      argumentError(
+          String("too few required arguments passed to ").append(app_name_),
+          true);
+  }
+
+  // --------------------------------------------------------------------------
+  // Retrieve
+  // --------------------------------------------------------------------------
+  template <typename T>
+  T retrieve(const String &name) {
+    if (index_.count(delimit(name)) == 0)
+      throw std::out_of_range("Key not found");
+    size_t N = index_[delimit(name)];
+    return castTo<T>(variables_[N]);
+  }
+
+  // --------------------------------------------------------------------------
+  // Properties
+  // --------------------------------------------------------------------------
+  String usage() {
+    // premable app name
+    std::ostringstream help;
+    help << "Usage: " << escape(app_name_);
+    size_t indent = help.str().size();
+    size_t linelength = 0;
+
+    // get the required arguments
+    for (ArgumentVector::const_iterator it = arguments_.begin();
+         it != arguments_.end(); ++it) {
+      Argument arg = *it;
+      if (arg.optional)
+        continue;
+      if (arg.name.compare(final_name_) == 0)
+        continue;
+      help << " ";
+      String argstr = arg.toString();
+      if (argstr.size() + linelength > 80) {
+        help << "\n" << String(indent, ' ');
+        linelength = 0;
+      } else {
+        linelength += argstr.size();
+      }
+      help << argstr;
+    }
+
+    // get the optional arguments
+    for (ArgumentVector::const_iterator it = arguments_.begin();
+         it != arguments_.end(); ++it) {
+      Argument arg = *it;
+      if (!arg.optional)
+        continue;
+      if (arg.name.compare(final_name_) == 0)
+        continue;
+      help << " ";
+      String argstr = arg.toString();
+      if (argstr.size() + linelength > 80) {
+        help << "\n" << String(indent, ' ');
+        linelength = 0;
+      } else {
+        linelength += argstr.size();
+      }
+      help << argstr;
+    }
+
+    // get the final argument
+    if (!final_name_.empty()) {
+      Argument arg = arguments_[index_[final_name_]];
+      String argstr = arg.toString(false);
+      if (argstr.size() + linelength > 80) {
+        help << "\n" << String(indent, ' ');
+        linelength = 0;
+      } else {
+        linelength += argstr.size();
+      }
+      help << argstr;
+    }
+
+    return help.str();
+  }
+  void useExceptions(bool state) { use_exceptions_ = state; }
+  bool empty() const { return index_.empty(); }
+  void clear() {
+    ignore_first_ = true;
+    required_ = 0;
+    index_.clear();
+    arguments_.clear();
+    variables_.clear();
+  }
+  bool exists(const String &name) const {
+    return index_.count(delimit(name)) > 0;
+  }
+  bool gotArgument(const String &name) {
+    // check if the name is an argument
+    if (index_.count(delimit(name)) == 0)
+      return 0;
+    size_t N = index_[delimit(name)];
+    Argument arg = arguments_[N];
+    return arg.specified;
+  }
+};
+} // namespace argparse
+
+template <typename T>
+std::ostream &operator<<(std::ostream &out, const std::vector<T> &v) {
+  out << "[";
+  for (unsigned long i = 0; i < v.size(); ++i) {
+    if (i > 0)
+      out << ", ";
+    out << v[i];
+  }
+  out << "]";
+
+  return out;
+}
+
+template <typename T>
+typename argparse::enable_if<argparse::is_standard_type<T>,
+                             std::istream &>::type
+operator>>(std::istream &in, std::vector<T> &v) {
+  using namespace argparse;
+  v.clear();
+
+  std::string str;
+  std::getline(in, str, '\n');
+
+  if (str.empty())
+    return in;
+  remove_space(str);
+  strip_brackets(str);
+
+  std::istringstream sin(str);
+  while (sin.good()) {
+    std::string substr;
+    std::getline(sin, substr, ',');
+    if (!substr.empty())
+      v.push_back(castTo<T>(substr));
+  }
+
+  return in;
+}
+
+template <typename T>
+typename argparse::enable_if<argparse::is_standard_type<T>,
+                             std::istream &>::type
+operator>>(std::istream &in, std::vector<std::vector<T>> &v) {
+  using namespace argparse;
+  static const std::string delimiter = "]";
+  v.clear();
+
+  std::string str;
+  std::getline(in, str, '\n');
+
+  if (str.empty())
+    return in;
+  remove_space(str);
+  strip_brackets(str);
+
+  size_t pos = 0;
+  while ((pos = str.find(delimiter)) != std::string::npos) {
+    std::string substr = str.substr(0, pos + 1);
+    v.push_back(castTo<std::vector<T>>(substr));
+    str.erase(0, pos + delimiter.length());
+  }
+
+  return in;
+}
+
+#endif
diff --git a/cviruntime/tool/cvimodel_tool.cpp b/cviruntime/tool/cvimodel_tool.cpp
new file mode 100644
index 000000000..b9b433454
--- /dev/null
+++ b/cviruntime/tool/cvimodel_tool.cpp
@@ -0,0 +1,1387 @@
+#include <stdio.h>
+#include <math.h>
+#include <chrono>
+#include <iomanip>
+#include <time.h>
+#include <stdlib.h>
+#include <string.h>
+#include <vector>
+#include <iostream>
+#include <sstream>
+#include <fstream>
+#include <sys/time.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <runtime/debug.h>
+#include <runtime/version.h>
+#include <runtime/model.hpp>
+#include <cvibuilder/cvimodel_generated.h>
+#include <cvibuilder/parameter_generated.h>
+#include "argparse.hpp"
+#include "assert.h"
+#include "md5.hpp"
+
+#ifdef ENABLE_COMPRESS_CMDBUF
+#include "lz4.h"
+#endif
+
+using FBWeightVector =
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<cvi::model::Weight>>>;
+using FBTensorVector =
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<cvi::model::Tensor>>>;
+using FBRoutineVector =
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<cvi::model::Routine>>>;
+using FBProgramVector =
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<cvi::model::Program>>>;
+using FBSectionVector =
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<cvi::model::Section>>>;
+using SectionInfoVector =
+    std::vector<std::tuple<cvi::runtime::FileStream *, uint32_t, uint32_t, uint32_t>>;
+
+class Model {
+public:
+  Model(const std::string &model_file);
+  ~Model() {
+    if (fb_model_buffer)
+      delete[] fb_model_buffer;
+  }
+
+  void dump();
+  void extract();
+  void encrypt(std::string &encrypter, std::string &output);
+  void merge(std::vector<std::shared_ptr<Model>> &models, std::string &dst);
+  void compress_inst(std::string &output);
+
+  cvi::runtime::MODEL_HEADER header;
+  cvi::runtime::FileStream stream;
+  size_t binary_offset;
+  const cvi::model::Model *fb_model;
+
+private:
+  uint8_t *fb_model_buffer;
+  flatbuffers::FlatBufferBuilder fbb;
+  float _qscale = 0;
+  std::string input_quanted_tensor;
+
+  void storeSectionToFile(const cvi::model::Section *section, std::string dst);
+  size_t compressSectionToFile(const cvi::model::Section *section, std::string dst);
+  std::string calcSectionMD5(const cvi::model::Section *section, int size);
+  const char *sectionTypeToStr(cvi::model::SectionType type);
+  const char *dtypeToStr(cvi::model::DType type);
+  size_t dtypeSize(cvi::model::DType type);
+  bool getQscaleFromDequantCpuOp(const cvi::cpu_op::Parameter *param);
+
+  void dumpBaseInfo();
+  void dumpSections();
+  void dumpWeightMap();
+  void dumpPrograms();
+  void dumpTensors(const cvi::model::Program *p, bool oldVersion);
+  void dumpRoutines(const cvi::model::Program *p);
+
+  void extractSections();
+  FBWeightVector cloneWeightMap(std::vector<std::shared_ptr<Model>> &models);
+  FBTensorVector cloneTensorMap(const cvi::model::Program *program);
+  FBRoutineVector
+  cloneRoutines(const cvi::model::Program *program, bool rename, int index,
+                std::map<std::string, std::string> *routine_name_map);
+  FBProgramVector clonePrograms(
+      std::vector<std::shared_ptr<Model>> &models,
+      std::vector<std::map<std::string, std::string>> &routine_name_maps);
+  FBSectionVector cloneSections(
+      std::vector<std::shared_ptr<Model>> &models,
+      std::vector<uint8_t> &sections_buf,
+      std::vector<std::map<std::string, std::string>> &routine_name_maps);
+
+  FBWeightVector rebuildWeightMap();
+  FBProgramVector rebuildPrograms();
+  FBSectionVector rebuildSections(std::string dir, std::string model_name,
+                                  std::vector<uint8_t> &enc_buf);
+  FBSectionVector rebuildSections(std::string tmp_dir, std::string model_name,
+                                  std::vector<uint8_t> &out_buf,
+                                  std::map<std::string, size_t> &cmpr_info);
+  int decompress_section(const cvi::model::Section *section,
+                         std::vector<char> &out_buf);
+  };
+
+Model::Model(const std::string &model_file) : stream(model_file), fbb(0) {
+  if (stream.length() <= sizeof(header)) {
+    printf("Error, invalid cvimodel file\n");
+  }
+  stream.read((uint8_t *)&header, 0, sizeof(header));
+  size_t header_size;
+  /* before version 1.1, heder size is 32 bytes */
+  if (header.major == 1 && header.minor == 0)
+    header_size = 0x20;
+  else
+    header_size = sizeof(cvi::runtime::MODEL_HEADER);
+  binary_offset = header_size + header.body_size;
+  fb_model_buffer = new uint8_t[header.body_size];
+  if (!fb_model_buffer) {
+    printf("Failed to allocate memory\n");
+  }
+  stream.read(fb_model_buffer, header_size, header.body_size);
+  fb_model = cvi::model::GetModel(fb_model_buffer);
+}
+
+void Model::dump() {
+  dumpBaseInfo();
+  dumpSections();
+  dumpWeightMap();
+  dumpPrograms();
+}
+
+void Model::extract() {
+  auto name = fb_model->name()->str();
+  for (auto p : *fb_model->programs()) {
+    auto neuron_size = p->neuron_size();
+    auto private_gmem_size = p->private_gmem();
+    auto shared_gmem_size = p->shared_gmem();
+    if (neuron_size) {
+      std::cout << "neuron_size: " << neuron_size << "\n";
+    } else {
+      std::cout << "private_mem_size: " << private_gmem_size << "\n";
+      std::cout << "shared_gmem_size: " << shared_gmem_size << "\n";
+    }
+    // dump cmdbuf
+    for (auto r : *p->routines()) {
+      if (r->type() != cvi::model::RoutineType_TPU)
+        continue;
+
+      auto getTensorByName = [&](std::string name) {
+        for (const auto &t : *p->tensor_map()) {
+          if (t->name()->str() == name) {
+            return t;
+          }
+        }
+        assert(0);
+      };
+
+
+      std::string buf_name;
+      std::string buf_type_name;
+      if (r->tpu_routine()->cmdbuf_section()) {
+        buf_name = r->tpu_routine()->cmdbuf_section()->str();
+        buf_type_name = "cmdbuf";
+      } else if (r->tpu_routine()->dmabuf_section()) {
+        buf_name = r->tpu_routine()->dmabuf_section()->str();
+        buf_type_name = "dmabuf";
+      } else {
+        assert(0 && "model has not cmdbuf and dmabuf");
+      }
+
+      printf("routine #%s\n", buf_name.c_str());
+      printf("  %-6s %-4s %-4s %-4s %-4s %-5s %-7s %s\n", " ", "n", "c", "h", "w",
+             "dtype", "offset", "name");
+      for (auto name : *r->in_tensors()) {
+        auto tensor = getTensorByName(name->str());
+        auto &shape = *tensor->shape()->dim();
+        printf("  %-6s %-4d %-4d %-4d %-4d %-5s %-7d %s\n", "[IN ]", (int)shape[0],
+               (int)shape[1], (int)shape[2], (int)shape[3], dtypeToStr(tensor->dtype()),
+               (int)tensor->offset(), name->c_str());
+      }
+      for (auto name : *r->out_tensors()) {
+        auto tensor = getTensorByName(name->str());
+        auto &shape = *tensor->shape()->dim();
+        printf("  %-6s %-4d %-4d %-4d %-4d %-5s %-7d %s\n", "[OUT]", (int)shape[0],
+               (int)shape[1], (int)shape[2], (int)shape[3], dtypeToStr(tensor->dtype()),
+               (int)tensor->offset(), name->c_str());
+      }
+      for (auto s : *fb_model->sections()) {
+        if (s->name()->str() == buf_name) {
+          std::string dst = name + "_program_" + std::to_string(p->batch_num()) +
+                            "_" + buf_type_name + "_" + s->name()->str() + ".bin";
+          storeSectionToFile(s, dst);
+          break;
+        }
+      }
+    }
+  }
+  // dump weight
+  for (auto s : *fb_model->sections()) {
+    if (s->type() != cvi::model::SectionType_WEIGHT)
+      continue;
+    storeSectionToFile(s, name + "_weight.bin");
+  }
+}
+
+int Model::decompress_section(const cvi::model::Section *section, std::vector<char>& out_buf) {
+  if (!section->compress()) {
+    return 0;
+  }
+#ifdef ENABLE_COMPRESS_CMDBUF
+  auto size = section->size();
+  auto offset = section->offset();
+
+  char *in_buf = new (std::nothrow) char[size];
+  if (!in_buf) {
+    printf("Alloc buff for decompress failed! buff size:%d\n", size);
+    return -1;
+  }
+  out_buf.resize(section->decompressed_size());
+
+  stream.read(reinterpret_cast<uint8_t*>(in_buf), binary_offset + offset, size);
+
+  size_t rc = LZ4_decompress_safe(in_buf, out_buf.data(), size, section->decompressed_size());
+  delete[] in_buf;
+  if (rc != section->decompressed_size()) {
+    printf("Decompress section failed! section name:%s\n", section->name()->c_str());
+    return -1;
+  }
+  return 0;
+#else
+  printf("Compressed section is not supported! please recompile with ENABLE_COMPRESS_CMDBUF\n");
+  return -1;
+#endif
+}
+
+void Model::storeSectionToFile(const cvi::model::Section *section, std::string dst) {
+  auto offset = section->offset();
+  auto size = section->size();
+  std::ofstream of(dst, std::ofstream::out | std::ofstream::binary |
+                            std::ofstream::trunc);
+  if (section->compress()) {
+    std::vector<char> out_buf;
+    if (0 != decompress_section(section, out_buf)) {
+      printf("store section failed\n");
+      return;
+    }
+    of.write(out_buf.data(), out_buf.size());
+  } else {
+    uint8_t *buf = new uint8_t[1024];
+    do {
+      auto len = size > 1024 ? 1024 : size;
+      stream.read(buf, binary_offset + offset, len);
+      of.write((const char *)buf, len);
+      offset += len;
+      size -= len;
+    } while (size);
+    of.close();
+    delete[] buf;
+  }
+  printf("store section to %s\n", dst.c_str());
+}
+
+std::string Model::calcSectionMD5(const cvi::model::Section *section, int size) {
+  auto offset = section->offset();
+  MD5 md5;
+  if (section->compress()) {
+    std::vector<char> out_buf;
+    if (0 != decompress_section(section, out_buf)) {
+      printf("store section failed\n");
+      return std::string();
+    }
+    md5.update(reinterpret_cast<uint8_t*>(out_buf.data()), out_buf.size());
+  } else {
+    uint8_t *buf = new uint8_t[1024];
+    do {
+      auto len = size > 1024 ? 1024 : size;
+      stream.read(buf, binary_offset + offset, len);
+      md5.update(buf, len);
+      offset += len;
+      size -= len;
+    } while (size);
+    delete[] buf;
+  }
+  return md5.finalize().hexdigest();
+}
+
+void Model::dumpBaseInfo() {
+  if (fb_model->mlir_version()) {
+    printf("Mlir Version: %s\n", fb_model->mlir_version()->c_str());
+  }
+  auto version = fb_model->version();
+  printf("Cvimodel Version: %d.%d.%d\n", (int)version->major_(), (int)version->minor_(),
+         (int)version->sub_minor());
+  printf("%s Build at %s\n", fb_model->name()->c_str(), fb_model->build_time()->c_str());
+  if (fb_model->target()) {
+    printf("For %s chip ONLY\n", fb_model->target()->c_str());
+  }
+
+  // dump peak memory usage, summary static size(weight/cmdbuf) and runtime (private_gmem_size+shared_gmem_size+io_mem)
+  size_t total_size = 0;
+  auto &sections = *fb_model->sections();
+  // static
+  for (auto s : sections) {
+    total_size += s->size();
+  }
+
+  // runtime
+  auto &programs = *fb_model->programs();
+  size_t share_size = 0;
+  size_t io_size = 0;
+  for (auto p : programs) {
+    total_size += p->neuron_size();
+    total_size += p->private_gmem();
+    if (share_size < p->shared_gmem()) {
+      share_size = p->shared_gmem();
+    }
+    auto &tensor_map = *p->tensor_map();
+    for (auto t : tensor_map) {
+      auto gaddr = (int64_t)t->offset();
+      if (gaddr != -1) {
+        auto memTypeIndx = (gaddr >> 40) & 0x07;
+        bool oldVersion = p->neuron_size() > 0;
+        if (memTypeIndx > 1 || oldVersion) {
+          if (memTypeIndx > 2) {
+            // io_mem
+            auto &shape = *t->shape()->dim();
+            size_t type_size = dtypeSize(t->dtype());
+            size_t tensor_size = shape[0] * shape[1] * shape[2] * shape[3] * type_size;
+            io_size += tensor_size;
+          }
+        }
+      }
+    }
+  }
+  total_size += share_size;
+  total_size += io_size;
+  printf("CviModel Need ION Memory Size: (%.2f MB)\n", total_size / (float)(1024*1024));
+}
+
+void Model::dumpSections() {
+  printf("\nSections:\n");
+  printf("%-3s  %-10s%-25s%-12s%-12s%-12s%-12s%-s\n", "ID", "TYPE", "NAME", "SIZE", "OFFSET",
+         "ENCRYPT", "COMPRESS", "MD5");
+  auto &sections = *fb_model->sections();
+  int i = 0;
+  for (auto s : sections) {
+    auto type = sectionTypeToStr(s->type());
+    auto name = s->name()->c_str();
+    auto size = s->size();
+    auto offset = s->offset();
+    auto md5 = calcSectionMD5(s, s->size());
+    auto encrypt = s->encrypt();
+    auto compress = s->compress();
+    printf("%03d  %-10s%-25s%-12d%-12d%-12s%-12s%-s\n", i++, type, name, size, offset,
+           encrypt ? "True" : "False", compress ? "True" :"False", md5.c_str());
+  }
+}
+
+void Model::dumpWeightMap() {
+  printf("\nWeightMap:\n");
+  printf("%-3s  %-10s%-10s%-8s%-4s %-4s %-4s %-4s %-s\n", "ID", "OFFSET", "SIZE", "TYPE",
+         "N", "C", "H", "W", "NAME");
+
+  auto &weights = *fb_model->weight_map();
+  int i = 0;
+  for (auto w : weights) {
+    auto &shape = *w->shape()->dim();
+    printf("%03d  %-10d%-10d%-8s%-4d %-4d %-4d %-4d %-s\n", i++, (int)w->offset(),
+           w->size(), dtypeToStr(w->type()), (int)shape[0], (int)shape[1], (int)shape[2],
+           (int)shape[3], w->name()->c_str());
+  }
+}
+
+void Model::dumpPrograms() {
+  auto &programs = *fb_model->programs();
+  int idx = 0;
+  for (auto p : programs) {
+    auto batch_num = p->batch_num();
+    auto neuron_size = p->neuron_size();
+    auto private_gmem_size = p->private_gmem();
+    auto shared_gmem_size = p->shared_gmem();
+    auto &input_tensors = *p->input_tensors();
+    auto &output_tensors = *p->output_tensors();
+    printf("\nProgram #%d\n", idx++);
+    printf("    %-12s: %d\n", "batch_num", batch_num);
+    if (neuron_size) {
+      printf("    %-12s: %d\n", "neuron_size", neuron_size);
+    } else {
+      printf("    %-12s: %d\n", "private_gmem_size", private_gmem_size);
+      printf("    %-12s: %d\n", "shared_gmem_size", shared_gmem_size);
+    }
+    printf("    %-12s: ", "inputs");
+    for (int i = 0; i < (int)input_tensors.size(); i++) {
+      if (i != 0)
+        printf(",");
+      printf("%s", input_tensors[i]->c_str());
+    }
+    printf("\n    %-12s: ", "outputs");
+    for (int i = 0; i < (int)output_tensors.size(); i++) {
+      if (i != 0)
+        printf(",");
+      printf("%s", output_tensors[i]->c_str());
+    }
+    printf("\n    %-12s:\n", "routines");
+    dumpRoutines(p);
+    printf("\n    %-12s:\n", "tensor_map");
+    // The cvimodel is old version(blow 1.1.0)
+    // if neuson size is greater than 0,
+    dumpTensors(p, neuron_size > 0);
+  }
+}
+
+void Model::dumpTensors(const cvi::model::Program *p, bool oldVersion) {
+  printf("        ");
+  printf("%-3s  %-12s%-6s%-4s %-4s %-4s %-4s %-10s %-7s %-s\n", "ID", "OFFSET", "TYPE",
+         "N", "C", "H", "W", "QSCALE", "MEM", "NAME");
+  auto &tensors = *p->tensor_map();
+  int i = 0;
+  for (auto t : tensors) {
+    auto &shape = *t->shape()->dim();
+    std::string memType = "   -";
+    auto gaddr = (int64_t)t->offset();
+    if (gaddr != -1) {
+      auto memTypeIndx = (gaddr >> 40) & 0x07;
+      if (memTypeIndx > 1 || oldVersion) {
+        if (memTypeIndx > 2) {
+          memType = "io_mem";
+        } else {
+          memType = "private";
+        }
+      } else {
+        memType = "shared";
+      }
+    }
+    float qscale = t->quant() ? t->quant()->qscale() : 0;
+    if (t->name()->str() == input_quanted_tensor) {
+      qscale = _qscale;
+    }
+    printf("        ");
+    if (qscale <= 0.000001 || qscale > 400.0f) {
+      printf("%03d  %-12d%-6s%-4d %-4d %-4d %-4d %-10s %-7s %-s\n", i++, (int)t->offset(),
+             dtypeToStr(t->dtype()), (int)shape[0], (int)shape[1], (int)shape[2],
+             (int)shape[3], "-", memType.c_str(), t->name()->c_str());
+    } else {
+      printf("%03d  %-12d%-6s%-4d %-4d %-4d %-4d %-10f %-7s %-s\n", i++, (int)t->offset(),
+             dtypeToStr(t->dtype()), (int)shape[0], (int)shape[1], (int)shape[2],
+             (int)shape[3], qscale, memType.c_str(), t->name()->c_str());
+    }
+  }
+}
+
+bool Model::getQscaleFromDequantCpuOp(const cvi::cpu_op::Parameter *param) {
+  std::string from;
+  std::string to;
+  float threshold = 0;
+  auto &attributes = *param->attributes();
+  for (auto attr : attributes) {
+    if (attr->float_attr()) {
+      auto _float = attr->float_attr();
+      if (_float->key()->str() == "threshold") {
+        threshold = _float->value();
+      }
+    } else if (attr->str_attr()) {
+      auto _str = attr->str_attr();
+      if (_str->key()->str() == "from") {
+        from = _str->value()->str();
+      } else if (_str->key()->str() == "to") {
+        to = _str->value()->str();
+      }
+    }
+  }
+  if (threshold != 0 && from == "NONE" && to == "INT8") {
+    _qscale = 128.0 / threshold;
+    return true;
+  }
+  return false;
+}
+
+void Model::dumpRoutines(const cvi::model::Program *p) {
+  auto &routines = *p->routines();
+  int i = 0;
+  for (auto r : routines) {
+    bool tpu = r->type() == cvi::model::RoutineType_TPU;
+    printf("     #%02d  %s\n", i++, tpu ? "tpu" : "cpu");
+    printf("        %-8s: ", "inputs");
+    int j = 0;
+    for (auto name : *r->in_tensors()) {
+      if (j++ != 0)
+        printf(",");
+      printf("%s", name->c_str());
+    }
+    printf("\n        %-8s: ", "outputs");
+    j = 0;
+    for (auto name : *r->out_tensors()) {
+      if (j++ != 0)
+        printf(",");
+      printf("%s", name->c_str());
+    }
+    if (tpu) {
+      std::string buf_name;
+      if (r->tpu_routine()->cmdbuf_section()) {
+        buf_name = r->tpu_routine()->cmdbuf_section()->str();
+      } else if (r->tpu_routine()->dmabuf_section()) {
+        buf_name = r->tpu_routine()->dmabuf_section()->str();
+      } else {
+        assert(0 && "model has not cmdbuf and dmabuf");
+      }
+      printf("\n        %-8s: %s\n", "section",
+             buf_name.c_str());
+    } else {
+      if (r->cpu_routine()->function_section()->str() == "quant" && _qscale == 0) {
+        auto param = cvi::cpu_op::GetParameter(r->cpu_routine()->function_args()->data());
+        if (getQscaleFromDequantCpuOp(param)) {
+          input_quanted_tensor = (*r->out_tensors())[0]->str();
+        }
+      }
+      printf("\n        %-8s: %s\n", "function",
+             r->cpu_routine()->function_section()->c_str());
+    }
+  }
+}
+
+const char *Model::sectionTypeToStr(cvi::model::SectionType type) {
+  switch (type) {
+    case cvi::model::SectionType_WEIGHT:
+      return "weight";
+    case cvi::model::SectionType_CMDBUF:
+      return "cmdbuf";
+    case cvi::model::SectionType_DMABUF:
+      return "dmabuf";
+    case cvi::model::SectionType_FUNC_X86:
+      return "x86_64";
+    case cvi::model::SectionType_FUNC_AARCH64:
+      return "aarch64";
+    default:
+      printf("unknown section type\n");
+  }
+  return "";
+}
+
+const char *Model::dtypeToStr(cvi::model::DType type) {
+  switch (type) {
+    case cvi::model::DType_FP32:
+      return "fp32";
+    case cvi::model::DType_INT32:
+      return "int32";
+    case cvi::model::DType_UINT32:
+      return "uint32";
+    case cvi::model::DType_BF16:
+      return "bf16";
+    case cvi::model::DType_INT16:
+      return "int16";
+    case cvi::model::DType_UINT16:
+      return "uint16";
+    case cvi::model::DType_INT8:
+      return "int8";
+    case cvi::model::DType_UINT8:
+      return "uint8";
+    default:
+      printf("unknown dtype\n");
+  }
+  return "";
+}
+
+size_t Model::dtypeSize(cvi::model::DType type) {
+  switch (type) {
+    case cvi::model::DType_FP32:
+      return 4;
+    case cvi::model::DType_INT32:
+      return 4;
+    case cvi::model::DType_UINT32:
+      return 4;
+    case cvi::model::DType_BF16:
+      return 2;
+    case cvi::model::DType_INT16:
+      return 2;
+    case cvi::model::DType_UINT16:
+      return 2;
+    case cvi::model::DType_INT8:
+      return 1;
+    case cvi::model::DType_UINT8:
+      return 1;
+    default:
+      printf("unknown dtype\n");
+  }
+  return 0;
+}
+
+static std::string getStrOfCurrentTime() {
+  std::stringstream ssTime;
+  auto clockNow = std::chrono::system_clock::now();
+  auto t = std::chrono::system_clock::to_time_t(clockNow);
+  ssTime << std::put_time(std::localtime(&t), "%Y-%m-%d %H:%M:%S");
+  return ssTime.str();
+}
+
+FBWeightVector Model::cloneWeightMap(std::vector<std::shared_ptr<Model>> &models) {
+  std::vector<flatbuffers::Offset<cvi::model::Weight>> tensor_vec;
+  std::vector<std::tuple<std::string, int64_t, uint32_t>> weight_tensors;
+  bool redundant = false;
+  for (auto &model : models) {
+    for (auto w : *model->fb_model->weight_map()) {
+      redundant = false;
+      auto name = w->name()->c_str();
+      for (auto t : weight_tensors) {
+        if (std::get<0>(t) == name && std::get<1>(t) == w->offset() && std::get<2>(t) == w->size()) {
+          redundant = true;
+          break;
+        }
+      }
+      if (redundant) continue;
+      std::vector<int64_t> dim;
+      for (auto s : *w->shape()->dim()) {
+        dim.push_back(s);
+      }
+      auto shape = cvi::model::CreateShapeDirect(fbb, &dim);
+      auto weight = cvi::model::CreateWeightDirect(fbb, name, w->offset(), w->size(), shape,
+                                                  w->type());
+      tensor_vec.push_back(weight);
+      weight_tensors.push_back(std::make_tuple(name, w->offset(), w->size()));
+    }
+  }
+  return fbb.CreateVector(tensor_vec);
+}
+
+FBTensorVector Model::cloneTensorMap(const cvi::model::Program *program) {
+  std::vector<flatbuffers::Offset<cvi::model::Tensor>> tensor_vec;
+  for (auto t : *program->tensor_map()) {
+    auto name = t->name()->c_str();
+    std::vector<int64_t> dim;
+    for (auto s : *t->shape()->dim()) {
+      dim.push_back(s);
+    }
+    auto shape = cvi::model::CreateShapeDirect(fbb, &dim);
+    auto tensor = cvi::model::CreateTensorDirect(fbb, t->tensor_id(), name, t->offset(), t->dtype(),
+                                       shape, 0, 0, t->overwrote());
+    if (t->quant()) {
+      auto quant = cvi::model::CreateQuantInfo(fbb, t->quant()->type(), 0, 0,
+                                             t->quant()->zero_point(),
+                                             t->quant()->qscale());
+      std::vector<float> scale;
+      if (t->scale()) {
+        for (int i = 0; i < (int)t->scale()->size(); ++i) {
+          scale.push_back(t->scale()->Get(i));
+        }
+      }
+
+      std::vector<float> mean;
+      if (t->mean()) {
+        for (int i = 0; i < (int)t->mean()->size(); ++i) {
+          mean.push_back(t->mean()->Get(i));
+        }
+      }
+
+      std::string pixel_format;
+      if (t->pixel_format()) {
+        pixel_format = t->pixel_format()->str();
+      }
+
+      tensor = cvi::model::CreateTensorDirect(fbb, t->tensor_id(), name, t->offset(), t->dtype(),
+                                      shape, 0, quant, t->overwrote(),
+                                      scale.size() > 0 ? &scale : nullptr,
+                                      mean.size() > 0 ? &mean : nullptr,
+                                      pixel_format.length() > 0 ? pixel_format.c_str() : nullptr,
+                                      t->aligned(), t->size());
+    }
+    tensor_vec.push_back(tensor);
+  }
+  return fbb.CreateVector(tensor_vec);
+}
+
+FBRoutineVector
+Model::cloneRoutines(const cvi::model::Program *program, bool rename, int index,
+                     std::map<std::string, std::string> *routine_name_map) {
+  std::vector<flatbuffers::Offset<cvi::model::Routine>> routines;
+  for (auto r : *program->routines()) {
+    std::vector<flatbuffers::Offset<flatbuffers::String>> fbStrVec;
+    for (auto name : *r->in_tensors()) {
+      fbStrVec.push_back(fbb.CreateString(name));
+    }
+    auto inputs = fbb.CreateVector(fbStrVec);
+    fbStrVec.clear();
+    for (auto name : *r->out_tensors()) {
+      fbStrVec.push_back(fbb.CreateString(name));
+    }
+    auto outputs = fbb.CreateVector(fbStrVec);
+    if (r->type() == cvi::model::RoutineType_TPU) {
+      flatbuffers::Offset<cvi::model::TpuRoutine> tpuRoutine;
+      if (rename) {
+        std::stringstream new_name;
+        if (r->tpu_routine()->cmdbuf_section()) {
+          new_name << r->tpu_routine()->cmdbuf_section()->c_str() << "_" << index;
+          tpuRoutine = cvi::model::CreateTpuRoutineDirect(
+              fbb, new_name.str().c_str(), nullptr);
+          routine_name_map->emplace(r->tpu_routine()->cmdbuf_section()->c_str(), new_name.str());
+        } else {
+          new_name << r->tpu_routine()->dmabuf_section()->c_str() << "_" << index;
+          tpuRoutine = cvi::model::CreateTpuRoutineDirect(
+              fbb, nullptr, new_name.str().c_str());
+          routine_name_map->emplace(r->tpu_routine()->dmabuf_section()->c_str(), new_name.str());
+        }
+      } else {
+        const char *cmdbuf = r->tpu_routine()->cmdbuf_section()
+                                 ? r->tpu_routine()->cmdbuf_section()->c_str()
+                                 : nullptr;
+        const char *dmabuf = r->tpu_routine()->dmabuf_section()
+                                 ? r->tpu_routine()->dmabuf_section()->c_str()
+                                 : nullptr;
+        tpuRoutine =
+            cvi::model::CreateTpuRoutineDirect(fbb, cmdbuf, dmabuf);
+      }
+      auto routine = cvi::model::CreateRoutine(fbb, cvi::model::RoutineType_TPU,
+                                               inputs, outputs, tpuRoutine, 0);
+      routines.push_back(routine);
+    } else {
+      std::vector<uint8_t> args;
+      for (auto byte : *r->cpu_routine()->function_args()) {
+        args.push_back(byte);
+      }
+      auto cpuRoutine = cvi::model::CreateCpuRoutineDirect(
+          fbb, r->cpu_routine()->function_section()->c_str(), &args);
+      auto routine =
+          cvi::model::CreateRoutine(fbb, r->type(), inputs, outputs, 0, cpuRoutine);
+      routines.push_back(routine);
+    }
+  }
+  return fbb.CreateVector(routines);
+}
+
+FBProgramVector Model::clonePrograms(
+    std::vector<std::shared_ptr<Model>> &models,
+    std::vector<std::map<std::string, std::string>> &routine_name_maps) {
+  std::vector<flatbuffers::Offset<cvi::model::Program>> programs;
+  routine_name_maps.clear();
+  for (uint32_t i = 0; i < models.size(); ++i) {
+    for (auto p : *models[i]->fb_model->programs()) {
+      auto tensor_map = cloneTensorMap(p);
+      std::vector<flatbuffers::Offset<flatbuffers::String>> fbStrVec;
+      for (auto name : *p->input_tensors()) {
+        fbStrVec.push_back(fbb.CreateString(name));
+      }
+      auto inputs = fbb.CreateVector(fbStrVec);
+      fbStrVec.clear();
+      for (auto name : *p->output_tensors()) {
+        fbStrVec.push_back(fbb.CreateString(name));
+      }
+      auto outputs = fbb.CreateVector(fbStrVec);
+      std::map<std::string, std::string> routine_name_map;
+      auto routines = cloneRoutines(p, true, i, &routine_name_map);
+      routine_name_maps.emplace_back(std::move(routine_name_map));
+      auto program = cvi::model::CreateProgram(fbb, p->batch_num(), p->neuron_size(),
+                                               inputs, outputs, tensor_map, routines,
+                                               p->shared_gmem(), p->private_gmem());
+      programs.push_back(program);
+    }
+  }
+  return fbb.CreateVector(programs);
+}
+
+typedef struct {
+  int id;
+  const cvi::model::Section* section;
+  uint32_t size;
+} weight_section_t;
+
+FBSectionVector Model::cloneSections(std::vector<std::shared_ptr<Model>> &models,
+                                     std::vector<uint8_t> &sections_buf,
+                                     std::vector<std::map<std::string, std::string> > &routine_name_maps) {
+  uint32_t offset = 0;
+  std::string weight_md5 = "";
+  std::vector<flatbuffers::Offset<cvi::model::Section>> section_vec;
+  std::vector<weight_section_t> weight_sections;
+  assert(models.size() == routine_name_maps.size());
+
+  uint8_t bit_buf_type = 0;
+  // first select candiate weight section
+  for (int i = 0; i < (int)models.size(); ++i) {
+    for (auto s : *models[i]->fb_model->sections()) {
+      if (s->type() == cvi::model::SectionType_WEIGHT) {
+        weight_section_t ws={0};
+        ws.id = i;
+        ws.section = s;
+        ws.size = s->size();
+        weight_sections.emplace_back(ws);
+      } else if (s->type() == cvi::model::SectionType_CMDBUF) {
+        bit_buf_type |= 0x01;
+      } else if (s->type() == cvi::model::SectionType_DMABUF) {
+        bit_buf_type |= 0x10;
+      }
+    }
+  }
+
+  if (bit_buf_type == 0x11) {
+    printf("WARN: models can't include both dmabuf and cmdbuf!\n");
+    exit(1);
+  }
+
+  std::sort(weight_sections.begin(), weight_sections.end(),
+            [](weight_section_t &s1, weight_section_t &s2) {
+              return s1.size < s2.size;
+            });
+
+  std::vector<int> weight_compare_size;
+  std::vector<int> weight_index;
+  for (auto pair : weight_sections) {
+    weight_index.push_back(pair.id);
+    weight_compare_size.push_back(pair.section->size());
+  }
+
+  for (int i = weight_compare_size.size() - 1; i > 0; --i) {
+    weight_compare_size[i] = weight_compare_size[i-1];
+  }
+
+  int candidate_index = weight_index[weight_index.size()-1];
+
+  for (int i = 0; i < (int)weight_sections.size(); ++i) {
+    auto &ws = weight_sections[i];
+    auto model = models[ws.id];
+    auto section = ws.section;
+    auto md5 = model->calcSectionMD5(section, weight_compare_size[i]);
+    if (weight_md5.empty()) {
+      weight_md5 = md5;
+    } else {
+      if (weight_md5 != md5) {
+        printf("WARN: weight binary of cvimodels should be same, model index (%d) vs (%d)\n",
+         weight_index[i-1], weight_index[i]);
+        exit(1);
+      } else {
+        weight_md5 = model->calcSectionMD5(section, section->size());
+      }
+    }
+  }
+
+  printf("cvimodels weight compare pass!\n");
+
+  int model_index = 0;
+  for (uint32_t i = 0; i < models.size(); ++i) {
+    for (auto s : *models[i]->fb_model->sections()) {
+      auto md5 = models[i]->calcSectionMD5(s, s->size());
+      printf("section type: %d, name: %s, size: %d, offset: %d, compress:%s "
+             "md5:%s\n",
+             (int)s->type(), s->name()->c_str(), s->size(), s->offset(),
+             s->compress() ? "True" : "False", md5.c_str());
+      // check if weight binary of all cvimodels are same.
+      if (s->type() == cvi::model::SectionType_WEIGHT) {
+        if (model_index != candidate_index) {
+          continue;
+        }
+      }
+
+      std::string section_name;
+      if (s->type() == cvi::model::SectionType_CMDBUF || s->type() == cvi::model::SectionType_DMABUF) {
+        section_name = routine_name_maps[i][s->name()->c_str()];
+        assert(!section_name.empty());
+      } else {
+        section_name = s->name()->c_str();
+      }
+
+      printf("add section, name:%s type:%d, md5:%s\n", section_name.c_str(), (int)s->type(), md5.c_str());
+      auto section = cvi::model::CreateSectionDirect(fbb, s->type(), section_name.c_str(),
+                                                     s->size(), offset, s->encrypt(),
+                                                     s->compress(), s->decompressed_size());
+      section_vec.push_back(section);
+      std::vector<uint8_t> buf(s->size());
+      models[i]->stream.read(buf.data(), models[i]->binary_offset + s->offset(), s->size());
+      sections_buf.insert(sections_buf.end(), buf.begin(), buf.end());
+      offset += s->size();
+    }
+    model_index++;
+  }
+  return fbb.CreateVector(section_vec);
+}
+
+void Model::merge(std::vector<std::shared_ptr<Model>> &models, std::string &dst) {
+  cvi::model::Version modelVersion =
+      cvi::model::Version(cvi::model::MajorVersion_value, cvi::model::MinorVersion_value,
+                          cvi::model::SubMinorVersion_value);
+  std::vector<std::map<std::string, std::string> > routine_name_maps;
+  auto modelName = fbb.CreateString(fb_model->name());
+  auto modelBuildTime = fbb.CreateString(getStrOfCurrentTime());
+  auto modelMlirVersion = fb_model->mlir_version() ?
+                          fbb.CreateString(fb_model->mlir_version()):
+                          fbb.CreateString("unknown");
+  auto fbTarget = fbb.CreateString(fb_model->target());
+  auto modelWeight = cloneWeightMap(models);
+  auto modelPrograms = clonePrograms(models, routine_name_maps);
+  std::vector<uint8_t> sections_buf;
+  auto modelSections = cloneSections(models, sections_buf, routine_name_maps);
+  auto newModel =
+      cvi::model::CreateModel(fbb, &modelVersion, modelName, modelBuildTime,  0, 0,
+                              modelWeight, modelPrograms, modelSections,
+                              fbTarget, modelMlirVersion);
+  fbb.Finish(newModel);
+
+  cvi::runtime::MODEL_HEADER modelHeader;
+  std::string magic = u8"CviModel";
+  std::string pad = u8"AA";
+  memcpy(modelHeader.magic, magic.c_str(), sizeof(modelHeader.magic));
+  memcpy(modelHeader.padding, pad.c_str(), sizeof(modelHeader.padding));
+  memset(modelHeader.chip, 0, 16);
+  memcpy(modelHeader.chip, this->header.chip, sizeof(modelHeader.chip));
+  modelHeader.body_size = fbb.GetSize();
+  modelHeader.major = cvi::model::MajorVersion_value;
+  modelHeader.minor = cvi::model::MinorVersion_value;
+
+  std::ofstream of(dst,
+                   std::ofstream::out | std::ofstream::binary | std::ofstream::trunc);
+  of.write((const char *)&modelHeader, sizeof(modelHeader));
+  of.write((const char *)fbb.GetBufferPointer(), fbb.GetSize());
+  of.write((const char *)sections_buf.data(), sections_buf.size());
+  of.close();
+  printf("store cvimodel to %s\n", dst.c_str());
+}
+
+FBWeightVector Model::rebuildWeightMap() {
+  std::vector<flatbuffers::Offset<cvi::model::Weight>> tensor_vec;
+  for (auto w : *fb_model->weight_map()) {
+    auto name = w->name()->c_str();
+    std::vector<int64_t> dim;
+    for (auto s : *w->shape()->dim()) {
+      dim.push_back(s);
+    }
+    auto shape = cvi::model::CreateShapeDirect(fbb, &dim);
+    auto weight = cvi::model::CreateWeightDirect(fbb, name, w->offset(), w->size(), shape,
+                                                w->type());
+    tensor_vec.push_back(weight);
+  }
+  return fbb.CreateVector(tensor_vec);
+}
+
+FBProgramVector Model::rebuildPrograms() {
+  std::vector<flatbuffers::Offset<cvi::model::Program>> programs;
+  for (auto p : *fb_model->programs()) {
+    auto tensor_map = cloneTensorMap(p);
+    std::vector<flatbuffers::Offset<flatbuffers::String>> fbStrVec;
+    for (auto name : *p->input_tensors()) {
+      fbStrVec.push_back(fbb.CreateString(name));
+    }
+    auto inputs = fbb.CreateVector(fbStrVec);
+    fbStrVec.clear();
+    for (auto name : *p->output_tensors()) {
+      fbStrVec.push_back(fbb.CreateString(name));
+    }
+    auto outputs = fbb.CreateVector(fbStrVec);
+    auto routines = cloneRoutines(p, false, 0, nullptr);
+    auto program = cvi::model::CreateProgram(fbb, p->batch_num(), p->neuron_size(),
+                                             inputs, outputs, tensor_map, routines,
+                                             p->shared_gmem(), p->private_gmem());
+    programs.push_back(program);
+  }
+  return fbb.CreateVector(programs);
+}
+
+FBSectionVector Model::rebuildSections(std::string tmp_dir, std::string model_name,
+                                       std::vector<uint8_t> &enc_buf) {
+  uint32_t offset = 0;
+  uint32_t enc_offset = 0;
+  std::vector<flatbuffers::Offset<cvi::model::Section>> section_vec;
+  for (auto s : *fb_model->sections()) {
+    if (s->type() == cvi::model::SectionType_WEIGHT ||
+        s->type() == cvi::model::SectionType_CMDBUF ||
+        s->type() == cvi::model::SectionType_DMABUF) {
+      std::string enc_filename =
+          tmp_dir + "/" + model_name + "_" + s->name()->c_str() + ".bin.enc";
+      std::ifstream ifs(enc_filename, std::ios::in | std::ios::binary);
+      ifs.seekg(0, ifs.end);
+      size_t size = ifs.tellg();
+      ifs.seekg(0, ifs.beg);
+      printf("filename: %s, filesize: %d\n", enc_filename.c_str(), (int)size);
+      char *buf = new char[size];
+      if (buf == nullptr) {
+        printf("alloc buf fail\n");
+        exit(-1);
+      }
+      ifs.read(buf, size);
+      enc_buf.insert(enc_buf.end(), buf, buf + size);
+      delete[] buf;
+      ifs.close();
+
+      auto section = cvi::model::CreateSectionDirect(fbb, s->type(), s->name()->c_str(),
+                                                     size, enc_offset, true);
+      section_vec.push_back(section);
+      enc_offset += size;
+    } else {
+      uint8_t *buf = new uint8_t[s->size()];
+      if (buf == nullptr) {
+        printf("alloc buf fail\n");
+        exit(-1);
+      }
+      stream.read(buf, binary_offset + offset, s->size());
+      enc_buf.insert(enc_buf.end(), buf, buf + s->size());
+      delete[] buf;
+
+      auto section = cvi::model::CreateSectionDirect(fbb, s->type(), s->name()->c_str(),
+                                                     s->size(), enc_offset, false);
+      section_vec.push_back(section);
+      enc_offset += s->size();
+    }
+    offset += s->size();
+  }
+
+  return fbb.CreateVector(section_vec);
+}
+
+void Model::encrypt(std::string &encrypter, std::string &output) {
+  // create tmp dir to keep extracted cmdbuf and weight
+  std::string tmp_dir = ".tmp_dir";
+  struct stat file_stat;
+  if (stat(tmp_dir.c_str(), &file_stat) != 0) {
+    mkdir(tmp_dir.c_str(), 0777);
+  } else {
+    printf("create tmp directory fail: file exist!\n");
+    return;
+  }
+
+  auto model_name = fb_model->name()->str();
+  std::vector<std::string> cmdbuf_names;
+  std::vector<std::string> dmabuf_names;
+  std::string weight_name;
+
+  for (auto p : *fb_model->programs()) {
+    for (auto r : *p->routines()) {
+      if (r->type() != cvi::model::RoutineType_TPU)
+        continue;
+
+      std::vector<std::string> *names = nullptr;
+      std::string name;
+      if (r->tpu_routine()->cmdbuf_section()) {
+        names = &cmdbuf_names;
+        name = r->tpu_routine()->cmdbuf_section()->str();
+      } else if (r->tpu_routine()->dmabuf_section()) {
+        assert(0 && "unsupport encrypt dmabuf");
+        names = &dmabuf_names;
+        name = r->tpu_routine()->dmabuf_section()->str();
+      } else {
+        assert(0);
+      }
+
+      printf("routine #%s\n", name.c_str());
+
+      for (auto s : *fb_model->sections()) {
+        if (s->name()->str() == name) {
+          std::string dst = model_name + "_" + s->name()->str() + ".bin";
+          names->push_back(dst);
+          storeSectionToFile(s, std::string(tmp_dir + "/" + dst));
+          break;
+        }
+      }
+    }
+  }
+
+  for (auto s : *fb_model->sections()) {
+    if (s->type() != cvi::model::SectionType_WEIGHT)
+      continue;
+    weight_name = model_name + "_" + s->name()->str() + ".bin";
+    storeSectionToFile(s, std::string(tmp_dir + "/" + weight_name));
+  }
+
+  for (auto cmdbuf : cmdbuf_names) {
+    int pid = fork();
+    if (pid == 0) {
+      printf("encrypt cmdbuf: %s in pid: %d\n", cmdbuf.c_str(), getpid());
+      execl(encrypter.c_str(), "cvi_crypt", "encrypt_sign_aimodel",
+            std::string(tmp_dir + "/" + cmdbuf).c_str(),
+            std::string(tmp_dir + "/" + weight_name).c_str(), nullptr);
+    }
+  }
+  for (auto dmabuf : dmabuf_names) {
+    int pid = fork();
+    if (pid == 0) {
+      printf("encrypt dmabuf: %s in pid: %d\n", dmabuf.c_str(), getpid());
+      execl(encrypter.c_str(), "cvi_crypt", "encrypt_sign_aimodel",
+            std::string(tmp_dir + "/" + dmabuf).c_str(),
+            std::string(tmp_dir + "/" + weight_name).c_str(), nullptr);
+    }
+  }
+
+  while (wait(nullptr) > 0)
+    ;
+
+  printf("encrypt cmdbuf and weight success!\n");
+
+  std::vector<uint8_t> enc_buf;
+  cvi::model::Version modelVersion =
+      cvi::model::Version(cvi::model::MajorVersion_value, cvi::model::MinorVersion_value,
+                          cvi::model::SubMinorVersion_value);
+  auto fbModelName = fbb.CreateString(model_name);
+  auto fbBuildTime = fbb.CreateString(getStrOfCurrentTime());
+  auto fbMlirVersion = fb_model->mlir_version() ?
+                       fbb.CreateString(fb_model->mlir_version()):
+                       fbb.CreateString("unknown");
+  auto fbTarget = fbb.CreateString(fb_model->target());
+
+  auto fbWeightMap = rebuildWeightMap();
+  auto fbSections = rebuildSections(tmp_dir, model_name, enc_buf);
+  auto fbProgram = rebuildPrograms();
+  auto encryptModel =
+      cvi::model::CreateModel(fbb, &modelVersion, fbModelName, fbBuildTime,  0, 0,
+                              fbWeightMap, fbProgram, fbSections, fbTarget, fbMlirVersion);
+  fbb.Finish(encryptModel);
+
+  std::vector<uint8_t> total_buf;
+  total_buf.insert(total_buf.end(), fbb.GetBufferPointer(),
+                   fbb.GetBufferPointer() + fbb.GetSize());
+  total_buf.insert(total_buf.end(), enc_buf.data(), enc_buf.data() + enc_buf.size());
+  cvi::runtime::MODEL_HEADER header;
+  std::string magic = u8"CviModel";
+  std::string padding = u8"AA";
+  memcpy(header.magic, magic.c_str(), sizeof(header.magic));
+  memcpy(header.padding, padding.c_str(), sizeof(header.padding));
+  memset(header.chip, 0, 16);
+  memcpy(header.chip, this->header.chip, sizeof(header.chip));
+  header.body_size = fbb.GetSize();
+  header.major = cvi::model::MajorVersion_value; // defined in cvimodel.fbs
+  header.minor = cvi::model::MinorVersion_value; // defined in cvimodel.fbs
+
+  std::ofstream ofs(output, std::ios::out | std::ios::binary);
+  ofs.write(reinterpret_cast<char *>(&header), sizeof(header));
+  ofs.write(reinterpret_cast<char *>(total_buf.data()), total_buf.size());
+  ofs.close();
+
+  // delete tmp_dir and file
+  for (auto cmdbuf : cmdbuf_names) {
+    std::string cmdbuf_file = tmp_dir + "/" + cmdbuf;
+    std::string enc_cmdbuf = cmdbuf_file + ".enc";
+    remove(cmdbuf_file.c_str());
+    remove(enc_cmdbuf.c_str());
+  }
+  for (auto dmabuf : dmabuf_names) {
+    std::string dmabuf_file = tmp_dir + "/" + dmabuf;
+    std::string enc_cmdbuf = dmabuf_file + ".enc";
+    remove(dmabuf_file.c_str());
+    remove(enc_cmdbuf.c_str());
+  }
+
+  std::string weight_file = tmp_dir + "/" + weight_name;
+  std::string enc_weight = weight_file + ".enc";
+  remove(weight_file.c_str());
+  remove(enc_weight.c_str());
+
+  rmdir(tmp_dir.c_str());
+}
+
+size_t Model::compressSectionToFile(const cvi::model::Section *section, std::string dst) {
+#ifdef ENABLE_COMPRESS_CMDBUF
+  auto offset = section->offset();
+  auto size = section->size();
+  std::ofstream of(dst, std::ofstream::out | std::ofstream::binary |
+                   std::ofstream::trunc);
+  uint8_t *in_buf = new (std::nothrow) uint8_t[size];
+  if (!in_buf) {
+    printf("Failed to allocate buffer buff size:%d\n", size);
+    exit(1);
+  }
+  auto got = stream.read(in_buf, binary_offset + offset, size);
+  if (got != size) {
+    printf("Failed to read data from cvimodel cmdbuf sections\n");
+    exit(1);
+  }
+  // if compressed, exit
+  if (section->compress()) {
+    printf("cmdbuf or dmabuf already compressed! exit\n");
+    exit(1);
+  }
+  size_t max_out_size = LZ4_compressBound(size);
+  std::vector<uint8_t> out_buf(max_out_size);
+
+  auto out_size = LZ4_compress_default(
+      reinterpret_cast<char *>(in_buf),
+      reinterpret_cast<char *>(out_buf.data()), size, max_out_size);
+  if (out_size < 1) {
+    printf("compress cmdbuf failed!\n");
+    exit(1);
+  }
+  printf("compress cmdbuf, %d => %d\n", size, out_size);
+  if (out_size > (int)size) {
+    printf("compressed size large than decompressed size don't need compress!\n");
+    exit(1);
+  }
+  of.write((const char *)out_buf.data(), out_size);
+  of.close();
+  delete[] in_buf;
+  printf("store section to %s\n", dst.c_str());
+  return size;
+#else
+  printf("Compressed section is not supported! please recompile with ENABLE_COMPRESS_CMDBUF\n");
+  exit(1);
+#endif
+}
+
+FBSectionVector Model::rebuildSections(
+    std::string tmp_dir, std::string model_name,
+    std::vector<uint8_t> &out_buf,
+    std::map<std::string, size_t> &cmpr_info) {
+  uint32_t offset = 0;
+  uint32_t out_offset = 0;
+  std::vector<flatbuffers::Offset<cvi::model::Section>> section_vec;
+  for (auto s : *fb_model->sections()) {
+    if (s->type() == cvi::model::SectionType_CMDBUF ||
+        s->type() == cvi::model::SectionType_DMABUF) {
+      std::string filename = tmp_dir + "/" + model_name +
+                             "_" + s->name()->c_str() + ".bin";
+      std::ifstream ifs(filename, std::ios::in | std::ios::binary);
+      ifs.seekg(0, ifs.end);
+      size_t size = ifs.tellg();
+      ifs.seekg(0, ifs.beg);
+      printf("filename: %s, filesize: %d\n", filename.c_str(), (int)size);
+      char *buf = new char[size];
+      if (buf == nullptr) {
+        printf("alloc buf fail\n");
+        exit(-1);
+      }
+      ifs.read(buf, size);
+      out_buf.insert(out_buf.end(), buf, buf + size);
+      delete[] buf;
+      ifs.close();
+      remove(filename.c_str());
+
+      auto section = cvi::model::CreateSectionDirect(fbb, s->type(), s->name()->c_str(),
+                                                     size, out_offset, false, true,
+                                                     cmpr_info[s->name()->str()]);
+      section_vec.push_back(section);
+      out_offset += size;
+    } else {
+      uint8_t *buf = new uint8_t[s->size()];
+      if (buf == nullptr) {
+        printf("alloc buf fail\n");
+        exit(-1);
+      }
+      stream.read(buf, binary_offset + offset, s->size());
+      out_buf.insert(out_buf.end(), buf, buf + s->size());
+      delete[] buf;
+
+      auto section = cvi::model::CreateSectionDirect(fbb, s->type(), s->name()->c_str(),
+                                                     s->size(), out_offset, false);
+      section_vec.push_back(section);
+      out_offset += s->size();
+    }
+    offset += s->size();
+  }
+
+  return fbb.CreateVector(section_vec);
+}
+
+void Model::compress_inst(std::string &output) {
+  // create tmp dir to keep extracted cmdbuf and weight
+  std::string tmp_dir = ".tmp_dir";
+  struct stat file_stat;
+  if (stat(tmp_dir.c_str(), &file_stat) != 0) {
+    mkdir(tmp_dir.c_str(), 0777);
+  } else {
+    printf("create tmp directory fail: file exist!\n");
+    return;
+  }
+
+  auto model_name = fb_model->name()->str();
+  std::map<std::string, size_t> cmdbuf_compr_info;
+  std::string weight_name;
+
+  for (auto p : *fb_model->programs()) {
+    for (auto r : *p->routines()) {
+      if (r->type() != cvi::model::RoutineType_TPU)
+        continue;
+
+      std::string name;
+      if (r->tpu_routine()->cmdbuf_section()) {
+        name = r->tpu_routine()->cmdbuf_section()->str();
+      } else if (r->tpu_routine()->dmabuf_section()) {
+        name = r->tpu_routine()->dmabuf_section()->str();
+      } else {
+        assert(0);
+      }
+      printf("routine #%s\n", name.c_str());
+
+      // compress cmdbuf
+      for (auto s : *fb_model->sections()) {
+        if (s->name()->str() == name) {
+          std::string dst = model_name + "_" + s->name()->str() + ".bin";
+          auto cmpr_sz = compressSectionToFile(s, std::string(tmp_dir + "/" + dst));
+          cmdbuf_compr_info[s->name()->str()] = cmpr_sz;
+          break;
+        }
+      }
+    }
+  }
+
+  std::vector<uint8_t> enc_buf;
+  cvi::model::Version modelVersion =
+      cvi::model::Version(cvi::model::MajorVersion_value, cvi::model::MinorVersion_value,
+                          cvi::model::SubMinorVersion_value);
+  auto fbModelName = fbb.CreateString(model_name);
+  auto fbBuildTime = fbb.CreateString(getStrOfCurrentTime());
+  auto fbMlirVersion = fb_model->mlir_version() ?
+                       fbb.CreateString(fb_model->mlir_version()):
+                       fbb.CreateString("unknown");
+  auto fbTarget = fbb.CreateString(fb_model->target());
+  auto fbWeightMap = rebuildWeightMap();
+  auto fbSections = rebuildSections(tmp_dir, model_name, enc_buf, cmdbuf_compr_info);
+  auto fbProgram = rebuildPrograms();
+  auto encryptModel =
+      cvi::model::CreateModel(fbb, &modelVersion, fbModelName, fbBuildTime,  0, 0,
+                              fbWeightMap, fbProgram, fbSections, fbTarget, fbMlirVersion);
+  fbb.Finish(encryptModel);
+
+  std::vector<uint8_t> total_buf;
+  total_buf.insert(total_buf.end(), fbb.GetBufferPointer(),
+                   fbb.GetBufferPointer() + fbb.GetSize());
+  total_buf.insert(total_buf.end(), enc_buf.data(), enc_buf.data() + enc_buf.size());
+  cvi::runtime::MODEL_HEADER header;
+  std::string magic = u8"CviModel";
+  std::string padding = u8"AA";
+  memcpy(header.magic, magic.c_str(), sizeof(header.magic));
+  memcpy(header.padding, padding.c_str(), sizeof(header.padding));
+  memset(header.chip, 0, 16);
+  memcpy(header.chip, this->header.chip, sizeof(header.chip));
+  header.body_size = fbb.GetSize();
+  header.major = cvi::model::MajorVersion_value; // defined in cvimodel.fbs
+  header.minor = cvi::model::MinorVersion_value; // defined in cvimodel.fbs
+
+  std::ofstream ofs(output, std::ios::out | std::ios::binary);
+  ofs.write(reinterpret_cast<char *>(&header), sizeof(header));
+  ofs.write(reinterpret_cast<char *>(total_buf.data()), total_buf.size());
+  ofs.close();
+
+  // delete tmp_dir
+  rmdir(tmp_dir.c_str());
+}
+
+int main(int argc, const char **argv) {
+  std::cout << argv[0] << "\n";
+  showRuntimeVersion();
+
+  argparse::ArgumentParser parser;
+  parser.addArgument("-a", "--action", 1, false); // required
+  parser.addArgument("-i", "--input", '+');       // inference count
+  parser.addArgument("-o", "--output", 1);
+  parser.parse(argc, argv);
+
+  auto action = parser.retrieve<std::string>("action");
+  auto inputs = parser.retrieve<std::vector<std::string>>("input");
+  auto model = std::make_shared<Model>(inputs[0]);
+  if (action == "dump") {
+    assert(inputs.size() == 1);
+    model->dump();
+  } else if (action == "extract") {
+    assert(inputs.size() == 1);
+    model->extract();
+  } else if (action == "compress") {
+    assert(inputs.size() == 1);
+    auto output = parser.retrieve<std::string>("output");
+    if (output.empty()) {
+      printf("ERROR: Please set output cvimodel\n");
+      exit(1);
+    }
+    model->compress_inst(output);
+  } else if (action == "encrypt") {
+    assert(inputs.size() == 2);
+    auto encrypter = inputs[1];
+    auto output = parser.retrieve<std::string>("output");
+    if (output.empty()) {
+      printf("ERROR: Please set output cvimodel\n");
+      exit(1);
+    }
+    model->encrypt(encrypter, output);
+  } else if (action == "merge") {
+    if (inputs.size() < 2) {
+      printf("ERROR: Please set more than one cvimodels\n");
+      exit(1);
+    }
+    auto output = parser.retrieve<std::string>("output");
+    if (output.empty()) {
+      printf("ERROR: Please set output cvimodel\n");
+      exit(1);
+    }
+    std::vector<std::shared_ptr<Model>> models;
+    models.push_back(model);
+    for (int i = 1; i < (int)inputs.size(); i++) {
+      auto other = std::make_shared<Model>(inputs[i]);
+      models.push_back(other);
+    }
+    model->merge(models, output);
+  }
+  return 0;
+}
+
diff --git a/cviruntime/tool/md5.cpp b/cviruntime/tool/md5.cpp
new file mode 100644
index 000000000..a89718b0b
--- /dev/null
+++ b/cviruntime/tool/md5.cpp
@@ -0,0 +1,313 @@
+/* MD5
+ converted to C++ class by Frank Thilo (thilo@unix-ag.org)
+ for bzflag (http://www.bzflag.org)
+
+   based on:
+
+   md5.h and md5.c
+   reference implementation of RFC 1321
+
+   Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All
+rights reserved.
+
+License to copy and use this software is granted provided that it
+is identified as the "RSA Data Security, Inc. MD5 Message-Digest
+Algorithm" in all material mentioning or referencing this software
+or this function.
+
+License is also granted to make and use derivative works provided
+that such works are identified as "derived from the RSA Data
+Security, Inc. MD5 Message-Digest Algorithm" in all material
+mentioning or referencing the derived work.
+
+RSA Data Security, Inc. makes no representations concerning either
+the merchantability of this software or the suitability of this
+software for any particular purpose. It is provided "as is"
+without express or implied warranty of any kind.
+
+These notices must be retained in any copies of any part of this
+documentation and/or software.
+*/
+
+#include "md5.hpp"
+
+/* system implementation headers */
+#include <cstdio>
+
+// Constants for MD5Transform routine.
+#define S11 7
+#define S12 12
+#define S13 17
+#define S14 22
+#define S21 5
+#define S22 9
+#define S23 14
+#define S24 20
+#define S31 4
+#define S32 11
+#define S33 16
+#define S34 23
+#define S41 6
+#define S42 10
+#define S43 15
+#define S44 21
+
+///////////////////////////////////////////////
+
+// F, G, H and I are basic MD5 functions.
+inline uint32_t MD5::F(uint32_t x, uint32_t y, uint32_t z) {
+  return (x & y) | ((~x) & z);
+}
+
+inline uint32_t MD5::G(uint32_t x, uint32_t y, uint32_t z) {
+  return (x & z) | (y & (~z));
+}
+
+inline uint32_t MD5::H(uint32_t x, uint32_t y, uint32_t z) {
+  return x ^ y ^ z;
+}
+
+inline uint32_t MD5::I(uint32_t x, uint32_t y, uint32_t z) {
+  return y ^ (x | ~z);
+}
+
+// rotate_left rotates x left n bits.
+inline uint32_t MD5::rotate_left(uint32_t x, int n) {
+  return (x << n) | (x >> (32 - n));
+}
+
+// FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4.
+// Rotation is separate from addition to prevent recomputation.
+inline void MD5::FF(uint32_t &a, uint32_t b, uint32_t c, uint32_t d, uint32_t x,
+                    uint32_t s, uint32_t ac) {
+  a = rotate_left(a + F(b, c, d) + x + ac, s) + b;
+}
+
+inline void MD5::GG(uint32_t &a, uint32_t b, uint32_t c, uint32_t d, uint32_t x,
+                    uint32_t s, uint32_t ac) {
+  a = rotate_left(a + G(b, c, d) + x + ac, s) + b;
+}
+
+inline void MD5::HH(uint32_t &a, uint32_t b, uint32_t c, uint32_t d, uint32_t x,
+                    uint32_t s, uint32_t ac) {
+  a = rotate_left(a + H(b, c, d) + x + ac, s) + b;
+}
+
+inline void MD5::II(uint32_t &a, uint32_t b, uint32_t c, uint32_t d, uint32_t x,
+                    uint32_t s, uint32_t ac) {
+  a = rotate_left(a + I(b, c, d) + x + ac, s) + b;
+}
+
+//////////////////////////////////////////////
+
+// default ctor, just initailize
+MD5::MD5() {
+  finalized = false;
+
+  count[0] = 0;
+  count[1] = 0;
+
+  // load magic initialization constants.
+  state[0] = 0x67452301;
+  state[1] = 0xefcdab89;
+  state[2] = 0x98badcfe;
+  state[3] = 0x10325476;
+
+  buffer[0] = 0;
+  digest[0] = 0;
+}
+
+//////////////////////////////
+
+// decodes input (uint8_t) into output (uint32_t). Assumes len is a multiple of 4.
+void MD5::decode(uint32_t output[], const uint8_t input[], uint32_t len) {
+  for (unsigned int i = 0, j = 0; j < len; i++, j += 4)
+    output[i] = ((uint32_t)input[j]) | (((uint32_t)input[j + 1]) << 8) |
+                (((uint32_t)input[j + 2]) << 16) | (((uint32_t)input[j + 3]) << 24);
+}
+
+//////////////////////////////
+
+// encodes input (uint32_t) into output (uint8_t). Assumes len is
+// a multiple of 4.
+void MD5::encode(uint8_t output[], const uint32_t input[], uint32_t len) {
+  for (uint32_t i = 0, j = 0; j < len; i++, j += 4) {
+    output[j] = input[i] & 0xff;
+    output[j + 1] = (input[i] >> 8) & 0xff;
+    output[j + 2] = (input[i] >> 16) & 0xff;
+    output[j + 3] = (input[i] >> 24) & 0xff;
+  }
+}
+
+//////////////////////////////
+
+// apply MD5 algo on a block
+void MD5::transform(const uint8_t block[blocksize]) {
+  uint32_t a = state[0], b = state[1], c = state[2], d = state[3], x[16];
+  decode(x, block, blocksize);
+
+  /* Round 1 */
+  FF(a, b, c, d, x[0], S11, 0xd76aa478);  /* 1 */
+  FF(d, a, b, c, x[1], S12, 0xe8c7b756);  /* 2 */
+  FF(c, d, a, b, x[2], S13, 0x242070db);  /* 3 */
+  FF(b, c, d, a, x[3], S14, 0xc1bdceee);  /* 4 */
+  FF(a, b, c, d, x[4], S11, 0xf57c0faf);  /* 5 */
+  FF(d, a, b, c, x[5], S12, 0x4787c62a);  /* 6 */
+  FF(c, d, a, b, x[6], S13, 0xa8304613);  /* 7 */
+  FF(b, c, d, a, x[7], S14, 0xfd469501);  /* 8 */
+  FF(a, b, c, d, x[8], S11, 0x698098d8);  /* 9 */
+  FF(d, a, b, c, x[9], S12, 0x8b44f7af);  /* 10 */
+  FF(c, d, a, b, x[10], S13, 0xffff5bb1); /* 11 */
+  FF(b, c, d, a, x[11], S14, 0x895cd7be); /* 12 */
+  FF(a, b, c, d, x[12], S11, 0x6b901122); /* 13 */
+  FF(d, a, b, c, x[13], S12, 0xfd987193); /* 14 */
+  FF(c, d, a, b, x[14], S13, 0xa679438e); /* 15 */
+  FF(b, c, d, a, x[15], S14, 0x49b40821); /* 16 */
+
+  /* Round 2 */
+  GG(a, b, c, d, x[1], S21, 0xf61e2562);  /* 17 */
+  GG(d, a, b, c, x[6], S22, 0xc040b340);  /* 18 */
+  GG(c, d, a, b, x[11], S23, 0x265e5a51); /* 19 */
+  GG(b, c, d, a, x[0], S24, 0xe9b6c7aa);  /* 20 */
+  GG(a, b, c, d, x[5], S21, 0xd62f105d);  /* 21 */
+  GG(d, a, b, c, x[10], S22, 0x2441453);  /* 22 */
+  GG(c, d, a, b, x[15], S23, 0xd8a1e681); /* 23 */
+  GG(b, c, d, a, x[4], S24, 0xe7d3fbc8);  /* 24 */
+  GG(a, b, c, d, x[9], S21, 0x21e1cde6);  /* 25 */
+  GG(d, a, b, c, x[14], S22, 0xc33707d6); /* 26 */
+  GG(c, d, a, b, x[3], S23, 0xf4d50d87);  /* 27 */
+  GG(b, c, d, a, x[8], S24, 0x455a14ed);  /* 28 */
+  GG(a, b, c, d, x[13], S21, 0xa9e3e905); /* 29 */
+  GG(d, a, b, c, x[2], S22, 0xfcefa3f8);  /* 30 */
+  GG(c, d, a, b, x[7], S23, 0x676f02d9);  /* 31 */
+  GG(b, c, d, a, x[12], S24, 0x8d2a4c8a); /* 32 */
+
+  /* Round 3 */
+  HH(a, b, c, d, x[5], S31, 0xfffa3942);  /* 33 */
+  HH(d, a, b, c, x[8], S32, 0x8771f681);  /* 34 */
+  HH(c, d, a, b, x[11], S33, 0x6d9d6122); /* 35 */
+  HH(b, c, d, a, x[14], S34, 0xfde5380c); /* 36 */
+  HH(a, b, c, d, x[1], S31, 0xa4beea44);  /* 37 */
+  HH(d, a, b, c, x[4], S32, 0x4bdecfa9);  /* 38 */
+  HH(c, d, a, b, x[7], S33, 0xf6bb4b60);  /* 39 */
+  HH(b, c, d, a, x[10], S34, 0xbebfbc70); /* 40 */
+  HH(a, b, c, d, x[13], S31, 0x289b7ec6); /* 41 */
+  HH(d, a, b, c, x[0], S32, 0xeaa127fa);  /* 42 */
+  HH(c, d, a, b, x[3], S33, 0xd4ef3085);  /* 43 */
+  HH(b, c, d, a, x[6], S34, 0x4881d05);   /* 44 */
+  HH(a, b, c, d, x[9], S31, 0xd9d4d039);  /* 45 */
+  HH(d, a, b, c, x[12], S32, 0xe6db99e5); /* 46 */
+  HH(c, d, a, b, x[15], S33, 0x1fa27cf8); /* 47 */
+  HH(b, c, d, a, x[2], S34, 0xc4ac5665);  /* 48 */
+
+  /* Round 4 */
+  II(a, b, c, d, x[0], S41, 0xf4292244);  /* 49 */
+  II(d, a, b, c, x[7], S42, 0x432aff97);  /* 50 */
+  II(c, d, a, b, x[14], S43, 0xab9423a7); /* 51 */
+  II(b, c, d, a, x[5], S44, 0xfc93a039);  /* 52 */
+  II(a, b, c, d, x[12], S41, 0x655b59c3); /* 53 */
+  II(d, a, b, c, x[3], S42, 0x8f0ccc92);  /* 54 */
+  II(c, d, a, b, x[10], S43, 0xffeff47d); /* 55 */
+  II(b, c, d, a, x[1], S44, 0x85845dd1);  /* 56 */
+  II(a, b, c, d, x[8], S41, 0x6fa87e4f);  /* 57 */
+  II(d, a, b, c, x[15], S42, 0xfe2ce6e0); /* 58 */
+  II(c, d, a, b, x[6], S43, 0xa3014314);  /* 59 */
+  II(b, c, d, a, x[13], S44, 0x4e0811a1); /* 60 */
+  II(a, b, c, d, x[4], S41, 0xf7537e82);  /* 61 */
+  II(d, a, b, c, x[11], S42, 0xbd3af235); /* 62 */
+  II(c, d, a, b, x[2], S43, 0x2ad7d2bb);  /* 63 */
+  II(b, c, d, a, x[9], S44, 0xeb86d391);  /* 64 */
+
+  state[0] += a;
+  state[1] += b;
+  state[2] += c;
+  state[3] += d;
+
+  // Zeroize sensitive information.
+  memset(x, 0, sizeof x);
+}
+
+//////////////////////////////
+
+// MD5 block update operation. Continues an MD5 message-digest
+// operation, processing another message block
+void MD5::update(const uint8_t input[], uint32_t length) {
+  // compute number of bytes mod 64
+  uint32_t index = count[0] / 8 % blocksize;
+
+  // Update number of bits
+  if ((count[0] += (length << 3)) < (length << 3))
+    count[1]++;
+  count[1] += (length >> 29);
+
+  // number of bytes we need to fill in buffer
+  uint32_t firstpart = 64 - index;
+
+  uint32_t i;
+
+  // transform as many times as possible.
+  if (length >= firstpart) {
+    // fill buffer first, transform
+    memcpy(&buffer[index], input, firstpart);
+    transform(buffer);
+
+    // transform chunks of blocksize (64 bytes)
+    for (i = firstpart; i + blocksize <= length; i += blocksize)
+      transform(&input[i]);
+
+    index = 0;
+  } else
+    i = 0;
+
+  // buffer remaining input
+  memcpy(&buffer[index], &input[i], length - i);
+}
+
+// MD5 finalization. Ends an MD5 message-digest operation, writing the
+// the message digest and zeroizing the context.
+MD5 &MD5::finalize() {
+  static uint8_t padding[64] = {0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+  if (!finalized) {
+    // Save number of bits
+    uint8_t bits[8];
+    encode(bits, count, 8);
+
+    // pad out to 56 mod 64.
+    uint32_t index = count[0] / 8 % 64;
+    uint32_t padLen = (index < 56) ? (56 - index) : (120 - index);
+    update(padding, padLen);
+
+    // Append length (before padding)
+    update(bits, 8);
+
+    // Store state in digest
+    encode(digest, state, 16);
+
+    // Zeroize sensitive information.
+    memset(buffer, 0, sizeof buffer);
+    memset(count, 0, sizeof count);
+
+    finalized = true;
+  }
+
+  return *this;
+}
+
+//////////////////////////////
+
+// return hex representation of digest as string
+std::string MD5::hexdigest() const {
+  if (!finalized)
+    return "";
+
+  char buf[33];
+  for (int i = 0; i < 16; i++)
+    sprintf(buf + i * 2, "%02x", digest[i]);
+  buf[32] = 0;
+
+  return std::string(buf);
+}
diff --git a/cviruntime/tool/md5.hpp b/cviruntime/tool/md5.hpp
new file mode 100644
index 000000000..dc35bce74
--- /dev/null
+++ b/cviruntime/tool/md5.hpp
@@ -0,0 +1,82 @@
+/* MD5
+ converted to C++ class by Frank Thilo (thilo@unix-ag.org)
+ for bzflag (http://www.bzflag.org)
+
+   based on:
+
+   md5.h and md5.c
+   reference implementation of RFC 1321
+
+   Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All
+rights reserved.
+
+License to copy and use this software is granted provided that it
+is identified as the "RSA Data Security, Inc. MD5 Message-Digest
+Algorithm" in all material mentioning or referencing this software
+or this function.
+
+License is also granted to make and use derivative works provided
+that such works are identified as "derived from the RSA Data
+Security, Inc. MD5 Message-Digest Algorithm" in all material
+mentioning or referencing the derived work.
+
+RSA Data Security, Inc. makes no representations concerning either
+the merchantability of this software or the suitability of this
+software for any particular purpose. It is provided "as is"
+without express or implied warranty of any kind.
+
+These notices must be retained in any copies of any part of this
+documentation and/or software.
+
+*/
+
+#ifndef BZF_MD5_H
+#define BZF_MD5_H
+
+#include <cstring>
+#include <iostream>
+#include <stdint.h>
+
+// a small class for calculating MD5 hashes of strings or byte arrays
+// it is not meant to be fast or secure
+//
+// usage: 1) feed it blocks of uchars with update()
+//      2) finalize()
+//      3) get hexdigest() string
+//      or
+//      MD5(std::string).hexdigest()
+//
+// assumes that char is 8 bit and int is 32 bit
+class MD5 {
+public:
+  MD5();
+  void update(const uint8_t *buf, uint32_t length);
+  MD5 &finalize();
+  std::string hexdigest() const;
+
+private:
+  enum { blocksize = 64 };     // VC6 won't eat a const static int here
+
+  void transform(const uint8_t block[blocksize]);
+  static void decode(uint32_t output[], const uint8_t input[], uint32_t len);
+  static void encode(uint8_t output[], const uint32_t input[], uint32_t len);
+
+  bool finalized;
+  uint8_t buffer[blocksize]; // bytes that didn't fit in last 64 byte chunk
+  uint32_t count[2];          // 64bit counter for number of bits (lo, hi)
+  uint32_t state[4];          // digest so far
+  uint8_t digest[16];        // the result
+
+  // low level logic operations
+  static inline uint32_t F(uint32_t x, uint32_t y, uint32_t z);
+  static inline uint32_t G(uint32_t x, uint32_t y, uint32_t z);
+  static inline uint32_t H(uint32_t x, uint32_t y, uint32_t z);
+  static inline uint32_t I(uint32_t x, uint32_t y, uint32_t z);
+  static inline uint32_t rotate_left(uint32_t x, int n);
+  static inline void FF(uint32_t &a, uint32_t b, uint32_t c, uint32_t d, uint32_t x, uint32_t s, uint32_t ac);
+  static inline void GG(uint32_t &a, uint32_t b, uint32_t c, uint32_t d, uint32_t x, uint32_t s, uint32_t ac);
+  static inline void HH(uint32_t &a, uint32_t b, uint32_t c, uint32_t d, uint32_t x, uint32_t s, uint32_t ac);
+  static inline void II(uint32_t &a, uint32_t b, uint32_t c, uint32_t d, uint32_t x, uint32_t s, uint32_t ac);
+};
+
+#endif
\ No newline at end of file
diff --git a/cviruntime/tool/model_interface_tester.cpp b/cviruntime/tool/model_interface_tester.cpp
new file mode 100644
index 000000000..59dab7319
--- /dev/null
+++ b/cviruntime/tool/model_interface_tester.cpp
@@ -0,0 +1,298 @@
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <stdlib.h>
+#include <string.h>
+#include <vector>
+#include <iostream>
+#include <sstream>
+#include <fstream>
+#include <sys/time.h>
+#include <cviruntime_context.h>
+#include <runtime/debug.h>
+#include "cviruntime.h"
+#include <runtime/version.h>
+#include "argparse.hpp"
+#include "similarity.hpp"
+#include "cnpy.h"
+#include "assert.h"
+
+static bool isNpzFile(const std::string &name) {
+  std::string extension = name.substr(name.size() - 4);
+  if (extension == ".npz")
+    return true;
+  return false;
+}
+
+class ModelTester {
+public:
+  ModelTester(const std::string &model_file, const std::string &ref_npz);
+  ~ModelTester();
+  void loadInputData(const std::string &input_npz);
+  void prepareAlignedData();
+  void testEmulateSendDataFromSystemMem();
+  void testEmulateSendDataFromVpss();
+
+private:
+  bool compareResults();
+  void forwardAndCompareResults(std::string api);
+
+  CVI_RT_HANDLE ctx;
+  CVI_MODEL_HANDLE model = NULL;
+  CVI_TENSOR *input_tensors;
+  CVI_TENSOR *output_tensors;
+  int32_t input_num;
+  int32_t output_num;
+  int32_t _n, _c, _h, _w;
+  int32_t _aligned_w;
+  bool _nhwc = false;
+
+  std::vector<int8_t> origin_input;
+  std::vector<int8_t> aligned_input;
+  std::string ref_npz;
+};
+
+ModelTester::ModelTester(const std::string &model_file, const std::string &ref_npz)
+  : ref_npz(ref_npz) {
+  _n = _c = _h = _w = _aligned_w = 0;
+
+  CVI_RT_Init(&ctx);
+
+  if (CVI_RC_SUCCESS != CVI_NN_RegisterModel(model_file.c_str(), &model)) {
+    exit(1);
+  }
+
+  CVI_NN_SetConfig(model, OPTION_OUTPUT_ALL_TENSORS, true);
+
+  CVI_NN_GetInputOutputTensors(
+      model, &input_tensors, &input_num,
+      &output_tensors, &output_num);
+}
+
+ModelTester::~ModelTester() {
+  CVI_NN_CleanupModel(model);
+  CVI_RT_DeInit(ctx);
+}
+
+
+bool ModelTester::compareResults() {
+  float euclidean = 0;
+  float cosine = 0;
+  float correlation = 0;
+
+  int err_cnt = 0;
+  for (int i = 0; i < output_num; i++) {
+    auto &tensor = output_tensors[i];
+    std::string name(tensor.name);
+    auto refData = cnpy::npz_load(ref_npz, name);
+    if (refData.num_vals == 0) {
+      printf("Warning, Cannot find %s in reference\n", name.c_str());
+      continue;
+    }
+    if (tensor.count != refData.num_vals) {
+      printf("%s %zu vs %zu, size are not equal.\n", name.c_str(), tensor.count, refData.num_vals);
+      continue;
+    }
+
+    if (refData.type == 'f') {
+      if (tensor.fmt == CVI_FMT_INT8) {
+        array_similarity((int8_t *)CVI_NN_TensorPtr(&tensor),
+                         refData.data<float>(), tensor.count,
+                         euclidean, cosine, correlation);
+      } else if (tensor.fmt == CVI_FMT_UINT8) {
+        array_similarity((uint8_t *)CVI_NN_TensorPtr(&tensor),
+                         refData.data<float>(), tensor.count,
+                         euclidean, cosine, correlation);
+      } else if (tensor.fmt == CVI_FMT_BF16) {
+        array_similarity((uint16_t *)CVI_NN_TensorPtr(&tensor),
+                         refData.data<float>(), tensor.count,
+                         euclidean, cosine, correlation);
+      } else {
+        array_similarity((float *)CVI_NN_TensorPtr(&tensor),
+                         refData.data<float>(), tensor.count,
+                         euclidean, cosine, correlation);
+      }
+    } else if (refData.type == 'u') {
+      if (tensor.fmt == CVI_FMT_BF16) {
+        assert(refData.word_size == 2);
+        array_similarity((uint16_t *)CVI_NN_TensorPtr(&tensor),
+                         refData.data<uint16_t>(), tensor.count, euclidean,
+                         cosine, correlation);
+      } else if (tensor.fmt == CVI_FMT_UINT8) {
+        assert(refData.word_size == 1);
+        array_similarity((uint8_t *)CVI_NN_TensorPtr(&tensor),
+                         refData.data<uint8_t>(), tensor.count, euclidean,
+                         cosine, correlation);
+      } else {
+        assert(0);
+      }
+    } else if (refData.type == 'i') {
+      assert(refData.word_size == 1);
+      assert(tensor.fmt == CVI_FMT_INT8);
+      array_similarity((int8_t *)CVI_NN_TensorPtr(&tensor),
+                        refData.data<int8_t>(), tensor.count, euclidean,
+                        cosine, correlation);
+    }
+
+    if (cosine < 1 || correlation < 1 || euclidean < 1) {
+      err_cnt++;
+      printf("Error, [%s] cosine:%f correlation:%f euclidean:%f\n", name.c_str(), cosine,
+             correlation, euclidean);
+    } else {
+      printf("[%s] cosine:%f correlation:%f euclidean:%f\n", name.c_str(), cosine,
+             correlation, euclidean);
+    }
+  }
+  if (err_cnt > 0) {
+    printf("Compare failed\n");
+    return false;
+  }
+  printf("Compare passed\n");
+  return true;
+}
+
+void ModelTester::loadInputData(const std::string &input_npz) {
+  assert(isNpzFile(input_npz));
+  auto npz = cnpy::npz_load(input_npz);
+  assert(1 == (int)npz.size());
+  auto &tensor = input_tensors[0];
+  auto &arr = npz.begin()->second;
+  auto size = CVI_NN_TensorSize(&tensor);
+  assert(arr.num_vals == size);
+
+  origin_input.resize(size);
+  if (arr.type == 'f') {
+    auto src = arr.data<float>();
+    auto qscale = CVI_NN_TensorQuantScale(&tensor);
+    for (size_t i = 0; i < arr.num_vals; i++) {
+      int val = std::round(src[i] * qscale);
+      if (tensor.fmt == CVI_FMT_INT8) {
+        if (val > 127) {
+          val = 127;
+        } else if (val < -128) {
+          val = -128;
+        }
+        origin_input[i] = (int8_t)val;
+      } else {
+        if (val > 255) {
+          val = 255;
+        }
+        origin_input[i] = static_cast<int8_t>(val);
+      }
+    }
+  } else {
+    auto src = arr.data<int8_t>();
+    for (size_t i = 0; i < arr.num_vals; i++) {
+      origin_input[i] = static_cast<int8_t>(src[i]);
+    }
+  }
+}
+
+static inline int align_up(int x, int n) {
+  return ((x + n - 1) / n) * n;
+}
+
+void ModelTester::prepareAlignedData() {
+  auto &tensor = input_tensors[0];
+  if (tensor.aligned) {
+    aligned_input.assign(origin_input.begin(),
+                         origin_input.end());
+  }
+  _n = tensor.shape.dim[0];
+  _c = tensor.shape.dim[1];
+  _h = tensor.shape.dim[2];
+  _w = tensor.shape.dim[3];
+  if (_w == 3) {
+    _c = 1;
+    _h = tensor.shape.dim[1];
+    _w = tensor.shape.dim[2] * tensor.shape.dim[3];
+    _nhwc = true;
+  }
+
+  _aligned_w = align_up(_w, 32);
+  size_t aligned_size = (size_t)(_n * _c * _h * _aligned_w);
+  auto dst = [&](int i) {
+    return ((i / _w) * _aligned_w) + (i % _w);
+  };
+  aligned_input.resize(aligned_size);
+  for (int i = 0; i < (int)origin_input.size(); i++) {
+    aligned_input[dst(i)] = origin_input[i];
+  }
+}
+
+void ModelTester::forwardAndCompareResults(std::string api) {
+  std::cout << "\ntest " << api << "....\n";
+  CVI_NN_Forward(model, input_tensors, input_num, output_tensors, output_num);
+  if (!compareResults()) {
+    std::cout << "test " << api << " failed\n";
+    exit(1);
+  }
+  std::cout << "test " << api << " successed\n";
+}
+
+void ModelTester::testEmulateSendDataFromVpss() {
+  auto &tensor = input_tensors[0];
+
+  uint8_t *ptr = (uint8_t *)aligned_input.data();
+  auto devMem = CVI_RT_MemAlloc(ctx, aligned_input.size());
+  auto paddr = (uint64_t)CVI_RT_MemGetPAddr(devMem);
+  CVI_RT_MemCopyS2D(ctx, devMem, ptr);
+
+  if (tensor.shape.dim[0] == 1) {
+    CVI_VIDEO_FRAME_INFO frame;
+    frame.type = tensor.pixel_format;
+    for (int i = 0; i < _c; ++i) {
+      frame.pyaddr[i] = paddr + i * _h * _aligned_w;
+    }
+    CVI_NN_SetTensorWithVideoFrame(nullptr, &tensor, &frame);
+    forwardAndCompareResults("CVI_NN_SetTensorWithVideoFrame");
+
+    if (!tensor.aligned) {
+      int channel_num = _n * _c;
+      uint64_t channel_paddrs[channel_num];
+      for (int i = 0; i < channel_num; ++i) {
+        channel_paddrs[i] = paddr + i * _h * _aligned_w;
+      }
+      CVI_NN_FeedTensorWithFrames(nullptr, &tensor, tensor.pixel_format,
+                                  CVI_FMT_INT8, channel_num, channel_paddrs, 0, 0, 0);
+      forwardAndCompareResults("CVI_NN_FeedTensorWithFrames");
+    }
+  }
+
+  CVI_NN_SetTensorWithAlignedFrames(&tensor, &paddr, 1, tensor.pixel_format);
+  forwardAndCompareResults("CVI_NN_SetTensorWithAlignedFrames");
+
+  CVI_RT_MemFree(ctx, devMem);
+}
+
+void ModelTester::testEmulateSendDataFromSystemMem() {
+  auto sys_mem = new uint8_t[origin_input.size()];
+  memcpy(sys_mem, origin_input.data(), origin_input.size());
+  CVI_NN_SetTensorPtr(&input_tensors[0], sys_mem);
+  forwardAndCompareResults("CVI_NN_SetTensorPtr");
+  delete[] sys_mem;
+}
+
+int main(int argc, const char **argv) {
+  showRuntimeVersion();
+
+  argparse::ArgumentParser parser;
+  parser.addArgument("-i", "--input", 1, false);   // required
+  parser.addArgument("-m", "--model", 1, false);     // required
+  parser.addArgument("-r", "--reference", 1, false); // must be npz file
+  parser.addArgument("-v", "--verbose",
+                     1); // set verbose level, 0: only error & warning, 1: info, 2: debug
+  parser.parse(argc, argv);
+
+  auto inputFile = parser.retrieve<std::string>("input");
+  auto modelFile = parser.retrieve<std::string>("model");
+  auto referenceFile = parser.retrieve<std::string>("reference");
+
+  ModelTester tester(modelFile, referenceFile);
+  tester.loadInputData(inputFile);
+  tester.prepareAlignedData();
+
+  tester.testEmulateSendDataFromSystemMem();
+  tester.testEmulateSendDataFromVpss();
+  return 0;
+}
diff --git a/cviruntime/tool/model_runner.cpp b/cviruntime/tool/model_runner.cpp
new file mode 100644
index 000000000..9846cc264
--- /dev/null
+++ b/cviruntime/tool/model_runner.cpp
@@ -0,0 +1,567 @@
+#include "argparse.hpp"
+#include "assert.h"
+#include "cnpy.h"
+#include "cviruntime.h"
+#include "similarity.hpp"
+#include <cviruntime_context.h>
+#include <fstream>
+#include <iostream>
+#include <math.h>
+#include <runtime/debug.h>
+#include <runtime/version.h>
+#include <sstream>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <time.h>
+#include <vector>
+
+static std::string optInputFile;
+static std::string optModelFile;
+static std::string optOutputFile;
+static int32_t optProgramId = 0;
+static int32_t optInferenceCount = 1;
+static bool optEnableTimer = false;
+static bool optDumpAllTensors = false;
+static float optCosineTolerance = 0.99f;
+static float optCorrelationTolerance = 0.99f;
+static float optEuclideanTolerance = 0.90f;
+
+#define EXIT_IF_ERROR(cond, statement)       \
+  if ((cond)) {                              \
+    printf("%s\n", statement);             \
+    exit(1);                                 \
+  }
+
+static const char *formatToStr(CVI_FMT fmt) {
+  switch (fmt) {
+  case CVI_FMT_FP32: return "fp32";
+  case CVI_FMT_INT32: return "i32";
+  case CVI_FMT_UINT32: return "u32";
+  case CVI_FMT_BF16: return "bf16";
+  case CVI_FMT_INT16: return "i16";
+  case CVI_FMT_UINT16: return "u16";
+  case CVI_FMT_INT8: return "i8";
+  case CVI_FMT_UINT8: return "u8";
+  default:
+    printf("unknown fmt:%d\n", fmt);
+  }
+  return nullptr;
+}
+
+static const char*
+pixelFormatToStr(CVI_NN_PIXEL_FORMAT_E pixel_format) {
+  switch(pixel_format) {
+    case CVI_NN_PIXEL_RGB_PACKED: return "RGB_PACKED";
+    case CVI_NN_PIXEL_BGR_PACKED: return "BGR_PACKED";
+    case CVI_NN_PIXEL_RGB_PLANAR: return "RGB_PLANAR";
+    case CVI_NN_PIXEL_BGR_PLANAR: return "BGR_PLANAR";
+    case CVI_NN_PIXEL_YUV_420_PLANAR: return "YUV420_PLANAR";
+    case CVI_NN_PIXEL_YUV_NV12: return "YUV_NV12";
+    case CVI_NN_PIXEL_YUV_NV21: return "YUV_NV21";
+    case CVI_NN_PIXEL_GRAYSCALE: return "GRAYSCALE";
+    case CVI_NN_PIXEL_TENSOR: return "TENSOR";
+    case CVI_NN_PIXEL_RGBA_PLANAR: return "RGBA_PLANAR";
+    default:
+      printf("unknown pixel format:%d\n", pixel_format);
+  }
+  return nullptr;
+}
+
+static bool isNpzFile(const std::string &name) {
+  std::string extension = name.substr(name.size() - 4);
+  if (extension == ".npz")
+    return true;
+  return false;
+}
+
+static int8_t *readFileToMemory(const std::string &fileName, size_t &size) {
+  std::ifstream file(fileName, std::ios::binary | std::ios::ate);
+  size = file.tellg();
+  file.seekg(0, std::ios::beg);
+  char *buf = new char[size];
+  if (!buf) {
+    printf("failed to allocate memory, size:%zu\n", size);
+    return nullptr;
+  }
+  file.read(buf, size);
+  return (int8_t *)buf;
+}
+
+static bool compareResultWithNpz(CVI_TENSOR *tensors, int32_t num,
+                                 std::string &ref_npz) {
+  float euclidean = 0;
+  float cosine = 0;
+  float correlation = 0;
+  int32_t errCnt = 0;
+  for (int i = 0; i < num; i++) {
+    auto &tensor = tensors[i];
+    std::string name(tensor.name);
+    auto refData = cnpy::npz_load(ref_npz, name);
+    if (refData.num_vals == 0) {
+      printf("Warning, Cannot find %s in reference\n", name.c_str());
+      continue;
+    }
+    if (tensor.count != refData.num_vals) {
+      printf("%s %zu vs %zu, size are not equal\n", name.c_str(), tensor.count, refData.num_vals);
+      return false;
+    }
+
+    if (refData.type == 'f') {
+      if (tensor.fmt == CVI_FMT_INT8) {
+        array_similarity((int8_t *)CVI_NN_TensorPtr(&tensor),
+                         refData.data<float>(), tensor.count,
+                         euclidean, cosine, correlation);
+      } else if (tensor.fmt == CVI_FMT_UINT8) {
+        array_similarity((uint8_t *)CVI_NN_TensorPtr(&tensor),
+                         refData.data<float>(), tensor.count,
+                         euclidean, cosine, correlation);
+      } else if (tensor.fmt == CVI_FMT_BF16) {
+        array_similarity((uint16_t *)CVI_NN_TensorPtr(&tensor),
+                         refData.data<float>(), tensor.count,
+                         euclidean, cosine, correlation);
+      } else if (tensor.fmt == CVI_FMT_INT16) {
+        array_similarity((int16_t *)CVI_NN_TensorPtr(&tensor),
+                         refData.data<float>(), tensor.count,
+                         euclidean, cosine, correlation);
+      } else if (tensor.fmt == CVI_FMT_UINT16) {
+        array_similarity((uint16_t *)CVI_NN_TensorPtr(&tensor),
+                         refData.data<float>(), tensor.count,
+                         euclidean, cosine, correlation);
+      } else {
+        array_similarity((float *)CVI_NN_TensorPtr(&tensor),
+                         refData.data<float>(), tensor.count,
+                         euclidean, cosine, correlation);
+      }
+    } else if (refData.type == 'u') {
+      if (tensor.fmt == CVI_FMT_BF16) {
+        assert(refData.word_size == 2);
+        array_similarity((uint16_t *)CVI_NN_TensorPtr(&tensor),
+                         refData.data<uint16_t>(), tensor.count, euclidean,
+                         cosine, correlation);
+      } else if (tensor.fmt == CVI_FMT_UINT8) {
+        assert(refData.word_size == 1);
+        array_similarity((uint8_t *)CVI_NN_TensorPtr(&tensor),
+                         refData.data<uint8_t>(), tensor.count, euclidean,
+                         cosine, correlation);
+      } else {
+        assert(0);
+      }
+    } else if (refData.type == 'i') {
+      assert(refData.word_size == 1);
+      assert(tensor.fmt == CVI_FMT_INT8);
+      array_similarity((int8_t *)CVI_NN_TensorPtr(&tensor),
+                        refData.data<int8_t>(), tensor.count, euclidean,
+                        cosine, correlation);
+    }
+
+    if (cosine < optCosineTolerance || correlation < optCorrelationTolerance ||
+        euclidean < optEuclideanTolerance) {
+      printf("Error, [%s] cosine:%f correlation:%f euclidean:%f\n",
+                    name.c_str(), cosine, correlation, euclidean);
+      errCnt++;
+    } else {
+      printf("Pass, [%s] cosine:%f correlation:%f euclidean:%f\n",
+                    name.c_str(), cosine, correlation, euclidean);
+    }
+  }
+  if (errCnt) {
+    printf("Compare Failed.\n");
+    return false;
+  }
+  printf("Compare Pass.\n");
+  return true;
+}
+
+static void saveResultToNpz(const std::string &name, CVI_TENSOR *tensors,
+                            int32_t num) {
+  assert(isNpzFile(name) && "output should be a npz file");
+
+  cnpy::npz_t npz;
+  for (int i = 0; i < num; i++) {
+    auto &tensor = tensors[i];
+    std::vector<size_t> shape = {
+        (size_t)tensor.shape.dim[0], (size_t)tensor.shape.dim[1],
+        (size_t)tensor.shape.dim[2], (size_t)tensor.shape.dim[3]};
+    switch (tensor.fmt) {
+      case CVI_FMT_FP32:
+        cnpy::npz_add_array<float>(
+            npz, tensor.name,
+            (float *)CVI_NN_TensorPtr(&tensor), shape);
+        break;
+      case CVI_FMT_UINT16: {
+        // uint16 format is used in numpy to store bf16 data.
+        // so we use float type to represent uint16
+        auto size = CVI_NN_TensorCount(&tensor);
+        auto ptr = (uint16_t *)CVI_NN_TensorPtr(&tensor);
+        std::vector<float> tmp(size);
+        for (size_t i = 0; i < size; ++i) {
+          tmp[i] = (float)ptr[i];
+        }
+        cnpy::npz_add_array<float>(
+            npz, tensor.name, tmp.data(), shape);
+        break;
+      }
+      case CVI_FMT_INT16:
+        cnpy::npz_add_array<int16_t>(
+            npz, tensor.name,
+            (int16_t *)CVI_NN_TensorPtr(&tensor), shape);
+        break;
+      case CVI_FMT_BF16: // we use uint16_t to represent BF16
+        cnpy::npz_add_array<uint16_t>(
+            npz, tensor.name,
+            (uint16_t *)CVI_NN_TensorPtr(&tensor), shape);
+        break;
+      case CVI_FMT_INT8:
+        if (CVI_NN_TensorCount(&tensor) != CVI_NN_TensorSize(&tensor)) {
+          shape[1] = shape[2] = 1;
+          shape[3] = CVI_NN_TensorSize(&tensor) / shape[0];
+        }
+        cnpy::npz_add_array<int8_t>(
+            npz, tensor.name,
+            (int8_t *)CVI_NN_TensorPtr(&tensor), shape);
+        break;
+      case CVI_FMT_UINT8:
+        if (CVI_NN_TensorCount(&tensor) != CVI_NN_TensorSize(&tensor)) {
+          shape[1] = shape[2] = 1;
+          shape[3] = CVI_NN_TensorSize(&tensor) / shape[0];
+        }
+        cnpy::npz_add_array<uint8_t>(
+            npz, tensor.name,
+            (uint8_t *)CVI_NN_TensorPtr(&tensor), shape);
+        break;
+      default:
+        printf("Error, Current unsupported type:%d\n", tensor.fmt);
+        assert(0);
+    }
+  }
+  cnpy::npz_save_all(name, npz);
+}
+
+static void ConvertFp32ToInt8(float *src, int8_t *dst, int count, float qscale,
+                              int zero_point = 0) {
+  for (int i = 0; i < count; i++) {
+    int val = std::round((*src++) * qscale) + zero_point;
+    if (val > 127) {
+      val = 127;
+    } else if (val < -128) {
+      val = -128;
+    }
+    *dst++ = (int8_t)val;
+  }
+}
+
+static void ConvertFp32ToUint8(float *src, uint8_t *dst, int count,
+                               float qscale, int zero_point = 0) {
+  for (int i = 0; i < count; i++) {
+    int val = std::round((*src++) * qscale) + zero_point;
+    if (val > 255) {
+      val = 255;
+    }
+    *dst++ = (uint8_t)val;
+  }
+}
+
+static void ConvertFp32ToInt16(float *src, int16_t *dst, int count) {
+  for (int i = 0; i < count; i++) {
+    int val = std::round((*src++));
+    *dst++ = (int16_t)val;
+  }
+}
+
+static void ConvertFp32ToUInt16(float *src, uint16_t *dst, int count) {
+  for (int i = 0; i < count; i++) {
+    int val = std::round((*src++));
+    *dst++ = (uint16_t)val;
+  }
+}
+
+static void ConvertFp32ToBf16(float *src, uint16_t *dst, int size,
+                              bool rounding) {
+  // const uint16_t* p = reinterpret_cast<const uint16_t*>(src);
+  const uint16_t *p = nullptr;
+  /// if rounding is prefered than trancating
+  /// float_val *= 1.001957f;
+  float *src_round = nullptr;
+  if (rounding) {
+    src_round = (float *)malloc(size * sizeof(float));
+    for (int i = 0; i < size; i++) {
+      float value = src[i];
+      uint32_t *u32_val = reinterpret_cast<uint32_t *>(&value);
+      uint32_t lsb = (*u32_val >> 16) & 1;
+      *u32_val += (0x7fff + lsb); // rounding_bias
+      float *ret = reinterpret_cast<float *>(u32_val);
+      src_round[i] = *ret;
+    }
+    p = reinterpret_cast<const uint16_t *>(src_round);
+  } else {
+    p = reinterpret_cast<const uint16_t *>(src);
+  }
+
+  uint16_t *q = reinterpret_cast<uint16_t *>(dst);
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+  for (; size != 0; p += 2, q++, size--) {
+    *q = p[0];
+    /* HW behavior */
+    // infinity set to max finite positive value
+    if ((*q & 0x7f80) == 0x7f80) {
+      *q = 0x7f7f;
+    }
+  }
+#else
+  for (; size != 0; p += 2, q++, size--) {
+    *q = p[1];
+    /* HW behavior */
+    // infinity set to max finite positive value
+    if ((*q & 0x7f80) == 0x7f80) {
+      *q = 0x7f7f;
+    }
+  }
+#endif
+  if (rounding) {
+    free(src_round);
+  }
+}
+
+static void loadInput(std::string &input_file, CVI_TENSOR *tensors, int num) {
+  assert(isNpzFile(input_file) && "input should be a npz file");
+
+  cnpy::npz_t input_npz = cnpy::npz_load(input_file);
+  EXIT_IF_ERROR(!input_npz.size(), "cannot open input npz file");
+  assert(num == (int)input_npz.size());
+
+  for (auto &npy : input_npz) {
+    auto &arr = npy.second;
+    auto name = npy.first.c_str();
+    int idx = 0;
+    // search the target tensor, if not found, use the first one.
+    for (int i = 0; i < num; i++) {
+      if (strncmp((char *)name, tensors[i].name, strlen(name)) == 0) {
+        idx = i;
+        break;
+      }
+    }
+    auto &tensor = tensors[idx];
+    if (arr.type == 'f' && tensor.fmt == CVI_FMT_INT8) {
+      assert(arr.num_vals == tensor.mem_size);
+      ConvertFp32ToInt8(arr.data<float>(),
+                        (int8_t *)CVI_NN_TensorPtr(&tensor),
+                        CVI_NN_TensorSize(&tensor),
+                        CVI_NN_TensorQuantScale(&tensor),
+                        CVI_NN_TensorQuantZeroPoint(&tensor));
+    } else if (arr.type == 'f' && tensor.fmt == CVI_FMT_UINT8) {
+      assert(arr.num_vals == tensor.mem_size);
+      ConvertFp32ToUint8(arr.data<float>(),
+                        (uint8_t *)CVI_NN_TensorPtr(&tensor),
+                        CVI_NN_TensorSize(&tensor),
+                        CVI_NN_TensorQuantScale(&tensor),
+                        CVI_NN_TensorQuantZeroPoint(&tensor));
+    } else if (arr.type == 'f' && tensor.fmt == CVI_FMT_BF16) {
+      assert(arr.num_vals == tensor.count);
+      ConvertFp32ToBf16(arr.data<float>(),
+                        (uint16_t *)CVI_NN_TensorPtr(&tensor),
+                        CVI_NN_TensorCount(&tensor), false);
+    } else if (arr.type == 'f' && tensor.fmt == CVI_FMT_INT16) {
+      assert(arr.num_vals == tensor.count);
+      ConvertFp32ToInt16(arr.data<float>(),
+                        (int16_t *)CVI_NN_TensorPtr(&tensor),
+                        CVI_NN_TensorCount(&tensor));
+    } else if (arr.type == 'f' && tensor.fmt == CVI_FMT_UINT16) {
+      assert(arr.num_vals == tensor.count);
+      ConvertFp32ToUInt16(arr.data<float>(),
+                        (uint16_t *)CVI_NN_TensorPtr(&tensor),
+                        CVI_NN_TensorCount(&tensor));
+    } else {
+      if (arr.num_bytes() != tensor.mem_size) {
+        std::stringstream err;
+        err << "arr.num_bytes: (" << arr.num_bytes()
+            << ")not same as mem.size: (" << tensor.mem_size << ")\n";
+        throw std::runtime_error(err.str());
+      }
+      memcpy(CVI_NN_TensorPtr(&tensor), arr.data<uint8_t>(), tensor.mem_size);
+    }
+  }
+}
+
+static void dumpTensorsInfo(
+    CVI_TENSOR *input_tensors, int32_t input_num,
+    CVI_TENSOR *output_tensors, int32_t output_num) {
+
+  printf("Inputs:\n");
+  for (int i = 0; i < input_num; ++i) {
+    auto &tensor = input_tensors[i];
+    if (tensor.pixel_format != CVI_NN_PIXEL_TENSOR) {
+      printf("  [%d] %s <%d,%d,%d,%d>,%s, qscale:%f, zero_point: %d\n",
+                   i, tensor.name, tensor.shape.dim[0], tensor.shape.dim[1], tensor.shape.dim[2],
+                   tensor.shape.dim[3], formatToStr(tensor.fmt), tensor.qscale, tensor.zero_point);
+      printf("           pixel_format:%s, aligned:%s\n",
+                   pixelFormatToStr(tensor.pixel_format), tensor.aligned ? "True" : "False");
+      printf("           scale:<%f, %f, %f>, mean:<%f, %f, %f>\n",
+                   tensor.scale[0], tensor.scale[1], tensor.scale[2],
+                   tensor.mean[0], tensor.mean[1], tensor.mean[2]);
+    } else {
+      printf("  [%d] %s <%d,%d,%d,%d>,%s, qscale:%f, zero_point: %d\n",
+                   i, tensor.name, tensor.shape.dim[0], tensor.shape.dim[1], tensor.shape.dim[2],
+                   tensor.shape.dim[3], formatToStr(tensor.fmt), tensor.qscale, tensor.zero_point);
+    }
+  }
+  printf("Outputs:\n");
+  for (int i = 0; i < output_num; ++i) {
+    auto &tensor = output_tensors[i];
+    printf("  [%d] %s <%d,%d,%d,%d>,%s\n",
+                 i, tensor.name, tensor.shape.dim[0], tensor.shape.dim[1], tensor.shape.dim[2],
+                 tensor.shape.dim[3], formatToStr(tensor.fmt));
+  }
+}
+
+int main(int argc, const char **argv) {
+  showRuntimeVersion();
+
+  argparse::ArgumentParser parser;
+  parser.addArgument("-i", "--input", 1); // required
+  parser.addArgument("-m", "--model", 1, false); // required
+  parser.addArgument("-o", "--output", 1);       // required
+  parser.addArgument("-p", "--pmu", 1);
+  parser.addArgument("-s", "--program-id", 1); // select program by id
+  parser.addArgument("-b", "--batch-num", 1);  // deprecated
+  parser.addArgument("-c", "--count", 1);     // inference count
+  parser.addArgument("-r", "--reference", 1); // must be npz file
+  // cosine_tol,correlation_tol,euclidean_tol
+  parser.addArgument("-t", "--tolerances", 1);
+  // set verbose level, 0: only error & warning, 1: info, 2: debug
+  parser.addArgument("-v", "--verbose", 1);
+  parser.addArgument("--dump-all-tensors");
+  parser.addArgument("--load-from-memory");
+  parser.addArgument("--enable-timer");
+  parser.parse(argc, argv);
+
+  if (parser.gotArgument("input")) {
+    optInputFile = parser.retrieve<std::string>("input");
+  }
+  optModelFile = parser.retrieve<std::string>("model");
+  std::string ref_fname;
+
+  if (parser.gotArgument("output")) {
+    optOutputFile = parser.retrieve<std::string>("output");
+  }
+  if (parser.gotArgument("pmu")) {
+    std::string pmu = parser.retrieve<std::string>("pmu");
+    setenv("TPU_PMUBUF_OUTPUT_FILE", pmu.c_str(), true);
+  }
+  if (parser.gotArgument("dump-all-tensors")) {
+    optDumpAllTensors = true;
+  }
+  if (parser.gotArgument("count")) {
+    optInferenceCount = parser.retrieve<int>("count");
+  }
+  if (parser.gotArgument("program-id")) {
+    optProgramId = parser.retrieve<int>("program-id");
+  }
+  if (parser.gotArgument("reference")) {
+    auto name = parser.retrieve<std::string>("reference");
+    assert(isNpzFile(name));
+    ref_fname = name;
+    EXIT_IF_ERROR(!ref_fname.size(), "reference npz file name is empty");
+  }
+
+  if (parser.gotArgument("tolerances")) {
+    std::istringstream option(parser.retrieve<std::string>("tolerances"));
+    std::vector<std::string> tolerances;
+    std::string tol;
+    while (std::getline(option, tol, ',')) {
+      tolerances.push_back(std::move(tol));
+    }
+    assert(tolerances.size() == 3);
+    optCosineTolerance = std::stof(tolerances[0]);
+    optCorrelationTolerance = std::stof(tolerances[1]);
+    optEuclideanTolerance = std::stof(tolerances[2]);
+    printf("Tolerance, cosine:%f, correlation:%f, euclidean:%f\n",
+           optCosineTolerance, optCorrelationTolerance,
+           optEuclideanTolerance);
+  }
+  if (parser.gotArgument("enable-timer")) {
+    optEnableTimer = true;
+  }
+
+  CVI_MODEL_HANDLE model = NULL;
+  CVI_RC ret;
+  if (parser.gotArgument("load-from-memory")) {
+    size_t size = 0;
+    int8_t *buffer = readFileToMemory(optModelFile, size);
+    EXIT_IF_ERROR(!buffer, "failed to read civmodel file to memory");
+    ret = CVI_NN_RegisterModelFromBuffer(buffer, size, &model);
+    delete[] buffer;
+    printf("load cvimodel from memory\n");
+  } else {
+    ret = CVI_NN_RegisterModel(optModelFile.c_str(), &model);
+  }
+  EXIT_IF_ERROR(ret != CVI_RC_SUCCESS, "failed to register cvimodel");
+
+  int major_ver, minor_ver;
+  CVI_NN_GetModelVersion(model, &major_ver, &minor_ver);
+  printf("cvimodel's version:%d.%d\n", major_ver ,minor_ver);
+
+  CVI_NN_SetConfig(model, OPTION_PROGRAM_INDEX, optProgramId);
+  CVI_NN_SetConfig(model, OPTION_OUTPUT_ALL_TENSORS, optDumpAllTensors);
+
+  CVI_TENSOR *input_tensors, *output_tensors;
+  int32_t input_num, output_num;
+  ret = CVI_NN_GetInputOutputTensors(model, &input_tensors, &input_num,
+                                     &output_tensors, &output_num);
+  EXIT_IF_ERROR(ret != CVI_RC_SUCCESS, "get input output tensors failed");
+
+  dumpTensorsInfo(input_tensors, input_num,
+                  output_tensors, output_num);
+  if (optInputFile.empty() == false) {
+    loadInput(optInputFile, input_tensors, input_num);
+  }
+
+  int err = 0;
+  if (optEnableTimer) {
+    struct timeval t0, t1;
+    long elapsed;
+    gettimeofday(&t0, NULL);
+
+    for (int i = 0; i < optInferenceCount; ++i) {
+      ret = CVI_NN_Forward(model, input_tensors, input_num,
+                           output_tensors, output_num);
+      EXIT_IF_ERROR(ret != CVI_RC_SUCCESS, "forward failed");
+    }
+
+    gettimeofday(&t1, NULL);
+    elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
+    double ms_per_iter = elapsed/optInferenceCount/1000.0;
+    double fps = 1000.0/ms_per_iter;
+    std::cout << "Performance result: "
+              << optInferenceCount << " runs take "
+              << elapsed/1000.0 << " ms, each run takes "
+              << std::to_string(ms_per_iter) << " ms, fps "
+              << std::to_string(fps) << std::endl;
+  } else {
+    int fail_cnt = 0;
+    for (int i = 0; i < optInferenceCount; ++i) {
+      ret = CVI_NN_Forward(model, input_tensors, input_num, output_tensors,
+                          output_num);
+      EXIT_IF_ERROR(ret != CVI_RC_SUCCESS, "forward failed");
+      if (ref_fname.size() &&
+          !compareResultWithNpz(output_tensors, output_num, ref_fname)) {
+        fail_cnt++;
+      }
+    }
+    if (ref_fname.size()) {
+      std::cout << "Compare result: " << (optInferenceCount - fail_cnt) << "/"
+                << optInferenceCount << " passed.\n";
+      if (fail_cnt)
+        err = 1;
+    }
+  }
+
+  if (!optOutputFile.empty()) {
+    saveResultToNpz(optOutputFile, output_tensors, output_num);
+  }
+
+  CVI_NN_CleanupModel(model);
+
+  return err;
+}
diff --git a/cviruntime/tool/multi_model_tester.cpp b/cviruntime/tool/multi_model_tester.cpp
new file mode 100644
index 000000000..e733985be
--- /dev/null
+++ b/cviruntime/tool/multi_model_tester.cpp
@@ -0,0 +1,161 @@
+#include "pthread.h"
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <stdlib.h>
+#include <string.h>
+#include <vector>
+#include <iostream>
+#include <sstream>
+#include <fstream>
+#include <random>
+#include <cmath>
+#include <sys/time.h>
+#include <cviruntime_context.h>
+#include <runtime/debug.h>
+#include "cviruntime.h"
+#include <runtime/version.h>
+#include "argparse.hpp"
+#include "assert.h"
+
+static int32_t optCount = 1;
+
+#define EXIT_IF_ERROR(cond, statement)                                                   \
+  if ((cond)) {                                                                          \
+    printf("%s\n", statement);                                                    \
+    exit(1);                                                                             \
+  }
+
+static bool compare = false;
+static void *thread_entry(void *p) {
+  CVI_MODEL_HANDLE model = (CVI_MODEL_HANDLE)p;
+
+  CVI_TENSOR *input_tensors, *output_tensors;
+  int32_t input_num, output_num;
+  int32_t count = optCount;
+  bool init_in_out = false;
+  std::vector<std::vector<uint8_t>> results;
+  while (count--) {
+    CVI_NN_GetInputOutputTensors(model, &input_tensors, &input_num,
+                                 &output_tensors, &output_num);
+    if (!compare || !init_in_out) {
+      // fill random data to inputs.
+      std::random_device rd{};
+      std::mt19937 gen{rd()};
+      for (int i = 0; i < input_num; i++) {
+          CVI_TENSOR *tensor = &input_tensors[i];
+          if (tensor->fmt == CVI_FMT_FP32) {
+              std::normal_distribution<float> d{0.3, 0.2};
+              float *data = (float *)CVI_NN_TensorPtr(tensor);
+              for (int i = 0; i < (int)CVI_NN_TensorCount(tensor); i++) {
+                  float rand = d(gen);
+                  rand       = rand < 0 ? 0 : rand;
+                  rand       = rand > 1 ? 1 : rand;
+                  data[i]    = rand;
+              }
+          } else {
+              std::normal_distribution<float> d{50, 50};
+              int8_t *data = (int8_t *)CVI_NN_TensorPtr(tensor);
+              for (int i = 0; i < (int)CVI_NN_TensorCount(tensor); i++) {
+                  float rand = std::round(d(gen));
+                  rand       = rand < 0 ? 0 : rand;
+                  rand       = rand > 127 ? 127 : rand;
+                  data[i]    = (int8_t)rand;
+              }
+          }
+      }
+    }
+
+    CVI_RC rc =
+        CVI_NN_Forward(model, input_tensors, input_num, output_tensors, output_num);
+    TPU_ASSERT(rc == CVI_RC_SUCCESS, nullptr);
+    if (compare) {
+      if (!init_in_out) {
+        for (int i = 0; i < output_num; ++i) {
+          uint8_t *data = (uint8_t*)CVI_NN_TensorPtr(&output_tensors[i]);
+          std::vector<uint8_t> rst;
+          rst.assign(data, data + CVI_NN_TensorSize(&output_tensors[i]));
+          results.emplace_back(std::move(rst));
+        }
+        init_in_out = true;
+      } else {
+        for (int i = 0; i < output_num; ++i) {
+          uint8_t *data = (uint8_t*)CVI_NN_TensorPtr(&output_tensors[i]);
+          for (size_t j = 0; j < CVI_NN_TensorCount(&output_tensors[i]); ++j) {
+            if (data[j] != results[i][j]) {
+              printf("check reuslt fail! tensor:%s\n", CVI_NN_TensorName(&output_tensors[i]));
+            }
+          }
+        }
+      }
+    }
+  }
+  return NULL;
+}
+
+int main(int argc, const char **argv) {
+  showRuntimeVersion();
+
+  argparse::ArgumentParser parser;
+  parser.addArgument("-m", "--models", '+', false); // required
+  parser.addArgument("-c", "--count", 1);      // inference count
+  parser.addArgument("-v", "--verbose", 1);
+  parser.addArgument("-s", "--shmsize", 1);
+  parser.addArgument("-p", "--compare", 1);
+  parser.parse(argc, argv);
+
+  if (parser.gotArgument("count")) {
+    optCount = parser.retrieve<int>("count");
+  }
+
+  if (parser.gotArgument("shmsize")) {
+    int shmsize = parser.retrieve<int>("shmsize");
+    CVI_NN_Global_SetSharedMemorySize(shmsize);
+    printf("set global shared memory size:%d", shmsize);
+  }
+
+  if (parser.gotArgument("compare")) {
+    compare = parser.retrieve<bool>("compare");
+  }
+
+  std::vector<std::string> optModelFiles;
+  optModelFiles = parser.retrieve<std::vector<std::string>>("models");
+  EXIT_IF_ERROR(optModelFiles.size() == 0, "please set one cvimodels at least");
+
+  dumpSysfsDebugFile("/sys/kernel/debug/ion/cvi_carveout_heap_dump/summary");
+
+  std::vector<CVI_MODEL_HANDLE> models;
+  for (auto &modelFile : optModelFiles) {
+    CVI_MODEL_HANDLE model;
+    printf("get model file:%s\n", modelFile.c_str());
+    CVI_RC ret = CVI_NN_RegisterModel(modelFile.c_str(), &model);
+    EXIT_IF_ERROR(ret != CVI_RC_SUCCESS, "failed to register cvimodel");
+    models.push_back(model);
+  }
+
+  int thread_num = optModelFiles.size();
+  pthread_t *thread = new pthread_t[thread_num];
+  for (int i = 0; i < thread_num; i++) {
+    pthread_create(&thread[i], NULL, thread_entry, models[i]);
+  }
+
+#ifdef __riscv_d
+#include <sched.h>
+  sched_yield();
+#else
+  pthread_yield();
+#endif
+  for (int i = 0; i < thread_num; i++) {
+    if (pthread_join(thread[i], NULL)) {
+      printf("failed to join thread #%d\n", i);
+      exit(1);
+    }
+  }
+  delete[] thread;
+
+  dumpSysfsDebugFile("/sys/kernel/debug/ion/cvi_carveout_heap_dump/summary");
+  for (int i = 0; i < thread_num; i++) {
+    CVI_NN_CleanupModel(models[i]);
+  }
+  return 0;
+}
diff --git a/cviruntime/tool/multi_thread_tester.cpp b/cviruntime/tool/multi_thread_tester.cpp
new file mode 100644
index 000000000..4d4df10f6
--- /dev/null
+++ b/cviruntime/tool/multi_thread_tester.cpp
@@ -0,0 +1,121 @@
+#include "pthread.h"
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <stdlib.h>
+#include <string.h>
+#include <vector>
+#include <iostream>
+#include <sstream>
+#include <fstream>
+#include <random>
+#include <cmath>
+#include <mutex>
+#include <sys/time.h>
+#include <cviruntime_context.h>
+#include <runtime/debug.h>
+#include "cviruntime.h"
+#include <runtime/version.h>
+#include "argparse.hpp"
+#include "assert.h"
+
+static int g_infer_cnt;
+static std::string g_model_file;
+static std::mutex g_ctx_mutex;
+static CVI_MODEL_HANDLE g_model_handle = nullptr;
+
+
+static void *thread_entry(void *p) {
+  (void)p;
+  CVI_RC ret;
+  CVI_MODEL_HANDLE model;
+  do {
+    const std::lock_guard<std::mutex> lock(g_ctx_mutex);
+    if (!g_model_handle) {
+      ret = CVI_NN_RegisterModel(g_model_file.c_str(), &g_model_handle);
+      assert(ret == CVI_RC_SUCCESS);
+      model = g_model_handle;
+    } else {
+      ret = CVI_NN_CloneModel(g_model_handle, &model);
+      assert(ret == CVI_RC_SUCCESS);
+    }
+  } while (0);
+
+  CVI_TENSOR *input_tensors, *output_tensors;
+  int32_t input_num, output_num;
+  ret = CVI_NN_GetInputOutputTensors(model, &input_tensors, &input_num,
+                                     &output_tensors, &output_num);
+  assert(ret == CVI_RC_SUCCESS);
+
+  int32_t count = g_infer_cnt;
+  while (count--) {
+    // fill random data to inputs.
+    std::random_device rd{};
+    std::mt19937 gen{rd()};
+    for (int i = 0; i < input_num; i++) {
+      CVI_TENSOR *tensor = &input_tensors[i];
+      if (tensor->fmt == CVI_FMT_FP32) {
+        std::normal_distribution<float> d{0.3, 0.2};
+        float *data = (float *)CVI_NN_TensorPtr(tensor);
+        for (int i = 0; i < (int)CVI_NN_TensorCount(tensor); i++) {
+          float rand = d(gen);
+          rand = rand < 0 ? 0 : rand;
+          rand = rand > 1 ? 1 : rand;
+          data[i] = rand;
+        }
+      } else {
+        std::normal_distribution<float> d{50, 50};
+        int8_t *data = (int8_t *)CVI_NN_TensorPtr(tensor);
+        for (int i = 0; i < (int)CVI_NN_TensorCount(tensor); i++) {
+          float rand = std::round(d(gen));
+          rand = rand < 0 ? 0 : rand;
+          rand = rand > 127 ? 127 : rand;
+          data[i] = (int8_t)rand;
+        }
+      }
+    }
+
+    CVI_RC rc =
+        CVI_NN_Forward(model, input_tensors, input_num,
+                       output_tensors, output_num);
+    TPU_ASSERT(rc == CVI_RC_SUCCESS, nullptr);
+  }
+
+  CVI_NN_CleanupModel(model);
+  return NULL;
+}
+
+int main(int argc, const char **argv) {
+  showRuntimeVersion();
+
+  argparse::ArgumentParser parser;
+  parser.addArgument("-m", "--model", 1, false); // required
+  parser.addArgument("-n", "--threads", 1, false); // thread count
+  parser.addArgument("-c", "--count", 1, false); // inference count
+  parser.addArgument("-v", "--verbose", 1);
+  parser.parse(argc, argv);
+
+  int thread_num = parser.retrieve<int>("threads");
+  g_model_file = parser.retrieve<std::string>("model");
+  g_infer_cnt = parser.retrieve<int>("count");
+
+  pthread_t *thread = new pthread_t[thread_num];
+  for (int i = 0; i < thread_num; i++) {
+    pthread_create(&thread[i], NULL, thread_entry, nullptr);
+  }
+#ifdef __riscv_d
+#include <sched.h>
+  sched_yield();
+#else
+  pthread_yield();
+#endif
+  for (int i = 0; i < thread_num; i++) {
+    if (pthread_join(thread[i], NULL)) {
+      printf("failed to join thread #%d\n", i);
+      exit(1);
+    }
+  }
+  delete[] thread;
+
+  return 0;
+}
diff --git a/cviruntime/tool/similarity.hpp b/cviruntime/tool/similarity.hpp
new file mode 100644
index 000000000..eae0d49f6
--- /dev/null
+++ b/cviruntime/tool/similarity.hpp
@@ -0,0 +1,124 @@
+#ifndef RUNTIME_SIMILARITY_H
+#define RUNTIME_SIMILARITY_H
+
+#include <stddef.h>
+#include <math.h>
+#include <vector>
+#include <iostream>
+
+static float u16_to_bf16(uint16_t val) {
+  float ret;
+  auto *q = reinterpret_cast<uint16_t *>(&ret);
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+  q[0] = val;
+#else
+  q[1] = val;
+#endif
+  return ret;
+}
+
+template <typename U, typename V>
+static bool array_convert(U *u, V *v, std::vector<float> &uu, std::vector<float> &vv) {
+  size_t equal_cnt = 0;
+  for (size_t i = 0; i < uu.size(); i++) {
+    uu[i] = (typeid(U) == typeid(uint16_t)) ? u16_to_bf16(u[i]) : static_cast<float>(u[i]);
+    vv[i] = (typeid(V) == typeid(uint16_t)) ? u16_to_bf16(v[i]) : static_cast<float>(v[i]);
+    if (uu[i] == vv[i])
+      equal_cnt++;
+  }
+  return equal_cnt == uu.size();
+}
+
+static float array_average(float *u, float *v, size_t size) {
+  double average = 0;
+  for (size_t i = 0; i < size; i++) {
+    average += u[i] * v[i];
+  }
+  return average / size;
+}
+
+static float array_average(float *u, size_t size, int power = 1) {
+  double average = 0;
+  for (size_t i = 0; i < size; i++) {
+    if (power != 1) {
+      average += pow(u[i], power);
+    } else {
+      average += u[i];
+    }
+  }
+  return average / size;
+}
+
+static float euclidean_similiarity(float *u, float *v, size_t size) {
+  double distance = 0;
+  double root = 0;
+  for (size_t i = 0; i < size; i++) {
+    distance += pow(u[i] - v[i], 2);
+    root += pow((u[i] + v[i]) / 2, 2);
+  }
+  distance = sqrt(distance);
+  root = sqrt(root);
+  return (float)(1 - distance / root);
+}
+
+static float correlation_similarity(float *u, float *v, size_t size, bool centered) {
+  if (centered) {
+    float umu = array_average(u, size);
+    float vmu = array_average(v, size);
+    for (size_t i = 0; i < size; i++) {
+      u[i] -= umu;
+      v[i] -= vmu;
+    }
+  }
+
+  float uv = array_average(u, v, size);
+  float uu = array_average(u, size, 2);
+  float vv = array_average(v, size, 2);
+  return uv / sqrt(uu * vv);
+}
+
+static void truncateArray(std::vector<float> &uu, std::vector<float> &vv, 
+                      float epsilon=0.001, float ratio=0.9) {
+  size_t size = uu.size();
+  float u_count = 0.0, v_count = 0.0;
+  for (size_t i = 0; i < size; i++) {
+    if (std::abs(uu[i]) < epsilon) {
+      u_count += 1.0;
+    }
+    if (std::abs(vv[i]) < epsilon) {
+      v_count += 1.0;
+    }
+  }
+  
+  float percent_u = u_count / size;
+  float percent_v = v_count / size;
+  if (percent_u > ratio) {
+    for (size_t i = 0; i < size; i++) {
+      uu[i] = (std::abs(uu[i]) < epsilon) ? epsilon : uu[i];
+    }
+  }
+  if (percent_v > ratio) {
+    for (size_t i = 0; i < size; i++) {
+      vv[i] = (std::abs(vv[i]) < epsilon) ? epsilon : vv[i];
+    }
+  }    
+} 
+
+template <typename U, typename V>
+static void array_similarity(U *u, V *v, size_t size, float &euclidean, float &cosine,
+                             float &correlation) {
+  std::vector<float> uu(size, 0);
+  std::vector<float> vv(size, 0);
+  if (array_convert(u, v, uu, vv)) {
+    euclidean = 1;
+    cosine = 1;
+    correlation = 1;
+    return;
+  }
+  truncateArray(uu, vv);
+  euclidean = euclidean_similiarity(uu.data(), vv.data(), uu.size());
+  cosine = correlation_similarity(uu.data(), vv.data(), uu.size(), false);
+  correlation = cosine;
+}
+
+#endif
\ No newline at end of file
diff --git a/cviruntime/tool/stress_tester.cpp b/cviruntime/tool/stress_tester.cpp
new file mode 100644
index 000000000..1a716c3d2
--- /dev/null
+++ b/cviruntime/tool/stress_tester.cpp
@@ -0,0 +1,210 @@
+#include <stdio.h>
+#include <math.h>
+#include <time.h>
+#include <stdlib.h>
+#include <string.h>
+#include <vector>
+#include <iostream>
+#include <sstream>
+#include <fstream>
+#include <sys/time.h>
+#include <cviruntime_context.h>
+#include <runtime/debug.h>
+#include "cviruntime.h"
+#include <runtime/version.h>
+#include "argparse.hpp"
+#include "similarity.hpp"
+#include "cnpy.h"
+#include "assert.h"
+
+static bool isNpzFile(const std::string &name) {
+  std::string extension = name.substr(name.size() - 4);
+  if (extension == ".npz")
+    return true;
+  return false;
+}
+
+class ModelTester {
+public:
+  ModelTester(const std::string &model_file, const std::string &ref_npz);
+  ~ModelTester();
+  void loadInputData(const std::string &input_npz);
+  void run();
+  bool compareResults();
+
+private:
+  CVI_RT_HANDLE ctx;
+  CVI_MODEL_HANDLE model = NULL;
+  CVI_TENSOR *input_tensors;
+  CVI_TENSOR *output_tensors;
+  int32_t input_num;
+  int32_t output_num;
+
+  std::vector<int8_t> input_vec;
+  std::string ref_npz;
+};
+
+ModelTester::ModelTester(const std::string &model_file, const std::string &ref_npz)
+    : ref_npz(ref_npz) {
+
+  CVI_RT_Init(&ctx);
+
+  if (CVI_RC_SUCCESS != CVI_NN_RegisterModel(model_file.c_str(), &model)) {
+    exit(1);
+  }
+  if (!model)
+    return;
+
+  CVI_NN_SetConfig(model, OPTION_OUTPUT_ALL_TENSORS, true);
+
+  CVI_NN_GetInputOutputTensors(model, &input_tensors, &input_num, &output_tensors,
+                               &output_num);
+}
+
+ModelTester::~ModelTester() {
+  CVI_NN_CleanupModel(model);
+  CVI_RT_DeInit(ctx);
+}
+
+bool ModelTester::compareResults() {
+  if (1)
+    return true;
+  float euclidean = 0;
+  float cosine = 0;
+  float correlation = 0;
+
+  int err_cnt = 0;
+  for (int i = 0; i < output_num; i++) {
+    auto &tensor = output_tensors[i];
+    std::string name(tensor.name);
+    auto refData = cnpy::npz_load(ref_npz, name);
+    if (refData.num_vals == 0) {
+      printf("Warning, Cannot find %s in reference\n", name.c_str());
+      continue;
+    }
+    if (tensor.count != refData.num_vals) {
+      printf("%s %zu vs %zu, size are not equal\n", name.c_str(), tensor.count, refData.num_vals);
+      return false;
+    }
+
+    if (refData.type == 'f') {
+      if (tensor.fmt == CVI_FMT_INT8) {
+        array_similarity((int8_t *)CVI_NN_TensorPtr(&tensor), refData.data<float>(),
+                         tensor.count, euclidean, cosine, correlation);
+      } else if (tensor.fmt == CVI_FMT_BF16) {
+        array_similarity((uint16_t *)CVI_NN_TensorPtr(&tensor), refData.data<float>(),
+                         tensor.count, euclidean, cosine, correlation);
+      } else {
+        array_similarity((float *)CVI_NN_TensorPtr(&tensor), refData.data<float>(),
+                         tensor.count, euclidean, cosine, correlation);
+      }
+    } else if (refData.type == 'u') {
+      assert(refData.word_size == 2);
+      if (tensor.fmt == CVI_FMT_BF16) {
+        array_similarity((uint16_t *)CVI_NN_TensorPtr(&tensor), refData.data<uint16_t>(),
+                         tensor.count, euclidean, cosine, correlation);
+      } else {
+        array_similarity((float *)CVI_NN_TensorPtr(&tensor), refData.data<uint16_t>(),
+                         tensor.count, euclidean, cosine, correlation);
+      }
+    } else if (refData.type == 'i') {
+      assert(refData.word_size == 1);
+      if (tensor.fmt == CVI_FMT_INT8) {
+        array_similarity((int8_t *)CVI_NN_TensorPtr(&tensor), refData.data<int8_t>(),
+                         tensor.count, euclidean, cosine, correlation);
+      } else {
+        array_similarity((float *)CVI_NN_TensorPtr(&tensor), refData.data<int8_t>(),
+                         tensor.count, euclidean, cosine, correlation);
+      }
+    }
+
+    if (cosine < 1 || correlation < 1 || euclidean < 1) {
+      err_cnt++;
+      printf("Error, [%s] cosine:%f correlation:%f euclidean:%f\n", name.c_str(), cosine,
+             correlation, euclidean);
+    } else {
+      printf("[%s] cosine:%f correlation:%f euclidean:%f\n", name.c_str(), cosine,
+             correlation, euclidean);
+    }
+  }
+  if (err_cnt > 0) {
+    printf("Compare failed\n");
+    return false;
+  }
+  printf("Compare passed\n");
+  return true;
+}
+
+void ModelTester::loadInputData(const std::string &input_npz) {
+  assert(isNpzFile(input_npz));
+  auto npz = cnpy::npz_load(input_npz);
+  assert(1 == (int)npz.size());
+  auto &tensor = input_tensors[0];
+  for (auto &npy : npz) {
+    auto &arr = npy.second;
+    input_vec.resize(arr.num_vals);
+    if (arr.type == 'f') {
+      auto src = arr.data<float>();
+      auto qscale = CVI_NN_TensorQuantScale(&tensor);
+      for (size_t i = 0; i < input_vec.size(); i++) {
+        int val = std::round(src[i] * qscale);
+        if (val > 127) {
+          val = 127;
+        } else if (val < -128) {
+          val = -128;
+        }
+        input_vec[i] = (int8_t)val;
+      }
+    } else {
+      auto src = arr.data<int8_t>();
+      for (size_t i = 0; i < input_vec.size(); i++) {
+        input_vec[i] = src[i];
+      }
+    }
+    //break;
+  }
+}
+
+void ModelTester::run() {
+  CVI_NN_SetTensorPtr(&input_tensors[0], input_vec.data());
+  CVI_NN_Forward(model, input_tensors, input_num, output_tensors, output_num);
+}
+
+int main(int argc, const char **argv) {
+  showRuntimeVersion();
+
+  argparse::ArgumentParser parser;
+  parser.addArgument("-i", "--input", 1, false);     // required
+  parser.addArgument("-m", "--model", 1, false);     // required
+  parser.addArgument("-r", "--reference", 1, false); // must be npz file
+  parser.addArgument("-c", "--count", 1, false);
+  parser.parse(argc, argv);
+
+  auto inputFile = parser.retrieve<std::string>("input");
+  auto modelFile = parser.retrieve<std::string>("model");
+  auto referenceFile = parser.retrieve<std::string>("reference");
+  auto count = parser.retrieve<int>("count");
+
+  printf("TEST 1, begin to create & destroy model for %d times.\n", count);
+  for (int i = 0; i < count; ++i) {
+    ModelTester tester(modelFile, referenceFile);
+    tester.loadInputData(inputFile);
+    tester.run();
+    if (i == count - 1) {
+      assert(tester.compareResults());
+    }
+  }
+  printf("TEST 1 passed\n");
+
+  if (1) {
+    printf("TEST 2, begin to run inferences for %d times.\n", count);
+    ModelTester tester(modelFile, referenceFile);
+    tester.loadInputData(inputFile);
+    for (int i = 0; i < count; ++i) {
+      tester.run();
+    }
+    assert(tester.compareResults());
+    printf("TEST 2 passed\n");
+  }
+  return 0;
+}